diff --git a/include/oneapi/dpl/experimental/kt/esimd_radix_sort.h b/include/oneapi/dpl/experimental/kt/esimd_radix_sort.h index f765b415799..c32413b80ff 100644 --- a/include/oneapi/dpl/experimental/kt/esimd_radix_sort.h +++ b/include/oneapi/dpl/experimental/kt/esimd_radix_sort.h @@ -25,7 +25,7 @@ namespace oneapi::dpl::experimental::kt::gpu::esimd // TODO: make sure to provide sufficient diagnostic if input does not allow either reading or writing template std::enable_if_t, sycl::event> -radix_sort(sycl::queue __q, _KeysRng&& __keys_rng, _KernelParam __param = {}) +radix_sort(sycl::queue& __q, _KeysRng&& __keys_rng, _KernelParam __param = {}) { __impl::__check_esimd_sort_params<__radix_bits, _KernelParam::data_per_workitem, _KernelParam::workgroup_size>(); @@ -38,7 +38,7 @@ radix_sort(sycl::queue __q, _KeysRng&& __keys_rng, _KernelParam __param = {}) template std::enable_if_t, sycl::event> -radix_sort(sycl::queue __q, _KeysIterator __keys_first, _KeysIterator __keys_last, _KernelParam __param = {}) +radix_sort(sycl::queue& __q, _KeysIterator __keys_first, _KeysIterator __keys_last, _KernelParam __param = {}) { __impl::__check_esimd_sort_params<__radix_bits, _KernelParam::data_per_workitem, _KernelParam::workgroup_size>(); @@ -54,7 +54,7 @@ radix_sort(sycl::queue __q, _KeysIterator __keys_first, _KeysIterator __keys_las template std::enable_if_t, sycl::event> -radix_sort_by_key(sycl::queue __q, _KeysRng&& __keys_rng, _ValsRng&& __vals_rng, _KernelParam __param = {}) +radix_sort_by_key(sycl::queue& __q, _KeysRng&& __keys_rng, _ValsRng&& __vals_rng, _KernelParam __param = {}) { __impl::__check_esimd_sort_params<__radix_bits, _KernelParam::data_per_workitem, _KernelParam::workgroup_size>(); @@ -69,7 +69,7 @@ radix_sort_by_key(sycl::queue __q, _KeysRng&& __keys_rng, _ValsRng&& __vals_rng, template std::enable_if_t, sycl::event> -radix_sort_by_key(sycl::queue __q, _KeysIterator __keys_first, _KeysIterator __keys_last, _ValsIterator __vals_first, +radix_sort_by_key(sycl::queue& __q, _KeysIterator __keys_first, _KeysIterator __keys_last, _ValsIterator __vals_first, _KernelParam __param = {}) { __impl::__check_esimd_sort_params<__radix_bits, _KernelParam::data_per_workitem, _KernelParam::workgroup_size>(); @@ -89,7 +89,7 @@ radix_sort_by_key(sycl::queue __q, _KeysIterator __keys_first, _KeysIterator __k template std::enable_if_t, sycl::event> -radix_sort(sycl::queue __q, _KeysRng1&& __keys_rng, _KeysRng2&& __keys_rng_out, _KernelParam __param = {}) +radix_sort(sycl::queue& __q, _KeysRng1&& __keys_rng, _KeysRng2&& __keys_rng_out, _KernelParam __param = {}) { __impl::__check_esimd_sort_params<__radix_bits, _KernelParam::data_per_workitem, _KernelParam::workgroup_size>(); if (__keys_rng.size() == 0) @@ -104,7 +104,7 @@ radix_sort(sycl::queue __q, _KeysRng1&& __keys_rng, _KeysRng2&& __keys_rng_out, template std::enable_if_t, sycl::event> -radix_sort(sycl::queue __q, _KeysIterator1 __keys_first, _KeysIterator1 __keys_last, _KeysIterator2 __keys_out_first, +radix_sort(sycl::queue& __q, _KeysIterator1 __keys_first, _KeysIterator1 __keys_last, _KeysIterator2 __keys_out_first, _KernelParam __param = {}) { __impl::__check_esimd_sort_params<__radix_bits, _KernelParam::data_per_workitem, _KernelParam::workgroup_size>(); @@ -126,7 +126,7 @@ radix_sort(sycl::queue __q, _KeysIterator1 __keys_first, _KeysIterator1 __keys_l template std::enable_if_t, sycl::event> -radix_sort_by_key(sycl::queue __q, _KeysRng1&& __keys_rng, _ValsRng1&& __vals_rng, _KeysRng2&& __keys_out_rng, +radix_sort_by_key(sycl::queue& __q, _KeysRng1&& __keys_rng, _ValsRng1&& __vals_rng, _KeysRng2&& __keys_out_rng, _ValsRng2&& __vals_out_rng, _KernelParam __param = {}) { __impl::__check_esimd_sort_params<__radix_bits, _KernelParam::data_per_workitem, _KernelParam::workgroup_size>(); @@ -144,8 +144,9 @@ radix_sort_by_key(sycl::queue __q, _KeysRng1&& __keys_rng, _ValsRng1&& __vals_rn template std::enable_if_t, sycl::event> -radix_sort_by_key(sycl::queue __q, _KeysIterator1 __keys_first, _KeysIterator1 __keys_last, _ValsIterator1 __vals_first, - _KeysIterator2 __keys_out_first, _ValsIterator2 __vals_out_first, _KernelParam __param = {}) +radix_sort_by_key(sycl::queue& __q, _KeysIterator1 __keys_first, _KeysIterator1 __keys_last, + _ValsIterator1 __vals_first, _KeysIterator2 __keys_out_first, _ValsIterator2 __vals_out_first, + _KernelParam __param = {}) { __impl::__check_esimd_sort_params<__radix_bits, _KernelParam::data_per_workitem, _KernelParam::workgroup_size>(); diff --git a/include/oneapi/dpl/experimental/kt/internal/esimd_radix_sort_dispatchers.h b/include/oneapi/dpl/experimental/kt/internal/esimd_radix_sort_dispatchers.h index 9757c517aa5..21c10b290a6 100644 --- a/include/oneapi/dpl/experimental/kt/internal/esimd_radix_sort_dispatchers.h +++ b/include/oneapi/dpl/experimental/kt/internal/esimd_radix_sort_dispatchers.h @@ -51,7 +51,7 @@ class __esimd_radix_sort_onesweep_copyback_by_key; template sycl::event -__one_wg(sycl::queue __q, _RngPack&& __pack, ::std::size_t __n) +__one_wg(sycl::queue& __q, _RngPack&& __pack, std::size_t __n) { using _KeyT = typename ::std::decay_t<_RngPack>::_KeyT; using _EsimRadixSortKernel = @@ -64,7 +64,7 @@ __one_wg(sycl::queue __q, _RngPack&& __pack, ::std::size_t __n) template sycl::event -__one_wg(sycl::queue __q, _RngPack1&& __pack_in, _RngPack2&& __pack_out, ::std::size_t __n) +__one_wg(sycl::queue& __q, _RngPack1&& __pack_in, _RngPack2&& __pack_out, std::size_t __n) { using _KeyT = typename ::std::decay_t<_RngPack1>::_KeyT; using _EsimRadixSortKernel = diff --git a/include/oneapi/dpl/experimental/kt/internal/esimd_radix_sort_submitters.h b/include/oneapi/dpl/experimental/kt/internal/esimd_radix_sort_submitters.h index 4fc274f2445..7f62546b824 100644 --- a/include/oneapi/dpl/experimental/kt/internal/esimd_radix_sort_submitters.h +++ b/include/oneapi/dpl/experimental/kt/internal/esimd_radix_sort_submitters.h @@ -41,7 +41,7 @@ struct __radix_sort_one_wg_submitter<__is_ascending, __radix_bits, __data_per_wo { template sycl::event - operator()(sycl::queue __q, _RngPack1&& __pack_in, _RngPack2&& __pack_out, ::std::size_t __n) const + operator()(sycl::queue& __q, _RngPack1&& __pack_in, _RngPack2&& __pack_out, std::size_t __n) const { sycl::nd_range<1> __nd_range{__work_group_size, __work_group_size}; return __q.submit([&](sycl::handler& __cgh) { diff --git a/include/oneapi/dpl/experimental/kt/single_pass_scan.h b/include/oneapi/dpl/experimental/kt/single_pass_scan.h index 754530e26bc..da8742e102b 100644 --- a/include/oneapi/dpl/experimental/kt/single_pass_scan.h +++ b/include/oneapi/dpl/experimental/kt/single_pass_scan.h @@ -145,7 +145,7 @@ struct __lookback_init_submitter<_FlagType, _Type, _BinaryOp, { template sycl::event - operator()(sycl::queue __q, _StatusFlags&& __status_flags, _PartialValues&& __partial_values, + operator()(sycl::queue& __q, _StatusFlags&& __status_flags, _PartialValues&& __partial_values, std::size_t __status_flags_size, std::uint16_t __status_flag_padding) const { return __q.submit([&](sycl::handler& __hdl) { @@ -276,8 +276,8 @@ struct __lookback_submitter<__data_per_workitem, __workgroup_size, _Type, _FlagT template sycl::event - operator()(sycl::queue __q, sycl::event __prev_event, _InRng&& __in_rng, _OutRng&& __out_rng, _BinaryOp __binary_op, - std::size_t __n, _StatusFlags&& __status_flags, std::size_t __status_flags_size, + operator()(sycl::queue& __q, sycl::event __prev_event, _InRng&& __in_rng, _OutRng&& __out_rng, + _BinaryOp __binary_op, std::size_t __n, _StatusFlags&& __status_flags, std::size_t __status_flags_size, _StatusValues&& __status_vals_full, _StatusValues&& __status_vals_partial, std::size_t __current_num_items) const { @@ -304,7 +304,7 @@ struct __lookback_submitter<__data_per_workitem, __workgroup_size, _Type, _FlagT template sycl::event -__single_pass_scan(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_rng, _BinaryOp __binary_op, _KernelParam) +__single_pass_scan(sycl::queue& __q, _InRange&& __in_rng, _OutRange&& __out_rng, _BinaryOp __binary_op, _KernelParam) { using _Type = oneapi::dpl::__internal::__value_t<_InRange>; using _FlagType = __scan_status_flag<_Type>; @@ -326,19 +326,19 @@ __single_pass_scan(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_r "Only binary operators with known identity values are supported"); assert("This device does not support 64-bit atomics" && - (sizeof(_Type) < 8 || __queue.get_device().has(sycl::aspect::atomic64))); + (sizeof(_Type) < 8 || __q.get_device().has(sycl::aspect::atomic64))); // Next power of 2 greater than or equal to __n auto __n_uniform = ::oneapi::dpl::__internal::__dpl_bit_ceil(__n); // Perform a single-work group scan if the input is small - if (oneapi::dpl::__par_backend_hetero::__group_scan_fits_in_slm<_Type>(__queue, __n, __n_uniform, /*limit=*/16384)) + if (oneapi::dpl::__par_backend_hetero::__group_scan_fits_in_slm<_Type>(__q, __n, __n_uniform, /*limit=*/16384)) { - return oneapi::dpl::__par_backend_hetero::__parallel_transform_scan_single_group( - oneapi::dpl::__internal::__device_backend_tag{}, - oneapi::dpl::execution::__dpl::make_device_policy(__queue), - std::forward<_InRange>(__in_rng), std::forward<_OutRange>(__out_rng), __n, - oneapi::dpl::__internal::__no_op{}, unseq_backend::__no_init_value<_Type>{}, __binary_op, std::true_type{}); + return oneapi::dpl::__par_backend_hetero::__parallel_transform_scan_single_group< + typename _KernelParam::kernel_name>(oneapi::dpl::__internal::__device_backend_tag{}, __q, + std::forward<_InRange>(__in_rng), std::forward<_OutRange>(__out_rng), + __n, oneapi::dpl::__internal::__no_op{}, + unseq_backend::__no_init_value<_Type>{}, __binary_op, std::true_type{}); } constexpr std::size_t __workgroup_size = _KernelParam::workgroup_size; @@ -358,7 +358,7 @@ __single_pass_scan(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_r std::size_t __mem_bytes = __status_flags_bytes + __status_vals_full_offset_bytes + __status_vals_partial_offset_bytes + __mem_align_pad; - std::byte* __device_mem = reinterpret_cast(sycl::malloc_device(__mem_bytes, __queue)); + std::byte* __device_mem = reinterpret_cast(sycl::malloc_device(__mem_bytes, __q)); if (!__device_mem) throw std::bad_alloc(); @@ -372,14 +372,14 @@ __single_pass_scan(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_r reinterpret_cast<_Type*>(__status_vals_full + __status_vals_full_offset_bytes / sizeof(_Type)); auto __fill_event = __lookback_init_submitter<_FlagType, _Type, _BinaryOp, _LookbackInitKernel>{}( - __queue, __status_flags, __status_vals_partial, __status_flags_size, __status_flag_padding); + __q, __status_flags, __status_vals_partial, __status_flags_size, __status_flag_padding); std::size_t __current_num_wgs = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __elems_in_tile); std::size_t __current_num_items = __current_num_wgs * __workgroup_size; auto __prev_event = __lookback_submitter<__data_per_workitem, __workgroup_size, _Type, _FlagType, _LookbackKernel>{}( - __queue, __fill_event, __in_rng, __out_rng, __binary_op, __n, __status_flags, __status_flags_size, + __q, __fill_event, __in_rng, __out_rng, __binary_op, __n, __status_flags, __status_flags_size, __status_vals_full, __status_vals_partial, __current_num_items); // TODO: Currently, the following portion of code makes this entire function synchronous. @@ -388,15 +388,15 @@ __single_pass_scan(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_r // we should replace this code with the asynchronous version below. if (0) { - return __queue.submit([=](sycl::handler& __hdl) { + return __q.submit([=](sycl::handler& __hdl) { __hdl.depends_on(__prev_event); - __hdl.host_task([=]() { sycl::free(__device_mem, __queue); }); + __hdl.host_task([=]() { sycl::free(__device_mem, __q); }); }); } else { __prev_event.wait(); - sycl::free(__device_mem, __queue); + sycl::free(__device_mem, __q); return __prev_event; } } @@ -405,18 +405,18 @@ __single_pass_scan(sycl::queue __queue, _InRange&& __in_rng, _OutRange&& __out_r template sycl::event -inclusive_scan(sycl::queue __queue, _InRng&& __in_rng, _OutRng&& __out_rng, _BinaryOp __binary_op, +inclusive_scan(sycl::queue& __q, _InRng&& __in_rng, _OutRng&& __out_rng, _BinaryOp __binary_op, _KernelParam __param = {}) { auto __in_view = oneapi::dpl::__ranges::views::all(std::forward<_InRng>(__in_rng)); auto __out_view = oneapi::dpl::__ranges::views::all(std::forward<_OutRng>(__out_rng)); - return __impl::__single_pass_scan(__queue, std::move(__in_view), std::move(__out_view), __binary_op, __param); + return __impl::__single_pass_scan(__q, std::move(__in_view), std::move(__out_view), __binary_op, __param); } template sycl::event -inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin, +inclusive_scan(sycl::queue& __q, _InIterator __in_begin, _InIterator __in_end, _OutIterator __out_begin, _BinaryOp __binary_op, _KernelParam __param = {}) { auto __n = __in_end - __in_begin; @@ -426,7 +426,7 @@ inclusive_scan(sycl::queue __queue, _InIterator __in_begin, _InIterator __in_end auto __keep2 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _OutIterator>(); auto __buf2 = __keep2(__out_begin, __out_begin + __n); - return __impl::__single_pass_scan(__queue, __buf1.all_view(), __buf2.all_view(), __binary_op, __param); + return __impl::__single_pass_scan(__q, __buf1.all_view(), __buf2.all_view(), __binary_op, __param); } } // namespace gpu diff --git a/include/oneapi/dpl/internal/async_impl/async_impl_hetero.h b/include/oneapi/dpl/internal/async_impl/async_impl_hetero.h index 05c56140db2..032c05e7ebe 100644 --- a/include/oneapi/dpl/internal/async_impl/async_impl_hetero.h +++ b/include/oneapi/dpl/internal/async_impl/async_impl_hetero.h @@ -140,9 +140,13 @@ __pattern_transform_reduce_async(__hetero_tag<_BackendTag>, _ExecutionPolicy&& _ oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _RandomAccessIterator2>(); auto __buf2 = __keep2(__first2, __first2 + __n); - return oneapi::dpl::__par_backend_hetero::__parallel_transform_reduce<_RepackedTp, - ::std::true_type /*is_commutative*/>( - _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), __binary_op1, _Functor{__binary_op2}, + using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + + sycl::queue __q_local = __exec.queue(); + + return oneapi::dpl::__par_backend_hetero::__parallel_transform_reduce<_CustomName, _RepackedTp, + std::true_type /*is_commutative*/>( + _BackendTag{}, __q_local, __binary_op1, _Functor{__binary_op2}, unseq_backend::__init_value<_RepackedTp>{__init}, // initial value __buf1.all_view(), __buf2.all_view()); } @@ -166,9 +170,13 @@ __pattern_transform_reduce_async(__hetero_tag<_BackendTag>, _ExecutionPolicy&& _ auto __keep = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _ForwardIterator>(); auto __buf = __keep(__first, __last); - return oneapi::dpl::__par_backend_hetero::__parallel_transform_reduce<_RepackedTp, - ::std::true_type /*is_commutative*/>( - _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), __binary_op, _Functor{__unary_op}, + using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + + sycl::queue __q_local = __exec.queue(); + + return oneapi::dpl::__par_backend_hetero::__parallel_transform_reduce<_CustomName, _RepackedTp, + std::true_type /*is_commutative*/>( + _BackendTag{}, __q_local, __binary_op, _Functor{__unary_op}, unseq_backend::__init_value<_RepackedTp>{__init}, // initial value __buf.all_view()); } @@ -204,9 +212,13 @@ __pattern_transform_scan_base_async(__hetero_tag<_BackendTag>, _ExecutionPolicy& auto __keep2 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _Iterator2>(); auto __buf2 = __keep2(__result, __result + __n); - auto __res = oneapi::dpl::__par_backend_hetero::__parallel_transform_scan( - _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), __buf1.all_view(), __buf2.all_view(), __n, __unary_op, - __init, __binary_op, _Inclusive{}); + sycl::queue __q_local = __exec.queue(); + + using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + + auto __res = oneapi::dpl::__par_backend_hetero::__parallel_transform_scan<_CustomName>( + _BackendTag{}, __q_local, __buf1.all_view(), __buf2.all_view(), __n, __unary_op, __init, __binary_op, + _Inclusive{}); return __res.__make_future(__result + __n); } diff --git a/include/oneapi/dpl/internal/async_impl/glue_async_impl.h b/include/oneapi/dpl/internal/async_impl/glue_async_impl.h index b148f13688b..5a24ffeaa52 100644 --- a/include/oneapi/dpl/internal/async_impl/glue_async_impl.h +++ b/include/oneapi/dpl/internal/async_impl/glue_async_impl.h @@ -102,8 +102,12 @@ sort_async(_ExecutionPolicy&& __exec, _Iterator __first, _Iterator __last, _Comp const auto __dispatch_tag = oneapi::dpl::__internal::__select_backend(__exec, __first); using __backend_tag = typename decltype(__dispatch_tag)::__backend_tag; - return __par_backend_hetero::__parallel_stable_sort(__backend_tag{}, ::std::forward<_ExecutionPolicy>(__exec), - __buf.all_view(), __comp, oneapi::dpl::identity{}); + sycl::queue __q_local = __exec.queue(); + + using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + + return __par_backend_hetero::__parallel_stable_sort<_CustomName>(__backend_tag{}, __q_local, __buf.all_view(), + __comp, oneapi::dpl::identity{}); } template class __seg_scan_prefix_kernel; -template +template struct __sycl_scan_by_segment_impl { template @@ -105,20 +105,16 @@ struct __sycl_scan_by_segment_impl template using _SegScanPrefixPhase = __seg_scan_prefix_kernel<__is_inclusive, _Name...>; - template + template void - operator()(_BackendTag, _ExecutionPolicy&& __exec, _Range1&& __keys, _Range2&& __values, _Range3&& __out_values, + operator()(_BackendTag, sycl::queue& __q, _Range1&& __keys, _Range2&& __values, _Range3&& __out_values, _BinaryPredicate __binary_pred, _BinaryOperator __binary_op, _T __init, _T __identity) { - using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; - using _SegScanWgKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator< - _SegScanWgPhase, _CustomName, _ExecutionPolicy, _Range1, _Range2, _Range3, _BinaryPredicate, - _BinaryOperator>; + _SegScanWgPhase, _CustomName, _Range1, _Range2, _Range3, _BinaryPredicate, _BinaryOperator>; using _SegScanPrefixKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator< - _SegScanPrefixPhase, _CustomName, _ExecutionPolicy, _Range1, _Range2, _Range3, _BinaryPredicate, - _BinaryOperator>; + _SegScanPrefixPhase, _CustomName, _Range1, _Range2, _Range3, _BinaryPredicate, _BinaryOperator>; using __val_type = oneapi::dpl::__internal::__value_t<_Range2>; @@ -129,21 +125,21 @@ struct __sycl_scan_by_segment_impl // Limit the work-group size to prevent large sizes on CPUs. Empirically found value. // This value exceeds the current practical limit for GPUs, but may need to be re-evaluated in the future. - std::size_t __wgroup_size = oneapi::dpl::__internal::__max_work_group_size(__exec.queue(), (std::size_t)2048); + std::size_t __wgroup_size = oneapi::dpl::__internal::__max_work_group_size(__q, (std::size_t)2048); // We require 2 * sizeof(__val_type) * __wgroup_size of SLM for the work group segmented scan. We add // an additional sizeof(__val_type) * __wgroup_size requirement to ensure sufficient SLM for the group algorithms. - __wgroup_size = oneapi::dpl::__internal::__slm_adjusted_work_group_size(__exec.queue(), 3 * sizeof(__val_type), - __wgroup_size); + __wgroup_size = + oneapi::dpl::__internal::__slm_adjusted_work_group_size(__q, 3 * sizeof(__val_type), __wgroup_size); #if _ONEDPL_COMPILE_KERNEL auto __seg_scan_wg_kernel = - __par_backend_hetero::__internal::__kernel_compiler<_SegScanWgKernel>::__compile(__exec.queue()); + __par_backend_hetero::__internal::__kernel_compiler<_SegScanWgKernel>::__compile(__q); auto __seg_scan_prefix_kernel = - __par_backend_hetero::__internal::__kernel_compiler<_SegScanPrefixKernel>::__compile(__exec.queue()); - __wgroup_size = ::std::min( - {__wgroup_size, oneapi::dpl::__internal::__kernel_work_group_size(__exec.queue(), __seg_scan_wg_kernel), - oneapi::dpl::__internal::__kernel_work_group_size(__exec.queue(), __seg_scan_prefix_kernel)}); + __par_backend_hetero::__internal::__kernel_compiler<_SegScanPrefixKernel>::__compile(__q); + __wgroup_size = + std::min({__wgroup_size, oneapi::dpl::__internal::__kernel_work_group_size(__q, __seg_scan_wg_kernel), + oneapi::dpl::__internal::__kernel_work_group_size(__q, __seg_scan_prefix_kernel)}); #endif ::std::size_t __n_groups = __internal::__dpl_ceiling_div(__n, __wgroup_size * __vals_per_item); @@ -154,7 +150,7 @@ struct __sycl_scan_by_segment_impl auto __seg_ends = oneapi::dpl::__par_backend_hetero::__buffer(__n_groups).get_buffer(); // 1. Work group reduction - auto __wg_scan = __exec.queue().submit([&](sycl::handler& __cgh) { + auto __wg_scan = __q.submit([&](sycl::handler& __cgh) { auto __partials_acc = __partials.template get_access(__cgh); auto __seg_ends_acc = __seg_ends.template get_access(__cgh); @@ -254,8 +250,7 @@ struct __sycl_scan_by_segment_impl }); // 2. Apply work group carry outs, calculate output indices, and load results into correct indices. - __exec.queue() - .submit([&](sycl::handler& __cgh) { + __q.submit([&](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __keys, __out_values); auto __partials_acc = __partials.template get_access(__cgh); @@ -392,9 +387,14 @@ __scan_by_segment_impl_common(__internal::__hetero_tag<_BackendTag>, Policy&& po constexpr iter_value_t identity = unseq_backend::__known_identity; - __sycl_scan_by_segment_impl()(_BackendTag{}, ::std::forward(policy), key_buf.all_view(), - value_buf.all_view(), value_output_buf.all_view(), binary_pred, - binary_op, init, identity); + using _CustomName = oneapi::dpl::__internal::__policy_kernel_name; + + sycl::queue __q_local = policy.queue(); + + __sycl_scan_by_segment_impl<_CustomName, Inclusive::value>{}(_BackendTag{}, __q_local, key_buf.all_view(), + value_buf.all_view(), value_output_buf.all_view(), + binary_pred, binary_op, init, identity); + return result + n; } diff --git a/include/oneapi/dpl/pstl/glue_algorithm_impl.h b/include/oneapi/dpl/pstl/glue_algorithm_impl.h index d8c83cb9a53..d506f246fd1 100644 --- a/include/oneapi/dpl/pstl/glue_algorithm_impl.h +++ b/include/oneapi/dpl/pstl/glue_algorithm_impl.h @@ -78,7 +78,7 @@ for_each(_ExecutionPolicy&& __exec, _ForwardIterator __first, _ForwardIterator _ { const auto __dispatch_tag = oneapi::dpl::__internal::__select_backend(__exec, __first); - oneapi::dpl::__internal::__pattern_walk1(__dispatch_tag, ::std::forward<_ExecutionPolicy>(__exec), __first, __last, + oneapi::dpl::__internal::__pattern_walk1(__dispatch_tag, std::forward<_ExecutionPolicy>(__exec), __first, __last, __f); } diff --git a/include/oneapi/dpl/pstl/glue_memory_impl.h b/include/oneapi/dpl/pstl/glue_memory_impl.h index af3bf00fc96..e8fca9b7824 100644 --- a/include/oneapi/dpl/pstl/glue_memory_impl.h +++ b/include/oneapi/dpl/pstl/glue_memory_impl.h @@ -224,6 +224,7 @@ struct __destroy_fn void operator()(_ReferenceType __val) const { + static_assert(std::is_reference_v<_ReferenceType>); __val.~_ValueType(); } }; diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h index bb0f7e0cc3d..e3fcf86ee71 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h @@ -532,11 +532,15 @@ __pattern_min_element(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Ite auto __keep = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _Iterator>(); auto __buf = __keep(__first, __last); - auto __ret_idx = oneapi::dpl::__par_backend_hetero::__parallel_transform_reduce<_ReduceValueType, _Commutative>( - _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), __reduce_fn, __transform_fn, - unseq_backend::__no_init_value{}, // no initial value - __buf.all_view()) - .get(); + using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + + sycl::queue __q_local = __exec.queue(); + + auto __ret_idx = + oneapi::dpl::__par_backend_hetero::__parallel_transform_reduce<_CustomName, _ReduceValueType, _Commutative>( + _BackendTag{}, __q_local, __reduce_fn, __transform_fn, unseq_backend::__no_init_value{}, // no initial value + __buf.all_view()) + .get(); return __first + ::std::get<0>(__ret_idx); } @@ -560,7 +564,7 @@ __pattern_min_element(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Ite // template -struct __pattern_minmax_element__reduce_fn +struct __pattern_minmax_element_reduce_fn { _Compare __comp; @@ -605,7 +609,7 @@ __pattern_minmax_element(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _ // This operator doesn't track the lowest found index in case of equal min. values and the highest found index in // case of equal max. values. Thus, this operator is not commutative. - __pattern_minmax_element__reduce_fn<_Compare, _ReduceValueType> __reduce_fn{__comp}; + __pattern_minmax_element_reduce_fn<_Compare, _ReduceValueType> __reduce_fn{__comp}; // TODO: Doesn't work with `zip_iterator`. // In that case the first and the second arguments of `_ReduceValueType` will be @@ -615,12 +619,16 @@ __pattern_minmax_element(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _ auto __keep = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _Iterator>(); auto __buf = __keep(__first, __last); - auto __ret = oneapi::dpl::__par_backend_hetero::__parallel_transform_reduce<_ReduceValueType, - ::std::false_type /*is_commutative*/>( - _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), __reduce_fn, __transform_fn, - unseq_backend::__no_init_value{}, // no initial value - __buf.all_view()) - .get(); + using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + + sycl::queue __q_local = __exec.queue(); + + auto __ret = + oneapi::dpl::__par_backend_hetero::__parallel_transform_reduce<_CustomName, _ReduceValueType, + std::false_type /*is_commutative*/>( + _BackendTag{}, __q_local, __reduce_fn, __transform_fn, unseq_backend::__no_init_value{}, // no initial value + __buf.all_view()) + .get(); return ::std::make_pair<_Iterator, _Iterator>(__first + ::std::get<0>(__ret), __first + ::std::get<1>(__ret)); } @@ -637,6 +645,8 @@ __pattern_adjacent_find(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _I if (__last - __first < 2) return __last; + sycl::queue __q_local = __exec.queue(); + using _Predicate = oneapi::dpl::unseq_backend::single_match_pred>; auto __keep1 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _Iterator>(); @@ -646,9 +656,11 @@ __pattern_adjacent_find(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _I // TODO: in case of conflicting names // __par_backend_hetero::make_wrapped_policy<__par_backend_hetero::__or_policy_wrapper>() - bool result = __par_backend_hetero::__parallel_find_or( - _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), - _Predicate{adjacent_find_fn<_BinaryPredicate>{__predicate}}, __par_backend_hetero::__parallel_or_tag{}, + using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + + bool result = __par_backend_hetero::__parallel_find_or<_CustomName>( + _BackendTag{}, __q_local, _Predicate{adjacent_find_fn<_BinaryPredicate>{__predicate}}, + __par_backend_hetero::__parallel_or_tag{}, oneapi::dpl::__ranges::make_zip_view(__buf1.all_view(), __buf2.all_view())); // inverted conditional because of @@ -666,15 +678,19 @@ __pattern_adjacent_find(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _I using _Predicate = oneapi::dpl::unseq_backend::single_match_pred>; - auto __result = __par_backend_hetero::__parallel_find( - _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), + sycl::queue __q_local = __exec.queue(); + + using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + + auto __result = __par_backend_hetero::__parallel_find<_CustomName>( + _BackendTag{}, __q_local, __par_backend_hetero::zip( __par_backend_hetero::make_iter_mode<__par_backend_hetero::access_mode::read>(__first), __par_backend_hetero::make_iter_mode<__par_backend_hetero::access_mode::read>(__first + 1)), __par_backend_hetero::zip( __par_backend_hetero::make_iter_mode<__par_backend_hetero::access_mode::read>(__last - 1), __par_backend_hetero::make_iter_mode<__par_backend_hetero::access_mode::read>(__last)), - _Predicate{adjacent_find_fn<_BinaryPredicate>{__predicate}}, ::std::true_type{}); + _Predicate{adjacent_find_fn<_BinaryPredicate>{__predicate}}, std::true_type{}); auto __zip_at_first = __par_backend_hetero::zip( __par_backend_hetero::make_iter_mode<__par_backend_hetero::access_mode::read>(__first), @@ -718,9 +734,13 @@ __pattern_count(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterator auto __keep = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _Iterator>(); auto __buf = __keep(__first, __last); - return oneapi::dpl::__par_backend_hetero::__parallel_transform_reduce<_ReduceValueType, - ::std::true_type /*is_commutative*/>( - _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), __reduce_fn, __transform_fn, + using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + + sycl::queue __q_local = __exec.queue(); + + return oneapi::dpl::__par_backend_hetero::__parallel_transform_reduce<_CustomName, _ReduceValueType, + std::true_type /*is_commutative*/>( + _BackendTag{}, __q_local, __reduce_fn, __transform_fn, unseq_backend::__no_init_value{}, // no initial value __buf.all_view()) .get(); @@ -743,11 +763,13 @@ __pattern_any_of(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterator auto __keep = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _Iterator>(); auto __buf = __keep(__first, __last); - return oneapi::dpl::__par_backend_hetero::__parallel_find_or( - _BackendTag{}, - __par_backend_hetero::make_wrapped_policy<__par_backend_hetero::__or_policy_wrapper>( - ::std::forward<_ExecutionPolicy>(__exec)), - _Predicate{__pred}, __par_backend_hetero::__parallel_or_tag{}, __buf.all_view()); + using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + + sycl::queue __q_local = __exec.queue(); + + return oneapi::dpl::__par_backend_hetero::__parallel_find_or< + __par_backend_hetero::__or_policy_wrapper<_CustomName>>( + _BackendTag{}, __q_local, _Predicate{__pred}, __par_backend_hetero::__parallel_or_tag{}, __buf.all_view()); } //------------------------------------------------------------------------ @@ -762,6 +784,8 @@ __pattern_equal(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterator1 if (__last1 == __first1 || __last2 == __first2 || __last1 - __first1 != __last2 - __first2) return false; + sycl::queue __q_local = __exec.queue(); + using _Predicate = oneapi::dpl::unseq_backend::single_match_pred>; auto __keep1 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _Iterator1>(); @@ -771,9 +795,10 @@ __pattern_equal(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterator1 // TODO: in case of conflicting names // __par_backend_hetero::make_wrapped_policy<__par_backend_hetero::__or_policy_wrapper>() - return !__par_backend_hetero::__parallel_find_or( - _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), _Predicate{equal_predicate<_Pred>{__pred}}, - __par_backend_hetero::__parallel_or_tag{}, + using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + + return !__par_backend_hetero::__parallel_find_or<_CustomName>( + _BackendTag{}, __q_local, _Predicate{equal_predicate<_Pred>{__pred}}, __par_backend_hetero::__parallel_or_tag{}, oneapi::dpl::__ranges::make_zip_view(__buf1.all_view(), __buf2.all_view())); } @@ -804,11 +829,15 @@ __pattern_find_if(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterato using _Predicate = oneapi::dpl::unseq_backend::single_match_pred<_Pred>; - return __par_backend_hetero::__parallel_find( - _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), + sycl::queue __q_local = __exec.queue(); + + using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + + return __par_backend_hetero::__parallel_find<_CustomName>( + _BackendTag{}, __q_local, __par_backend_hetero::make_iter_mode<__par_backend_hetero::access_mode::read>(__first), __par_backend_hetero::make_iter_mode<__par_backend_hetero::access_mode::read>(__last), _Predicate{__pred}, - ::std::true_type{}); + std::true_type{}); } //------------------------------------------------------------------------ @@ -833,13 +862,17 @@ __pattern_find_end(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _ { using _Predicate = unseq_backend::multiple_match_pred<_Pred>; - return __par_backend_hetero::__parallel_find( - _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), + sycl::queue __q_local = __exec.queue(); + + using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + + return __par_backend_hetero::__parallel_find<_CustomName>( + _BackendTag{}, __q_local, __par_backend_hetero::make_iter_mode<__par_backend_hetero::access_mode::read>(__first), __par_backend_hetero::make_iter_mode<__par_backend_hetero::access_mode::read>(__last), __par_backend_hetero::make_iter_mode<__par_backend_hetero::access_mode::read>(__s_first), __par_backend_hetero::make_iter_mode<__par_backend_hetero::access_mode::read>(__s_last), _Predicate{__pred}, - ::std::false_type{}); + std::false_type{}); } } @@ -857,15 +890,19 @@ __pattern_find_first_of(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _I using _Predicate = unseq_backend::first_match_pred<_Pred>; + sycl::queue __q_local = __exec.queue(); + // TODO: To check whether it makes sense to iterate over the second sequence in case of // distance(__first, __last) < distance(__s_first, __s_last). - return __par_backend_hetero::__parallel_find( - _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), + using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + + return __par_backend_hetero::__parallel_find<_CustomName>( + _BackendTag{}, __q_local, __par_backend_hetero::make_iter_mode<__par_backend_hetero::access_mode::read>(__first), __par_backend_hetero::make_iter_mode<__par_backend_hetero::access_mode::read>(__last), __par_backend_hetero::make_iter_mode<__par_backend_hetero::access_mode::read>(__s_first), __par_backend_hetero::make_iter_mode<__par_backend_hetero::access_mode::read>(__s_last), _Predicate{__pred}, - ::std::true_type{}); + std::true_type{}); } //------------------------------------------------------------------------ @@ -897,13 +934,18 @@ __pattern_search(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _It } using _Predicate = unseq_backend::multiple_match_pred<_Pred>; - return __par_backend_hetero::__parallel_find( - _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), + + sycl::queue __q_local = __exec.queue(); + + using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + + return __par_backend_hetero::__parallel_find<_CustomName>( + _BackendTag{}, __q_local, __par_backend_hetero::make_iter_mode<__par_backend_hetero::access_mode::read>(__first), __par_backend_hetero::make_iter_mode<__par_backend_hetero::access_mode::read>(__last), __par_backend_hetero::make_iter_mode<__par_backend_hetero::access_mode::read>(__s_first), __par_backend_hetero::make_iter_mode<__par_backend_hetero::access_mode::read>(__s_last), _Predicate{__pred}, - ::std::true_type{}); + std::true_type{}); } //------------------------------------------------------------------------ @@ -945,11 +987,16 @@ __pattern_search_n(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _ } using _Predicate = unseq_backend::n_elem_match_pred<_BinaryPredicate, _Tp, _Size>; - return __par_backend_hetero::__parallel_find( - _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), + + sycl::queue __q_local = __exec.queue(); + + using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + + return __par_backend_hetero::__parallel_find<_CustomName>( + _BackendTag{}, __q_local, __par_backend_hetero::make_iter_mode<__par_backend_hetero::access_mode::read>(__first), __par_backend_hetero::make_iter_mode<__par_backend_hetero::access_mode::read>(__last), - _Predicate{__pred, __value, __count}, ::std::true_type{}); + _Predicate{__pred, __value, __count}, std::true_type{}); } //------------------------------------------------------------------------ @@ -970,9 +1017,15 @@ __pattern_mismatch(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterat auto __first_zip = __par_backend_hetero::zip( __par_backend_hetero::make_iter_mode<__par_backend_hetero::access_mode::read>(__first1), __par_backend_hetero::make_iter_mode<__par_backend_hetero::access_mode::read>(__first2)); - auto __result = __par_backend_hetero::__parallel_find( - _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), __first_zip, __first_zip + __n, - _Predicate{equal_predicate<_Pred>{__pred}}, ::std::true_type{}); + + sycl::queue __q_local = __exec.queue(); + + using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + + auto __result = __par_backend_hetero::__parallel_find<_CustomName>( + _BackendTag{}, __q_local, __first_zip, __first_zip + __n, _Predicate{equal_predicate<_Pred>{__pred}}, + std::true_type{}); + __n = __result - __first_zip; return ::std::make_pair(__first1 + __n, __first2 + __n); } @@ -999,8 +1052,12 @@ __pattern_copy_if(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterato auto __keep2 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _Iterator2>(); auto __buf2 = __keep2(__result_first, __result_first + __n); - auto __res = __par_backend_hetero::__parallel_copy_if(_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), - __buf1.all_view(), __buf2.all_view(), __n, __pred); + using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + + sycl::queue __q_local = __exec.queue(); + + auto __res = __par_backend_hetero::__parallel_copy_if<_CustomName>(_BackendTag{}, __q_local, __buf1.all_view(), + __buf2.all_view(), __n, __pred); ::std::size_t __num_copied = __res.get(); //is a blocking call return __result_first + __num_copied; @@ -1034,8 +1091,12 @@ __pattern_partition_copy(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _ oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, decltype(__zipped_res)>(); auto __buf2 = __keep2(__zipped_res, __zipped_res + __n); - auto __result = oneapi::dpl::__par_backend_hetero::__parallel_partition_copy( - _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), __buf1.all_view(), __buf2.all_view(), __pred); + sycl::queue __q_local = __exec.queue(); + + using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + + auto __result = oneapi::dpl::__par_backend_hetero::__parallel_partition_copy<_CustomName>( + _BackendTag{}, __q_local, __buf1.all_view(), __buf2.all_view(), __pred); _It1DifferenceType __num_true = __result.get(); // blocking call @@ -1072,8 +1133,12 @@ __pattern_unique_copy(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Ite auto __keep2 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _Iterator2>(); auto __buf2 = __keep2(__result_first, __result_first + __n); - auto __result = oneapi::dpl::__par_backend_hetero::__parallel_unique_copy( - _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), __buf1.all_view(), __buf2.all_view(), __pred); + sycl::queue __q_local = __exec.queue(); + + using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + + auto __result = oneapi::dpl::__par_backend_hetero::__parallel_unique_copy<_CustomName>( + _BackendTag{}, __q_local, __buf1.all_view(), __buf2.all_view(), __pred); return __result_first + __result.get(); // is a blocking call } @@ -1190,12 +1255,16 @@ __pattern_is_partitioned(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _ auto __keep = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _Iterator>(); auto __buf = __keep(__first, __last); - auto __res = oneapi::dpl::__par_backend_hetero::__parallel_transform_reduce<_ReduceValueType, - ::std::false_type /*is_commutative*/>( - _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), __reduce_fn, __transform_fn, - unseq_backend::__no_init_value{}, // no initial value - __buf.all_view()) - .get(); + using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + + sycl::queue __q_local = __exec.queue(); + + auto __res = + oneapi::dpl::__par_backend_hetero::__parallel_transform_reduce<_CustomName, _ReduceValueType, + std::false_type /*is_commutative*/>( + _BackendTag{}, __q_local, __reduce_fn, __transform_fn, unseq_backend::__no_init_value{}, // no initial value + __buf.all_view()) + .get(); return __broken != __reduce_fn(_ReduceValueType{__all_true}, __res); } @@ -1229,11 +1298,15 @@ __pattern_is_heap_until(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _R using _Predicate = oneapi::dpl::unseq_backend::single_match_pred_by_idx<__is_heap_check<_Compare>>; - return __par_backend_hetero::__parallel_find( - _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), + sycl::queue __q_local = __exec.queue(); + + using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + + return __par_backend_hetero::__parallel_find<_CustomName>( + _BackendTag{}, __q_local, __par_backend_hetero::make_iter_mode<__par_backend_hetero::access_mode::read>(__first), __par_backend_hetero::make_iter_mode<__par_backend_hetero::access_mode::read>(__last), _Predicate{__comp}, - ::std::true_type{}); + std::true_type{}); } template @@ -1246,8 +1319,12 @@ __pattern_is_heap(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _RandomA using _Predicate = oneapi::dpl::unseq_backend::single_match_pred_by_idx<__is_heap_check<_Compare>>; - return !__par_backend_hetero::__parallel_or( - _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), + sycl::queue __q_local = __exec.queue(); + + using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + + return !__par_backend_hetero::__parallel_or<_CustomName>( + _BackendTag{}, __q_local, __par_backend_hetero::make_iter_mode<__par_backend_hetero::access_mode::read>(__first), __par_backend_hetero::make_iter_mode<__par_backend_hetero::access_mode::read>(__last), _Predicate{__comp}); } @@ -1291,8 +1368,12 @@ __pattern_merge(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _Ite auto __keep3 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _Iterator3>(); auto __buf3 = __keep3(__d_first, __d_first + __n); - __par_backend_hetero::__parallel_merge(_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), - __buf1.all_view(), __buf2.all_view(), __buf3.all_view(), __comp) + using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + + sycl::queue __q_local = __exec.queue(); + + __par_backend_hetero::__parallel_merge<_CustomName>(_BackendTag{}, __q_local, __buf1.all_view(), + __buf2.all_view(), __buf3.all_view(), __comp) .__deferrable_wait(); } return __d_first + __n; @@ -1349,8 +1430,12 @@ __stable_sort_with_projection(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __ex auto __keep = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read_write, _Iterator>(); auto __buf = __keep(__first, __last); - __par_backend_hetero::__parallel_stable_sort(_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), - __buf.all_view(), __comp, __proj) + sycl::queue __q_local = __exec.queue(); + + using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + + __par_backend_hetero::__parallel_stable_sort<_CustomName>(_BackendTag{}, __q_local, __buf.all_view(), __comp, + __proj) .__deferrable_wait(); } @@ -1506,11 +1591,14 @@ __pattern_lexicographical_compare(__hetero_tag<_BackendTag>, _ExecutionPolicy&& auto __keep2 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _Iterator2>(); auto __buf2 = __keep2(__first2, __first2 + __shared_size); + using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + + sycl::queue __q_local = __exec.queue(); + auto __ret_idx = - oneapi::dpl::__par_backend_hetero::__parallel_transform_reduce<_ReduceValueType, - ::std::false_type /*is_commutative*/>( - _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), __reduce_fn, __transform_fn, - unseq_backend::__no_init_value{}, // no initial value + oneapi::dpl::__par_backend_hetero::__parallel_transform_reduce<_CustomName, _ReduceValueType, + std::false_type /*is_commutative*/>( + _BackendTag{}, __q_local, __reduce_fn, __transform_fn, unseq_backend::__no_init_value{}, // no initial value __buf1.all_view(), __buf2.all_view()) .get(); @@ -1536,8 +1624,13 @@ __pattern_includes(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Forwar typedef typename ::std::iterator_traits<_ForwardIterator2>::difference_type _Size2; using __brick_include_type = unseq_backend::__brick_includes<_Compare, _Size1, _Size2>; - return !__par_backend_hetero::__parallel_or( - _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), + + sycl::queue __q_local = __exec.queue(); + + using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + + return !__par_backend_hetero::__parallel_or<_CustomName>( + _BackendTag{}, __q_local, __par_backend_hetero::make_iter_mode<__par_backend_hetero::access_mode::read>(__first2), __par_backend_hetero::make_iter_mode<__par_backend_hetero::access_mode::read>(__last2), __par_backend_hetero::make_iter_mode<__par_backend_hetero::access_mode::read>(__first1), @@ -1557,8 +1650,12 @@ __pattern_partial_sort(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _It if (__last - __first < 2) return; - __par_backend_hetero::__parallel_partial_sort( - _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), + using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + + sycl::queue __q_local = __exec.queue(); + + __par_backend_hetero::__parallel_partial_sort<_CustomName>( + _BackendTag{}, __q_local, __par_backend_hetero::make_iter_mode<__par_backend_hetero::access_mode::read_write>(__first), __par_backend_hetero::make_iter_mode<__par_backend_hetero::access_mode::read_write>(__mid), __par_backend_hetero::make_iter_mode<__par_backend_hetero::access_mode::read_write>(__last), __comp) @@ -1653,8 +1750,12 @@ __pattern_partial_sort_copy(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& // sycl::buffer and sycl accessors. SYCL runtime makes a dependency graph to prevent the races between // the patterns: __pattern_walk2, __parallel_partial_sort and __pattern_walk2. - __par_backend_hetero::__parallel_partial_sort( - _BackendTag{}, __par_backend_hetero::make_wrapped_policy<__partial_sort_2>(__exec), + using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + + sycl::queue __q_local = __exec.queue(); + + __par_backend_hetero::__parallel_partial_sort<__partial_sort_2<_CustomName>>( + _BackendTag{}, __q_local, __par_backend_hetero::make_iter_mode<__par_backend_hetero::access_mode::read_write>(__buf_first), __par_backend_hetero::make_iter_mode<__par_backend_hetero::access_mode::read_write>(__buf_mid), __par_backend_hetero::make_iter_mode<__par_backend_hetero::access_mode::read_write>(__buf_last), __comp); @@ -1817,7 +1918,7 @@ __pattern_rotate_copy(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Bid const auto __shift = __new_first - __first; oneapi::dpl::__par_backend_hetero::__parallel_for( - _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), + _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), unseq_backend::__rotate_copy::difference_type, decltype(__buf1.all_view()), decltype(__buf2.all_view())>{__n, __shift}, __n, __buf1.all_view(), __buf2.all_view()) @@ -1847,9 +1948,13 @@ __pattern_hetero_set_op(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _F auto __keep3 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _OutputIterator>(); auto __buf3 = __keep3(__result, __result + __n1); - auto __result_size = __par_backend_hetero::__parallel_set_op(_BackendTag{}, std::forward<_ExecutionPolicy>(__exec), - __buf1.all_view(), __buf2.all_view(), - __buf3.all_view(), __comp, __is_op_difference) + sycl::queue __q_local = __exec.queue(); + + using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + + auto __result_size = __par_backend_hetero::__parallel_set_op<_CustomName>( + _BackendTag{}, __q_local, __buf1.all_view(), __buf2.all_view(), __buf3.all_view(), __comp, + __is_op_difference) .get(); return __result + __result_size; diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h index eac1406f824..60db3178a7b 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h @@ -228,12 +228,16 @@ __pattern_equal(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Range1&& if (__rng1.empty() || __rng2.empty() || __rng1.size() != __rng2.size()) return false; + sycl::queue __q_local = __exec.queue(); + using _Predicate = oneapi::dpl::unseq_backend::single_match_pred>; // TODO: in case of conflicting names // __par_backend_hetero::make_wrapped_policy<__par_backend_hetero::__or_policy_wrapper>() - return !oneapi::dpl::__par_backend_hetero::__parallel_find_or( - _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), _Predicate{equal_predicate<_Pred>{__pred}}, + using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + + return !oneapi::dpl::__par_backend_hetero::__parallel_find_or<_CustomName>( + _BackendTag{}, __q_local, _Predicate{equal_predicate<_Pred>{__pred}}, oneapi::dpl::__par_backend_hetero::__parallel_or_tag{}, oneapi::dpl::__ranges::zip_view(::std::forward<_Range1>(__rng1), ::std::forward<_Range2>(__rng2))); } @@ -265,14 +269,16 @@ __pattern_find_if(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Range&& if (__rng.empty()) return __rng.size(); + sycl::queue __q_local = __exec.queue(); + using _Predicate = oneapi::dpl::unseq_backend::single_match_pred<_Pred>; using _TagType = oneapi::dpl::__par_backend_hetero::__parallel_find_forward_tag<_Range>; - return oneapi::dpl::__par_backend_hetero::__parallel_find_or( - _BackendTag{}, - __par_backend_hetero::make_wrapped_policy<__par_backend_hetero::__find_policy_wrapper>( - ::std::forward<_ExecutionPolicy>(__exec)), - _Predicate{__pred}, _TagType{}, ::std::forward<_Range>(__rng)); + using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + + return oneapi::dpl::__par_backend_hetero::__parallel_find_or< + __par_backend_hetero::__find_policy_wrapper<_CustomName>>(_BackendTag{}, __q_local, _Predicate{__pred}, + _TagType{}, std::forward<_Range>(__rng)); } #if _ONEDPL_CPP20_RANGES_PRESENT @@ -309,14 +315,17 @@ __pattern_find_end(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _ return __res ? 0 : __rng1.size(); } + sycl::queue __q_local = __exec.queue(); + using _Predicate = unseq_backend::multiple_match_pred<_Pred>; using _TagType = __par_backend_hetero::__parallel_find_backward_tag<_Range1>; - return oneapi::dpl::__par_backend_hetero::__parallel_find_or( - _BackendTag{}, - __par_backend_hetero::make_wrapped_policy<__par_backend_hetero::__find_policy_wrapper>( - ::std::forward<_ExecutionPolicy>(__exec)), - _Predicate{__pred}, _TagType{}, ::std::forward<_Range1>(__rng1), ::std::forward<_Range2>(__rng2)); + using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + + return oneapi::dpl::__par_backend_hetero::__parallel_find_or< + __par_backend_hetero::__find_policy_wrapper<_CustomName>>(_BackendTag{}, __q_local, _Predicate{__pred}, + _TagType{}, std::forward<_Range1>(__rng1), + std::forward<_Range2>(__rng2)); } //------------------------------------------------------------------------ @@ -332,15 +341,18 @@ __pattern_find_first_of(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _R if (__rng1.empty() || __rng2.empty()) return __rng1.size(); + sycl::queue __q_local = __exec.queue(); + using _Predicate = unseq_backend::first_match_pred<_Pred>; using _TagType = oneapi::dpl::__par_backend_hetero::__parallel_find_forward_tag<_Range1>; + using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + //TODO: To check whether it makes sense to iterate over the second sequence in case of __rng1.size() < __rng2.size() - return oneapi::dpl::__par_backend_hetero::__parallel_find_or( - _BackendTag{}, - __par_backend_hetero::make_wrapped_policy<__par_backend_hetero::__find_policy_wrapper>( - ::std::forward<_ExecutionPolicy>(__exec)), - _Predicate{__pred}, _TagType{}, ::std::forward<_Range1>(__rng1), ::std::forward<_Range2>(__rng2)); + return oneapi::dpl::__par_backend_hetero::__parallel_find_or< + __par_backend_hetero::__find_policy_wrapper<_CustomName>>(_BackendTag{}, __q_local, _Predicate{__pred}, + _TagType{}, std::forward<_Range1>(__rng1), + std::forward<_Range2>(__rng2)); } //------------------------------------------------------------------------ @@ -354,12 +366,16 @@ __pattern_any_of(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Range&& if (__rng.empty()) return false; + sycl::queue __q_local = __exec.queue(); + using _Predicate = oneapi::dpl::unseq_backend::single_match_pred<_Pred>; - return oneapi::dpl::__par_backend_hetero::__parallel_find_or( - _BackendTag{}, - __par_backend_hetero::make_wrapped_policy( - ::std::forward<_ExecutionPolicy>(__exec)), - _Predicate{__pred}, oneapi::dpl::__par_backend_hetero::__parallel_or_tag{}, ::std::forward<_Range>(__rng)); + + using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + + return oneapi::dpl::__par_backend_hetero::__parallel_find_or< + oneapi::dpl::__par_backend_hetero::__or_policy_wrapper<_CustomName>>( + _BackendTag{}, __q_local, _Predicate{__pred}, oneapi::dpl::__par_backend_hetero::__parallel_or_tag{}, + std::forward<_Range>(__rng)); } #if _ONEDPL_CPP20_RANGES_PRESENT @@ -405,11 +421,14 @@ __pattern_search(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _Ra using _Predicate = unseq_backend::multiple_match_pred<_Pred>; using _TagType = oneapi::dpl::__par_backend_hetero::__parallel_find_forward_tag<_Range1>; - return oneapi::dpl::__par_backend_hetero::__parallel_find_or( - _BackendTag{}, - oneapi::dpl::__par_backend_hetero::make_wrapped_policy< - oneapi::dpl::__par_backend_hetero::__find_policy_wrapper>(::std::forward<_ExecutionPolicy>(__exec)), - _Predicate{__pred}, _TagType{}, ::std::forward<_Range1>(__rng1), ::std::forward<_Range2>(__rng2)); + sycl::queue __q_local = __exec.queue(); + + using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + + return oneapi::dpl::__par_backend_hetero::__parallel_find_or< + oneapi::dpl::__par_backend_hetero::__find_policy_wrapper<_CustomName>>( + _BackendTag{}, __q_local, _Predicate{__pred}, _TagType{}, std::forward<_Range1>(__rng1), + std::forward<_Range2>(__rng2)); } #if _ONEDPL_CPP20_RANGES_PRESENT @@ -508,6 +527,8 @@ __pattern_adjacent_find(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _R if (__rng.size() < 2) return __rng.size(); + sycl::queue __q_local = __exec.queue(); + using _Predicate = oneapi::dpl::unseq_backend::single_match_pred>; using _TagType = ::std::conditional_t<__is__or_semantic(), oneapi::dpl::__par_backend_hetero::__parallel_or_tag, oneapi::dpl::__par_backend_hetero::__parallel_find_forward_tag<_Range>>; @@ -522,9 +543,10 @@ __pattern_adjacent_find(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _R // TODO: in case of conflicting names // __par_backend_hetero::make_wrapped_policy<__par_backend_hetero::__or_policy_wrapper>() - auto result = oneapi::dpl::__par_backend_hetero::__parallel_find_or( - _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), - _Predicate{adjacent_find_fn<_BinaryPredicate>{__predicate}}, _TagType{}, + using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + + auto result = oneapi::dpl::__par_backend_hetero::__parallel_find_or<_CustomName>( + _BackendTag{}, __q_local, _Predicate{adjacent_find_fn<_BinaryPredicate>{__predicate}}, _TagType{}, oneapi::dpl::__ranges::zip_view(__rng1, __rng2)); // inverted conditional because of @@ -591,11 +613,15 @@ __pattern_count(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Range&& _ auto __reduce_fn = ::std::plus<_ReduceValueType>{}; __pattern_count_transform_fn<_Predicate> __transform_fn{__predicate}; - return oneapi::dpl::__par_backend_hetero::__parallel_transform_reduce<_ReduceValueType, - ::std::true_type /*is_commutative*/>( - _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), __reduce_fn, __transform_fn, + using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + + sycl::queue __q_local = __exec.queue(); + + return oneapi::dpl::__par_backend_hetero::__parallel_transform_reduce<_CustomName, _ReduceValueType, + std::true_type /*is_commutative*/>( + _BackendTag{}, __q_local, __reduce_fn, __transform_fn, unseq_backend::__no_init_value{}, // no initial value - ::std::forward<_Range>(__rng)) + std::forward<_Range>(__rng)) .get(); } @@ -625,9 +651,12 @@ __pattern_copy_if(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Range1& if (__n == 0) return 0; - auto __res = oneapi::dpl::__par_backend_hetero::__parallel_copy_if( - _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), - std::forward<_Range2>(__rng2), __n, __pred, __assign); + using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + + sycl::queue __q_local = __exec.queue(); + + auto __res = oneapi::dpl::__par_backend_hetero::__parallel_copy_if<_CustomName>( + _BackendTag{}, __q_local, std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), __n, __pred, __assign); return __res.get(); //is a blocking call } @@ -713,9 +742,12 @@ __pattern_unique_copy(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Ran return 1; } - auto __res = oneapi::dpl::__par_backend_hetero::__parallel_unique_copy( - _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng), - std::forward<_Range2>(__result), __pred); + sycl::queue __q_local = __exec.queue(); + + using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + + auto __res = oneapi::dpl::__par_backend_hetero::__parallel_unique_copy<_CustomName>( + _BackendTag{}, __q_local, std::forward<_Range1>(__rng), std::forward<_Range2>(__result), __pred); return __res.get(); // is a blocking call } @@ -798,9 +830,13 @@ __pattern_merge(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _Ran return {__res, 0}; } - auto __res = __par_backend_hetero::__parallel_merge( - _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), ::std::forward<_Range1>(__rng1), - ::std::forward<_Range2>(__rng2), ::std::forward<_Range3>(__rng3), __comp, __out_size_limit{}); + using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + + sycl::queue __q_local = __exec.queue(); + + auto __res = __par_backend_hetero::__parallel_merge<_CustomName>( + _BackendTag{}, __q_local, std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), + std::forward<_Range3>(__rng3), __comp, __out_size_limit{}); auto __val = __res.get(); return {__val.first, __val.second}; @@ -844,10 +880,16 @@ void __pattern_stable_sort(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Range&& __rng, _Compare __comp, _Proj __proj) { + using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + if (__rng.size() >= 2) - __par_backend_hetero::__parallel_stable_sort(_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), - ::std::forward<_Range>(__rng), __comp, __proj) + { + sycl::queue __q_local = __exec.queue(); + + __par_backend_hetero::__parallel_stable_sort<_CustomName>(_BackendTag{}, __q_local, std::forward<_Range>(__rng), + __comp, __proj) .__deferrable_wait(); + } } #if _ONEDPL_CPP20_RANGES_PRESENT @@ -915,12 +957,15 @@ __pattern_min_element(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Ran __pattern_min_element_reduce_fn<_Compare, _ReduceValueType> __reduce_fn{__comp}; __pattern_min_element_transform_fn<_ReduceValueType> __transform_fn; + using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + + sycl::queue __q_local = __exec.queue(); + auto __ret_idx = - oneapi::dpl::__par_backend_hetero::__parallel_transform_reduce<_ReduceValueType, - ::std::false_type /*is_commutative*/>( - _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), __reduce_fn, __transform_fn, - unseq_backend::__no_init_value{}, // no initial value - ::std::forward<_Range>(__rng)) + oneapi::dpl::__par_backend_hetero::__parallel_transform_reduce<_CustomName, _ReduceValueType, + std::false_type /*is_commutative*/>( + _BackendTag{}, __q_local, __reduce_fn, __transform_fn, unseq_backend::__no_init_value{}, // no initial value + std::forward<_Range>(__rng)) .get(); using ::std::get; @@ -1000,12 +1045,15 @@ __pattern_minmax_element(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _ // a `tuple` of `difference_type`, not the `difference_type` itself. __pattern_minmax_element_transform_fn<_ReduceValueType> __transform_fn; + using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + + sycl::queue __q_local = __exec.queue(); + _ReduceValueType __ret = - oneapi::dpl::__par_backend_hetero::__parallel_transform_reduce<_ReduceValueType, - ::std::false_type /*is_commutative*/>( - _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), __reduce_fn, __transform_fn, - unseq_backend::__no_init_value{}, // no initial value - ::std::forward<_Range>(__rng)) + oneapi::dpl::__par_backend_hetero::__parallel_transform_reduce<_CustomName, _ReduceValueType, + std::false_type /*is_commutative*/>( + _BackendTag{}, __q_local, __reduce_fn, __transform_fn, unseq_backend::__no_init_value{}, // no initial value + std::forward<_Range>(__rng)) .get(); using ::std::get; diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index d0c61dcbab1..d52ce0773a1 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -232,38 +232,35 @@ struct __parallel_scan_submitter; template struct __parallel_scan_submitter<_CustomName, __internal::__optional_kernel_name<_PropagateScanName...>> { - template + template __future> - operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _InitType __init, _LocalScan __local_scan, + operator()(sycl::queue& __q, _Range1&& __rng1, _Range2&& __rng2, _InitType __init, _LocalScan __local_scan, _GroupScan __group_scan, _GlobalScan __global_scan) const { using _Type = typename _InitType::__value_type; using _LocalScanKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator< - __scan_local_kernel, _CustomName, _ExecutionPolicy, _Range1, _Range2, _Type, _LocalScan, _GroupScan, - _GlobalScan>; + __scan_local_kernel, _CustomName, _Range1, _Range2, _Type, _LocalScan, _GroupScan, _GlobalScan>; using _GroupScanKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator< - __scan_group_kernel, _CustomName, _ExecutionPolicy, _Range1, _Range2, _Type, _LocalScan, _GroupScan, - _GlobalScan>; + __scan_group_kernel, _CustomName, _Range1, _Range2, _Type, _LocalScan, _GroupScan, _GlobalScan>; auto __n = __rng1.size(); assert(__n > 0); - auto __max_cu = oneapi::dpl::__internal::__max_compute_units(__exec.queue()); + auto __max_cu = oneapi::dpl::__internal::__max_compute_units(__q); // get the work group size adjusted to the local memory limit // TODO: find a way to generalize getting of reliable work-group sizes - std::size_t __wgroup_size = - oneapi::dpl::__internal::__slm_adjusted_work_group_size(__exec.queue(), sizeof(_Type)); + std::size_t __wgroup_size = oneapi::dpl::__internal::__slm_adjusted_work_group_size(__q, sizeof(_Type)); // Limit the work-group size to prevent large sizes on CPUs. Empirically found value. // This value matches the current practical limit for GPUs, but may need to be re-evaluated in the future. __wgroup_size = std::min(__wgroup_size, (std::size_t)1024); #if _ONEDPL_COMPILE_KERNEL //Actually there is one kernel_bundle for the all kernels of the pattern. - auto __kernels = __internal::__kernel_compiler<_LocalScanKernel, _GroupScanKernel>::__compile(__exec.queue()); + auto __kernels = __internal::__kernel_compiler<_LocalScanKernel, _GroupScanKernel>::__compile(__q); auto __kernel_1 = __kernels[0]; auto __kernel_2 = __kernels[1]; - auto __wgroup_size_kernel_1 = oneapi::dpl::__internal::__kernel_work_group_size(__exec.queue(), __kernel_1); - auto __wgroup_size_kernel_2 = oneapi::dpl::__internal::__kernel_work_group_size(__exec.queue(), __kernel_2); + auto __wgroup_size_kernel_1 = oneapi::dpl::__internal::__kernel_work_group_size(__q, __kernel_1); + auto __wgroup_size_kernel_2 = oneapi::dpl::__internal::__kernel_work_group_size(__q, __kernel_2); __wgroup_size = ::std::min({__wgroup_size, __wgroup_size_kernel_1, __wgroup_size_kernel_2}); #endif @@ -274,12 +271,12 @@ struct __parallel_scan_submitter<_CustomName, __internal::__optional_kernel_name // Storage for the results of scan for each workgroup using __result_and_scratch_storage_t = __result_and_scratch_storage<_Type>; - __result_and_scratch_storage_t __result_and_scratch{__exec.queue(), __n_groups + 1}; + __result_and_scratch_storage_t __result_and_scratch{__q, __n_groups + 1}; - _PRINT_INFO_IN_DEBUG_MODE(__exec.queue(), __wgroup_size, __max_cu); + _PRINT_INFO_IN_DEBUG_MODE(__q, __wgroup_size, __max_cu); // 1. Local scan on each workgroup - auto __submit_event = __exec.queue().submit([&](sycl::handler& __cgh) { + auto __submit_event = __q.submit([&](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2); //get an access to data under SYCL buffer auto __temp_acc = __result_and_scratch.template __get_scratch_acc( __cgh, __dpl_sycl::__no_init{}); @@ -301,7 +298,7 @@ struct __parallel_scan_submitter<_CustomName, __internal::__optional_kernel_name if (__n_groups > 1) { auto __iters_per_single_wg = oneapi::dpl::__internal::__dpl_ceiling_div(__n_groups, __wgroup_size); - __submit_event = __exec.queue().submit([&](sycl::handler& __cgh) { + __submit_event = __q.submit([&](sycl::handler& __cgh) { __cgh.depends_on(__submit_event); auto __temp_acc = __result_and_scratch.template __get_scratch_acc(__cgh); __dpl_sycl::__local_accessor<_Type> __local_acc(__wgroup_size, __cgh); @@ -322,7 +319,7 @@ struct __parallel_scan_submitter<_CustomName, __internal::__optional_kernel_name } // 3. Final scan for whole range - auto __final_event = __exec.queue().submit([&](sycl::handler& __cgh) { + auto __final_event = __q.submit([&](sycl::handler& __cgh) { __cgh.depends_on(__submit_event); oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2); //get an access to data under SYCL buffer auto __temp_acc = __result_and_scratch.template __get_scratch_acc(__cgh); @@ -371,10 +368,9 @@ template struct __parallel_transform_scan_dynamic_single_group_submitter<_Inclusive, __internal::__optional_kernel_name<_ScanKernelName...>> { - template + template sycl::event - operator()(const _Policy& __policy, _InRng&& __in_rng, _OutRng&& __out_rng, ::std::size_t __n, _InitType __init, + operator()(sycl::queue& __q, _InRng&& __in_rng, _OutRng&& __out_rng, std::size_t __n, _InitType __init, _BinaryOperation __bin_op, _UnaryOp __unary_op, ::std::uint16_t __wg_size) { using _ValueType = typename _InitType::__value_type; @@ -382,7 +378,7 @@ struct __parallel_transform_scan_dynamic_single_group_submitter<_Inclusive, const ::std::uint16_t __elems_per_item = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __wg_size); const ::std::uint16_t __elems_per_wg = __elems_per_item * __wg_size; - return __policy.queue().submit([&](sycl::handler& __hdl) { + return __q.submit([&](sycl::handler& __hdl) { oneapi::dpl::__ranges::__require_access(__hdl, __in_rng, __out_rng); auto __lacc = __dpl_sycl::__local_accessor<_ValueType>(sycl::range<1>{__elems_per_wg}, __hdl); @@ -426,17 +422,16 @@ template > { - template + template sycl::event - operator()(const _Policy& __policy, _InRng&& __in_rng, _OutRng&& __out_rng, ::std::size_t __n, _InitType __init, + operator()(sycl::queue& __q, _InRng&& __in_rng, _OutRng&& __out_rng, std::size_t __n, _InitType __init, _BinaryOperation __bin_op, _UnaryOp __unary_op) { using _ValueType = typename _InitType::__value_type; constexpr ::uint32_t __elems_per_wg = _ElemsPerItem * _WGSize; - return __policy.queue().submit([&](sycl::handler& __hdl) { + return __q.submit([&](sycl::handler& __hdl) { oneapi::dpl::__ranges::__require_access(__hdl, __in_rng, __out_rng); auto __lacc = __dpl_sycl::__local_accessor<_ValueType>(sycl::range<1>{__elems_per_wg}, __hdl); @@ -482,10 +477,10 @@ template > { - template + template __future> - operator()(_Policy&& __policy, _InRng&& __in_rng, _OutRng&& __out_rng, ::std::size_t __n, _InitType __init, + operator()(sycl::queue& __q, _InRng&& __in_rng, _OutRng&& __out_rng, std::size_t __n, _InitType __init, _BinaryOperation __bin_op, _UnaryOp __unary_op, _Assign __assign) { using _ValueType = ::std::uint16_t; @@ -498,9 +493,9 @@ struct __parallel_copy_if_static_single_group_submitter<_Size, _ElemsPerItem, _W constexpr ::std::uint32_t __elems_per_wg = _ElemsPerItem * _WGSize; using __result_and_scratch_storage_t = __result_and_scratch_storage<_Size>; - __result_and_scratch_storage_t __result{__policy.queue(), 0}; + __result_and_scratch_storage_t __result{__q, 0}; - auto __event = __policy.queue().submit([&](sycl::handler& __hdl) { + auto __event = __q.submit([&](sycl::handler& __hdl) { oneapi::dpl::__ranges::__require_access(__hdl, __in_rng, __out_rng); // Local memory is split into two parts. The first half stores the result of applying the @@ -555,17 +550,15 @@ struct __parallel_copy_if_static_single_group_submitter<_Size, _ElemsPerItem, _W } }; -template sycl::event -__parallel_transform_scan_single_group(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, +__parallel_transform_scan_single_group(oneapi::dpl::__internal::__device_backend_tag, sycl::queue& __q, _InRng&& __in_rng, _OutRng&& __out_rng, ::std::size_t __n, _UnaryOperation __unary_op, _InitType __init, _BinaryOperation __binary_op, _Inclusive) { - using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; - - ::std::size_t __max_wg_size = oneapi::dpl::__internal::__max_work_group_size(__exec.queue()); + std::size_t __max_wg_size = oneapi::dpl::__internal::__max_work_group_size(__q); // Specialization for devices that have a max work-group size of 1024 constexpr ::std::uint16_t __targeted_wg_size = 1024; @@ -587,8 +580,8 @@ __parallel_transform_scan_single_group(oneapi::dpl::__internal::__device_backend ::std::integral_constant<::std::uint16_t, __wg_size>, ::std::integral_constant<::std::uint16_t, __num_elems_per_item>, _BinaryOperation, /* _IsFullGroup= */ std::true_type, _Inclusive, _CustomName>>>()( - ::std::forward<_ExecutionPolicy>(__exec), std::forward<_InRng>(__in_rng), - std::forward<_OutRng>(__out_rng), __n, __init, __binary_op, __unary_op); + __q, std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __n, __init, __binary_op, + __unary_op); else return __parallel_transform_scan_static_single_group_submitter< _Inclusive::value, __num_elems_per_item, __wg_size, @@ -597,8 +590,8 @@ __parallel_transform_scan_single_group(oneapi::dpl::__internal::__device_backend ::std::integral_constant<::std::uint16_t, __wg_size>, ::std::integral_constant<::std::uint16_t, __num_elems_per_item>, _BinaryOperation, /* _IsFullGroup= */ ::std::false_type, _Inclusive, _CustomName>>>()( - ::std::forward<_ExecutionPolicy>(__exec), std::forward<_InRng>(__in_rng), - std::forward<_OutRng>(__out_rng), __n, __init, __binary_op, __unary_op); + __q, std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __n, __init, __binary_op, + __unary_op); }; if (__n <= 16) return __single_group_scan_f(std::integral_constant<::std::uint16_t, 16>{}); @@ -629,35 +622,33 @@ __parallel_transform_scan_single_group(oneapi::dpl::__internal::__device_backend __par_backend_hetero::__scan_single_wg_dynamic_kernel<_BinaryOperation, _CustomName>>; return __parallel_transform_scan_dynamic_single_group_submitter<_Inclusive::value, _DynamicGroupScanKernel>()( - std::forward<_ExecutionPolicy>(__exec), std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), - __n, __init, __binary_op, __unary_op, __max_wg_size); + __q, std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __n, __init, __binary_op, __unary_op, + __max_wg_size); } } -template __future> -__parallel_transform_scan_base(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, - _Range1&& __in_rng, _Range2&& __out_rng, _InitType __init, _LocalScan __local_scan, - _GroupScan __group_scan, _GlobalScan __global_scan) +__parallel_transform_scan_base(oneapi::dpl::__internal::__device_backend_tag, sycl::queue& __q, _Range1&& __in_rng, + _Range2&& __out_rng, _InitType __init, _LocalScan __local_scan, _GroupScan __group_scan, + _GlobalScan __global_scan) { - using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; - using _PropagateKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<__scan_propagate_kernel<_CustomName>>; - return __parallel_scan_submitter<_CustomName, _PropagateKernel>()( - std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng), std::forward<_Range2>(__out_rng), - __init, __local_scan, __group_scan, __global_scan); + return __parallel_scan_submitter<_CustomName, _PropagateKernel>()(__q, std::forward<_Range1>(__in_rng), + std::forward<_Range2>(__out_rng), __init, + __local_scan, __group_scan, __global_scan); } template bool -__group_scan_fits_in_slm(const sycl::queue& __queue, std::size_t __n, std::size_t __n_uniform, +__group_scan_fits_in_slm(const sycl::queue& __q, std::size_t __n, std::size_t __n_uniform, std::size_t __single_group_upper_limit) { // Pessimistically only use half of the memory to take into account memory used by compiled kernel - const std::size_t __max_slm_size = __queue.get_device().template get_info() / 2; + const std::size_t __max_slm_size = __q.get_device().template get_info() / 2; const auto __req_slm_size = sizeof(_Type) * __n_uniform; return (__n <= __single_group_upper_limit && __max_slm_size >= __req_slm_size); @@ -1038,10 +1029,10 @@ struct __write_to_id_if_else _Assign __assign; }; -template __future> -__parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, +__parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backend_tag, sycl::queue& __q, _Range1&& __in_rng, _Range2&& __out_rng, std::size_t __n, _UnaryOperation __unary_op, _InitType __init, _BinaryOperation __binary_op, _Inclusive) { @@ -1051,7 +1042,7 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen // work-group implementation requires a fundamental type which must also be trivially copyable. if constexpr (std::is_trivially_copyable_v<_Type>) { - bool __use_reduce_then_scan = oneapi::dpl::__par_backend_hetero::__is_gpu_with_reduce_then_scan_sg_sz(__exec); + bool __use_reduce_then_scan = oneapi::dpl::__par_backend_hetero::__is_gpu_with_reduce_then_scan_sg_sz(__q); // TODO: Consider re-implementing single group scan to support types without known identities. This could also // allow us to use single wg scan for the last block of reduce-then-scan if it is sufficiently small. @@ -1063,15 +1054,15 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen // Empirically found values for reduce-then-scan and multi pass scan implementation for single wg cutoff std::size_t __single_group_upper_limit = __use_reduce_then_scan ? 2048 : 16384; - if (__group_scan_fits_in_slm<_Type>(__exec.queue(), __n, __n_uniform, __single_group_upper_limit)) + if (__group_scan_fits_in_slm<_Type>(__q, __n, __n_uniform, __single_group_upper_limit)) { - auto __event = __parallel_transform_scan_single_group( - __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng), - std::forward<_Range2>(__out_rng), __n, __unary_op, __init, __binary_op, _Inclusive{}); + auto __event = __parallel_transform_scan_single_group<_CustomName>( + __backend_tag, __q, std::forward<_Range1>(__in_rng), std::forward<_Range2>(__out_rng), __n, + __unary_op, __init, __binary_op, _Inclusive{}); // Although we do not actually need result storage in this case, we need to construct // a placeholder here to match the return type of the non-single-work-group implementation - __result_and_scratch_storage<_Type> __dummy_result_and_scratch{__exec, 0}; + __result_and_scratch_storage<_Type> __dummy_result_and_scratch{__q, 0}; return __future{std::move(__event), std::move(__dummy_result_and_scratch)}; } @@ -1084,10 +1075,10 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen _GenInput __gen_transform{__unary_op}; - return __parallel_transform_reduce_then_scan( - __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng), - std::forward<_Range2>(__out_rng), __gen_transform, __binary_op, __gen_transform, _ScanInputTransform{}, - _WriteOp{}, __init, _Inclusive{}, /*_IsUniquePattern=*/std::false_type{}); + return __parallel_transform_reduce_then_scan<_CustomName>( + __backend_tag, __q, std::forward<_Range1>(__in_rng), std::forward<_Range2>(__out_rng), __gen_transform, + __binary_op, __gen_transform, _ScanInputTransform{}, _WriteOp{}, __init, _Inclusive{}, + /*_IsUniquePattern=*/std::false_type{}); } } @@ -1101,9 +1092,8 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen _NoAssign __no_assign_op; _NoOpFunctor __get_data_op; - return __parallel_transform_scan_base( - __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng), - std::forward<_Range2>(__out_rng), __init, + return __parallel_transform_scan_base<_CustomName>( + __backend_tag, __q, std::forward<_Range1>(__in_rng), std::forward<_Range2>(__out_rng), __init, // local scan unseq_backend::__scan<_Inclusive, _BinaryOperation, _UnaryFunctor, _Assigner, _Assigner, _NoOpFunctor, _InitType>{__binary_op, _UnaryFunctor{__unary_op}, __assign_op, __assign_op, @@ -1116,23 +1106,22 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen unseq_backend::__global_scan_functor<_Inclusive, _BinaryOperation, _InitType>{__binary_op, __init}); } -template +template struct __invoke_single_group_copy_if { // Specialization for devices that have a max work-group size of at least 1024 static constexpr ::std::uint16_t __targeted_wg_size = 1024; - template auto - operator()(_ExecutionPolicy&& __exec, std::size_t __n, _InRng&& __in_rng, _OutRng&& __out_rng, _Pred __pred, + operator()(sycl::queue& __q, std::size_t __n, _InRng&& __in_rng, _OutRng&& __out_rng, _Pred __pred, _Assign __assign) { constexpr ::std::uint16_t __wg_size = ::std::min(_Size, __targeted_wg_size); constexpr ::std::uint16_t __num_elems_per_item = ::oneapi::dpl::__internal::__dpl_ceiling_div(_Size, __wg_size); const bool __is_full_group = __n == __wg_size; - using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; using _InitType = unseq_backend::__no_init_value<::std::uint16_t>; using _ReduceOp = ::std::plus<::std::uint16_t>; if (__is_full_group) @@ -1144,8 +1133,8 @@ struct __invoke_single_group_copy_if using _FullKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<_FullKernel>; return __par_backend_hetero::__parallel_copy_if_static_single_group_submitter< _SizeType, __num_elems_per_item, __wg_size, true, _FullKernelName>()( - std::forward<_ExecutionPolicy>(__exec), std::forward<_InRng>(__in_rng), - std::forward<_OutRng>(__out_rng), __n, _InitType{}, _ReduceOp{}, __pred, __assign); + __q, std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __n, _InitType{}, _ReduceOp{}, + __pred, __assign); } else { @@ -1157,16 +1146,16 @@ struct __invoke_single_group_copy_if oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<_NonFullKernel>; return __par_backend_hetero::__parallel_copy_if_static_single_group_submitter< _SizeType, __num_elems_per_item, __wg_size, false, _NonFullKernelName>()( - std::forward<_ExecutionPolicy>(__exec), std::forward<_InRng>(__in_rng), - std::forward<_OutRng>(__out_rng), __n, _InitType{}, _ReduceOp{}, __pred, __assign); + __q, std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __n, _InitType{}, _ReduceOp{}, + __pred, __assign); } } }; -template +template __future> -__parallel_reduce_then_scan_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, +__parallel_reduce_then_scan_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag, sycl::queue& __q, _InRng&& __in_rng, _OutRng&& __out_rng, _Size, _GenMask __generate_mask, _WriteOp __write_op, _IsUniquePattern __is_unique_pattern) { @@ -1175,20 +1164,18 @@ __parallel_reduce_then_scan_copy(oneapi::dpl::__internal::__device_backend_tag _ using _GenScanInput = oneapi::dpl::__par_backend_hetero::__gen_expand_count_mask<_GenMask>; using _ScanInputTransform = oneapi::dpl::__par_backend_hetero::__get_zeroth_element; - return __parallel_transform_reduce_then_scan(__backend_tag, std::forward<_ExecutionPolicy>(__exec), - std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), - _GenReduceInput{__generate_mask}, _ReduceOp{}, - _GenScanInput{__generate_mask, {}}, _ScanInputTransform{}, __write_op, - oneapi::dpl::unseq_backend::__no_init_value<_Size>{}, - /*_Inclusive=*/std::true_type{}, __is_unique_pattern); + return __parallel_transform_reduce_then_scan<_CustomName>( + __backend_tag, __q, std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), + _GenReduceInput{__generate_mask}, _ReduceOp{}, _GenScanInput{__generate_mask, {}}, _ScanInputTransform{}, + __write_op, oneapi::dpl::unseq_backend::__no_init_value<_Size>{}, + /*_Inclusive=*/std::true_type{}, __is_unique_pattern); } -template __future> -__parallel_scan_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, - _InRng&& __in_rng, _OutRng&& __out_rng, _Size __n, _CreateMaskOp __create_mask_op, - _CopyByMaskOp __copy_by_mask_op) +__parallel_scan_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag, sycl::queue& __q, _InRng&& __in_rng, + _OutRng&& __out_rng, _Size __n, _CreateMaskOp __create_mask_op, _CopyByMaskOp __copy_by_mask_op) { using _ReduceOp = std::plus<_Size>; using _Assigner = unseq_backend::__scan_assigner; @@ -1205,8 +1192,8 @@ __parallel_scan_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag // temporary buffer to store boolean mask oneapi::dpl::__par_backend_hetero::__buffer __mask_buf(__n); - return __parallel_transform_scan_base( - __backend_tag, std::forward<_ExecutionPolicy>(__exec), + return __parallel_transform_scan_base<_CustomName>( + __backend_tag, __q, oneapi::dpl::__ranges::zip_view( __in_rng, oneapi::dpl::__ranges::all_view( __mask_buf.get_buffer())), @@ -1222,10 +1209,10 @@ __parallel_scan_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag __copy_by_mask_op); } -template +template __future>> -__parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, - _Range1&& __rng, _Range2&& __result, _BinaryPredicate __pred) +__parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag, sycl::queue& __q, _Range1&& __rng, + _Range2&& __result, _BinaryPredicate __pred) { using _Assign = oneapi::dpl::__internal::__pstl_assign; oneapi::dpl::__internal::__difference_t<_Range1> __n = __rng.size(); @@ -1234,15 +1221,15 @@ __parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_t // can simply copy the input range to the output. assert(__n > 1); - if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_reduce_then_scan_sg_sz(__exec)) + if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_reduce_then_scan_sg_sz(__q)) { using _GenMask = oneapi::dpl::__par_backend_hetero::__gen_unique_mask<_BinaryPredicate>; using _WriteOp = oneapi::dpl::__par_backend_hetero::__write_to_id_if<1, _Assign>; - return __parallel_reduce_then_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), - std::forward<_Range1>(__rng), std::forward<_Range2>(__result), __n, - _GenMask{__pred}, _WriteOp{_Assign{}}, - /*_IsUniquePattern=*/std::true_type{}); + return __parallel_reduce_then_scan_copy<_CustomName>(__backend_tag, __q, std::forward<_Range1>(__rng), + std::forward<_Range2>(__result), __n, _GenMask{__pred}, + _WriteOp{_Assign{}}, + /*_IsUniquePattern=*/std::true_type{}); } else { @@ -1253,19 +1240,18 @@ __parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_t decltype(__n)>; using _CopyOp = unseq_backend::__copy_by_mask<_ReduceOp, _Assign, /*inclusive*/ std::true_type, 1>; - return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng), - std::forward<_Range2>(__result), __n, - _CreateOp{oneapi::dpl::__internal::__not_pred<_BinaryPredicate>{__pred}}, - _CopyOp{_ReduceOp{}, _Assign{}}); + return __parallel_scan_copy<_CustomName>( + __backend_tag, __q, std::forward<_Range1>(__rng), std::forward<_Range2>(__result), __n, + _CreateOp{oneapi::dpl::__internal::__not_pred<_BinaryPredicate>{__pred}}, _CopyOp{_ReduceOp{}, _Assign{}}); } } -template __future>>> __parallel_reduce_by_segment_reduce_then_scan(oneapi::dpl::__internal::__device_backend_tag __backend_tag, - _ExecutionPolicy&& __exec, _Range1&& __keys, _Range2&& __values, + sycl::queue& __q, _Range1&& __keys, _Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values, _BinaryPredicate __binary_pred, _BinaryOperator __binary_op) { @@ -1283,8 +1269,9 @@ __parallel_reduce_by_segment_reduce_then_scan(oneapi::dpl::__internal::__device_ std::size_t __n = __keys.size(); // __gen_red_by_seg_scan_input requires that __n > 1 assert(__n > 1); - return __parallel_transform_reduce_then_scan( - __backend_tag, std::forward<_ExecutionPolicy>(__exec), + + return __parallel_transform_reduce_then_scan<_CustomName>( + __backend_tag, __q, oneapi::dpl::__ranges::make_zip_view(std::forward<_Range1>(__keys), std::forward<_Range2>(__values)), oneapi::dpl::__ranges::make_zip_view(std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values)), _GenReduceInput{__binary_pred}, _ReduceOp{__binary_op}, _GenScanInput{__binary_pred, __n}, @@ -1293,22 +1280,22 @@ __parallel_reduce_by_segment_reduce_then_scan(oneapi::dpl::__internal::__device_ /*Inclusive*/ std::true_type{}, /*_IsUniquePattern=*/std::false_type{}); } -template +template __future>> -__parallel_partition_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, +__parallel_partition_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag, sycl::queue& __q, _Range1&& __rng, _Range2&& __result, _UnaryPredicate __pred) { oneapi::dpl::__internal::__difference_t<_Range1> __n = __rng.size(); - if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_reduce_then_scan_sg_sz(__exec)) + if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_reduce_then_scan_sg_sz(__q)) { using _GenMask = oneapi::dpl::__par_backend_hetero::__gen_mask<_UnaryPredicate>; using _WriteOp = oneapi::dpl::__par_backend_hetero::__write_to_id_if_else; - return __parallel_reduce_then_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), - std::forward<_Range1>(__rng), std::forward<_Range2>(__result), __n, - _GenMask{__pred, {}}, _WriteOp{}, - /*_IsUniquePattern=*/std::false_type{}); + return __parallel_reduce_then_scan_copy<_CustomName>(__backend_tag, __q, std::forward<_Range1>(__rng), + std::forward<_Range2>(__result), __n, _GenMask{__pred, {}}, + _WriteOp{}, + /*_IsUniquePattern=*/std::false_type{}); } else { @@ -1316,32 +1303,32 @@ __parallel_partition_copy(oneapi::dpl::__internal::__device_backend_tag __backen using _CreateOp = unseq_backend::__create_mask<_UnaryPredicate, decltype(__n)>; using _CopyOp = unseq_backend::__partition_by_mask<_ReduceOp, /*inclusive*/ std::true_type>; - return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng), - std::forward<_Range2>(__result), __n, _CreateOp{__pred}, _CopyOp{_ReduceOp{}}); + return __parallel_scan_copy<_CustomName>(__backend_tag, __q, std::forward<_Range1>(__rng), + std::forward<_Range2>(__result), __n, _CreateOp{__pred}, + _CopyOp{_ReduceOp{}}); } } -template __future> -__parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, - _InRng&& __in_rng, _OutRng&& __out_rng, _Size __n, _Pred __pred, _Assign __assign = _Assign{}) +__parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag, sycl::queue& __q, _InRng&& __in_rng, + _OutRng&& __out_rng, _Size __n, _Pred __pred, _Assign __assign = _Assign{}) { - using _SingleGroupInvoker = __invoke_single_group_copy_if<_Size>; + using _SingleGroupInvoker = __invoke_single_group_copy_if<_CustomName, _Size>; // Next power of 2 greater than or equal to __n auto __n_uniform = ::oneapi::dpl::__internal::__dpl_bit_ceil(static_cast>(__n)); // Pessimistically only use half of the memory to take into account memory used by compiled kernel - const std::size_t __max_slm_size = - __exec.queue().get_device().template get_info() / 2; + const std::size_t __max_slm_size = __q.get_device().template get_info() / 2; // The kernel stores n integers for the predicate and another n integers for the offsets const auto __req_slm_size = sizeof(std::uint16_t) * __n_uniform * 2; constexpr std::uint16_t __single_group_upper_limit = 2048; - std::size_t __max_wg_size = oneapi::dpl::__internal::__max_work_group_size(__exec.queue()); + std::size_t __max_wg_size = oneapi::dpl::__internal::__max_work_group_size(__q); if (__n <= __single_group_upper_limit && __max_slm_size >= __req_slm_size && __max_wg_size >= _SingleGroupInvoker::__targeted_wg_size) @@ -1349,18 +1336,18 @@ __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag, using _SizeBreakpoints = std::integer_sequence; return __par_backend_hetero::__static_monotonic_dispatcher<_SizeBreakpoints>::__dispatch( - _SingleGroupInvoker{}, __n, std::forward<_ExecutionPolicy>(__exec), __n, std::forward<_InRng>(__in_rng), - std::forward<_OutRng>(__out_rng), __pred, __assign); + _SingleGroupInvoker{}, __n, __q, __n, std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), + __pred, __assign); } - else if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_reduce_then_scan_sg_sz(__exec)) + else if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_reduce_then_scan_sg_sz(__q)) { using _GenMask = oneapi::dpl::__par_backend_hetero::__gen_mask<_Pred>; using _WriteOp = oneapi::dpl::__par_backend_hetero::__write_to_id_if<0, _Assign>; - return __parallel_reduce_then_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), - std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __n, - _GenMask{__pred, {}}, _WriteOp{__assign}, - /*_IsUniquePattern=*/std::false_type{}); + return __parallel_reduce_then_scan_copy<_CustomName>(__backend_tag, __q, std::forward<_InRng>(__in_rng), + std::forward<_OutRng>(__out_rng), __n, + _GenMask{__pred, {}}, _WriteOp{__assign}, + /*_IsUniquePattern=*/std::false_type{}); } else { @@ -1369,16 +1356,16 @@ __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag, using _CopyOp = unseq_backend::__copy_by_mask<_ReduceOp, _Assign, /*inclusive*/ std::true_type, 1>; - return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), - std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __n, - _CreateOp{__pred}, _CopyOp{_ReduceOp{}, __assign}); + return __parallel_scan_copy<_CustomName>(__backend_tag, __q, std::forward<_InRng>(__in_rng), + std::forward<_OutRng>(__out_rng), __n, _CreateOp{__pred}, + _CopyOp{_ReduceOp{}, __assign}); } } -template __future>> -__parallel_set_reduce_then_scan(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, +__parallel_set_reduce_then_scan(oneapi::dpl::__internal::__device_backend_tag __backend_tag, sycl::queue& __q, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __result, _Compare __comp, _IsOpDifference) { @@ -1398,8 +1385,8 @@ __parallel_set_reduce_then_scan(oneapi::dpl::__internal::__device_backend_tag __ oneapi::dpl::__par_backend_hetero::__buffer __mask_buf(__rng1.size()); - return __parallel_transform_reduce_then_scan( - __backend_tag, std::forward<_ExecutionPolicy>(__exec), + return __parallel_transform_reduce_then_scan<_CustomName>( + __backend_tag, __q, oneapi::dpl::__ranges::make_zip_view( std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), oneapi::dpl::__ranges::all_view( @@ -1410,11 +1397,11 @@ __parallel_set_reduce_then_scan(oneapi::dpl::__internal::__device_backend_tag __ /*_Inclusive=*/std::true_type{}, /*__is_unique_pattern=*/std::false_type{}); } -template __future>> -__parallel_set_scan(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, - _Range1&& __rng1, _Range2&& __rng2, _Range3&& __result, _Compare __comp, _IsOpDifference) +__parallel_set_scan(oneapi::dpl::__internal::__device_backend_tag __backend_tag, sycl::queue& __q, _Range1&& __rng1, + _Range2&& __rng2, _Range3&& __result, _Compare __comp, _IsOpDifference) { using _Size1 = oneapi::dpl::__internal::__difference_t<_Range1>; using _Size2 = oneapi::dpl::__internal::__difference_t<_Range2>; @@ -1440,8 +1427,8 @@ __parallel_set_scan(oneapi::dpl::__internal::__device_backend_tag __backend_tag, // temporary buffer to store boolean mask oneapi::dpl::__par_backend_hetero::__buffer __mask_buf(__n1); - return __par_backend_hetero::__parallel_transform_scan_base( - __backend_tag, std::forward<_ExecutionPolicy>(__exec), + return __par_backend_hetero::__parallel_transform_scan_base<_CustomName>( + __backend_tag, __q, oneapi::dpl::__ranges::make_zip_view( std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), oneapi::dpl::__ranges::all_view( @@ -1458,24 +1445,23 @@ __parallel_set_scan(oneapi::dpl::__internal::__device_backend_tag __backend_tag, __copy_by_mask_op); } -template __future>> -__parallel_set_op(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, - _Range1&& __rng1, _Range2&& __rng2, _Range3&& __result, _Compare __comp, - _IsOpDifference __is_op_difference) +__parallel_set_op(oneapi::dpl::__internal::__device_backend_tag __backend_tag, sycl::queue& __q, _Range1&& __rng1, + _Range2&& __rng2, _Range3&& __result, _Compare __comp, _IsOpDifference __is_op_difference) { - if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_reduce_then_scan_sg_sz(__exec)) + if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_reduce_then_scan_sg_sz(__q)) { - return __parallel_set_reduce_then_scan(__backend_tag, std::forward<_ExecutionPolicy>(__exec), - std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), - std::forward<_Range3>(__result), __comp, __is_op_difference); + return __parallel_set_reduce_then_scan<_CustomName>( + __backend_tag, __q, std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), + std::forward<_Range3>(__result), __comp, __is_op_difference); } else { - return __parallel_set_scan(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), - std::forward<_Range2>(__rng2), std::forward<_Range3>(__result), __comp, - __is_op_difference); + return __parallel_set_scan<_CustomName>(__backend_tag, __q, std::forward<_Range1>(__rng1), + std::forward<_Range2>(__rng2), std::forward<_Range3>(__result), __comp, + __is_op_difference); } } @@ -1666,22 +1652,20 @@ template struct __parallel_find_or_nd_range_tuner { // Tune the amount of work-groups and work-group size - template std::tuple - operator()(const _ExecutionPolicy& __exec, const std::size_t __rng_n) const + operator()(const sycl::queue& __q, const std::size_t __rng_n) const { // TODO: find a way to generalize getting of reliable work-group size // Limit the work-group size to prevent large sizes on CPUs. Empirically found value. // This value exceeds the current practical limit for GPUs, but may need to be re-evaluated in the future. - const std::size_t __wgroup_size = - oneapi::dpl::__internal::__max_work_group_size(__exec.queue(), (std::size_t)4096); + const std::size_t __wgroup_size = oneapi::dpl::__internal::__max_work_group_size(__q, (std::size_t)4096); std::size_t __n_groups = 1; // If no more than 32 data elements per work item, a single work group will be used if (__rng_n > __wgroup_size * 32) { // Compute the number of groups and limit by the number of compute units __n_groups = std::min(oneapi::dpl::__internal::__dpl_ceiling_div(__rng_n, __wgroup_size), - oneapi::dpl::__internal::__max_compute_units(__exec.queue())); + oneapi::dpl::__internal::__max_compute_units(__q)); } return {__n_groups, __wgroup_size}; @@ -1694,12 +1678,11 @@ template <> struct __parallel_find_or_nd_range_tuner { // Tune the amount of work-groups and work-group size - template std::tuple - operator()(const _ExecutionPolicy& __exec, const std::size_t __rng_n) const + operator()(const sycl::queue& __q, const std::size_t __rng_n) const { // Call common tuning function to get the work-group size - auto [__n_groups, __wgroup_size] = __parallel_find_or_nd_range_tuner{}(__exec, __rng_n); + auto [__n_groups, __wgroup_size] = __parallel_find_or_nd_range_tuner{}(__q, __rng_n); if (__n_groups > 1) { @@ -1739,21 +1722,20 @@ struct __parallel_find_or_impl_one_wg; template struct __parallel_find_or_impl_one_wg<__or_tag_check, __internal::__optional_kernel_name> { - template + template __FoundStateType - operator()(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _BrickTag __brick_tag, + operator()(oneapi::dpl::__internal::__device_backend_tag, sycl::queue& __q, _BrickTag __brick_tag, const std::size_t __rng_n, const std::size_t __wgroup_size, const __FoundStateType __init_value, _Predicate __pred, _Ranges&&... __rngs) { using __result_and_scratch_storage_t = __result_and_scratch_storage<__FoundStateType>; - __result_and_scratch_storage_t __result_storage{__exec.queue(), 0}; + __result_and_scratch_storage_t __result_storage{__q, 0}; // Calculate the number of elements to be processed by each work-item. const auto __iters_per_work_item = oneapi::dpl::__internal::__dpl_ceiling_div(__rng_n, __wgroup_size); // main parallel_for - auto __event = __exec.queue().submit([&](sycl::handler& __cgh) { + auto __event = __q.submit([&](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rngs...); auto __result_acc = __result_storage.template __get_result_acc(__cgh, __dpl_sycl::__no_init{}); @@ -1804,10 +1786,9 @@ struct __parallel_find_or_impl_multiple_wgs; template struct __parallel_find_or_impl_multiple_wgs<__or_tag_check, __internal::__optional_kernel_name> { - template + template _AtomicType - operator()(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _BrickTag __brick_tag, + operator()(oneapi::dpl::__internal::__device_backend_tag, sycl::queue& __q, _BrickTag __brick_tag, const std::size_t __rng_n, const std::size_t __n_groups, const std::size_t __wgroup_size, const _AtomicType __init_value, _Predicate __pred, _Ranges&&... __rngs) { @@ -1822,7 +1803,7 @@ struct __parallel_find_or_impl_multiple_wgs<__or_tag_check, __internal::__option sycl::buffer<_AtomicType, 1> __result_sycl_buf(&__result, 1); // temporary storage for global atomic // main parallel_for - __exec.queue().submit([&](sycl::handler& __cgh) { + __q.submit([&](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rngs...); auto __result_sycl_buf_acc = __result_sycl_buf.template get_access(__cgh); @@ -1871,23 +1852,21 @@ struct __parallel_find_or_impl_multiple_wgs<__or_tag_check, __internal::__option }; // Base pattern for __parallel_or and __parallel_find. The execution depends on tag type _BrickTag. -template +template ::std::conditional_t< ::std::is_same_v<_BrickTag, __parallel_or_tag>, bool, oneapi::dpl::__internal::__difference_t::type>> -__parallel_find_or(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Brick __f, - _BrickTag __brick_tag, _Ranges&&... __rngs) +__parallel_find_or(oneapi::dpl::__internal::__device_backend_tag, sycl::queue& __q, _Brick __f, _BrickTag __brick_tag, + _Ranges&&... __rngs) { - using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; - auto __rng_n = oneapi::dpl::__ranges::__get_first_range_size(__rngs...); assert(__rng_n > 0); // Evaluate the amount of work-groups and work-group size const auto [__n_groups, __wgroup_size] = - __parallel_find_or_nd_range_tuner{}(__exec, __rng_n); + __parallel_find_or_nd_range_tuner{}(__q, __rng_n); - _PRINT_INFO_IN_DEBUG_MODE(__exec.queue(), __wgroup_size); + _PRINT_INFO_IN_DEBUG_MODE(__q, __wgroup_size); using _AtomicType = typename _BrickTag::_AtomicType; const _AtomicType __init_value = _BrickTag::__init_value(__rng_n); @@ -1906,21 +1885,21 @@ __parallel_find_or(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPoli // Single WG implementation __result = __parallel_find_or_impl_one_wg<__or_tag_check, __find_or_one_wg_kernel_name>()( - oneapi::dpl::__internal::__device_backend_tag{}, std::forward<_ExecutionPolicy>(__exec), __brick_tag, - __rng_n, __wgroup_size, __init_value, __pred, std::forward<_Ranges>(__rngs)...); + oneapi::dpl::__internal::__device_backend_tag{}, __q, __brick_tag, __rng_n, __wgroup_size, __init_value, + __pred, std::forward<_Ranges>(__rngs)...); } else { assert("This device does not support 64-bit atomics" && - (sizeof(_AtomicType) < 8 || __exec.queue().get_device().has(sycl::aspect::atomic64))); + (sizeof(_AtomicType) < 8 || __q.get_device().has(sycl::aspect::atomic64))); using __find_or_kernel_name = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<__find_or_kernel<_CustomName>>; // Multiple WG implementation __result = __parallel_find_or_impl_multiple_wgs<__or_tag_check, __find_or_kernel_name>()( - oneapi::dpl::__internal::__device_backend_tag{}, std::forward<_ExecutionPolicy>(__exec), __brick_tag, - __rng_n, __n_groups, __wgroup_size, __init_value, __pred, std::forward<_Ranges>(__rngs)...); + oneapi::dpl::__internal::__device_backend_tag{}, __q, __brick_tag, __rng_n, __n_groups, __wgroup_size, + __init_value, __pred, std::forward<_Ranges>(__rngs)...); } if constexpr (__or_tag_check) @@ -1937,37 +1916,33 @@ class __or_policy_wrapper { }; -template +template bool -__parallel_or(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, - _Iterator1 __first, _Iterator1 __last, _Iterator2 __s_first, _Iterator2 __s_last, _Brick __f) +__parallel_or(oneapi::dpl::__internal::__device_backend_tag __backend_tag, sycl::queue& __q, _Iterator1 __first, + _Iterator1 __last, _Iterator2 __s_first, _Iterator2 __s_last, _Brick __f) { auto __keep = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _Iterator1>(); auto __buf = __keep(__first, __last); auto __s_keep = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _Iterator2>(); auto __s_buf = __s_keep(__s_first, __s_last); - return oneapi::dpl::__par_backend_hetero::__parallel_find_or( - __backend_tag, - __par_backend_hetero::make_wrapped_policy<__or_policy_wrapper>(::std::forward<_ExecutionPolicy>(__exec)), __f, - __parallel_or_tag{}, __buf.all_view(), __s_buf.all_view()); + return oneapi::dpl::__par_backend_hetero::__parallel_find_or<_CustomName>( + __backend_tag, __q, __f, __parallel_or_tag{}, __buf.all_view(), __s_buf.all_view()); } // Special overload for single sequence cases. // TODO: check if similar pattern may apply to other algorithms. If so, these overloads should be moved out of // backend code. -template +template bool -__parallel_or(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, _Iterator __first, +__parallel_or(oneapi::dpl::__internal::__device_backend_tag __backend_tag, sycl::queue& __q, _Iterator __first, _Iterator __last, _Brick __f) { auto __keep = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _Iterator>(); auto __buf = __keep(__first, __last); - return oneapi::dpl::__par_backend_hetero::__parallel_find_or( - __backend_tag, - __par_backend_hetero::make_wrapped_policy<__or_policy_wrapper>(::std::forward<_ExecutionPolicy>(__exec)), __f, - __parallel_or_tag{}, __buf.all_view()); + return oneapi::dpl::__par_backend_hetero::__parallel_find_or<_CustomName>(__backend_tag, __q, __f, + __parallel_or_tag{}, __buf.all_view()); } //------------------------------------------------------------------------ @@ -1979,10 +1954,10 @@ class __find_policy_wrapper { }; -template +template _Iterator1 -__parallel_find(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, - _Iterator1 __first, _Iterator1 __last, _Iterator2 __s_first, _Iterator2 __s_last, _Brick __f, _IsFirst) +__parallel_find(oneapi::dpl::__internal::__device_backend_tag __backend_tag, sycl::queue& __q, _Iterator1 __first, + _Iterator1 __last, _Iterator2 __s_first, _Iterator2 __s_last, _Brick __f, _IsFirst) { auto __keep = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _Iterator1>(); auto __buf = __keep(__first, __last); @@ -1991,31 +1966,27 @@ __parallel_find(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _Ex using _TagType = ::std::conditional_t<_IsFirst::value, __parallel_find_forward_tag, __parallel_find_backward_tag>; - return __first + oneapi::dpl::__par_backend_hetero::__parallel_find_or( - __backend_tag, - __par_backend_hetero::make_wrapped_policy<__find_policy_wrapper>( - ::std::forward<_ExecutionPolicy>(__exec)), - __f, _TagType{}, __buf.all_view(), __s_buf.all_view()); + + return __first + oneapi::dpl::__par_backend_hetero::__parallel_find_or<__find_policy_wrapper<_CustomName>>( + __backend_tag, __q, __f, _TagType{}, __buf.all_view(), __s_buf.all_view()); } // Special overload for single sequence cases. // TODO: check if similar pattern may apply to other algorithms. If so, these overloads should be moved out of // backend code. -template +template _Iterator -__parallel_find(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, - _Iterator __first, _Iterator __last, _Brick __f, _IsFirst) +__parallel_find(oneapi::dpl::__internal::__device_backend_tag __backend_tag, sycl::queue& __q, _Iterator __first, + _Iterator __last, _Brick __f, _IsFirst) { auto __keep = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _Iterator>(); auto __buf = __keep(__first, __last); using _TagType = ::std::conditional_t<_IsFirst::value, __parallel_find_forward_tag, __parallel_find_backward_tag>; - return __first + oneapi::dpl::__par_backend_hetero::__parallel_find_or( - __backend_tag, - __par_backend_hetero::make_wrapped_policy<__find_policy_wrapper>( - ::std::forward<_ExecutionPolicy>(__exec)), - __f, _TagType{}, __buf.all_view()); + + return __first + oneapi::dpl::__par_backend_hetero::__parallel_find_or<__find_policy_wrapper<_CustomName>>( + __backend_tag, __q, __f, _TagType{}, __buf.all_view()); } //------------------------------------------------------------------------ @@ -2103,9 +2074,9 @@ template struct __parallel_partial_sort_submitter<__internal::__optional_kernel_name<_GlobalSortName...>, __internal::__optional_kernel_name<_CopyBackName...>> { - template + template __future - operator()(_BackendTag, _ExecutionPolicy&& __exec, _Range&& __rng, _Merge __merge, _Compare __comp) const + operator()(_BackendTag, sycl::queue& __q, _Range&& __rng, _Merge __merge, _Compare __comp) const { using _Tp = oneapi::dpl::__internal::__value_t<_Range>; using _Size = oneapi::dpl::__internal::__difference_t<_Range>; @@ -2115,14 +2086,14 @@ struct __parallel_partial_sort_submitter<__internal::__optional_kernel_name<_Glo oneapi::dpl::__par_backend_hetero::__buffer<_Tp> __temp_buf(__n); auto __temp = __temp_buf.get_buffer(); - _PRINT_INFO_IN_DEBUG_MODE(__exec.queue()); + _PRINT_INFO_IN_DEBUG_MODE(__q); _Size __k = 1; bool __data_in_temp = false; sycl::event __event1; do { - __event1 = __exec.queue().submit([&, __data_in_temp, __k](sycl::handler& __cgh) { + __event1 = __q.submit([&, __data_in_temp, __k](sycl::handler& __cgh) { __cgh.depends_on(__event1); oneapi::dpl::__ranges::__require_access(__cgh, __rng); auto __temp_acc = __temp.template get_access(__cgh); @@ -2153,7 +2124,7 @@ struct __parallel_partial_sort_submitter<__internal::__optional_kernel_name<_Glo // if results are in temporary buffer then copy back those if (__data_in_temp) { - __event1 = __exec.queue().submit([&](sycl::handler& __cgh) { + __event1 = __q.submit([&](sycl::handler& __cgh) { __cgh.depends_on(__event1); oneapi::dpl::__ranges::__require_access(__cgh, __rng); auto __temp_acc = __temp.template get_access(__cgh); @@ -2171,20 +2142,18 @@ struct __parallel_partial_sort_submitter<__internal::__optional_kernel_name<_Glo template class __sort_global_kernel; -template +template __future -__parallel_partial_sort_impl(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Range&& __rng, +__parallel_partial_sort_impl(oneapi::dpl::__internal::__device_backend_tag, sycl::queue& __q, _Range&& __rng, _Merge __merge, _Compare __comp) { - using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; using _GlobalSortKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<__sort_global_kernel<_CustomName>>; using _CopyBackKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<__sort_copy_back_kernel<_CustomName>>; return __parallel_partial_sort_submitter<_GlobalSortKernel, _CopyBackKernel>()( - oneapi::dpl::__internal::__device_backend_tag{}, ::std::forward<_ExecutionPolicy>(__exec), - ::std::forward<_Range>(__rng), __merge, __comp); + oneapi::dpl::__internal::__device_backend_tag{}, __q, std::forward<_Range>(__rng), __merge, __comp); } //------------------------------------------------------------------------ @@ -2206,28 +2175,28 @@ struct __is_radix_sort_usable_for_type #if _ONEDPL_USE_RADIX_SORT template < - typename _ExecutionPolicy, typename _Range, typename _Compare, typename _Proj, + typename _CustomName, typename _Range, typename _Compare, typename _Proj, ::std::enable_if_t< __is_radix_sort_usable_for_type, _Compare>::value, int> = 0> __future -__parallel_stable_sort(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, - _Range&& __rng, _Compare, _Proj __proj) +__parallel_stable_sort(oneapi::dpl::__internal::__device_backend_tag __backend_tag, sycl::queue& __q, _Range&& __rng, + _Compare, _Proj __proj) { - return __parallel_radix_sort<__internal::__is_comp_ascending<::std::decay_t<_Compare>>::value>( - __backend_tag, ::std::forward<_ExecutionPolicy>(__exec), ::std::forward<_Range>(__rng), __proj); + return __parallel_radix_sort<_CustomName, __internal::__is_comp_ascending>::value>( + __backend_tag, __q, std::forward<_Range>(__rng), __proj); } #endif // _ONEDPL_USE_RADIX_SORT template < - typename _ExecutionPolicy, typename _Range, typename _Compare, typename _Proj, + typename _CustomName, typename _Range, typename _Compare, typename _Proj, ::std::enable_if_t< !__is_radix_sort_usable_for_type, _Compare>::value, int> = 0> __future> -__parallel_stable_sort(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, - _Range&& __rng, _Compare __comp, _Proj __proj) +__parallel_stable_sort(oneapi::dpl::__internal::__device_backend_tag __backend_tag, sycl::queue& __q, _Range&& __rng, + _Compare __comp, _Proj __proj) { - return __parallel_sort_impl(__backend_tag, ::std::forward<_ExecutionPolicy>(__exec), ::std::forward<_Range>(__rng), - oneapi::dpl::__internal::__compare<_Compare, _Proj>{__comp, __proj}); + return __parallel_sort_impl<_CustomName>(__backend_tag, __q, std::forward<_Range>(__rng), + oneapi::dpl::__internal::__compare<_Compare, _Proj>{__comp, __proj}); } //------------------------------------------------------------------------ @@ -2237,9 +2206,9 @@ __parallel_stable_sort(oneapi::dpl::__internal::__device_backend_tag __backend_t // TODO: check if it makes sense to move these wrappers out of backend to a common place // TODO: consider changing __partial_merge_kernel to make it compatible with // __full_merge_kernel in order to use __parallel_sort_impl routine -template +template __future -__parallel_partial_sort(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, +__parallel_partial_sort(oneapi::dpl::__internal::__device_backend_tag __backend_tag, sycl::queue& __q, _Iterator __first, _Iterator __mid, _Iterator __last, _Compare __comp) { const auto __mid_idx = __mid - __first; @@ -2247,8 +2216,8 @@ __parallel_partial_sort(oneapi::dpl::__internal::__device_backend_tag __backend_ auto __keep = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read_write, _Iterator>(); auto __buf = __keep(__first, __last); - return __parallel_partial_sort_impl(__backend_tag, ::std::forward<_ExecutionPolicy>(__exec), __buf.all_view(), - __partial_merge_kernel{__mid_idx}, __comp); + return __parallel_partial_sort_impl<_CustomName>(__backend_tag, __q, __buf.all_view(), + __partial_merge_kernel{__mid_idx}, __comp); } //------------------------------------------------------------------------ @@ -2319,10 +2288,13 @@ __parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_ _BinaryOperator __binary_op, /*known_identity=*/std::false_type) { + using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; using __diff_type = oneapi::dpl::__internal::__difference_t<_Range1>; using __key_type = oneapi::dpl::__internal::__value_t<_Range1>; using __val_type = oneapi::dpl::__internal::__value_t<_Range2>; + sycl::queue __q_local = __exec.queue(); + const auto __n = __keys.size(); // Round 1: reduce with extra indices added to avoid long segments // TODO: At threshold points check if the key is equal to the key at the previous threshold point, indicating a long sequence. @@ -2343,18 +2315,17 @@ __parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_ oneapi::dpl::__ranges::views::all_write(__idx)); // use work group size adjusted to shared local memory as the maximum segment size. - std::size_t __wgroup_size = oneapi::dpl::__internal::__slm_adjusted_work_group_size( - __exec.queue(), sizeof(__key_type) + sizeof(__val_type)); + std::size_t __wgroup_size = + oneapi::dpl::__internal::__slm_adjusted_work_group_size(__q_local, sizeof(__key_type) + sizeof(__val_type)); // element is copied if it is the 0th element (marks beginning of first segment), is in an index // evenly divisible by wg size (ensures segments are not long), or has a key not equal to the // adjacent element (marks end of real segments) // TODO: replace wgroup size with segment size based on platform specifics. auto __intermediate_result_end = - oneapi::dpl::__par_backend_hetero::__parallel_copy_if( - oneapi::dpl::__internal::__device_backend_tag{}, - oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__assign_key1_wrapper>(__exec), __view1, __view2, - __n, __internal::__parallel_reduce_by_segment_fallback_fn1<_BinaryPredicate>{__binary_pred, __wgroup_size}, + oneapi::dpl::__par_backend_hetero::__parallel_copy_if<__assign_key1_wrapper<_CustomName>>( + oneapi::dpl::__internal::__device_backend_tag{}, __q_local, __view1, __view2, __n, + __internal::__parallel_reduce_by_segment_fallback_fn1<_BinaryPredicate>{__binary_pred, __wgroup_size}, unseq_backend::__brick_assign_key_position{}) .get(); @@ -2391,10 +2362,9 @@ __parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_ // element is copied if it is the 0th element (marks beginning of first segment), or has a key not equal to // the adjacent element (end of a segment). Artificial segments based on wg size are not created. auto __result_end = - oneapi::dpl::__par_backend_hetero::__parallel_copy_if( - oneapi::dpl::__internal::__device_backend_tag{}, - oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__assign_key2_wrapper>(__exec), __view3, __view4, - __view3.size(), __internal::__parallel_reduce_by_segment_fallback_fn2<_BinaryPredicate>{__binary_pred}, + oneapi::dpl::__par_backend_hetero::__parallel_copy_if<__assign_key2_wrapper<_CustomName>>( + oneapi::dpl::__internal::__device_backend_tag{}, __q_local, __view3, __view4, __view3.size(), + __internal::__parallel_reduce_by_segment_fallback_fn2<_BinaryPredicate>{__binary_pred}, unseq_backend::__brick_assign_key_position{}) .get(); @@ -2428,17 +2398,21 @@ __parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag, _Exe // __out_keys = { 1, 2, 3, 4, 1, 3, 1, 3, 0 } // __out_values = { 1, 2, 3, 4, 2, 6, 2, 6, 0 } + using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + + sycl::queue __q_local = __exec.queue(); + using __val_type = oneapi::dpl::__internal::__value_t<_Range2>; // Prior to icpx 2025.0, the reduce-then-scan path performs poorly and should be avoided. #if !defined(__INTEL_LLVM_COMPILER) || __INTEL_LLVM_COMPILER >= 20250000 if constexpr (std::is_trivially_copyable_v<__val_type>) { - if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_reduce_then_scan_sg_sz(__exec)) + if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_reduce_then_scan_sg_sz(__q_local)) { - auto __res = oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment_reduce_then_scan( - oneapi::dpl::__internal::__device_backend_tag{}, std::forward<_ExecutionPolicy>(__exec), - std::forward<_Range1>(__keys), std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys), - std::forward<_Range4>(__out_values), __binary_pred, __binary_op); + auto __res = oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment_reduce_then_scan<_CustomName>( + oneapi::dpl::__internal::__device_backend_tag{}, __q_local, std::forward<_Range1>(__keys), + std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values), + __binary_pred, __binary_op); // Because our init type ends up being tuple, return the first component which is the write index. Add 1 to return the // past-the-end iterator pair of segmented reduction. return std::get<0>(__res.get()) + 1; diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h index 3b423d6ed39..d7e7d9e380a 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h @@ -56,13 +56,13 @@ struct __parallel_for_small_submitter; template struct __parallel_for_small_submitter<__internal::__optional_kernel_name<_Name...>> { - template + template __future - operator()(_ExecutionPolicy&& __exec, _Fp __brick, _Index __count, _Ranges&&... __rngs) const + operator()(sycl::queue& __q, _Fp __brick, _Index __count, _Ranges&&... __rngs) const { assert(oneapi::dpl::__ranges::__get_first_range_size(__rngs...) > 0); - _PRINT_INFO_IN_DEBUG_MODE(__exec.queue()); - auto __event = __exec.queue().submit([__rngs..., __brick, __count](sycl::handler& __cgh) { + _PRINT_INFO_IN_DEBUG_MODE(__q); + auto __event = __q.submit([__rngs..., __brick, __count](sycl::handler& __cgh) { //get an access to data under SYCL buffer: oneapi::dpl::__ranges::__require_access(__cgh, __rngs...); @@ -127,25 +127,25 @@ struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name.. // Once there is enough work to launch a group on each compute unit with our chosen __iters_per_item, // then we should start using this code path. - template + template static std::size_t - __estimate_best_start_size(const _ExecutionPolicy& __exec, _Fp __brick) + __estimate_best_start_size(const sycl::queue& __q, _Fp __brick) { const std::size_t __work_group_size = - oneapi::dpl::__internal::__max_work_group_size(__exec.queue(), __max_work_group_size); - const std::uint32_t __max_cu = oneapi::dpl::__internal::__max_compute_units(__exec.queue()); + oneapi::dpl::__internal::__max_work_group_size(__q, __max_work_group_size); + const std::uint32_t __max_cu = oneapi::dpl::__internal::__max_compute_units(__q); return __work_group_size * _Fp::__preferred_iters_per_item * __max_cu; } - template + template __future - operator()(_ExecutionPolicy&& __exec, _Fp __brick, _Index __count, _Ranges&&... __rngs) const + operator()(sycl::queue& __q, _Fp __brick, _Index __count, _Ranges&&... __rngs) const { assert(oneapi::dpl::__ranges::__get_first_range_size(__rngs...) > 0); const std::size_t __work_group_size = - oneapi::dpl::__internal::__max_work_group_size(__exec.queue(), __max_work_group_size); - _PRINT_INFO_IN_DEBUG_MODE(__exec.queue()); - auto __event = __exec.queue().submit([__rngs..., __brick, __work_group_size, __count](sycl::handler& __cgh) { + oneapi::dpl::__internal::__max_work_group_size(__q, __max_work_group_size); + _PRINT_INFO_IN_DEBUG_MODE(__q); + auto __event = __q.submit([__rngs..., __brick, __work_group_size, __count](sycl::handler& __cgh) { //get an access to data under SYCL buffer: oneapi::dpl::__ranges::__require_access(__cgh, __rngs...); constexpr std::uint8_t __iters_per_work_item = _Fp::__preferred_iters_per_item; @@ -187,6 +187,8 @@ __parallel_for(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& using _ForKernelLarge = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<__parallel_for_large_kernel<_CustomName>>; + sycl::queue __q_local = __exec.queue(); + using __small_submitter = __parallel_for_small_submitter<_ForKernelSmall>; using __large_submitter = __parallel_for_large_submitter<_ForKernelLarge>; // Compile two kernels: one for small-to-medium inputs and a second for large. This avoids runtime checks within a @@ -194,14 +196,12 @@ __parallel_for(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& // then only compile the basic kernel as the two versions are effectively the same. if constexpr (_Fp::__preferred_iters_per_item > 1 || _Fp::__preferred_vector_size > 1) { - if (__count >= __large_submitter::__estimate_best_start_size(__exec, __brick)) + if (__count >= __large_submitter::__estimate_best_start_size(__q_local, __brick)) { - return __large_submitter{}(std::forward<_ExecutionPolicy>(__exec), __brick, __count, - std::forward<_Ranges>(__rngs)...); + return __large_submitter{}(__q_local, __brick, __count, std::forward<_Ranges>(__rngs)...); } } - return __small_submitter{}(std::forward<_ExecutionPolicy>(__exec), __brick, __count, - std::forward<_Ranges>(__rngs)...); + return __small_submitter{}(__q_local, __brick, __count, std::forward<_Ranges>(__rngs)...); } } // namespace __par_backend_hetero diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h index 067f187365a..1d8f76b3dd2 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h @@ -55,19 +55,19 @@ struct __parallel_for_fpga_submitter; template struct __parallel_for_fpga_submitter<__internal::__optional_kernel_name<_Name...>> { - template + template __future - operator()(_ExecutionPolicy&& __exec, _Fp __brick, _Index __count, _Ranges&&... __rngs) const + operator()(sycl::queue& __q, _Fp __brick, _Index __count, _Ranges&&... __rngs) const { assert(oneapi::dpl::__ranges::__get_first_range_size(__rngs...) > 0); - _PRINT_INFO_IN_DEBUG_MODE(__exec.queue()); - auto __event = __exec.queue().submit([&__rngs..., &__brick, __count](sycl::handler& __cgh) { + _PRINT_INFO_IN_DEBUG_MODE(__q); + auto __event = __q.submit([&__rngs..., &__brick, __count](sycl::handler& __cgh) { //get an access to data under SYCL buffer: oneapi::dpl::__ranges::__require_access(__cgh, __rngs...); __cgh.single_task<_Name...>([=]() { -#pragma unroll(::std::decay <_ExecutionPolicy>::type::unroll_factor) +#pragma unroll(unroll_factor) for (auto __idx = 0; __idx < __count; ++__idx) { __brick.__scalar_path_impl(std::true_type{}, __idx, __rngs...); @@ -87,8 +87,12 @@ __parallel_for(oneapi::dpl::__internal::__fpga_backend_tag, _ExecutionPolicy&& _ using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; using __parallel_for_name = __internal::__kernel_name_provider<_CustomName>; - return __parallel_for_fpga_submitter<__parallel_for_name>()(std::forward<_ExecutionPolicy>(__exec), __brick, - __count, std::forward<_Ranges>(__rngs)...); + constexpr unsigned int unroll_factor = std::decay<_ExecutionPolicy>::type::unroll_factor; + + sycl::queue __q_local = __exec.queue(); + + return __parallel_for_fpga_submitter<__parallel_for_name>{}.template operator()( + __q_local, __brick, __count, std::forward<_Ranges>(__rngs)...); } //------------------------------------------------------------------------ @@ -96,18 +100,18 @@ __parallel_for(oneapi::dpl::__internal::__fpga_backend_tag, _ExecutionPolicy&& _ //----------------------------------------------------------------------- // TODO: check if it makes sense to move these wrappers out of backend to a common place -template +template auto -__parallel_histogram(oneapi::dpl::__internal::__fpga_backend_tag, _ExecutionPolicy&& __exec, const _Event& __init_event, +__parallel_histogram(oneapi::dpl::__internal::__fpga_backend_tag, sycl::queue& __q, const _Event& __init_event, _Range1&& __input, _Range2&& __bins, const _BinHashMgr& __binhash_manager) { static_assert(sizeof(oneapi::dpl::__internal::__value_t<_Range2>) <= sizeof(::std::uint32_t), "histogram is not supported on FPGA devices with output types greater than 32 bits"); // workaround until we implement more performant version for patterns - return oneapi::dpl::__par_backend_hetero::__parallel_histogram( - oneapi::dpl::__internal::__device_backend_tag{}, std::forward<_ExecutionPolicy>(__exec), __init_event, - std::forward<_Range1>(__input), std::forward<_Range2>(__bins), __binhash_manager); + return oneapi::dpl::__par_backend_hetero::__parallel_histogram<_CustomName>( + oneapi::dpl::__internal::__device_backend_tag{}, __q, __init_event, std::forward<_Range1>(__input), + std::forward<_Range2>(__bins), __binhash_manager); } } // namespace __par_backend_hetero diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_histogram.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_histogram.h index 4737d6f9531..ee0c72f575b 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_histogram.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_histogram.h @@ -213,10 +213,10 @@ template <::std::uint16_t __iters_per_work_item, ::std::uint8_t __bins_per_work_ struct __histogram_general_registers_local_reduction_submitter<__iters_per_work_item, __bins_per_work_item, __internal::__optional_kernel_name<_KernelName...>> { - template + template sycl::event - operator()(_ExecutionPolicy&& __exec, const sycl::event& __init_event, ::std::uint16_t __work_group_size, - _Range1&& __input, _Range2&& __bins, const _BinHashMgr& __binhash_manager) + operator()(sycl::queue& __q, const sycl::event& __init_event, std::uint16_t __work_group_size, _Range1&& __input, + _Range2&& __bins, const _BinHashMgr& __binhash_manager) { const ::std::size_t __n = __input.size(); const ::std::uint8_t __num_bins = __bins.size(); @@ -229,7 +229,7 @@ struct __histogram_general_registers_local_reduction_submitter<__iters_per_work_ ::std::size_t __extra_SLM_elements = __binhash_manager.get_required_SLM_elements(); ::std::size_t __segments = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __work_group_size * __iters_per_work_item); - return __exec.queue().submit([&](auto& __h) { + return __q.submit([&](auto& __h) { __h.depends_on(__init_event); auto _device_copyable_func = __binhash_manager.prepare_device_binhash(__h); oneapi::dpl::__ranges::__require_access(__h, __input, __bins); @@ -285,15 +285,13 @@ struct __histogram_general_registers_local_reduction_submitter<__iters_per_work_ } }; -template <::std::uint16_t __iters_per_work_item, ::std::uint8_t __bins_per_work_item, typename _ExecutionPolicy, +template sycl::event -__histogram_general_registers_local_reduction(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, +__histogram_general_registers_local_reduction(oneapi::dpl::__internal::__device_backend_tag, sycl::queue& __q, const sycl::event& __init_event, ::std::uint16_t __work_group_size, _Range1&& __input, _Range2&& __bins, const _BinHashMgr& __binhash_manager) { - using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; - using _iters_per_work_item_t = ::std::integral_constant<::std::uint16_t, __iters_per_work_item>; // Required to include _iters_per_work_item_t in kernel name because we compile multiple kernels and decide between @@ -304,8 +302,8 @@ __histogram_general_registers_local_reduction(oneapi::dpl::__internal::__device_ return __histogram_general_registers_local_reduction_submitter<__iters_per_work_item, __bins_per_work_item, _RegistersLocalReducName>()( - ::std::forward<_ExecutionPolicy>(__exec), __init_event, __work_group_size, ::std::forward<_Range1>(__input), - ::std::forward<_Range2>(__bins), __binhash_manager); + __q, __init_event, __work_group_size, std::forward<_Range1>(__input), std::forward<_Range2>(__bins), + __binhash_manager); } template <::std::uint16_t __iters_per_work_item, typename _KernelName> @@ -315,10 +313,10 @@ template <::std::uint16_t __iters_per_work_item, typename... _KernelName> struct __histogram_general_local_atomics_submitter<__iters_per_work_item, __internal::__optional_kernel_name<_KernelName...>> { - template + template sycl::event - operator()(_ExecutionPolicy&& __exec, const sycl::event& __init_event, ::std::uint16_t __work_group_size, - _Range1&& __input, _Range2&& __bins, const _BinHashMgr& __binhash_manager) + operator()(sycl::queue& __q, const sycl::event& __init_event, std::uint16_t __work_group_size, _Range1&& __input, + _Range2&& __bins, const _BinHashMgr& __binhash_manager) { using _local_histogram_type = ::std::uint32_t; using _bin_type = oneapi::dpl::__internal::__value_t<_Range2>; @@ -330,7 +328,7 @@ struct __histogram_general_local_atomics_submitter<__iters_per_work_item, const ::std::size_t __num_bins = __bins.size(); ::std::size_t __segments = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __work_group_size * __iters_per_work_item); - return __exec.queue().submit([&](auto& __h) { + return __q.submit([&](auto& __h) { __h.depends_on(__init_event); auto _device_copyable_func = __binhash_manager.prepare_device_binhash(__h); oneapi::dpl::__ranges::__require_access(__h, __input, __bins); @@ -380,15 +378,13 @@ struct __histogram_general_local_atomics_submitter<__iters_per_work_item, } }; -template <::std::uint16_t __iters_per_work_item, typename _ExecutionPolicy, typename _Range1, typename _Range2, +template sycl::event -__histogram_general_local_atomics(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, +__histogram_general_local_atomics(oneapi::dpl::__internal::__device_backend_tag, sycl::queue& __q, const sycl::event& __init_event, ::std::uint16_t __work_group_size, _Range1&& __input, _Range2&& __bins, const _BinHashMgr& __binhash_manager) { - using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; - using _iters_per_work_item_t = ::std::integral_constant<::std::uint16_t, __iters_per_work_item>; // Required to include _iters_per_work_item_t in kernel name because we compile multiple kernels and decide between @@ -398,8 +394,8 @@ __histogram_general_local_atomics(oneapi::dpl::__internal::__device_backend_tag, __histo_kernel_local_atomics<_iters_per_work_item_t, _CustomName>>; return __histogram_general_local_atomics_submitter<__iters_per_work_item, _local_atomics_name>()( - ::std::forward<_ExecutionPolicy>(__exec), __init_event, __work_group_size, ::std::forward<_Range1>(__input), - ::std::forward<_Range2>(__bins), __binhash_manager); + __q, __init_event, __work_group_size, std::forward<_Range1>(__input), std::forward<_Range2>(__bins), + __binhash_manager); } template @@ -408,9 +404,9 @@ struct __histogram_general_private_global_atomics_submitter; template struct __histogram_general_private_global_atomics_submitter<__internal::__optional_kernel_name<_KernelName...>> { - template + template sycl::event - operator()(_BackendTag, _ExecutionPolicy&& __exec, const sycl::event& __init_event, + operator()(_BackendTag, sycl::queue& __q, const sycl::event& __init_event, ::std::uint16_t __min_iters_per_work_item, ::std::uint16_t __work_group_size, _Range1&& __input, _Range2&& __bins, const _BinHashMgr& __binhash_manager) { @@ -419,7 +415,7 @@ struct __histogram_general_private_global_atomics_submitter<__internal::__option using _bin_type = oneapi::dpl::__internal::__value_t<_Range2>; using _histogram_index_type = ::std::int32_t; - auto __global_mem_size = __exec.queue().get_device().template get_info(); + auto __global_mem_size = __q.get_device().template get_info(); const ::std::size_t __max_segments = ::std::min(__global_mem_size / (__num_bins * sizeof(_bin_type)), oneapi::dpl::__internal::__dpl_ceiling_div(__n, __work_group_size * __min_iters_per_work_item)); @@ -431,7 +427,7 @@ struct __histogram_general_private_global_atomics_submitter<__internal::__option auto __private_histograms = oneapi::dpl::__par_backend_hetero::__buffer<_bin_type>(__segments * __num_bins).get_buffer(); - return __exec.queue().submit([&](auto& __h) { + return __q.submit([&](auto& __h) { __h.depends_on(__init_event); auto _device_copyable_func = __binhash_manager.prepare_device_binhash(__h); oneapi::dpl::__ranges::__require_access(__h, __input, __bins); @@ -478,30 +474,27 @@ struct __histogram_general_private_global_atomics_submitter<__internal::__option }); } }; -template +template sycl::event -__histogram_general_private_global_atomics(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, +__histogram_general_private_global_atomics(oneapi::dpl::__internal::__device_backend_tag, sycl::queue& __q, const sycl::event& __init_event, ::std::uint16_t __min_iters_per_work_item, ::std::uint16_t __work_group_size, _Range1&& __input, _Range2&& __bins, const _BinHashMgr& __binhash_manager) { - using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; - using _global_atomics_name = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< __histo_kernel_private_glocal_atomics<_CustomName>>; return __histogram_general_private_global_atomics_submitter<_global_atomics_name>()( - oneapi::dpl::__internal::__device_backend_tag{}, ::std::forward<_ExecutionPolicy>(__exec), __init_event, - __min_iters_per_work_item, __work_group_size, ::std::forward<_Range1>(__input), ::std::forward<_Range2>(__bins), - __binhash_manager); + oneapi::dpl::__internal::__device_backend_tag{}, __q, __init_event, __min_iters_per_work_item, + __work_group_size, std::forward<_Range1>(__input), std::forward<_Range2>(__bins), __binhash_manager); } -template <::std::uint16_t __iters_per_work_item, typename _ExecutionPolicy, typename _Range1, typename _Range2, +template __future -__parallel_histogram_select_kernel(oneapi::dpl::__internal::__device_backend_tag __backend_tag, - _ExecutionPolicy&& __exec, const sycl::event& __init_event, _Range1&& __input, - _Range2&& __bins, const _BinHashMgr& __binhash_manager) +__parallel_histogram_select_kernel(oneapi::dpl::__internal::__device_backend_tag __backend_tag, sycl::queue& __q, + const sycl::event& __init_event, _Range1&& __input, _Range2&& __bins, + const _BinHashMgr& __binhash_manager) { using _private_histogram_type = ::std::uint16_t; using _local_histogram_type = ::std::uint32_t; @@ -509,28 +502,27 @@ __parallel_histogram_select_kernel(oneapi::dpl::__internal::__device_backend_tag const auto __num_bins = __bins.size(); // Limit the maximum work-group size for better performance. Empirically found value. - std::uint16_t __work_group_size = - oneapi::dpl::__internal::__max_work_group_size(__exec.queue(), std::uint16_t(1024)); + std::uint16_t __work_group_size = oneapi::dpl::__internal::__max_work_group_size(__q, std::uint16_t(1024)); - auto __local_mem_size = __exec.queue().get_device().template get_info(); + auto __local_mem_size = __q.get_device().template get_info(); constexpr ::std::uint8_t __max_work_item_private_bins = 16 / sizeof(_private_histogram_type); // if bins fit into registers, use register private accumulation if (__num_bins <= __max_work_item_private_bins) { - return __future( - __histogram_general_registers_local_reduction<__iters_per_work_item, __max_work_item_private_bins>( - __backend_tag, ::std::forward<_ExecutionPolicy>(__exec), __init_event, __work_group_size, - ::std::forward<_Range1>(__input), ::std::forward<_Range2>(__bins), __binhash_manager)); + return __future(__histogram_general_registers_local_reduction<_CustomName, __iters_per_work_item, + __max_work_item_private_bins>( + __backend_tag, __q, __init_event, __work_group_size, std::forward<_Range1>(__input), + std::forward<_Range2>(__bins), __binhash_manager)); } // if bins fit into SLM, use local atomics else if (__num_bins * sizeof(_local_histogram_type) + __binhash_manager.get_required_SLM_elements() * sizeof(_extra_memory_type) < __local_mem_size) { - return __future(__histogram_general_local_atomics<__iters_per_work_item>( - __backend_tag, ::std::forward<_ExecutionPolicy>(__exec), __init_event, __work_group_size, - ::std::forward<_Range1>(__input), ::std::forward<_Range2>(__bins), __binhash_manager)); + return __future(__histogram_general_local_atomics<_CustomName, __iters_per_work_item>( + __backend_tag, __q, __init_event, __work_group_size, std::forward<_Range1>(__input), + std::forward<_Range2>(__bins), __binhash_manager)); } else // otherwise, use global atomics (private copies per workgroup) { @@ -539,29 +531,29 @@ __parallel_histogram_select_kernel(oneapi::dpl::__internal::__device_backend_tag // suggestion which but global memory limitations may increase this value to be able to fit the workgroup // private copies of the histogram bins in global memory. No unrolling is taken advantage of here because it // is a runtime argument. - return __future(__histogram_general_private_global_atomics( - __backend_tag, ::std::forward<_ExecutionPolicy>(__exec), __init_event, __iters_per_work_item, - __work_group_size, ::std::forward<_Range1>(__input), ::std::forward<_Range2>(__bins), __binhash_manager)); + return __future(__histogram_general_private_global_atomics<_CustomName>( + __backend_tag, __q, __init_event, __iters_per_work_item, __work_group_size, std::forward<_Range1>(__input), + std::forward<_Range2>(__bins), __binhash_manager)); } } -template +template __future -__parallel_histogram(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, +__parallel_histogram(oneapi::dpl::__internal::__device_backend_tag __backend_tag, sycl::queue& __q, const sycl::event& __init_event, _Range1&& __input, _Range2&& __bins, const _BinHashMgr& __binhash_manager) { if (__input.size() < 1048576) // 2^20 { - return __parallel_histogram_select_kernel( - __backend_tag, ::std::forward<_ExecutionPolicy>(__exec), __init_event, ::std::forward<_Range1>(__input), - ::std::forward<_Range2>(__bins), __binhash_manager); + return __parallel_histogram_select_kernel<_CustomName, /*iters_per_workitem = */ 4>( + __backend_tag, __q, __init_event, std::forward<_Range1>(__input), std::forward<_Range2>(__bins), + __binhash_manager); } else { - return __parallel_histogram_select_kernel( - __backend_tag, ::std::forward<_ExecutionPolicy>(__exec), __init_event, ::std::forward<_Range1>(__input), - ::std::forward<_Range2>(__bins), __binhash_manager); + return __parallel_histogram_select_kernel<_CustomName, /*iters_per_workitem = */ 32>( + __backend_tag, __q, __init_event, std::forward<_Range1>(__input), std::forward<_Range2>(__bins), + __binhash_manager); } } diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 1c560bc55c9..7082d17398e 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -204,9 +204,9 @@ struct __parallel_merge_submitter; template struct __parallel_merge_submitter<_OutSizeLimit, _IdType, __internal::__optional_kernel_name<_Name...>> { - template + template __future> - operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const + operator()(sycl::queue& __q, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const { const _IdType __n1 = __rng1.size(); const _IdType __n2 = __rng2.size(); @@ -214,10 +214,10 @@ struct __parallel_merge_submitter<_OutSizeLimit, _IdType, __internal::__optional assert(__n1 > 0 || __n2 > 0); - _PRINT_INFO_IN_DEBUG_MODE(__exec.queue()); + _PRINT_INFO_IN_DEBUG_MODE(__q); // Empirical number of values to process per work-item - const _IdType __chunk = __exec.queue().get_device().is_cpu() ? 128 : 4; + const _IdType __chunk = __q.get_device().is_cpu() ? 128 : 4; const _IdType __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk); @@ -228,15 +228,15 @@ struct __parallel_merge_submitter<_OutSizeLimit, _IdType, __internal::__optional __result_and_scratch_storage_t* __p_res_storage = nullptr; if constexpr (_OutSizeLimit{}) - __p_res_storage = new __result_and_scratch_storage_t(__exec.queue(), 0); + __p_res_storage = new __result_and_scratch_storage_t(__q, 0); else assert(__rng3.size() >= __n1 + __n2); std::shared_ptr<__result_and_scratch_storage_base> __p_result_and_scratch_storage_base( static_cast<__result_and_scratch_storage_base*>(__p_res_storage)); - auto __event = __exec.queue().submit([&__rng1, &__rng2, &__rng3, __p_res_storage, __comp, __chunk, __steps, __n, - __n1, __n2](sycl::handler& __cgh) { + auto __event = __q.submit([&__rng1, &__rng2, &__rng3, __p_res_storage, __comp, __chunk, __steps, __n, __n1, + __n2](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); auto __result_acc = __get_acc(__p_res_storage, __cgh); @@ -299,13 +299,13 @@ struct __parallel_merge_submitter_large<_OutSizeLimit, _IdType, _CustomName, }; // Calculate nd-range parameters - template + template nd_range_params - eval_nd_range_params(_ExecutionPolicy&& __exec, const _Range1& __rng1, const _Range2& __rng2, + eval_nd_range_params(const sycl::queue& __q, const _Range1& __rng1, const _Range2& __rng2, const std::size_t __n) const { // Empirical number of values to process per work-item - const std::uint8_t __chunk = __exec.queue().get_device().is_cpu() ? 128 : 4; + const std::uint8_t __chunk = __q.get_device().is_cpu() ? 128 : 4; const _IdType __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk); // TODO required to evaluate this value based on available SLM size for each work-group. @@ -317,10 +317,10 @@ struct __parallel_merge_submitter_large<_OutSizeLimit, _IdType, _CustomName, } // Calculation of split points on each base diagonal - template + template sycl::event - eval_split_points_for_groups(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _IdType __n, - _Compare __comp, const nd_range_params& __nd_range_params, + eval_split_points_for_groups(sycl::queue& __q, _Range1&& __rng1, _Range2&& __rng2, _IdType __n, _Compare __comp, + const nd_range_params& __nd_range_params, _Storage& __base_diagonals_sp_global_storage) const { const _IdType __n1 = __rng1.size(); @@ -328,8 +328,8 @@ struct __parallel_merge_submitter_large<_OutSizeLimit, _IdType, _CustomName, const _IdType __base_diag_chunk = __nd_range_params.steps_between_two_base_diags * __nd_range_params.chunk; - return __exec.queue().submit([&__rng1, &__rng2, __comp, __nd_range_params, __base_diagonals_sp_global_storage, - __n1, __n2, __n, __base_diag_chunk](sycl::handler& __cgh) { + return __q.submit([&__rng1, &__rng2, __comp, __nd_range_params, __base_diagonals_sp_global_storage, __n1, __n2, + __n, __base_diag_chunk](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2); auto __base_diagonals_sp_global_acc = __base_diagonals_sp_global_storage.template __get_scratch_acc( @@ -353,10 +353,9 @@ struct __parallel_merge_submitter_large<_OutSizeLimit, _IdType, _CustomName, } // Process parallel merge - template + template sycl::event - run_parallel_merge(const sycl::event& __event, _ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, + run_parallel_merge(const sycl::event& __event, sycl::queue& __q, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp, const nd_range_params& __nd_range_params, const _Storage& __base_diagonals_sp_global_storage) const { @@ -364,8 +363,8 @@ struct __parallel_merge_submitter_large<_OutSizeLimit, _IdType, _CustomName, const _IdType __n2 = __rng2.size(); const _IdType __n = std::min<_IdType>(__n1 + __n2, __rng3.size()); - return __exec.queue().submit([&__event, &__rng1, &__rng2, &__rng3, __n, __comp, __nd_range_params, - __base_diagonals_sp_global_storage, __n1, __n2](sycl::handler& __cgh) { + return __q.submit([&__event, &__rng1, &__rng2, &__rng3, __n, __comp, __nd_range_params, + __base_diagonals_sp_global_storage, __n1, __n2](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); auto __base_diagonals_sp_global_acc = __base_diagonals_sp_global_storage.template __get_scratch_acc(__cgh); @@ -423,9 +422,9 @@ struct __parallel_merge_submitter_large<_OutSizeLimit, _IdType, _CustomName, } public: - template + template __future> - operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const + operator()(sycl::queue& __q, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const { const _IdType __n1 = __rng1.size(); const _IdType __n2 = __rng2.size(); @@ -433,10 +432,10 @@ struct __parallel_merge_submitter_large<_OutSizeLimit, _IdType, _CustomName, const _IdType __n = std::min<_IdType>(__n1 + __n2, __rng3.size()); - _PRINT_INFO_IN_DEBUG_MODE(__exec.queue()); + _PRINT_INFO_IN_DEBUG_MODE(__q); // Calculate nd-range parameters - const nd_range_params __nd_range_params = eval_nd_range_params(__exec, __rng1, __rng2, __n); + const nd_range_params __nd_range_params = eval_nd_range_params(__q, __rng1, __rng2, __n); // Create storage to save split-points on each base diagonal + 1 (for the right base diagonal in the last work-group) using __val_t = _split_point_t<_IdType>; @@ -444,18 +443,18 @@ struct __parallel_merge_submitter_large<_OutSizeLimit, _IdType, _CustomName, std::integral_constant>; using __result_and_scratch_storage_t = __result_and_scratch_storage<__val_t, _NResults::value>; auto __p_base_diagonals_sp_global_storage = - new __result_and_scratch_storage_t(__exec, __nd_range_params.base_diag_count + 1); + new __result_and_scratch_storage_t(__q, __nd_range_params.base_diag_count + 1); // Save the raw pointer into a shared_ptr to return it in __future and extend the lifetime of the storage. std::shared_ptr<__result_and_scratch_storage_base> __p_result_and_scratch_storage_base( static_cast<__result_and_scratch_storage_base*>(__p_base_diagonals_sp_global_storage)); // Find split-points on the base diagonals - sycl::event __event = eval_split_points_for_groups(__exec, __rng1, __rng2, __n, __comp, __nd_range_params, + sycl::event __event = eval_split_points_for_groups(__q, __rng1, __rng2, __n, __comp, __nd_range_params, *__p_base_diagonals_sp_global_storage); // Merge data using split points on each diagonal - __event = run_parallel_merge(__event, __exec, __rng1, __rng2, __rng3, __comp, __nd_range_params, + __event = run_parallel_merge(__event, __q, __rng1, __rng2, __rng3, __comp, __nd_range_params, *__p_base_diagonals_sp_global_storage); return __future{std::move(__event), std::move(__p_result_and_scratch_storage_base)}; @@ -485,14 +484,12 @@ __get_starting_size_limit_for_large_submitter() return 16 * 1'048'576; // 16 MB } -template __future> -__parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Range1&& __rng1, - _Range2&& __rng2, _Range3&& __rng3, _Compare __comp, _OutSizeLimit = {}) +__parallel_merge(oneapi::dpl::__internal::__device_backend_tag, sycl::queue& __q, _Range1&& __rng1, _Range2&& __rng2, + _Range3&& __rng3, _Compare __comp, _OutSizeLimit = {}) { - using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; - using __value_type = oneapi::dpl::__internal::__value_t<_Range3>; const std::size_t __n = std::min(__rng1.size() + __rng2.size(), __rng3.size()); @@ -504,8 +501,7 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< __merge_kernel_name<_CustomName, _WiIndex>>; return __parallel_merge_submitter<_OutSizeLimit, _WiIndex, _MergeKernelName>()( - std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), - std::forward<_Range3>(__rng3), __comp); + __q, std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), std::forward<_Range3>(__rng3), __comp); } else { @@ -517,9 +513,9 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< __merge_kernel_name_large<_CustomName, _WiIndex>>; return __parallel_merge_submitter_large<_OutSizeLimit, _WiIndex, _CustomName, _DiagonalsKernelName, - _MergeKernelName>()( - std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), - std::forward<_Range3>(__rng3), __comp); + _MergeKernelName>()(__q, std::forward<_Range1>(__rng1), + std::forward<_Range2>(__rng2), + std::forward<_Range3>(__rng3), __comp); } else { @@ -529,9 +525,9 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< __merge_kernel_name_large<_CustomName, _WiIndex>>; return __parallel_merge_submitter_large<_OutSizeLimit, _WiIndex, _CustomName, _DiagonalsKernelName, - _MergeKernelName>()( - std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), - std::forward<_Range3>(__rng3), __comp); + _MergeKernelName>()(__q, std::forward<_Range1>(__rng1), + std::forward<_Range2>(__rng2), + std::forward<_Range3>(__rng3), __comp); } } } diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index 39b229a4134..e4d00243ee4 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -336,26 +336,24 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name template DropViews(Rng&, const WorkDataArea&) -> DropViews; - template std::size_t - get_max_base_diags_count(const _ExecutionPolicy& __exec, const _IndexT __chunk, std::size_t __n) const + get_max_base_diags_count(const sycl::queue& __q, const _IndexT __chunk, std::size_t __n) const { - const std::size_t __max_wg_size = oneapi::dpl::__internal::__max_work_group_size(__exec.queue()); + const std::size_t __max_wg_size = oneapi::dpl::__internal::__max_work_group_size(__q); return oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk * __max_wg_size); } // Calculate nd-range params - template nd_range_params - eval_nd_range_params(const _ExecutionPolicy& __exec, const std::size_t __rng_size, const _IndexT __n_sorted) const + eval_nd_range_params(const sycl::queue& __q, const std::size_t __rng_size, const _IndexT __n_sorted) const { - const bool __is_cpu = __exec.queue().get_device().is_cpu(); + const bool __is_cpu = __q.get_device().is_cpu(); // The chunk size must not exceed two sorted sub-sequences to be merged, // ensuring that at least one work-item processes them. const _IndexT __chunk = std::min<_IndexT>(__is_cpu ? 32 : 4, __n_sorted * 2); const _IndexT __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__rng_size, __chunk); - _IndexT __base_diag_count = get_max_base_diags_count(__exec, __chunk, __n_sorted); + _IndexT __base_diag_count = get_max_base_diags_count(__q, __chunk, __n_sorted); _IndexT __steps_between_two_base_diags = oneapi::dpl::__internal::__dpl_ceiling_div(__steps, __base_diag_count); return {__base_diag_count, __steps_between_two_base_diags, __chunk, __steps}; @@ -381,18 +379,17 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name } // Calculation of split points on each base diagonal - template + template sycl::event eval_split_points_for_groups(const sycl::event& __event_chain, const _IndexT __n_sorted, const bool __data_in_temp, - const _ExecutionPolicy& __exec, const _Range& __rng, _TempBuf& __temp_buf, - _Compare __comp, const nd_range_params& __nd_range_params, + sycl::queue& __q, const _Range& __rng, _TempBuf& __temp_buf, _Compare __comp, + const nd_range_params& __nd_range_params, _Storage& __base_diagonals_sp_global_storage) const { const _IndexT __n = __rng.size(); - return __exec.queue().submit([&__event_chain, __n_sorted, __data_in_temp, &__rng, &__temp_buf, __comp, - __nd_range_params, &__base_diagonals_sp_global_storage, - __n](sycl::handler& __cgh) { + return __q.submit([&__event_chain, __n_sorted, __data_in_temp, &__rng, &__temp_buf, __comp, __nd_range_params, + &__base_diagonals_sp_global_storage, __n](sycl::handler& __cgh) { __cgh.depends_on(__event_chain); oneapi::dpl::__ranges::__require_access(__cgh, __rng); @@ -474,16 +471,16 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name } // Process parallel merge - template + template sycl::event run_parallel_merge(const sycl::event& __event_chain, const _IndexT __n_sorted, const bool __data_in_temp, - const _ExecutionPolicy& __exec, _Range& __rng, _TempBuf& __temp_buf, _Compare __comp, + sycl::queue& __q, _Range& __rng, _TempBuf& __temp_buf, _Compare __comp, const nd_range_params& __nd_range_params) const { const _IndexT __n = __rng.size(); - return __exec.queue().submit([&__event_chain, __n_sorted, __data_in_temp, &__rng, &__temp_buf, __comp, - __nd_range_params, __n](sycl::handler& __cgh) { + return __q.submit([&__event_chain, __n_sorted, __data_in_temp, &__rng, &__temp_buf, __comp, __nd_range_params, + __n](sycl::handler& __cgh) { __cgh.depends_on(__event_chain); oneapi::dpl::__ranges::__require_access(__cgh, __rng); @@ -515,18 +512,17 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name } // Process parallel merge with usage of split-points on base diagonals - template + template sycl::event run_parallel_merge_from_diagonals(const sycl::event& __event_chain, const _IndexT __n_sorted, - const bool __data_in_temp, const _ExecutionPolicy& __exec, _Range& __rng, - _TempBuf& __temp_buf, _Compare __comp, const nd_range_params& __nd_range_params, + const bool __data_in_temp, sycl::queue& __q, _Range& __rng, _TempBuf& __temp_buf, + _Compare __comp, const nd_range_params& __nd_range_params, _Storage& __base_diagonals_sp_global_storage) const { const _IndexT __n = __rng.size(); - return __exec.queue().submit([&__event_chain, __n_sorted, __data_in_temp, &__rng, &__temp_buf, __comp, - __nd_range_params, &__base_diagonals_sp_global_storage, - __n](sycl::handler& __cgh) { + return __q.submit([&__event_chain, __n_sorted, __data_in_temp, &__rng, &__temp_buf, __comp, __nd_range_params, + &__base_diagonals_sp_global_storage, __n](sycl::handler& __cgh) { __cgh.depends_on(__event_chain); oneapi::dpl::__ranges::__require_access(__cgh, __rng); @@ -568,9 +564,9 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name } public: - template + template std::tuple> - operator()(_ExecutionPolicy&& __exec, _Range& __rng, _Compare __comp, _LeafSizeT __leaf_size, _TempBuf& __temp_buf, + operator()(sycl::queue& __q, _Range& __rng, _Compare __comp, _LeafSizeT __leaf_size, _TempBuf& __temp_buf, sycl::event __event_chain) const { // 1 final base diagonal for save final sp(0,0) @@ -584,7 +580,7 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name using __value_type = oneapi::dpl::__internal::__value_t<_Range>; // Calculate nd-range params - const nd_range_params __nd_range_params = eval_nd_range_params(__exec, __n, __n_sorted); + const nd_range_params __nd_range_params = eval_nd_range_params(__q, __n, __n_sorted); using __base_diagonals_sp_storage_t = __result_and_scratch_storage<_merge_split_point_t, /* _NResults */ 0>; @@ -601,7 +597,7 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name // Max amount of base diagonals const std::size_t __max_base_diags_count = - get_max_base_diags_count(__exec, __nd_range_params.chunk, __n) + __1_final_base_diag; + get_max_base_diags_count(__q, __nd_range_params.chunk, __n) + __1_final_base_diag; for (std::int64_t __i = 0; __i < __n_iter; ++__i) { @@ -609,7 +605,7 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name if (2 * __n_sorted < __get_starting_size_limit_for_large_submitter<__value_type>()) { // Process parallel merge - __event_chain = run_parallel_merge(__event_chain, __n_sorted, __data_in_temp, __exec, __rng, __temp_buf, + __event_chain = run_parallel_merge(__event_chain, __n_sorted, __data_in_temp, __q, __rng, __temp_buf, __comp, __nd_range_params); } else @@ -618,7 +614,7 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name { // Create storage to save split-points on each base diagonal + 1 (for the right base diagonal in the last work-group) __p_base_diagonals_sp_global_storage = - new __base_diagonals_sp_storage_t(__exec.queue(), __max_base_diags_count); + new __base_diagonals_sp_storage_t(__q, __max_base_diags_count); // Save the raw pointer into a shared_ptr to return it in __future and extend the lifetime of the storage. __p_result_and_scratch_storage_base.reset( @@ -626,7 +622,7 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name } nd_range_params __nd_range_params_this = - eval_nd_range_params(__exec, std::size_t(2 * __n_sorted), __n_sorted); + eval_nd_range_params(__q, std::size_t(2 * __n_sorted), __n_sorted); // Check that each base diagonal started from beginning of merge matrix assert(0 == (2 * __n_sorted) % @@ -640,12 +636,12 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name // Calculation of split-points on each base diagonal __event_chain = - eval_split_points_for_groups(__event_chain, __n_sorted, __data_in_temp, __exec, __rng, __temp_buf, + eval_split_points_for_groups(__event_chain, __n_sorted, __data_in_temp, __q, __rng, __temp_buf, __comp, __nd_range_params_this, *__p_base_diagonals_sp_global_storage); // Process parallel merge with usage of split-points on base diagonals - __event_chain = run_parallel_merge_from_diagonals(__event_chain, __n_sorted, __data_in_temp, __exec, - __rng, __temp_buf, __comp, __nd_range_params_this, + __event_chain = run_parallel_merge_from_diagonals(__event_chain, __n_sorted, __data_in_temp, __q, __rng, + __temp_buf, __comp, __nd_range_params_this, *__p_base_diagonals_sp_global_storage); } @@ -696,13 +692,12 @@ class __sort_global_kernel2; template class __sort_copy_back_kernel; -template +template __future> -__merge_sort(_ExecutionPolicy&& __exec, _Range&& __rng, _Compare __comp, _LeafSorter& __leaf_sorter) +__merge_sort(sycl::queue& __q, _Range&& __rng, _Compare __comp, _LeafSorter& __leaf_sorter) { using _Tp = oneapi::dpl::__internal::__value_t<_Range>; - using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; using _LeafSortKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<__sort_leaf_kernel<_CustomName>>; using _DiagonalsKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< @@ -718,8 +713,6 @@ __merge_sort(_ExecutionPolicy&& __exec, _Range&& __rng, _Compare __comp, _LeafSo assert((__leaf_sorter.__process_size & (__leaf_sorter.__process_size - 1)) == 0 && "Leaf size must be a power of 2"); - sycl::queue __q = __exec.queue(); - // 1. Perform sorting of the leaves of the merge sort tree sycl::event __event_leaf_sort = __merge_sort_leaf_submitter<_LeafSortKernel>()(__q, __rng, __leaf_sorter); @@ -728,7 +721,7 @@ __merge_sort(_ExecutionPolicy&& __exec, _Range&& __rng, _Compare __comp, _LeafSo auto __temp_buf = __temp.get_buffer(); auto [__event_sort, __data_in_temp, __temp_sp_storages] = __merge_sort_global_submitter<_IndexT, _DiagonalsKernelName, _GlobalSortKernel1, _GlobalSortKernel2>()( - __exec, __rng, __comp, __leaf_sorter.__process_size, __temp_buf, __event_leaf_sort); + __q, __rng, __comp, __leaf_sorter.__process_size, __temp_buf, __event_leaf_sort); // 3. If the data remained in the temporary buffer then copy it back if (__data_in_temp) @@ -739,15 +732,15 @@ __merge_sort(_ExecutionPolicy&& __exec, _Range&& __rng, _Compare __comp, _LeafSo return __future{std::move(__event_sort), std::move(__temp_sp_storages)}; } -template +template __future> -__submit_selecting_leaf(_ExecutionPolicy&& __exec, _Range&& __rng, _Compare __comp) +__submit_selecting_leaf(sycl::queue& __q, _Range&& __rng, _Compare __comp) { using _Leaf = __leaf_sorter, _Compare>; using _Tp = oneapi::dpl::__internal::__value_t<_Range>; const std::size_t __n = __rng.size(); - sycl::device __device = __exec.queue().get_device(); + sycl::device __device = __q.get_device(); const std::size_t __max_wg_size = __device.template get_info(); @@ -789,23 +782,20 @@ __submit_selecting_leaf(_ExecutionPolicy&& __exec, _Range&& __rng, _Compare __co __wg_size = oneapi::dpl::__internal::__dpl_bit_floor(__wg_size); _Leaf __leaf(__rng, __comp, __data_per_workitem, __wg_size); - return __merge_sort<_IndexT>(std::forward<_ExecutionPolicy>(__exec), std::forward<_Range>(__rng), __comp, __leaf); + return __merge_sort<_CustomName, _IndexT>(__q, std::forward<_Range>(__rng), __comp, __leaf); }; -template +template __future> -__parallel_sort_impl(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Range&& __rng, - _Compare __comp) +__parallel_sort_impl(oneapi::dpl::__internal::__device_backend_tag, sycl::queue& __q, _Range&& __rng, _Compare __comp) { if (__rng.size() <= std::numeric_limits::max()) { - return __submit_selecting_leaf(std::forward<_ExecutionPolicy>(__exec), - std::forward<_Range>(__rng), __comp); + return __submit_selecting_leaf<_CustomName, std::uint32_t>(__q, std::forward<_Range>(__rng), __comp); } else { - return __submit_selecting_leaf(std::forward<_ExecutionPolicy>(__exec), - std::forward<_Range>(__rng), __comp); + return __submit_selecting_leaf<_CustomName, std::uint64_t>(__q, std::forward<_Range>(__rng), __comp); } } diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h index ea2da90245d..83244d315bf 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h @@ -161,14 +161,14 @@ class __radix_sort_reorder_kernel; // radix sort: count kernel (per iteration) //----------------------------------------------------------------------- -template sycl::event -__radix_sort_count_submit(_ExecutionPolicy&& __exec, ::std::size_t __segments, ::std::size_t __wg_size, +__radix_sort_count_submit(sycl::queue& __q, std::size_t __segments, std::size_t __wg_size, ::std::uint32_t __radix_offset, _ValRange&& __val_rng, _CountBuf& __count_buf, sycl::event __dependency_event, _Proj __proj #if _ONEDPL_COMPILE_KERNEL @@ -191,7 +191,7 @@ __radix_sort_count_submit(_ExecutionPolicy&& __exec, ::std::size_t __segments, : oneapi::dpl::__ranges::all_view<_CountT, __par_backend_hetero::access_mode::read_write>(__count_buf); // submit to compute arrays with local count values - sycl::event __count_levent = __exec.queue().submit([&](sycl::handler& __hdl) { + sycl::event __count_levent = __q.submit([&](sycl::handler& __hdl) { __hdl.depends_on(__dependency_event); // ensure the input data and the space for counters are accessible @@ -266,13 +266,13 @@ __radix_sort_count_submit(_ExecutionPolicy&& __exec, ::std::size_t __segments, : // radix sort: scan kernel (per iteration) //----------------------------------------------------------------------- -template sycl::event -__radix_sort_scan_submit(_ExecutionPolicy&& __exec, ::std::size_t __scan_wg_size, ::std::size_t __segments, +__radix_sort_scan_submit(sycl::queue& __q, std::size_t __scan_wg_size, std::size_t __segments, _CountBuf& __count_buf, ::std::size_t __n, sycl::event __dependency_event #if _ONEDPL_COMPILE_KERNEL , _Kernel& __kernel @@ -295,7 +295,7 @@ __radix_sort_scan_submit(_ExecutionPolicy&& __exec, ::std::size_t __scan_wg_size // compilation of the kernel prevents out of resources issue, which may occur due to usage of // collective algorithms such as joint_exclusive_scan even if local memory is not explicitly requested - sycl::event __scan_event = __exec.queue().submit([&](sycl::handler& __hdl) { + sycl::event __scan_event = __q.submit([&](sycl::handler& __hdl) { __hdl.depends_on(__dependency_event); // access the counters for all work groups oneapi::dpl::__ranges::__require_access(__hdl, __count_rng); @@ -501,16 +501,15 @@ __copy_kernel_for_radix_sort(::std::size_t __segments, const ::std::size_t __ele // radix sort: reorder kernel (per iteration) //----------------------------------------------------------------------- template sycl::event -__radix_sort_reorder_submit(_ExecutionPolicy&& __exec, ::std::size_t __segments, - ::std::size_t __sg_size, ::std::uint32_t __radix_offset, _InRange&& __input_rng, - _OutRange&& __output_rng, _OffsetBuf& __offset_buf, sycl::event __dependency_event, - _Proj __proj +__radix_sort_reorder_submit(sycl::queue& __q, std::size_t __segments, std::size_t __sg_size, + std::uint32_t __radix_offset, _InRange&& __input_rng, _OutRange&& __output_rng, + _OffsetBuf& __offset_buf, sycl::event __dependency_event, _Proj __proj #if _ONEDPL_COMPILE_KERNEL , _Kernel& __kernel #endif @@ -535,7 +534,7 @@ __radix_sort_reorder_submit(_ExecutionPolicy&& __exec, ::std::size_t __segments, oneapi::dpl::__ranges::all_view<::std::uint32_t, __par_backend_hetero::access_mode::read>(__offset_buf); // submit to reorder values - sycl::event __reorder_event = __exec.queue().submit([&](sycl::handler& __hdl) { + sycl::event __reorder_event = __q.submit([&](sycl::handler& __hdl) { __hdl.depends_on(__dependency_event); // access the offsets for all work groups oneapi::dpl::__ranges::__require_access(__hdl, __offset_rng); @@ -635,7 +634,7 @@ __radix_sort_reorder_submit(_ExecutionPolicy&& __exec, ::std::size_t __segments, // radix sort: one iteration //----------------------------------------------------------------------- -template <::std::uint32_t __radix_bits, bool __is_ascending, bool __even> +template struct __parallel_radix_sort_iteration { template @@ -647,29 +646,28 @@ struct __parallel_radix_sort_iteration template using __reorder_phase = __radix_sort_reorder_kernel<__radix_bits, __is_ascending, __even, _Name...>; - template + template static sycl::event - submit(_ExecutionPolicy&& __exec, ::std::size_t __segments, ::std::uint32_t __radix_iter, _InRange&& __in_rng, + submit(sycl::queue& __q, std::size_t __segments, std::uint32_t __radix_iter, _InRange&& __in_rng, _OutRange&& __out_rng, _TmpBuf& __tmp_buf, sycl::event __dependency_event, _Proj __proj) { - using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; using _RadixCountKernel = - __internal::__kernel_name_generator<__count_phase, _CustomName, _ExecutionPolicy, ::std::decay_t<_InRange>, + __internal::__kernel_name_generator<__count_phase, _CustomName, std::decay_t<_InRange>, ::std::decay_t<_TmpBuf>, _Proj>; - using _RadixLocalScanKernel = __internal::__kernel_name_generator<__local_scan_phase, _CustomName, - _ExecutionPolicy, ::std::decay_t<_TmpBuf>>; + using _RadixLocalScanKernel = + __internal::__kernel_name_generator<__local_scan_phase, _CustomName, std::decay_t<_TmpBuf>>; using _RadixReorderPeerKernel = - __internal::__kernel_name_generator<__reorder_peer_phase, _CustomName, _ExecutionPolicy, - ::std::decay_t<_InRange>, ::std::decay_t<_OutRange>, _Proj>; + __internal::__kernel_name_generator<__reorder_peer_phase, _CustomName, std::decay_t<_InRange>, + std::decay_t<_OutRange>, _Proj>; using _RadixReorderKernel = - __internal::__kernel_name_generator<__reorder_phase, _CustomName, _ExecutionPolicy, - ::std::decay_t<_InRange>, ::std::decay_t<_OutRange>, _Proj>; + __internal::__kernel_name_generator<__reorder_phase, _CustomName, std::decay_t<_InRange>, + std::decay_t<_OutRange>, _Proj>; - ::std::size_t __max_sg_size = oneapi::dpl::__internal::__max_sub_group_size(__exec.queue()); + std::size_t __max_sg_size = oneapi::dpl::__internal::__max_sub_group_size(__q); ::std::size_t __reorder_sg_size = __max_sg_size; // Limit the work-group size to prevent large sizes on CPUs. Empirically found value. // This value exceeds the current practical limit for GPUs, but may need to be re-evaluated in the future. - std::size_t __scan_wg_size = oneapi::dpl::__internal::__max_work_group_size(__exec.queue(), (std::size_t)4096); + std::size_t __scan_wg_size = oneapi::dpl::__internal::__max_work_group_size(__q, (std::size_t)4096); #if _ONEDPL_RADIX_WORKLOAD_TUNING ::std::size_t __count_wg_size = (__in_rng.size() > (1 << 21) /*2M*/ ? 128 : __max_sg_size); #else @@ -678,17 +676,16 @@ struct __parallel_radix_sort_iteration // correct __count_wg_size, __scan_wg_size, __reorder_sg_size after introspection of the kernels #if _ONEDPL_COMPILE_KERNEL - auto __kernels = - __internal::__kernel_compiler<_RadixCountKernel, _RadixLocalScanKernel, _RadixReorderPeerKernel, - _RadixReorderKernel>::__compile(__exec.queue()); + auto __kernels = __internal::__kernel_compiler<_RadixCountKernel, _RadixLocalScanKernel, + _RadixReorderPeerKernel, _RadixReorderKernel>::__compile(__q); auto __count_kernel = __kernels[0]; auto __local_scan_kernel = __kernels[1]; auto __reorder_peer_kernel = __kernels[2]; auto __reorder_kernel = __kernels[3]; - std::size_t __count_sg_size = oneapi::dpl::__internal::__kernel_sub_group_size(__exec.queue(), __count_kernel); - __reorder_sg_size = oneapi::dpl::__internal::__kernel_sub_group_size(__exec.queue(), __reorder_kernel); - __scan_wg_size = sycl::min( - __scan_wg_size, oneapi::dpl::__internal::__kernel_work_group_size(__exec.queue(), __local_scan_kernel)); + std::size_t __count_sg_size = oneapi::dpl::__internal::__kernel_sub_group_size(__q, __count_kernel); + __reorder_sg_size = oneapi::dpl::__internal::__kernel_sub_group_size(__q, __reorder_kernel); + __scan_wg_size = + sycl::min(__scan_wg_size, oneapi::dpl::__internal::__kernel_work_group_size(__q, __local_scan_kernel)); __count_wg_size = sycl::max(__count_sg_size, __reorder_sg_size); #endif const ::std::uint32_t __radix_states = 1 << __radix_bits; @@ -696,7 +693,7 @@ struct __parallel_radix_sort_iteration // correct __count_wg_size according to local memory limit in count phase using _CounterType = typename ::std::decay_t<_TmpBuf>::value_type; const auto __max_count_wg_size = oneapi::dpl::__internal::__slm_adjusted_work_group_size( - __exec.queue(), sizeof(_CounterType) * __radix_states, __count_wg_size); + __q, sizeof(_CounterType) * __radix_states, __count_wg_size); __count_wg_size = static_cast<::std::size_t>((__max_count_wg_size / __radix_states)) * __radix_states; // work-group size must be a power of 2 and not less than the number of states. @@ -709,7 +706,7 @@ struct __parallel_radix_sort_iteration // 1. Count Phase sycl::event __count_event = __radix_sort_count_submit<_RadixCountKernel, __radix_bits, __is_ascending>( - __exec, __segments, __count_wg_size, __radix_offset, __in_rng, __tmp_buf, __dependency_event, __proj + __q, __segments, __count_wg_size, __radix_offset, __in_rng, __tmp_buf, __dependency_event, __proj #if _ONEDPL_COMPILE_KERNEL , __count_kernel #endif @@ -717,7 +714,7 @@ struct __parallel_radix_sort_iteration // 2. Scan Phase sycl::event __scan_event = __radix_sort_scan_submit<_RadixLocalScanKernel, __radix_bits>( - __exec, __scan_wg_size, __segments, __tmp_buf, __in_rng.size(), __count_event + __q, __scan_wg_size, __segments, __tmp_buf, __in_rng.size(), __count_event #if _ONEDPL_COMPILE_KERNEL , __local_scan_kernel #endif @@ -737,8 +734,8 @@ struct __parallel_radix_sort_iteration __reorder_event = __radix_sort_reorder_submit<_RadixReorderPeerKernel, __radix_bits, __is_ascending, __peer_algorithm>( - __exec, __segments, __reorder_sg_size, __radix_offset, ::std::forward<_InRange>(__in_rng), - ::std::forward<_OutRange>(__out_rng), __tmp_buf, __scan_event, __proj + __q, __segments, __reorder_sg_size, __radix_offset, std::forward<_InRange>(__in_rng), + std::forward<_OutRange>(__out_rng), __tmp_buf, __scan_event, __proj #if _ONEDPL_COMPILE_KERNEL , __reorder_peer_kernel #endif @@ -748,7 +745,7 @@ struct __parallel_radix_sort_iteration { __reorder_event = __radix_sort_reorder_submit<_RadixReorderKernel, __radix_bits, __is_ascending, __peer_prefix_algo::scan_then_broadcast>( - __exec, __segments, __reorder_sg_size, __radix_offset, ::std::forward<_InRange>(__in_rng), + __q, __segments, __reorder_sg_size, __radix_offset, std::forward<_InRange>(__in_rng), ::std::forward<_OutRange>(__out_rng), __tmp_buf, __scan_event, __proj #if _ONEDPL_COMPILE_KERNEL , __reorder_kernel @@ -766,10 +763,9 @@ struct __parallel_radix_sort_iteration //----------------------------------------------------------------------- // radix sort: main function //----------------------------------------------------------------------- -template +template __future -__parallel_radix_sort(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Range&& __in_rng, - _Proj __proj) +__parallel_radix_sort(oneapi::dpl::__internal::__device_backend_tag, sycl::queue& __q, _Range&& __in_rng, _Proj __proj) { const ::std::size_t __n = __in_rng.size(); assert(__n > 1); @@ -785,38 +781,37 @@ __parallel_radix_sort(oneapi::dpl::__internal::__device_backend_tag, _ExecutionP // Limit the work-group size to prevent large sizes on CPUs. Empirically found value. // This value exceeds the current practical limit for GPUs, but may need to be re-evaluated in the future. - const std::size_t __max_wg_size = oneapi::dpl::__internal::__max_work_group_size(__exec.queue(), (std::size_t)4096); + const std::size_t __max_wg_size = oneapi::dpl::__internal::__max_work_group_size(__q, (std::size_t)4096); //TODO: 1.to reduce number of the kernels; 2.to define work group size in runtime, depending on number of elements constexpr std::size_t __wg_size = 64; - const auto __subgroup_sizes = __exec.queue().get_device().template get_info(); + const auto __subgroup_sizes = __q.get_device().template get_info(); const bool __dev_has_sg16 = std::find(__subgroup_sizes.begin(), __subgroup_sizes.end(), static_cast(16)) != __subgroup_sizes.end(); //TODO: with _RadixSortKernel also the following a couple of compile time constants is used for unique kernel name - using _RadixSortKernel = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; if (__n <= 64 && __wg_size <= __max_wg_size) __event = __subgroup_radix_sort<_RadixSortKernel, __wg_size, 1, __radix_bits, __is_ascending>{}( - __exec.queue(), ::std::forward<_Range>(__in_rng), __proj); + __q, std::forward<_Range>(__in_rng), __proj); else if (__n <= 128 && __wg_size * 2 <= __max_wg_size) __event = __subgroup_radix_sort<_RadixSortKernel, __wg_size * 2, 1, __radix_bits, __is_ascending>{}( - __exec.queue(), ::std::forward<_Range>(__in_rng), __proj); + __q, std::forward<_Range>(__in_rng), __proj); else if (__n <= 256 && __wg_size * 2 <= __max_wg_size) __event = __subgroup_radix_sort<_RadixSortKernel, __wg_size * 2, 2, __radix_bits, __is_ascending>{}( - __exec.queue(), ::std::forward<_Range>(__in_rng), __proj); + __q, std::forward<_Range>(__in_rng), __proj); else if (__n <= 512 && __wg_size * 2 <= __max_wg_size) __event = __subgroup_radix_sort<_RadixSortKernel, __wg_size * 2, 4, __radix_bits, __is_ascending>{}( - __exec.queue(), ::std::forward<_Range>(__in_rng), __proj); + __q, std::forward<_Range>(__in_rng), __proj); else if (__n <= 1024 && __wg_size * 2 <= __max_wg_size) __event = __subgroup_radix_sort<_RadixSortKernel, __wg_size * 2, 8, __radix_bits, __is_ascending>{}( - __exec.queue(), ::std::forward<_Range>(__in_rng), __proj); + __q, std::forward<_Range>(__in_rng), __proj); else if (__n <= 2048 && __wg_size * 4 <= __max_wg_size) __event = __subgroup_radix_sort<_RadixSortKernel, __wg_size * 4, 8, __radix_bits, __is_ascending>{}( - __exec.queue(), ::std::forward<_Range>(__in_rng), __proj); + __q, std::forward<_Range>(__in_rng), __proj); else if (__n <= 4096 && __wg_size * 4 <= __max_wg_size) __event = __subgroup_radix_sort<_RadixSortKernel, __wg_size * 4, 16, __radix_bits, __is_ascending>{}( - __exec.queue(), ::std::forward<_Range>(__in_rng), __proj); + __q, std::forward<_Range>(__in_rng), __proj); // In __subgroup_radix_sort, we request a sub-group size of 16 via _ONEDPL_SYCL_REQD_SUB_GROUP_SIZE_IF_SUPPORTED // for compilation targets that support this option. For the below cases, register spills that result in // runtime exceptions have been observed on accelerators that do not support the requested sub-group size of 16. @@ -824,10 +819,10 @@ __parallel_radix_sort(oneapi::dpl::__internal::__device_backend_tag, _ExecutionP // register spills on assessed hardware. else if (__n <= 8192 && __wg_size * 8 <= __max_wg_size && __dev_has_sg16) __event = __subgroup_radix_sort<_RadixSortKernel, __wg_size * 8, 16, __radix_bits, __is_ascending>{}( - __exec.queue(), ::std::forward<_Range>(__in_rng), __proj); + __q, std::forward<_Range>(__in_rng), __proj); else if (__n <= 16384 && __wg_size * 8 <= __max_wg_size && __dev_has_sg16) __event = __subgroup_radix_sort<_RadixSortKernel, __wg_size * 8, 32, __radix_bits, __is_ascending>{}( - __exec.queue(), ::std::forward<_Range>(__in_rng), __proj); + __q, std::forward<_Range>(__in_rng), __proj); else { constexpr ::std::uint32_t __radix_iters = __get_buckets_in_type<_KeyT>(__radix_bits); @@ -859,11 +854,15 @@ __parallel_radix_sort(oneapi::dpl::__internal::__device_backend_tag, _ExecutionP { // TODO: convert to ordered type once at the first iteration and convert back at the last one if (__radix_iter % 2 == 0) - __event = __parallel_radix_sort_iteration<__radix_bits, __is_ascending, /*even=*/true>::submit( - __exec, __segments, __radix_iter, __in_rng, __out_rng, __tmp_buf, __event, __proj); + __event = + __parallel_radix_sort_iteration<_RadixSortKernel, __radix_bits, __is_ascending, + /*even=*/true>::submit(__q, __segments, __radix_iter, __in_rng, + __out_rng, __tmp_buf, __event, __proj); else //swap __in_rng and __out_rng - __event = __parallel_radix_sort_iteration<__radix_bits, __is_ascending, /*even=*/false>::submit( - __exec, __segments, __radix_iter, __out_rng, __in_rng, __tmp_buf, __event, __proj); + __event = + __parallel_radix_sort_iteration<_RadixSortKernel, __radix_bits, __is_ascending, + /*even=*/false>::submit(__q, __segments, __radix_iter, __out_rng, + __in_rng, __tmp_buf, __event, __proj); } } diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort_one_wg.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort_one_wg.h index 51d1cdafa9f..3b3e5230011 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort_one_wg.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort_one_wg.h @@ -35,7 +35,7 @@ struct __subgroup_radix_sort { template sycl::event - operator()(sycl::queue __q, _RangeIn&& __src, _Proj __proj) + operator()(sycl::queue& __q, _RangeIn&& __src, _Proj __proj) { using __wg_size_t = ::std::integral_constant<::std::uint16_t, __wg_size>; using __block_size_t = ::std::integral_constant<::std::uint16_t, __block_size>; @@ -127,7 +127,7 @@ struct __subgroup_radix_sort template auto - __check_slm_size(sycl::queue __q, _Size __n) + __check_slm_size(const sycl::queue& __q, _Size __n) { assert(__n <= 1 << 16); //the kernel is designed for data size <= 64K @@ -156,7 +156,7 @@ struct __subgroup_radix_sort { template sycl::event - operator()(sycl::queue __q, _RangeIn&& __src, _Proj __proj, _SLM_tag_val, _SLM_counter) + operator()(sycl::queue& __q, _RangeIn&& __src, _Proj __proj, _SLM_tag_val, _SLM_counter) { uint16_t __n = __src.size(); assert(__n <= __block_size * __wg_size); diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h index ffe3c2da2d1..5d7e51d73bc 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h @@ -123,10 +123,9 @@ template > { - template + template __future> - operator()(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, const _Size __n, + operator()(oneapi::dpl::__internal::__device_backend_tag, sycl::queue& __q, const _Size __n, const _Size __work_group_size, const _Size __iters_per_work_item, _ReduceOp __reduce_op, _TransformOp __transform_op, _InitType __init, _Ranges&&... __rngs) const { @@ -137,9 +136,9 @@ struct __parallel_transform_reduce_small_submitter<_Tp, _Commutative, _VecSize, const bool __is_full = __n == __work_group_size * __iters_per_work_item; using __result_and_scratch_storage_t = __result_and_scratch_storage<_Tp>; - __result_and_scratch_storage_t __scratch_container{__exec.queue(), 0}; + __result_and_scratch_storage_t __scratch_container{__q, 0}; - sycl::event __reduce_event = __exec.queue().submit([&, __n](sycl::handler& __cgh) { + sycl::event __reduce_event = __q.submit([&, __n](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rngs...); // get an access to data under SYCL buffer auto __res_acc = __scratch_container.template __get_result_acc(__cgh, __dpl_sycl::__no_init{}); @@ -159,21 +158,20 @@ struct __parallel_transform_reduce_small_submitter<_Tp, _Commutative, _VecSize, } }; // struct __parallel_transform_reduce_small_submitter -template __future> -__parallel_transform_reduce_small_impl(oneapi::dpl::__internal::__device_backend_tag __backend_tag, - _ExecutionPolicy&& __exec, const _Size __n, const _Size __work_group_size, +__parallel_transform_reduce_small_impl(oneapi::dpl::__internal::__device_backend_tag __backend_tag, sycl::queue& __q, + const _Size __n, const _Size __work_group_size, const _Size __iters_per_work_item, _ReduceOp __reduce_op, _TransformOp __transform_op, _InitType __init, _Ranges&&... __rngs) { - using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; using _ReduceKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<__reduce_small_kernel<_CustomName>>; return __parallel_transform_reduce_small_submitter<_Tp, _Commutative, _VecSize, _ReduceKernel>()( - __backend_tag, std::forward<_ExecutionPolicy>(__exec), __n, __work_group_size, __iters_per_work_item, - __reduce_op, __transform_op, __init, std::forward<_Ranges>(__rngs)...); + __backend_tag, __q, __n, __work_group_size, __iters_per_work_item, __reduce_op, __transform_op, __init, + std::forward<_Ranges>(__rngs)...); } // Submits the first kernel of the parallel_transform_reduce for mid-sized arrays. @@ -186,9 +184,9 @@ template > { - template + template sycl::event - operator()(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, const _Size __n, + operator()(oneapi::dpl::__internal::__device_backend_tag, sycl::queue& __q, const _Size __n, const _Size __work_group_size, const _Size __iters_per_work_item, _ReduceOp __reduce_op, _TransformOp __transform_op, __result_and_scratch_storage<_Tp>& __scratch_container, _Ranges&&... __rngs) const @@ -204,7 +202,7 @@ struct __parallel_transform_reduce_device_kernel_submitter<_Tp, _Commutative, _V const _Size __n_groups = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __size_per_work_group); const bool __is_full = __n == __size_per_work_group * __n_groups; - return __exec.queue().submit([&, __n](sycl::handler& __cgh) { + return __q.submit([&, __n](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rngs...); // get an access to data under SYCL buffer std::size_t __local_mem_size = __reduce_pattern.local_mem_req(__work_group_size); __dpl_sycl::__local_accessor<_Tp> __temp_local(sycl::range<1>(__local_mem_size), __cgh); @@ -232,12 +230,11 @@ template > { - template + template __future> - operator()(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, - const sycl::event& __reduce_event, const _Size __n, const _Size __work_group_size, - const _Size __iters_per_work_item, _ReduceOp __reduce_op, _InitType __init, - __result_and_scratch_storage<_Tp>&& __scratch_container) const + operator()(oneapi::dpl::__internal::__device_backend_tag, sycl::queue& __q, const sycl::event& __reduce_event, + const _Size __n, const _Size __work_group_size, const _Size __iters_per_work_item, _ReduceOp __reduce_op, + _InitType __init, __result_and_scratch_storage<_Tp>&& __scratch_container) const { using __result_and_scratch_storage_t = __result_and_scratch_storage<_Tp>; @@ -249,7 +246,7 @@ struct __parallel_transform_reduce_work_group_kernel_submitter<_Tp, _Commutative const bool __is_full = __n == __work_group_size * __iters_per_work_item; - auto __event = __exec.queue().submit([&, __n](sycl::handler& __cgh) { + auto __event = __q.submit([&, __n](sycl::handler& __cgh) { __cgh.depends_on(__reduce_event); auto __temp_acc = __scratch_container.template __get_scratch_acc(__cgh); @@ -272,16 +269,15 @@ struct __parallel_transform_reduce_work_group_kernel_submitter<_Tp, _Commutative } }; // struct __parallel_transform_reduce_work_group_kernel_submitter -template __future> -__parallel_transform_reduce_mid_impl(oneapi::dpl::__internal::__device_backend_tag __backend_tag, - _ExecutionPolicy&& __exec, const _Size __n, const _Size __work_group_size, +__parallel_transform_reduce_mid_impl(oneapi::dpl::__internal::__device_backend_tag __backend_tag, sycl::queue& __q, + const _Size __n, const _Size __work_group_size, const _Size __iters_per_work_item_device_kernel, const _Size __iters_per_work_item_work_group_kernel, _ReduceOp __reduce_op, _TransformOp __transform_op, _InitType __init, _Ranges&&... __rngs) { - using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; using _ReduceDeviceKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<__reduce_mid_device_kernel<_CustomName>>; using _ReduceWorkGroupKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< @@ -290,32 +286,30 @@ __parallel_transform_reduce_mid_impl(oneapi::dpl::__internal::__device_backend_t // number of buffer elements processed within workgroup const _Size __size_per_work_group = __iters_per_work_item_device_kernel * __work_group_size; const _Size __n_groups = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __size_per_work_group); - __result_and_scratch_storage<_Tp> __scratch_container{__exec.queue(), __n_groups}; + __result_and_scratch_storage<_Tp> __scratch_container{__q, __n_groups}; sycl::event __reduce_event = __parallel_transform_reduce_device_kernel_submitter<_Tp, _Commutative, _VecSize, _ReduceDeviceKernel>()( - __backend_tag, __exec, __n, __work_group_size, __iters_per_work_item_device_kernel, __reduce_op, + __backend_tag, __q, __n, __work_group_size, __iters_per_work_item_device_kernel, __reduce_op, __transform_op, __scratch_container, std::forward<_Ranges>(__rngs)...); // __n_groups preliminary results from the device kernel. return __parallel_transform_reduce_work_group_kernel_submitter<_Tp, _Commutative, _VecSize, _ReduceWorkGroupKernel>()( - __backend_tag, std::forward<_ExecutionPolicy>(__exec), __reduce_event, __n_groups, __work_group_size, - __iters_per_work_item_work_group_kernel, __reduce_op, __init, std::move(__scratch_container)); + __backend_tag, __q, __reduce_event, __n_groups, __work_group_size, __iters_per_work_item_work_group_kernel, + __reduce_op, __init, std::move(__scratch_container)); } // General implementation using a tree reduction -template +template struct __parallel_transform_reduce_impl { - template + template static __future> - submit(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Size __n, _Size __work_group_size, + submit(oneapi::dpl::__internal::__device_backend_tag, sycl::queue& __q, _Size __n, _Size __work_group_size, const _Size __iters_per_work_item, _ReduceOp __reduce_op, _TransformOp __transform_op, _InitType __init, _Ranges&&... __rngs) { - using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; using _NoOpFunctor = unseq_backend::walk_n; using _ReduceKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator< __reduce_kernel, _CustomName, _ReduceOp, _TransformOp, _NoOpFunctor, _Ranges...>; @@ -329,8 +323,8 @@ struct __parallel_transform_reduce_impl auto __reduce_pattern = unseq_backend::reduce_over_group<_ReduceOp, _Tp>{__reduce_op}; #if _ONEDPL_COMPILE_KERNEL - auto __kernel = __internal::__kernel_compiler<_ReduceKernel>::__compile(__exec.queue()); - _Size __adjusted_work_group_size = oneapi::dpl::__internal::__kernel_work_group_size(__exec.queue(), __kernel); + auto __kernel = __internal::__kernel_compiler<_ReduceKernel>::__compile(__q); + _Size __adjusted_work_group_size = oneapi::dpl::__internal::__kernel_work_group_size(__q, __kernel); __work_group_size = std::min(__work_group_size, __adjusted_work_group_size); #endif @@ -341,7 +335,7 @@ struct __parallel_transform_reduce_impl // Create temporary global buffers to store temporary values const std::size_t __n_scratch = 2 * __n_groups; using __result_and_scratch_storage_t = __result_and_scratch_storage<_Tp>; - __result_and_scratch_storage_t __scratch_container{__exec.queue(), __n_scratch}; + __result_and_scratch_storage_t __scratch_container{__q, __n_scratch}; // __is_first == true. Reduce over each work_group // __is_first == false. Reduce between work groups @@ -355,8 +349,7 @@ struct __parallel_transform_reduce_impl sycl::event __reduce_event; do { - __reduce_event = __exec.queue().submit([&, __is_first, __offset_1, __offset_2, __n, - __n_groups](sycl::handler& __cgh) { + __reduce_event = __q.submit([&, __is_first, __offset_1, __offset_2, __n, __n_groups](sycl::handler& __cgh) { __cgh.depends_on(__reduce_event); auto __temp_acc = __scratch_container.template __get_scratch_acc( __cgh, __is_first ? sycl::property_list{__dpl_sycl::__no_init{}} : sycl::property_list{}); @@ -436,10 +429,10 @@ struct __parallel_transform_reduce_impl // Mid-sized arrays use two tree reductions with independent __iters_per_work_item. // Big arrays are processed with a recursive tree reduction. __work_group_size * __iters_per_work_item elements are // reduced in each step. -template __future> -__parallel_transform_reduce(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, +__parallel_transform_reduce(oneapi::dpl::__internal::__device_backend_tag __backend_tag, sycl::queue& __q, _ReduceOp __reduce_op, _TransformOp __transform_op, _InitType __init, _Ranges&&... __rngs) { auto __n = oneapi::dpl::__ranges::__get_first_range_size(__rngs...); @@ -457,8 +450,8 @@ __parallel_transform_reduce(oneapi::dpl::__internal::__device_backend_tag __back // Get the work group size adjusted to the local memory limit. // Pessimistically double the memory requirement to take into account memory used by compiled kernel. // TODO: find a way to generalize getting of reliable work-group size. - std::size_t __work_group_size = oneapi::dpl::__internal::__slm_adjusted_work_group_size( - __exec.queue(), static_cast(sizeof(_Tp) * 2)); + std::size_t __work_group_size = + oneapi::dpl::__internal::__slm_adjusted_work_group_size(__q, static_cast(sizeof(_Tp) * 2)); // Limit work-group size to __max_work_group_size for performance on GPUs. Empirically tested. __work_group_size = std::min(__work_group_size, __max_work_group_size); @@ -473,9 +466,9 @@ __parallel_transform_reduce(oneapi::dpl::__internal::__device_backend_tag __back const auto __work_group_size_short = static_cast(__work_group_size); std::uint16_t __iters_per_work_item = oneapi::dpl::__internal::__dpl_ceiling_div(__n_short, __work_group_size); __iters_per_work_item = __adjust_iters_per_work_item<__vector_size>(__iters_per_work_item); - return __parallel_transform_reduce_small_impl<_Tp, _Commutative, __vector_size>( - __backend_tag, std::forward<_ExecutionPolicy>(__exec), __n_short, __work_group_size_short, - __iters_per_work_item, __reduce_op, __transform_op, __init, std::forward<_Ranges>(__rngs)...); + return __parallel_transform_reduce_small_impl<_CustomName, _Tp, _Commutative, __vector_size>( + __backend_tag, __q, __n_short, __work_group_size_short, __iters_per_work_item, __reduce_op, __transform_op, + __init, std::forward<_Ranges>(__rngs)...); } // Use two-step tree reduction. // First step reduces __work_group_size * __iters_per_work_item_device_kernel elements. @@ -489,7 +482,7 @@ __parallel_transform_reduce(oneapi::dpl::__internal::__device_backend_tag __back const auto __work_group_size_short = static_cast(__work_group_size); // Fully-utilize the device by running a work-group per compute unit. // Add a factor more work-groups than compute units to fully utilizes the device and hide latencies. - const std::uint32_t __max_cu = oneapi::dpl::__internal::__max_compute_units(__exec.queue()); + const std::uint32_t __max_cu = oneapi::dpl::__internal::__max_compute_units(__q); std::uint32_t __n_groups = __max_cu * __oversubscription; std::uint32_t __iters_per_work_item_device_kernel = oneapi::dpl::__internal::__dpl_ceiling_div(__n_short, __n_groups * __work_group_size_short); @@ -507,16 +500,16 @@ __parallel_transform_reduce(oneapi::dpl::__internal::__device_backend_tag __back oneapi::dpl::__internal::__dpl_ceiling_div(__n_groups, __work_group_size_short); __iters_per_work_item_work_group_kernel = __adjust_iters_per_work_item<__vector_size>(__iters_per_work_item_work_group_kernel); - return __parallel_transform_reduce_mid_impl<_Tp, _Commutative, __vector_size>( - __backend_tag, std::forward<_ExecutionPolicy>(__exec), __n_short, __work_group_size_short, - __iters_per_work_item_device_kernel, __iters_per_work_item_work_group_kernel, __reduce_op, __transform_op, - __init, std::forward<_Ranges>(__rngs)...); + return __parallel_transform_reduce_mid_impl<_CustomName, _Tp, _Commutative, __vector_size>( + __backend_tag, __q, __n_short, __work_group_size_short, __iters_per_work_item_device_kernel, + __iters_per_work_item_work_group_kernel, __reduce_op, __transform_op, __init, + std::forward<_Ranges>(__rngs)...); } // Otherwise use a recursive tree reduction with __max_iters_per_work_item __iters_per_work_item. const auto __work_group_size_long = static_cast<_Size>(__work_group_size); - return __parallel_transform_reduce_impl<_Tp, _Commutative, __vector_size>::submit( - __backend_tag, std::forward<_ExecutionPolicy>(__exec), __n, __work_group_size_long, __max_iters_per_work_item, - __reduce_op, __transform_op, __init, std::forward<_Ranges>(__rngs)...); + return __parallel_transform_reduce_impl<_CustomName, _Tp, _Commutative, __vector_size>::submit( + __backend_tag, __q, __n, __work_group_size_long, __max_iters_per_work_item, __reduce_op, __transform_op, __init, + std::forward<_Ranges>(__rngs)...); } } // namespace __par_backend_hetero diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_by_segment.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_by_segment.h index dba0afdb03d..94d3e14b522 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_by_segment.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_by_segment.h @@ -98,17 +98,15 @@ __parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_ using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; using _SegReduceCountKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator< - _SegReduceCountPhase, _CustomName, _ExecutionPolicy, _Range1, _Range2, _Range3, _Range4, _BinaryPredicate, - _BinaryOperator>; + _SegReduceCountPhase, _CustomName, _Range1, _Range2, _Range3, _Range4, _BinaryPredicate, _BinaryOperator>; using _SegReduceOffsetKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator< - _SegReduceOffsetPhase, _CustomName, _ExecutionPolicy, _Range1, _Range2, _Range3, _Range4, _BinaryPredicate, - _BinaryOperator>; + _SegReduceOffsetPhase, _CustomName, _Range1, _Range2, _Range3, _Range4, _BinaryPredicate, _BinaryOperator>; using _SegReduceWgKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator< - _SegReduceWgPhase, _CustomName, _ExecutionPolicy, _Range1, _Range2, _Range3, _Range4, _BinaryPredicate, - _BinaryOperator>; + _SegReduceWgPhase, _CustomName, _Range1, _Range2, _Range3, _Range4, _BinaryPredicate, _BinaryOperator>; using _SegReducePrefixKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator< - _SegReducePrefixPhase, _CustomName, _ExecutionPolicy, _Range1, _Range2, _Range3, _Range4, _BinaryPredicate, - _BinaryOperator>; + _SegReducePrefixPhase, _CustomName, _Range1, _Range2, _Range3, _Range4, _BinaryPredicate, _BinaryOperator>; + + sycl::queue __q_local = __exec.queue(); using __diff_type = oneapi::dpl::__internal::__difference_t<_Range3>; using __key_type = oneapi::dpl::__internal::__value_t<_Range1>; @@ -116,32 +114,32 @@ __parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_ const std::size_t __n = __keys.size(); - constexpr std::uint16_t __vals_per_item = - 16; // Each work item serially processes 16 items. Best observed performance on gpu + // Each work item serially processes 16 items. Best observed performance on gpu + constexpr std::uint16_t __vals_per_item = 16; // Limit the work-group size to prevent large sizes on CPUs. Empirically found value. // This value exceeds the current practical limit for GPUs, but may need to be re-evaluated in the future. - std::size_t __wgroup_size = oneapi::dpl::__internal::__max_work_group_size(__exec.queue(), (std::size_t)2048); + std::size_t __wgroup_size = oneapi::dpl::__internal::__max_work_group_size(__q_local, (std::size_t)2048); // adjust __wgroup_size according to local memory limit. Double the requirement on __val_type due to sycl group algorithm's use // of SLM. __wgroup_size = oneapi::dpl::__internal::__slm_adjusted_work_group_size( - __exec.queue(), sizeof(__key_type) + 2 * sizeof(__val_type), __wgroup_size); + __q_local, sizeof(__key_type) + 2 * sizeof(__val_type), __wgroup_size); #if _ONEDPL_COMPILE_KERNEL auto __seg_reduce_count_kernel = - __par_backend_hetero::__internal::__kernel_compiler<_SegReduceCountKernel>::__compile(__exec.queue()); + __par_backend_hetero::__internal::__kernel_compiler<_SegReduceCountKernel>::__compile(__q_local); auto __seg_reduce_offset_kernel = - __par_backend_hetero::__internal::__kernel_compiler<_SegReduceOffsetKernel>::__compile(__exec.queue()); + __par_backend_hetero::__internal::__kernel_compiler<_SegReduceOffsetKernel>::__compile(__q_local); auto __seg_reduce_wg_kernel = - __par_backend_hetero::__internal::__kernel_compiler<_SegReduceWgKernel>::__compile(__exec.queue()); + __par_backend_hetero::__internal::__kernel_compiler<_SegReduceWgKernel>::__compile(__q_local); auto __seg_reduce_prefix_kernel = - __par_backend_hetero::__internal::__kernel_compiler<_SegReducePrefixKernel>::__compile(__exec.queue()); + __par_backend_hetero::__internal::__kernel_compiler<_SegReducePrefixKernel>::__compile(__q_local); __wgroup_size = std::min( - {__wgroup_size, oneapi::dpl::__internal::__kernel_work_group_size(__exec.queue(), __seg_reduce_count_kernel), - oneapi::dpl::__internal::__kernel_work_group_size(__exec.queue(), __seg_reduce_offset_kernel), - oneapi::dpl::__internal::__kernel_work_group_size(__exec.queue(), __seg_reduce_wg_kernel), - oneapi::dpl::__internal::__kernel_work_group_size(__exec.queue(), __seg_reduce_prefix_kernel)}); + {__wgroup_size, oneapi::dpl::__internal::__kernel_work_group_size(__q_local, __seg_reduce_count_kernel), + oneapi::dpl::__internal::__kernel_work_group_size(__q_local, __seg_reduce_offset_kernel), + oneapi::dpl::__internal::__kernel_work_group_size(__q_local, __seg_reduce_wg_kernel), + oneapi::dpl::__internal::__kernel_work_group_size(__q_local, __seg_reduce_prefix_kernel)}); #endif std::size_t __n_groups = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __wgroup_size * __vals_per_item); @@ -158,7 +156,7 @@ __parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_ auto __seg_ends_scanned = oneapi::dpl::__par_backend_hetero::__buffer<__diff_type>(__n_groups).get_buffer(); // 1. Count the segment ends in each workgroup - auto __seg_end_identification = __exec.queue().submit([&](sycl::handler& __cgh) { + auto __seg_end_identification = __q_local.submit([&](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __keys); auto __seg_ends_acc = __seg_ends.template get_access(__cgh); #if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT @@ -195,7 +193,7 @@ __parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_ }); // 1.5 Small single-group kernel - auto __single_group_scan = __exec.queue().submit([&](sycl::handler& __cgh) { + auto __single_group_scan = __q_local.submit([&](sycl::handler& __cgh) { __cgh.depends_on(__seg_end_identification); auto __seg_ends_acc = __seg_ends.template get_access(__cgh); auto __seg_ends_scan_acc = __seg_ends_scanned.template get_access(__cgh); @@ -215,7 +213,7 @@ __parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_ }); // 2. Work group reduction - auto __wg_reduce = __exec.queue().submit([&](sycl::handler& __cgh) { + auto __wg_reduce = __q_local.submit([&](sycl::handler& __cgh) { __cgh.depends_on(__single_group_scan); oneapi::dpl::__ranges::__require_access(__cgh, __keys, __out_keys, __out_values, __values); @@ -332,7 +330,7 @@ __parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_ }); // 3. Apply inter work-group aggregates - __exec.queue() + __q_local .submit([&](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __keys, __out_keys, __out_values); diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h index 43dcbab1fd9..3b60c0f6adf 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h @@ -344,14 +344,14 @@ struct __parallel_reduce_then_scan_reduce_submitter<__max_inputs_per_item, __is_ static constexpr std::uint8_t __sub_group_size = __get_reduce_then_scan_actual_sg_sz_device(); // Step 1 - SubGroupReduce is expected to perform sub-group reductions to global memory // input buffer - template + template sycl::event - operator()(_ExecutionPolicy&& __exec, const sycl::nd_range<1> __nd_range, _InRng&& __in_rng, + operator()(sycl::queue& __q, const sycl::nd_range<1> __nd_range, _InRng&& __in_rng, _TmpStorageAcc& __scratch_container, const sycl::event& __prior_event, const std::size_t __inputs_remaining, const std::size_t __block_num) const { using _InitValueType = typename _InitType::__value_type; - return __exec.queue().submit([&, this](sycl::handler& __cgh) { + return __q.submit([&, this](sycl::handler& __cgh) { __dpl_sycl::__local_accessor<_InitValueType> __sub_group_partials(__max_num_sub_groups_local, __cgh); __cgh.depends_on(__prior_event); oneapi::dpl::__ranges::__require_access(__cgh, __in_rng); @@ -506,14 +506,14 @@ struct __parallel_reduce_then_scan_scan_submitter<__max_inputs_per_item, __is_in __tmp_ptr[__num_sub_groups_global + 1 - (__block_num % 2)] = __block_carry_out; } - template + template sycl::event - operator()(_ExecutionPolicy&& __exec, const sycl::nd_range<1> __nd_range, _InRng&& __in_rng, _OutRng&& __out_rng, + operator()(sycl::queue& __q, const sycl::nd_range<1> __nd_range, _InRng&& __in_rng, _OutRng&& __out_rng, _TmpStorageAcc& __scratch_container, const sycl::event& __prior_event, const std::size_t __inputs_remaining, const std::size_t __block_num) const { std::uint32_t __inputs_in_block = std::min(__n - __block_num * __max_block_size, std::size_t{__max_block_size}); - return __exec.queue().submit([&, this](sycl::handler& __cgh) { + return __q.submit([&, this](sycl::handler& __cgh) { // We need __num_sub_groups_local + 1 temporary SLM locations to store intermediate results: // __num_sub_groups_local for each sub-group partial from the reduce kernel + // 1 element for the accumulated block-local carry-in from previous groups in the block @@ -818,12 +818,11 @@ struct __parallel_reduce_then_scan_scan_submitter<__max_inputs_per_item, __is_in // Enable reduce-then-scan if the device uses the required sub-group size and is ran on a device // with fast coordinated subgroup operations. We do not want to run this scan on CPU targets, as they are not // performant with this algorithm. -template -bool -__is_gpu_with_reduce_then_scan_sg_sz(const _ExecutionPolicy& __exec) +inline bool +__is_gpu_with_reduce_then_scan_sg_sz(const sycl::queue& __q) { - return (__exec.queue().get_device().is_gpu() && - oneapi::dpl::__internal::__supports_sub_group_size(__exec, __get_reduce_then_scan_reqd_sg_sz_host())); + return (__q.get_device().is_gpu() && + oneapi::dpl::__internal::__supports_sub_group_size(__q, __get_reduce_then_scan_reqd_sg_sz_host())); } // General scan-like algorithm helpers @@ -836,17 +835,16 @@ __is_gpu_with_reduce_then_scan_sg_sz(const _ExecutionPolicy& __exec) // _ReduceOp - a binary function which is used in the reduction and scan operations // _WriteOp - a function which accepts output range, index, and output of `_GenScanInput` applied to the input range // and performs the final write to output operation -template __future> -__parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, +__parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_tag, sycl::queue& __q, _InRng&& __in_rng, _OutRng&& __out_rng, _GenReduceInput __gen_reduce_input, _ReduceOp __reduce_op, _GenScanInput __gen_scan_input, _ScanInputTransform __scan_input_transform, _WriteOp __write_op, _InitType __init, _Inclusive, _IsUniquePattern) { - using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; using _ReduceKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< __reduce_then_scan_reduce_kernel<_CustomName>>; using _ScanKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< @@ -861,7 +859,7 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_ constexpr bool __inclusive = _Inclusive::value; constexpr bool __is_unique_pattern_v = _IsUniquePattern::value; - const std::uint32_t __max_work_group_size = oneapi::dpl::__internal::__max_work_group_size(__exec.queue(), 8192); + const std::uint32_t __max_work_group_size = oneapi::dpl::__internal::__max_work_group_size(__q, 8192); // Round down to nearest multiple of the subgroup size const std::uint32_t __work_group_size = (__max_work_group_size / __max_sub_group_size) * __max_sub_group_size; @@ -894,7 +892,7 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_ // We need temporary storage for reductions of each sub-group (__num_sub_groups_global). // Additionally, we need two elements for the block carry-out to prevent a race condition // between reading and writing the block carry-out within a single kernel. - __result_and_scratch_storage<_ValueType> __result_and_scratch{__exec.queue(), __max_num_sub_groups_global + 2}; + __result_and_scratch_storage<_ValueType> __result_and_scratch{__q, __max_num_sub_groups_global + 2}; // Reduce and scan step implementations using _ReduceSubmitter = @@ -937,10 +935,10 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_ auto __local_range = sycl::range<1>(__work_group_size); auto __kernel_nd_range = sycl::nd_range<1>(__global_range, __local_range); // 1. Reduce step - Reduce assigned input per sub-group, compute and apply intra-wg carries, and write to global memory. - __event = __reduce_submitter(__exec, __kernel_nd_range, __in_rng, __result_and_scratch, __event, + __event = __reduce_submitter(__q, __kernel_nd_range, __in_rng, __result_and_scratch, __event, __inputs_remaining, __b); // 2. Scan step - Compute intra-wg carries, determine sub-group carry-ins, and perform full input block scan. - __event = __scan_submitter(__exec, __kernel_nd_range, __in_rng, __out_rng, __result_and_scratch, __event, + __event = __scan_submitter(__q, __kernel_nd_range, __in_rng, __out_rng, __result_and_scratch, __event, __inputs_remaining, __b); __inputs_remaining -= std::min(__inputs_remaining, __block_size); if (__b + 2 == __num_blocks) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h b/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h index 8f83241a882..0997e387b18 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h @@ -129,7 +129,7 @@ template struct __pattern_min_element_reduce_fn; template -struct __pattern_minmax_element__reduce_fn; +struct __pattern_minmax_element_reduce_fn; template struct __pattern_count_transform_fn; @@ -314,7 +314,7 @@ struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__internal:: }; template -struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__internal::__pattern_minmax_element__reduce_fn, +struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__internal::__pattern_minmax_element_reduce_fn, _Compare, _ReduceValueType)> : oneapi::dpl::__internal::__are_all_device_copyable<_Compare, _ReduceValueType> { diff --git a/include/oneapi/dpl/pstl/hetero/histogram_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/histogram_impl_hetero.h index 774c8647548..9c6e6c27b1d 100644 --- a/include/oneapi/dpl/pstl/hetero/histogram_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/histogram_impl_hetero.h @@ -149,6 +149,10 @@ __pattern_histogram(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Rando if (__n > 0) { + using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + + sycl::queue __q_local = __exec.queue(); + //need __binhash_manager to stay in scope until the kernel completes to keep the buffer alive // __make_binhash_manager will call __get_sycl_range for any data which requires it within __func auto __binhash_manager = __make_binhash_manager(::std::forward<_BinHash>(__func)); @@ -157,8 +161,8 @@ __pattern_histogram(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Rando _RandomAccessIterator1>(); auto __input_buf = __keep_input(__first, __last); - __parallel_histogram(_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), __init_event, - __input_buf.all_view(), ::std::move(__bins), __binhash_manager) + __parallel_histogram<_CustomName>(_BackendTag{}, __q_local, __init_event, __input_buf.all_view(), + std::move(__bins), __binhash_manager) .__deferrable_wait(); } else diff --git a/include/oneapi/dpl/pstl/hetero/numeric_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/numeric_impl_hetero.h index ba10e48ce69..fb7848e1590 100644 --- a/include/oneapi/dpl/pstl/hetero/numeric_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/numeric_impl_hetero.h @@ -58,9 +58,13 @@ __pattern_transform_reduce(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _RandomAccessIterator2>(); auto __buf2 = __keep2(__first2, __first2 + __n); - return oneapi::dpl::__par_backend_hetero::__parallel_transform_reduce<_RepackedTp, - ::std::true_type /*is_commutative*/>( - _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), __binary_op1, _Functor{__binary_op2}, + using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + + sycl::queue __q_local = __exec.queue(); + + return oneapi::dpl::__par_backend_hetero::__parallel_transform_reduce<_CustomName, _RepackedTp, + std::true_type /*is_commutative*/>( + _BackendTag{}, __q_local, __binary_op1, _Functor{__binary_op2}, unseq_backend::__init_value<_RepackedTp>{__init}, // initial value __buf1.all_view(), __buf2.all_view()) .get(); @@ -86,9 +90,13 @@ __pattern_transform_reduce(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, auto __keep = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _ForwardIterator>(); auto __buf = __keep(__first, __last); - return oneapi::dpl::__par_backend_hetero::__parallel_transform_reduce<_RepackedTp, - ::std::true_type /*is_commutative*/>( - _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), __binary_op, _Functor{__unary_op}, + using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + + sycl::queue __q_local = __exec.queue(); + + return oneapi::dpl::__par_backend_hetero::__parallel_transform_reduce<_CustomName, _RepackedTp, + std::true_type /*is_commutative*/>( + _BackendTag{}, __q_local, __binary_op, _Functor{__unary_op}, unseq_backend::__init_value<_RepackedTp>{__init}, // initial value __buf.all_view()) .get(); @@ -137,6 +145,10 @@ __pattern_transform_scan_base(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy& auto __keep1 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _Iterator1>(); auto __buf1 = __keep1(__first, __last); + sycl::queue __q_local = __exec.queue(); + + using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + // This is a temporary workaround for an in-place exclusive scan while the SYCL backend scan pattern is not fixed. const bool __is_scan_inplace_exclusive = __n > 1 && !_Inclusive{} && __iterators_possibly_equal(__first, __result); if (!__is_scan_inplace_exclusive) @@ -144,9 +156,9 @@ __pattern_transform_scan_base(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy& auto __keep2 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _Iterator2>(); auto __buf2 = __keep2(__result, __result + __n); - oneapi::dpl::__par_backend_hetero::__parallel_transform_scan( - _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), __buf1.all_view(), __buf2.all_view(), __n, - __unary_op, __init, __binary_op, _Inclusive{}) + oneapi::dpl::__par_backend_hetero::__parallel_transform_scan<_CustomName>( + _BackendTag{}, __q_local, __buf1.all_view(), __buf2.all_view(), __n, __unary_op, __init, __binary_op, + _Inclusive{}) .__deferrable_wait(); } else @@ -168,9 +180,10 @@ __pattern_transform_scan_base(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy& auto __buf2 = __keep2(__first_tmp, __last_tmp); // Run main algorithm and save data into temporary buffer - oneapi::dpl::__par_backend_hetero::__parallel_transform_scan(_BackendTag{}, __policy, __buf1.all_view(), - __buf2.all_view(), __n, __unary_op, __init, - __binary_op, _Inclusive{}) + oneapi::dpl::__par_backend_hetero::__parallel_transform_scan< + oneapi::dpl::__internal::__policy_kernel_name>( + _BackendTag{}, __q_local, __buf1.all_view(), __buf2.all_view(), __n, __unary_op, __init, __binary_op, + _Inclusive{}) .wait(); // Move data from temporary buffer into results @@ -277,7 +290,7 @@ __pattern_adjacent_difference(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __ex using _Function = unseq_backend::walk_adjacent_difference; - oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, __exec, + oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, std::forward<_ExecutionPolicy>(__exec), _Function{__fn, static_cast(__n)}, __n, __buf1.all_view(), __buf2.all_view()) .__deferrable_wait(); diff --git a/include/oneapi/dpl/pstl/hetero/numeric_ranges_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/numeric_ranges_impl_hetero.h index 6b0bec70faa..da083b1c5dc 100644 --- a/include/oneapi/dpl/pstl/hetero/numeric_ranges_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/numeric_ranges_impl_hetero.h @@ -51,11 +51,15 @@ __pattern_transform_reduce(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, using _Functor = unseq_backend::walk_n<_BinaryOperation2>; using _RepackedTp = oneapi::dpl::__par_backend_hetero::__repacked_tuple_t<_Tp>; - return oneapi::dpl::__par_backend_hetero::__parallel_transform_reduce<_RepackedTp, - ::std::true_type /*is_commutative*/>( - _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), __binary_op1, _Functor{__binary_op2}, + using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + + sycl::queue __q_local = __exec.queue(); + + return oneapi::dpl::__par_backend_hetero::__parallel_transform_reduce<_CustomName, _RepackedTp, + std::true_type /*is_commutative*/>( + _BackendTag{}, __q_local, __binary_op1, _Functor{__binary_op2}, unseq_backend::__init_value<_RepackedTp>{__init}, // initial value - ::std::forward<_Range1>(__rng1), ::std::forward<_Range2>(__rng2)) + std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2)) .get(); } @@ -75,11 +79,15 @@ __pattern_transform_reduce(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, using _Functor = unseq_backend::walk_n<_UnaryOperation>; using _RepackedTp = oneapi::dpl::__par_backend_hetero::__repacked_tuple_t<_Tp>; - return oneapi::dpl::__par_backend_hetero::__parallel_transform_reduce<_RepackedTp, - ::std::true_type /*is_commutative*/>( - _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), __binary_op, _Functor{__unary_op}, + using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + + sycl::queue __q_local = __exec.queue(); + + return oneapi::dpl::__par_backend_hetero::__parallel_transform_reduce<_CustomName, _RepackedTp, + std::true_type /*is_commutative*/>( + _BackendTag{}, __q_local, __binary_op, _Functor{__unary_op}, unseq_backend::__init_value<_RepackedTp>{__init}, // initial value - ::std::forward<_Range>(__rng)) + std::forward<_Range>(__rng)) .get(); } @@ -97,9 +105,13 @@ __pattern_transform_scan_base(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __ex if (__n == 0) return 0; - oneapi::dpl::__par_backend_hetero::__parallel_transform_scan( - _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), - std::forward<_Range2>(__rng2), __n, __unary_op, __init, __binary_op, _Inclusive{}) + sycl::queue __q_local = __exec.queue(); + + using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + + oneapi::dpl::__par_backend_hetero::__parallel_transform_scan<_CustomName>( + _BackendTag{}, __q_local, std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), __n, __unary_op, __init, + __binary_op, _Inclusive{}) .__deferrable_wait(); return __n; }