From 7c476638b0242b34c5cd07542c88393b6d3b3718 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Wed, 18 Sep 2024 12:02:09 +0200 Subject: [PATCH 01/12] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h - delete copy constructor from oneapi::dpl::__par_backend_hetero::__future class Signed-off-by: Sergey Kopienko --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h index 9bd195a80a9..3394617cd7b 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h @@ -724,6 +724,8 @@ class __future : private std::tuple<_Args...> __future(_Event __e, _Args... __args) : std::tuple<_Args...>(__args...), __my_event(__e) {} __future(_Event __e, std::tuple<_Args...> __t) : std::tuple<_Args...>(__t), __my_event(__e) {} + __future(const __future&) = delete; + auto event() const { From b74a4c10b2bebfe03bb75e11d5c6225f91d98369 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Wed, 18 Sep 2024 12:02:29 +0200 Subject: [PATCH 02/12] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h - delete copy assignment from oneapi::dpl::__par_backend_hetero::__future class Signed-off-by: Sergey Kopienko --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h index 3394617cd7b..79a945aa267 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h @@ -725,7 +725,9 @@ class __future : private std::tuple<_Args...> __future(_Event __e, std::tuple<_Args...> __t) : std::tuple<_Args...>(__t), __my_event(__e) {} __future(const __future&) = delete; - + __future& + operator=(const __future&) = delete; + auto event() const { From c1c344724288b8ca881d6f079cbdb778fb0265dd Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 19 Sep 2024 10:39:14 +0200 Subject: [PATCH 03/12] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h - move parameters in oneapi::dpl::__par_backend_hetero::__future class constructors Signed-off-by: Sergey Kopienko --- .../pstl/hetero/dpcpp/parallel_backend_sycl_utils.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h index 79a945aa267..d14a60f2bc2 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h @@ -721,8 +721,14 @@ class __future : private std::tuple<_Args...> } public: - __future(_Event __e, _Args... __args) : std::tuple<_Args...>(__args...), __my_event(__e) {} - __future(_Event __e, std::tuple<_Args...> __t) : std::tuple<_Args...>(__t), __my_event(__e) {} + __future(_Event __e, _Args... __args) + : std::tuple<_Args...>(std::forward<_Args>(__args)...), __my_event(std::move(__e)) + { + } + __future(_Event __e, std::tuple<_Args...> __t) + : std::tuple<_Args...>(std::move(__t)), __my_event(std::move(__e)) + { + } __future(const __future&) = delete; __future& From 7e1c113360603f900eb314386dde89ca9a993b54 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 19 Sep 2024 10:39:19 +0200 Subject: [PATCH 04/12] include/oneapi/dpl/internal/async_impl/async_impl_hetero.h - __future improvements + usage Signed-off-by: Sergey Kopienko --- .../dpl/internal/async_impl/async_impl_hetero.h | 12 ++++++++---- .../pstl/hetero/dpcpp/parallel_backend_sycl_utils.h | 13 +++++++------ 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/include/oneapi/dpl/internal/async_impl/async_impl_hetero.h b/include/oneapi/dpl/internal/async_impl/async_impl_hetero.h index 1558919b20f..af1663a7036 100644 --- a/include/oneapi/dpl/internal/async_impl/async_impl_hetero.h +++ b/include/oneapi/dpl/internal/async_impl/async_impl_hetero.h @@ -69,7 +69,8 @@ __pattern_walk2_async(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _For _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), unseq_backend::walk_n<_ExecutionPolicy, _Function>{__f}, __n, __buf1.all_view(), __buf2.all_view()); - return __future.__make_future(__first2 + __n); + using _f_type = decltype(__future); + return _f_type::__make_future(std::move(__future), __first2 + __n); } template , _ExecutionPolicy&& __exec, _For unseq_backend::walk_n<_ExecutionPolicy, _Function>{__f}, __n, __buf1.all_view(), __buf2.all_view(), __buf3.all_view()); - return __future.__make_future(__first3 + __n); + using _f_type = decltype(__future); + return _f_type::__make_future(std::move(__future), __first3 + __n); } template , _ExecutionPolicy& auto __keep2 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _Iterator2>(); auto __buf2 = __keep2(__result, __result + __n); - auto __res = oneapi::dpl::__par_backend_hetero::__parallel_transform_scan( + auto __future = oneapi::dpl::__par_backend_hetero::__parallel_transform_scan( _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), __buf1.all_view(), __buf2.all_view(), __n, __unary_op, __init, __binary_op, _Inclusive{}); - return __res.__make_future(__result + __n); + + using _f_type = decltype(__future); + return _f_type::__make_future(std::move(__future), __result + __n); } template //The internal API. There are cases where the implementation specifies return value "higher" than SYCL backend, //where a future is created. - template - auto - __make_future(_T __t) const + template + static __future<_OtherEvent, _AddArgs..., _OtherArgs...> + __make_future(__future<_OtherEvent, _OtherArgs...>&& __f, _AddArgs... __add_args) { - auto new_val = std::tuple<_T>(__t); - auto new_tuple = std::tuple_cat(new_val, (std::tuple<_Args...>)*this); - return __future<_Event, _T, _Args...>(__my_event, new_tuple); + auto new_vals = std::tuple<_AddArgs...>(std::forward<_AddArgs>(__add_args)...); + auto new_tuple = std::tuple_cat(std::move(new_vals), static_cast&&>(__f)); + + return {std::move(__f.__my_event), std::move(new_tuple)}; } }; From 325fe7b2dffd3c2bde09e7f233ba5ce17a3d9953 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 19 Sep 2024 10:52:42 +0200 Subject: [PATCH 05/12] include/oneapi/dpl/internal/async_impl/async_impl_hetero.h - __make_future improvements : make code more compact Signed-off-by: Sergey Kopienko --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h index aa19d859a76..96f57384e10 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h @@ -781,10 +781,9 @@ class __future : private std::tuple<_Args...> static __future<_OtherEvent, _AddArgs..., _OtherArgs...> __make_future(__future<_OtherEvent, _OtherArgs...>&& __f, _AddArgs... __add_args) { - auto new_vals = std::tuple<_AddArgs...>(std::forward<_AddArgs>(__add_args)...); - auto new_tuple = std::tuple_cat(std::move(new_vals), static_cast&&>(__f)); - - return {std::move(__f.__my_event), std::move(new_tuple)}; + return {std::move(__f.__my_event), + std::tuple_cat(std::tuple<_AddArgs...>(std::forward<_AddArgs>(__add_args)...), + static_cast&&>(__f))}; } }; From 87e3880a1f97bfbca4ede293fdf2a8a6b9b81050 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Wed, 18 Sep 2024 14:29:05 +0200 Subject: [PATCH 06/12] Fix compile errors after delete __future copy creation and copy assignment Signed-off-by: Sergey Kopienko --- .../dpl/internal/async_impl/async_impl_hetero.h | 3 +-- .../dpl/internal/async_impl/glue_async_impl.h | 15 +++++---------- 2 files changed, 6 insertions(+), 12 deletions(-) diff --git a/include/oneapi/dpl/internal/async_impl/async_impl_hetero.h b/include/oneapi/dpl/internal/async_impl/async_impl_hetero.h index af1663a7036..e742299288b 100644 --- a/include/oneapi/dpl/internal/async_impl/async_impl_hetero.h +++ b/include/oneapi/dpl/internal/async_impl/async_impl_hetero.h @@ -42,10 +42,9 @@ __pattern_walk1_async(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _For oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read_write, _ForwardIterator>(); auto __buf = __keep(__first, __last); - auto __future_obj = oneapi::dpl::__par_backend_hetero::__parallel_for( + return oneapi::dpl::__par_backend_hetero::__parallel_for( _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), unseq_backend::walk_n<_ExecutionPolicy, _Function>{__f}, __n, __buf.all_view()); - return __future_obj; } template <__par_backend_hetero::access_mode __acc_mode1 = __par_backend_hetero::access_mode::read, diff --git a/include/oneapi/dpl/internal/async_impl/glue_async_impl.h b/include/oneapi/dpl/internal/async_impl/glue_async_impl.h index dfd4a969ec8..30bb5d037af 100644 --- a/include/oneapi/dpl/internal/async_impl/glue_async_impl.h +++ b/include/oneapi/dpl/internal/async_impl/glue_async_impl.h @@ -46,10 +46,9 @@ transform_async(_ExecutionPolicy&& __exec, _ForwardIterator1 __first, _ForwardIt const auto __dispatch_tag = oneapi::dpl::__internal::__select_backend(__exec, __first, __result); wait_for_all(::std::forward<_Events>(__dependencies)...); - auto ret_val = oneapi::dpl::__internal::__pattern_walk2_async( + return oneapi::dpl::__internal::__pattern_walk2_async( __dispatch_tag, ::std::forward<_ExecutionPolicy>(__exec), __first, __last, __result, oneapi::dpl::__internal::__transform_functor<_UnaryOperation>{::std::move(__op)}); - return ret_val; } template (__dependencies)...); - auto ret_val = oneapi::dpl::__internal::__pattern_walk3_async( + return oneapi::dpl::__internal::__pattern_walk3_async( __dispatch_tag, ::std::forward<_ExecutionPolicy>(__exec), __first1, __last1, __first2, __result, oneapi::dpl::__internal::__transform_functor<_BinaryOperation>(::std::move(__op))); - return ret_val; } // [async.copy] @@ -80,10 +78,9 @@ copy_async(_ExecutionPolicy&& __exec, _ForwardIterator1 __first, _ForwardIterato auto __dispatch_tag = oneapi::dpl::__internal::__select_backend(__exec, __first, __result); wait_for_all(::std::forward<_Events>(__dependencies)...); - auto ret_val = oneapi::dpl::__internal::__pattern_walk2_brick_async( + return oneapi::dpl::__internal::__pattern_walk2_brick_async( __dispatch_tag, ::std::forward<_ExecutionPolicy>(__exec), __first, __last, __result, oneapi::dpl::__internal::__brick_copy{}); - return ret_val; } // [async.sort] @@ -127,9 +124,8 @@ for_each_async(_ExecutionPolicy&& __exec, _ForwardIterator __first, _ForwardIter const auto __dispatch_tag = oneapi::dpl::__internal::__select_backend(__exec, __first); wait_for_all(::std::forward<_Events>(__dependencies)...); - auto ret_val = oneapi::dpl::__internal::__pattern_walk1_async( + return oneapi::dpl::__internal::__pattern_walk1_async( __dispatch_tag, ::std::forward<_ExecutionPolicy>(__exec), __first, __last, __f); - return ret_val; } // [async.reduce] @@ -144,10 +140,9 @@ reduce_async(_ExecutionPolicy&& __exec, _ForwardIterator __first, _ForwardIterat const auto __dispatch_tag = oneapi::dpl::__internal::__select_backend(__exec, __first); wait_for_all(::std::forward<_Events>(__dependencies)...); - auto ret_val = oneapi::dpl::__internal::__pattern_transform_reduce_async( + return oneapi::dpl::__internal::__pattern_transform_reduce_async( __dispatch_tag, ::std::forward<_ExecutionPolicy>(__exec), __first, __last, __init, __binary_op, oneapi::dpl::__internal::__no_op()); - return ret_val; } template Date: Thu, 19 Sep 2024 15:50:42 +0200 Subject: [PATCH 07/12] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h - modify the set of __future constructors Signed-off-by: Sergey Kopienko --- .../hetero/dpcpp/parallel_backend_sycl_utils.h | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h index 96f57384e10..52de7617223 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h @@ -721,12 +721,16 @@ class __future : private std::tuple<_Args...> } public: - __future(_Event __e, _Args... __args) - : std::tuple<_Args...>(std::forward<_Args>(__args)...), __my_event(std::move(__e)) + __future(_Event&& __e) + : __my_event(std::move(__e)) { } - __future(_Event __e, std::tuple<_Args...> __t) - : std::tuple<_Args...>(std::move(__t)), __my_event(std::move(__e)) + __future(_Event&& __e, const std::tuple<_Args...>& __data) + : std::tuple<_Args...>(__data), __my_event(std::move(__e)) + { + } + __future(_Event&& __e, std::tuple<_Args...>&& __data) + : std::tuple<_Args...>(std::move(__data)), __my_event(std::move(__e)) { } @@ -781,9 +785,9 @@ class __future : private std::tuple<_Args...> static __future<_OtherEvent, _AddArgs..., _OtherArgs...> __make_future(__future<_OtherEvent, _OtherArgs...>&& __f, _AddArgs... __add_args) { - return {std::move(__f.__my_event), - std::tuple_cat(std::tuple<_AddArgs...>(std::forward<_AddArgs>(__add_args)...), - static_cast&&>(__f))}; + return __future<_OtherEvent, _AddArgs..., _OtherArgs...>{ + std::move(__f.__my_event), std::tuple_cat(std::tuple<_AddArgs...>(std::forward<_AddArgs>(__add_args)...), + static_cast&&>(__f))}; } }; From 5ba1be49d345c59e1c73ed449fafc30db884d06a Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 19 Sep 2024 15:55:22 +0200 Subject: [PATCH 08/12] Use full specialization of returned __future instance and move (instead copy) data into it Signed-off-by: Sergey Kopienko --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 18 ++++++++++++------ .../hetero/dpcpp/parallel_backend_sycl_fpga.h | 2 +- .../dpcpp/parallel_backend_sycl_histogram.h | 18 +++++++++--------- .../hetero/dpcpp/parallel_backend_sycl_merge.h | 2 +- .../dpcpp/parallel_backend_sycl_merge_sort.h | 2 +- .../dpcpp/parallel_backend_sycl_radix_sort.h | 2 +- .../dpcpp/parallel_backend_sycl_reduce.h | 6 ++++-- .../parallel_backend_sycl_reduce_then_scan.h | 3 ++- 8 files changed, 31 insertions(+), 22 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index c0f3e9ad2d6..1224346e06e 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -242,7 +242,7 @@ struct __parallel_for_submitter<__internal::__optional_kernel_name<_Name...>> __brick(__idx, __rngs...); }); }); - return __future(__event); + return __future(std::move(__event)); } }; @@ -372,7 +372,8 @@ struct __parallel_scan_submitter<_CustomName, __internal::__optional_kernel_name }); }); - return __future(__final_event, __result_and_scratch); + return __future>{ + std::move(__final_event), std::move(__result_and_scratch)}; } }; @@ -644,7 +645,8 @@ struct __parallel_copy_if_static_single_group_submitter<_Size, _ElemsPerItem, _W } }); }); - return __future(__event, __result); + return __future>{std::move(__event), + std::move(__result)}; } }; @@ -700,7 +702,10 @@ __parallel_transform_scan_single_group(oneapi::dpl::__internal::__device_backend /* _IsFullGroup= */ ::std::false_type, _Inclusive, _CustomName>>>()( ::std::forward<_ExecutionPolicy>(__exec), std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __n, __init, __binary_op, __unary_op); - return __future(__event, __dummy_result_and_scratch); + + return __future>{ + std::move(__event), std::move(__dummy_result_and_scratch)}; }; if (__n <= 16) return __single_group_scan_f(std::integral_constant<::std::uint16_t, 16>{}); @@ -734,7 +739,8 @@ __parallel_transform_scan_single_group(oneapi::dpl::__internal::__device_backend __parallel_transform_scan_dynamic_single_group_submitter<_Inclusive::value, _DynamicGroupScanKernel>()( std::forward<_ExecutionPolicy>(__exec), std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __n, __init, __binary_op, __unary_op, __max_wg_size); - return __future(__event, __dummy_result_and_scratch); + return __future>{ + std::move(__event), std::move(__dummy_result_and_scratch)}; } } @@ -1866,7 +1872,7 @@ struct __parallel_partial_sort_submitter<__internal::__optional_kernel_name<_Glo }); } // return future and extend lifetime of temporary buffer - return __future(__event1); + return __future(std::move(__event1)); } }; diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h index 7baee78b1b1..2ced9aaec50 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h @@ -75,7 +75,7 @@ struct __parallel_for_fpga_submitter<__internal::__optional_kernel_name<_Name... } }); }); - return __future(__event); + return __future(std::move(__event)); } }; diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_histogram.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_histogram.h index 45124417ade..f3ba8672f04 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_histogram.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_histogram.h @@ -516,19 +516,19 @@ __parallel_histogram_select_kernel(oneapi::dpl::__internal::__device_backend_tag // if bins fit into registers, use register private accumulation if (__num_bins <= __max_work_item_private_bins) { - return __future( + return __future( __histogram_general_registers_local_reduction<__iters_per_work_item, __max_work_item_private_bins>( - __backend_tag, ::std::forward<_ExecutionPolicy>(__exec), __init_event, __work_group_size, - ::std::forward<_Range1>(__input), ::std::forward<_Range2>(__bins), __binhash_manager)); + __backend_tag, std::forward<_ExecutionPolicy>(__exec), __init_event, __work_group_size, + std::forward<_Range1>(__input), std::forward<_Range2>(__bins), __binhash_manager)); } // if bins fit into SLM, use local atomics else if (__num_bins * sizeof(_local_histogram_type) + __binhash_manager.get_required_SLM_elements() * sizeof(_extra_memory_type) < __local_mem_size) { - return __future(__histogram_general_local_atomics<__iters_per_work_item>( - __backend_tag, ::std::forward<_ExecutionPolicy>(__exec), __init_event, __work_group_size, - ::std::forward<_Range1>(__input), ::std::forward<_Range2>(__bins), __binhash_manager)); + return __future(__histogram_general_local_atomics<__iters_per_work_item>( + __backend_tag, std::forward<_ExecutionPolicy>(__exec), __init_event, __work_group_size, + std::forward<_Range1>(__input), std::forward<_Range2>(__bins), __binhash_manager)); } else // otherwise, use global atomics (private copies per workgroup) { @@ -537,9 +537,9 @@ __parallel_histogram_select_kernel(oneapi::dpl::__internal::__device_backend_tag // suggestion which but global memory limitations may increase this value to be able to fit the workgroup // private copies of the histogram bins in global memory. No unrolling is taken advantage of here because it // is a runtime argument. - return __future(__histogram_general_private_global_atomics( - __backend_tag, ::std::forward<_ExecutionPolicy>(__exec), __init_event, __iters_per_work_item, - __work_group_size, ::std::forward<_Range1>(__input), ::std::forward<_Range2>(__bins), __binhash_manager)); + return __future(__histogram_general_private_global_atomics( + __backend_tag, std::forward<_ExecutionPolicy>(__exec), __init_event, __iters_per_work_item, + __work_group_size, std::forward<_Range1>(__input), std::forward<_Range2>(__bins), __binhash_manager)); } } diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 753e32816a0..b9990958e9b 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -162,7 +162,7 @@ struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_N __comp); }); }); - return __future(__event); + return __future(std::move(__event)); } }; diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index 19a4f25b889..1c0bd923a4b 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -303,7 +303,7 @@ struct __parallel_sort_submitter<_IdType, __internal::__optional_kernel_name<_Le }); } - return __future(__event1); + return __future(std::move(__event1)); } }; diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h index b6ee2c4f3b9..0c113556576 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h @@ -866,7 +866,7 @@ __parallel_radix_sort(oneapi::dpl::__internal::__device_backend_tag, _ExecutionP } } - return __future(__event); + return __future(std::move(__event)); } } // namespace __par_backend_hetero diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h index ca776e94dce..fee22e7b17f 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h @@ -154,7 +154,8 @@ struct __parallel_transform_reduce_small_submitter<_Tp, _Commutative, _VecSize, }); }); - return __future(__reduce_event, __scratch_container); + return __future>{ + std::move(__reduce_event), std::move(__scratch_container)}; } }; // struct __parallel_transform_reduce_small_submitter @@ -418,7 +419,8 @@ struct __parallel_transform_reduce_impl __n_groups = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __size_per_work_group); } while (__n > 1); - return __future(__reduce_event, __scratch_container); + return __future>{ + std::move(__reduce_event), std::move(__scratch_container)}; } }; // struct __parallel_transform_reduce_impl diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h index 0856234985f..3d69cab952c 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h @@ -863,7 +863,8 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_ __inputs_per_item = __inputs_per_sub_group / __sub_group_size; } } - return __future(__event, __result_and_scratch); + return __future>{ + std::move(__event), std::move(__result_and_scratch)}; } } // namespace __par_backend_hetero From 384d0d462b7cb28f853bda0761168cd1ba953c9f Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 19 Sep 2024 16:00:30 +0200 Subject: [PATCH 09/12] Declare full specialization of __future return type instead of auto Signed-off-by: Sergey Kopienko --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 14 +++++++------- .../pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h | 4 ++-- .../hetero/dpcpp/parallel_backend_sycl_histogram.h | 2 +- .../hetero/dpcpp/parallel_backend_sycl_merge.h | 2 +- .../dpcpp/parallel_backend_sycl_merge_sort.h | 2 +- .../dpcpp/parallel_backend_sycl_radix_sort.h | 2 +- .../hetero/dpcpp/parallel_backend_sycl_reduce.h | 10 +++++----- .../dpcpp/parallel_backend_sycl_reduce_then_scan.h | 2 +- 8 files changed, 19 insertions(+), 19 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 1224346e06e..d7134e557fb 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -228,7 +228,7 @@ template struct __parallel_for_submitter<__internal::__optional_kernel_name<_Name...>> { template - auto + __future operator()(_ExecutionPolicy&& __exec, _Fp __brick, _Index __count, _Ranges&&... __rngs) const { assert(oneapi::dpl::__ranges::__get_first_range_size(__rngs...) > 0); @@ -249,7 +249,7 @@ struct __parallel_for_submitter<__internal::__optional_kernel_name<_Name...>> //General version of parallel_for, one additional parameter - __count of iterations of loop __cgh.parallel_for, //for some algorithms happens that size of processing range is n, but amount of iterations is n/2. template -auto +__future __parallel_for(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Fp __brick, _Index __count, _Ranges&&... __rngs) { @@ -275,7 +275,7 @@ struct __parallel_scan_submitter<_CustomName, __internal::__optional_kernel_name { template - auto + __future> operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _BinaryOperation __binary_op, _InitType __init, _LocalScan __local_scan, _GroupScan __group_scan, _GlobalScan __global_scan) const { @@ -557,7 +557,7 @@ struct __parallel_copy_if_static_single_group_submitter<_Size, _ElemsPerItem, _W { template - auto + __future> operator()(_Policy&& __policy, _InRng&& __in_rng, _OutRng&& __out_rng, ::std::size_t __n, _InitType __init, _BinaryOperation __bin_op, _UnaryOp __unary_op, _Assign __assign) { @@ -652,7 +652,7 @@ struct __parallel_copy_if_static_single_group_submitter<_Size, _ElemsPerItem, _W template -auto +__future> __parallel_transform_scan_single_group(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _InRng&& __in_rng, _OutRng&& __out_rng, ::std::size_t __n, _UnaryOperation __unary_op, _InitType __init, _BinaryOperation __binary_op, @@ -746,7 +746,7 @@ __parallel_transform_scan_single_group(oneapi::dpl::__internal::__device_backend template -auto +__future> __parallel_transform_scan_base(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Range1&& __in_rng, _Range2&& __out_rng, _BinaryOperation __binary_op, _InitType __init, _LocalScan __local_scan, _GroupScan __group_scan, _GlobalScan __global_scan) @@ -1812,7 +1812,7 @@ struct __parallel_partial_sort_submitter<__internal::__optional_kernel_name<_Glo __internal::__optional_kernel_name<_CopyBackName...>> { template - auto + __future operator()(_BackendTag, _ExecutionPolicy&& __exec, _Range&& __rng, _Merge __merge, _Compare __comp) const { using _Tp = oneapi::dpl::__internal::__value_t<_Range>; diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h index 2ced9aaec50..3c73eaafdfe 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h @@ -56,7 +56,7 @@ template struct __parallel_for_fpga_submitter<__internal::__optional_kernel_name<_Name...>> { template - auto + __future operator()(_ExecutionPolicy&& __exec, _Fp __brick, _Index __count, _Ranges&&... __rngs) const { auto __n = oneapi::dpl::__ranges::__get_first_range_size(__rngs...); @@ -80,7 +80,7 @@ struct __parallel_for_fpga_submitter<__internal::__optional_kernel_name<_Name... }; template -auto +__future __parallel_for(oneapi::dpl::__internal::__fpga_backend_tag, _ExecutionPolicy&& __exec, _Fp __brick, _Index __count, _Ranges&&... __rngs) { diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_histogram.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_histogram.h index f3ba8672f04..dde5cee2d4a 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_histogram.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_histogram.h @@ -497,7 +497,7 @@ __histogram_general_private_global_atomics(oneapi::dpl::__internal::__device_bac template <::std::uint16_t __iters_per_work_item, typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _BinHashMgr> -auto +__future __parallel_histogram_select_kernel(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, const sycl::event& __init_event, _Range1&& __input, _Range2&& __bins, const _BinHashMgr& __binhash_manager) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index b9990958e9b..bfdfd8883d7 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -137,7 +137,7 @@ template struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_Name...>> { template - auto + __future operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const { const _IdType __n1 = __rng1.size(); diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index 1c0bd923a4b..ef1faf2ef72 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -212,7 +212,7 @@ struct __parallel_sort_submitter<_IdType, __internal::__optional_kernel_name<_Le __internal::__optional_kernel_name<_CopyBackName...>> { template - auto + __future operator()(_ExecutionPolicy&& __exec, _Range&& __rng, _Compare __comp, _LeafSorter& __leaf_sorter) const { using _Tp = oneapi::dpl::__internal::__value_t<_Range>; diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h index 0c113556576..6d08927ea56 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h @@ -766,7 +766,7 @@ struct __parallel_radix_sort_iteration // radix sort: main function //----------------------------------------------------------------------- template -auto +__future __parallel_radix_sort(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Range&& __in_rng, _Proj __proj) { diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h index fee22e7b17f..988e2363a54 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h @@ -125,7 +125,7 @@ struct __parallel_transform_reduce_small_submitter<_Tp, _Commutative, _VecSize, { template - auto + __future> operator()(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, const _Size __n, const _Size __work_group_size, const _Size __iters_per_work_item, _ReduceOp __reduce_op, _TransformOp __transform_op, _InitType __init, _Ranges&&... __rngs) const @@ -161,7 +161,7 @@ struct __parallel_transform_reduce_small_submitter<_Tp, _Commutative, _VecSize, template -auto +__future> __parallel_transform_reduce_small_impl(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, const _Size __n, const _Size __work_group_size, const _Size __iters_per_work_item, _ReduceOp __reduce_op, @@ -236,7 +236,7 @@ struct __parallel_transform_reduce_work_group_kernel_submitter<_Tp, _Commutative { template - auto + __future> operator()(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, sycl::event& __reduce_event, const _Size __n, const _Size __work_group_size, const _Size __iters_per_work_item, _ReduceOp __reduce_op, _InitType __init, const __result_and_scratch_storage<_ExecutionPolicy2, _Tp>& __scratch_container) const @@ -311,7 +311,7 @@ struct __parallel_transform_reduce_impl { template - static auto + static __future> submit(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Size __n, _Size __work_group_size, const _Size __iters_per_work_item, _ReduceOp __reduce_op, _TransformOp __transform_op, _InitType __init, _Ranges&&... __rngs) @@ -438,7 +438,7 @@ struct __parallel_transform_reduce_impl // reduced in each step. template -auto +__future> __parallel_transform_reduce(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, _ReduceOp __reduce_op, _TransformOp __transform_op, _InitType __init, _Ranges&&... __rngs) { diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h index 3d69cab952c..a7bd22bdd95 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h @@ -747,7 +747,7 @@ __is_gpu_with_sg_32(const _ExecutionPolicy& __exec) template -auto +__future> __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _InRng&& __in_rng, _OutRng&& __out_rng, _GenReduceInput __gen_reduce_input, _ReduceOp __reduce_op, _GenScanInput __gen_scan_input, From 1b91850b333d9bb14f59e2eca5e5039e58e18931 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 19 Sep 2024 16:01:14 +0200 Subject: [PATCH 10/12] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h - special fix for __parallel_transform_reduce_work_group_kernel_submitter::operator() Signed-off-by: Sergey Kopienko --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h index 988e2363a54..e2a3c8e5668 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h @@ -237,9 +237,9 @@ struct __parallel_transform_reduce_work_group_kernel_submitter<_Tp, _Commutative template __future> - operator()(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, sycl::event& __reduce_event, + operator()(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, sycl::event&& __reduce_event, const _Size __n, const _Size __work_group_size, const _Size __iters_per_work_item, _ReduceOp __reduce_op, - _InitType __init, const __result_and_scratch_storage<_ExecutionPolicy2, _Tp>& __scratch_container) const + _InitType __init, __result_and_scratch_storage<_ExecutionPolicy2, _Tp>& __scratch_container) const { using _NoOpFunctor = unseq_backend::walk_n<_ExecutionPolicy, oneapi::dpl::__internal::__no_op>; auto __transform_pattern = @@ -269,7 +269,8 @@ struct __parallel_transform_reduce_work_group_kernel_submitter<_Tp, _Commutative }); }); - return __future(__reduce_event, __scratch_container); + return __future>{std::move(__reduce_event), + std::move(__scratch_container)}; } }; // struct __parallel_transform_reduce_work_group_kernel_submitter @@ -301,7 +302,7 @@ __parallel_transform_reduce_mid_impl(oneapi::dpl::__internal::__device_backend_t // __n_groups preliminary results from the device kernel. return __parallel_transform_reduce_work_group_kernel_submitter<_Tp, _Commutative, _VecSize, _ReduceWorkGroupKernel>()( - __backend_tag, std::forward<_ExecutionPolicy>(__exec), __reduce_event, __n_groups, __work_group_size, + __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::move(__reduce_event), __n_groups, __work_group_size, __iters_per_work_item_work_group_kernel, __reduce_op, __init, __scratch_container); } From 235d87a7ce82c02dd3bb305149cc5bd7f3f1aead Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 19 Sep 2024 18:08:41 +0200 Subject: [PATCH 11/12] Apply GitHUB clang format Signed-off-by: Sergey Kopienko --- include/oneapi/dpl/internal/async_impl/async_impl_hetero.h | 6 +++--- include/oneapi/dpl/internal/async_impl/glue_async_impl.h | 4 ++-- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h | 4 ++-- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h | 5 +---- 4 files changed, 8 insertions(+), 11 deletions(-) diff --git a/include/oneapi/dpl/internal/async_impl/async_impl_hetero.h b/include/oneapi/dpl/internal/async_impl/async_impl_hetero.h index e742299288b..9841c53accc 100644 --- a/include/oneapi/dpl/internal/async_impl/async_impl_hetero.h +++ b/include/oneapi/dpl/internal/async_impl/async_impl_hetero.h @@ -42,9 +42,9 @@ __pattern_walk1_async(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _For oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read_write, _ForwardIterator>(); auto __buf = __keep(__first, __last); - return oneapi::dpl::__par_backend_hetero::__parallel_for( - _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), - unseq_backend::walk_n<_ExecutionPolicy, _Function>{__f}, __n, __buf.all_view()); + return oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), + unseq_backend::walk_n<_ExecutionPolicy, _Function>{__f}, + __n, __buf.all_view()); } template <__par_backend_hetero::access_mode __acc_mode1 = __par_backend_hetero::access_mode::read, diff --git a/include/oneapi/dpl/internal/async_impl/glue_async_impl.h b/include/oneapi/dpl/internal/async_impl/glue_async_impl.h index 30bb5d037af..5be6adc101d 100644 --- a/include/oneapi/dpl/internal/async_impl/glue_async_impl.h +++ b/include/oneapi/dpl/internal/async_impl/glue_async_impl.h @@ -124,8 +124,8 @@ for_each_async(_ExecutionPolicy&& __exec, _ForwardIterator __first, _ForwardIter const auto __dispatch_tag = oneapi::dpl::__internal::__select_backend(__exec, __first); wait_for_all(::std::forward<_Events>(__dependencies)...); - return oneapi::dpl::__internal::__pattern_walk1_async( - __dispatch_tag, ::std::forward<_ExecutionPolicy>(__exec), __first, __last, __f); + return oneapi::dpl::__internal::__pattern_walk1_async(__dispatch_tag, ::std::forward<_ExecutionPolicy>(__exec), + __first, __last, __f); } // [async.reduce] diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h index e2a3c8e5668..2744251b2fc 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h @@ -269,8 +269,8 @@ struct __parallel_transform_reduce_work_group_kernel_submitter<_Tp, _Commutative }); }); - return __future>{std::move(__reduce_event), - std::move(__scratch_container)}; + return __future>{ + std::move(__reduce_event), std::move(__scratch_container)}; } }; // struct __parallel_transform_reduce_work_group_kernel_submitter diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h index 52de7617223..d19235b62cf 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h @@ -721,10 +721,7 @@ class __future : private std::tuple<_Args...> } public: - __future(_Event&& __e) - : __my_event(std::move(__e)) - { - } + __future(_Event&& __e) : __my_event(std::move(__e)) {} __future(_Event&& __e, const std::tuple<_Args...>& __data) : std::tuple<_Args...>(__data), __my_event(std::move(__e)) { From 3afc10cf3e856517725acaf9e1fc8d41383a569e Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 20 Sep 2024 11:33:03 +0200 Subject: [PATCH 12/12] Revert "Declare full specialization of __future return type instead of auto" This reverts commit 6be8e443ede52ea5d6b863f141529fa2329dacbe. Signed-off-by: Sergey Kopienko # Conflicts: # include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 14 +++++++------- .../pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h | 4 ++-- .../hetero/dpcpp/parallel_backend_sycl_histogram.h | 2 +- .../hetero/dpcpp/parallel_backend_sycl_merge.h | 2 +- .../dpcpp/parallel_backend_sycl_merge_sort.h | 2 +- .../dpcpp/parallel_backend_sycl_radix_sort.h | 2 +- .../hetero/dpcpp/parallel_backend_sycl_reduce.h | 12 ++++++------ .../dpcpp/parallel_backend_sycl_reduce_then_scan.h | 2 +- 8 files changed, 20 insertions(+), 20 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index d7134e557fb..1224346e06e 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -228,7 +228,7 @@ template struct __parallel_for_submitter<__internal::__optional_kernel_name<_Name...>> { template - __future + auto operator()(_ExecutionPolicy&& __exec, _Fp __brick, _Index __count, _Ranges&&... __rngs) const { assert(oneapi::dpl::__ranges::__get_first_range_size(__rngs...) > 0); @@ -249,7 +249,7 @@ struct __parallel_for_submitter<__internal::__optional_kernel_name<_Name...>> //General version of parallel_for, one additional parameter - __count of iterations of loop __cgh.parallel_for, //for some algorithms happens that size of processing range is n, but amount of iterations is n/2. template -__future +auto __parallel_for(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Fp __brick, _Index __count, _Ranges&&... __rngs) { @@ -275,7 +275,7 @@ struct __parallel_scan_submitter<_CustomName, __internal::__optional_kernel_name { template - __future> + auto operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _BinaryOperation __binary_op, _InitType __init, _LocalScan __local_scan, _GroupScan __group_scan, _GlobalScan __global_scan) const { @@ -557,7 +557,7 @@ struct __parallel_copy_if_static_single_group_submitter<_Size, _ElemsPerItem, _W { template - __future> + auto operator()(_Policy&& __policy, _InRng&& __in_rng, _OutRng&& __out_rng, ::std::size_t __n, _InitType __init, _BinaryOperation __bin_op, _UnaryOp __unary_op, _Assign __assign) { @@ -652,7 +652,7 @@ struct __parallel_copy_if_static_single_group_submitter<_Size, _ElemsPerItem, _W template -__future> +auto __parallel_transform_scan_single_group(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _InRng&& __in_rng, _OutRng&& __out_rng, ::std::size_t __n, _UnaryOperation __unary_op, _InitType __init, _BinaryOperation __binary_op, @@ -746,7 +746,7 @@ __parallel_transform_scan_single_group(oneapi::dpl::__internal::__device_backend template -__future> +auto __parallel_transform_scan_base(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Range1&& __in_rng, _Range2&& __out_rng, _BinaryOperation __binary_op, _InitType __init, _LocalScan __local_scan, _GroupScan __group_scan, _GlobalScan __global_scan) @@ -1812,7 +1812,7 @@ struct __parallel_partial_sort_submitter<__internal::__optional_kernel_name<_Glo __internal::__optional_kernel_name<_CopyBackName...>> { template - __future + auto operator()(_BackendTag, _ExecutionPolicy&& __exec, _Range&& __rng, _Merge __merge, _Compare __comp) const { using _Tp = oneapi::dpl::__internal::__value_t<_Range>; diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h index 3c73eaafdfe..2ced9aaec50 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h @@ -56,7 +56,7 @@ template struct __parallel_for_fpga_submitter<__internal::__optional_kernel_name<_Name...>> { template - __future + auto operator()(_ExecutionPolicy&& __exec, _Fp __brick, _Index __count, _Ranges&&... __rngs) const { auto __n = oneapi::dpl::__ranges::__get_first_range_size(__rngs...); @@ -80,7 +80,7 @@ struct __parallel_for_fpga_submitter<__internal::__optional_kernel_name<_Name... }; template -__future +auto __parallel_for(oneapi::dpl::__internal::__fpga_backend_tag, _ExecutionPolicy&& __exec, _Fp __brick, _Index __count, _Ranges&&... __rngs) { diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_histogram.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_histogram.h index dde5cee2d4a..f3ba8672f04 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_histogram.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_histogram.h @@ -497,7 +497,7 @@ __histogram_general_private_global_atomics(oneapi::dpl::__internal::__device_bac template <::std::uint16_t __iters_per_work_item, typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _BinHashMgr> -__future +auto __parallel_histogram_select_kernel(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, const sycl::event& __init_event, _Range1&& __input, _Range2&& __bins, const _BinHashMgr& __binhash_manager) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index bfdfd8883d7..b9990958e9b 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -137,7 +137,7 @@ template struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_Name...>> { template - __future + auto operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const { const _IdType __n1 = __rng1.size(); diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index ef1faf2ef72..1c0bd923a4b 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -212,7 +212,7 @@ struct __parallel_sort_submitter<_IdType, __internal::__optional_kernel_name<_Le __internal::__optional_kernel_name<_CopyBackName...>> { template - __future + auto operator()(_ExecutionPolicy&& __exec, _Range&& __rng, _Compare __comp, _LeafSorter& __leaf_sorter) const { using _Tp = oneapi::dpl::__internal::__value_t<_Range>; diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h index 6d08927ea56..0c113556576 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h @@ -766,7 +766,7 @@ struct __parallel_radix_sort_iteration // radix sort: main function //----------------------------------------------------------------------- template -__future +auto __parallel_radix_sort(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Range&& __in_rng, _Proj __proj) { diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h index 2744251b2fc..c0adce402e4 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h @@ -125,7 +125,7 @@ struct __parallel_transform_reduce_small_submitter<_Tp, _Commutative, _VecSize, { template - __future> + auto operator()(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, const _Size __n, const _Size __work_group_size, const _Size __iters_per_work_item, _ReduceOp __reduce_op, _TransformOp __transform_op, _InitType __init, _Ranges&&... __rngs) const @@ -161,7 +161,7 @@ struct __parallel_transform_reduce_small_submitter<_Tp, _Commutative, _VecSize, template -__future> +auto __parallel_transform_reduce_small_impl(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, const _Size __n, const _Size __work_group_size, const _Size __iters_per_work_item, _ReduceOp __reduce_op, @@ -236,8 +236,8 @@ struct __parallel_transform_reduce_work_group_kernel_submitter<_Tp, _Commutative { template - __future> - operator()(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, sycl::event&& __reduce_event, + auto + operator()(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, sycl::event& __reduce_event, const _Size __n, const _Size __work_group_size, const _Size __iters_per_work_item, _ReduceOp __reduce_op, _InitType __init, __result_and_scratch_storage<_ExecutionPolicy2, _Tp>& __scratch_container) const { @@ -312,7 +312,7 @@ struct __parallel_transform_reduce_impl { template - static __future> + static auto submit(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Size __n, _Size __work_group_size, const _Size __iters_per_work_item, _ReduceOp __reduce_op, _TransformOp __transform_op, _InitType __init, _Ranges&&... __rngs) @@ -439,7 +439,7 @@ struct __parallel_transform_reduce_impl // reduced in each step. template -__future> +auto __parallel_transform_reduce(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, _ReduceOp __reduce_op, _TransformOp __transform_op, _InitType __init, _Ranges&&... __rngs) { diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h index a7bd22bdd95..3d69cab952c 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h @@ -747,7 +747,7 @@ __is_gpu_with_sg_32(const _ExecutionPolicy& __exec) template -__future> +auto __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _InRng&& __in_rng, _OutRng&& __out_rng, _GenReduceInput __gen_reduce_input, _ReduceOp __reduce_op, _GenScanInput __gen_scan_input,