From 5ba1be49d345c59e1c73ed449fafc30db884d06a Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 19 Sep 2024 15:55:22 +0200 Subject: [PATCH] Use full specialization of returned __future instance and move (instead copy) data into it Signed-off-by: Sergey Kopienko --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 18 ++++++++++++------ .../hetero/dpcpp/parallel_backend_sycl_fpga.h | 2 +- .../dpcpp/parallel_backend_sycl_histogram.h | 18 +++++++++--------- .../hetero/dpcpp/parallel_backend_sycl_merge.h | 2 +- .../dpcpp/parallel_backend_sycl_merge_sort.h | 2 +- .../dpcpp/parallel_backend_sycl_radix_sort.h | 2 +- .../dpcpp/parallel_backend_sycl_reduce.h | 6 ++++-- .../parallel_backend_sycl_reduce_then_scan.h | 3 ++- 8 files changed, 31 insertions(+), 22 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index c0f3e9ad2d6..1224346e06e 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -242,7 +242,7 @@ struct __parallel_for_submitter<__internal::__optional_kernel_name<_Name...>> __brick(__idx, __rngs...); }); }); - return __future(__event); + return __future(std::move(__event)); } }; @@ -372,7 +372,8 @@ struct __parallel_scan_submitter<_CustomName, __internal::__optional_kernel_name }); }); - return __future(__final_event, __result_and_scratch); + return __future>{ + std::move(__final_event), std::move(__result_and_scratch)}; } }; @@ -644,7 +645,8 @@ struct __parallel_copy_if_static_single_group_submitter<_Size, _ElemsPerItem, _W } }); }); - return __future(__event, __result); + return __future>{std::move(__event), + std::move(__result)}; } }; @@ -700,7 +702,10 @@ __parallel_transform_scan_single_group(oneapi::dpl::__internal::__device_backend /* _IsFullGroup= */ ::std::false_type, _Inclusive, _CustomName>>>()( ::std::forward<_ExecutionPolicy>(__exec), std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __n, __init, __binary_op, __unary_op); - return __future(__event, __dummy_result_and_scratch); + + return __future>{ + std::move(__event), std::move(__dummy_result_and_scratch)}; }; if (__n <= 16) return __single_group_scan_f(std::integral_constant<::std::uint16_t, 16>{}); @@ -734,7 +739,8 @@ __parallel_transform_scan_single_group(oneapi::dpl::__internal::__device_backend __parallel_transform_scan_dynamic_single_group_submitter<_Inclusive::value, _DynamicGroupScanKernel>()( std::forward<_ExecutionPolicy>(__exec), std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __n, __init, __binary_op, __unary_op, __max_wg_size); - return __future(__event, __dummy_result_and_scratch); + return __future>{ + std::move(__event), std::move(__dummy_result_and_scratch)}; } } @@ -1866,7 +1872,7 @@ struct __parallel_partial_sort_submitter<__internal::__optional_kernel_name<_Glo }); } // return future and extend lifetime of temporary buffer - return __future(__event1); + return __future(std::move(__event1)); } }; diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h index 7baee78b1b1..2ced9aaec50 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h @@ -75,7 +75,7 @@ struct __parallel_for_fpga_submitter<__internal::__optional_kernel_name<_Name... } }); }); - return __future(__event); + return __future(std::move(__event)); } }; diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_histogram.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_histogram.h index 45124417ade..f3ba8672f04 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_histogram.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_histogram.h @@ -516,19 +516,19 @@ __parallel_histogram_select_kernel(oneapi::dpl::__internal::__device_backend_tag // if bins fit into registers, use register private accumulation if (__num_bins <= __max_work_item_private_bins) { - return __future( + return __future( __histogram_general_registers_local_reduction<__iters_per_work_item, __max_work_item_private_bins>( - __backend_tag, ::std::forward<_ExecutionPolicy>(__exec), __init_event, __work_group_size, - ::std::forward<_Range1>(__input), ::std::forward<_Range2>(__bins), __binhash_manager)); + __backend_tag, std::forward<_ExecutionPolicy>(__exec), __init_event, __work_group_size, + std::forward<_Range1>(__input), std::forward<_Range2>(__bins), __binhash_manager)); } // if bins fit into SLM, use local atomics else if (__num_bins * sizeof(_local_histogram_type) + __binhash_manager.get_required_SLM_elements() * sizeof(_extra_memory_type) < __local_mem_size) { - return __future(__histogram_general_local_atomics<__iters_per_work_item>( - __backend_tag, ::std::forward<_ExecutionPolicy>(__exec), __init_event, __work_group_size, - ::std::forward<_Range1>(__input), ::std::forward<_Range2>(__bins), __binhash_manager)); + return __future(__histogram_general_local_atomics<__iters_per_work_item>( + __backend_tag, std::forward<_ExecutionPolicy>(__exec), __init_event, __work_group_size, + std::forward<_Range1>(__input), std::forward<_Range2>(__bins), __binhash_manager)); } else // otherwise, use global atomics (private copies per workgroup) { @@ -537,9 +537,9 @@ __parallel_histogram_select_kernel(oneapi::dpl::__internal::__device_backend_tag // suggestion which but global memory limitations may increase this value to be able to fit the workgroup // private copies of the histogram bins in global memory. No unrolling is taken advantage of here because it // is a runtime argument. - return __future(__histogram_general_private_global_atomics( - __backend_tag, ::std::forward<_ExecutionPolicy>(__exec), __init_event, __iters_per_work_item, - __work_group_size, ::std::forward<_Range1>(__input), ::std::forward<_Range2>(__bins), __binhash_manager)); + return __future(__histogram_general_private_global_atomics( + __backend_tag, std::forward<_ExecutionPolicy>(__exec), __init_event, __iters_per_work_item, + __work_group_size, std::forward<_Range1>(__input), std::forward<_Range2>(__bins), __binhash_manager)); } } diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 753e32816a0..b9990958e9b 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -162,7 +162,7 @@ struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_N __comp); }); }); - return __future(__event); + return __future(std::move(__event)); } }; diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index 19a4f25b889..1c0bd923a4b 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -303,7 +303,7 @@ struct __parallel_sort_submitter<_IdType, __internal::__optional_kernel_name<_Le }); } - return __future(__event1); + return __future(std::move(__event1)); } }; diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h index b6ee2c4f3b9..0c113556576 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h @@ -866,7 +866,7 @@ __parallel_radix_sort(oneapi::dpl::__internal::__device_backend_tag, _ExecutionP } } - return __future(__event); + return __future(std::move(__event)); } } // namespace __par_backend_hetero diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h index ca776e94dce..fee22e7b17f 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h @@ -154,7 +154,8 @@ struct __parallel_transform_reduce_small_submitter<_Tp, _Commutative, _VecSize, }); }); - return __future(__reduce_event, __scratch_container); + return __future>{ + std::move(__reduce_event), std::move(__scratch_container)}; } }; // struct __parallel_transform_reduce_small_submitter @@ -418,7 +419,8 @@ struct __parallel_transform_reduce_impl __n_groups = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __size_per_work_group); } while (__n > 1); - return __future(__reduce_event, __scratch_container); + return __future>{ + std::move(__reduce_event), std::move(__scratch_container)}; } }; // struct __parallel_transform_reduce_impl diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h index 0856234985f..3d69cab952c 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h @@ -863,7 +863,8 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_ __inputs_per_item = __inputs_per_sub_group / __sub_group_size; } } - return __future(__event, __result_and_scratch); + return __future>{ + std::move(__event), std::move(__result_and_scratch)}; } } // namespace __par_backend_hetero