From 4a558770405381d96d806abb860e2b976239b6dc Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Wed, 6 Nov 2024 17:06:03 +0100 Subject: [PATCH 01/76] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - introduce new function __find_start_point_in Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 151 +++++++++++++++--- 1 file changed, 127 insertions(+), 24 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 753e32816a0..ea0c03d3365 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -45,39 +45,142 @@ namespace __par_backend_hetero // | ----> // 3 | 0 0 0 0 0 | template -auto -__find_start_point(const _Rng1& __rng1, const _Rng2& __rng2, const _Index __i_elem, const _Index __n1, - const _Index __n2, _Compare __comp) +std::pair<_Index, _Index> +__find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rng1_to, const _Rng2& __rng2, + const _Index __rng2_from, _Index __rng2_to, const _Index __i_elem, _Compare __comp) { - //searching for the first '1', a lower bound for a diagonal [0, 0,..., 0, 1, 1,.... 1, 1] - oneapi::dpl::counting_iterator<_Index> __diag_it(0); + assert(__rng1_from <= __rng1_to); + assert(__rng2_from <= __rng2_to); + + assert(__rng1_to > 0 || __rng2_to > 0); - if (__i_elem < __n2) //a condition to specify upper or lower part of the merge matrix to be processed + if constexpr (!std::is_pointer_v<_Rng1>) + assert(__rng1_to <= __rng1.size()); + if constexpr (!std::is_pointer_v<_Rng2>) + assert(__rng2_to <= __rng2.size()); + + assert(__i_elem >= 0); + + // ----------------------- EXAMPLE ------------------------ + // Let's consider the following input data: + // rng1.size() = 10 + // rng2.size() = 6 + // i_diag = 9 + // Let's define the following ranges for processing: + // rng1: [3, ..., 9) -> __rng1_from = 3, __rng1_to = 9 + // rng2: [1, ..., 4) -> __rng2_from = 1, __rng2_to = 4 + // + // The goal: required to process only X' items of the merge matrix + // as intersection of rng1[3, ..., 9) and rng2[1, ..., 4) + // + // -------------------------------------------------------- + // + // __diag_it_begin(rng1) __diag_it_end(rng1) + // (init state) (dest state) (init state, dest state) + // | | | + // V V V + // + + + + + + + // \ rng1 0 1 2 3 4 5 6 7 8 9 + // rng2 +--------------------------------------+ + // 0 | ^ ^ ^ X | <--- __diag_it_end(rng2) (init state) + // + 1 | <----------------- + + X'2 ^ | <--- __diag_it_end(rng2) (dest state) + // + 2 | <----------------- + X'1 | | + // + 3 | <----------------- X'0 | | <--- __diag_it_begin(rng2) (dest state) + // 4 | X ^ | | + // 5 | X | | | <--- __diag_it_begin(rng2) (init state) + // +-------AX-----------+-----------+-----+ + // AX | | + // AX | | + // Run lower_bound:[from = 5, to = 8) + // + // AX - absent items in rng2 + // + // We have three points on diagonal for call comparison: + // X'0 : call __comp(rng1[5], rng2[3]) // 5 + 3 == 9 - 1 == 8 + // X'1 : call __comp(rng1[6], rng2[2]) // 6 + 2 == 9 - 1 == 8 + // X'3 : call __comp(rng1[7], rng2[1]) // 7 + 1 == 9 - 1 == 8 + // - where for every comparing pairs idx(rng1) + idx(rng2) == i_diag - 1 + + //////////////////////////////////////////////////////////////////////////////////// + // Process the corner case: for the first diagonal with the index 0 split point + // is equal to (0, 0) regardless of the size and content of the data. + if (__i_elem > 0) { - const _Index __q = __i_elem; //diagonal index - const _Index __n_diag = std::min<_Index>(__q, __n1); //diagonal size - auto __res = - std::lower_bound(__diag_it, __diag_it + __n_diag, 1 /*value to find*/, - [&__rng2, &__rng1, __q, __comp](const auto& __i_diag, const auto& __value) mutable { - const auto __zero_or_one = __comp(__rng2[__q - __i_diag - 1], __rng1[__i_diag]); - return __zero_or_one < __value; - }); - return std::make_pair(*__res, __q - *__res); + //////////////////////////////////////////////////////////////////////////////////// + // Taking into account the specified constraints of the range of processed data + const auto __index_sum = __i_elem - 1; + + using _IndexSigned = std::make_signed_t<_Index>; + + _IndexSigned idx1_from = __rng1_from; + _IndexSigned idx1_to = __rng1_to; + assert(idx1_from <= idx1_to); + + _IndexSigned idx2_from = __index_sum - (__rng1_to - 1); + _IndexSigned idx2_to = __index_sum - __rng1_from + 1; + assert(idx2_from <= idx2_to); + + const _IndexSigned idx2_from_diff = idx2_from < (_IndexSigned)__rng2_from ? (_IndexSigned)__rng2_from - idx2_from : 0; + const _IndexSigned idx2_to_diff = idx2_to > (_IndexSigned)__rng2_to ? idx2_to - (_IndexSigned)__rng2_to : 0; + + idx1_to -= idx2_from_diff; + idx1_from += idx2_to_diff; + + idx2_from = __index_sum - (idx1_to - 1); + idx2_to = __index_sum - idx1_from + 1; + + assert(idx1_from <= idx1_to); + assert(__rng1_from <= idx1_from && idx1_to <= __rng1_to); + + assert(idx2_from <= idx2_to); + assert(__rng2_from <= idx2_from && idx2_to <= __rng2_to); + + //////////////////////////////////////////////////////////////////////////////////// + // Run search of split point on diagonal + + using __it_t = oneapi::dpl::counting_iterator<_Index>; + + __it_t __diag_it_begin(idx1_from); + __it_t __diag_it_end(idx1_to); + + constexpr int kValue = 1; + const __it_t __res = + std::lower_bound(__diag_it_begin, __diag_it_end, kValue, [&](_Index __idx, const auto& __value) { + const auto __rng1_idx = __idx; + const auto __rng2_idx = __index_sum - __idx; + + assert(__rng1_from <= __rng1_idx && __rng1_idx < __rng1_to); + assert(__rng2_from <= __rng2_idx && __rng2_idx < __rng2_to); + assert(__rng1_idx + __rng2_idx == __index_sum); + + const auto __zero_or_one = __comp(__rng2[__rng2_idx], __rng1[__rng1_idx]); + return __zero_or_one < kValue; + }); + + const std::pair<_Index, _Index> __result = std::make_pair(*__res, __index_sum - *__res + 1); + assert(__result.first + __result.second == __i_elem); + + assert(__rng1_from <= __result.first && __result.first <= __rng1_to); + assert(__rng2_from <= __result.second && __result.second <= __rng2_to); + + return __result; } else { - const _Index __q = __i_elem - __n2; //diagonal index - const _Index __n_diag = std::min<_Index>(__n1 - __q, __n2); //diagonal size - auto __res = - std::lower_bound(__diag_it, __diag_it + __n_diag, 1 /*value to find*/, - [&__rng2, &__rng1, __n2, __q, __comp](const auto& __i_diag, const auto& __value) mutable { - const auto __zero_or_one = __comp(__rng2[__n2 - __i_diag - 1], __rng1[__q + __i_diag]); - return __zero_or_one < __value; - }); - return std::make_pair(__q + *__res, __n2 - *__res); + assert(__rng1_from == 0); + assert(__rng2_from == 0); + return std::make_pair(__rng1_from, __rng2_from); } } +template +std::pair<_Index, _Index> +__find_start_point(const _Rng1& __rng1, const _Rng2& __rng2, const _Index __i_elem, const _Index __n1, + const _Index __n2, _Compare __comp) +{ + return __find_start_point_in(__rng1, (_Index)0, __n1, __rng2, (_Index)0, __n2, __i_elem, __comp); +} + // Do serial merge of the data from rng1 (starting from start1) and rng2 (starting from start2) and writing // to rng3 (starting from start3) in 'chunk' steps, but do not exceed the total size of the sequences (n1 and n2) template From 6553c46ad9302b87915e932643e037816ef7c8e8 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Wed, 6 Nov 2024 17:07:11 +0100 Subject: [PATCH 02/76] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - introduce __parallel_merge_submitter_large for merge of biggest data sizes Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 107 ++++++++++++++++++ 1 file changed, 107 insertions(+) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index ea0c03d3365..10ddf37fc50 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -232,10 +232,16 @@ __serial_merge(const _Rng1& __rng1, const _Rng2& __rng2, _Rng3& __rng3, _Index _ } } +template +class _find_split_points_kernel_on_mid_diagonal; + // Please see the comment for __parallel_for_submitter for optional kernel name explanation template struct __parallel_merge_submitter; +template +struct __parallel_merge_submitter_large; + template struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_Name...>> { @@ -269,6 +275,107 @@ struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_N } }; +template +struct __parallel_merge_submitter_large<_IdType, __internal::__optional_kernel_name<_Name...>> +{ + template + auto + operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const + { + const _IdType __n1 = __rng1.size(); + const _IdType __n2 = __rng2.size(); + const _IdType __n = __n1 + __n2; + + assert(__n1 > 0 || __n2 > 0); + + _PRINT_INFO_IN_DEBUG_MODE(__exec); + + using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + + using _FindSplitPointsKernelOnMidDiagonal = + oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator< + _find_split_points_kernel_on_mid_diagonal, _CustomName, _Range1, _Range2, _IdType, _Compare>; + + // Empirical number of values to process per work-item + const std::uint8_t __chunk = __exec.queue().get_device().is_cpu() ? 128 : 4; + + const _IdType __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk); + const _IdType __base_diag_count = 1'024 * 32; + const _IdType __base_diag_part = oneapi::dpl::__internal::__dpl_ceiling_div(__steps, __base_diag_count); + + using _split_point_t = std::pair<_IdType, _IdType>; + + using __result_and_scratch_storage_t = __result_and_scratch_storage<_ExecutionPolicy, _split_point_t>; + __result_and_scratch_storage_t __result_and_scratch{__exec, 0, __base_diag_count + 1}; + + sycl::event __event = __exec.queue().submit([&](sycl::handler& __cgh) { + + oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2); + auto __scratch_acc = __result_and_scratch.template __get_scratch_acc( + __cgh, __dpl_sycl::__no_init{}); + + __cgh.parallel_for<_FindSplitPointsKernelOnMidDiagonal>( + sycl::range(__base_diag_count + 1), [=](sycl::item __item_id) + { + auto __global_idx = __item_id.get_linear_id(); + auto __scratch_ptr = __result_and_scratch_storage_t::__get_usm_or_buffer_accessor_ptr(__scratch_acc); + + if (__global_idx == 0) + { + __scratch_ptr[0] = std::make_pair((_IdType)0, (_IdType)0); + } + else if (__global_idx == __base_diag_count) + { + __scratch_ptr[__base_diag_count] = std::make_pair(__n1, __n2); + } + else + { + const _IdType __i_elem = __global_idx * __base_diag_part * __chunk; + __scratch_ptr[__global_idx] = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); + } + }); + }); + + __event = __exec.queue().submit([&](sycl::handler& __cgh) { + + oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); + auto __scratch_acc = __result_and_scratch.template __get_scratch_acc(__cgh); + + __cgh.depends_on(__event); + + __cgh.parallel_for<_Name...>(sycl::range(__steps), [=](sycl::item __item_id) { + auto __global_idx = __item_id.get_linear_id(); + const _IdType __i_elem = __global_idx * __chunk; + + auto __scratch_ptr = __result_and_scratch_storage_t::__get_usm_or_buffer_accessor_ptr(__scratch_acc); + auto __scratch_idx = __global_idx / __base_diag_part; + + _split_point_t __start; + if (__global_idx % __base_diag_part != 0) + { + // Check that we fit into size of scratch + assert(__scratch_idx + 1 < __base_diag_count + 1); + + const _split_point_t __sp_left = __scratch_ptr[__scratch_idx]; + const _split_point_t __sp_right = __scratch_ptr[__scratch_idx + 1]; + + __start = __find_start_point_in(__rng1, __sp_left.first, __sp_right.first, + __rng2, __sp_left.second, __sp_right.second, + __i_elem, __comp); + } + else + { + __start = __scratch_ptr[__scratch_idx]; + } + + __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, __n2, + __comp); + }); + }); + return __future(__event); + } +}; + template class __merge_kernel_name; From 6443f2e903d5e640b3f81b93137003e0fc101e75 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Wed, 6 Nov 2024 17:08:24 +0100 Subject: [PATCH 03/76] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - using __parallel_merge_submitter_large for merge data equal or greater then 4M items Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 77 +++++++++++++------ 1 file changed, 54 insertions(+), 23 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 10ddf37fc50..56fa56aeaec 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -232,14 +232,11 @@ __serial_merge(const _Rng1& __rng1, const _Rng2& __rng2, _Rng3& __rng3, _Index _ } } -template -class _find_split_points_kernel_on_mid_diagonal; - // Please see the comment for __parallel_for_submitter for optional kernel name explanation template struct __parallel_merge_submitter; -template +template struct __parallel_merge_submitter_large; template @@ -275,8 +272,14 @@ struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_N } }; -template -struct __parallel_merge_submitter_large<_IdType, __internal::__optional_kernel_name<_Name...>> +template +class _find_split_points_kernel_on_mid_diagonal_uint32_t; + +template +class _find_split_points_kernel_on_mid_diagonal_uint64_t; + +template +struct __parallel_merge_submitter_large<_IdType, _CustomName, __internal::__optional_kernel_name<_Name...>> { template auto @@ -290,11 +293,12 @@ struct __parallel_merge_submitter_large<_IdType, __internal::__optional_kernel_n _PRINT_INFO_IN_DEBUG_MODE(__exec); - using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; - - using _FindSplitPointsKernelOnMidDiagonal = + using _FindSplitPointsKernelOnMidDiagonal = std::conditional_t< + std::is_same_v<_IdType, std::uint32_t>, + oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator< + _find_split_points_kernel_on_mid_diagonal_uint32_t, _CustomName, _Range1, _Range2, _IdType, _Compare>, oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator< - _find_split_points_kernel_on_mid_diagonal, _CustomName, _Range1, _Range2, _IdType, _Compare>; + _find_split_points_kernel_on_mid_diagonal_uint64_t, _CustomName, _Range1, _Range2, _IdType, _Compare>>; // Empirical number of values to process per work-item const std::uint8_t __chunk = __exec.queue().get_device().is_cpu() ? 128 : 4; @@ -379,6 +383,9 @@ struct __parallel_merge_submitter_large<_IdType, __internal::__optional_kernel_n template class __merge_kernel_name; +template +class __merge_kernel_name_large; + template auto __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Range1&& __rng1, @@ -387,23 +394,47 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; const auto __n = __rng1.size() + __rng2.size(); - if (__n <= std::numeric_limits::max()) + if (__n < 4 * 1'048'576) { - using _WiIndex = std::uint32_t; - using _MergeKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __merge_kernel_name<_CustomName, _WiIndex>>; - return __parallel_merge_submitter<_WiIndex, _MergeKernel>()( - std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), - std::forward<_Range3>(__rng3), __comp); + if (__n <= std::numeric_limits::max()) + { + using _WiIndex = std::uint32_t; + using _MergeKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __merge_kernel_name<_CustomName, _WiIndex>>; + return __parallel_merge_submitter<_WiIndex, _MergeKernel>()( + std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), + std::forward<_Range3>(__rng3), __comp); + } + else + { + using _WiIndex = std::uint64_t; + using _MergeKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __merge_kernel_name<_CustomName, _WiIndex>>; + return __parallel_merge_submitter<_WiIndex, _MergeKernel>()( + std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), + std::forward<_Range3>(__rng3), __comp); + } } else { - using _WiIndex = std::uint64_t; - using _MergeKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __merge_kernel_name<_CustomName, _WiIndex>>; - return __parallel_merge_submitter<_WiIndex, _MergeKernel>()( - std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), - std::forward<_Range3>(__rng3), __comp); + if (__n <= std::numeric_limits::max()) + { + using _WiIndex = std::uint32_t; + using _MergeKernelLarge = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __merge_kernel_name_large<_CustomName, _WiIndex>>; + return __parallel_merge_submitter_large<_WiIndex, _CustomName, _MergeKernelLarge>()( + std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), + std::forward<_Range3>(__rng3), __comp); + } + else + { + using _WiIndex = std::uint64_t; + using _MergeKernelLarge = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __merge_kernel_name_large<_CustomName, _WiIndex>>; + return __parallel_merge_submitter_large<_WiIndex, _CustomName, _MergeKernelLarge>()( + std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), + std::forward<_Range3>(__rng3), __comp); + } } } From 4c3422b99c60b896211292a8899ea913801fec51 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 7 Nov 2024 15:07:12 +0100 Subject: [PATCH 04/76] Apply GitHUB clang format Signed-off-by: Sergey Kopienko --- .../hetero/dpcpp/parallel_backend_sycl_merge.h | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 56fa56aeaec..bbbdeb2a6c7 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -120,7 +120,8 @@ __find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rn _IndexSigned idx2_to = __index_sum - __rng1_from + 1; assert(idx2_from <= idx2_to); - const _IndexSigned idx2_from_diff = idx2_from < (_IndexSigned)__rng2_from ? (_IndexSigned)__rng2_from - idx2_from : 0; + const _IndexSigned idx2_from_diff = + idx2_from < (_IndexSigned)__rng2_from ? (_IndexSigned)__rng2_from - idx2_from : 0; const _IndexSigned idx2_to_diff = idx2_to > (_IndexSigned)__rng2_to ? idx2_to - (_IndexSigned)__rng2_to : 0; idx1_to -= idx2_from_diff; @@ -313,16 +314,15 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, __internal::__opti __result_and_scratch_storage_t __result_and_scratch{__exec, 0, __base_diag_count + 1}; sycl::event __event = __exec.queue().submit([&](sycl::handler& __cgh) { - oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2); auto __scratch_acc = __result_and_scratch.template __get_scratch_acc( __cgh, __dpl_sycl::__no_init{}); __cgh.parallel_for<_FindSplitPointsKernelOnMidDiagonal>( - sycl::range(__base_diag_count + 1), [=](sycl::item __item_id) - { + sycl::range(__base_diag_count + 1), [=](sycl::item __item_id) { auto __global_idx = __item_id.get_linear_id(); - auto __scratch_ptr = __result_and_scratch_storage_t::__get_usm_or_buffer_accessor_ptr(__scratch_acc); + auto __scratch_ptr = + __result_and_scratch_storage_t::__get_usm_or_buffer_accessor_ptr(__scratch_acc); if (__global_idx == 0) { @@ -341,7 +341,6 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, __internal::__opti }); __event = __exec.queue().submit([&](sycl::handler& __cgh) { - oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); auto __scratch_acc = __result_and_scratch.template __get_scratch_acc(__cgh); @@ -360,12 +359,11 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, __internal::__opti // Check that we fit into size of scratch assert(__scratch_idx + 1 < __base_diag_count + 1); - const _split_point_t __sp_left = __scratch_ptr[__scratch_idx]; + const _split_point_t __sp_left = __scratch_ptr[__scratch_idx]; const _split_point_t __sp_right = __scratch_ptr[__scratch_idx + 1]; - __start = __find_start_point_in(__rng1, __sp_left.first, __sp_right.first, - __rng2, __sp_left.second, __sp_right.second, - __i_elem, __comp); + __start = __find_start_point_in(__rng1, __sp_left.first, __sp_right.first, __rng2, __sp_left.second, + __sp_right.second, __i_elem, __comp); } else { From afca75a82aa2208aebee673ee987b3f615c3eb78 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 8 Nov 2024 09:39:30 +0100 Subject: [PATCH 05/76] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix compile error Signed-off-by: Sergey Kopienko --- .../hetero/dpcpp/parallel_backend_sycl_merge.h | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index bbbdeb2a6c7..34c8e962765 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -294,12 +294,14 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, __internal::__opti _PRINT_INFO_IN_DEBUG_MODE(__exec); - using _FindSplitPointsKernelOnMidDiagonal = std::conditional_t< - std::is_same_v<_IdType, std::uint32_t>, - oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator< - _find_split_points_kernel_on_mid_diagonal_uint32_t, _CustomName, _Range1, _Range2, _IdType, _Compare>, - oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator< - _find_split_points_kernel_on_mid_diagonal_uint64_t, _CustomName, _Range1, _Range2, _IdType, _Compare>>; + using _FindSplitPointsOnMidDiagonalKernel = + std::conditional_t, + oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator< + _find_split_points_kernel_on_mid_diagonal_uint32_t, _CustomName, _ExecutionPolicy, + _Range1, _Range2, _Range3, _Compare>, + oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator< + _find_split_points_kernel_on_mid_diagonal_uint64_t, _CustomName, _ExecutionPolicy, + _Range1, _Range2, _Range3, _Compare>>; // Empirical number of values to process per work-item const std::uint8_t __chunk = __exec.queue().get_device().is_cpu() ? 128 : 4; @@ -318,7 +320,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, __internal::__opti auto __scratch_acc = __result_and_scratch.template __get_scratch_acc( __cgh, __dpl_sycl::__no_init{}); - __cgh.parallel_for<_FindSplitPointsKernelOnMidDiagonal>( + __cgh.parallel_for<_FindSplitPointsOnMidDiagonalKernel>( sycl::range(__base_diag_count + 1), [=](sycl::item __item_id) { auto __global_idx = __item_id.get_linear_id(); auto __scratch_ptr = From 3d3fb7d9781234b442579d675bc49bd7a4a447ff Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 8 Nov 2024 20:17:31 +0100 Subject: [PATCH 06/76] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix Kernel names Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 45 +++++++++---------- 1 file changed, 20 insertions(+), 25 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 34c8e962765..9aa8c79e011 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -237,9 +237,6 @@ __serial_merge(const _Rng1& __rng1, const _Rng2& __rng2, _Rng3& __rng3, _Index _ template struct __parallel_merge_submitter; -template -struct __parallel_merge_submitter_large; - template struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_Name...>> { @@ -273,14 +270,13 @@ struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_N } }; -template -class _find_split_points_kernel_on_mid_diagonal_uint32_t; - -template -class _find_split_points_kernel_on_mid_diagonal_uint64_t; +template +struct __parallel_merge_submitter_large; -template -struct __parallel_merge_submitter_large<_IdType, _CustomName, __internal::__optional_kernel_name<_Name...>> +template +struct __parallel_merge_submitter_large<_IdType, _CustomName, + __internal::__optional_kernel_name<_DiagonalsKernelName...>, + __internal::__optional_kernel_name<_MergeKernelName...>> { template auto @@ -294,15 +290,6 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, __internal::__opti _PRINT_INFO_IN_DEBUG_MODE(__exec); - using _FindSplitPointsOnMidDiagonalKernel = - std::conditional_t, - oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator< - _find_split_points_kernel_on_mid_diagonal_uint32_t, _CustomName, _ExecutionPolicy, - _Range1, _Range2, _Range3, _Compare>, - oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator< - _find_split_points_kernel_on_mid_diagonal_uint64_t, _CustomName, _ExecutionPolicy, - _Range1, _Range2, _Range3, _Compare>>; - // Empirical number of values to process per work-item const std::uint8_t __chunk = __exec.queue().get_device().is_cpu() ? 128 : 4; @@ -320,7 +307,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, __internal::__opti auto __scratch_acc = __result_and_scratch.template __get_scratch_acc( __cgh, __dpl_sycl::__no_init{}); - __cgh.parallel_for<_FindSplitPointsOnMidDiagonalKernel>( + __cgh.parallel_for<_DiagonalsKernelName...>( sycl::range(__base_diag_count + 1), [=](sycl::item __item_id) { auto __global_idx = __item_id.get_linear_id(); auto __scratch_ptr = @@ -348,7 +335,8 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, __internal::__opti __cgh.depends_on(__event); - __cgh.parallel_for<_Name...>(sycl::range(__steps), [=](sycl::item __item_id) { + __cgh.parallel_for<_MergeKernelName...>( + sycl::range(__steps), [=](sycl::item __item_id) { auto __global_idx = __item_id.get_linear_id(); const _IdType __i_elem = __global_idx * __chunk; @@ -383,6 +371,9 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, __internal::__opti template class __merge_kernel_name; +template +class __diagonals_kernel_name; + template class __merge_kernel_name_large; @@ -420,18 +411,22 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy if (__n <= std::numeric_limits::max()) { using _WiIndex = std::uint32_t; - using _MergeKernelLarge = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + using _DiagonalsKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __diagonals_kernel_name<_CustomName, _WiIndex>>; + using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< __merge_kernel_name_large<_CustomName, _WiIndex>>; - return __parallel_merge_submitter_large<_WiIndex, _CustomName, _MergeKernelLarge>()( + return __parallel_merge_submitter_large<_WiIndex, _CustomName, _DiagonalsKernelName, _MergeKernelName>()( std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), std::forward<_Range3>(__rng3), __comp); } else { using _WiIndex = std::uint64_t; - using _MergeKernelLarge = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + using _DiagonalsKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __diagonals_kernel_name<_CustomName, _WiIndex>>; + using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< __merge_kernel_name_large<_CustomName, _WiIndex>>; - return __parallel_merge_submitter_large<_WiIndex, _CustomName, _MergeKernelLarge>()( + return __parallel_merge_submitter_large<_WiIndex, _CustomName, _DiagonalsKernelName, _MergeKernelName>()( std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), std::forward<_Range3>(__rng3), __comp); } From 80cfc42f6401937418d3c878cc8bdb4c8259398a Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 8 Nov 2024 20:20:16 +0100 Subject: [PATCH 07/76] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - rename template parameter names in __parallel_merge_submitter Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 27 ++++++++++--------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 9aa8c79e011..758c09de9ad 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -234,11 +234,11 @@ __serial_merge(const _Rng1& __rng1, const _Rng2& __rng2, _Rng3& __rng3, _Index _ } // Please see the comment for __parallel_for_submitter for optional kernel name explanation -template +template struct __parallel_merge_submitter; -template -struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_Name...>> +template +struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_MergeKernelName...>> { template auto @@ -259,12 +259,13 @@ struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_N auto __event = __exec.queue().submit([&](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); - __cgh.parallel_for<_Name...>(sycl::range(__steps), [=](sycl::item __item_id) { - const _IdType __i_elem = __item_id.get_linear_id() * __chunk; - const auto __start = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); - __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, __n2, - __comp); - }); + __cgh.parallel_for<_MergeKernelName...>( + sycl::range(__steps), [=](sycl::item __item_id) { + const _IdType __i_elem = __item_id.get_linear_id() * __chunk; + const auto __start = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); + __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, __n2, + __comp); + }); }); return __future(__event); } @@ -390,18 +391,18 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy if (__n <= std::numeric_limits::max()) { using _WiIndex = std::uint32_t; - using _MergeKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< __merge_kernel_name<_CustomName, _WiIndex>>; - return __parallel_merge_submitter<_WiIndex, _MergeKernel>()( + return __parallel_merge_submitter<_WiIndex, _MergeKernelName>()( std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), std::forward<_Range3>(__rng3), __comp); } else { using _WiIndex = std::uint64_t; - using _MergeKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< __merge_kernel_name<_CustomName, _WiIndex>>; - return __parallel_merge_submitter<_WiIndex, _MergeKernel>()( + return __parallel_merge_submitter<_WiIndex, _MergeKernelName>()( std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), std::forward<_Range3>(__rng3), __comp); } From d9377f3c0a69d3f45c57d5ff3da4f3faa093b88a Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 11 Nov 2024 12:28:28 +0100 Subject: [PATCH 08/76] Apply GitHUB clang format Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 43 ++++++++++--------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 758c09de9ad..0c341f31de6 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -338,32 +338,33 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, __cgh.parallel_for<_MergeKernelName...>( sycl::range(__steps), [=](sycl::item __item_id) { - auto __global_idx = __item_id.get_linear_id(); - const _IdType __i_elem = __global_idx * __chunk; + auto __global_idx = __item_id.get_linear_id(); + const _IdType __i_elem = __global_idx * __chunk; - auto __scratch_ptr = __result_and_scratch_storage_t::__get_usm_or_buffer_accessor_ptr(__scratch_acc); - auto __scratch_idx = __global_idx / __base_diag_part; + auto __scratch_ptr = + __result_and_scratch_storage_t::__get_usm_or_buffer_accessor_ptr(__scratch_acc); + auto __scratch_idx = __global_idx / __base_diag_part; - _split_point_t __start; - if (__global_idx % __base_diag_part != 0) - { - // Check that we fit into size of scratch - assert(__scratch_idx + 1 < __base_diag_count + 1); + _split_point_t __start; + if (__global_idx % __base_diag_part != 0) + { + // Check that we fit into size of scratch + assert(__scratch_idx + 1 < __base_diag_count + 1); - const _split_point_t __sp_left = __scratch_ptr[__scratch_idx]; - const _split_point_t __sp_right = __scratch_ptr[__scratch_idx + 1]; + const _split_point_t __sp_left = __scratch_ptr[__scratch_idx]; + const _split_point_t __sp_right = __scratch_ptr[__scratch_idx + 1]; - __start = __find_start_point_in(__rng1, __sp_left.first, __sp_right.first, __rng2, __sp_left.second, - __sp_right.second, __i_elem, __comp); - } - else - { - __start = __scratch_ptr[__scratch_idx]; - } + __start = __find_start_point_in(__rng1, __sp_left.first, __sp_right.first, __rng2, + __sp_left.second, __sp_right.second, __i_elem, __comp); + } + else + { + __start = __scratch_ptr[__scratch_idx]; + } - __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, __n2, - __comp); - }); + __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, __n2, + __comp); + }); }); return __future(__event); } From c5923eb1d1c7adcd1c5de556b290504d2c45fbde Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 18 Nov 2024 16:23:14 +0100 Subject: [PATCH 09/76] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix review comment Signed-off-by: Sergey Kopienko --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 0c341f31de6..8ae3a4b148b 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -389,9 +389,9 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy const auto __n = __rng1.size() + __rng2.size(); if (__n < 4 * 1'048'576) { - if (__n <= std::numeric_limits::max()) + if (__n <= std::numeric_limits::max()) { - using _WiIndex = std::uint32_t; + using _WiIndex = std::uint16_t; using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< __merge_kernel_name<_CustomName, _WiIndex>>; return __parallel_merge_submitter<_WiIndex, _MergeKernelName>()( @@ -400,7 +400,7 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy } else { - using _WiIndex = std::uint64_t; + using _WiIndex = std::uint32_t; using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< __merge_kernel_name<_CustomName, _WiIndex>>; return __parallel_merge_submitter<_WiIndex, _MergeKernelName>()( From 400f695a77553d2090350e5e77bcc2f5e90fdf3b Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 19 Nov 2024 09:36:53 +0100 Subject: [PATCH 10/76] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix review comment Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 24 +++++-------------- 1 file changed, 6 insertions(+), 18 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 8ae3a4b148b..114be82a5ce 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -389,24 +389,12 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy const auto __n = __rng1.size() + __rng2.size(); if (__n < 4 * 1'048'576) { - if (__n <= std::numeric_limits::max()) - { - using _WiIndex = std::uint16_t; - using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __merge_kernel_name<_CustomName, _WiIndex>>; - return __parallel_merge_submitter<_WiIndex, _MergeKernelName>()( - std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), - std::forward<_Range3>(__rng3), __comp); - } - else - { - using _WiIndex = std::uint32_t; - using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __merge_kernel_name<_CustomName, _WiIndex>>; - return __parallel_merge_submitter<_WiIndex, _MergeKernelName>()( - std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), - std::forward<_Range3>(__rng3), __comp); - } + using _WiIndex = std::uint32_t; + using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __merge_kernel_name<_CustomName, _WiIndex>>; + return __parallel_merge_submitter<_WiIndex, _MergeKernelName>()( + std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), + std::forward<_Range3>(__rng3), __comp); } else { From 8994a675653cebd27bbdecc6ee99607207f7728e Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 28 Nov 2024 15:07:00 +0100 Subject: [PATCH 11/76] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - introduce __starting_size_limit_for_large_submitter into __parallel_merge Signed-off-by: Sergey Kopienko --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 114be82a5ce..693a49fde64 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -386,8 +386,10 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy { using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; - const auto __n = __rng1.size() + __rng2.size(); - if (__n < 4 * 1'048'576) + constexpr std::size_t __starting_size_limit_for_large_submitter = 4 * 1'048'576; // 4 MB + + const std::size_t __n = __rng1.size() + __rng2.size(); + if (__n < __starting_size_limit_for_large_submitter) { using _WiIndex = std::uint32_t; using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< From d29f5c9eca376b54dcb42f429e9d5555f685170a Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 28 Nov 2024 16:31:17 +0100 Subject: [PATCH 12/76] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - renames Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 693a49fde64..0627624ea5c 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -300,39 +300,39 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, using _split_point_t = std::pair<_IdType, _IdType>; - using __result_and_scratch_storage_t = __result_and_scratch_storage<_ExecutionPolicy, _split_point_t>; - __result_and_scratch_storage_t __result_and_scratch{__exec, 0, __base_diag_count + 1}; + using __base_diagonals_sp_storage_t = __result_and_scratch_storage<_ExecutionPolicy, _split_point_t>; + __base_diagonals_sp_storage_t __result_and_scratch{__exec, 0, __base_diag_count + 1}; sycl::event __event = __exec.queue().submit([&](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2); - auto __scratch_acc = __result_and_scratch.template __get_scratch_acc( + auto __base_diagonals_sp_global_acc = __result_and_scratch.template __get_scratch_acc( __cgh, __dpl_sycl::__no_init{}); __cgh.parallel_for<_DiagonalsKernelName...>( sycl::range(__base_diag_count + 1), [=](sycl::item __item_id) { auto __global_idx = __item_id.get_linear_id(); - auto __scratch_ptr = - __result_and_scratch_storage_t::__get_usm_or_buffer_accessor_ptr(__scratch_acc); + auto __base_diagonals_sp_global_ptr = + __base_diagonals_sp_storage_t::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); if (__global_idx == 0) { - __scratch_ptr[0] = std::make_pair((_IdType)0, (_IdType)0); + __base_diagonals_sp_global_ptr[0] = std::make_pair((_IdType)0, (_IdType)0); } else if (__global_idx == __base_diag_count) { - __scratch_ptr[__base_diag_count] = std::make_pair(__n1, __n2); + __base_diagonals_sp_global_ptr[__base_diag_count] = std::make_pair(__n1, __n2); } else { const _IdType __i_elem = __global_idx * __base_diag_part * __chunk; - __scratch_ptr[__global_idx] = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); + __base_diagonals_sp_global_ptr[__global_idx] = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); } }); }); __event = __exec.queue().submit([&](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); - auto __scratch_acc = __result_and_scratch.template __get_scratch_acc(__cgh); + auto __base_diagonals_sp_global_acc = __result_and_scratch.template __get_scratch_acc(__cgh); __cgh.depends_on(__event); @@ -341,25 +341,25 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, auto __global_idx = __item_id.get_linear_id(); const _IdType __i_elem = __global_idx * __chunk; - auto __scratch_ptr = - __result_and_scratch_storage_t::__get_usm_or_buffer_accessor_ptr(__scratch_acc); - auto __scratch_idx = __global_idx / __base_diag_part; + auto __base_diagonals_sp_global_ptr = + __base_diagonals_sp_storage_t::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); + auto __diagonal_idx = __global_idx / __base_diag_part; _split_point_t __start; if (__global_idx % __base_diag_part != 0) { // Check that we fit into size of scratch - assert(__scratch_idx + 1 < __base_diag_count + 1); + assert(__diagonal_idx + 1 < __base_diag_count + 1); - const _split_point_t __sp_left = __scratch_ptr[__scratch_idx]; - const _split_point_t __sp_right = __scratch_ptr[__scratch_idx + 1]; + const _split_point_t __sp_left = __base_diagonals_sp_global_ptr[__diagonal_idx]; + const _split_point_t __sp_right = __base_diagonals_sp_global_ptr[__diagonal_idx + 1]; __start = __find_start_point_in(__rng1, __sp_left.first, __sp_right.first, __rng2, __sp_left.second, __sp_right.second, __i_elem, __comp); } else { - __start = __scratch_ptr[__scratch_idx]; + __start = __base_diagonals_sp_global_ptr[__diagonal_idx]; } __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, __n2, From 6f3e3e1ed6b92d95e0276c2c23709dab0d3c59c7 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 28 Nov 2024 16:34:13 +0100 Subject: [PATCH 13/76] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - introduce _split_point_t type Signed-off-by: Sergey Kopienko --- .../hetero/dpcpp/parallel_backend_sycl_merge.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 0627624ea5c..696719b87ec 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -31,6 +31,8 @@ namespace dpl { namespace __par_backend_hetero { +template +using _split_point_t = std::pair<_Index, _Index>; //Searching for an intersection of a merge matrix (n1, n2) diagonal with the Merge Path to define sub-ranges //to serial merge. For example, a merge matrix for [0,1,1,2,3] and [0,0,2,3] is shown below: @@ -45,7 +47,7 @@ namespace __par_backend_hetero // | ----> // 3 | 0 0 0 0 0 | template -std::pair<_Index, _Index> +_split_point_t<_Index> __find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rng1_to, const _Rng2& __rng2, const _Index __rng2_from, _Index __rng2_to, const _Index __i_elem, _Compare __comp) { @@ -158,7 +160,7 @@ __find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rn return __zero_or_one < kValue; }); - const std::pair<_Index, _Index> __result = std::make_pair(*__res, __index_sum - *__res + 1); + const _split_point_t<_Index> __result{ *__res, __index_sum - *__res + 1 }; assert(__result.first + __result.second == __i_elem); assert(__rng1_from <= __result.first && __result.first <= __rng1_to); @@ -175,7 +177,7 @@ __find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rn } template -std::pair<_Index, _Index> +_split_point_t<_Index> __find_start_point(const _Rng1& __rng1, const _Rng2& __rng2, const _Index __i_elem, const _Index __n1, const _Index __n2, _Compare __comp) { @@ -298,9 +300,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, const _IdType __base_diag_count = 1'024 * 32; const _IdType __base_diag_part = oneapi::dpl::__internal::__dpl_ceiling_div(__steps, __base_diag_count); - using _split_point_t = std::pair<_IdType, _IdType>; - - using __base_diagonals_sp_storage_t = __result_and_scratch_storage<_ExecutionPolicy, _split_point_t>; + using __base_diagonals_sp_storage_t = __result_and_scratch_storage<_ExecutionPolicy, _split_point_t<_IdType>>; __base_diagonals_sp_storage_t __result_and_scratch{__exec, 0, __base_diag_count + 1}; sycl::event __event = __exec.queue().submit([&](sycl::handler& __cgh) { @@ -345,14 +345,14 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, __base_diagonals_sp_storage_t::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); auto __diagonal_idx = __global_idx / __base_diag_part; - _split_point_t __start; + _split_point_t<_IdType> __start; if (__global_idx % __base_diag_part != 0) { // Check that we fit into size of scratch assert(__diagonal_idx + 1 < __base_diag_count + 1); - const _split_point_t __sp_left = __base_diagonals_sp_global_ptr[__diagonal_idx]; - const _split_point_t __sp_right = __base_diagonals_sp_global_ptr[__diagonal_idx + 1]; + const _split_point_t<_IdType> __sp_left = __base_diagonals_sp_global_ptr[__diagonal_idx]; + const _split_point_t<_IdType> __sp_right = __base_diagonals_sp_global_ptr[__diagonal_idx + 1]; __start = __find_start_point_in(__rng1, __sp_left.first, __sp_right.first, __rng2, __sp_left.second, __sp_right.second, __i_elem, __comp); From 908b61e2bc1e9d80974c84f40063a1cd83566f8d Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 28 Nov 2024 16:38:23 +0100 Subject: [PATCH 14/76] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - remove usages of std::make_pair Signed-off-by: Sergey Kopienko --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 696719b87ec..9e9591493fc 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -19,7 +19,7 @@ #include // std::numeric_limits #include // assert #include // std::uint8_t, ... -#include // std::make_pair, std::forward +#include // std::forward #include // std::min, std::lower_bound #include "sycl_defs.h" @@ -172,7 +172,7 @@ __find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rn { assert(__rng1_from == 0); assert(__rng2_from == 0); - return std::make_pair(__rng1_from, __rng2_from); + return { __rng1_from, __rng2_from }; } } @@ -316,11 +316,11 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, if (__global_idx == 0) { - __base_diagonals_sp_global_ptr[0] = std::make_pair((_IdType)0, (_IdType)0); + __base_diagonals_sp_global_ptr[0] = _split_point_t<_IdType>{ 0, 0 }; } else if (__global_idx == __base_diag_count) { - __base_diagonals_sp_global_ptr[__base_diag_count] = std::make_pair(__n1, __n2); + __base_diagonals_sp_global_ptr[__base_diag_count] = _split_point_t<_IdType>{ __n1, __n2 }; } else { From 262d65b108c5329d69de8662aa4ae3d1e147853d Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 28 Nov 2024 17:09:02 +0100 Subject: [PATCH 15/76] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - optimize evaluation of split-points on base diagonals Signed-off-by: Sergey Kopienko --- .../hetero/dpcpp/parallel_backend_sycl_merge.h | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 9e9591493fc..c9c52a5c163 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -314,19 +314,15 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, auto __base_diagonals_sp_global_ptr = __base_diagonals_sp_storage_t::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); - if (__global_idx == 0) - { - __base_diagonals_sp_global_ptr[0] = _split_point_t<_IdType>{ 0, 0 }; - } - else if (__global_idx == __base_diag_count) - { - __base_diagonals_sp_global_ptr[__base_diag_count] = _split_point_t<_IdType>{ __n1, __n2 }; - } - else + _split_point_t<_IdType> __sp = __global_idx == 0 ? _split_point_t<_IdType>{ 0, 0 } : _split_point_t<_IdType>{ __n1, __n2 }; + + if (0 < __global_idx && __global_idx < __base_diag_count) { const _IdType __i_elem = __global_idx * __base_diag_part * __chunk; - __base_diagonals_sp_global_ptr[__global_idx] = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); + __sp = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); } + + __base_diagonals_sp_global_ptr[__global_idx] = __sp; }); }); From 02671e35185a3e5942666c1d576a7eccad808414 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 28 Nov 2024 17:04:25 +0100 Subject: [PATCH 16/76] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - renames Signed-off-by: Sergey Kopienko --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index c9c52a5c163..70590c5b69e 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -301,11 +301,11 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, const _IdType __base_diag_part = oneapi::dpl::__internal::__dpl_ceiling_div(__steps, __base_diag_count); using __base_diagonals_sp_storage_t = __result_and_scratch_storage<_ExecutionPolicy, _split_point_t<_IdType>>; - __base_diagonals_sp_storage_t __result_and_scratch{__exec, 0, __base_diag_count + 1}; + __base_diagonals_sp_storage_t __base_diagonals_sp_global_storage{__exec, 0, __base_diag_count + 1}; sycl::event __event = __exec.queue().submit([&](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2); - auto __base_diagonals_sp_global_acc = __result_and_scratch.template __get_scratch_acc( + auto __base_diagonals_sp_global_acc = __base_diagonals_sp_global_storage.template __get_scratch_acc( __cgh, __dpl_sycl::__no_init{}); __cgh.parallel_for<_DiagonalsKernelName...>( @@ -328,7 +328,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, __event = __exec.queue().submit([&](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); - auto __base_diagonals_sp_global_acc = __result_and_scratch.template __get_scratch_acc(__cgh); + auto __base_diagonals_sp_global_acc = __base_diagonals_sp_global_storage.template __get_scratch_acc(__cgh); __cgh.depends_on(__event); From 1825df2dad7e3127aa5bc61646cf49971982c6e9 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 28 Nov 2024 17:10:41 +0100 Subject: [PATCH 17/76] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - extract eval_split_points_for_groups function Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 90 ++++++++++++++----- 1 file changed, 66 insertions(+), 24 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 70590c5b69e..7ba814a7355 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -281,17 +281,25 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, __internal::__optional_kernel_name<_DiagonalsKernelName...>, __internal::__optional_kernel_name<_MergeKernelName...>> { - template - auto - operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const - { - const _IdType __n1 = __rng1.size(); - const _IdType __n2 = __rng2.size(); - const _IdType __n = __n1 + __n2; +protected: - assert(__n1 > 0 || __n2 > 0); - - _PRINT_INFO_IN_DEBUG_MODE(__exec); + struct nd_range_params + { + std::size_t base_diag_count = 0; + std::size_t base_diag_part = 0; + std::uint8_t chunk = 0; + _IdType steps = 0; + }; + + template + using __base_diagonals_sp_storage_t = __result_and_scratch_storage<_ExecutionPolicy, _split_point_t<_IdType>>; + + // Calculate nd-range params + template + nd_range_params + eval_nd_range_params(_ExecutionPolicy&& __exec, const _Range1& __rng1, const _Range2& __rng2) const + { + const std::size_t __n = __rng1.size() + __rng2.size(); // Empirical number of values to process per work-item const std::uint8_t __chunk = __exec.queue().get_device().is_cpu() ? 128 : 4; @@ -300,8 +308,18 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, const _IdType __base_diag_count = 1'024 * 32; const _IdType __base_diag_part = oneapi::dpl::__internal::__dpl_ceiling_div(__steps, __base_diag_count); - using __base_diagonals_sp_storage_t = __result_and_scratch_storage<_ExecutionPolicy, _split_point_t<_IdType>>; - __base_diagonals_sp_storage_t __base_diagonals_sp_global_storage{__exec, 0, __base_diag_count + 1}; + return { __base_diag_count, __base_diag_part, __chunk, __steps }; + } + + // Calculation of split points on each base diagonal + template + sycl::event + eval_split_points_for_groups(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Compare __comp, + const nd_range_params& __nd_range_params, + _Storage& __base_diagonals_sp_global_storage) const + { + const _IdType __n1 = __rng1.size(); + const _IdType __n2 = __rng2.size(); sycl::event __event = __exec.queue().submit([&](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2); @@ -309,16 +327,15 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, __cgh, __dpl_sycl::__no_init{}); __cgh.parallel_for<_DiagonalsKernelName...>( - sycl::range(__base_diag_count + 1), [=](sycl::item __item_id) { + sycl::range(__nd_range_params.base_diag_count + 1), [=](sycl::item __item_id) { auto __global_idx = __item_id.get_linear_id(); - auto __base_diagonals_sp_global_ptr = - __base_diagonals_sp_storage_t::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); + auto __base_diagonals_sp_global_ptr = _Storage::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); _split_point_t<_IdType> __sp = __global_idx == 0 ? _split_point_t<_IdType>{ 0, 0 } : _split_point_t<_IdType>{ __n1, __n2 }; - if (0 < __global_idx && __global_idx < __base_diag_count) + if (0 < __global_idx && __global_idx < __nd_range_params.base_diag_count) { - const _IdType __i_elem = __global_idx * __base_diag_part * __chunk; + const _IdType __i_elem = __global_idx * __nd_range_params.base_diag_part * __nd_range_params.chunk; __sp = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); } @@ -326,6 +343,31 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, }); }); + return __event; + } + +public: + + template + auto + operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const + { + const _IdType __n1 = __rng1.size(); + const _IdType __n2 = __rng2.size(); + const _IdType __n = __n1 + __n2; + + assert(__n1 > 0 || __n2 > 0); + + _PRINT_INFO_IN_DEBUG_MODE(__exec); + + // Calculate nd-range params + const nd_range_params __nd_range_params = eval_nd_range_params(__exec, __rng1, __rng2); + + __base_diagonals_sp_storage_t<_ExecutionPolicy> __base_diagonals_sp_global_storage{__exec, 0, __nd_range_params.base_diag_count + 1}; + + // Calculation of split points on each base diagonal + sycl::event __event = eval_split_points_for_groups(__exec, __rng1, __rng2, __comp, __nd_range_params, __base_diagonals_sp_global_storage); + __event = __exec.queue().submit([&](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); auto __base_diagonals_sp_global_acc = __base_diagonals_sp_global_storage.template __get_scratch_acc(__cgh); @@ -333,19 +375,19 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, __cgh.depends_on(__event); __cgh.parallel_for<_MergeKernelName...>( - sycl::range(__steps), [=](sycl::item __item_id) { + sycl::range(__nd_range_params.steps), [=](sycl::item __item_id) { auto __global_idx = __item_id.get_linear_id(); - const _IdType __i_elem = __global_idx * __chunk; + const _IdType __i_elem = __global_idx * __nd_range_params.chunk; auto __base_diagonals_sp_global_ptr = - __base_diagonals_sp_storage_t::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); - auto __diagonal_idx = __global_idx / __base_diag_part; + __base_diagonals_sp_storage_t<_ExecutionPolicy>::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); + auto __diagonal_idx = __global_idx / __nd_range_params.base_diag_part; _split_point_t<_IdType> __start; - if (__global_idx % __base_diag_part != 0) + if (__global_idx % __nd_range_params.base_diag_part != 0) { // Check that we fit into size of scratch - assert(__diagonal_idx + 1 < __base_diag_count + 1); + assert(__diagonal_idx + 1 < __nd_range_params.base_diag_count + 1); const _split_point_t<_IdType> __sp_left = __base_diagonals_sp_global_ptr[__diagonal_idx]; const _split_point_t<_IdType> __sp_right = __base_diagonals_sp_global_ptr[__diagonal_idx + 1]; @@ -358,7 +400,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, __start = __base_diagonals_sp_global_ptr[__diagonal_idx]; } - __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, __n2, + __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __nd_range_params.chunk, __n1, __n2, __comp); }); }); From 6456fda3d83e6ea1993b585656a67d711adf0dc5 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 28 Nov 2024 17:22:15 +0100 Subject: [PATCH 18/76] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - extract run_parallel_merge function Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 59 +++++++++++-------- 1 file changed, 36 insertions(+), 23 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 7ba814a7355..96d3651e33e 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -291,9 +291,6 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, _IdType steps = 0; }; - template - using __base_diagonals_sp_storage_t = __result_and_scratch_storage<_ExecutionPolicy, _split_point_t<_IdType>>; - // Calculate nd-range params template nd_range_params @@ -346,27 +343,17 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, return __event; } -public: - - template - auto - operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const + // Process parallel merge + template + sycl::event + run_parallel_merge(sycl::event __event, + _ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp, + const nd_range_params& __nd_range_params, + const _Storage& __base_diagonals_sp_global_storage) const { const _IdType __n1 = __rng1.size(); const _IdType __n2 = __rng2.size(); - const _IdType __n = __n1 + __n2; - - assert(__n1 > 0 || __n2 > 0); - - _PRINT_INFO_IN_DEBUG_MODE(__exec); - - // Calculate nd-range params - const nd_range_params __nd_range_params = eval_nd_range_params(__exec, __rng1, __rng2); - - __base_diagonals_sp_storage_t<_ExecutionPolicy> __base_diagonals_sp_global_storage{__exec, 0, __nd_range_params.base_diag_count + 1}; - - // Calculation of split points on each base diagonal - sycl::event __event = eval_split_points_for_groups(__exec, __rng1, __rng2, __comp, __nd_range_params, __base_diagonals_sp_global_storage); __event = __exec.queue().submit([&](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); @@ -379,8 +366,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, auto __global_idx = __item_id.get_linear_id(); const _IdType __i_elem = __global_idx * __nd_range_params.chunk; - auto __base_diagonals_sp_global_ptr = - __base_diagonals_sp_storage_t<_ExecutionPolicy>::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); + auto __base_diagonals_sp_global_ptr = _Storage::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); auto __diagonal_idx = __global_idx / __nd_range_params.base_diag_part; _split_point_t<_IdType> __start; @@ -404,6 +390,33 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, __comp); }); }); + + return __event; + } + +public: + + template + auto + operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const + { + assert(__rng1.size() > 0 || __rng2.size() > 0); + + _PRINT_INFO_IN_DEBUG_MODE(__exec); + + // Calculate nd-range params + const nd_range_params __nd_range_params = eval_nd_range_params(__exec, __rng1, __rng2); + + // Create storage for save split-points on each base diagonal + 1 (for the right base diagonal in the last work-group) + using __base_diagonals_sp_storage_t = __result_and_scratch_storage<_ExecutionPolicy, _split_point_t<_IdType>>; + __base_diagonals_sp_storage_t __base_diagonals_sp_global_storage{__exec, 0, __nd_range_params.base_diag_count + 1}; + + // Calculation of split points on each base diagonal + sycl::event __event = eval_split_points_for_groups(__exec, __rng1, __rng2, __comp, __nd_range_params, __base_diagonals_sp_global_storage); + + // Merge data using split points on each base diagonal + __event = run_parallel_merge(__event, __exec, __rng1, __rng2, __rng3, __comp, __nd_range_params, __base_diagonals_sp_global_storage); + return __future(__event); } }; From 1b0ecd9c46f576e87d9a3f1cf75351b39ac2f7d5 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 28 Nov 2024 17:25:01 +0100 Subject: [PATCH 19/76] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - using SLM bank size to define chunk in the eval_nd_range_params function Signed-off-by: Sergey Kopienko --- .../pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 96d3651e33e..dcec4586c93 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -296,10 +296,19 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, nd_range_params eval_nd_range_params(_ExecutionPolicy&& __exec, const _Range1& __rng1, const _Range2& __rng2) const { + using _Range1ValueType = oneapi::dpl::__internal::__value_t<_Range1>; + using _Range2ValueType = oneapi::dpl::__internal::__value_t<_Range2>; + using _RangeValueType = std::conditional_t<(sizeof(_Range1ValueType) > sizeof(_Range2ValueType)), _Range1ValueType, _Range2ValueType>; + const std::size_t __n = __rng1.size() + __rng2.size(); + constexpr std::size_t __slm_bank_size = 32; // TODO is it correct value? How to get it from hardware? + + // Calculate how many data items we can read into one SLM bank + constexpr std::size_t __data_items_in_slm_bank = oneapi::dpl::__internal::__dpl_ceiling_div(__slm_bank_size, sizeof(_RangeValueType)); + // Empirical number of values to process per work-item - const std::uint8_t __chunk = __exec.queue().get_device().is_cpu() ? 128 : 4; + const std::uint8_t __chunk = __exec.queue().get_device().is_cpu() ? 128 : __data_items_in_slm_bank; const _IdType __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk); const _IdType __base_diag_count = 1'024 * 32; From 3a8891f5011b340d00fb5dcaa0b828a5d47cfb82 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 28 Nov 2024 17:43:34 +0100 Subject: [PATCH 20/76] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - using SLM bank size to define chunk in the eval_nd_range_params function (16) Signed-off-by: Sergey Kopienko --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index dcec4586c93..87a80239199 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -302,7 +302,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, const std::size_t __n = __rng1.size() + __rng2.size(); - constexpr std::size_t __slm_bank_size = 32; // TODO is it correct value? How to get it from hardware? + constexpr std::size_t __slm_bank_size = 16; // TODO is it correct value? How to get it from hardware? // Calculate how many data items we can read into one SLM bank constexpr std::size_t __data_items_in_slm_bank = oneapi::dpl::__internal::__dpl_ceiling_div(__slm_bank_size, sizeof(_RangeValueType)); From de7ab0ba2ecd3753acdffec0314bd56eceee5bb7 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 28 Nov 2024 18:02:33 +0100 Subject: [PATCH 21/76] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - restore old implementation of __find_start_point Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 54 ++++++++++++++++--- 1 file changed, 46 insertions(+), 8 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 87a80239199..26eede8fc70 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -34,6 +34,52 @@ namespace __par_backend_hetero template using _split_point_t = std::pair<_Index, _Index>; +//Searching for an intersection of a merge matrix (n1, n2) diagonal with the Merge Path to define sub-ranges +//to serial merge. For example, a merge matrix for [0,1,1,2,3] and [0,0,2,3] is shown below: +// 0 1 1 2 3 +// ------------------ +// |---> +// 0 | 0 | 1 1 1 1 +// | | +// 0 | 0 | 1 1 1 1 +// | ----------> +// 2 | 0 0 0 0 | 1 +// | ----> +// 3 | 0 0 0 0 0 | +template +auto +__find_start_point(const _Rng1& __rng1, const _Rng2& __rng2, const _Index __i_elem, const _Index __n1, + const _Index __n2, _Compare __comp) +{ + //searching for the first '1', a lower bound for a diagonal [0, 0,..., 0, 1, 1,.... 1, 1] + oneapi::dpl::counting_iterator<_Index> __diag_it(0); + + if (__i_elem < __n2) //a condition to specify upper or lower part of the merge matrix to be processed + { + const _Index __q = __i_elem; //diagonal index + const _Index __n_diag = std::min<_Index>(__q, __n1); //diagonal size + auto __res = + std::lower_bound(__diag_it, __diag_it + __n_diag, 1 /*value to find*/, + [&__rng2, &__rng1, __q, __comp](const auto& __i_diag, const auto& __value) mutable { + const auto __zero_or_one = __comp(__rng2[__q - __i_diag - 1], __rng1[__i_diag]); + return __zero_or_one < __value; + }); + return std::make_pair(*__res, __q - *__res); + } + else + { + const _Index __q = __i_elem - __n2; //diagonal index + const _Index __n_diag = std::min<_Index>(__n1 - __q, __n2); //diagonal size + auto __res = + std::lower_bound(__diag_it, __diag_it + __n_diag, 1 /*value to find*/, + [&__rng2, &__rng1, __n2, __q, __comp](const auto& __i_diag, const auto& __value) mutable { + const auto __zero_or_one = __comp(__rng2[__n2 - __i_diag - 1], __rng1[__q + __i_diag]); + return __zero_or_one < __value; + }); + return std::make_pair(__q + *__res, __n2 - *__res); + } +} + //Searching for an intersection of a merge matrix (n1, n2) diagonal with the Merge Path to define sub-ranges //to serial merge. For example, a merge matrix for [0,1,1,2,3] and [0,0,2,3] is shown below: // 0 1 1 2 3 @@ -176,14 +222,6 @@ __find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rn } } -template -_split_point_t<_Index> -__find_start_point(const _Rng1& __rng1, const _Rng2& __rng2, const _Index __i_elem, const _Index __n1, - const _Index __n2, _Compare __comp) -{ - return __find_start_point_in(__rng1, (_Index)0, __n1, __rng2, (_Index)0, __n2, __i_elem, __comp); -} - // Do serial merge of the data from rng1 (starting from start1) and rng2 (starting from start2) and writing // to rng3 (starting from start3) in 'chunk' steps, but do not exceed the total size of the sequences (n1 and n2) template From e9c39fe245f3b50226b237d32ff9ec61d311b7ae Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 3 Dec 2024 17:22:31 +0100 Subject: [PATCH 22/76] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - rename: base_diag_part -> steps_between_two_base_diags Signed-off-by: Sergey Kopienko --- .../pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 26eede8fc70..230d7ff5ace 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -324,7 +324,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, struct nd_range_params { std::size_t base_diag_count = 0; - std::size_t base_diag_part = 0; + std::size_t steps_between_two_base_diags = 0; std::uint8_t chunk = 0; _IdType steps = 0; }; @@ -350,9 +350,9 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, const _IdType __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk); const _IdType __base_diag_count = 1'024 * 32; - const _IdType __base_diag_part = oneapi::dpl::__internal::__dpl_ceiling_div(__steps, __base_diag_count); + const _IdType __steps_between_two_base_diags = oneapi::dpl::__internal::__dpl_ceiling_div(__steps, __base_diag_count); - return { __base_diag_count, __base_diag_part, __chunk, __steps }; + return { __base_diag_count, __steps_between_two_base_diags, __chunk, __steps }; } // Calculation of split points on each base diagonal @@ -379,7 +379,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, if (0 < __global_idx && __global_idx < __nd_range_params.base_diag_count) { - const _IdType __i_elem = __global_idx * __nd_range_params.base_diag_part * __nd_range_params.chunk; + const _IdType __i_elem = __global_idx * __nd_range_params.steps_between_two_base_diags * __nd_range_params.chunk; __sp = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); } @@ -414,10 +414,10 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, const _IdType __i_elem = __global_idx * __nd_range_params.chunk; auto __base_diagonals_sp_global_ptr = _Storage::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); - auto __diagonal_idx = __global_idx / __nd_range_params.base_diag_part; + auto __diagonal_idx = __global_idx / __nd_range_params.steps_between_two_base_diags; _split_point_t<_IdType> __start; - if (__global_idx % __nd_range_params.base_diag_part != 0) + if (__global_idx % __nd_range_params.steps_between_two_base_diags != 0) { // Check that we fit into size of scratch assert(__diagonal_idx + 1 < __nd_range_params.base_diag_count + 1); From 6b4d2cb881b0b230d94822ac1040c00fa55a3ccc Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Wed, 4 Dec 2024 08:15:36 +0100 Subject: [PATCH 23/76] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix review comment Signed-off-by: Sergey Kopienko --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 230d7ff5ace..b011f6c44ad 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -19,7 +19,7 @@ #include // std::numeric_limits #include // assert #include // std::uint8_t, ... -#include // std::forward +#include // std::make_pair, std::forward #include // std::min, std::lower_bound #include "sycl_defs.h" From b29c080d356efbf1d36e69bacdf1d8a01dad154f Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Wed, 4 Dec 2024 10:59:06 +0100 Subject: [PATCH 24/76] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix an error in __parallel_merge_submitter_large::eval_split_points_for_groups Signed-off-by: Sergey Kopienko --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index b011f6c44ad..50248c2cf16 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -364,6 +364,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, { const _IdType __n1 = __rng1.size(); const _IdType __n2 = __rng2.size(); + const _IdType __n = __n1 + __n2; sycl::event __event = __exec.queue().submit([&](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2); @@ -380,7 +381,8 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, if (0 < __global_idx && __global_idx < __nd_range_params.base_diag_count) { const _IdType __i_elem = __global_idx * __nd_range_params.steps_between_two_base_diags * __nd_range_params.chunk; - __sp = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); + if (__i_elem < __n) + __sp = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); } __base_diagonals_sp_global_ptr[__global_idx] = __sp; From 6f54078b7c9a6f899e07736a095d3710ea13c4d8 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Wed, 4 Dec 2024 11:08:03 +0100 Subject: [PATCH 25/76] Fix an error: the life time of storage with split points on base diagonals is too short Signed-off-by: Sergey Kopienko --- .../pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 13 ++++++++----- .../pstl/hetero/dpcpp/parallel_backend_sycl_utils.h | 8 +++++++- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 50248c2cf16..d1cd047cdf2 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -307,7 +307,9 @@ struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_M __comp); }); }); - return __future(__event); + // We should return the same thing in the second param of __future for compatibility + // with the returning value in __parallel_merge_submitter_large::operator() + return __future(__event, __result_and_scratch_storage_base_ptr{}); } }; @@ -458,15 +460,16 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, // Create storage for save split-points on each base diagonal + 1 (for the right base diagonal in the last work-group) using __base_diagonals_sp_storage_t = __result_and_scratch_storage<_ExecutionPolicy, _split_point_t<_IdType>>; - __base_diagonals_sp_storage_t __base_diagonals_sp_global_storage{__exec, 0, __nd_range_params.base_diag_count + 1}; + auto __p_base_diagonals_sp_global_storage = new __base_diagonals_sp_storage_t(__exec, 0, __nd_range_params.base_diag_count + 1); + __result_and_scratch_storage_base_ptr __p_result_and_scratch_storage_base(static_cast<__result_and_scratch_storage_base*>(__p_base_diagonals_sp_global_storage)); // Calculation of split points on each base diagonal - sycl::event __event = eval_split_points_for_groups(__exec, __rng1, __rng2, __comp, __nd_range_params, __base_diagonals_sp_global_storage); + sycl::event __event = eval_split_points_for_groups(__exec, __rng1, __rng2, __comp, __nd_range_params, *__p_base_diagonals_sp_global_storage); // Merge data using split points on each base diagonal - __event = run_parallel_merge(__event, __exec, __rng1, __rng2, __rng3, __comp, __nd_range_params, __base_diagonals_sp_global_storage); + __event = run_parallel_merge(__event, __exec, __rng1, __rng2, __rng3, __comp, __nd_range_params, *__p_base_diagonals_sp_global_storage); - return __future(__event); + return __future(__event, std::move(__p_result_and_scratch_storage_base)); } }; diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h index f4eb557170e..e66e8c28089 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h @@ -518,8 +518,14 @@ struct __usm_or_buffer_accessor } }; +struct __result_and_scratch_storage_base +{ + virtual ~__result_and_scratch_storage_base() = default; +}; +using __result_and_scratch_storage_base_ptr = std::shared_ptr<__result_and_scratch_storage_base>; + template -struct __result_and_scratch_storage +struct __result_and_scratch_storage : __result_and_scratch_storage_base { private: using __sycl_buffer_t = sycl::buffer<_T, 1>; From 4292c6c89478e71532395a576c546fdc8d54c278 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Wed, 4 Dec 2024 17:20:43 +0100 Subject: [PATCH 26/76] Combine two submitters `__parallel_merge_submitter` and `__parallel_merge_submitter_large` into one `__parallel_merge_submitter` (#1956) --- .../dpcpp/parallel_backend_sycl_merge.h | 174 +++++++++--------- 1 file changed, 87 insertions(+), 87 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index d1cd047cdf2..d0379b07c99 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -273,53 +273,14 @@ __serial_merge(const _Rng1& __rng1, const _Rng2& __rng2, _Rng3& __rng3, _Index _ } } -// Please see the comment for __parallel_for_submitter for optional kernel name explanation -template +template struct __parallel_merge_submitter; -template -struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_MergeKernelName...>> -{ - template - auto - operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const - { - const _IdType __n1 = __rng1.size(); - const _IdType __n2 = __rng2.size(); - const _IdType __n = __n1 + __n2; - - assert(__n1 > 0 || __n2 > 0); - - _PRINT_INFO_IN_DEBUG_MODE(__exec); - - // Empirical number of values to process per work-item - const std::uint8_t __chunk = __exec.queue().get_device().is_cpu() ? 128 : 4; - - const _IdType __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk); - - auto __event = __exec.queue().submit([&](sycl::handler& __cgh) { - oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); - __cgh.parallel_for<_MergeKernelName...>( - sycl::range(__steps), [=](sycl::item __item_id) { - const _IdType __i_elem = __item_id.get_linear_id() * __chunk; - const auto __start = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); - __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, __n2, - __comp); - }); - }); - // We should return the same thing in the second param of __future for compatibility - // with the returning value in __parallel_merge_submitter_large::operator() - return __future(__event, __result_and_scratch_storage_base_ptr{}); - } -}; - -template -struct __parallel_merge_submitter_large; - -template -struct __parallel_merge_submitter_large<_IdType, _CustomName, +template +struct __parallel_merge_submitter<_IdType, _CustomName, __internal::__optional_kernel_name<_DiagonalsKernelName...>, - __internal::__optional_kernel_name<_MergeKernelName...>> + __internal::__optional_kernel_name<_MergeKernelName1...>, + __internal::__optional_kernel_name<_MergeKernelName2...>> { protected: @@ -351,8 +312,8 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, const std::uint8_t __chunk = __exec.queue().get_device().is_cpu() ? 128 : __data_items_in_slm_bank; const _IdType __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk); - const _IdType __base_diag_count = 1'024 * 32; - const _IdType __steps_between_two_base_diags = oneapi::dpl::__internal::__dpl_ceiling_div(__steps, __base_diag_count); + const _IdType __base_diag_count = __use_base_diags ? 32 * 1'024 : 0; + const _IdType __steps_between_two_base_diags = __use_base_diags ? oneapi::dpl::__internal::__dpl_ceiling_div(__steps, __base_diag_count) : 0; return { __base_diag_count, __steps_between_two_base_diags, __chunk, __steps }; } @@ -394,6 +355,33 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, return __event; } + // Process parallel merge + template + sycl::event + run_parallel_merge(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp, + const nd_range_params& __nd_range_params) const + { + const _IdType __n1 = __rng1.size(); + const _IdType __n2 = __rng2.size(); + + const auto __chunk = __nd_range_params.chunk; + + sycl::event __event = __exec.queue().submit([&](sycl::handler& __cgh) { + oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); + + __cgh.parallel_for<_MergeKernelName1...>( + sycl::range(__nd_range_params.steps), [=](sycl::item __item_id) { + auto __global_idx = __item_id.get_linear_id(); + const _IdType __i_elem = __global_idx * __chunk; + + _split_point_t<_IdType> __start = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); + __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, __n2, __comp); + }); + }); + + return __event; + } + // Process parallel merge template @@ -412,7 +400,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, __cgh.depends_on(__event); - __cgh.parallel_for<_MergeKernelName...>( + __cgh.parallel_for<_MergeKernelName2...>( sycl::range(__nd_range_params.steps), [=](sycl::item __item_id) { auto __global_idx = __item_id.get_linear_id(); const _IdType __i_elem = __global_idx * __nd_range_params.chunk; @@ -437,8 +425,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, __start = __base_diagonals_sp_global_ptr[__diagonal_idx]; } - __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __nd_range_params.chunk, __n1, __n2, - __comp); + __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __nd_range_params.chunk, __n1, __n2, __comp); }); }); @@ -447,6 +434,11 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, public: + __parallel_merge_submitter(bool __use_base_diags) + : __use_base_diags(__use_base_diags) + { + } + template auto operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const @@ -458,29 +450,43 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, // Calculate nd-range params const nd_range_params __nd_range_params = eval_nd_range_params(__exec, __rng1, __rng2); - // Create storage for save split-points on each base diagonal + 1 (for the right base diagonal in the last work-group) - using __base_diagonals_sp_storage_t = __result_and_scratch_storage<_ExecutionPolicy, _split_point_t<_IdType>>; - auto __p_base_diagonals_sp_global_storage = new __base_diagonals_sp_storage_t(__exec, 0, __nd_range_params.base_diag_count + 1); - __result_and_scratch_storage_base_ptr __p_result_and_scratch_storage_base(static_cast<__result_and_scratch_storage_base*>(__p_base_diagonals_sp_global_storage)); + __result_and_scratch_storage_base_ptr __p_result_and_scratch_storage_base; // Calculation of split points on each base diagonal - sycl::event __event = eval_split_points_for_groups(__exec, __rng1, __rng2, __comp, __nd_range_params, *__p_base_diagonals_sp_global_storage); + sycl::event __event; + if (__use_base_diags) + { + // Create storage for save split-points on each base diagonal + 1 (for the right base diagonal in the last work-group) + auto __p_base_diagonals_sp_global_storage = new __result_and_scratch_storage<_ExecutionPolicy, _split_point_t<_IdType>>(__exec, 0, __nd_range_params.base_diag_count + 1); + __p_result_and_scratch_storage_base.reset(static_cast<__result_and_scratch_storage_base*>(__p_base_diagonals_sp_global_storage)); - // Merge data using split points on each base diagonal - __event = run_parallel_merge(__event, __exec, __rng1, __rng2, __rng3, __comp, __nd_range_params, *__p_base_diagonals_sp_global_storage); + __event = eval_split_points_for_groups(__exec, __rng1, __rng2, __comp, __nd_range_params, *__p_base_diagonals_sp_global_storage); - return __future(__event, std::move(__p_result_and_scratch_storage_base)); + // Merge data using split points on each base diagonal + __event = run_parallel_merge(__event, __exec, __rng1, __rng2, __rng3, __comp, __nd_range_params, *__p_base_diagonals_sp_global_storage); + } + else + { + // Merge data using split points on each base diagonal + __event = run_parallel_merge(__exec, __rng1, __rng2, __rng3, __comp, __nd_range_params); + } + + return __future(std::move(__event), std::move(__p_result_and_scratch_storage_base)); } + +private: + + const bool __use_base_diags = false; }; template -class __merge_kernel_name; +class __merge_kernel_name1; template -class __diagonals_kernel_name; +class __merge_kernel_name2; template -class __merge_kernel_name_large; +class __diagonals_kernel_name; template auto @@ -489,42 +495,36 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy { using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + const std::size_t __n = __rng1.size() + __rng2.size(); + constexpr std::size_t __starting_size_limit_for_large_submitter = 4 * 1'048'576; // 4 MB + const bool __use_base_diags = __n >= __starting_size_limit_for_large_submitter; - const std::size_t __n = __rng1.size() + __rng2.size(); - if (__n < __starting_size_limit_for_large_submitter) + if (__n <= std::numeric_limits::max()) { using _WiIndex = std::uint32_t; - using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __merge_kernel_name<_CustomName, _WiIndex>>; - return __parallel_merge_submitter<_WiIndex, _MergeKernelName>()( + using _DiagonalsKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __diagonals_kernel_name<_CustomName, _WiIndex>>; + using _MergeKernelName1 = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __merge_kernel_name1<_CustomName, _WiIndex>>; + using _MergeKernelName2 = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __merge_kernel_name2<_CustomName, _WiIndex>>; + return __parallel_merge_submitter<_WiIndex, _CustomName, _DiagonalsKernelName, _MergeKernelName1, _MergeKernelName2>(__use_base_diags)( std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), std::forward<_Range3>(__rng3), __comp); } else { - if (__n <= std::numeric_limits::max()) - { - using _WiIndex = std::uint32_t; - using _DiagonalsKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __diagonals_kernel_name<_CustomName, _WiIndex>>; - using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __merge_kernel_name_large<_CustomName, _WiIndex>>; - return __parallel_merge_submitter_large<_WiIndex, _CustomName, _DiagonalsKernelName, _MergeKernelName>()( - std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), - std::forward<_Range3>(__rng3), __comp); - } - else - { - using _WiIndex = std::uint64_t; - using _DiagonalsKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __diagonals_kernel_name<_CustomName, _WiIndex>>; - using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __merge_kernel_name_large<_CustomName, _WiIndex>>; - return __parallel_merge_submitter_large<_WiIndex, _CustomName, _DiagonalsKernelName, _MergeKernelName>()( - std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), - std::forward<_Range3>(__rng3), __comp); - } + using _WiIndex = std::uint64_t; + using _DiagonalsKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __diagonals_kernel_name<_CustomName, _WiIndex>>; + using _MergeKernelName1 = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __merge_kernel_name1<_CustomName, _WiIndex>>; + using _MergeKernelName2 = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __merge_kernel_name2<_CustomName, _WiIndex>>; + return __parallel_merge_submitter<_WiIndex, _CustomName, _DiagonalsKernelName, _MergeKernelName1, _MergeKernelName2>(__use_base_diags)( + std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), + std::forward<_Range3>(__rng3), __comp); } } From 6ad8170b7fd4e48e9af20fabf9317937d7a84099 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Wed, 4 Dec 2024 18:32:58 +0100 Subject: [PATCH 27/76] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix review comment: remove extra condition check from __find_start_point_in Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 98 +++++++++---------- 1 file changed, 44 insertions(+), 54 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index d0379b07c99..fb19b08a609 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -107,7 +107,9 @@ __find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rn if constexpr (!std::is_pointer_v<_Rng2>) assert(__rng2_to <= __rng2.size()); - assert(__i_elem >= 0); + // We shouldn't call this function with __i_elem == 0 because we a priory know that + // split point for this case is {0, 0} + assert(__i_elem > 0); // ----------------------- EXAMPLE ------------------------ // Let's consider the following input data: @@ -150,76 +152,64 @@ __find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rn // - where for every comparing pairs idx(rng1) + idx(rng2) == i_diag - 1 //////////////////////////////////////////////////////////////////////////////////// - // Process the corner case: for the first diagonal with the index 0 split point - // is equal to (0, 0) regardless of the size and content of the data. - if (__i_elem > 0) - { - //////////////////////////////////////////////////////////////////////////////////// - // Taking into account the specified constraints of the range of processed data - const auto __index_sum = __i_elem - 1; + // Taking into account the specified constraints of the range of processed data + const auto __index_sum = __i_elem - 1; - using _IndexSigned = std::make_signed_t<_Index>; + using _IndexSigned = std::make_signed_t<_Index>; - _IndexSigned idx1_from = __rng1_from; - _IndexSigned idx1_to = __rng1_to; - assert(idx1_from <= idx1_to); + _IndexSigned idx1_from = __rng1_from; + _IndexSigned idx1_to = __rng1_to; + assert(idx1_from <= idx1_to); - _IndexSigned idx2_from = __index_sum - (__rng1_to - 1); - _IndexSigned idx2_to = __index_sum - __rng1_from + 1; - assert(idx2_from <= idx2_to); + _IndexSigned idx2_from = __index_sum - (__rng1_to - 1); + _IndexSigned idx2_to = __index_sum - __rng1_from + 1; + assert(idx2_from <= idx2_to); - const _IndexSigned idx2_from_diff = - idx2_from < (_IndexSigned)__rng2_from ? (_IndexSigned)__rng2_from - idx2_from : 0; - const _IndexSigned idx2_to_diff = idx2_to > (_IndexSigned)__rng2_to ? idx2_to - (_IndexSigned)__rng2_to : 0; + const _IndexSigned idx2_from_diff = + idx2_from < (_IndexSigned)__rng2_from ? (_IndexSigned)__rng2_from - idx2_from : 0; + const _IndexSigned idx2_to_diff = idx2_to > (_IndexSigned)__rng2_to ? idx2_to - (_IndexSigned)__rng2_to : 0; - idx1_to -= idx2_from_diff; - idx1_from += idx2_to_diff; + idx1_to -= idx2_from_diff; + idx1_from += idx2_to_diff; - idx2_from = __index_sum - (idx1_to - 1); - idx2_to = __index_sum - idx1_from + 1; + idx2_from = __index_sum - (idx1_to - 1); + idx2_to = __index_sum - idx1_from + 1; - assert(idx1_from <= idx1_to); - assert(__rng1_from <= idx1_from && idx1_to <= __rng1_to); + assert(idx1_from <= idx1_to); + assert(__rng1_from <= idx1_from && idx1_to <= __rng1_to); - assert(idx2_from <= idx2_to); - assert(__rng2_from <= idx2_from && idx2_to <= __rng2_to); + assert(idx2_from <= idx2_to); + assert(__rng2_from <= idx2_from && idx2_to <= __rng2_to); - //////////////////////////////////////////////////////////////////////////////////// - // Run search of split point on diagonal + //////////////////////////////////////////////////////////////////////////////////// + // Run search of split point on diagonal - using __it_t = oneapi::dpl::counting_iterator<_Index>; + using __it_t = oneapi::dpl::counting_iterator<_Index>; - __it_t __diag_it_begin(idx1_from); - __it_t __diag_it_end(idx1_to); + __it_t __diag_it_begin(idx1_from); + __it_t __diag_it_end(idx1_to); - constexpr int kValue = 1; - const __it_t __res = - std::lower_bound(__diag_it_begin, __diag_it_end, kValue, [&](_Index __idx, const auto& __value) { - const auto __rng1_idx = __idx; - const auto __rng2_idx = __index_sum - __idx; + constexpr int kValue = 1; + const __it_t __res = + std::lower_bound(__diag_it_begin, __diag_it_end, kValue, [&](_Index __idx, const auto& __value) { + const auto __rng1_idx = __idx; + const auto __rng2_idx = __index_sum - __idx; - assert(__rng1_from <= __rng1_idx && __rng1_idx < __rng1_to); - assert(__rng2_from <= __rng2_idx && __rng2_idx < __rng2_to); - assert(__rng1_idx + __rng2_idx == __index_sum); + assert(__rng1_from <= __rng1_idx && __rng1_idx < __rng1_to); + assert(__rng2_from <= __rng2_idx && __rng2_idx < __rng2_to); + assert(__rng1_idx + __rng2_idx == __index_sum); - const auto __zero_or_one = __comp(__rng2[__rng2_idx], __rng1[__rng1_idx]); - return __zero_or_one < kValue; - }); + const auto __zero_or_one = __comp(__rng2[__rng2_idx], __rng1[__rng1_idx]); + return __zero_or_one < kValue; + }); - const _split_point_t<_Index> __result{ *__res, __index_sum - *__res + 1 }; - assert(__result.first + __result.second == __i_elem); + const _split_point_t<_Index> __result{ *__res, __index_sum - *__res + 1 }; + assert(__result.first + __result.second == __i_elem); - assert(__rng1_from <= __result.first && __result.first <= __rng1_to); - assert(__rng2_from <= __result.second && __result.second <= __rng2_to); + assert(__rng1_from <= __result.first && __result.first <= __rng1_to); + assert(__rng2_from <= __result.second && __result.second <= __rng2_to); - return __result; - } - else - { - assert(__rng1_from == 0); - assert(__rng2_from == 0); - return { __rng1_from, __rng2_from }; - } + return __result; } // Do serial merge of the data from rng1 (starting from start1) and rng2 (starting from start2) and writing From 6dd39e7f43907d460340d5175d06b0af52116661 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Wed, 4 Dec 2024 18:39:19 +0100 Subject: [PATCH 28/76] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix review comment: fix condition check in __find_start_point_in Signed-off-by: Sergey Kopienko --- .../pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index fb19b08a609..296db3927f4 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -97,15 +97,8 @@ _split_point_t<_Index> __find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rng1_to, const _Rng2& __rng2, const _Index __rng2_from, _Index __rng2_to, const _Index __i_elem, _Compare __comp) { - assert(__rng1_from <= __rng1_to); - assert(__rng2_from <= __rng2_to); - - assert(__rng1_to > 0 || __rng2_to > 0); - - if constexpr (!std::is_pointer_v<_Rng1>) - assert(__rng1_to <= __rng1.size()); - if constexpr (!std::is_pointer_v<_Rng2>) - assert(__rng2_to <= __rng2.size()); + assert(0 <= __rng1_from && __rng1_from < __rng1_to && __rng1_to < __rng1.size()); + assert(0 <= __rng2_from && __rng2_from < __rng2_to && __rng2_to < __rng2.size()); // We shouldn't call this function with __i_elem == 0 because we a priory know that // split point for this case is {0, 0} From 1b7de915049ab236888b1f8f38e08ef6426b272e Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Wed, 4 Dec 2024 18:43:56 +0100 Subject: [PATCH 29/76] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - apply GitHUB clang format Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 98 +++++++++++-------- 1 file changed, 55 insertions(+), 43 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 296db3927f4..5ce69d3f342 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -196,7 +196,7 @@ __find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rn return __zero_or_one < kValue; }); - const _split_point_t<_Index> __result{ *__res, __index_sum - *__res + 1 }; + const _split_point_t<_Index> __result{*__res, __index_sum - *__res + 1}; assert(__result.first + __result.second == __i_elem); assert(__rng1_from <= __result.first && __result.first <= __rng1_to); @@ -256,23 +256,23 @@ __serial_merge(const _Rng1& __rng1, const _Rng2& __rng2, _Rng3& __rng3, _Index _ } } -template +template struct __parallel_merge_submitter; -template -struct __parallel_merge_submitter<_IdType, _CustomName, - __internal::__optional_kernel_name<_DiagonalsKernelName...>, - __internal::__optional_kernel_name<_MergeKernelName1...>, - __internal::__optional_kernel_name<_MergeKernelName2...>> +template +struct __parallel_merge_submitter<_IdType, _CustomName, __internal::__optional_kernel_name<_DiagonalsKernelName...>, + __internal::__optional_kernel_name<_MergeKernelName1...>, + __internal::__optional_kernel_name<_MergeKernelName2...>> { -protected: - + protected: struct nd_range_params { - std::size_t base_diag_count = 0; - std::size_t steps_between_two_base_diags = 0; + std::size_t base_diag_count = 0; + std::size_t steps_between_two_base_diags = 0; std::uint8_t chunk = 0; - _IdType steps = 0; + _IdType steps = 0; }; // Calculate nd-range params @@ -282,23 +282,26 @@ struct __parallel_merge_submitter<_IdType, _CustomName, { using _Range1ValueType = oneapi::dpl::__internal::__value_t<_Range1>; using _Range2ValueType = oneapi::dpl::__internal::__value_t<_Range2>; - using _RangeValueType = std::conditional_t<(sizeof(_Range1ValueType) > sizeof(_Range2ValueType)), _Range1ValueType, _Range2ValueType>; + using _RangeValueType = std::conditional_t<(sizeof(_Range1ValueType) > sizeof(_Range2ValueType)), + _Range1ValueType, _Range2ValueType>; const std::size_t __n = __rng1.size() + __rng2.size(); - constexpr std::size_t __slm_bank_size = 16; // TODO is it correct value? How to get it from hardware? + constexpr std::size_t __slm_bank_size = 16; // TODO is it correct value? How to get it from hardware? // Calculate how many data items we can read into one SLM bank - constexpr std::size_t __data_items_in_slm_bank = oneapi::dpl::__internal::__dpl_ceiling_div(__slm_bank_size, sizeof(_RangeValueType)); + constexpr std::size_t __data_items_in_slm_bank = + oneapi::dpl::__internal::__dpl_ceiling_div(__slm_bank_size, sizeof(_RangeValueType)); // Empirical number of values to process per work-item const std::uint8_t __chunk = __exec.queue().get_device().is_cpu() ? 128 : __data_items_in_slm_bank; const _IdType __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk); const _IdType __base_diag_count = __use_base_diags ? 32 * 1'024 : 0; - const _IdType __steps_between_two_base_diags = __use_base_diags ? oneapi::dpl::__internal::__dpl_ceiling_div(__steps, __base_diag_count) : 0; + const _IdType __steps_between_two_base_diags = + __use_base_diags ? oneapi::dpl::__internal::__dpl_ceiling_div(__steps, __base_diag_count) : 0; - return { __base_diag_count, __steps_between_two_base_diags, __chunk, __steps }; + return {__base_diag_count, __steps_between_two_base_diags, __chunk, __steps}; } // Calculation of split points on each base diagonal @@ -314,19 +317,23 @@ struct __parallel_merge_submitter<_IdType, _CustomName, sycl::event __event = __exec.queue().submit([&](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2); - auto __base_diagonals_sp_global_acc = __base_diagonals_sp_global_storage.template __get_scratch_acc( - __cgh, __dpl_sycl::__no_init{}); + auto __base_diagonals_sp_global_acc = + __base_diagonals_sp_global_storage.template __get_scratch_acc( + __cgh, __dpl_sycl::__no_init{}); __cgh.parallel_for<_DiagonalsKernelName...>( sycl::range(__nd_range_params.base_diag_count + 1), [=](sycl::item __item_id) { auto __global_idx = __item_id.get_linear_id(); - auto __base_diagonals_sp_global_ptr = _Storage::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); + auto __base_diagonals_sp_global_ptr = + _Storage::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); - _split_point_t<_IdType> __sp = __global_idx == 0 ? _split_point_t<_IdType>{ 0, 0 } : _split_point_t<_IdType>{ __n1, __n2 }; + _split_point_t<_IdType> __sp = + __global_idx == 0 ? _split_point_t<_IdType>{0, 0} : _split_point_t<_IdType>{__n1, __n2}; if (0 < __global_idx && __global_idx < __nd_range_params.base_diag_count) { - const _IdType __i_elem = __global_idx * __nd_range_params.steps_between_two_base_diags * __nd_range_params.chunk; + const _IdType __i_elem = + __global_idx * __nd_range_params.steps_between_two_base_diags * __nd_range_params.chunk; if (__i_elem < __n) __sp = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); } @@ -358,7 +365,8 @@ struct __parallel_merge_submitter<_IdType, _CustomName, const _IdType __i_elem = __global_idx * __chunk; _split_point_t<_IdType> __start = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); - __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, __n2, __comp); + __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, __n2, + __comp); }); }); @@ -369,9 +377,8 @@ struct __parallel_merge_submitter<_IdType, _CustomName, template sycl::event - run_parallel_merge(sycl::event __event, - _ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp, - const nd_range_params& __nd_range_params, + run_parallel_merge(sycl::event __event, _ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, + _Range3&& __rng3, _Compare __comp, const nd_range_params& __nd_range_params, const _Storage& __base_diagonals_sp_global_storage) const { const _IdType __n1 = __rng1.size(); @@ -379,7 +386,8 @@ struct __parallel_merge_submitter<_IdType, _CustomName, __event = __exec.queue().submit([&](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); - auto __base_diagonals_sp_global_acc = __base_diagonals_sp_global_storage.template __get_scratch_acc(__cgh); + auto __base_diagonals_sp_global_acc = + __base_diagonals_sp_global_storage.template __get_scratch_acc(__cgh); __cgh.depends_on(__event); @@ -388,7 +396,8 @@ struct __parallel_merge_submitter<_IdType, _CustomName, auto __global_idx = __item_id.get_linear_id(); const _IdType __i_elem = __global_idx * __nd_range_params.chunk; - auto __base_diagonals_sp_global_ptr = _Storage::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); + auto __base_diagonals_sp_global_ptr = + _Storage::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); auto __diagonal_idx = __global_idx / __nd_range_params.steps_between_two_base_diags; _split_point_t<_IdType> __start; @@ -408,19 +417,16 @@ struct __parallel_merge_submitter<_IdType, _CustomName, __start = __base_diagonals_sp_global_ptr[__diagonal_idx]; } - __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __nd_range_params.chunk, __n1, __n2, __comp); + __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, + __nd_range_params.chunk, __n1, __n2, __comp); }); }); return __event; } -public: - - __parallel_merge_submitter(bool __use_base_diags) - : __use_base_diags(__use_base_diags) - { - } + public: + __parallel_merge_submitter(bool __use_base_diags) : __use_base_diags(__use_base_diags) {} template auto @@ -440,13 +446,18 @@ struct __parallel_merge_submitter<_IdType, _CustomName, if (__use_base_diags) { // Create storage for save split-points on each base diagonal + 1 (for the right base diagonal in the last work-group) - auto __p_base_diagonals_sp_global_storage = new __result_and_scratch_storage<_ExecutionPolicy, _split_point_t<_IdType>>(__exec, 0, __nd_range_params.base_diag_count + 1); - __p_result_and_scratch_storage_base.reset(static_cast<__result_and_scratch_storage_base*>(__p_base_diagonals_sp_global_storage)); + auto __p_base_diagonals_sp_global_storage = + new __result_and_scratch_storage<_ExecutionPolicy, _split_point_t<_IdType>>( + __exec, 0, __nd_range_params.base_diag_count + 1); + __p_result_and_scratch_storage_base.reset( + static_cast<__result_and_scratch_storage_base*>(__p_base_diagonals_sp_global_storage)); - __event = eval_split_points_for_groups(__exec, __rng1, __rng2, __comp, __nd_range_params, *__p_base_diagonals_sp_global_storage); + __event = eval_split_points_for_groups(__exec, __rng1, __rng2, __comp, __nd_range_params, + *__p_base_diagonals_sp_global_storage); // Merge data using split points on each base diagonal - __event = run_parallel_merge(__event, __exec, __rng1, __rng2, __rng3, __comp, __nd_range_params, *__p_base_diagonals_sp_global_storage); + __event = run_parallel_merge(__event, __exec, __rng1, __rng2, __rng3, __comp, __nd_range_params, + *__p_base_diagonals_sp_global_storage); } else { @@ -457,8 +468,7 @@ struct __parallel_merge_submitter<_IdType, _CustomName, return __future(std::move(__event), std::move(__p_result_and_scratch_storage_base)); } -private: - + private: const bool __use_base_diags = false; }; @@ -492,7 +502,8 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy __merge_kernel_name1<_CustomName, _WiIndex>>; using _MergeKernelName2 = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< __merge_kernel_name2<_CustomName, _WiIndex>>; - return __parallel_merge_submitter<_WiIndex, _CustomName, _DiagonalsKernelName, _MergeKernelName1, _MergeKernelName2>(__use_base_diags)( + return __parallel_merge_submitter<_WiIndex, _CustomName, _DiagonalsKernelName, _MergeKernelName1, + _MergeKernelName2>(__use_base_diags)( std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), std::forward<_Range3>(__rng3), __comp); } @@ -505,7 +516,8 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy __merge_kernel_name1<_CustomName, _WiIndex>>; using _MergeKernelName2 = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< __merge_kernel_name2<_CustomName, _WiIndex>>; - return __parallel_merge_submitter<_WiIndex, _CustomName, _DiagonalsKernelName, _MergeKernelName1, _MergeKernelName2>(__use_base_diags)( + return __parallel_merge_submitter<_WiIndex, _CustomName, _DiagonalsKernelName, _MergeKernelName1, + _MergeKernelName2>(__use_base_diags)( std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), std::forward<_Range3>(__rng3), __comp); } From 8e50bbfa10da9810efa06255dc154e0c4f4e6fb2 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Wed, 4 Dec 2024 19:44:38 +0100 Subject: [PATCH 30/76] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix compile error in sort.pass.cpp Signed-off-by: Sergey Kopienko --- .../pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 5ce69d3f342..a583863e65a 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -97,8 +97,16 @@ _split_point_t<_Index> __find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rng1_to, const _Rng2& __rng2, const _Index __rng2_from, _Index __rng2_to, const _Index __i_elem, _Compare __comp) { - assert(0 <= __rng1_from && __rng1_from < __rng1_to && __rng1_to < __rng1.size()); - assert(0 <= __rng2_from && __rng2_from < __rng2_to && __rng2_to < __rng2.size()); + if constexpr (!std::is_pointer_v<_Rng1> && !std::is_pointer_v<_Rng2>) + { + assert(0 <= __rng1_from && __rng1_from < __rng1_to && __rng1_to < __rng1.size()); + assert(0 <= __rng2_from && __rng2_from < __rng2_to && __rng2_to < __rng2.size()); + } + else + { + assert(0 <= __rng1_from && __rng1_from < __rng1_to); + assert(0 <= __rng2_from && __rng2_from < __rng2_to); + } // We shouldn't call this function with __i_elem == 0 because we a priory know that // split point for this case is {0, 0} From f0ea19df47f4b2ae1402fad9d7a0d38f47560f3e Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 6 Dec 2024 18:24:46 +0100 Subject: [PATCH 31/76] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - processing additional corner cases in __find_start_point_in Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index a583863e65a..491bc467ee5 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -97,16 +97,16 @@ _split_point_t<_Index> __find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rng1_to, const _Rng2& __rng2, const _Index __rng2_from, _Index __rng2_to, const _Index __i_elem, _Compare __comp) { - if constexpr (!std::is_pointer_v<_Rng1> && !std::is_pointer_v<_Rng2>) - { - assert(0 <= __rng1_from && __rng1_from < __rng1_to && __rng1_to < __rng1.size()); - assert(0 <= __rng2_from && __rng2_from < __rng2_to && __rng2_to < __rng2.size()); - } - else - { - assert(0 <= __rng1_from && __rng1_from < __rng1_to); - assert(0 <= __rng2_from && __rng2_from < __rng2_to); - } + assert(__rng1_from + __rng2_from <= __i_elem && __i_elem <= __rng1_to + __rng2_to); + + if (__i_elem == 0) + return _split_point_t<_Index>{ 0, 0 }; + + if (__rng1_from == __rng1_to) + return _split_point_t<_Index>{ __rng1_from, __rng2_from + __i_elem }; + + if (__rng2_from == __rng2_to) + return _split_point_t<_Index>{ __rng1_from + __i_elem, __rng2_to }; // We shouldn't call this function with __i_elem == 0 because we a priory know that // split point for this case is {0, 0} From f327800a342818dcf5ac5e287b836f775b064ed0 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 6 Dec 2024 18:25:18 +0100 Subject: [PATCH 32/76] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix an error in run_parallel_merge Signed-off-by: Sergey Kopienko --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 491bc467ee5..9d352625411 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -372,9 +372,12 @@ struct __parallel_merge_submitter<_IdType, _CustomName, __internal::__optional_k auto __global_idx = __item_id.get_linear_id(); const _IdType __i_elem = __global_idx * __chunk; - _split_point_t<_IdType> __start = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); - __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, __n2, - __comp); + if (__i_elem < __n1 + __n2) + { + _split_point_t<_IdType> __start = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); + __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, __n2, + __comp); + } }); }); From 53def33cae31bbe7e372405bb1b7f365708c83e3 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 6 Dec 2024 19:33:43 +0100 Subject: [PATCH 33/76] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix review comment: remove assert calls from Kernel code Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 26 ------------------- 1 file changed, 26 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 9d352625411..dc9da9cc0b3 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -97,8 +97,6 @@ _split_point_t<_Index> __find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rng1_to, const _Rng2& __rng2, const _Index __rng2_from, _Index __rng2_to, const _Index __i_elem, _Compare __comp) { - assert(__rng1_from + __rng2_from <= __i_elem && __i_elem <= __rng1_to + __rng2_to); - if (__i_elem == 0) return _split_point_t<_Index>{ 0, 0 }; @@ -108,10 +106,6 @@ __find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rn if (__rng2_from == __rng2_to) return _split_point_t<_Index>{ __rng1_from + __i_elem, __rng2_to }; - // We shouldn't call this function with __i_elem == 0 because we a priory know that - // split point for this case is {0, 0} - assert(__i_elem > 0); - // ----------------------- EXAMPLE ------------------------ // Let's consider the following input data: // rng1.size() = 10 @@ -160,11 +154,9 @@ __find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rn _IndexSigned idx1_from = __rng1_from; _IndexSigned idx1_to = __rng1_to; - assert(idx1_from <= idx1_to); _IndexSigned idx2_from = __index_sum - (__rng1_to - 1); _IndexSigned idx2_to = __index_sum - __rng1_from + 1; - assert(idx2_from <= idx2_to); const _IndexSigned idx2_from_diff = idx2_from < (_IndexSigned)__rng2_from ? (_IndexSigned)__rng2_from - idx2_from : 0; @@ -176,12 +168,6 @@ __find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rn idx2_from = __index_sum - (idx1_to - 1); idx2_to = __index_sum - idx1_from + 1; - assert(idx1_from <= idx1_to); - assert(__rng1_from <= idx1_from && idx1_to <= __rng1_to); - - assert(idx2_from <= idx2_to); - assert(__rng2_from <= idx2_from && idx2_to <= __rng2_to); - //////////////////////////////////////////////////////////////////////////////////// // Run search of split point on diagonal @@ -196,20 +182,11 @@ __find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rn const auto __rng1_idx = __idx; const auto __rng2_idx = __index_sum - __idx; - assert(__rng1_from <= __rng1_idx && __rng1_idx < __rng1_to); - assert(__rng2_from <= __rng2_idx && __rng2_idx < __rng2_to); - assert(__rng1_idx + __rng2_idx == __index_sum); - const auto __zero_or_one = __comp(__rng2[__rng2_idx], __rng1[__rng1_idx]); return __zero_or_one < kValue; }); const _split_point_t<_Index> __result{*__res, __index_sum - *__res + 1}; - assert(__result.first + __result.second == __i_elem); - - assert(__rng1_from <= __result.first && __result.first <= __rng1_to); - assert(__rng2_from <= __result.second && __result.second <= __rng2_to); - return __result; } @@ -414,9 +391,6 @@ struct __parallel_merge_submitter<_IdType, _CustomName, __internal::__optional_k _split_point_t<_IdType> __start; if (__global_idx % __nd_range_params.steps_between_two_base_diags != 0) { - // Check that we fit into size of scratch - assert(__diagonal_idx + 1 < __nd_range_params.base_diag_count + 1); - const _split_point_t<_IdType> __sp_left = __base_diagonals_sp_global_ptr[__diagonal_idx]; const _split_point_t<_IdType> __sp_right = __base_diagonals_sp_global_ptr[__diagonal_idx + 1]; From d8d6e7419b19f084b734ebd6d7dd9417d60019df Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 6 Dec 2024 19:34:57 +0100 Subject: [PATCH 34/76] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - remove extra local variables in __find_start_point_in Signed-off-by: Sergey Kopienko --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index dc9da9cc0b3..8cd262a4be2 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -179,15 +179,11 @@ __find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rn constexpr int kValue = 1; const __it_t __res = std::lower_bound(__diag_it_begin, __diag_it_end, kValue, [&](_Index __idx, const auto& __value) { - const auto __rng1_idx = __idx; - const auto __rng2_idx = __index_sum - __idx; - - const auto __zero_or_one = __comp(__rng2[__rng2_idx], __rng1[__rng1_idx]); + const auto __zero_or_one = __comp(__rng2[__index_sum - __idx], __rng1[__idx]); return __zero_or_one < kValue; }); - const _split_point_t<_Index> __result{*__res, __index_sum - *__res + 1}; - return __result; + return {*__res, __index_sum - *__res + 1}; } // Do serial merge of the data from rng1 (starting from start1) and rng2 (starting from start2) and writing From f67503f555557357c3357334783adf91bdabeb82 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Sun, 8 Dec 2024 15:03:38 +0100 Subject: [PATCH 35/76] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - processing additional corner cases in __find_start_point_in Signed-off-by: Sergey Kopienko --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 8cd262a4be2..be1048d81b5 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -104,7 +104,7 @@ __find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rn return _split_point_t<_Index>{ __rng1_from, __rng2_from + __i_elem }; if (__rng2_from == __rng2_to) - return _split_point_t<_Index>{ __rng1_from + __i_elem, __rng2_to }; + return _split_point_t<_Index>{ __rng1_from + __i_elem, __rng2_from }; // ----------------------- EXAMPLE ------------------------ // Let's consider the following input data: From 3089f711d20e084006a35d5e4adbe9d8233d2fdc Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 10 Dec 2024 09:28:22 +0100 Subject: [PATCH 36/76] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - remove extra condition checks from __find_start_point_in Signed-off-by: Sergey Kopienko --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 9 --------- 1 file changed, 9 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index be1048d81b5..ed4c7df1ded 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -97,15 +97,6 @@ _split_point_t<_Index> __find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rng1_to, const _Rng2& __rng2, const _Index __rng2_from, _Index __rng2_to, const _Index __i_elem, _Compare __comp) { - if (__i_elem == 0) - return _split_point_t<_Index>{ 0, 0 }; - - if (__rng1_from == __rng1_to) - return _split_point_t<_Index>{ __rng1_from, __rng2_from + __i_elem }; - - if (__rng2_from == __rng2_to) - return _split_point_t<_Index>{ __rng1_from + __i_elem, __rng2_from }; - // ----------------------- EXAMPLE ------------------------ // Let's consider the following input data: // rng1.size() = 10 From c033585f16f41a8466c62fb62e2565b168eed3d1 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 16 Dec 2024 10:02:16 +0100 Subject: [PATCH 37/76] Revert: Combine two submitters `__parallel_merge_submitter` and `__parallel_merge_submitter_large` into one `__parallel_merge_submitter` (#1956) Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 155 ++++++++++-------- 1 file changed, 91 insertions(+), 64 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index ed4c7df1ded..08ba09098d5 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -228,15 +228,53 @@ __serial_merge(const _Rng1& __rng1, const _Rng2& __rng2, _Rng3& __rng3, _Index _ } } -template +// Please see the comment for __parallel_for_submitter for optional kernel name explanation +template struct __parallel_merge_submitter; -template -struct __parallel_merge_submitter<_IdType, _CustomName, __internal::__optional_kernel_name<_DiagonalsKernelName...>, - __internal::__optional_kernel_name<_MergeKernelName1...>, - __internal::__optional_kernel_name<_MergeKernelName2...>> +template +struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_MergeKernelName...>> +{ + template + auto + operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const + { + const _IdType __n1 = __rng1.size(); + const _IdType __n2 = __rng2.size(); + const _IdType __n = __n1 + __n2; + + assert(__n1 > 0 || __n2 > 0); + + _PRINT_INFO_IN_DEBUG_MODE(__exec); + + // Empirical number of values to process per work-item + const std::uint8_t __chunk = __exec.queue().get_device().is_cpu() ? 128 : 4; + + const _IdType __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk); + + auto __event = __exec.queue().submit([&](sycl::handler& __cgh) { + oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); + __cgh.parallel_for<_MergeKernelName...>( + sycl::range(__steps), [=](sycl::item __item_id) { + const _IdType __i_elem = __item_id.get_linear_id() * __chunk; + const auto __start = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); + __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, __n2, + __comp); + }); + }); + // We should return the same thing in the second param of __future for compatibility + // with the returning value in __parallel_merge_submitter_large::operator() + return __future(__event, __result_and_scratch_storage_base_ptr{}); + } +}; + +template +struct __parallel_merge_submitter_large; + +template +struct __parallel_merge_submitter_large<_IdType, _CustomName, + __internal::__optional_kernel_name<_DiagonalsKernelName...>, + __internal::__optional_kernel_name<_MergeKernelName...>> { protected: struct nd_range_params @@ -269,9 +307,9 @@ struct __parallel_merge_submitter<_IdType, _CustomName, __internal::__optional_k const std::uint8_t __chunk = __exec.queue().get_device().is_cpu() ? 128 : __data_items_in_slm_bank; const _IdType __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk); - const _IdType __base_diag_count = __use_base_diags ? 32 * 1'024 : 0; + const _IdType __base_diag_count = 32 * 1'024; const _IdType __steps_between_two_base_diags = - __use_base_diags ? oneapi::dpl::__internal::__dpl_ceiling_div(__steps, __base_diag_count) : 0; + oneapi::dpl::__internal::__dpl_ceiling_div(__steps, __base_diag_count); return {__base_diag_count, __steps_between_two_base_diags, __chunk, __steps}; } @@ -331,7 +369,7 @@ struct __parallel_merge_submitter<_IdType, _CustomName, __internal::__optional_k sycl::event __event = __exec.queue().submit([&](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); - __cgh.parallel_for<_MergeKernelName1...>( + __cgh.parallel_for<_MergeKernelName...>( sycl::range(__nd_range_params.steps), [=](sycl::item __item_id) { auto __global_idx = __item_id.get_linear_id(); const _IdType __i_elem = __global_idx * __chunk; @@ -366,7 +404,7 @@ struct __parallel_merge_submitter<_IdType, _CustomName, __internal::__optional_k __cgh.depends_on(__event); - __cgh.parallel_for<_MergeKernelName2...>( + __cgh.parallel_for<_MergeKernelName...>( sycl::range(__nd_range_params.steps), [=](sycl::item __item_id) { auto __global_idx = __item_id.get_linear_id(); const _IdType __i_elem = __global_idx * __nd_range_params.chunk; @@ -398,8 +436,6 @@ struct __parallel_merge_submitter<_IdType, _CustomName, __internal::__optional_k } public: - __parallel_merge_submitter(bool __use_base_diags) : __use_base_diags(__use_base_diags) {} - template auto operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const @@ -413,42 +449,29 @@ struct __parallel_merge_submitter<_IdType, _CustomName, __internal::__optional_k __result_and_scratch_storage_base_ptr __p_result_and_scratch_storage_base; - // Calculation of split points on each base diagonal - sycl::event __event; - if (__use_base_diags) - { - // Create storage for save split-points on each base diagonal + 1 (for the right base diagonal in the last work-group) - auto __p_base_diagonals_sp_global_storage = - new __result_and_scratch_storage<_ExecutionPolicy, _split_point_t<_IdType>>( - __exec, 0, __nd_range_params.base_diag_count + 1); - __p_result_and_scratch_storage_base.reset( - static_cast<__result_and_scratch_storage_base*>(__p_base_diagonals_sp_global_storage)); - - __event = eval_split_points_for_groups(__exec, __rng1, __rng2, __comp, __nd_range_params, - *__p_base_diagonals_sp_global_storage); - - // Merge data using split points on each base diagonal - __event = run_parallel_merge(__event, __exec, __rng1, __rng2, __rng3, __comp, __nd_range_params, - *__p_base_diagonals_sp_global_storage); - } - else - { - // Merge data using split points on each base diagonal - __event = run_parallel_merge(__exec, __rng1, __rng2, __rng3, __comp, __nd_range_params); - } + // Create storage for save split-points on each base diagonal + 1 (for the right base diagonal in the last work-group) + auto __p_base_diagonals_sp_global_storage = + new __result_and_scratch_storage<_ExecutionPolicy, _split_point_t<_IdType>>( + __exec, 0, __nd_range_params.base_diag_count + 1); + __p_result_and_scratch_storage_base.reset( + static_cast<__result_and_scratch_storage_base*>(__p_base_diagonals_sp_global_storage)); + + sycl::event __event = eval_split_points_for_groups(__exec, __rng1, __rng2, __comp, __nd_range_params, + *__p_base_diagonals_sp_global_storage); + + // Merge data using split points on each base diagonal + __event = run_parallel_merge(__event, __exec, __rng1, __rng2, __rng3, __comp, __nd_range_params, + *__p_base_diagonals_sp_global_storage); return __future(std::move(__event), std::move(__p_result_and_scratch_storage_base)); } - - private: - const bool __use_base_diags = false; }; template -class __merge_kernel_name1; +class __merge_kernel_name; template -class __merge_kernel_name2; +class __merge_kernel_name_large; template class __diagonals_kernel_name; @@ -460,38 +483,42 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy { using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; - const std::size_t __n = __rng1.size() + __rng2.size(); - constexpr std::size_t __starting_size_limit_for_large_submitter = 4 * 1'048'576; // 4 MB - const bool __use_base_diags = __n >= __starting_size_limit_for_large_submitter; - if (__n <= std::numeric_limits::max()) + const std::size_t __n = __rng1.size() + __rng2.size(); + if (__n < __starting_size_limit_for_large_submitter) { using _WiIndex = std::uint32_t; - using _DiagonalsKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __diagonals_kernel_name<_CustomName, _WiIndex>>; - using _MergeKernelName1 = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __merge_kernel_name1<_CustomName, _WiIndex>>; - using _MergeKernelName2 = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __merge_kernel_name2<_CustomName, _WiIndex>>; - return __parallel_merge_submitter<_WiIndex, _CustomName, _DiagonalsKernelName, _MergeKernelName1, - _MergeKernelName2>(__use_base_diags)( + using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __merge_kernel_name<_CustomName, _WiIndex>>; + return __parallel_merge_submitter<_WiIndex, _MergeKernelName>()( std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), std::forward<_Range3>(__rng3), __comp); } else { - using _WiIndex = std::uint64_t; - using _DiagonalsKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __diagonals_kernel_name<_CustomName, _WiIndex>>; - using _MergeKernelName1 = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __merge_kernel_name1<_CustomName, _WiIndex>>; - using _MergeKernelName2 = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __merge_kernel_name2<_CustomName, _WiIndex>>; - return __parallel_merge_submitter<_WiIndex, _CustomName, _DiagonalsKernelName, _MergeKernelName1, - _MergeKernelName2>(__use_base_diags)( - std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), - std::forward<_Range3>(__rng3), __comp); + if (__n <= std::numeric_limits::max()) + { + using _WiIndex = std::uint32_t; + using _DiagonalsKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __diagonals_kernel_name<_CustomName, _WiIndex>>; + using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __merge_kernel_name_large<_CustomName, _WiIndex>>; + return __parallel_merge_submitter_large<_WiIndex, _CustomName, _DiagonalsKernelName, _MergeKernelName>()( + std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), + std::forward<_Range3>(__rng3), __comp); + } + else + { + using _WiIndex = std::uint64_t; + using _DiagonalsKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __diagonals_kernel_name<_CustomName, _WiIndex>>; + using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __merge_kernel_name_large<_CustomName, _WiIndex>>; + return __parallel_merge_submitter_large<_WiIndex, _CustomName, _DiagonalsKernelName, _MergeKernelName>()( + std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), + std::forward<_Range3>(__rng3), __comp); + } } } From a06ac54f4a97195a477e8694ee9e834618d4119b Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 16 Dec 2024 10:05:15 +0100 Subject: [PATCH 38/76] Call __find_start_point_in instead of __find_start_point in the __parallel_merge_submitter_large::run_parallel_merge Signed-off-by: Sergey Kopienko --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 08ba09098d5..56b60cb6bd2 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -376,7 +376,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, if (__i_elem < __n1 + __n2) { - _split_point_t<_IdType> __start = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); + _split_point_t<_IdType> __start = __find_start_point_in(__rng1, (_IdType)0, __n1, __rng2, (_IdType)0, __n2, __i_elem, __comp); __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, __n2, __comp); } From c96cccfbe47c3196ce4e0d79651cbce24bb6b763 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 16 Dec 2024 10:25:45 +0100 Subject: [PATCH 39/76] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix review comment: I would use std::pair<_Index> directly here. Signed-off-by: Sergey Kopienko --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 56b60cb6bd2..8d0c4527d6e 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -174,7 +174,7 @@ __find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rn return __zero_or_one < kValue; }); - return {*__res, __index_sum - *__res + 1}; + return _split_point_t<_Index>{*__res, __index_sum - *__res + 1}; } // Do serial merge of the data from rng1 (starting from start1) and rng2 (starting from start2) and writing From 2d8f480c0ea5681f9a5a36775ab0b722be3346ad Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 16 Dec 2024 11:15:38 +0100 Subject: [PATCH 40/76] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix performance degradation for 8 Mb int type Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 1066 +++++++++-------- 1 file changed, 537 insertions(+), 529 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 8d0c4527d6e..895253a151e 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -1,529 +1,537 @@ -// -*- C++ -*- -//===-- parallel_backend_sycl_merge.h --------------------------------===// -// -// Copyright (C) Intel Corporation -// -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -// This file incorporates work covered by the following copyright and permission -// notice: -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// -//===----------------------------------------------------------------------===// - -#ifndef _ONEDPL_PARALLEL_BACKEND_SYCL_MERGE_H -#define _ONEDPL_PARALLEL_BACKEND_SYCL_MERGE_H - -#include // std::numeric_limits -#include // assert -#include // std::uint8_t, ... -#include // std::make_pair, std::forward -#include // std::min, std::lower_bound - -#include "sycl_defs.h" -#include "parallel_backend_sycl_utils.h" - -namespace oneapi -{ -namespace dpl -{ -namespace __par_backend_hetero -{ -template -using _split_point_t = std::pair<_Index, _Index>; - -//Searching for an intersection of a merge matrix (n1, n2) diagonal with the Merge Path to define sub-ranges -//to serial merge. For example, a merge matrix for [0,1,1,2,3] and [0,0,2,3] is shown below: -// 0 1 1 2 3 -// ------------------ -// |---> -// 0 | 0 | 1 1 1 1 -// | | -// 0 | 0 | 1 1 1 1 -// | ----------> -// 2 | 0 0 0 0 | 1 -// | ----> -// 3 | 0 0 0 0 0 | -template -auto -__find_start_point(const _Rng1& __rng1, const _Rng2& __rng2, const _Index __i_elem, const _Index __n1, - const _Index __n2, _Compare __comp) -{ - //searching for the first '1', a lower bound for a diagonal [0, 0,..., 0, 1, 1,.... 1, 1] - oneapi::dpl::counting_iterator<_Index> __diag_it(0); - - if (__i_elem < __n2) //a condition to specify upper or lower part of the merge matrix to be processed - { - const _Index __q = __i_elem; //diagonal index - const _Index __n_diag = std::min<_Index>(__q, __n1); //diagonal size - auto __res = - std::lower_bound(__diag_it, __diag_it + __n_diag, 1 /*value to find*/, - [&__rng2, &__rng1, __q, __comp](const auto& __i_diag, const auto& __value) mutable { - const auto __zero_or_one = __comp(__rng2[__q - __i_diag - 1], __rng1[__i_diag]); - return __zero_or_one < __value; - }); - return std::make_pair(*__res, __q - *__res); - } - else - { - const _Index __q = __i_elem - __n2; //diagonal index - const _Index __n_diag = std::min<_Index>(__n1 - __q, __n2); //diagonal size - auto __res = - std::lower_bound(__diag_it, __diag_it + __n_diag, 1 /*value to find*/, - [&__rng2, &__rng1, __n2, __q, __comp](const auto& __i_diag, const auto& __value) mutable { - const auto __zero_or_one = __comp(__rng2[__n2 - __i_diag - 1], __rng1[__q + __i_diag]); - return __zero_or_one < __value; - }); - return std::make_pair(__q + *__res, __n2 - *__res); - } -} - -//Searching for an intersection of a merge matrix (n1, n2) diagonal with the Merge Path to define sub-ranges -//to serial merge. For example, a merge matrix for [0,1,1,2,3] and [0,0,2,3] is shown below: -// 0 1 1 2 3 -// ------------------ -// |---> -// 0 | 0 | 1 1 1 1 -// | | -// 0 | 0 | 1 1 1 1 -// | ----------> -// 2 | 0 0 0 0 | 1 -// | ----> -// 3 | 0 0 0 0 0 | -template -_split_point_t<_Index> -__find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rng1_to, const _Rng2& __rng2, - const _Index __rng2_from, _Index __rng2_to, const _Index __i_elem, _Compare __comp) -{ - // ----------------------- EXAMPLE ------------------------ - // Let's consider the following input data: - // rng1.size() = 10 - // rng2.size() = 6 - // i_diag = 9 - // Let's define the following ranges for processing: - // rng1: [3, ..., 9) -> __rng1_from = 3, __rng1_to = 9 - // rng2: [1, ..., 4) -> __rng2_from = 1, __rng2_to = 4 - // - // The goal: required to process only X' items of the merge matrix - // as intersection of rng1[3, ..., 9) and rng2[1, ..., 4) - // - // -------------------------------------------------------- - // - // __diag_it_begin(rng1) __diag_it_end(rng1) - // (init state) (dest state) (init state, dest state) - // | | | - // V V V - // + + + + + + - // \ rng1 0 1 2 3 4 5 6 7 8 9 - // rng2 +--------------------------------------+ - // 0 | ^ ^ ^ X | <--- __diag_it_end(rng2) (init state) - // + 1 | <----------------- + + X'2 ^ | <--- __diag_it_end(rng2) (dest state) - // + 2 | <----------------- + X'1 | | - // + 3 | <----------------- X'0 | | <--- __diag_it_begin(rng2) (dest state) - // 4 | X ^ | | - // 5 | X | | | <--- __diag_it_begin(rng2) (init state) - // +-------AX-----------+-----------+-----+ - // AX | | - // AX | | - // Run lower_bound:[from = 5, to = 8) - // - // AX - absent items in rng2 - // - // We have three points on diagonal for call comparison: - // X'0 : call __comp(rng1[5], rng2[3]) // 5 + 3 == 9 - 1 == 8 - // X'1 : call __comp(rng1[6], rng2[2]) // 6 + 2 == 9 - 1 == 8 - // X'3 : call __comp(rng1[7], rng2[1]) // 7 + 1 == 9 - 1 == 8 - // - where for every comparing pairs idx(rng1) + idx(rng2) == i_diag - 1 - - //////////////////////////////////////////////////////////////////////////////////// - // Taking into account the specified constraints of the range of processed data - const auto __index_sum = __i_elem - 1; - - using _IndexSigned = std::make_signed_t<_Index>; - - _IndexSigned idx1_from = __rng1_from; - _IndexSigned idx1_to = __rng1_to; - - _IndexSigned idx2_from = __index_sum - (__rng1_to - 1); - _IndexSigned idx2_to = __index_sum - __rng1_from + 1; - - const _IndexSigned idx2_from_diff = - idx2_from < (_IndexSigned)__rng2_from ? (_IndexSigned)__rng2_from - idx2_from : 0; - const _IndexSigned idx2_to_diff = idx2_to > (_IndexSigned)__rng2_to ? idx2_to - (_IndexSigned)__rng2_to : 0; - - idx1_to -= idx2_from_diff; - idx1_from += idx2_to_diff; - - idx2_from = __index_sum - (idx1_to - 1); - idx2_to = __index_sum - idx1_from + 1; - - //////////////////////////////////////////////////////////////////////////////////// - // Run search of split point on diagonal - - using __it_t = oneapi::dpl::counting_iterator<_Index>; - - __it_t __diag_it_begin(idx1_from); - __it_t __diag_it_end(idx1_to); - - constexpr int kValue = 1; - const __it_t __res = - std::lower_bound(__diag_it_begin, __diag_it_end, kValue, [&](_Index __idx, const auto& __value) { - const auto __zero_or_one = __comp(__rng2[__index_sum - __idx], __rng1[__idx]); - return __zero_or_one < kValue; - }); - - return _split_point_t<_Index>{*__res, __index_sum - *__res + 1}; -} - -// Do serial merge of the data from rng1 (starting from start1) and rng2 (starting from start2) and writing -// to rng3 (starting from start3) in 'chunk' steps, but do not exceed the total size of the sequences (n1 and n2) -template -void -__serial_merge(const _Rng1& __rng1, const _Rng2& __rng2, _Rng3& __rng3, _Index __start1, _Index __start2, - const _Index __start3, const std::uint8_t __chunk, const _Index __n1, const _Index __n2, _Compare __comp) -{ - if (__start1 >= __n1) - { - //copying a residual of the second seq - const _Index __n = std::min<_Index>(__n2 - __start2, __chunk); - for (std::uint8_t __i = 0; __i < __n; ++__i) - __rng3[__start3 + __i] = __rng2[__start2 + __i]; - } - else if (__start2 >= __n2) - { - //copying a residual of the first seq - const _Index __n = std::min<_Index>(__n1 - __start1, __chunk); - for (std::uint8_t __i = 0; __i < __n; ++__i) - __rng3[__start3 + __i] = __rng1[__start1 + __i]; - } - else - { - for (std::uint8_t __i = 0; __i < __chunk && __start1 < __n1 && __start2 < __n2; ++__i) - { - const auto& __val1 = __rng1[__start1]; - const auto& __val2 = __rng2[__start2]; - if (__comp(__val2, __val1)) - { - __rng3[__start3 + __i] = __val2; - if (++__start2 == __n2) - { - //copying a residual of the first seq - for (++__i; __i < __chunk && __start1 < __n1; ++__i, ++__start1) - __rng3[__start3 + __i] = __rng1[__start1]; - } - } - else - { - __rng3[__start3 + __i] = __val1; - if (++__start1 == __n1) - { - //copying a residual of the second seq - for (++__i; __i < __chunk && __start2 < __n2; ++__i, ++__start2) - __rng3[__start3 + __i] = __rng2[__start2]; - } - } - } - } -} - -// Please see the comment for __parallel_for_submitter for optional kernel name explanation -template -struct __parallel_merge_submitter; - -template -struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_MergeKernelName...>> -{ - template - auto - operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const - { - const _IdType __n1 = __rng1.size(); - const _IdType __n2 = __rng2.size(); - const _IdType __n = __n1 + __n2; - - assert(__n1 > 0 || __n2 > 0); - - _PRINT_INFO_IN_DEBUG_MODE(__exec); - - // Empirical number of values to process per work-item - const std::uint8_t __chunk = __exec.queue().get_device().is_cpu() ? 128 : 4; - - const _IdType __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk); - - auto __event = __exec.queue().submit([&](sycl::handler& __cgh) { - oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); - __cgh.parallel_for<_MergeKernelName...>( - sycl::range(__steps), [=](sycl::item __item_id) { - const _IdType __i_elem = __item_id.get_linear_id() * __chunk; - const auto __start = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); - __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, __n2, - __comp); - }); - }); - // We should return the same thing in the second param of __future for compatibility - // with the returning value in __parallel_merge_submitter_large::operator() - return __future(__event, __result_and_scratch_storage_base_ptr{}); - } -}; - -template -struct __parallel_merge_submitter_large; - -template -struct __parallel_merge_submitter_large<_IdType, _CustomName, - __internal::__optional_kernel_name<_DiagonalsKernelName...>, - __internal::__optional_kernel_name<_MergeKernelName...>> -{ - protected: - struct nd_range_params - { - std::size_t base_diag_count = 0; - std::size_t steps_between_two_base_diags = 0; - std::uint8_t chunk = 0; - _IdType steps = 0; - }; - - // Calculate nd-range params - template - nd_range_params - eval_nd_range_params(_ExecutionPolicy&& __exec, const _Range1& __rng1, const _Range2& __rng2) const - { - using _Range1ValueType = oneapi::dpl::__internal::__value_t<_Range1>; - using _Range2ValueType = oneapi::dpl::__internal::__value_t<_Range2>; - using _RangeValueType = std::conditional_t<(sizeof(_Range1ValueType) > sizeof(_Range2ValueType)), - _Range1ValueType, _Range2ValueType>; - - const std::size_t __n = __rng1.size() + __rng2.size(); - - constexpr std::size_t __slm_bank_size = 16; // TODO is it correct value? How to get it from hardware? - - // Calculate how many data items we can read into one SLM bank - constexpr std::size_t __data_items_in_slm_bank = - oneapi::dpl::__internal::__dpl_ceiling_div(__slm_bank_size, sizeof(_RangeValueType)); - - // Empirical number of values to process per work-item - const std::uint8_t __chunk = __exec.queue().get_device().is_cpu() ? 128 : __data_items_in_slm_bank; - - const _IdType __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk); - const _IdType __base_diag_count = 32 * 1'024; - const _IdType __steps_between_two_base_diags = - oneapi::dpl::__internal::__dpl_ceiling_div(__steps, __base_diag_count); - - return {__base_diag_count, __steps_between_two_base_diags, __chunk, __steps}; - } - - // Calculation of split points on each base diagonal - template - sycl::event - eval_split_points_for_groups(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Compare __comp, - const nd_range_params& __nd_range_params, - _Storage& __base_diagonals_sp_global_storage) const - { - const _IdType __n1 = __rng1.size(); - const _IdType __n2 = __rng2.size(); - const _IdType __n = __n1 + __n2; - - sycl::event __event = __exec.queue().submit([&](sycl::handler& __cgh) { - oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2); - auto __base_diagonals_sp_global_acc = - __base_diagonals_sp_global_storage.template __get_scratch_acc( - __cgh, __dpl_sycl::__no_init{}); - - __cgh.parallel_for<_DiagonalsKernelName...>( - sycl::range(__nd_range_params.base_diag_count + 1), [=](sycl::item __item_id) { - auto __global_idx = __item_id.get_linear_id(); - auto __base_diagonals_sp_global_ptr = - _Storage::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); - - _split_point_t<_IdType> __sp = - __global_idx == 0 ? _split_point_t<_IdType>{0, 0} : _split_point_t<_IdType>{__n1, __n2}; - - if (0 < __global_idx && __global_idx < __nd_range_params.base_diag_count) - { - const _IdType __i_elem = - __global_idx * __nd_range_params.steps_between_two_base_diags * __nd_range_params.chunk; - if (__i_elem < __n) - __sp = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); - } - - __base_diagonals_sp_global_ptr[__global_idx] = __sp; - }); - }); - - return __event; - } - - // Process parallel merge - template - sycl::event - run_parallel_merge(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp, - const nd_range_params& __nd_range_params) const - { - const _IdType __n1 = __rng1.size(); - const _IdType __n2 = __rng2.size(); - - const auto __chunk = __nd_range_params.chunk; - - sycl::event __event = __exec.queue().submit([&](sycl::handler& __cgh) { - oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); - - __cgh.parallel_for<_MergeKernelName...>( - sycl::range(__nd_range_params.steps), [=](sycl::item __item_id) { - auto __global_idx = __item_id.get_linear_id(); - const _IdType __i_elem = __global_idx * __chunk; - - if (__i_elem < __n1 + __n2) - { - _split_point_t<_IdType> __start = __find_start_point_in(__rng1, (_IdType)0, __n1, __rng2, (_IdType)0, __n2, __i_elem, __comp); - __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, __n2, - __comp); - } - }); - }); - - return __event; - } - - // Process parallel merge - template - sycl::event - run_parallel_merge(sycl::event __event, _ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, - _Range3&& __rng3, _Compare __comp, const nd_range_params& __nd_range_params, - const _Storage& __base_diagonals_sp_global_storage) const - { - const _IdType __n1 = __rng1.size(); - const _IdType __n2 = __rng2.size(); - - __event = __exec.queue().submit([&](sycl::handler& __cgh) { - oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); - auto __base_diagonals_sp_global_acc = - __base_diagonals_sp_global_storage.template __get_scratch_acc(__cgh); - - __cgh.depends_on(__event); - - __cgh.parallel_for<_MergeKernelName...>( - sycl::range(__nd_range_params.steps), [=](sycl::item __item_id) { - auto __global_idx = __item_id.get_linear_id(); - const _IdType __i_elem = __global_idx * __nd_range_params.chunk; - - auto __base_diagonals_sp_global_ptr = - _Storage::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); - auto __diagonal_idx = __global_idx / __nd_range_params.steps_between_two_base_diags; - - _split_point_t<_IdType> __start; - if (__global_idx % __nd_range_params.steps_between_two_base_diags != 0) - { - const _split_point_t<_IdType> __sp_left = __base_diagonals_sp_global_ptr[__diagonal_idx]; - const _split_point_t<_IdType> __sp_right = __base_diagonals_sp_global_ptr[__diagonal_idx + 1]; - - __start = __find_start_point_in(__rng1, __sp_left.first, __sp_right.first, __rng2, - __sp_left.second, __sp_right.second, __i_elem, __comp); - } - else - { - __start = __base_diagonals_sp_global_ptr[__diagonal_idx]; - } - - __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, - __nd_range_params.chunk, __n1, __n2, __comp); - }); - }); - - return __event; - } - - public: - template - auto - operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const - { - assert(__rng1.size() > 0 || __rng2.size() > 0); - - _PRINT_INFO_IN_DEBUG_MODE(__exec); - - // Calculate nd-range params - const nd_range_params __nd_range_params = eval_nd_range_params(__exec, __rng1, __rng2); - - __result_and_scratch_storage_base_ptr __p_result_and_scratch_storage_base; - - // Create storage for save split-points on each base diagonal + 1 (for the right base diagonal in the last work-group) - auto __p_base_diagonals_sp_global_storage = - new __result_and_scratch_storage<_ExecutionPolicy, _split_point_t<_IdType>>( - __exec, 0, __nd_range_params.base_diag_count + 1); - __p_result_and_scratch_storage_base.reset( - static_cast<__result_and_scratch_storage_base*>(__p_base_diagonals_sp_global_storage)); - - sycl::event __event = eval_split_points_for_groups(__exec, __rng1, __rng2, __comp, __nd_range_params, - *__p_base_diagonals_sp_global_storage); - - // Merge data using split points on each base diagonal - __event = run_parallel_merge(__event, __exec, __rng1, __rng2, __rng3, __comp, __nd_range_params, - *__p_base_diagonals_sp_global_storage); - - return __future(std::move(__event), std::move(__p_result_and_scratch_storage_base)); - } -}; - -template -class __merge_kernel_name; - -template -class __merge_kernel_name_large; - -template -class __diagonals_kernel_name; - -template -auto -__parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Range1&& __rng1, - _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) -{ - using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; - - constexpr std::size_t __starting_size_limit_for_large_submitter = 4 * 1'048'576; // 4 MB - - const std::size_t __n = __rng1.size() + __rng2.size(); - if (__n < __starting_size_limit_for_large_submitter) - { - using _WiIndex = std::uint32_t; - using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __merge_kernel_name<_CustomName, _WiIndex>>; - return __parallel_merge_submitter<_WiIndex, _MergeKernelName>()( - std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), - std::forward<_Range3>(__rng3), __comp); - } - else - { - if (__n <= std::numeric_limits::max()) - { - using _WiIndex = std::uint32_t; - using _DiagonalsKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __diagonals_kernel_name<_CustomName, _WiIndex>>; - using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __merge_kernel_name_large<_CustomName, _WiIndex>>; - return __parallel_merge_submitter_large<_WiIndex, _CustomName, _DiagonalsKernelName, _MergeKernelName>()( - std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), - std::forward<_Range3>(__rng3), __comp); - } - else - { - using _WiIndex = std::uint64_t; - using _DiagonalsKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __diagonals_kernel_name<_CustomName, _WiIndex>>; - using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __merge_kernel_name_large<_CustomName, _WiIndex>>; - return __parallel_merge_submitter_large<_WiIndex, _CustomName, _DiagonalsKernelName, _MergeKernelName>()( - std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), - std::forward<_Range3>(__rng3), __comp); - } - } -} - -} // namespace __par_backend_hetero -} // namespace dpl -} // namespace oneapi - -#endif // _ONEDPL_PARALLEL_BACKEND_SYCL_MERGE_H +// -*- C++ -*- +//===-- parallel_backend_sycl_merge.h --------------------------------===// +// +// Copyright (C) Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// This file incorporates work covered by the following copyright and permission +// notice: +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// +//===----------------------------------------------------------------------===// + +#ifndef _ONEDPL_PARALLEL_BACKEND_SYCL_MERGE_H +#define _ONEDPL_PARALLEL_BACKEND_SYCL_MERGE_H + +#include // std::numeric_limits +#include // assert +#include // std::uint8_t, ... +#include // std::make_pair, std::forward +#include // std::min, std::lower_bound + +#include "sycl_defs.h" +#include "parallel_backend_sycl_utils.h" + +namespace oneapi +{ +namespace dpl +{ +namespace __par_backend_hetero +{ +template +using _split_point_t = std::pair<_Index, _Index>; + +//Searching for an intersection of a merge matrix (n1, n2) diagonal with the Merge Path to define sub-ranges +//to serial merge. For example, a merge matrix for [0,1,1,2,3] and [0,0,2,3] is shown below: +// 0 1 1 2 3 +// ------------------ +// |---> +// 0 | 0 | 1 1 1 1 +// | | +// 0 | 0 | 1 1 1 1 +// | ----------> +// 2 | 0 0 0 0 | 1 +// | ----> +// 3 | 0 0 0 0 0 | +template +auto +__find_start_point(const _Rng1& __rng1, const _Rng2& __rng2, const _Index __i_elem, const _Index __n1, + const _Index __n2, _Compare __comp) +{ + //searching for the first '1', a lower bound for a diagonal [0, 0,..., 0, 1, 1,.... 1, 1] + oneapi::dpl::counting_iterator<_Index> __diag_it(0); + + if (__i_elem < __n2) //a condition to specify upper or lower part of the merge matrix to be processed + { + const _Index __q = __i_elem; //diagonal index + const _Index __n_diag = std::min<_Index>(__q, __n1); //diagonal size + auto __res = + std::lower_bound(__diag_it, __diag_it + __n_diag, 1 /*value to find*/, + [&__rng2, &__rng1, __q, __comp](const auto& __i_diag, const auto& __value) mutable { + const auto __zero_or_one = __comp(__rng2[__q - __i_diag - 1], __rng1[__i_diag]); + return __zero_or_one < __value; + }); + return std::make_pair(*__res, __q - *__res); + } + else + { + const _Index __q = __i_elem - __n2; //diagonal index + const _Index __n_diag = std::min<_Index>(__n1 - __q, __n2); //diagonal size + auto __res = + std::lower_bound(__diag_it, __diag_it + __n_diag, 1 /*value to find*/, + [&__rng2, &__rng1, __n2, __q, __comp](const auto& __i_diag, const auto& __value) mutable { + const auto __zero_or_one = __comp(__rng2[__n2 - __i_diag - 1], __rng1[__q + __i_diag]); + return __zero_or_one < __value; + }); + return std::make_pair(__q + *__res, __n2 - *__res); + } +} + +//Searching for an intersection of a merge matrix (n1, n2) diagonal with the Merge Path to define sub-ranges +//to serial merge. For example, a merge matrix for [0,1,1,2,3] and [0,0,2,3] is shown below: +// 0 1 1 2 3 +// ------------------ +// |---> +// 0 | 0 | 1 1 1 1 +// | | +// 0 | 0 | 1 1 1 1 +// | ----------> +// 2 | 0 0 0 0 | 1 +// | ----> +// 3 | 0 0 0 0 0 | +template +_split_point_t<_Index> +__find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rng1_to, const _Rng2& __rng2, + const _Index __rng2_from, _Index __rng2_to, const _Index __i_elem, _Compare __comp) +{ + // ----------------------- EXAMPLE ------------------------ + // Let's consider the following input data: + // rng1.size() = 10 + // rng2.size() = 6 + // i_diag = 9 + // Let's define the following ranges for processing: + // rng1: [3, ..., 9) -> __rng1_from = 3, __rng1_to = 9 + // rng2: [1, ..., 4) -> __rng2_from = 1, __rng2_to = 4 + // + // The goal: required to process only X' items of the merge matrix + // as intersection of rng1[3, ..., 9) and rng2[1, ..., 4) + // + // -------------------------------------------------------- + // + // __diag_it_begin(rng1) __diag_it_end(rng1) + // (init state) (dest state) (init state, dest state) + // | | | + // V V V + // + + + + + + + // \ rng1 0 1 2 3 4 5 6 7 8 9 + // rng2 +--------------------------------------+ + // 0 | ^ ^ ^ X | <--- __diag_it_end(rng2) (init state) + // + 1 | <----------------- + + X'2 ^ | <--- __diag_it_end(rng2) (dest state) + // + 2 | <----------------- + X'1 | | + // + 3 | <----------------- X'0 | | <--- __diag_it_begin(rng2) (dest state) + // 4 | X ^ | | + // 5 | X | | | <--- __diag_it_begin(rng2) (init state) + // +-------AX-----------+-----------+-----+ + // AX | | + // AX | | + // Run lower_bound:[from = 5, to = 8) + // + // AX - absent items in rng2 + // + // We have three points on diagonal for call comparison: + // X'0 : call __comp(rng1[5], rng2[3]) // 5 + 3 == 9 - 1 == 8 + // X'1 : call __comp(rng1[6], rng2[2]) // 6 + 2 == 9 - 1 == 8 + // X'3 : call __comp(rng1[7], rng2[1]) // 7 + 1 == 9 - 1 == 8 + // - where for every comparing pairs idx(rng1) + idx(rng2) == i_diag - 1 + + //////////////////////////////////////////////////////////////////////////////////// + // Taking into account the specified constraints of the range of processed data + const auto __index_sum = __i_elem - 1; + + using _IndexSigned = std::make_signed_t<_Index>; + + _IndexSigned idx1_from = __rng1_from; + _IndexSigned idx1_to = __rng1_to; + + _IndexSigned idx2_from = __index_sum - (__rng1_to - 1); + _IndexSigned idx2_to = __index_sum - __rng1_from + 1; + + const _IndexSigned idx2_from_diff = + idx2_from < (_IndexSigned)__rng2_from ? (_IndexSigned)__rng2_from - idx2_from : 0; + const _IndexSigned idx2_to_diff = idx2_to > (_IndexSigned)__rng2_to ? idx2_to - (_IndexSigned)__rng2_to : 0; + + idx1_to -= idx2_from_diff; + idx1_from += idx2_to_diff; + + idx2_from = __index_sum - (idx1_to - 1); + idx2_to = __index_sum - idx1_from + 1; + + //////////////////////////////////////////////////////////////////////////////////// + // Run search of split point on diagonal + + using __it_t = oneapi::dpl::counting_iterator<_Index>; + + __it_t __diag_it_begin(idx1_from); + __it_t __diag_it_end(idx1_to); + + constexpr int kValue = 1; + const __it_t __res = + std::lower_bound(__diag_it_begin, __diag_it_end, kValue, [&](_Index __idx, const auto& __value) { + const auto __zero_or_one = __comp(__rng2[__index_sum - __idx], __rng1[__idx]); + return __zero_or_one < kValue; + }); + + return _split_point_t<_Index>{*__res, __index_sum - *__res + 1}; +} + +// Do serial merge of the data from rng1 (starting from start1) and rng2 (starting from start2) and writing +// to rng3 (starting from start3) in 'chunk' steps, but do not exceed the total size of the sequences (n1 and n2) +template +void +__serial_merge(const _Rng1& __rng1, const _Rng2& __rng2, _Rng3& __rng3, _Index __start1, _Index __start2, + const _Index __start3, const std::uint8_t __chunk, const _Index __n1, const _Index __n2, _Compare __comp) +{ + if (__start1 >= __n1) + { + //copying a residual of the second seq + const _Index __n = std::min<_Index>(__n2 - __start2, __chunk); + for (std::uint8_t __i = 0; __i < __n; ++__i) + __rng3[__start3 + __i] = __rng2[__start2 + __i]; + } + else if (__start2 >= __n2) + { + //copying a residual of the first seq + const _Index __n = std::min<_Index>(__n1 - __start1, __chunk); + for (std::uint8_t __i = 0; __i < __n; ++__i) + __rng3[__start3 + __i] = __rng1[__start1 + __i]; + } + else + { + for (std::uint8_t __i = 0; __i < __chunk && __start1 < __n1 && __start2 < __n2; ++__i) + { + const auto& __val1 = __rng1[__start1]; + const auto& __val2 = __rng2[__start2]; + if (__comp(__val2, __val1)) + { + __rng3[__start3 + __i] = __val2; + if (++__start2 == __n2) + { + //copying a residual of the first seq + for (++__i; __i < __chunk && __start1 < __n1; ++__i, ++__start1) + __rng3[__start3 + __i] = __rng1[__start1]; + } + } + else + { + __rng3[__start3 + __i] = __val1; + if (++__start1 == __n1) + { + //copying a residual of the second seq + for (++__i; __i < __chunk && __start2 < __n2; ++__i, ++__start2) + __rng3[__start3 + __i] = __rng2[__start2]; + } + } + } + } +} + +// Please see the comment for __parallel_for_submitter for optional kernel name explanation +template +struct __parallel_merge_submitter; + +template +struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_MergeKernelName...>> +{ + template + auto + operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const + { + const _IdType __n1 = __rng1.size(); + const _IdType __n2 = __rng2.size(); + const _IdType __n = __n1 + __n2; + + assert(__n1 > 0 || __n2 > 0); + + _PRINT_INFO_IN_DEBUG_MODE(__exec); + + // Empirical number of values to process per work-item + const std::uint8_t __chunk = __exec.queue().get_device().is_cpu() ? 128 : 4; + + const _IdType __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk); + + auto __event = __exec.queue().submit([&](sycl::handler& __cgh) { + oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); + __cgh.parallel_for<_MergeKernelName...>( + sycl::range(__steps), [=](sycl::item __item_id) { + const _IdType __i_elem = __item_id.get_linear_id() * __chunk; + const auto __start = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); + __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, __n2, + __comp); + }); + }); + // We should return the same thing in the second param of __future for compatibility + // with the returning value in __parallel_merge_submitter_large::operator() + return __future(__event, __result_and_scratch_storage_base_ptr{}); + } +}; + +template +struct __parallel_merge_submitter_large; + +template +struct __parallel_merge_submitter_large<_IdType, _CustomName, + __internal::__optional_kernel_name<_DiagonalsKernelName...>, + __internal::__optional_kernel_name<_MergeKernelName...>> +{ + protected: + struct nd_range_params + { + std::size_t base_diag_count = 0; + std::size_t steps_between_two_base_diags = 0; + std::uint8_t chunk = 0; + _IdType steps = 0; + }; + + // Calculate nd-range params + template + nd_range_params + eval_nd_range_params(_ExecutionPolicy&& __exec, const _Range1& __rng1, const _Range2& __rng2) const + { + using _Range1ValueType = oneapi::dpl::__internal::__value_t<_Range1>; + using _Range2ValueType = oneapi::dpl::__internal::__value_t<_Range2>; + using _RangeValueType = std::conditional_t<(sizeof(_Range1ValueType) > sizeof(_Range2ValueType)), + _Range1ValueType, _Range2ValueType>; + + const std::size_t __n = __rng1.size() + __rng2.size(); + + // Empirical number of values to process per work-item + const std::uint8_t __chunk = __exec.queue().get_device().is_cpu() ? 128 : 4; + + const _IdType __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk); + const _IdType __base_diag_count = 32 * 1'024; + const _IdType __steps_between_two_base_diags = + oneapi::dpl::__internal::__dpl_ceiling_div(__steps, __base_diag_count); + + return {__base_diag_count, __steps_between_two_base_diags, __chunk, __steps}; + } + + // Calculation of split points on each base diagonal + template + sycl::event + eval_split_points_for_groups(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Compare __comp, + const nd_range_params& __nd_range_params, + _Storage& __base_diagonals_sp_global_storage) const + { + const _IdType __n1 = __rng1.size(); + const _IdType __n2 = __rng2.size(); + const _IdType __n = __n1 + __n2; + + sycl::event __event = __exec.queue().submit([&](sycl::handler& __cgh) { + oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2); + auto __base_diagonals_sp_global_acc = + __base_diagonals_sp_global_storage.template __get_scratch_acc( + __cgh, __dpl_sycl::__no_init{}); + + __cgh.parallel_for<_DiagonalsKernelName...>( + sycl::range(__nd_range_params.base_diag_count + 1), [=](sycl::item __item_id) { + auto __global_idx = __item_id.get_linear_id(); + auto __base_diagonals_sp_global_ptr = + _Storage::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); + + _split_point_t<_IdType> __sp = + __global_idx == 0 ? _split_point_t<_IdType>{0, 0} : _split_point_t<_IdType>{__n1, __n2}; + + if (0 < __global_idx && __global_idx < __nd_range_params.base_diag_count) + { + const _IdType __i_elem = + __global_idx * __nd_range_params.steps_between_two_base_diags * __nd_range_params.chunk; + if (__i_elem < __n) + __sp = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); + } + + __base_diagonals_sp_global_ptr[__global_idx] = __sp; + }); + }); + + return __event; + } + + // Process parallel merge + template + sycl::event + run_parallel_merge(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp, + const nd_range_params& __nd_range_params) const + { + const _IdType __n1 = __rng1.size(); + const _IdType __n2 = __rng2.size(); + + const auto __chunk = __nd_range_params.chunk; + + sycl::event __event = __exec.queue().submit([&](sycl::handler& __cgh) { + oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); + + __cgh.parallel_for<_MergeKernelName...>( + sycl::range(__nd_range_params.steps), [=](sycl::item __item_id) { + auto __global_idx = __item_id.get_linear_id(); + const _IdType __i_elem = __global_idx * __chunk; + + if (__i_elem < __n1 + __n2) + { + _split_point_t<_IdType> __start = __find_start_point_in(__rng1, (_IdType)0, __n1, __rng2, (_IdType)0, __n2, __i_elem, __comp); + __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, __n2, + __comp); + } + }); + }); + + return __event; + } + + // Process parallel merge + template + sycl::event + run_parallel_merge(sycl::event __event, _ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, + _Range3&& __rng3, _Compare __comp, const nd_range_params& __nd_range_params, + const _Storage& __base_diagonals_sp_global_storage) const + { + const _IdType __n1 = __rng1.size(); + const _IdType __n2 = __rng2.size(); + + __event = __exec.queue().submit([&](sycl::handler& __cgh) { + oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); + auto __base_diagonals_sp_global_acc = + __base_diagonals_sp_global_storage.template __get_scratch_acc(__cgh); + + __cgh.depends_on(__event); + + __cgh.parallel_for<_MergeKernelName...>( + sycl::range(__nd_range_params.steps), [=](sycl::item __item_id) { + auto __global_idx = __item_id.get_linear_id(); + const _IdType __i_elem = __global_idx * __nd_range_params.chunk; + + auto __base_diagonals_sp_global_ptr = + _Storage::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); + auto __diagonal_idx = __global_idx / __nd_range_params.steps_between_two_base_diags; + + _split_point_t<_IdType> __start; + if (__global_idx % __nd_range_params.steps_between_two_base_diags != 0) + { + const _split_point_t<_IdType> __sp_left = __base_diagonals_sp_global_ptr[__diagonal_idx]; + const _split_point_t<_IdType> __sp_right = __base_diagonals_sp_global_ptr[__diagonal_idx + 1]; + + __start = __find_start_point_in(__rng1, __sp_left.first, __sp_right.first, __rng2, + __sp_left.second, __sp_right.second, __i_elem, __comp); + } + else + { + __start = __base_diagonals_sp_global_ptr[__diagonal_idx]; + } + + __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, + __nd_range_params.chunk, __n1, __n2, __comp); + }); + }); + + return __event; + } + + public: + template + auto + operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const + { + assert(__rng1.size() > 0 || __rng2.size() > 0); + + _PRINT_INFO_IN_DEBUG_MODE(__exec); + + // Calculate nd-range params + const nd_range_params __nd_range_params = eval_nd_range_params(__exec, __rng1, __rng2); + + __result_and_scratch_storage_base_ptr __p_result_and_scratch_storage_base; + + // Create storage for save split-points on each base diagonal + 1 (for the right base diagonal in the last work-group) + auto __p_base_diagonals_sp_global_storage = + new __result_and_scratch_storage<_ExecutionPolicy, _split_point_t<_IdType>>( + __exec, 0, __nd_range_params.base_diag_count + 1); + __p_result_and_scratch_storage_base.reset( + static_cast<__result_and_scratch_storage_base*>(__p_base_diagonals_sp_global_storage)); + + sycl::event __event = eval_split_points_for_groups(__exec, __rng1, __rng2, __comp, __nd_range_params, + *__p_base_diagonals_sp_global_storage); + + // Merge data using split points on each base diagonal + __event = run_parallel_merge(__event, __exec, __rng1, __rng2, __rng3, __comp, __nd_range_params, + *__p_base_diagonals_sp_global_storage); + + return __future(std::move(__event), std::move(__p_result_and_scratch_storage_base)); + } +}; + +template +class __merge_kernel_name; + +template +class __merge_kernel_name_large; + +template +class __diagonals_kernel_name; + +template +std::size_t +starting_size_limit_for_large_submitter() +{ + return 4 * 1'048'576; // 4 MB +} + +template <> +std::size_t +starting_size_limit_for_large_submitter() +{ + return 16 * 1'048'576; // 8 MB +} + +template +auto +__parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Range1&& __rng1, + _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) +{ + using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + + using __value_type = oneapi::dpl::__internal::__value_t<_Range3>; + + const std::size_t __n = __rng1.size() + __rng2.size(); + if (__n < starting_size_limit_for_large_submitter<__value_type>()) + { + using _WiIndex = std::uint32_t; + using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __merge_kernel_name<_CustomName, _WiIndex>>; + return __parallel_merge_submitter<_WiIndex, _MergeKernelName>()( + std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), + std::forward<_Range3>(__rng3), __comp); + } + else + { + if (__n <= std::numeric_limits::max()) + { + using _WiIndex = std::uint32_t; + using _DiagonalsKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __diagonals_kernel_name<_CustomName, _WiIndex>>; + using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __merge_kernel_name_large<_CustomName, _WiIndex>>; + return __parallel_merge_submitter_large<_WiIndex, _CustomName, _DiagonalsKernelName, _MergeKernelName>()( + std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), + std::forward<_Range3>(__rng3), __comp); + } + else + { + using _WiIndex = std::uint64_t; + using _DiagonalsKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __diagonals_kernel_name<_CustomName, _WiIndex>>; + using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __merge_kernel_name_large<_CustomName, _WiIndex>>; + return __parallel_merge_submitter_large<_WiIndex, _CustomName, _DiagonalsKernelName, _MergeKernelName>()( + std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), + std::forward<_Range3>(__rng3), __comp); + } + } +} + +} // namespace __par_backend_hetero +} // namespace dpl +} // namespace oneapi + +#endif // _ONEDPL_PARALLEL_BACKEND_SYCL_MERGE_H From 05ff60fe2fb23638af569c52f8bccd60512af1a6 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 16 Dec 2024 11:28:43 +0100 Subject: [PATCH 41/76] Apply GitHUB clang format Signed-off-by: Sergey Kopienko --- .../pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 895253a151e..f6c7f7a3d5c 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -370,9 +370,10 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, if (__i_elem < __n1 + __n2) { - _split_point_t<_IdType> __start = __find_start_point_in(__rng1, (_IdType)0, __n1, __rng2, (_IdType)0, __n2, __i_elem, __comp); - __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, __n2, - __comp); + _split_point_t<_IdType> __start = + __find_start_point_in(__rng1, (_IdType)0, __n1, __rng2, (_IdType)0, __n2, __i_elem, __comp); + __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, + __n2, __comp); } }); }); @@ -451,11 +452,11 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, static_cast<__result_and_scratch_storage_base*>(__p_base_diagonals_sp_global_storage)); sycl::event __event = eval_split_points_for_groups(__exec, __rng1, __rng2, __comp, __nd_range_params, - *__p_base_diagonals_sp_global_storage); + *__p_base_diagonals_sp_global_storage); // Merge data using split points on each base diagonal __event = run_parallel_merge(__event, __exec, __rng1, __rng2, __rng3, __comp, __nd_range_params, - *__p_base_diagonals_sp_global_storage); + *__p_base_diagonals_sp_global_storage); return __future(std::move(__event), std::move(__p_result_and_scratch_storage_base)); } From ea47019fd4af943b33ced96ea0976e6474ede0fa Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 16 Dec 2024 15:13:41 +0100 Subject: [PATCH 42/76] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - remove run_parallel_merge with old implementation Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 32 ------------------- 1 file changed, 32 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index f6c7f7a3d5c..658598af4db 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -349,38 +349,6 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, return __event; } - // Process parallel merge - template - sycl::event - run_parallel_merge(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp, - const nd_range_params& __nd_range_params) const - { - const _IdType __n1 = __rng1.size(); - const _IdType __n2 = __rng2.size(); - - const auto __chunk = __nd_range_params.chunk; - - sycl::event __event = __exec.queue().submit([&](sycl::handler& __cgh) { - oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); - - __cgh.parallel_for<_MergeKernelName...>( - sycl::range(__nd_range_params.steps), [=](sycl::item __item_id) { - auto __global_idx = __item_id.get_linear_id(); - const _IdType __i_elem = __global_idx * __chunk; - - if (__i_elem < __n1 + __n2) - { - _split_point_t<_IdType> __start = - __find_start_point_in(__rng1, (_IdType)0, __n1, __rng2, (_IdType)0, __n2, __i_elem, __comp); - __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, - __n2, __comp); - } - }); - }); - - return __event; - } - // Process parallel merge template From 73bbc141cf0ccaf69e7bd949b532708dbef2748f Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 16 Dec 2024 15:32:12 +0100 Subject: [PATCH 43/76] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix self-review comment Signed-off-by: Sergey Kopienko --- .../pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 658598af4db..6b19d748400 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -440,15 +440,15 @@ template class __diagonals_kernel_name; template -std::size_t -starting_size_limit_for_large_submitter() +constexpr std::size_t +__get_starting_size_limit_for_large_submitter() { return 4 * 1'048'576; // 4 MB } template <> -std::size_t -starting_size_limit_for_large_submitter() +constexpr std::size_t +__get_starting_size_limit_for_large_submitter() { return 16 * 1'048'576; // 8 MB } @@ -463,9 +463,10 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy using __value_type = oneapi::dpl::__internal::__value_t<_Range3>; const std::size_t __n = __rng1.size() + __rng2.size(); - if (__n < starting_size_limit_for_large_submitter<__value_type>()) + if (__n < __get_starting_size_limit_for_large_submitter<__value_type>()) { using _WiIndex = std::uint32_t; + static_assert(__get_starting_size_limit_for_large_submitter<__value_type>() <= std::numeric_limits<_WiIndex>::max()); using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< __merge_kernel_name<_CustomName, _WiIndex>>; return __parallel_merge_submitter<_WiIndex, _MergeKernelName>()( From e0c1628adc625adfa59a5bcc3cb8bb1f79615700 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 16 Dec 2024 16:24:42 +0100 Subject: [PATCH 44/76] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix self-review comments Signed-off-by: Sergey Kopienko --- .../hetero/dpcpp/parallel_backend_sycl_merge.h | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 6b19d748400..2a9221595e4 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -169,7 +169,7 @@ __find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rn constexpr int kValue = 1; const __it_t __res = - std::lower_bound(__diag_it_begin, __diag_it_end, kValue, [&](_Index __idx, const auto& __value) { + std::lower_bound(__diag_it_begin, __diag_it_end, kValue, [&__rng1, &__rng2, __index_sum, __comp](_Index __idx, const auto& __value) { const auto __zero_or_one = __comp(__rng2[__index_sum - __idx], __rng1[__idx]); return __zero_or_one < kValue; }); @@ -252,7 +252,7 @@ struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_M const _IdType __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk); - auto __event = __exec.queue().submit([&](sycl::handler& __cgh) { + auto __event = __exec.queue().submit([&__rng1, &__rng2, &__rng3, __steps, __chunk, __n1, __n2, __comp](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); __cgh.parallel_for<_MergeKernelName...>( sycl::range(__steps), [=](sycl::item __item_id) { @@ -319,7 +319,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, const _IdType __n2 = __rng2.size(); const _IdType __n = __n1 + __n2; - sycl::event __event = __exec.queue().submit([&](sycl::handler& __cgh) { + return __exec.queue().submit([&__rng1, &__rng2, __base_diagonals_sp_global_storage, __n1, __n2, __n, __nd_range_params, __comp](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2); auto __base_diagonals_sp_global_acc = __base_diagonals_sp_global_storage.template __get_scratch_acc( @@ -345,22 +345,20 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, __base_diagonals_sp_global_ptr[__global_idx] = __sp; }); }); - - return __event; } // Process parallel merge template sycl::event - run_parallel_merge(sycl::event __event, _ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, + run_parallel_merge(const sycl::event& __event, _ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp, const nd_range_params& __nd_range_params, const _Storage& __base_diagonals_sp_global_storage) const { const _IdType __n1 = __rng1.size(); const _IdType __n2 = __rng2.size(); - __event = __exec.queue().submit([&](sycl::handler& __cgh) { + return __exec.queue().submit([&__event, &__rng1, &__rng2, &__rng3, __nd_range_params, __base_diagonals_sp_global_storage, __n1, __n2, __comp](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); auto __base_diagonals_sp_global_acc = __base_diagonals_sp_global_storage.template __get_scratch_acc(__cgh); @@ -394,8 +392,6 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, __nd_range_params.chunk, __n1, __n2, __comp); }); }); - - return __event; } public: From 38166c712a1551593b0015f064e3ef9417edcd42 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 17 Dec 2024 09:44:08 +0100 Subject: [PATCH 45/76] Apply GitHUB clang format Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 41 +++++++++++-------- 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 2a9221595e4..6158b32dcf9 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -168,11 +168,12 @@ __find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rn __it_t __diag_it_end(idx1_to); constexpr int kValue = 1; - const __it_t __res = - std::lower_bound(__diag_it_begin, __diag_it_end, kValue, [&__rng1, &__rng2, __index_sum, __comp](_Index __idx, const auto& __value) { - const auto __zero_or_one = __comp(__rng2[__index_sum - __idx], __rng1[__idx]); - return __zero_or_one < kValue; - }); + const __it_t __res = std::lower_bound(__diag_it_begin, __diag_it_end, kValue, + [&__rng1, &__rng2, __index_sum, __comp](_Index __idx, const auto& __value) { + const auto __zero_or_one = + __comp(__rng2[__index_sum - __idx], __rng1[__idx]); + return __zero_or_one < kValue; + }); return _split_point_t<_Index>{*__res, __index_sum - *__res + 1}; } @@ -252,16 +253,17 @@ struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_M const _IdType __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk); - auto __event = __exec.queue().submit([&__rng1, &__rng2, &__rng3, __steps, __chunk, __n1, __n2, __comp](sycl::handler& __cgh) { - oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); - __cgh.parallel_for<_MergeKernelName...>( - sycl::range(__steps), [=](sycl::item __item_id) { - const _IdType __i_elem = __item_id.get_linear_id() * __chunk; - const auto __start = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); - __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, __n2, - __comp); - }); - }); + auto __event = __exec.queue().submit( + [&__rng1, &__rng2, &__rng3, __steps, __chunk, __n1, __n2, __comp](sycl::handler& __cgh) { + oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); + __cgh.parallel_for<_MergeKernelName...>( + sycl::range(__steps), [=](sycl::item __item_id) { + const _IdType __i_elem = __item_id.get_linear_id() * __chunk; + const auto __start = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); + __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, + __n2, __comp); + }); + }); // We should return the same thing in the second param of __future for compatibility // with the returning value in __parallel_merge_submitter_large::operator() return __future(__event, __result_and_scratch_storage_base_ptr{}); @@ -319,7 +321,8 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, const _IdType __n2 = __rng2.size(); const _IdType __n = __n1 + __n2; - return __exec.queue().submit([&__rng1, &__rng2, __base_diagonals_sp_global_storage, __n1, __n2, __n, __nd_range_params, __comp](sycl::handler& __cgh) { + return __exec.queue().submit([&__rng1, &__rng2, __base_diagonals_sp_global_storage, __n1, __n2, __n, + __nd_range_params, __comp](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2); auto __base_diagonals_sp_global_acc = __base_diagonals_sp_global_storage.template __get_scratch_acc( @@ -358,7 +361,8 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, const _IdType __n1 = __rng1.size(); const _IdType __n2 = __rng2.size(); - return __exec.queue().submit([&__event, &__rng1, &__rng2, &__rng3, __nd_range_params, __base_diagonals_sp_global_storage, __n1, __n2, __comp](sycl::handler& __cgh) { + return __exec.queue().submit([&__event, &__rng1, &__rng2, &__rng3, __nd_range_params, + __base_diagonals_sp_global_storage, __n1, __n2, __comp](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); auto __base_diagonals_sp_global_acc = __base_diagonals_sp_global_storage.template __get_scratch_acc(__cgh); @@ -462,7 +466,8 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy if (__n < __get_starting_size_limit_for_large_submitter<__value_type>()) { using _WiIndex = std::uint32_t; - static_assert(__get_starting_size_limit_for_large_submitter<__value_type>() <= std::numeric_limits<_WiIndex>::max()); + static_assert(__get_starting_size_limit_for_large_submitter<__value_type>() <= + std::numeric_limits<_WiIndex>::max()); using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< __merge_kernel_name<_CustomName, _WiIndex>>; return __parallel_merge_submitter<_WiIndex, _MergeKernelName>()( From 7b5dc422a98a87d067066cf326ff5e0b5115494b Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 17 Dec 2024 12:20:49 +0100 Subject: [PATCH 46/76] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix ordering of captured variables in submit calls Signed-off-by: Sergey Kopienko --- .../pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 6158b32dcf9..9ead310b63c 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -254,7 +254,7 @@ struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_M const _IdType __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk); auto __event = __exec.queue().submit( - [&__rng1, &__rng2, &__rng3, __steps, __chunk, __n1, __n2, __comp](sycl::handler& __cgh) { + [&__rng1, &__rng2, &__rng3, __comp, __chunk, __steps, __n1, __n2](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); __cgh.parallel_for<_MergeKernelName...>( sycl::range(__steps), [=](sycl::item __item_id) { @@ -321,8 +321,8 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, const _IdType __n2 = __rng2.size(); const _IdType __n = __n1 + __n2; - return __exec.queue().submit([&__rng1, &__rng2, __base_diagonals_sp_global_storage, __n1, __n2, __n, - __nd_range_params, __comp](sycl::handler& __cgh) { + return __exec.queue().submit([&__rng1, &__rng2, __comp, __nd_range_params, __base_diagonals_sp_global_storage, + __n1, __n2, __n](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2); auto __base_diagonals_sp_global_acc = __base_diagonals_sp_global_storage.template __get_scratch_acc( @@ -361,8 +361,8 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, const _IdType __n1 = __rng1.size(); const _IdType __n2 = __rng2.size(); - return __exec.queue().submit([&__event, &__rng1, &__rng2, &__rng3, __nd_range_params, - __base_diagonals_sp_global_storage, __n1, __n2, __comp](sycl::handler& __cgh) { + return __exec.queue().submit([&__event, &__rng1, &__rng2, &__rng3, __comp, __nd_range_params, + __base_diagonals_sp_global_storage, __n1, __n2](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); auto __base_diagonals_sp_global_acc = __base_diagonals_sp_global_storage.template __get_scratch_acc(__cgh); From 93c731ac6564a1a93d54eda5807a0649c69d31f6 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Wed, 18 Dec 2024 16:50:26 +0100 Subject: [PATCH 47/76] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix EOL chars Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 1016 ++++++++--------- 1 file changed, 508 insertions(+), 508 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 9ead310b63c..10d4c5e7489 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -1,508 +1,508 @@ -// -*- C++ -*- -//===-- parallel_backend_sycl_merge.h --------------------------------===// -// -// Copyright (C) Intel Corporation -// -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -// This file incorporates work covered by the following copyright and permission -// notice: -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// -//===----------------------------------------------------------------------===// - -#ifndef _ONEDPL_PARALLEL_BACKEND_SYCL_MERGE_H -#define _ONEDPL_PARALLEL_BACKEND_SYCL_MERGE_H - -#include // std::numeric_limits -#include // assert -#include // std::uint8_t, ... -#include // std::make_pair, std::forward -#include // std::min, std::lower_bound - -#include "sycl_defs.h" -#include "parallel_backend_sycl_utils.h" - -namespace oneapi -{ -namespace dpl -{ -namespace __par_backend_hetero -{ -template -using _split_point_t = std::pair<_Index, _Index>; - -//Searching for an intersection of a merge matrix (n1, n2) diagonal with the Merge Path to define sub-ranges -//to serial merge. For example, a merge matrix for [0,1,1,2,3] and [0,0,2,3] is shown below: -// 0 1 1 2 3 -// ------------------ -// |---> -// 0 | 0 | 1 1 1 1 -// | | -// 0 | 0 | 1 1 1 1 -// | ----------> -// 2 | 0 0 0 0 | 1 -// | ----> -// 3 | 0 0 0 0 0 | -template -auto -__find_start_point(const _Rng1& __rng1, const _Rng2& __rng2, const _Index __i_elem, const _Index __n1, - const _Index __n2, _Compare __comp) -{ - //searching for the first '1', a lower bound for a diagonal [0, 0,..., 0, 1, 1,.... 1, 1] - oneapi::dpl::counting_iterator<_Index> __diag_it(0); - - if (__i_elem < __n2) //a condition to specify upper or lower part of the merge matrix to be processed - { - const _Index __q = __i_elem; //diagonal index - const _Index __n_diag = std::min<_Index>(__q, __n1); //diagonal size - auto __res = - std::lower_bound(__diag_it, __diag_it + __n_diag, 1 /*value to find*/, - [&__rng2, &__rng1, __q, __comp](const auto& __i_diag, const auto& __value) mutable { - const auto __zero_or_one = __comp(__rng2[__q - __i_diag - 1], __rng1[__i_diag]); - return __zero_or_one < __value; - }); - return std::make_pair(*__res, __q - *__res); - } - else - { - const _Index __q = __i_elem - __n2; //diagonal index - const _Index __n_diag = std::min<_Index>(__n1 - __q, __n2); //diagonal size - auto __res = - std::lower_bound(__diag_it, __diag_it + __n_diag, 1 /*value to find*/, - [&__rng2, &__rng1, __n2, __q, __comp](const auto& __i_diag, const auto& __value) mutable { - const auto __zero_or_one = __comp(__rng2[__n2 - __i_diag - 1], __rng1[__q + __i_diag]); - return __zero_or_one < __value; - }); - return std::make_pair(__q + *__res, __n2 - *__res); - } -} - -//Searching for an intersection of a merge matrix (n1, n2) diagonal with the Merge Path to define sub-ranges -//to serial merge. For example, a merge matrix for [0,1,1,2,3] and [0,0,2,3] is shown below: -// 0 1 1 2 3 -// ------------------ -// |---> -// 0 | 0 | 1 1 1 1 -// | | -// 0 | 0 | 1 1 1 1 -// | ----------> -// 2 | 0 0 0 0 | 1 -// | ----> -// 3 | 0 0 0 0 0 | -template -_split_point_t<_Index> -__find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rng1_to, const _Rng2& __rng2, - const _Index __rng2_from, _Index __rng2_to, const _Index __i_elem, _Compare __comp) -{ - // ----------------------- EXAMPLE ------------------------ - // Let's consider the following input data: - // rng1.size() = 10 - // rng2.size() = 6 - // i_diag = 9 - // Let's define the following ranges for processing: - // rng1: [3, ..., 9) -> __rng1_from = 3, __rng1_to = 9 - // rng2: [1, ..., 4) -> __rng2_from = 1, __rng2_to = 4 - // - // The goal: required to process only X' items of the merge matrix - // as intersection of rng1[3, ..., 9) and rng2[1, ..., 4) - // - // -------------------------------------------------------- - // - // __diag_it_begin(rng1) __diag_it_end(rng1) - // (init state) (dest state) (init state, dest state) - // | | | - // V V V - // + + + + + + - // \ rng1 0 1 2 3 4 5 6 7 8 9 - // rng2 +--------------------------------------+ - // 0 | ^ ^ ^ X | <--- __diag_it_end(rng2) (init state) - // + 1 | <----------------- + + X'2 ^ | <--- __diag_it_end(rng2) (dest state) - // + 2 | <----------------- + X'1 | | - // + 3 | <----------------- X'0 | | <--- __diag_it_begin(rng2) (dest state) - // 4 | X ^ | | - // 5 | X | | | <--- __diag_it_begin(rng2) (init state) - // +-------AX-----------+-----------+-----+ - // AX | | - // AX | | - // Run lower_bound:[from = 5, to = 8) - // - // AX - absent items in rng2 - // - // We have three points on diagonal for call comparison: - // X'0 : call __comp(rng1[5], rng2[3]) // 5 + 3 == 9 - 1 == 8 - // X'1 : call __comp(rng1[6], rng2[2]) // 6 + 2 == 9 - 1 == 8 - // X'3 : call __comp(rng1[7], rng2[1]) // 7 + 1 == 9 - 1 == 8 - // - where for every comparing pairs idx(rng1) + idx(rng2) == i_diag - 1 - - //////////////////////////////////////////////////////////////////////////////////// - // Taking into account the specified constraints of the range of processed data - const auto __index_sum = __i_elem - 1; - - using _IndexSigned = std::make_signed_t<_Index>; - - _IndexSigned idx1_from = __rng1_from; - _IndexSigned idx1_to = __rng1_to; - - _IndexSigned idx2_from = __index_sum - (__rng1_to - 1); - _IndexSigned idx2_to = __index_sum - __rng1_from + 1; - - const _IndexSigned idx2_from_diff = - idx2_from < (_IndexSigned)__rng2_from ? (_IndexSigned)__rng2_from - idx2_from : 0; - const _IndexSigned idx2_to_diff = idx2_to > (_IndexSigned)__rng2_to ? idx2_to - (_IndexSigned)__rng2_to : 0; - - idx1_to -= idx2_from_diff; - idx1_from += idx2_to_diff; - - idx2_from = __index_sum - (idx1_to - 1); - idx2_to = __index_sum - idx1_from + 1; - - //////////////////////////////////////////////////////////////////////////////////// - // Run search of split point on diagonal - - using __it_t = oneapi::dpl::counting_iterator<_Index>; - - __it_t __diag_it_begin(idx1_from); - __it_t __diag_it_end(idx1_to); - - constexpr int kValue = 1; - const __it_t __res = std::lower_bound(__diag_it_begin, __diag_it_end, kValue, - [&__rng1, &__rng2, __index_sum, __comp](_Index __idx, const auto& __value) { - const auto __zero_or_one = - __comp(__rng2[__index_sum - __idx], __rng1[__idx]); - return __zero_or_one < kValue; - }); - - return _split_point_t<_Index>{*__res, __index_sum - *__res + 1}; -} - -// Do serial merge of the data from rng1 (starting from start1) and rng2 (starting from start2) and writing -// to rng3 (starting from start3) in 'chunk' steps, but do not exceed the total size of the sequences (n1 and n2) -template -void -__serial_merge(const _Rng1& __rng1, const _Rng2& __rng2, _Rng3& __rng3, _Index __start1, _Index __start2, - const _Index __start3, const std::uint8_t __chunk, const _Index __n1, const _Index __n2, _Compare __comp) -{ - if (__start1 >= __n1) - { - //copying a residual of the second seq - const _Index __n = std::min<_Index>(__n2 - __start2, __chunk); - for (std::uint8_t __i = 0; __i < __n; ++__i) - __rng3[__start3 + __i] = __rng2[__start2 + __i]; - } - else if (__start2 >= __n2) - { - //copying a residual of the first seq - const _Index __n = std::min<_Index>(__n1 - __start1, __chunk); - for (std::uint8_t __i = 0; __i < __n; ++__i) - __rng3[__start3 + __i] = __rng1[__start1 + __i]; - } - else - { - for (std::uint8_t __i = 0; __i < __chunk && __start1 < __n1 && __start2 < __n2; ++__i) - { - const auto& __val1 = __rng1[__start1]; - const auto& __val2 = __rng2[__start2]; - if (__comp(__val2, __val1)) - { - __rng3[__start3 + __i] = __val2; - if (++__start2 == __n2) - { - //copying a residual of the first seq - for (++__i; __i < __chunk && __start1 < __n1; ++__i, ++__start1) - __rng3[__start3 + __i] = __rng1[__start1]; - } - } - else - { - __rng3[__start3 + __i] = __val1; - if (++__start1 == __n1) - { - //copying a residual of the second seq - for (++__i; __i < __chunk && __start2 < __n2; ++__i, ++__start2) - __rng3[__start3 + __i] = __rng2[__start2]; - } - } - } - } -} - -// Please see the comment for __parallel_for_submitter for optional kernel name explanation -template -struct __parallel_merge_submitter; - -template -struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_MergeKernelName...>> -{ - template - auto - operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const - { - const _IdType __n1 = __rng1.size(); - const _IdType __n2 = __rng2.size(); - const _IdType __n = __n1 + __n2; - - assert(__n1 > 0 || __n2 > 0); - - _PRINT_INFO_IN_DEBUG_MODE(__exec); - - // Empirical number of values to process per work-item - const std::uint8_t __chunk = __exec.queue().get_device().is_cpu() ? 128 : 4; - - const _IdType __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk); - - auto __event = __exec.queue().submit( - [&__rng1, &__rng2, &__rng3, __comp, __chunk, __steps, __n1, __n2](sycl::handler& __cgh) { - oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); - __cgh.parallel_for<_MergeKernelName...>( - sycl::range(__steps), [=](sycl::item __item_id) { - const _IdType __i_elem = __item_id.get_linear_id() * __chunk; - const auto __start = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); - __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, - __n2, __comp); - }); - }); - // We should return the same thing in the second param of __future for compatibility - // with the returning value in __parallel_merge_submitter_large::operator() - return __future(__event, __result_and_scratch_storage_base_ptr{}); - } -}; - -template -struct __parallel_merge_submitter_large; - -template -struct __parallel_merge_submitter_large<_IdType, _CustomName, - __internal::__optional_kernel_name<_DiagonalsKernelName...>, - __internal::__optional_kernel_name<_MergeKernelName...>> -{ - protected: - struct nd_range_params - { - std::size_t base_diag_count = 0; - std::size_t steps_between_two_base_diags = 0; - std::uint8_t chunk = 0; - _IdType steps = 0; - }; - - // Calculate nd-range params - template - nd_range_params - eval_nd_range_params(_ExecutionPolicy&& __exec, const _Range1& __rng1, const _Range2& __rng2) const - { - using _Range1ValueType = oneapi::dpl::__internal::__value_t<_Range1>; - using _Range2ValueType = oneapi::dpl::__internal::__value_t<_Range2>; - using _RangeValueType = std::conditional_t<(sizeof(_Range1ValueType) > sizeof(_Range2ValueType)), - _Range1ValueType, _Range2ValueType>; - - const std::size_t __n = __rng1.size() + __rng2.size(); - - // Empirical number of values to process per work-item - const std::uint8_t __chunk = __exec.queue().get_device().is_cpu() ? 128 : 4; - - const _IdType __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk); - const _IdType __base_diag_count = 32 * 1'024; - const _IdType __steps_between_two_base_diags = - oneapi::dpl::__internal::__dpl_ceiling_div(__steps, __base_diag_count); - - return {__base_diag_count, __steps_between_two_base_diags, __chunk, __steps}; - } - - // Calculation of split points on each base diagonal - template - sycl::event - eval_split_points_for_groups(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Compare __comp, - const nd_range_params& __nd_range_params, - _Storage& __base_diagonals_sp_global_storage) const - { - const _IdType __n1 = __rng1.size(); - const _IdType __n2 = __rng2.size(); - const _IdType __n = __n1 + __n2; - - return __exec.queue().submit([&__rng1, &__rng2, __comp, __nd_range_params, __base_diagonals_sp_global_storage, - __n1, __n2, __n](sycl::handler& __cgh) { - oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2); - auto __base_diagonals_sp_global_acc = - __base_diagonals_sp_global_storage.template __get_scratch_acc( - __cgh, __dpl_sycl::__no_init{}); - - __cgh.parallel_for<_DiagonalsKernelName...>( - sycl::range(__nd_range_params.base_diag_count + 1), [=](sycl::item __item_id) { - auto __global_idx = __item_id.get_linear_id(); - auto __base_diagonals_sp_global_ptr = - _Storage::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); - - _split_point_t<_IdType> __sp = - __global_idx == 0 ? _split_point_t<_IdType>{0, 0} : _split_point_t<_IdType>{__n1, __n2}; - - if (0 < __global_idx && __global_idx < __nd_range_params.base_diag_count) - { - const _IdType __i_elem = - __global_idx * __nd_range_params.steps_between_two_base_diags * __nd_range_params.chunk; - if (__i_elem < __n) - __sp = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); - } - - __base_diagonals_sp_global_ptr[__global_idx] = __sp; - }); - }); - } - - // Process parallel merge - template - sycl::event - run_parallel_merge(const sycl::event& __event, _ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, - _Range3&& __rng3, _Compare __comp, const nd_range_params& __nd_range_params, - const _Storage& __base_diagonals_sp_global_storage) const - { - const _IdType __n1 = __rng1.size(); - const _IdType __n2 = __rng2.size(); - - return __exec.queue().submit([&__event, &__rng1, &__rng2, &__rng3, __comp, __nd_range_params, - __base_diagonals_sp_global_storage, __n1, __n2](sycl::handler& __cgh) { - oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); - auto __base_diagonals_sp_global_acc = - __base_diagonals_sp_global_storage.template __get_scratch_acc(__cgh); - - __cgh.depends_on(__event); - - __cgh.parallel_for<_MergeKernelName...>( - sycl::range(__nd_range_params.steps), [=](sycl::item __item_id) { - auto __global_idx = __item_id.get_linear_id(); - const _IdType __i_elem = __global_idx * __nd_range_params.chunk; - - auto __base_diagonals_sp_global_ptr = - _Storage::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); - auto __diagonal_idx = __global_idx / __nd_range_params.steps_between_two_base_diags; - - _split_point_t<_IdType> __start; - if (__global_idx % __nd_range_params.steps_between_two_base_diags != 0) - { - const _split_point_t<_IdType> __sp_left = __base_diagonals_sp_global_ptr[__diagonal_idx]; - const _split_point_t<_IdType> __sp_right = __base_diagonals_sp_global_ptr[__diagonal_idx + 1]; - - __start = __find_start_point_in(__rng1, __sp_left.first, __sp_right.first, __rng2, - __sp_left.second, __sp_right.second, __i_elem, __comp); - } - else - { - __start = __base_diagonals_sp_global_ptr[__diagonal_idx]; - } - - __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, - __nd_range_params.chunk, __n1, __n2, __comp); - }); - }); - } - - public: - template - auto - operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const - { - assert(__rng1.size() > 0 || __rng2.size() > 0); - - _PRINT_INFO_IN_DEBUG_MODE(__exec); - - // Calculate nd-range params - const nd_range_params __nd_range_params = eval_nd_range_params(__exec, __rng1, __rng2); - - __result_and_scratch_storage_base_ptr __p_result_and_scratch_storage_base; - - // Create storage for save split-points on each base diagonal + 1 (for the right base diagonal in the last work-group) - auto __p_base_diagonals_sp_global_storage = - new __result_and_scratch_storage<_ExecutionPolicy, _split_point_t<_IdType>>( - __exec, 0, __nd_range_params.base_diag_count + 1); - __p_result_and_scratch_storage_base.reset( - static_cast<__result_and_scratch_storage_base*>(__p_base_diagonals_sp_global_storage)); - - sycl::event __event = eval_split_points_for_groups(__exec, __rng1, __rng2, __comp, __nd_range_params, - *__p_base_diagonals_sp_global_storage); - - // Merge data using split points on each base diagonal - __event = run_parallel_merge(__event, __exec, __rng1, __rng2, __rng3, __comp, __nd_range_params, - *__p_base_diagonals_sp_global_storage); - - return __future(std::move(__event), std::move(__p_result_and_scratch_storage_base)); - } -}; - -template -class __merge_kernel_name; - -template -class __merge_kernel_name_large; - -template -class __diagonals_kernel_name; - -template -constexpr std::size_t -__get_starting_size_limit_for_large_submitter() -{ - return 4 * 1'048'576; // 4 MB -} - -template <> -constexpr std::size_t -__get_starting_size_limit_for_large_submitter() -{ - return 16 * 1'048'576; // 8 MB -} - -template -auto -__parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Range1&& __rng1, - _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) -{ - using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; - - using __value_type = oneapi::dpl::__internal::__value_t<_Range3>; - - const std::size_t __n = __rng1.size() + __rng2.size(); - if (__n < __get_starting_size_limit_for_large_submitter<__value_type>()) - { - using _WiIndex = std::uint32_t; - static_assert(__get_starting_size_limit_for_large_submitter<__value_type>() <= - std::numeric_limits<_WiIndex>::max()); - using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __merge_kernel_name<_CustomName, _WiIndex>>; - return __parallel_merge_submitter<_WiIndex, _MergeKernelName>()( - std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), - std::forward<_Range3>(__rng3), __comp); - } - else - { - if (__n <= std::numeric_limits::max()) - { - using _WiIndex = std::uint32_t; - using _DiagonalsKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __diagonals_kernel_name<_CustomName, _WiIndex>>; - using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __merge_kernel_name_large<_CustomName, _WiIndex>>; - return __parallel_merge_submitter_large<_WiIndex, _CustomName, _DiagonalsKernelName, _MergeKernelName>()( - std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), - std::forward<_Range3>(__rng3), __comp); - } - else - { - using _WiIndex = std::uint64_t; - using _DiagonalsKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __diagonals_kernel_name<_CustomName, _WiIndex>>; - using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __merge_kernel_name_large<_CustomName, _WiIndex>>; - return __parallel_merge_submitter_large<_WiIndex, _CustomName, _DiagonalsKernelName, _MergeKernelName>()( - std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), - std::forward<_Range3>(__rng3), __comp); - } - } -} - -} // namespace __par_backend_hetero -} // namespace dpl -} // namespace oneapi - -#endif // _ONEDPL_PARALLEL_BACKEND_SYCL_MERGE_H +// -*- C++ -*- +//===-- parallel_backend_sycl_merge.h --------------------------------===// +// +// Copyright (C) Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// This file incorporates work covered by the following copyright and permission +// notice: +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// +//===----------------------------------------------------------------------===// + +#ifndef _ONEDPL_PARALLEL_BACKEND_SYCL_MERGE_H +#define _ONEDPL_PARALLEL_BACKEND_SYCL_MERGE_H + +#include // std::numeric_limits +#include // assert +#include // std::uint8_t, ... +#include // std::make_pair, std::forward +#include // std::min, std::lower_bound + +#include "sycl_defs.h" +#include "parallel_backend_sycl_utils.h" + +namespace oneapi +{ +namespace dpl +{ +namespace __par_backend_hetero +{ +template +using _split_point_t = std::pair<_Index, _Index>; + +//Searching for an intersection of a merge matrix (n1, n2) diagonal with the Merge Path to define sub-ranges +//to serial merge. For example, a merge matrix for [0,1,1,2,3] and [0,0,2,3] is shown below: +// 0 1 1 2 3 +// ------------------ +// |---> +// 0 | 0 | 1 1 1 1 +// | | +// 0 | 0 | 1 1 1 1 +// | ----------> +// 2 | 0 0 0 0 | 1 +// | ----> +// 3 | 0 0 0 0 0 | +template +auto +__find_start_point(const _Rng1& __rng1, const _Rng2& __rng2, const _Index __i_elem, const _Index __n1, + const _Index __n2, _Compare __comp) +{ + //searching for the first '1', a lower bound for a diagonal [0, 0,..., 0, 1, 1,.... 1, 1] + oneapi::dpl::counting_iterator<_Index> __diag_it(0); + + if (__i_elem < __n2) //a condition to specify upper or lower part of the merge matrix to be processed + { + const _Index __q = __i_elem; //diagonal index + const _Index __n_diag = std::min<_Index>(__q, __n1); //diagonal size + auto __res = + std::lower_bound(__diag_it, __diag_it + __n_diag, 1 /*value to find*/, + [&__rng2, &__rng1, __q, __comp](const auto& __i_diag, const auto& __value) mutable { + const auto __zero_or_one = __comp(__rng2[__q - __i_diag - 1], __rng1[__i_diag]); + return __zero_or_one < __value; + }); + return std::make_pair(*__res, __q - *__res); + } + else + { + const _Index __q = __i_elem - __n2; //diagonal index + const _Index __n_diag = std::min<_Index>(__n1 - __q, __n2); //diagonal size + auto __res = + std::lower_bound(__diag_it, __diag_it + __n_diag, 1 /*value to find*/, + [&__rng2, &__rng1, __n2, __q, __comp](const auto& __i_diag, const auto& __value) mutable { + const auto __zero_or_one = __comp(__rng2[__n2 - __i_diag - 1], __rng1[__q + __i_diag]); + return __zero_or_one < __value; + }); + return std::make_pair(__q + *__res, __n2 - *__res); + } +} + +//Searching for an intersection of a merge matrix (n1, n2) diagonal with the Merge Path to define sub-ranges +//to serial merge. For example, a merge matrix for [0,1,1,2,3] and [0,0,2,3] is shown below: +// 0 1 1 2 3 +// ------------------ +// |---> +// 0 | 0 | 1 1 1 1 +// | | +// 0 | 0 | 1 1 1 1 +// | ----------> +// 2 | 0 0 0 0 | 1 +// | ----> +// 3 | 0 0 0 0 0 | +template +_split_point_t<_Index> +__find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rng1_to, const _Rng2& __rng2, + const _Index __rng2_from, _Index __rng2_to, const _Index __i_elem, _Compare __comp) +{ + // ----------------------- EXAMPLE ------------------------ + // Let's consider the following input data: + // rng1.size() = 10 + // rng2.size() = 6 + // i_diag = 9 + // Let's define the following ranges for processing: + // rng1: [3, ..., 9) -> __rng1_from = 3, __rng1_to = 9 + // rng2: [1, ..., 4) -> __rng2_from = 1, __rng2_to = 4 + // + // The goal: required to process only X' items of the merge matrix + // as intersection of rng1[3, ..., 9) and rng2[1, ..., 4) + // + // -------------------------------------------------------- + // + // __diag_it_begin(rng1) __diag_it_end(rng1) + // (init state) (dest state) (init state, dest state) + // | | | + // V V V + // + + + + + + + // \ rng1 0 1 2 3 4 5 6 7 8 9 + // rng2 +--------------------------------------+ + // 0 | ^ ^ ^ X | <--- __diag_it_end(rng2) (init state) + // + 1 | <----------------- + + X'2 ^ | <--- __diag_it_end(rng2) (dest state) + // + 2 | <----------------- + X'1 | | + // + 3 | <----------------- X'0 | | <--- __diag_it_begin(rng2) (dest state) + // 4 | X ^ | | + // 5 | X | | | <--- __diag_it_begin(rng2) (init state) + // +-------AX-----------+-----------+-----+ + // AX | | + // AX | | + // Run lower_bound:[from = 5, to = 8) + // + // AX - absent items in rng2 + // + // We have three points on diagonal for call comparison: + // X'0 : call __comp(rng1[5], rng2[3]) // 5 + 3 == 9 - 1 == 8 + // X'1 : call __comp(rng1[6], rng2[2]) // 6 + 2 == 9 - 1 == 8 + // X'3 : call __comp(rng1[7], rng2[1]) // 7 + 1 == 9 - 1 == 8 + // - where for every comparing pairs idx(rng1) + idx(rng2) == i_diag - 1 + + //////////////////////////////////////////////////////////////////////////////////// + // Taking into account the specified constraints of the range of processed data + const auto __index_sum = __i_elem - 1; + + using _IndexSigned = std::make_signed_t<_Index>; + + _IndexSigned idx1_from = __rng1_from; + _IndexSigned idx1_to = __rng1_to; + + _IndexSigned idx2_from = __index_sum - (__rng1_to - 1); + _IndexSigned idx2_to = __index_sum - __rng1_from + 1; + + const _IndexSigned idx2_from_diff = + idx2_from < (_IndexSigned)__rng2_from ? (_IndexSigned)__rng2_from - idx2_from : 0; + const _IndexSigned idx2_to_diff = idx2_to > (_IndexSigned)__rng2_to ? idx2_to - (_IndexSigned)__rng2_to : 0; + + idx1_to -= idx2_from_diff; + idx1_from += idx2_to_diff; + + idx2_from = __index_sum - (idx1_to - 1); + idx2_to = __index_sum - idx1_from + 1; + + //////////////////////////////////////////////////////////////////////////////////// + // Run search of split point on diagonal + + using __it_t = oneapi::dpl::counting_iterator<_Index>; + + __it_t __diag_it_begin(idx1_from); + __it_t __diag_it_end(idx1_to); + + constexpr int kValue = 1; + const __it_t __res = std::lower_bound(__diag_it_begin, __diag_it_end, kValue, + [&__rng1, &__rng2, __index_sum, __comp](_Index __idx, const auto& __value) { + const auto __zero_or_one = + __comp(__rng2[__index_sum - __idx], __rng1[__idx]); + return __zero_or_one < kValue; + }); + + return _split_point_t<_Index>{*__res, __index_sum - *__res + 1}; +} + +// Do serial merge of the data from rng1 (starting from start1) and rng2 (starting from start2) and writing +// to rng3 (starting from start3) in 'chunk' steps, but do not exceed the total size of the sequences (n1 and n2) +template +void +__serial_merge(const _Rng1& __rng1, const _Rng2& __rng2, _Rng3& __rng3, _Index __start1, _Index __start2, + const _Index __start3, const std::uint8_t __chunk, const _Index __n1, const _Index __n2, _Compare __comp) +{ + if (__start1 >= __n1) + { + //copying a residual of the second seq + const _Index __n = std::min<_Index>(__n2 - __start2, __chunk); + for (std::uint8_t __i = 0; __i < __n; ++__i) + __rng3[__start3 + __i] = __rng2[__start2 + __i]; + } + else if (__start2 >= __n2) + { + //copying a residual of the first seq + const _Index __n = std::min<_Index>(__n1 - __start1, __chunk); + for (std::uint8_t __i = 0; __i < __n; ++__i) + __rng3[__start3 + __i] = __rng1[__start1 + __i]; + } + else + { + for (std::uint8_t __i = 0; __i < __chunk && __start1 < __n1 && __start2 < __n2; ++__i) + { + const auto& __val1 = __rng1[__start1]; + const auto& __val2 = __rng2[__start2]; + if (__comp(__val2, __val1)) + { + __rng3[__start3 + __i] = __val2; + if (++__start2 == __n2) + { + //copying a residual of the first seq + for (++__i; __i < __chunk && __start1 < __n1; ++__i, ++__start1) + __rng3[__start3 + __i] = __rng1[__start1]; + } + } + else + { + __rng3[__start3 + __i] = __val1; + if (++__start1 == __n1) + { + //copying a residual of the second seq + for (++__i; __i < __chunk && __start2 < __n2; ++__i, ++__start2) + __rng3[__start3 + __i] = __rng2[__start2]; + } + } + } + } +} + +// Please see the comment for __parallel_for_submitter for optional kernel name explanation +template +struct __parallel_merge_submitter; + +template +struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_MergeKernelName...>> +{ + template + auto + operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const + { + const _IdType __n1 = __rng1.size(); + const _IdType __n2 = __rng2.size(); + const _IdType __n = __n1 + __n2; + + assert(__n1 > 0 || __n2 > 0); + + _PRINT_INFO_IN_DEBUG_MODE(__exec); + + // Empirical number of values to process per work-item + const std::uint8_t __chunk = __exec.queue().get_device().is_cpu() ? 128 : 4; + + const _IdType __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk); + + auto __event = __exec.queue().submit( + [&__rng1, &__rng2, &__rng3, __comp, __chunk, __steps, __n1, __n2](sycl::handler& __cgh) { + oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); + __cgh.parallel_for<_MergeKernelName...>( + sycl::range(__steps), [=](sycl::item __item_id) { + const _IdType __i_elem = __item_id.get_linear_id() * __chunk; + const auto __start = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); + __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, + __n2, __comp); + }); + }); + // We should return the same thing in the second param of __future for compatibility + // with the returning value in __parallel_merge_submitter_large::operator() + return __future(__event, __result_and_scratch_storage_base_ptr{}); + } +}; + +template +struct __parallel_merge_submitter_large; + +template +struct __parallel_merge_submitter_large<_IdType, _CustomName, + __internal::__optional_kernel_name<_DiagonalsKernelName...>, + __internal::__optional_kernel_name<_MergeKernelName...>> +{ + protected: + struct nd_range_params + { + std::size_t base_diag_count = 0; + std::size_t steps_between_two_base_diags = 0; + std::uint8_t chunk = 0; + _IdType steps = 0; + }; + + // Calculate nd-range params + template + nd_range_params + eval_nd_range_params(_ExecutionPolicy&& __exec, const _Range1& __rng1, const _Range2& __rng2) const + { + using _Range1ValueType = oneapi::dpl::__internal::__value_t<_Range1>; + using _Range2ValueType = oneapi::dpl::__internal::__value_t<_Range2>; + using _RangeValueType = std::conditional_t<(sizeof(_Range1ValueType) > sizeof(_Range2ValueType)), + _Range1ValueType, _Range2ValueType>; + + const std::size_t __n = __rng1.size() + __rng2.size(); + + // Empirical number of values to process per work-item + const std::uint8_t __chunk = __exec.queue().get_device().is_cpu() ? 128 : 4; + + const _IdType __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk); + const _IdType __base_diag_count = 32 * 1'024; + const _IdType __steps_between_two_base_diags = + oneapi::dpl::__internal::__dpl_ceiling_div(__steps, __base_diag_count); + + return {__base_diag_count, __steps_between_two_base_diags, __chunk, __steps}; + } + + // Calculation of split points on each base diagonal + template + sycl::event + eval_split_points_for_groups(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Compare __comp, + const nd_range_params& __nd_range_params, + _Storage& __base_diagonals_sp_global_storage) const + { + const _IdType __n1 = __rng1.size(); + const _IdType __n2 = __rng2.size(); + const _IdType __n = __n1 + __n2; + + return __exec.queue().submit([&__rng1, &__rng2, __comp, __nd_range_params, __base_diagonals_sp_global_storage, + __n1, __n2, __n](sycl::handler& __cgh) { + oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2); + auto __base_diagonals_sp_global_acc = + __base_diagonals_sp_global_storage.template __get_scratch_acc( + __cgh, __dpl_sycl::__no_init{}); + + __cgh.parallel_for<_DiagonalsKernelName...>( + sycl::range(__nd_range_params.base_diag_count + 1), [=](sycl::item __item_id) { + auto __global_idx = __item_id.get_linear_id(); + auto __base_diagonals_sp_global_ptr = + _Storage::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); + + _split_point_t<_IdType> __sp = + __global_idx == 0 ? _split_point_t<_IdType>{0, 0} : _split_point_t<_IdType>{__n1, __n2}; + + if (0 < __global_idx && __global_idx < __nd_range_params.base_diag_count) + { + const _IdType __i_elem = + __global_idx * __nd_range_params.steps_between_two_base_diags * __nd_range_params.chunk; + if (__i_elem < __n) + __sp = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); + } + + __base_diagonals_sp_global_ptr[__global_idx] = __sp; + }); + }); + } + + // Process parallel merge + template + sycl::event + run_parallel_merge(const sycl::event& __event, _ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, + _Range3&& __rng3, _Compare __comp, const nd_range_params& __nd_range_params, + const _Storage& __base_diagonals_sp_global_storage) const + { + const _IdType __n1 = __rng1.size(); + const _IdType __n2 = __rng2.size(); + + return __exec.queue().submit([&__event, &__rng1, &__rng2, &__rng3, __comp, __nd_range_params, + __base_diagonals_sp_global_storage, __n1, __n2](sycl::handler& __cgh) { + oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); + auto __base_diagonals_sp_global_acc = + __base_diagonals_sp_global_storage.template __get_scratch_acc(__cgh); + + __cgh.depends_on(__event); + + __cgh.parallel_for<_MergeKernelName...>( + sycl::range(__nd_range_params.steps), [=](sycl::item __item_id) { + auto __global_idx = __item_id.get_linear_id(); + const _IdType __i_elem = __global_idx * __nd_range_params.chunk; + + auto __base_diagonals_sp_global_ptr = + _Storage::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); + auto __diagonal_idx = __global_idx / __nd_range_params.steps_between_two_base_diags; + + _split_point_t<_IdType> __start; + if (__global_idx % __nd_range_params.steps_between_two_base_diags != 0) + { + const _split_point_t<_IdType> __sp_left = __base_diagonals_sp_global_ptr[__diagonal_idx]; + const _split_point_t<_IdType> __sp_right = __base_diagonals_sp_global_ptr[__diagonal_idx + 1]; + + __start = __find_start_point_in(__rng1, __sp_left.first, __sp_right.first, __rng2, + __sp_left.second, __sp_right.second, __i_elem, __comp); + } + else + { + __start = __base_diagonals_sp_global_ptr[__diagonal_idx]; + } + + __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, + __nd_range_params.chunk, __n1, __n2, __comp); + }); + }); + } + + public: + template + auto + operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const + { + assert(__rng1.size() > 0 || __rng2.size() > 0); + + _PRINT_INFO_IN_DEBUG_MODE(__exec); + + // Calculate nd-range params + const nd_range_params __nd_range_params = eval_nd_range_params(__exec, __rng1, __rng2); + + __result_and_scratch_storage_base_ptr __p_result_and_scratch_storage_base; + + // Create storage for save split-points on each base diagonal + 1 (for the right base diagonal in the last work-group) + auto __p_base_diagonals_sp_global_storage = + new __result_and_scratch_storage<_ExecutionPolicy, _split_point_t<_IdType>>( + __exec, 0, __nd_range_params.base_diag_count + 1); + __p_result_and_scratch_storage_base.reset( + static_cast<__result_and_scratch_storage_base*>(__p_base_diagonals_sp_global_storage)); + + sycl::event __event = eval_split_points_for_groups(__exec, __rng1, __rng2, __comp, __nd_range_params, + *__p_base_diagonals_sp_global_storage); + + // Merge data using split points on each base diagonal + __event = run_parallel_merge(__event, __exec, __rng1, __rng2, __rng3, __comp, __nd_range_params, + *__p_base_diagonals_sp_global_storage); + + return __future(std::move(__event), std::move(__p_result_and_scratch_storage_base)); + } +}; + +template +class __merge_kernel_name; + +template +class __merge_kernel_name_large; + +template +class __diagonals_kernel_name; + +template +constexpr std::size_t +__get_starting_size_limit_for_large_submitter() +{ + return 4 * 1'048'576; // 4 MB +} + +template <> +constexpr std::size_t +__get_starting_size_limit_for_large_submitter() +{ + return 16 * 1'048'576; // 8 MB +} + +template +auto +__parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Range1&& __rng1, + _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) +{ + using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + + using __value_type = oneapi::dpl::__internal::__value_t<_Range3>; + + const std::size_t __n = __rng1.size() + __rng2.size(); + if (__n < __get_starting_size_limit_for_large_submitter<__value_type>()) + { + using _WiIndex = std::uint32_t; + static_assert(__get_starting_size_limit_for_large_submitter<__value_type>() <= + std::numeric_limits<_WiIndex>::max()); + using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __merge_kernel_name<_CustomName, _WiIndex>>; + return __parallel_merge_submitter<_WiIndex, _MergeKernelName>()( + std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), + std::forward<_Range3>(__rng3), __comp); + } + else + { + if (__n <= std::numeric_limits::max()) + { + using _WiIndex = std::uint32_t; + using _DiagonalsKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __diagonals_kernel_name<_CustomName, _WiIndex>>; + using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __merge_kernel_name_large<_CustomName, _WiIndex>>; + return __parallel_merge_submitter_large<_WiIndex, _CustomName, _DiagonalsKernelName, _MergeKernelName>()( + std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), + std::forward<_Range3>(__rng3), __comp); + } + else + { + using _WiIndex = std::uint64_t; + using _DiagonalsKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __diagonals_kernel_name<_CustomName, _WiIndex>>; + using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __merge_kernel_name_large<_CustomName, _WiIndex>>; + return __parallel_merge_submitter_large<_WiIndex, _CustomName, _DiagonalsKernelName, _MergeKernelName>()( + std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), + std::forward<_Range3>(__rng3), __comp); + } + } +} + +} // namespace __par_backend_hetero +} // namespace dpl +} // namespace oneapi + +#endif // _ONEDPL_PARALLEL_BACKEND_SYCL_MERGE_H From ab004c56702037b4a5409f5b7c18588fee009470 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Wed, 18 Dec 2024 21:23:53 +0100 Subject: [PATCH 48/76] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix compile error after merge changes from main branch Signed-off-by: Sergey Kopienko --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 68d86296fb5..5c71754edc5 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -266,7 +266,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, { std::size_t base_diag_count = 0; std::size_t steps_between_two_base_diags = 0; - std::uint8_t chunk = 0; + _IdType chunk = 0; _IdType steps = 0; }; From c11e177789937c398d706c7084df985185726908 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 19 Dec 2024 09:37:47 +0100 Subject: [PATCH 49/76] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix review comment: declare all internal staff in __parallel_merge_submitter_large as private Signed-off-by: Sergey Kopienko --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 5c71754edc5..3efe8888bf1 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -261,7 +261,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, __internal::__optional_kernel_name<_DiagonalsKernelName...>, __internal::__optional_kernel_name<_MergeKernelName...>> { - protected: + private: struct nd_range_params { std::size_t base_diag_count = 0; From 79af1a80bc3020f15e3df20d38b401cf2e9b8133 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 19 Dec 2024 09:59:38 +0100 Subject: [PATCH 50/76] test/parallel_api/algorithm/alg.merge/merge.pass.cpp - expant test for long data sizes Signed-off-by: Sergey Kopienko --- .../algorithm/alg.merge/merge.pass.cpp | 43 +++++++++++++------ 1 file changed, 31 insertions(+), 12 deletions(-) diff --git a/test/parallel_api/algorithm/alg.merge/merge.pass.cpp b/test/parallel_api/algorithm/alg.merge/merge.pass.cpp index 34cba9f672a..2715256f3a1 100644 --- a/test/parallel_api/algorithm/alg.merge/merge.pass.cpp +++ b/test/parallel_api/algorithm/alg.merge/merge.pass.cpp @@ -97,24 +97,18 @@ struct test_merge_compare } }; -template +template void -test_merge_by_type(Generator1 generator1, Generator2 generator2) +test_merge_by_type(Generator1 generator1, Generator2 generator2, size_t start_size, size_t max_size, FStep fstep) { using namespace std; - size_t max_size = 100000; Sequence in1(max_size, generator1); Sequence in2(max_size / 2, generator2); Sequence out(in1.size() + in2.size()); ::std::sort(in1.begin(), in1.end()); ::std::sort(in2.begin(), in2.end()); - size_t start_size = 0; -#if TEST_DPCPP_BACKEND_PRESENT - start_size = 2; -#endif - - for (size_t size = start_size; size <= max_size; size = size <= 16 ? size + 1 : size_t(3.1415 * size)) { + for (size_t size = start_size; size <= max_size; size = fstep(size)) { #if !TEST_DPCPP_BACKEND_PRESENT invoke_on_all_policies<0>()(test_merge(), in1.cbegin(), in1.cbegin() + size, in2.data(), in2.data() + size / 2, out.begin(), out.begin() + 1.5 * size); @@ -139,6 +133,16 @@ test_merge_by_type(Generator1 generator1, Generator2 generator2) } } +template +void +test_merge_by_type(size_t start_size, size_t max_size, FStep fstep) +{ + test_merge_by_type([](size_t v) { return (v % 2 == 0 ? v : -v) * 3; }, [](size_t v) { return v * 2; }, start_size, max_size, fstep); +#if !ONEDPL_FPGA_DEVICE + test_merge_by_type([](size_t v) { return float64_t(v); }, [](size_t v) { return float64_t(v - 100); }, start_size, max_size, fstep); +#endif +} + template struct test_non_const { @@ -166,9 +170,24 @@ struct test_merge_tuple int main() { - test_merge_by_type([](size_t v) { return (v % 2 == 0 ? v : -v) * 3; }, [](size_t v) { return v * 2; }); -#if !ONEDPL_FPGA_DEVICE - test_merge_by_type([](size_t v) { return float64_t(v); }, [](size_t v) { return float64_t(v - 100); }); +#if TEST_DPCPP_BACKEND_PRESENT + const size_t start_size_small = 2; +#else + const size_t start_size_small = 0; +#endif + const size_t max_size_small = 100000; + auto fstep_small = [](std::size_t size){ return size <= 16 ? size + 1 : size_t(3.1415 * size);}; + test_merge_by_type(start_size_small, max_size_small, fstep_small); + + // Large data sizes (on GPU only) +#if TEST_DPCPP_BACKEND_PRESENT + if (!TestUtils::get_test_queue().get_device().is_cpu()) + { + const size_t start_size_large = 4'000'000; + const size_t max_size_large = 8'000'000; + auto fstep_large = [](std::size_t size){ return size + 2'000'000; }; + test_merge_by_type(start_size_large, max_size_large, fstep_large); + } #endif #if !TEST_DPCPP_BACKEND_PRESENT From 1591d9d7c64d7a25526fe185a681d45592c7c91d Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 19 Dec 2024 10:48:05 +0100 Subject: [PATCH 51/76] test/parallel_api/algorithm/alg.merge/merge.pass.cpp - expant test for long data sizes Signed-off-by: Sergey Kopienko --- test/parallel_api/algorithm/alg.merge/merge.pass.cpp | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/test/parallel_api/algorithm/alg.merge/merge.pass.cpp b/test/parallel_api/algorithm/alg.merge/merge.pass.cpp index 2715256f3a1..e1e8bc74015 100644 --- a/test/parallel_api/algorithm/alg.merge/merge.pass.cpp +++ b/test/parallel_api/algorithm/alg.merge/merge.pass.cpp @@ -141,6 +141,13 @@ test_merge_by_type(size_t start_size, size_t max_size, FStep fstep) #if !ONEDPL_FPGA_DEVICE test_merge_by_type([](size_t v) { return float64_t(v); }, [](size_t v) { return float64_t(v - 100); }, start_size, max_size, fstep); #endif + +#if !TEST_DPCPP_BACKEND_PRESENT + // Wrapper has atomic increment in ctor. It's not allowed in kernel + test_merge_by_type>([](size_t v) { return Wrapper(v % 100); }, + [](size_t v) { return Wrapper(v % 10); }, + start_size, max_size, fstep); +#endif } template @@ -191,12 +198,10 @@ main() #endif #if !TEST_DPCPP_BACKEND_PRESENT - // Wrapper has atomic increment in ctor. It's not allowed in kernel - test_merge_by_type>([](size_t v) { return Wrapper(v % 100); }, - [](size_t v) { return Wrapper(v % 10); }); test_algo_basic_double(run_for_rnd_fw>()); #endif + using T = std::tuple; //a pair (key, value) std::vector a = { {1, 2}, {1, 2}, {1,2}, {1,2}, {1, 2}, {1, 2} }; std::vector b = { {1, 1}, {1, 1}, {1, 1}, {1, 1}, {1, 1}, {1, 1}, {1, 1} }; From a3ea36d62e1062bbf2bceeb5504d35f8d72df466 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 19 Dec 2024 10:50:36 +0100 Subject: [PATCH 52/76] test/parallel_api/algorithm/alg.merge/merge.pass.cpp - replace "::std::" to "std::" Signed-off-by: Sergey Kopienko --- .../algorithm/alg.merge/merge.pass.cpp | 44 +++++++++---------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/test/parallel_api/algorithm/alg.merge/merge.pass.cpp b/test/parallel_api/algorithm/alg.merge/merge.pass.cpp index e1e8bc74015..e41f74005e7 100644 --- a/test/parallel_api/algorithm/alg.merge/merge.pass.cpp +++ b/test/parallel_api/algorithm/alg.merge/merge.pass.cpp @@ -41,19 +41,19 @@ struct test_merge // for reverse iterators template void - operator()(Policy&& exec, ::std::reverse_iterator first1, ::std::reverse_iterator last1, - ::std::reverse_iterator first2, ::std::reverse_iterator last2, - ::std::reverse_iterator out_first, ::std::reverse_iterator out_last) + operator()(Policy&& exec, std::reverse_iterator first1, std::reverse_iterator last1, + std::reverse_iterator first2, std::reverse_iterator last2, + std::reverse_iterator out_first, std::reverse_iterator out_last) { using namespace std; - typedef typename ::std::iterator_traits<::std::reverse_iterator>::value_type T; - const auto res = merge(exec, first1, last1, first2, last2, out_first, ::std::greater()); + typedef typename std::iterator_traits>::value_type T; + const auto res = merge(exec, first1, last1, first2, last2, out_first, std::greater()); EXPECT_TRUE(res == out_last, "wrong return result from merge with predicate"); - EXPECT_TRUE(is_sorted(out_first, res, ::std::greater()), "wrong result from merge with predicate"); - EXPECT_TRUE(includes(out_first, res, first1, last1, ::std::greater()), + EXPECT_TRUE(is_sorted(out_first, res, std::greater()), "wrong result from merge with predicate"); + EXPECT_TRUE(includes(out_first, res, first1, last1, std::greater()), "first sequence is not a part of result"); - EXPECT_TRUE(includes(out_first, res, first2, last2, ::std::greater()), + EXPECT_TRUE(includes(out_first, res, first2, last2, std::greater()), "second sequence is not a part of result"); } }; @@ -79,20 +79,20 @@ struct test_merge_compare template void - operator()(Policy&& exec, ::std::reverse_iterator first1, ::std::reverse_iterator last1, - ::std::reverse_iterator first2, ::std::reverse_iterator last2, - ::std::reverse_iterator out_first, ::std::reverse_iterator out_last, + operator()(Policy&& exec, std::reverse_iterator first1, std::reverse_iterator last1, + std::reverse_iterator first2, std::reverse_iterator last2, + std::reverse_iterator out_first, std::reverse_iterator out_last, Compare /* comp */) { using namespace std; - typedef typename ::std::iterator_traits<::std::reverse_iterator>::value_type T; - const auto res = merge(exec, first1, last1, first2, last2, out_first, ::std::greater()); + typedef typename std::iterator_traits>::value_type T; + const auto res = merge(exec, first1, last1, first2, last2, out_first, std::greater()); EXPECT_TRUE(res == out_last, "wrong return result from merge with predicate"); - EXPECT_TRUE(is_sorted(out_first, res, ::std::greater()), "wrong result from merge with predicate"); - EXPECT_TRUE(includes(out_first, res, first1, last1, ::std::greater()), + EXPECT_TRUE(is_sorted(out_first, res, std::greater()), "wrong result from merge with predicate"); + EXPECT_TRUE(includes(out_first, res, first1, last1, std::greater()), "first sequence is not a part of result"); - EXPECT_TRUE(includes(out_first, res, first2, last2, ::std::greater()), + EXPECT_TRUE(includes(out_first, res, first2, last2, std::greater()), "second sequence is not a part of result"); } }; @@ -105,15 +105,15 @@ test_merge_by_type(Generator1 generator1, Generator2 generator2, size_t start_si Sequence in1(max_size, generator1); Sequence in2(max_size / 2, generator2); Sequence out(in1.size() + in2.size()); - ::std::sort(in1.begin(), in1.end()); - ::std::sort(in2.begin(), in2.end()); + std::sort(in1.begin(), in1.end()); + std::sort(in2.begin(), in2.end()); for (size_t size = start_size; size <= max_size; size = fstep(size)) { #if !TEST_DPCPP_BACKEND_PRESENT invoke_on_all_policies<0>()(test_merge(), in1.cbegin(), in1.cbegin() + size, in2.data(), in2.data() + size / 2, out.begin(), out.begin() + 1.5 * size); invoke_on_all_policies<1>()(test_merge_compare(), in1.cbegin(), in1.cbegin() + size, in2.data(), - in2.data() + size / 2, out.begin(), out.begin() + 1.5 * size, ::std::less()); + in2.data() + size / 2, out.begin(), out.begin() + 1.5 * size, std::less()); #endif // Currently test harness doesn't execute the testcase for inputs with more than 1000 elements for const iterators to optimize execution time, @@ -122,13 +122,13 @@ test_merge_by_type(Generator1 generator1, Generator2 generator2, size_t start_si invoke_on_all_policies<2>()(test_merge(), in1.begin(), in1.begin() + size, in2.cbegin(), in2.cbegin() + size / 2, out.begin(), out.begin() + 1.5 * size); invoke_on_all_policies<3>()(test_merge_compare(), in1.begin(), in1.begin() + size, in2.cbegin(), - in2.cbegin() + size / 2, out.begin(), out.begin() + 1.5 * size, ::std::less()); + in2.cbegin() + size / 2, out.begin(), out.begin() + 1.5 * size, std::less()); #if !TEST_DPCPP_BACKEND_PRESENT invoke_on_all_policies<4>()(test_merge(), in1.data(), in1.data() + size, in2.cbegin(), in2.cbegin() + size / 2, out.begin(), out.begin() + 3 * size / 2); invoke_on_all_policies<5>()(test_merge_compare(), in1.data(), in1.data() + size, in2.cbegin(), - in2.cbegin() + size / 2, out.begin(), out.begin() + 3 * size / 2, ::std::less()); + in2.cbegin() + size / 2, out.begin(), out.begin() + 3 * size / 2, std::less()); #endif } } @@ -157,7 +157,7 @@ struct test_non_const void operator()(Policy&& exec, InputIterator input_iter, OutputIterator out_iter) { - merge(exec, input_iter, input_iter, input_iter, input_iter, out_iter, non_const(::std::less())); + merge(exec, input_iter, input_iter, input_iter, input_iter, out_iter, non_const(std::less())); } }; From ed32a0f15991450a5010a0950f6bdecb5b64ec60 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 19 Dec 2024 11:56:52 +0100 Subject: [PATCH 53/76] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - small refactoring of __parallel_merge_submitter_large::operator() Signed-off-by: Sergey Kopienko --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 3efe8888bf1..e202d727605 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -393,13 +393,13 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, // Calculate nd-range params const nd_range_params __nd_range_params = eval_nd_range_params(__exec, __rng1, __rng2); - __result_and_scratch_storage_base_ptr __p_result_and_scratch_storage_base; - // Create storage for save split-points on each base diagonal + 1 (for the right base diagonal in the last work-group) auto __p_base_diagonals_sp_global_storage = new __result_and_scratch_storage<_ExecutionPolicy, _split_point_t<_IdType>>( __exec, 0, __nd_range_params.base_diag_count + 1); - __p_result_and_scratch_storage_base.reset( + + // Save raw-pointer into shared_ptr for return it in __future to exted life-time of the storage + __result_and_scratch_storage_base_ptr __p_result_and_scratch_storage_base( static_cast<__result_and_scratch_storage_base*>(__p_base_diagonals_sp_global_storage)); sycl::event __event = eval_split_points_for_groups(__exec, __rng1, __rng2, __comp, __nd_range_params, From 20e6bc52bc6635da9d9d56d790ade3efb91d3cf3 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 19 Dec 2024 11:58:47 +0100 Subject: [PATCH 54/76] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix comments in __parallel_merge_submitter_large::operator() Signed-off-by: Sergey Kopienko --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index e202d727605..429c2a65eb4 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -402,10 +402,11 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, __result_and_scratch_storage_base_ptr __p_result_and_scratch_storage_base( static_cast<__result_and_scratch_storage_base*>(__p_base_diagonals_sp_global_storage)); + // Find split-points on the base diagonals sycl::event __event = eval_split_points_for_groups(__exec, __rng1, __rng2, __comp, __nd_range_params, *__p_base_diagonals_sp_global_storage); - // Merge data using split points on each base diagonal + // Merge data using split points on each diagonal __event = run_parallel_merge(__event, __exec, __rng1, __rng2, __rng3, __comp, __nd_range_params, *__p_base_diagonals_sp_global_storage); From 333ada9095bd94c4eb8887c7856209ab5556a90f Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 19 Dec 2024 13:30:05 +0100 Subject: [PATCH 55/76] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix comments in __parallel_merge_submitter_large::operator() Signed-off-by: Sergey Kopienko --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 429c2a65eb4..40d686f479e 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -398,7 +398,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, new __result_and_scratch_storage<_ExecutionPolicy, _split_point_t<_IdType>>( __exec, 0, __nd_range_params.base_diag_count + 1); - // Save raw-pointer into shared_ptr for return it in __future to exted life-time of the storage + // Save raw-pointer into shared_ptr for return it in __future to extend life-time of the storage __result_and_scratch_storage_base_ptr __p_result_and_scratch_storage_base( static_cast<__result_and_scratch_storage_base*>(__p_base_diagonals_sp_global_storage)); From f938b7486251587e543c008619eaafd11828c3bd Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 19 Dec 2024 14:35:50 +0100 Subject: [PATCH 56/76] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix review comment: The subexpression __nd_range_params.steps_between_two_base_diags * __nd_range_params.chunk is a invariant and can be calculated outside parallel_for kernel. Signed-off-by: Sergey Kopienko --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 40d686f479e..deccb6a601b 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -304,8 +304,10 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, const _IdType __n2 = __rng2.size(); const _IdType __n = __n1 + __n2; + const _IdType __base_diag_chunk = __nd_range_params.steps_between_two_base_diags * __nd_range_params.chunk; + return __exec.queue().submit([&__rng1, &__rng2, __comp, __nd_range_params, __base_diagonals_sp_global_storage, - __n1, __n2, __n](sycl::handler& __cgh) { + __n1, __n2, __n, __base_diag_chunk](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2); auto __base_diagonals_sp_global_acc = __base_diagonals_sp_global_storage.template __get_scratch_acc( @@ -322,8 +324,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, if (0 < __global_idx && __global_idx < __nd_range_params.base_diag_count) { - const _IdType __i_elem = - __global_idx * __nd_range_params.steps_between_two_base_diags * __nd_range_params.chunk; + const _IdType __i_elem = __global_idx * __base_diag_chunk; if (__i_elem < __n) __sp = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); } From da34e0338abb25a93425bf16052324d9d6a739a1 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 19 Dec 2024 15:10:53 +0100 Subject: [PATCH 57/76] Fix review comment: typename __result_and_scratch_storage_base_ptr has been removed Signed-off-by: Sergey Kopienko --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 4 ++-- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index deccb6a601b..b9878104008 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -249,7 +249,7 @@ struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_M }); // We should return the same thing in the second param of __future for compatibility // with the returning value in __parallel_merge_submitter_large::operator() - return __future(__event, __result_and_scratch_storage_base_ptr{}); + return __future(__event, std::shared_ptr<__result_and_scratch_storage_base>{}); } }; @@ -400,7 +400,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, __exec, 0, __nd_range_params.base_diag_count + 1); // Save raw-pointer into shared_ptr for return it in __future to extend life-time of the storage - __result_and_scratch_storage_base_ptr __p_result_and_scratch_storage_base( + std::shared_ptr<__result_and_scratch_storage_base> __p_result_and_scratch_storage_base( static_cast<__result_and_scratch_storage_base*>(__p_base_diagonals_sp_global_storage)); // Find split-points on the base diagonals diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h index e66e8c28089..9cfd71c7fb8 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h @@ -522,7 +522,6 @@ struct __result_and_scratch_storage_base { virtual ~__result_and_scratch_storage_base() = default; }; -using __result_and_scratch_storage_base_ptr = std::shared_ptr<__result_and_scratch_storage_base>; template struct __result_and_scratch_storage : __result_and_scratch_storage_base From 3e5f5bcef0238e4898f3b25f86822035ef21454d Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 19 Dec 2024 15:14:17 +0100 Subject: [PATCH 58/76] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix review comment: Calculate nd-range params => Calculate nd-range parameters Signed-off-by: Sergey Kopienko --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index b9878104008..b80b39ec4af 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -270,7 +270,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, _IdType steps = 0; }; - // Calculate nd-range params + // Calculate nd-range parameters template nd_range_params eval_nd_range_params(_ExecutionPolicy&& __exec, const _Range1& __rng1, const _Range2& __rng2) const @@ -391,7 +391,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, _PRINT_INFO_IN_DEBUG_MODE(__exec); - // Calculate nd-range params + // Calculate nd-range parameters const nd_range_params __nd_range_params = eval_nd_range_params(__exec, __rng1, __rng2); // Create storage for save split-points on each base diagonal + 1 (for the right base diagonal in the last work-group) From fe77c5fddbde4093e8fb78281d3ad2bda51a6c74 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 19 Dec 2024 15:14:55 +0100 Subject: [PATCH 59/76] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix review comment: for save = > to save Signed-off-by: Sergey Kopienko --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index b80b39ec4af..bcad245eb32 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -394,7 +394,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, // Calculate nd-range parameters const nd_range_params __nd_range_params = eval_nd_range_params(__exec, __rng1, __rng2); - // Create storage for save split-points on each base diagonal + 1 (for the right base diagonal in the last work-group) + // Create storage to save split-points on each base diagonal + 1 (for the right base diagonal in the last work-group) auto __p_base_diagonals_sp_global_storage = new __result_and_scratch_storage<_ExecutionPolicy, _split_point_t<_IdType>>( __exec, 0, __nd_range_params.base_diag_count + 1); From e73f3bffeaca00f94370fed864b97825bcad194e Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 19 Dec 2024 15:16:33 +0100 Subject: [PATCH 60/76] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix review comment: // Save raw-pointer into shared_ptr for return it in __future to extend life-time of the storage = > // Save the raw pointer into a shared_ptr to return it in __future and extend the lifetime of the storage. Signed-off-by: Sergey Kopienko --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index bcad245eb32..322864e60fc 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -399,7 +399,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, new __result_and_scratch_storage<_ExecutionPolicy, _split_point_t<_IdType>>( __exec, 0, __nd_range_params.base_diag_count + 1); - // Save raw-pointer into shared_ptr for return it in __future to extend life-time of the storage + // Save the raw pointer into a shared_ptr to return it in __future and extend the lifetime of the storage. std::shared_ptr<__result_and_scratch_storage_base> __p_result_and_scratch_storage_base( static_cast<__result_and_scratch_storage_base*>(__p_base_diagonals_sp_global_storage)); From 2c09ccf8450325c52f3ca4809fcfbb7f90369245 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 19 Dec 2024 16:05:50 +0100 Subject: [PATCH 61/76] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix review comment Signed-off-by: Sergey Kopienko --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 322864e60fc..ffaad0673d0 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -435,7 +435,7 @@ template <> constexpr std::size_t __get_starting_size_limit_for_large_submitter() { - return 16 * 1'048'576; // 8 MB + return 16 * 1'048'576; // 16 MB } template From 212366437a59ced89557c379eecbfdb13fb43b42 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 19 Dec 2024 22:29:04 +0100 Subject: [PATCH 62/76] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix review comment: remove if conditions checks from __parallel_merge_submitter_large::eval_split_points_for_groups Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index ffaad0673d0..464a8e60797 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -319,17 +319,14 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, auto __base_diagonals_sp_global_ptr = _Storage::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); - _split_point_t<_IdType> __sp = - __global_idx == 0 ? _split_point_t<_IdType>{0, 0} : _split_point_t<_IdType>{__n1, __n2}; - - if (0 < __global_idx && __global_idx < __nd_range_params.base_diag_count) - { - const _IdType __i_elem = __global_idx * __base_diag_chunk; - if (__i_elem < __n) - __sp = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); - } - - __base_diagonals_sp_global_ptr[__global_idx] = __sp; + const _IdType __i_elem = __global_idx * __base_diag_chunk; + + __base_diagonals_sp_global_ptr[__global_idx] = + __i_elem == 0 + ? _split_point_t<_IdType>{0, 0} + : __i_elem < __n + ? __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp) + : _split_point_t<_IdType>{__n1, __n2}; }); }); } From 3013dfd7f7d8443770adbf997971493662507c8a Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 19 Dec 2024 22:32:38 +0100 Subject: [PATCH 63/76] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h - fix review comment: apply comments for the struct __result_and_scratch_storage_base Signed-off-by: Sergey Kopienko --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h index 9cfd71c7fb8..a81bda902ba 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h @@ -518,6 +518,10 @@ struct __usm_or_buffer_accessor } }; +// This base class is provided to allow same-typed shared pointer return values from kernels in +// a `__future` for keeping alive temporary data, while allowing run-time branches to lead to +// differently typed temporary storage for kernels. Virtual destructor is required to call +// derived class destructor when leaving scope. struct __result_and_scratch_storage_base { virtual ~__result_and_scratch_storage_base() = default; From 614c538d51a2f2709a41b3b6e32fd0e113ccc4d2 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 19 Dec 2024 22:37:50 +0100 Subject: [PATCH 64/76] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix review comment: TODO comment applied for __base_diag_count value in the __parallel_merge_submitter_large::eval_nd_range_params Signed-off-by: Sergey Kopienko --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 464a8e60797..6ba7fff137d 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -286,6 +286,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, const std::uint8_t __chunk = __exec.queue().get_device().is_cpu() ? 128 : 4; const _IdType __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk); + // TODO required to evaluate this value besed on available SLM size for each work-group. const _IdType __base_diag_count = 32 * 1'024; const _IdType __steps_between_two_base_diags = oneapi::dpl::__internal::__dpl_ceiling_div(__steps, __base_diag_count); From 1389088286a9873a6fe83e5a340747f09a6c5d76 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 19 Dec 2024 22:42:26 +0100 Subject: [PATCH 65/76] Update include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h Co-authored-by: Dan Hoeflinger <109972525+danhoeflinger@users.noreply.github.com> --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 6ba7fff137d..61c627bca04 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -286,7 +286,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, const std::uint8_t __chunk = __exec.queue().get_device().is_cpu() ? 128 : 4; const _IdType __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk); - // TODO required to evaluate this value besed on available SLM size for each work-group. + // TODO required to evaluate this value based on available SLM size for each work-group. const _IdType __base_diag_count = 32 * 1'024; const _IdType __steps_between_two_base_diags = oneapi::dpl::__internal::__dpl_ceiling_div(__steps, __base_diag_count); From bc20ec173ee0390c5dc4aca7e1b87357635dc2ab Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 19 Dec 2024 23:29:00 +0100 Subject: [PATCH 66/76] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - apply GitHUB clang format + additional brackets to improve readability Signed-off-by: Sergey Kopienko --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 61c627bca04..32825050df4 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -323,11 +323,10 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, const _IdType __i_elem = __global_idx * __base_diag_chunk; __base_diagonals_sp_global_ptr[__global_idx] = - __i_elem == 0 - ? _split_point_t<_IdType>{0, 0} - : __i_elem < __n - ? __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp) - : _split_point_t<_IdType>{__n1, __n2}; + __i_elem == 0 + ? _split_point_t<_IdType>{0, 0} + : (__i_elem < __n ? __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp) + : _split_point_t<_IdType>{__n1, __n2}); }); }); } From 1856284d137fe58946bf8f1e36f1c785d4dd5bcb Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 20 Dec 2024 10:01:02 +0100 Subject: [PATCH 67/76] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix self-review comment: __index_sum should be of signed type to correctly process invariant when __i_elem == 0 Signed-off-by: Sergey Kopienko --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 32825050df4..d7c2a096e41 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -137,11 +137,11 @@ __find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rn // X'3 : call __comp(rng1[7], rng2[1]) // 7 + 1 == 9 - 1 == 8 // - where for every comparing pairs idx(rng1) + idx(rng2) == i_diag - 1 + using _IndexSigned = std::make_signed_t<_Index>; + //////////////////////////////////////////////////////////////////////////////////// // Taking into account the specified constraints of the range of processed data - const auto __index_sum = __i_elem - 1; - - using _IndexSigned = std::make_signed_t<_Index>; + const _IndexSigned __index_sum = __i_elem - 1; _IndexSigned idx1_from = __rng1_from; _IndexSigned idx1_to = __rng1_to; From 52cec85f05e05ab6f0c1976aa1d265afdaaec145 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 20 Dec 2024 10:15:26 +0100 Subject: [PATCH 68/76] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix warning: warning C4804: '<': unsafe use of type 'bool' in operation Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 22 ++++++++----------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index d7c2a096e41..10b26cde64a 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -59,10 +59,9 @@ __find_start_point(const _Rng1& __rng1, const _Rng2& __rng2, const _Index __i_el const _Index __q = __i_elem; //diagonal index const _Index __n_diag = std::min<_Index>(__q, __n1); //diagonal size auto __res = - std::lower_bound(__diag_it, __diag_it + __n_diag, 1 /*value to find*/, - [&__rng2, &__rng1, __q, __comp](const auto& __i_diag, const auto& __value) mutable { - const auto __zero_or_one = __comp(__rng2[__q - __i_diag - 1], __rng1[__i_diag]); - return __zero_or_one < __value; + std::lower_bound(__diag_it, __diag_it + __n_diag, false /*value to find*/, + [&__rng2, &__rng1, __q, __comp](const auto& __i_diag, const bool __value) mutable { + return __value == __comp(__rng2[__q - __i_diag - 1], __rng1[__i_diag]); }); return std::make_pair(*__res, __q - *__res); } @@ -71,10 +70,9 @@ __find_start_point(const _Rng1& __rng1, const _Rng2& __rng2, const _Index __i_el const _Index __q = __i_elem - __n2; //diagonal index const _Index __n_diag = std::min<_Index>(__n1 - __q, __n2); //diagonal size auto __res = - std::lower_bound(__diag_it, __diag_it + __n_diag, 1 /*value to find*/, - [&__rng2, &__rng1, __n2, __q, __comp](const auto& __i_diag, const auto& __value) mutable { - const auto __zero_or_one = __comp(__rng2[__n2 - __i_diag - 1], __rng1[__q + __i_diag]); - return __zero_or_one < __value; + std::lower_bound(__diag_it, __diag_it + __n_diag, false /*value to find*/, + [&__rng2, &__rng1, __n2, __q, __comp](const auto& __i_diag, const bool __value) mutable { + return __value == __comp(__rng2[__n2 - __i_diag - 1], __rng1[__q + __i_diag]); }); return std::make_pair(__q + *__res, __n2 - *__res); } @@ -167,12 +165,10 @@ __find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rn __it_t __diag_it_begin(idx1_from); __it_t __diag_it_end(idx1_to); - constexpr int kValue = 1; + constexpr bool kValue = false; const __it_t __res = std::lower_bound(__diag_it_begin, __diag_it_end, kValue, - [&__rng1, &__rng2, __index_sum, __comp](_Index __idx, const auto& __value) { - const auto __zero_or_one = - __comp(__rng2[__index_sum - __idx], __rng1[__idx]); - return __zero_or_one < kValue; + [&__rng1, &__rng2, __index_sum, __comp](_Index __idx, const bool __value) { + return __value == __comp(__rng2[__index_sum - __idx], __rng1[__idx]); }); return _split_point_t<_Index>{*__res, __index_sum - *__res + 1}; From 6f5ec48b4b4bec9b34f7f38be68c64c675b77ff4 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 20 Dec 2024 10:16:18 +0100 Subject: [PATCH 69/76] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - remove extra comments before __find_start_point_in function Signed-off-by: Sergey Kopienko --- .../pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 10b26cde64a..c706042b69b 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -78,18 +78,6 @@ __find_start_point(const _Rng1& __rng1, const _Rng2& __rng2, const _Index __i_el } } -//Searching for an intersection of a merge matrix (n1, n2) diagonal with the Merge Path to define sub-ranges -//to serial merge. For example, a merge matrix for [0,1,1,2,3] and [0,0,2,3] is shown below: -// 0 1 1 2 3 -// ------------------ -// |---> -// 0 | 0 | 1 1 1 1 -// | | -// 0 | 0 | 1 1 1 1 -// | ----------> -// 2 | 0 0 0 0 | 1 -// | ----> -// 3 | 0 0 0 0 0 | template _split_point_t<_Index> __find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rng1_to, const _Rng2& __rng2, From 6dd8e51e77a10d5dfbb149325bada66b35539280 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 20 Dec 2024 10:19:44 +0100 Subject: [PATCH 70/76] Remove __find_start_point implementation and usage Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 43 +++---------------- .../dpcpp/parallel_backend_sycl_merge_sort.h | 12 +++--- 2 files changed, 13 insertions(+), 42 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index c706042b69b..42bbcfdde48 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -46,38 +46,6 @@ using _split_point_t = std::pair<_Index, _Index>; // 2 | 0 0 0 0 | 1 // | ----> // 3 | 0 0 0 0 0 | -template -auto -__find_start_point(const _Rng1& __rng1, const _Rng2& __rng2, const _Index __i_elem, const _Index __n1, - const _Index __n2, _Compare __comp) -{ - //searching for the first '1', a lower bound for a diagonal [0, 0,..., 0, 1, 1,.... 1, 1] - oneapi::dpl::counting_iterator<_Index> __diag_it(0); - - if (__i_elem < __n2) //a condition to specify upper or lower part of the merge matrix to be processed - { - const _Index __q = __i_elem; //diagonal index - const _Index __n_diag = std::min<_Index>(__q, __n1); //diagonal size - auto __res = - std::lower_bound(__diag_it, __diag_it + __n_diag, false /*value to find*/, - [&__rng2, &__rng1, __q, __comp](const auto& __i_diag, const bool __value) mutable { - return __value == __comp(__rng2[__q - __i_diag - 1], __rng1[__i_diag]); - }); - return std::make_pair(*__res, __q - *__res); - } - else - { - const _Index __q = __i_elem - __n2; //diagonal index - const _Index __n_diag = std::min<_Index>(__n1 - __q, __n2); //diagonal size - auto __res = - std::lower_bound(__diag_it, __diag_it + __n_diag, false /*value to find*/, - [&__rng2, &__rng1, __n2, __q, __comp](const auto& __i_diag, const bool __value) mutable { - return __value == __comp(__rng2[__n2 - __i_diag - 1], __rng1[__q + __i_diag]); - }); - return std::make_pair(__q + *__res, __n2 - *__res); - } -} - template _split_point_t<_Index> __find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rng1_to, const _Rng2& __rng2, @@ -226,7 +194,8 @@ struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_M __cgh.parallel_for<_MergeKernelName...>( sycl::range(__steps), [=](sycl::item __item_id) { const _IdType __i_elem = __item_id.get_linear_id() * __chunk; - const auto __start = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); + const auto __start = + __find_start_point_in(__rng1, _IdType{0}, __n1, __rng2, _IdType{0}, __n2, __i_elem, __comp); __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, __n2, __comp); }); @@ -307,10 +276,10 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, const _IdType __i_elem = __global_idx * __base_diag_chunk; __base_diagonals_sp_global_ptr[__global_idx] = - __i_elem == 0 - ? _split_point_t<_IdType>{0, 0} - : (__i_elem < __n ? __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp) - : _split_point_t<_IdType>{__n1, __n2}); + __i_elem == 0 ? _split_point_t<_IdType>{0, 0} + : (__i_elem < __n ? __find_start_point_in(__rng1, _IdType{0}, __n1, __rng2, + _IdType{0}, __n2, __i_elem, __comp) + : _split_point_t<_IdType>{__n1, __n2}); }); }); } diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index a9e60b81c71..a39aa3cc052 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -27,7 +27,7 @@ #include "sycl_traits.h" // SYCL traits specialization for some oneDPL types. #include "../../utils.h" // __dpl_bit_floor, __dpl_bit_ceil #include "../../utils_ranges.h" // __difference_t -#include "parallel_backend_sycl_merge.h" // __find_start_point, __serial_merge +#include "parallel_backend_sycl_merge.h" // __find_start_point_in, __serial_merge namespace oneapi { @@ -91,8 +91,8 @@ struct __group_merge_path_sorter auto __in_ptr1 = __in_ptr + __start1; auto __in_ptr2 = __in_ptr + __start2; - const std::pair __start = - __find_start_point(__in_ptr1, __in_ptr2, __id_local, __n1, __n2, __comp); + const std::pair __start = __find_start_point_in( + __in_ptr1, std::uint32_t{0}, __n1, __in_ptr2, std::uint32_t{0}, __n2, __id_local, __comp); // TODO: copy the data into registers before the merge to halve the required amount of SLM __serial_merge(__in_ptr1, __in_ptr2, __out_ptr, __start.first, __start.second, __id, __data_per_workitem, __n1, __n2, __comp); @@ -272,7 +272,8 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name const oneapi::dpl::__ranges::drop_view_simple __rng1(__dst, __offset); const oneapi::dpl::__ranges::drop_view_simple __rng2(__dst, __offset + __n1); - const auto start = __find_start_point(__rng1, __rng2, __i_elem_local, __n1, __n2, __comp); + const auto start = __find_start_point_in(__rng1, _IndexT{0}, __n1, __rng2, _IndexT{0}, __n2, + __i_elem_local, __comp); __serial_merge(__rng1, __rng2, __rng /*__rng3*/, start.first, start.second, __i_elem, __chunk, __n1, __n2, __comp); } @@ -281,7 +282,8 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name const oneapi::dpl::__ranges::drop_view_simple __rng1(__rng, __offset); const oneapi::dpl::__ranges::drop_view_simple __rng2(__rng, __offset + __n1); - const auto start = __find_start_point(__rng1, __rng2, __i_elem_local, __n1, __n2, __comp); + const auto start = __find_start_point_in(__rng1, _IndexT{0}, __n1, __rng2, _IndexT{0}, __n2, + __i_elem_local, __comp); __serial_merge(__rng1, __rng2, __dst /*__rng3*/, start.first, start.second, __i_elem, __chunk, __n1, __n2, __comp); } From 5da98e44fd36e7bbb4d76cd40da63892c7be6cc0 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 20 Dec 2024 12:54:06 +0100 Subject: [PATCH 71/76] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix self-review comment: we should describe lambda here as mutable 1) for compatibility with previous implementation 2) because at https://en.cppreference.com/w/cpp/algorithm/merge (for example) we see that bool cmp(const Type1& a, const Type2& b); isn't const Signed-off-by: Sergey Kopienko --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 42bbcfdde48..abdf2accd2b 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -123,7 +123,7 @@ __find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rn constexpr bool kValue = false; const __it_t __res = std::lower_bound(__diag_it_begin, __diag_it_end, kValue, - [&__rng1, &__rng2, __index_sum, __comp](_Index __idx, const bool __value) { + [&__rng1, &__rng2, __index_sum, __comp](_Index __idx, const bool __value) mutable { return __value == __comp(__rng2[__index_sum - __idx], __rng1[__idx]); }); From 63797c87991863ae1b763d9aeb99a0627066a928 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 20 Dec 2024 12:37:45 +0100 Subject: [PATCH 72/76] Rename __find_start_point_in to __find_start_point Signed-off-by: Sergey Kopienko --- .../hetero/dpcpp/parallel_backend_sycl_merge.h | 14 +++++++------- .../dpcpp/parallel_backend_sycl_merge_sort.h | 12 ++++++------ 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index abdf2accd2b..aac2629adf2 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -48,8 +48,8 @@ using _split_point_t = std::pair<_Index, _Index>; // 3 | 0 0 0 0 0 | template _split_point_t<_Index> -__find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rng1_to, const _Rng2& __rng2, - const _Index __rng2_from, _Index __rng2_to, const _Index __i_elem, _Compare __comp) +__find_start_point(const _Rng1& __rng1, const _Index __rng1_from, _Index __rng1_to, const _Rng2& __rng2, + const _Index __rng2_from, _Index __rng2_to, const _Index __i_elem, _Compare __comp) { // ----------------------- EXAMPLE ------------------------ // Let's consider the following input data: @@ -195,7 +195,7 @@ struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_M sycl::range(__steps), [=](sycl::item __item_id) { const _IdType __i_elem = __item_id.get_linear_id() * __chunk; const auto __start = - __find_start_point_in(__rng1, _IdType{0}, __n1, __rng2, _IdType{0}, __n2, __i_elem, __comp); + __find_start_point(__rng1, _IdType{0}, __n1, __rng2, _IdType{0}, __n2, __i_elem, __comp); __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, __n2, __comp); }); @@ -277,8 +277,8 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, __base_diagonals_sp_global_ptr[__global_idx] = __i_elem == 0 ? _split_point_t<_IdType>{0, 0} - : (__i_elem < __n ? __find_start_point_in(__rng1, _IdType{0}, __n1, __rng2, - _IdType{0}, __n2, __i_elem, __comp) + : (__i_elem < __n ? __find_start_point(__rng1, _IdType{0}, __n1, __rng2, + _IdType{0}, __n2, __i_elem, __comp) : _split_point_t<_IdType>{__n1, __n2}); }); }); @@ -318,8 +318,8 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, const _split_point_t<_IdType> __sp_left = __base_diagonals_sp_global_ptr[__diagonal_idx]; const _split_point_t<_IdType> __sp_right = __base_diagonals_sp_global_ptr[__diagonal_idx + 1]; - __start = __find_start_point_in(__rng1, __sp_left.first, __sp_right.first, __rng2, - __sp_left.second, __sp_right.second, __i_elem, __comp); + __start = __find_start_point(__rng1, __sp_left.first, __sp_right.first, __rng2, + __sp_left.second, __sp_right.second, __i_elem, __comp); } else { diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index a39aa3cc052..70299632223 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -27,7 +27,7 @@ #include "sycl_traits.h" // SYCL traits specialization for some oneDPL types. #include "../../utils.h" // __dpl_bit_floor, __dpl_bit_ceil #include "../../utils_ranges.h" // __difference_t -#include "parallel_backend_sycl_merge.h" // __find_start_point_in, __serial_merge +#include "parallel_backend_sycl_merge.h" // __find_start_point, __serial_merge namespace oneapi { @@ -91,7 +91,7 @@ struct __group_merge_path_sorter auto __in_ptr1 = __in_ptr + __start1; auto __in_ptr2 = __in_ptr + __start2; - const std::pair __start = __find_start_point_in( + const std::pair __start = __find_start_point( __in_ptr1, std::uint32_t{0}, __n1, __in_ptr2, std::uint32_t{0}, __n2, __id_local, __comp); // TODO: copy the data into registers before the merge to halve the required amount of SLM __serial_merge(__in_ptr1, __in_ptr2, __out_ptr, __start.first, __start.second, __id, __data_per_workitem, @@ -272,8 +272,8 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name const oneapi::dpl::__ranges::drop_view_simple __rng1(__dst, __offset); const oneapi::dpl::__ranges::drop_view_simple __rng2(__dst, __offset + __n1); - const auto start = __find_start_point_in(__rng1, _IndexT{0}, __n1, __rng2, _IndexT{0}, __n2, - __i_elem_local, __comp); + const auto start = __find_start_point(__rng1, _IndexT{0}, __n1, __rng2, _IndexT{0}, __n2, + __i_elem_local, __comp); __serial_merge(__rng1, __rng2, __rng /*__rng3*/, start.first, start.second, __i_elem, __chunk, __n1, __n2, __comp); } @@ -282,8 +282,8 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name const oneapi::dpl::__ranges::drop_view_simple __rng1(__rng, __offset); const oneapi::dpl::__ranges::drop_view_simple __rng2(__rng, __offset + __n1); - const auto start = __find_start_point_in(__rng1, _IndexT{0}, __n1, __rng2, _IndexT{0}, __n2, - __i_elem_local, __comp); + const auto start = __find_start_point(__rng1, _IndexT{0}, __n1, __rng2, _IndexT{0}, __n2, + __i_elem_local, __comp); __serial_merge(__rng1, __rng2, __dst /*__rng3*/, start.first, start.second, __i_elem, __chunk, __n1, __n2, __comp); } From e55ee66de725d43c5f70a8029eced764d34b7088 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 20 Dec 2024 12:39:27 +0100 Subject: [PATCH 73/76] Final remove of extra changes Signed-off-by: Sergey Kopienko --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index aac2629adf2..73412f448ca 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -196,8 +196,8 @@ struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_M const _IdType __i_elem = __item_id.get_linear_id() * __chunk; const auto __start = __find_start_point(__rng1, _IdType{0}, __n1, __rng2, _IdType{0}, __n2, __i_elem, __comp); - __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, - __n2, __comp); + __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, __n2, + __comp); }); }); // We should return the same thing in the second param of __future for compatibility From 3f67f94729b5b3f05c107cf988207ccc7c855112 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 20 Dec 2024 13:40:57 +0100 Subject: [PATCH 74/76] Final remove of extra changes Signed-off-by: Sergey Kopienko --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 73412f448ca..85046fc8ef8 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -165,11 +165,11 @@ __serial_merge(const _Rng1& __rng1, const _Rng2& __rng2, _Rng3& __rng3, const _I } // Please see the comment for __parallel_for_submitter for optional kernel name explanation -template +template struct __parallel_merge_submitter; -template -struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_MergeKernelName...>> +template +struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_Name...>> { template auto @@ -191,8 +191,7 @@ struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_M auto __event = __exec.queue().submit( [&__rng1, &__rng2, &__rng3, __comp, __chunk, __steps, __n1, __n2](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); - __cgh.parallel_for<_MergeKernelName...>( - sycl::range(__steps), [=](sycl::item __item_id) { + __cgh.parallel_for<_Name...>(sycl::range(__steps), [=](sycl::item __item_id) { const _IdType __i_elem = __item_id.get_linear_id() * __chunk; const auto __start = __find_start_point(__rng1, _IdType{0}, __n1, __rng2, _IdType{0}, __n2, __i_elem, __comp); From 297b644eadc13e3d056853582bc5368566f59eb9 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 20 Dec 2024 14:36:57 +0100 Subject: [PATCH 75/76] Apply GitHUB clang format Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 21 ++++++++++--------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 85046fc8ef8..efc5e4dbe0f 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -122,10 +122,11 @@ __find_start_point(const _Rng1& __rng1, const _Index __rng1_from, _Index __rng1_ __it_t __diag_it_end(idx1_to); constexpr bool kValue = false; - const __it_t __res = std::lower_bound(__diag_it_begin, __diag_it_end, kValue, - [&__rng1, &__rng2, __index_sum, __comp](_Index __idx, const bool __value) mutable { - return __value == __comp(__rng2[__index_sum - __idx], __rng1[__idx]); - }); + const __it_t __res = + std::lower_bound(__diag_it_begin, __diag_it_end, kValue, + [&__rng1, &__rng2, __index_sum, __comp](_Index __idx, const bool __value) mutable { + return __value == __comp(__rng2[__index_sum - __idx], __rng1[__idx]); + }); return _split_point_t<_Index>{*__res, __index_sum - *__res + 1}; } @@ -192,12 +193,12 @@ struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_N [&__rng1, &__rng2, &__rng3, __comp, __chunk, __steps, __n1, __n2](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); __cgh.parallel_for<_Name...>(sycl::range(__steps), [=](sycl::item __item_id) { - const _IdType __i_elem = __item_id.get_linear_id() * __chunk; - const auto __start = - __find_start_point(__rng1, _IdType{0}, __n1, __rng2, _IdType{0}, __n2, __i_elem, __comp); - __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, __n2, - __comp); - }); + const _IdType __i_elem = __item_id.get_linear_id() * __chunk; + const auto __start = + __find_start_point(__rng1, _IdType{0}, __n1, __rng2, _IdType{0}, __n2, __i_elem, __comp); + __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, __n2, + __comp); + }); }); // We should return the same thing in the second param of __future for compatibility // with the returning value in __parallel_merge_submitter_large::operator() From 0ec717db5226641fc949c9d71702ba21ce61c246 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Fri, 20 Dec 2024 15:47:38 +0100 Subject: [PATCH 76/76] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix review comment: constexpr bool kValue = false; has been removed Signed-off-by: Sergey Kopienko --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index efc5e4dbe0f..36860f2d449 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -121,9 +121,8 @@ __find_start_point(const _Rng1& __rng1, const _Index __rng1_from, _Index __rng1_ __it_t __diag_it_begin(idx1_from); __it_t __diag_it_end(idx1_to); - constexpr bool kValue = false; const __it_t __res = - std::lower_bound(__diag_it_begin, __diag_it_end, kValue, + std::lower_bound(__diag_it_begin, __diag_it_end, false, [&__rng1, &__rng2, __index_sum, __comp](_Index __idx, const bool __value) mutable { return __value == __comp(__rng2[__index_sum - __idx], __rng1[__idx]); });