From f67503f555557357c3357334783adf91bdabeb82 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Sun, 8 Dec 2024 15:03:38 +0100 Subject: [PATCH 01/16] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - processing additional corner cases in __find_start_point_in Signed-off-by: Sergey Kopienko --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 8cd262a4be2..be1048d81b5 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -104,7 +104,7 @@ __find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rn return _split_point_t<_Index>{ __rng1_from, __rng2_from + __i_elem }; if (__rng2_from == __rng2_to) - return _split_point_t<_Index>{ __rng1_from + __i_elem, __rng2_to }; + return _split_point_t<_Index>{ __rng1_from + __i_elem, __rng2_from }; // ----------------------- EXAMPLE ------------------------ // Let's consider the following input data: From 3089f711d20e084006a35d5e4adbe9d8233d2fdc Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 10 Dec 2024 09:28:22 +0100 Subject: [PATCH 02/16] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - remove extra condition checks from __find_start_point_in Signed-off-by: Sergey Kopienko --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 9 --------- 1 file changed, 9 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index be1048d81b5..ed4c7df1ded 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -97,15 +97,6 @@ _split_point_t<_Index> __find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rng1_to, const _Rng2& __rng2, const _Index __rng2_from, _Index __rng2_to, const _Index __i_elem, _Compare __comp) { - if (__i_elem == 0) - return _split_point_t<_Index>{ 0, 0 }; - - if (__rng1_from == __rng1_to) - return _split_point_t<_Index>{ __rng1_from, __rng2_from + __i_elem }; - - if (__rng2_from == __rng2_to) - return _split_point_t<_Index>{ __rng1_from + __i_elem, __rng2_from }; - // ----------------------- EXAMPLE ------------------------ // Let's consider the following input data: // rng1.size() = 10 From c033585f16f41a8466c62fb62e2565b168eed3d1 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 16 Dec 2024 10:02:16 +0100 Subject: [PATCH 03/16] Revert: Combine two submitters `__parallel_merge_submitter` and `__parallel_merge_submitter_large` into one `__parallel_merge_submitter` (#1956) Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 155 ++++++++++-------- 1 file changed, 91 insertions(+), 64 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index ed4c7df1ded..08ba09098d5 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -228,15 +228,53 @@ __serial_merge(const _Rng1& __rng1, const _Rng2& __rng2, _Rng3& __rng3, _Index _ } } -template +// Please see the comment for __parallel_for_submitter for optional kernel name explanation +template struct __parallel_merge_submitter; -template -struct __parallel_merge_submitter<_IdType, _CustomName, __internal::__optional_kernel_name<_DiagonalsKernelName...>, - __internal::__optional_kernel_name<_MergeKernelName1...>, - __internal::__optional_kernel_name<_MergeKernelName2...>> +template +struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_MergeKernelName...>> +{ + template + auto + operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const + { + const _IdType __n1 = __rng1.size(); + const _IdType __n2 = __rng2.size(); + const _IdType __n = __n1 + __n2; + + assert(__n1 > 0 || __n2 > 0); + + _PRINT_INFO_IN_DEBUG_MODE(__exec); + + // Empirical number of values to process per work-item + const std::uint8_t __chunk = __exec.queue().get_device().is_cpu() ? 128 : 4; + + const _IdType __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk); + + auto __event = __exec.queue().submit([&](sycl::handler& __cgh) { + oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); + __cgh.parallel_for<_MergeKernelName...>( + sycl::range(__steps), [=](sycl::item __item_id) { + const _IdType __i_elem = __item_id.get_linear_id() * __chunk; + const auto __start = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); + __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, __n2, + __comp); + }); + }); + // We should return the same thing in the second param of __future for compatibility + // with the returning value in __parallel_merge_submitter_large::operator() + return __future(__event, __result_and_scratch_storage_base_ptr{}); + } +}; + +template +struct __parallel_merge_submitter_large; + +template +struct __parallel_merge_submitter_large<_IdType, _CustomName, + __internal::__optional_kernel_name<_DiagonalsKernelName...>, + __internal::__optional_kernel_name<_MergeKernelName...>> { protected: struct nd_range_params @@ -269,9 +307,9 @@ struct __parallel_merge_submitter<_IdType, _CustomName, __internal::__optional_k const std::uint8_t __chunk = __exec.queue().get_device().is_cpu() ? 128 : __data_items_in_slm_bank; const _IdType __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk); - const _IdType __base_diag_count = __use_base_diags ? 32 * 1'024 : 0; + const _IdType __base_diag_count = 32 * 1'024; const _IdType __steps_between_two_base_diags = - __use_base_diags ? oneapi::dpl::__internal::__dpl_ceiling_div(__steps, __base_diag_count) : 0; + oneapi::dpl::__internal::__dpl_ceiling_div(__steps, __base_diag_count); return {__base_diag_count, __steps_between_two_base_diags, __chunk, __steps}; } @@ -331,7 +369,7 @@ struct __parallel_merge_submitter<_IdType, _CustomName, __internal::__optional_k sycl::event __event = __exec.queue().submit([&](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); - __cgh.parallel_for<_MergeKernelName1...>( + __cgh.parallel_for<_MergeKernelName...>( sycl::range(__nd_range_params.steps), [=](sycl::item __item_id) { auto __global_idx = __item_id.get_linear_id(); const _IdType __i_elem = __global_idx * __chunk; @@ -366,7 +404,7 @@ struct __parallel_merge_submitter<_IdType, _CustomName, __internal::__optional_k __cgh.depends_on(__event); - __cgh.parallel_for<_MergeKernelName2...>( + __cgh.parallel_for<_MergeKernelName...>( sycl::range(__nd_range_params.steps), [=](sycl::item __item_id) { auto __global_idx = __item_id.get_linear_id(); const _IdType __i_elem = __global_idx * __nd_range_params.chunk; @@ -398,8 +436,6 @@ struct __parallel_merge_submitter<_IdType, _CustomName, __internal::__optional_k } public: - __parallel_merge_submitter(bool __use_base_diags) : __use_base_diags(__use_base_diags) {} - template auto operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const @@ -413,42 +449,29 @@ struct __parallel_merge_submitter<_IdType, _CustomName, __internal::__optional_k __result_and_scratch_storage_base_ptr __p_result_and_scratch_storage_base; - // Calculation of split points on each base diagonal - sycl::event __event; - if (__use_base_diags) - { - // Create storage for save split-points on each base diagonal + 1 (for the right base diagonal in the last work-group) - auto __p_base_diagonals_sp_global_storage = - new __result_and_scratch_storage<_ExecutionPolicy, _split_point_t<_IdType>>( - __exec, 0, __nd_range_params.base_diag_count + 1); - __p_result_and_scratch_storage_base.reset( - static_cast<__result_and_scratch_storage_base*>(__p_base_diagonals_sp_global_storage)); - - __event = eval_split_points_for_groups(__exec, __rng1, __rng2, __comp, __nd_range_params, - *__p_base_diagonals_sp_global_storage); - - // Merge data using split points on each base diagonal - __event = run_parallel_merge(__event, __exec, __rng1, __rng2, __rng3, __comp, __nd_range_params, - *__p_base_diagonals_sp_global_storage); - } - else - { - // Merge data using split points on each base diagonal - __event = run_parallel_merge(__exec, __rng1, __rng2, __rng3, __comp, __nd_range_params); - } + // Create storage for save split-points on each base diagonal + 1 (for the right base diagonal in the last work-group) + auto __p_base_diagonals_sp_global_storage = + new __result_and_scratch_storage<_ExecutionPolicy, _split_point_t<_IdType>>( + __exec, 0, __nd_range_params.base_diag_count + 1); + __p_result_and_scratch_storage_base.reset( + static_cast<__result_and_scratch_storage_base*>(__p_base_diagonals_sp_global_storage)); + + sycl::event __event = eval_split_points_for_groups(__exec, __rng1, __rng2, __comp, __nd_range_params, + *__p_base_diagonals_sp_global_storage); + + // Merge data using split points on each base diagonal + __event = run_parallel_merge(__event, __exec, __rng1, __rng2, __rng3, __comp, __nd_range_params, + *__p_base_diagonals_sp_global_storage); return __future(std::move(__event), std::move(__p_result_and_scratch_storage_base)); } - - private: - const bool __use_base_diags = false; }; template -class __merge_kernel_name1; +class __merge_kernel_name; template -class __merge_kernel_name2; +class __merge_kernel_name_large; template class __diagonals_kernel_name; @@ -460,38 +483,42 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy { using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; - const std::size_t __n = __rng1.size() + __rng2.size(); - constexpr std::size_t __starting_size_limit_for_large_submitter = 4 * 1'048'576; // 4 MB - const bool __use_base_diags = __n >= __starting_size_limit_for_large_submitter; - if (__n <= std::numeric_limits::max()) + const std::size_t __n = __rng1.size() + __rng2.size(); + if (__n < __starting_size_limit_for_large_submitter) { using _WiIndex = std::uint32_t; - using _DiagonalsKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __diagonals_kernel_name<_CustomName, _WiIndex>>; - using _MergeKernelName1 = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __merge_kernel_name1<_CustomName, _WiIndex>>; - using _MergeKernelName2 = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __merge_kernel_name2<_CustomName, _WiIndex>>; - return __parallel_merge_submitter<_WiIndex, _CustomName, _DiagonalsKernelName, _MergeKernelName1, - _MergeKernelName2>(__use_base_diags)( + using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __merge_kernel_name<_CustomName, _WiIndex>>; + return __parallel_merge_submitter<_WiIndex, _MergeKernelName>()( std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), std::forward<_Range3>(__rng3), __comp); } else { - using _WiIndex = std::uint64_t; - using _DiagonalsKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __diagonals_kernel_name<_CustomName, _WiIndex>>; - using _MergeKernelName1 = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __merge_kernel_name1<_CustomName, _WiIndex>>; - using _MergeKernelName2 = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __merge_kernel_name2<_CustomName, _WiIndex>>; - return __parallel_merge_submitter<_WiIndex, _CustomName, _DiagonalsKernelName, _MergeKernelName1, - _MergeKernelName2>(__use_base_diags)( - std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), - std::forward<_Range3>(__rng3), __comp); + if (__n <= std::numeric_limits::max()) + { + using _WiIndex = std::uint32_t; + using _DiagonalsKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __diagonals_kernel_name<_CustomName, _WiIndex>>; + using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __merge_kernel_name_large<_CustomName, _WiIndex>>; + return __parallel_merge_submitter_large<_WiIndex, _CustomName, _DiagonalsKernelName, _MergeKernelName>()( + std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), + std::forward<_Range3>(__rng3), __comp); + } + else + { + using _WiIndex = std::uint64_t; + using _DiagonalsKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __diagonals_kernel_name<_CustomName, _WiIndex>>; + using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __merge_kernel_name_large<_CustomName, _WiIndex>>; + return __parallel_merge_submitter_large<_WiIndex, _CustomName, _DiagonalsKernelName, _MergeKernelName>()( + std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), + std::forward<_Range3>(__rng3), __comp); + } } } From a06ac54f4a97195a477e8694ee9e834618d4119b Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 16 Dec 2024 10:05:15 +0100 Subject: [PATCH 04/16] Call __find_start_point_in instead of __find_start_point in the __parallel_merge_submitter_large::run_parallel_merge Signed-off-by: Sergey Kopienko --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 08ba09098d5..56b60cb6bd2 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -376,7 +376,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, if (__i_elem < __n1 + __n2) { - _split_point_t<_IdType> __start = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); + _split_point_t<_IdType> __start = __find_start_point_in(__rng1, (_IdType)0, __n1, __rng2, (_IdType)0, __n2, __i_elem, __comp); __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, __n2, __comp); } From c96cccfbe47c3196ce4e0d79651cbce24bb6b763 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 16 Dec 2024 10:25:45 +0100 Subject: [PATCH 05/16] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix review comment: I would use std::pair<_Index> directly here. Signed-off-by: Sergey Kopienko --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 56b60cb6bd2..8d0c4527d6e 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -174,7 +174,7 @@ __find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rn return __zero_or_one < kValue; }); - return {*__res, __index_sum - *__res + 1}; + return _split_point_t<_Index>{*__res, __index_sum - *__res + 1}; } // Do serial merge of the data from rng1 (starting from start1) and rng2 (starting from start2) and writing From 2d8f480c0ea5681f9a5a36775ab0b722be3346ad Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 16 Dec 2024 11:15:38 +0100 Subject: [PATCH 06/16] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix performance degradation for 8 Mb int type Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 1066 +++++++++-------- 1 file changed, 537 insertions(+), 529 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 8d0c4527d6e..895253a151e 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -1,529 +1,537 @@ -// -*- C++ -*- -//===-- parallel_backend_sycl_merge.h --------------------------------===// -// -// Copyright (C) Intel Corporation -// -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -// This file incorporates work covered by the following copyright and permission -// notice: -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// -//===----------------------------------------------------------------------===// - -#ifndef _ONEDPL_PARALLEL_BACKEND_SYCL_MERGE_H -#define _ONEDPL_PARALLEL_BACKEND_SYCL_MERGE_H - -#include // std::numeric_limits -#include // assert -#include // std::uint8_t, ... -#include // std::make_pair, std::forward -#include // std::min, std::lower_bound - -#include "sycl_defs.h" -#include "parallel_backend_sycl_utils.h" - -namespace oneapi -{ -namespace dpl -{ -namespace __par_backend_hetero -{ -template -using _split_point_t = std::pair<_Index, _Index>; - -//Searching for an intersection of a merge matrix (n1, n2) diagonal with the Merge Path to define sub-ranges -//to serial merge. For example, a merge matrix for [0,1,1,2,3] and [0,0,2,3] is shown below: -// 0 1 1 2 3 -// ------------------ -// |---> -// 0 | 0 | 1 1 1 1 -// | | -// 0 | 0 | 1 1 1 1 -// | ----------> -// 2 | 0 0 0 0 | 1 -// | ----> -// 3 | 0 0 0 0 0 | -template -auto -__find_start_point(const _Rng1& __rng1, const _Rng2& __rng2, const _Index __i_elem, const _Index __n1, - const _Index __n2, _Compare __comp) -{ - //searching for the first '1', a lower bound for a diagonal [0, 0,..., 0, 1, 1,.... 1, 1] - oneapi::dpl::counting_iterator<_Index> __diag_it(0); - - if (__i_elem < __n2) //a condition to specify upper or lower part of the merge matrix to be processed - { - const _Index __q = __i_elem; //diagonal index - const _Index __n_diag = std::min<_Index>(__q, __n1); //diagonal size - auto __res = - std::lower_bound(__diag_it, __diag_it + __n_diag, 1 /*value to find*/, - [&__rng2, &__rng1, __q, __comp](const auto& __i_diag, const auto& __value) mutable { - const auto __zero_or_one = __comp(__rng2[__q - __i_diag - 1], __rng1[__i_diag]); - return __zero_or_one < __value; - }); - return std::make_pair(*__res, __q - *__res); - } - else - { - const _Index __q = __i_elem - __n2; //diagonal index - const _Index __n_diag = std::min<_Index>(__n1 - __q, __n2); //diagonal size - auto __res = - std::lower_bound(__diag_it, __diag_it + __n_diag, 1 /*value to find*/, - [&__rng2, &__rng1, __n2, __q, __comp](const auto& __i_diag, const auto& __value) mutable { - const auto __zero_or_one = __comp(__rng2[__n2 - __i_diag - 1], __rng1[__q + __i_diag]); - return __zero_or_one < __value; - }); - return std::make_pair(__q + *__res, __n2 - *__res); - } -} - -//Searching for an intersection of a merge matrix (n1, n2) diagonal with the Merge Path to define sub-ranges -//to serial merge. For example, a merge matrix for [0,1,1,2,3] and [0,0,2,3] is shown below: -// 0 1 1 2 3 -// ------------------ -// |---> -// 0 | 0 | 1 1 1 1 -// | | -// 0 | 0 | 1 1 1 1 -// | ----------> -// 2 | 0 0 0 0 | 1 -// | ----> -// 3 | 0 0 0 0 0 | -template -_split_point_t<_Index> -__find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rng1_to, const _Rng2& __rng2, - const _Index __rng2_from, _Index __rng2_to, const _Index __i_elem, _Compare __comp) -{ - // ----------------------- EXAMPLE ------------------------ - // Let's consider the following input data: - // rng1.size() = 10 - // rng2.size() = 6 - // i_diag = 9 - // Let's define the following ranges for processing: - // rng1: [3, ..., 9) -> __rng1_from = 3, __rng1_to = 9 - // rng2: [1, ..., 4) -> __rng2_from = 1, __rng2_to = 4 - // - // The goal: required to process only X' items of the merge matrix - // as intersection of rng1[3, ..., 9) and rng2[1, ..., 4) - // - // -------------------------------------------------------- - // - // __diag_it_begin(rng1) __diag_it_end(rng1) - // (init state) (dest state) (init state, dest state) - // | | | - // V V V - // + + + + + + - // \ rng1 0 1 2 3 4 5 6 7 8 9 - // rng2 +--------------------------------------+ - // 0 | ^ ^ ^ X | <--- __diag_it_end(rng2) (init state) - // + 1 | <----------------- + + X'2 ^ | <--- __diag_it_end(rng2) (dest state) - // + 2 | <----------------- + X'1 | | - // + 3 | <----------------- X'0 | | <--- __diag_it_begin(rng2) (dest state) - // 4 | X ^ | | - // 5 | X | | | <--- __diag_it_begin(rng2) (init state) - // +-------AX-----------+-----------+-----+ - // AX | | - // AX | | - // Run lower_bound:[from = 5, to = 8) - // - // AX - absent items in rng2 - // - // We have three points on diagonal for call comparison: - // X'0 : call __comp(rng1[5], rng2[3]) // 5 + 3 == 9 - 1 == 8 - // X'1 : call __comp(rng1[6], rng2[2]) // 6 + 2 == 9 - 1 == 8 - // X'3 : call __comp(rng1[7], rng2[1]) // 7 + 1 == 9 - 1 == 8 - // - where for every comparing pairs idx(rng1) + idx(rng2) == i_diag - 1 - - //////////////////////////////////////////////////////////////////////////////////// - // Taking into account the specified constraints of the range of processed data - const auto __index_sum = __i_elem - 1; - - using _IndexSigned = std::make_signed_t<_Index>; - - _IndexSigned idx1_from = __rng1_from; - _IndexSigned idx1_to = __rng1_to; - - _IndexSigned idx2_from = __index_sum - (__rng1_to - 1); - _IndexSigned idx2_to = __index_sum - __rng1_from + 1; - - const _IndexSigned idx2_from_diff = - idx2_from < (_IndexSigned)__rng2_from ? (_IndexSigned)__rng2_from - idx2_from : 0; - const _IndexSigned idx2_to_diff = idx2_to > (_IndexSigned)__rng2_to ? idx2_to - (_IndexSigned)__rng2_to : 0; - - idx1_to -= idx2_from_diff; - idx1_from += idx2_to_diff; - - idx2_from = __index_sum - (idx1_to - 1); - idx2_to = __index_sum - idx1_from + 1; - - //////////////////////////////////////////////////////////////////////////////////// - // Run search of split point on diagonal - - using __it_t = oneapi::dpl::counting_iterator<_Index>; - - __it_t __diag_it_begin(idx1_from); - __it_t __diag_it_end(idx1_to); - - constexpr int kValue = 1; - const __it_t __res = - std::lower_bound(__diag_it_begin, __diag_it_end, kValue, [&](_Index __idx, const auto& __value) { - const auto __zero_or_one = __comp(__rng2[__index_sum - __idx], __rng1[__idx]); - return __zero_or_one < kValue; - }); - - return _split_point_t<_Index>{*__res, __index_sum - *__res + 1}; -} - -// Do serial merge of the data from rng1 (starting from start1) and rng2 (starting from start2) and writing -// to rng3 (starting from start3) in 'chunk' steps, but do not exceed the total size of the sequences (n1 and n2) -template -void -__serial_merge(const _Rng1& __rng1, const _Rng2& __rng2, _Rng3& __rng3, _Index __start1, _Index __start2, - const _Index __start3, const std::uint8_t __chunk, const _Index __n1, const _Index __n2, _Compare __comp) -{ - if (__start1 >= __n1) - { - //copying a residual of the second seq - const _Index __n = std::min<_Index>(__n2 - __start2, __chunk); - for (std::uint8_t __i = 0; __i < __n; ++__i) - __rng3[__start3 + __i] = __rng2[__start2 + __i]; - } - else if (__start2 >= __n2) - { - //copying a residual of the first seq - const _Index __n = std::min<_Index>(__n1 - __start1, __chunk); - for (std::uint8_t __i = 0; __i < __n; ++__i) - __rng3[__start3 + __i] = __rng1[__start1 + __i]; - } - else - { - for (std::uint8_t __i = 0; __i < __chunk && __start1 < __n1 && __start2 < __n2; ++__i) - { - const auto& __val1 = __rng1[__start1]; - const auto& __val2 = __rng2[__start2]; - if (__comp(__val2, __val1)) - { - __rng3[__start3 + __i] = __val2; - if (++__start2 == __n2) - { - //copying a residual of the first seq - for (++__i; __i < __chunk && __start1 < __n1; ++__i, ++__start1) - __rng3[__start3 + __i] = __rng1[__start1]; - } - } - else - { - __rng3[__start3 + __i] = __val1; - if (++__start1 == __n1) - { - //copying a residual of the second seq - for (++__i; __i < __chunk && __start2 < __n2; ++__i, ++__start2) - __rng3[__start3 + __i] = __rng2[__start2]; - } - } - } - } -} - -// Please see the comment for __parallel_for_submitter for optional kernel name explanation -template -struct __parallel_merge_submitter; - -template -struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_MergeKernelName...>> -{ - template - auto - operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const - { - const _IdType __n1 = __rng1.size(); - const _IdType __n2 = __rng2.size(); - const _IdType __n = __n1 + __n2; - - assert(__n1 > 0 || __n2 > 0); - - _PRINT_INFO_IN_DEBUG_MODE(__exec); - - // Empirical number of values to process per work-item - const std::uint8_t __chunk = __exec.queue().get_device().is_cpu() ? 128 : 4; - - const _IdType __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk); - - auto __event = __exec.queue().submit([&](sycl::handler& __cgh) { - oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); - __cgh.parallel_for<_MergeKernelName...>( - sycl::range(__steps), [=](sycl::item __item_id) { - const _IdType __i_elem = __item_id.get_linear_id() * __chunk; - const auto __start = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); - __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, __n2, - __comp); - }); - }); - // We should return the same thing in the second param of __future for compatibility - // with the returning value in __parallel_merge_submitter_large::operator() - return __future(__event, __result_and_scratch_storage_base_ptr{}); - } -}; - -template -struct __parallel_merge_submitter_large; - -template -struct __parallel_merge_submitter_large<_IdType, _CustomName, - __internal::__optional_kernel_name<_DiagonalsKernelName...>, - __internal::__optional_kernel_name<_MergeKernelName...>> -{ - protected: - struct nd_range_params - { - std::size_t base_diag_count = 0; - std::size_t steps_between_two_base_diags = 0; - std::uint8_t chunk = 0; - _IdType steps = 0; - }; - - // Calculate nd-range params - template - nd_range_params - eval_nd_range_params(_ExecutionPolicy&& __exec, const _Range1& __rng1, const _Range2& __rng2) const - { - using _Range1ValueType = oneapi::dpl::__internal::__value_t<_Range1>; - using _Range2ValueType = oneapi::dpl::__internal::__value_t<_Range2>; - using _RangeValueType = std::conditional_t<(sizeof(_Range1ValueType) > sizeof(_Range2ValueType)), - _Range1ValueType, _Range2ValueType>; - - const std::size_t __n = __rng1.size() + __rng2.size(); - - constexpr std::size_t __slm_bank_size = 16; // TODO is it correct value? How to get it from hardware? - - // Calculate how many data items we can read into one SLM bank - constexpr std::size_t __data_items_in_slm_bank = - oneapi::dpl::__internal::__dpl_ceiling_div(__slm_bank_size, sizeof(_RangeValueType)); - - // Empirical number of values to process per work-item - const std::uint8_t __chunk = __exec.queue().get_device().is_cpu() ? 128 : __data_items_in_slm_bank; - - const _IdType __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk); - const _IdType __base_diag_count = 32 * 1'024; - const _IdType __steps_between_two_base_diags = - oneapi::dpl::__internal::__dpl_ceiling_div(__steps, __base_diag_count); - - return {__base_diag_count, __steps_between_two_base_diags, __chunk, __steps}; - } - - // Calculation of split points on each base diagonal - template - sycl::event - eval_split_points_for_groups(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Compare __comp, - const nd_range_params& __nd_range_params, - _Storage& __base_diagonals_sp_global_storage) const - { - const _IdType __n1 = __rng1.size(); - const _IdType __n2 = __rng2.size(); - const _IdType __n = __n1 + __n2; - - sycl::event __event = __exec.queue().submit([&](sycl::handler& __cgh) { - oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2); - auto __base_diagonals_sp_global_acc = - __base_diagonals_sp_global_storage.template __get_scratch_acc( - __cgh, __dpl_sycl::__no_init{}); - - __cgh.parallel_for<_DiagonalsKernelName...>( - sycl::range(__nd_range_params.base_diag_count + 1), [=](sycl::item __item_id) { - auto __global_idx = __item_id.get_linear_id(); - auto __base_diagonals_sp_global_ptr = - _Storage::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); - - _split_point_t<_IdType> __sp = - __global_idx == 0 ? _split_point_t<_IdType>{0, 0} : _split_point_t<_IdType>{__n1, __n2}; - - if (0 < __global_idx && __global_idx < __nd_range_params.base_diag_count) - { - const _IdType __i_elem = - __global_idx * __nd_range_params.steps_between_two_base_diags * __nd_range_params.chunk; - if (__i_elem < __n) - __sp = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); - } - - __base_diagonals_sp_global_ptr[__global_idx] = __sp; - }); - }); - - return __event; - } - - // Process parallel merge - template - sycl::event - run_parallel_merge(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp, - const nd_range_params& __nd_range_params) const - { - const _IdType __n1 = __rng1.size(); - const _IdType __n2 = __rng2.size(); - - const auto __chunk = __nd_range_params.chunk; - - sycl::event __event = __exec.queue().submit([&](sycl::handler& __cgh) { - oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); - - __cgh.parallel_for<_MergeKernelName...>( - sycl::range(__nd_range_params.steps), [=](sycl::item __item_id) { - auto __global_idx = __item_id.get_linear_id(); - const _IdType __i_elem = __global_idx * __chunk; - - if (__i_elem < __n1 + __n2) - { - _split_point_t<_IdType> __start = __find_start_point_in(__rng1, (_IdType)0, __n1, __rng2, (_IdType)0, __n2, __i_elem, __comp); - __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, __n2, - __comp); - } - }); - }); - - return __event; - } - - // Process parallel merge - template - sycl::event - run_parallel_merge(sycl::event __event, _ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, - _Range3&& __rng3, _Compare __comp, const nd_range_params& __nd_range_params, - const _Storage& __base_diagonals_sp_global_storage) const - { - const _IdType __n1 = __rng1.size(); - const _IdType __n2 = __rng2.size(); - - __event = __exec.queue().submit([&](sycl::handler& __cgh) { - oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); - auto __base_diagonals_sp_global_acc = - __base_diagonals_sp_global_storage.template __get_scratch_acc(__cgh); - - __cgh.depends_on(__event); - - __cgh.parallel_for<_MergeKernelName...>( - sycl::range(__nd_range_params.steps), [=](sycl::item __item_id) { - auto __global_idx = __item_id.get_linear_id(); - const _IdType __i_elem = __global_idx * __nd_range_params.chunk; - - auto __base_diagonals_sp_global_ptr = - _Storage::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); - auto __diagonal_idx = __global_idx / __nd_range_params.steps_between_two_base_diags; - - _split_point_t<_IdType> __start; - if (__global_idx % __nd_range_params.steps_between_two_base_diags != 0) - { - const _split_point_t<_IdType> __sp_left = __base_diagonals_sp_global_ptr[__diagonal_idx]; - const _split_point_t<_IdType> __sp_right = __base_diagonals_sp_global_ptr[__diagonal_idx + 1]; - - __start = __find_start_point_in(__rng1, __sp_left.first, __sp_right.first, __rng2, - __sp_left.second, __sp_right.second, __i_elem, __comp); - } - else - { - __start = __base_diagonals_sp_global_ptr[__diagonal_idx]; - } - - __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, - __nd_range_params.chunk, __n1, __n2, __comp); - }); - }); - - return __event; - } - - public: - template - auto - operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const - { - assert(__rng1.size() > 0 || __rng2.size() > 0); - - _PRINT_INFO_IN_DEBUG_MODE(__exec); - - // Calculate nd-range params - const nd_range_params __nd_range_params = eval_nd_range_params(__exec, __rng1, __rng2); - - __result_and_scratch_storage_base_ptr __p_result_and_scratch_storage_base; - - // Create storage for save split-points on each base diagonal + 1 (for the right base diagonal in the last work-group) - auto __p_base_diagonals_sp_global_storage = - new __result_and_scratch_storage<_ExecutionPolicy, _split_point_t<_IdType>>( - __exec, 0, __nd_range_params.base_diag_count + 1); - __p_result_and_scratch_storage_base.reset( - static_cast<__result_and_scratch_storage_base*>(__p_base_diagonals_sp_global_storage)); - - sycl::event __event = eval_split_points_for_groups(__exec, __rng1, __rng2, __comp, __nd_range_params, - *__p_base_diagonals_sp_global_storage); - - // Merge data using split points on each base diagonal - __event = run_parallel_merge(__event, __exec, __rng1, __rng2, __rng3, __comp, __nd_range_params, - *__p_base_diagonals_sp_global_storage); - - return __future(std::move(__event), std::move(__p_result_and_scratch_storage_base)); - } -}; - -template -class __merge_kernel_name; - -template -class __merge_kernel_name_large; - -template -class __diagonals_kernel_name; - -template -auto -__parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Range1&& __rng1, - _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) -{ - using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; - - constexpr std::size_t __starting_size_limit_for_large_submitter = 4 * 1'048'576; // 4 MB - - const std::size_t __n = __rng1.size() + __rng2.size(); - if (__n < __starting_size_limit_for_large_submitter) - { - using _WiIndex = std::uint32_t; - using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __merge_kernel_name<_CustomName, _WiIndex>>; - return __parallel_merge_submitter<_WiIndex, _MergeKernelName>()( - std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), - std::forward<_Range3>(__rng3), __comp); - } - else - { - if (__n <= std::numeric_limits::max()) - { - using _WiIndex = std::uint32_t; - using _DiagonalsKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __diagonals_kernel_name<_CustomName, _WiIndex>>; - using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __merge_kernel_name_large<_CustomName, _WiIndex>>; - return __parallel_merge_submitter_large<_WiIndex, _CustomName, _DiagonalsKernelName, _MergeKernelName>()( - std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), - std::forward<_Range3>(__rng3), __comp); - } - else - { - using _WiIndex = std::uint64_t; - using _DiagonalsKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __diagonals_kernel_name<_CustomName, _WiIndex>>; - using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __merge_kernel_name_large<_CustomName, _WiIndex>>; - return __parallel_merge_submitter_large<_WiIndex, _CustomName, _DiagonalsKernelName, _MergeKernelName>()( - std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), - std::forward<_Range3>(__rng3), __comp); - } - } -} - -} // namespace __par_backend_hetero -} // namespace dpl -} // namespace oneapi - -#endif // _ONEDPL_PARALLEL_BACKEND_SYCL_MERGE_H +// -*- C++ -*- +//===-- parallel_backend_sycl_merge.h --------------------------------===// +// +// Copyright (C) Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// This file incorporates work covered by the following copyright and permission +// notice: +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// +//===----------------------------------------------------------------------===// + +#ifndef _ONEDPL_PARALLEL_BACKEND_SYCL_MERGE_H +#define _ONEDPL_PARALLEL_BACKEND_SYCL_MERGE_H + +#include // std::numeric_limits +#include // assert +#include // std::uint8_t, ... +#include // std::make_pair, std::forward +#include // std::min, std::lower_bound + +#include "sycl_defs.h" +#include "parallel_backend_sycl_utils.h" + +namespace oneapi +{ +namespace dpl +{ +namespace __par_backend_hetero +{ +template +using _split_point_t = std::pair<_Index, _Index>; + +//Searching for an intersection of a merge matrix (n1, n2) diagonal with the Merge Path to define sub-ranges +//to serial merge. For example, a merge matrix for [0,1,1,2,3] and [0,0,2,3] is shown below: +// 0 1 1 2 3 +// ------------------ +// |---> +// 0 | 0 | 1 1 1 1 +// | | +// 0 | 0 | 1 1 1 1 +// | ----------> +// 2 | 0 0 0 0 | 1 +// | ----> +// 3 | 0 0 0 0 0 | +template +auto +__find_start_point(const _Rng1& __rng1, const _Rng2& __rng2, const _Index __i_elem, const _Index __n1, + const _Index __n2, _Compare __comp) +{ + //searching for the first '1', a lower bound for a diagonal [0, 0,..., 0, 1, 1,.... 1, 1] + oneapi::dpl::counting_iterator<_Index> __diag_it(0); + + if (__i_elem < __n2) //a condition to specify upper or lower part of the merge matrix to be processed + { + const _Index __q = __i_elem; //diagonal index + const _Index __n_diag = std::min<_Index>(__q, __n1); //diagonal size + auto __res = + std::lower_bound(__diag_it, __diag_it + __n_diag, 1 /*value to find*/, + [&__rng2, &__rng1, __q, __comp](const auto& __i_diag, const auto& __value) mutable { + const auto __zero_or_one = __comp(__rng2[__q - __i_diag - 1], __rng1[__i_diag]); + return __zero_or_one < __value; + }); + return std::make_pair(*__res, __q - *__res); + } + else + { + const _Index __q = __i_elem - __n2; //diagonal index + const _Index __n_diag = std::min<_Index>(__n1 - __q, __n2); //diagonal size + auto __res = + std::lower_bound(__diag_it, __diag_it + __n_diag, 1 /*value to find*/, + [&__rng2, &__rng1, __n2, __q, __comp](const auto& __i_diag, const auto& __value) mutable { + const auto __zero_or_one = __comp(__rng2[__n2 - __i_diag - 1], __rng1[__q + __i_diag]); + return __zero_or_one < __value; + }); + return std::make_pair(__q + *__res, __n2 - *__res); + } +} + +//Searching for an intersection of a merge matrix (n1, n2) diagonal with the Merge Path to define sub-ranges +//to serial merge. For example, a merge matrix for [0,1,1,2,3] and [0,0,2,3] is shown below: +// 0 1 1 2 3 +// ------------------ +// |---> +// 0 | 0 | 1 1 1 1 +// | | +// 0 | 0 | 1 1 1 1 +// | ----------> +// 2 | 0 0 0 0 | 1 +// | ----> +// 3 | 0 0 0 0 0 | +template +_split_point_t<_Index> +__find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rng1_to, const _Rng2& __rng2, + const _Index __rng2_from, _Index __rng2_to, const _Index __i_elem, _Compare __comp) +{ + // ----------------------- EXAMPLE ------------------------ + // Let's consider the following input data: + // rng1.size() = 10 + // rng2.size() = 6 + // i_diag = 9 + // Let's define the following ranges for processing: + // rng1: [3, ..., 9) -> __rng1_from = 3, __rng1_to = 9 + // rng2: [1, ..., 4) -> __rng2_from = 1, __rng2_to = 4 + // + // The goal: required to process only X' items of the merge matrix + // as intersection of rng1[3, ..., 9) and rng2[1, ..., 4) + // + // -------------------------------------------------------- + // + // __diag_it_begin(rng1) __diag_it_end(rng1) + // (init state) (dest state) (init state, dest state) + // | | | + // V V V + // + + + + + + + // \ rng1 0 1 2 3 4 5 6 7 8 9 + // rng2 +--------------------------------------+ + // 0 | ^ ^ ^ X | <--- __diag_it_end(rng2) (init state) + // + 1 | <----------------- + + X'2 ^ | <--- __diag_it_end(rng2) (dest state) + // + 2 | <----------------- + X'1 | | + // + 3 | <----------------- X'0 | | <--- __diag_it_begin(rng2) (dest state) + // 4 | X ^ | | + // 5 | X | | | <--- __diag_it_begin(rng2) (init state) + // +-------AX-----------+-----------+-----+ + // AX | | + // AX | | + // Run lower_bound:[from = 5, to = 8) + // + // AX - absent items in rng2 + // + // We have three points on diagonal for call comparison: + // X'0 : call __comp(rng1[5], rng2[3]) // 5 + 3 == 9 - 1 == 8 + // X'1 : call __comp(rng1[6], rng2[2]) // 6 + 2 == 9 - 1 == 8 + // X'3 : call __comp(rng1[7], rng2[1]) // 7 + 1 == 9 - 1 == 8 + // - where for every comparing pairs idx(rng1) + idx(rng2) == i_diag - 1 + + //////////////////////////////////////////////////////////////////////////////////// + // Taking into account the specified constraints of the range of processed data + const auto __index_sum = __i_elem - 1; + + using _IndexSigned = std::make_signed_t<_Index>; + + _IndexSigned idx1_from = __rng1_from; + _IndexSigned idx1_to = __rng1_to; + + _IndexSigned idx2_from = __index_sum - (__rng1_to - 1); + _IndexSigned idx2_to = __index_sum - __rng1_from + 1; + + const _IndexSigned idx2_from_diff = + idx2_from < (_IndexSigned)__rng2_from ? (_IndexSigned)__rng2_from - idx2_from : 0; + const _IndexSigned idx2_to_diff = idx2_to > (_IndexSigned)__rng2_to ? idx2_to - (_IndexSigned)__rng2_to : 0; + + idx1_to -= idx2_from_diff; + idx1_from += idx2_to_diff; + + idx2_from = __index_sum - (idx1_to - 1); + idx2_to = __index_sum - idx1_from + 1; + + //////////////////////////////////////////////////////////////////////////////////// + // Run search of split point on diagonal + + using __it_t = oneapi::dpl::counting_iterator<_Index>; + + __it_t __diag_it_begin(idx1_from); + __it_t __diag_it_end(idx1_to); + + constexpr int kValue = 1; + const __it_t __res = + std::lower_bound(__diag_it_begin, __diag_it_end, kValue, [&](_Index __idx, const auto& __value) { + const auto __zero_or_one = __comp(__rng2[__index_sum - __idx], __rng1[__idx]); + return __zero_or_one < kValue; + }); + + return _split_point_t<_Index>{*__res, __index_sum - *__res + 1}; +} + +// Do serial merge of the data from rng1 (starting from start1) and rng2 (starting from start2) and writing +// to rng3 (starting from start3) in 'chunk' steps, but do not exceed the total size of the sequences (n1 and n2) +template +void +__serial_merge(const _Rng1& __rng1, const _Rng2& __rng2, _Rng3& __rng3, _Index __start1, _Index __start2, + const _Index __start3, const std::uint8_t __chunk, const _Index __n1, const _Index __n2, _Compare __comp) +{ + if (__start1 >= __n1) + { + //copying a residual of the second seq + const _Index __n = std::min<_Index>(__n2 - __start2, __chunk); + for (std::uint8_t __i = 0; __i < __n; ++__i) + __rng3[__start3 + __i] = __rng2[__start2 + __i]; + } + else if (__start2 >= __n2) + { + //copying a residual of the first seq + const _Index __n = std::min<_Index>(__n1 - __start1, __chunk); + for (std::uint8_t __i = 0; __i < __n; ++__i) + __rng3[__start3 + __i] = __rng1[__start1 + __i]; + } + else + { + for (std::uint8_t __i = 0; __i < __chunk && __start1 < __n1 && __start2 < __n2; ++__i) + { + const auto& __val1 = __rng1[__start1]; + const auto& __val2 = __rng2[__start2]; + if (__comp(__val2, __val1)) + { + __rng3[__start3 + __i] = __val2; + if (++__start2 == __n2) + { + //copying a residual of the first seq + for (++__i; __i < __chunk && __start1 < __n1; ++__i, ++__start1) + __rng3[__start3 + __i] = __rng1[__start1]; + } + } + else + { + __rng3[__start3 + __i] = __val1; + if (++__start1 == __n1) + { + //copying a residual of the second seq + for (++__i; __i < __chunk && __start2 < __n2; ++__i, ++__start2) + __rng3[__start3 + __i] = __rng2[__start2]; + } + } + } + } +} + +// Please see the comment for __parallel_for_submitter for optional kernel name explanation +template +struct __parallel_merge_submitter; + +template +struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_MergeKernelName...>> +{ + template + auto + operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const + { + const _IdType __n1 = __rng1.size(); + const _IdType __n2 = __rng2.size(); + const _IdType __n = __n1 + __n2; + + assert(__n1 > 0 || __n2 > 0); + + _PRINT_INFO_IN_DEBUG_MODE(__exec); + + // Empirical number of values to process per work-item + const std::uint8_t __chunk = __exec.queue().get_device().is_cpu() ? 128 : 4; + + const _IdType __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk); + + auto __event = __exec.queue().submit([&](sycl::handler& __cgh) { + oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); + __cgh.parallel_for<_MergeKernelName...>( + sycl::range(__steps), [=](sycl::item __item_id) { + const _IdType __i_elem = __item_id.get_linear_id() * __chunk; + const auto __start = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); + __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, __n2, + __comp); + }); + }); + // We should return the same thing in the second param of __future for compatibility + // with the returning value in __parallel_merge_submitter_large::operator() + return __future(__event, __result_and_scratch_storage_base_ptr{}); + } +}; + +template +struct __parallel_merge_submitter_large; + +template +struct __parallel_merge_submitter_large<_IdType, _CustomName, + __internal::__optional_kernel_name<_DiagonalsKernelName...>, + __internal::__optional_kernel_name<_MergeKernelName...>> +{ + protected: + struct nd_range_params + { + std::size_t base_diag_count = 0; + std::size_t steps_between_two_base_diags = 0; + std::uint8_t chunk = 0; + _IdType steps = 0; + }; + + // Calculate nd-range params + template + nd_range_params + eval_nd_range_params(_ExecutionPolicy&& __exec, const _Range1& __rng1, const _Range2& __rng2) const + { + using _Range1ValueType = oneapi::dpl::__internal::__value_t<_Range1>; + using _Range2ValueType = oneapi::dpl::__internal::__value_t<_Range2>; + using _RangeValueType = std::conditional_t<(sizeof(_Range1ValueType) > sizeof(_Range2ValueType)), + _Range1ValueType, _Range2ValueType>; + + const std::size_t __n = __rng1.size() + __rng2.size(); + + // Empirical number of values to process per work-item + const std::uint8_t __chunk = __exec.queue().get_device().is_cpu() ? 128 : 4; + + const _IdType __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk); + const _IdType __base_diag_count = 32 * 1'024; + const _IdType __steps_between_two_base_diags = + oneapi::dpl::__internal::__dpl_ceiling_div(__steps, __base_diag_count); + + return {__base_diag_count, __steps_between_two_base_diags, __chunk, __steps}; + } + + // Calculation of split points on each base diagonal + template + sycl::event + eval_split_points_for_groups(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Compare __comp, + const nd_range_params& __nd_range_params, + _Storage& __base_diagonals_sp_global_storage) const + { + const _IdType __n1 = __rng1.size(); + const _IdType __n2 = __rng2.size(); + const _IdType __n = __n1 + __n2; + + sycl::event __event = __exec.queue().submit([&](sycl::handler& __cgh) { + oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2); + auto __base_diagonals_sp_global_acc = + __base_diagonals_sp_global_storage.template __get_scratch_acc( + __cgh, __dpl_sycl::__no_init{}); + + __cgh.parallel_for<_DiagonalsKernelName...>( + sycl::range(__nd_range_params.base_diag_count + 1), [=](sycl::item __item_id) { + auto __global_idx = __item_id.get_linear_id(); + auto __base_diagonals_sp_global_ptr = + _Storage::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); + + _split_point_t<_IdType> __sp = + __global_idx == 0 ? _split_point_t<_IdType>{0, 0} : _split_point_t<_IdType>{__n1, __n2}; + + if (0 < __global_idx && __global_idx < __nd_range_params.base_diag_count) + { + const _IdType __i_elem = + __global_idx * __nd_range_params.steps_between_two_base_diags * __nd_range_params.chunk; + if (__i_elem < __n) + __sp = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); + } + + __base_diagonals_sp_global_ptr[__global_idx] = __sp; + }); + }); + + return __event; + } + + // Process parallel merge + template + sycl::event + run_parallel_merge(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp, + const nd_range_params& __nd_range_params) const + { + const _IdType __n1 = __rng1.size(); + const _IdType __n2 = __rng2.size(); + + const auto __chunk = __nd_range_params.chunk; + + sycl::event __event = __exec.queue().submit([&](sycl::handler& __cgh) { + oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); + + __cgh.parallel_for<_MergeKernelName...>( + sycl::range(__nd_range_params.steps), [=](sycl::item __item_id) { + auto __global_idx = __item_id.get_linear_id(); + const _IdType __i_elem = __global_idx * __chunk; + + if (__i_elem < __n1 + __n2) + { + _split_point_t<_IdType> __start = __find_start_point_in(__rng1, (_IdType)0, __n1, __rng2, (_IdType)0, __n2, __i_elem, __comp); + __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, __n2, + __comp); + } + }); + }); + + return __event; + } + + // Process parallel merge + template + sycl::event + run_parallel_merge(sycl::event __event, _ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, + _Range3&& __rng3, _Compare __comp, const nd_range_params& __nd_range_params, + const _Storage& __base_diagonals_sp_global_storage) const + { + const _IdType __n1 = __rng1.size(); + const _IdType __n2 = __rng2.size(); + + __event = __exec.queue().submit([&](sycl::handler& __cgh) { + oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); + auto __base_diagonals_sp_global_acc = + __base_diagonals_sp_global_storage.template __get_scratch_acc(__cgh); + + __cgh.depends_on(__event); + + __cgh.parallel_for<_MergeKernelName...>( + sycl::range(__nd_range_params.steps), [=](sycl::item __item_id) { + auto __global_idx = __item_id.get_linear_id(); + const _IdType __i_elem = __global_idx * __nd_range_params.chunk; + + auto __base_diagonals_sp_global_ptr = + _Storage::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); + auto __diagonal_idx = __global_idx / __nd_range_params.steps_between_two_base_diags; + + _split_point_t<_IdType> __start; + if (__global_idx % __nd_range_params.steps_between_two_base_diags != 0) + { + const _split_point_t<_IdType> __sp_left = __base_diagonals_sp_global_ptr[__diagonal_idx]; + const _split_point_t<_IdType> __sp_right = __base_diagonals_sp_global_ptr[__diagonal_idx + 1]; + + __start = __find_start_point_in(__rng1, __sp_left.first, __sp_right.first, __rng2, + __sp_left.second, __sp_right.second, __i_elem, __comp); + } + else + { + __start = __base_diagonals_sp_global_ptr[__diagonal_idx]; + } + + __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, + __nd_range_params.chunk, __n1, __n2, __comp); + }); + }); + + return __event; + } + + public: + template + auto + operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const + { + assert(__rng1.size() > 0 || __rng2.size() > 0); + + _PRINT_INFO_IN_DEBUG_MODE(__exec); + + // Calculate nd-range params + const nd_range_params __nd_range_params = eval_nd_range_params(__exec, __rng1, __rng2); + + __result_and_scratch_storage_base_ptr __p_result_and_scratch_storage_base; + + // Create storage for save split-points on each base diagonal + 1 (for the right base diagonal in the last work-group) + auto __p_base_diagonals_sp_global_storage = + new __result_and_scratch_storage<_ExecutionPolicy, _split_point_t<_IdType>>( + __exec, 0, __nd_range_params.base_diag_count + 1); + __p_result_and_scratch_storage_base.reset( + static_cast<__result_and_scratch_storage_base*>(__p_base_diagonals_sp_global_storage)); + + sycl::event __event = eval_split_points_for_groups(__exec, __rng1, __rng2, __comp, __nd_range_params, + *__p_base_diagonals_sp_global_storage); + + // Merge data using split points on each base diagonal + __event = run_parallel_merge(__event, __exec, __rng1, __rng2, __rng3, __comp, __nd_range_params, + *__p_base_diagonals_sp_global_storage); + + return __future(std::move(__event), std::move(__p_result_and_scratch_storage_base)); + } +}; + +template +class __merge_kernel_name; + +template +class __merge_kernel_name_large; + +template +class __diagonals_kernel_name; + +template +std::size_t +starting_size_limit_for_large_submitter() +{ + return 4 * 1'048'576; // 4 MB +} + +template <> +std::size_t +starting_size_limit_for_large_submitter() +{ + return 16 * 1'048'576; // 8 MB +} + +template +auto +__parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Range1&& __rng1, + _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) +{ + using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + + using __value_type = oneapi::dpl::__internal::__value_t<_Range3>; + + const std::size_t __n = __rng1.size() + __rng2.size(); + if (__n < starting_size_limit_for_large_submitter<__value_type>()) + { + using _WiIndex = std::uint32_t; + using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __merge_kernel_name<_CustomName, _WiIndex>>; + return __parallel_merge_submitter<_WiIndex, _MergeKernelName>()( + std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), + std::forward<_Range3>(__rng3), __comp); + } + else + { + if (__n <= std::numeric_limits::max()) + { + using _WiIndex = std::uint32_t; + using _DiagonalsKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __diagonals_kernel_name<_CustomName, _WiIndex>>; + using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __merge_kernel_name_large<_CustomName, _WiIndex>>; + return __parallel_merge_submitter_large<_WiIndex, _CustomName, _DiagonalsKernelName, _MergeKernelName>()( + std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), + std::forward<_Range3>(__rng3), __comp); + } + else + { + using _WiIndex = std::uint64_t; + using _DiagonalsKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __diagonals_kernel_name<_CustomName, _WiIndex>>; + using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __merge_kernel_name_large<_CustomName, _WiIndex>>; + return __parallel_merge_submitter_large<_WiIndex, _CustomName, _DiagonalsKernelName, _MergeKernelName>()( + std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), + std::forward<_Range3>(__rng3), __comp); + } + } +} + +} // namespace __par_backend_hetero +} // namespace dpl +} // namespace oneapi + +#endif // _ONEDPL_PARALLEL_BACKEND_SYCL_MERGE_H From 05ff60fe2fb23638af569c52f8bccd60512af1a6 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 16 Dec 2024 11:28:43 +0100 Subject: [PATCH 07/16] Apply GitHUB clang format Signed-off-by: Sergey Kopienko --- .../pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 895253a151e..f6c7f7a3d5c 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -370,9 +370,10 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, if (__i_elem < __n1 + __n2) { - _split_point_t<_IdType> __start = __find_start_point_in(__rng1, (_IdType)0, __n1, __rng2, (_IdType)0, __n2, __i_elem, __comp); - __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, __n2, - __comp); + _split_point_t<_IdType> __start = + __find_start_point_in(__rng1, (_IdType)0, __n1, __rng2, (_IdType)0, __n2, __i_elem, __comp); + __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, + __n2, __comp); } }); }); @@ -451,11 +452,11 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, static_cast<__result_and_scratch_storage_base*>(__p_base_diagonals_sp_global_storage)); sycl::event __event = eval_split_points_for_groups(__exec, __rng1, __rng2, __comp, __nd_range_params, - *__p_base_diagonals_sp_global_storage); + *__p_base_diagonals_sp_global_storage); // Merge data using split points on each base diagonal __event = run_parallel_merge(__event, __exec, __rng1, __rng2, __rng3, __comp, __nd_range_params, - *__p_base_diagonals_sp_global_storage); + *__p_base_diagonals_sp_global_storage); return __future(std::move(__event), std::move(__p_result_and_scratch_storage_base)); } From ea47019fd4af943b33ced96ea0976e6474ede0fa Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 16 Dec 2024 15:13:41 +0100 Subject: [PATCH 08/16] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - remove run_parallel_merge with old implementation Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 32 ------------------- 1 file changed, 32 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index f6c7f7a3d5c..658598af4db 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -349,38 +349,6 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, return __event; } - // Process parallel merge - template - sycl::event - run_parallel_merge(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp, - const nd_range_params& __nd_range_params) const - { - const _IdType __n1 = __rng1.size(); - const _IdType __n2 = __rng2.size(); - - const auto __chunk = __nd_range_params.chunk; - - sycl::event __event = __exec.queue().submit([&](sycl::handler& __cgh) { - oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); - - __cgh.parallel_for<_MergeKernelName...>( - sycl::range(__nd_range_params.steps), [=](sycl::item __item_id) { - auto __global_idx = __item_id.get_linear_id(); - const _IdType __i_elem = __global_idx * __chunk; - - if (__i_elem < __n1 + __n2) - { - _split_point_t<_IdType> __start = - __find_start_point_in(__rng1, (_IdType)0, __n1, __rng2, (_IdType)0, __n2, __i_elem, __comp); - __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, - __n2, __comp); - } - }); - }); - - return __event; - } - // Process parallel merge template From 73bbc141cf0ccaf69e7bd949b532708dbef2748f Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 16 Dec 2024 15:32:12 +0100 Subject: [PATCH 09/16] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix self-review comment Signed-off-by: Sergey Kopienko --- .../pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 658598af4db..6b19d748400 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -440,15 +440,15 @@ template class __diagonals_kernel_name; template -std::size_t -starting_size_limit_for_large_submitter() +constexpr std::size_t +__get_starting_size_limit_for_large_submitter() { return 4 * 1'048'576; // 4 MB } template <> -std::size_t -starting_size_limit_for_large_submitter() +constexpr std::size_t +__get_starting_size_limit_for_large_submitter() { return 16 * 1'048'576; // 8 MB } @@ -463,9 +463,10 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy using __value_type = oneapi::dpl::__internal::__value_t<_Range3>; const std::size_t __n = __rng1.size() + __rng2.size(); - if (__n < starting_size_limit_for_large_submitter<__value_type>()) + if (__n < __get_starting_size_limit_for_large_submitter<__value_type>()) { using _WiIndex = std::uint32_t; + static_assert(__get_starting_size_limit_for_large_submitter<__value_type>() <= std::numeric_limits<_WiIndex>::max()); using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< __merge_kernel_name<_CustomName, _WiIndex>>; return __parallel_merge_submitter<_WiIndex, _MergeKernelName>()( From e0c1628adc625adfa59a5bcc3cb8bb1f79615700 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Mon, 16 Dec 2024 16:24:42 +0100 Subject: [PATCH 10/16] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix self-review comments Signed-off-by: Sergey Kopienko --- .../hetero/dpcpp/parallel_backend_sycl_merge.h | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 6b19d748400..2a9221595e4 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -169,7 +169,7 @@ __find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rn constexpr int kValue = 1; const __it_t __res = - std::lower_bound(__diag_it_begin, __diag_it_end, kValue, [&](_Index __idx, const auto& __value) { + std::lower_bound(__diag_it_begin, __diag_it_end, kValue, [&__rng1, &__rng2, __index_sum, __comp](_Index __idx, const auto& __value) { const auto __zero_or_one = __comp(__rng2[__index_sum - __idx], __rng1[__idx]); return __zero_or_one < kValue; }); @@ -252,7 +252,7 @@ struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_M const _IdType __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk); - auto __event = __exec.queue().submit([&](sycl::handler& __cgh) { + auto __event = __exec.queue().submit([&__rng1, &__rng2, &__rng3, __steps, __chunk, __n1, __n2, __comp](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); __cgh.parallel_for<_MergeKernelName...>( sycl::range(__steps), [=](sycl::item __item_id) { @@ -319,7 +319,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, const _IdType __n2 = __rng2.size(); const _IdType __n = __n1 + __n2; - sycl::event __event = __exec.queue().submit([&](sycl::handler& __cgh) { + return __exec.queue().submit([&__rng1, &__rng2, __base_diagonals_sp_global_storage, __n1, __n2, __n, __nd_range_params, __comp](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2); auto __base_diagonals_sp_global_acc = __base_diagonals_sp_global_storage.template __get_scratch_acc( @@ -345,22 +345,20 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, __base_diagonals_sp_global_ptr[__global_idx] = __sp; }); }); - - return __event; } // Process parallel merge template sycl::event - run_parallel_merge(sycl::event __event, _ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, + run_parallel_merge(const sycl::event& __event, _ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp, const nd_range_params& __nd_range_params, const _Storage& __base_diagonals_sp_global_storage) const { const _IdType __n1 = __rng1.size(); const _IdType __n2 = __rng2.size(); - __event = __exec.queue().submit([&](sycl::handler& __cgh) { + return __exec.queue().submit([&__event, &__rng1, &__rng2, &__rng3, __nd_range_params, __base_diagonals_sp_global_storage, __n1, __n2, __comp](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); auto __base_diagonals_sp_global_acc = __base_diagonals_sp_global_storage.template __get_scratch_acc(__cgh); @@ -394,8 +392,6 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, __nd_range_params.chunk, __n1, __n2, __comp); }); }); - - return __event; } public: From 38166c712a1551593b0015f064e3ef9417edcd42 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 17 Dec 2024 09:44:08 +0100 Subject: [PATCH 11/16] Apply GitHUB clang format Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 41 +++++++++++-------- 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 2a9221595e4..6158b32dcf9 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -168,11 +168,12 @@ __find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rn __it_t __diag_it_end(idx1_to); constexpr int kValue = 1; - const __it_t __res = - std::lower_bound(__diag_it_begin, __diag_it_end, kValue, [&__rng1, &__rng2, __index_sum, __comp](_Index __idx, const auto& __value) { - const auto __zero_or_one = __comp(__rng2[__index_sum - __idx], __rng1[__idx]); - return __zero_or_one < kValue; - }); + const __it_t __res = std::lower_bound(__diag_it_begin, __diag_it_end, kValue, + [&__rng1, &__rng2, __index_sum, __comp](_Index __idx, const auto& __value) { + const auto __zero_or_one = + __comp(__rng2[__index_sum - __idx], __rng1[__idx]); + return __zero_or_one < kValue; + }); return _split_point_t<_Index>{*__res, __index_sum - *__res + 1}; } @@ -252,16 +253,17 @@ struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_M const _IdType __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk); - auto __event = __exec.queue().submit([&__rng1, &__rng2, &__rng3, __steps, __chunk, __n1, __n2, __comp](sycl::handler& __cgh) { - oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); - __cgh.parallel_for<_MergeKernelName...>( - sycl::range(__steps), [=](sycl::item __item_id) { - const _IdType __i_elem = __item_id.get_linear_id() * __chunk; - const auto __start = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); - __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, __n2, - __comp); - }); - }); + auto __event = __exec.queue().submit( + [&__rng1, &__rng2, &__rng3, __steps, __chunk, __n1, __n2, __comp](sycl::handler& __cgh) { + oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); + __cgh.parallel_for<_MergeKernelName...>( + sycl::range(__steps), [=](sycl::item __item_id) { + const _IdType __i_elem = __item_id.get_linear_id() * __chunk; + const auto __start = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); + __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, + __n2, __comp); + }); + }); // We should return the same thing in the second param of __future for compatibility // with the returning value in __parallel_merge_submitter_large::operator() return __future(__event, __result_and_scratch_storage_base_ptr{}); @@ -319,7 +321,8 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, const _IdType __n2 = __rng2.size(); const _IdType __n = __n1 + __n2; - return __exec.queue().submit([&__rng1, &__rng2, __base_diagonals_sp_global_storage, __n1, __n2, __n, __nd_range_params, __comp](sycl::handler& __cgh) { + return __exec.queue().submit([&__rng1, &__rng2, __base_diagonals_sp_global_storage, __n1, __n2, __n, + __nd_range_params, __comp](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2); auto __base_diagonals_sp_global_acc = __base_diagonals_sp_global_storage.template __get_scratch_acc( @@ -358,7 +361,8 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, const _IdType __n1 = __rng1.size(); const _IdType __n2 = __rng2.size(); - return __exec.queue().submit([&__event, &__rng1, &__rng2, &__rng3, __nd_range_params, __base_diagonals_sp_global_storage, __n1, __n2, __comp](sycl::handler& __cgh) { + return __exec.queue().submit([&__event, &__rng1, &__rng2, &__rng3, __nd_range_params, + __base_diagonals_sp_global_storage, __n1, __n2, __comp](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); auto __base_diagonals_sp_global_acc = __base_diagonals_sp_global_storage.template __get_scratch_acc(__cgh); @@ -462,7 +466,8 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy if (__n < __get_starting_size_limit_for_large_submitter<__value_type>()) { using _WiIndex = std::uint32_t; - static_assert(__get_starting_size_limit_for_large_submitter<__value_type>() <= std::numeric_limits<_WiIndex>::max()); + static_assert(__get_starting_size_limit_for_large_submitter<__value_type>() <= + std::numeric_limits<_WiIndex>::max()); using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< __merge_kernel_name<_CustomName, _WiIndex>>; return __parallel_merge_submitter<_WiIndex, _MergeKernelName>()( From 7b5dc422a98a87d067066cf326ff5e0b5115494b Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Tue, 17 Dec 2024 12:20:49 +0100 Subject: [PATCH 12/16] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix ordering of captured variables in submit calls Signed-off-by: Sergey Kopienko --- .../pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 6158b32dcf9..9ead310b63c 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -254,7 +254,7 @@ struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_M const _IdType __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk); auto __event = __exec.queue().submit( - [&__rng1, &__rng2, &__rng3, __steps, __chunk, __n1, __n2, __comp](sycl::handler& __cgh) { + [&__rng1, &__rng2, &__rng3, __comp, __chunk, __steps, __n1, __n2](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); __cgh.parallel_for<_MergeKernelName...>( sycl::range(__steps), [=](sycl::item __item_id) { @@ -321,8 +321,8 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, const _IdType __n2 = __rng2.size(); const _IdType __n = __n1 + __n2; - return __exec.queue().submit([&__rng1, &__rng2, __base_diagonals_sp_global_storage, __n1, __n2, __n, - __nd_range_params, __comp](sycl::handler& __cgh) { + return __exec.queue().submit([&__rng1, &__rng2, __comp, __nd_range_params, __base_diagonals_sp_global_storage, + __n1, __n2, __n](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2); auto __base_diagonals_sp_global_acc = __base_diagonals_sp_global_storage.template __get_scratch_acc( @@ -361,8 +361,8 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, const _IdType __n1 = __rng1.size(); const _IdType __n2 = __rng2.size(); - return __exec.queue().submit([&__event, &__rng1, &__rng2, &__rng3, __nd_range_params, - __base_diagonals_sp_global_storage, __n1, __n2, __comp](sycl::handler& __cgh) { + return __exec.queue().submit([&__event, &__rng1, &__rng2, &__rng3, __comp, __nd_range_params, + __base_diagonals_sp_global_storage, __n1, __n2](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); auto __base_diagonals_sp_global_acc = __base_diagonals_sp_global_storage.template __get_scratch_acc(__cgh); From 93c731ac6564a1a93d54eda5807a0649c69d31f6 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Wed, 18 Dec 2024 16:50:26 +0100 Subject: [PATCH 13/16] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix EOL chars Signed-off-by: Sergey Kopienko --- .../dpcpp/parallel_backend_sycl_merge.h | 1016 ++++++++--------- 1 file changed, 508 insertions(+), 508 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 9ead310b63c..10d4c5e7489 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -1,508 +1,508 @@ -// -*- C++ -*- -//===-- parallel_backend_sycl_merge.h --------------------------------===// -// -// Copyright (C) Intel Corporation -// -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -// This file incorporates work covered by the following copyright and permission -// notice: -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// -//===----------------------------------------------------------------------===// - -#ifndef _ONEDPL_PARALLEL_BACKEND_SYCL_MERGE_H -#define _ONEDPL_PARALLEL_BACKEND_SYCL_MERGE_H - -#include // std::numeric_limits -#include // assert -#include // std::uint8_t, ... -#include // std::make_pair, std::forward -#include // std::min, std::lower_bound - -#include "sycl_defs.h" -#include "parallel_backend_sycl_utils.h" - -namespace oneapi -{ -namespace dpl -{ -namespace __par_backend_hetero -{ -template -using _split_point_t = std::pair<_Index, _Index>; - -//Searching for an intersection of a merge matrix (n1, n2) diagonal with the Merge Path to define sub-ranges -//to serial merge. For example, a merge matrix for [0,1,1,2,3] and [0,0,2,3] is shown below: -// 0 1 1 2 3 -// ------------------ -// |---> -// 0 | 0 | 1 1 1 1 -// | | -// 0 | 0 | 1 1 1 1 -// | ----------> -// 2 | 0 0 0 0 | 1 -// | ----> -// 3 | 0 0 0 0 0 | -template -auto -__find_start_point(const _Rng1& __rng1, const _Rng2& __rng2, const _Index __i_elem, const _Index __n1, - const _Index __n2, _Compare __comp) -{ - //searching for the first '1', a lower bound for a diagonal [0, 0,..., 0, 1, 1,.... 1, 1] - oneapi::dpl::counting_iterator<_Index> __diag_it(0); - - if (__i_elem < __n2) //a condition to specify upper or lower part of the merge matrix to be processed - { - const _Index __q = __i_elem; //diagonal index - const _Index __n_diag = std::min<_Index>(__q, __n1); //diagonal size - auto __res = - std::lower_bound(__diag_it, __diag_it + __n_diag, 1 /*value to find*/, - [&__rng2, &__rng1, __q, __comp](const auto& __i_diag, const auto& __value) mutable { - const auto __zero_or_one = __comp(__rng2[__q - __i_diag - 1], __rng1[__i_diag]); - return __zero_or_one < __value; - }); - return std::make_pair(*__res, __q - *__res); - } - else - { - const _Index __q = __i_elem - __n2; //diagonal index - const _Index __n_diag = std::min<_Index>(__n1 - __q, __n2); //diagonal size - auto __res = - std::lower_bound(__diag_it, __diag_it + __n_diag, 1 /*value to find*/, - [&__rng2, &__rng1, __n2, __q, __comp](const auto& __i_diag, const auto& __value) mutable { - const auto __zero_or_one = __comp(__rng2[__n2 - __i_diag - 1], __rng1[__q + __i_diag]); - return __zero_or_one < __value; - }); - return std::make_pair(__q + *__res, __n2 - *__res); - } -} - -//Searching for an intersection of a merge matrix (n1, n2) diagonal with the Merge Path to define sub-ranges -//to serial merge. For example, a merge matrix for [0,1,1,2,3] and [0,0,2,3] is shown below: -// 0 1 1 2 3 -// ------------------ -// |---> -// 0 | 0 | 1 1 1 1 -// | | -// 0 | 0 | 1 1 1 1 -// | ----------> -// 2 | 0 0 0 0 | 1 -// | ----> -// 3 | 0 0 0 0 0 | -template -_split_point_t<_Index> -__find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rng1_to, const _Rng2& __rng2, - const _Index __rng2_from, _Index __rng2_to, const _Index __i_elem, _Compare __comp) -{ - // ----------------------- EXAMPLE ------------------------ - // Let's consider the following input data: - // rng1.size() = 10 - // rng2.size() = 6 - // i_diag = 9 - // Let's define the following ranges for processing: - // rng1: [3, ..., 9) -> __rng1_from = 3, __rng1_to = 9 - // rng2: [1, ..., 4) -> __rng2_from = 1, __rng2_to = 4 - // - // The goal: required to process only X' items of the merge matrix - // as intersection of rng1[3, ..., 9) and rng2[1, ..., 4) - // - // -------------------------------------------------------- - // - // __diag_it_begin(rng1) __diag_it_end(rng1) - // (init state) (dest state) (init state, dest state) - // | | | - // V V V - // + + + + + + - // \ rng1 0 1 2 3 4 5 6 7 8 9 - // rng2 +--------------------------------------+ - // 0 | ^ ^ ^ X | <--- __diag_it_end(rng2) (init state) - // + 1 | <----------------- + + X'2 ^ | <--- __diag_it_end(rng2) (dest state) - // + 2 | <----------------- + X'1 | | - // + 3 | <----------------- X'0 | | <--- __diag_it_begin(rng2) (dest state) - // 4 | X ^ | | - // 5 | X | | | <--- __diag_it_begin(rng2) (init state) - // +-------AX-----------+-----------+-----+ - // AX | | - // AX | | - // Run lower_bound:[from = 5, to = 8) - // - // AX - absent items in rng2 - // - // We have three points on diagonal for call comparison: - // X'0 : call __comp(rng1[5], rng2[3]) // 5 + 3 == 9 - 1 == 8 - // X'1 : call __comp(rng1[6], rng2[2]) // 6 + 2 == 9 - 1 == 8 - // X'3 : call __comp(rng1[7], rng2[1]) // 7 + 1 == 9 - 1 == 8 - // - where for every comparing pairs idx(rng1) + idx(rng2) == i_diag - 1 - - //////////////////////////////////////////////////////////////////////////////////// - // Taking into account the specified constraints of the range of processed data - const auto __index_sum = __i_elem - 1; - - using _IndexSigned = std::make_signed_t<_Index>; - - _IndexSigned idx1_from = __rng1_from; - _IndexSigned idx1_to = __rng1_to; - - _IndexSigned idx2_from = __index_sum - (__rng1_to - 1); - _IndexSigned idx2_to = __index_sum - __rng1_from + 1; - - const _IndexSigned idx2_from_diff = - idx2_from < (_IndexSigned)__rng2_from ? (_IndexSigned)__rng2_from - idx2_from : 0; - const _IndexSigned idx2_to_diff = idx2_to > (_IndexSigned)__rng2_to ? idx2_to - (_IndexSigned)__rng2_to : 0; - - idx1_to -= idx2_from_diff; - idx1_from += idx2_to_diff; - - idx2_from = __index_sum - (idx1_to - 1); - idx2_to = __index_sum - idx1_from + 1; - - //////////////////////////////////////////////////////////////////////////////////// - // Run search of split point on diagonal - - using __it_t = oneapi::dpl::counting_iterator<_Index>; - - __it_t __diag_it_begin(idx1_from); - __it_t __diag_it_end(idx1_to); - - constexpr int kValue = 1; - const __it_t __res = std::lower_bound(__diag_it_begin, __diag_it_end, kValue, - [&__rng1, &__rng2, __index_sum, __comp](_Index __idx, const auto& __value) { - const auto __zero_or_one = - __comp(__rng2[__index_sum - __idx], __rng1[__idx]); - return __zero_or_one < kValue; - }); - - return _split_point_t<_Index>{*__res, __index_sum - *__res + 1}; -} - -// Do serial merge of the data from rng1 (starting from start1) and rng2 (starting from start2) and writing -// to rng3 (starting from start3) in 'chunk' steps, but do not exceed the total size of the sequences (n1 and n2) -template -void -__serial_merge(const _Rng1& __rng1, const _Rng2& __rng2, _Rng3& __rng3, _Index __start1, _Index __start2, - const _Index __start3, const std::uint8_t __chunk, const _Index __n1, const _Index __n2, _Compare __comp) -{ - if (__start1 >= __n1) - { - //copying a residual of the second seq - const _Index __n = std::min<_Index>(__n2 - __start2, __chunk); - for (std::uint8_t __i = 0; __i < __n; ++__i) - __rng3[__start3 + __i] = __rng2[__start2 + __i]; - } - else if (__start2 >= __n2) - { - //copying a residual of the first seq - const _Index __n = std::min<_Index>(__n1 - __start1, __chunk); - for (std::uint8_t __i = 0; __i < __n; ++__i) - __rng3[__start3 + __i] = __rng1[__start1 + __i]; - } - else - { - for (std::uint8_t __i = 0; __i < __chunk && __start1 < __n1 && __start2 < __n2; ++__i) - { - const auto& __val1 = __rng1[__start1]; - const auto& __val2 = __rng2[__start2]; - if (__comp(__val2, __val1)) - { - __rng3[__start3 + __i] = __val2; - if (++__start2 == __n2) - { - //copying a residual of the first seq - for (++__i; __i < __chunk && __start1 < __n1; ++__i, ++__start1) - __rng3[__start3 + __i] = __rng1[__start1]; - } - } - else - { - __rng3[__start3 + __i] = __val1; - if (++__start1 == __n1) - { - //copying a residual of the second seq - for (++__i; __i < __chunk && __start2 < __n2; ++__i, ++__start2) - __rng3[__start3 + __i] = __rng2[__start2]; - } - } - } - } -} - -// Please see the comment for __parallel_for_submitter for optional kernel name explanation -template -struct __parallel_merge_submitter; - -template -struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_MergeKernelName...>> -{ - template - auto - operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const - { - const _IdType __n1 = __rng1.size(); - const _IdType __n2 = __rng2.size(); - const _IdType __n = __n1 + __n2; - - assert(__n1 > 0 || __n2 > 0); - - _PRINT_INFO_IN_DEBUG_MODE(__exec); - - // Empirical number of values to process per work-item - const std::uint8_t __chunk = __exec.queue().get_device().is_cpu() ? 128 : 4; - - const _IdType __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk); - - auto __event = __exec.queue().submit( - [&__rng1, &__rng2, &__rng3, __comp, __chunk, __steps, __n1, __n2](sycl::handler& __cgh) { - oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); - __cgh.parallel_for<_MergeKernelName...>( - sycl::range(__steps), [=](sycl::item __item_id) { - const _IdType __i_elem = __item_id.get_linear_id() * __chunk; - const auto __start = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); - __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, - __n2, __comp); - }); - }); - // We should return the same thing in the second param of __future for compatibility - // with the returning value in __parallel_merge_submitter_large::operator() - return __future(__event, __result_and_scratch_storage_base_ptr{}); - } -}; - -template -struct __parallel_merge_submitter_large; - -template -struct __parallel_merge_submitter_large<_IdType, _CustomName, - __internal::__optional_kernel_name<_DiagonalsKernelName...>, - __internal::__optional_kernel_name<_MergeKernelName...>> -{ - protected: - struct nd_range_params - { - std::size_t base_diag_count = 0; - std::size_t steps_between_two_base_diags = 0; - std::uint8_t chunk = 0; - _IdType steps = 0; - }; - - // Calculate nd-range params - template - nd_range_params - eval_nd_range_params(_ExecutionPolicy&& __exec, const _Range1& __rng1, const _Range2& __rng2) const - { - using _Range1ValueType = oneapi::dpl::__internal::__value_t<_Range1>; - using _Range2ValueType = oneapi::dpl::__internal::__value_t<_Range2>; - using _RangeValueType = std::conditional_t<(sizeof(_Range1ValueType) > sizeof(_Range2ValueType)), - _Range1ValueType, _Range2ValueType>; - - const std::size_t __n = __rng1.size() + __rng2.size(); - - // Empirical number of values to process per work-item - const std::uint8_t __chunk = __exec.queue().get_device().is_cpu() ? 128 : 4; - - const _IdType __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk); - const _IdType __base_diag_count = 32 * 1'024; - const _IdType __steps_between_two_base_diags = - oneapi::dpl::__internal::__dpl_ceiling_div(__steps, __base_diag_count); - - return {__base_diag_count, __steps_between_two_base_diags, __chunk, __steps}; - } - - // Calculation of split points on each base diagonal - template - sycl::event - eval_split_points_for_groups(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Compare __comp, - const nd_range_params& __nd_range_params, - _Storage& __base_diagonals_sp_global_storage) const - { - const _IdType __n1 = __rng1.size(); - const _IdType __n2 = __rng2.size(); - const _IdType __n = __n1 + __n2; - - return __exec.queue().submit([&__rng1, &__rng2, __comp, __nd_range_params, __base_diagonals_sp_global_storage, - __n1, __n2, __n](sycl::handler& __cgh) { - oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2); - auto __base_diagonals_sp_global_acc = - __base_diagonals_sp_global_storage.template __get_scratch_acc( - __cgh, __dpl_sycl::__no_init{}); - - __cgh.parallel_for<_DiagonalsKernelName...>( - sycl::range(__nd_range_params.base_diag_count + 1), [=](sycl::item __item_id) { - auto __global_idx = __item_id.get_linear_id(); - auto __base_diagonals_sp_global_ptr = - _Storage::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); - - _split_point_t<_IdType> __sp = - __global_idx == 0 ? _split_point_t<_IdType>{0, 0} : _split_point_t<_IdType>{__n1, __n2}; - - if (0 < __global_idx && __global_idx < __nd_range_params.base_diag_count) - { - const _IdType __i_elem = - __global_idx * __nd_range_params.steps_between_two_base_diags * __nd_range_params.chunk; - if (__i_elem < __n) - __sp = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); - } - - __base_diagonals_sp_global_ptr[__global_idx] = __sp; - }); - }); - } - - // Process parallel merge - template - sycl::event - run_parallel_merge(const sycl::event& __event, _ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, - _Range3&& __rng3, _Compare __comp, const nd_range_params& __nd_range_params, - const _Storage& __base_diagonals_sp_global_storage) const - { - const _IdType __n1 = __rng1.size(); - const _IdType __n2 = __rng2.size(); - - return __exec.queue().submit([&__event, &__rng1, &__rng2, &__rng3, __comp, __nd_range_params, - __base_diagonals_sp_global_storage, __n1, __n2](sycl::handler& __cgh) { - oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); - auto __base_diagonals_sp_global_acc = - __base_diagonals_sp_global_storage.template __get_scratch_acc(__cgh); - - __cgh.depends_on(__event); - - __cgh.parallel_for<_MergeKernelName...>( - sycl::range(__nd_range_params.steps), [=](sycl::item __item_id) { - auto __global_idx = __item_id.get_linear_id(); - const _IdType __i_elem = __global_idx * __nd_range_params.chunk; - - auto __base_diagonals_sp_global_ptr = - _Storage::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); - auto __diagonal_idx = __global_idx / __nd_range_params.steps_between_two_base_diags; - - _split_point_t<_IdType> __start; - if (__global_idx % __nd_range_params.steps_between_two_base_diags != 0) - { - const _split_point_t<_IdType> __sp_left = __base_diagonals_sp_global_ptr[__diagonal_idx]; - const _split_point_t<_IdType> __sp_right = __base_diagonals_sp_global_ptr[__diagonal_idx + 1]; - - __start = __find_start_point_in(__rng1, __sp_left.first, __sp_right.first, __rng2, - __sp_left.second, __sp_right.second, __i_elem, __comp); - } - else - { - __start = __base_diagonals_sp_global_ptr[__diagonal_idx]; - } - - __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, - __nd_range_params.chunk, __n1, __n2, __comp); - }); - }); - } - - public: - template - auto - operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const - { - assert(__rng1.size() > 0 || __rng2.size() > 0); - - _PRINT_INFO_IN_DEBUG_MODE(__exec); - - // Calculate nd-range params - const nd_range_params __nd_range_params = eval_nd_range_params(__exec, __rng1, __rng2); - - __result_and_scratch_storage_base_ptr __p_result_and_scratch_storage_base; - - // Create storage for save split-points on each base diagonal + 1 (for the right base diagonal in the last work-group) - auto __p_base_diagonals_sp_global_storage = - new __result_and_scratch_storage<_ExecutionPolicy, _split_point_t<_IdType>>( - __exec, 0, __nd_range_params.base_diag_count + 1); - __p_result_and_scratch_storage_base.reset( - static_cast<__result_and_scratch_storage_base*>(__p_base_diagonals_sp_global_storage)); - - sycl::event __event = eval_split_points_for_groups(__exec, __rng1, __rng2, __comp, __nd_range_params, - *__p_base_diagonals_sp_global_storage); - - // Merge data using split points on each base diagonal - __event = run_parallel_merge(__event, __exec, __rng1, __rng2, __rng3, __comp, __nd_range_params, - *__p_base_diagonals_sp_global_storage); - - return __future(std::move(__event), std::move(__p_result_and_scratch_storage_base)); - } -}; - -template -class __merge_kernel_name; - -template -class __merge_kernel_name_large; - -template -class __diagonals_kernel_name; - -template -constexpr std::size_t -__get_starting_size_limit_for_large_submitter() -{ - return 4 * 1'048'576; // 4 MB -} - -template <> -constexpr std::size_t -__get_starting_size_limit_for_large_submitter() -{ - return 16 * 1'048'576; // 8 MB -} - -template -auto -__parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Range1&& __rng1, - _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) -{ - using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; - - using __value_type = oneapi::dpl::__internal::__value_t<_Range3>; - - const std::size_t __n = __rng1.size() + __rng2.size(); - if (__n < __get_starting_size_limit_for_large_submitter<__value_type>()) - { - using _WiIndex = std::uint32_t; - static_assert(__get_starting_size_limit_for_large_submitter<__value_type>() <= - std::numeric_limits<_WiIndex>::max()); - using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __merge_kernel_name<_CustomName, _WiIndex>>; - return __parallel_merge_submitter<_WiIndex, _MergeKernelName>()( - std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), - std::forward<_Range3>(__rng3), __comp); - } - else - { - if (__n <= std::numeric_limits::max()) - { - using _WiIndex = std::uint32_t; - using _DiagonalsKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __diagonals_kernel_name<_CustomName, _WiIndex>>; - using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __merge_kernel_name_large<_CustomName, _WiIndex>>; - return __parallel_merge_submitter_large<_WiIndex, _CustomName, _DiagonalsKernelName, _MergeKernelName>()( - std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), - std::forward<_Range3>(__rng3), __comp); - } - else - { - using _WiIndex = std::uint64_t; - using _DiagonalsKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __diagonals_kernel_name<_CustomName, _WiIndex>>; - using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __merge_kernel_name_large<_CustomName, _WiIndex>>; - return __parallel_merge_submitter_large<_WiIndex, _CustomName, _DiagonalsKernelName, _MergeKernelName>()( - std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), - std::forward<_Range3>(__rng3), __comp); - } - } -} - -} // namespace __par_backend_hetero -} // namespace dpl -} // namespace oneapi - -#endif // _ONEDPL_PARALLEL_BACKEND_SYCL_MERGE_H +// -*- C++ -*- +//===-- parallel_backend_sycl_merge.h --------------------------------===// +// +// Copyright (C) Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// This file incorporates work covered by the following copyright and permission +// notice: +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// +//===----------------------------------------------------------------------===// + +#ifndef _ONEDPL_PARALLEL_BACKEND_SYCL_MERGE_H +#define _ONEDPL_PARALLEL_BACKEND_SYCL_MERGE_H + +#include // std::numeric_limits +#include // assert +#include // std::uint8_t, ... +#include // std::make_pair, std::forward +#include // std::min, std::lower_bound + +#include "sycl_defs.h" +#include "parallel_backend_sycl_utils.h" + +namespace oneapi +{ +namespace dpl +{ +namespace __par_backend_hetero +{ +template +using _split_point_t = std::pair<_Index, _Index>; + +//Searching for an intersection of a merge matrix (n1, n2) diagonal with the Merge Path to define sub-ranges +//to serial merge. For example, a merge matrix for [0,1,1,2,3] and [0,0,2,3] is shown below: +// 0 1 1 2 3 +// ------------------ +// |---> +// 0 | 0 | 1 1 1 1 +// | | +// 0 | 0 | 1 1 1 1 +// | ----------> +// 2 | 0 0 0 0 | 1 +// | ----> +// 3 | 0 0 0 0 0 | +template +auto +__find_start_point(const _Rng1& __rng1, const _Rng2& __rng2, const _Index __i_elem, const _Index __n1, + const _Index __n2, _Compare __comp) +{ + //searching for the first '1', a lower bound for a diagonal [0, 0,..., 0, 1, 1,.... 1, 1] + oneapi::dpl::counting_iterator<_Index> __diag_it(0); + + if (__i_elem < __n2) //a condition to specify upper or lower part of the merge matrix to be processed + { + const _Index __q = __i_elem; //diagonal index + const _Index __n_diag = std::min<_Index>(__q, __n1); //diagonal size + auto __res = + std::lower_bound(__diag_it, __diag_it + __n_diag, 1 /*value to find*/, + [&__rng2, &__rng1, __q, __comp](const auto& __i_diag, const auto& __value) mutable { + const auto __zero_or_one = __comp(__rng2[__q - __i_diag - 1], __rng1[__i_diag]); + return __zero_or_one < __value; + }); + return std::make_pair(*__res, __q - *__res); + } + else + { + const _Index __q = __i_elem - __n2; //diagonal index + const _Index __n_diag = std::min<_Index>(__n1 - __q, __n2); //diagonal size + auto __res = + std::lower_bound(__diag_it, __diag_it + __n_diag, 1 /*value to find*/, + [&__rng2, &__rng1, __n2, __q, __comp](const auto& __i_diag, const auto& __value) mutable { + const auto __zero_or_one = __comp(__rng2[__n2 - __i_diag - 1], __rng1[__q + __i_diag]); + return __zero_or_one < __value; + }); + return std::make_pair(__q + *__res, __n2 - *__res); + } +} + +//Searching for an intersection of a merge matrix (n1, n2) diagonal with the Merge Path to define sub-ranges +//to serial merge. For example, a merge matrix for [0,1,1,2,3] and [0,0,2,3] is shown below: +// 0 1 1 2 3 +// ------------------ +// |---> +// 0 | 0 | 1 1 1 1 +// | | +// 0 | 0 | 1 1 1 1 +// | ----------> +// 2 | 0 0 0 0 | 1 +// | ----> +// 3 | 0 0 0 0 0 | +template +_split_point_t<_Index> +__find_start_point_in(const _Rng1& __rng1, const _Index __rng1_from, _Index __rng1_to, const _Rng2& __rng2, + const _Index __rng2_from, _Index __rng2_to, const _Index __i_elem, _Compare __comp) +{ + // ----------------------- EXAMPLE ------------------------ + // Let's consider the following input data: + // rng1.size() = 10 + // rng2.size() = 6 + // i_diag = 9 + // Let's define the following ranges for processing: + // rng1: [3, ..., 9) -> __rng1_from = 3, __rng1_to = 9 + // rng2: [1, ..., 4) -> __rng2_from = 1, __rng2_to = 4 + // + // The goal: required to process only X' items of the merge matrix + // as intersection of rng1[3, ..., 9) and rng2[1, ..., 4) + // + // -------------------------------------------------------- + // + // __diag_it_begin(rng1) __diag_it_end(rng1) + // (init state) (dest state) (init state, dest state) + // | | | + // V V V + // + + + + + + + // \ rng1 0 1 2 3 4 5 6 7 8 9 + // rng2 +--------------------------------------+ + // 0 | ^ ^ ^ X | <--- __diag_it_end(rng2) (init state) + // + 1 | <----------------- + + X'2 ^ | <--- __diag_it_end(rng2) (dest state) + // + 2 | <----------------- + X'1 | | + // + 3 | <----------------- X'0 | | <--- __diag_it_begin(rng2) (dest state) + // 4 | X ^ | | + // 5 | X | | | <--- __diag_it_begin(rng2) (init state) + // +-------AX-----------+-----------+-----+ + // AX | | + // AX | | + // Run lower_bound:[from = 5, to = 8) + // + // AX - absent items in rng2 + // + // We have three points on diagonal for call comparison: + // X'0 : call __comp(rng1[5], rng2[3]) // 5 + 3 == 9 - 1 == 8 + // X'1 : call __comp(rng1[6], rng2[2]) // 6 + 2 == 9 - 1 == 8 + // X'3 : call __comp(rng1[7], rng2[1]) // 7 + 1 == 9 - 1 == 8 + // - where for every comparing pairs idx(rng1) + idx(rng2) == i_diag - 1 + + //////////////////////////////////////////////////////////////////////////////////// + // Taking into account the specified constraints of the range of processed data + const auto __index_sum = __i_elem - 1; + + using _IndexSigned = std::make_signed_t<_Index>; + + _IndexSigned idx1_from = __rng1_from; + _IndexSigned idx1_to = __rng1_to; + + _IndexSigned idx2_from = __index_sum - (__rng1_to - 1); + _IndexSigned idx2_to = __index_sum - __rng1_from + 1; + + const _IndexSigned idx2_from_diff = + idx2_from < (_IndexSigned)__rng2_from ? (_IndexSigned)__rng2_from - idx2_from : 0; + const _IndexSigned idx2_to_diff = idx2_to > (_IndexSigned)__rng2_to ? idx2_to - (_IndexSigned)__rng2_to : 0; + + idx1_to -= idx2_from_diff; + idx1_from += idx2_to_diff; + + idx2_from = __index_sum - (idx1_to - 1); + idx2_to = __index_sum - idx1_from + 1; + + //////////////////////////////////////////////////////////////////////////////////// + // Run search of split point on diagonal + + using __it_t = oneapi::dpl::counting_iterator<_Index>; + + __it_t __diag_it_begin(idx1_from); + __it_t __diag_it_end(idx1_to); + + constexpr int kValue = 1; + const __it_t __res = std::lower_bound(__diag_it_begin, __diag_it_end, kValue, + [&__rng1, &__rng2, __index_sum, __comp](_Index __idx, const auto& __value) { + const auto __zero_or_one = + __comp(__rng2[__index_sum - __idx], __rng1[__idx]); + return __zero_or_one < kValue; + }); + + return _split_point_t<_Index>{*__res, __index_sum - *__res + 1}; +} + +// Do serial merge of the data from rng1 (starting from start1) and rng2 (starting from start2) and writing +// to rng3 (starting from start3) in 'chunk' steps, but do not exceed the total size of the sequences (n1 and n2) +template +void +__serial_merge(const _Rng1& __rng1, const _Rng2& __rng2, _Rng3& __rng3, _Index __start1, _Index __start2, + const _Index __start3, const std::uint8_t __chunk, const _Index __n1, const _Index __n2, _Compare __comp) +{ + if (__start1 >= __n1) + { + //copying a residual of the second seq + const _Index __n = std::min<_Index>(__n2 - __start2, __chunk); + for (std::uint8_t __i = 0; __i < __n; ++__i) + __rng3[__start3 + __i] = __rng2[__start2 + __i]; + } + else if (__start2 >= __n2) + { + //copying a residual of the first seq + const _Index __n = std::min<_Index>(__n1 - __start1, __chunk); + for (std::uint8_t __i = 0; __i < __n; ++__i) + __rng3[__start3 + __i] = __rng1[__start1 + __i]; + } + else + { + for (std::uint8_t __i = 0; __i < __chunk && __start1 < __n1 && __start2 < __n2; ++__i) + { + const auto& __val1 = __rng1[__start1]; + const auto& __val2 = __rng2[__start2]; + if (__comp(__val2, __val1)) + { + __rng3[__start3 + __i] = __val2; + if (++__start2 == __n2) + { + //copying a residual of the first seq + for (++__i; __i < __chunk && __start1 < __n1; ++__i, ++__start1) + __rng3[__start3 + __i] = __rng1[__start1]; + } + } + else + { + __rng3[__start3 + __i] = __val1; + if (++__start1 == __n1) + { + //copying a residual of the second seq + for (++__i; __i < __chunk && __start2 < __n2; ++__i, ++__start2) + __rng3[__start3 + __i] = __rng2[__start2]; + } + } + } + } +} + +// Please see the comment for __parallel_for_submitter for optional kernel name explanation +template +struct __parallel_merge_submitter; + +template +struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_MergeKernelName...>> +{ + template + auto + operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const + { + const _IdType __n1 = __rng1.size(); + const _IdType __n2 = __rng2.size(); + const _IdType __n = __n1 + __n2; + + assert(__n1 > 0 || __n2 > 0); + + _PRINT_INFO_IN_DEBUG_MODE(__exec); + + // Empirical number of values to process per work-item + const std::uint8_t __chunk = __exec.queue().get_device().is_cpu() ? 128 : 4; + + const _IdType __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk); + + auto __event = __exec.queue().submit( + [&__rng1, &__rng2, &__rng3, __comp, __chunk, __steps, __n1, __n2](sycl::handler& __cgh) { + oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); + __cgh.parallel_for<_MergeKernelName...>( + sycl::range(__steps), [=](sycl::item __item_id) { + const _IdType __i_elem = __item_id.get_linear_id() * __chunk; + const auto __start = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); + __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, + __n2, __comp); + }); + }); + // We should return the same thing in the second param of __future for compatibility + // with the returning value in __parallel_merge_submitter_large::operator() + return __future(__event, __result_and_scratch_storage_base_ptr{}); + } +}; + +template +struct __parallel_merge_submitter_large; + +template +struct __parallel_merge_submitter_large<_IdType, _CustomName, + __internal::__optional_kernel_name<_DiagonalsKernelName...>, + __internal::__optional_kernel_name<_MergeKernelName...>> +{ + protected: + struct nd_range_params + { + std::size_t base_diag_count = 0; + std::size_t steps_between_two_base_diags = 0; + std::uint8_t chunk = 0; + _IdType steps = 0; + }; + + // Calculate nd-range params + template + nd_range_params + eval_nd_range_params(_ExecutionPolicy&& __exec, const _Range1& __rng1, const _Range2& __rng2) const + { + using _Range1ValueType = oneapi::dpl::__internal::__value_t<_Range1>; + using _Range2ValueType = oneapi::dpl::__internal::__value_t<_Range2>; + using _RangeValueType = std::conditional_t<(sizeof(_Range1ValueType) > sizeof(_Range2ValueType)), + _Range1ValueType, _Range2ValueType>; + + const std::size_t __n = __rng1.size() + __rng2.size(); + + // Empirical number of values to process per work-item + const std::uint8_t __chunk = __exec.queue().get_device().is_cpu() ? 128 : 4; + + const _IdType __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk); + const _IdType __base_diag_count = 32 * 1'024; + const _IdType __steps_between_two_base_diags = + oneapi::dpl::__internal::__dpl_ceiling_div(__steps, __base_diag_count); + + return {__base_diag_count, __steps_between_two_base_diags, __chunk, __steps}; + } + + // Calculation of split points on each base diagonal + template + sycl::event + eval_split_points_for_groups(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Compare __comp, + const nd_range_params& __nd_range_params, + _Storage& __base_diagonals_sp_global_storage) const + { + const _IdType __n1 = __rng1.size(); + const _IdType __n2 = __rng2.size(); + const _IdType __n = __n1 + __n2; + + return __exec.queue().submit([&__rng1, &__rng2, __comp, __nd_range_params, __base_diagonals_sp_global_storage, + __n1, __n2, __n](sycl::handler& __cgh) { + oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2); + auto __base_diagonals_sp_global_acc = + __base_diagonals_sp_global_storage.template __get_scratch_acc( + __cgh, __dpl_sycl::__no_init{}); + + __cgh.parallel_for<_DiagonalsKernelName...>( + sycl::range(__nd_range_params.base_diag_count + 1), [=](sycl::item __item_id) { + auto __global_idx = __item_id.get_linear_id(); + auto __base_diagonals_sp_global_ptr = + _Storage::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); + + _split_point_t<_IdType> __sp = + __global_idx == 0 ? _split_point_t<_IdType>{0, 0} : _split_point_t<_IdType>{__n1, __n2}; + + if (0 < __global_idx && __global_idx < __nd_range_params.base_diag_count) + { + const _IdType __i_elem = + __global_idx * __nd_range_params.steps_between_two_base_diags * __nd_range_params.chunk; + if (__i_elem < __n) + __sp = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); + } + + __base_diagonals_sp_global_ptr[__global_idx] = __sp; + }); + }); + } + + // Process parallel merge + template + sycl::event + run_parallel_merge(const sycl::event& __event, _ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, + _Range3&& __rng3, _Compare __comp, const nd_range_params& __nd_range_params, + const _Storage& __base_diagonals_sp_global_storage) const + { + const _IdType __n1 = __rng1.size(); + const _IdType __n2 = __rng2.size(); + + return __exec.queue().submit([&__event, &__rng1, &__rng2, &__rng3, __comp, __nd_range_params, + __base_diagonals_sp_global_storage, __n1, __n2](sycl::handler& __cgh) { + oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); + auto __base_diagonals_sp_global_acc = + __base_diagonals_sp_global_storage.template __get_scratch_acc(__cgh); + + __cgh.depends_on(__event); + + __cgh.parallel_for<_MergeKernelName...>( + sycl::range(__nd_range_params.steps), [=](sycl::item __item_id) { + auto __global_idx = __item_id.get_linear_id(); + const _IdType __i_elem = __global_idx * __nd_range_params.chunk; + + auto __base_diagonals_sp_global_ptr = + _Storage::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); + auto __diagonal_idx = __global_idx / __nd_range_params.steps_between_two_base_diags; + + _split_point_t<_IdType> __start; + if (__global_idx % __nd_range_params.steps_between_two_base_diags != 0) + { + const _split_point_t<_IdType> __sp_left = __base_diagonals_sp_global_ptr[__diagonal_idx]; + const _split_point_t<_IdType> __sp_right = __base_diagonals_sp_global_ptr[__diagonal_idx + 1]; + + __start = __find_start_point_in(__rng1, __sp_left.first, __sp_right.first, __rng2, + __sp_left.second, __sp_right.second, __i_elem, __comp); + } + else + { + __start = __base_diagonals_sp_global_ptr[__diagonal_idx]; + } + + __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, + __nd_range_params.chunk, __n1, __n2, __comp); + }); + }); + } + + public: + template + auto + operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const + { + assert(__rng1.size() > 0 || __rng2.size() > 0); + + _PRINT_INFO_IN_DEBUG_MODE(__exec); + + // Calculate nd-range params + const nd_range_params __nd_range_params = eval_nd_range_params(__exec, __rng1, __rng2); + + __result_and_scratch_storage_base_ptr __p_result_and_scratch_storage_base; + + // Create storage for save split-points on each base diagonal + 1 (for the right base diagonal in the last work-group) + auto __p_base_diagonals_sp_global_storage = + new __result_and_scratch_storage<_ExecutionPolicy, _split_point_t<_IdType>>( + __exec, 0, __nd_range_params.base_diag_count + 1); + __p_result_and_scratch_storage_base.reset( + static_cast<__result_and_scratch_storage_base*>(__p_base_diagonals_sp_global_storage)); + + sycl::event __event = eval_split_points_for_groups(__exec, __rng1, __rng2, __comp, __nd_range_params, + *__p_base_diagonals_sp_global_storage); + + // Merge data using split points on each base diagonal + __event = run_parallel_merge(__event, __exec, __rng1, __rng2, __rng3, __comp, __nd_range_params, + *__p_base_diagonals_sp_global_storage); + + return __future(std::move(__event), std::move(__p_result_and_scratch_storage_base)); + } +}; + +template +class __merge_kernel_name; + +template +class __merge_kernel_name_large; + +template +class __diagonals_kernel_name; + +template +constexpr std::size_t +__get_starting_size_limit_for_large_submitter() +{ + return 4 * 1'048'576; // 4 MB +} + +template <> +constexpr std::size_t +__get_starting_size_limit_for_large_submitter() +{ + return 16 * 1'048'576; // 8 MB +} + +template +auto +__parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Range1&& __rng1, + _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) +{ + using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + + using __value_type = oneapi::dpl::__internal::__value_t<_Range3>; + + const std::size_t __n = __rng1.size() + __rng2.size(); + if (__n < __get_starting_size_limit_for_large_submitter<__value_type>()) + { + using _WiIndex = std::uint32_t; + static_assert(__get_starting_size_limit_for_large_submitter<__value_type>() <= + std::numeric_limits<_WiIndex>::max()); + using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __merge_kernel_name<_CustomName, _WiIndex>>; + return __parallel_merge_submitter<_WiIndex, _MergeKernelName>()( + std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), + std::forward<_Range3>(__rng3), __comp); + } + else + { + if (__n <= std::numeric_limits::max()) + { + using _WiIndex = std::uint32_t; + using _DiagonalsKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __diagonals_kernel_name<_CustomName, _WiIndex>>; + using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __merge_kernel_name_large<_CustomName, _WiIndex>>; + return __parallel_merge_submitter_large<_WiIndex, _CustomName, _DiagonalsKernelName, _MergeKernelName>()( + std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), + std::forward<_Range3>(__rng3), __comp); + } + else + { + using _WiIndex = std::uint64_t; + using _DiagonalsKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __diagonals_kernel_name<_CustomName, _WiIndex>>; + using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __merge_kernel_name_large<_CustomName, _WiIndex>>; + return __parallel_merge_submitter_large<_WiIndex, _CustomName, _DiagonalsKernelName, _MergeKernelName>()( + std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), + std::forward<_Range3>(__rng3), __comp); + } + } +} + +} // namespace __par_backend_hetero +} // namespace dpl +} // namespace oneapi + +#endif // _ONEDPL_PARALLEL_BACKEND_SYCL_MERGE_H From ab004c56702037b4a5409f5b7c18588fee009470 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Wed, 18 Dec 2024 21:23:53 +0100 Subject: [PATCH 14/16] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix compile error after merge changes from main branch Signed-off-by: Sergey Kopienko --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 68d86296fb5..5c71754edc5 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -266,7 +266,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, { std::size_t base_diag_count = 0; std::size_t steps_between_two_base_diags = 0; - std::uint8_t chunk = 0; + _IdType chunk = 0; _IdType steps = 0; }; From c11e177789937c398d706c7084df985185726908 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 19 Dec 2024 09:37:47 +0100 Subject: [PATCH 15/16] include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - fix review comment: declare all internal staff in __parallel_merge_submitter_large as private Signed-off-by: Sergey Kopienko --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index 5c71754edc5..3efe8888bf1 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -261,7 +261,7 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, __internal::__optional_kernel_name<_DiagonalsKernelName...>, __internal::__optional_kernel_name<_MergeKernelName...>> { - protected: + private: struct nd_range_params { std::size_t base_diag_count = 0; From 79af1a80bc3020f15e3df20d38b401cf2e9b8133 Mon Sep 17 00:00:00 2001 From: Sergey Kopienko Date: Thu, 19 Dec 2024 09:59:38 +0100 Subject: [PATCH 16/16] test/parallel_api/algorithm/alg.merge/merge.pass.cpp - expant test for long data sizes Signed-off-by: Sergey Kopienko --- .../algorithm/alg.merge/merge.pass.cpp | 43 +++++++++++++------ 1 file changed, 31 insertions(+), 12 deletions(-) diff --git a/test/parallel_api/algorithm/alg.merge/merge.pass.cpp b/test/parallel_api/algorithm/alg.merge/merge.pass.cpp index 34cba9f672a..2715256f3a1 100644 --- a/test/parallel_api/algorithm/alg.merge/merge.pass.cpp +++ b/test/parallel_api/algorithm/alg.merge/merge.pass.cpp @@ -97,24 +97,18 @@ struct test_merge_compare } }; -template +template void -test_merge_by_type(Generator1 generator1, Generator2 generator2) +test_merge_by_type(Generator1 generator1, Generator2 generator2, size_t start_size, size_t max_size, FStep fstep) { using namespace std; - size_t max_size = 100000; Sequence in1(max_size, generator1); Sequence in2(max_size / 2, generator2); Sequence out(in1.size() + in2.size()); ::std::sort(in1.begin(), in1.end()); ::std::sort(in2.begin(), in2.end()); - size_t start_size = 0; -#if TEST_DPCPP_BACKEND_PRESENT - start_size = 2; -#endif - - for (size_t size = start_size; size <= max_size; size = size <= 16 ? size + 1 : size_t(3.1415 * size)) { + for (size_t size = start_size; size <= max_size; size = fstep(size)) { #if !TEST_DPCPP_BACKEND_PRESENT invoke_on_all_policies<0>()(test_merge(), in1.cbegin(), in1.cbegin() + size, in2.data(), in2.data() + size / 2, out.begin(), out.begin() + 1.5 * size); @@ -139,6 +133,16 @@ test_merge_by_type(Generator1 generator1, Generator2 generator2) } } +template +void +test_merge_by_type(size_t start_size, size_t max_size, FStep fstep) +{ + test_merge_by_type([](size_t v) { return (v % 2 == 0 ? v : -v) * 3; }, [](size_t v) { return v * 2; }, start_size, max_size, fstep); +#if !ONEDPL_FPGA_DEVICE + test_merge_by_type([](size_t v) { return float64_t(v); }, [](size_t v) { return float64_t(v - 100); }, start_size, max_size, fstep); +#endif +} + template struct test_non_const { @@ -166,9 +170,24 @@ struct test_merge_tuple int main() { - test_merge_by_type([](size_t v) { return (v % 2 == 0 ? v : -v) * 3; }, [](size_t v) { return v * 2; }); -#if !ONEDPL_FPGA_DEVICE - test_merge_by_type([](size_t v) { return float64_t(v); }, [](size_t v) { return float64_t(v - 100); }); +#if TEST_DPCPP_BACKEND_PRESENT + const size_t start_size_small = 2; +#else + const size_t start_size_small = 0; +#endif + const size_t max_size_small = 100000; + auto fstep_small = [](std::size_t size){ return size <= 16 ? size + 1 : size_t(3.1415 * size);}; + test_merge_by_type(start_size_small, max_size_small, fstep_small); + + // Large data sizes (on GPU only) +#if TEST_DPCPP_BACKEND_PRESENT + if (!TestUtils::get_test_queue().get_device().is_cpu()) + { + const size_t start_size_large = 4'000'000; + const size_t max_size_large = 8'000'000; + auto fstep_large = [](std::size_t size){ return size + 2'000'000; }; + test_merge_by_type(start_size_large, max_size_large, fstep_large); + } #endif #if !TEST_DPCPP_BACKEND_PRESENT