Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimization of __serial_merge function #1970

Merged
merged 18 commits into from
Dec 18, 2024
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
be021ac
include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - …
SergeyKopienko Dec 17, 2024
d3d863d
Specify __serial_merge by unroll factor template param
SergeyKopienko Dec 17, 2024
6bfe0ee
include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - …
SergeyKopienko Dec 18, 2024
726bd2e
Apply GitHUB clang format
SergeyKopienko Dec 18, 2024
ab54931
Remove unroll from __serial_merge
SergeyKopienko Dec 18, 2024
764a494
Merge branch 'dev/skopienko/serial_merge_without_unroll' into dev/sko…
SergeyKopienko Dec 18, 2024
b30d296
include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - …
SergeyKopienko Dec 18, 2024
288784a
include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - …
SergeyKopienko Dec 18, 2024
2d065ac
include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - …
SergeyKopienko Dec 18, 2024
cafc522
include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - …
SergeyKopienko Dec 18, 2024
bf18ec8
Fix review comment: change __chunk parameter type of __serial_merge t…
SergeyKopienko Dec 18, 2024
879b5d2
include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - …
SergeyKopienko Dec 18, 2024
00421d4
include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - …
SergeyKopienko Dec 18, 2024
3e94b36
include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - …
SergeyKopienko Dec 18, 2024
f82443f
include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - …
SergeyKopienko Dec 18, 2024
10d0dd1
include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort…
SergeyKopienko Dec 18, 2024
cbba810
include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort…
SergeyKopienko Dec 18, 2024
a43f427
Merge branch 'main' into dev/skopienko/serial_merge
SergeyKopienko Dec 18, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 26 additions & 43 deletions include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,50 +82,33 @@ __find_start_point(const _Rng1& __rng1, const _Rng2& __rng2, const _Index __i_el
// to rng3 (starting from start3) in 'chunk' steps, but do not exceed the total size of the sequences (n1 and n2)
template <typename _Rng1, typename _Rng2, typename _Rng3, typename _Index, typename _Compare>
void
__serial_merge(const _Rng1& __rng1, const _Rng2& __rng2, _Rng3& __rng3, _Index __start1, _Index __start2,
const _Index __start3, const std::uint8_t __chunk, const _Index __n1, const _Index __n2, _Compare __comp)
__serial_merge(const _Rng1& __rng1, const _Rng2& __rng2, _Rng3& __rng3, const _Index __start1, const _Index __start2,
const _Index __start3, const _Index __chunk, const _Index __n1, const _Index __n2, _Compare __comp)
{
if (__start1 >= __n1)
{
//copying a residual of the second seq
const _Index __n = std::min<_Index>(__n2 - __start2, __chunk);
for (std::uint8_t __i = 0; __i < __n; ++__i)
__rng3[__start3 + __i] = __rng2[__start2 + __i];
}
else if (__start2 >= __n2)
{
//copying a residual of the first seq
const _Index __n = std::min<_Index>(__n1 - __start1, __chunk);
for (std::uint8_t __i = 0; __i < __n; ++__i)
__rng3[__start3 + __i] = __rng1[__start1 + __i];
}
else
const _Index __rng1_size = std::min<_Index>(__n1 > __start1 ? __n1 - __start1 : _Index{0}, __chunk);
const _Index __rng2_size = std::min<_Index>(__n2 > __start2 ? __n2 - __start2 : _Index{0}, __chunk);
const _Index __rng3_size = std::min<_Index>(__rng1_size + __rng2_size, __chunk);

const _Index __rng1_idx_end = __start1 + __rng1_size;
const _Index __rng2_idx_end = __start2 + __rng2_size;
const _Index __rng3_idx_end = __start3 + __rng3_size;

_Index __rng1_idx = __start1;
_Index __rng2_idx = __start2;

for (_Index __rng3_idx = __start3; __rng3_idx < __rng3_idx_end; ++__rng3_idx)
{
for (std::uint8_t __i = 0; __i < __chunk && __start1 < __n1 && __start2 < __n2; ++__i)
{
const auto& __val1 = __rng1[__start1];
const auto& __val2 = __rng2[__start2];
if (__comp(__val2, __val1))
{
__rng3[__start3 + __i] = __val2;
if (++__start2 == __n2)
{
//copying a residual of the first seq
for (++__i; __i < __chunk && __start1 < __n1; ++__i, ++__start1)
__rng3[__start3 + __i] = __rng1[__start1];
}
}
else
{
__rng3[__start3 + __i] = __val1;
if (++__start1 == __n1)
{
//copying a residual of the second seq
for (++__i; __i < __chunk && __start2 < __n2; ++__i, ++__start2)
__rng3[__start3 + __i] = __rng2[__start2];
}
}
}
const bool __rng1_idx_less_n1 = __rng1_idx < __rng1_idx_end;
const bool __rng2_idx_less_n2 = __rng2_idx < __rng2_idx_end;

// One of __rng1_idx_less_n1 and __rng2_idx_less_n2 should be true here
SergeyKopienko marked this conversation as resolved.
Show resolved Hide resolved
// because 1) we should fill output data with elements from one of the input ranges
// 2) we calculate __rng3_idx_end as std::min<_Index>(__rng1_size + __rng2_size, __chunk).
__rng3[__rng3_idx] =
((__rng1_idx_less_n1 && __rng2_idx_less_n2 && __comp(__rng2[__rng2_idx], __rng1[__rng1_idx])) ||
!__rng1_idx_less_n1)
? __rng2[__rng2_idx++]
: __rng1[__rng1_idx++];
SergeyKopienko marked this conversation as resolved.
Show resolved Hide resolved
}
}

Expand All @@ -149,7 +132,7 @@ struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_N
_PRINT_INFO_IN_DEBUG_MODE(__exec);

// Empirical number of values to process per work-item
const std::uint8_t __chunk = __exec.queue().get_device().is_cpu() ? 128 : 4;
const _IdType __chunk = __exec.queue().get_device().is_cpu() ? 128 : 4;

const _IdType __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ struct __group_merge_path_sorter
template <typename _StorageAcc, typename _Compare>
bool
sort(const sycl::nd_item<1>& __item, const _StorageAcc& __storage_acc, _Compare __comp, std::uint32_t __start,
std::uint32_t __end, std::uint32_t __sorted, std::uint16_t __data_per_workitem,
std::uint32_t __end, std::uint32_t __sorted, std::uint32_t __data_per_workitem,
SergeyKopienko marked this conversation as resolved.
Show resolved Hide resolved
std::uint32_t __workgroup_size) const
{
const std::uint32_t __sorted_final = __data_per_workitem * __workgroup_size;
Expand Down Expand Up @@ -91,7 +91,7 @@ struct __group_merge_path_sorter
auto __in_ptr1 = __in_ptr + __start1;
auto __in_ptr2 = __in_ptr + __start2;

const auto __start = __find_start_point(__in_ptr1, __in_ptr2, __id_local, __n1, __n2, __comp);
const std::pair<std::uint32_t, std::uint32_t> __start = __find_start_point(__in_ptr1, __in_ptr2, __id_local, __n1, __n2, __comp);
// TODO: copy the data into registers before the merge to halve the required amount of SLM
__serial_merge(__in_ptr1, __in_ptr2, __out_ptr, __start.first, __start.second, __id, __data_per_workitem,
__n1, __n2, __comp);
Expand Down Expand Up @@ -241,7 +241,7 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name
const _IndexT __n = __rng.size();
_IndexT __n_sorted = __leaf_size;
const bool __is_cpu = __q.get_device().is_cpu();
const std::uint32_t __chunk = __is_cpu ? 32 : 4;
const _IndexT __chunk = __is_cpu ? 32 : 4;
const std::size_t __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk);
bool __data_in_temp = false;

Expand Down
Loading