Skip to content

Commit

Permalink
Enabling reduce_then_scan for "Set" family of scan APIs (#1879)
Browse files Browse the repository at this point in the history
Signed-off-by: Dan Hoeflinger <[email protected]>
Co-authored-by: Adam Fidel <[email protected]>
  • Loading branch information
danhoeflinger and adamfidel authored Oct 28, 2024
1 parent 66ead74 commit 28cb633
Show file tree
Hide file tree
Showing 4 changed files with 196 additions and 58 deletions.
46 changes: 5 additions & 41 deletions include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h
Original file line number Diff line number Diff line change
Expand Up @@ -1690,32 +1690,11 @@ template <typename _BackendTag, typename _ExecutionPolicy, typename _ForwardIter
_OutputIterator
__pattern_hetero_set_op(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _ForwardIterator1 __first1,
_ForwardIterator1 __last1, _ForwardIterator2 __first2, _ForwardIterator2 __last2,
_OutputIterator __result, _Compare __comp, _IsOpDifference)
_OutputIterator __result, _Compare __comp, _IsOpDifference __is_op_difference)
{
typedef typename std::iterator_traits<_ForwardIterator1>::difference_type _Size1;
typedef typename std::iterator_traits<_ForwardIterator2>::difference_type _Size2;

const _Size1 __n1 = __last1 - __first1;
const _Size2 __n2 = __last2 - __first2;

//Algo is based on the recommended approach of set_intersection algo for GPU: binary search + scan (copying by mask).
using _ReduceOp = std::plus<_Size1>;
using _Assigner = unseq_backend::__scan_assigner;
using _NoAssign = unseq_backend::__scan_no_assign;
using _MaskAssigner = unseq_backend::__mask_assigner<2>;
using _InitType = unseq_backend::__no_init_value<_Size1>;
using _DataAcc = unseq_backend::walk_n<_ExecutionPolicy, oneapi::dpl::__internal::__no_op>;

_ReduceOp __reduce_op;
_Assigner __assign_op;
_DataAcc __get_data_op;
unseq_backend::__copy_by_mask<_ReduceOp, oneapi::dpl::__internal::__pstl_assign, /*inclusive*/ std::true_type, 2>
__copy_by_mask_op;
unseq_backend::__brick_set_op<_ExecutionPolicy, _Compare, _Size1, _Size2, _IsOpDifference> __create_mask_op{
__comp, __n1, __n2};

// temporary buffer to store boolean mask
oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, int32_t> __mask_buf(__exec, __n1);

auto __keep1 =
oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _ForwardIterator1>();
Expand All @@ -1727,25 +1706,10 @@ __pattern_hetero_set_op(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _F
auto __keep3 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _OutputIterator>();
auto __buf3 = __keep3(__result, __result + __n1);

auto __result_size =
__par_backend_hetero::__parallel_transform_scan_base(
_BackendTag{}, std::forward<_ExecutionPolicy>(__exec),
oneapi::dpl::__ranges::make_zip_view(
__buf1.all_view(), __buf2.all_view(),
oneapi::dpl::__ranges::all_view<int32_t, __par_backend_hetero::access_mode::read_write>(
__mask_buf.get_buffer())),
__buf3.all_view(), _InitType{},
// local scan
unseq_backend::__scan</*inclusive*/ std::true_type, _ExecutionPolicy, _ReduceOp, _DataAcc, _Assigner,
_MaskAssigner, decltype(__create_mask_op), _InitType>{
__reduce_op, __get_data_op, __assign_op, _MaskAssigner{}, __create_mask_op},
// scan between groups
unseq_backend::__scan</*inclusive=*/std::true_type, _ExecutionPolicy, _ReduceOp, _DataAcc, _NoAssign,
_Assigner, _DataAcc, _InitType>{__reduce_op, __get_data_op, _NoAssign{}, __assign_op,
__get_data_op},
// global scan
__copy_by_mask_op)
.get();
auto __result_size = __par_backend_hetero::__parallel_set_op(_BackendTag{}, std::forward<_ExecutionPolicy>(__exec),
__buf1.all_view(), __buf2.all_view(),
__buf3.all_view(), __comp, __is_op_difference)
.get();

return __result + __result_size;
}
Expand Down
193 changes: 183 additions & 10 deletions include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
Original file line number Diff line number Diff line change
Expand Up @@ -741,12 +741,12 @@ __parallel_transform_scan_single_group(oneapi::dpl::__internal::__device_backend
}
}

template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _InitType,
typename _LocalScan, typename _GroupScan, typename _GlobalScan>
template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _InitType, typename _LocalScan,
typename _GroupScan, typename _GlobalScan>
auto
__parallel_transform_scan_base(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec,
_Range1&& __in_rng, _Range2&& __out_rng, _InitType __init,
_LocalScan __local_scan, _GroupScan __group_scan, _GlobalScan __global_scan)
_Range1&& __in_rng, _Range2&& __out_rng, _InitType __init, _LocalScan __local_scan,
_GroupScan __group_scan, _GlobalScan __global_scan)
{
using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>;

Expand Down Expand Up @@ -801,16 +801,17 @@ struct __simple_write_to_id
}
};

template <typename _Predicate>
template <typename _Predicate, typename _RangeTransform = oneapi::dpl::__internal::__no_op>
struct __gen_mask
{
template <typename _InRng>
bool
operator()(const _InRng& __in_rng, std::size_t __id) const
operator()(_InRng&& __in_rng, std::size_t __id) const
{
return __pred(__in_rng[__id]);
return __pred((__rng_transform(std::forward<_InRng>(__in_rng)))[__id]);
}
_Predicate __pred;
_RangeTransform __rng_transform;
};

template <typename _BinaryPredicate>
Expand All @@ -828,6 +829,70 @@ struct __gen_unique_mask
_BinaryPredicate __pred;
};

template <typename _IsOpDifference, typename _Compare>
struct __gen_set_mask
{
template <typename _InRng>
bool
operator()(const _InRng& __in_rng, std::size_t __id) const
{
// First we must extract individual sequences from zip iterator because they may not have the same length,
// dereferencing is dangerous
auto __set_a = std::get<0>(__in_rng.tuple()); // first sequence
auto __set_b = std::get<1>(__in_rng.tuple()); // second sequence
auto __set_mask = std::get<2>(__in_rng.tuple()); // mask sequence

std::size_t __nb = __set_b.size();

auto __val_a = __set_a[__id];

auto __res = oneapi::dpl::__internal::__pstl_lower_bound(__set_b, std::size_t{0}, __nb, __val_a, __comp);

bool bres =
_IsOpDifference::value; //initialization is true in case of difference operation; false - intersection.
if (__res == __nb || __comp(__val_a, __set_b[__res]))
{
// there is no __val_a in __set_b, so __set_b in the difference {__set_a}/{__set_b};
}
else
{
auto __val_b = __set_b[__res];

//Difference operation logic: if number of duplication in __set_a on left side from __id > total number of
//duplication in __set_b then a mask is 1

//Intersection operation logic: if number of duplication in __set_a on left side from __id <= total number of
//duplication in __set_b then a mask is 1

const std::size_t __count_a_left =
__id - oneapi::dpl::__internal::__pstl_left_bound(__set_a, std::size_t{0}, __id, __val_a, __comp) + 1;

const std::size_t __count_b =
oneapi::dpl::__internal::__pstl_right_bound(__set_b, __res, __nb, __val_b, __comp) -
oneapi::dpl::__internal::__pstl_left_bound(__set_b, std::size_t{0}, __res, __val_b, __comp);

if constexpr (_IsOpDifference::value)
bres = __count_a_left > __count_b; /*difference*/
else
bres = __count_a_left <= __count_b; /*intersection*/
}
__set_mask[__id] = bres;
return bres;
}
_Compare __comp;
};

template <std::size_t _EleId>
struct __extract_range_from_zip
{
template <typename _InRng>
auto
operator()(const _InRng& __in_rng) const
{
return std::get<_EleId>(__in_rng.tuple());
}
};

template <typename _GenMask>
struct __gen_count_mask
{
Expand All @@ -840,22 +905,24 @@ struct __gen_count_mask
_GenMask __gen_mask;
};

template <typename _GenMask>
template <typename _GenMask, typename _RangeTransform = oneapi::dpl::__internal::__no_op>
struct __gen_expand_count_mask
{
template <typename _InRng, typename _SizeType>
auto
operator()(_InRng&& __in_rng, _SizeType __id) const
{
auto __transformed_input = __rng_transform(__in_rng);
// Explicitly creating this element type is necessary to avoid modifying the input data when _InRng is a
// zip_iterator which will return a tuple of references when dereferenced. With this explicit type, we copy
// the values of zipped input types rather than their references.
using _ElementType = oneapi::dpl::__internal::__value_t<_InRng>;
_ElementType ele = __in_rng[__id];
using _ElementType = oneapi::dpl::__internal::__value_t<decltype(__transformed_input)>;
_ElementType ele = __transformed_input[__id];
bool mask = __gen_mask(std::forward<_InRng>(__in_rng), __id);
return std::tuple(mask ? _SizeType{1} : _SizeType{0}, mask, ele);
}
_GenMask __gen_mask;
_RangeTransform __rng_transform;
};

struct __get_zeroth_element
Expand Down Expand Up @@ -1200,6 +1267,112 @@ __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag,
}
}

template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _Range3, typename _Compare,
typename _IsOpDifference>
auto
__parallel_set_reduce_then_scan(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec,
_Range1&& __rng1, _Range2&& __rng2, _Range3&& __result, _Compare __comp,
_IsOpDifference)
{
// fill in reduce then scan impl
using _GenMaskReduce = oneapi::dpl::__par_backend_hetero::__gen_set_mask<_IsOpDifference, _Compare>;
using _MaskRangeTransform = oneapi::dpl::__par_backend_hetero::__extract_range_from_zip<2>;
using _MaskPredicate = oneapi::dpl::__internal::__no_op;
using _GenMaskScan = oneapi::dpl::__par_backend_hetero::__gen_mask<_MaskPredicate, _MaskRangeTransform>;
using _WriteOp = oneapi::dpl::__par_backend_hetero::__write_to_id_if<0, oneapi::dpl::__internal::__pstl_assign>;
using _Size = oneapi::dpl::__internal::__difference_t<_Range3>;
using _ScanRangeTransform = oneapi::dpl::__par_backend_hetero::__extract_range_from_zip<0>;

using _GenReduceInput = oneapi::dpl::__par_backend_hetero::__gen_count_mask<_GenMaskReduce>;
using _ReduceOp = std::plus<_Size>;
using _GenScanInput = oneapi::dpl::__par_backend_hetero::__gen_expand_count_mask<_GenMaskScan, _ScanRangeTransform>;
using _ScanInputTransform = oneapi::dpl::__par_backend_hetero::__get_zeroth_element;

oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, std::int32_t> __mask_buf(__exec, __rng1.size());

return __parallel_transform_reduce_then_scan(
__backend_tag, std::forward<_ExecutionPolicy>(__exec),
oneapi::dpl::__ranges::make_zip_view(
std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2),
oneapi::dpl::__ranges::all_view<std::int32_t, __par_backend_hetero::access_mode::read_write>(
__mask_buf.get_buffer())),
std::forward<_Range3>(__result), _GenReduceInput{_GenMaskReduce{__comp}}, _ReduceOp{},
_GenScanInput{_GenMaskScan{_MaskPredicate{}, _MaskRangeTransform{}}, _ScanRangeTransform{}},
_ScanInputTransform{}, _WriteOp{}, oneapi::dpl::unseq_backend::__no_init_value<_Size>{},
/*_Inclusive=*/std::true_type{}, /*__is_unique_pattern=*/std::false_type{});
}

template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _Range3, typename _Compare,
typename _IsOpDifference>
auto
__parallel_set_scan(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec,
_Range1&& __rng1, _Range2&& __rng2, _Range3&& __result, _Compare __comp,
_IsOpDifference __is_op_difference)
{
using _Size1 = oneapi::dpl::__internal::__difference_t<_Range1>;
using _Size2 = oneapi::dpl::__internal::__difference_t<_Range2>;

_Size1 __n1 = __rng1.size();
_Size2 __n2 = __rng2.size();

//Algo is based on the recommended approach of set_intersection algo for GPU: binary search + scan (copying by mask).
using _ReduceOp = std::plus<_Size1>;
using _Assigner = unseq_backend::__scan_assigner;
using _NoAssign = unseq_backend::__scan_no_assign;
using _MaskAssigner = unseq_backend::__mask_assigner<2>;
using _InitType = unseq_backend::__no_init_value<_Size1>;
using _DataAcc = unseq_backend::walk_n<_ExecutionPolicy, oneapi::dpl::__internal::__no_op>;

_ReduceOp __reduce_op;
_Assigner __assign_op;
_DataAcc __get_data_op;
unseq_backend::__copy_by_mask<_ReduceOp, oneapi::dpl::__internal::__pstl_assign, /*inclusive*/ std::true_type, 2>
__copy_by_mask_op;
unseq_backend::__brick_set_op<_ExecutionPolicy, _Compare, _Size1, _Size2, _IsOpDifference> __create_mask_op{
__comp, __n1, __n2};

// temporary buffer to store boolean mask
oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, int32_t> __mask_buf(__exec, __n1);

return __par_backend_hetero::__parallel_transform_scan_base(
__backend_tag, std::forward<_ExecutionPolicy>(__exec),
oneapi::dpl::__ranges::make_zip_view(
std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2),
oneapi::dpl::__ranges::all_view<int32_t, __par_backend_hetero::access_mode::read_write>(
__mask_buf.get_buffer())),
std::forward<_Range3>(__result), _InitType{},
// local scan
unseq_backend::__scan</*inclusive*/ std::true_type, _ExecutionPolicy, _ReduceOp, _DataAcc, _Assigner,
_MaskAssigner, decltype(__create_mask_op), _InitType>{
__reduce_op, __get_data_op, __assign_op, _MaskAssigner{}, __create_mask_op},
// scan between groups
unseq_backend::__scan</*inclusive=*/std::true_type, _ExecutionPolicy, _ReduceOp, _DataAcc, _NoAssign, _Assigner,
_DataAcc, _InitType>{__reduce_op, __get_data_op, _NoAssign{}, __assign_op, __get_data_op},
// global scan
__copy_by_mask_op);
}

template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _Range3, typename _Compare,
typename _IsOpDifference>
auto
__parallel_set_op(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec,
_Range1&& __rng1, _Range2&& __rng2, _Range3&& __result, _Compare __comp,
_IsOpDifference __is_op_difference)
{
if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec))
{
return __parallel_set_reduce_then_scan(__backend_tag, std::forward<_ExecutionPolicy>(__exec),
std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2),
std::forward<_Range3>(__result), __comp, __is_op_difference);
}
else
{
return __parallel_set_scan(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1),
std::forward<_Range2>(__rng2), std::forward<_Range3>(__result), __comp,
__is_op_difference);
}
}

//------------------------------------------------------------------------
// find_or tags
//------------------------------------------------------------------------
Expand Down
13 changes: 7 additions & 6 deletions include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,7 @@ namespace oneapi::dpl::__par_backend_hetero
template <typename _UnaryOp>
struct __gen_transform_input;

template <typename _Predicate>
template <typename _Predicate, typename _RangeTransform>
struct __gen_mask;

template <typename _BinaryPredicate>
Expand All @@ -245,7 +245,7 @@ struct __gen_unique_mask;
template <typename _GenMask>
struct __gen_count_mask;

template <typename _GenMask>
template <typename _GenMask, typename _RangeTransform>
struct __gen_expand_count_mask;

template <int32_t __offset, typename _Assign>
Expand All @@ -266,8 +266,9 @@ struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backen
{
};

template <typename _Predicate>
struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__gen_mask, _Predicate)>
template <typename _Predicate, typename _RangeTransform>
struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__gen_mask, _Predicate,
_RangeTransform)>
: oneapi::dpl::__internal::__are_all_device_copyable<_Predicate>
{
};
Expand All @@ -284,9 +285,9 @@ struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backen
{
};

template <typename _GenMask>
template <typename _GenMask, typename _RangeTransform>
struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__gen_expand_count_mask,
_GenMask)>
_GenMask, _RangeTransform)>
: oneapi::dpl::__internal::__are_all_device_copyable<_GenMask>
{
};
Expand Down
2 changes: 1 addition & 1 deletion include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
Original file line number Diff line number Diff line change
Expand Up @@ -910,7 +910,7 @@ struct __brick_includes
auto __val_a = __a[__res];

//searching number of duplication
const auto __count_a = __internal::__pstl_right_bound(__a, __res, __a_end, __val_a, __comp) - __res + __res -
const auto __count_a = __internal::__pstl_right_bound(__a, __res, __a_end, __val_a, __comp) -
__internal::__pstl_left_bound(__a, __a_beg, __res, __val_a, __comp);

const auto __count_b = __internal::__pstl_right_bound(__b, _Size2(__idx_b), __b_end, __val_b, __comp) -
Expand Down

0 comments on commit 28cb633

Please sign in to comment.