Skip to content

Commit

Permalink
Address several PR comments
Browse files Browse the repository at this point in the history
* Move __stride_recommender into __parallel_for_large_submitter
* Use {} to invoke constructor
* Simplify if-else statements in for dispatch

Signed-off-by: Matthew Michel <[email protected]>
  • Loading branch information
mmichel11 committed Nov 6, 2024
1 parent e3e05a7 commit 8f9c5bb
Showing 1 changed file with 37 additions and 12 deletions.
49 changes: 37 additions & 12 deletions include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,39 @@ struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name..
// Limit the work-group size to 512 which has empirically yielded the best results across different architectures.
static constexpr std::uint16_t __max_work_group_size = 512;

// SPIR-V compilation targets show best performance with a stride of the sub-group size.
// Other compilation targets perform best with a work-group size stride. This utility can only be called from the
// device.
static inline std::tuple<std::size_t, std::size_t, bool>
__stride_recommender(const sycl::nd_item<1>& __item, std::size_t __count, std::size_t __iters_per_work_item,
std::size_t __work_group_size)
{
if constexpr (oneapi::dpl::__internal::__is_spirv_target_v)
{
const __dpl_sycl::__sub_group __sub_group = __item.get_sub_group();
const std::uint32_t __sub_group_size = __sub_group.get_local_linear_range();
const std::uint32_t __sub_group_id = __sub_group.get_group_linear_id();
const std::uint32_t __sub_group_local_id = __sub_group.get_local_linear_id();
const std::size_t __work_group_id = __item.get_group().get_group_linear_id();

const std::size_t __sub_group_start_idx =
__iters_per_work_item * (__work_group_id * __work_group_size + __sub_group_size * __sub_group_id);
const bool __is_full_sub_group =
__sub_group_start_idx + __iters_per_work_item * __sub_group_size <= __count;
const std::size_t __work_item_idx = __sub_group_start_idx + __sub_group_local_id;
return std::make_tuple(__work_item_idx, __sub_group_size, __is_full_sub_group);
}
else
{
const std::size_t __work_group_start_idx =
__item.get_group().get_group_linear_id() * __work_group_size * __iters_per_work_item;
const std::size_t __work_item_idx = __work_group_start_idx + __item.get_local_linear_id();
const bool __is_full_work_group =
__work_group_start_idx + __iters_per_work_item * __work_group_size <= __count;
return std::make_tuple(__work_item_idx, __work_group_size, __is_full_work_group);
}
}

// Once there is enough work to launch a group on each compute unit with our chosen __iters_per_item,
// then we should start using this code path.
template <typename _ExecutionPolicy>
Expand Down Expand Up @@ -357,22 +390,14 @@ __parallel_for(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&&
// then only compile the basic kernel as the two versions are effectively the same.
if constexpr (__large_submitter::__iters_per_work_item > 1)
{
if (__count < __large_submitter::__estimate_best_start_size(__exec))
if (__count >= __large_submitter::__estimate_best_start_size(__exec))
{
return __small_submitter()(std::forward<_ExecutionPolicy>(__exec), __brick, __count,
return __large_submitter{}(std::forward<_ExecutionPolicy>(__exec), __brick, __count,
std::forward<_Ranges>(__rngs)...);
}
else
{
return __large_submitter()(std::forward<_ExecutionPolicy>(__exec), __brick, __count,
std::forward<_Ranges>(__rngs)...);
}
}
else
{
return __small_submitter()(std::forward<_ExecutionPolicy>(__exec), __brick, __count,
std::forward<_Ranges>(__rngs)...);
}
return __small_submitter{}(std::forward<_ExecutionPolicy>(__exec), __brick, __count,
std::forward<_Ranges>(__rngs)...);
}

//------------------------------------------------------------------------
Expand Down

0 comments on commit 8f9c5bb

Please sign in to comment.