Skip to content

Commit

Permalink
@@@
Browse files Browse the repository at this point in the history
Signed-off-by: Sergey Kopienko <[email protected]>
  • Loading branch information
SergeyKopienko committed Nov 20, 2024
1 parent 253ca8d commit 1fbe771
Showing 1 changed file with 91 additions and 24 deletions.
115 changes: 91 additions & 24 deletions include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
#include "sycl_defs.h"
#include "parallel_backend_sycl_utils.h"

//#define DUMP_DATA_LOADING 1

namespace oneapi
{
namespace dpl
Expand Down Expand Up @@ -280,6 +282,20 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
__internal::__optional_kernel_name<_DiagonalsKernelName...>,
__internal::__optional_kernel_name<_MergeKernelName...>>
{
#if DUMP_DATA_LOADING
template <typename _Range, typename _Index, typename _Data>
static void
__load_item_into_slm(_Range&& __rng, _Index __idx_from, _Data* __slm, _Index __idx_to, std::size_t __range_index,
bool __b_check, std::size_t __group_linear_id, std::size_t __local_id)
{
// BP
// condition: __b_check
// action: __range_index = {__range_index}, __rng[{__idx_from}] -> __slm[{__idx_to}], __group_linear_id = {__group_linear_id}, __local_id = {__local_id}
// action: {__range_index}, {__idx_from}, {__idx_to}, {__group_linear_id}, {__local_id}
__slm[__idx_to] = __rng[__idx_from];
}
#endif

template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _Range3, typename _Compare>
auto
operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const
Expand All @@ -294,37 +310,50 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
const _IdType __n2 = __rng2.size();
const _IdType __n = __n1 + __n2;

#if DUMP_DATA_LOADING
//const bool __b_check = __n1 == 16144 && __n2 == 8072;
const bool __b_check = __n1 == 50716 && __n2 == 25358; // __wi_in_one_wg = 51 __wg_count = 12

if (__b_check)
{
int i = 0;
i = i;
}
#endif

assert(__n1 > 0 || __n2 > 0);

_PRINT_INFO_IN_DEBUG_MODE(__exec);

// Empirical number of values to process per work-item
const _IdType __chunk = __exec.queue().get_device().is_cpu() ? 128 : 4;
assert(__chunk > 0);

// Define SLM bank size
constexpr std::size_t __slm_bank_size = 32; // TODO is it correct value? How to get it from hardware?

// Calculate how many data items we can read into one SLM bank
constexpr std::size_t __data_items_in_slm_bank = std::max((std::size_t)1, __slm_bank_size / sizeof(_RangeValueType));

// Pessimistically only use 2/3 of the memory to take into account memory used by compiled kernel
const auto __slm_adjusted_work_group_size = oneapi::dpl::__internal::__slm_adjusted_work_group_size(__exec, sizeof(_RangeValueType));
const auto __slm_adjusted_work_group_size_x_part = __slm_adjusted_work_group_size * 4 / 5;
// Empirical number of values to process per work-item
const _IdType __chunk = __exec.queue().get_device().is_cpu() ? 128 : __data_items_in_slm_bank; //4;
assert(__chunk > 0);

// The amount of data must be a multiple of the chunk size.
const std::size_t __max_source_data_items_fit_into_slm = __slm_adjusted_work_group_size_x_part - __slm_adjusted_work_group_size_x_part % __chunk;
assert(__max_source_data_items_fit_into_slm > 0);
assert(__max_source_data_items_fit_into_slm % __chunk == 0);
// Get the size of local memory arena in bytes.
const std::size_t __slm_mem_size = __exec.queue().get_device().template get_info<sycl::info::device::local_mem_size>();

// Pessimistically only use 4/5 of the memory to take into account memory used by compiled kernel
const std::size_t __slm_mem_size_x_part = __slm_mem_size * 4 / 5;

// Calculate how many items count we may place into SLM memory
const auto __slm_cached_items_count = __slm_mem_size_x_part / sizeof(_RangeValueType);

// The amount of items in the each work-group is the amount of diagonals processing between two work-groups + 1 (for the left base diagonal in work-group)
const std::size_t __wi_in_one_wg = __max_source_data_items_fit_into_slm / __chunk;
const std::size_t __wi_in_one_wg = __slm_cached_items_count / __chunk;
assert(__wi_in_one_wg > 0);

// The amount of the base diagonals is the amount of the work-groups
// - also it's the distance between two base diagonals is equal to the amount of work-items in each work-group
const std::size_t __wg_count = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk * __wi_in_one_wg);

assert(__wg_count * __wi_in_one_wg * __chunk >= __n);

// Create storage for save split-points on each base diagonal + 1 (for the right base diagonal in the last work-group)
// - in GLOBAL coordinates
using __base_diagonals_sp_storage_t = __result_and_scratch_storage<_ExecutionPolicy, _split_point_t<_IdType>>;
Expand Down Expand Up @@ -365,8 +394,8 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
auto __base_diagonals_sp_global_acc = __base_diagonals_sp_global_storage.template __get_scratch_acc<sycl::access_mode::read>(__cgh);
auto __base_diagonals_sp_global_ptr = __base_diagonals_sp_storage_t::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc);

const std::size_t __slm_cached_data_size = __wi_in_one_wg * __chunk;
__dpl_sycl::__local_accessor<_RangeValueType> __loc_acc(2 * __slm_cached_data_size, __cgh);
// __wi_in_one_wg * __chunk == 51 * 128 == 6528
__dpl_sycl::__local_accessor<_RangeValueType> __loc_acc(__wi_in_one_wg * __chunk, __cgh);

// Run nd_range parallel_for to process all the data
// - each work-group caching source data in SLM and processing diagonals between two base diagonals;
Expand All @@ -393,46 +422,83 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
_RangeValueType* __rng1_cache_slm = std::addressof(__loc_acc[0]);
_RangeValueType* __rng2_cache_slm = std::addressof(__loc_acc[0]) + __rng1_wg_data_size;

const _IdType __chunk_of_data_reading = std::max(__data_items_in_slm_bank, oneapi::dpl::__internal::__dpl_ceiling_div(__rng1_wg_data_size + __rng2_wg_data_size, __wi_in_one_wg));

const _IdType __how_many_wi_reads_rng1 = oneapi::dpl::__internal::__dpl_ceiling_div(__rng1_wg_data_size, __chunk_of_data_reading);
const _IdType __how_many_wi_reads_rng2 = oneapi::dpl::__internal::__dpl_ceiling_div(__rng2_wg_data_size, __chunk_of_data_reading);
const _IdType __how_many_wi_reads_rng1 = oneapi::dpl::__internal::__dpl_ceiling_div(__rng1_wg_data_size, __chunk);
const _IdType __how_many_wi_reads_rng2 = oneapi::dpl::__internal::__dpl_ceiling_div(__rng2_wg_data_size, __chunk);

// Calculate the amount of WI for read data from rng1
if (__local_id < __how_many_wi_reads_rng1)
{
const _IdType __idx_begin = __local_id * __chunk_of_data_reading;
const _IdType __idx_begin = __local_id * __chunk;

// Cooperative data load from __rng1 to __rng1_cache_slm
if (__idx_begin < __rng1_wg_data_size)
{
const _IdType __idx_end = std::min(__idx_begin + __chunk_of_data_reading, __rng1_wg_data_size);
const _IdType __idx_end = std::min(__idx_begin + __chunk, __rng1_wg_data_size);

_ONEDPL_PRAGMA_UNROLL
for (_IdType __idx = __idx_begin; __idx < __idx_end; ++__idx)
#if !DUMP_DATA_LOADING
__rng1_cache_slm[__idx] = __rng1[__sp_base_left_global.first + __idx];
#else
__load_item_into_slm(__rng1, __sp_base_left_global.first + __idx, __rng1_cache_slm, __idx, 1, __b_check, __group_linear_id, __local_id);
#endif
}
}

const std::size_t __first_wi_local_id_for_read_rng2 = __wi_in_one_wg - __how_many_wi_reads_rng2 - 1;
// const bool __b_check = __n1 == 50716 && __n2 == 25358;
// __chunk == 128
// __wi_in_one_wg = 51, __wg_count = 12, __how_many_wi_reads_rng2 = 7
const std::size_t __first_wi_local_id_for_read_rng2 = __wi_in_one_wg - __how_many_wi_reads_rng2; // - 1;
// wi to read from __rng2 : 44, 45, 46, 47, 48, 49, 50
if (__local_id >= __first_wi_local_id_for_read_rng2)
{
const _IdType __idx_begin = (__local_id - __first_wi_local_id_for_read_rng2) * __chunk_of_data_reading;
const _IdType __idx_begin = (__local_id - __first_wi_local_id_for_read_rng2) * __chunk;

// Cooperative data load from __rng2 to __rng2_cache_slm
if (__idx_begin < __rng2_wg_data_size)
{
const _IdType __idx_end = std::min(__idx_begin + __chunk_of_data_reading, __rng2_wg_data_size);
const _IdType __idx_end = std::min(__idx_begin + __chunk, __rng2_wg_data_size);

_ONEDPL_PRAGMA_UNROLL
for (_IdType __idx = __idx_begin; __idx < __idx_end; ++__idx)
#if !DUMP_DATA_LOADING
__rng2_cache_slm[__idx] = __rng2[__sp_base_left_global.second + __idx];
#else
__load_item_into_slm(__rng2, __sp_base_left_global.second + __idx, __rng2_cache_slm, __idx, 2, __b_check, __group_linear_id, __local_id);
#endif
}
}

// Wait until all the data is loaded
__dpl_sycl::__group_barrier(__nd_item);

#if DUMP_DATA_LOADING
if (__local_id == 0)
{
for (auto i = __sp_base_left_global.first; i < __sp_base_right_global.first; ++i)
{
auto _idx_slm = i - __sp_base_left_global.first;
if (__rng1_cache_slm[_idx_slm] != __rng1[i])
{
auto __group_linear_id_tmp = __group_linear_id;
__group_linear_id_tmp = __group_linear_id_tmp;
assert(false);
}
}

for (auto i = __sp_base_left_global.second; i < __sp_base_right_global.second; ++i)
{
auto _idx_slm = i - __sp_base_left_global.second;
if (__rng2_cache_slm[_idx_slm] != __rng2[i])
{
auto __group_linear_id_tmp = __group_linear_id;
__group_linear_id_tmp = __group_linear_id_tmp;
assert(false);
}
}
}
#endif

// Current diagonal inside of the merge matrix?
if (__global_linear_id * __chunk < __n)
{
Expand Down Expand Up @@ -477,7 +543,8 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy
{
using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>;

constexpr std::size_t __starting_size_limit_for_large_submitter = 1 * 1'048'576; // 1 Mb
//constexpr std::size_t __starting_size_limit_for_large_submitter = 1 * 1'048'576; // 1 Mb
constexpr std::size_t __starting_size_limit_for_large_submitter = 1'024;

using _Range1ValueType = typename std::iterator_traits<decltype(__rng1.begin())>::value_type;
using _Range2ValueType = typename std::iterator_traits<decltype(__rng2.begin())>::value_type;
Expand Down

0 comments on commit 1fbe771

Please sign in to comment.