diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index e73c5ee569..f86951df70 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -26,6 +26,8 @@ #include "sycl_defs.h" #include "parallel_backend_sycl_utils.h" +//#define DUMP_DATA_LOADING 1 + namespace oneapi { namespace dpl @@ -280,6 +282,20 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, __internal::__optional_kernel_name<_DiagonalsKernelName...>, __internal::__optional_kernel_name<_MergeKernelName...>> { +#if DUMP_DATA_LOADING + template + static void + __load_item_into_slm(_Range&& __rng, _Index __idx_from, _Data* __slm, _Index __idx_to, std::size_t __range_index, + bool __b_check, std::size_t __group_linear_id, std::size_t __local_id) + { + // BP + // condition: __b_check + // action: __range_index = {__range_index}, __rng[{__idx_from}] -> __slm[{__idx_to}], __group_linear_id = {__group_linear_id}, __local_id = {__local_id} + // action: {__range_index}, {__idx_from}, {__idx_to}, {__group_linear_id}, {__local_id} + __slm[__idx_to] = __rng[__idx_from]; + } +#endif + template auto operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const @@ -294,37 +310,50 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, const _IdType __n2 = __rng2.size(); const _IdType __n = __n1 + __n2; +#if DUMP_DATA_LOADING + //const bool __b_check = __n1 == 16144 && __n2 == 8072; + const bool __b_check = __n1 == 50716 && __n2 == 25358; // __wi_in_one_wg = 51 __wg_count = 12 + + if (__b_check) + { + int i = 0; + i = i; + } +#endif + assert(__n1 > 0 || __n2 > 0); _PRINT_INFO_IN_DEBUG_MODE(__exec); - // Empirical number of values to process per work-item - const _IdType __chunk = __exec.queue().get_device().is_cpu() ? 128 : 4; - assert(__chunk > 0); - // Define SLM bank size constexpr std::size_t __slm_bank_size = 32; // TODO is it correct value? How to get it from hardware? // Calculate how many data items we can read into one SLM bank constexpr std::size_t __data_items_in_slm_bank = std::max((std::size_t)1, __slm_bank_size / sizeof(_RangeValueType)); - // Pessimistically only use 2/3 of the memory to take into account memory used by compiled kernel - const auto __slm_adjusted_work_group_size = oneapi::dpl::__internal::__slm_adjusted_work_group_size(__exec, sizeof(_RangeValueType)); - const auto __slm_adjusted_work_group_size_x_part = __slm_adjusted_work_group_size * 4 / 5; + // Empirical number of values to process per work-item + const _IdType __chunk = __exec.queue().get_device().is_cpu() ? 128 : __data_items_in_slm_bank; //4; + assert(__chunk > 0); - // The amount of data must be a multiple of the chunk size. - const std::size_t __max_source_data_items_fit_into_slm = __slm_adjusted_work_group_size_x_part - __slm_adjusted_work_group_size_x_part % __chunk; - assert(__max_source_data_items_fit_into_slm > 0); - assert(__max_source_data_items_fit_into_slm % __chunk == 0); + // Get the size of local memory arena in bytes. + const std::size_t __slm_mem_size = __exec.queue().get_device().template get_info(); + + // Pessimistically only use 4/5 of the memory to take into account memory used by compiled kernel + const std::size_t __slm_mem_size_x_part = __slm_mem_size * 4 / 5; + + // Calculate how many items count we may place into SLM memory + const auto __slm_cached_items_count = __slm_mem_size_x_part / sizeof(_RangeValueType); // The amount of items in the each work-group is the amount of diagonals processing between two work-groups + 1 (for the left base diagonal in work-group) - const std::size_t __wi_in_one_wg = __max_source_data_items_fit_into_slm / __chunk; + const std::size_t __wi_in_one_wg = __slm_cached_items_count / __chunk; assert(__wi_in_one_wg > 0); // The amount of the base diagonals is the amount of the work-groups // - also it's the distance between two base diagonals is equal to the amount of work-items in each work-group const std::size_t __wg_count = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk * __wi_in_one_wg); + assert(__wg_count * __wi_in_one_wg * __chunk >= __n); + // Create storage for save split-points on each base diagonal + 1 (for the right base diagonal in the last work-group) // - in GLOBAL coordinates using __base_diagonals_sp_storage_t = __result_and_scratch_storage<_ExecutionPolicy, _split_point_t<_IdType>>; @@ -365,8 +394,8 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, auto __base_diagonals_sp_global_acc = __base_diagonals_sp_global_storage.template __get_scratch_acc(__cgh); auto __base_diagonals_sp_global_ptr = __base_diagonals_sp_storage_t::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); - const std::size_t __slm_cached_data_size = __wi_in_one_wg * __chunk; - __dpl_sycl::__local_accessor<_RangeValueType> __loc_acc(2 * __slm_cached_data_size, __cgh); + // __wi_in_one_wg * __chunk == 51 * 128 == 6528 + __dpl_sycl::__local_accessor<_RangeValueType> __loc_acc(__wi_in_one_wg * __chunk, __cgh); // Run nd_range parallel_for to process all the data // - each work-group caching source data in SLM and processing diagonals between two base diagonals; @@ -393,46 +422,83 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName, _RangeValueType* __rng1_cache_slm = std::addressof(__loc_acc[0]); _RangeValueType* __rng2_cache_slm = std::addressof(__loc_acc[0]) + __rng1_wg_data_size; - const _IdType __chunk_of_data_reading = std::max(__data_items_in_slm_bank, oneapi::dpl::__internal::__dpl_ceiling_div(__rng1_wg_data_size + __rng2_wg_data_size, __wi_in_one_wg)); - - const _IdType __how_many_wi_reads_rng1 = oneapi::dpl::__internal::__dpl_ceiling_div(__rng1_wg_data_size, __chunk_of_data_reading); - const _IdType __how_many_wi_reads_rng2 = oneapi::dpl::__internal::__dpl_ceiling_div(__rng2_wg_data_size, __chunk_of_data_reading); + const _IdType __how_many_wi_reads_rng1 = oneapi::dpl::__internal::__dpl_ceiling_div(__rng1_wg_data_size, __chunk); + const _IdType __how_many_wi_reads_rng2 = oneapi::dpl::__internal::__dpl_ceiling_div(__rng2_wg_data_size, __chunk); // Calculate the amount of WI for read data from rng1 if (__local_id < __how_many_wi_reads_rng1) { - const _IdType __idx_begin = __local_id * __chunk_of_data_reading; + const _IdType __idx_begin = __local_id * __chunk; // Cooperative data load from __rng1 to __rng1_cache_slm if (__idx_begin < __rng1_wg_data_size) { - const _IdType __idx_end = std::min(__idx_begin + __chunk_of_data_reading, __rng1_wg_data_size); + const _IdType __idx_end = std::min(__idx_begin + __chunk, __rng1_wg_data_size); _ONEDPL_PRAGMA_UNROLL for (_IdType __idx = __idx_begin; __idx < __idx_end; ++__idx) +#if !DUMP_DATA_LOADING __rng1_cache_slm[__idx] = __rng1[__sp_base_left_global.first + __idx]; +#else + __load_item_into_slm(__rng1, __sp_base_left_global.first + __idx, __rng1_cache_slm, __idx, 1, __b_check, __group_linear_id, __local_id); +#endif } } - const std::size_t __first_wi_local_id_for_read_rng2 = __wi_in_one_wg - __how_many_wi_reads_rng2 - 1; + // const bool __b_check = __n1 == 50716 && __n2 == 25358; + // __chunk == 128 + // __wi_in_one_wg = 51, __wg_count = 12, __how_many_wi_reads_rng2 = 7 + const std::size_t __first_wi_local_id_for_read_rng2 = __wi_in_one_wg - __how_many_wi_reads_rng2; // - 1; + // wi to read from __rng2 : 44, 45, 46, 47, 48, 49, 50 if (__local_id >= __first_wi_local_id_for_read_rng2) { - const _IdType __idx_begin = (__local_id - __first_wi_local_id_for_read_rng2) * __chunk_of_data_reading; + const _IdType __idx_begin = (__local_id - __first_wi_local_id_for_read_rng2) * __chunk; // Cooperative data load from __rng2 to __rng2_cache_slm if (__idx_begin < __rng2_wg_data_size) { - const _IdType __idx_end = std::min(__idx_begin + __chunk_of_data_reading, __rng2_wg_data_size); + const _IdType __idx_end = std::min(__idx_begin + __chunk, __rng2_wg_data_size); _ONEDPL_PRAGMA_UNROLL for (_IdType __idx = __idx_begin; __idx < __idx_end; ++__idx) +#if !DUMP_DATA_LOADING __rng2_cache_slm[__idx] = __rng2[__sp_base_left_global.second + __idx]; +#else + __load_item_into_slm(__rng2, __sp_base_left_global.second + __idx, __rng2_cache_slm, __idx, 2, __b_check, __group_linear_id, __local_id); +#endif } } // Wait until all the data is loaded __dpl_sycl::__group_barrier(__nd_item); +#if DUMP_DATA_LOADING + if (__local_id == 0) + { + for (auto i = __sp_base_left_global.first; i < __sp_base_right_global.first; ++i) + { + auto _idx_slm = i - __sp_base_left_global.first; + if (__rng1_cache_slm[_idx_slm] != __rng1[i]) + { + auto __group_linear_id_tmp = __group_linear_id; + __group_linear_id_tmp = __group_linear_id_tmp; + assert(false); + } + } + + for (auto i = __sp_base_left_global.second; i < __sp_base_right_global.second; ++i) + { + auto _idx_slm = i - __sp_base_left_global.second; + if (__rng2_cache_slm[_idx_slm] != __rng2[i]) + { + auto __group_linear_id_tmp = __group_linear_id; + __group_linear_id_tmp = __group_linear_id_tmp; + assert(false); + } + } + } +#endif + // Current diagonal inside of the merge matrix? if (__global_linear_id * __chunk < __n) { @@ -477,7 +543,8 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy { using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; - constexpr std::size_t __starting_size_limit_for_large_submitter = 1 * 1'048'576; // 1 Mb + //constexpr std::size_t __starting_size_limit_for_large_submitter = 1 * 1'048'576; // 1 Mb + constexpr std::size_t __starting_size_limit_for_large_submitter = 1'024; using _Range1ValueType = typename std::iterator_traits::value_type; using _Range2ValueType = typename std::iterator_traits::value_type;