diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index e73c5ee569..f86951df70 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -26,6 +26,8 @@
 #include "sycl_defs.h"
 #include "parallel_backend_sycl_utils.h"
 
+//#define DUMP_DATA_LOADING 1
+
 namespace oneapi
 {
 namespace dpl
@@ -280,6 +282,20 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
                                         __internal::__optional_kernel_name<_DiagonalsKernelName...>,
                                         __internal::__optional_kernel_name<_MergeKernelName...>>
 {
+#if DUMP_DATA_LOADING
+    template <typename _Range, typename _Index, typename _Data>
+    static void
+    __load_item_into_slm(_Range&& __rng, _Index __idx_from, _Data* __slm, _Index __idx_to, std::size_t __range_index,
+                         bool __b_check, std::size_t __group_linear_id, std::size_t __local_id)
+    {
+        // BP
+        //  condition: __b_check
+        //  action: __range_index = {__range_index}, __rng[{__idx_from}] -> __slm[{__idx_to}], __group_linear_id = {__group_linear_id}, __local_id = {__local_id}
+        //  action: {__range_index}, {__idx_from}, {__idx_to}, {__group_linear_id}, {__local_id}
+        __slm[__idx_to] = __rng[__idx_from];
+    }
+#endif
+
     template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _Range3, typename _Compare>
     auto
     operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const
@@ -294,37 +310,50 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
         const _IdType __n2 = __rng2.size();
         const _IdType __n = __n1 + __n2;
 
+#if DUMP_DATA_LOADING
+        //const bool __b_check = __n1 == 16144 && __n2 == 8072;
+        const bool __b_check = __n1 == 50716 && __n2 == 25358;      // __wi_in_one_wg = 51 __wg_count = 12
+
+        if (__b_check)
+        {
+            int i = 0;
+            i = i;
+        }
+#endif
+
         assert(__n1 > 0 || __n2 > 0);
 
         _PRINT_INFO_IN_DEBUG_MODE(__exec);
 
-        // Empirical number of values to process per work-item
-        const _IdType __chunk = __exec.queue().get_device().is_cpu() ? 128 : 4;
-        assert(__chunk > 0);
-
         // Define SLM bank size
         constexpr std::size_t __slm_bank_size = 32;     // TODO is it correct value? How to get it from hardware?
 
         // Calculate how many data items we can read into one SLM bank
         constexpr std::size_t __data_items_in_slm_bank = std::max((std::size_t)1, __slm_bank_size / sizeof(_RangeValueType));
 
-        // Pessimistically only use 2/3 of the memory to take into account memory used by compiled kernel
-        const auto __slm_adjusted_work_group_size = oneapi::dpl::__internal::__slm_adjusted_work_group_size(__exec, sizeof(_RangeValueType));
-        const auto __slm_adjusted_work_group_size_x_part = __slm_adjusted_work_group_size * 4 / 5;
+        // Empirical number of values to process per work-item
+        const _IdType __chunk = __exec.queue().get_device().is_cpu() ? 128 : __data_items_in_slm_bank;  //4;
+        assert(__chunk > 0);
 
-        // The amount of data must be a multiple of the chunk size.
-        const std::size_t __max_source_data_items_fit_into_slm = __slm_adjusted_work_group_size_x_part - __slm_adjusted_work_group_size_x_part % __chunk;
-        assert(__max_source_data_items_fit_into_slm > 0);
-        assert(__max_source_data_items_fit_into_slm % __chunk == 0);
+        // Get the size of local memory arena in bytes.
+        const std::size_t __slm_mem_size = __exec.queue().get_device().template get_info<sycl::info::device::local_mem_size>();
+
+        // Pessimistically only use 4/5 of the memory to take into account memory used by compiled kernel
+        const std::size_t __slm_mem_size_x_part = __slm_mem_size * 4 / 5;
+
+        // Calculate how many items count we may place into SLM memory
+        const auto __slm_cached_items_count = __slm_mem_size_x_part / sizeof(_RangeValueType);
 
         // The amount of items in the each work-group is the amount of diagonals processing between two work-groups + 1 (for the left base diagonal in work-group)
-        const std::size_t __wi_in_one_wg = __max_source_data_items_fit_into_slm / __chunk;
+        const std::size_t __wi_in_one_wg = __slm_cached_items_count / __chunk;
         assert(__wi_in_one_wg > 0);
 
         // The amount of the base diagonals is the amount of the work-groups
         //  - also it's the distance between two base diagonals is equal to the amount of work-items in each work-group
         const std::size_t __wg_count = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk * __wi_in_one_wg);
 
+        assert(__wg_count * __wi_in_one_wg * __chunk >= __n);
+
         // Create storage for save split-points on each base diagonal + 1 (for the right base diagonal in the last work-group)
         //  - in GLOBAL coordinates
         using __base_diagonals_sp_storage_t = __result_and_scratch_storage<_ExecutionPolicy, _split_point_t<_IdType>>;
@@ -365,8 +394,8 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
             auto __base_diagonals_sp_global_acc = __base_diagonals_sp_global_storage.template __get_scratch_acc<sycl::access_mode::read>(__cgh);
             auto __base_diagonals_sp_global_ptr = __base_diagonals_sp_storage_t::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc);
 
-            const std::size_t __slm_cached_data_size = __wi_in_one_wg * __chunk;
-            __dpl_sycl::__local_accessor<_RangeValueType> __loc_acc(2 * __slm_cached_data_size, __cgh);
+            // __wi_in_one_wg * __chunk == 51 * 128 == 6528
+            __dpl_sycl::__local_accessor<_RangeValueType> __loc_acc(__wi_in_one_wg * __chunk, __cgh);
 
             // Run nd_range parallel_for to process all the data
             // - each work-group caching source data in SLM and processing diagonals between two base diagonals;
@@ -393,46 +422,83 @@ struct __parallel_merge_submitter_large<_IdType, _CustomName,
                     _RangeValueType* __rng1_cache_slm = std::addressof(__loc_acc[0]);
                     _RangeValueType* __rng2_cache_slm = std::addressof(__loc_acc[0]) + __rng1_wg_data_size;
 
-                    const _IdType __chunk_of_data_reading = std::max(__data_items_in_slm_bank, oneapi::dpl::__internal::__dpl_ceiling_div(__rng1_wg_data_size + __rng2_wg_data_size, __wi_in_one_wg));
-
-                    const _IdType __how_many_wi_reads_rng1 = oneapi::dpl::__internal::__dpl_ceiling_div(__rng1_wg_data_size, __chunk_of_data_reading);
-                    const _IdType __how_many_wi_reads_rng2 = oneapi::dpl::__internal::__dpl_ceiling_div(__rng2_wg_data_size, __chunk_of_data_reading);
+                    const _IdType __how_many_wi_reads_rng1 = oneapi::dpl::__internal::__dpl_ceiling_div(__rng1_wg_data_size, __chunk);
+                    const _IdType __how_many_wi_reads_rng2 = oneapi::dpl::__internal::__dpl_ceiling_div(__rng2_wg_data_size, __chunk);
                     
                     // Calculate the amount of WI for read data from rng1
                     if (__local_id < __how_many_wi_reads_rng1)
                     {
-                        const _IdType __idx_begin = __local_id * __chunk_of_data_reading;
+                        const _IdType __idx_begin = __local_id * __chunk;
 
                         // Cooperative data load from __rng1 to __rng1_cache_slm
                         if (__idx_begin < __rng1_wg_data_size)
                         {
-                            const _IdType __idx_end = std::min(__idx_begin + __chunk_of_data_reading, __rng1_wg_data_size);
+                            const _IdType __idx_end = std::min(__idx_begin + __chunk, __rng1_wg_data_size);
                     
                             _ONEDPL_PRAGMA_UNROLL
                             for (_IdType __idx = __idx_begin; __idx < __idx_end; ++__idx)
+#if !DUMP_DATA_LOADING
                                 __rng1_cache_slm[__idx] = __rng1[__sp_base_left_global.first + __idx];
+#else
+                                __load_item_into_slm(__rng1, __sp_base_left_global.first + __idx, __rng1_cache_slm, __idx, 1, __b_check, __group_linear_id, __local_id);
+#endif
                         }
                     }
 
-                    const std::size_t __first_wi_local_id_for_read_rng2 = __wi_in_one_wg - __how_many_wi_reads_rng2 - 1;
+                    // const bool __b_check = __n1 == 50716 && __n2 == 25358;
+                    // __chunk == 128
+                    // __wi_in_one_wg = 51, __wg_count = 12, __how_many_wi_reads_rng2 = 7
+                    const std::size_t __first_wi_local_id_for_read_rng2 = __wi_in_one_wg - __how_many_wi_reads_rng2;    // - 1;
+                    // wi to read from __rng2 : 44, 45, 46, 47, 48, 49, 50
                     if (__local_id >= __first_wi_local_id_for_read_rng2)
                     {
-                        const _IdType __idx_begin = (__local_id - __first_wi_local_id_for_read_rng2) * __chunk_of_data_reading;
+                        const _IdType __idx_begin = (__local_id - __first_wi_local_id_for_read_rng2) * __chunk;
 
                         // Cooperative data load from __rng2 to __rng2_cache_slm
                         if (__idx_begin < __rng2_wg_data_size)
                         {
-                            const _IdType __idx_end = std::min(__idx_begin + __chunk_of_data_reading, __rng2_wg_data_size);
+                            const _IdType __idx_end = std::min(__idx_begin + __chunk, __rng2_wg_data_size);
                     
                             _ONEDPL_PRAGMA_UNROLL
                             for (_IdType __idx = __idx_begin; __idx < __idx_end; ++__idx)
+#if !DUMP_DATA_LOADING
                                 __rng2_cache_slm[__idx] = __rng2[__sp_base_left_global.second + __idx];
+#else
+                                __load_item_into_slm(__rng2, __sp_base_left_global.second + __idx, __rng2_cache_slm, __idx, 2, __b_check, __group_linear_id, __local_id);
+#endif
                         }
                     }
 
                     // Wait until all the data is loaded
                     __dpl_sycl::__group_barrier(__nd_item);
 
+#if DUMP_DATA_LOADING
+                    if (__local_id == 0)
+                    {
+                        for (auto i = __sp_base_left_global.first; i < __sp_base_right_global.first; ++i)
+                        {
+                            auto _idx_slm = i - __sp_base_left_global.first;
+                            if (__rng1_cache_slm[_idx_slm] != __rng1[i])
+                            {
+                                auto __group_linear_id_tmp = __group_linear_id;
+                                __group_linear_id_tmp = __group_linear_id_tmp;
+                                assert(false);
+                            }
+                        }
+
+                        for (auto i = __sp_base_left_global.second; i < __sp_base_right_global.second; ++i)
+                        {
+                            auto _idx_slm = i - __sp_base_left_global.second;
+                            if (__rng2_cache_slm[_idx_slm] != __rng2[i])
+                            {
+                                auto __group_linear_id_tmp = __group_linear_id;
+                                __group_linear_id_tmp = __group_linear_id_tmp;
+                                assert(false);
+                            }
+                        }
+                    }
+#endif
+
                     // Current diagonal inside of the merge matrix?
                     if (__global_linear_id * __chunk < __n)
                     {
@@ -477,7 +543,8 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy
 {
     using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>;
 
-    constexpr std::size_t __starting_size_limit_for_large_submitter = 1 * 1'048'576; // 1 Mb
+    //constexpr std::size_t __starting_size_limit_for_large_submitter = 1 * 1'048'576; // 1 Mb
+    constexpr std::size_t __starting_size_limit_for_large_submitter = 1'024;
 
     using _Range1ValueType = typename std::iterator_traits<decltype(__rng1.begin())>::value_type;
     using _Range2ValueType = typename std::iterator_traits<decltype(__rng2.begin())>::value_type;