From 9764a57d78e6531101e9a0c43bda0242e4962eff Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Thu, 5 Sep 2024 12:08:52 -0700
Subject: [PATCH 01/65] Optimize memory transactions in SYCL backend parallel
 for

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 54 +++++++++++++++++--
 1 file changed, 49 insertions(+), 5 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index 96d63e33aee..652db0c65f5 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -237,13 +237,57 @@ struct __parallel_for_submitter<__internal::__optional_kernel_name<_Name...>>
     {
         assert(oneapi::dpl::__ranges::__get_first_range_size(__rngs...) > 0);
         _PRINT_INFO_IN_DEBUG_MODE(__exec);
-        auto __event = __exec.queue().submit([&__rngs..., &__brick, __count](sycl::handler& __cgh) {
+        auto __event = __exec.queue().submit([&__rngs..., &__brick, &__exec, __count](sycl::handler& __cgh) {
             //get an access to data under SYCL buffer:
             oneapi::dpl::__ranges::__require_access(__cgh, __rngs...);
-
-            __cgh.parallel_for<_Name...>(sycl::range</*dim=*/1>(__count), [=](sycl::item</*dim=*/1> __item_id) {
-                auto __idx = __item_id.get_linear_id();
-                __brick(__idx, __rngs...);
+            std::size_t __work_group_size = oneapi::dpl::__internal::__max_work_group_size(__exec);
+
+            // For target architectures, 512 bytes is the maximum amount of data that can be performed in a single load / store
+            // transaction. Assuming a sub-group size of 32, 512 / 32 = 16 which is the number of bytes we wish to load / store
+            // per work-item. For architectures that do not support load / stores of 512 bytes (e.g. 128 bytes), several smaller
+            // but coalesced transactions will be made and performance should still be maximized.
+            // Grab the value type of the first range to estimate the optimal iters per work item.
+            using _ValueType = oneapi::dpl::__internal::__value_t<std::decay_t<std::tuple_element_t<0, std::tuple<_Ranges...>>>>;
+            constexpr std::uint16_t __max_bytes_per_transaction = 512;
+            constexpr std::uint16_t __predicted_sub_group_size = 32;
+            constexpr std::uint16_t __bytes_per_work_item = __max_bytes_per_transaction / __predicted_sub_group_size;
+            // If the _ValueType > 128 bytes (unlikely), then perform a single iteration per work item.
+            constexpr std::uint16_t __iters_per_work_item = std::max(std::size_t{1}, __bytes_per_work_item / sizeof(_ValueType));
+            std::size_t __num_items = std::max(static_cast<_Index>(__work_group_size), oneapi::dpl::__internal::__dpl_ceiling_div(__count, __iters_per_work_item));
+            // TODO: optimize for small data sizes that do not saturate the device with this scheme
+            __cgh.parallel_for<_Name...>(sycl::nd_range(sycl::range<1>(__num_items), sycl::range<1>(__work_group_size)), [=](sycl::nd_item</*dim=*/1> __ndi) {
+                __dpl_sycl::__sub_group __sub_group = __ndi.get_sub_group(); 
+                std::uint32_t __sub_group_size = __sub_group.get_local_linear_range();
+                std::uint32_t __sub_group_id = __sub_group.get_group_linear_id();
+                std::uint32_t __sub_group_local_id = __sub_group.get_local_linear_id();
+                std::size_t  __work_group_id = __ndi.get_group().get_group_linear_id();
+                
+                std::size_t __sub_group_start_idx =
+                    __iters_per_work_item * (__work_group_id * __work_group_size +
+                                             __sub_group_size * __sub_group_id);
+                bool __is_full_sub_group = __sub_group_start_idx + __iters_per_work_item * __sub_group_size <= __count;
+                std::size_t __idx = __sub_group_start_idx + __sub_group_local_id;
+                if (__is_full_sub_group)
+                {
+                    _ONEDPL_PRAGMA_UNROLL
+                    for (std::uint32_t i = 0; i < __iters_per_work_item; ++i)
+                    {
+                        __brick(__idx, __rngs...);
+                        __idx += __sub_group_size;
+                    }
+                }
+                else
+                {
+                    _ONEDPL_PRAGMA_UNROLL
+                    for (std::uint32_t i = 0; i < __iters_per_work_item; ++i)
+                    {
+                        if (__idx < __count)
+                        {
+                            __brick(__idx, __rngs...);
+                            __idx += __sub_group_size;
+                        }
+                    }
+                }
             });
         });
         return __future(__event);

From c836b1d20486f7b3b6ca783937a9848884b54cde Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Thu, 5 Sep 2024 14:12:08 -0500
Subject: [PATCH 02/65] clang-format

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 68 ++++++++++---------
 1 file changed, 37 insertions(+), 31 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index 652db0c65f5..1b077268117 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -247,48 +247,54 @@ struct __parallel_for_submitter<__internal::__optional_kernel_name<_Name...>>
             // per work-item. For architectures that do not support load / stores of 512 bytes (e.g. 128 bytes), several smaller
             // but coalesced transactions will be made and performance should still be maximized.
             // Grab the value type of the first range to estimate the optimal iters per work item.
-            using _ValueType = oneapi::dpl::__internal::__value_t<std::decay_t<std::tuple_element_t<0, std::tuple<_Ranges...>>>>;
+            using _ValueType =
+                oneapi::dpl::__internal::__value_t<std::decay_t<std::tuple_element_t<0, std::tuple<_Ranges...>>>>;
             constexpr std::uint16_t __max_bytes_per_transaction = 512;
             constexpr std::uint16_t __predicted_sub_group_size = 32;
             constexpr std::uint16_t __bytes_per_work_item = __max_bytes_per_transaction / __predicted_sub_group_size;
             // If the _ValueType > 128 bytes (unlikely), then perform a single iteration per work item.
-            constexpr std::uint16_t __iters_per_work_item = std::max(std::size_t{1}, __bytes_per_work_item / sizeof(_ValueType));
-            std::size_t __num_items = std::max(static_cast<_Index>(__work_group_size), oneapi::dpl::__internal::__dpl_ceiling_div(__count, __iters_per_work_item));
+            constexpr std::uint16_t __iters_per_work_item =
+                std::max(std::size_t{1}, __bytes_per_work_item / sizeof(_ValueType));
+            std::size_t __num_items =
+                std::max(static_cast<_Index>(__work_group_size),
+                         oneapi::dpl::__internal::__dpl_ceiling_div(__count, __iters_per_work_item));
             // TODO: optimize for small data sizes that do not saturate the device with this scheme
-            __cgh.parallel_for<_Name...>(sycl::nd_range(sycl::range<1>(__num_items), sycl::range<1>(__work_group_size)), [=](sycl::nd_item</*dim=*/1> __ndi) {
-                __dpl_sycl::__sub_group __sub_group = __ndi.get_sub_group(); 
-                std::uint32_t __sub_group_size = __sub_group.get_local_linear_range();
-                std::uint32_t __sub_group_id = __sub_group.get_group_linear_id();
-                std::uint32_t __sub_group_local_id = __sub_group.get_local_linear_id();
-                std::size_t  __work_group_id = __ndi.get_group().get_group_linear_id();
-                
-                std::size_t __sub_group_start_idx =
-                    __iters_per_work_item * (__work_group_id * __work_group_size +
-                                             __sub_group_size * __sub_group_id);
-                bool __is_full_sub_group = __sub_group_start_idx + __iters_per_work_item * __sub_group_size <= __count;
-                std::size_t __idx = __sub_group_start_idx + __sub_group_local_id;
-                if (__is_full_sub_group)
-                {
-                    _ONEDPL_PRAGMA_UNROLL
-                    for (std::uint32_t i = 0; i < __iters_per_work_item; ++i)
-                    {
-                        __brick(__idx, __rngs...);
-                        __idx += __sub_group_size;
-                    }
-                }
-                else
-                {
-                    _ONEDPL_PRAGMA_UNROLL
-                    for (std::uint32_t i = 0; i < __iters_per_work_item; ++i)
+            __cgh.parallel_for<_Name...>(
+                sycl::nd_range(sycl::range<1>(__num_items), sycl::range<1>(__work_group_size)),
+                [=](sycl::nd_item</*dim=*/1> __ndi) {
+                    __dpl_sycl::__sub_group __sub_group = __ndi.get_sub_group();
+                    std::uint32_t __sub_group_size = __sub_group.get_local_linear_range();
+                    std::uint32_t __sub_group_id = __sub_group.get_group_linear_id();
+                    std::uint32_t __sub_group_local_id = __sub_group.get_local_linear_id();
+                    std::size_t __work_group_id = __ndi.get_group().get_group_linear_id();
+
+                    std::size_t __sub_group_start_idx = __iters_per_work_item * (__work_group_id * __work_group_size +
+                                                                                 __sub_group_size * __sub_group_id);
+                    bool __is_full_sub_group =
+                        __sub_group_start_idx + __iters_per_work_item * __sub_group_size <= __count;
+                    std::size_t __idx = __sub_group_start_idx + __sub_group_local_id;
+                    if (__is_full_sub_group)
                     {
-                        if (__idx < __count)
+                        _ONEDPL_PRAGMA_UNROLL
+                        for (std::uint32_t i = 0; i < __iters_per_work_item; ++i)
                         {
                             __brick(__idx, __rngs...);
                             __idx += __sub_group_size;
                         }
                     }
-                }
-            });
+                    else
+                    {
+                        _ONEDPL_PRAGMA_UNROLL
+                        for (std::uint32_t i = 0; i < __iters_per_work_item; ++i)
+                        {
+                            if (__idx < __count)
+                            {
+                                __brick(__idx, __rngs...);
+                                __idx += __sub_group_size;
+                            }
+                        }
+                    }
+                });
         });
         return __future(__event);
     }

From 55f33a4c274542f814ec3c715430cd2c6ca9ac69 Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Fri, 6 Sep 2024 09:42:06 -0700
Subject: [PATCH 03/65] Correct comment and error handling.

128 byte memory operations are performed instead of 512 after inspecting
the assembly. Processing 512 bytes per sub-group still seems to be the
best value after experimentation.

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../pstl/hetero/dpcpp/parallel_backend_sycl.h   | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index 1b077268117..70405bedce6 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -242,22 +242,21 @@ struct __parallel_for_submitter<__internal::__optional_kernel_name<_Name...>>
             oneapi::dpl::__ranges::__require_access(__cgh, __rngs...);
             std::size_t __work_group_size = oneapi::dpl::__internal::__max_work_group_size(__exec);
 
-            // For target architectures, 512 bytes is the maximum amount of data that can be performed in a single load / store
-            // transaction. Assuming a sub-group size of 32, 512 / 32 = 16 which is the number of bytes we wish to load / store
-            // per work-item. For architectures that do not support load / stores of 512 bytes (e.g. 128 bytes), several smaller
-            // but coalesced transactions will be made and performance should still be maximized.
+            // Processing 512 bytes per sub-group has shown the best performance on target architectures.
             // Grab the value type of the first range to estimate the optimal iters per work item.
             using _ValueType =
                 oneapi::dpl::__internal::__value_t<std::decay_t<std::tuple_element_t<0, std::tuple<_Ranges...>>>>;
-            constexpr std::uint16_t __max_bytes_per_transaction = 512;
+
+            constexpr std::uint16_t __max_bytes_per_sub_group = 512;
             constexpr std::uint16_t __predicted_sub_group_size = 32;
-            constexpr std::uint16_t __bytes_per_work_item = __max_bytes_per_transaction / __predicted_sub_group_size;
+            constexpr std::uint16_t __bytes_per_work_item = __max_bytes_per_sub_group / __predicted_sub_group_size;
             // If the _ValueType > 128 bytes (unlikely), then perform a single iteration per work item.
             constexpr std::uint16_t __iters_per_work_item =
                 std::max(std::size_t{1}, __bytes_per_work_item / sizeof(_ValueType));
-            std::size_t __num_items =
-                std::max(static_cast<_Index>(__work_group_size),
-                         oneapi::dpl::__internal::__dpl_ceiling_div(__count, __iters_per_work_item));
+            std::size_t __num_groups =
+                std::max(__work_group_size,
+                         oneapi::dpl::__internal::__dpl_ceiling_div(__count, (__work_group_size * __iters_per_work_item)));
+            std::size_t __num_items = __num_groups * __work_group_size;
             // TODO: optimize for small data sizes that do not saturate the device with this scheme
             __cgh.parallel_for<_Name...>(
                 sycl::nd_range(sycl::range<1>(__num_items), sycl::range<1>(__work_group_size)),

From adadd56dfe608b537682f74f0131784e657a5d88 Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Tue, 10 Sep 2024 08:29:09 -0700
Subject: [PATCH 04/65] __num_groups bugfix

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index 70405bedce6..e54b2c5f300 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -254,7 +254,7 @@ struct __parallel_for_submitter<__internal::__optional_kernel_name<_Name...>>
             constexpr std::uint16_t __iters_per_work_item =
                 std::max(std::size_t{1}, __bytes_per_work_item / sizeof(_ValueType));
             std::size_t __num_groups =
-                std::max(__work_group_size,
+                std::max(std::size_t{1},
                          oneapi::dpl::__internal::__dpl_ceiling_div(__count, (__work_group_size * __iters_per_work_item)));
             std::size_t __num_items = __num_groups * __work_group_size;
             // TODO: optimize for small data sizes that do not saturate the device with this scheme

From 71d7bccc1f5f39406d11541dfc7de397c6abc9e2 Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Mon, 16 Sep 2024 08:58:22 -0700
Subject: [PATCH 05/65] Introduce stride recommender for different targets and
 better distribute work for small inputs

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 48 ++++++++-----------
 .../dpcpp/parallel_backend_sycl_utils.h       | 34 +++++++++++++
 2 files changed, 54 insertions(+), 28 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index e54b2c5f300..88a62ffdc65 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -240,56 +240,48 @@ struct __parallel_for_submitter<__internal::__optional_kernel_name<_Name...>>
         auto __event = __exec.queue().submit([&__rngs..., &__brick, &__exec, __count](sycl::handler& __cgh) {
             //get an access to data under SYCL buffer:
             oneapi::dpl::__ranges::__require_access(__cgh, __rngs...);
-            std::size_t __work_group_size = oneapi::dpl::__internal::__max_work_group_size(__exec);
+
+            // Limit the work-group size to 512 which has empirically yielded the best results.
+            std::size_t __work_group_size = oneapi::dpl::__internal::__max_work_group_size(__exec, 512);
+            __work_group_size = std::min(__work_group_size, static_cast<std::size_t>(__count));
 
             // Processing 512 bytes per sub-group has shown the best performance on target architectures.
             // Grab the value type of the first range to estimate the optimal iters per work item.
             using _ValueType =
                 oneapi::dpl::__internal::__value_t<std::decay_t<std::tuple_element_t<0, std::tuple<_Ranges...>>>>;
 
-            constexpr std::uint16_t __max_bytes_per_sub_group = 512;
-            constexpr std::uint16_t __predicted_sub_group_size = 32;
-            constexpr std::uint16_t __bytes_per_work_item = __max_bytes_per_sub_group / __predicted_sub_group_size;
-            // If the _ValueType > 128 bytes (unlikely), then perform a single iteration per work item.
-            constexpr std::uint16_t __iters_per_work_item =
-                std::max(std::size_t{1}, __bytes_per_work_item / sizeof(_ValueType));
+            constexpr std::size_t __bytes_per_work_item = 16;
+            constexpr std::size_t __max_iters_per_work_item = oneapi::dpl::__internal::__dpl_ceiling_div(__bytes_per_work_item, sizeof(_ValueType));
+            auto __max_cu = oneapi::dpl::__internal::__max_compute_units(__exec);
+            std::size_t __elems_per_compute_unit = oneapi::dpl::__internal::__dpl_ceiling_div(__count, __max_cu * __work_group_size);
+            // For small data sizes, distribute the work evenly among compute units.
+            std::size_t __iters_per_work_item = std::min(__elems_per_compute_unit, __max_iters_per_work_item);
             std::size_t __num_groups =
-                std::max(std::size_t{1},
-                         oneapi::dpl::__internal::__dpl_ceiling_div(__count, (__work_group_size * __iters_per_work_item)));
+                         oneapi::dpl::__internal::__dpl_ceiling_div(__count, (__work_group_size * __iters_per_work_item));
             std::size_t __num_items = __num_groups * __work_group_size;
-            // TODO: optimize for small data sizes that do not saturate the device with this scheme
             __cgh.parallel_for<_Name...>(
                 sycl::nd_range(sycl::range<1>(__num_items), sycl::range<1>(__work_group_size)),
                 [=](sycl::nd_item</*dim=*/1> __ndi) {
-                    __dpl_sycl::__sub_group __sub_group = __ndi.get_sub_group();
-                    std::uint32_t __sub_group_size = __sub_group.get_local_linear_range();
-                    std::uint32_t __sub_group_id = __sub_group.get_group_linear_id();
-                    std::uint32_t __sub_group_local_id = __sub_group.get_local_linear_id();
-                    std::size_t __work_group_id = __ndi.get_group().get_group_linear_id();
-
-                    std::size_t __sub_group_start_idx = __iters_per_work_item * (__work_group_id * __work_group_size +
-                                                                                 __sub_group_size * __sub_group_id);
-                    bool __is_full_sub_group =
-                        __sub_group_start_idx + __iters_per_work_item * __sub_group_size <= __count;
-                    std::size_t __idx = __sub_group_start_idx + __sub_group_local_id;
-                    if (__is_full_sub_group)
+                    auto [__idx, __stride, __is_full] = __stride_recommender(__ndi, __count, __iters_per_work_item, __work_group_size);
+                    // TODO: Investigate using a vectorized approach similar to reduce.
+                    // Initial investigation showed benefits for in-place for-based algorithms (e.g. std::for_each) but
+                    // performance regressions for out-of-place (e.g. std::copy).
+                    if (__is_full)
                     {
-                        _ONEDPL_PRAGMA_UNROLL
-                        for (std::uint32_t i = 0; i < __iters_per_work_item; ++i)
+                        for (std::uint16_t __i = 0; __i < __iters_per_work_item; ++__i)
                         {
                             __brick(__idx, __rngs...);
-                            __idx += __sub_group_size;
+                            __idx += __stride;
                         }
                     }
                     else
                     {
-                        _ONEDPL_PRAGMA_UNROLL
-                        for (std::uint32_t i = 0; i < __iters_per_work_item; ++i)
+                        for (std::uint16_t __i = 0; __i < __iters_per_work_item; ++__i)
                         {
                             if (__idx < __count)
                             {
                                 __brick(__idx, __rngs...);
-                                __idx += __sub_group_size;
+                                __idx += __stride;
                             }
                         }
                     }
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
index f4eb557170e..a9c195368b6 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
@@ -834,6 +834,40 @@ class __static_monotonic_dispatcher<::std::integer_sequence<::std::uint16_t, _X,
     }
 };
 
+// Utility to recommend a stride for the best-performing memory access pattern from empirical testing on different
+// devices. This utility can only be called from the device.
+//
+// SPIR-V compilation targets show best performance with a stride of the sub-group size.
+// Other compilation targets perform best with a work-group size stride.
+template <typename NdItem>
+std::tuple<std::size_t, std::size_t, bool>
+__stride_recommender(const NdItem& __ndi, std::size_t __count, std::size_t __iters_per_work_item, std::size_t __work_group_size)
+{
+    if constexpr (oneapi::dpl::__internal::__is_spirv_target_v)
+    {
+        __dpl_sycl::__sub_group __sub_group = __ndi.get_sub_group();
+        std::uint32_t __sub_group_size = __sub_group.get_local_linear_range();
+        std::uint32_t __sub_group_id = __sub_group.get_group_linear_id();
+        std::uint32_t __sub_group_local_id = __sub_group.get_local_linear_id();
+        std::size_t __work_group_id = __ndi.get_group().get_group_linear_id();
+
+        std::size_t __sub_group_start_idx = __iters_per_work_item * (__work_group_id * __work_group_size +
+                                                                     __sub_group_size * __sub_group_id);
+        bool __is_full_sub_group =
+            __sub_group_start_idx + __iters_per_work_item * __sub_group_size <= __count;
+        std::size_t __work_item_idx = __sub_group_start_idx + __sub_group_local_id;
+        return std::make_tuple(__work_item_idx, __sub_group_size, __is_full_sub_group);
+    }
+    else
+    {
+        std::size_t __work_group_start_idx = __ndi.get_group().get_group_linear_id() * __work_group_size * __iters_per_work_item;
+        std::size_t __work_item_idx = __work_group_start_idx + __ndi.get_local_linear_id();
+        bool __is_full_work_group =
+            __work_group_start_idx + __iters_per_work_item * __work_group_size <= __count;
+        return std::make_tuple(__work_item_idx, __work_group_size, __is_full_work_group);
+    }
+}
+
 } // namespace __par_backend_hetero
 } // namespace dpl
 } // namespace oneapi

From ebb3d569ca6d91d31ab7705790751d70350b15bc Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Mon, 16 Sep 2024 14:27:52 -0700
Subject: [PATCH 06/65] Cleanup

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 22 +++++++++----------
 .../dpcpp/parallel_backend_sycl_utils.h       | 22 +++++++++----------
 2 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index 88a62ffdc65..17eda57cd09 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -245,20 +245,20 @@ struct __parallel_for_submitter<__internal::__optional_kernel_name<_Name...>>
             std::size_t __work_group_size = oneapi::dpl::__internal::__max_work_group_size(__exec, 512);
             __work_group_size = std::min(__work_group_size, static_cast<std::size_t>(__count));
 
-            // Processing 512 bytes per sub-group has shown the best performance on target architectures.
-            // Grab the value type of the first range to estimate the optimal iters per work item.
             using _ValueType =
                 oneapi::dpl::__internal::__value_t<std::decay_t<std::tuple_element_t<0, std::tuple<_Ranges...>>>>;
 
-            constexpr std::size_t __bytes_per_work_item = 16;
-            constexpr std::size_t __max_iters_per_work_item = oneapi::dpl::__internal::__dpl_ceiling_div(__bytes_per_work_item, sizeof(_ValueType));
-            auto __max_cu = oneapi::dpl::__internal::__max_compute_units(__exec);
-            std::size_t __elems_per_compute_unit = oneapi::dpl::__internal::__dpl_ceiling_div(__count, __max_cu * __work_group_size);
+            // Process up to 16 bytes per work-item. This results in 512 bytes loaded input range per size 32 sub-group which
+            // has yielded best performance on target architectures. For larger data types, load a single element.
+            constexpr std::uint8_t __bytes_per_work_item = 16;
+            constexpr std::uint8_t __max_iters_per_work_item = oneapi::dpl::__internal::__dpl_ceiling_div(__bytes_per_work_item, sizeof(_ValueType));
+            const std::uint32_t __max_cu = oneapi::dpl::__internal::__max_compute_units(__exec);
+            const std::size_t __iters_per_compute_unit = oneapi::dpl::__internal::__dpl_ceiling_div(__count, __max_cu * __work_group_size);
             // For small data sizes, distribute the work evenly among compute units.
-            std::size_t __iters_per_work_item = std::min(__elems_per_compute_unit, __max_iters_per_work_item);
-            std::size_t __num_groups =
+            const std::uint8_t __iters_per_work_item = std::min(__iters_per_compute_unit, static_cast<std::size_t>(__max_iters_per_work_item));
+            const std::size_t __num_groups =
                          oneapi::dpl::__internal::__dpl_ceiling_div(__count, (__work_group_size * __iters_per_work_item));
-            std::size_t __num_items = __num_groups * __work_group_size;
+            const std::size_t __num_items = __num_groups * __work_group_size;
             __cgh.parallel_for<_Name...>(
                 sycl::nd_range(sycl::range<1>(__num_items), sycl::range<1>(__work_group_size)),
                 [=](sycl::nd_item</*dim=*/1> __ndi) {
@@ -268,7 +268,7 @@ struct __parallel_for_submitter<__internal::__optional_kernel_name<_Name...>>
                     // performance regressions for out-of-place (e.g. std::copy).
                     if (__is_full)
                     {
-                        for (std::uint16_t __i = 0; __i < __iters_per_work_item; ++__i)
+                        for (std::uint8_t __i = 0; __i < __iters_per_work_item; ++__i)
                         {
                             __brick(__idx, __rngs...);
                             __idx += __stride;
@@ -276,7 +276,7 @@ struct __parallel_for_submitter<__internal::__optional_kernel_name<_Name...>>
                     }
                     else
                     {
-                        for (std::uint16_t __i = 0; __i < __iters_per_work_item; ++__i)
+                        for (std::uint8_t __i = 0; __i < __iters_per_work_item; ++__i)
                         {
                             if (__idx < __count)
                             {
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
index a9c195368b6..1c18eeeda8f 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
@@ -845,24 +845,24 @@ __stride_recommender(const NdItem& __ndi, std::size_t __count, std::size_t __ite
 {
     if constexpr (oneapi::dpl::__internal::__is_spirv_target_v)
     {
-        __dpl_sycl::__sub_group __sub_group = __ndi.get_sub_group();
-        std::uint32_t __sub_group_size = __sub_group.get_local_linear_range();
-        std::uint32_t __sub_group_id = __sub_group.get_group_linear_id();
-        std::uint32_t __sub_group_local_id = __sub_group.get_local_linear_id();
-        std::size_t __work_group_id = __ndi.get_group().get_group_linear_id();
+        const __dpl_sycl::__sub_group __sub_group = __ndi.get_sub_group();
+        const std::uint32_t __sub_group_size = __sub_group.get_local_linear_range();
+        const std::uint32_t __sub_group_id = __sub_group.get_group_linear_id();
+        const std::uint32_t __sub_group_local_id = __sub_group.get_local_linear_id();
+        const std::size_t __work_group_id = __ndi.get_group().get_group_linear_id();
 
-        std::size_t __sub_group_start_idx = __iters_per_work_item * (__work_group_id * __work_group_size +
+        const std::size_t __sub_group_start_idx = __iters_per_work_item * (__work_group_id * __work_group_size +
                                                                      __sub_group_size * __sub_group_id);
-        bool __is_full_sub_group =
+        const bool __is_full_sub_group =
             __sub_group_start_idx + __iters_per_work_item * __sub_group_size <= __count;
-        std::size_t __work_item_idx = __sub_group_start_idx + __sub_group_local_id;
+        const std::size_t __work_item_idx = __sub_group_start_idx + __sub_group_local_id;
         return std::make_tuple(__work_item_idx, __sub_group_size, __is_full_sub_group);
     }
     else
     {
-        std::size_t __work_group_start_idx = __ndi.get_group().get_group_linear_id() * __work_group_size * __iters_per_work_item;
-        std::size_t __work_item_idx = __work_group_start_idx + __ndi.get_local_linear_id();
-        bool __is_full_work_group =
+        const std::size_t __work_group_start_idx = __ndi.get_group().get_group_linear_id() * __work_group_size * __iters_per_work_item;
+        const std::size_t __work_item_idx = __work_group_start_idx + __ndi.get_local_linear_id();
+        const bool __is_full_work_group =
             __work_group_start_idx + __iters_per_work_item * __work_group_size <= __count;
         return std::make_tuple(__work_item_idx, __work_group_size, __is_full_work_group);
     }

From 2c4ecd0ddec719b9ec1e6fc8d97fe74499e56bfc Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Wed, 18 Sep 2024 12:52:47 -0700
Subject: [PATCH 07/65] Unroll loop if possible

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h    | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index 17eda57cd09..c4e23190bf6 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -256,6 +256,7 @@ struct __parallel_for_submitter<__internal::__optional_kernel_name<_Name...>>
             const std::size_t __iters_per_compute_unit = oneapi::dpl::__internal::__dpl_ceiling_div(__count, __max_cu * __work_group_size);
             // For small data sizes, distribute the work evenly among compute units.
             const std::uint8_t __iters_per_work_item = std::min(__iters_per_compute_unit, static_cast<std::size_t>(__max_iters_per_work_item));
+            const bool __can_unroll_loop = __max_iters_per_work_item == __iters_per_work_item;
             const std::size_t __num_groups =
                          oneapi::dpl::__internal::__dpl_ceiling_div(__count, (__work_group_size * __iters_per_work_item));
             const std::size_t __num_items = __num_groups * __work_group_size;
@@ -266,7 +267,16 @@ struct __parallel_for_submitter<__internal::__optional_kernel_name<_Name...>>
                     // TODO: Investigate using a vectorized approach similar to reduce.
                     // Initial investigation showed benefits for in-place for-based algorithms (e.g. std::for_each) but
                     // performance regressions for out-of-place (e.g. std::copy).
-                    if (__is_full)
+                    if (__is_full && __can_unroll_loop)
+                    {
+                        _ONEDPL_PRAGMA_UNROLL
+                        for (std::uint8_t __i = 0; __i < __max_iters_per_work_item; ++__i)
+                        {
+                            __brick(__idx, __rngs...);
+                            __idx += __stride;
+                        }
+                    }
+                    else if (__is_full)
                     {
                         for (std::uint8_t __i = 0; __i < __iters_per_work_item; ++__i)
                         {

From dc6bd0c6b94e81aeb9601ee0078c9d2a8c35402c Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Wed, 18 Sep 2024 13:15:44 -0700
Subject: [PATCH 08/65] Revert "Unroll loop if possible"

This reverts commit e4cbcebf6ec43c2eced1a90124e6306883793da0. Small
sizes slightly slower and for horizontal vectorization no "real" benefit is
observed.
---
 .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h    | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index c4e23190bf6..17eda57cd09 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -256,7 +256,6 @@ struct __parallel_for_submitter<__internal::__optional_kernel_name<_Name...>>
             const std::size_t __iters_per_compute_unit = oneapi::dpl::__internal::__dpl_ceiling_div(__count, __max_cu * __work_group_size);
             // For small data sizes, distribute the work evenly among compute units.
             const std::uint8_t __iters_per_work_item = std::min(__iters_per_compute_unit, static_cast<std::size_t>(__max_iters_per_work_item));
-            const bool __can_unroll_loop = __max_iters_per_work_item == __iters_per_work_item;
             const std::size_t __num_groups =
                          oneapi::dpl::__internal::__dpl_ceiling_div(__count, (__work_group_size * __iters_per_work_item));
             const std::size_t __num_items = __num_groups * __work_group_size;
@@ -267,16 +266,7 @@ struct __parallel_for_submitter<__internal::__optional_kernel_name<_Name...>>
                     // TODO: Investigate using a vectorized approach similar to reduce.
                     // Initial investigation showed benefits for in-place for-based algorithms (e.g. std::for_each) but
                     // performance regressions for out-of-place (e.g. std::copy).
-                    if (__is_full && __can_unroll_loop)
-                    {
-                        _ONEDPL_PRAGMA_UNROLL
-                        for (std::uint8_t __i = 0; __i < __max_iters_per_work_item; ++__i)
-                        {
-                            __brick(__idx, __rngs...);
-                            __idx += __stride;
-                        }
-                    }
-                    else if (__is_full)
+                    if (__is_full)
                     {
                         for (std::uint8_t __i = 0; __i < __iters_per_work_item; ++__i)
                         {

From d5126b2df69fbc482375c9fd01ff5edc5e3c60c8 Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Thu, 19 Sep 2024 20:02:21 -0700
Subject: [PATCH 09/65] Use a small and large kernel in parallel for

Small but measurable overheads can be observed for small inputs where
runtime dispatch in the kernel is present to check for the correct path
to take. Letting the compiler handle the the small input case in the
original kernel shows the best performance.

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 70 +++++++++++++++----
 1 file changed, 57 insertions(+), 13 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index 17eda57cd09..2e4869fdb6b 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -218,6 +218,12 @@ class __scan_single_wg_dynamic_kernel;
 template <typename... Name>
 class __scan_copy_single_wg_kernel;
 
+template <typename... Name>
+class __parallel_for_small_kernel;
+
+template <typename... Name>
+class __parallel_for_large_kernel;
+
 //------------------------------------------------------------------------
 // parallel_for - async pattern
 //------------------------------------------------------------------------
@@ -226,10 +232,35 @@ class __scan_copy_single_wg_kernel;
 // as the parameter pack that can be empty (for unnamed kernels) or contain exactly one
 // type (for explicitly specified name by the user)
 template <typename _KernelName>
-struct __parallel_for_submitter;
+struct __parallel_for_small_submitter;
 
 template <typename... _Name>
-struct __parallel_for_submitter<__internal::__optional_kernel_name<_Name...>>
+struct __parallel_for_small_submitter<__internal::__optional_kernel_name<_Name...>>
+{
+    template <typename _ExecutionPolicy, typename _Fp, typename _Index, typename... _Ranges>
+    auto
+    operator()(_ExecutionPolicy&& __exec, _Fp __brick, _Index __count, _Ranges&&... __rngs) const
+    {
+        assert(oneapi::dpl::__ranges::__get_first_range_size(__rngs...) > 0);
+        _PRINT_INFO_IN_DEBUG_MODE(__exec);
+        auto __event = __exec.queue().submit([&__rngs..., &__brick, __count](sycl::handler& __cgh) {
+            //get an access to data under SYCL buffer:
+            oneapi::dpl::__ranges::__require_access(__cgh, __rngs...);
+
+            __cgh.parallel_for<_Name...>(sycl::range</*dim=*/1>(__count), [=](sycl::item</*dim=*/1> __item_id) {
+                auto __idx = __item_id.get_linear_id();
+                __brick(__idx, __rngs...);
+            });
+        });
+        return __future(__event);
+    }
+};
+
+template <typename _KernelName>
+struct __parallel_for_large_submitter;
+
+template <typename... _Name>
+struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name...>>
 {
     template <typename _ExecutionPolicy, typename _Fp, typename _Index, typename... _Ranges>
     auto
@@ -248,14 +279,13 @@ struct __parallel_for_submitter<__internal::__optional_kernel_name<_Name...>>
             using _ValueType =
                 oneapi::dpl::__internal::__value_t<std::decay_t<std::tuple_element_t<0, std::tuple<_Ranges...>>>>;
 
-            // Process up to 16 bytes per work-item. This results in 512 bytes loaded input range per size 32 sub-group which
-            // has yielded best performance on target architectures. For larger data types, load a single element.
+            // Process up to 16 bytes per work-item per input range. This value has been the empirically determined minimum
+            // number of bytes for a single input range to saturate HW bandwidth on target architecures.
             constexpr std::uint8_t __bytes_per_work_item = 16;
-            constexpr std::uint8_t __max_iters_per_work_item = oneapi::dpl::__internal::__dpl_ceiling_div(__bytes_per_work_item, sizeof(_ValueType));
-            const std::uint32_t __max_cu = oneapi::dpl::__internal::__max_compute_units(__exec);
-            const std::size_t __iters_per_compute_unit = oneapi::dpl::__internal::__dpl_ceiling_div(__count, __max_cu * __work_group_size);
-            // For small data sizes, distribute the work evenly among compute units.
-            const std::uint8_t __iters_per_work_item = std::min(__iters_per_compute_unit, static_cast<std::size_t>(__max_iters_per_work_item));
+            // TODO: Better handle this heuristic for the case where the input is a zip iterator
+            constexpr std::uint8_t __iters_per_work_item =
+                oneapi::dpl::__internal::__dpl_ceiling_div(__bytes_per_work_item, sizeof(_ValueType));
+
             const std::size_t __num_groups =
                          oneapi::dpl::__internal::__dpl_ceiling_div(__count, (__work_group_size * __iters_per_work_item));
             const std::size_t __num_items = __num_groups * __work_group_size;
@@ -268,6 +298,7 @@ struct __parallel_for_submitter<__internal::__optional_kernel_name<_Name...>>
                     // performance regressions for out-of-place (e.g. std::copy).
                     if (__is_full)
                     {
+                        _ONEDPL_PRAGMA_UNROLL
                         for (std::uint8_t __i = 0; __i < __iters_per_work_item; ++__i)
                         {
                             __brick(__idx, __rngs...);
@@ -299,10 +330,23 @@ __parallel_for(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&&
                _Ranges&&... __rngs)
 {
     using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>;
-    using _ForKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<_CustomName>;
-
-    return __parallel_for_submitter<_ForKernel>()(::std::forward<_ExecutionPolicy>(__exec), __brick, __count,
-                                                  ::std::forward<_Ranges>(__rngs)...);
+    using _ForKernelSmall =
+        oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<__parallel_for_small_kernel<_CustomName>>;
+    using _ForKernelLarge =
+        oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<__parallel_for_large_kernel<_CustomName>>;
+
+    // Compile two kernels: one for small-to-medium inputs and a second for large. This avoids runtime checks within a single
+    // kernel that worsen performance for small cases.
+    if (__count <= 262144)
+    {
+        return __parallel_for_small_submitter<_ForKernelSmall>()(std::forward<_ExecutionPolicy>(__exec), __brick, __count,
+                                                            std::forward<_Ranges>(__rngs)...);
+    }
+    else
+    {
+        return __parallel_for_large_submitter<_ForKernelLarge>()(std::forward<_ExecutionPolicy>(__exec), __brick, __count,
+                                                                 std::forward<_Ranges>(__rngs)...);
+    }
 }
 
 //------------------------------------------------------------------------

From 6433a5004234e56ae8132274537d3a02c1c5d6aa Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Fri, 20 Sep 2024 10:46:24 -0500
Subject: [PATCH 10/65] Improve __iters_per_work_item heuristic.

We now flatten the user-provided ranges and find the minimum sized type
to estimate the best __iters_per_work_item. This benefits performance in
calls that wrap multiple buffers in a single input / output through a
zip_iterator (e.g. dpct::scatter_if in SYCLomatic compatibility headers).

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 59 ++++++++++++-------
 include/oneapi/dpl/pstl/tuple_impl.h          | 20 +++++++
 include/oneapi/dpl/pstl/utils.h               | 20 +++++++
 3 files changed, 77 insertions(+), 22 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index 2e4869fdb6b..65d358cb045 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -256,13 +256,35 @@ struct __parallel_for_small_submitter<__internal::__optional_kernel_name<_Name..
     }
 };
 
-template <typename _KernelName>
+template <typename _KernelName, typename... _Ranges>
 struct __parallel_for_large_submitter;
 
-template <typename... _Name>
-struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name...>>
-{
-    template <typename _ExecutionPolicy, typename _Fp, typename _Index, typename... _Ranges>
+template <typename... _Name, typename... _Ranges>
+struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name...>, _Ranges...>
+{
+    static constexpr std::uint8_t __bytes_per_work_item = 16;
+    // Flatten the range as std::tuple value types in the range are likely coming from separate ranges in a zip
+    // iterator.
+    using _FlattenedRangesTuple = typename oneapi::dpl::__internal::__flatten_std_or_internal_tuple<
+        std::tuple<oneapi::dpl::__internal::__value_t<_Ranges>...>>::type;
+    using _MinValueType = typename oneapi::dpl::__internal::__min_tuple_type<_FlattenedRangesTuple>::type;
+    // __iters_per_work_item is set to 1, 2, 4, 8, or 16 depending on the smallest type in the
+    // flattened ranges. This allows us to launch enough work per item to saturate device memory.
+    static constexpr std::uint8_t __iters_per_work_item =
+        oneapi::dpl::__internal::__dpl_ceiling_div(__bytes_per_work_item, sizeof(_MinValueType));
+
+    // Once there is enough work to launch a group on each compute unit with our __iters_per_item,
+    // then we should start using this code path.
+    template <typename _ExecutionPolicy>
+    static std::size_t
+    __estimate_best_start_size(const _ExecutionPolicy& __exec)
+    {
+        std::size_t __work_group_size = oneapi::dpl::__internal::__max_work_group_size(__exec, 512);
+        const std::uint32_t __max_cu = oneapi::dpl::__internal::__max_compute_units(__exec);
+        return __work_group_size * __iters_per_work_item * __max_cu;
+    }
+
+    template <typename _ExecutionPolicy, typename _Fp, typename _Index>
     auto
     operator()(_ExecutionPolicy&& __exec, _Fp __brick, _Index __count, _Ranges&&... __rngs) const
     {
@@ -274,25 +296,16 @@ struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name..
 
             // Limit the work-group size to 512 which has empirically yielded the best results.
             std::size_t __work_group_size = oneapi::dpl::__internal::__max_work_group_size(__exec, 512);
-            __work_group_size = std::min(__work_group_size, static_cast<std::size_t>(__count));
 
-            using _ValueType =
-                oneapi::dpl::__internal::__value_t<std::decay_t<std::tuple_element_t<0, std::tuple<_Ranges...>>>>;
-
-            // Process up to 16 bytes per work-item per input range. This value has been the empirically determined minimum
-            // number of bytes for a single input range to saturate HW bandwidth on target architecures.
-            constexpr std::uint8_t __bytes_per_work_item = 16;
             // TODO: Better handle this heuristic for the case where the input is a zip iterator
-            constexpr std::uint8_t __iters_per_work_item =
-                oneapi::dpl::__internal::__dpl_ceiling_div(__bytes_per_work_item, sizeof(_ValueType));
-
             const std::size_t __num_groups =
-                         oneapi::dpl::__internal::__dpl_ceiling_div(__count, (__work_group_size * __iters_per_work_item));
+                oneapi::dpl::__internal::__dpl_ceiling_div(__count, (__work_group_size * __iters_per_work_item));
             const std::size_t __num_items = __num_groups * __work_group_size;
             __cgh.parallel_for<_Name...>(
                 sycl::nd_range(sycl::range<1>(__num_items), sycl::range<1>(__work_group_size)),
                 [=](sycl::nd_item</*dim=*/1> __ndi) {
-                    auto [__idx, __stride, __is_full] = __stride_recommender(__ndi, __count, __iters_per_work_item, __work_group_size);
+                    auto [__idx, __stride, __is_full] =
+                        __stride_recommender(__ndi, __count, __iters_per_work_item, __work_group_size);
                     // TODO: Investigate using a vectorized approach similar to reduce.
                     // Initial investigation showed benefits for in-place for-based algorithms (e.g. std::for_each) but
                     // performance regressions for out-of-place (e.g. std::copy).
@@ -335,17 +348,19 @@ __parallel_for(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&&
     using _ForKernelLarge =
         oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<__parallel_for_large_kernel<_CustomName>>;
 
+    using __small_submitter = __parallel_for_small_submitter<_ForKernelSmall>;
+    using __large_submitter = __parallel_for_large_submitter<_ForKernelLarge, _Ranges...>;
     // Compile two kernels: one for small-to-medium inputs and a second for large. This avoids runtime checks within a single
     // kernel that worsen performance for small cases.
-    if (__count <= 262144)
+    if (__count < __large_submitter::__estimate_best_start_size(__exec))
     {
-        return __parallel_for_small_submitter<_ForKernelSmall>()(std::forward<_ExecutionPolicy>(__exec), __brick, __count,
-                                                            std::forward<_Ranges>(__rngs)...);
+        return __small_submitter()(std::forward<_ExecutionPolicy>(__exec), __brick, __count,
+                                   std::forward<_Ranges>(__rngs)...);
     }
     else
     {
-        return __parallel_for_large_submitter<_ForKernelLarge>()(std::forward<_ExecutionPolicy>(__exec), __brick, __count,
-                                                                 std::forward<_Ranges>(__rngs)...);
+        return __large_submitter()(std::forward<_ExecutionPolicy>(__exec), __brick, __count,
+                                   std::forward<_Ranges>(__rngs)...);
     }
 }
 
diff --git a/include/oneapi/dpl/pstl/tuple_impl.h b/include/oneapi/dpl/pstl/tuple_impl.h
index 239734d4861..0c528b0d15e 100644
--- a/include/oneapi/dpl/pstl/tuple_impl.h
+++ b/include/oneapi/dpl/pstl/tuple_impl.h
@@ -793,6 +793,26 @@ struct __decay_with_tuple_specialization<::std::tuple<_Args...>>
 template <typename... _Args>
 using __decay_with_tuple_specialization_t = typename __decay_with_tuple_specialization<_Args...>::type;
 
+
+// Flatten nested std::tuple or oneapi::dpl::__internal::tuple types into a single std::tuple.
+template <typename _T>
+struct __flatten_std_or_internal_tuple
+{
+	using type = std::tuple<_T>;
+};
+
+template <typename... _Ts>
+struct __flatten_std_or_internal_tuple<std::tuple<_Ts...>>
+{
+    using type = decltype(std::tuple_cat(std::declval<typename __flatten_std_or_internal_tuple<_Ts>::type>()...));
+};
+
+template <typename... _Ts>
+struct __flatten_std_or_internal_tuple<oneapi::dpl::__internal::tuple<_Ts...>>
+{
+    using type = decltype(std::tuple_cat(std::declval<typename __flatten_std_or_internal_tuple<_Ts>::type>()...));
+};
+
 } // namespace __internal
 } // namespace dpl
 } // namespace oneapi
diff --git a/include/oneapi/dpl/pstl/utils.h b/include/oneapi/dpl/pstl/utils.h
index 8a8dfdae1bc..1168dc76586 100644
--- a/include/oneapi/dpl/pstl/utils.h
+++ b/include/oneapi/dpl/pstl/utils.h
@@ -784,6 +784,26 @@ union __lazy_ctor_storage
     }
 };
 
+// Utility that returns the smallest type in tuple.
+template <typename _Tuple>
+class __min_tuple_type;
+
+template <typename _T>
+class __min_tuple_type<std::tuple<_T>>
+{
+  public:
+    using type = _T;
+};
+
+template <typename _T, typename... _Ts>
+class __min_tuple_type<std::tuple<_T, _Ts...>>
+{
+    using __min_type_ts = typename __min_tuple_type<std::tuple<_Ts...>>::type;
+
+  public:
+    using type = std::conditional_t<(sizeof(_T) < sizeof(__min_type_ts)), _T, __min_type_ts>;
+};
+
 } // namespace __internal
 } // namespace dpl
 } // namespace oneapi

From d376124b827f2232bf0600b9efe8b5236d074ff8 Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Fri, 20 Sep 2024 16:00:45 -0500
Subject: [PATCH 11/65] Code cleanup

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 22 ++++++++++---------
 .../dpcpp/parallel_backend_sycl_utils.h       | 16 +++++++-------
 2 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index 65d358cb045..04c1c3a07c6 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -262,24 +262,29 @@ struct __parallel_for_large_submitter;
 template <typename... _Name, typename... _Ranges>
 struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name...>, _Ranges...>
 {
-    static constexpr std::uint8_t __bytes_per_work_item = 16;
     // Flatten the range as std::tuple value types in the range are likely coming from separate ranges in a zip
     // iterator.
     using _FlattenedRangesTuple = typename oneapi::dpl::__internal::__flatten_std_or_internal_tuple<
         std::tuple<oneapi::dpl::__internal::__value_t<_Ranges>...>>::type;
     using _MinValueType = typename oneapi::dpl::__internal::__min_tuple_type<_FlattenedRangesTuple>::type;
     // __iters_per_work_item is set to 1, 2, 4, 8, or 16 depending on the smallest type in the
-    // flattened ranges. This allows us to launch enough work per item to saturate device memory.
+    // flattened ranges. This allows us to launch enough work per item to saturate the device's memory
+    // bandwidth. This heuristic errs on the side of launching more work per item than what is needed to
+    // achieve full bandwidth utilization for algorithms that have multiple ranges as this has shown the
+    // best general performance.
+    static constexpr std::uint8_t __bytes_per_work_item = 16;
     static constexpr std::uint8_t __iters_per_work_item =
         oneapi::dpl::__internal::__dpl_ceiling_div(__bytes_per_work_item, sizeof(_MinValueType));
+    // Limit the work-group size to 512 which has empirically yielded the best results across different architectures.
+    static constexpr std::uint16_t __max_work_group_size = 512;
 
-    // Once there is enough work to launch a group on each compute unit with our __iters_per_item,
+    // Once there is enough work to launch a group on each compute unit with our chosen __iters_per_item,
     // then we should start using this code path.
     template <typename _ExecutionPolicy>
     static std::size_t
     __estimate_best_start_size(const _ExecutionPolicy& __exec)
     {
-        std::size_t __work_group_size = oneapi::dpl::__internal::__max_work_group_size(__exec, 512);
+        std::size_t __work_group_size = oneapi::dpl::__internal::__max_work_group_size(__exec, __max_work_group_size);
         const std::uint32_t __max_cu = oneapi::dpl::__internal::__max_compute_units(__exec);
         return __work_group_size * __iters_per_work_item * __max_cu;
     }
@@ -294,10 +299,7 @@ struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name..
             //get an access to data under SYCL buffer:
             oneapi::dpl::__ranges::__require_access(__cgh, __rngs...);
 
-            // Limit the work-group size to 512 which has empirically yielded the best results.
-            std::size_t __work_group_size = oneapi::dpl::__internal::__max_work_group_size(__exec, 512);
-
-            // TODO: Better handle this heuristic for the case where the input is a zip iterator
+            std::size_t __work_group_size = oneapi::dpl::__internal::__max_work_group_size(__exec, __max_work_group_size);
             const std::size_t __num_groups =
                 oneapi::dpl::__internal::__dpl_ceiling_div(__count, (__work_group_size * __iters_per_work_item));
             const std::size_t __num_items = __num_groups * __work_group_size;
@@ -368,7 +370,7 @@ __parallel_for(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&&
 // parallel_transform_scan - async pattern
 //------------------------------------------------------------------------
 
-// Please see the comment for __parallel_for_submitter for optional kernel name explanation
+// Please see the comment for __parallel_for_small_submitter for optional kernel name explanation
 template <typename _CustomName, typename _PropagateScanName>
 struct __parallel_scan_submitter;
 
@@ -2284,7 +2286,7 @@ struct __partial_merge_kernel
     }
 };
 
-// Please see the comment for __parallel_for_submitter for optional kernel name explanation
+// Please see the comment for __parallel_for_small_submitter for optional kernel name explanation
 template <typename _GlobalSortName, typename _CopyBackName>
 struct __parallel_partial_sort_submitter;
 
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
index 1c18eeeda8f..0b382b5e248 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
@@ -841,7 +841,8 @@ class __static_monotonic_dispatcher<::std::integer_sequence<::std::uint16_t, _X,
 // Other compilation targets perform best with a work-group size stride.
 template <typename NdItem>
 std::tuple<std::size_t, std::size_t, bool>
-__stride_recommender(const NdItem& __ndi, std::size_t __count, std::size_t __iters_per_work_item, std::size_t __work_group_size)
+__stride_recommender(const NdItem& __ndi, std::size_t __count, std::size_t __iters_per_work_item,
+                     std::size_t __work_group_size)
 {
     if constexpr (oneapi::dpl::__internal::__is_spirv_target_v)
     {
@@ -851,19 +852,18 @@ __stride_recommender(const NdItem& __ndi, std::size_t __count, std::size_t __ite
         const std::uint32_t __sub_group_local_id = __sub_group.get_local_linear_id();
         const std::size_t __work_group_id = __ndi.get_group().get_group_linear_id();
 
-        const std::size_t __sub_group_start_idx = __iters_per_work_item * (__work_group_id * __work_group_size +
-                                                                     __sub_group_size * __sub_group_id);
-        const bool __is_full_sub_group =
-            __sub_group_start_idx + __iters_per_work_item * __sub_group_size <= __count;
+        const std::size_t __sub_group_start_idx =
+            __iters_per_work_item * (__work_group_id * __work_group_size + __sub_group_size * __sub_group_id);
+        const bool __is_full_sub_group = __sub_group_start_idx + __iters_per_work_item * __sub_group_size <= __count;
         const std::size_t __work_item_idx = __sub_group_start_idx + __sub_group_local_id;
         return std::make_tuple(__work_item_idx, __sub_group_size, __is_full_sub_group);
     }
     else
     {
-        const std::size_t __work_group_start_idx = __ndi.get_group().get_group_linear_id() * __work_group_size * __iters_per_work_item;
+        const std::size_t __work_group_start_idx =
+            __ndi.get_group().get_group_linear_id() * __work_group_size * __iters_per_work_item;
         const std::size_t __work_item_idx = __work_group_start_idx + __ndi.get_local_linear_id();
-        const bool __is_full_work_group =
-            __work_group_start_idx + __iters_per_work_item * __work_group_size <= __count;
+        const bool __is_full_work_group = __work_group_start_idx + __iters_per_work_item * __work_group_size <= __count;
         return std::make_tuple(__work_item_idx, __work_group_size, __is_full_work_group);
     }
 }

From a7c7606af0985fff5caf66862fcaf23b4df57905 Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Mon, 23 Sep 2024 13:29:09 -0500
Subject: [PATCH 12/65] Clang format

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 3 ++-
 include/oneapi/dpl/pstl/tuple_impl.h                         | 3 +--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index 04c1c3a07c6..cc79df785b9 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -299,7 +299,8 @@ struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name..
             //get an access to data under SYCL buffer:
             oneapi::dpl::__ranges::__require_access(__cgh, __rngs...);
 
-            std::size_t __work_group_size = oneapi::dpl::__internal::__max_work_group_size(__exec, __max_work_group_size);
+            std::size_t __work_group_size =
+                oneapi::dpl::__internal::__max_work_group_size(__exec, __max_work_group_size);
             const std::size_t __num_groups =
                 oneapi::dpl::__internal::__dpl_ceiling_div(__count, (__work_group_size * __iters_per_work_item));
             const std::size_t __num_items = __num_groups * __work_group_size;
diff --git a/include/oneapi/dpl/pstl/tuple_impl.h b/include/oneapi/dpl/pstl/tuple_impl.h
index 0c528b0d15e..c758a4a3f1b 100644
--- a/include/oneapi/dpl/pstl/tuple_impl.h
+++ b/include/oneapi/dpl/pstl/tuple_impl.h
@@ -793,12 +793,11 @@ struct __decay_with_tuple_specialization<::std::tuple<_Args...>>
 template <typename... _Args>
 using __decay_with_tuple_specialization_t = typename __decay_with_tuple_specialization<_Args...>::type;
 
-
 // Flatten nested std::tuple or oneapi::dpl::__internal::tuple types into a single std::tuple.
 template <typename _T>
 struct __flatten_std_or_internal_tuple
 {
-	using type = std::tuple<_T>;
+    using type = std::tuple<_T>;
 };
 
 template <typename... _Ts>

From b8aa15cadbcf8a02986ab4a18ba39baa69865f4e Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Mon, 23 Sep 2024 14:02:21 -0500
Subject: [PATCH 13/65] Update comments

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index cc79df785b9..4fe18421543 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -270,8 +270,8 @@ struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name..
     // __iters_per_work_item is set to 1, 2, 4, 8, or 16 depending on the smallest type in the
     // flattened ranges. This allows us to launch enough work per item to saturate the device's memory
     // bandwidth. This heuristic errs on the side of launching more work per item than what is needed to
-    // achieve full bandwidth utilization for algorithms that have multiple ranges as this has shown the
-    // best general performance.
+    // achieve full bandwidth utilization. 16 bytes per input range per work item has been found as a good
+    // value across the different for-based algorithms.
     static constexpr std::uint8_t __bytes_per_work_item = 16;
     static constexpr std::uint8_t __iters_per_work_item =
         oneapi::dpl::__internal::__dpl_ceiling_div(__bytes_per_work_item, sizeof(_MinValueType));
@@ -309,9 +309,10 @@ struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name..
                 [=](sycl::nd_item</*dim=*/1> __ndi) {
                     auto [__idx, __stride, __is_full] =
                         __stride_recommender(__ndi, __count, __iters_per_work_item, __work_group_size);
-                    // TODO: Investigate using a vectorized approach similar to reduce.
+                    // TODO: Investigate adding a vectorized path similar to reduce.
                     // Initial investigation showed benefits for in-place for-based algorithms (e.g. std::for_each) but
-                    // performance regressions for out-of-place (e.g. std::copy).
+                    // performance regressions for out-of-place (e.g. std::copy) where the compiler was unable to
+                    // vectorize our code.
                     if (__is_full)
                     {
                         _ONEDPL_PRAGMA_UNROLL

From b45a7c2f6dfe2ad8b59bf8cb1589c0cfcc6e2c9f Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Mon, 23 Sep 2024 14:05:09 -0500
Subject: [PATCH 14/65] Bugfix in comment

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index 4fe18421543..6613606800a 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -270,7 +270,7 @@ struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name..
     // __iters_per_work_item is set to 1, 2, 4, 8, or 16 depending on the smallest type in the
     // flattened ranges. This allows us to launch enough work per item to saturate the device's memory
     // bandwidth. This heuristic errs on the side of launching more work per item than what is needed to
-    // achieve full bandwidth utilization. 16 bytes per input range per work item has been found as a good
+    // achieve full bandwidth utilization. 16 bytes per range per work item has been found as a good
     // value across the different for-based algorithms.
     static constexpr std::uint8_t __bytes_per_work_item = 16;
     static constexpr std::uint8_t __iters_per_work_item =

From 4f9a3606ea95af41b8a8b5fa443fa3111a91e854 Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Mon, 23 Sep 2024 15:10:06 -0500
Subject: [PATCH 15/65] More cleanup and better handle non-full case

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 31 ++++++++++++-------
 .../dpcpp/parallel_backend_sycl_utils.h       |  6 ++--
 2 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index 6613606800a..ab96e782d7a 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -284,7 +284,8 @@ struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name..
     static std::size_t
     __estimate_best_start_size(const _ExecutionPolicy& __exec)
     {
-        std::size_t __work_group_size = oneapi::dpl::__internal::__max_work_group_size(__exec, __max_work_group_size);
+        const std::size_t __work_group_size =
+            oneapi::dpl::__internal::__max_work_group_size(__exec, __max_work_group_size);
         const std::uint32_t __max_cu = oneapi::dpl::__internal::__max_compute_units(__exec);
         return __work_group_size * __iters_per_work_item * __max_cu;
     }
@@ -298,8 +299,7 @@ struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name..
         auto __event = __exec.queue().submit([&__rngs..., &__brick, &__exec, __count](sycl::handler& __cgh) {
             //get an access to data under SYCL buffer:
             oneapi::dpl::__ranges::__require_access(__cgh, __rngs...);
-
-            std::size_t __work_group_size =
+            const std::size_t __work_group_size =
                 oneapi::dpl::__internal::__max_work_group_size(__exec, __max_work_group_size);
             const std::size_t __num_groups =
                 oneapi::dpl::__internal::__dpl_ceiling_div(__count, (__work_group_size * __iters_per_work_item));
@@ -307,12 +307,13 @@ struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name..
             __cgh.parallel_for<_Name...>(
                 sycl::nd_range(sycl::range<1>(__num_items), sycl::range<1>(__work_group_size)),
                 [=](sycl::nd_item</*dim=*/1> __ndi) {
-                    auto [__idx, __stride, __is_full] =
-                        __stride_recommender(__ndi, __count, __iters_per_work_item, __work_group_size);
                     // TODO: Investigate adding a vectorized path similar to reduce.
                     // Initial investigation showed benefits for in-place for-based algorithms (e.g. std::for_each) but
                     // performance regressions for out-of-place (e.g. std::copy) where the compiler was unable to
-                    // vectorize our code.
+                    // vectorize our code. Vectorization may also improve performance of for-algorithms over small data
+                    // types.
+                    auto [__idx, __group_start_idx, __stride, __is_full] =
+                        __stride_recommender(__ndi, __count, __iters_per_work_item, __work_group_size);
                     if (__is_full)
                     {
                         _ONEDPL_PRAGMA_UNROLL
@@ -324,13 +325,19 @@ struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name..
                     }
                     else
                     {
-                        for (std::uint8_t __i = 0; __i < __iters_per_work_item; ++__i)
+                        // Recompute iters per item and manually unroll last loop iteration to remove most branching.
+                        if (__group_start_idx >= __count)
+                            return;
+                        const std::uint8_t __adjusted_iters_per_work_item =
+                            oneapi::dpl::__internal::__dpl_ceiling_div(__count - __group_start_idx, __stride);
+                        for (std::uint8_t __i = 0; __i < __adjusted_iters_per_work_item - 1; ++__i)
                         {
-                            if (__idx < __count)
-                            {
-                                __brick(__idx, __rngs...);
-                                __idx += __stride;
-                            }
+                            __brick(__idx, __rngs...);
+                            __idx += __stride;
+                        }
+                        if (__idx < __count)
+                        {
+                            __brick(__idx, __rngs...);
                         }
                     }
                 });
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
index 0b382b5e248..39f5052382a 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
@@ -840,7 +840,7 @@ class __static_monotonic_dispatcher<::std::integer_sequence<::std::uint16_t, _X,
 // SPIR-V compilation targets show best performance with a stride of the sub-group size.
 // Other compilation targets perform best with a work-group size stride.
 template <typename NdItem>
-std::tuple<std::size_t, std::size_t, bool>
+std::tuple<std::size_t, std::size_t, std::size_t, bool>
 __stride_recommender(const NdItem& __ndi, std::size_t __count, std::size_t __iters_per_work_item,
                      std::size_t __work_group_size)
 {
@@ -856,7 +856,7 @@ __stride_recommender(const NdItem& __ndi, std::size_t __count, std::size_t __ite
             __iters_per_work_item * (__work_group_id * __work_group_size + __sub_group_size * __sub_group_id);
         const bool __is_full_sub_group = __sub_group_start_idx + __iters_per_work_item * __sub_group_size <= __count;
         const std::size_t __work_item_idx = __sub_group_start_idx + __sub_group_local_id;
-        return std::make_tuple(__work_item_idx, __sub_group_size, __is_full_sub_group);
+        return std::make_tuple(__work_item_idx, __sub_group_start_idx, __sub_group_size, __is_full_sub_group);
     }
     else
     {
@@ -864,7 +864,7 @@ __stride_recommender(const NdItem& __ndi, std::size_t __count, std::size_t __ite
             __ndi.get_group().get_group_linear_id() * __work_group_size * __iters_per_work_item;
         const std::size_t __work_item_idx = __work_group_start_idx + __ndi.get_local_linear_id();
         const bool __is_full_work_group = __work_group_start_idx + __iters_per_work_item * __work_group_size <= __count;
-        return std::make_tuple(__work_item_idx, __work_group_size, __is_full_work_group);
+        return std::make_tuple(__work_item_idx, __work_group_start_idx, __work_group_size, __is_full_work_group);
     }
 }
 

From 7bb1d2b81d11161cbce2009318559fba18fa0a63 Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Tue, 24 Sep 2024 08:38:40 -0500
Subject: [PATCH 16/65] Rename __ndi to __item for consistency with codebase

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h    |  4 ++--
 .../pstl/hetero/dpcpp/parallel_backend_sycl_utils.h  | 12 ++++++------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index ab96e782d7a..6c0cf593f3c 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -306,14 +306,14 @@ struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name..
             const std::size_t __num_items = __num_groups * __work_group_size;
             __cgh.parallel_for<_Name...>(
                 sycl::nd_range(sycl::range<1>(__num_items), sycl::range<1>(__work_group_size)),
-                [=](sycl::nd_item</*dim=*/1> __ndi) {
+                [=](sycl::nd_item</*dim=*/1> __item) {
                     // TODO: Investigate adding a vectorized path similar to reduce.
                     // Initial investigation showed benefits for in-place for-based algorithms (e.g. std::for_each) but
                     // performance regressions for out-of-place (e.g. std::copy) where the compiler was unable to
                     // vectorize our code. Vectorization may also improve performance of for-algorithms over small data
                     // types.
                     auto [__idx, __group_start_idx, __stride, __is_full] =
-                        __stride_recommender(__ndi, __count, __iters_per_work_item, __work_group_size);
+                        __stride_recommender(__item, __count, __iters_per_work_item, __work_group_size);
                     if (__is_full)
                     {
                         _ONEDPL_PRAGMA_UNROLL
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
index 39f5052382a..fc98cc86db2 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
@@ -839,18 +839,18 @@ class __static_monotonic_dispatcher<::std::integer_sequence<::std::uint16_t, _X,
 //
 // SPIR-V compilation targets show best performance with a stride of the sub-group size.
 // Other compilation targets perform best with a work-group size stride.
-template <typename NdItem>
+template <typename _NdItem>
 std::tuple<std::size_t, std::size_t, std::size_t, bool>
-__stride_recommender(const NdItem& __ndi, std::size_t __count, std::size_t __iters_per_work_item,
+__stride_recommender(const _NdItem& __item, std::size_t __count, std::size_t __iters_per_work_item,
                      std::size_t __work_group_size)
 {
     if constexpr (oneapi::dpl::__internal::__is_spirv_target_v)
     {
-        const __dpl_sycl::__sub_group __sub_group = __ndi.get_sub_group();
+        const __dpl_sycl::__sub_group __sub_group = __item.get_sub_group();
         const std::uint32_t __sub_group_size = __sub_group.get_local_linear_range();
         const std::uint32_t __sub_group_id = __sub_group.get_group_linear_id();
         const std::uint32_t __sub_group_local_id = __sub_group.get_local_linear_id();
-        const std::size_t __work_group_id = __ndi.get_group().get_group_linear_id();
+        const std::size_t __work_group_id = __item.get_group().get_group_linear_id();
 
         const std::size_t __sub_group_start_idx =
             __iters_per_work_item * (__work_group_id * __work_group_size + __sub_group_size * __sub_group_id);
@@ -861,8 +861,8 @@ __stride_recommender(const NdItem& __ndi, std::size_t __count, std::size_t __ite
     else
     {
         const std::size_t __work_group_start_idx =
-            __ndi.get_group().get_group_linear_id() * __work_group_size * __iters_per_work_item;
-        const std::size_t __work_item_idx = __work_group_start_idx + __ndi.get_local_linear_id();
+            __item.get_group().get_group_linear_id() * __work_group_size * __iters_per_work_item;
+        const std::size_t __work_item_idx = __work_group_start_idx + __item.get_local_linear_id();
         const bool __is_full_work_group = __work_group_start_idx + __iters_per_work_item * __work_group_size <= __count;
         return std::make_tuple(__work_item_idx, __work_group_start_idx, __work_group_size, __is_full_work_group);
     }

From a2ad92041e603f69cd075c6d8f08142038fbc2e4 Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Tue, 24 Sep 2024 08:45:52 -0500
Subject: [PATCH 17/65] Update all comments on kernel naming trick

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../experimental/kt/internal/esimd_radix_sort_submitters.h    | 2 +-
 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h  | 4 ++--
 .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h | 2 +-
 .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h       | 2 +-
 .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h      | 2 +-
 5 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/oneapi/dpl/experimental/kt/internal/esimd_radix_sort_submitters.h b/include/oneapi/dpl/experimental/kt/internal/esimd_radix_sort_submitters.h
index 4d7b81e6a2e..4fc274f2445 100644
--- a/include/oneapi/dpl/experimental/kt/internal/esimd_radix_sort_submitters.h
+++ b/include/oneapi/dpl/experimental/kt/internal/esimd_radix_sort_submitters.h
@@ -27,7 +27,7 @@ namespace oneapi::dpl::experimental::kt::gpu::esimd::__impl
 {
 
 //------------------------------------------------------------------------
-// Please see the comment for __parallel_for_submitter for optional kernel name explanation
+// Please see the comment above __parallel_for_small_submitter for optional kernel name explanation
 //------------------------------------------------------------------------
 
 template <bool __is_ascending, ::std::uint8_t __radix_bits, ::std::uint16_t __data_per_work_item,
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index 6c0cf593f3c..ba4a4537da2 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -379,7 +379,7 @@ __parallel_for(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&&
 // parallel_transform_scan - async pattern
 //------------------------------------------------------------------------
 
-// Please see the comment for __parallel_for_small_submitter for optional kernel name explanation
+// Please see the comment above __parallel_for_small_submitter for optional kernel name explanation
 template <typename _CustomName, typename _PropagateScanName>
 struct __parallel_scan_submitter;
 
@@ -2295,7 +2295,7 @@ struct __partial_merge_kernel
     }
 };
 
-// Please see the comment for __parallel_for_small_submitter for optional kernel name explanation
+// Please see the comment above __parallel_for_small_submitter for optional kernel name explanation
 template <typename _GlobalSortName, typename _CopyBackName>
 struct __parallel_partial_sort_submitter;
 
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h
index 7baee78b1b1..3be82fdc623 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h
@@ -48,7 +48,7 @@ namespace __par_backend_hetero
 //General version of parallel_for, one additional parameter - __count of iterations of loop __cgh.parallel_for,
 //for some algorithms happens that size of processing range is n, but amount of iterations is n/2.
 
-// Please see the comment for __parallel_for_submitter for optional kernel name explanation
+// Please see the comment above __parallel_for_small_submitter for optional kernel name explanation
 template <typename _Name>
 struct __parallel_for_fpga_submitter;
 
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
index cadff26a15d..9c331148f3c 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
@@ -112,7 +112,7 @@ __serial_merge(const _Rng1& __rng1, const _Rng2& __rng2, _Rng3& __rng3, const _I
     }
 }
 
-// Please see the comment for __parallel_for_submitter for optional kernel name explanation
+// Please see the comment above __parallel_for_small_submitter for optional kernel name explanation
 template <typename _IdType, typename _Name>
 struct __parallel_merge_submitter;
 
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h
index edad63d2a79..23e38268bf9 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h
@@ -111,7 +111,7 @@ __device_reduce_kernel(const _NDItemId __item_id, const _Size __n, const _Size _
 
 //------------------------------------------------------------------------
 // parallel_transform_reduce - async patterns
-// Please see the comment for __parallel_for_submitter for optional kernel name explanation
+// Please see the comment above __parallel_for_small_submitter for optional kernel name explanation
 //------------------------------------------------------------------------
 
 // Parallel_transform_reduce for a small arrays using a single work group.

From 47fe214c25cde24305bd118a689dbdba1a385e4b Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Tue, 24 Sep 2024 09:29:54 -0500
Subject: [PATCH 18/65] Handle non-full case in a cleaner way

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../pstl/hetero/dpcpp/parallel_backend_sycl.h    | 16 +++++-----------
 .../hetero/dpcpp/parallel_backend_sycl_utils.h   |  6 +++---
 2 files changed, 8 insertions(+), 14 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index ba4a4537da2..282b787448a 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -312,7 +312,7 @@ struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name..
                     // performance regressions for out-of-place (e.g. std::copy) where the compiler was unable to
                     // vectorize our code. Vectorization may also improve performance of for-algorithms over small data
                     // types.
-                    auto [__idx, __group_start_idx, __stride, __is_full] =
+                    auto [__idx, __stride, __is_full] =
                         __stride_recommender(__item, __count, __iters_per_work_item, __work_group_size);
                     if (__is_full)
                     {
@@ -323,22 +323,16 @@ struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name..
                             __idx += __stride;
                         }
                     }
-                    else
+                    // If we are not full, then take this branch only if there is work to process.
+                    else if (__idx < __count)
                     {
-                        // Recompute iters per item and manually unroll last loop iteration to remove most branching.
-                        if (__group_start_idx >= __count)
-                            return;
                         const std::uint8_t __adjusted_iters_per_work_item =
-                            oneapi::dpl::__internal::__dpl_ceiling_div(__count - __group_start_idx, __stride);
-                        for (std::uint8_t __i = 0; __i < __adjusted_iters_per_work_item - 1; ++__i)
+                            oneapi::dpl::__internal::__dpl_ceiling_div(__count - __idx, __stride);
+                        for (std::uint8_t __i = 0; __i < __adjusted_iters_per_work_item; ++__i)
                         {
                             __brick(__idx, __rngs...);
                             __idx += __stride;
                         }
-                        if (__idx < __count)
-                        {
-                            __brick(__idx, __rngs...);
-                        }
                     }
                 });
         });
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
index fc98cc86db2..c447f9b592f 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
@@ -840,7 +840,7 @@ class __static_monotonic_dispatcher<::std::integer_sequence<::std::uint16_t, _X,
 // SPIR-V compilation targets show best performance with a stride of the sub-group size.
 // Other compilation targets perform best with a work-group size stride.
 template <typename _NdItem>
-std::tuple<std::size_t, std::size_t, std::size_t, bool>
+std::tuple<std::size_t, std::size_t, bool>
 __stride_recommender(const _NdItem& __item, std::size_t __count, std::size_t __iters_per_work_item,
                      std::size_t __work_group_size)
 {
@@ -856,7 +856,7 @@ __stride_recommender(const _NdItem& __item, std::size_t __count, std::size_t __i
             __iters_per_work_item * (__work_group_id * __work_group_size + __sub_group_size * __sub_group_id);
         const bool __is_full_sub_group = __sub_group_start_idx + __iters_per_work_item * __sub_group_size <= __count;
         const std::size_t __work_item_idx = __sub_group_start_idx + __sub_group_local_id;
-        return std::make_tuple(__work_item_idx, __sub_group_start_idx, __sub_group_size, __is_full_sub_group);
+        return std::make_tuple(__work_item_idx, __sub_group_size, __is_full_sub_group);
     }
     else
     {
@@ -864,7 +864,7 @@ __stride_recommender(const _NdItem& __item, std::size_t __count, std::size_t __i
             __item.get_group().get_group_linear_id() * __work_group_size * __iters_per_work_item;
         const std::size_t __work_item_idx = __work_group_start_idx + __item.get_local_linear_id();
         const bool __is_full_work_group = __work_group_start_idx + __iters_per_work_item * __work_group_size <= __count;
-        return std::make_tuple(__work_item_idx, __work_group_start_idx, __work_group_size, __is_full_work_group);
+        return std::make_tuple(__work_item_idx, __work_group_size, __is_full_work_group);
     }
 }
 

From 79a18e9545c84c1727f283f8fd358dbacc7f9da6 Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Tue, 24 Sep 2024 11:17:14 -0500
Subject: [PATCH 19/65] Switch min tuple type utility to return size of type

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../pstl/hetero/dpcpp/parallel_backend_sycl.h   |  5 +++--
 include/oneapi/dpl/pstl/utils.h                 | 17 ++++++++++-------
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index 282b787448a..dd963a5e591 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -266,7 +266,8 @@ struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name..
     // iterator.
     using _FlattenedRangesTuple = typename oneapi::dpl::__internal::__flatten_std_or_internal_tuple<
         std::tuple<oneapi::dpl::__internal::__value_t<_Ranges>...>>::type;
-    using _MinValueType = typename oneapi::dpl::__internal::__min_tuple_type<_FlattenedRangesTuple>::type;
+    static constexpr std::size_t __min_type_size =
+        oneapi::dpl::__internal::__min_tuple_type_size_v<_FlattenedRangesTuple>;
     // __iters_per_work_item is set to 1, 2, 4, 8, or 16 depending on the smallest type in the
     // flattened ranges. This allows us to launch enough work per item to saturate the device's memory
     // bandwidth. This heuristic errs on the side of launching more work per item than what is needed to
@@ -274,7 +275,7 @@ struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name..
     // value across the different for-based algorithms.
     static constexpr std::uint8_t __bytes_per_work_item = 16;
     static constexpr std::uint8_t __iters_per_work_item =
-        oneapi::dpl::__internal::__dpl_ceiling_div(__bytes_per_work_item, sizeof(_MinValueType));
+        oneapi::dpl::__internal::__dpl_ceiling_div(__bytes_per_work_item, __min_type_size);
     // Limit the work-group size to 512 which has empirically yielded the best results across different architectures.
     static constexpr std::uint16_t __max_work_group_size = 512;
 
diff --git a/include/oneapi/dpl/pstl/utils.h b/include/oneapi/dpl/pstl/utils.h
index 1168dc76586..10d60d8c5d6 100644
--- a/include/oneapi/dpl/pstl/utils.h
+++ b/include/oneapi/dpl/pstl/utils.h
@@ -784,26 +784,29 @@ union __lazy_ctor_storage
     }
 };
 
-// Utility that returns the smallest type in tuple.
+// Utility that returns the smallest type size in a tuple.
 template <typename _Tuple>
-class __min_tuple_type;
+class __min_tuple_type_size;
 
 template <typename _T>
-class __min_tuple_type<std::tuple<_T>>
+class __min_tuple_type_size<std::tuple<_T>>
 {
   public:
-    using type = _T;
+    static constexpr std::size_t value = sizeof(_T);
 };
 
 template <typename _T, typename... _Ts>
-class __min_tuple_type<std::tuple<_T, _Ts...>>
+class __min_tuple_type_size<std::tuple<_T, _Ts...>>
 {
-    using __min_type_ts = typename __min_tuple_type<std::tuple<_Ts...>>::type;
+    static constexpr std::size_t __min_type_value_ts = __min_tuple_type_size<std::tuple<_Ts...>>::value;
 
   public:
-    using type = std::conditional_t<(sizeof(_T) < sizeof(__min_type_ts)), _T, __min_type_ts>;
+    static constexpr std::size_t value = std::min(sizeof(_T), __min_type_value_ts);
 };
 
+template <typename _Tuple>
+inline constexpr std::size_t __min_tuple_type_size_v = __min_tuple_type_size<_Tuple>::value;
+
 } // namespace __internal
 } // namespace dpl
 } // namespace oneapi

From 3ab8c75aeab4a05becd8da49879ff0003f6701e3 Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Tue, 24 Sep 2024 11:59:53 -0500
Subject: [PATCH 20/65] Remove unnecessary template parameter

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
index c447f9b592f..77cc8ad1671 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
@@ -839,9 +839,8 @@ class __static_monotonic_dispatcher<::std::integer_sequence<::std::uint16_t, _X,
 //
 // SPIR-V compilation targets show best performance with a stride of the sub-group size.
 // Other compilation targets perform best with a work-group size stride.
-template <typename _NdItem>
 std::tuple<std::size_t, std::size_t, bool>
-__stride_recommender(const _NdItem& __item, std::size_t __count, std::size_t __iters_per_work_item,
+__stride_recommender(const sycl::nd_item<1>& __item, std::size_t __count, std::size_t __iters_per_work_item,
                      std::size_t __work_group_size)
 {
     if constexpr (oneapi::dpl::__internal::__is_spirv_target_v)

From 4a70fe2155c3bc100bca7e47c4a3047a01f32291 Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Tue, 24 Sep 2024 13:07:30 -0500
Subject: [PATCH 21/65] Make non-template function inline for ODR compliance

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
index 77cc8ad1671..685b4760daa 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
@@ -839,7 +839,7 @@ class __static_monotonic_dispatcher<::std::integer_sequence<::std::uint16_t, _X,
 //
 // SPIR-V compilation targets show best performance with a stride of the sub-group size.
 // Other compilation targets perform best with a work-group size stride.
-std::tuple<std::size_t, std::size_t, bool>
+inline std::tuple<std::size_t, std::size_t, bool>
 __stride_recommender(const sycl::nd_item<1>& __item, std::size_t __count, std::size_t __iters_per_work_item,
                      std::size_t __work_group_size)
 {

From 5530209445e7a824b70a2d8201edcbaf6a8cce44 Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Tue, 24 Sep 2024 14:21:40 -0700
Subject: [PATCH 22/65] If the iters per work item is 1, then only compile the
 basic pfor kernel

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 21 +++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index dd963a5e591..aee0c73fdf6 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -356,16 +356,25 @@ __parallel_for(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&&
 
     using __small_submitter = __parallel_for_small_submitter<_ForKernelSmall>;
     using __large_submitter = __parallel_for_large_submitter<_ForKernelLarge, _Ranges...>;
-    // Compile two kernels: one for small-to-medium inputs and a second for large. This avoids runtime checks within a single
-    // kernel that worsen performance for small cases.
-    if (__count < __large_submitter::__estimate_best_start_size(__exec))
+    // Compile two kernels: one for small-to-medium inputs and a second for large. This avoids runtime checks within a
+    // single kernel that worsen performance for small cases. If the number of iterations of the large submitter is 1,
+    // then only compile the basic kernel as the two versions are effectively the same.
+    if constexpr (__large_submitter::__iters_per_work_item > 1)
     {
-        return __small_submitter()(std::forward<_ExecutionPolicy>(__exec), __brick, __count,
-                                   std::forward<_Ranges>(__rngs)...);
+        if (__count < __large_submitter::__estimate_best_start_size(__exec))
+        {
+            return __small_submitter()(std::forward<_ExecutionPolicy>(__exec), __brick, __count,
+                                       std::forward<_Ranges>(__rngs)...);
+        }
+        else
+        {
+            return __large_submitter()(std::forward<_ExecutionPolicy>(__exec), __brick, __count,
+                                       std::forward<_Ranges>(__rngs)...);
+        }
     }
     else
     {
-        return __large_submitter()(std::forward<_ExecutionPolicy>(__exec), __brick, __count,
+        return __small_submitter()(std::forward<_ExecutionPolicy>(__exec), __brick, __count,
                                    std::forward<_Ranges>(__rngs)...);
     }
 }

From 90f19d4c62facaa3211e8af12826ef3ea036b861 Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Wed, 25 Sep 2024 09:11:44 -0500
Subject: [PATCH 23/65] Address several PR comments

* Move __stride_recommender into __parallel_for_large_submitter
* Use {} to invoke constructor
* Simplify if-else statements in for dispatch

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 49 ++++++++++++++-----
 1 file changed, 37 insertions(+), 12 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index aee0c73fdf6..30f983c8a72 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -279,6 +279,39 @@ struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name..
     // Limit the work-group size to 512 which has empirically yielded the best results across different architectures.
     static constexpr std::uint16_t __max_work_group_size = 512;
 
+    // SPIR-V compilation targets show best performance with a stride of the sub-group size.
+    // Other compilation targets perform best with a work-group size stride. This utility can only be called from the
+    // device.
+    static inline std::tuple<std::size_t, std::size_t, bool>
+    __stride_recommender(const sycl::nd_item<1>& __item, std::size_t __count, std::size_t __iters_per_work_item,
+                         std::size_t __work_group_size)
+    {
+        if constexpr (oneapi::dpl::__internal::__is_spirv_target_v)
+        {
+            const __dpl_sycl::__sub_group __sub_group = __item.get_sub_group();
+            const std::uint32_t __sub_group_size = __sub_group.get_local_linear_range();
+            const std::uint32_t __sub_group_id = __sub_group.get_group_linear_id();
+            const std::uint32_t __sub_group_local_id = __sub_group.get_local_linear_id();
+            const std::size_t __work_group_id = __item.get_group().get_group_linear_id();
+
+            const std::size_t __sub_group_start_idx =
+                __iters_per_work_item * (__work_group_id * __work_group_size + __sub_group_size * __sub_group_id);
+            const bool __is_full_sub_group =
+                __sub_group_start_idx + __iters_per_work_item * __sub_group_size <= __count;
+            const std::size_t __work_item_idx = __sub_group_start_idx + __sub_group_local_id;
+            return std::make_tuple(__work_item_idx, __sub_group_size, __is_full_sub_group);
+        }
+        else
+        {
+            const std::size_t __work_group_start_idx =
+                __item.get_group().get_group_linear_id() * __work_group_size * __iters_per_work_item;
+            const std::size_t __work_item_idx = __work_group_start_idx + __item.get_local_linear_id();
+            const bool __is_full_work_group =
+                __work_group_start_idx + __iters_per_work_item * __work_group_size <= __count;
+            return std::make_tuple(__work_item_idx, __work_group_size, __is_full_work_group);
+        }
+    }
+
     // Once there is enough work to launch a group on each compute unit with our chosen __iters_per_item,
     // then we should start using this code path.
     template <typename _ExecutionPolicy>
@@ -361,22 +394,14 @@ __parallel_for(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&&
     // then only compile the basic kernel as the two versions are effectively the same.
     if constexpr (__large_submitter::__iters_per_work_item > 1)
     {
-        if (__count < __large_submitter::__estimate_best_start_size(__exec))
+        if (__count >= __large_submitter::__estimate_best_start_size(__exec))
         {
-            return __small_submitter()(std::forward<_ExecutionPolicy>(__exec), __brick, __count,
+            return __large_submitter{}(std::forward<_ExecutionPolicy>(__exec), __brick, __count,
                                        std::forward<_Ranges>(__rngs)...);
         }
-        else
-        {
-            return __large_submitter()(std::forward<_ExecutionPolicy>(__exec), __brick, __count,
-                                       std::forward<_Ranges>(__rngs)...);
-        }
-    }
-    else
-    {
-        return __small_submitter()(std::forward<_ExecutionPolicy>(__exec), __brick, __count,
-                                   std::forward<_Ranges>(__rngs)...);
     }
+    return __small_submitter{}(std::forward<_ExecutionPolicy>(__exec), __brick, __count,
+                               std::forward<_Ranges>(__rngs)...);
 }
 
 //------------------------------------------------------------------------

From 1ac65b927db4944fca994c30240157e7fb201434 Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Wed, 25 Sep 2024 09:19:31 -0500
Subject: [PATCH 24/65] Remove free function __stride_recommender

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../dpcpp/parallel_backend_sycl_utils.h       | 33 -------------------
 1 file changed, 33 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
index 685b4760daa..f4eb557170e 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
@@ -834,39 +834,6 @@ class __static_monotonic_dispatcher<::std::integer_sequence<::std::uint16_t, _X,
     }
 };
 
-// Utility to recommend a stride for the best-performing memory access pattern from empirical testing on different
-// devices. This utility can only be called from the device.
-//
-// SPIR-V compilation targets show best performance with a stride of the sub-group size.
-// Other compilation targets perform best with a work-group size stride.
-inline std::tuple<std::size_t, std::size_t, bool>
-__stride_recommender(const sycl::nd_item<1>& __item, std::size_t __count, std::size_t __iters_per_work_item,
-                     std::size_t __work_group_size)
-{
-    if constexpr (oneapi::dpl::__internal::__is_spirv_target_v)
-    {
-        const __dpl_sycl::__sub_group __sub_group = __item.get_sub_group();
-        const std::uint32_t __sub_group_size = __sub_group.get_local_linear_range();
-        const std::uint32_t __sub_group_id = __sub_group.get_group_linear_id();
-        const std::uint32_t __sub_group_local_id = __sub_group.get_local_linear_id();
-        const std::size_t __work_group_id = __item.get_group().get_group_linear_id();
-
-        const std::size_t __sub_group_start_idx =
-            __iters_per_work_item * (__work_group_id * __work_group_size + __sub_group_size * __sub_group_id);
-        const bool __is_full_sub_group = __sub_group_start_idx + __iters_per_work_item * __sub_group_size <= __count;
-        const std::size_t __work_item_idx = __sub_group_start_idx + __sub_group_local_id;
-        return std::make_tuple(__work_item_idx, __sub_group_size, __is_full_sub_group);
-    }
-    else
-    {
-        const std::size_t __work_group_start_idx =
-            __item.get_group().get_group_linear_id() * __work_group_size * __iters_per_work_item;
-        const std::size_t __work_item_idx = __work_group_start_idx + __item.get_local_linear_id();
-        const bool __is_full_work_group = __work_group_start_idx + __iters_per_work_item * __work_group_size <= __count;
-        return std::make_tuple(__work_item_idx, __work_group_size, __is_full_work_group);
-    }
-}
-
 } // namespace __par_backend_hetero
 } // namespace dpl
 } // namespace oneapi

From 6a5a562aaa3140b55e7b93752658104c487ee5ef Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Wed, 25 Sep 2024 13:17:30 -0500
Subject: [PATCH 25/65] Accept ranges as forwarding references in
 __parallel_for_large_submitter

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h      | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index 30f983c8a72..8798b50b126 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -256,16 +256,16 @@ struct __parallel_for_small_submitter<__internal::__optional_kernel_name<_Name..
     }
 };
 
-template <typename _KernelName, typename... _Ranges>
+template <typename _KernelName, typename... _RangeTypes>
 struct __parallel_for_large_submitter;
 
-template <typename... _Name, typename... _Ranges>
-struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name...>, _Ranges...>
+template <typename... _Name, typename... _RangeTypes>
+struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name...>, _RangeTypes...>
 {
     // Flatten the range as std::tuple value types in the range are likely coming from separate ranges in a zip
     // iterator.
     using _FlattenedRangesTuple = typename oneapi::dpl::__internal::__flatten_std_or_internal_tuple<
-        std::tuple<oneapi::dpl::__internal::__value_t<_Ranges>...>>::type;
+        std::tuple<oneapi::dpl::__internal::__value_t<_RangeTypes>...>>::type;
     static constexpr std::size_t __min_type_size =
         oneapi::dpl::__internal::__min_tuple_type_size_v<_FlattenedRangesTuple>;
     // __iters_per_work_item is set to 1, 2, 4, 8, or 16 depending on the smallest type in the
@@ -324,7 +324,7 @@ struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name..
         return __work_group_size * __iters_per_work_item * __max_cu;
     }
 
-    template <typename _ExecutionPolicy, typename _Fp, typename _Index>
+    template <typename _ExecutionPolicy, typename _Fp, typename _Index, typename... _Ranges>
     auto
     operator()(_ExecutionPolicy&& __exec, _Fp __brick, _Index __count, _Ranges&&... __rngs) const
     {

From 357032f663e39a91d2fe3e8943f9a3d5465f4c87 Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Wed, 6 Nov 2024 13:44:38 -0600
Subject: [PATCH 26/65] Address reviewer comments

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../pstl/hetero/dpcpp/parallel_backend_sycl.h |  9 +++----
 include/oneapi/dpl/pstl/tuple_impl.h          | 19 --------------
 include/oneapi/dpl/pstl/utils.h               | 25 +++++++------------
 3 files changed, 12 insertions(+), 41 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index 8798b50b126..d029aea0de7 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -28,6 +28,7 @@
 #include <cmath>
 #include <limits>
 #include <cstdint>
+#include <tuple>
 
 #include "../../iterator_impl.h"
 #include "../../execution_impl.h"
@@ -262,12 +263,8 @@ struct __parallel_for_large_submitter;
 template <typename... _Name, typename... _RangeTypes>
 struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name...>, _RangeTypes...>
 {
-    // Flatten the range as std::tuple value types in the range are likely coming from separate ranges in a zip
-    // iterator.
-    using _FlattenedRangesTuple = typename oneapi::dpl::__internal::__flatten_std_or_internal_tuple<
-        std::tuple<oneapi::dpl::__internal::__value_t<_RangeTypes>...>>::type;
-    static constexpr std::size_t __min_type_size =
-        oneapi::dpl::__internal::__min_tuple_type_size_v<_FlattenedRangesTuple>;
+    static constexpr std::size_t __min_type_size = oneapi::dpl::__internal::__min_nested_type_size<
+        std::tuple<oneapi::dpl::__internal::__value_t<_RangeTypes>...>>::value;
     // __iters_per_work_item is set to 1, 2, 4, 8, or 16 depending on the smallest type in the
     // flattened ranges. This allows us to launch enough work per item to saturate the device's memory
     // bandwidth. This heuristic errs on the side of launching more work per item than what is needed to
diff --git a/include/oneapi/dpl/pstl/tuple_impl.h b/include/oneapi/dpl/pstl/tuple_impl.h
index c758a4a3f1b..239734d4861 100644
--- a/include/oneapi/dpl/pstl/tuple_impl.h
+++ b/include/oneapi/dpl/pstl/tuple_impl.h
@@ -793,25 +793,6 @@ struct __decay_with_tuple_specialization<::std::tuple<_Args...>>
 template <typename... _Args>
 using __decay_with_tuple_specialization_t = typename __decay_with_tuple_specialization<_Args...>::type;
 
-// Flatten nested std::tuple or oneapi::dpl::__internal::tuple types into a single std::tuple.
-template <typename _T>
-struct __flatten_std_or_internal_tuple
-{
-    using type = std::tuple<_T>;
-};
-
-template <typename... _Ts>
-struct __flatten_std_or_internal_tuple<std::tuple<_Ts...>>
-{
-    using type = decltype(std::tuple_cat(std::declval<typename __flatten_std_or_internal_tuple<_Ts>::type>()...));
-};
-
-template <typename... _Ts>
-struct __flatten_std_or_internal_tuple<oneapi::dpl::__internal::tuple<_Ts...>>
-{
-    using type = decltype(std::tuple_cat(std::declval<typename __flatten_std_or_internal_tuple<_Ts>::type>()...));
-};
-
 } // namespace __internal
 } // namespace dpl
 } // namespace oneapi
diff --git a/include/oneapi/dpl/pstl/utils.h b/include/oneapi/dpl/pstl/utils.h
index 10d60d8c5d6..1848d33eaea 100644
--- a/include/oneapi/dpl/pstl/utils.h
+++ b/include/oneapi/dpl/pstl/utils.h
@@ -25,6 +25,7 @@
 #include <iterator>
 #include <functional>
 #include <type_traits>
+#include <algorithm>
 
 #if _ONEDPL_BACKEND_SYCL
 #    include "hetero/dpcpp/sycl_defs.h"
@@ -784,29 +785,21 @@ union __lazy_ctor_storage
     }
 };
 
-// Utility that returns the smallest type size in a tuple.
-template <typename _Tuple>
-class __min_tuple_type_size;
-
+// Returns the smallest type within a set of potentially nested template types.
+// E.g. If we consider the type: T = tuple<float, tuple<short, long>, int, double>,
+// then __min_nested_type_size<T>::value returns sizeof(short).
 template <typename _T>
-class __min_tuple_type_size<std::tuple<_T>>
+struct __min_nested_type_size
 {
-  public:
-    static constexpr std::size_t value = sizeof(_T);
+    constexpr static std::size_t value = sizeof(_T);
 };
 
-template <typename _T, typename... _Ts>
-class __min_tuple_type_size<std::tuple<_T, _Ts...>>
+template <template <typename...> typename _WrapperType, typename... _Ts>
+struct __min_nested_type_size<_WrapperType<_Ts...>>
 {
-    static constexpr std::size_t __min_type_value_ts = __min_tuple_type_size<std::tuple<_Ts...>>::value;
-
-  public:
-    static constexpr std::size_t value = std::min(sizeof(_T), __min_type_value_ts);
+    constexpr static std::size_t value = std::min({__min_nested_type_size<_Ts>::value...});
 };
 
-template <typename _Tuple>
-inline constexpr std::size_t __min_tuple_type_size_v = __min_tuple_type_size<_Tuple>::value;
-
 } // namespace __internal
 } // namespace dpl
 } // namespace oneapi

From ca9e594daff357948d7048b71c58ce267be57d78 Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Mon, 16 Dec 2024 07:31:13 -0800
Subject: [PATCH 27/65] Introduce vectorized for-path for small types and
 parallel_backend_sycl_for.h

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../oneapi/dpl/internal/binary_search_impl.h  |  24 +-
 .../dpl/pstl/hetero/algorithm_impl_hetero.h   |  49 +-
 .../hetero/algorithm_ranges_impl_hetero.h     |   1 +
 .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 183 +-------
 .../hetero/dpcpp/parallel_backend_sycl_for.h  | 209 +++++++++
 .../dpcpp/parallel_backend_sycl_utils.h       | 171 +++++++
 .../dpl/pstl/hetero/dpcpp/sycl_traits.h       |  12 +-
 .../pstl/hetero/dpcpp/unseq_backend_sycl.h    | 438 +++++++++++++++++-
 .../dpl/pstl/hetero/numeric_impl_hetero.h     |   9 +-
 include/oneapi/dpl/pstl/utils.h               |  10 +
 10 files changed, 871 insertions(+), 235 deletions(-)
 create mode 100644 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h

diff --git a/include/oneapi/dpl/internal/binary_search_impl.h b/include/oneapi/dpl/internal/binary_search_impl.h
index ef01be4b161..0214b9874c4 100644
--- a/include/oneapi/dpl/internal/binary_search_impl.h
+++ b/include/oneapi/dpl/internal/binary_search_impl.h
@@ -37,8 +37,10 @@ enum class search_algorithm
     binary_search
 };
 
-template <typename Comp, typename T, search_algorithm func>
+#if _ONEDPL_BACKEND_SYCL
+template <typename Comp, typename T, typename _Range, search_algorithm func>
 struct custom_brick
+    : oneapi::dpl::unseq_backend::walk_scalar_base<_Range>
 {
     Comp comp;
     T size;
@@ -68,17 +70,23 @@ struct custom_brick
             get<2>(acc[idx]) = (value != end_orig) && (get<1>(acc[idx]) == get<0>(acc[value]));
         }
     }
-
-    template <typename _ItemId, typename _Acc>
+    template <typename _IsFull, typename _ItemId, typename _Acc>
     void
-    operator()(_ItemId idx, _Acc acc) const
+    __scalar_path(_IsFull, _ItemId idx, _Acc acc) const
     {
         if (use_32bit_indexing)
             search_impl<std::uint32_t>(idx, acc);
         else
             search_impl<std::uint64_t>(idx, acc);
     }
+    template <typename _IsFull, typename _ItemId, typename _Acc>
+    void
+    operator()(_IsFull __is_full, _ItemId idx, _Acc acc) const
+    {
+        __scalar_path(__is_full, idx, acc);
+    }
 };
+#endif
 
 template <class _Tag, typename Policy, typename InputIterator1, typename InputIterator2, typename OutputIterator,
           typename StrictWeakOrdering>
@@ -155,7 +163,7 @@ lower_bound_impl(__internal::__hetero_tag<_BackendTag>, Policy&& policy, InputIt
     const bool use_32bit_indexing = size <= std::numeric_limits<std::uint32_t>::max();
     __bknd::__parallel_for(
         _BackendTag{}, ::std::forward<decltype(policy)>(policy),
-        custom_brick<StrictWeakOrdering, decltype(size), search_algorithm::lower_bound>{comp, size, use_32bit_indexing},
+        custom_brick<StrictWeakOrdering, decltype(size), decltype(zip_vw), search_algorithm::lower_bound>{{}, comp, size, use_32bit_indexing},
         value_size, zip_vw)
         .__deferrable_wait();
     return result + value_size;
@@ -187,7 +195,7 @@ upper_bound_impl(__internal::__hetero_tag<_BackendTag>, Policy&& policy, InputIt
     const bool use_32bit_indexing = size <= std::numeric_limits<std::uint32_t>::max();
     __bknd::__parallel_for(
         _BackendTag{}, std::forward<decltype(policy)>(policy),
-        custom_brick<StrictWeakOrdering, decltype(size), search_algorithm::upper_bound>{comp, size, use_32bit_indexing},
+        custom_brick<StrictWeakOrdering, decltype(size), decltype(zip_vw), search_algorithm::upper_bound>{{}, comp, size, use_32bit_indexing},
         value_size, zip_vw)
         .__deferrable_wait();
     return result + value_size;
@@ -218,8 +226,8 @@ binary_search_impl(__internal::__hetero_tag<_BackendTag>, Policy&& policy, Input
     auto zip_vw = make_zip_view(input_buf.all_view(), value_buf.all_view(), result_buf.all_view());
     const bool use_32bit_indexing = size <= std::numeric_limits<std::uint32_t>::max();
     __bknd::__parallel_for(_BackendTag{}, std::forward<decltype(policy)>(policy),
-                           custom_brick<StrictWeakOrdering, decltype(size), search_algorithm::binary_search>{
-                               comp, size, use_32bit_indexing},
+                           custom_brick<StrictWeakOrdering, decltype(size), decltype(zip_vw), search_algorithm::binary_search>{
+                             {}, comp, size, use_32bit_indexing},
                            value_size, zip_vw)
         .__deferrable_wait();
     return result + value_size;
diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h
index 65bf99c8777..bdcba6fca21 100644
--- a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h
+++ b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h
@@ -56,8 +56,9 @@ __pattern_walk1(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _ForwardIt
         oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read_write, _ForwardIterator>();
     auto __buf = __keep(__first, __last);
 
+    auto __view = __buf.all_view();
     oneapi::dpl::__par_backend_hetero::__parallel_for(
-        _BackendTag{}, __exec, unseq_backend::walk_n<_ExecutionPolicy, _Function>{__f}, __n, __buf.all_view())
+        _BackendTag{}, __exec, unseq_backend::walk1_vector_or_scalar<_ExecutionPolicy, _Function, decltype(__view)>{{}, __f, static_cast<std::size_t>(__n)}, __n, __view)
         .__deferrable_wait();
 }
 
@@ -100,10 +101,13 @@ __pattern_walk2(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _ForwardIt
 
     auto __keep2 = oneapi::dpl::__ranges::__get_sycl_range<__acc_mode2, _ForwardIterator2>();
     auto __buf2 = __keep2(__first2, __first2 + __n);
+    
+    auto __view1 = __buf1.all_view();
+    auto __view2 = __buf2.all_view();
 
     auto __future = oneapi::dpl::__par_backend_hetero::__parallel_for(
         _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
-        unseq_backend::walk_n<_ExecutionPolicy, _Function>{__f}, __n, __buf1.all_view(), __buf2.all_view());
+        unseq_backend::walk2_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__view1), decltype(__view2)>{{}, __f, size_t(__n)}, __n, __view1, __view2);
 
     // Call no wait, wait or deferrable wait depending on _WaitMode
     __future.wait(_WaitMode{});
@@ -160,9 +164,14 @@ __pattern_walk3(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _ForwardIt
     auto __keep3 = oneapi::dpl::__ranges::__get_sycl_range<__acc_mode3, _ForwardIterator3>();
     auto __buf3 = __keep3(__first3, __first3 + __n);
 
+    auto __view1 = __buf1.all_view();
+    auto __view2 = __buf2.all_view();
+    auto __view3 = __buf3.all_view();
+
     oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
-                                                      unseq_backend::walk_n<_ExecutionPolicy, _Function>{__f}, __n,
-                                                      __buf1.all_view(), __buf2.all_view(), __buf3.all_view())
+                                                      unseq_backend::walk3_vectors_or_scalars<_ExecutionPolicy, _Function,
+                                                      decltype(__view1), decltype(__view2), decltype(__view3)>{{}, __f, size_t(__n)}, __n,
+                                                      __view1, __view2, __view3)
         .__deferrable_wait();
 
     return __first3 + __n;
@@ -1565,7 +1574,7 @@ __pattern_reverse(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterato
     auto __buf = __keep(__first, __last);
     oneapi::dpl::__par_backend_hetero::__parallel_for(
         _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
-        unseq_backend::__reverse_functor<typename ::std::iterator_traits<_Iterator>::difference_type>{__n}, __n / 2,
+        unseq_backend::__reverse_functor<typename ::std::iterator_traits<_Iterator>::difference_type, decltype(__buf.all_view())>{{}, __n}, __n / 2,
         __buf.all_view())
         .__deferrable_wait();
 }
@@ -1589,10 +1598,12 @@ __pattern_reverse_copy(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Bi
     auto __keep2 =
         oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _ForwardIterator>();
     auto __buf2 = __keep2(__result, __result + __n);
+    auto __view1 = __buf1.all_view();
+    auto __view2 = __buf2.all_view();
     oneapi::dpl::__par_backend_hetero::__parallel_for(
         _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
-        unseq_backend::__reverse_copy<typename ::std::iterator_traits<_BidirectionalIterator>::difference_type>{__n},
-        __n, __buf1.all_view(), __buf2.all_view())
+        unseq_backend::__reverse_copy<typename ::std::iterator_traits<_BidirectionalIterator>::difference_type, decltype(__view1), decltype(__view2)>{{}, __n},
+        __n, __view1, __view2)
         .__deferrable_wait();
 
     return __result + __n;
@@ -1626,23 +1637,24 @@ __pattern_rotate(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterator
     auto __buf = __keep(__first, __last);
     auto __temp_buf = oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, _Tp>(__exec, __n);
 
+    auto __view = __buf.all_view();
     auto __temp_rng_w =
         oneapi::dpl::__ranges::all_view<_Tp, __par_backend_hetero::access_mode::write>(__temp_buf.get_buffer());
 
     const auto __shift = __new_first - __first;
     oneapi::dpl::__par_backend_hetero::__parallel_for(
         _BackendTag{}, oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__rotate_wrapper>(__exec),
-        unseq_backend::__rotate_copy<typename ::std::iterator_traits<_Iterator>::difference_type>{__n, __shift}, __n,
-        __buf.all_view(), __temp_rng_w);
+        unseq_backend::__rotate_copy<typename ::std::iterator_traits<_Iterator>::difference_type, decltype(__view), decltype(__temp_rng_w)>{{}, __n, __shift}, __n,
+        __view, __temp_rng_w);
 
     //An explicit wait isn't required here because we are working with a temporary sycl::buffer and sycl accessors and
     //SYCL runtime makes a dependency graph to prevent the races between two __parallel_for patterns.
 
     using _Function = __brick_move<__hetero_tag<_BackendTag>, _ExecutionPolicy>;
-    auto __brick = unseq_backend::walk_n<_ExecutionPolicy, _Function>{_Function{}};
-
     auto __temp_rng_rw =
         oneapi::dpl::__ranges::all_view<_Tp, __par_backend_hetero::access_mode::read_write>(__temp_buf.get_buffer());
+    auto __brick = unseq_backend::walk2_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__temp_rng_rw), decltype(__buf.all_view())>{{}, _Function{},
+        static_cast<std::size_t>(__n)};
     oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), __brick,
                                                       __n, __temp_rng_rw, __buf.all_view())
         .__deferrable_wait();
@@ -1673,13 +1685,16 @@ __pattern_rotate_copy(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Bid
         oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _ForwardIterator>();
     auto __buf2 = __keep2(__result, __result + __n);
 
+    auto __view1 = __buf1.all_view();
+    auto __view2 = __buf2.all_view();
+
     const auto __shift = __new_first - __first;
 
     oneapi::dpl::__par_backend_hetero::__parallel_for(
         _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
-        unseq_backend::__rotate_copy<typename ::std::iterator_traits<_BidirectionalIterator>::difference_type>{__n,
+        unseq_backend::__rotate_copy<typename ::std::iterator_traits<_BidirectionalIterator>::difference_type, decltype(__view1), decltype(__view2)>{{}, __n,
                                                                                                                __shift},
-        __n, __buf1.all_view(), __buf2.all_view())
+        __n, __view1, __view2)
         .__deferrable_wait();
 
     return __result + __n;
@@ -1932,23 +1947,25 @@ __pattern_shift_left(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Rang
     _DiffType __mid = __size / 2 + __size % 2;
     _DiffType __size_res = __size - __n;
 
-    //1. n >= size/2; 'size - _n' parallel copying
+    ////1. n >= size/2; 'size - _n' parallel copying
     if (__n >= __mid)
     {
         using _Function = __brick_move<__hetero_tag<_BackendTag>, _ExecutionPolicy>;
-        auto __brick = oneapi::dpl::unseq_backend::walk_n<_ExecutionPolicy, _Function>{_Function{}};
 
         //TODO: to consider use just "read" access mode for a source range and just "write" - for a destination range.
         auto __src = oneapi::dpl::__ranges::drop_view_simple<_Range, _DiffType>(__rng, __n);
         auto __dst = oneapi::dpl::__ranges::take_view_simple<_Range, _DiffType>(__rng, __size_res);
 
+        auto __brick = 
+            unseq_backend::walk2_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__src), decltype(__dst)>{{}, _Function{}, static_cast<std::size_t>(__size_res)};
+
         oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
                                                           __brick, __size_res, __src, __dst)
             .__deferrable_wait();
     }
     else //2. n < size/2; 'n' parallel copying
     {
-        auto __brick = unseq_backend::__brick_shift_left<_ExecutionPolicy, _DiffType>{__size, __n};
+        auto __brick = unseq_backend::__brick_shift_left<_ExecutionPolicy, _DiffType, decltype(__rng)>{{}, __size, __n};
         oneapi::dpl::__par_backend_hetero::__parallel_for(
             _BackendTag{},
             oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__shift_left_right>(
diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h
index da7820b91a2..85fadbc3100 100644
--- a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h
+++ b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h
@@ -57,6 +57,7 @@ __pattern_walk_n(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Function
     auto __n = oneapi::dpl::__ranges::__get_first_range_size(__rngs...);
     if (__n > 0)
     {
+        // TODO add specializations with a fallback to walk n
         oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
                                                           unseq_backend::walk_n<_ExecutionPolicy, _Function>{__f}, __n,
                                                           ::std::forward<_Ranges>(__rngs)...)
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index d029aea0de7..505b788a1e2 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -37,6 +37,7 @@
 
 #include "sycl_defs.h"
 #include "parallel_backend_sycl_utils.h"
+#include "parallel_backend_sycl_for.h"
 #include "parallel_backend_sycl_reduce.h"
 #include "parallel_backend_sycl_merge.h"
 #include "parallel_backend_sycl_merge_sort.h"
@@ -219,188 +220,6 @@ class __scan_single_wg_dynamic_kernel;
 template <typename... Name>
 class __scan_copy_single_wg_kernel;
 
-template <typename... Name>
-class __parallel_for_small_kernel;
-
-template <typename... Name>
-class __parallel_for_large_kernel;
-
-//------------------------------------------------------------------------
-// parallel_for - async pattern
-//------------------------------------------------------------------------
-
-// Use the trick with incomplete type and partial specialization to deduce the kernel name
-// as the parameter pack that can be empty (for unnamed kernels) or contain exactly one
-// type (for explicitly specified name by the user)
-template <typename _KernelName>
-struct __parallel_for_small_submitter;
-
-template <typename... _Name>
-struct __parallel_for_small_submitter<__internal::__optional_kernel_name<_Name...>>
-{
-    template <typename _ExecutionPolicy, typename _Fp, typename _Index, typename... _Ranges>
-    auto
-    operator()(_ExecutionPolicy&& __exec, _Fp __brick, _Index __count, _Ranges&&... __rngs) const
-    {
-        assert(oneapi::dpl::__ranges::__get_first_range_size(__rngs...) > 0);
-        _PRINT_INFO_IN_DEBUG_MODE(__exec);
-        auto __event = __exec.queue().submit([&__rngs..., &__brick, __count](sycl::handler& __cgh) {
-            //get an access to data under SYCL buffer:
-            oneapi::dpl::__ranges::__require_access(__cgh, __rngs...);
-
-            __cgh.parallel_for<_Name...>(sycl::range</*dim=*/1>(__count), [=](sycl::item</*dim=*/1> __item_id) {
-                auto __idx = __item_id.get_linear_id();
-                __brick(__idx, __rngs...);
-            });
-        });
-        return __future(__event);
-    }
-};
-
-template <typename _KernelName, typename... _RangeTypes>
-struct __parallel_for_large_submitter;
-
-template <typename... _Name, typename... _RangeTypes>
-struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name...>, _RangeTypes...>
-{
-    static constexpr std::size_t __min_type_size = oneapi::dpl::__internal::__min_nested_type_size<
-        std::tuple<oneapi::dpl::__internal::__value_t<_RangeTypes>...>>::value;
-    // __iters_per_work_item is set to 1, 2, 4, 8, or 16 depending on the smallest type in the
-    // flattened ranges. This allows us to launch enough work per item to saturate the device's memory
-    // bandwidth. This heuristic errs on the side of launching more work per item than what is needed to
-    // achieve full bandwidth utilization. 16 bytes per range per work item has been found as a good
-    // value across the different for-based algorithms.
-    static constexpr std::uint8_t __bytes_per_work_item = 16;
-    static constexpr std::uint8_t __iters_per_work_item =
-        oneapi::dpl::__internal::__dpl_ceiling_div(__bytes_per_work_item, __min_type_size);
-    // Limit the work-group size to 512 which has empirically yielded the best results across different architectures.
-    static constexpr std::uint16_t __max_work_group_size = 512;
-
-    // SPIR-V compilation targets show best performance with a stride of the sub-group size.
-    // Other compilation targets perform best with a work-group size stride. This utility can only be called from the
-    // device.
-    static inline std::tuple<std::size_t, std::size_t, bool>
-    __stride_recommender(const sycl::nd_item<1>& __item, std::size_t __count, std::size_t __iters_per_work_item,
-                         std::size_t __work_group_size)
-    {
-        if constexpr (oneapi::dpl::__internal::__is_spirv_target_v)
-        {
-            const __dpl_sycl::__sub_group __sub_group = __item.get_sub_group();
-            const std::uint32_t __sub_group_size = __sub_group.get_local_linear_range();
-            const std::uint32_t __sub_group_id = __sub_group.get_group_linear_id();
-            const std::uint32_t __sub_group_local_id = __sub_group.get_local_linear_id();
-            const std::size_t __work_group_id = __item.get_group().get_group_linear_id();
-
-            const std::size_t __sub_group_start_idx =
-                __iters_per_work_item * (__work_group_id * __work_group_size + __sub_group_size * __sub_group_id);
-            const bool __is_full_sub_group =
-                __sub_group_start_idx + __iters_per_work_item * __sub_group_size <= __count;
-            const std::size_t __work_item_idx = __sub_group_start_idx + __sub_group_local_id;
-            return std::make_tuple(__work_item_idx, __sub_group_size, __is_full_sub_group);
-        }
-        else
-        {
-            const std::size_t __work_group_start_idx =
-                __item.get_group().get_group_linear_id() * __work_group_size * __iters_per_work_item;
-            const std::size_t __work_item_idx = __work_group_start_idx + __item.get_local_linear_id();
-            const bool __is_full_work_group =
-                __work_group_start_idx + __iters_per_work_item * __work_group_size <= __count;
-            return std::make_tuple(__work_item_idx, __work_group_size, __is_full_work_group);
-        }
-    }
-
-    // Once there is enough work to launch a group on each compute unit with our chosen __iters_per_item,
-    // then we should start using this code path.
-    template <typename _ExecutionPolicy>
-    static std::size_t
-    __estimate_best_start_size(const _ExecutionPolicy& __exec)
-    {
-        const std::size_t __work_group_size =
-            oneapi::dpl::__internal::__max_work_group_size(__exec, __max_work_group_size);
-        const std::uint32_t __max_cu = oneapi::dpl::__internal::__max_compute_units(__exec);
-        return __work_group_size * __iters_per_work_item * __max_cu;
-    }
-
-    template <typename _ExecutionPolicy, typename _Fp, typename _Index, typename... _Ranges>
-    auto
-    operator()(_ExecutionPolicy&& __exec, _Fp __brick, _Index __count, _Ranges&&... __rngs) const
-    {
-        assert(oneapi::dpl::__ranges::__get_first_range_size(__rngs...) > 0);
-        _PRINT_INFO_IN_DEBUG_MODE(__exec);
-        auto __event = __exec.queue().submit([&__rngs..., &__brick, &__exec, __count](sycl::handler& __cgh) {
-            //get an access to data under SYCL buffer:
-            oneapi::dpl::__ranges::__require_access(__cgh, __rngs...);
-            const std::size_t __work_group_size =
-                oneapi::dpl::__internal::__max_work_group_size(__exec, __max_work_group_size);
-            const std::size_t __num_groups =
-                oneapi::dpl::__internal::__dpl_ceiling_div(__count, (__work_group_size * __iters_per_work_item));
-            const std::size_t __num_items = __num_groups * __work_group_size;
-            __cgh.parallel_for<_Name...>(
-                sycl::nd_range(sycl::range<1>(__num_items), sycl::range<1>(__work_group_size)),
-                [=](sycl::nd_item</*dim=*/1> __item) {
-                    // TODO: Investigate adding a vectorized path similar to reduce.
-                    // Initial investigation showed benefits for in-place for-based algorithms (e.g. std::for_each) but
-                    // performance regressions for out-of-place (e.g. std::copy) where the compiler was unable to
-                    // vectorize our code. Vectorization may also improve performance of for-algorithms over small data
-                    // types.
-                    auto [__idx, __stride, __is_full] =
-                        __stride_recommender(__item, __count, __iters_per_work_item, __work_group_size);
-                    if (__is_full)
-                    {
-                        _ONEDPL_PRAGMA_UNROLL
-                        for (std::uint8_t __i = 0; __i < __iters_per_work_item; ++__i)
-                        {
-                            __brick(__idx, __rngs...);
-                            __idx += __stride;
-                        }
-                    }
-                    // If we are not full, then take this branch only if there is work to process.
-                    else if (__idx < __count)
-                    {
-                        const std::uint8_t __adjusted_iters_per_work_item =
-                            oneapi::dpl::__internal::__dpl_ceiling_div(__count - __idx, __stride);
-                        for (std::uint8_t __i = 0; __i < __adjusted_iters_per_work_item; ++__i)
-                        {
-                            __brick(__idx, __rngs...);
-                            __idx += __stride;
-                        }
-                    }
-                });
-        });
-        return __future(__event);
-    }
-};
-
-//General version of parallel_for, one additional parameter - __count of iterations of loop __cgh.parallel_for,
-//for some algorithms happens that size of processing range is n, but amount of iterations is n/2.
-template <typename _ExecutionPolicy, typename _Fp, typename _Index, typename... _Ranges>
-auto
-__parallel_for(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Fp __brick, _Index __count,
-               _Ranges&&... __rngs)
-{
-    using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>;
-    using _ForKernelSmall =
-        oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<__parallel_for_small_kernel<_CustomName>>;
-    using _ForKernelLarge =
-        oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<__parallel_for_large_kernel<_CustomName>>;
-
-    using __small_submitter = __parallel_for_small_submitter<_ForKernelSmall>;
-    using __large_submitter = __parallel_for_large_submitter<_ForKernelLarge, _Ranges...>;
-    // Compile two kernels: one for small-to-medium inputs and a second for large. This avoids runtime checks within a
-    // single kernel that worsen performance for small cases. If the number of iterations of the large submitter is 1,
-    // then only compile the basic kernel as the two versions are effectively the same.
-    if constexpr (__large_submitter::__iters_per_work_item > 1)
-    {
-        if (__count >= __large_submitter::__estimate_best_start_size(__exec))
-        {
-            return __large_submitter{}(std::forward<_ExecutionPolicy>(__exec), __brick, __count,
-                                       std::forward<_Ranges>(__rngs)...);
-        }
-    }
-    return __small_submitter{}(std::forward<_ExecutionPolicy>(__exec), __brick, __count,
-                               std::forward<_Ranges>(__rngs)...);
-}
-
 //------------------------------------------------------------------------
 // parallel_transform_scan - async pattern
 //------------------------------------------------------------------------
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h
new file mode 100644
index 00000000000..6ec9500cd2c
--- /dev/null
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h
@@ -0,0 +1,209 @@
+// -*- C++ -*-
+//===-- parallel_backend_sycl_for.h --------------------------------===//
+//
+// Copyright (C) Intel Corporation
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// This file incorporates work covered by the following copyright and permission
+// notice:
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _ONEDPL_PARALLEL_BACKEND_SYCL_FOR_H
+#define _ONEDPL_PARALLEL_BACKEND_SYCL_FOR_H
+
+#include <algorithm>
+#include <cstdint>
+#include <type_traits>
+
+#include "sycl_defs.h"
+#include "parallel_backend_sycl_utils.h"
+#include "execution_sycl_defs.h"
+#include "unseq_backend_sycl.h"
+#include "utils_ranges_sycl.h"
+
+#include "sycl_traits.h" //SYCL traits specialization for some oneDPL types.
+
+namespace oneapi
+{
+namespace dpl
+{
+namespace __par_backend_hetero
+{
+
+template <typename... Name>
+class __parallel_for_small_kernel;
+
+template <typename... Name>
+class __parallel_for_large_kernel;
+
+//------------------------------------------------------------------------
+// parallel_for - async pattern
+//------------------------------------------------------------------------
+
+// Use the trick with incomplete type and partial specialization to deduce the kernel name
+// as the parameter pack that can be empty (for unnamed kernels) or contain exactly one
+// type (for explicitly specified name by the user)
+template <typename _KernelName>
+struct __parallel_for_small_submitter;
+
+template <typename... _Name>
+struct __parallel_for_small_submitter<__internal::__optional_kernel_name<_Name...>>
+{
+    template <typename _ExecutionPolicy, typename _Fp, typename _Index, typename... _Ranges>
+    auto
+    operator()(_ExecutionPolicy&& __exec, _Fp __brick, _Index __count, _Ranges&&... __rngs) const
+    {
+        assert(oneapi::dpl::__ranges::__get_first_range_size(__rngs...) > 0);
+        _PRINT_INFO_IN_DEBUG_MODE(__exec);
+        auto __event = __exec.queue().submit([&__rngs..., &__brick, __count](sycl::handler& __cgh) {
+            //get an access to data under SYCL buffer:
+            oneapi::dpl::__ranges::__require_access(__cgh, __rngs...);
+
+            __cgh.parallel_for<_Name...>(sycl::range</*dim=*/1>(__count), [=](sycl::item</*dim=*/1> __item_id) {
+                auto __idx = __item_id.get_linear_id();
+                // For small inputs, do not vectorize or perform multiple iterations per work item. Spread input evenly
+                // across compute units.
+                __brick.__scalar_path(std::true_type{}, __idx, __rngs...);
+            });
+        });
+        return __future(__event);
+    }
+};
+
+template <typename _KernelName, typename... _RangeTypes>
+struct __parallel_for_large_submitter;
+
+template <typename... _Name, typename... _RangeTypes>
+struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name...>, _RangeTypes...>
+{
+    // Limit the work-group size to 512 which has empirically yielded the best results across different architectures.
+    static constexpr std::uint16_t __max_work_group_size = 512;
+
+    // SPIR-V compilation targets show best performance with a stride of the sub-group size.
+    // Other compilation targets perform best with a work-group size stride. This utility can only be called from the
+    // device.
+    static inline std::tuple<std::size_t, std::size_t, bool>
+    __stride_recommender(const sycl::nd_item<1>& __item, std::size_t __count, std::size_t __iters_per_work_item,
+                         std::size_t __adj_elements_per_work_item,
+                         std::size_t __work_group_size)
+    {
+        if constexpr (1)//oneapi::dpl::__internal::__is_spirv_target_v)
+        {
+            const __dpl_sycl::__sub_group __sub_group = __item.get_sub_group();
+            const std::uint32_t __sub_group_size = __sub_group.get_local_linear_range();
+            const std::uint32_t __sub_group_id = __sub_group.get_group_linear_id();
+            const std::uint32_t __sub_group_local_id = __sub_group.get_local_linear_id();
+            const std::size_t __work_group_id = __item.get_group().get_group_linear_id();
+
+            const std::size_t __sub_group_start_idx =
+                __iters_per_work_item * __adj_elements_per_work_item * (__work_group_id * __work_group_size + __sub_group_size * __sub_group_id);
+            const bool __is_full_sub_group =
+                __sub_group_start_idx + __iters_per_work_item * __adj_elements_per_work_item + __sub_group_size <= __count;
+            const std::size_t __work_item_idx = __sub_group_start_idx + __adj_elements_per_work_item * __sub_group_local_id;
+            return std::make_tuple(__work_item_idx, __adj_elements_per_work_item * __sub_group_size, __is_full_sub_group);
+        }
+        else
+        {
+            const std::size_t __work_group_start_idx =
+                __item.get_group().get_group_linear_id() * __work_group_size * __iters_per_work_item;
+            const std::size_t __work_item_idx = __work_group_start_idx + __item.get_local_linear_id();
+            const bool __is_full_work_group =
+                __work_group_start_idx + __iters_per_work_item * __work_group_size <= __count;
+            return std::make_tuple(__work_item_idx, __work_group_size, __is_full_work_group);
+        }
+    }
+
+    // Once there is enough work to launch a group on each compute unit with our chosen __iters_per_item,
+    // then we should start using this code path.
+    template <typename _ExecutionPolicy, typename _Fp>
+    static std::size_t
+    __estimate_best_start_size(const _ExecutionPolicy& __exec, _Fp __brick)
+    {
+#if 0
+        constexpr static std::uint16_t __iters_per_work_item = 4 / decltype(__brick)::__preferred_vector_size; 
+        const std::size_t __work_group_size =
+            oneapi::dpl::__internal::__max_work_group_size(__exec, __max_work_group_size);
+        const std::uint32_t __max_cu = oneapi::dpl::__internal::__max_compute_units(__exec);
+        return __work_group_size * __max_cu;
+#else
+        return 10000;
+#endif
+    }
+
+    template <typename _ExecutionPolicy, typename _Fp, typename _Index, typename... _Ranges>
+    auto
+    operator()(_ExecutionPolicy&& __exec, _Fp __brick, _Index __count, _Ranges&&... __rngs) const
+    {
+        assert(oneapi::dpl::__ranges::__get_first_range_size(__rngs...) > 0);
+        _PRINT_INFO_IN_DEBUG_MODE(__exec);
+        auto __event = __exec.queue().submit([&__rngs..., &__brick, &__exec, __count](sycl::handler& __cgh) {
+            //get an access to data under SYCL buffer:
+            oneapi::dpl::__ranges::__require_access(__cgh, __rngs...);
+            constexpr static std::uint16_t __iters_per_work_item = _Fp::__preferred_iters_per_item;
+            const std::size_t __work_group_size =
+                oneapi::dpl::__internal::__max_work_group_size(__exec, __max_work_group_size);
+            const std::size_t __num_groups =
+                oneapi::dpl::__internal::__dpl_ceiling_div(__count, (__work_group_size * decltype(__brick)::__preferred_vector_size * __iters_per_work_item));
+            const std::size_t __num_items = __num_groups * __work_group_size;
+            __cgh.parallel_for<_Name...>(
+                sycl::nd_range(sycl::range<1>(__num_items), sycl::range<1>(__work_group_size)),
+                [=](sycl::nd_item</*dim=*/1> __item) {
+                    auto [__idx, __stride, __is_full] =
+                        __stride_recommender(__item, __count, __iters_per_work_item, decltype(__brick)::__preferred_vector_size, __work_group_size);
+                    __strided_loop<__iters_per_work_item> __execute_loop{static_cast<std::size_t>(__count)};
+                    if (__is_full)
+                    {
+                        __execute_loop(std::true_type{}, __idx, __stride, __brick, __rngs...);
+                    }
+                    // If we are not full, then take this branch only if there is work to process.
+                    else if (__idx < __count)
+                    {
+                        __execute_loop(std::false_type{}, __idx, __stride, __brick, __rngs...);
+                    }
+                });
+        });
+        return __future(__event);
+    }
+};
+
+//General version of parallel_for, one additional parameter - __count of iterations of loop __cgh.parallel_for,
+//for some algorithms happens that size of processing range is n, but amount of iterations is n/2.
+template <typename _ExecutionPolicy, typename _Fp, typename _Index, typename... _Ranges>
+auto
+__parallel_for(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Fp __brick, _Index __count,
+               _Ranges&&... __rngs)
+{
+    using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>;
+    using _ForKernelSmall =
+        oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<__parallel_for_small_kernel<_CustomName>>;
+    using _ForKernelLarge =
+        oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<__parallel_for_large_kernel<_CustomName>>;
+
+    using __small_submitter = __parallel_for_small_submitter<_ForKernelSmall>;
+    using __large_submitter = __parallel_for_large_submitter<_ForKernelLarge, _Ranges...>;
+    //std::cerr << _Fp::__preferred_vector_size << std::endl;
+    // Compile two kernels: one for small-to-medium inputs and a second for large. This avoids runtime checks within a
+    // single kernel that worsen performance for small cases. If the number of iterations of the large submitter is 1,
+    // then only compile the basic kernel as the two versions are effectively the same.
+    if constexpr (_Fp::__preferred_iters_per_item > 1)
+    {
+        if (__count >= __large_submitter::__estimate_best_start_size(__exec, __brick))
+        {
+          return __large_submitter{}(std::forward<_ExecutionPolicy>(__exec), __brick, __count,
+                                     std::forward<_Ranges>(__rngs)...);
+         }
+     }
+    return __small_submitter{}(std::forward<_ExecutionPolicy>(__exec), __brick, __count,
+                               std::forward<_Ranges>(__rngs)...);
+}
+
+} // namespace __par_backend_hetero
+} // namespace dpl
+} // namespace oneapi
+
+#endif
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
index f4eb557170e..5a9e0ee7de3 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
@@ -834,6 +834,177 @@ class __static_monotonic_dispatcher<::std::integer_sequence<::std::uint16_t, _X,
     }
 };
 
+// For use with __lazy_ctor_storage
+struct __lazy_load_transform_op
+{
+    template <typename _IdxType1, typename _IdxType2, typename _SourceAcc, typename _DestAcc>
+    void
+    operator()(_IdxType1 __idx_source, _IdxType2 __idx_dest, _SourceAcc __source_acc, _DestAcc __dest_acc) const
+    {
+        __dest_acc[__idx_dest].__setup(__source_acc[__idx_source]);
+    }
+};
+
+template <std::uint16_t __vec_size>
+struct __vector_load
+{
+    static_assert(__vec_size <= 4);
+    std::size_t __n;
+    template <typename _IdxType, typename _LoadOp, typename... _Acc>
+    void
+    operator()(std::true_type, _IdxType __start_idx, _LoadOp __load_op, _Acc... __acc) const
+    {
+        _ONEDPL_PRAGMA_UNROLL
+        for (std::uint16_t __i = 0; __i < __vec_size; ++__i)
+            __load_op(__start_idx + __i, __i, __acc...);
+    }
+    
+    template <typename _IdxType, typename _LoadOp, typename... _Acc>
+    void
+    operator()(std::false_type, _IdxType __start_idx, _LoadOp __load_op, _Acc... __acc) const
+    {
+        std::uint16_t __elements = std::min(__vec_size, decltype(__vec_size)(__n - __start_idx));
+        for (std::uint16_t __i = 0; __i < __elements; ++__i)
+            __load_op(__start_idx + __i, __i, __acc...);
+    }
+};
+
+// For use with __lazy_ctor_storage
+template <typename _TransformOp>
+struct __lazy_store_transform_op
+{
+    _TransformOp __transform{};
+    template <typename _IdxType1, typename _IdxType2, typename _SourceAcc, typename _DestAcc>
+    void
+    operator()(_IdxType1 __idx_source, _IdxType2 __idx_dest, _SourceAcc __source_acc, _DestAcc __dest_acc) const
+    {
+        // TODO: fix this. it always performs an assignment in its current state
+        __transform(__source_acc[__idx_source].__v, __dest_acc[__idx_dest]);
+    }
+    template <typename _IdxType1, typename _IdxType2, typename _Source1Acc, typename _Source2Acc, typename _DestAcc>
+    void
+    operator()(_IdxType1 __idx_source, _IdxType2 __idx_dest, _Source1Acc __source1_acc, _Source2Acc __source2_acc, _DestAcc __dest_acc) const
+    {
+        __transform(__source1_acc[__idx_source].__v, __source2_acc[__idx_source].__v, __dest_acc[__idx_dest]);
+    }
+};
+
+template <std::uint16_t __vec_size, typename _F>
+struct __vector_walk
+{
+    _F __f;
+    std::size_t __n;
+
+    template <typename... _Rngs>
+    void operator()(std::true_type, std::size_t __idx, _Rngs&&... __rngs) const
+    {
+        _ONEDPL_PRAGMA_UNROLL
+        for (std::uint16_t __i = 0; __i < __vec_size; ++__i)
+        {
+
+            __f(__rngs[__idx + __i]...);
+        }
+    }
+    // For a non-full vector path, process it sequentially. This will always be the last sub or work group
+    // if it does not evenly divide into input
+    template <typename... _Rngs>
+    void operator()(std::false_type, std::size_t __idx, _Rngs&&... __rngs) const
+    {
+        std::uint16_t __elements = std::min(__vec_size, decltype(__vec_size)(__n - __idx));
+        for (std::uint16_t __i = 0; __i < __elements; ++__i)
+        {
+            __f(__rngs[__idx + __i]...);
+        }
+    }
+};
+
+template <std::uint16_t __vec_size>
+struct __vector_store
+{
+    std::size_t __n;
+    static_assert(__vec_size <= 4);
+    template <typename _IdxType, typename _StoreOp, typename... _Acc>
+    void
+    operator()(std::true_type, _IdxType __start_idx, _StoreOp __store_op, _Acc... __acc) const
+    {
+        _ONEDPL_PRAGMA_UNROLL
+        for (std::uint16_t __i = 0; __i < __vec_size; ++__i)
+            __store_op(__i, __start_idx + __i, __acc...);
+    }
+    template <typename _IdxType, typename _StoreOp, typename... _Acc>
+    void
+    operator()(std::false_type, _IdxType __start_idx, _StoreOp __store_op, _Acc... __acc) const
+    {
+        std::uint16_t __elements = std::min(__vec_size, decltype(__vec_size)(__n - __start_idx));
+        for (std::uint16_t __i = 0; __i < __elements; ++__i)
+            __store_op(__i, __start_idx + __i, __acc...);
+    }
+};
+
+template <std::uint16_t __vec_size>
+struct __vector_reverse
+{
+    template <typename _IsFull, typename _Idx, typename _Array>
+    void
+    operator()(_IsFull __is_full, const _Idx __elements_to_process, _Array __array) const
+    {
+        if constexpr (__is_full)
+        {
+            _ONEDPL_PRAGMA_UNROLL
+            for (std::uint16_t __i = 0; __i != __vec_size / 2; ++__i)
+                std::swap(__array[__i].__v, __array[__vec_size - __i - 1].__v);
+        }
+        else
+        {
+          for (std::uint16_t __i = 0; __i != __elements_to_process / 2; ++__i)
+              std::swap(__array[__i].__v, __array[__elements_to_process - __i - 1].__v);
+        }
+    }
+};
+
+// Processes a loop with a given stride. Intended to be used with sub-group / work-group strides for good memory access patterns
+// (potentially with vectorization)
+template <std::uint16_t __num_strides>
+struct __strided_loop
+{
+    std::size_t __n;
+    template <typename _IdxType, typename _LoopBodyOp, typename... _Ranges>
+	void
+    operator()(/*__is_full*/std::true_type, _IdxType __idx, std::uint16_t __stride, _LoopBodyOp __loop_body_op, _Ranges&&... __rngs) const
+    {
+        _ONEDPL_PRAGMA_UNROLL
+        for (std::uint16_t __i = 0; __i < __num_strides; ++__i)
+        {
+            __loop_body_op(std::true_type{}, __idx, __rngs...);
+            __idx += __stride;
+        }
+    }
+    template <typename _IdxType, typename _LoopBodyOp, typename... _Ranges>
+	void
+    operator()(/*__is_full*/std::false_type, _IdxType __idx, std::uint16_t __stride, _LoopBodyOp __loop_body_op, _Ranges&&... __rngs) const
+    {
+        // constrain the number of iterations as much as possible and then pass the knowledge that we are not a full loop to the body operation
+        const std::uint8_t __adjusted_iters_per_work_item =
+            oneapi::dpl::__internal::__dpl_ceiling_div(__n - __idx, __stride);
+        for (std::uint16_t __i = 0; __i < __adjusted_iters_per_work_item; ++__i)
+        {
+            __loop_body_op(std::false_type{}, __idx, __rngs...);
+            __idx += __stride;
+        }
+    }
+};
+
+template <typename _Rng>
+struct __is_vectorizable_view
+    : std::false_type
+{
+};
+template <typename... _Args>
+struct __is_vectorizable_view<oneapi::dpl::__ranges::guard_view<_Args...>>
+    : std::true_type
+{
+};
+
 } // namespace __par_backend_hetero
 } // namespace dpl
 } // namespace oneapi
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h b/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h
index 7d3fd829cc5..f16894c050f 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h
@@ -368,7 +368,7 @@ namespace oneapi::dpl::unseq_backend
 template <typename _ExecutionPolicy, typename _F>
 struct walk_n;
 
-template <typename _ExecutionPolicy, typename _F>
+template <typename _ExecutionPolicy, typename _F, typename _Range1, typename _Range2>
 struct walk_adjacent_difference;
 
 template <typename _ExecutionPolicy, typename _Operation1, typename _Operation2, typename _Tp, typename _Commutative,
@@ -429,9 +429,9 @@ struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::unseq_backen
 {
 };
 
-template <typename _ExecutionPolicy, typename _F>
+template <typename _ExecutionPolicy, typename _F, typename _Range1, typename _Range2>
 struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::unseq_backend::walk_adjacent_difference,
-                                                       _ExecutionPolicy, _F)>
+                                                       _ExecutionPolicy, _F, _Range1, _Range2)>
     : oneapi::dpl::__internal::__are_all_device_copyable<_F>
 {
 };
@@ -555,7 +555,7 @@ namespace oneapi::dpl::internal
 
 enum class search_algorithm;
 
-template <typename Comp, typename T, search_algorithm func>
+template <typename Comp, typename T, typename _Range, search_algorithm func>
 struct custom_brick;
 
 template <typename T, typename Predicate>
@@ -575,8 +575,8 @@ class transform_if_stencil_fun;
 
 } // namespace oneapi::dpl::internal
 
-template <typename Comp, typename T, oneapi::dpl::internal::search_algorithm func>
-struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::internal::custom_brick, Comp, T, func)>
+template <typename Comp, typename T, typename _Range, oneapi::dpl::internal::search_algorithm func>
+struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::internal::custom_brick, Comp, T, _Range, func)>
     : oneapi::dpl::__internal::__are_all_device_copyable<Comp, T>
 {
 };
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
index 2caa6add318..11e504de9f1 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
@@ -109,6 +109,180 @@ struct walk_n
     }
 };
 
+template <typename... _Ranges>
+struct walk_vector_or_scalar_base
+{
+    using _ValueTypes = std::tuple<oneapi::dpl::__internal::__value_t<_Ranges>...>;
+    constexpr static std::uint16_t __min_type_size = oneapi::dpl::__internal::__min_nested_type_size<_ValueTypes>::value;
+    // TODO: We need some check that the input is contiguous and can be treated as a pointer.
+    constexpr static bool __can_vectorize =
+        (oneapi::dpl::__par_backend_hetero::__is_vectorizable_view<_Ranges>::value && ...) &&
+        (std::is_fundamental_v<oneapi::dpl::__internal::__value_t<_Ranges>> && ...) && __min_type_size < 4;
+    // Vectorize for small types, so we generate 128-byte load / stores in a sub-group
+    constexpr static std::uint16_t __preferred_vector_size = __can_vectorize ? oneapi::dpl::__internal::__dpl_ceiling_div(4,
+            __min_type_size) : 1;
+    // To achieve full bandwidth utilization, multiple iterations need to be processed by a work item
+    constexpr static std::uint16_t __preferred_iters_per_item = 16 / (__min_type_size * __preferred_vector_size);
+};
+
+template <typename... _Ranges>
+struct walk_scalar_base
+{
+    using _ValueType = oneapi::dpl::__internal::__min_nested_type_size<std::tuple<oneapi::dpl::__internal::__value_t<_Ranges>...>>;
+    // TODO: We need some check that the input is contiguous and can be treated as a pointer.
+    constexpr static bool __can_vectorize = false;
+    // With no vectorization, the vector size is 1
+    constexpr static std::uint16_t __preferred_vector_size = 1;
+    // To achieve full bandwidth utilization, multiple iterations need to be processed by a work item
+    constexpr static std::uint16_t __preferred_iters_per_item = 16 / (sizeof(_ValueType) * __preferred_vector_size);
+};
+
+// conditionally determines if a vector or scalar path should be taken based on the provided range. 
+template <typename _ExecutionPolicy, typename _F, typename _Range>
+struct walk1_vector_or_scalar
+  : public walk_vector_or_scalar_base<_Range>
+{
+    using __base_t = walk_vector_or_scalar_base<_Range>;
+    _F __f;
+    std::size_t __n;
+ 
+    template <typename _IsFull, typename _ItemId>
+    void 
+    __vector_path(_IsFull __is_full, const _ItemId __idx, _Range __rng) const
+    {
+        // This is needed to enable vectorization
+        auto __raw_ptr = __rng.begin();
+        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size, decltype(__f)>{__f, __n}(__is_full, __idx, __raw_ptr);
+    }
+
+    // _IsFull is ignored here. We assume that boundary checking has been already performed for this index.
+    template <typename _IsFull, typename _ItemId>
+    void 
+    __scalar_path(_IsFull, const _ItemId __idx, _Range __rng) const
+    {
+
+        __f(__rng[__idx]);
+    }
+
+    template <typename _IsFull, typename _ItemId>
+    void 
+    operator()(_IsFull __is_full, const _ItemId __idx, _Range __rng) const
+    {
+        if constexpr (__base_t::__can_vectorize)
+            __vector_path(__is_full, __idx, __rng);
+        else
+            __scalar_path(__is_full, __idx, __rng);
+    }
+};
+
+template <typename _ExecutionPolicy, typename _F, typename _Range1, typename _Range2>
+struct walk2_vectors_or_scalars
+    : public walk_vector_or_scalar_base<_Range1, _Range2>
+{
+    using __base_t = walk_vector_or_scalar_base<_Range1, _Range2>;
+    _F __f;
+    std::size_t __n;
+
+    template <typename _IsFull, typename _ItemId>
+    void 
+    __vector_path(_IsFull __is_full, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2) const
+    {
+        using _ValueType1 = oneapi::dpl::__internal::__value_t<_Range1>;
+        // This is needed for the icpx compiler to vectorize. The indirection introduced by our range holder interfere
+        // with the vectorizer. At this point, we have ensured that input is contiguous and can be operated on as a raw pointer.
+        auto __raw_ptr1 = __rng1.begin();
+        auto __raw_ptr2 = __rng2.begin();
+        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType1> __rng1_vector[__base_t::__preferred_vector_size];
+        // 1. Load input into a vector
+        oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(__is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{},
+                __raw_ptr1, __rng1_vector);
+        // 2. Apply functor to vector and store into global memory
+        oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(__is_full, __idx,
+                oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f},
+                __rng1_vector, __raw_ptr2);
+        // 3. Explicitly call destructor of lazy union type
+        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size, oneapi::dpl::__internal::__lazy_ctor_storage_deleter>{
+            oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __n}(__is_full, 0, __rng1_vector);
+    }
+
+    // _IsFull is ignored here. We assume that boundary checking has been already performed for this index.
+    template <typename _IsFull, typename _ItemId>
+    void 
+    __scalar_path(_IsFull, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2) const
+    {
+
+        __f(__rng1[__idx], __rng2[__idx]);
+    }
+
+    template <typename _IsFull, typename _ItemId>
+    void 
+    operator()(_IsFull __is_full, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2) const
+    {
+        if constexpr (__base_t::__can_vectorize)
+            __vector_path(__is_full, __idx, __rng1, __rng2);
+        else
+            __scalar_path(__is_full, __idx, __rng1, __rng2);
+    }
+};
+
+template <typename _ExecutionPolicy, typename _F, typename _Range1, typename _Range2, typename _Range3>
+struct walk3_vectors_or_scalars
+    : public walk_vector_or_scalar_base<_Range1, _Range2, _Range3>
+{
+    using __base_t = walk_vector_or_scalar_base<_Range1, _Range2, _Range3>;
+    _F __f;
+    std::size_t __n;
+
+    template <typename _IsFull, typename _ItemId>
+    void 
+    __vector_path(_IsFull __is_full, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2, _Range3 __rng3) const
+    {
+        using _ValueType1 = oneapi::dpl::__internal::__value_t<_Range1>;
+        using _ValueType2 = oneapi::dpl::__internal::__value_t<_Range2>;
+        // This is needed for the icpx compiler to vectorize. The indirection introduced by our views interfere
+        // with the vectorizer. At this point, we have ensured that input is contiguous and can be operated on as a raw pointer.
+        auto __raw_ptr1 = __rng1.begin();
+        auto __raw_ptr2 = __rng2.begin();
+        auto __raw_ptr3 = __rng3.begin();
+
+        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType1> __rng1_vector[__base_t::__preferred_vector_size];
+        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType2> __rng2_vector[__base_t::__preferred_vector_size];
+        // 1. Load inputs into vectors
+        oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(__is_full, __idx,
+                oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{},
+                __raw_ptr1, __rng1_vector);
+        oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(__is_full, __idx, __raw_ptr2, __rng2_vector,
+            oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{});
+        // 2. Apply functor to vector and store into global memory
+        oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(__is_full, __idx,
+                oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f}, __rng1_vector, __rng2_vector, __raw_ptr3);
+        // 3. Explicitly call destructors of lazy union type
+        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size, oneapi::dpl::__internal::__lazy_ctor_storage_deleter>{
+            oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __n}(__is_full, 0, __rng1_vector);
+        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size, oneapi::dpl::__internal::__lazy_ctor_storage_deleter>{
+            oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __n}(__is_full, 0, __rng2_vector);
+    }
+
+    // _IsFull is ignored here. We assume that boundary checking has been already performed for this index.
+    template <typename _IsFull, typename _ItemId>
+    void 
+    __scalar_path(_IsFull, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2, _Range3 __rng3) const
+    {
+
+        __f(__rng1[__idx], __rng2[__idx], __rng3[__idx]);
+    }
+
+    template <typename _IsFull, typename _ItemId>
+    void 
+    operator()(_IsFull __is_full, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2, _Range3 __rng3) const
+    {
+        if constexpr (__base_t::__can_vectorize)
+            __vector_path(__is_full, __idx, __rng1, __rng2, __rng3);
+        else
+            __scalar_path(__is_full, __idx, __rng1, __rng2, __rng3);
+    }
+};
+
 // If read accessor returns temporary value then __no_op returns lvalue reference to it.
 // After temporary value destroying it will be a reference on invalid object.
 // So let's don't call functor in case of __no_op
@@ -129,22 +303,56 @@ struct walk_n<_ExecutionPolicy, oneapi::dpl::__internal::__no_op>
 // walk_adjacent_difference
 //------------------------------------------------------------------------
 
-template <typename _ExecutionPolicy, typename _F>
+template <typename _ExecutionPolicy, typename _F, typename _Range1, typename _Range2>
 struct walk_adjacent_difference
+    : public walk_vector_or_scalar_base<_Range1, _Range2>
 {
+    using __base_t = walk_vector_or_scalar_base<_Range1, _Range2>;
     _F __f;
+    std::size_t __n;
 
-    template <typename _ItemId, typename _Acc1, typename _Acc2>
+    template <typename _IsFull, typename _ItemId>
     void
-    operator()(const _ItemId __idx, const _Acc1& _acc_src, _Acc2& _acc_dst) const
+    __scalar_path(_IsFull, const _ItemId __idx, const _Range1 __acc_src, _Range2 __acc_dst) const
     {
-        using ::std::get;
-
         // just copy an element if it is the first one
         if (__idx == 0)
-            _acc_dst[__idx] = _acc_src[__idx];
+            __acc_dst[__idx] = __acc_src[__idx];
         else
-            __f(_acc_src[__idx + (-1)], _acc_src[__idx], _acc_dst[__idx]);
+            __f(__acc_src[__idx + (-1)], __acc_src[__idx], __acc_dst[__idx]);
+    }
+    template <typename _IsFull, typename _ItemId>
+    void
+    __vector_path(_IsFull __is_full, const _ItemId __idx, const _Range1 __acc_src, _Range2 __acc_dst) const
+    {
+		using _ValueType = oneapi::dpl::__internal::__value_t<_Range1>;
+        auto __acc_src_ptr = __acc_src.begin();
+        auto __acc_dst_ptr = __acc_dst.begin();
+        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __src_vector[__base_t::__preferred_vector_size + 1];
+        if (__idx != 0)
+            __src_vector[0].__setup(__acc_src_ptr[__idx - 1]);
+        else
+            __src_vector[0].__setup(__acc_src_ptr[0]);
+        oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(__is_full, __idx, 
+                oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{},
+                __acc_src_ptr, &__src_vector[1]);
+        oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(__is_full, __idx,
+            oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f}, __src_vector, &__src_vector[1], __acc_dst_ptr);
+        // A dummy value is first written to global memory followed by an overwrite for the first index. Pulling the vector loads / stores into an if branch
+        // to better handle this results in vectorization and performance problems.
+        if (__idx == 0)
+            __acc_dst[0] = __src_vector[0].__v;
+		oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size, oneapi::dpl::__internal::__lazy_ctor_storage_deleter>{
+            oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __n}(__is_full, 0, __src_vector);
+    }
+    template <typename _IsFull, typename _ItemId, typename _Acc1, typename _Acc2>
+    void
+    operator()(_IsFull __is_full, const _ItemId __idx, const _Acc1& __acc_src, _Acc2& __acc_dst) const
+    {
+        if constexpr (__base_t::__can_vectorize)
+            __vector_path(__is_full, __idx, __acc_src, __acc_dst);
+        else
+            __scalar_path(__is_full, __idx, __acc_src, __acc_dst);
     }
 };
 
@@ -924,48 +1132,192 @@ struct __brick_includes
 //------------------------------------------------------------------------
 // reverse
 //------------------------------------------------------------------------
-template <typename _Size>
+template <typename _Size, typename _Range>
 struct __reverse_functor
+    : public walk_vector_or_scalar_base<_Range>
 {
-    _Size __size;
-    template <typename _Idx, typename _Accessor>
+    using __base_t = walk_vector_or_scalar_base<_Range>;
+    using _ValueType = oneapi::dpl::__internal::__value_t<_Range>;
+    _Size __size; 
+    template <typename _IsFull, typename _Idx, typename _Accessor>
+    void 
+    __vector_path(_IsFull __is_full, const _Idx __left_start_idx, _Accessor __acc) const
+    {
+        auto __acc_pointer = __acc.begin();
+        std::size_t __n = __size;
+        std::size_t __midpoint = __size / 2;
+        // If our start is passed the midpoint, then immediately leave as it is guaranteed to be processed by another
+        // work-item. There may be some double processing (< 4 elements) between left and right vectors at the
+        // "crossover" point within a work item, but allowing this is to happen is likely more performant than
+        // additional branching for each work item (see reverse_copy).
+        if (__left_start_idx >= __midpoint)
+            return;
+        
+        // 1. Load two vectors that we want to swap: one from the left half of the buffer and one from the right
+        const _Idx __right_start_idx = __size - __left_start_idx - __base_t::__preferred_vector_size;
+
+        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __acc_left_vector[__base_t::__preferred_vector_size];
+        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __acc_right_vector[__base_t::__preferred_vector_size];
+
+        oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(__is_full, __left_start_idx, 
+                oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{},
+                __acc_pointer, __acc_left_vector);
+        oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(__is_full, __right_start_idx, 
+                oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{},
+                __acc_pointer, __acc_right_vector);
+        // 2. Reverse vectors in registers
+        oneapi::dpl::__par_backend_hetero::__vector_reverse<__base_t::__preferred_vector_size>{}(std::true_type{}, __left_start_idx, __acc_left_vector);
+        oneapi::dpl::__par_backend_hetero::__vector_reverse<__base_t::__preferred_vector_size>{}(std::true_type{}, __right_start_idx, __acc_right_vector);
+        // 3. Store the left-half vector to the corresponding right-half indices and vice versa
+        oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(__is_full, __right_start_idx,
+            oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<oneapi::dpl::__internal::__pstl_assign>{}, __acc_left_vector, __acc_pointer);
+        oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(__is_full, __left_start_idx,
+            oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<oneapi::dpl::__internal::__pstl_assign>{}, __acc_right_vector, __acc_pointer);
+        // 4. Call destructor of temporary storage 
+        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size, oneapi::dpl::__internal::__lazy_ctor_storage_deleter>{
+            oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __n}(__is_full, 0, __acc_left_vector);
+        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size, oneapi::dpl::__internal::__lazy_ctor_storage_deleter>{
+            oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __n}(__is_full, 0, __acc_right_vector);
+    }
+    template <typename _IsFull, typename _Idx, typename _Accessor>
     void
-    operator()(const _Idx __idx, _Accessor& __acc) const
+    __scalar_path(_IsFull, const _Idx __idx, _Accessor __acc) const
     {
         using ::std::swap;
         swap(__acc[__idx], __acc[__size - __idx - 1]);
     }
+    template <typename _IsFull, typename _Idx, typename _Accessor>
+    void
+    operator()(_IsFull __is_full, const _Idx __idx, _Accessor __acc) const
+    {
+        if constexpr (__base_t::__can_vectorize)
+            __vector_path(__is_full, __idx, __acc);
+        else
+            __scalar_path(__is_full, __idx, __acc);
+    }
 };
 
 //------------------------------------------------------------------------
 // reverse_copy
 //------------------------------------------------------------------------
-template <typename _Size>
+template <typename _Size, typename _Range1, typename _Range2>
 struct __reverse_copy
+    : public walk_vector_or_scalar_base<_Range1, _Range2>
 {
+    using __base_t = walk_vector_or_scalar_base<_Range1, _Range2>;
+    using _ValueType = oneapi::dpl::__internal::__value_t<_Range1>;
     _Size __size;
-    template <typename _Idx, typename _AccessorSrc, typename _AccessorDst>
+
+    template <typename _IsFull, typename _Idx>
     void
-    operator()(const _Idx __idx, const _AccessorSrc& __acc1, _AccessorDst& __acc2) const
+    __scalar_path(_IsFull, const _Idx __idx, const _Range1 __acc1, _Range2 __acc2) const
     {
         __acc2[__idx] = __acc1[__size - __idx - 1];
     }
+    template <typename _IsFull, typename _Idx>
+    void
+    __vector_path(_IsFull __is_full, const _Idx __idx, const _Range1 __acc1, _Range2 __acc2) const
+    {
+        auto __acc1_pointer = __acc1.begin();
+        auto __acc2_pointer = __acc2.begin();
+        std::size_t __n = __size;
+        // TODO why do we need this?
+        std::size_t __remaining_elements = __idx >= __n ? 0 : __n - __idx;
+        std::size_t __elements_to_process = std::min(static_cast<std::size_t>(__base_t::__preferred_vector_size), __remaining_elements);
+        const _Idx __output_start = __size - __idx - __elements_to_process;
+        // 1. Load vector to reverse
+        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __acc1_vector[__base_t::__preferred_vector_size];
+        oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(__is_full, __idx, 
+                oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{},
+                __acc1_pointer, __acc1_vector);
+        // 2, 3. Reverse in registers and flip the location of the vector in the output buffer
+        // TODO: For uint8_t, this if...else... branch to handle inputs that do not align to sub-group size introduces a ~7x performance regression on
+        // GPU Series Max 1550. We need to investigate the generated assembly and work with the compiler team to resolve this. The performance is still
+        // better than the scalar path in this case, so it is still worth taking even with this performance bug.
+        if (__elements_to_process == __base_t::__preferred_vector_size)
+        {
+            oneapi::dpl::__par_backend_hetero::__vector_reverse<__base_t::__preferred_vector_size>{}(std::true_type{}, __elements_to_process, __acc1_vector);
+            oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(__is_full, __output_start,
+                oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<oneapi::dpl::__internal::__pstl_assign>{}, __acc1_vector, __acc2_pointer);
+        }
+        else
+        {
+            oneapi::dpl::__par_backend_hetero::__vector_reverse<__base_t::__preferred_vector_size>{}(std::false_type{}, __elements_to_process, __acc1_vector);
+            for (std::uint16_t __i = 0; __i < __elements_to_process; ++__i)
+                __acc2_pointer[__output_start + __i] = __acc1_vector[__i].__v;
+        }
+        // 3. Cleanup 
+        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size, oneapi::dpl::__internal::__lazy_ctor_storage_deleter>{
+            oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __n}(__is_full, 0, __acc1_vector);
+    }
+    template <typename _IsFull, typename _Idx, typename _AccessorSrc, typename _AccessorDst>
+    void
+    operator()(_IsFull __is_full, const _Idx __idx, const _AccessorSrc __acc1, _AccessorDst __acc2) const
+    {
+        if constexpr (__base_t::__can_vectorize)
+            __vector_path(__is_full, __idx, __acc1, __acc2);
+        else
+            __scalar_path(__is_full, __idx, __acc1, __acc2);
+    }
 };
 
 //------------------------------------------------------------------------
 // rotate_copy
 //------------------------------------------------------------------------
-template <typename _Size>
+template <typename _Size, typename _Range1, typename _Range2>
 struct __rotate_copy
+    : public walk_vector_or_scalar_base<_Range1, _Range2>
 {
+    using __base_t = walk_vector_or_scalar_base<_Range1, _Range2>;
+    using _ValueType = oneapi::dpl::__internal::__value_t<_Range1>;
     _Size __size;
     _Size __shift;
-    template <typename _Idx, typename _AccessorSrc, typename _AccessorDst>
+    template <typename _IsFull, typename _Idx, typename _AccessorSrc, typename _AccessorDst>
     void
-    operator()(const _Idx __idx, const _AccessorSrc& __acc1, _AccessorDst& __acc2) const
+    __vector_path(_IsFull __is_full, const _Idx __idx, const _AccessorSrc& __acc1, _AccessorDst& __acc2) const
     {
+        auto __acc1_pointer = __acc1.begin();
+        auto __acc2_pointer = __acc2.begin();
+        _Idx __shifted_idx = __shift + __idx;
+        _Idx __wrapped_idx = __shifted_idx % __size;
+        std::size_t __n = __shift;
+        //if (__idx >= __n) return;
+        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __acc1_vector[__base_t::__preferred_vector_size];
+        // Vectorize loads only if we know the wrap around point is beyond the current vector elements to process
+        if (__wrapped_idx + __base_t::__preferred_vector_size < __size)
+        {
+            oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(__is_full, __wrapped_idx, 
+                    oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{},
+                    __acc1_pointer, __acc1_vector);
+        }
+        else
+        {
+            std::size_t __remaining_elements = __idx >= __n ? 0 : __n - __idx;
+            std::size_t __elements_to_process = std::min(static_cast<std::size_t>(__base_t::__preferred_vector_size), __remaining_elements);
+            for (std::uint16_t __i = 0; __i != __elements_to_process; ++__i)
+                __acc1_vector[__i].__setup(__acc1_pointer[(__shifted_idx + __i) % __size]);
+        }
+        oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(__is_full, __idx,
+            oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<oneapi::dpl::__internal::__pstl_assign>{}, __acc1_vector, __acc2_pointer);
+        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size, oneapi::dpl::__internal::__lazy_ctor_storage_deleter>{
+            oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __n}(__is_full, 0, __acc1_vector);
+    }
+    template <typename _IsFull, typename _Idx>
+    void
+    __scalar_path(_IsFull, const _Idx __idx, const _Range1 __acc1, _Range2 __acc2) const
+    {
+
         __acc2[__idx] = __acc1[(__shift + __idx) % __size];
     }
+    template <typename _IsFull, typename _Idx>
+    void
+    operator()(_IsFull __is_full, const _Idx __idx, const _Range1 __acc1, _Range2 __acc2) const
+    {
+        if constexpr (__base_t::__can_vectorize)
+            __vector_path(__is_full, __idx, __acc1, __acc2);
+        else
+            __scalar_path(__is_full, __idx, __acc1, __acc2);
+    }
 };
 
 //------------------------------------------------------------------------
@@ -1038,15 +1390,51 @@ class __brick_set_op
     }
 };
 
-template <typename _ExecutionPolicy, typename _DiffType>
+template <typename _ExecutionPolicy, typename _DiffType, typename _Range>
 struct __brick_shift_left
+    : public walk_vector_or_scalar_base<_Range>
 {
+    using __base_t = walk_vector_or_scalar_base<_Range>;
+    using _ValueType = oneapi::dpl::__internal::__value_t<_Range>;
     _DiffType __size;
     _DiffType __n;
+    
+    template <typename _IsFull, typename _ItemId>
+    void
+    __vector_path(_IsFull __is_full, const _ItemId __idx, _Range __rng) const
+    {
+        if (__idx >= __n) return;
+        const _DiffType __i = __idx - __n; //loop invariant
+        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __rng_vector[__base_t::__preferred_vector_size];
+        auto __rng_pointer = __rng.begin();
+        for (_DiffType __k = __n; __k < __size; __k += __n)
+        {
+            if (__k + __idx + __base_t::__preferred_vector_size <= __size)
+            {
+                oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{static_cast<std::size_t>(__size)}(std::true_type{}, __k + __idx,
+                        oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{},
+                        __rng_pointer, __rng_vector);
+                oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{static_cast<std::size_t>(__size)}(std::true_type{}, __k + __i,
+                    oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<oneapi::dpl::__internal::__pstl_assign>{}, __rng_vector, __rng_pointer);
+                oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size, oneapi::dpl::__internal::__lazy_ctor_storage_deleter>{
+                    oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, static_cast<std::size_t>(__size)}(std::true_type{}, 0, __rng_vector);
+            }
+            else if (__k + __idx < __size)
+            {
+                oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{static_cast<std::size_t>(__size)}(std::false_type{}, __k + __idx,
+                        oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __rng_pointer, __rng_vector);
+                oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{static_cast<std::size_t>(__size)}(std::false_type{}, __k + __i,
+                    oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<oneapi::dpl::__internal::__pstl_assign>{}, __rng_vector, __rng_pointer);
+                oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size, oneapi::dpl::__internal::__lazy_ctor_storage_deleter>{
+                    oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, static_cast<std::size_t>(__size)}(std::false_type{}, 0, __rng_vector);
+            }
+        }
 
-    template <typename _ItemId, typename _Range>
+    }
+
+    template <typename _IsFull, typename _ItemId>
     void
-    operator()(const _ItemId __idx, _Range&& __rng) const
+    __scalar_path(_IsFull __is_full, const _ItemId __idx, _Range __rng) const
     {
         const _DiffType __i = __idx - __n; //loop invariant
         for (_DiffType __k = __n; __k < __size; __k += __n)
@@ -1055,6 +1443,16 @@ struct __brick_shift_left
                 __rng[__k + __i] = ::std::move(__rng[__k + __idx]);
         }
     }
+
+    template <typename _IsFull, typename _ItemId>
+    void
+    operator()(_IsFull __is_full, const _ItemId __idx, _Range __rng) const
+    {
+        if constexpr (__base_t::__can_vectorize)
+            __vector_path(__is_full, __idx, __rng);
+        else
+            __scalar_path(__is_full, __idx, __rng);
+    }
 };
 
 struct __brick_assign_key_position
diff --git a/include/oneapi/dpl/pstl/hetero/numeric_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/numeric_impl_hetero.h
index d040e828eef..f63a8f34992 100644
--- a/include/oneapi/dpl/pstl/hetero/numeric_impl_hetero.h
+++ b/include/oneapi/dpl/pstl/hetero/numeric_impl_hetero.h
@@ -263,10 +263,13 @@ __pattern_adjacent_difference(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&
             oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _ForwardIterator2>();
         auto __buf2 = __keep2(__d_first, __d_last);
 
-        using _Function = unseq_backend::walk_adjacent_difference<_ExecutionPolicy, decltype(__fn)>;
+        auto __view1 = __buf1.all_view();
+        auto __view2 = __buf2.all_view();
 
-        oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, __exec, _Function{__fn}, __n,
-                                                          __buf1.all_view(), __buf2.all_view())
+        using _Function = unseq_backend::walk_adjacent_difference<_ExecutionPolicy, decltype(__fn), decltype(__view1), decltype(__view2)>;
+
+        oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, __exec, _Function{{}, __fn, static_cast<std::size_t>(__n)}, __n,
+                                                          __view1, __view2)
             .__deferrable_wait();
     }
 
diff --git a/include/oneapi/dpl/pstl/utils.h b/include/oneapi/dpl/pstl/utils.h
index 1848d33eaea..58ea5edaebe 100644
--- a/include/oneapi/dpl/pstl/utils.h
+++ b/include/oneapi/dpl/pstl/utils.h
@@ -785,6 +785,16 @@ union __lazy_ctor_storage
     }
 };
 
+// Utility to explicitly call the destructor of __lazy_ctor_storage as a callback functor 
+struct __lazy_ctor_storage_deleter
+{
+    template <typename _Tp>
+    void operator()(__lazy_ctor_storage<_Tp> __storage) const
+    {
+        __storage.__destroy();
+    }
+};
+
 // Returns the smallest type within a set of potentially nested template types.
 // E.g. If we consider the type: T = tuple<float, tuple<short, long>, int, double>,
 // then __min_nested_type_size<T>::value returns sizeof(short).

From e4060f54569f0477e3c9ce1aae13f5b3bef81cce Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Mon, 16 Dec 2024 14:26:53 -0800
Subject: [PATCH 28/65] Improve testing and cleanup of code

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../hetero/dpcpp/parallel_backend_sycl_for.h  | 14 +++---
 .../dpcpp/parallel_backend_sycl_utils.h       | 18 ++-----
 .../pstl/hetero/dpcpp/unseq_backend_sycl.h    | 47 +++++++++++--------
 .../dpl/pstl/hetero/dpcpp/utils_ranges_sycl.h | 21 +++++++++
 .../alg.reverse/reverse.pass.cpp              |  1 +
 .../alg.reverse/reverse_copy.pass.cpp         |  1 +
 .../copy_move.pass.cpp                        |  3 ++
 .../alg.modifying.operations/replace.pass.cpp |  1 +
 test/support/test_config.h                    |  2 +
 9 files changed, 67 insertions(+), 41 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h
index 6ec9500cd2c..a190a710949 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h
@@ -103,7 +103,7 @@ struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name..
             const std::size_t __sub_group_start_idx =
                 __iters_per_work_item * __adj_elements_per_work_item * (__work_group_id * __work_group_size + __sub_group_size * __sub_group_id);
             const bool __is_full_sub_group =
-                __sub_group_start_idx + __iters_per_work_item * __adj_elements_per_work_item + __sub_group_size <= __count;
+                __sub_group_start_idx + __iters_per_work_item * __adj_elements_per_work_item * __sub_group_size <= __count;
             const std::size_t __work_item_idx = __sub_group_start_idx + __adj_elements_per_work_item * __sub_group_local_id;
             return std::make_tuple(__work_item_idx, __adj_elements_per_work_item * __sub_group_size, __is_full_sub_group);
         }
@@ -124,14 +124,15 @@ struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name..
     static std::size_t
     __estimate_best_start_size(const _ExecutionPolicy& __exec, _Fp __brick)
     {
-#if 0
-        constexpr static std::uint16_t __iters_per_work_item = 4 / decltype(__brick)::__preferred_vector_size; 
+        // To ensure that the large submitter gets tested on all devices, set the switch point to 10,000 only when compiling
+        // oneDPL tests.
+#if TEST_FOR_ALGORITHM_LARGE_SUBMITTER
+        return 10000;
+#else
         const std::size_t __work_group_size =
             oneapi::dpl::__internal::__max_work_group_size(__exec, __max_work_group_size);
         const std::uint32_t __max_cu = oneapi::dpl::__internal::__max_compute_units(__exec);
-        return __work_group_size * __max_cu;
-#else
-        return 10000;
+        return __work_group_size * _Fp::__preferred_iters_per_item * __max_cu;
 #endif
     }
 
@@ -186,7 +187,6 @@ __parallel_for(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&&
 
     using __small_submitter = __parallel_for_small_submitter<_ForKernelSmall>;
     using __large_submitter = __parallel_for_large_submitter<_ForKernelLarge, _Ranges...>;
-    //std::cerr << _Fp::__preferred_vector_size << std::endl;
     // Compile two kernels: one for small-to-medium inputs and a second for large. This avoids runtime checks within a
     // single kernel that worsen performance for small cases. If the number of iterations of the large submitter is 1,
     // then only compile the basic kernel as the two versions are effectively the same.
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
index 5a9e0ee7de3..55342166470 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
@@ -873,14 +873,15 @@ struct __vector_load
 template <typename _TransformOp>
 struct __lazy_store_transform_op
 {
-    _TransformOp __transform{};
+    _TransformOp __transform;
+    // Unary transformations into an output buffer
     template <typename _IdxType1, typename _IdxType2, typename _SourceAcc, typename _DestAcc>
     void
     operator()(_IdxType1 __idx_source, _IdxType2 __idx_dest, _SourceAcc __source_acc, _DestAcc __dest_acc) const
     {
-        // TODO: fix this. it always performs an assignment in its current state
         __transform(__source_acc[__idx_source].__v, __dest_acc[__idx_dest]);
     }
+    // Binary transformations into an output buffer
     template <typename _IdxType1, typename _IdxType2, typename _Source1Acc, typename _Source2Acc, typename _DestAcc>
     void
     operator()(_IdxType1 __idx_source, _IdxType2 __idx_dest, _Source1Acc __source1_acc, _Source2Acc __source2_acc, _DestAcc __dest_acc) const
@@ -983,7 +984,7 @@ struct __strided_loop
 	void
     operator()(/*__is_full*/std::false_type, _IdxType __idx, std::uint16_t __stride, _LoopBodyOp __loop_body_op, _Ranges&&... __rngs) const
     {
-        // constrain the number of iterations as much as possible and then pass the knowledge that we are not a full loop to the body operation
+        // Constrain the number of iterations as much as possible and then pass the knowledge that we are not a full loop to the body operation
         const std::uint8_t __adjusted_iters_per_work_item =
             oneapi::dpl::__internal::__dpl_ceiling_div(__n - __idx, __stride);
         for (std::uint16_t __i = 0; __i < __adjusted_iters_per_work_item; ++__i)
@@ -994,17 +995,6 @@ struct __strided_loop
     }
 };
 
-template <typename _Rng>
-struct __is_vectorizable_view
-    : std::false_type
-{
-};
-template <typename... _Args>
-struct __is_vectorizable_view<oneapi::dpl::__ranges::guard_view<_Args...>>
-    : std::true_type
-{
-};
-
 } // namespace __par_backend_hetero
 } // namespace dpl
 } // namespace oneapi
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
index 11e504de9f1..4b863f69214 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
@@ -22,6 +22,7 @@
 #include "../../onedpl_config.h"
 #include "../../utils.h"
 #include "sycl_defs.h"
+#include "utils_ranges_sycl.h"
 
 namespace oneapi
 {
@@ -109,27 +110,31 @@ struct walk_n
     }
 };
 
+// Base class which establishes tuning parameters including vectorization / scalar path decider at compile time
+// for walk / for based algorithms
 template <typename... _Ranges>
-struct walk_vector_or_scalar_base
+class walk_vector_or_scalar_base
 {
     using _ValueTypes = std::tuple<oneapi::dpl::__internal::__value_t<_Ranges>...>;
     constexpr static std::uint16_t __min_type_size = oneapi::dpl::__internal::__min_nested_type_size<_ValueTypes>::value;
-    // TODO: We need some check that the input is contiguous and can be treated as a pointer.
+    constexpr static std::uint16_t __bytes_per_item = 16;
+public:
     constexpr static bool __can_vectorize =
-        (oneapi::dpl::__par_backend_hetero::__is_vectorizable_view<_Ranges>::value && ...) &&
+        (oneapi::dpl::__ranges::__is_vectorizable_view<_Ranges>::value && ...) &&
         (std::is_fundamental_v<oneapi::dpl::__internal::__value_t<_Ranges>> && ...) && __min_type_size < 4;
     // Vectorize for small types, so we generate 128-byte load / stores in a sub-group
     constexpr static std::uint16_t __preferred_vector_size = __can_vectorize ? oneapi::dpl::__internal::__dpl_ceiling_div(4,
             __min_type_size) : 1;
     // To achieve full bandwidth utilization, multiple iterations need to be processed by a work item
-    constexpr static std::uint16_t __preferred_iters_per_item = 16 / (__min_type_size * __preferred_vector_size);
+    constexpr static std::uint16_t __preferred_iters_per_item = __bytes_per_item / (__min_type_size * __preferred_vector_size);
 };
 
+// Path that intentionally disables vectorization for algorithms with a scattered access pattern (e.g. binary_search)
 template <typename... _Ranges>
-struct walk_scalar_base
+class walk_scalar_base
 {
     using _ValueType = oneapi::dpl::__internal::__min_nested_type_size<std::tuple<oneapi::dpl::__internal::__value_t<_Ranges>...>>;
-    // TODO: We need some check that the input is contiguous and can be treated as a pointer.
+public:
     constexpr static bool __can_vectorize = false;
     // With no vectorization, the vector size is 1
     constexpr static std::uint16_t __preferred_vector_size = 1;
@@ -137,7 +142,6 @@ struct walk_scalar_base
     constexpr static std::uint16_t __preferred_iters_per_item = 16 / (sizeof(_ValueType) * __preferred_vector_size);
 };
 
-// conditionally determines if a vector or scalar path should be taken based on the provided range. 
 template <typename _ExecutionPolicy, typename _F, typename _Range>
 struct walk1_vector_or_scalar
   : public walk_vector_or_scalar_base<_Range>
@@ -188,8 +192,9 @@ struct walk2_vectors_or_scalars
     __vector_path(_IsFull __is_full, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2) const
     {
         using _ValueType1 = oneapi::dpl::__internal::__value_t<_Range1>;
-        // This is needed for the icpx compiler to vectorize. The indirection introduced by our range holder interfere
-        // with the vectorizer. At this point, we have ensured that input is contiguous and can be operated on as a raw pointer.
+        // This is needed for the icpx compiler to vectorize. The indirection introduced by our all / guard views interfere
+        // with compiler vectorization. At this point, we have ensured that input is contiguous and can be operated on as a raw pointer. The
+        // begin() function for these views will return a pointer.
         auto __raw_ptr1 = __rng1.begin();
         auto __raw_ptr2 = __rng2.begin();
         oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType1> __rng1_vector[__base_t::__preferred_vector_size];
@@ -240,7 +245,8 @@ struct walk3_vectors_or_scalars
         using _ValueType1 = oneapi::dpl::__internal::__value_t<_Range1>;
         using _ValueType2 = oneapi::dpl::__internal::__value_t<_Range2>;
         // This is needed for the icpx compiler to vectorize. The indirection introduced by our views interfere
-        // with the vectorizer. At this point, we have ensured that input is contiguous and can be operated on as a raw pointer.
+        // with compiler vectorization. At this point, we have ensured that input is contiguous and can be operated on
+        // as a raw pointer.
         auto __raw_ptr1 = __rng1.begin();
         auto __raw_ptr2 = __rng2.begin();
         auto __raw_ptr3 = __rng3.begin();
@@ -253,7 +259,7 @@ struct walk3_vectors_or_scalars
                 __raw_ptr1, __rng1_vector);
         oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(__is_full, __idx, __raw_ptr2, __rng2_vector,
             oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{});
-        // 2. Apply functor to vector and store into global memory
+        // 2. Apply binary functor to vector and store into global memory
         oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(__is_full, __idx,
                 oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f}, __rng1_vector, __rng2_vector, __raw_ptr3);
         // 3. Explicitly call destructors of lazy union type
@@ -329,6 +335,8 @@ struct walk_adjacent_difference
         auto __acc_src_ptr = __acc_src.begin();
         auto __acc_dst_ptr = __acc_dst.begin();
         oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __src_vector[__base_t::__preferred_vector_size + 1];
+        // 1. Establish a vector of __preferred_vector_size + 1 where a scalar load is performed on the first element
+        // followed by a vector load of the specified length.
         if (__idx != 0)
             __src_vector[0].__setup(__acc_src_ptr[__idx - 1]);
         else
@@ -336,12 +344,14 @@ struct walk_adjacent_difference
         oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(__is_full, __idx, 
                 oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{},
                 __acc_src_ptr, &__src_vector[1]);
+        // 2. Perform a vector store of __preferred_vector_size adjacent differences.
         oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(__is_full, __idx,
             oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f}, __src_vector, &__src_vector[1], __acc_dst_ptr);
         // A dummy value is first written to global memory followed by an overwrite for the first index. Pulling the vector loads / stores into an if branch
-        // to better handle this results in vectorization and performance problems.
+        // to better handle this results in performance degradation.
         if (__idx == 0)
             __acc_dst[0] = __src_vector[0].__v;
+        // 3. Delete temporary storage
 		oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size, oneapi::dpl::__internal::__lazy_ctor_storage_deleter>{
             oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __n}(__is_full, 0, __src_vector);
     }
@@ -1165,7 +1175,7 @@ struct __reverse_functor
         oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(__is_full, __right_start_idx, 
                 oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{},
                 __acc_pointer, __acc_right_vector);
-        // 2. Reverse vectors in registers
+        // 2. Reverse vectors in registers. Note that due to indices we have chosen, there will always be a full vector of elements to load
         oneapi::dpl::__par_backend_hetero::__vector_reverse<__base_t::__preferred_vector_size>{}(std::true_type{}, __left_start_idx, __acc_left_vector);
         oneapi::dpl::__par_backend_hetero::__vector_reverse<__base_t::__preferred_vector_size>{}(std::true_type{}, __right_start_idx, __acc_right_vector);
         // 3. Store the left-half vector to the corresponding right-half indices and vice versa
@@ -1173,7 +1183,7 @@ struct __reverse_functor
             oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<oneapi::dpl::__internal::__pstl_assign>{}, __acc_left_vector, __acc_pointer);
         oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(__is_full, __left_start_idx,
             oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<oneapi::dpl::__internal::__pstl_assign>{}, __acc_right_vector, __acc_pointer);
-        // 4. Call destructor of temporary storage 
+        // 4. Call destructors of temporary storage 
         oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size, oneapi::dpl::__internal::__lazy_ctor_storage_deleter>{
             oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __n}(__is_full, 0, __acc_left_vector);
         oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size, oneapi::dpl::__internal::__lazy_ctor_storage_deleter>{
@@ -1221,7 +1231,6 @@ struct __reverse_copy
         auto __acc1_pointer = __acc1.begin();
         auto __acc2_pointer = __acc2.begin();
         std::size_t __n = __size;
-        // TODO why do we need this?
         std::size_t __remaining_elements = __idx >= __n ? 0 : __n - __idx;
         std::size_t __elements_to_process = std::min(static_cast<std::size_t>(__base_t::__preferred_vector_size), __remaining_elements);
         const _Idx __output_start = __size - __idx - __elements_to_process;
@@ -1231,9 +1240,6 @@ struct __reverse_copy
                 oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{},
                 __acc1_pointer, __acc1_vector);
         // 2, 3. Reverse in registers and flip the location of the vector in the output buffer
-        // TODO: For uint8_t, this if...else... branch to handle inputs that do not align to sub-group size introduces a ~7x performance regression on
-        // GPU Series Max 1550. We need to investigate the generated assembly and work with the compiler team to resolve this. The performance is still
-        // better than the scalar path in this case, so it is still worth taking even with this performance bug.
         if (__elements_to_process == __base_t::__preferred_vector_size)
         {
             oneapi::dpl::__par_backend_hetero::__vector_reverse<__base_t::__preferred_vector_size>{}(std::true_type{}, __elements_to_process, __acc1_vector);
@@ -1281,9 +1287,8 @@ struct __rotate_copy
         _Idx __shifted_idx = __shift + __idx;
         _Idx __wrapped_idx = __shifted_idx % __size;
         std::size_t __n = __shift;
-        //if (__idx >= __n) return;
         oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __acc1_vector[__base_t::__preferred_vector_size];
-        // Vectorize loads only if we know the wrap around point is beyond the current vector elements to process
+        //1. Vectorize loads only if we know the wrap around point is beyond the current vector elements to process
         if (__wrapped_idx + __base_t::__preferred_vector_size < __size)
         {
             oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(__is_full, __wrapped_idx, 
@@ -1297,8 +1302,10 @@ struct __rotate_copy
             for (std::uint16_t __i = 0; __i != __elements_to_process; ++__i)
                 __acc1_vector[__i].__setup(__acc1_pointer[(__shifted_idx + __i) % __size]);
         }
+        // 2. Store the rotation
         oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(__is_full, __idx,
             oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<oneapi::dpl::__internal::__pstl_assign>{}, __acc1_vector, __acc2_pointer);
+        // 3. Delete temporary storage
         oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size, oneapi::dpl::__internal::__lazy_ctor_storage_deleter>{
             oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __n}(__is_full, 0, __acc1_vector);
     }
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/utils_ranges_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/utils_ranges_sycl.h
index 0f757c2ce3e..e7d51d8647f 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/utils_ranges_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/utils_ranges_sycl.h
@@ -752,6 +752,27 @@ __select_backend(const execution::fpga_policy<_Factor, _KernelName>&, _Ranges&&.
 }
 #endif
 
+// Check the outer view type type to see if we can vectorize. Any non-contiguous inputs (e.g. reverse
+// views, permutation views, etc.) cannot be vectorized.
+template <typename _Args>
+struct __is_vectorizable_view
+    : std::false_type
+{
+};
+// If the outer view is a guard view, then the input is passed directly as a pointer and we can use.
+template <typename... _Args>
+struct __is_vectorizable_view<oneapi::dpl::__ranges::guard_view<_Args...>>
+    : std::true_type
+{
+};
+// If all_view is passed, then we are processing a sycl::buffer directly which is contiguous and can
+// be used. 
+template <typename... _Args>
+struct __is_vectorizable_view<oneapi::dpl::__ranges::all_view<_Args...>>
+    : std::true_type
+{
+};
+
 } // namespace __ranges
 } // namespace dpl
 } // namespace oneapi
diff --git a/test/parallel_api/algorithm/alg.modifying.operations/alg.reverse/reverse.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/alg.reverse/reverse.pass.cpp
index 13f03fdb3dd..e2d61c0fb9c 100644
--- a/test/parallel_api/algorithm/alg.modifying.operations/alg.reverse/reverse.pass.cpp
+++ b/test/parallel_api/algorithm/alg.modifying.operations/alg.reverse/reverse.pass.cpp
@@ -99,6 +99,7 @@ int
 main()
 {
     test<std::int32_t>();
+    test<std::uint8_t>();
     test<std::uint16_t>();
     test<float64_t>();
     test<wrapper<float32_t>>();
diff --git a/test/parallel_api/algorithm/alg.modifying.operations/alg.reverse/reverse_copy.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/alg.reverse/reverse_copy.pass.cpp
index 44f9dc6fe7c..b48c3a23b3d 100644
--- a/test/parallel_api/algorithm/alg.modifying.operations/alg.reverse/reverse_copy.pass.cpp
+++ b/test/parallel_api/algorithm/alg.modifying.operations/alg.reverse/reverse_copy.pass.cpp
@@ -100,6 +100,7 @@ main()
 {
     // clang-3.8 fails to correctly auto vectorize the loop in some cases of different types of container's elements,
     // for example: std::int32_t and std::int8_t. This issue isn't detected for clang-3.9 and newer versions.
+    test<std::uint8_t, std::uint8_t>();
     test<std::int16_t, std::int8_t>();
     test<std::uint16_t, float32_t>();
     test<float64_t, std::int64_t>();
diff --git a/test/parallel_api/algorithm/alg.modifying.operations/copy_move.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/copy_move.pass.cpp
index 9ad4cfc13bc..c49a57d6c87 100644
--- a/test/parallel_api/algorithm/alg.modifying.operations/copy_move.pass.cpp
+++ b/test/parallel_api/algorithm/alg.modifying.operations/copy_move.pass.cpp
@@ -166,6 +166,9 @@ main()
     test<std::int32_t>(-666, [](size_t j) { return std::int32_t(j); });
     test<float64_t>(-666.0, [](size_t j) { return float64_t(j); });
 
+    test<std::uint16_t>(42, [](size_t j) { return std::uint16_t(j); });
+    test<std::uint8_t>(42, [](size_t j) { return std::uint8_t(j); });
+
 #if !TEST_DPCPP_BACKEND_PRESENT
     /*TODO: copy support of a class with no default constructor*/
     test<Wrapper<float64_t>>(Wrapper<float64_t>(-666.0), [](std::int32_t j) { return Wrapper<float64_t>(j); });
diff --git a/test/parallel_api/algorithm/alg.modifying.operations/replace.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/replace.pass.cpp
index 92820d2012f..00d18c94df7 100644
--- a/test/parallel_api/algorithm/alg.modifying.operations/replace.pass.cpp
+++ b/test/parallel_api/algorithm/alg.modifying.operations/replace.pass.cpp
@@ -162,6 +162,7 @@ int
 main()
 {
     test<std::int32_t, float32_t>(oneapi::dpl::__internal::__equal_value<std::int32_t>(666));
+    test<std::uint8_t, std::uint8_t>([](const std::uint8_t& elem) { return elem % 3 < 2; });
     test<std::uint16_t, std::uint8_t>([](const std::uint16_t& elem) { return elem % 3 < 2; });
     test<float64_t, std::int64_t>([](const float64_t& elem) { return elem * elem - 3.5 * elem > 10; });
     //test<copy_int, copy_int>([](const copy_int& val) { return val.value / 5 > 2; });
diff --git a/test/support/test_config.h b/test/support/test_config.h
index b0fe4c8a9c9..06e37edbce9 100644
--- a/test/support/test_config.h
+++ b/test/support/test_config.h
@@ -276,4 +276,6 @@
 // Intel(R) oneAPI DPC++/C++ compiler produces 'Unexpected kernel lambda size issue' error
 #define _PSTL_LAMBDA_PTR_TO_MEMBER_WINDOWS_BROKEN (_MSC_VER && TEST_DPCPP_BACKEND_PRESENT && __INTEL_LLVM_COMPILER <= 20250100)
 
+#define TEST_FOR_ALGORITHM_LARGE_SUBMITTER TEST_DPCPP_BACKEND_PRESENT
+
 #endif // _TEST_CONFIG_H

From 283b053f0537cfce543768487a442ce92fb5c39a Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Mon, 16 Dec 2024 16:50:04 -0600
Subject: [PATCH 29/65] clang format

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../oneapi/dpl/internal/binary_search_impl.h  |  18 +-
 .../dpl/pstl/hetero/algorithm_impl_hetero.h   |  49 ++--
 .../dpcpp/parallel_backend_sycl_utils.h       |  25 +-
 .../pstl/hetero/dpcpp/unseq_backend_sycl.h    | 248 ++++++++++--------
 .../dpl/pstl/hetero/dpcpp/utils_ranges_sycl.h |  11 +-
 .../dpl/pstl/hetero/numeric_impl_hetero.h     |   7 +-
 6 files changed, 204 insertions(+), 154 deletions(-)

diff --git a/include/oneapi/dpl/internal/binary_search_impl.h b/include/oneapi/dpl/internal/binary_search_impl.h
index 0214b9874c4..f57e9e0fafc 100644
--- a/include/oneapi/dpl/internal/binary_search_impl.h
+++ b/include/oneapi/dpl/internal/binary_search_impl.h
@@ -39,8 +39,7 @@ enum class search_algorithm
 
 #if _ONEDPL_BACKEND_SYCL
 template <typename Comp, typename T, typename _Range, search_algorithm func>
-struct custom_brick
-    : oneapi::dpl::unseq_backend::walk_scalar_base<_Range>
+struct custom_brick : oneapi::dpl::unseq_backend::walk_scalar_base<_Range>
 {
     Comp comp;
     T size;
@@ -163,7 +162,8 @@ lower_bound_impl(__internal::__hetero_tag<_BackendTag>, Policy&& policy, InputIt
     const bool use_32bit_indexing = size <= std::numeric_limits<std::uint32_t>::max();
     __bknd::__parallel_for(
         _BackendTag{}, ::std::forward<decltype(policy)>(policy),
-        custom_brick<StrictWeakOrdering, decltype(size), decltype(zip_vw), search_algorithm::lower_bound>{{}, comp, size, use_32bit_indexing},
+        custom_brick<StrictWeakOrdering, decltype(size), decltype(zip_vw), search_algorithm::lower_bound>{
+            {}, comp, size, use_32bit_indexing},
         value_size, zip_vw)
         .__deferrable_wait();
     return result + value_size;
@@ -195,7 +195,8 @@ upper_bound_impl(__internal::__hetero_tag<_BackendTag>, Policy&& policy, InputIt
     const bool use_32bit_indexing = size <= std::numeric_limits<std::uint32_t>::max();
     __bknd::__parallel_for(
         _BackendTag{}, std::forward<decltype(policy)>(policy),
-        custom_brick<StrictWeakOrdering, decltype(size), decltype(zip_vw), search_algorithm::upper_bound>{{}, comp, size, use_32bit_indexing},
+        custom_brick<StrictWeakOrdering, decltype(size), decltype(zip_vw), search_algorithm::upper_bound>{
+            {}, comp, size, use_32bit_indexing},
         value_size, zip_vw)
         .__deferrable_wait();
     return result + value_size;
@@ -225,10 +226,11 @@ binary_search_impl(__internal::__hetero_tag<_BackendTag>, Policy&& policy, Input
     auto result_buf = keep_result(result, result + value_size);
     auto zip_vw = make_zip_view(input_buf.all_view(), value_buf.all_view(), result_buf.all_view());
     const bool use_32bit_indexing = size <= std::numeric_limits<std::uint32_t>::max();
-    __bknd::__parallel_for(_BackendTag{}, std::forward<decltype(policy)>(policy),
-                           custom_brick<StrictWeakOrdering, decltype(size), decltype(zip_vw), search_algorithm::binary_search>{
-                             {}, comp, size, use_32bit_indexing},
-                           value_size, zip_vw)
+    __bknd::__parallel_for(
+        _BackendTag{}, std::forward<decltype(policy)>(policy),
+        custom_brick<StrictWeakOrdering, decltype(size), decltype(zip_vw), search_algorithm::binary_search>{
+            {}, comp, size, use_32bit_indexing},
+        value_size, zip_vw)
         .__deferrable_wait();
     return result + value_size;
 }
diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h
index bdcba6fca21..d17f021ab78 100644
--- a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h
+++ b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h
@@ -58,7 +58,10 @@ __pattern_walk1(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _ForwardIt
 
     auto __view = __buf.all_view();
     oneapi::dpl::__par_backend_hetero::__parallel_for(
-        _BackendTag{}, __exec, unseq_backend::walk1_vector_or_scalar<_ExecutionPolicy, _Function, decltype(__view)>{{}, __f, static_cast<std::size_t>(__n)}, __n, __view)
+        _BackendTag{}, __exec,
+        unseq_backend::walk1_vector_or_scalar<_ExecutionPolicy, _Function, decltype(__view)>{
+            {}, __f, static_cast<std::size_t>(__n)},
+        __n, __view)
         .__deferrable_wait();
 }
 
@@ -101,13 +104,15 @@ __pattern_walk2(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _ForwardIt
 
     auto __keep2 = oneapi::dpl::__ranges::__get_sycl_range<__acc_mode2, _ForwardIterator2>();
     auto __buf2 = __keep2(__first2, __first2 + __n);
-    
+
     auto __view1 = __buf1.all_view();
     auto __view2 = __buf2.all_view();
 
     auto __future = oneapi::dpl::__par_backend_hetero::__parallel_for(
         _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
-        unseq_backend::walk2_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__view1), decltype(__view2)>{{}, __f, size_t(__n)}, __n, __view1, __view2);
+        unseq_backend::walk2_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__view1), decltype(__view2)>{
+            {}, __f, size_t(__n)},
+        __n, __view1, __view2);
 
     // Call no wait, wait or deferrable wait depending on _WaitMode
     __future.wait(_WaitMode{});
@@ -168,10 +173,11 @@ __pattern_walk3(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _ForwardIt
     auto __view2 = __buf2.all_view();
     auto __view3 = __buf3.all_view();
 
-    oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
-                                                      unseq_backend::walk3_vectors_or_scalars<_ExecutionPolicy, _Function,
-                                                      decltype(__view1), decltype(__view2), decltype(__view3)>{{}, __f, size_t(__n)}, __n,
-                                                      __view1, __view2, __view3)
+    oneapi::dpl::__par_backend_hetero::__parallel_for(
+        _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
+        unseq_backend::walk3_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__view1), decltype(__view2),
+                                                decltype(__view3)>{{}, __f, size_t(__n)},
+        __n, __view1, __view2, __view3)
         .__deferrable_wait();
 
     return __first3 + __n;
@@ -1574,8 +1580,9 @@ __pattern_reverse(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterato
     auto __buf = __keep(__first, __last);
     oneapi::dpl::__par_backend_hetero::__parallel_for(
         _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
-        unseq_backend::__reverse_functor<typename ::std::iterator_traits<_Iterator>::difference_type, decltype(__buf.all_view())>{{}, __n}, __n / 2,
-        __buf.all_view())
+        unseq_backend::__reverse_functor<typename ::std::iterator_traits<_Iterator>::difference_type,
+                                         decltype(__buf.all_view())>{{}, __n},
+        __n / 2, __buf.all_view())
         .__deferrable_wait();
 }
 
@@ -1602,7 +1609,8 @@ __pattern_reverse_copy(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Bi
     auto __view2 = __buf2.all_view();
     oneapi::dpl::__par_backend_hetero::__parallel_for(
         _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
-        unseq_backend::__reverse_copy<typename ::std::iterator_traits<_BidirectionalIterator>::difference_type, decltype(__view1), decltype(__view2)>{{}, __n},
+        unseq_backend::__reverse_copy<typename ::std::iterator_traits<_BidirectionalIterator>::difference_type,
+                                      decltype(__view1), decltype(__view2)>{{}, __n},
         __n, __view1, __view2)
         .__deferrable_wait();
 
@@ -1644,8 +1652,9 @@ __pattern_rotate(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterator
     const auto __shift = __new_first - __first;
     oneapi::dpl::__par_backend_hetero::__parallel_for(
         _BackendTag{}, oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__rotate_wrapper>(__exec),
-        unseq_backend::__rotate_copy<typename ::std::iterator_traits<_Iterator>::difference_type, decltype(__view), decltype(__temp_rng_w)>{{}, __n, __shift}, __n,
-        __view, __temp_rng_w);
+        unseq_backend::__rotate_copy<typename ::std::iterator_traits<_Iterator>::difference_type, decltype(__view),
+                                     decltype(__temp_rng_w)>{{}, __n, __shift},
+        __n, __view, __temp_rng_w);
 
     //An explicit wait isn't required here because we are working with a temporary sycl::buffer and sycl accessors and
     //SYCL runtime makes a dependency graph to prevent the races between two __parallel_for patterns.
@@ -1653,8 +1662,9 @@ __pattern_rotate(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterator
     using _Function = __brick_move<__hetero_tag<_BackendTag>, _ExecutionPolicy>;
     auto __temp_rng_rw =
         oneapi::dpl::__ranges::all_view<_Tp, __par_backend_hetero::access_mode::read_write>(__temp_buf.get_buffer());
-    auto __brick = unseq_backend::walk2_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__temp_rng_rw), decltype(__buf.all_view())>{{}, _Function{},
-        static_cast<std::size_t>(__n)};
+    auto __brick = unseq_backend::walk2_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__temp_rng_rw),
+                                                           decltype(__buf.all_view())>{
+        {}, _Function{}, static_cast<std::size_t>(__n)};
     oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), __brick,
                                                       __n, __temp_rng_rw, __buf.all_view())
         .__deferrable_wait();
@@ -1692,8 +1702,8 @@ __pattern_rotate_copy(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Bid
 
     oneapi::dpl::__par_backend_hetero::__parallel_for(
         _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
-        unseq_backend::__rotate_copy<typename ::std::iterator_traits<_BidirectionalIterator>::difference_type, decltype(__view1), decltype(__view2)>{{}, __n,
-                                                                                                               __shift},
+        unseq_backend::__rotate_copy<typename ::std::iterator_traits<_BidirectionalIterator>::difference_type,
+                                     decltype(__view1), decltype(__view2)>{{}, __n, __shift},
         __n, __view1, __view2)
         .__deferrable_wait();
 
@@ -1947,7 +1957,7 @@ __pattern_shift_left(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Rang
     _DiffType __mid = __size / 2 + __size % 2;
     _DiffType __size_res = __size - __n;
 
-    ////1. n >= size/2; 'size - _n' parallel copying
+    //1. n >= size/2; 'size - _n' parallel copying
     if (__n >= __mid)
     {
         using _Function = __brick_move<__hetero_tag<_BackendTag>, _ExecutionPolicy>;
@@ -1956,8 +1966,9 @@ __pattern_shift_left(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Rang
         auto __src = oneapi::dpl::__ranges::drop_view_simple<_Range, _DiffType>(__rng, __n);
         auto __dst = oneapi::dpl::__ranges::take_view_simple<_Range, _DiffType>(__rng, __size_res);
 
-        auto __brick = 
-            unseq_backend::walk2_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__src), decltype(__dst)>{{}, _Function{}, static_cast<std::size_t>(__size_res)};
+        auto __brick =
+            unseq_backend::walk2_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__src), decltype(__dst)>{
+                {}, _Function{}, static_cast<std::size_t>(__size_res)};
 
         oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
                                                           __brick, __size_res, __src, __dst)
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
index 55342166470..5204cf048a7 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
@@ -858,7 +858,7 @@ struct __vector_load
         for (std::uint16_t __i = 0; __i < __vec_size; ++__i)
             __load_op(__start_idx + __i, __i, __acc...);
     }
-    
+
     template <typename _IdxType, typename _LoadOp, typename... _Acc>
     void
     operator()(std::false_type, _IdxType __start_idx, _LoadOp __load_op, _Acc... __acc) const
@@ -884,7 +884,8 @@ struct __lazy_store_transform_op
     // Binary transformations into an output buffer
     template <typename _IdxType1, typename _IdxType2, typename _Source1Acc, typename _Source2Acc, typename _DestAcc>
     void
-    operator()(_IdxType1 __idx_source, _IdxType2 __idx_dest, _Source1Acc __source1_acc, _Source2Acc __source2_acc, _DestAcc __dest_acc) const
+    operator()(_IdxType1 __idx_source, _IdxType2 __idx_dest, _Source1Acc __source1_acc, _Source2Acc __source2_acc,
+               _DestAcc __dest_acc) const
     {
         __transform(__source1_acc[__idx_source].__v, __source2_acc[__idx_source].__v, __dest_acc[__idx_dest]);
     }
@@ -897,7 +898,8 @@ struct __vector_walk
     std::size_t __n;
 
     template <typename... _Rngs>
-    void operator()(std::true_type, std::size_t __idx, _Rngs&&... __rngs) const
+    void
+    operator()(std::true_type, std::size_t __idx, _Rngs&&... __rngs) const
     {
         _ONEDPL_PRAGMA_UNROLL
         for (std::uint16_t __i = 0; __i < __vec_size; ++__i)
@@ -909,7 +911,8 @@ struct __vector_walk
     // For a non-full vector path, process it sequentially. This will always be the last sub or work group
     // if it does not evenly divide into input
     template <typename... _Rngs>
-    void operator()(std::false_type, std::size_t __idx, _Rngs&&... __rngs) const
+    void
+    operator()(std::false_type, std::size_t __idx, _Rngs&&... __rngs) const
     {
         std::uint16_t __elements = std::min(__vec_size, decltype(__vec_size)(__n - __idx));
         for (std::uint16_t __i = 0; __i < __elements; ++__i)
@@ -957,8 +960,8 @@ struct __vector_reverse
         }
         else
         {
-          for (std::uint16_t __i = 0; __i != __elements_to_process / 2; ++__i)
-              std::swap(__array[__i].__v, __array[__elements_to_process - __i - 1].__v);
+            for (std::uint16_t __i = 0; __i != __elements_to_process / 2; ++__i)
+                std::swap(__array[__i].__v, __array[__elements_to_process - __i - 1].__v);
         }
     }
 };
@@ -970,8 +973,9 @@ struct __strided_loop
 {
     std::size_t __n;
     template <typename _IdxType, typename _LoopBodyOp, typename... _Ranges>
-	void
-    operator()(/*__is_full*/std::true_type, _IdxType __idx, std::uint16_t __stride, _LoopBodyOp __loop_body_op, _Ranges&&... __rngs) const
+    void
+    operator()(/*__is_full*/ std::true_type, _IdxType __idx, std::uint16_t __stride, _LoopBodyOp __loop_body_op,
+               _Ranges&&... __rngs) const
     {
         _ONEDPL_PRAGMA_UNROLL
         for (std::uint16_t __i = 0; __i < __num_strides; ++__i)
@@ -981,8 +985,9 @@ struct __strided_loop
         }
     }
     template <typename _IdxType, typename _LoopBodyOp, typename... _Ranges>
-	void
-    operator()(/*__is_full*/std::false_type, _IdxType __idx, std::uint16_t __stride, _LoopBodyOp __loop_body_op, _Ranges&&... __rngs) const
+    void
+    operator()(/*__is_full*/ std::false_type, _IdxType __idx, std::uint16_t __stride, _LoopBodyOp __loop_body_op,
+               _Ranges&&... __rngs) const
     {
         // Constrain the number of iterations as much as possible and then pass the knowledge that we are not a full loop to the body operation
         const std::uint8_t __adjusted_iters_per_work_item =
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
index 4b863f69214..d30d10ab642 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
@@ -116,25 +116,30 @@ template <typename... _Ranges>
 class walk_vector_or_scalar_base
 {
     using _ValueTypes = std::tuple<oneapi::dpl::__internal::__value_t<_Ranges>...>;
-    constexpr static std::uint16_t __min_type_size = oneapi::dpl::__internal::__min_nested_type_size<_ValueTypes>::value;
+    constexpr static std::uint16_t __min_type_size =
+        oneapi::dpl::__internal::__min_nested_type_size<_ValueTypes>::value;
     constexpr static std::uint16_t __bytes_per_item = 16;
-public:
+
+  public:
     constexpr static bool __can_vectorize =
         (oneapi::dpl::__ranges::__is_vectorizable_view<_Ranges>::value && ...) &&
         (std::is_fundamental_v<oneapi::dpl::__internal::__value_t<_Ranges>> && ...) && __min_type_size < 4;
     // Vectorize for small types, so we generate 128-byte load / stores in a sub-group
-    constexpr static std::uint16_t __preferred_vector_size = __can_vectorize ? oneapi::dpl::__internal::__dpl_ceiling_div(4,
-            __min_type_size) : 1;
+    constexpr static std::uint16_t __preferred_vector_size =
+        __can_vectorize ? oneapi::dpl::__internal::__dpl_ceiling_div(4, __min_type_size) : 1;
     // To achieve full bandwidth utilization, multiple iterations need to be processed by a work item
-    constexpr static std::uint16_t __preferred_iters_per_item = __bytes_per_item / (__min_type_size * __preferred_vector_size);
+    constexpr static std::uint16_t __preferred_iters_per_item =
+        __bytes_per_item / (__min_type_size * __preferred_vector_size);
 };
 
 // Path that intentionally disables vectorization for algorithms with a scattered access pattern (e.g. binary_search)
 template <typename... _Ranges>
 class walk_scalar_base
 {
-    using _ValueType = oneapi::dpl::__internal::__min_nested_type_size<std::tuple<oneapi::dpl::__internal::__value_t<_Ranges>...>>;
-public:
+    using _ValueType =
+        oneapi::dpl::__internal::__min_nested_type_size<std::tuple<oneapi::dpl::__internal::__value_t<_Ranges>...>>;
+
+  public:
     constexpr static bool __can_vectorize = false;
     // With no vectorization, the vector size is 1
     constexpr static std::uint16_t __preferred_vector_size = 1;
@@ -143,25 +148,25 @@ class walk_scalar_base
 };
 
 template <typename _ExecutionPolicy, typename _F, typename _Range>
-struct walk1_vector_or_scalar
-  : public walk_vector_or_scalar_base<_Range>
+struct walk1_vector_or_scalar : public walk_vector_or_scalar_base<_Range>
 {
     using __base_t = walk_vector_or_scalar_base<_Range>;
     _F __f;
     std::size_t __n;
- 
+
     template <typename _IsFull, typename _ItemId>
-    void 
+    void
     __vector_path(_IsFull __is_full, const _ItemId __idx, _Range __rng) const
     {
         // This is needed to enable vectorization
         auto __raw_ptr = __rng.begin();
-        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size, decltype(__f)>{__f, __n}(__is_full, __idx, __raw_ptr);
+        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size, decltype(__f)>{__f, __n}(
+            __is_full, __idx, __raw_ptr);
     }
 
     // _IsFull is ignored here. We assume that boundary checking has been already performed for this index.
     template <typename _IsFull, typename _ItemId>
-    void 
+    void
     __scalar_path(_IsFull, const _ItemId __idx, _Range __rng) const
     {
 
@@ -169,7 +174,7 @@ struct walk1_vector_or_scalar
     }
 
     template <typename _IsFull, typename _ItemId>
-    void 
+    void
     operator()(_IsFull __is_full, const _ItemId __idx, _Range __rng) const
     {
         if constexpr (__base_t::__can_vectorize)
@@ -180,15 +185,14 @@ struct walk1_vector_or_scalar
 };
 
 template <typename _ExecutionPolicy, typename _F, typename _Range1, typename _Range2>
-struct walk2_vectors_or_scalars
-    : public walk_vector_or_scalar_base<_Range1, _Range2>
+struct walk2_vectors_or_scalars : public walk_vector_or_scalar_base<_Range1, _Range2>
 {
     using __base_t = walk_vector_or_scalar_base<_Range1, _Range2>;
     _F __f;
     std::size_t __n;
 
     template <typename _IsFull, typename _ItemId>
-    void 
+    void
     __vector_path(_IsFull __is_full, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2) const
     {
         using _ValueType1 = oneapi::dpl::__internal::__value_t<_Range1>;
@@ -199,20 +203,21 @@ struct walk2_vectors_or_scalars
         auto __raw_ptr2 = __rng2.begin();
         oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType1> __rng1_vector[__base_t::__preferred_vector_size];
         // 1. Load input into a vector
-        oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(__is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{},
-                __raw_ptr1, __rng1_vector);
+        oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(
+            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __raw_ptr1, __rng1_vector);
         // 2. Apply functor to vector and store into global memory
-        oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(__is_full, __idx,
-                oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f},
-                __rng1_vector, __raw_ptr2);
+        oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(
+            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f}, __rng1_vector,
+            __raw_ptr2);
         // 3. Explicitly call destructor of lazy union type
-        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size, oneapi::dpl::__internal::__lazy_ctor_storage_deleter>{
+        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size,
+                                                         oneapi::dpl::__internal::__lazy_ctor_storage_deleter>{
             oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __n}(__is_full, 0, __rng1_vector);
     }
 
     // _IsFull is ignored here. We assume that boundary checking has been already performed for this index.
     template <typename _IsFull, typename _ItemId>
-    void 
+    void
     __scalar_path(_IsFull, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2) const
     {
 
@@ -220,7 +225,7 @@ struct walk2_vectors_or_scalars
     }
 
     template <typename _IsFull, typename _ItemId>
-    void 
+    void
     operator()(_IsFull __is_full, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2) const
     {
         if constexpr (__base_t::__can_vectorize)
@@ -231,15 +236,14 @@ struct walk2_vectors_or_scalars
 };
 
 template <typename _ExecutionPolicy, typename _F, typename _Range1, typename _Range2, typename _Range3>
-struct walk3_vectors_or_scalars
-    : public walk_vector_or_scalar_base<_Range1, _Range2, _Range3>
+struct walk3_vectors_or_scalars : public walk_vector_or_scalar_base<_Range1, _Range2, _Range3>
 {
     using __base_t = walk_vector_or_scalar_base<_Range1, _Range2, _Range3>;
     _F __f;
     std::size_t __n;
 
     template <typename _IsFull, typename _ItemId>
-    void 
+    void
     __vector_path(_IsFull __is_full, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2, _Range3 __rng3) const
     {
         using _ValueType1 = oneapi::dpl::__internal::__value_t<_Range1>;
@@ -254,24 +258,26 @@ struct walk3_vectors_or_scalars
         oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType1> __rng1_vector[__base_t::__preferred_vector_size];
         oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType2> __rng2_vector[__base_t::__preferred_vector_size];
         // 1. Load inputs into vectors
-        oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(__is_full, __idx,
-                oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{},
-                __raw_ptr1, __rng1_vector);
-        oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(__is_full, __idx, __raw_ptr2, __rng2_vector,
-            oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{});
+        oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(
+            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __raw_ptr1, __rng1_vector);
+        oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(
+            __is_full, __idx, __raw_ptr2, __rng2_vector, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{});
         // 2. Apply binary functor to vector and store into global memory
-        oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(__is_full, __idx,
-                oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f}, __rng1_vector, __rng2_vector, __raw_ptr3);
+        oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(
+            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f}, __rng1_vector,
+            __rng2_vector, __raw_ptr3);
         // 3. Explicitly call destructors of lazy union type
-        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size, oneapi::dpl::__internal::__lazy_ctor_storage_deleter>{
+        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size,
+                                                         oneapi::dpl::__internal::__lazy_ctor_storage_deleter>{
             oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __n}(__is_full, 0, __rng1_vector);
-        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size, oneapi::dpl::__internal::__lazy_ctor_storage_deleter>{
+        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size,
+                                                         oneapi::dpl::__internal::__lazy_ctor_storage_deleter>{
             oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __n}(__is_full, 0, __rng2_vector);
     }
 
     // _IsFull is ignored here. We assume that boundary checking has been already performed for this index.
     template <typename _IsFull, typename _ItemId>
-    void 
+    void
     __scalar_path(_IsFull, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2, _Range3 __rng3) const
     {
 
@@ -279,7 +285,7 @@ struct walk3_vectors_or_scalars
     }
 
     template <typename _IsFull, typename _ItemId>
-    void 
+    void
     operator()(_IsFull __is_full, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2, _Range3 __rng3) const
     {
         if constexpr (__base_t::__can_vectorize)
@@ -310,8 +316,7 @@ struct walk_n<_ExecutionPolicy, oneapi::dpl::__internal::__no_op>
 //------------------------------------------------------------------------
 
 template <typename _ExecutionPolicy, typename _F, typename _Range1, typename _Range2>
-struct walk_adjacent_difference
-    : public walk_vector_or_scalar_base<_Range1, _Range2>
+struct walk_adjacent_difference : public walk_vector_or_scalar_base<_Range1, _Range2>
 {
     using __base_t = walk_vector_or_scalar_base<_Range1, _Range2>;
     _F __f;
@@ -331,7 +336,7 @@ struct walk_adjacent_difference
     void
     __vector_path(_IsFull __is_full, const _ItemId __idx, const _Range1 __acc_src, _Range2 __acc_dst) const
     {
-		using _ValueType = oneapi::dpl::__internal::__value_t<_Range1>;
+        using _ValueType = oneapi::dpl::__internal::__value_t<_Range1>;
         auto __acc_src_ptr = __acc_src.begin();
         auto __acc_dst_ptr = __acc_dst.begin();
         oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __src_vector[__base_t::__preferred_vector_size + 1];
@@ -341,18 +346,20 @@ struct walk_adjacent_difference
             __src_vector[0].__setup(__acc_src_ptr[__idx - 1]);
         else
             __src_vector[0].__setup(__acc_src_ptr[0]);
-        oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(__is_full, __idx, 
-                oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{},
-                __acc_src_ptr, &__src_vector[1]);
+        oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(
+            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __acc_src_ptr,
+            &__src_vector[1]);
         // 2. Perform a vector store of __preferred_vector_size adjacent differences.
-        oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(__is_full, __idx,
-            oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f}, __src_vector, &__src_vector[1], __acc_dst_ptr);
+        oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(
+            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f}, __src_vector,
+            &__src_vector[1], __acc_dst_ptr);
         // A dummy value is first written to global memory followed by an overwrite for the first index. Pulling the vector loads / stores into an if branch
         // to better handle this results in performance degradation.
         if (__idx == 0)
             __acc_dst[0] = __src_vector[0].__v;
         // 3. Delete temporary storage
-		oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size, oneapi::dpl::__internal::__lazy_ctor_storage_deleter>{
+        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size,
+                                                         oneapi::dpl::__internal::__lazy_ctor_storage_deleter>{
             oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __n}(__is_full, 0, __src_vector);
     }
     template <typename _IsFull, typename _ItemId, typename _Acc1, typename _Acc2>
@@ -1143,14 +1150,13 @@ struct __brick_includes
 // reverse
 //------------------------------------------------------------------------
 template <typename _Size, typename _Range>
-struct __reverse_functor
-    : public walk_vector_or_scalar_base<_Range>
+struct __reverse_functor : public walk_vector_or_scalar_base<_Range>
 {
     using __base_t = walk_vector_or_scalar_base<_Range>;
     using _ValueType = oneapi::dpl::__internal::__value_t<_Range>;
-    _Size __size; 
+    _Size __size;
     template <typename _IsFull, typename _Idx, typename _Accessor>
-    void 
+    void
     __vector_path(_IsFull __is_full, const _Idx __left_start_idx, _Accessor __acc) const
     {
         auto __acc_pointer = __acc.begin();
@@ -1162,31 +1168,39 @@ struct __reverse_functor
         // additional branching for each work item (see reverse_copy).
         if (__left_start_idx >= __midpoint)
             return;
-        
+
         // 1. Load two vectors that we want to swap: one from the left half of the buffer and one from the right
         const _Idx __right_start_idx = __size - __left_start_idx - __base_t::__preferred_vector_size;
 
         oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __acc_left_vector[__base_t::__preferred_vector_size];
         oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __acc_right_vector[__base_t::__preferred_vector_size];
 
-        oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(__is_full, __left_start_idx, 
-                oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{},
-                __acc_pointer, __acc_left_vector);
-        oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(__is_full, __right_start_idx, 
-                oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{},
-                __acc_pointer, __acc_right_vector);
+        oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(
+            __is_full, __left_start_idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __acc_pointer,
+            __acc_left_vector);
+        oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(
+            __is_full, __right_start_idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __acc_pointer,
+            __acc_right_vector);
         // 2. Reverse vectors in registers. Note that due to indices we have chosen, there will always be a full vector of elements to load
-        oneapi::dpl::__par_backend_hetero::__vector_reverse<__base_t::__preferred_vector_size>{}(std::true_type{}, __left_start_idx, __acc_left_vector);
-        oneapi::dpl::__par_backend_hetero::__vector_reverse<__base_t::__preferred_vector_size>{}(std::true_type{}, __right_start_idx, __acc_right_vector);
+        oneapi::dpl::__par_backend_hetero::__vector_reverse<__base_t::__preferred_vector_size>{}(
+            std::true_type{}, __left_start_idx, __acc_left_vector);
+        oneapi::dpl::__par_backend_hetero::__vector_reverse<__base_t::__preferred_vector_size>{}(
+            std::true_type{}, __right_start_idx, __acc_right_vector);
         // 3. Store the left-half vector to the corresponding right-half indices and vice versa
-        oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(__is_full, __right_start_idx,
-            oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<oneapi::dpl::__internal::__pstl_assign>{}, __acc_left_vector, __acc_pointer);
-        oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(__is_full, __left_start_idx,
-            oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<oneapi::dpl::__internal::__pstl_assign>{}, __acc_right_vector, __acc_pointer);
-        // 4. Call destructors of temporary storage 
-        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size, oneapi::dpl::__internal::__lazy_ctor_storage_deleter>{
+        oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(
+            __is_full, __right_start_idx,
+            oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<oneapi::dpl::__internal::__pstl_assign>{},
+            __acc_left_vector, __acc_pointer);
+        oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(
+            __is_full, __left_start_idx,
+            oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<oneapi::dpl::__internal::__pstl_assign>{},
+            __acc_right_vector, __acc_pointer);
+        // 4. Call destructors of temporary storage
+        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size,
+                                                         oneapi::dpl::__internal::__lazy_ctor_storage_deleter>{
             oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __n}(__is_full, 0, __acc_left_vector);
-        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size, oneapi::dpl::__internal::__lazy_ctor_storage_deleter>{
+        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size,
+                                                         oneapi::dpl::__internal::__lazy_ctor_storage_deleter>{
             oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __n}(__is_full, 0, __acc_right_vector);
     }
     template <typename _IsFull, typename _Idx, typename _Accessor>
@@ -1211,8 +1225,7 @@ struct __reverse_functor
 // reverse_copy
 //------------------------------------------------------------------------
 template <typename _Size, typename _Range1, typename _Range2>
-struct __reverse_copy
-    : public walk_vector_or_scalar_base<_Range1, _Range2>
+struct __reverse_copy : public walk_vector_or_scalar_base<_Range1, _Range2>
 {
     using __base_t = walk_vector_or_scalar_base<_Range1, _Range2>;
     using _ValueType = oneapi::dpl::__internal::__value_t<_Range1>;
@@ -1232,28 +1245,34 @@ struct __reverse_copy
         auto __acc2_pointer = __acc2.begin();
         std::size_t __n = __size;
         std::size_t __remaining_elements = __idx >= __n ? 0 : __n - __idx;
-        std::size_t __elements_to_process = std::min(static_cast<std::size_t>(__base_t::__preferred_vector_size), __remaining_elements);
+        std::size_t __elements_to_process =
+            std::min(static_cast<std::size_t>(__base_t::__preferred_vector_size), __remaining_elements);
         const _Idx __output_start = __size - __idx - __elements_to_process;
         // 1. Load vector to reverse
         oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __acc1_vector[__base_t::__preferred_vector_size];
-        oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(__is_full, __idx, 
-                oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{},
-                __acc1_pointer, __acc1_vector);
+        oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(
+            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __acc1_pointer,
+            __acc1_vector);
         // 2, 3. Reverse in registers and flip the location of the vector in the output buffer
         if (__elements_to_process == __base_t::__preferred_vector_size)
         {
-            oneapi::dpl::__par_backend_hetero::__vector_reverse<__base_t::__preferred_vector_size>{}(std::true_type{}, __elements_to_process, __acc1_vector);
-            oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(__is_full, __output_start,
-                oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<oneapi::dpl::__internal::__pstl_assign>{}, __acc1_vector, __acc2_pointer);
+            oneapi::dpl::__par_backend_hetero::__vector_reverse<__base_t::__preferred_vector_size>{}(
+                std::true_type{}, __elements_to_process, __acc1_vector);
+            oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(
+                __is_full, __output_start,
+                oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<oneapi::dpl::__internal::__pstl_assign>{},
+                __acc1_vector, __acc2_pointer);
         }
         else
         {
-            oneapi::dpl::__par_backend_hetero::__vector_reverse<__base_t::__preferred_vector_size>{}(std::false_type{}, __elements_to_process, __acc1_vector);
+            oneapi::dpl::__par_backend_hetero::__vector_reverse<__base_t::__preferred_vector_size>{}(
+                std::false_type{}, __elements_to_process, __acc1_vector);
             for (std::uint16_t __i = 0; __i < __elements_to_process; ++__i)
                 __acc2_pointer[__output_start + __i] = __acc1_vector[__i].__v;
         }
-        // 3. Cleanup 
-        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size, oneapi::dpl::__internal::__lazy_ctor_storage_deleter>{
+        // 3. Cleanup
+        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size,
+                                                         oneapi::dpl::__internal::__lazy_ctor_storage_deleter>{
             oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __n}(__is_full, 0, __acc1_vector);
     }
     template <typename _IsFull, typename _Idx, typename _AccessorSrc, typename _AccessorDst>
@@ -1271,8 +1290,7 @@ struct __reverse_copy
 // rotate_copy
 //------------------------------------------------------------------------
 template <typename _Size, typename _Range1, typename _Range2>
-struct __rotate_copy
-    : public walk_vector_or_scalar_base<_Range1, _Range2>
+struct __rotate_copy : public walk_vector_or_scalar_base<_Range1, _Range2>
 {
     using __base_t = walk_vector_or_scalar_base<_Range1, _Range2>;
     using _ValueType = oneapi::dpl::__internal::__value_t<_Range1>;
@@ -1291,22 +1309,26 @@ struct __rotate_copy
         //1. Vectorize loads only if we know the wrap around point is beyond the current vector elements to process
         if (__wrapped_idx + __base_t::__preferred_vector_size < __size)
         {
-            oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(__is_full, __wrapped_idx, 
-                    oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{},
-                    __acc1_pointer, __acc1_vector);
+            oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(
+                __is_full, __wrapped_idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __acc1_pointer,
+                __acc1_vector);
         }
         else
         {
             std::size_t __remaining_elements = __idx >= __n ? 0 : __n - __idx;
-            std::size_t __elements_to_process = std::min(static_cast<std::size_t>(__base_t::__preferred_vector_size), __remaining_elements);
+            std::size_t __elements_to_process =
+                std::min(static_cast<std::size_t>(__base_t::__preferred_vector_size), __remaining_elements);
             for (std::uint16_t __i = 0; __i != __elements_to_process; ++__i)
                 __acc1_vector[__i].__setup(__acc1_pointer[(__shifted_idx + __i) % __size]);
         }
         // 2. Store the rotation
-        oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(__is_full, __idx,
-            oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<oneapi::dpl::__internal::__pstl_assign>{}, __acc1_vector, __acc2_pointer);
+        oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(
+            __is_full, __idx,
+            oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<oneapi::dpl::__internal::__pstl_assign>{},
+            __acc1_vector, __acc2_pointer);
         // 3. Delete temporary storage
-        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size, oneapi::dpl::__internal::__lazy_ctor_storage_deleter>{
+        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size,
+                                                         oneapi::dpl::__internal::__lazy_ctor_storage_deleter>{
             oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __n}(__is_full, 0, __acc1_vector);
     }
     template <typename _IsFull, typename _Idx>
@@ -1398,19 +1420,19 @@ class __brick_set_op
 };
 
 template <typename _ExecutionPolicy, typename _DiffType, typename _Range>
-struct __brick_shift_left
-    : public walk_vector_or_scalar_base<_Range>
+struct __brick_shift_left : public walk_vector_or_scalar_base<_Range>
 {
     using __base_t = walk_vector_or_scalar_base<_Range>;
     using _ValueType = oneapi::dpl::__internal::__value_t<_Range>;
     _DiffType __size;
     _DiffType __n;
-    
+
     template <typename _IsFull, typename _ItemId>
     void
     __vector_path(_IsFull __is_full, const _ItemId __idx, _Range __rng) const
     {
-        if (__idx >= __n) return;
+        if (__idx >= __n)
+            return;
         const _DiffType __i = __idx - __n; //loop invariant
         oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __rng_vector[__base_t::__preferred_vector_size];
         auto __rng_pointer = __rng.begin();
@@ -1418,25 +1440,37 @@ struct __brick_shift_left
         {
             if (__k + __idx + __base_t::__preferred_vector_size <= __size)
             {
-                oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{static_cast<std::size_t>(__size)}(std::true_type{}, __k + __idx,
-                        oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{},
-                        __rng_pointer, __rng_vector);
-                oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{static_cast<std::size_t>(__size)}(std::true_type{}, __k + __i,
-                    oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<oneapi::dpl::__internal::__pstl_assign>{}, __rng_vector, __rng_pointer);
-                oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size, oneapi::dpl::__internal::__lazy_ctor_storage_deleter>{
-                    oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, static_cast<std::size_t>(__size)}(std::true_type{}, 0, __rng_vector);
+                oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{
+                    static_cast<std::size_t>(__size)}(std::true_type{}, __k + __idx,
+                                                      oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{},
+                                                      __rng_pointer, __rng_vector);
+                oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{
+                    static_cast<std::size_t>(__size)}(std::true_type{}, __k + __i,
+                                                      oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<
+                                                          oneapi::dpl::__internal::__pstl_assign>{},
+                                                      __rng_vector, __rng_pointer);
+                oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size,
+                                                                 oneapi::dpl::__internal::__lazy_ctor_storage_deleter>{
+                    oneapi::dpl::__internal::__lazy_ctor_storage_deleter{},
+                    static_cast<std::size_t>(__size)}(std::true_type{}, 0, __rng_vector);
             }
             else if (__k + __idx < __size)
             {
-                oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{static_cast<std::size_t>(__size)}(std::false_type{}, __k + __idx,
-                        oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __rng_pointer, __rng_vector);
-                oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{static_cast<std::size_t>(__size)}(std::false_type{}, __k + __i,
-                    oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<oneapi::dpl::__internal::__pstl_assign>{}, __rng_vector, __rng_pointer);
-                oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size, oneapi::dpl::__internal::__lazy_ctor_storage_deleter>{
-                    oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, static_cast<std::size_t>(__size)}(std::false_type{}, 0, __rng_vector);
+                oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{
+                    static_cast<std::size_t>(__size)}(std::false_type{}, __k + __idx,
+                                                      oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{},
+                                                      __rng_pointer, __rng_vector);
+                oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{
+                    static_cast<std::size_t>(__size)}(std::false_type{}, __k + __i,
+                                                      oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<
+                                                          oneapi::dpl::__internal::__pstl_assign>{},
+                                                      __rng_vector, __rng_pointer);
+                oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size,
+                                                                 oneapi::dpl::__internal::__lazy_ctor_storage_deleter>{
+                    oneapi::dpl::__internal::__lazy_ctor_storage_deleter{},
+                    static_cast<std::size_t>(__size)}(std::false_type{}, 0, __rng_vector);
             }
         }
-
     }
 
     template <typename _IsFull, typename _ItemId>
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/utils_ranges_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/utils_ranges_sycl.h
index e7d51d8647f..b69e207f551 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/utils_ranges_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/utils_ranges_sycl.h
@@ -755,21 +755,18 @@ __select_backend(const execution::fpga_policy<_Factor, _KernelName>&, _Ranges&&.
 // Check the outer view type type to see if we can vectorize. Any non-contiguous inputs (e.g. reverse
 // views, permutation views, etc.) cannot be vectorized.
 template <typename _Args>
-struct __is_vectorizable_view
-    : std::false_type
+struct __is_vectorizable_view : std::false_type
 {
 };
 // If the outer view is a guard view, then the input is passed directly as a pointer and we can use.
 template <typename... _Args>
-struct __is_vectorizable_view<oneapi::dpl::__ranges::guard_view<_Args...>>
-    : std::true_type
+struct __is_vectorizable_view<oneapi::dpl::__ranges::guard_view<_Args...>> : std::true_type
 {
 };
 // If all_view is passed, then we are processing a sycl::buffer directly which is contiguous and can
-// be used. 
+// be used.
 template <typename... _Args>
-struct __is_vectorizable_view<oneapi::dpl::__ranges::all_view<_Args...>>
-    : std::true_type
+struct __is_vectorizable_view<oneapi::dpl::__ranges::all_view<_Args...>> : std::true_type
 {
 };
 
diff --git a/include/oneapi/dpl/pstl/hetero/numeric_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/numeric_impl_hetero.h
index f63a8f34992..5b980cb0978 100644
--- a/include/oneapi/dpl/pstl/hetero/numeric_impl_hetero.h
+++ b/include/oneapi/dpl/pstl/hetero/numeric_impl_hetero.h
@@ -266,10 +266,11 @@ __pattern_adjacent_difference(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&
         auto __view1 = __buf1.all_view();
         auto __view2 = __buf2.all_view();
 
-        using _Function = unseq_backend::walk_adjacent_difference<_ExecutionPolicy, decltype(__fn), decltype(__view1), decltype(__view2)>;
+        using _Function = unseq_backend::walk_adjacent_difference<_ExecutionPolicy, decltype(__fn), decltype(__view1),
+                                                                  decltype(__view2)>;
 
-        oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, __exec, _Function{{}, __fn, static_cast<std::size_t>(__n)}, __n,
-                                                          __view1, __view2)
+        oneapi::dpl::__par_backend_hetero::__parallel_for(
+            _BackendTag{}, __exec, _Function{{}, __fn, static_cast<std::size_t>(__n)}, __n, __view1, __view2)
             .__deferrable_wait();
     }
 

From 75e4beb5b526e46c8c1b7b8375c0d23f70ba72cc Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Tue, 17 Dec 2024 06:43:35 -0800
Subject: [PATCH 30/65] Miscellaneous fixes identified during testing

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../internal/async_impl/async_impl_hetero.h   | 18 +++++++---
 .../hetero/algorithm_ranges_impl_hetero.h     | 34 +++++++++++++++----
 .../pstl/hetero/dpcpp/parallel_backend_sycl.h |  5 +--
 .../dpl/pstl/hetero/dpcpp/sycl_traits.h       |  6 ++--
 .../pstl/hetero/dpcpp/unseq_backend_sycl.h    | 19 +++++++----
 .../dpl/pstl/hetero/histogram_impl_hetero.h   |  3 +-
 .../device_copyable.pass.cpp                  | 12 +++----
 7 files changed, 69 insertions(+), 28 deletions(-)

diff --git a/include/oneapi/dpl/internal/async_impl/async_impl_hetero.h b/include/oneapi/dpl/internal/async_impl/async_impl_hetero.h
index 1558919b20f..a4e53212467 100644
--- a/include/oneapi/dpl/internal/async_impl/async_impl_hetero.h
+++ b/include/oneapi/dpl/internal/async_impl/async_impl_hetero.h
@@ -42,9 +42,11 @@ __pattern_walk1_async(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _For
         oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read_write, _ForwardIterator>();
     auto __buf = __keep(__first, __last);
 
+    auto __view = __buf.all_view();
+
     auto __future_obj = oneapi::dpl::__par_backend_hetero::__parallel_for(
         _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
-        unseq_backend::walk_n<_ExecutionPolicy, _Function>{__f}, __n, __buf.all_view());
+        unseq_backend::walk1_vector_or_scalar<_ExecutionPolicy, _Function, decltype(__view)>{{}, __f, std::size_t(__n)}, __n, __view);
     return __future_obj;
 }
 
@@ -65,9 +67,12 @@ __pattern_walk2_async(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _For
     auto __keep2 = oneapi::dpl::__ranges::__get_sycl_range<__acc_mode2, _ForwardIterator2>();
     auto __buf2 = __keep2(__first2, __first2 + __n);
 
+    auto __view1 = __buf1.all_view();
+    auto __view2 = __buf2.all_view();
+
     auto __future = oneapi::dpl::__par_backend_hetero::__parallel_for(
         _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
-        unseq_backend::walk_n<_ExecutionPolicy, _Function>{__f}, __n, __buf1.all_view(), __buf2.all_view());
+        unseq_backend::walk2_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__view1), decltype(__view2)>{{}, __f, std::size_t(__n)}, __n, __view1, __view2);
 
     return __future.__make_future(__first2 + __n);
 }
@@ -91,10 +96,15 @@ __pattern_walk3_async(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _For
         oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _ForwardIterator3>();
     auto __buf3 = __keep3(__first3, __first3 + __n);
 
+    auto __view1 = __buf1.all_view();
+    auto __view2 = __buf2.all_view();
+    auto __view3 = __buf3.all_view();
+
     auto __future =
         oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
-                                                          unseq_backend::walk_n<_ExecutionPolicy, _Function>{__f}, __n,
-                                                          __buf1.all_view(), __buf2.all_view(), __buf3.all_view());
+                                                          unseq_backend::walk3_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__view1), decltype(__view2),
+                                                decltype(__view3)>{{}, __f, size_t(__n)}, __n,
+                                                          __view1, __view2, __view3);
 
     return __future.__make_future(__first3 + __n);
 }
diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h
index 85fadbc3100..645a1de0896 100644
--- a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h
+++ b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h
@@ -57,11 +57,32 @@ __pattern_walk_n(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Function
     auto __n = oneapi::dpl::__ranges::__get_first_range_size(__rngs...);
     if (__n > 0)
     {
-        // TODO add specializations with a fallback to walk n
-        oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
-                                                          unseq_backend::walk_n<_ExecutionPolicy, _Function>{__f}, __n,
-                                                          ::std::forward<_Ranges>(__rngs)...)
-            .__deferrable_wait();
+        constexpr std::size_t __num_ranges = sizeof...(_Ranges);
+        static_assert(__num_ranges <= 3, "__pattern_walk_n only supports up to three packed range parameters");
+        if constexpr (__num_ranges == 1)
+        {
+            oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
+                                                              unseq_backend::walk1_vector_or_scalar<_ExecutionPolicy, _Function, std::decay_t<_Ranges>...>{{}, __f,
+                                                                static_cast<std::size_t>(__n)}, __n,
+                                                                ::std::forward<_Ranges>(__rngs)...)
+                .__deferrable_wait();
+        }
+        else if constexpr (__num_ranges == 2)
+        {
+            oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
+                                                              unseq_backend::walk2_vectors_or_scalars<_ExecutionPolicy, _Function, std::decay_t<_Ranges>...>{{}, __f,
+                                                                static_cast<std::size_t>(__n)}, __n,
+                                                                ::std::forward<_Ranges>(__rngs)...)
+                .__deferrable_wait();
+        }
+        else
+        {
+            oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
+                                                              unseq_backend::walk3_vectors_or_scalars<_ExecutionPolicy, _Function, std::decay_t<_Ranges>...>{{}, __f,
+                                                                static_cast<std::size_t>(__n)}, __n,
+                                                                ::std::forward<_Ranges>(__rngs)...)
+                .__deferrable_wait();
+        }
     }
 }
 
@@ -626,7 +647,8 @@ __pattern_unique_copy(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec
             _BackendTag{},
             oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__copy_wrapper>(
                 std::forward<_ExecutionPolicy>(__exec)),
-            unseq_backend::walk_n<_ExecutionPolicy, _CopyBrick>{_CopyBrick{}}, __n, std::forward<_Range1>(__rng),
+            unseq_backend::walk2_vectors_or_scalars<_ExecutionPolicy, _CopyBrick, std::decay_t<_Range1>, std::decay_t<_Range2>>{{}, _CopyBrick{}, static_cast<std::size_t>(__n)}, __n,
+            std::forward<_Range1>(__rng),
             std::forward<_Range2>(__result))
             .get();
 
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index 505b788a1e2..c62b1131e01 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -2375,7 +2375,8 @@ __parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_
     oneapi::dpl::__par_backend_hetero::__parallel_for(
         oneapi::dpl::__internal::__device_backend_tag{},
         oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__reduce1_wrapper>(__exec),
-        unseq_backend::__brick_reduce_idx<_BinaryOperator, decltype(__n)>(__binary_op, __n), __intermediate_result_end,
+        unseq_backend::__brick_reduce_idx<_BinaryOperator, decltype(__n), _Range2>(__binary_op, __n),
+        __intermediate_result_end,
         oneapi::dpl::__ranges::take_view_simple(oneapi::dpl::__ranges::views::all_read(__idx),
                                                 __intermediate_result_end),
         std::forward<_Range2>(__values), oneapi::dpl::__ranges::views::all_write(__tmp_out_values))
@@ -2421,7 +2422,7 @@ __parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_
         oneapi::dpl::__internal::__device_backend_tag{},
         oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__reduce2_wrapper>(
             std::forward<_ExecutionPolicy>(__exec)),
-        unseq_backend::__brick_reduce_idx<_BinaryOperator, decltype(__intermediate_result_end)>(
+        unseq_backend::__brick_reduce_idx<_BinaryOperator, decltype(__intermediate_result_end), _Range4>(
             __binary_op, __intermediate_result_end),
         __result_end,
         oneapi::dpl::__ranges::take_view_simple(oneapi::dpl::__ranges::views::all_read(__idx), __result_end),
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h b/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h
index f16894c050f..46e74b382a5 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h
@@ -418,7 +418,7 @@ struct __brick_includes;
 template <typename _ExecutionPolicy, typename _Compare, typename _Size1, typename _Size2, typename _IsOpDifference>
 class __brick_set_op;
 
-template <typename _BinaryOperator, typename _Size>
+template <typename _BinaryOperator, typename _Size, typename _Range>
 struct __brick_reduce_idx;
 
 } // namespace oneapi::dpl::unseq_backend
@@ -543,9 +543,9 @@ struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::unseq_backen
 {
 };
 
-template <typename _BinaryOperator, typename _Size>
+template <typename _BinaryOperator, typename _Size, typename _Range>
 struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::unseq_backend::__brick_reduce_idx, _BinaryOperator,
-                                                       _Size)>
+                                                       _Size, _Range)>
     : oneapi::dpl::__internal::__are_all_device_copyable<_BinaryOperator, _Size>
 {
 };
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
index d30d10ab642..cbaaace0287 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
@@ -1510,8 +1510,8 @@ struct __brick_assign_key_position
 };
 
 // reduce the values in a segment associated with a key
-template <typename _BinaryOperator, typename _Size>
-struct __brick_reduce_idx
+template <typename _BinaryOperator, typename _Size, typename _Range>
+struct __brick_reduce_idx : public walk_scalar_base<_Range>
 {
     __brick_reduce_idx(const _BinaryOperator& __b, const _Size __n_) : __binary_op(__b), __n(__n_) {}
 
@@ -1526,16 +1526,23 @@ struct __brick_reduce_idx
             __res = __binary_op(__res, __values[__segment_begin]);
         return __res;
     }
-
-    template <typename _ItemId, typename _ReduceIdx, typename _Values, typename _OutValues>
+    template <typename _IsFull, typename _ItemId, typename _ReduceIdx, typename _Values, typename _OutValues>
     void
-    operator()(const _ItemId __idx, const _ReduceIdx& __segment_starts, const _Values& __values,
-               _OutValues& __out_values) const
+    __scalar_path(_IsFull, const _ItemId __idx, const _ReduceIdx& __segment_starts, const _Values& __values,
+                  _OutValues& __out_values) const
     {
         using __value_type = decltype(__segment_starts[__idx]);
         __value_type __segment_end =
             (__idx == __segment_starts.size() - 1) ? __value_type(__n) : __segment_starts[__idx + 1];
         __out_values[__idx] = reduce(__segment_starts[__idx], __segment_end, __values);
+
+    }
+    template <typename _IsFull, typename _ItemId, typename _ReduceIdx, typename _Values, typename _OutValues>
+    void
+    operator()(_IsFull __is_full, const _ItemId __idx, const _ReduceIdx& __segment_starts, const _Values& __values,
+               _OutValues& __out_values) const
+    {
+        __scalar_path(__is_full, __idx, __segment_starts, __values, __out_values);
     }
 
   private:
diff --git a/include/oneapi/dpl/pstl/hetero/histogram_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/histogram_impl_hetero.h
index 6970f57f4d7..fdb12527d20 100644
--- a/include/oneapi/dpl/pstl/hetero/histogram_impl_hetero.h
+++ b/include/oneapi/dpl/pstl/hetero/histogram_impl_hetero.h
@@ -143,7 +143,8 @@ __pattern_histogram(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Rando
 
         auto __init_event = oneapi::dpl::__par_backend_hetero::__parallel_for(
             _BackendTag{}, oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__hist_fill_zeros_wrapper>(__exec),
-            unseq_backend::walk_n<_ExecutionPolicy, decltype(__fill_func)>{__fill_func}, __num_bins, __bins);
+            unseq_backend::walk1_vector_or_scalar<_ExecutionPolicy, decltype(__fill_func), decltype(__bins)>{{}, __fill_func,
+                static_cast<std::size_t>(__num_bins)}, __num_bins, __bins);
 
         if (__n > 0)
         {
diff --git a/test/general/implementation_details/device_copyable.pass.cpp b/test/general/implementation_details/device_copyable.pass.cpp
index 322d93d4824..1cd18095ee1 100644
--- a/test/general/implementation_details/device_copyable.pass.cpp
+++ b/test/general/implementation_details/device_copyable.pass.cpp
@@ -47,7 +47,7 @@ test_device_copyable()
 
     //custom_brick
     static_assert(sycl::is_device_copyable_v<oneapi::dpl::internal::custom_brick<
-                      noop_device_copyable, int_device_copyable, oneapi::dpl::internal::search_algorithm::lower_bound>>,
+                      noop_device_copyable, int_device_copyable, range_device_copyable, oneapi::dpl::internal::search_algorithm::lower_bound>>,
                   "custom_brick is not device copyable with device copyable types");
     //replace_if_fun
     static_assert(
@@ -79,7 +79,7 @@ test_device_copyable()
     //walk_adjacent_difference
     static_assert(
         sycl::is_device_copyable_v<
-            oneapi::dpl::unseq_backend::walk_adjacent_difference<policy_non_device_copyable, noop_device_copyable>>,
+            oneapi::dpl::unseq_backend::walk_adjacent_difference<policy_non_device_copyable, noop_device_copyable, range_device_copyable, range_device_copyable>>,
         "walk_adjacent_difference is not device copyable with device copyable types");
     //transform_reduce
     static_assert(
@@ -149,7 +149,7 @@ test_device_copyable()
         "__brick_set_op is not device copyable with device copyable types");
     // __brick_reduce_idx
     static_assert(sycl::is_device_copyable_v<
-                      oneapi::dpl::unseq_backend::__brick_reduce_idx<noop_device_copyable, int_device_copyable>>,
+                      oneapi::dpl::unseq_backend::__brick_reduce_idx<noop_device_copyable, int_device_copyable, range_device_copyable>>,
                   "__brick_reduce_idx is not device copyable with device copyable types");
 
     //__gen_transform_input
@@ -309,7 +309,7 @@ test_non_device_copyable()
     //custom_brick
     static_assert(
         !sycl::is_device_copyable_v<oneapi::dpl::internal::custom_brick<
-            noop_device_copyable, int_non_device_copyable, oneapi::dpl::internal::search_algorithm::lower_bound>>,
+            noop_device_copyable, int_non_device_copyable, range_non_device_copyable, oneapi::dpl::internal::search_algorithm::lower_bound>>,
         "custom_brick is device copyable with non device copyable types");
     //replace_if_fun
     static_assert(!sycl::is_device_copyable_v<
@@ -342,7 +342,7 @@ test_non_device_copyable()
     //walk_adjacent_difference
     static_assert(
         !sycl::is_device_copyable_v<
-            oneapi::dpl::unseq_backend::walk_adjacent_difference<policy_non_device_copyable, noop_non_device_copyable>>,
+            oneapi::dpl::unseq_backend::walk_adjacent_difference<policy_non_device_copyable, noop_non_device_copyable, range_non_device_copyable, range_non_device_copyable>>,
         "walk_adjacent_difference is device copyable with non device copyable types");
     //transform_reduce
     static_assert(
@@ -412,7 +412,7 @@ test_non_device_copyable()
         "__brick_set_op is device copyable with non device copyable types");
     //__brick_reduce_idx
     static_assert(!sycl::is_device_copyable_v<
-                      oneapi::dpl::unseq_backend::__brick_reduce_idx<noop_device_copyable, int_non_device_copyable>>,
+                      oneapi::dpl::unseq_backend::__brick_reduce_idx<noop_device_copyable, int_non_device_copyable, range_non_device_copyable>>,
                   "__brick_reduce_idx is device copyable with non device copyable types");
 
     //__gen_transform_input

From 7990bc100d1db6c8c968505eede1c5bb3293e0ab Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Tue, 17 Dec 2024 08:52:35 -0600
Subject: [PATCH 31/65] clang-format

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../internal/async_impl/async_impl_hetero.h   | 17 ++++++----
 .../hetero/algorithm_ranges_impl_hetero.h     | 34 +++++++++++--------
 .../pstl/hetero/dpcpp/unseq_backend_sycl.h    |  1 -
 .../dpl/pstl/hetero/histogram_impl_hetero.h   |  5 +--
 .../device_copyable.pass.cpp                  | 34 ++++++++++---------
 5 files changed, 50 insertions(+), 41 deletions(-)

diff --git a/include/oneapi/dpl/internal/async_impl/async_impl_hetero.h b/include/oneapi/dpl/internal/async_impl/async_impl_hetero.h
index a4e53212467..36a1b339df7 100644
--- a/include/oneapi/dpl/internal/async_impl/async_impl_hetero.h
+++ b/include/oneapi/dpl/internal/async_impl/async_impl_hetero.h
@@ -46,7 +46,8 @@ __pattern_walk1_async(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _For
 
     auto __future_obj = oneapi::dpl::__par_backend_hetero::__parallel_for(
         _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
-        unseq_backend::walk1_vector_or_scalar<_ExecutionPolicy, _Function, decltype(__view)>{{}, __f, std::size_t(__n)}, __n, __view);
+        unseq_backend::walk1_vector_or_scalar<_ExecutionPolicy, _Function, decltype(__view)>{{}, __f, std::size_t(__n)},
+        __n, __view);
     return __future_obj;
 }
 
@@ -72,7 +73,9 @@ __pattern_walk2_async(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _For
 
     auto __future = oneapi::dpl::__par_backend_hetero::__parallel_for(
         _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
-        unseq_backend::walk2_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__view1), decltype(__view2)>{{}, __f, std::size_t(__n)}, __n, __view1, __view2);
+        unseq_backend::walk2_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__view1), decltype(__view2)>{
+            {}, __f, std::size_t(__n)},
+        __n, __view1, __view2);
 
     return __future.__make_future(__first2 + __n);
 }
@@ -100,11 +103,11 @@ __pattern_walk3_async(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _For
     auto __view2 = __buf2.all_view();
     auto __view3 = __buf3.all_view();
 
-    auto __future =
-        oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
-                                                          unseq_backend::walk3_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__view1), decltype(__view2),
-                                                decltype(__view3)>{{}, __f, size_t(__n)}, __n,
-                                                          __view1, __view2, __view3);
+    auto __future = oneapi::dpl::__par_backend_hetero::__parallel_for(
+        _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
+        unseq_backend::walk3_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__view1), decltype(__view2),
+                                                decltype(__view3)>{{}, __f, size_t(__n)},
+        __n, __view1, __view2, __view3);
 
     return __future.__make_future(__first3 + __n);
 }
diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h
index 645a1de0896..898ab22e1c2 100644
--- a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h
+++ b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h
@@ -61,26 +61,29 @@ __pattern_walk_n(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Function
         static_assert(__num_ranges <= 3, "__pattern_walk_n only supports up to three packed range parameters");
         if constexpr (__num_ranges == 1)
         {
-            oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
-                                                              unseq_backend::walk1_vector_or_scalar<_ExecutionPolicy, _Function, std::decay_t<_Ranges>...>{{}, __f,
-                                                                static_cast<std::size_t>(__n)}, __n,
-                                                                ::std::forward<_Ranges>(__rngs)...)
+            oneapi::dpl::__par_backend_hetero::__parallel_for(
+                _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
+                unseq_backend::walk1_vector_or_scalar<_ExecutionPolicy, _Function, std::decay_t<_Ranges>...>{
+                    {}, __f, static_cast<std::size_t>(__n)},
+                __n, ::std::forward<_Ranges>(__rngs)...)
                 .__deferrable_wait();
         }
         else if constexpr (__num_ranges == 2)
         {
-            oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
-                                                              unseq_backend::walk2_vectors_or_scalars<_ExecutionPolicy, _Function, std::decay_t<_Ranges>...>{{}, __f,
-                                                                static_cast<std::size_t>(__n)}, __n,
-                                                                ::std::forward<_Ranges>(__rngs)...)
+            oneapi::dpl::__par_backend_hetero::__parallel_for(
+                _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
+                unseq_backend::walk2_vectors_or_scalars<_ExecutionPolicy, _Function, std::decay_t<_Ranges>...>{
+                    {}, __f, static_cast<std::size_t>(__n)},
+                __n, ::std::forward<_Ranges>(__rngs)...)
                 .__deferrable_wait();
         }
         else
         {
-            oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
-                                                              unseq_backend::walk3_vectors_or_scalars<_ExecutionPolicy, _Function, std::decay_t<_Ranges>...>{{}, __f,
-                                                                static_cast<std::size_t>(__n)}, __n,
-                                                                ::std::forward<_Ranges>(__rngs)...)
+            oneapi::dpl::__par_backend_hetero::__parallel_for(
+                _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
+                unseq_backend::walk3_vectors_or_scalars<_ExecutionPolicy, _Function, std::decay_t<_Ranges>...>{
+                    {}, __f, static_cast<std::size_t>(__n)},
+                __n, ::std::forward<_Ranges>(__rngs)...)
                 .__deferrable_wait();
         }
     }
@@ -647,9 +650,10 @@ __pattern_unique_copy(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec
             _BackendTag{},
             oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__copy_wrapper>(
                 std::forward<_ExecutionPolicy>(__exec)),
-            unseq_backend::walk2_vectors_or_scalars<_ExecutionPolicy, _CopyBrick, std::decay_t<_Range1>, std::decay_t<_Range2>>{{}, _CopyBrick{}, static_cast<std::size_t>(__n)}, __n,
-            std::forward<_Range1>(__rng),
-            std::forward<_Range2>(__result))
+            unseq_backend::walk2_vectors_or_scalars<_ExecutionPolicy, _CopyBrick, std::decay_t<_Range1>,
+                                                    std::decay_t<_Range2>>{
+                {}, _CopyBrick{}, static_cast<std::size_t>(__n)},
+            __n, std::forward<_Range1>(__rng), std::forward<_Range2>(__result))
             .get();
 
         return 1;
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
index cbaaace0287..123b3cc2d09 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
@@ -1535,7 +1535,6 @@ struct __brick_reduce_idx : public walk_scalar_base<_Range>
         __value_type __segment_end =
             (__idx == __segment_starts.size() - 1) ? __value_type(__n) : __segment_starts[__idx + 1];
         __out_values[__idx] = reduce(__segment_starts[__idx], __segment_end, __values);
-
     }
     template <typename _IsFull, typename _ItemId, typename _ReduceIdx, typename _Values, typename _OutValues>
     void
diff --git a/include/oneapi/dpl/pstl/hetero/histogram_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/histogram_impl_hetero.h
index fdb12527d20..e4835971a86 100644
--- a/include/oneapi/dpl/pstl/hetero/histogram_impl_hetero.h
+++ b/include/oneapi/dpl/pstl/hetero/histogram_impl_hetero.h
@@ -143,8 +143,9 @@ __pattern_histogram(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Rando
 
         auto __init_event = oneapi::dpl::__par_backend_hetero::__parallel_for(
             _BackendTag{}, oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__hist_fill_zeros_wrapper>(__exec),
-            unseq_backend::walk1_vector_or_scalar<_ExecutionPolicy, decltype(__fill_func), decltype(__bins)>{{}, __fill_func,
-                static_cast<std::size_t>(__num_bins)}, __num_bins, __bins);
+            unseq_backend::walk1_vector_or_scalar<_ExecutionPolicy, decltype(__fill_func), decltype(__bins)>{
+                {}, __fill_func, static_cast<std::size_t>(__num_bins)},
+            __num_bins, __bins);
 
         if (__n > 0)
         {
diff --git a/test/general/implementation_details/device_copyable.pass.cpp b/test/general/implementation_details/device_copyable.pass.cpp
index 1cd18095ee1..9571a5fc93e 100644
--- a/test/general/implementation_details/device_copyable.pass.cpp
+++ b/test/general/implementation_details/device_copyable.pass.cpp
@@ -46,9 +46,11 @@ test_device_copyable()
                   "constant_iterator_device_copyable is not device copyable");
 
     //custom_brick
-    static_assert(sycl::is_device_copyable_v<oneapi::dpl::internal::custom_brick<
-                      noop_device_copyable, int_device_copyable, range_device_copyable, oneapi::dpl::internal::search_algorithm::lower_bound>>,
-                  "custom_brick is not device copyable with device copyable types");
+    static_assert(
+        sycl::is_device_copyable_v<
+            oneapi::dpl::internal::custom_brick<noop_device_copyable, int_device_copyable, range_device_copyable,
+                                                oneapi::dpl::internal::search_algorithm::lower_bound>>,
+        "custom_brick is not device copyable with device copyable types");
     //replace_if_fun
     static_assert(
         sycl::is_device_copyable_v<oneapi::dpl::internal::replace_if_fun<int_device_copyable, noop_device_copyable>>,
@@ -77,10 +79,9 @@ test_device_copyable()
                       oneapi::dpl::unseq_backend::walk_n<policy_non_device_copyable, noop_device_copyable>>,
                   "walk_n is not device copyable with device copyable types");
     //walk_adjacent_difference
-    static_assert(
-        sycl::is_device_copyable_v<
-            oneapi::dpl::unseq_backend::walk_adjacent_difference<policy_non_device_copyable, noop_device_copyable, range_device_copyable, range_device_copyable>>,
-        "walk_adjacent_difference is not device copyable with device copyable types");
+    static_assert(sycl::is_device_copyable_v<oneapi::dpl::unseq_backend::walk_adjacent_difference<
+                      policy_non_device_copyable, noop_device_copyable, range_device_copyable, range_device_copyable>>,
+                  "walk_adjacent_difference is not device copyable with device copyable types");
     //transform_reduce
     static_assert(
         sycl::is_device_copyable_v<
@@ -148,8 +149,8 @@ test_device_copyable()
                                                        int_device_copyable, int_device_copyable, std::true_type>>,
         "__brick_set_op is not device copyable with device copyable types");
     // __brick_reduce_idx
-    static_assert(sycl::is_device_copyable_v<
-                      oneapi::dpl::unseq_backend::__brick_reduce_idx<noop_device_copyable, int_device_copyable, range_device_copyable>>,
+    static_assert(sycl::is_device_copyable_v<oneapi::dpl::unseq_backend::__brick_reduce_idx<
+                      noop_device_copyable, int_device_copyable, range_device_copyable>>,
                   "__brick_reduce_idx is not device copyable with device copyable types");
 
     //__gen_transform_input
@@ -307,10 +308,10 @@ test_non_device_copyable()
     static_assert(!sycl::is_device_copyable_v<range_non_device_copyable>, "range_non_device_copyable is device copyable");
 
     //custom_brick
-    static_assert(
-        !sycl::is_device_copyable_v<oneapi::dpl::internal::custom_brick<
-            noop_device_copyable, int_non_device_copyable, range_non_device_copyable, oneapi::dpl::internal::search_algorithm::lower_bound>>,
-        "custom_brick is device copyable with non device copyable types");
+    static_assert(!sycl::is_device_copyable_v<oneapi::dpl::internal::custom_brick<
+                      noop_device_copyable, int_non_device_copyable, range_non_device_copyable,
+                      oneapi::dpl::internal::search_algorithm::lower_bound>>,
+                  "custom_brick is device copyable with non device copyable types");
     //replace_if_fun
     static_assert(!sycl::is_device_copyable_v<
                       oneapi::dpl::internal::replace_if_fun<int_device_copyable, noop_non_device_copyable>>,
@@ -342,7 +343,8 @@ test_non_device_copyable()
     //walk_adjacent_difference
     static_assert(
         !sycl::is_device_copyable_v<
-            oneapi::dpl::unseq_backend::walk_adjacent_difference<policy_non_device_copyable, noop_non_device_copyable, range_non_device_copyable, range_non_device_copyable>>,
+            oneapi::dpl::unseq_backend::walk_adjacent_difference<policy_non_device_copyable, noop_non_device_copyable,
+                                                                 range_non_device_copyable, range_non_device_copyable>>,
         "walk_adjacent_difference is device copyable with non device copyable types");
     //transform_reduce
     static_assert(
@@ -411,8 +413,8 @@ test_non_device_copyable()
                                                        int_device_copyable, int_device_copyable, std::true_type>>,
         "__brick_set_op is device copyable with non device copyable types");
     //__brick_reduce_idx
-    static_assert(!sycl::is_device_copyable_v<
-                      oneapi::dpl::unseq_backend::__brick_reduce_idx<noop_device_copyable, int_non_device_copyable, range_non_device_copyable>>,
+    static_assert(!sycl::is_device_copyable_v<oneapi::dpl::unseq_backend::__brick_reduce_idx<
+                      noop_device_copyable, int_non_device_copyable, range_non_device_copyable>>,
                   "__brick_reduce_idx is device copyable with non device copyable types");
 
     //__gen_transform_input

From 4aaa81fd6b0c581651314c7aac65282a129ec266 Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Tue, 17 Dec 2024 07:24:27 -0800
Subject: [PATCH 32/65] Fix ordering to __vector_load call

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
index 123b3cc2d09..b34782ff64d 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
@@ -261,7 +261,7 @@ struct walk3_vectors_or_scalars : public walk_vector_or_scalar_base<_Range1, _Ra
         oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(
             __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __raw_ptr1, __rng1_vector);
         oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(
-            __is_full, __idx, __raw_ptr2, __rng2_vector, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{});
+            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __raw_ptr2, __rng2_vector);
         // 2. Apply binary functor to vector and store into global memory
         oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(
             __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f}, __rng1_vector,

From 65e4a6861b8d6ce88761a1f00386378384436a8b Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Tue, 17 Dec 2024 07:53:19 -0800
Subject: [PATCH 33/65] Add support for vectorization with C++20 parallel range
 APIs

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../pstl/hetero/dpcpp/unseq_backend_sycl.h    |  2 +-
 .../dpl/pstl/hetero/dpcpp/utils_ranges_sycl.h | 22 ++++++++++++++-----
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
index b34782ff64d..dd197f29408 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
@@ -122,7 +122,7 @@ class walk_vector_or_scalar_base
 
   public:
     constexpr static bool __can_vectorize =
-        (oneapi::dpl::__ranges::__is_vectorizable_view<_Ranges>::value && ...) &&
+        (oneapi::dpl::__ranges::__is_vectorizable_range<_Ranges>::value && ...) &&
         (std::is_fundamental_v<oneapi::dpl::__internal::__value_t<_Ranges>> && ...) && __min_type_size < 4;
     // Vectorize for small types, so we generate 128-byte load / stores in a sub-group
     constexpr static std::uint16_t __preferred_vector_size =
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/utils_ranges_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/utils_ranges_sycl.h
index b69e207f551..f126df3ab9e 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/utils_ranges_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/utils_ranges_sycl.h
@@ -18,6 +18,9 @@
 
 #include <iterator>
 #include <type_traits>
+#if _ONEDPL_CPP20_RANGES_PRESENT && _ONEDPL_CPP20_CONCEPTS_PRESENT
+#include <ranges> // std::ranges::contiguous_range
+#endif
 
 #include "../../utils_ranges.h"
 #include "../../iterator_impl.h"
@@ -753,20 +756,27 @@ __select_backend(const execution::fpga_policy<_Factor, _KernelName>&, _Ranges&&.
 #endif
 
 // Check the outer view type type to see if we can vectorize. Any non-contiguous inputs (e.g. reverse
-// views, permutation views, etc.) cannot be vectorized.
-template <typename _Args>
-struct __is_vectorizable_view : std::false_type
-{
+// views, permutation views, etc.) cannot be vectorized. If C++20 ranges are present, then we can
+// use the std::ranges::contiguous_range concept.
+template <typename _Rng>
+struct __is_vectorizable_range
+{
+    constexpr static bool value =
+#if _ONEDPL_CPP20_RANGES_PRESENT && _ONEDPL_CPP20_CONCEPTS_PRESENT
+        std::ranges::contiguous_range<_Rng>;
+#else
+        false;
+#endif
 };
 // If the outer view is a guard view, then the input is passed directly as a pointer and we can use.
 template <typename... _Args>
-struct __is_vectorizable_view<oneapi::dpl::__ranges::guard_view<_Args...>> : std::true_type
+struct __is_vectorizable_range<oneapi::dpl::__ranges::guard_view<_Args...>> : std::true_type
 {
 };
 // If all_view is passed, then we are processing a sycl::buffer directly which is contiguous and can
 // be used.
 template <typename... _Args>
-struct __is_vectorizable_view<oneapi::dpl::__ranges::all_view<_Args...>> : std::true_type
+struct __is_vectorizable_range<oneapi::dpl::__ranges::all_view<_Args...>> : std::true_type
 {
 };
 

From b4657a692cb6274cd8ed6e68af08bb58fab163b8 Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Tue, 17 Dec 2024 10:53:49 -0600
Subject: [PATCH 34/65] Add device copyable specializations for new walk
 patterns

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../dpl/pstl/hetero/dpcpp/sycl_traits.h       | 30 +++++++++++++++++++
 .../device_copyable.pass.cpp                  | 28 +++++++++++++++++
 2 files changed, 58 insertions(+)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h b/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h
index 46e74b382a5..4658495e726 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h
@@ -368,6 +368,15 @@ namespace oneapi::dpl::unseq_backend
 template <typename _ExecutionPolicy, typename _F>
 struct walk_n;
 
+template <typename _ExecutionPolicy, typename _F, typename _Range>
+struct walk1_vector_or_scalar;
+
+template <typename _ExecutionPolicy, typename _F, typename _Range1, typename _Range2>
+struct walk2_vectors_or_scalars;
+
+template <typename _ExecutionPolicy, typename _F, typename _Range1, typename _Range2, typename _Range3>
+struct walk3_vectors_or_scalars;
+
 template <typename _ExecutionPolicy, typename _F, typename _Range1, typename _Range2>
 struct walk_adjacent_difference;
 
@@ -429,6 +438,27 @@ struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::unseq_backen
 {
 };
 
+template <typename _ExecutionPolicy, typename _F, typename _Range>
+struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::unseq_backend::walk1_vector_or_scalar,
+                                                       _ExecutionPolicy, _F, _Range)>
+    : oneapi::dpl::__internal::__are_all_device_copyable<_F>
+{
+};
+
+template <typename _ExecutionPolicy, typename _F, typename _Range1, typename _Range2>
+struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::unseq_backend::walk2_vectors_or_scalars,
+                                                       _ExecutionPolicy, _F, _Range1, _Range2)>
+    : oneapi::dpl::__internal::__are_all_device_copyable<_F>
+{
+};
+
+template <typename _ExecutionPolicy, typename _F, typename _Range1, typename _Range2, typename _Range3>
+struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::unseq_backend::walk3_vectors_or_scalars,
+                                                       _ExecutionPolicy, _F, _Range1, _Range2, _Range3)>
+    : oneapi::dpl::__internal::__are_all_device_copyable<_F>
+{
+};
+
 template <typename _ExecutionPolicy, typename _F, typename _Range1, typename _Range2>
 struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::unseq_backend::walk_adjacent_difference,
                                                        _ExecutionPolicy, _F, _Range1, _Range2)>
diff --git a/test/general/implementation_details/device_copyable.pass.cpp b/test/general/implementation_details/device_copyable.pass.cpp
index 9571a5fc93e..499269bfb1c 100644
--- a/test/general/implementation_details/device_copyable.pass.cpp
+++ b/test/general/implementation_details/device_copyable.pass.cpp
@@ -78,6 +78,19 @@ test_device_copyable()
     static_assert(sycl::is_device_copyable_v<
                       oneapi::dpl::unseq_backend::walk_n<policy_non_device_copyable, noop_device_copyable>>,
                   "walk_n is not device copyable with device copyable types");
+    //walk1_vector_or_scalar
+    static_assert(sycl::is_device_copyable_v<oneapi::dpl::unseq_backend::walk1_vector_or_scalar<
+                      policy_non_device_copyable, noop_device_copyable, range_device_copyable>>,
+                  "walk1_vector_or_scalar is not device copyable with device copyable types");
+    //walk2_vectors_or_scalars
+    static_assert(sycl::is_device_copyable_v<oneapi::dpl::unseq_backend::walk2_vectors_or_scalars<
+                      policy_non_device_copyable, noop_device_copyable, range_device_copyable, range_device_copyable>>,
+                  "walk2_vectors_or_scalars is not device copyable with device copyable types");
+    //walk3_vectors_or_scalars
+    static_assert(sycl::is_device_copyable_v<oneapi::dpl::unseq_backend::walk3_vectors_or_scalars<
+                      policy_non_device_copyable, noop_device_copyable, range_device_copyable, range_device_copyable,
+                      range_device_copyable>>,
+                  "walk3_vectors_or_scalars is not device copyable with device copyable types");
     //walk_adjacent_difference
     static_assert(sycl::is_device_copyable_v<oneapi::dpl::unseq_backend::walk_adjacent_difference<
                       policy_non_device_copyable, noop_device_copyable, range_device_copyable, range_device_copyable>>,
@@ -340,6 +353,21 @@ test_non_device_copyable()
     static_assert(!sycl::is_device_copyable_v<
                       oneapi::dpl::unseq_backend::walk_n<policy_non_device_copyable, noop_non_device_copyable>>,
                   "walk_n is device copyable with non device copyable types");
+    //walk1_vector_or_scalar
+    static_assert(!sycl::is_device_copyable_v<oneapi::dpl::unseq_backend::walk1_vector_or_scalar<
+                      policy_non_device_copyable, noop_non_device_copyable, range_non_device_copyable>>,
+                  "walk1_vector_or_scalar is device copyable with non device copyable types");
+    //walk2_vectors_or_scalars
+    static_assert(
+        !sycl::is_device_copyable_v<
+            oneapi::dpl::unseq_backend::walk2_vectors_or_scalars<policy_non_device_copyable, noop_non_device_copyable,
+                                                                 range_non_device_copyable, range_non_device_copyable>>,
+        "walk2_vectors_or_scalars is device copyable with non device copyable types");
+    //walk3_vectors_or_scalars
+    static_assert(!sycl::is_device_copyable_v<oneapi::dpl::unseq_backend::walk3_vectors_or_scalars<
+                      policy_non_device_copyable, noop_non_device_copyable, range_non_device_copyable,
+                      range_non_device_copyable, range_non_device_copyable>>,
+                  "walk3_vectors_or_scalars is device copyable with non device copyable types");
     //walk_adjacent_difference
     static_assert(
         !sycl::is_device_copyable_v<

From 3086dd3f60cca6295e0dd38c1af11cf51a31dd07 Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Tue, 17 Dec 2024 09:36:08 -0800
Subject: [PATCH 35/65] Align vector_walk implementation with other vector
 functors

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../dpcpp/parallel_backend_sycl_utils.h       | 12 ++--
 .../pstl/hetero/dpcpp/unseq_backend_sycl.h    | 58 ++++++++-----------
 2 files changed, 30 insertions(+), 40 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
index 5204cf048a7..93f796b90c8 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
@@ -891,15 +891,15 @@ struct __lazy_store_transform_op
     }
 };
 
-template <std::uint16_t __vec_size, typename _F>
+template <std::uint16_t __vec_size>
 struct __vector_walk
 {
-    _F __f;
+    static_assert(__vec_size <= 4);
     std::size_t __n;
 
-    template <typename... _Rngs>
+    template <typename _IdxType, typename _WalkFunction, typename... _Rngs>
     void
-    operator()(std::true_type, std::size_t __idx, _Rngs&&... __rngs) const
+    operator()(std::true_type, _IdxType __idx, _WalkFunction __f, _Rngs&&... __rngs) const
     {
         _ONEDPL_PRAGMA_UNROLL
         for (std::uint16_t __i = 0; __i < __vec_size; ++__i)
@@ -910,9 +910,9 @@ struct __vector_walk
     }
     // For a non-full vector path, process it sequentially. This will always be the last sub or work group
     // if it does not evenly divide into input
-    template <typename... _Rngs>
+    template <typename _IdxType, typename _WalkFunction, typename... _Rngs>
     void
-    operator()(std::false_type, std::size_t __idx, _Rngs&&... __rngs) const
+    operator()(std::false_type, _IdxType __idx, _WalkFunction __f, _Rngs&&... __rngs) const
     {
         std::uint16_t __elements = std::min(__vec_size, decltype(__vec_size)(__n - __idx));
         for (std::uint16_t __i = 0; __i < __elements; ++__i)
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
index dd197f29408..e5cbc0fd0c9 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
@@ -160,8 +160,8 @@ struct walk1_vector_or_scalar : public walk_vector_or_scalar_base<_Range>
     {
         // This is needed to enable vectorization
         auto __raw_ptr = __rng.begin();
-        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size, decltype(__f)>{__f, __n}(
-            __is_full, __idx, __raw_ptr);
+        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(
+            __is_full, __idx, __f, __raw_ptr);
     }
 
     // _IsFull is ignored here. We assume that boundary checking has been already performed for this index.
@@ -210,9 +210,8 @@ struct walk2_vectors_or_scalars : public walk_vector_or_scalar_base<_Range1, _Ra
             __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f}, __rng1_vector,
             __raw_ptr2);
         // 3. Explicitly call destructor of lazy union type
-        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size,
-                                                         oneapi::dpl::__internal::__lazy_ctor_storage_deleter>{
-            oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __n}(__is_full, 0, __rng1_vector);
+        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(__is_full, 0,
+             oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __rng1_vector);
     }
 
     // _IsFull is ignored here. We assume that boundary checking has been already performed for this index.
@@ -267,12 +266,10 @@ struct walk3_vectors_or_scalars : public walk_vector_or_scalar_base<_Range1, _Ra
             __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f}, __rng1_vector,
             __rng2_vector, __raw_ptr3);
         // 3. Explicitly call destructors of lazy union type
-        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size,
-                                                         oneapi::dpl::__internal::__lazy_ctor_storage_deleter>{
-            oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __n}(__is_full, 0, __rng1_vector);
-        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size,
-                                                         oneapi::dpl::__internal::__lazy_ctor_storage_deleter>{
-            oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __n}(__is_full, 0, __rng2_vector);
+        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(__is_full, 0,
+            oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __rng1_vector);
+        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(__is_full, 0,
+            oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __rng2_vector);
     }
 
     // _IsFull is ignored here. We assume that boundary checking has been already performed for this index.
@@ -358,9 +355,8 @@ struct walk_adjacent_difference : public walk_vector_or_scalar_base<_Range1, _Ra
         if (__idx == 0)
             __acc_dst[0] = __src_vector[0].__v;
         // 3. Delete temporary storage
-        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size,
-                                                         oneapi::dpl::__internal::__lazy_ctor_storage_deleter>{
-            oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __n}(__is_full, 0, __src_vector);
+        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(__is_full, 0,
+            oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __src_vector);
     }
     template <typename _IsFull, typename _ItemId, typename _Acc1, typename _Acc2>
     void
@@ -1196,12 +1192,10 @@ struct __reverse_functor : public walk_vector_or_scalar_base<_Range>
             oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<oneapi::dpl::__internal::__pstl_assign>{},
             __acc_right_vector, __acc_pointer);
         // 4. Call destructors of temporary storage
-        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size,
-                                                         oneapi::dpl::__internal::__lazy_ctor_storage_deleter>{
-            oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __n}(__is_full, 0, __acc_left_vector);
-        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size,
-                                                         oneapi::dpl::__internal::__lazy_ctor_storage_deleter>{
-            oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __n}(__is_full, 0, __acc_right_vector);
+        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(__is_full, 0,
+            oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __acc_left_vector);
+        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(__is_full, 0,
+            oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __acc_right_vector);
     }
     template <typename _IsFull, typename _Idx, typename _Accessor>
     void
@@ -1271,9 +1265,8 @@ struct __reverse_copy : public walk_vector_or_scalar_base<_Range1, _Range2>
                 __acc2_pointer[__output_start + __i] = __acc1_vector[__i].__v;
         }
         // 3. Cleanup
-        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size,
-                                                         oneapi::dpl::__internal::__lazy_ctor_storage_deleter>{
-            oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __n}(__is_full, 0, __acc1_vector);
+        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(__is_full, 0,
+            oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __acc1_vector);
     }
     template <typename _IsFull, typename _Idx, typename _AccessorSrc, typename _AccessorDst>
     void
@@ -1327,9 +1320,8 @@ struct __rotate_copy : public walk_vector_or_scalar_base<_Range1, _Range2>
             oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<oneapi::dpl::__internal::__pstl_assign>{},
             __acc1_vector, __acc2_pointer);
         // 3. Delete temporary storage
-        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size,
-                                                         oneapi::dpl::__internal::__lazy_ctor_storage_deleter>{
-            oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __n}(__is_full, 0, __acc1_vector);
+        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(__is_full, 0,
+            oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __acc1_vector);
     }
     template <typename _IsFull, typename _Idx>
     void
@@ -1449,10 +1441,9 @@ struct __brick_shift_left : public walk_vector_or_scalar_base<_Range>
                                                       oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<
                                                           oneapi::dpl::__internal::__pstl_assign>{},
                                                       __rng_vector, __rng_pointer);
-                oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size,
-                                                                 oneapi::dpl::__internal::__lazy_ctor_storage_deleter>{
-                    oneapi::dpl::__internal::__lazy_ctor_storage_deleter{},
-                    static_cast<std::size_t>(__size)}(std::true_type{}, 0, __rng_vector);
+                oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{
+                    static_cast<std::size_t>(__size)}(std::true_type{}, 0,
+                            oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __rng_vector);
             }
             else if (__k + __idx < __size)
             {
@@ -1465,10 +1456,9 @@ struct __brick_shift_left : public walk_vector_or_scalar_base<_Range>
                                                       oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<
                                                           oneapi::dpl::__internal::__pstl_assign>{},
                                                       __rng_vector, __rng_pointer);
-                oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size,
-                                                                 oneapi::dpl::__internal::__lazy_ctor_storage_deleter>{
-                    oneapi::dpl::__internal::__lazy_ctor_storage_deleter{},
-                    static_cast<std::size_t>(__size)}(std::false_type{}, 0, __rng_vector);
+                oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{
+                    static_cast<std::size_t>(__size)}(std::false_type{}, 0,
+                            oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __rng_vector);
             }
         }
     }

From df17673a46b3cbdb19d4b8ff08d4d19aea0c3340 Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Tue, 17 Dec 2024 09:51:35 -0800
Subject: [PATCH 36/65] Add back non-spirv path

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h  | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h
index a190a710949..fc9fa3ed57f 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h
@@ -92,7 +92,7 @@ struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name..
                          std::size_t __adj_elements_per_work_item,
                          std::size_t __work_group_size)
     {
-        if constexpr (1)//oneapi::dpl::__internal::__is_spirv_target_v)
+        if constexpr (oneapi::dpl::__internal::__is_spirv_target_v)
         {
             const __dpl_sycl::__sub_group __sub_group = __item.get_sub_group();
             const std::uint32_t __sub_group_size = __sub_group.get_local_linear_range();
@@ -110,11 +110,11 @@ struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name..
         else
         {
             const std::size_t __work_group_start_idx =
-                __item.get_group().get_group_linear_id() * __work_group_size * __iters_per_work_item;
-            const std::size_t __work_item_idx = __work_group_start_idx + __item.get_local_linear_id();
+                __item.get_group().get_group_linear_id() * __work_group_size * __iters_per_work_item * __adj_elements_per_work_item;
+            const std::size_t __work_item_idx = __work_group_start_idx + __item.get_local_linear_id() * __adj_elements_per_work_item;
             const bool __is_full_work_group =
-                __work_group_start_idx + __iters_per_work_item * __work_group_size <= __count;
-            return std::make_tuple(__work_item_idx, __work_group_size, __is_full_work_group);
+                __work_group_start_idx + __iters_per_work_item * __work_group_size * __adj_elements_per_work_item <= __count;
+            return std::make_tuple(__work_item_idx, __work_group_size * __adj_elements_per_work_item, __is_full_work_group);
         }
     }
 

From fd4e2c3a33dd81ec91f2e471ccb41151c15715a3 Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Tue, 17 Dec 2024 13:54:32 -0800
Subject: [PATCH 37/65] Further improve test coverage

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../alg.modifying.operations/fill.pass.cpp    |  2 ++
 .../replace_copy.pass.cpp                     |  4 ++++
 .../alg.modifying.operations/rotate.pass.cpp  |  2 ++
 .../rotate_copy.pass.cpp                      |  2 ++
 .../shift_left_right.pass.cpp                 |  5 +++--
 .../swap_ranges.pass.cpp                      |  1 +
 .../transform_binary.pass.cpp                 |  2 ++
 .../alg.nonmodifying/transform_if.pass.cpp    | 20 +++++++++++--------
 .../numeric.ops/adjacent_difference.pass.cpp  |  2 ++
 9 files changed, 30 insertions(+), 10 deletions(-)

diff --git a/test/parallel_api/algorithm/alg.modifying.operations/fill.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/fill.pass.cpp
index 66c8e321627..259e5c99411 100644
--- a/test/parallel_api/algorithm/alg.modifying.operations/fill.pass.cpp
+++ b/test/parallel_api/algorithm/alg.modifying.operations/fill.pass.cpp
@@ -105,6 +105,8 @@ main()
 
     for (::std::size_t n = 0; n < N; n = n < 16 ? n + 1 : size_t(3.1415 * n))
     {
+        test_fill_by_type<std::int8_t>(n);
+        test_fill_by_type<std::int16_t>(n);
         test_fill_by_type<std::int32_t>(n);
         test_fill_by_type<float64_t>(n);
     }
diff --git a/test/parallel_api/algorithm/alg.modifying.operations/replace_copy.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/replace_copy.pass.cpp
index 50851e3d299..eae0b518dc9 100644
--- a/test/parallel_api/algorithm/alg.modifying.operations/replace_copy.pass.cpp
+++ b/test/parallel_api/algorithm/alg.modifying.operations/replace_copy.pass.cpp
@@ -123,6 +123,10 @@ main()
     test<std::int32_t>(-666, 42, 99, [](const std::int32_t& x) { return x != 42; },
                   [](size_t j) { return ((j + 1) % 5 & 2) != 0 ? 42 : -1 - std::int32_t(j); });
 
+    test<std::uint8_t>(123, 42, 99, [](const std::uint8_t& x) { return x != 42; },
+                  [](size_t j) { return ((j + 1) % 5 & 2) != 0 ? 42 : 255; });
+
+
 #if !TEST_DPCPP_BACKEND_PRESENT
     test<Number>(Number(42, OddTag()), Number(2001, OddTag()), Number(2017, OddTag()), IsMultiple(3, OddTag()),
                  [](std::int32_t j) { return ((j + 1) % 3 & 2) != 0 ? Number(2001, OddTag()) : Number(j, OddTag()); });
diff --git a/test/parallel_api/algorithm/alg.modifying.operations/rotate.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/rotate.pass.cpp
index 2147999ce3f..963d2616a1c 100644
--- a/test/parallel_api/algorithm/alg.modifying.operations/rotate.pass.cpp
+++ b/test/parallel_api/algorithm/alg.modifying.operations/rotate.pass.cpp
@@ -154,6 +154,8 @@ test()
 int
 main()
 {
+    test<std::int8_t>();
+    test<std::int16_t>();
     test<std::int32_t>();
 #if !TEST_DPCPP_BACKEND_PRESENT
     test<wrapper<float64_t>>();
diff --git a/test/parallel_api/algorithm/alg.modifying.operations/rotate_copy.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/rotate_copy.pass.cpp
index edf5f56651c..c54f6b8dc82 100644
--- a/test/parallel_api/algorithm/alg.modifying.operations/rotate_copy.pass.cpp
+++ b/test/parallel_api/algorithm/alg.modifying.operations/rotate_copy.pass.cpp
@@ -125,6 +125,8 @@ test()
 int
 main()
 {
+    test<std::int8_t, std::int8_t>();
+    test<std::int16_t, std::int16_t>();
     test<std::int32_t, std::int8_t>();
     test<std::uint16_t, float32_t>();
     test<float64_t, std::int64_t>();
diff --git a/test/parallel_api/algorithm/alg.modifying.operations/shift_left_right.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/shift_left_right.pass.cpp
index c15a493a639..1d07540fc07 100644
--- a/test/parallel_api/algorithm/alg.modifying.operations/shift_left_right.pass.cpp
+++ b/test/parallel_api/algorithm/alg.modifying.operations/shift_left_right.pass.cpp
@@ -225,13 +225,14 @@ main()
     const int threads_to_use = std::min(max_threads, int(32));
     omp_set_num_threads(threads_to_use);
 #endif
-    using ValueType = ::std::int32_t;
 
     const ::std::size_t N = 100000;
     for (long m = 0; m < N; m = m < 16 ? m + 1 : long(3.1415 * m))
         for (long n = 0; n < N; n = n < 16 ? n + 1 : long(3.1415 * n))
     {
-       test_shift_by_type<ValueType>(m, n);
+       test_shift_by_type<std::uint8_t>(m, n);
+       test_shift_by_type<std::uint16_t>(m, n);
+       test_shift_by_type<std::int32_t>(m, n);
     }
 
     return TestUtils::done();
diff --git a/test/parallel_api/algorithm/alg.modifying.operations/swap_ranges.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/swap_ranges.pass.cpp
index afd10f55f50..7fd56707cf1 100644
--- a/test/parallel_api/algorithm/alg.modifying.operations/swap_ranges.pass.cpp
+++ b/test/parallel_api/algorithm/alg.modifying.operations/swap_ranges.pass.cpp
@@ -132,6 +132,7 @@ main()
 {
     test<wrapper<std::uint16_t>>();
     test<wrapper<float32_t>>();
+    test<std::int8_t>();
     test<std::int32_t>();
     test<float64_t>();
 
diff --git a/test/parallel_api/algorithm/alg.modifying.operations/transform_binary.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/transform_binary.pass.cpp
index 113c24be7c8..92723a75e8d 100644
--- a/test/parallel_api/algorithm/alg.modifying.operations/transform_binary.pass.cpp
+++ b/test/parallel_api/algorithm/alg.modifying.operations/transform_binary.pass.cpp
@@ -153,5 +153,7 @@ main()
     //test case for zip iterator
     test<50, std::int32_t, std::int32_t, std::int32_t>(TheOperationZip<std::int32_t>(1), _ZipIteratorAdapter{});
 
+    test<60, std::uint16_t, std::uint16_t, std::int32_t>(TheOperation<std::uint16_t, std::uint16_t, std::int32_t>(1));
+
     return done();
 }
diff --git a/test/parallel_api/algorithm/alg.nonmodifying/transform_if.pass.cpp b/test/parallel_api/algorithm/alg.nonmodifying/transform_if.pass.cpp
index 973ca3658f8..dbd3b3e7af4 100644
--- a/test/parallel_api/algorithm/alg.nonmodifying/transform_if.pass.cpp
+++ b/test/parallel_api/algorithm/alg.nonmodifying/transform_if.pass.cpp
@@ -165,7 +165,7 @@ test()
     for (size_t n = 1; n <= 100000; n = n <= 16 ? n + 1 : size_t(3.1415 * n))
     {
         {
-            Sequence<_Type> in1(n, [=](size_t k) { return (3 * k); });
+            Sequence<_Type> in1(n, [=](size_t k) { return (3 * k) % std::numeric_limits<_Type>::max(); });
             Sequence<_Type> in2(n, [=](size_t k) { return k % 2 == 0 ? 1 : 0; });
 
             Sequence<_Type> out(n, [=](size_t) { return init_val; });
@@ -178,7 +178,7 @@ test()
 #endif
         }
         {
-            Sequence<_Type> in1(n, [=](size_t k) { return k; });
+            Sequence<_Type> in1(n, [=](size_t k) { return k % std::numeric_limits<_Type>::max(); });
             Sequence<_Type> out(n, [=](size_t) { return init_val; });
 
             invoke_on_all_policies<2>()(test_transform_if_unary<_Type>(), in1.begin(), in1.end(), out.begin(),
@@ -199,7 +199,7 @@ test_inplace()
     for (size_t n = 1; n <= 100000; n = n <= 16 ? n + 1 : size_t(3.1415 * n))
     {
         {
-            Sequence<_Type> in1(n, [=](size_t k) { return k; });
+            Sequence<_Type> in1(n, [=](size_t k) { return k % std::numeric_limits<_Type>::max(); });
             Sequence<_Type> out(n, [=](size_t) { return 0; });
 
             invoke_on_all_policies<4>()(test_transform_if_unary_inplace<_Type>(), in1.begin(), in1.end(), out.begin(),
@@ -211,11 +211,15 @@ test_inplace()
 int
 main()
 {
-    test<::std::int32_t>();
-    test<::std::int64_t>();
-
-    test_inplace<::std::int32_t>();
-    test_inplace<::std::int64_t>();
+    test<std::int8_t>();
+    test<std::int16_t>();
+    test<std::int32_t>();
+    test<std::int64_t>();
+
+    test_inplace<std::int8_t>();
+    test_inplace<std::int16_t>();
+    test_inplace<std::int32_t>();
+    test_inplace<std::int64_t>();
 
     return done();
 }
diff --git a/test/parallel_api/numeric/numeric.ops/adjacent_difference.pass.cpp b/test/parallel_api/numeric/numeric.ops/adjacent_difference.pass.cpp
index a617d3205d0..19a7683f889 100644
--- a/test/parallel_api/numeric/numeric.ops/adjacent_difference.pass.cpp
+++ b/test/parallel_api/numeric/numeric.ops/adjacent_difference.pass.cpp
@@ -165,6 +165,8 @@ int
 main()
 {
     test<std::uint8_t, std::uint32_t>([](std::uint32_t a, std::uint32_t b) { return a - b; });
+    test<std::uint8_t, std::uint8_t>([](std::uint8_t a, std::uint8_t b) { return a > b ? a - b : b - a; });
+    test<std::uint16_t, std::uint16_t>([](std::uint16_t a, std::uint16_t b) { return a > b ? a - b : b - a; });
     test<std::int32_t, std::int64_t>([](std::int64_t a, std::int64_t b) { return a / (b + 1); });
     test<std::int64_t, float32_t>([](float32_t a, float32_t b) { return (a + b) / 2; });
 #if !TEST_DPCPP_BACKEND_PRESENT

From 58fd466cbf2a61c161b9a0c38651aca257a8fada Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Tue, 17 Dec 2024 15:09:10 -0800
Subject: [PATCH 38/65] Restore original shift_left due to implicit
 implementation requirement that for pattern launches exactly n work items

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../dpl/pstl/hetero/algorithm_impl_hetero.h   |  2 +-
 .../pstl/hetero/dpcpp/unseq_backend_sycl.h    | 58 +++----------------
 2 files changed, 10 insertions(+), 50 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h
index d17f021ab78..ebb68b8ce0f 100644
--- a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h
+++ b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h
@@ -1976,7 +1976,7 @@ __pattern_shift_left(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Rang
     }
     else //2. n < size/2; 'n' parallel copying
     {
-        auto __brick = unseq_backend::__brick_shift_left<_ExecutionPolicy, _DiffType, decltype(__rng)>{{}, __size, __n};
+        auto __brick = unseq_backend::__brick_shift_left<_ExecutionPolicy, _DiffType, decltype(__rng)>{__size, __n};
         oneapi::dpl::__par_backend_hetero::__parallel_for(
             _BackendTag{},
             oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__shift_left_right>(
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
index e5cbc0fd0c9..cf9f20755bb 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
@@ -1411,58 +1411,21 @@ class __brick_set_op
     }
 };
 
+// TODO: The implementation of shift left is reliant on exactly n (shift factor)
+// work items being launched by the parallel_for pattern, so it cannot be vectorized
+// or process multiple iterations per work items as is. For now, we must ensure that our
+// small submitter is launched in the SYCL backend's __parallel_for
 template <typename _ExecutionPolicy, typename _DiffType, typename _Range>
-struct __brick_shift_left : public walk_vector_or_scalar_base<_Range>
+struct __brick_shift_left
 {
     using __base_t = walk_vector_or_scalar_base<_Range>;
     using _ValueType = oneapi::dpl::__internal::__value_t<_Range>;
+    constexpr static bool __can_vectorize = false;
+    constexpr static std::uint16_t __preferred_vector_size = 1;
+    constexpr static std::uint16_t __preferred_iters_per_item = 1;
     _DiffType __size;
     _DiffType __n;
 
-    template <typename _IsFull, typename _ItemId>
-    void
-    __vector_path(_IsFull __is_full, const _ItemId __idx, _Range __rng) const
-    {
-        if (__idx >= __n)
-            return;
-        const _DiffType __i = __idx - __n; //loop invariant
-        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __rng_vector[__base_t::__preferred_vector_size];
-        auto __rng_pointer = __rng.begin();
-        for (_DiffType __k = __n; __k < __size; __k += __n)
-        {
-            if (__k + __idx + __base_t::__preferred_vector_size <= __size)
-            {
-                oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{
-                    static_cast<std::size_t>(__size)}(std::true_type{}, __k + __idx,
-                                                      oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{},
-                                                      __rng_pointer, __rng_vector);
-                oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{
-                    static_cast<std::size_t>(__size)}(std::true_type{}, __k + __i,
-                                                      oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<
-                                                          oneapi::dpl::__internal::__pstl_assign>{},
-                                                      __rng_vector, __rng_pointer);
-                oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{
-                    static_cast<std::size_t>(__size)}(std::true_type{}, 0,
-                            oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __rng_vector);
-            }
-            else if (__k + __idx < __size)
-            {
-                oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{
-                    static_cast<std::size_t>(__size)}(std::false_type{}, __k + __idx,
-                                                      oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{},
-                                                      __rng_pointer, __rng_vector);
-                oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{
-                    static_cast<std::size_t>(__size)}(std::false_type{}, __k + __i,
-                                                      oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<
-                                                          oneapi::dpl::__internal::__pstl_assign>{},
-                                                      __rng_vector, __rng_pointer);
-                oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{
-                    static_cast<std::size_t>(__size)}(std::false_type{}, 0,
-                            oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __rng_vector);
-            }
-        }
-    }
-
     template <typename _IsFull, typename _ItemId>
     void
     __scalar_path(_IsFull __is_full, const _ItemId __idx, _Range __rng) const
@@ -1479,10 +1442,7 @@ struct __brick_shift_left : public walk_vector_or_scalar_base<_Range>
     void
     operator()(_IsFull __is_full, const _ItemId __idx, _Range __rng) const
     {
-        if constexpr (__base_t::__can_vectorize)
-            __vector_path(__is_full, __idx, __rng);
-        else
-            __scalar_path(__is_full, __idx, __rng);
+        __scalar_path(__is_full, __idx, __rng);
     }
 };
 

From 094124f8f779245c640f114605d6959ece2eef15 Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Wed, 18 Dec 2024 08:20:26 -0800
Subject: [PATCH 39/65] Fix issues in vectorized rotate

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h     | 8 ++++----
 include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h | 6 +++---
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h
index fc9fa3ed57f..fb834cbcb35 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h
@@ -194,10 +194,10 @@ __parallel_for(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&&
     {
         if (__count >= __large_submitter::__estimate_best_start_size(__exec, __brick))
         {
-          return __large_submitter{}(std::forward<_ExecutionPolicy>(__exec), __brick, __count,
-                                     std::forward<_Ranges>(__rngs)...);
-         }
-     }
+            return __large_submitter{}(std::forward<_ExecutionPolicy>(__exec), __brick, __count,
+                                       std::forward<_Ranges>(__rngs)...);
+        }
+    }
     return __small_submitter{}(std::forward<_ExecutionPolicy>(__exec), __brick, __count,
                                std::forward<_Ranges>(__rngs)...);
 }
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
index cf9f20755bb..c0120c6ea3b 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
@@ -1291,16 +1291,16 @@ struct __rotate_copy : public walk_vector_or_scalar_base<_Range1, _Range2>
     _Size __shift;
     template <typename _IsFull, typename _Idx, typename _AccessorSrc, typename _AccessorDst>
     void
-    __vector_path(_IsFull __is_full, const _Idx __idx, const _AccessorSrc& __acc1, _AccessorDst& __acc2) const
+    __vector_path(_IsFull __is_full, const _Idx __idx, const _AccessorSrc __acc1, _AccessorDst __acc2) const
     {
         auto __acc1_pointer = __acc1.begin();
         auto __acc2_pointer = __acc2.begin();
         _Idx __shifted_idx = __shift + __idx;
         _Idx __wrapped_idx = __shifted_idx % __size;
-        std::size_t __n = __shift;
+        std::size_t __n = __size;
         oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __acc1_vector[__base_t::__preferred_vector_size];
         //1. Vectorize loads only if we know the wrap around point is beyond the current vector elements to process
-        if (__wrapped_idx + __base_t::__preferred_vector_size < __size)
+        if (__wrapped_idx + __base_t::__preferred_vector_size <= __size)
         {
             oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(
                 __is_full, __wrapped_idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __acc1_pointer,

From 82135f66e66aa887a750b6fc6da3a7e35699390f Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Wed, 18 Dec 2024 10:39:29 -0600
Subject: [PATCH 40/65] Fix fpga parallel for compilation issues

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h
index 3be82fdc623..3e08be8466e 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h
@@ -71,7 +71,7 @@ struct __parallel_for_fpga_submitter<__internal::__optional_kernel_name<_Name...
 #pragma unroll(::std::decay <_ExecutionPolicy>::type::unroll_factor)
                 for (auto __idx = 0; __idx < __count; ++__idx)
                 {
-                    __brick(__idx, __rngs...);
+                    __brick.__scalar_path(std::true_type{}, __idx, __rngs...);
                 }
             });
         });

From e97911863800dca3dd7b45b0975a19e474486a06 Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Wed, 18 Dec 2024 12:16:19 -0600
Subject: [PATCH 41/65] Restore initial shift_left_right.pass.cpp

Due to the revert of the vectorization path the original test provides
sufficient coverage.

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../alg.modifying.operations/shift_left_right.pass.cpp       | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/test/parallel_api/algorithm/alg.modifying.operations/shift_left_right.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/shift_left_right.pass.cpp
index 1d07540fc07..c15a493a639 100644
--- a/test/parallel_api/algorithm/alg.modifying.operations/shift_left_right.pass.cpp
+++ b/test/parallel_api/algorithm/alg.modifying.operations/shift_left_right.pass.cpp
@@ -225,14 +225,13 @@ main()
     const int threads_to_use = std::min(max_threads, int(32));
     omp_set_num_threads(threads_to_use);
 #endif
+    using ValueType = ::std::int32_t;
 
     const ::std::size_t N = 100000;
     for (long m = 0; m < N; m = m < 16 ? m + 1 : long(3.1415 * m))
         for (long n = 0; n < N; n = n < 16 ? n + 1 : long(3.1415 * n))
     {
-       test_shift_by_type<std::uint8_t>(m, n);
-       test_shift_by_type<std::uint16_t>(m, n);
-       test_shift_by_type<std::int32_t>(m, n);
+       test_shift_by_type<ValueType>(m, n);
     }
 
     return TestUtils::done();

From 4bfaada9352ae925099dacf7676581b1c21a4297 Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Wed, 18 Dec 2024 12:17:45 -0600
Subject: [PATCH 42/65] Fix test side issue when unnamed lambdas are disabled

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../numeric/numeric.ops/adjacent_difference.pass.cpp             | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test/parallel_api/numeric/numeric.ops/adjacent_difference.pass.cpp b/test/parallel_api/numeric/numeric.ops/adjacent_difference.pass.cpp
index 19a7683f889..417afc99903 100644
--- a/test/parallel_api/numeric/numeric.ops/adjacent_difference.pass.cpp
+++ b/test/parallel_api/numeric/numeric.ops/adjacent_difference.pass.cpp
@@ -165,7 +165,6 @@ int
 main()
 {
     test<std::uint8_t, std::uint32_t>([](std::uint32_t a, std::uint32_t b) { return a - b; });
-    test<std::uint8_t, std::uint8_t>([](std::uint8_t a, std::uint8_t b) { return a > b ? a - b : b - a; });
     test<std::uint16_t, std::uint16_t>([](std::uint16_t a, std::uint16_t b) { return a > b ? a - b : b - a; });
     test<std::int32_t, std::int64_t>([](std::int64_t a, std::int64_t b) { return a / (b + 1); });
     test<std::int64_t, float32_t>([](float32_t a, float32_t b) { return (a + b) / 2; });

From 8ae18db252a942773fca5292cc58b6b6ab11e779 Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Wed, 18 Dec 2024 14:36:04 -0600
Subject: [PATCH 43/65] Add a vector path specialization for std::swap_ranges

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../dpl/pstl/hetero/algorithm_impl_hetero.h   | 26 +++++++--
 .../hetero/algorithm_ranges_impl_hetero.h     | 31 +++++-----
 .../pstl/hetero/dpcpp/unseq_backend_sycl.h    | 58 +++++++++++++++++++
 .../swap_ranges.pass.cpp                      |  2 +-
 4 files changed, 99 insertions(+), 18 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h
index ebb68b8ce0f..a3bc1fcc4e0 100644
--- a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h
+++ b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h
@@ -139,10 +139,28 @@ _ForwardIterator2
 __pattern_swap(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _ForwardIterator1 __first1,
                _ForwardIterator1 __last1, _ForwardIterator2 __first2, _Function __f)
 {
-    return __pattern_walk2</*_WaitMode*/ __par_backend_hetero::__deferrable_mode,
-                           __par_backend_hetero::access_mode::read_write,
-                           __par_backend_hetero::access_mode::read_write>(
-        __tag, ::std::forward<_ExecutionPolicy>(__exec), __first1, __last1, __first2, __f);
+    auto __n = __last1 - __first1;
+    if (__n <= 0)
+        return __first2;
+
+    auto __keep1 =
+        oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read_write, _ForwardIterator1>();
+    auto __buf1 = __keep1(__first1, __last1);
+
+    auto __keep2 =
+        oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read_write, _ForwardIterator2>();
+    auto __buf2 = __keep2(__first2, __first2 + __n);
+
+    auto __view1 = __buf1.all_view();
+    auto __view2 = __buf2.all_view();
+
+    auto __future = oneapi::dpl::__par_backend_hetero::__parallel_for(
+        _BackendTag{}, std::forward<_ExecutionPolicy>(__exec),
+        unseq_backend::__brick_swap<_ExecutionPolicy, _Function, decltype(__view1), decltype(__view2)>{
+            {}, __f, size_t(__n)},
+        __n, __view1, __view2);
+    __future.wait(__par_backend_hetero::__deferrable_mode{});
+    return __first2 + __n;
 }
 
 //------------------------------------------------------------------------
diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h
index 898ab22e1c2..3b39db5c6dc 100644
--- a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h
+++ b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h
@@ -172,20 +172,25 @@ __pattern_swap(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _Rang
 {
     if (__rng1.size() <= __rng2.size())
     {
-        oneapi::dpl::__internal::__ranges::__pattern_walk_n(
-            __tag,
-            oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__swap1_wrapper>(
-                ::std::forward<_ExecutionPolicy>(__exec)),
-            __f, __rng1, __rng2);
-        return __rng1.size();
+        std::size_t __n = __rng1.size();
+        auto __exec1 = oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__swap1_wrapper>(std::forward<_ExecutionPolicy>(__exec));
+        auto __future = oneapi::dpl::__par_backend_hetero::__parallel_for(
+            _BackendTag{}, std::move(__exec1),
+            unseq_backend::__brick_swap<decltype(__exec1), _Function, _Range1, _Range2>{
+                {}, __f, __n},
+            __n, __rng1, __rng2);
+        __future.wait(__par_backend_hetero::__deferrable_mode{});
+        return __n;
     }
-
-    oneapi::dpl::__internal::__ranges::__pattern_walk_n(
-        __tag,
-        oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__swap2_wrapper>(
-            ::std::forward<_ExecutionPolicy>(__exec)),
-        __f, __rng2, __rng1);
-    return __rng2.size();
+    std::size_t __n = __rng2.size();
+    auto __exec2 = oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__swap2_wrapper>(std::forward<_ExecutionPolicy>(__exec));
+    auto __future = oneapi::dpl::__par_backend_hetero::__parallel_for(
+            _BackendTag{}, std::move(__exec2),
+            unseq_backend::__brick_swap<decltype(__exec2), _Function, _Range2, _Range1>{
+                {}, __f, __n},
+            __n, __rng2, __rng1);
+    __future.wait(__par_backend_hetero::__deferrable_mode{});
+    return __n;
 }
 
 //------------------------------------------------------------------------
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
index c0120c6ea3b..d2d1a46e25b 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
@@ -1499,6 +1499,64 @@ struct __brick_reduce_idx : public walk_scalar_base<_Range>
     _Size __n;
 };
 
+// std::swap_ranges is unique in that both sets of provided ranges will be modified. Due to this,
+// we define a separate functor from __walk2_vectors_or_scalars with a customized vectorization path.
+template <typename _ExecutionPolicy, typename _F, typename _Range1, typename _Range2>
+struct __brick_swap : public walk_vector_or_scalar_base<_Range1, _Range2>
+{
+    using __base_t = walk_vector_or_scalar_base<_Range1, _Range2>;
+    _F __f;
+    std::size_t __n;
+    template <typename _IsFull, typename _ItemId>
+    void
+    __vector_path(_IsFull __is_full, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2) const
+    {
+        using _ValueType1 = oneapi::dpl::__internal::__value_t<_Range1>;
+        using _ValueType2 = oneapi::dpl::__internal::__value_t<_Range2>;
+        // This is needed for the icpx compiler to vectorize. The indirection introduced by our all / guard views interfere
+        // with compiler vectorization. At this point, we have ensured that input is contiguous and can be operated on as a raw pointer. The
+        // begin() function for these views will return a pointer.
+        auto __raw_ptr1 = __rng1.begin();
+        auto __raw_ptr2 = __rng2.begin();
+        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType1> __rng1_vector[__base_t::__preferred_vector_size];
+        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType1> __rng2_vector[__base_t::__preferred_vector_size];
+        // 1. Load inputs into vectors
+        oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(
+            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __raw_ptr1, __rng1_vector);
+        oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(
+            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __raw_ptr2, __rng2_vector);
+        // 2. Swap the two ranges
+        oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(
+            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f}, __rng2_vector,
+            __raw_ptr1);
+        oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(
+            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f}, __rng1_vector,
+            __raw_ptr2);
+        // 3. Explicitly call destructor of lazy union type
+        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(
+            __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __rng1_vector);
+        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(
+            __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __rng2_vector);
+    }
+
+    template <typename _IsFull, typename _Idx>
+    void
+    __scalar_path(_IsFull __is_full, const _Idx __idx, const _Range1 __rng1, _Range2 __rng2) const
+    {
+        __f(__rng1[__idx], __rng2[__idx]);
+    }
+
+    template <typename _IsFull, typename _Idx>
+    void
+    operator()(_IsFull __is_full, const _Idx __idx, const _Range1 __rng1, _Range2 __rng2) const
+    {
+        if constexpr (__base_t::__can_vectorize)
+            __vector_path(__is_full, __idx, __rng1, __rng2);
+        else
+            __scalar_path(__is_full, __idx, __rng1, __rng2);
+    }
+};
+
 } // namespace unseq_backend
 } // namespace dpl
 } // namespace oneapi
diff --git a/test/parallel_api/algorithm/alg.modifying.operations/swap_ranges.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/swap_ranges.pass.cpp
index 7fd56707cf1..8b947feb12a 100644
--- a/test/parallel_api/algorithm/alg.modifying.operations/swap_ranges.pass.cpp
+++ b/test/parallel_api/algorithm/alg.modifying.operations/swap_ranges.pass.cpp
@@ -132,7 +132,7 @@ main()
 {
     test<wrapper<std::uint16_t>>();
     test<wrapper<float32_t>>();
-    test<std::int8_t>();
+    test<std::uint8_t>();
     test<std::int32_t>();
     test<float64_t>();
 

From 6cb11c7d098c41760c7f525a72a5297a4421fb1f Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Wed, 18 Dec 2024 16:35:47 -0600
Subject: [PATCH 44/65] General code cleanup

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../internal/async_impl/async_impl_hetero.h   |   7 +-
 .../dpl/pstl/hetero/algorithm_impl_hetero.h   |  20 +-
 .../pstl/hetero/dpcpp/parallel_backend_sycl.h |   1 -
 .../hetero/dpcpp/parallel_backend_sycl_for.h  |  10 +-
 .../dpcpp/parallel_backend_sycl_utils.h       |  57 +++---
 .../pstl/hetero/dpcpp/unseq_backend_sycl.h    | 175 +++++++++---------
 6 files changed, 137 insertions(+), 133 deletions(-)

diff --git a/include/oneapi/dpl/internal/async_impl/async_impl_hetero.h b/include/oneapi/dpl/internal/async_impl/async_impl_hetero.h
index 36a1b339df7..92b9ed9b8ec 100644
--- a/include/oneapi/dpl/internal/async_impl/async_impl_hetero.h
+++ b/include/oneapi/dpl/internal/async_impl/async_impl_hetero.h
@@ -46,7 +46,8 @@ __pattern_walk1_async(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _For
 
     auto __future_obj = oneapi::dpl::__par_backend_hetero::__parallel_for(
         _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
-        unseq_backend::walk1_vector_or_scalar<_ExecutionPolicy, _Function, decltype(__view)>{{}, __f, std::size_t(__n)},
+        unseq_backend::walk1_vector_or_scalar<_ExecutionPolicy, _Function, decltype(__view)>{
+            {}, __f, static_cast<std::size_t>(__n)},
         __n, __view);
     return __future_obj;
 }
@@ -74,7 +75,7 @@ __pattern_walk2_async(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _For
     auto __future = oneapi::dpl::__par_backend_hetero::__parallel_for(
         _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
         unseq_backend::walk2_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__view1), decltype(__view2)>{
-            {}, __f, std::size_t(__n)},
+            {}, __f, static_cast<std::size_t>(__n)},
         __n, __view1, __view2);
 
     return __future.__make_future(__first2 + __n);
@@ -106,7 +107,7 @@ __pattern_walk3_async(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _For
     auto __future = oneapi::dpl::__par_backend_hetero::__parallel_for(
         _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
         unseq_backend::walk3_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__view1), decltype(__view2),
-                                                decltype(__view3)>{{}, __f, size_t(__n)},
+                                                decltype(__view3)>{{}, __f, static_cast<size_t>(__n)},
         __n, __view1, __view2, __view3);
 
     return __future.__make_future(__first3 + __n);
diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h
index a3bc1fcc4e0..b7d2a9262b2 100644
--- a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h
+++ b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h
@@ -111,7 +111,7 @@ __pattern_walk2(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _ForwardIt
     auto __future = oneapi::dpl::__par_backend_hetero::__parallel_for(
         _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
         unseq_backend::walk2_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__view1), decltype(__view2)>{
-            {}, __f, size_t(__n)},
+            {}, __f, static_cast<std::size_t>(__n)},
         __n, __view1, __view2);
 
     // Call no wait, wait or deferrable wait depending on _WaitMode
@@ -157,7 +157,7 @@ __pattern_swap(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _Forw
     auto __future = oneapi::dpl::__par_backend_hetero::__parallel_for(
         _BackendTag{}, std::forward<_ExecutionPolicy>(__exec),
         unseq_backend::__brick_swap<_ExecutionPolicy, _Function, decltype(__view1), decltype(__view2)>{
-            {}, __f, size_t(__n)},
+            {}, __f, static_cast<std::size_t>(__n)},
         __n, __view1, __view2);
     __future.wait(__par_backend_hetero::__deferrable_mode{});
     return __first2 + __n;
@@ -192,9 +192,9 @@ __pattern_walk3(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _ForwardIt
     auto __view3 = __buf3.all_view();
 
     oneapi::dpl::__par_backend_hetero::__parallel_for(
-        _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
+        _BackendTag{}, std::forward<_ExecutionPolicy>(__exec),
         unseq_backend::walk3_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__view1), decltype(__view2),
-                                                decltype(__view3)>{{}, __f, size_t(__n)},
+                                                decltype(__view3)>{{}, __f, static_cast<std::size_t>(__n)},
         __n, __view1, __view2, __view3)
         .__deferrable_wait();
 
@@ -1597,8 +1597,8 @@ __pattern_reverse(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterato
     auto __keep = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read_write, _Iterator>();
     auto __buf = __keep(__first, __last);
     oneapi::dpl::__par_backend_hetero::__parallel_for(
-        _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
-        unseq_backend::__reverse_functor<typename ::std::iterator_traits<_Iterator>::difference_type,
+        _BackendTag{}, std::forward<_ExecutionPolicy>(__exec),
+        unseq_backend::__reverse_functor<typename std::iterator_traits<_Iterator>::difference_type,
                                          decltype(__buf.all_view())>{{}, __n},
         __n / 2, __buf.all_view())
         .__deferrable_wait();
@@ -1626,8 +1626,8 @@ __pattern_reverse_copy(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Bi
     auto __view1 = __buf1.all_view();
     auto __view2 = __buf2.all_view();
     oneapi::dpl::__par_backend_hetero::__parallel_for(
-        _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
-        unseq_backend::__reverse_copy<typename ::std::iterator_traits<_BidirectionalIterator>::difference_type,
+        _BackendTag{}, std::forward<_ExecutionPolicy>(__exec),
+        unseq_backend::__reverse_copy<typename std::iterator_traits<_BidirectionalIterator>::difference_type,
                                       decltype(__view1), decltype(__view2)>{{}, __n},
         __n, __view1, __view2)
         .__deferrable_wait();
@@ -1670,7 +1670,7 @@ __pattern_rotate(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterator
     const auto __shift = __new_first - __first;
     oneapi::dpl::__par_backend_hetero::__parallel_for(
         _BackendTag{}, oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__rotate_wrapper>(__exec),
-        unseq_backend::__rotate_copy<typename ::std::iterator_traits<_Iterator>::difference_type, decltype(__view),
+        unseq_backend::__rotate_copy<typename std::iterator_traits<_Iterator>::difference_type, decltype(__view),
                                      decltype(__temp_rng_w)>{{}, __n, __shift},
         __n, __view, __temp_rng_w);
 
@@ -1683,7 +1683,7 @@ __pattern_rotate(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterator
     auto __brick = unseq_backend::walk2_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__temp_rng_rw),
                                                            decltype(__buf.all_view())>{
         {}, _Function{}, static_cast<std::size_t>(__n)};
-    oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), __brick,
+    oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, std::forward<_ExecutionPolicy>(__exec), __brick,
                                                       __n, __temp_rng_rw, __buf.all_view())
         .__deferrable_wait();
 
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index c62b1131e01..3fb809f78d6 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -28,7 +28,6 @@
 #include <cmath>
 #include <limits>
 #include <cstdint>
-#include <tuple>
 
 #include "../../iterator_impl.h"
 #include "../../execution_impl.h"
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h
index fb834cbcb35..5a7cb7b29d2 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h
@@ -60,7 +60,7 @@ struct __parallel_for_small_submitter<__internal::__optional_kernel_name<_Name..
     {
         assert(oneapi::dpl::__ranges::__get_first_range_size(__rngs...) > 0);
         _PRINT_INFO_IN_DEBUG_MODE(__exec);
-        auto __event = __exec.queue().submit([&__rngs..., &__brick, __count](sycl::handler& __cgh) {
+        auto __event = __exec.queue().submit([__rngs..., __brick, __count](sycl::handler& __cgh) {
             //get an access to data under SYCL buffer:
             oneapi::dpl::__ranges::__require_access(__cgh, __rngs...);
 
@@ -142,20 +142,20 @@ struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name..
     {
         assert(oneapi::dpl::__ranges::__get_first_range_size(__rngs...) > 0);
         _PRINT_INFO_IN_DEBUG_MODE(__exec);
-        auto __event = __exec.queue().submit([&__rngs..., &__brick, &__exec, __count](sycl::handler& __cgh) {
+        auto __event = __exec.queue().submit([__rngs..., __brick, __exec, __count](sycl::handler& __cgh) {
             //get an access to data under SYCL buffer:
             oneapi::dpl::__ranges::__require_access(__cgh, __rngs...);
             constexpr static std::uint16_t __iters_per_work_item = _Fp::__preferred_iters_per_item;
             const std::size_t __work_group_size =
                 oneapi::dpl::__internal::__max_work_group_size(__exec, __max_work_group_size);
             const std::size_t __num_groups =
-                oneapi::dpl::__internal::__dpl_ceiling_div(__count, (__work_group_size * decltype(__brick)::__preferred_vector_size * __iters_per_work_item));
+                oneapi::dpl::__internal::__dpl_ceiling_div(__count, (__work_group_size * _Fp::__preferred_vector_size * __iters_per_work_item));
             const std::size_t __num_items = __num_groups * __work_group_size;
             __cgh.parallel_for<_Name...>(
                 sycl::nd_range(sycl::range<1>(__num_items), sycl::range<1>(__work_group_size)),
                 [=](sycl::nd_item</*dim=*/1> __item) {
                     auto [__idx, __stride, __is_full] =
-                        __stride_recommender(__item, __count, __iters_per_work_item, decltype(__brick)::__preferred_vector_size, __work_group_size);
+                        __stride_recommender(__item, __count, __iters_per_work_item, _Fp::__preferred_vector_size, __work_group_size);
                     __strided_loop<__iters_per_work_item> __execute_loop{static_cast<std::size_t>(__count)};
                     if (__is_full)
                     {
@@ -190,7 +190,7 @@ __parallel_for(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&&
     // Compile two kernels: one for small-to-medium inputs and a second for large. This avoids runtime checks within a
     // single kernel that worsen performance for small cases. If the number of iterations of the large submitter is 1,
     // then only compile the basic kernel as the two versions are effectively the same.
-    if constexpr (_Fp::__preferred_iters_per_item > 1)
+    if constexpr (_Fp::__preferred_iters_per_item > 1 || _Fp::__preferred_vector_size > 1)
     {
         if (__count >= __large_submitter::__estimate_best_start_size(__exec, __brick))
         {
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
index 93f796b90c8..1735c363731 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
@@ -845,17 +845,17 @@ struct __lazy_load_transform_op
     }
 };
 
-template <std::uint16_t __vec_size>
+template <std::uint8_t __vec_size>
 struct __vector_load
 {
-    static_assert(__vec_size <= 4);
+    static_assert(__vec_size <= 4, "Only vector sizes of 4 or less are supported");
     std::size_t __n;
     template <typename _IdxType, typename _LoadOp, typename... _Acc>
     void
     operator()(std::true_type, _IdxType __start_idx, _LoadOp __load_op, _Acc... __acc) const
     {
         _ONEDPL_PRAGMA_UNROLL
-        for (std::uint16_t __i = 0; __i < __vec_size; ++__i)
+        for (std::uint8_t __i = 0; __i < __vec_size; ++__i)
             __load_op(__start_idx + __i, __i, __acc...);
     }
 
@@ -863,8 +863,8 @@ struct __vector_load
     void
     operator()(std::false_type, _IdxType __start_idx, _LoadOp __load_op, _Acc... __acc) const
     {
-        std::uint16_t __elements = std::min(__vec_size, decltype(__vec_size)(__n - __start_idx));
-        for (std::uint16_t __i = 0; __i < __elements; ++__i)
+        std::uint8_t __elements = std::min(std::size_t{__vec_size}, std::size_t{__n - __start_idx});
+        for (std::uint8_t __i = 0; __i < __elements; ++__i)
             __load_op(__start_idx + __i, __i, __acc...);
     }
 };
@@ -891,10 +891,10 @@ struct __lazy_store_transform_op
     }
 };
 
-template <std::uint16_t __vec_size>
+template <std::uint8_t __vec_size>
 struct __vector_walk
 {
-    static_assert(__vec_size <= 4);
+    static_assert(__vec_size <= 4, "Only vector sizes of 4 or less are supported");
     std::size_t __n;
 
     template <typename _IdxType, typename _WalkFunction, typename... _Rngs>
@@ -902,9 +902,8 @@ struct __vector_walk
     operator()(std::true_type, _IdxType __idx, _WalkFunction __f, _Rngs&&... __rngs) const
     {
         _ONEDPL_PRAGMA_UNROLL
-        for (std::uint16_t __i = 0; __i < __vec_size; ++__i)
+        for (std::uint8_t __i = 0; __i < __vec_size; ++__i)
         {
-
             __f(__rngs[__idx + __i]...);
         }
     }
@@ -914,40 +913,42 @@ struct __vector_walk
     void
     operator()(std::false_type, _IdxType __idx, _WalkFunction __f, _Rngs&&... __rngs) const
     {
-        std::uint16_t __elements = std::min(__vec_size, decltype(__vec_size)(__n - __idx));
-        for (std::uint16_t __i = 0; __i < __elements; ++__i)
+        std::uint8_t __elements = std::min(std::size_t{__vec_size}, std::size_t{__n - __idx});
+        for (std::uint8_t __i = 0; __i < __elements; ++__i)
         {
             __f(__rngs[__idx + __i]...);
         }
     }
 };
 
-template <std::uint16_t __vec_size>
+template <std::uint8_t __vec_size>
 struct __vector_store
 {
+    static_assert(__vec_size <= 4, "Only vector sizes of 4 or less are supported");
     std::size_t __n;
-    static_assert(__vec_size <= 4);
-    template <typename _IdxType, typename _StoreOp, typename... _Acc>
+
+    template <typename _IdxType, typename _StoreOp, typename... _Rngs>
     void
-    operator()(std::true_type, _IdxType __start_idx, _StoreOp __store_op, _Acc... __acc) const
+    operator()(std::true_type, _IdxType __start_idx, _StoreOp __store_op, _Rngs... __rngs) const
     {
         _ONEDPL_PRAGMA_UNROLL
-        for (std::uint16_t __i = 0; __i < __vec_size; ++__i)
-            __store_op(__i, __start_idx + __i, __acc...);
+        for (std::uint8_t __i = 0; __i < __vec_size; ++__i)
+            __store_op(__i, __start_idx + __i, __rngs...);
     }
-    template <typename _IdxType, typename _StoreOp, typename... _Acc>
+    template <typename _IdxType, typename _StoreOp, typename... _Rngs>
     void
-    operator()(std::false_type, _IdxType __start_idx, _StoreOp __store_op, _Acc... __acc) const
+    operator()(std::false_type, _IdxType __start_idx, _StoreOp __store_op, _Rngs... __rngs) const
     {
-        std::uint16_t __elements = std::min(__vec_size, decltype(__vec_size)(__n - __start_idx));
-        for (std::uint16_t __i = 0; __i < __elements; ++__i)
-            __store_op(__i, __start_idx + __i, __acc...);
+        std::uint8_t __elements = std::min(std::size_t{__vec_size}, std::size_t{__n - __start_idx});
+        for (std::uint8_t __i = 0; __i < __elements; ++__i)
+            __store_op(__i, __start_idx + __i, __rngs...);
     }
 };
 
-template <std::uint16_t __vec_size>
+template <std::uint8_t __vec_size>
 struct __vector_reverse
 {
+    static_assert(__vec_size <= 4, "Only vector sizes of 4 or less are supported");
     template <typename _IsFull, typename _Idx, typename _Array>
     void
     operator()(_IsFull __is_full, const _Idx __elements_to_process, _Array __array) const
@@ -955,12 +956,12 @@ struct __vector_reverse
         if constexpr (__is_full)
         {
             _ONEDPL_PRAGMA_UNROLL
-            for (std::uint16_t __i = 0; __i != __vec_size / 2; ++__i)
+            for (std::uint8_t __i = 0; __i < __vec_size / 2; ++__i)
                 std::swap(__array[__i].__v, __array[__vec_size - __i - 1].__v);
         }
         else
         {
-            for (std::uint16_t __i = 0; __i != __elements_to_process / 2; ++__i)
+            for (std::uint8_t __i = 0; __i < __elements_to_process / 2; ++__i)
                 std::swap(__array[__i].__v, __array[__elements_to_process - __i - 1].__v);
         }
     }
@@ -968,7 +969,7 @@ struct __vector_reverse
 
 // Processes a loop with a given stride. Intended to be used with sub-group / work-group strides for good memory access patterns
 // (potentially with vectorization)
-template <std::uint16_t __num_strides>
+template <std::uint8_t __num_strides>
 struct __strided_loop
 {
     std::size_t __n;
@@ -978,7 +979,7 @@ struct __strided_loop
                _Ranges&&... __rngs) const
     {
         _ONEDPL_PRAGMA_UNROLL
-        for (std::uint16_t __i = 0; __i < __num_strides; ++__i)
+        for (std::uint8_t __i = 0; __i < __num_strides; ++__i)
         {
             __loop_body_op(std::true_type{}, __idx, __rngs...);
             __idx += __stride;
@@ -992,7 +993,7 @@ struct __strided_loop
         // Constrain the number of iterations as much as possible and then pass the knowledge that we are not a full loop to the body operation
         const std::uint8_t __adjusted_iters_per_work_item =
             oneapi::dpl::__internal::__dpl_ceiling_div(__n - __idx, __stride);
-        for (std::uint16_t __i = 0; __i < __adjusted_iters_per_work_item; ++__i)
+        for (std::uint8_t __i = 0; __i < __adjusted_iters_per_work_item; ++__i)
         {
             __loop_body_op(std::false_type{}, __idx, __rngs...);
             __idx += __stride;
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
index d2d1a46e25b..2b8bb672603 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
@@ -116,19 +116,20 @@ template <typename... _Ranges>
 class walk_vector_or_scalar_base
 {
     using _ValueTypes = std::tuple<oneapi::dpl::__internal::__value_t<_Ranges>...>;
-    constexpr static std::uint16_t __min_type_size =
+    constexpr static std::uint8_t __min_type_size =
         oneapi::dpl::__internal::__min_nested_type_size<_ValueTypes>::value;
-    constexpr static std::uint16_t __bytes_per_item = 16;
-
+    // Empirically determined 'bytes-in-flight' to maximize bandwidth utilization
+    constexpr static std::uint8_t __bytes_per_item = 16;
+    // Maximum size supported by compilers to generate vector instructions
+    constexpr static std::uint8_t __max_vector_size = 4;
   public:
     constexpr static bool __can_vectorize =
         (oneapi::dpl::__ranges::__is_vectorizable_range<_Ranges>::value && ...) &&
         (std::is_fundamental_v<oneapi::dpl::__internal::__value_t<_Ranges>> && ...) && __min_type_size < 4;
     // Vectorize for small types, so we generate 128-byte load / stores in a sub-group
-    constexpr static std::uint16_t __preferred_vector_size =
-        __can_vectorize ? oneapi::dpl::__internal::__dpl_ceiling_div(4, __min_type_size) : 1;
-    // To achieve full bandwidth utilization, multiple iterations need to be processed by a work item
-    constexpr static std::uint16_t __preferred_iters_per_item =
+    constexpr static std::uint8_t __preferred_vector_size =
+        __can_vectorize ? oneapi::dpl::__internal::__dpl_ceiling_div(__max_vector_size, __min_type_size) : 1;
+    constexpr static std::uint8_t __preferred_iters_per_item =
         __bytes_per_item / (__min_type_size * __preferred_vector_size);
 };
 
@@ -136,15 +137,17 @@ class walk_vector_or_scalar_base
 template <typename... _Ranges>
 class walk_scalar_base
 {
-    using _ValueType =
-        oneapi::dpl::__internal::__min_nested_type_size<std::tuple<oneapi::dpl::__internal::__value_t<_Ranges>...>>;
-
+    using _ValueTypes = std::tuple<oneapi::dpl::__internal::__value_t<_Ranges>...>;
+    constexpr static std::uint8_t __min_type_size =
+        oneapi::dpl::__internal::__min_nested_type_size<_ValueTypes>::value;
+    constexpr static std::uint8_t __bytes_per_item = 16;
   public:
     constexpr static bool __can_vectorize = false;
     // With no vectorization, the vector size is 1
-    constexpr static std::uint16_t __preferred_vector_size = 1;
+    constexpr static std::uint8_t __preferred_vector_size = 1;
     // To achieve full bandwidth utilization, multiple iterations need to be processed by a work item
-    constexpr static std::uint16_t __preferred_iters_per_item = 16 / (sizeof(_ValueType) * __preferred_vector_size);
+    constexpr static std::uint8_t __preferred_iters_per_item =
+        __bytes_per_item / (__min_type_size * __preferred_vector_size);
 };
 
 template <typename _ExecutionPolicy, typename _F, typename _Range>
@@ -321,51 +324,51 @@ struct walk_adjacent_difference : public walk_vector_or_scalar_base<_Range1, _Ra
 
     template <typename _IsFull, typename _ItemId>
     void
-    __scalar_path(_IsFull, const _ItemId __idx, const _Range1 __acc_src, _Range2 __acc_dst) const
+    __scalar_path(_IsFull, const _ItemId __idx, const _Range1 __rng1, _Range2 __rng2) const
     {
         // just copy an element if it is the first one
         if (__idx == 0)
-            __acc_dst[__idx] = __acc_src[__idx];
+            __rng2[__idx] = __rng1[__idx];
         else
-            __f(__acc_src[__idx + (-1)], __acc_src[__idx], __acc_dst[__idx]);
+            __f(__rng1[__idx + (-1)], __rng1[__idx], __rng2[__idx]);
     }
     template <typename _IsFull, typename _ItemId>
     void
-    __vector_path(_IsFull __is_full, const _ItemId __idx, const _Range1 __acc_src, _Range2 __acc_dst) const
+    __vector_path(_IsFull __is_full, const _ItemId __idx, const _Range1 __rng1, _Range2 __rng2) const
     {
         using _ValueType = oneapi::dpl::__internal::__value_t<_Range1>;
-        auto __acc_src_ptr = __acc_src.begin();
-        auto __acc_dst_ptr = __acc_dst.begin();
-        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __src_vector[__base_t::__preferred_vector_size + 1];
+        auto __rng1_ptr = __rng1.begin();
+        auto __rng2_ptr = __rng2.begin();
+        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __rng1_vector[__base_t::__preferred_vector_size + 1];
         // 1. Establish a vector of __preferred_vector_size + 1 where a scalar load is performed on the first element
         // followed by a vector load of the specified length.
         if (__idx != 0)
-            __src_vector[0].__setup(__acc_src_ptr[__idx - 1]);
+            __rng1_vector[0].__setup(__rng1_ptr[__idx - 1]);
         else
-            __src_vector[0].__setup(__acc_src_ptr[0]);
+            __rng1_vector[0].__setup(__rng1_ptr[0]);
         oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(
-            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __acc_src_ptr,
-            &__src_vector[1]);
+            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __rng1_ptr,
+            &__rng1_vector[1]);
         // 2. Perform a vector store of __preferred_vector_size adjacent differences.
         oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(
-            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f}, __src_vector,
-            &__src_vector[1], __acc_dst_ptr);
+            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f}, __rng1_vector,
+            &__rng1_vector[1], __rng2_ptr);
         // A dummy value is first written to global memory followed by an overwrite for the first index. Pulling the vector loads / stores into an if branch
         // to better handle this results in performance degradation.
         if (__idx == 0)
-            __acc_dst[0] = __src_vector[0].__v;
+            __rng2[0] = __rng1_vector[0].__v;
         // 3. Delete temporary storage
         oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(__is_full, 0,
-            oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __src_vector);
+            oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __rng1_vector);
     }
-    template <typename _IsFull, typename _ItemId, typename _Acc1, typename _Acc2>
+    template <typename _IsFull, typename _ItemId>
     void
-    operator()(_IsFull __is_full, const _ItemId __idx, const _Acc1& __acc_src, _Acc2& __acc_dst) const
+    operator()(_IsFull __is_full, const _ItemId __idx, const _Range1 __rng1, _Range2 __rng2) const
     {
         if constexpr (__base_t::__can_vectorize)
-            __vector_path(__is_full, __idx, __acc_src, __acc_dst);
+            __vector_path(__is_full, __idx, __rng1, __rng2);
         else
-            __scalar_path(__is_full, __idx, __acc_src, __acc_dst);
+            __scalar_path(__is_full, __idx, __rng1, __rng2);
     }
 };
 
@@ -1151,11 +1154,11 @@ struct __reverse_functor : public walk_vector_or_scalar_base<_Range>
     using __base_t = walk_vector_or_scalar_base<_Range>;
     using _ValueType = oneapi::dpl::__internal::__value_t<_Range>;
     _Size __size;
-    template <typename _IsFull, typename _Idx, typename _Accessor>
+    template <typename _IsFull, typename _Idx>
     void
-    __vector_path(_IsFull __is_full, const _Idx __left_start_idx, _Accessor __acc) const
+    __vector_path(_IsFull __is_full, const _Idx __left_start_idx, _Range __rng) const
     {
-        auto __acc_pointer = __acc.begin();
+        auto __rng_pointer = __rng.begin();
         std::size_t __n = __size;
         std::size_t __midpoint = __size / 2;
         // If our start is passed the midpoint, then immediately leave as it is guaranteed to be processed by another
@@ -1168,50 +1171,50 @@ struct __reverse_functor : public walk_vector_or_scalar_base<_Range>
         // 1. Load two vectors that we want to swap: one from the left half of the buffer and one from the right
         const _Idx __right_start_idx = __size - __left_start_idx - __base_t::__preferred_vector_size;
 
-        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __acc_left_vector[__base_t::__preferred_vector_size];
-        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __acc_right_vector[__base_t::__preferred_vector_size];
+        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __rng_left_vector[__base_t::__preferred_vector_size];
+        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __rng_right_vector[__base_t::__preferred_vector_size];
 
         oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(
-            __is_full, __left_start_idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __acc_pointer,
-            __acc_left_vector);
+            __is_full, __left_start_idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __rng_pointer,
+            __rng_left_vector);
         oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(
-            __is_full, __right_start_idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __acc_pointer,
-            __acc_right_vector);
+            __is_full, __right_start_idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __rng_pointer,
+            __rng_right_vector);
         // 2. Reverse vectors in registers. Note that due to indices we have chosen, there will always be a full vector of elements to load
         oneapi::dpl::__par_backend_hetero::__vector_reverse<__base_t::__preferred_vector_size>{}(
-            std::true_type{}, __left_start_idx, __acc_left_vector);
+            std::true_type{}, __left_start_idx, __rng_left_vector);
         oneapi::dpl::__par_backend_hetero::__vector_reverse<__base_t::__preferred_vector_size>{}(
-            std::true_type{}, __right_start_idx, __acc_right_vector);
+            std::true_type{}, __right_start_idx, __rng_right_vector);
         // 3. Store the left-half vector to the corresponding right-half indices and vice versa
         oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(
             __is_full, __right_start_idx,
             oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<oneapi::dpl::__internal::__pstl_assign>{},
-            __acc_left_vector, __acc_pointer);
+            __rng_left_vector, __rng_pointer);
         oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(
             __is_full, __left_start_idx,
             oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<oneapi::dpl::__internal::__pstl_assign>{},
-            __acc_right_vector, __acc_pointer);
+            __rng_right_vector, __rng_pointer);
         // 4. Call destructors of temporary storage
         oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(__is_full, 0,
-            oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __acc_left_vector);
+            oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __rng_left_vector);
         oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(__is_full, 0,
-            oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __acc_right_vector);
+            oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __rng_right_vector);
     }
-    template <typename _IsFull, typename _Idx, typename _Accessor>
+    template <typename _IsFull, typename _Idx>
     void
-    __scalar_path(_IsFull, const _Idx __idx, _Accessor __acc) const
+    __scalar_path(_IsFull, const _Idx __idx, _Range __rng) const
     {
         using ::std::swap;
-        swap(__acc[__idx], __acc[__size - __idx - 1]);
+        swap(__rng[__idx], __rng[__size - __idx - 1]);
     }
-    template <typename _IsFull, typename _Idx, typename _Accessor>
+    template <typename _IsFull, typename _Idx>
     void
-    operator()(_IsFull __is_full, const _Idx __idx, _Accessor __acc) const
+    operator()(_IsFull __is_full, const _Idx __idx, _Range __rng) const
     {
         if constexpr (__base_t::__can_vectorize)
-            __vector_path(__is_full, __idx, __acc);
+            __vector_path(__is_full, __idx, __rng);
         else
-            __scalar_path(__is_full, __idx, __acc);
+            __scalar_path(__is_full, __idx, __rng);
     }
 };
 
@@ -1227,55 +1230,55 @@ struct __reverse_copy : public walk_vector_or_scalar_base<_Range1, _Range2>
 
     template <typename _IsFull, typename _Idx>
     void
-    __scalar_path(_IsFull, const _Idx __idx, const _Range1 __acc1, _Range2 __acc2) const
+    __scalar_path(_IsFull, const _Idx __idx, const _Range1 __rng1, _Range2 __rng2) const
     {
-        __acc2[__idx] = __acc1[__size - __idx - 1];
+        __rng2[__idx] = __rng1[__size - __idx - 1];
     }
     template <typename _IsFull, typename _Idx>
     void
-    __vector_path(_IsFull __is_full, const _Idx __idx, const _Range1 __acc1, _Range2 __acc2) const
+    __vector_path(_IsFull __is_full, const _Idx __idx, const _Range1 __rng1, _Range2 __rng2) const
     {
-        auto __acc1_pointer = __acc1.begin();
-        auto __acc2_pointer = __acc2.begin();
+        auto __rng1_pointer = __rng1.begin();
+        auto __rng2_pointer = __rng2.begin();
         std::size_t __n = __size;
         std::size_t __remaining_elements = __idx >= __n ? 0 : __n - __idx;
         std::size_t __elements_to_process =
             std::min(static_cast<std::size_t>(__base_t::__preferred_vector_size), __remaining_elements);
         const _Idx __output_start = __size - __idx - __elements_to_process;
         // 1. Load vector to reverse
-        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __acc1_vector[__base_t::__preferred_vector_size];
+        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __rng1_vector[__base_t::__preferred_vector_size];
         oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(
-            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __acc1_pointer,
-            __acc1_vector);
+            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __rng1_pointer,
+            __rng1_vector);
         // 2, 3. Reverse in registers and flip the location of the vector in the output buffer
         if (__elements_to_process == __base_t::__preferred_vector_size)
         {
             oneapi::dpl::__par_backend_hetero::__vector_reverse<__base_t::__preferred_vector_size>{}(
-                std::true_type{}, __elements_to_process, __acc1_vector);
+                std::true_type{}, __elements_to_process, __rng1_vector);
             oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(
                 __is_full, __output_start,
                 oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<oneapi::dpl::__internal::__pstl_assign>{},
-                __acc1_vector, __acc2_pointer);
+                __rng1_vector, __rng2_pointer);
         }
         else
         {
             oneapi::dpl::__par_backend_hetero::__vector_reverse<__base_t::__preferred_vector_size>{}(
-                std::false_type{}, __elements_to_process, __acc1_vector);
-            for (std::uint16_t __i = 0; __i < __elements_to_process; ++__i)
-                __acc2_pointer[__output_start + __i] = __acc1_vector[__i].__v;
+                std::false_type{}, __elements_to_process, __rng1_vector);
+            for (std::uint8_t __i = 0; __i < __elements_to_process; ++__i)
+                __rng2_pointer[__output_start + __i] = __rng1_vector[__i].__v;
         }
         // 3. Cleanup
         oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(__is_full, 0,
-            oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __acc1_vector);
+            oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __rng1_vector);
     }
-    template <typename _IsFull, typename _Idx, typename _AccessorSrc, typename _AccessorDst>
+    template <typename _IsFull, typename _Idx>
     void
-    operator()(_IsFull __is_full, const _Idx __idx, const _AccessorSrc __acc1, _AccessorDst __acc2) const
+    operator()(_IsFull __is_full, const _Idx __idx, const _Range1 __rng1, _Range2 __rng2) const
     {
         if constexpr (__base_t::__can_vectorize)
-            __vector_path(__is_full, __idx, __acc1, __acc2);
+            __vector_path(__is_full, __idx, __rng1, __rng2);
         else
-            __scalar_path(__is_full, __idx, __acc1, __acc2);
+            __scalar_path(__is_full, __idx, __rng1, __rng2);
     }
 };
 
@@ -1289,22 +1292,22 @@ struct __rotate_copy : public walk_vector_or_scalar_base<_Range1, _Range2>
     using _ValueType = oneapi::dpl::__internal::__value_t<_Range1>;
     _Size __size;
     _Size __shift;
-    template <typename _IsFull, typename _Idx, typename _AccessorSrc, typename _AccessorDst>
+    template <typename _IsFull, typename _Idx>
     void
-    __vector_path(_IsFull __is_full, const _Idx __idx, const _AccessorSrc __acc1, _AccessorDst __acc2) const
+    __vector_path(_IsFull __is_full, const _Idx __idx, const _Range1 __rng1, _Range2 __rng2) const
     {
-        auto __acc1_pointer = __acc1.begin();
-        auto __acc2_pointer = __acc2.begin();
+        auto __rng1_pointer = __rng1.begin();
+        auto __rng2_pointer = __rng2.begin();
         _Idx __shifted_idx = __shift + __idx;
         _Idx __wrapped_idx = __shifted_idx % __size;
         std::size_t __n = __size;
-        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __acc1_vector[__base_t::__preferred_vector_size];
+        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __rng1_vector[__base_t::__preferred_vector_size];
         //1. Vectorize loads only if we know the wrap around point is beyond the current vector elements to process
         if (__wrapped_idx + __base_t::__preferred_vector_size <= __size)
         {
             oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(
-                __is_full, __wrapped_idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __acc1_pointer,
-                __acc1_vector);
+                __is_full, __wrapped_idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __rng1_pointer,
+                __rng1_vector);
         }
         else
         {
@@ -1312,32 +1315,32 @@ struct __rotate_copy : public walk_vector_or_scalar_base<_Range1, _Range2>
             std::size_t __elements_to_process =
                 std::min(static_cast<std::size_t>(__base_t::__preferred_vector_size), __remaining_elements);
             for (std::uint16_t __i = 0; __i != __elements_to_process; ++__i)
-                __acc1_vector[__i].__setup(__acc1_pointer[(__shifted_idx + __i) % __size]);
+                __rng1_vector[__i].__setup(__rng1_pointer[(__shifted_idx + __i) % __size]);
         }
         // 2. Store the rotation
         oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(
             __is_full, __idx,
             oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<oneapi::dpl::__internal::__pstl_assign>{},
-            __acc1_vector, __acc2_pointer);
+            __rng1_vector, __rng2_pointer);
         // 3. Delete temporary storage
         oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(__is_full, 0,
-            oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __acc1_vector);
+            oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __rng1_vector);
     }
     template <typename _IsFull, typename _Idx>
     void
-    __scalar_path(_IsFull, const _Idx __idx, const _Range1 __acc1, _Range2 __acc2) const
+    __scalar_path(_IsFull, const _Idx __idx, const _Range1 __rng1, _Range2 __rng2) const
     {
 
-        __acc2[__idx] = __acc1[(__shift + __idx) % __size];
+        __rng2[__idx] = __rng1[(__shift + __idx) % __size];
     }
     template <typename _IsFull, typename _Idx>
     void
-    operator()(_IsFull __is_full, const _Idx __idx, const _Range1 __acc1, _Range2 __acc2) const
+    operator()(_IsFull __is_full, const _Idx __idx, const _Range1 __rng1, _Range2 __rng2) const
     {
         if constexpr (__base_t::__can_vectorize)
-            __vector_path(__is_full, __idx, __acc1, __acc2);
+            __vector_path(__is_full, __idx, __rng1, __rng2);
         else
-            __scalar_path(__is_full, __idx, __acc1, __acc2);
+            __scalar_path(__is_full, __idx, __rng1, __rng2);
     }
 };
 

From 505bdf3ccd44335a17bd1d13a35964fed3726df6 Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Wed, 18 Dec 2024 17:44:39 -0600
Subject: [PATCH 45/65] Bugfix with __pattern_swap using nanoranges

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../pstl/hetero/algorithm_ranges_impl_hetero.h   | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h
index 3b39db5c6dc..000e95df430 100644
--- a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h
+++ b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h
@@ -173,22 +173,24 @@ __pattern_swap(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _Rang
     if (__rng1.size() <= __rng2.size())
     {
         std::size_t __n = __rng1.size();
-        auto __exec1 = oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__swap1_wrapper>(std::forward<_ExecutionPolicy>(__exec));
+        auto __exec1 = oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__swap1_wrapper>(
+            std::forward<_ExecutionPolicy>(__exec));
         auto __future = oneapi::dpl::__par_backend_hetero::__parallel_for(
             _BackendTag{}, std::move(__exec1),
-            unseq_backend::__brick_swap<decltype(__exec1), _Function, _Range1, _Range2>{
+            unseq_backend::__brick_swap<decltype(__exec1), _Function, std::decay_t<_Range1>, std::decay_t<_Range2>>{
                 {}, __f, __n},
             __n, __rng1, __rng2);
         __future.wait(__par_backend_hetero::__deferrable_mode{});
         return __n;
     }
     std::size_t __n = __rng2.size();
-    auto __exec2 = oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__swap2_wrapper>(std::forward<_ExecutionPolicy>(__exec));
+    auto __exec2 =
+        oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__swap2_wrapper>(std::forward<_ExecutionPolicy>(__exec));
     auto __future = oneapi::dpl::__par_backend_hetero::__parallel_for(
-            _BackendTag{}, std::move(__exec2),
-            unseq_backend::__brick_swap<decltype(__exec2), _Function, _Range2, _Range1>{
-                {}, __f, __n},
-            __n, __rng2, __rng1);
+        _BackendTag{}, std::move(__exec2),
+        unseq_backend::__brick_swap<decltype(__exec2), _Function, std::decay_t<_Range2>, std::decay_t<_Range1>>{
+            {}, __f, __n},
+        __n, __rng2, __rng1);
     __future.wait(__par_backend_hetero::__deferrable_mode{});
     return __n;
 }

From 114924d3034b6ccbd787fbd58ff8ad9b20fa353c Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Thu, 19 Dec 2024 16:23:22 -0600
Subject: [PATCH 46/65] clang-format

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../hetero/dpcpp/parallel_backend_sycl_for.h  | 28 +++++++-----
 .../pstl/hetero/dpcpp/unseq_backend_sycl.h    | 44 +++++++++----------
 .../dpl/pstl/hetero/dpcpp/utils_ranges_sycl.h |  2 +-
 include/oneapi/dpl/pstl/utils.h               |  5 ++-
 4 files changed, 43 insertions(+), 36 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h
index 5a7cb7b29d2..da148da1a70 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h
@@ -89,8 +89,7 @@ struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name..
     // device.
     static inline std::tuple<std::size_t, std::size_t, bool>
     __stride_recommender(const sycl::nd_item<1>& __item, std::size_t __count, std::size_t __iters_per_work_item,
-                         std::size_t __adj_elements_per_work_item,
-                         std::size_t __work_group_size)
+                         std::size_t __adj_elements_per_work_item, std::size_t __work_group_size)
     {
         if constexpr (oneapi::dpl::__internal::__is_spirv_target_v)
         {
@@ -101,20 +100,27 @@ struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name..
             const std::size_t __work_group_id = __item.get_group().get_group_linear_id();
 
             const std::size_t __sub_group_start_idx =
-                __iters_per_work_item * __adj_elements_per_work_item * (__work_group_id * __work_group_size + __sub_group_size * __sub_group_id);
+                __iters_per_work_item * __adj_elements_per_work_item *
+                (__work_group_id * __work_group_size + __sub_group_size * __sub_group_id);
             const bool __is_full_sub_group =
-                __sub_group_start_idx + __iters_per_work_item * __adj_elements_per_work_item * __sub_group_size <= __count;
-            const std::size_t __work_item_idx = __sub_group_start_idx + __adj_elements_per_work_item * __sub_group_local_id;
-            return std::make_tuple(__work_item_idx, __adj_elements_per_work_item * __sub_group_size, __is_full_sub_group);
+                __sub_group_start_idx + __iters_per_work_item * __adj_elements_per_work_item * __sub_group_size <=
+                __count;
+            const std::size_t __work_item_idx =
+                __sub_group_start_idx + __adj_elements_per_work_item * __sub_group_local_id;
+            return std::make_tuple(__work_item_idx, __adj_elements_per_work_item * __sub_group_size,
+                                   __is_full_sub_group);
         }
         else
         {
-            const std::size_t __work_group_start_idx =
-                __item.get_group().get_group_linear_id() * __work_group_size * __iters_per_work_item * __adj_elements_per_work_item;
-            const std::size_t __work_item_idx = __work_group_start_idx + __item.get_local_linear_id() * __adj_elements_per_work_item;
+            const std::size_t __work_group_start_idx = __item.get_group().get_group_linear_id() * __work_group_size *
+                                                       __iters_per_work_item * __adj_elements_per_work_item;
+            const std::size_t __work_item_idx =
+                __work_group_start_idx + __item.get_local_linear_id() * __adj_elements_per_work_item;
             const bool __is_full_work_group =
-                __work_group_start_idx + __iters_per_work_item * __work_group_size * __adj_elements_per_work_item <= __count;
-            return std::make_tuple(__work_item_idx, __work_group_size * __adj_elements_per_work_item, __is_full_work_group);
+                __work_group_start_idx + __iters_per_work_item * __work_group_size * __adj_elements_per_work_item <=
+                __count;
+            return std::make_tuple(__work_item_idx, __work_group_size * __adj_elements_per_work_item,
+                                   __is_full_work_group);
         }
     }
 
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
index 2b8bb672603..b3023302608 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
@@ -116,12 +116,12 @@ template <typename... _Ranges>
 class walk_vector_or_scalar_base
 {
     using _ValueTypes = std::tuple<oneapi::dpl::__internal::__value_t<_Ranges>...>;
-    constexpr static std::uint8_t __min_type_size =
-        oneapi::dpl::__internal::__min_nested_type_size<_ValueTypes>::value;
+    constexpr static std::uint8_t __min_type_size = oneapi::dpl::__internal::__min_nested_type_size<_ValueTypes>::value;
     // Empirically determined 'bytes-in-flight' to maximize bandwidth utilization
     constexpr static std::uint8_t __bytes_per_item = 16;
     // Maximum size supported by compilers to generate vector instructions
     constexpr static std::uint8_t __max_vector_size = 4;
+
   public:
     constexpr static bool __can_vectorize =
         (oneapi::dpl::__ranges::__is_vectorizable_range<_Ranges>::value && ...) &&
@@ -138,9 +138,9 @@ template <typename... _Ranges>
 class walk_scalar_base
 {
     using _ValueTypes = std::tuple<oneapi::dpl::__internal::__value_t<_Ranges>...>;
-    constexpr static std::uint8_t __min_type_size =
-        oneapi::dpl::__internal::__min_nested_type_size<_ValueTypes>::value;
+    constexpr static std::uint8_t __min_type_size = oneapi::dpl::__internal::__min_nested_type_size<_ValueTypes>::value;
     constexpr static std::uint8_t __bytes_per_item = 16;
+
   public:
     constexpr static bool __can_vectorize = false;
     // With no vectorization, the vector size is 1
@@ -163,8 +163,8 @@ struct walk1_vector_or_scalar : public walk_vector_or_scalar_base<_Range>
     {
         // This is needed to enable vectorization
         auto __raw_ptr = __rng.begin();
-        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(
-            __is_full, __idx, __f, __raw_ptr);
+        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(__is_full, __idx, __f,
+                                                                                                 __raw_ptr);
     }
 
     // _IsFull is ignored here. We assume that boundary checking has been already performed for this index.
@@ -213,8 +213,8 @@ struct walk2_vectors_or_scalars : public walk_vector_or_scalar_base<_Range1, _Ra
             __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f}, __rng1_vector,
             __raw_ptr2);
         // 3. Explicitly call destructor of lazy union type
-        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(__is_full, 0,
-             oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __rng1_vector);
+        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(
+            __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __rng1_vector);
     }
 
     // _IsFull is ignored here. We assume that boundary checking has been already performed for this index.
@@ -269,10 +269,10 @@ struct walk3_vectors_or_scalars : public walk_vector_or_scalar_base<_Range1, _Ra
             __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f}, __rng1_vector,
             __rng2_vector, __raw_ptr3);
         // 3. Explicitly call destructors of lazy union type
-        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(__is_full, 0,
-            oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __rng1_vector);
-        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(__is_full, 0,
-            oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __rng2_vector);
+        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(
+            __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __rng1_vector);
+        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(
+            __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __rng2_vector);
     }
 
     // _IsFull is ignored here. We assume that boundary checking has been already performed for this index.
@@ -358,8 +358,8 @@ struct walk_adjacent_difference : public walk_vector_or_scalar_base<_Range1, _Ra
         if (__idx == 0)
             __rng2[0] = __rng1_vector[0].__v;
         // 3. Delete temporary storage
-        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(__is_full, 0,
-            oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __rng1_vector);
+        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(
+            __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __rng1_vector);
     }
     template <typename _IsFull, typename _ItemId>
     void
@@ -1195,10 +1195,10 @@ struct __reverse_functor : public walk_vector_or_scalar_base<_Range>
             oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<oneapi::dpl::__internal::__pstl_assign>{},
             __rng_right_vector, __rng_pointer);
         // 4. Call destructors of temporary storage
-        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(__is_full, 0,
-            oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __rng_left_vector);
-        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(__is_full, 0,
-            oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __rng_right_vector);
+        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(
+            __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __rng_left_vector);
+        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(
+            __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __rng_right_vector);
     }
     template <typename _IsFull, typename _Idx>
     void
@@ -1268,8 +1268,8 @@ struct __reverse_copy : public walk_vector_or_scalar_base<_Range1, _Range2>
                 __rng2_pointer[__output_start + __i] = __rng1_vector[__i].__v;
         }
         // 3. Cleanup
-        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(__is_full, 0,
-            oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __rng1_vector);
+        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(
+            __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __rng1_vector);
     }
     template <typename _IsFull, typename _Idx>
     void
@@ -1323,8 +1323,8 @@ struct __rotate_copy : public walk_vector_or_scalar_base<_Range1, _Range2>
             oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<oneapi::dpl::__internal::__pstl_assign>{},
             __rng1_vector, __rng2_pointer);
         // 3. Delete temporary storage
-        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(__is_full, 0,
-            oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __rng1_vector);
+        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(
+            __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __rng1_vector);
     }
     template <typename _IsFull, typename _Idx>
     void
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/utils_ranges_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/utils_ranges_sycl.h
index f126df3ab9e..934c0a54b90 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/utils_ranges_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/utils_ranges_sycl.h
@@ -19,7 +19,7 @@
 #include <iterator>
 #include <type_traits>
 #if _ONEDPL_CPP20_RANGES_PRESENT && _ONEDPL_CPP20_CONCEPTS_PRESENT
-#include <ranges> // std::ranges::contiguous_range
+#   include <ranges> // std::ranges::contiguous_range
 #endif
 
 #include "../../utils_ranges.h"
diff --git a/include/oneapi/dpl/pstl/utils.h b/include/oneapi/dpl/pstl/utils.h
index 58ea5edaebe..0a0e5057c15 100644
--- a/include/oneapi/dpl/pstl/utils.h
+++ b/include/oneapi/dpl/pstl/utils.h
@@ -785,11 +785,12 @@ union __lazy_ctor_storage
     }
 };
 
-// Utility to explicitly call the destructor of __lazy_ctor_storage as a callback functor 
+// Utility to explicitly call the destructor of __lazy_ctor_storage as a callback functor
 struct __lazy_ctor_storage_deleter
 {
     template <typename _Tp>
-    void operator()(__lazy_ctor_storage<_Tp> __storage) const
+    void
+    operator()(__lazy_ctor_storage<_Tp> __storage) const
     {
         __storage.__destroy();
     }

From 845de21be8d1cd55ef1a85ef67064321a99f8aff Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Thu, 19 Dec 2024 19:44:48 -0600
Subject: [PATCH 47/65] Address applicable comments from PR #1870

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../hetero/dpcpp/parallel_backend_sycl_for.h  | 24 +++++++++----------
 .../dpl/pstl/hetero/dpcpp/utils_ranges_sycl.h |  2 +-
 2 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h
index da148da1a70..ff28eab7412 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h
@@ -19,6 +19,7 @@
 #include <algorithm>
 #include <cstdint>
 #include <type_traits>
+#include <tuple>
 
 #include "sycl_defs.h"
 #include "parallel_backend_sycl_utils.h"
@@ -107,8 +108,7 @@ struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name..
                 __count;
             const std::size_t __work_item_idx =
                 __sub_group_start_idx + __adj_elements_per_work_item * __sub_group_local_id;
-            return std::make_tuple(__work_item_idx, __adj_elements_per_work_item * __sub_group_size,
-                                   __is_full_sub_group);
+            return std::tuple(__work_item_idx, __adj_elements_per_work_item * __sub_group_size, __is_full_sub_group);
         }
         else
         {
@@ -119,8 +119,7 @@ struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name..
             const bool __is_full_work_group =
                 __work_group_start_idx + __iters_per_work_item * __work_group_size * __adj_elements_per_work_item <=
                 __count;
-            return std::make_tuple(__work_item_idx, __work_group_size * __adj_elements_per_work_item,
-                                   __is_full_work_group);
+            return std::tuple(__work_item_idx, __work_group_size * __adj_elements_per_work_item, __is_full_work_group);
         }
     }
 
@@ -147,21 +146,20 @@ struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name..
     operator()(_ExecutionPolicy&& __exec, _Fp __brick, _Index __count, _Ranges&&... __rngs) const
     {
         assert(oneapi::dpl::__ranges::__get_first_range_size(__rngs...) > 0);
+        const std::size_t __work_group_size =
+            oneapi::dpl::__internal::__max_work_group_size(__exec, __max_work_group_size);
         _PRINT_INFO_IN_DEBUG_MODE(__exec);
-        auto __event = __exec.queue().submit([__rngs..., __brick, __exec, __count](sycl::handler& __cgh) {
+        auto __event = __exec.queue().submit([__rngs..., __brick, __work_group_size, __count](sycl::handler& __cgh) {
             //get an access to data under SYCL buffer:
             oneapi::dpl::__ranges::__require_access(__cgh, __rngs...);
             constexpr static std::uint16_t __iters_per_work_item = _Fp::__preferred_iters_per_item;
-            const std::size_t __work_group_size =
-                oneapi::dpl::__internal::__max_work_group_size(__exec, __max_work_group_size);
-            const std::size_t __num_groups =
-                oneapi::dpl::__internal::__dpl_ceiling_div(__count, (__work_group_size * _Fp::__preferred_vector_size * __iters_per_work_item));
-            const std::size_t __num_items = __num_groups * __work_group_size;
+            const std::size_t __num_groups = oneapi::dpl::__internal::__dpl_ceiling_div(
+                __count, (__work_group_size * _Fp::__preferred_vector_size * __iters_per_work_item));
             __cgh.parallel_for<_Name...>(
-                sycl::nd_range(sycl::range<1>(__num_items), sycl::range<1>(__work_group_size)),
+                sycl::nd_range(sycl::range<1>(__num_groups * __work_group_size), sycl::range<1>(__work_group_size)),
                 [=](sycl::nd_item</*dim=*/1> __item) {
-                    auto [__idx, __stride, __is_full] =
-                        __stride_recommender(__item, __count, __iters_per_work_item, _Fp::__preferred_vector_size, __work_group_size);
+                    auto [__idx, __stride, __is_full] = __stride_recommender(
+                        __item, __count, __iters_per_work_item, _Fp::__preferred_vector_size, __work_group_size);
                     __strided_loop<__iters_per_work_item> __execute_loop{static_cast<std::size_t>(__count)};
                     if (__is_full)
                     {
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/utils_ranges_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/utils_ranges_sycl.h
index 934c0a54b90..f126df3ab9e 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/utils_ranges_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/utils_ranges_sycl.h
@@ -19,7 +19,7 @@
 #include <iterator>
 #include <type_traits>
 #if _ONEDPL_CPP20_RANGES_PRESENT && _ONEDPL_CPP20_CONCEPTS_PRESENT
-#   include <ranges> // std::ranges::contiguous_range
+#include <ranges> // std::ranges::contiguous_range
 #endif
 
 #include "../../utils_ranges.h"

From 71678d023273a404131a08182c43822ec9ec5fea Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Thu, 2 Jan 2025 10:36:35 -0600
Subject: [PATCH 48/65] Refactor __lazy_ctor_storage deleter

* Deleter is now a callable lambda returned from a static function in the class
* Deleter accepts l-value reference to __lazy_ctor_storage

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../pstl/hetero/dpcpp/unseq_backend_sycl.h    | 30 ++++++++++++-------
 include/oneapi/dpl/pstl/utils.h               | 12 ++------
 2 files changed, 23 insertions(+), 19 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
index b3023302608..bfa3f929382 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
@@ -214,7 +214,8 @@ struct walk2_vectors_or_scalars : public walk_vector_or_scalar_base<_Range1, _Ra
             __raw_ptr2);
         // 3. Explicitly call destructor of lazy union type
         oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(
-            __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __rng1_vector);
+            __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType1>::__get_callable_deleter(),
+            __rng1_vector);
     }
 
     // _IsFull is ignored here. We assume that boundary checking has been already performed for this index.
@@ -270,9 +271,11 @@ struct walk3_vectors_or_scalars : public walk_vector_or_scalar_base<_Range1, _Ra
             __rng2_vector, __raw_ptr3);
         // 3. Explicitly call destructors of lazy union type
         oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(
-            __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __rng1_vector);
+            __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType1>::__get_callable_deleter(),
+            __rng1_vector);
         oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(
-            __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __rng2_vector);
+            __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType2>::__get_callable_deleter(),
+            __rng2_vector);
     }
 
     // _IsFull is ignored here. We assume that boundary checking has been already performed for this index.
@@ -359,7 +362,8 @@ struct walk_adjacent_difference : public walk_vector_or_scalar_base<_Range1, _Ra
             __rng2[0] = __rng1_vector[0].__v;
         // 3. Delete temporary storage
         oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(
-            __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __rng1_vector);
+            __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType>::__get_callable_deleter(),
+            __rng1_vector);
     }
     template <typename _IsFull, typename _ItemId>
     void
@@ -1196,9 +1200,11 @@ struct __reverse_functor : public walk_vector_or_scalar_base<_Range>
             __rng_right_vector, __rng_pointer);
         // 4. Call destructors of temporary storage
         oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(
-            __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __rng_left_vector);
+            __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType>::__get_callable_deleter(),
+            __rng_left_vector);
         oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(
-            __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __rng_right_vector);
+            __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType>::__get_callable_deleter(),
+            __rng_right_vector);
     }
     template <typename _IsFull, typename _Idx>
     void
@@ -1269,7 +1275,8 @@ struct __reverse_copy : public walk_vector_or_scalar_base<_Range1, _Range2>
         }
         // 3. Cleanup
         oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(
-            __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __rng1_vector);
+            __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType>::__get_callable_deleter(),
+            __rng1_vector);
     }
     template <typename _IsFull, typename _Idx>
     void
@@ -1324,7 +1331,8 @@ struct __rotate_copy : public walk_vector_or_scalar_base<_Range1, _Range2>
             __rng1_vector, __rng2_pointer);
         // 3. Delete temporary storage
         oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(
-            __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __rng1_vector);
+            __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType>::__get_callable_deleter(),
+            __rng1_vector);
     }
     template <typename _IsFull, typename _Idx>
     void
@@ -1537,9 +1545,11 @@ struct __brick_swap : public walk_vector_or_scalar_base<_Range1, _Range2>
             __raw_ptr2);
         // 3. Explicitly call destructor of lazy union type
         oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(
-            __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __rng1_vector);
+            __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType1>::__get_callable_deleter(),
+            __rng1_vector);
         oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(
-            __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage_deleter{}, __rng2_vector);
+            __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType2>::__get_callable_deleter(),
+            __rng2_vector);
     }
 
     template <typename _IsFull, typename _Idx>
diff --git a/include/oneapi/dpl/pstl/utils.h b/include/oneapi/dpl/pstl/utils.h
index 0a0e5057c15..d94757ecd47 100644
--- a/include/oneapi/dpl/pstl/utils.h
+++ b/include/oneapi/dpl/pstl/utils.h
@@ -783,16 +783,10 @@ union __lazy_ctor_storage
     {
         __v.~_Tp();
     }
-};
-
-// Utility to explicitly call the destructor of __lazy_ctor_storage as a callback functor
-struct __lazy_ctor_storage_deleter
-{
-    template <typename _Tp>
-    void
-    operator()(__lazy_ctor_storage<_Tp> __storage) const
+    static auto
+    __get_callable_deleter()
     {
-        __storage.__destroy();
+        return [](__lazy_ctor_storage& __storage) { __storage.__destroy(); };
     }
 };
 

From 8b0b18b22931f5a89f09e6ac7e34bf7ff7b0fef0 Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Thu, 2 Jan 2025 11:26:17 -0600
Subject: [PATCH 49/65] Address review comments

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h      | 2 +-
 .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h       | 6 +++---
 include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h   | 2 --
 3 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h
index b7d2a9262b2..fb90f8deced 100644
--- a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h
+++ b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h
@@ -140,7 +140,7 @@ __pattern_swap(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _Forw
                _ForwardIterator1 __last1, _ForwardIterator2 __first2, _Function __f)
 {
     auto __n = __last1 - __first1;
-    if (__n <= 0)
+    if (__n == 0)
         return __first2;
 
     auto __keep1 =
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h
index ff28eab7412..deae3bfcad6 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h
@@ -92,13 +92,13 @@ struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name..
     __stride_recommender(const sycl::nd_item<1>& __item, std::size_t __count, std::size_t __iters_per_work_item,
                          std::size_t __adj_elements_per_work_item, std::size_t __work_group_size)
     {
+        const std::size_t __work_group_id = __item.get_group().get_group_linear_id();
         if constexpr (oneapi::dpl::__internal::__is_spirv_target_v)
         {
             const __dpl_sycl::__sub_group __sub_group = __item.get_sub_group();
             const std::uint32_t __sub_group_size = __sub_group.get_local_linear_range();
             const std::uint32_t __sub_group_id = __sub_group.get_group_linear_id();
             const std::uint32_t __sub_group_local_id = __sub_group.get_local_linear_id();
-            const std::size_t __work_group_id = __item.get_group().get_group_linear_id();
 
             const std::size_t __sub_group_start_idx =
                 __iters_per_work_item * __adj_elements_per_work_item *
@@ -112,8 +112,8 @@ struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name..
         }
         else
         {
-            const std::size_t __work_group_start_idx = __item.get_group().get_group_linear_id() * __work_group_size *
-                                                       __iters_per_work_item * __adj_elements_per_work_item;
+            const std::size_t __work_group_start_idx =
+                __work_group_id * __work_group_size * __iters_per_work_item * __adj_elements_per_work_item;
             const std::size_t __work_item_idx =
                 __work_group_start_idx + __item.get_local_linear_id() * __adj_elements_per_work_item;
             const bool __is_full_work_group =
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
index bfa3f929382..8d400aa2e2a 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
@@ -172,7 +172,6 @@ struct walk1_vector_or_scalar : public walk_vector_or_scalar_base<_Range>
     void
     __scalar_path(_IsFull, const _ItemId __idx, _Range __rng) const
     {
-
         __f(__rng[__idx]);
     }
 
@@ -1338,7 +1337,6 @@ struct __rotate_copy : public walk_vector_or_scalar_base<_Range1, _Range2>
     void
     __scalar_path(_IsFull, const _Idx __idx, const _Range1 __rng1, _Range2 __rng2) const
     {
-
         __rng2[__idx] = __rng1[(__shift + __idx) % __size];
     }
     template <typename _IsFull, typename _Idx>

From f7d9753f43a8a0dfcd80b11f06eaa4151a475915 Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Sat, 4 Jan 2025 14:27:34 -0600
Subject: [PATCH 50/65] Remove intrusive test macro and adjust input sizes in
 test framework

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../hetero/dpcpp/parallel_backend_sycl_for.h  |  6 ------
 .../alg.reverse/reverse.pass.cpp              |  2 +-
 .../alg.reverse/reverse_copy.pass.cpp         |  2 +-
 .../copy_move.pass.cpp                        |  3 ++-
 .../alg.modifying.operations/fill.pass.cpp    |  2 +-
 .../generate.pass.cpp                         |  4 +++-
 .../alg.modifying.operations/replace.pass.cpp |  2 +-
 .../replace_copy.pass.cpp                     |  3 ++-
 .../alg.modifying.operations/rotate.pass.cpp  |  2 +-
 .../rotate_copy.pass.cpp                      |  2 +-
 .../swap_ranges.pass.cpp                      |  2 +-
 .../transform_binary.pass.cpp                 |  2 +-
 .../transform_unary.pass.cpp                  |  3 ++-
 .../alg.nonmodifying/for_each.pass.cpp        |  1 +
 .../alg.nonmodifying/transform_if.pass.cpp    |  6 ++++--
 .../numeric.ops/adjacent_difference.pass.cpp  |  2 +-
 test/support/test_config.h                    |  2 --
 test/support/utils.h                          | 20 +++++++++++++++++++
 18 files changed, 43 insertions(+), 23 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h
index deae3bfcad6..6cd548eeda9 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h
@@ -129,16 +129,10 @@ struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name..
     static std::size_t
     __estimate_best_start_size(const _ExecutionPolicy& __exec, _Fp __brick)
     {
-        // To ensure that the large submitter gets tested on all devices, set the switch point to 10,000 only when compiling
-        // oneDPL tests.
-#if TEST_FOR_ALGORITHM_LARGE_SUBMITTER
-        return 10000;
-#else
         const std::size_t __work_group_size =
             oneapi::dpl::__internal::__max_work_group_size(__exec, __max_work_group_size);
         const std::uint32_t __max_cu = oneapi::dpl::__internal::__max_compute_units(__exec);
         return __work_group_size * _Fp::__preferred_iters_per_item * __max_cu;
-#endif
     }
 
     template <typename _ExecutionPolicy, typename _Fp, typename _Index, typename... _Ranges>
diff --git a/test/parallel_api/algorithm/alg.modifying.operations/alg.reverse/reverse.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/alg.reverse/reverse.pass.cpp
index e2d61c0fb9c..41afe0693e1 100644
--- a/test/parallel_api/algorithm/alg.modifying.operations/alg.reverse/reverse.pass.cpp
+++ b/test/parallel_api/algorithm/alg.modifying.operations/alg.reverse/reverse.pass.cpp
@@ -69,7 +69,7 @@ template <typename T>
 void
 test()
 {
-    const ::std::size_t max_len = 100000;
+    const std::size_t max_len = TestUtils::get_pattern_for_max_n();
 
     Sequence<T> actual(max_len);
 
diff --git a/test/parallel_api/algorithm/alg.modifying.operations/alg.reverse/reverse_copy.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/alg.reverse/reverse_copy.pass.cpp
index b48c3a23b3d..cbe63db7e47 100644
--- a/test/parallel_api/algorithm/alg.modifying.operations/alg.reverse/reverse_copy.pass.cpp
+++ b/test/parallel_api/algorithm/alg.modifying.operations/alg.reverse/reverse_copy.pass.cpp
@@ -82,7 +82,7 @@ template <typename T1, typename T2>
 void
 test()
 {
-    const ::std::size_t max_len = 100000;
+    const std::size_t max_len = TestUtils::get_pattern_for_max_n();
     Sequence<T2> actual(max_len);
     Sequence<T1> data(max_len, [](::std::size_t i) { return T1(i); });
 
diff --git a/test/parallel_api/algorithm/alg.modifying.operations/copy_move.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/copy_move.pass.cpp
index c49a57d6c87..7dfc95b1485 100644
--- a/test/parallel_api/algorithm/alg.modifying.operations/copy_move.pass.cpp
+++ b/test/parallel_api/algorithm/alg.modifying.operations/copy_move.pass.cpp
@@ -125,8 +125,9 @@ template <typename T, typename Convert>
 void
 test(T trash, Convert convert)
 {
+    size_t max_n = TestUtils::get_pattern_for_max_n();
     // Try sequences of various lengths.
-    for (size_t n = 0; n <= 100000; n = n <= 16 ? n + 1 : size_t(3.1415 * n))
+    for (size_t n = 0; n <= max_n; n = n <= 16 ? n + 1 : size_t(3.1415 * n))
     {
         // count is number of output elements, plus a handful
         // more for sake of detecting buffer overruns.
diff --git a/test/parallel_api/algorithm/alg.modifying.operations/fill.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/fill.pass.cpp
index 259e5c99411..6c707b62079 100644
--- a/test/parallel_api/algorithm/alg.modifying.operations/fill.pass.cpp
+++ b/test/parallel_api/algorithm/alg.modifying.operations/fill.pass.cpp
@@ -101,7 +101,7 @@ int
 main()
 {
 
-    const ::std::size_t N = 100000;
+    const std::size_t N = TestUtils::get_pattern_for_max_n();
 
     for (::std::size_t n = 0; n < N; n = n < 16 ? n + 1 : size_t(3.1415 * n))
     {
diff --git a/test/parallel_api/algorithm/alg.modifying.operations/generate.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/generate.pass.cpp
index 3ee13107011..bcd65426526 100644
--- a/test/parallel_api/algorithm/alg.modifying.operations/generate.pass.cpp
+++ b/test/parallel_api/algorithm/alg.modifying.operations/generate.pass.cpp
@@ -82,7 +82,8 @@ template <typename T>
 void
 test_generate_by_type()
 {
-    for (size_t n = 0; n <= 100000; n = n < 16 ? n + 1 : size_t(3.1415 * n))
+    size_t max_n = TestUtils::get_pattern_for_max_n();
+    for (size_t n = 0; n <= max_n; n = n < 16 ? n + 1 : size_t(3.1415 * n))
     {
         Sequence<T> in(n, [](size_t) -> T { return T(0); }); //fill by zero
 
@@ -123,6 +124,7 @@ struct test_non_const_generate_n
 int
 main()
 {
+    test_generate_by_type<std::uint8_t>();
     test_generate_by_type<std::int32_t>();
     test_generate_by_type<float64_t>();
 
diff --git a/test/parallel_api/algorithm/alg.modifying.operations/replace.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/replace.pass.cpp
index 00d18c94df7..91291851dcb 100644
--- a/test/parallel_api/algorithm/alg.modifying.operations/replace.pass.cpp
+++ b/test/parallel_api/algorithm/alg.modifying.operations/replace.pass.cpp
@@ -114,7 +114,7 @@ template <typename T1, typename T2, typename Pred>
 void
 test(Pred pred)
 {
-    const ::std::size_t max_len = 100000;
+    const std::size_t max_len = TestUtils::get_pattern_for_max_n();
 
     const T1 value = T1(0);
     const T1 new_value = T1(666);
diff --git a/test/parallel_api/algorithm/alg.modifying.operations/replace_copy.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/replace_copy.pass.cpp
index eae0b518dc9..b89c3aa3f40 100644
--- a/test/parallel_api/algorithm/alg.modifying.operations/replace_copy.pass.cpp
+++ b/test/parallel_api/algorithm/alg.modifying.operations/replace_copy.pass.cpp
@@ -75,8 +75,9 @@ template <typename T, typename Convert, typename Predicate>
 void
 test(T trash, const T& old_value, const T& new_value, Predicate pred, Convert convert)
 {
+    const size_t max_n = TestUtils::get_pattern_for_max_n();
     // Try sequences of various lengths.
-    for (size_t n = 0; n <= 100000; n = n <= 16 ? n + 1 : size_t(3.1415 * n))
+    for (size_t n = 0; n <= max_n; n = n <= 16 ? n + 1 : size_t(3.1415 * n))
     {
         Sequence<T> in(n, [&](size_t k) -> T { return convert(n ^ k); });
         Sequence<T> out(n, [=](size_t) { return trash; });
diff --git a/test/parallel_api/algorithm/alg.modifying.operations/rotate.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/rotate.pass.cpp
index 963d2616a1c..a6f4600c2a8 100644
--- a/test/parallel_api/algorithm/alg.modifying.operations/rotate.pass.cpp
+++ b/test/parallel_api/algorithm/alg.modifying.operations/rotate.pass.cpp
@@ -132,7 +132,7 @@ template <typename T>
 void
 test()
 {
-    const std::int32_t max_len = 100000;
+    const std::int32_t max_len = TestUtils::get_pattern_for_max_n();
 
     Sequence<T> actual(max_len, [](::std::size_t i) { return T(i); });
     Sequence<T> data(max_len, [](::std::size_t i) { return T(i); });
diff --git a/test/parallel_api/algorithm/alg.modifying.operations/rotate_copy.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/rotate_copy.pass.cpp
index c54f6b8dc82..1df986deb6a 100644
--- a/test/parallel_api/algorithm/alg.modifying.operations/rotate_copy.pass.cpp
+++ b/test/parallel_api/algorithm/alg.modifying.operations/rotate_copy.pass.cpp
@@ -100,7 +100,7 @@ void
 test()
 {
 
-    const ::std::size_t max_len = 100000;
+    const std::size_t max_len = TestUtils::get_pattern_for_max_n();
 
     Sequence<T2> actual(max_len, [](::std::size_t i) { return T1(i); });
 
diff --git a/test/parallel_api/algorithm/alg.modifying.operations/swap_ranges.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/swap_ranges.pass.cpp
index 8b947feb12a..f23ea69e05b 100644
--- a/test/parallel_api/algorithm/alg.modifying.operations/swap_ranges.pass.cpp
+++ b/test/parallel_api/algorithm/alg.modifying.operations/swap_ranges.pass.cpp
@@ -115,7 +115,7 @@ template <typename T>
 void
 test()
 {
-    const ::std::size_t max_len = 100000;
+    const std::size_t max_len = TestUtils::get_pattern_for_max_n();
 
     Sequence<T> data(max_len);
     Sequence<T> actual(max_len);
diff --git a/test/parallel_api/algorithm/alg.modifying.operations/transform_binary.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/transform_binary.pass.cpp
index 92723a75e8d..5deae6ea667 100644
--- a/test/parallel_api/algorithm/alg.modifying.operations/transform_binary.pass.cpp
+++ b/test/parallel_api/algorithm/alg.modifying.operations/transform_binary.pass.cpp
@@ -104,7 +104,7 @@ test(Predicate pred, _IteratorAdapter adap = {})
 #if PSTL_USE_DEBUG && ONEDPL_USE_OPENMP_BACKEND
         10000;
 #else
-        100000;
+        TestUtils::get_pattern_for_max_n();
 #endif
     for (size_t n = 0; n <= max_n; n = n <= 16 ? n + 1 : size_t(3.1415 * n))
     {
diff --git a/test/parallel_api/algorithm/alg.modifying.operations/transform_unary.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/transform_unary.pass.cpp
index 20c1403f6b9..19fe906935c 100644
--- a/test/parallel_api/algorithm/alg.modifying.operations/transform_unary.pass.cpp
+++ b/test/parallel_api/algorithm/alg.modifying.operations/transform_unary.pass.cpp
@@ -84,7 +84,8 @@ template <::std::size_t CallNumber, typename Tin, typename Tout, typename _Op =
 void
 test()
 {
-    for (size_t n = 0; n <= 100000; n = n <= 16 ? n + 1 : size_t(3.1415 * n))
+    size_t max_n = TestUtils::get_pattern_for_max_n();
+    for (size_t n = 0; n <= max_n; n = n <= 16 ? n + 1 : size_t(3.1415 * n))
     {
         Sequence<Tin> in(n, [](std::int32_t k) { return k % 5 != 1 ? 3 * k - 7 : 0; });
 
diff --git a/test/parallel_api/algorithm/alg.nonmodifying/for_each.pass.cpp b/test/parallel_api/algorithm/alg.nonmodifying/for_each.pass.cpp
index 526c1c45fcf..9ef0db6d53e 100644
--- a/test/parallel_api/algorithm/alg.nonmodifying/for_each.pass.cpp
+++ b/test/parallel_api/algorithm/alg.nonmodifying/for_each.pass.cpp
@@ -85,6 +85,7 @@ template <typename T>
 void
 test()
 {
+    const size_t max_n = TestUtils::get_pattern_for_max_n();
     for (size_t n = 0; n <= 100000; n = n <= 16 ? n + 1 : size_t(3.1415 * n))
     {
         Sequence<T> in_out(n, Gen<T>());
diff --git a/test/parallel_api/algorithm/alg.nonmodifying/transform_if.pass.cpp b/test/parallel_api/algorithm/alg.nonmodifying/transform_if.pass.cpp
index dbd3b3e7af4..97198b72f76 100644
--- a/test/parallel_api/algorithm/alg.nonmodifying/transform_if.pass.cpp
+++ b/test/parallel_api/algorithm/alg.nonmodifying/transform_if.pass.cpp
@@ -162,7 +162,8 @@ void
 test()
 {
     const ::std::int64_t init_val = 999;
-    for (size_t n = 1; n <= 100000; n = n <= 16 ? n + 1 : size_t(3.1415 * n))
+    const size_t max_n = TestUtils::get_pattern_for_max_n();
+    for (size_t n = 1; n <= max_n; n = n <= 16 ? n + 1 : size_t(3.1415 * n))
     {
         {
             Sequence<_Type> in1(n, [=](size_t k) { return (3 * k) % std::numeric_limits<_Type>::max(); });
@@ -196,7 +197,8 @@ void
 test_inplace()
 {
     const ::std::int64_t init_val = 999;
-    for (size_t n = 1; n <= 100000; n = n <= 16 ? n + 1 : size_t(3.1415 * n))
+    const size_t max_n = TestUtils::get_pattern_for_max_n();
+    for (size_t n = 1; n <= max_n; n = n <= 16 ? n + 1 : size_t(3.1415 * n))
     {
         {
             Sequence<_Type> in1(n, [=](size_t k) { return k % std::numeric_limits<_Type>::max(); });
diff --git a/test/parallel_api/numeric/numeric.ops/adjacent_difference.pass.cpp b/test/parallel_api/numeric/numeric.ops/adjacent_difference.pass.cpp
index 417afc99903..08b93265541 100644
--- a/test/parallel_api/numeric/numeric.ops/adjacent_difference.pass.cpp
+++ b/test/parallel_api/numeric/numeric.ops/adjacent_difference.pass.cpp
@@ -139,7 +139,7 @@ template <typename T1, typename T2, typename Pred>
 void
 test(Pred pred)
 {
-    const ::std::size_t max_len = 100000;
+    const std::size_t max_len = TestUtils::get_pattern_for_max_n();
 
     const T2 value = T2(77);
     const T1 trash = T1(31);
diff --git a/test/support/test_config.h b/test/support/test_config.h
index 06e37edbce9..b0fe4c8a9c9 100644
--- a/test/support/test_config.h
+++ b/test/support/test_config.h
@@ -276,6 +276,4 @@
 // Intel(R) oneAPI DPC++/C++ compiler produces 'Unexpected kernel lambda size issue' error
 #define _PSTL_LAMBDA_PTR_TO_MEMBER_WINDOWS_BROKEN (_MSC_VER && TEST_DPCPP_BACKEND_PRESENT && __INTEL_LLVM_COMPILER <= 20250100)
 
-#define TEST_FOR_ALGORITHM_LARGE_SUBMITTER TEST_DPCPP_BACKEND_PRESENT
-
 #endif // _TEST_CONFIG_H
diff --git a/test/support/utils.h b/test/support/utils.h
index 666d5ca3646..3326fc4ce87 100644
--- a/test/support/utils.h
+++ b/test/support/utils.h
@@ -1013,6 +1013,26 @@ generate_arithmetic_data(T* input, std::size_t size, std::uint32_t seed)
         input[j] = input[i];
     }
 }
+
+// Utility that models __estimate_best_start_size in the SYCL backend parallel_for
+// to ensure large enough inputs are used to test the large submitter path.
+// A multiplier to the max n is added to ensure we get a few separate test inputs for
+// this path.
+std::size_t
+get_pattern_for_max_n()
+{
+#if TEST_DPCPP_BACKEND_PRESENT
+    sycl::queue q = TestUtils::get_test_queue();
+    sycl::device d = q.get_device();
+    constexpr std::size_t max_iters_per_item = 16;
+    constexpr std::size_t multiplier = 4;
+    return multiplier * max_iters_per_item * d.get_info<sycl::info::device::max_work_group_size>() *
+           d.get_info<sycl::info::device::max_compute_units>();
+#else
+    return TestUtils::max_n;
+#endif
+}
+
 } /* namespace TestUtils */
 
 #endif // _UTILS_H

From 83c5ca4297744355bbfc96cab346767b82059bbd Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Sat, 4 Jan 2025 14:47:08 -0600
Subject: [PATCH 51/65] Make walk_scalar_base and walk_vector_or_scalar_base
 structs

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
index 8d400aa2e2a..9489d613e5f 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
@@ -113,8 +113,9 @@ struct walk_n
 // Base class which establishes tuning parameters including vectorization / scalar path decider at compile time
 // for walk / for based algorithms
 template <typename... _Ranges>
-class walk_vector_or_scalar_base
+struct walk_vector_or_scalar_base
 {
+  private:
     using _ValueTypes = std::tuple<oneapi::dpl::__internal::__value_t<_Ranges>...>;
     constexpr static std::uint8_t __min_type_size = oneapi::dpl::__internal::__min_nested_type_size<_ValueTypes>::value;
     // Empirically determined 'bytes-in-flight' to maximize bandwidth utilization
@@ -135,8 +136,9 @@ class walk_vector_or_scalar_base
 
 // Path that intentionally disables vectorization for algorithms with a scattered access pattern (e.g. binary_search)
 template <typename... _Ranges>
-class walk_scalar_base
+struct walk_scalar_base
 {
+  private:
     using _ValueTypes = std::tuple<oneapi::dpl::__internal::__value_t<_Ranges>...>;
     constexpr static std::uint8_t __min_type_size = oneapi::dpl::__internal::__min_nested_type_size<_ValueTypes>::value;
     constexpr static std::uint8_t __bytes_per_item = 16;

From fedd5de10bd99390af5e63be3ec6926f0b4d7a7f Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Sat, 4 Jan 2025 15:33:28 -0600
Subject: [PATCH 52/65] Add missing max_n

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 test/parallel_api/algorithm/alg.nonmodifying/for_each.pass.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/parallel_api/algorithm/alg.nonmodifying/for_each.pass.cpp b/test/parallel_api/algorithm/alg.nonmodifying/for_each.pass.cpp
index 9ef0db6d53e..c23fd3013de 100644
--- a/test/parallel_api/algorithm/alg.nonmodifying/for_each.pass.cpp
+++ b/test/parallel_api/algorithm/alg.nonmodifying/for_each.pass.cpp
@@ -86,7 +86,7 @@ void
 test()
 {
     const size_t max_n = TestUtils::get_pattern_for_max_n();
-    for (size_t n = 0; n <= 100000; n = n <= 16 ? n + 1 : size_t(3.1415 * n))
+    for (size_t n = 0; n <= max_n; n = n <= 16 ? n + 1 : size_t(3.1415 * n))
     {
         Sequence<T> in_out(n, Gen<T>());
         Sequence<T> expected(n, Gen<T>());

From 08aa260332b1e25fd7059a7ba4a80d805e7dfd80 Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Sat, 4 Jan 2025 15:34:53 -0600
Subject: [PATCH 53/65] Add constructors for for-based bricks

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../internal/async_impl/async_impl_hetero.h   |  6 ++--
 .../dpl/pstl/hetero/algorithm_impl_hetero.h   | 22 ++++++------
 .../hetero/algorithm_ranges_impl_hetero.h     | 15 ++++----
 .../pstl/hetero/dpcpp/unseq_backend_sycl.h    | 35 +++++++++++++++++++
 .../dpl/pstl/hetero/numeric_impl_hetero.h     |  2 +-
 5 files changed, 57 insertions(+), 23 deletions(-)

diff --git a/include/oneapi/dpl/internal/async_impl/async_impl_hetero.h b/include/oneapi/dpl/internal/async_impl/async_impl_hetero.h
index 92b9ed9b8ec..69817f7b468 100644
--- a/include/oneapi/dpl/internal/async_impl/async_impl_hetero.h
+++ b/include/oneapi/dpl/internal/async_impl/async_impl_hetero.h
@@ -47,7 +47,7 @@ __pattern_walk1_async(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _For
     auto __future_obj = oneapi::dpl::__par_backend_hetero::__parallel_for(
         _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
         unseq_backend::walk1_vector_or_scalar<_ExecutionPolicy, _Function, decltype(__view)>{
-            {}, __f, static_cast<std::size_t>(__n)},
+            __f, static_cast<std::size_t>(__n)},
         __n, __view);
     return __future_obj;
 }
@@ -75,7 +75,7 @@ __pattern_walk2_async(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _For
     auto __future = oneapi::dpl::__par_backend_hetero::__parallel_for(
         _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
         unseq_backend::walk2_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__view1), decltype(__view2)>{
-            {}, __f, static_cast<std::size_t>(__n)},
+            __f, static_cast<std::size_t>(__n)},
         __n, __view1, __view2);
 
     return __future.__make_future(__first2 + __n);
@@ -107,7 +107,7 @@ __pattern_walk3_async(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _For
     auto __future = oneapi::dpl::__par_backend_hetero::__parallel_for(
         _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
         unseq_backend::walk3_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__view1), decltype(__view2),
-                                                decltype(__view3)>{{}, __f, static_cast<size_t>(__n)},
+                                                decltype(__view3)>{__f, static_cast<size_t>(__n)},
         __n, __view1, __view2, __view3);
 
     return __future.__make_future(__first3 + __n);
diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h
index fb90f8deced..8f8c81d6537 100644
--- a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h
+++ b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h
@@ -60,7 +60,7 @@ __pattern_walk1(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _ForwardIt
     oneapi::dpl::__par_backend_hetero::__parallel_for(
         _BackendTag{}, __exec,
         unseq_backend::walk1_vector_or_scalar<_ExecutionPolicy, _Function, decltype(__view)>{
-            {}, __f, static_cast<std::size_t>(__n)},
+            __f, static_cast<std::size_t>(__n)},
         __n, __view)
         .__deferrable_wait();
 }
@@ -111,7 +111,7 @@ __pattern_walk2(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _ForwardIt
     auto __future = oneapi::dpl::__par_backend_hetero::__parallel_for(
         _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
         unseq_backend::walk2_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__view1), decltype(__view2)>{
-            {}, __f, static_cast<std::size_t>(__n)},
+            __f, static_cast<std::size_t>(__n)},
         __n, __view1, __view2);
 
     // Call no wait, wait or deferrable wait depending on _WaitMode
@@ -157,7 +157,7 @@ __pattern_swap(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _Forw
     auto __future = oneapi::dpl::__par_backend_hetero::__parallel_for(
         _BackendTag{}, std::forward<_ExecutionPolicy>(__exec),
         unseq_backend::__brick_swap<_ExecutionPolicy, _Function, decltype(__view1), decltype(__view2)>{
-            {}, __f, static_cast<std::size_t>(__n)},
+            __f, static_cast<std::size_t>(__n)},
         __n, __view1, __view2);
     __future.wait(__par_backend_hetero::__deferrable_mode{});
     return __first2 + __n;
@@ -194,7 +194,7 @@ __pattern_walk3(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _ForwardIt
     oneapi::dpl::__par_backend_hetero::__parallel_for(
         _BackendTag{}, std::forward<_ExecutionPolicy>(__exec),
         unseq_backend::walk3_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__view1), decltype(__view2),
-                                                decltype(__view3)>{{}, __f, static_cast<std::size_t>(__n)},
+                                                decltype(__view3)>{__f, static_cast<std::size_t>(__n)},
         __n, __view1, __view2, __view3)
         .__deferrable_wait();
 
@@ -1599,7 +1599,7 @@ __pattern_reverse(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterato
     oneapi::dpl::__par_backend_hetero::__parallel_for(
         _BackendTag{}, std::forward<_ExecutionPolicy>(__exec),
         unseq_backend::__reverse_functor<typename std::iterator_traits<_Iterator>::difference_type,
-                                         decltype(__buf.all_view())>{{}, __n},
+                                         decltype(__buf.all_view())>{__n},
         __n / 2, __buf.all_view())
         .__deferrable_wait();
 }
@@ -1628,7 +1628,7 @@ __pattern_reverse_copy(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Bi
     oneapi::dpl::__par_backend_hetero::__parallel_for(
         _BackendTag{}, std::forward<_ExecutionPolicy>(__exec),
         unseq_backend::__reverse_copy<typename std::iterator_traits<_BidirectionalIterator>::difference_type,
-                                      decltype(__view1), decltype(__view2)>{{}, __n},
+                                      decltype(__view1), decltype(__view2)>{__n},
         __n, __view1, __view2)
         .__deferrable_wait();
 
@@ -1671,7 +1671,7 @@ __pattern_rotate(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterator
     oneapi::dpl::__par_backend_hetero::__parallel_for(
         _BackendTag{}, oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__rotate_wrapper>(__exec),
         unseq_backend::__rotate_copy<typename std::iterator_traits<_Iterator>::difference_type, decltype(__view),
-                                     decltype(__temp_rng_w)>{{}, __n, __shift},
+                                     decltype(__temp_rng_w)>{__n, __shift},
         __n, __view, __temp_rng_w);
 
     //An explicit wait isn't required here because we are working with a temporary sycl::buffer and sycl accessors and
@@ -1680,9 +1680,9 @@ __pattern_rotate(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterator
     using _Function = __brick_move<__hetero_tag<_BackendTag>, _ExecutionPolicy>;
     auto __temp_rng_rw =
         oneapi::dpl::__ranges::all_view<_Tp, __par_backend_hetero::access_mode::read_write>(__temp_buf.get_buffer());
-    auto __brick = unseq_backend::walk2_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__temp_rng_rw),
-                                                           decltype(__buf.all_view())>{
-        {}, _Function{}, static_cast<std::size_t>(__n)};
+    auto __brick =
+        unseq_backend::walk2_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__temp_rng_rw),
+                                                decltype(__buf.all_view())>{_Function{}, static_cast<std::size_t>(__n)};
     oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, std::forward<_ExecutionPolicy>(__exec), __brick,
                                                       __n, __temp_rng_rw, __buf.all_view())
         .__deferrable_wait();
@@ -1721,7 +1721,7 @@ __pattern_rotate_copy(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Bid
     oneapi::dpl::__par_backend_hetero::__parallel_for(
         _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
         unseq_backend::__rotate_copy<typename ::std::iterator_traits<_BidirectionalIterator>::difference_type,
-                                     decltype(__view1), decltype(__view2)>{{}, __n, __shift},
+                                     decltype(__view1), decltype(__view2)>{__n, __shift},
         __n, __view1, __view2)
         .__deferrable_wait();
 
diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h
index 000e95df430..d0af8d513ff 100644
--- a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h
+++ b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h
@@ -64,7 +64,7 @@ __pattern_walk_n(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Function
             oneapi::dpl::__par_backend_hetero::__parallel_for(
                 _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
                 unseq_backend::walk1_vector_or_scalar<_ExecutionPolicy, _Function, std::decay_t<_Ranges>...>{
-                    {}, __f, static_cast<std::size_t>(__n)},
+                    __f, static_cast<std::size_t>(__n)},
                 __n, ::std::forward<_Ranges>(__rngs)...)
                 .__deferrable_wait();
         }
@@ -73,7 +73,7 @@ __pattern_walk_n(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Function
             oneapi::dpl::__par_backend_hetero::__parallel_for(
                 _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
                 unseq_backend::walk2_vectors_or_scalars<_ExecutionPolicy, _Function, std::decay_t<_Ranges>...>{
-                    {}, __f, static_cast<std::size_t>(__n)},
+                    __f, static_cast<std::size_t>(__n)},
                 __n, ::std::forward<_Ranges>(__rngs)...)
                 .__deferrable_wait();
         }
@@ -82,7 +82,7 @@ __pattern_walk_n(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Function
             oneapi::dpl::__par_backend_hetero::__parallel_for(
                 _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
                 unseq_backend::walk3_vectors_or_scalars<_ExecutionPolicy, _Function, std::decay_t<_Ranges>...>{
-                    {}, __f, static_cast<std::size_t>(__n)},
+                    __f, static_cast<std::size_t>(__n)},
                 __n, ::std::forward<_Ranges>(__rngs)...)
                 .__deferrable_wait();
         }
@@ -178,7 +178,7 @@ __pattern_swap(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _Rang
         auto __future = oneapi::dpl::__par_backend_hetero::__parallel_for(
             _BackendTag{}, std::move(__exec1),
             unseq_backend::__brick_swap<decltype(__exec1), _Function, std::decay_t<_Range1>, std::decay_t<_Range2>>{
-                {}, __f, __n},
+                __f, __n},
             __n, __rng1, __rng2);
         __future.wait(__par_backend_hetero::__deferrable_mode{});
         return __n;
@@ -188,8 +188,8 @@ __pattern_swap(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _Rang
         oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__swap2_wrapper>(std::forward<_ExecutionPolicy>(__exec));
     auto __future = oneapi::dpl::__par_backend_hetero::__parallel_for(
         _BackendTag{}, std::move(__exec2),
-        unseq_backend::__brick_swap<decltype(__exec2), _Function, std::decay_t<_Range2>, std::decay_t<_Range1>>{
-            {}, __f, __n},
+        unseq_backend::__brick_swap<decltype(__exec2), _Function, std::decay_t<_Range2>, std::decay_t<_Range1>>{__f,
+                                                                                                                __n},
         __n, __rng2, __rng1);
     __future.wait(__par_backend_hetero::__deferrable_mode{});
     return __n;
@@ -658,8 +658,7 @@ __pattern_unique_copy(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec
             oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__copy_wrapper>(
                 std::forward<_ExecutionPolicy>(__exec)),
             unseq_backend::walk2_vectors_or_scalars<_ExecutionPolicy, _CopyBrick, std::decay_t<_Range1>,
-                                                    std::decay_t<_Range2>>{
-                {}, _CopyBrick{}, static_cast<std::size_t>(__n)},
+                                                    std::decay_t<_Range2>>{_CopyBrick{}, static_cast<std::size_t>(__n)},
             __n, std::forward<_Range1>(__rng), std::forward<_Range2>(__result))
             .get();
 
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
index 9489d613e5f..90e55b8f669 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
@@ -155,10 +155,14 @@ struct walk_scalar_base
 template <typename _ExecutionPolicy, typename _F, typename _Range>
 struct walk1_vector_or_scalar : public walk_vector_or_scalar_base<_Range>
 {
+  private:
     using __base_t = walk_vector_or_scalar_base<_Range>;
     _F __f;
     std::size_t __n;
 
+  public:
+    walk1_vector_or_scalar(_F __f, std::size_t __n) : __f(__f), __n(__n) {}
+
     template <typename _IsFull, typename _ItemId>
     void
     __vector_path(_IsFull __is_full, const _ItemId __idx, _Range __rng) const
@@ -191,10 +195,14 @@ struct walk1_vector_or_scalar : public walk_vector_or_scalar_base<_Range>
 template <typename _ExecutionPolicy, typename _F, typename _Range1, typename _Range2>
 struct walk2_vectors_or_scalars : public walk_vector_or_scalar_base<_Range1, _Range2>
 {
+  private:
     using __base_t = walk_vector_or_scalar_base<_Range1, _Range2>;
     _F __f;
     std::size_t __n;
 
+  public:
+    walk2_vectors_or_scalars(_F __f, std::size_t __n) : __f(__f), __n(__n) {}
+
     template <typename _IsFull, typename _ItemId>
     void
     __vector_path(_IsFull __is_full, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2) const
@@ -242,10 +250,14 @@ struct walk2_vectors_or_scalars : public walk_vector_or_scalar_base<_Range1, _Ra
 template <typename _ExecutionPolicy, typename _F, typename _Range1, typename _Range2, typename _Range3>
 struct walk3_vectors_or_scalars : public walk_vector_or_scalar_base<_Range1, _Range2, _Range3>
 {
+  private:
     using __base_t = walk_vector_or_scalar_base<_Range1, _Range2, _Range3>;
     _F __f;
     std::size_t __n;
 
+  public:
+    walk3_vectors_or_scalars(_F __f, std::size_t __n) : __f(__f), __n(__n) {}
+
     template <typename _IsFull, typename _ItemId>
     void
     __vector_path(_IsFull __is_full, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2, _Range3 __rng3) const
@@ -322,10 +334,14 @@ struct walk_n<_ExecutionPolicy, oneapi::dpl::__internal::__no_op>
 template <typename _ExecutionPolicy, typename _F, typename _Range1, typename _Range2>
 struct walk_adjacent_difference : public walk_vector_or_scalar_base<_Range1, _Range2>
 {
+  private:
     using __base_t = walk_vector_or_scalar_base<_Range1, _Range2>;
     _F __f;
     std::size_t __n;
 
+  public:
+    walk_adjacent_difference(_F __f, std::size_t __n) : __f(__f), __n(__n) {}
+
     template <typename _IsFull, typename _ItemId>
     void
     __scalar_path(_IsFull, const _ItemId __idx, const _Range1 __rng1, _Range2 __rng2) const
@@ -1156,9 +1172,14 @@ struct __brick_includes
 template <typename _Size, typename _Range>
 struct __reverse_functor : public walk_vector_or_scalar_base<_Range>
 {
+  private:
     using __base_t = walk_vector_or_scalar_base<_Range>;
     using _ValueType = oneapi::dpl::__internal::__value_t<_Range>;
     _Size __size;
+
+  public:
+    __reverse_functor(_Size __size) : __size(__size) {}
+
     template <typename _IsFull, typename _Idx>
     void
     __vector_path(_IsFull __is_full, const _Idx __left_start_idx, _Range __rng) const
@@ -1231,10 +1252,14 @@ struct __reverse_functor : public walk_vector_or_scalar_base<_Range>
 template <typename _Size, typename _Range1, typename _Range2>
 struct __reverse_copy : public walk_vector_or_scalar_base<_Range1, _Range2>
 {
+  private:
     using __base_t = walk_vector_or_scalar_base<_Range1, _Range2>;
     using _ValueType = oneapi::dpl::__internal::__value_t<_Range1>;
     _Size __size;
 
+  public:
+    __reverse_copy(_Size __size) : __size(__size) {}
+
     template <typename _IsFull, typename _Idx>
     void
     __scalar_path(_IsFull, const _Idx __idx, const _Range1 __rng1, _Range2 __rng2) const
@@ -1296,10 +1321,15 @@ struct __reverse_copy : public walk_vector_or_scalar_base<_Range1, _Range2>
 template <typename _Size, typename _Range1, typename _Range2>
 struct __rotate_copy : public walk_vector_or_scalar_base<_Range1, _Range2>
 {
+  private:
     using __base_t = walk_vector_or_scalar_base<_Range1, _Range2>;
     using _ValueType = oneapi::dpl::__internal::__value_t<_Range1>;
     _Size __size;
     _Size __shift;
+
+  public:
+    __rotate_copy(_Size __size, _Size __shift) : __size(__size), __shift(__shift) {}
+
     template <typename _IsFull, typename _Idx>
     void
     __vector_path(_IsFull __is_full, const _Idx __idx, const _Range1 __rng1, _Range2 __rng2) const
@@ -1515,9 +1545,14 @@ struct __brick_reduce_idx : public walk_scalar_base<_Range>
 template <typename _ExecutionPolicy, typename _F, typename _Range1, typename _Range2>
 struct __brick_swap : public walk_vector_or_scalar_base<_Range1, _Range2>
 {
+  private:
     using __base_t = walk_vector_or_scalar_base<_Range1, _Range2>;
     _F __f;
     std::size_t __n;
+
+  public:
+    __brick_swap(_F __f, std::size_t __n) : __f(__f), __n(__n) {}
+
     template <typename _IsFull, typename _ItemId>
     void
     __vector_path(_IsFull __is_full, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2) const
diff --git a/include/oneapi/dpl/pstl/hetero/numeric_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/numeric_impl_hetero.h
index 5b980cb0978..80d2aaccbac 100644
--- a/include/oneapi/dpl/pstl/hetero/numeric_impl_hetero.h
+++ b/include/oneapi/dpl/pstl/hetero/numeric_impl_hetero.h
@@ -270,7 +270,7 @@ __pattern_adjacent_difference(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&
                                                                   decltype(__view2)>;
 
         oneapi::dpl::__par_backend_hetero::__parallel_for(
-            _BackendTag{}, __exec, _Function{{}, __fn, static_cast<std::size_t>(__n)}, __n, __view1, __view2)
+            _BackendTag{}, __exec, _Function{__fn, static_cast<std::size_t>(__n)}, __n, __view1, __view2)
             .__deferrable_wait();
     }
 

From a5eca963b5eb08b9ea4bccd69b0aa59096422963 Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Mon, 6 Jan 2025 08:23:19 -0600
Subject: [PATCH 54/65] Remove extraneous {} and add constructor to
 custom_brick

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 include/oneapi/dpl/internal/binary_search_impl.h      | 11 ++++++++---
 .../oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h    |  2 +-
 .../oneapi/dpl/pstl/hetero/histogram_impl_hetero.h    |  2 +-
 3 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/include/oneapi/dpl/internal/binary_search_impl.h b/include/oneapi/dpl/internal/binary_search_impl.h
index f57e9e0fafc..d8fa53d68d1 100644
--- a/include/oneapi/dpl/internal/binary_search_impl.h
+++ b/include/oneapi/dpl/internal/binary_search_impl.h
@@ -45,6 +45,11 @@ struct custom_brick : oneapi::dpl::unseq_backend::walk_scalar_base<_Range>
     T size;
     bool use_32bit_indexing;
 
+    custom_brick(Comp comp, T size, bool use_32bit_indexing)
+        : comp(comp), size(size), use_32bit_indexing(use_32bit_indexing)
+    {
+    }
+
     template <typename _Size, typename _ItemId, typename _Acc>
     void
     search_impl(_ItemId idx, _Acc acc) const
@@ -163,7 +168,7 @@ lower_bound_impl(__internal::__hetero_tag<_BackendTag>, Policy&& policy, InputIt
     __bknd::__parallel_for(
         _BackendTag{}, ::std::forward<decltype(policy)>(policy),
         custom_brick<StrictWeakOrdering, decltype(size), decltype(zip_vw), search_algorithm::lower_bound>{
-            {}, comp, size, use_32bit_indexing},
+            comp, size, use_32bit_indexing},
         value_size, zip_vw)
         .__deferrable_wait();
     return result + value_size;
@@ -196,7 +201,7 @@ upper_bound_impl(__internal::__hetero_tag<_BackendTag>, Policy&& policy, InputIt
     __bknd::__parallel_for(
         _BackendTag{}, std::forward<decltype(policy)>(policy),
         custom_brick<StrictWeakOrdering, decltype(size), decltype(zip_vw), search_algorithm::upper_bound>{
-            {}, comp, size, use_32bit_indexing},
+            comp, size, use_32bit_indexing},
         value_size, zip_vw)
         .__deferrable_wait();
     return result + value_size;
@@ -229,7 +234,7 @@ binary_search_impl(__internal::__hetero_tag<_BackendTag>, Policy&& policy, Input
     __bknd::__parallel_for(
         _BackendTag{}, std::forward<decltype(policy)>(policy),
         custom_brick<StrictWeakOrdering, decltype(size), decltype(zip_vw), search_algorithm::binary_search>{
-            {}, comp, size, use_32bit_indexing},
+            comp, size, use_32bit_indexing},
         value_size, zip_vw)
         .__deferrable_wait();
     return result + value_size;
diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h
index 8f8c81d6537..94846618bed 100644
--- a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h
+++ b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h
@@ -1986,7 +1986,7 @@ __pattern_shift_left(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Rang
 
         auto __brick =
             unseq_backend::walk2_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__src), decltype(__dst)>{
-                {}, _Function{}, static_cast<std::size_t>(__size_res)};
+                _Function{}, static_cast<std::size_t>(__size_res)};
 
         oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
                                                           __brick, __size_res, __src, __dst)
diff --git a/include/oneapi/dpl/pstl/hetero/histogram_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/histogram_impl_hetero.h
index e4835971a86..937608bfc39 100644
--- a/include/oneapi/dpl/pstl/hetero/histogram_impl_hetero.h
+++ b/include/oneapi/dpl/pstl/hetero/histogram_impl_hetero.h
@@ -144,7 +144,7 @@ __pattern_histogram(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Rando
         auto __init_event = oneapi::dpl::__par_backend_hetero::__parallel_for(
             _BackendTag{}, oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__hist_fill_zeros_wrapper>(__exec),
             unseq_backend::walk1_vector_or_scalar<_ExecutionPolicy, decltype(__fill_func), decltype(__bins)>{
-                {}, __fill_func, static_cast<std::size_t>(__num_bins)},
+                __fill_func, static_cast<std::size_t>(__num_bins)},
             __num_bins, __bins);
 
         if (__n > 0)

From 32612a159c54d4ea15d638d422a16039a9a5e89f Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Mon, 6 Jan 2025 09:48:38 -0600
Subject: [PATCH 55/65] Limit recursive searching of __min_nested_type_size to
 tuples

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 include/oneapi/dpl/pstl/utils.h | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/include/oneapi/dpl/pstl/utils.h b/include/oneapi/dpl/pstl/utils.h
index d94757ecd47..361c25021e6 100644
--- a/include/oneapi/dpl/pstl/utils.h
+++ b/include/oneapi/dpl/pstl/utils.h
@@ -790,7 +790,15 @@ union __lazy_ctor_storage
     }
 };
 
-// Returns the smallest type within a set of potentially nested template types.
+// To implement __min_nested_type_size, a general utility with an internal tuple
+// specialization, we need to forward declare our internal tuple first as tuple_impl.h
+// already includes this header.
+template <typename... T>
+struct tuple;
+
+// Returns the smallest type within a set of potentially nested template types. This function
+// recursively explores std::tuple and oneapi::dpl::__internal::tuple for the smallest type.
+// For all other types, its size is used directly.
 // E.g. If we consider the type: T = tuple<float, tuple<short, long>, int, double>,
 // then __min_nested_type_size<T>::value returns sizeof(short).
 template <typename _T>
@@ -799,8 +807,14 @@ struct __min_nested_type_size
     constexpr static std::size_t value = sizeof(_T);
 };
 
-template <template <typename...> typename _WrapperType, typename... _Ts>
-struct __min_nested_type_size<_WrapperType<_Ts...>>
+template <typename... _Ts>
+struct __min_nested_type_size<std::tuple<_Ts...>>
+{
+    constexpr static std::size_t value = std::min({__min_nested_type_size<_Ts>::value...});
+};
+
+template <typename... _Ts>
+struct __min_nested_type_size<oneapi::dpl::__internal::tuple<_Ts...>>
 {
     constexpr static std::size_t value = std::min({__min_nested_type_size<_Ts>::value...});
 };

From 1336735740413206157b7c39ee348bbbbd864e71 Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Mon, 6 Jan 2025 13:20:40 -0600
Subject: [PATCH 56/65] Work around compiler vectorization issue

With uint8_t types, the icpx compiler fails to vectorize even when
calling begin() on our range within a kernel to pull out a raw pointer.
To work around this issue, begin() needs to be called on the host and
passed to the kernel

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../hetero/dpcpp/parallel_backend_sycl_for.h  |  16 +-
 .../pstl/hetero/dpcpp/unseq_backend_sycl.h    | 300 ++++++++----------
 2 files changed, 151 insertions(+), 165 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h
index 6cd548eeda9..d91f787d565 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h
@@ -192,8 +192,20 @@ __parallel_for(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&&
     {
         if (__count >= __large_submitter::__estimate_best_start_size(__exec, __brick))
         {
-            return __large_submitter{}(std::forward<_ExecutionPolicy>(__exec), __brick, __count,
-                                       std::forward<_Ranges>(__rngs)...);
+            // Passing begin() of each range is needed for the icpx compiler to vectorize. The indirection introduced
+            // by our all / guard views interfere with compiler vectorization. At this point, we have ensured that
+            // input is contiguous and can be operated on directly. The begin() function for these views will return a
+            // pointer which is passed to the kernel.
+            if constexpr (_Fp::__can_vectorize)
+            {
+                return __large_submitter{}(std::forward<_ExecutionPolicy>(__exec), __brick, __count,
+                                           std::forward<_Ranges>(__rngs).begin()...);
+            }
+            else
+            {
+                return __large_submitter{}(std::forward<_ExecutionPolicy>(__exec), __brick, __count,
+                                           std::forward<_Ranges>(__rngs)...);
+            }
         }
     }
     return __small_submitter{}(std::forward<_ExecutionPolicy>(__exec), __brick, __count,
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
index 90e55b8f669..2b5466f991b 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
@@ -163,32 +163,30 @@ struct walk1_vector_or_scalar : public walk_vector_or_scalar_base<_Range>
   public:
     walk1_vector_or_scalar(_F __f, std::size_t __n) : __f(__f), __n(__n) {}
 
-    template <typename _IsFull, typename _ItemId>
+    template <typename _IsFull, typename _ItemId, typename _Acc>
     void
-    __vector_path(_IsFull __is_full, const _ItemId __idx, _Range __rng) const
+    __vector_path(_IsFull __is_full, const _ItemId __idx, _Acc __acc) const
     {
-        // This is needed to enable vectorization
-        auto __raw_ptr = __rng.begin();
         oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(__is_full, __idx, __f,
-                                                                                                 __raw_ptr);
+                                                                                                 __acc);
     }
 
     // _IsFull is ignored here. We assume that boundary checking has been already performed for this index.
-    template <typename _IsFull, typename _ItemId>
+    template <typename _IsFull, typename _ItemId, typename _Acc>
     void
-    __scalar_path(_IsFull, const _ItemId __idx, _Range __rng) const
+    __scalar_path(_IsFull, const _ItemId __idx, _Acc __acc) const
     {
-        __f(__rng[__idx]);
+        __f(__acc[__idx]);
     }
 
-    template <typename _IsFull, typename _ItemId>
+    template <typename _IsFull, typename _ItemId, typename _Acc>
     void
-    operator()(_IsFull __is_full, const _ItemId __idx, _Range __rng) const
+    operator()(_IsFull __is_full, const _ItemId __idx, _Acc __acc) const
     {
         if constexpr (__base_t::__can_vectorize)
-            __vector_path(__is_full, __idx, __rng);
+            __vector_path(__is_full, __idx, __acc);
         else
-            __scalar_path(__is_full, __idx, __rng);
+            __scalar_path(__is_full, __idx, __acc);
     }
 };
 
@@ -196,6 +194,7 @@ template <typename _ExecutionPolicy, typename _F, typename _Range1, typename _Ra
 struct walk2_vectors_or_scalars : public walk_vector_or_scalar_base<_Range1, _Range2>
 {
   private:
+    using _ValueType1 = oneapi::dpl::__internal::__value_t<_Range1>;
     using __base_t = walk_vector_or_scalar_base<_Range1, _Range2>;
     _F __f;
     std::size_t __n;
@@ -203,47 +202,44 @@ struct walk2_vectors_or_scalars : public walk_vector_or_scalar_base<_Range1, _Ra
   public:
     walk2_vectors_or_scalars(_F __f, std::size_t __n) : __f(__f), __n(__n) {}
 
-    template <typename _IsFull, typename _ItemId>
+    template <typename _IsFull, typename _ItemId, typename _Acc1, typename _Acc2>
     void
-    __vector_path(_IsFull __is_full, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2) const
+    __vector_path(_IsFull __is_full, const _ItemId __idx, _Acc1 __acc1, _Acc2 __acc2) const
     {
-        using _ValueType1 = oneapi::dpl::__internal::__value_t<_Range1>;
         // This is needed for the icpx compiler to vectorize. The indirection introduced by our all / guard views interfere
         // with compiler vectorization. At this point, we have ensured that input is contiguous and can be operated on as a raw pointer. The
         // begin() function for these views will return a pointer.
-        auto __raw_ptr1 = __rng1.begin();
-        auto __raw_ptr2 = __rng2.begin();
-        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType1> __rng1_vector[__base_t::__preferred_vector_size];
+        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType1> __acc1_vector[__base_t::__preferred_vector_size];
         // 1. Load input into a vector
         oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(
-            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __raw_ptr1, __rng1_vector);
+            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __acc1, __acc1_vector);
         // 2. Apply functor to vector and store into global memory
         oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(
-            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f}, __rng1_vector,
-            __raw_ptr2);
+            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f}, __acc1_vector,
+            __acc2);
         // 3. Explicitly call destructor of lazy union type
         oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(
             __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType1>::__get_callable_deleter(),
-            __rng1_vector);
+            __acc1_vector);
     }
 
     // _IsFull is ignored here. We assume that boundary checking has been already performed for this index.
-    template <typename _IsFull, typename _ItemId>
+    template <typename _IsFull, typename _ItemId, typename _Acc1, typename _Acc2>
     void
-    __scalar_path(_IsFull, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2) const
+    __scalar_path(_IsFull, const _ItemId __idx, _Acc1 __acc1, _Acc2 __acc2) const
     {
 
-        __f(__rng1[__idx], __rng2[__idx]);
+        __f(__acc1[__idx], __acc2[__idx]);
     }
 
-    template <typename _IsFull, typename _ItemId>
+    template <typename _IsFull, typename _ItemId, typename _Acc1, typename _Acc2>
     void
-    operator()(_IsFull __is_full, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2) const
+    operator()(_IsFull __is_full, const _ItemId __idx, _Acc1 __acc1, _Acc2 __acc2) const
     {
         if constexpr (__base_t::__can_vectorize)
-            __vector_path(__is_full, __idx, __rng1, __rng2);
+            __vector_path(__is_full, __idx, __acc1, __acc2);
         else
-            __scalar_path(__is_full, __idx, __rng1, __rng2);
+            __scalar_path(__is_full, __idx, __acc1, __acc2);
     }
 };
 
@@ -251,6 +247,8 @@ template <typename _ExecutionPolicy, typename _F, typename _Range1, typename _Ra
 struct walk3_vectors_or_scalars : public walk_vector_or_scalar_base<_Range1, _Range2, _Range3>
 {
   private:
+    using _ValueType1 = oneapi::dpl::__internal::__value_t<_Range1>;
+    using _ValueType2 = oneapi::dpl::__internal::__value_t<_Range2>;
     using __base_t = walk_vector_or_scalar_base<_Range1, _Range2, _Range3>;
     _F __f;
     std::size_t __n;
@@ -258,56 +256,47 @@ struct walk3_vectors_or_scalars : public walk_vector_or_scalar_base<_Range1, _Ra
   public:
     walk3_vectors_or_scalars(_F __f, std::size_t __n) : __f(__f), __n(__n) {}
 
-    template <typename _IsFull, typename _ItemId>
+    template <typename _IsFull, typename _ItemId, typename _Acc1, typename _Acc2, typename _Acc3>
     void
-    __vector_path(_IsFull __is_full, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2, _Range3 __rng3) const
-    {
-        using _ValueType1 = oneapi::dpl::__internal::__value_t<_Range1>;
-        using _ValueType2 = oneapi::dpl::__internal::__value_t<_Range2>;
-        // This is needed for the icpx compiler to vectorize. The indirection introduced by our views interfere
-        // with compiler vectorization. At this point, we have ensured that input is contiguous and can be operated on
-        // as a raw pointer.
-        auto __raw_ptr1 = __rng1.begin();
-        auto __raw_ptr2 = __rng2.begin();
-        auto __raw_ptr3 = __rng3.begin();
-
-        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType1> __rng1_vector[__base_t::__preferred_vector_size];
-        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType2> __rng2_vector[__base_t::__preferred_vector_size];
+    __vector_path(_IsFull __is_full, const _ItemId __idx, _Acc1 __acc1, _Acc2 __acc2, _Acc3 __acc3) const
+    {
+        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType1> __acc1_vector[__base_t::__preferred_vector_size];
+        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType2> __acc2_vector[__base_t::__preferred_vector_size];
         // 1. Load inputs into vectors
         oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(
-            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __raw_ptr1, __rng1_vector);
+            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __acc1, __acc1_vector);
         oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(
-            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __raw_ptr2, __rng2_vector);
+            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __acc2, __acc2_vector);
         // 2. Apply binary functor to vector and store into global memory
         oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(
-            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f}, __rng1_vector,
-            __rng2_vector, __raw_ptr3);
+            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f}, __acc1_vector,
+            __acc2_vector, __acc3);
         // 3. Explicitly call destructors of lazy union type
         oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(
             __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType1>::__get_callable_deleter(),
-            __rng1_vector);
+            __acc1_vector);
         oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(
             __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType2>::__get_callable_deleter(),
-            __rng2_vector);
+            __acc2_vector);
     }
 
     // _IsFull is ignored here. We assume that boundary checking has been already performed for this index.
-    template <typename _IsFull, typename _ItemId>
+    template <typename _IsFull, typename _ItemId, typename _Acc1, typename _Acc2, typename _Acc3>
     void
-    __scalar_path(_IsFull, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2, _Range3 __rng3) const
+    __scalar_path(_IsFull, const _ItemId __idx, _Acc1 __acc1, _Acc2 __acc2, _Acc3 __acc3) const
     {
 
-        __f(__rng1[__idx], __rng2[__idx], __rng3[__idx]);
+        __f(__acc1[__idx], __acc2[__idx], __acc3[__idx]);
     }
 
-    template <typename _IsFull, typename _ItemId>
+    template <typename _IsFull, typename _ItemId, typename _Acc1, typename _Acc2, typename _Acc3>
     void
-    operator()(_IsFull __is_full, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2, _Range3 __rng3) const
+    operator()(_IsFull __is_full, const _ItemId __idx, _Acc1 __acc1, _Acc2 __acc2, _Acc3 __acc3) const
     {
         if constexpr (__base_t::__can_vectorize)
-            __vector_path(__is_full, __idx, __rng1, __rng2, __rng3);
+            __vector_path(__is_full, __idx, __acc1, __acc2, __acc3);
         else
-            __scalar_path(__is_full, __idx, __rng1, __rng2, __rng3);
+            __scalar_path(__is_full, __idx, __acc1, __acc2, __acc3);
     }
 };
 
@@ -342,54 +331,50 @@ struct walk_adjacent_difference : public walk_vector_or_scalar_base<_Range1, _Ra
   public:
     walk_adjacent_difference(_F __f, std::size_t __n) : __f(__f), __n(__n) {}
 
-    template <typename _IsFull, typename _ItemId>
+    template <typename _IsFull, typename _ItemId, typename _Acc1, typename _Acc2>
     void
-    __scalar_path(_IsFull, const _ItemId __idx, const _Range1 __rng1, _Range2 __rng2) const
+    __scalar_path(_IsFull, const _ItemId __idx, const _Acc1 __acc1, _Acc2 __acc2) const
     {
         // just copy an element if it is the first one
         if (__idx == 0)
-            __rng2[__idx] = __rng1[__idx];
+            __acc2[__idx] = __acc1[__idx];
         else
-            __f(__rng1[__idx + (-1)], __rng1[__idx], __rng2[__idx]);
+            __f(__acc1[__idx + (-1)], __acc1[__idx], __acc2[__idx]);
     }
-    template <typename _IsFull, typename _ItemId>
+    template <typename _IsFull, typename _ItemId, typename _Acc1, typename _Acc2>
     void
-    __vector_path(_IsFull __is_full, const _ItemId __idx, const _Range1 __rng1, _Range2 __rng2) const
+    __vector_path(_IsFull __is_full, const _ItemId __idx, const _Acc1 __acc1, _Acc2 __acc2) const
     {
         using _ValueType = oneapi::dpl::__internal::__value_t<_Range1>;
-        auto __rng1_ptr = __rng1.begin();
-        auto __rng2_ptr = __rng2.begin();
-        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __rng1_vector[__base_t::__preferred_vector_size + 1];
+        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __acc1_vector[__base_t::__preferred_vector_size + 1];
         // 1. Establish a vector of __preferred_vector_size + 1 where a scalar load is performed on the first element
         // followed by a vector load of the specified length.
         if (__idx != 0)
-            __rng1_vector[0].__setup(__rng1_ptr[__idx - 1]);
+            __acc1_vector[0].__setup(__acc1[__idx - 1]);
         else
-            __rng1_vector[0].__setup(__rng1_ptr[0]);
+            __acc1_vector[0].__setup(__acc1[0]);
         oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(
-            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __rng1_ptr,
-            &__rng1_vector[1]);
+            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __acc1, &__acc1_vector[1]);
         // 2. Perform a vector store of __preferred_vector_size adjacent differences.
         oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(
-            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f}, __rng1_vector,
-            &__rng1_vector[1], __rng2_ptr);
+            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f}, __acc1_vector,
+            &__acc1_vector[1], __acc2);
         // A dummy value is first written to global memory followed by an overwrite for the first index. Pulling the vector loads / stores into an if branch
         // to better handle this results in performance degradation.
         if (__idx == 0)
-            __rng2[0] = __rng1_vector[0].__v;
+            __acc2[0] = __acc1_vector[0].__v;
         // 3. Delete temporary storage
         oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(
-            __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType>::__get_callable_deleter(),
-            __rng1_vector);
+            __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType>::__get_callable_deleter(), __acc1);
     }
-    template <typename _IsFull, typename _ItemId>
+    template <typename _IsFull, typename _ItemId, typename _Acc1, typename _Acc2>
     void
-    operator()(_IsFull __is_full, const _ItemId __idx, const _Range1 __rng1, _Range2 __rng2) const
+    operator()(_IsFull __is_full, const _ItemId __idx, const _Acc1 __acc1, _Acc2 __acc2) const
     {
         if constexpr (__base_t::__can_vectorize)
-            __vector_path(__is_full, __idx, __rng1, __rng2);
+            __vector_path(__is_full, __idx, __acc1, __acc2);
         else
-            __scalar_path(__is_full, __idx, __rng1, __rng2);
+            __scalar_path(__is_full, __idx, __acc1, __acc2);
     }
 };
 
@@ -1180,11 +1165,10 @@ struct __reverse_functor : public walk_vector_or_scalar_base<_Range>
   public:
     __reverse_functor(_Size __size) : __size(__size) {}
 
-    template <typename _IsFull, typename _Idx>
+    template <typename _IsFull, typename _Idx, typename _Acc>
     void
-    __vector_path(_IsFull __is_full, const _Idx __left_start_idx, _Range __rng) const
+    __vector_path(_IsFull __is_full, const _Idx __left_start_idx, _Acc __acc) const
     {
-        auto __rng_pointer = __rng.begin();
         std::size_t __n = __size;
         std::size_t __midpoint = __size / 2;
         // If our start is passed the midpoint, then immediately leave as it is guaranteed to be processed by another
@@ -1197,52 +1181,52 @@ struct __reverse_functor : public walk_vector_or_scalar_base<_Range>
         // 1. Load two vectors that we want to swap: one from the left half of the buffer and one from the right
         const _Idx __right_start_idx = __size - __left_start_idx - __base_t::__preferred_vector_size;
 
-        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __rng_left_vector[__base_t::__preferred_vector_size];
-        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __rng_right_vector[__base_t::__preferred_vector_size];
+        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __acc_left_vector[__base_t::__preferred_vector_size];
+        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __acc_right_vector[__base_t::__preferred_vector_size];
 
         oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(
-            __is_full, __left_start_idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __rng_pointer,
-            __rng_left_vector);
+            __is_full, __left_start_idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __acc,
+            __acc_left_vector);
         oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(
-            __is_full, __right_start_idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __rng_pointer,
-            __rng_right_vector);
+            __is_full, __right_start_idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __acc,
+            __acc_right_vector);
         // 2. Reverse vectors in registers. Note that due to indices we have chosen, there will always be a full vector of elements to load
         oneapi::dpl::__par_backend_hetero::__vector_reverse<__base_t::__preferred_vector_size>{}(
-            std::true_type{}, __left_start_idx, __rng_left_vector);
+            std::true_type{}, __left_start_idx, __acc_left_vector);
         oneapi::dpl::__par_backend_hetero::__vector_reverse<__base_t::__preferred_vector_size>{}(
-            std::true_type{}, __right_start_idx, __rng_right_vector);
+            std::true_type{}, __right_start_idx, __acc_right_vector);
         // 3. Store the left-half vector to the corresponding right-half indices and vice versa
         oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(
             __is_full, __right_start_idx,
             oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<oneapi::dpl::__internal::__pstl_assign>{},
-            __rng_left_vector, __rng_pointer);
+            __acc_left_vector, __acc);
         oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(
             __is_full, __left_start_idx,
             oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<oneapi::dpl::__internal::__pstl_assign>{},
-            __rng_right_vector, __rng_pointer);
+            __acc_right_vector, __acc);
         // 4. Call destructors of temporary storage
         oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(
             __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType>::__get_callable_deleter(),
-            __rng_left_vector);
+            __acc_left_vector);
         oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(
             __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType>::__get_callable_deleter(),
-            __rng_right_vector);
+            __acc_right_vector);
     }
-    template <typename _IsFull, typename _Idx>
+    template <typename _IsFull, typename _Idx, typename _Acc>
     void
-    __scalar_path(_IsFull, const _Idx __idx, _Range __rng) const
+    __scalar_path(_IsFull, const _Idx __idx, _Acc __acc) const
     {
-        using ::std::swap;
-        swap(__rng[__idx], __rng[__size - __idx - 1]);
+        using std::swap;
+        swap(__acc[__idx], __acc[__size - __idx - 1]);
     }
-    template <typename _IsFull, typename _Idx>
+    template <typename _IsFull, typename _Idx, typename _Acc>
     void
-    operator()(_IsFull __is_full, const _Idx __idx, _Range __rng) const
+    operator()(_IsFull __is_full, const _Idx __idx, _Acc __acc) const
     {
         if constexpr (__base_t::__can_vectorize)
-            __vector_path(__is_full, __idx, __rng);
+            __vector_path(__is_full, __idx, __acc);
         else
-            __scalar_path(__is_full, __idx, __rng);
+            __scalar_path(__is_full, __idx, __acc);
     }
 };
 
@@ -1260,58 +1244,55 @@ struct __reverse_copy : public walk_vector_or_scalar_base<_Range1, _Range2>
   public:
     __reverse_copy(_Size __size) : __size(__size) {}
 
-    template <typename _IsFull, typename _Idx>
+    template <typename _IsFull, typename _Idx, typename _Acc1, typename _Acc2>
     void
-    __scalar_path(_IsFull, const _Idx __idx, const _Range1 __rng1, _Range2 __rng2) const
+    __scalar_path(_IsFull, const _Idx __idx, const _Acc1 __acc1, _Acc2 __acc2) const
     {
-        __rng2[__idx] = __rng1[__size - __idx - 1];
+        __acc2[__idx] = __acc1[__size - __idx - 1];
     }
-    template <typename _IsFull, typename _Idx>
+    template <typename _IsFull, typename _Idx, typename _Acc1, typename _Acc2>
     void
-    __vector_path(_IsFull __is_full, const _Idx __idx, const _Range1 __rng1, _Range2 __rng2) const
+    __vector_path(_IsFull __is_full, const _Idx __idx, const _Acc1 __acc1, _Acc2 __acc2) const
     {
-        auto __rng1_pointer = __rng1.begin();
-        auto __rng2_pointer = __rng2.begin();
         std::size_t __n = __size;
         std::size_t __remaining_elements = __idx >= __n ? 0 : __n - __idx;
         std::size_t __elements_to_process =
             std::min(static_cast<std::size_t>(__base_t::__preferred_vector_size), __remaining_elements);
         const _Idx __output_start = __size - __idx - __elements_to_process;
         // 1. Load vector to reverse
-        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __rng1_vector[__base_t::__preferred_vector_size];
+        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __acc1_vector[__base_t::__preferred_vector_size];
         oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(
-            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __rng1_pointer,
-            __rng1_vector);
+            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __acc1, __acc1_vector);
         // 2, 3. Reverse in registers and flip the location of the vector in the output buffer
         if (__elements_to_process == __base_t::__preferred_vector_size)
         {
             oneapi::dpl::__par_backend_hetero::__vector_reverse<__base_t::__preferred_vector_size>{}(
-                std::true_type{}, __elements_to_process, __rng1_vector);
+                std::true_type{}, __elements_to_process, __acc1_vector);
             oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(
                 __is_full, __output_start,
                 oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<oneapi::dpl::__internal::__pstl_assign>{},
-                __rng1_vector, __rng2_pointer);
+                __acc1_vector, __acc2);
         }
         else
         {
             oneapi::dpl::__par_backend_hetero::__vector_reverse<__base_t::__preferred_vector_size>{}(
-                std::false_type{}, __elements_to_process, __rng1_vector);
+                std::false_type{}, __elements_to_process, __acc1_vector);
             for (std::uint8_t __i = 0; __i < __elements_to_process; ++__i)
-                __rng2_pointer[__output_start + __i] = __rng1_vector[__i].__v;
+                __acc2[__output_start + __i] = __acc1_vector[__i].__v;
         }
         // 3. Cleanup
         oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(
             __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType>::__get_callable_deleter(),
-            __rng1_vector);
+            __acc1_vector);
     }
-    template <typename _IsFull, typename _Idx>
+    template <typename _IsFull, typename _Idx, typename _Acc1, typename _Acc2>
     void
-    operator()(_IsFull __is_full, const _Idx __idx, const _Range1 __rng1, _Range2 __rng2) const
+    operator()(_IsFull __is_full, const _Idx __idx, const _Acc1 __acc1, _Acc2 __acc2) const
     {
         if constexpr (__base_t::__can_vectorize)
-            __vector_path(__is_full, __idx, __rng1, __rng2);
+            __vector_path(__is_full, __idx, __acc1, __acc2);
         else
-            __scalar_path(__is_full, __idx, __rng1, __rng2);
+            __scalar_path(__is_full, __idx, __acc1, __acc2);
     }
 };
 
@@ -1330,22 +1311,20 @@ struct __rotate_copy : public walk_vector_or_scalar_base<_Range1, _Range2>
   public:
     __rotate_copy(_Size __size, _Size __shift) : __size(__size), __shift(__shift) {}
 
-    template <typename _IsFull, typename _Idx>
+    template <typename _IsFull, typename _Idx, typename _Acc1, typename _Acc2>
     void
-    __vector_path(_IsFull __is_full, const _Idx __idx, const _Range1 __rng1, _Range2 __rng2) const
+    __vector_path(_IsFull __is_full, const _Idx __idx, const _Acc1 __acc1, _Acc2 __acc2) const
     {
-        auto __rng1_pointer = __rng1.begin();
-        auto __rng2_pointer = __rng2.begin();
         _Idx __shifted_idx = __shift + __idx;
         _Idx __wrapped_idx = __shifted_idx % __size;
         std::size_t __n = __size;
-        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __rng1_vector[__base_t::__preferred_vector_size];
+        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __acc1_vector[__base_t::__preferred_vector_size];
         //1. Vectorize loads only if we know the wrap around point is beyond the current vector elements to process
         if (__wrapped_idx + __base_t::__preferred_vector_size <= __size)
         {
             oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(
-                __is_full, __wrapped_idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __rng1_pointer,
-                __rng1_vector);
+                __is_full, __wrapped_idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __acc1,
+                __acc1_vector);
         }
         else
         {
@@ -1353,32 +1332,32 @@ struct __rotate_copy : public walk_vector_or_scalar_base<_Range1, _Range2>
             std::size_t __elements_to_process =
                 std::min(static_cast<std::size_t>(__base_t::__preferred_vector_size), __remaining_elements);
             for (std::uint16_t __i = 0; __i != __elements_to_process; ++__i)
-                __rng1_vector[__i].__setup(__rng1_pointer[(__shifted_idx + __i) % __size]);
+                __acc1_vector[__i].__setup(__acc1[(__shifted_idx + __i) % __size]);
         }
         // 2. Store the rotation
         oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(
             __is_full, __idx,
             oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<oneapi::dpl::__internal::__pstl_assign>{},
-            __rng1_vector, __rng2_pointer);
+            __acc1_vector, __acc2);
         // 3. Delete temporary storage
         oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(
             __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType>::__get_callable_deleter(),
-            __rng1_vector);
+            __acc1_vector);
     }
-    template <typename _IsFull, typename _Idx>
+    template <typename _IsFull, typename _Idx, typename _Acc1, typename _Acc2>
     void
-    __scalar_path(_IsFull, const _Idx __idx, const _Range1 __rng1, _Range2 __rng2) const
+    __scalar_path(_IsFull, const _Idx __idx, const _Acc1 __acc1, _Acc2 __acc2) const
     {
-        __rng2[__idx] = __rng1[(__shift + __idx) % __size];
+        __acc2[__idx] = __acc1[(__shift + __idx) % __size];
     }
-    template <typename _IsFull, typename _Idx>
+    template <typename _IsFull, typename _Idx, typename _Acc1, typename _Acc2>
     void
-    operator()(_IsFull __is_full, const _Idx __idx, const _Range1 __rng1, _Range2 __rng2) const
+    operator()(_IsFull __is_full, const _Idx __idx, const _Acc1 __acc1, _Acc2 __acc2) const
     {
         if constexpr (__base_t::__can_vectorize)
-            __vector_path(__is_full, __idx, __rng1, __rng2);
+            __vector_path(__is_full, __idx, __acc1, __acc2);
         else
-            __scalar_path(__is_full, __idx, __rng1, __rng2);
+            __scalar_path(__is_full, __idx, __acc1, __acc2);
     }
 };
 
@@ -1546,6 +1525,8 @@ template <typename _ExecutionPolicy, typename _F, typename _Range1, typename _Ra
 struct __brick_swap : public walk_vector_or_scalar_base<_Range1, _Range2>
 {
   private:
+    using _ValueType1 = oneapi::dpl::__internal::__value_t<_Range1>;
+    using _ValueType2 = oneapi::dpl::__internal::__value_t<_Range2>;
     using __base_t = walk_vector_or_scalar_base<_Range1, _Range2>;
     _F __f;
     std::size_t __n;
@@ -1553,55 +1534,48 @@ struct __brick_swap : public walk_vector_or_scalar_base<_Range1, _Range2>
   public:
     __brick_swap(_F __f, std::size_t __n) : __f(__f), __n(__n) {}
 
-    template <typename _IsFull, typename _ItemId>
+    template <typename _IsFull, typename _ItemId, typename _Acc1, typename _Acc2>
     void
-    __vector_path(_IsFull __is_full, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2) const
+    __vector_path(_IsFull __is_full, const _ItemId __idx, _Acc1 __acc1, _Acc2 __acc2) const
     {
-        using _ValueType1 = oneapi::dpl::__internal::__value_t<_Range1>;
-        using _ValueType2 = oneapi::dpl::__internal::__value_t<_Range2>;
-        // This is needed for the icpx compiler to vectorize. The indirection introduced by our all / guard views interfere
-        // with compiler vectorization. At this point, we have ensured that input is contiguous and can be operated on as a raw pointer. The
-        // begin() function for these views will return a pointer.
-        auto __raw_ptr1 = __rng1.begin();
-        auto __raw_ptr2 = __rng2.begin();
-        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType1> __rng1_vector[__base_t::__preferred_vector_size];
-        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType1> __rng2_vector[__base_t::__preferred_vector_size];
+        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType1> __acc1_vector[__base_t::__preferred_vector_size];
+        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType1> __acc2_vector[__base_t::__preferred_vector_size];
         // 1. Load inputs into vectors
         oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(
-            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __raw_ptr1, __rng1_vector);
+            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __acc1, __acc1_vector);
         oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(
-            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __raw_ptr2, __rng2_vector);
+            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __acc2, __acc2_vector);
         // 2. Swap the two ranges
         oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(
-            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f}, __rng2_vector,
-            __raw_ptr1);
+            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f}, __acc2_vector,
+            __acc1);
         oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(
-            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f}, __rng1_vector,
-            __raw_ptr2);
+            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f}, __acc1_vector,
+            __acc2);
         // 3. Explicitly call destructor of lazy union type
         oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(
             __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType1>::__get_callable_deleter(),
-            __rng1_vector);
+            __acc1_vector);
         oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(
             __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType2>::__get_callable_deleter(),
-            __rng2_vector);
+            __acc2_vector);
     }
 
-    template <typename _IsFull, typename _Idx>
+    template <typename _IsFull, typename _Idx, typename _Acc1, typename _Acc2>
     void
-    __scalar_path(_IsFull __is_full, const _Idx __idx, const _Range1 __rng1, _Range2 __rng2) const
+    __scalar_path(_IsFull __is_full, const _Idx __idx, const _Acc1 __acc1, _Acc2 __acc2) const
     {
-        __f(__rng1[__idx], __rng2[__idx]);
+        __f(__acc1[__idx], __acc2[__idx]);
     }
 
-    template <typename _IsFull, typename _Idx>
+    template <typename _IsFull, typename _Idx, typename _Acc1, typename _Acc2>
     void
-    operator()(_IsFull __is_full, const _Idx __idx, const _Range1 __rng1, _Range2 __rng2) const
+    operator()(_IsFull __is_full, const _Idx __idx, const _Acc1 __acc1, _Acc2 __acc2) const
     {
         if constexpr (__base_t::__can_vectorize)
-            __vector_path(__is_full, __idx, __rng1, __rng2);
+            __vector_path(__is_full, __idx, __acc1, __acc2);
         else
-            __scalar_path(__is_full, __idx, __rng1, __rng2);
+            __scalar_path(__is_full, __idx, __acc1, __acc2);
     }
 };
 

From c5e7d61c2bc87c0fe23e4a6349d2c3ee19b8304b Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Mon, 6 Jan 2025 16:15:15 -0800
Subject: [PATCH 57/65] Add missing decays

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
index 2b5466f991b..539611d8ea0 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
@@ -125,7 +125,7 @@ struct walk_vector_or_scalar_base
 
   public:
     constexpr static bool __can_vectorize =
-        (oneapi::dpl::__ranges::__is_vectorizable_range<_Ranges>::value && ...) &&
+        (oneapi::dpl::__ranges::__is_vectorizable_range<std::decay_t<_Ranges>>::value && ...) &&
         (std::is_fundamental_v<oneapi::dpl::__internal::__value_t<_Ranges>> && ...) && __min_type_size < 4;
     // Vectorize for small types, so we generate 128-byte load / stores in a sub-group
     constexpr static std::uint8_t __preferred_vector_size =

From 0c6ca75e491fda61d314090c7ac0fbd50c42fd56 Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Mon, 6 Jan 2025 16:16:37 -0800
Subject: [PATCH 58/65] Add compile time check to ensure we do not get buffer
 pointer on host

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h  |  4 +++-
 .../oneapi/dpl/pstl/hetero/dpcpp/utils_ranges_sycl.h   | 10 ++++++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h
index d91f787d565..edc64fbf72b 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h
@@ -196,7 +196,9 @@ __parallel_for(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&&
             // by our all / guard views interfere with compiler vectorization. At this point, we have ensured that
             // input is contiguous and can be operated on directly. The begin() function for these views will return a
             // pointer which is passed to the kernel.
-            if constexpr (_Fp::__can_vectorize)
+            //
+            // For buffers, pointers cannot be grabbed from an accessor outside of a kernel, so we have to fallback.
+            if constexpr (_Fp::__can_vectorize && (oneapi::dpl::__ranges::__is_passed_directly_range<std::decay_t<_Ranges>>::value && ...))
             {
                 return __large_submitter{}(std::forward<_ExecutionPolicy>(__exec), __brick, __count,
                                            std::forward<_Ranges>(__rngs).begin()...);
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/utils_ranges_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/utils_ranges_sycl.h
index f126df3ab9e..8ad71394213 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/utils_ranges_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/utils_ranges_sycl.h
@@ -780,6 +780,16 @@ struct __is_vectorizable_range<oneapi::dpl::__ranges::all_view<_Args...>> : std:
 {
 };
 
+template <typename _Rng>
+struct __is_passed_directly_range : std::false_type
+{
+};
+
+template <typename... _Args>
+struct __is_passed_directly_range<oneapi::dpl::__ranges::guard_view<_Args...>> : std::true_type
+{
+};
+
 } // namespace __ranges
 } // namespace dpl
 } // namespace oneapi

From be8aeda3cd2eb21f4b6a2a048083599c81deab5d Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Tue, 7 Jan 2025 11:41:30 -0800
Subject: [PATCH 59/65] Revert "Work around compiler vectorization issue"

This reverts commit 1336735740413206157b7c39ee348bbbbd864e71.
---
 .../hetero/dpcpp/parallel_backend_sycl_for.h  |  19 +-
 .../pstl/hetero/dpcpp/unseq_backend_sycl.h    | 300 ++++++++++--------
 2 files changed, 166 insertions(+), 153 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h
index edc64fbf72b..577c4100c27 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h
@@ -192,22 +192,9 @@ __parallel_for(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&&
     {
         if (__count >= __large_submitter::__estimate_best_start_size(__exec, __brick))
         {
-            // Passing begin() of each range is needed for the icpx compiler to vectorize. The indirection introduced
-            // by our all / guard views interfere with compiler vectorization. At this point, we have ensured that
-            // input is contiguous and can be operated on directly. The begin() function for these views will return a
-            // pointer which is passed to the kernel.
-            //
-            // For buffers, pointers cannot be grabbed from an accessor outside of a kernel, so we have to fallback.
-            if constexpr (_Fp::__can_vectorize && (oneapi::dpl::__ranges::__is_passed_directly_range<std::decay_t<_Ranges>>::value && ...))
-            {
-                return __large_submitter{}(std::forward<_ExecutionPolicy>(__exec), __brick, __count,
-                                           std::forward<_Ranges>(__rngs).begin()...);
-            }
-            else
-            {
-                return __large_submitter{}(std::forward<_ExecutionPolicy>(__exec), __brick, __count,
-                                           std::forward<_Ranges>(__rngs)...);
-            }
+
+            return __large_submitter{}(std::forward<_ExecutionPolicy>(__exec), __brick, __count,
+                                       std::forward<_Ranges>(__rngs)...);
         }
     }
     return __small_submitter{}(std::forward<_ExecutionPolicy>(__exec), __brick, __count,
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
index 539611d8ea0..9071c8b073a 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
@@ -163,30 +163,32 @@ struct walk1_vector_or_scalar : public walk_vector_or_scalar_base<_Range>
   public:
     walk1_vector_or_scalar(_F __f, std::size_t __n) : __f(__f), __n(__n) {}
 
-    template <typename _IsFull, typename _ItemId, typename _Acc>
+    template <typename _IsFull, typename _ItemId>
     void
-    __vector_path(_IsFull __is_full, const _ItemId __idx, _Acc __acc) const
+    __vector_path(_IsFull __is_full, const _ItemId __idx, _Range __rng) const
     {
+        // This is needed to enable vectorization
+        auto __raw_ptr = __rng.begin();
         oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(__is_full, __idx, __f,
-                                                                                                 __acc);
+                                                                                                 __raw_ptr);
     }
 
     // _IsFull is ignored here. We assume that boundary checking has been already performed for this index.
-    template <typename _IsFull, typename _ItemId, typename _Acc>
+    template <typename _IsFull, typename _ItemId>
     void
-    __scalar_path(_IsFull, const _ItemId __idx, _Acc __acc) const
+    __scalar_path(_IsFull, const _ItemId __idx, _Range __rng) const
     {
-        __f(__acc[__idx]);
+        __f(__rng[__idx]);
     }
 
-    template <typename _IsFull, typename _ItemId, typename _Acc>
+    template <typename _IsFull, typename _ItemId>
     void
-    operator()(_IsFull __is_full, const _ItemId __idx, _Acc __acc) const
+    operator()(_IsFull __is_full, const _ItemId __idx, _Range __rng) const
     {
         if constexpr (__base_t::__can_vectorize)
-            __vector_path(__is_full, __idx, __acc);
+            __vector_path(__is_full, __idx, __rng);
         else
-            __scalar_path(__is_full, __idx, __acc);
+            __scalar_path(__is_full, __idx, __rng);
     }
 };
 
@@ -194,7 +196,6 @@ template <typename _ExecutionPolicy, typename _F, typename _Range1, typename _Ra
 struct walk2_vectors_or_scalars : public walk_vector_or_scalar_base<_Range1, _Range2>
 {
   private:
-    using _ValueType1 = oneapi::dpl::__internal::__value_t<_Range1>;
     using __base_t = walk_vector_or_scalar_base<_Range1, _Range2>;
     _F __f;
     std::size_t __n;
@@ -202,44 +203,47 @@ struct walk2_vectors_or_scalars : public walk_vector_or_scalar_base<_Range1, _Ra
   public:
     walk2_vectors_or_scalars(_F __f, std::size_t __n) : __f(__f), __n(__n) {}
 
-    template <typename _IsFull, typename _ItemId, typename _Acc1, typename _Acc2>
+    template <typename _IsFull, typename _ItemId>
     void
-    __vector_path(_IsFull __is_full, const _ItemId __idx, _Acc1 __acc1, _Acc2 __acc2) const
+    __vector_path(_IsFull __is_full, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2) const
     {
+        using _ValueType1 = oneapi::dpl::__internal::__value_t<_Range1>;
         // This is needed for the icpx compiler to vectorize. The indirection introduced by our all / guard views interfere
         // with compiler vectorization. At this point, we have ensured that input is contiguous and can be operated on as a raw pointer. The
         // begin() function for these views will return a pointer.
-        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType1> __acc1_vector[__base_t::__preferred_vector_size];
+        auto __raw_ptr1 = __rng1.begin();
+        auto __raw_ptr2 = __rng2.begin();
+        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType1> __rng1_vector[__base_t::__preferred_vector_size];
         // 1. Load input into a vector
         oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(
-            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __acc1, __acc1_vector);
+            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __raw_ptr1, __rng1_vector);
         // 2. Apply functor to vector and store into global memory
         oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(
-            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f}, __acc1_vector,
-            __acc2);
+            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f}, __rng1_vector,
+            __raw_ptr2);
         // 3. Explicitly call destructor of lazy union type
         oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(
             __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType1>::__get_callable_deleter(),
-            __acc1_vector);
+            __rng1_vector);
     }
 
     // _IsFull is ignored here. We assume that boundary checking has been already performed for this index.
-    template <typename _IsFull, typename _ItemId, typename _Acc1, typename _Acc2>
+    template <typename _IsFull, typename _ItemId>
     void
-    __scalar_path(_IsFull, const _ItemId __idx, _Acc1 __acc1, _Acc2 __acc2) const
+    __scalar_path(_IsFull, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2) const
     {
 
-        __f(__acc1[__idx], __acc2[__idx]);
+        __f(__rng1[__idx], __rng2[__idx]);
     }
 
-    template <typename _IsFull, typename _ItemId, typename _Acc1, typename _Acc2>
+    template <typename _IsFull, typename _ItemId>
     void
-    operator()(_IsFull __is_full, const _ItemId __idx, _Acc1 __acc1, _Acc2 __acc2) const
+    operator()(_IsFull __is_full, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2) const
     {
         if constexpr (__base_t::__can_vectorize)
-            __vector_path(__is_full, __idx, __acc1, __acc2);
+            __vector_path(__is_full, __idx, __rng1, __rng2);
         else
-            __scalar_path(__is_full, __idx, __acc1, __acc2);
+            __scalar_path(__is_full, __idx, __rng1, __rng2);
     }
 };
 
@@ -247,8 +251,6 @@ template <typename _ExecutionPolicy, typename _F, typename _Range1, typename _Ra
 struct walk3_vectors_or_scalars : public walk_vector_or_scalar_base<_Range1, _Range2, _Range3>
 {
   private:
-    using _ValueType1 = oneapi::dpl::__internal::__value_t<_Range1>;
-    using _ValueType2 = oneapi::dpl::__internal::__value_t<_Range2>;
     using __base_t = walk_vector_or_scalar_base<_Range1, _Range2, _Range3>;
     _F __f;
     std::size_t __n;
@@ -256,47 +258,56 @@ struct walk3_vectors_or_scalars : public walk_vector_or_scalar_base<_Range1, _Ra
   public:
     walk3_vectors_or_scalars(_F __f, std::size_t __n) : __f(__f), __n(__n) {}
 
-    template <typename _IsFull, typename _ItemId, typename _Acc1, typename _Acc2, typename _Acc3>
+    template <typename _IsFull, typename _ItemId>
     void
-    __vector_path(_IsFull __is_full, const _ItemId __idx, _Acc1 __acc1, _Acc2 __acc2, _Acc3 __acc3) const
-    {
-        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType1> __acc1_vector[__base_t::__preferred_vector_size];
-        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType2> __acc2_vector[__base_t::__preferred_vector_size];
+    __vector_path(_IsFull __is_full, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2, _Range3 __rng3) const
+    {
+        using _ValueType1 = oneapi::dpl::__internal::__value_t<_Range1>;
+        using _ValueType2 = oneapi::dpl::__internal::__value_t<_Range2>;
+        // This is needed for the icpx compiler to vectorize. The indirection introduced by our views interfere
+        // with compiler vectorization. At this point, we have ensured that input is contiguous and can be operated on
+        // as a raw pointer.
+        auto __raw_ptr1 = __rng1.begin();
+        auto __raw_ptr2 = __rng2.begin();
+        auto __raw_ptr3 = __rng3.begin();
+
+        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType1> __rng1_vector[__base_t::__preferred_vector_size];
+        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType2> __rng2_vector[__base_t::__preferred_vector_size];
         // 1. Load inputs into vectors
         oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(
-            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __acc1, __acc1_vector);
+            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __raw_ptr1, __rng1_vector);
         oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(
-            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __acc2, __acc2_vector);
+            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __raw_ptr2, __rng2_vector);
         // 2. Apply binary functor to vector and store into global memory
         oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(
-            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f}, __acc1_vector,
-            __acc2_vector, __acc3);
+            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f}, __rng1_vector,
+            __rng2_vector, __raw_ptr3);
         // 3. Explicitly call destructors of lazy union type
         oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(
             __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType1>::__get_callable_deleter(),
-            __acc1_vector);
+            __rng1_vector);
         oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(
             __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType2>::__get_callable_deleter(),
-            __acc2_vector);
+            __rng2_vector);
     }
 
     // _IsFull is ignored here. We assume that boundary checking has been already performed for this index.
-    template <typename _IsFull, typename _ItemId, typename _Acc1, typename _Acc2, typename _Acc3>
+    template <typename _IsFull, typename _ItemId>
     void
-    __scalar_path(_IsFull, const _ItemId __idx, _Acc1 __acc1, _Acc2 __acc2, _Acc3 __acc3) const
+    __scalar_path(_IsFull, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2, _Range3 __rng3) const
     {
 
-        __f(__acc1[__idx], __acc2[__idx], __acc3[__idx]);
+        __f(__rng1[__idx], __rng2[__idx], __rng3[__idx]);
     }
 
-    template <typename _IsFull, typename _ItemId, typename _Acc1, typename _Acc2, typename _Acc3>
+    template <typename _IsFull, typename _ItemId>
     void
-    operator()(_IsFull __is_full, const _ItemId __idx, _Acc1 __acc1, _Acc2 __acc2, _Acc3 __acc3) const
+    operator()(_IsFull __is_full, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2, _Range3 __rng3) const
     {
         if constexpr (__base_t::__can_vectorize)
-            __vector_path(__is_full, __idx, __acc1, __acc2, __acc3);
+            __vector_path(__is_full, __idx, __rng1, __rng2, __rng3);
         else
-            __scalar_path(__is_full, __idx, __acc1, __acc2, __acc3);
+            __scalar_path(__is_full, __idx, __rng1, __rng2, __rng3);
     }
 };
 
@@ -331,50 +342,54 @@ struct walk_adjacent_difference : public walk_vector_or_scalar_base<_Range1, _Ra
   public:
     walk_adjacent_difference(_F __f, std::size_t __n) : __f(__f), __n(__n) {}
 
-    template <typename _IsFull, typename _ItemId, typename _Acc1, typename _Acc2>
+    template <typename _IsFull, typename _ItemId>
     void
-    __scalar_path(_IsFull, const _ItemId __idx, const _Acc1 __acc1, _Acc2 __acc2) const
+    __scalar_path(_IsFull, const _ItemId __idx, const _Range1 __rng1, _Range2 __rng2) const
     {
         // just copy an element if it is the first one
         if (__idx == 0)
-            __acc2[__idx] = __acc1[__idx];
+            __rng2[__idx] = __rng1[__idx];
         else
-            __f(__acc1[__idx + (-1)], __acc1[__idx], __acc2[__idx]);
+            __f(__rng1[__idx + (-1)], __rng1[__idx], __rng2[__idx]);
     }
-    template <typename _IsFull, typename _ItemId, typename _Acc1, typename _Acc2>
+    template <typename _IsFull, typename _ItemId>
     void
-    __vector_path(_IsFull __is_full, const _ItemId __idx, const _Acc1 __acc1, _Acc2 __acc2) const
+    __vector_path(_IsFull __is_full, const _ItemId __idx, const _Range1 __rng1, _Range2 __rng2) const
     {
         using _ValueType = oneapi::dpl::__internal::__value_t<_Range1>;
-        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __acc1_vector[__base_t::__preferred_vector_size + 1];
+        auto __rng1_ptr = __rng1.begin();
+        auto __rng2_ptr = __rng2.begin();
+        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __rng1_vector[__base_t::__preferred_vector_size + 1];
         // 1. Establish a vector of __preferred_vector_size + 1 where a scalar load is performed on the first element
         // followed by a vector load of the specified length.
         if (__idx != 0)
-            __acc1_vector[0].__setup(__acc1[__idx - 1]);
+            __rng1_vector[0].__setup(__rng1_ptr[__idx - 1]);
         else
-            __acc1_vector[0].__setup(__acc1[0]);
+            __rng1_vector[0].__setup(__rng1_ptr[0]);
         oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(
-            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __acc1, &__acc1_vector[1]);
+            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __rng1_ptr,
+            &__rng1_vector[1]);
         // 2. Perform a vector store of __preferred_vector_size adjacent differences.
         oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(
-            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f}, __acc1_vector,
-            &__acc1_vector[1], __acc2);
+            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f}, __rng1_vector,
+            &__rng1_vector[1], __rng2_ptr);
         // A dummy value is first written to global memory followed by an overwrite for the first index. Pulling the vector loads / stores into an if branch
         // to better handle this results in performance degradation.
         if (__idx == 0)
-            __acc2[0] = __acc1_vector[0].__v;
+            __rng2[0] = __rng1_vector[0].__v;
         // 3. Delete temporary storage
         oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(
-            __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType>::__get_callable_deleter(), __acc1);
+            __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType>::__get_callable_deleter(),
+            __rng1_vector);
     }
-    template <typename _IsFull, typename _ItemId, typename _Acc1, typename _Acc2>
+    template <typename _IsFull, typename _ItemId>
     void
-    operator()(_IsFull __is_full, const _ItemId __idx, const _Acc1 __acc1, _Acc2 __acc2) const
+    operator()(_IsFull __is_full, const _ItemId __idx, const _Range1 __rng1, _Range2 __rng2) const
     {
         if constexpr (__base_t::__can_vectorize)
-            __vector_path(__is_full, __idx, __acc1, __acc2);
+            __vector_path(__is_full, __idx, __rng1, __rng2);
         else
-            __scalar_path(__is_full, __idx, __acc1, __acc2);
+            __scalar_path(__is_full, __idx, __rng1, __rng2);
     }
 };
 
@@ -1165,10 +1180,11 @@ struct __reverse_functor : public walk_vector_or_scalar_base<_Range>
   public:
     __reverse_functor(_Size __size) : __size(__size) {}
 
-    template <typename _IsFull, typename _Idx, typename _Acc>
+    template <typename _IsFull, typename _Idx>
     void
-    __vector_path(_IsFull __is_full, const _Idx __left_start_idx, _Acc __acc) const
+    __vector_path(_IsFull __is_full, const _Idx __left_start_idx, _Range __rng) const
     {
+        auto __rng_pointer = __rng.begin();
         std::size_t __n = __size;
         std::size_t __midpoint = __size / 2;
         // If our start is passed the midpoint, then immediately leave as it is guaranteed to be processed by another
@@ -1181,52 +1197,52 @@ struct __reverse_functor : public walk_vector_or_scalar_base<_Range>
         // 1. Load two vectors that we want to swap: one from the left half of the buffer and one from the right
         const _Idx __right_start_idx = __size - __left_start_idx - __base_t::__preferred_vector_size;
 
-        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __acc_left_vector[__base_t::__preferred_vector_size];
-        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __acc_right_vector[__base_t::__preferred_vector_size];
+        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __rng_left_vector[__base_t::__preferred_vector_size];
+        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __rng_right_vector[__base_t::__preferred_vector_size];
 
         oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(
-            __is_full, __left_start_idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __acc,
-            __acc_left_vector);
+            __is_full, __left_start_idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __rng_pointer,
+            __rng_left_vector);
         oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(
-            __is_full, __right_start_idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __acc,
-            __acc_right_vector);
+            __is_full, __right_start_idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __rng_pointer,
+            __rng_right_vector);
         // 2. Reverse vectors in registers. Note that due to indices we have chosen, there will always be a full vector of elements to load
         oneapi::dpl::__par_backend_hetero::__vector_reverse<__base_t::__preferred_vector_size>{}(
-            std::true_type{}, __left_start_idx, __acc_left_vector);
+            std::true_type{}, __left_start_idx, __rng_left_vector);
         oneapi::dpl::__par_backend_hetero::__vector_reverse<__base_t::__preferred_vector_size>{}(
-            std::true_type{}, __right_start_idx, __acc_right_vector);
+            std::true_type{}, __right_start_idx, __rng_right_vector);
         // 3. Store the left-half vector to the corresponding right-half indices and vice versa
         oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(
             __is_full, __right_start_idx,
             oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<oneapi::dpl::__internal::__pstl_assign>{},
-            __acc_left_vector, __acc);
+            __rng_left_vector, __rng_pointer);
         oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(
             __is_full, __left_start_idx,
             oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<oneapi::dpl::__internal::__pstl_assign>{},
-            __acc_right_vector, __acc);
+            __rng_right_vector, __rng_pointer);
         // 4. Call destructors of temporary storage
         oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(
             __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType>::__get_callable_deleter(),
-            __acc_left_vector);
+            __rng_left_vector);
         oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(
             __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType>::__get_callable_deleter(),
-            __acc_right_vector);
+            __rng_right_vector);
     }
-    template <typename _IsFull, typename _Idx, typename _Acc>
+    template <typename _IsFull, typename _Idx>
     void
-    __scalar_path(_IsFull, const _Idx __idx, _Acc __acc) const
+    __scalar_path(_IsFull, const _Idx __idx, _Range __rng) const
     {
-        using std::swap;
-        swap(__acc[__idx], __acc[__size - __idx - 1]);
+        using ::std::swap;
+        swap(__rng[__idx], __rng[__size - __idx - 1]);
     }
-    template <typename _IsFull, typename _Idx, typename _Acc>
+    template <typename _IsFull, typename _Idx>
     void
-    operator()(_IsFull __is_full, const _Idx __idx, _Acc __acc) const
+    operator()(_IsFull __is_full, const _Idx __idx, _Range __rng) const
     {
         if constexpr (__base_t::__can_vectorize)
-            __vector_path(__is_full, __idx, __acc);
+            __vector_path(__is_full, __idx, __rng);
         else
-            __scalar_path(__is_full, __idx, __acc);
+            __scalar_path(__is_full, __idx, __rng);
     }
 };
 
@@ -1244,55 +1260,58 @@ struct __reverse_copy : public walk_vector_or_scalar_base<_Range1, _Range2>
   public:
     __reverse_copy(_Size __size) : __size(__size) {}
 
-    template <typename _IsFull, typename _Idx, typename _Acc1, typename _Acc2>
+    template <typename _IsFull, typename _Idx>
     void
-    __scalar_path(_IsFull, const _Idx __idx, const _Acc1 __acc1, _Acc2 __acc2) const
+    __scalar_path(_IsFull, const _Idx __idx, const _Range1 __rng1, _Range2 __rng2) const
     {
-        __acc2[__idx] = __acc1[__size - __idx - 1];
+        __rng2[__idx] = __rng1[__size - __idx - 1];
     }
-    template <typename _IsFull, typename _Idx, typename _Acc1, typename _Acc2>
+    template <typename _IsFull, typename _Idx>
     void
-    __vector_path(_IsFull __is_full, const _Idx __idx, const _Acc1 __acc1, _Acc2 __acc2) const
+    __vector_path(_IsFull __is_full, const _Idx __idx, const _Range1 __rng1, _Range2 __rng2) const
     {
+        auto __rng1_pointer = __rng1.begin();
+        auto __rng2_pointer = __rng2.begin();
         std::size_t __n = __size;
         std::size_t __remaining_elements = __idx >= __n ? 0 : __n - __idx;
         std::size_t __elements_to_process =
             std::min(static_cast<std::size_t>(__base_t::__preferred_vector_size), __remaining_elements);
         const _Idx __output_start = __size - __idx - __elements_to_process;
         // 1. Load vector to reverse
-        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __acc1_vector[__base_t::__preferred_vector_size];
+        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __rng1_vector[__base_t::__preferred_vector_size];
         oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(
-            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __acc1, __acc1_vector);
+            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __rng1_pointer,
+            __rng1_vector);
         // 2, 3. Reverse in registers and flip the location of the vector in the output buffer
         if (__elements_to_process == __base_t::__preferred_vector_size)
         {
             oneapi::dpl::__par_backend_hetero::__vector_reverse<__base_t::__preferred_vector_size>{}(
-                std::true_type{}, __elements_to_process, __acc1_vector);
+                std::true_type{}, __elements_to_process, __rng1_vector);
             oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(
                 __is_full, __output_start,
                 oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<oneapi::dpl::__internal::__pstl_assign>{},
-                __acc1_vector, __acc2);
+                __rng1_vector, __rng2_pointer);
         }
         else
         {
             oneapi::dpl::__par_backend_hetero::__vector_reverse<__base_t::__preferred_vector_size>{}(
-                std::false_type{}, __elements_to_process, __acc1_vector);
+                std::false_type{}, __elements_to_process, __rng1_vector);
             for (std::uint8_t __i = 0; __i < __elements_to_process; ++__i)
-                __acc2[__output_start + __i] = __acc1_vector[__i].__v;
+                __rng2_pointer[__output_start + __i] = __rng1_vector[__i].__v;
         }
         // 3. Cleanup
         oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(
             __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType>::__get_callable_deleter(),
-            __acc1_vector);
+            __rng1_vector);
     }
-    template <typename _IsFull, typename _Idx, typename _Acc1, typename _Acc2>
+    template <typename _IsFull, typename _Idx>
     void
-    operator()(_IsFull __is_full, const _Idx __idx, const _Acc1 __acc1, _Acc2 __acc2) const
+    operator()(_IsFull __is_full, const _Idx __idx, const _Range1 __rng1, _Range2 __rng2) const
     {
         if constexpr (__base_t::__can_vectorize)
-            __vector_path(__is_full, __idx, __acc1, __acc2);
+            __vector_path(__is_full, __idx, __rng1, __rng2);
         else
-            __scalar_path(__is_full, __idx, __acc1, __acc2);
+            __scalar_path(__is_full, __idx, __rng1, __rng2);
     }
 };
 
@@ -1311,20 +1330,22 @@ struct __rotate_copy : public walk_vector_or_scalar_base<_Range1, _Range2>
   public:
     __rotate_copy(_Size __size, _Size __shift) : __size(__size), __shift(__shift) {}
 
-    template <typename _IsFull, typename _Idx, typename _Acc1, typename _Acc2>
+    template <typename _IsFull, typename _Idx>
     void
-    __vector_path(_IsFull __is_full, const _Idx __idx, const _Acc1 __acc1, _Acc2 __acc2) const
+    __vector_path(_IsFull __is_full, const _Idx __idx, const _Range1 __rng1, _Range2 __rng2) const
     {
+        auto __rng1_pointer = __rng1.begin();
+        auto __rng2_pointer = __rng2.begin();
         _Idx __shifted_idx = __shift + __idx;
         _Idx __wrapped_idx = __shifted_idx % __size;
         std::size_t __n = __size;
-        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __acc1_vector[__base_t::__preferred_vector_size];
+        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __rng1_vector[__base_t::__preferred_vector_size];
         //1. Vectorize loads only if we know the wrap around point is beyond the current vector elements to process
         if (__wrapped_idx + __base_t::__preferred_vector_size <= __size)
         {
             oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(
-                __is_full, __wrapped_idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __acc1,
-                __acc1_vector);
+                __is_full, __wrapped_idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __rng1_pointer,
+                __rng1_vector);
         }
         else
         {
@@ -1332,32 +1353,32 @@ struct __rotate_copy : public walk_vector_or_scalar_base<_Range1, _Range2>
             std::size_t __elements_to_process =
                 std::min(static_cast<std::size_t>(__base_t::__preferred_vector_size), __remaining_elements);
             for (std::uint16_t __i = 0; __i != __elements_to_process; ++__i)
-                __acc1_vector[__i].__setup(__acc1[(__shifted_idx + __i) % __size]);
+                __rng1_vector[__i].__setup(__rng1_pointer[(__shifted_idx + __i) % __size]);
         }
         // 2. Store the rotation
         oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(
             __is_full, __idx,
             oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<oneapi::dpl::__internal::__pstl_assign>{},
-            __acc1_vector, __acc2);
+            __rng1_vector, __rng2_pointer);
         // 3. Delete temporary storage
         oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(
             __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType>::__get_callable_deleter(),
-            __acc1_vector);
+            __rng1_vector);
     }
-    template <typename _IsFull, typename _Idx, typename _Acc1, typename _Acc2>
+    template <typename _IsFull, typename _Idx>
     void
-    __scalar_path(_IsFull, const _Idx __idx, const _Acc1 __acc1, _Acc2 __acc2) const
+    __scalar_path(_IsFull, const _Idx __idx, const _Range1 __rng1, _Range2 __rng2) const
     {
-        __acc2[__idx] = __acc1[(__shift + __idx) % __size];
+        __rng2[__idx] = __rng1[(__shift + __idx) % __size];
     }
-    template <typename _IsFull, typename _Idx, typename _Acc1, typename _Acc2>
+    template <typename _IsFull, typename _Idx>
     void
-    operator()(_IsFull __is_full, const _Idx __idx, const _Acc1 __acc1, _Acc2 __acc2) const
+    operator()(_IsFull __is_full, const _Idx __idx, const _Range1 __rng1, _Range2 __rng2) const
     {
         if constexpr (__base_t::__can_vectorize)
-            __vector_path(__is_full, __idx, __acc1, __acc2);
+            __vector_path(__is_full, __idx, __rng1, __rng2);
         else
-            __scalar_path(__is_full, __idx, __acc1, __acc2);
+            __scalar_path(__is_full, __idx, __rng1, __rng2);
     }
 };
 
@@ -1525,8 +1546,6 @@ template <typename _ExecutionPolicy, typename _F, typename _Range1, typename _Ra
 struct __brick_swap : public walk_vector_or_scalar_base<_Range1, _Range2>
 {
   private:
-    using _ValueType1 = oneapi::dpl::__internal::__value_t<_Range1>;
-    using _ValueType2 = oneapi::dpl::__internal::__value_t<_Range2>;
     using __base_t = walk_vector_or_scalar_base<_Range1, _Range2>;
     _F __f;
     std::size_t __n;
@@ -1534,48 +1553,55 @@ struct __brick_swap : public walk_vector_or_scalar_base<_Range1, _Range2>
   public:
     __brick_swap(_F __f, std::size_t __n) : __f(__f), __n(__n) {}
 
-    template <typename _IsFull, typename _ItemId, typename _Acc1, typename _Acc2>
+    template <typename _IsFull, typename _ItemId>
     void
-    __vector_path(_IsFull __is_full, const _ItemId __idx, _Acc1 __acc1, _Acc2 __acc2) const
+    __vector_path(_IsFull __is_full, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2) const
     {
-        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType1> __acc1_vector[__base_t::__preferred_vector_size];
-        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType1> __acc2_vector[__base_t::__preferred_vector_size];
+        using _ValueType1 = oneapi::dpl::__internal::__value_t<_Range1>;
+        using _ValueType2 = oneapi::dpl::__internal::__value_t<_Range2>;
+        // This is needed for the icpx compiler to vectorize. The indirection introduced by our all / guard views interfere
+        // with compiler vectorization. At this point, we have ensured that input is contiguous and can be operated on as a raw pointer. The
+        // begin() function for these views will return a pointer.
+        auto __raw_ptr1 = __rng1.begin();
+        auto __raw_ptr2 = __rng2.begin();
+        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType1> __rng1_vector[__base_t::__preferred_vector_size];
+        oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType1> __rng2_vector[__base_t::__preferred_vector_size];
         // 1. Load inputs into vectors
         oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(
-            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __acc1, __acc1_vector);
+            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __raw_ptr1, __rng1_vector);
         oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(
-            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __acc2, __acc2_vector);
+            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __raw_ptr2, __rng2_vector);
         // 2. Swap the two ranges
         oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(
-            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f}, __acc2_vector,
-            __acc1);
+            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f}, __rng2_vector,
+            __raw_ptr1);
         oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(
-            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f}, __acc1_vector,
-            __acc2);
+            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f}, __rng1_vector,
+            __raw_ptr2);
         // 3. Explicitly call destructor of lazy union type
         oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(
             __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType1>::__get_callable_deleter(),
-            __acc1_vector);
+            __rng1_vector);
         oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(
             __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType2>::__get_callable_deleter(),
-            __acc2_vector);
+            __rng2_vector);
     }
 
-    template <typename _IsFull, typename _Idx, typename _Acc1, typename _Acc2>
+    template <typename _IsFull, typename _Idx>
     void
-    __scalar_path(_IsFull __is_full, const _Idx __idx, const _Acc1 __acc1, _Acc2 __acc2) const
+    __scalar_path(_IsFull __is_full, const _Idx __idx, const _Range1 __rng1, _Range2 __rng2) const
     {
-        __f(__acc1[__idx], __acc2[__idx]);
+        __f(__rng1[__idx], __rng2[__idx]);
     }
 
-    template <typename _IsFull, typename _Idx, typename _Acc1, typename _Acc2>
+    template <typename _IsFull, typename _Idx>
     void
-    operator()(_IsFull __is_full, const _Idx __idx, const _Acc1 __acc1, _Acc2 __acc2) const
+    operator()(_IsFull __is_full, const _Idx __idx, const _Range1 __rng1, _Range2 __rng2) const
     {
         if constexpr (__base_t::__can_vectorize)
-            __vector_path(__is_full, __idx, __acc1, __acc2);
+            __vector_path(__is_full, __idx, __rng1, __rng2);
         else
-            __scalar_path(__is_full, __idx, __acc1, __acc2);
+            __scalar_path(__is_full, __idx, __rng1, __rng2);
     }
 };
 

From 86b9c8906194ae9aef7633c6045d7cf1260aa702 Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Tue, 7 Jan 2025 14:00:28 -0800
Subject: [PATCH 60/65] Remove all begin() calls on views in vectorization
 paths

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../pstl/hetero/dpcpp/unseq_backend_sycl.h    | 71 +++++++------------
 1 file changed, 24 insertions(+), 47 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
index 9071c8b073a..6f77052b758 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
@@ -168,9 +168,8 @@ struct walk1_vector_or_scalar : public walk_vector_or_scalar_base<_Range>
     __vector_path(_IsFull __is_full, const _ItemId __idx, _Range __rng) const
     {
         // This is needed to enable vectorization
-        auto __raw_ptr = __rng.begin();
         oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(__is_full, __idx, __f,
-                                                                                                 __raw_ptr);
+                                                                                                 __rng);
     }
 
     // _IsFull is ignored here. We assume that boundary checking has been already performed for this index.
@@ -208,19 +207,14 @@ struct walk2_vectors_or_scalars : public walk_vector_or_scalar_base<_Range1, _Ra
     __vector_path(_IsFull __is_full, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2) const
     {
         using _ValueType1 = oneapi::dpl::__internal::__value_t<_Range1>;
-        // This is needed for the icpx compiler to vectorize. The indirection introduced by our all / guard views interfere
-        // with compiler vectorization. At this point, we have ensured that input is contiguous and can be operated on as a raw pointer. The
-        // begin() function for these views will return a pointer.
-        auto __raw_ptr1 = __rng1.begin();
-        auto __raw_ptr2 = __rng2.begin();
         oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType1> __rng1_vector[__base_t::__preferred_vector_size];
         // 1. Load input into a vector
         oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(
-            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __raw_ptr1, __rng1_vector);
+            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __rng1, __rng1_vector);
         // 2. Apply functor to vector and store into global memory
         oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(
             __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f}, __rng1_vector,
-            __raw_ptr2);
+            __rng2);
         // 3. Explicitly call destructor of lazy union type
         oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(
             __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType1>::__get_callable_deleter(),
@@ -264,24 +258,18 @@ struct walk3_vectors_or_scalars : public walk_vector_or_scalar_base<_Range1, _Ra
     {
         using _ValueType1 = oneapi::dpl::__internal::__value_t<_Range1>;
         using _ValueType2 = oneapi::dpl::__internal::__value_t<_Range2>;
-        // This is needed for the icpx compiler to vectorize. The indirection introduced by our views interfere
-        // with compiler vectorization. At this point, we have ensured that input is contiguous and can be operated on
-        // as a raw pointer.
-        auto __raw_ptr1 = __rng1.begin();
-        auto __raw_ptr2 = __rng2.begin();
-        auto __raw_ptr3 = __rng3.begin();
 
         oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType1> __rng1_vector[__base_t::__preferred_vector_size];
         oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType2> __rng2_vector[__base_t::__preferred_vector_size];
         // 1. Load inputs into vectors
         oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(
-            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __raw_ptr1, __rng1_vector);
+            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __rng1, __rng1_vector);
         oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(
-            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __raw_ptr2, __rng2_vector);
+            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __rng2, __rng2_vector);
         // 2. Apply binary functor to vector and store into global memory
         oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(
             __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f}, __rng1_vector,
-            __rng2_vector, __raw_ptr3);
+            __rng2_vector, __rng3);
         // 3. Explicitly call destructors of lazy union type
         oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(
             __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType1>::__get_callable_deleter(),
@@ -357,22 +345,19 @@ struct walk_adjacent_difference : public walk_vector_or_scalar_base<_Range1, _Ra
     __vector_path(_IsFull __is_full, const _ItemId __idx, const _Range1 __rng1, _Range2 __rng2) const
     {
         using _ValueType = oneapi::dpl::__internal::__value_t<_Range1>;
-        auto __rng1_ptr = __rng1.begin();
-        auto __rng2_ptr = __rng2.begin();
         oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __rng1_vector[__base_t::__preferred_vector_size + 1];
         // 1. Establish a vector of __preferred_vector_size + 1 where a scalar load is performed on the first element
         // followed by a vector load of the specified length.
         if (__idx != 0)
-            __rng1_vector[0].__setup(__rng1_ptr[__idx - 1]);
+            __rng1_vector[0].__setup(__rng1[__idx - 1]);
         else
-            __rng1_vector[0].__setup(__rng1_ptr[0]);
+            __rng1_vector[0].__setup(__rng1[0]);
         oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(
-            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __rng1_ptr,
-            &__rng1_vector[1]);
+            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __rng1, &__rng1_vector[1]);
         // 2. Perform a vector store of __preferred_vector_size adjacent differences.
         oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(
             __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f}, __rng1_vector,
-            &__rng1_vector[1], __rng2_ptr);
+            &__rng1_vector[1], __rng2);
         // A dummy value is first written to global memory followed by an overwrite for the first index. Pulling the vector loads / stores into an if branch
         // to better handle this results in performance degradation.
         if (__idx == 0)
@@ -1184,7 +1169,6 @@ struct __reverse_functor : public walk_vector_or_scalar_base<_Range>
     void
     __vector_path(_IsFull __is_full, const _Idx __left_start_idx, _Range __rng) const
     {
-        auto __rng_pointer = __rng.begin();
         std::size_t __n = __size;
         std::size_t __midpoint = __size / 2;
         // If our start is passed the midpoint, then immediately leave as it is guaranteed to be processed by another
@@ -1201,10 +1185,10 @@ struct __reverse_functor : public walk_vector_or_scalar_base<_Range>
         oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __rng_right_vector[__base_t::__preferred_vector_size];
 
         oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(
-            __is_full, __left_start_idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __rng_pointer,
+            __is_full, __left_start_idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __rng,
             __rng_left_vector);
         oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(
-            __is_full, __right_start_idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __rng_pointer,
+            __is_full, __right_start_idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __rng,
             __rng_right_vector);
         // 2. Reverse vectors in registers. Note that due to indices we have chosen, there will always be a full vector of elements to load
         oneapi::dpl::__par_backend_hetero::__vector_reverse<__base_t::__preferred_vector_size>{}(
@@ -1215,11 +1199,11 @@ struct __reverse_functor : public walk_vector_or_scalar_base<_Range>
         oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(
             __is_full, __right_start_idx,
             oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<oneapi::dpl::__internal::__pstl_assign>{},
-            __rng_left_vector, __rng_pointer);
+            __rng_left_vector, __rng);
         oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(
             __is_full, __left_start_idx,
             oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<oneapi::dpl::__internal::__pstl_assign>{},
-            __rng_right_vector, __rng_pointer);
+            __rng_right_vector, __rng);
         // 4. Call destructors of temporary storage
         oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(
             __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType>::__get_callable_deleter(),
@@ -1270,8 +1254,6 @@ struct __reverse_copy : public walk_vector_or_scalar_base<_Range1, _Range2>
     void
     __vector_path(_IsFull __is_full, const _Idx __idx, const _Range1 __rng1, _Range2 __rng2) const
     {
-        auto __rng1_pointer = __rng1.begin();
-        auto __rng2_pointer = __rng2.begin();
         std::size_t __n = __size;
         std::size_t __remaining_elements = __idx >= __n ? 0 : __n - __idx;
         std::size_t __elements_to_process =
@@ -1280,8 +1262,7 @@ struct __reverse_copy : public walk_vector_or_scalar_base<_Range1, _Range2>
         // 1. Load vector to reverse
         oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __rng1_vector[__base_t::__preferred_vector_size];
         oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(
-            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __rng1_pointer,
-            __rng1_vector);
+            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __rng1, __rng1_vector);
         // 2, 3. Reverse in registers and flip the location of the vector in the output buffer
         if (__elements_to_process == __base_t::__preferred_vector_size)
         {
@@ -1290,14 +1271,14 @@ struct __reverse_copy : public walk_vector_or_scalar_base<_Range1, _Range2>
             oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(
                 __is_full, __output_start,
                 oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<oneapi::dpl::__internal::__pstl_assign>{},
-                __rng1_vector, __rng2_pointer);
+                __rng1_vector, __rng2);
         }
         else
         {
             oneapi::dpl::__par_backend_hetero::__vector_reverse<__base_t::__preferred_vector_size>{}(
                 std::false_type{}, __elements_to_process, __rng1_vector);
             for (std::uint8_t __i = 0; __i < __elements_to_process; ++__i)
-                __rng2_pointer[__output_start + __i] = __rng1_vector[__i].__v;
+                __rng2[__output_start + __i] = __rng1_vector[__i].__v;
         }
         // 3. Cleanup
         oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(
@@ -1334,8 +1315,6 @@ struct __rotate_copy : public walk_vector_or_scalar_base<_Range1, _Range2>
     void
     __vector_path(_IsFull __is_full, const _Idx __idx, const _Range1 __rng1, _Range2 __rng2) const
     {
-        auto __rng1_pointer = __rng1.begin();
-        auto __rng2_pointer = __rng2.begin();
         _Idx __shifted_idx = __shift + __idx;
         _Idx __wrapped_idx = __shifted_idx % __size;
         std::size_t __n = __size;
@@ -1344,7 +1323,7 @@ struct __rotate_copy : public walk_vector_or_scalar_base<_Range1, _Range2>
         if (__wrapped_idx + __base_t::__preferred_vector_size <= __size)
         {
             oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(
-                __is_full, __wrapped_idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __rng1_pointer,
+                __is_full, __wrapped_idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __rng1,
                 __rng1_vector);
         }
         else
@@ -1353,13 +1332,13 @@ struct __rotate_copy : public walk_vector_or_scalar_base<_Range1, _Range2>
             std::size_t __elements_to_process =
                 std::min(static_cast<std::size_t>(__base_t::__preferred_vector_size), __remaining_elements);
             for (std::uint16_t __i = 0; __i != __elements_to_process; ++__i)
-                __rng1_vector[__i].__setup(__rng1_pointer[(__shifted_idx + __i) % __size]);
+                __rng1_vector[__i].__setup(__rng1[(__shifted_idx + __i) % __size]);
         }
         // 2. Store the rotation
         oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(
             __is_full, __idx,
             oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<oneapi::dpl::__internal::__pstl_assign>{},
-            __rng1_vector, __rng2_pointer);
+            __rng1_vector, __rng2);
         // 3. Delete temporary storage
         oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(
             __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType>::__get_callable_deleter(),
@@ -1562,22 +1541,20 @@ struct __brick_swap : public walk_vector_or_scalar_base<_Range1, _Range2>
         // This is needed for the icpx compiler to vectorize. The indirection introduced by our all / guard views interfere
         // with compiler vectorization. At this point, we have ensured that input is contiguous and can be operated on as a raw pointer. The
         // begin() function for these views will return a pointer.
-        auto __raw_ptr1 = __rng1.begin();
-        auto __raw_ptr2 = __rng2.begin();
         oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType1> __rng1_vector[__base_t::__preferred_vector_size];
         oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType1> __rng2_vector[__base_t::__preferred_vector_size];
         // 1. Load inputs into vectors
         oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(
-            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __raw_ptr1, __rng1_vector);
+            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __rng1, __rng1_vector);
         oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(
-            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __raw_ptr2, __rng2_vector);
+            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __rng2, __rng2_vector);
         // 2. Swap the two ranges
         oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(
             __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f}, __rng2_vector,
-            __raw_ptr1);
+            __rng1);
         oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(
             __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f}, __rng1_vector,
-            __raw_ptr2);
+            __rng2);
         // 3. Explicitly call destructor of lazy union type
         oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(
             __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType1>::__get_callable_deleter(),

From ffd95cc2cd14726c6a6426ef11d7568b50935a4b Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Tue, 7 Jan 2025 14:05:10 -0800
Subject: [PATCH 61/65] Remove unused __is_passed_directly_range utility

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../oneapi/dpl/pstl/hetero/dpcpp/utils_ranges_sycl.h   | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/utils_ranges_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/utils_ranges_sycl.h
index 8ad71394213..f126df3ab9e 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/utils_ranges_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/utils_ranges_sycl.h
@@ -780,16 +780,6 @@ struct __is_vectorizable_range<oneapi::dpl::__ranges::all_view<_Args...>> : std:
 {
 };
 
-template <typename _Rng>
-struct __is_passed_directly_range : std::false_type
-{
-};
-
-template <typename... _Args>
-struct __is_passed_directly_range<oneapi::dpl::__ranges::guard_view<_Args...>> : std::true_type
-{
-};
-
 } // namespace __ranges
 } // namespace dpl
 } // namespace oneapi

From 537a6f04c493469e5b6a6ae82d925a5cbb770fb5 Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Wed, 8 Jan 2025 11:22:04 -0600
Subject: [PATCH 62/65] Rename __scalar_path / __vector_path to
 __scalar_path_impl / __vector_path_impl

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../oneapi/dpl/internal/binary_search_impl.h  |  4 +-
 .../hetero/dpcpp/parallel_backend_sycl_for.h  |  2 +-
 .../hetero/dpcpp/parallel_backend_sycl_fpga.h |  2 +-
 .../pstl/hetero/dpcpp/unseq_backend_sycl.h    | 74 +++++++++----------
 4 files changed, 41 insertions(+), 41 deletions(-)

diff --git a/include/oneapi/dpl/internal/binary_search_impl.h b/include/oneapi/dpl/internal/binary_search_impl.h
index d8fa53d68d1..2727bc6b480 100644
--- a/include/oneapi/dpl/internal/binary_search_impl.h
+++ b/include/oneapi/dpl/internal/binary_search_impl.h
@@ -76,7 +76,7 @@ struct custom_brick : oneapi::dpl::unseq_backend::walk_scalar_base<_Range>
     }
     template <typename _IsFull, typename _ItemId, typename _Acc>
     void
-    __scalar_path(_IsFull, _ItemId idx, _Acc acc) const
+    __scalar_path_impl(_IsFull, _ItemId idx, _Acc acc) const
     {
         if (use_32bit_indexing)
             search_impl<std::uint32_t>(idx, acc);
@@ -87,7 +87,7 @@ struct custom_brick : oneapi::dpl::unseq_backend::walk_scalar_base<_Range>
     void
     operator()(_IsFull __is_full, _ItemId idx, _Acc acc) const
     {
-        __scalar_path(__is_full, idx, acc);
+        __scalar_path_impl(__is_full, idx, acc);
     }
 };
 #endif
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h
index 577c4100c27..46e36d1fafb 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h
@@ -69,7 +69,7 @@ struct __parallel_for_small_submitter<__internal::__optional_kernel_name<_Name..
                 auto __idx = __item_id.get_linear_id();
                 // For small inputs, do not vectorize or perform multiple iterations per work item. Spread input evenly
                 // across compute units.
-                __brick.__scalar_path(std::true_type{}, __idx, __rngs...);
+                __brick.__scalar_path_impl(std::true_type{}, __idx, __rngs...);
             });
         });
         return __future(__event);
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h
index 3e08be8466e..613cd07f6f2 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h
@@ -71,7 +71,7 @@ struct __parallel_for_fpga_submitter<__internal::__optional_kernel_name<_Name...
 #pragma unroll(::std::decay <_ExecutionPolicy>::type::unroll_factor)
                 for (auto __idx = 0; __idx < __count; ++__idx)
                 {
-                    __brick.__scalar_path(std::true_type{}, __idx, __rngs...);
+                    __brick.__scalar_path_impl(std::true_type{}, __idx, __rngs...);
                 }
             });
         });
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
index 6f77052b758..e87deca20a0 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
@@ -165,7 +165,7 @@ struct walk1_vector_or_scalar : public walk_vector_or_scalar_base<_Range>
 
     template <typename _IsFull, typename _ItemId>
     void
-    __vector_path(_IsFull __is_full, const _ItemId __idx, _Range __rng) const
+    __vector_path_impl(_IsFull __is_full, const _ItemId __idx, _Range __rng) const
     {
         // This is needed to enable vectorization
         oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(__is_full, __idx, __f,
@@ -175,7 +175,7 @@ struct walk1_vector_or_scalar : public walk_vector_or_scalar_base<_Range>
     // _IsFull is ignored here. We assume that boundary checking has been already performed for this index.
     template <typename _IsFull, typename _ItemId>
     void
-    __scalar_path(_IsFull, const _ItemId __idx, _Range __rng) const
+    __scalar_path_impl(_IsFull, const _ItemId __idx, _Range __rng) const
     {
         __f(__rng[__idx]);
     }
@@ -185,9 +185,9 @@ struct walk1_vector_or_scalar : public walk_vector_or_scalar_base<_Range>
     operator()(_IsFull __is_full, const _ItemId __idx, _Range __rng) const
     {
         if constexpr (__base_t::__can_vectorize)
-            __vector_path(__is_full, __idx, __rng);
+            __vector_path_impl(__is_full, __idx, __rng);
         else
-            __scalar_path(__is_full, __idx, __rng);
+            __scalar_path_impl(__is_full, __idx, __rng);
     }
 };
 
@@ -204,7 +204,7 @@ struct walk2_vectors_or_scalars : public walk_vector_or_scalar_base<_Range1, _Ra
 
     template <typename _IsFull, typename _ItemId>
     void
-    __vector_path(_IsFull __is_full, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2) const
+    __vector_path_impl(_IsFull __is_full, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2) const
     {
         using _ValueType1 = oneapi::dpl::__internal::__value_t<_Range1>;
         oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType1> __rng1_vector[__base_t::__preferred_vector_size];
@@ -224,7 +224,7 @@ struct walk2_vectors_or_scalars : public walk_vector_or_scalar_base<_Range1, _Ra
     // _IsFull is ignored here. We assume that boundary checking has been already performed for this index.
     template <typename _IsFull, typename _ItemId>
     void
-    __scalar_path(_IsFull, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2) const
+    __scalar_path_impl(_IsFull, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2) const
     {
 
         __f(__rng1[__idx], __rng2[__idx]);
@@ -235,9 +235,9 @@ struct walk2_vectors_or_scalars : public walk_vector_or_scalar_base<_Range1, _Ra
     operator()(_IsFull __is_full, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2) const
     {
         if constexpr (__base_t::__can_vectorize)
-            __vector_path(__is_full, __idx, __rng1, __rng2);
+            __vector_path_impl(__is_full, __idx, __rng1, __rng2);
         else
-            __scalar_path(__is_full, __idx, __rng1, __rng2);
+            __scalar_path_impl(__is_full, __idx, __rng1, __rng2);
     }
 };
 
@@ -254,7 +254,7 @@ struct walk3_vectors_or_scalars : public walk_vector_or_scalar_base<_Range1, _Ra
 
     template <typename _IsFull, typename _ItemId>
     void
-    __vector_path(_IsFull __is_full, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2, _Range3 __rng3) const
+    __vector_path_impl(_IsFull __is_full, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2, _Range3 __rng3) const
     {
         using _ValueType1 = oneapi::dpl::__internal::__value_t<_Range1>;
         using _ValueType2 = oneapi::dpl::__internal::__value_t<_Range2>;
@@ -282,7 +282,7 @@ struct walk3_vectors_or_scalars : public walk_vector_or_scalar_base<_Range1, _Ra
     // _IsFull is ignored here. We assume that boundary checking has been already performed for this index.
     template <typename _IsFull, typename _ItemId>
     void
-    __scalar_path(_IsFull, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2, _Range3 __rng3) const
+    __scalar_path_impl(_IsFull, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2, _Range3 __rng3) const
     {
 
         __f(__rng1[__idx], __rng2[__idx], __rng3[__idx]);
@@ -293,9 +293,9 @@ struct walk3_vectors_or_scalars : public walk_vector_or_scalar_base<_Range1, _Ra
     operator()(_IsFull __is_full, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2, _Range3 __rng3) const
     {
         if constexpr (__base_t::__can_vectorize)
-            __vector_path(__is_full, __idx, __rng1, __rng2, __rng3);
+            __vector_path_impl(__is_full, __idx, __rng1, __rng2, __rng3);
         else
-            __scalar_path(__is_full, __idx, __rng1, __rng2, __rng3);
+            __scalar_path_impl(__is_full, __idx, __rng1, __rng2, __rng3);
     }
 };
 
@@ -332,7 +332,7 @@ struct walk_adjacent_difference : public walk_vector_or_scalar_base<_Range1, _Ra
 
     template <typename _IsFull, typename _ItemId>
     void
-    __scalar_path(_IsFull, const _ItemId __idx, const _Range1 __rng1, _Range2 __rng2) const
+    __scalar_path_impl(_IsFull, const _ItemId __idx, const _Range1 __rng1, _Range2 __rng2) const
     {
         // just copy an element if it is the first one
         if (__idx == 0)
@@ -342,7 +342,7 @@ struct walk_adjacent_difference : public walk_vector_or_scalar_base<_Range1, _Ra
     }
     template <typename _IsFull, typename _ItemId>
     void
-    __vector_path(_IsFull __is_full, const _ItemId __idx, const _Range1 __rng1, _Range2 __rng2) const
+    __vector_path_impl(_IsFull __is_full, const _ItemId __idx, const _Range1 __rng1, _Range2 __rng2) const
     {
         using _ValueType = oneapi::dpl::__internal::__value_t<_Range1>;
         oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __rng1_vector[__base_t::__preferred_vector_size + 1];
@@ -372,9 +372,9 @@ struct walk_adjacent_difference : public walk_vector_or_scalar_base<_Range1, _Ra
     operator()(_IsFull __is_full, const _ItemId __idx, const _Range1 __rng1, _Range2 __rng2) const
     {
         if constexpr (__base_t::__can_vectorize)
-            __vector_path(__is_full, __idx, __rng1, __rng2);
+            __vector_path_impl(__is_full, __idx, __rng1, __rng2);
         else
-            __scalar_path(__is_full, __idx, __rng1, __rng2);
+            __scalar_path_impl(__is_full, __idx, __rng1, __rng2);
     }
 };
 
@@ -1167,7 +1167,7 @@ struct __reverse_functor : public walk_vector_or_scalar_base<_Range>
 
     template <typename _IsFull, typename _Idx>
     void
-    __vector_path(_IsFull __is_full, const _Idx __left_start_idx, _Range __rng) const
+    __vector_path_impl(_IsFull __is_full, const _Idx __left_start_idx, _Range __rng) const
     {
         std::size_t __n = __size;
         std::size_t __midpoint = __size / 2;
@@ -1214,7 +1214,7 @@ struct __reverse_functor : public walk_vector_or_scalar_base<_Range>
     }
     template <typename _IsFull, typename _Idx>
     void
-    __scalar_path(_IsFull, const _Idx __idx, _Range __rng) const
+    __scalar_path_impl(_IsFull, const _Idx __idx, _Range __rng) const
     {
         using ::std::swap;
         swap(__rng[__idx], __rng[__size - __idx - 1]);
@@ -1224,9 +1224,9 @@ struct __reverse_functor : public walk_vector_or_scalar_base<_Range>
     operator()(_IsFull __is_full, const _Idx __idx, _Range __rng) const
     {
         if constexpr (__base_t::__can_vectorize)
-            __vector_path(__is_full, __idx, __rng);
+            __vector_path_impl(__is_full, __idx, __rng);
         else
-            __scalar_path(__is_full, __idx, __rng);
+            __scalar_path_impl(__is_full, __idx, __rng);
     }
 };
 
@@ -1246,13 +1246,13 @@ struct __reverse_copy : public walk_vector_or_scalar_base<_Range1, _Range2>
 
     template <typename _IsFull, typename _Idx>
     void
-    __scalar_path(_IsFull, const _Idx __idx, const _Range1 __rng1, _Range2 __rng2) const
+    __scalar_path_impl(_IsFull, const _Idx __idx, const _Range1 __rng1, _Range2 __rng2) const
     {
         __rng2[__idx] = __rng1[__size - __idx - 1];
     }
     template <typename _IsFull, typename _Idx>
     void
-    __vector_path(_IsFull __is_full, const _Idx __idx, const _Range1 __rng1, _Range2 __rng2) const
+    __vector_path_impl(_IsFull __is_full, const _Idx __idx, const _Range1 __rng1, _Range2 __rng2) const
     {
         std::size_t __n = __size;
         std::size_t __remaining_elements = __idx >= __n ? 0 : __n - __idx;
@@ -1290,9 +1290,9 @@ struct __reverse_copy : public walk_vector_or_scalar_base<_Range1, _Range2>
     operator()(_IsFull __is_full, const _Idx __idx, const _Range1 __rng1, _Range2 __rng2) const
     {
         if constexpr (__base_t::__can_vectorize)
-            __vector_path(__is_full, __idx, __rng1, __rng2);
+            __vector_path_impl(__is_full, __idx, __rng1, __rng2);
         else
-            __scalar_path(__is_full, __idx, __rng1, __rng2);
+            __scalar_path_impl(__is_full, __idx, __rng1, __rng2);
     }
 };
 
@@ -1313,7 +1313,7 @@ struct __rotate_copy : public walk_vector_or_scalar_base<_Range1, _Range2>
 
     template <typename _IsFull, typename _Idx>
     void
-    __vector_path(_IsFull __is_full, const _Idx __idx, const _Range1 __rng1, _Range2 __rng2) const
+    __vector_path_impl(_IsFull __is_full, const _Idx __idx, const _Range1 __rng1, _Range2 __rng2) const
     {
         _Idx __shifted_idx = __shift + __idx;
         _Idx __wrapped_idx = __shifted_idx % __size;
@@ -1346,7 +1346,7 @@ struct __rotate_copy : public walk_vector_or_scalar_base<_Range1, _Range2>
     }
     template <typename _IsFull, typename _Idx>
     void
-    __scalar_path(_IsFull, const _Idx __idx, const _Range1 __rng1, _Range2 __rng2) const
+    __scalar_path_impl(_IsFull, const _Idx __idx, const _Range1 __rng1, _Range2 __rng2) const
     {
         __rng2[__idx] = __rng1[(__shift + __idx) % __size];
     }
@@ -1355,9 +1355,9 @@ struct __rotate_copy : public walk_vector_or_scalar_base<_Range1, _Range2>
     operator()(_IsFull __is_full, const _Idx __idx, const _Range1 __rng1, _Range2 __rng2) const
     {
         if constexpr (__base_t::__can_vectorize)
-            __vector_path(__is_full, __idx, __rng1, __rng2);
+            __vector_path_impl(__is_full, __idx, __rng1, __rng2);
         else
-            __scalar_path(__is_full, __idx, __rng1, __rng2);
+            __scalar_path_impl(__is_full, __idx, __rng1, __rng2);
     }
 };
 
@@ -1448,7 +1448,7 @@ struct __brick_shift_left
 
     template <typename _IsFull, typename _ItemId>
     void
-    __scalar_path(_IsFull __is_full, const _ItemId __idx, _Range __rng) const
+    __scalar_path_impl(_IsFull __is_full, const _ItemId __idx, _Range __rng) const
     {
         const _DiffType __i = __idx - __n; //loop invariant
         for (_DiffType __k = __n; __k < __size; __k += __n)
@@ -1462,7 +1462,7 @@ struct __brick_shift_left
     void
     operator()(_IsFull __is_full, const _ItemId __idx, _Range __rng) const
     {
-        __scalar_path(__is_full, __idx, __rng);
+        __scalar_path_impl(__is_full, __idx, __rng);
     }
 };
 
@@ -1498,8 +1498,8 @@ struct __brick_reduce_idx : public walk_scalar_base<_Range>
     }
     template <typename _IsFull, typename _ItemId, typename _ReduceIdx, typename _Values, typename _OutValues>
     void
-    __scalar_path(_IsFull, const _ItemId __idx, const _ReduceIdx& __segment_starts, const _Values& __values,
-                  _OutValues& __out_values) const
+    __scalar_path_impl(_IsFull, const _ItemId __idx, const _ReduceIdx& __segment_starts, const _Values& __values,
+                       _OutValues& __out_values) const
     {
         using __value_type = decltype(__segment_starts[__idx]);
         __value_type __segment_end =
@@ -1511,7 +1511,7 @@ struct __brick_reduce_idx : public walk_scalar_base<_Range>
     operator()(_IsFull __is_full, const _ItemId __idx, const _ReduceIdx& __segment_starts, const _Values& __values,
                _OutValues& __out_values) const
     {
-        __scalar_path(__is_full, __idx, __segment_starts, __values, __out_values);
+        __scalar_path_impl(__is_full, __idx, __segment_starts, __values, __out_values);
     }
 
   private:
@@ -1534,7 +1534,7 @@ struct __brick_swap : public walk_vector_or_scalar_base<_Range1, _Range2>
 
     template <typename _IsFull, typename _ItemId>
     void
-    __vector_path(_IsFull __is_full, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2) const
+    __vector_path_impl(_IsFull __is_full, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2) const
     {
         using _ValueType1 = oneapi::dpl::__internal::__value_t<_Range1>;
         using _ValueType2 = oneapi::dpl::__internal::__value_t<_Range2>;
@@ -1566,7 +1566,7 @@ struct __brick_swap : public walk_vector_or_scalar_base<_Range1, _Range2>
 
     template <typename _IsFull, typename _Idx>
     void
-    __scalar_path(_IsFull __is_full, const _Idx __idx, const _Range1 __rng1, _Range2 __rng2) const
+    __scalar_path_impl(_IsFull __is_full, const _Idx __idx, const _Range1 __rng1, _Range2 __rng2) const
     {
         __f(__rng1[__idx], __rng2[__idx]);
     }
@@ -1576,9 +1576,9 @@ struct __brick_swap : public walk_vector_or_scalar_base<_Range1, _Range2>
     operator()(_IsFull __is_full, const _Idx __idx, const _Range1 __rng1, _Range2 __rng2) const
     {
         if constexpr (__base_t::__can_vectorize)
-            __vector_path(__is_full, __idx, __rng1, __rng2);
+            __vector_path_impl(__is_full, __idx, __rng1, __rng2);
         else
-            __scalar_path(__is_full, __idx, __rng1, __rng2);
+            __scalar_path_impl(__is_full, __idx, __rng1, __rng2);
     }
 };
 

From 50a60ea95942ba9825816266c955e2c835cbd521 Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Wed, 8 Jan 2025 13:55:51 -0600
Subject: [PATCH 63/65] Correct __vector_walk deleters and a type in
 __reverse_copy

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../pstl/hetero/dpcpp/unseq_backend_sycl.h    | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
index e87deca20a0..2035846b570 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
@@ -216,7 +216,7 @@ struct walk2_vectors_or_scalars : public walk_vector_or_scalar_base<_Range1, _Ra
             __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f}, __rng1_vector,
             __rng2);
         // 3. Explicitly call destructor of lazy union type
-        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(
+        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n - __idx}(
             __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType1>::__get_callable_deleter(),
             __rng1_vector);
     }
@@ -271,10 +271,10 @@ struct walk3_vectors_or_scalars : public walk_vector_or_scalar_base<_Range1, _Ra
             __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f}, __rng1_vector,
             __rng2_vector, __rng3);
         // 3. Explicitly call destructors of lazy union type
-        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(
+        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n - __idx}(
             __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType1>::__get_callable_deleter(),
             __rng1_vector);
-        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(
+        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n - __idx}(
             __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType2>::__get_callable_deleter(),
             __rng2_vector);
     }
@@ -363,7 +363,7 @@ struct walk_adjacent_difference : public walk_vector_or_scalar_base<_Range1, _Ra
         if (__idx == 0)
             __rng2[0] = __rng1_vector[0].__v;
         // 3. Delete temporary storage
-        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(
+        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n - __idx}(
             __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType>::__get_callable_deleter(),
             __rng1_vector);
     }
@@ -1256,7 +1256,7 @@ struct __reverse_copy : public walk_vector_or_scalar_base<_Range1, _Range2>
     {
         std::size_t __n = __size;
         std::size_t __remaining_elements = __idx >= __n ? 0 : __n - __idx;
-        std::size_t __elements_to_process =
+        std::uint8_t __elements_to_process =
             std::min(static_cast<std::size_t>(__base_t::__preferred_vector_size), __remaining_elements);
         const _Idx __output_start = __size - __idx - __elements_to_process;
         // 1. Load vector to reverse
@@ -1267,7 +1267,7 @@ struct __reverse_copy : public walk_vector_or_scalar_base<_Range1, _Range2>
         if (__elements_to_process == __base_t::__preferred_vector_size)
         {
             oneapi::dpl::__par_backend_hetero::__vector_reverse<__base_t::__preferred_vector_size>{}(
-                std::true_type{}, __elements_to_process, __rng1_vector);
+                __is_full, __elements_to_process, __rng1_vector);
             oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(
                 __is_full, __output_start,
                 oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<oneapi::dpl::__internal::__pstl_assign>{},
@@ -1281,7 +1281,7 @@ struct __reverse_copy : public walk_vector_or_scalar_base<_Range1, _Range2>
                 __rng2[__output_start + __i] = __rng1_vector[__i].__v;
         }
         // 3. Cleanup
-        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(
+        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__remaining_elements}(
             __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType>::__get_callable_deleter(),
             __rng1_vector);
     }
@@ -1340,7 +1340,7 @@ struct __rotate_copy : public walk_vector_or_scalar_base<_Range1, _Range2>
             oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<oneapi::dpl::__internal::__pstl_assign>{},
             __rng1_vector, __rng2);
         // 3. Delete temporary storage
-        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(
+        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n - __idx}(
             __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType>::__get_callable_deleter(),
             __rng1_vector);
     }
@@ -1556,10 +1556,10 @@ struct __brick_swap : public walk_vector_or_scalar_base<_Range1, _Range2>
             __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f}, __rng1_vector,
             __rng2);
         // 3. Explicitly call destructor of lazy union type
-        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(
+        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n - __idx}(
             __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType1>::__get_callable_deleter(),
             __rng1_vector);
-        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(
+        oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n - __idx}(
             __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType2>::__get_callable_deleter(),
             __rng2_vector);
     }

From 1081ab8f54e9a3c15fee66ad847e96c05186ac0d Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Thu, 9 Jan 2025 11:29:24 -0600
Subject: [PATCH 64/65] Set upper limit of 10,000,000 for get_pattern_for_max_n

This is well beyond the cutoff point to invoke the large submitter and
prevents timeouts observed on devices with many compute units when
testing CPU paths.

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 test/support/utils.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/test/support/utils.h b/test/support/utils.h
index 3326fc4ce87..bef28868b1b 100644
--- a/test/support/utils.h
+++ b/test/support/utils.h
@@ -1026,8 +1026,10 @@ get_pattern_for_max_n()
     sycl::device d = q.get_device();
     constexpr std::size_t max_iters_per_item = 16;
     constexpr std::size_t multiplier = 4;
-    return multiplier * max_iters_per_item * d.get_info<sycl::info::device::max_work_group_size>() *
-           d.get_info<sycl::info::device::max_compute_units>();
+    std::size_t __max_n = multiplier * max_iters_per_item * d.get_info<sycl::info::device::max_work_group_size>() *
+                          d.get_info<sycl::info::device::max_compute_units>();
+    __max_n = std::min(std::size_t{10000000}, __max_n);
+    return __max_n;
 #else
     return TestUtils::max_n;
 #endif

From 9513edbfc0b1e774695dc87331282cb6125eb60d Mon Sep 17 00:00:00 2001
From: Matthew Michel <matthew.michel@intel.com>
Date: Thu, 9 Jan 2025 13:23:49 -0600
Subject: [PATCH 65/65] General cleanup and renaming for consistency

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../dpcpp/parallel_backend_sycl_utils.h       | 43 +++++++++----------
 .../pstl/hetero/dpcpp/unseq_backend_sycl.h    | 26 +++++------
 2 files changed, 31 insertions(+), 38 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
index 1735c363731..21970044d7d 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
@@ -835,7 +835,7 @@ class __static_monotonic_dispatcher<::std::integer_sequence<::std::uint16_t, _X,
 };
 
 // For use with __lazy_ctor_storage
-struct __lazy_load_transform_op
+struct __lazy_load_op
 {
     template <typename _IdxType1, typename _IdxType2, typename _SourceAcc, typename _DestAcc>
     void
@@ -850,22 +850,22 @@ struct __vector_load
 {
     static_assert(__vec_size <= 4, "Only vector sizes of 4 or less are supported");
     std::size_t __n;
-    template <typename _IdxType, typename _LoadOp, typename... _Acc>
+    template <typename _IdxType, typename _LoadOp, typename... _Rngs>
     void
-    operator()(std::true_type, _IdxType __start_idx, _LoadOp __load_op, _Acc... __acc) const
+    operator()(/*__is_full*/ std::true_type, _IdxType __start_idx, _LoadOp __load_op, _Rngs&&... __rngs) const
     {
         _ONEDPL_PRAGMA_UNROLL
         for (std::uint8_t __i = 0; __i < __vec_size; ++__i)
-            __load_op(__start_idx + __i, __i, __acc...);
+            __load_op(__start_idx + __i, __i, __rngs...);
     }
 
-    template <typename _IdxType, typename _LoadOp, typename... _Acc>
+    template <typename _IdxType, typename _LoadOp, typename... _Rngs>
     void
-    operator()(std::false_type, _IdxType __start_idx, _LoadOp __load_op, _Acc... __acc) const
+    operator()(/*__is_full*/ std::false_type, _IdxType __start_idx, _LoadOp __load_op, _Rngs&&... __rngs) const
     {
         std::uint8_t __elements = std::min(std::size_t{__vec_size}, std::size_t{__n - __start_idx});
         for (std::uint8_t __i = 0; __i < __elements; ++__i)
-            __load_op(__start_idx + __i, __i, __acc...);
+            __load_op(__start_idx + __i, __i, __rngs...);
     }
 };
 
@@ -929,7 +929,7 @@ struct __vector_store
 
     template <typename _IdxType, typename _StoreOp, typename... _Rngs>
     void
-    operator()(std::true_type, _IdxType __start_idx, _StoreOp __store_op, _Rngs... __rngs) const
+    operator()(std::true_type, _IdxType __start_idx, _StoreOp __store_op, _Rngs&&... __rngs) const
     {
         _ONEDPL_PRAGMA_UNROLL
         for (std::uint8_t __i = 0; __i < __vec_size; ++__i)
@@ -937,7 +937,7 @@ struct __vector_store
     }
     template <typename _IdxType, typename _StoreOp, typename... _Rngs>
     void
-    operator()(std::false_type, _IdxType __start_idx, _StoreOp __store_op, _Rngs... __rngs) const
+    operator()(std::false_type, _IdxType __start_idx, _StoreOp __store_op, _Rngs&&... __rngs) const
     {
         std::uint8_t __elements = std::min(std::size_t{__vec_size}, std::size_t{__n - __start_idx});
         for (std::uint8_t __i = 0; __i < __elements; ++__i)
@@ -949,21 +949,20 @@ template <std::uint8_t __vec_size>
 struct __vector_reverse
 {
     static_assert(__vec_size <= 4, "Only vector sizes of 4 or less are supported");
-    template <typename _IsFull, typename _Idx, typename _Array>
+    template <typename _Idx, typename _Array>
     void
-    operator()(_IsFull __is_full, const _Idx __elements_to_process, _Array __array) const
+    operator()(/*__is_full*/ std::true_type, const _Idx __elements_to_process, _Array __array) const
     {
-        if constexpr (__is_full)
-        {
-            _ONEDPL_PRAGMA_UNROLL
-            for (std::uint8_t __i = 0; __i < __vec_size / 2; ++__i)
-                std::swap(__array[__i].__v, __array[__vec_size - __i - 1].__v);
-        }
-        else
-        {
-            for (std::uint8_t __i = 0; __i < __elements_to_process / 2; ++__i)
-                std::swap(__array[__i].__v, __array[__elements_to_process - __i - 1].__v);
-        }
+        _ONEDPL_PRAGMA_UNROLL
+        for (std::uint8_t __i = 0; __i < __vec_size / 2; ++__i)
+            std::swap(__array[__i].__v, __array[__vec_size - __i - 1].__v);
+    }
+    template <typename _Idx, typename _Array>
+    void
+    operator()(/*__is_full*/ std::false_type, const _Idx __elements_to_process, _Array __array) const
+    {
+        for (std::uint8_t __i = 0; __i < __elements_to_process / 2; ++__i)
+            std::swap(__array[__i].__v, __array[__elements_to_process - __i - 1].__v);
     }
 };
 
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
index 2035846b570..0678907b953 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
@@ -167,7 +167,6 @@ struct walk1_vector_or_scalar : public walk_vector_or_scalar_base<_Range>
     void
     __vector_path_impl(_IsFull __is_full, const _ItemId __idx, _Range __rng) const
     {
-        // This is needed to enable vectorization
         oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(__is_full, __idx, __f,
                                                                                                  __rng);
     }
@@ -210,7 +209,7 @@ struct walk2_vectors_or_scalars : public walk_vector_or_scalar_base<_Range1, _Ra
         oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType1> __rng1_vector[__base_t::__preferred_vector_size];
         // 1. Load input into a vector
         oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(
-            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __rng1, __rng1_vector);
+            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_op{}, __rng1, __rng1_vector);
         // 2. Apply functor to vector and store into global memory
         oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(
             __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f}, __rng1_vector,
@@ -263,9 +262,9 @@ struct walk3_vectors_or_scalars : public walk_vector_or_scalar_base<_Range1, _Ra
         oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType2> __rng2_vector[__base_t::__preferred_vector_size];
         // 1. Load inputs into vectors
         oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(
-            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __rng1, __rng1_vector);
+            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_op{}, __rng1, __rng1_vector);
         oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(
-            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __rng2, __rng2_vector);
+            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_op{}, __rng2, __rng2_vector);
         // 2. Apply binary functor to vector and store into global memory
         oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(
             __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f}, __rng1_vector,
@@ -353,7 +352,7 @@ struct walk_adjacent_difference : public walk_vector_or_scalar_base<_Range1, _Ra
         else
             __rng1_vector[0].__setup(__rng1[0]);
         oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(
-            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __rng1, &__rng1_vector[1]);
+            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_op{}, __rng1, &__rng1_vector[1]);
         // 2. Perform a vector store of __preferred_vector_size adjacent differences.
         oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(
             __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f}, __rng1_vector,
@@ -1185,10 +1184,9 @@ struct __reverse_functor : public walk_vector_or_scalar_base<_Range>
         oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __rng_right_vector[__base_t::__preferred_vector_size];
 
         oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(
-            __is_full, __left_start_idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __rng,
-            __rng_left_vector);
+            __is_full, __left_start_idx, oneapi::dpl::__par_backend_hetero::__lazy_load_op{}, __rng, __rng_left_vector);
         oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(
-            __is_full, __right_start_idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __rng,
+            __is_full, __right_start_idx, oneapi::dpl::__par_backend_hetero::__lazy_load_op{}, __rng,
             __rng_right_vector);
         // 2. Reverse vectors in registers. Note that due to indices we have chosen, there will always be a full vector of elements to load
         oneapi::dpl::__par_backend_hetero::__vector_reverse<__base_t::__preferred_vector_size>{}(
@@ -1262,7 +1260,7 @@ struct __reverse_copy : public walk_vector_or_scalar_base<_Range1, _Range2>
         // 1. Load vector to reverse
         oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __rng1_vector[__base_t::__preferred_vector_size];
         oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(
-            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __rng1, __rng1_vector);
+            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_op{}, __rng1, __rng1_vector);
         // 2, 3. Reverse in registers and flip the location of the vector in the output buffer
         if (__elements_to_process == __base_t::__preferred_vector_size)
         {
@@ -1323,8 +1321,7 @@ struct __rotate_copy : public walk_vector_or_scalar_base<_Range1, _Range2>
         if (__wrapped_idx + __base_t::__preferred_vector_size <= __size)
         {
             oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(
-                __is_full, __wrapped_idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __rng1,
-                __rng1_vector);
+                __is_full, __wrapped_idx, oneapi::dpl::__par_backend_hetero::__lazy_load_op{}, __rng1, __rng1_vector);
         }
         else
         {
@@ -1538,16 +1535,13 @@ struct __brick_swap : public walk_vector_or_scalar_base<_Range1, _Range2>
     {
         using _ValueType1 = oneapi::dpl::__internal::__value_t<_Range1>;
         using _ValueType2 = oneapi::dpl::__internal::__value_t<_Range2>;
-        // This is needed for the icpx compiler to vectorize. The indirection introduced by our all / guard views interfere
-        // with compiler vectorization. At this point, we have ensured that input is contiguous and can be operated on as a raw pointer. The
-        // begin() function for these views will return a pointer.
         oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType1> __rng1_vector[__base_t::__preferred_vector_size];
         oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType1> __rng2_vector[__base_t::__preferred_vector_size];
         // 1. Load inputs into vectors
         oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(
-            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __rng1, __rng1_vector);
+            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_op{}, __rng1, __rng1_vector);
         oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(
-            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __rng2, __rng2_vector);
+            __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_op{}, __rng2, __rng2_vector);
         // 2. Swap the two ranges
         oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(
             __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f}, __rng2_vector,