diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h index 6cd548eeda..d91f787d56 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h @@ -192,8 +192,20 @@ __parallel_for(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& { if (__count >= __large_submitter::__estimate_best_start_size(__exec, __brick)) { - return __large_submitter{}(std::forward<_ExecutionPolicy>(__exec), __brick, __count, - std::forward<_Ranges>(__rngs)...); + // Passing begin() of each range is needed for the icpx compiler to vectorize. The indirection introduced + // by our all / guard views interfere with compiler vectorization. At this point, we have ensured that + // input is contiguous and can be operated on directly. The begin() function for these views will return a + // pointer which is passed to the kernel. + if constexpr (_Fp::__can_vectorize) + { + return __large_submitter{}(std::forward<_ExecutionPolicy>(__exec), __brick, __count, + std::forward<_Ranges>(__rngs).begin()...); + } + else + { + return __large_submitter{}(std::forward<_ExecutionPolicy>(__exec), __brick, __count, + std::forward<_Ranges>(__rngs)...); + } } } return __small_submitter{}(std::forward<_ExecutionPolicy>(__exec), __brick, __count, diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h index 90e55b8f66..2b5466f991 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h @@ -163,32 +163,30 @@ struct walk1_vector_or_scalar : public walk_vector_or_scalar_base<_Range> public: walk1_vector_or_scalar(_F __f, std::size_t __n) : __f(__f), __n(__n) {} - template + template void - __vector_path(_IsFull __is_full, const _ItemId __idx, _Range __rng) const + __vector_path(_IsFull __is_full, const _ItemId __idx, _Acc __acc) const { - // This is needed to enable vectorization - auto __raw_ptr = __rng.begin(); oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(__is_full, __idx, __f, - __raw_ptr); + __acc); } // _IsFull is ignored here. We assume that boundary checking has been already performed for this index. - template + template void - __scalar_path(_IsFull, const _ItemId __idx, _Range __rng) const + __scalar_path(_IsFull, const _ItemId __idx, _Acc __acc) const { - __f(__rng[__idx]); + __f(__acc[__idx]); } - template + template void - operator()(_IsFull __is_full, const _ItemId __idx, _Range __rng) const + operator()(_IsFull __is_full, const _ItemId __idx, _Acc __acc) const { if constexpr (__base_t::__can_vectorize) - __vector_path(__is_full, __idx, __rng); + __vector_path(__is_full, __idx, __acc); else - __scalar_path(__is_full, __idx, __rng); + __scalar_path(__is_full, __idx, __acc); } }; @@ -196,6 +194,7 @@ template { private: + using _ValueType1 = oneapi::dpl::__internal::__value_t<_Range1>; using __base_t = walk_vector_or_scalar_base<_Range1, _Range2>; _F __f; std::size_t __n; @@ -203,47 +202,44 @@ struct walk2_vectors_or_scalars : public walk_vector_or_scalar_base<_Range1, _Ra public: walk2_vectors_or_scalars(_F __f, std::size_t __n) : __f(__f), __n(__n) {} - template + template void - __vector_path(_IsFull __is_full, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2) const + __vector_path(_IsFull __is_full, const _ItemId __idx, _Acc1 __acc1, _Acc2 __acc2) const { - using _ValueType1 = oneapi::dpl::__internal::__value_t<_Range1>; // This is needed for the icpx compiler to vectorize. The indirection introduced by our all / guard views interfere // with compiler vectorization. At this point, we have ensured that input is contiguous and can be operated on as a raw pointer. The // begin() function for these views will return a pointer. - auto __raw_ptr1 = __rng1.begin(); - auto __raw_ptr2 = __rng2.begin(); - oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType1> __rng1_vector[__base_t::__preferred_vector_size]; + oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType1> __acc1_vector[__base_t::__preferred_vector_size]; // 1. Load input into a vector oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}( - __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __raw_ptr1, __rng1_vector); + __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __acc1, __acc1_vector); // 2. Apply functor to vector and store into global memory oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}( - __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f}, __rng1_vector, - __raw_ptr2); + __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f}, __acc1_vector, + __acc2); // 3. Explicitly call destructor of lazy union type oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}( __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType1>::__get_callable_deleter(), - __rng1_vector); + __acc1_vector); } // _IsFull is ignored here. We assume that boundary checking has been already performed for this index. - template + template void - __scalar_path(_IsFull, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2) const + __scalar_path(_IsFull, const _ItemId __idx, _Acc1 __acc1, _Acc2 __acc2) const { - __f(__rng1[__idx], __rng2[__idx]); + __f(__acc1[__idx], __acc2[__idx]); } - template + template void - operator()(_IsFull __is_full, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2) const + operator()(_IsFull __is_full, const _ItemId __idx, _Acc1 __acc1, _Acc2 __acc2) const { if constexpr (__base_t::__can_vectorize) - __vector_path(__is_full, __idx, __rng1, __rng2); + __vector_path(__is_full, __idx, __acc1, __acc2); else - __scalar_path(__is_full, __idx, __rng1, __rng2); + __scalar_path(__is_full, __idx, __acc1, __acc2); } }; @@ -251,6 +247,8 @@ template { private: + using _ValueType1 = oneapi::dpl::__internal::__value_t<_Range1>; + using _ValueType2 = oneapi::dpl::__internal::__value_t<_Range2>; using __base_t = walk_vector_or_scalar_base<_Range1, _Range2, _Range3>; _F __f; std::size_t __n; @@ -258,56 +256,47 @@ struct walk3_vectors_or_scalars : public walk_vector_or_scalar_base<_Range1, _Ra public: walk3_vectors_or_scalars(_F __f, std::size_t __n) : __f(__f), __n(__n) {} - template + template void - __vector_path(_IsFull __is_full, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2, _Range3 __rng3) const - { - using _ValueType1 = oneapi::dpl::__internal::__value_t<_Range1>; - using _ValueType2 = oneapi::dpl::__internal::__value_t<_Range2>; - // This is needed for the icpx compiler to vectorize. The indirection introduced by our views interfere - // with compiler vectorization. At this point, we have ensured that input is contiguous and can be operated on - // as a raw pointer. - auto __raw_ptr1 = __rng1.begin(); - auto __raw_ptr2 = __rng2.begin(); - auto __raw_ptr3 = __rng3.begin(); - - oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType1> __rng1_vector[__base_t::__preferred_vector_size]; - oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType2> __rng2_vector[__base_t::__preferred_vector_size]; + __vector_path(_IsFull __is_full, const _ItemId __idx, _Acc1 __acc1, _Acc2 __acc2, _Acc3 __acc3) const + { + oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType1> __acc1_vector[__base_t::__preferred_vector_size]; + oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType2> __acc2_vector[__base_t::__preferred_vector_size]; // 1. Load inputs into vectors oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}( - __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __raw_ptr1, __rng1_vector); + __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __acc1, __acc1_vector); oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}( - __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __raw_ptr2, __rng2_vector); + __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __acc2, __acc2_vector); // 2. Apply binary functor to vector and store into global memory oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}( - __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f}, __rng1_vector, - __rng2_vector, __raw_ptr3); + __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f}, __acc1_vector, + __acc2_vector, __acc3); // 3. Explicitly call destructors of lazy union type oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}( __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType1>::__get_callable_deleter(), - __rng1_vector); + __acc1_vector); oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}( __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType2>::__get_callable_deleter(), - __rng2_vector); + __acc2_vector); } // _IsFull is ignored here. We assume that boundary checking has been already performed for this index. - template + template void - __scalar_path(_IsFull, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2, _Range3 __rng3) const + __scalar_path(_IsFull, const _ItemId __idx, _Acc1 __acc1, _Acc2 __acc2, _Acc3 __acc3) const { - __f(__rng1[__idx], __rng2[__idx], __rng3[__idx]); + __f(__acc1[__idx], __acc2[__idx], __acc3[__idx]); } - template + template void - operator()(_IsFull __is_full, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2, _Range3 __rng3) const + operator()(_IsFull __is_full, const _ItemId __idx, _Acc1 __acc1, _Acc2 __acc2, _Acc3 __acc3) const { if constexpr (__base_t::__can_vectorize) - __vector_path(__is_full, __idx, __rng1, __rng2, __rng3); + __vector_path(__is_full, __idx, __acc1, __acc2, __acc3); else - __scalar_path(__is_full, __idx, __rng1, __rng2, __rng3); + __scalar_path(__is_full, __idx, __acc1, __acc2, __acc3); } }; @@ -342,54 +331,50 @@ struct walk_adjacent_difference : public walk_vector_or_scalar_base<_Range1, _Ra public: walk_adjacent_difference(_F __f, std::size_t __n) : __f(__f), __n(__n) {} - template + template void - __scalar_path(_IsFull, const _ItemId __idx, const _Range1 __rng1, _Range2 __rng2) const + __scalar_path(_IsFull, const _ItemId __idx, const _Acc1 __acc1, _Acc2 __acc2) const { // just copy an element if it is the first one if (__idx == 0) - __rng2[__idx] = __rng1[__idx]; + __acc2[__idx] = __acc1[__idx]; else - __f(__rng1[__idx + (-1)], __rng1[__idx], __rng2[__idx]); + __f(__acc1[__idx + (-1)], __acc1[__idx], __acc2[__idx]); } - template + template void - __vector_path(_IsFull __is_full, const _ItemId __idx, const _Range1 __rng1, _Range2 __rng2) const + __vector_path(_IsFull __is_full, const _ItemId __idx, const _Acc1 __acc1, _Acc2 __acc2) const { using _ValueType = oneapi::dpl::__internal::__value_t<_Range1>; - auto __rng1_ptr = __rng1.begin(); - auto __rng2_ptr = __rng2.begin(); - oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __rng1_vector[__base_t::__preferred_vector_size + 1]; + oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __acc1_vector[__base_t::__preferred_vector_size + 1]; // 1. Establish a vector of __preferred_vector_size + 1 where a scalar load is performed on the first element // followed by a vector load of the specified length. if (__idx != 0) - __rng1_vector[0].__setup(__rng1_ptr[__idx - 1]); + __acc1_vector[0].__setup(__acc1[__idx - 1]); else - __rng1_vector[0].__setup(__rng1_ptr[0]); + __acc1_vector[0].__setup(__acc1[0]); oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}( - __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __rng1_ptr, - &__rng1_vector[1]); + __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __acc1, &__acc1_vector[1]); // 2. Perform a vector store of __preferred_vector_size adjacent differences. oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}( - __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f}, __rng1_vector, - &__rng1_vector[1], __rng2_ptr); + __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f}, __acc1_vector, + &__acc1_vector[1], __acc2); // A dummy value is first written to global memory followed by an overwrite for the first index. Pulling the vector loads / stores into an if branch // to better handle this results in performance degradation. if (__idx == 0) - __rng2[0] = __rng1_vector[0].__v; + __acc2[0] = __acc1_vector[0].__v; // 3. Delete temporary storage oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}( - __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType>::__get_callable_deleter(), - __rng1_vector); + __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType>::__get_callable_deleter(), __acc1); } - template + template void - operator()(_IsFull __is_full, const _ItemId __idx, const _Range1 __rng1, _Range2 __rng2) const + operator()(_IsFull __is_full, const _ItemId __idx, const _Acc1 __acc1, _Acc2 __acc2) const { if constexpr (__base_t::__can_vectorize) - __vector_path(__is_full, __idx, __rng1, __rng2); + __vector_path(__is_full, __idx, __acc1, __acc2); else - __scalar_path(__is_full, __idx, __rng1, __rng2); + __scalar_path(__is_full, __idx, __acc1, __acc2); } }; @@ -1180,11 +1165,10 @@ struct __reverse_functor : public walk_vector_or_scalar_base<_Range> public: __reverse_functor(_Size __size) : __size(__size) {} - template + template void - __vector_path(_IsFull __is_full, const _Idx __left_start_idx, _Range __rng) const + __vector_path(_IsFull __is_full, const _Idx __left_start_idx, _Acc __acc) const { - auto __rng_pointer = __rng.begin(); std::size_t __n = __size; std::size_t __midpoint = __size / 2; // If our start is passed the midpoint, then immediately leave as it is guaranteed to be processed by another @@ -1197,52 +1181,52 @@ struct __reverse_functor : public walk_vector_or_scalar_base<_Range> // 1. Load two vectors that we want to swap: one from the left half of the buffer and one from the right const _Idx __right_start_idx = __size - __left_start_idx - __base_t::__preferred_vector_size; - oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __rng_left_vector[__base_t::__preferred_vector_size]; - oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __rng_right_vector[__base_t::__preferred_vector_size]; + oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __acc_left_vector[__base_t::__preferred_vector_size]; + oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __acc_right_vector[__base_t::__preferred_vector_size]; oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}( - __is_full, __left_start_idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __rng_pointer, - __rng_left_vector); + __is_full, __left_start_idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __acc, + __acc_left_vector); oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}( - __is_full, __right_start_idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __rng_pointer, - __rng_right_vector); + __is_full, __right_start_idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __acc, + __acc_right_vector); // 2. Reverse vectors in registers. Note that due to indices we have chosen, there will always be a full vector of elements to load oneapi::dpl::__par_backend_hetero::__vector_reverse<__base_t::__preferred_vector_size>{}( - std::true_type{}, __left_start_idx, __rng_left_vector); + std::true_type{}, __left_start_idx, __acc_left_vector); oneapi::dpl::__par_backend_hetero::__vector_reverse<__base_t::__preferred_vector_size>{}( - std::true_type{}, __right_start_idx, __rng_right_vector); + std::true_type{}, __right_start_idx, __acc_right_vector); // 3. Store the left-half vector to the corresponding right-half indices and vice versa oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}( __is_full, __right_start_idx, oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op{}, - __rng_left_vector, __rng_pointer); + __acc_left_vector, __acc); oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}( __is_full, __left_start_idx, oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op{}, - __rng_right_vector, __rng_pointer); + __acc_right_vector, __acc); // 4. Call destructors of temporary storage oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}( __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType>::__get_callable_deleter(), - __rng_left_vector); + __acc_left_vector); oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}( __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType>::__get_callable_deleter(), - __rng_right_vector); + __acc_right_vector); } - template + template void - __scalar_path(_IsFull, const _Idx __idx, _Range __rng) const + __scalar_path(_IsFull, const _Idx __idx, _Acc __acc) const { - using ::std::swap; - swap(__rng[__idx], __rng[__size - __idx - 1]); + using std::swap; + swap(__acc[__idx], __acc[__size - __idx - 1]); } - template + template void - operator()(_IsFull __is_full, const _Idx __idx, _Range __rng) const + operator()(_IsFull __is_full, const _Idx __idx, _Acc __acc) const { if constexpr (__base_t::__can_vectorize) - __vector_path(__is_full, __idx, __rng); + __vector_path(__is_full, __idx, __acc); else - __scalar_path(__is_full, __idx, __rng); + __scalar_path(__is_full, __idx, __acc); } }; @@ -1260,58 +1244,55 @@ struct __reverse_copy : public walk_vector_or_scalar_base<_Range1, _Range2> public: __reverse_copy(_Size __size) : __size(__size) {} - template + template void - __scalar_path(_IsFull, const _Idx __idx, const _Range1 __rng1, _Range2 __rng2) const + __scalar_path(_IsFull, const _Idx __idx, const _Acc1 __acc1, _Acc2 __acc2) const { - __rng2[__idx] = __rng1[__size - __idx - 1]; + __acc2[__idx] = __acc1[__size - __idx - 1]; } - template + template void - __vector_path(_IsFull __is_full, const _Idx __idx, const _Range1 __rng1, _Range2 __rng2) const + __vector_path(_IsFull __is_full, const _Idx __idx, const _Acc1 __acc1, _Acc2 __acc2) const { - auto __rng1_pointer = __rng1.begin(); - auto __rng2_pointer = __rng2.begin(); std::size_t __n = __size; std::size_t __remaining_elements = __idx >= __n ? 0 : __n - __idx; std::size_t __elements_to_process = std::min(static_cast(__base_t::__preferred_vector_size), __remaining_elements); const _Idx __output_start = __size - __idx - __elements_to_process; // 1. Load vector to reverse - oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __rng1_vector[__base_t::__preferred_vector_size]; + oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __acc1_vector[__base_t::__preferred_vector_size]; oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}( - __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __rng1_pointer, - __rng1_vector); + __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __acc1, __acc1_vector); // 2, 3. Reverse in registers and flip the location of the vector in the output buffer if (__elements_to_process == __base_t::__preferred_vector_size) { oneapi::dpl::__par_backend_hetero::__vector_reverse<__base_t::__preferred_vector_size>{}( - std::true_type{}, __elements_to_process, __rng1_vector); + std::true_type{}, __elements_to_process, __acc1_vector); oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}( __is_full, __output_start, oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op{}, - __rng1_vector, __rng2_pointer); + __acc1_vector, __acc2); } else { oneapi::dpl::__par_backend_hetero::__vector_reverse<__base_t::__preferred_vector_size>{}( - std::false_type{}, __elements_to_process, __rng1_vector); + std::false_type{}, __elements_to_process, __acc1_vector); for (std::uint8_t __i = 0; __i < __elements_to_process; ++__i) - __rng2_pointer[__output_start + __i] = __rng1_vector[__i].__v; + __acc2[__output_start + __i] = __acc1_vector[__i].__v; } // 3. Cleanup oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}( __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType>::__get_callable_deleter(), - __rng1_vector); + __acc1_vector); } - template + template void - operator()(_IsFull __is_full, const _Idx __idx, const _Range1 __rng1, _Range2 __rng2) const + operator()(_IsFull __is_full, const _Idx __idx, const _Acc1 __acc1, _Acc2 __acc2) const { if constexpr (__base_t::__can_vectorize) - __vector_path(__is_full, __idx, __rng1, __rng2); + __vector_path(__is_full, __idx, __acc1, __acc2); else - __scalar_path(__is_full, __idx, __rng1, __rng2); + __scalar_path(__is_full, __idx, __acc1, __acc2); } }; @@ -1330,22 +1311,20 @@ struct __rotate_copy : public walk_vector_or_scalar_base<_Range1, _Range2> public: __rotate_copy(_Size __size, _Size __shift) : __size(__size), __shift(__shift) {} - template + template void - __vector_path(_IsFull __is_full, const _Idx __idx, const _Range1 __rng1, _Range2 __rng2) const + __vector_path(_IsFull __is_full, const _Idx __idx, const _Acc1 __acc1, _Acc2 __acc2) const { - auto __rng1_pointer = __rng1.begin(); - auto __rng2_pointer = __rng2.begin(); _Idx __shifted_idx = __shift + __idx; _Idx __wrapped_idx = __shifted_idx % __size; std::size_t __n = __size; - oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __rng1_vector[__base_t::__preferred_vector_size]; + oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __acc1_vector[__base_t::__preferred_vector_size]; //1. Vectorize loads only if we know the wrap around point is beyond the current vector elements to process if (__wrapped_idx + __base_t::__preferred_vector_size <= __size) { oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}( - __is_full, __wrapped_idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __rng1_pointer, - __rng1_vector); + __is_full, __wrapped_idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __acc1, + __acc1_vector); } else { @@ -1353,32 +1332,32 @@ struct __rotate_copy : public walk_vector_or_scalar_base<_Range1, _Range2> std::size_t __elements_to_process = std::min(static_cast(__base_t::__preferred_vector_size), __remaining_elements); for (std::uint16_t __i = 0; __i != __elements_to_process; ++__i) - __rng1_vector[__i].__setup(__rng1_pointer[(__shifted_idx + __i) % __size]); + __acc1_vector[__i].__setup(__acc1[(__shifted_idx + __i) % __size]); } // 2. Store the rotation oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}( __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op{}, - __rng1_vector, __rng2_pointer); + __acc1_vector, __acc2); // 3. Delete temporary storage oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}( __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType>::__get_callable_deleter(), - __rng1_vector); + __acc1_vector); } - template + template void - __scalar_path(_IsFull, const _Idx __idx, const _Range1 __rng1, _Range2 __rng2) const + __scalar_path(_IsFull, const _Idx __idx, const _Acc1 __acc1, _Acc2 __acc2) const { - __rng2[__idx] = __rng1[(__shift + __idx) % __size]; + __acc2[__idx] = __acc1[(__shift + __idx) % __size]; } - template + template void - operator()(_IsFull __is_full, const _Idx __idx, const _Range1 __rng1, _Range2 __rng2) const + operator()(_IsFull __is_full, const _Idx __idx, const _Acc1 __acc1, _Acc2 __acc2) const { if constexpr (__base_t::__can_vectorize) - __vector_path(__is_full, __idx, __rng1, __rng2); + __vector_path(__is_full, __idx, __acc1, __acc2); else - __scalar_path(__is_full, __idx, __rng1, __rng2); + __scalar_path(__is_full, __idx, __acc1, __acc2); } }; @@ -1546,6 +1525,8 @@ template { private: + using _ValueType1 = oneapi::dpl::__internal::__value_t<_Range1>; + using _ValueType2 = oneapi::dpl::__internal::__value_t<_Range2>; using __base_t = walk_vector_or_scalar_base<_Range1, _Range2>; _F __f; std::size_t __n; @@ -1553,55 +1534,48 @@ struct __brick_swap : public walk_vector_or_scalar_base<_Range1, _Range2> public: __brick_swap(_F __f, std::size_t __n) : __f(__f), __n(__n) {} - template + template void - __vector_path(_IsFull __is_full, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2) const + __vector_path(_IsFull __is_full, const _ItemId __idx, _Acc1 __acc1, _Acc2 __acc2) const { - using _ValueType1 = oneapi::dpl::__internal::__value_t<_Range1>; - using _ValueType2 = oneapi::dpl::__internal::__value_t<_Range2>; - // This is needed for the icpx compiler to vectorize. The indirection introduced by our all / guard views interfere - // with compiler vectorization. At this point, we have ensured that input is contiguous and can be operated on as a raw pointer. The - // begin() function for these views will return a pointer. - auto __raw_ptr1 = __rng1.begin(); - auto __raw_ptr2 = __rng2.begin(); - oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType1> __rng1_vector[__base_t::__preferred_vector_size]; - oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType1> __rng2_vector[__base_t::__preferred_vector_size]; + oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType1> __acc1_vector[__base_t::__preferred_vector_size]; + oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType1> __acc2_vector[__base_t::__preferred_vector_size]; // 1. Load inputs into vectors oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}( - __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __raw_ptr1, __rng1_vector); + __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __acc1, __acc1_vector); oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}( - __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __raw_ptr2, __rng2_vector); + __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_transform_op{}, __acc2, __acc2_vector); // 2. Swap the two ranges oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}( - __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f}, __rng2_vector, - __raw_ptr1); + __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f}, __acc2_vector, + __acc1); oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}( - __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f}, __rng1_vector, - __raw_ptr2); + __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f}, __acc1_vector, + __acc2); // 3. Explicitly call destructor of lazy union type oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}( __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType1>::__get_callable_deleter(), - __rng1_vector); + __acc1_vector); oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}( __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType2>::__get_callable_deleter(), - __rng2_vector); + __acc2_vector); } - template + template void - __scalar_path(_IsFull __is_full, const _Idx __idx, const _Range1 __rng1, _Range2 __rng2) const + __scalar_path(_IsFull __is_full, const _Idx __idx, const _Acc1 __acc1, _Acc2 __acc2) const { - __f(__rng1[__idx], __rng2[__idx]); + __f(__acc1[__idx], __acc2[__idx]); } - template + template void - operator()(_IsFull __is_full, const _Idx __idx, const _Range1 __rng1, _Range2 __rng2) const + operator()(_IsFull __is_full, const _Idx __idx, const _Acc1 __acc1, _Acc2 __acc2) const { if constexpr (__base_t::__can_vectorize) - __vector_path(__is_full, __idx, __rng1, __rng2); + __vector_path(__is_full, __idx, __acc1, __acc2); else - __scalar_path(__is_full, __idx, __rng1, __rng2); + __scalar_path(__is_full, __idx, __acc1, __acc2); } };