From 71e5673e5a6b73eb25aba82ed26b1490c8c6b5ea Mon Sep 17 00:00:00 2001 From: Dmitriy Sobolev Date: Mon, 9 Dec 2024 13:43:28 +0000 Subject: [PATCH] Make sycl feature selection logic more scalable --- .../dpl/internal/reduce_by_segment_impl.h | 16 +- .../dpl/internal/scan_by_segment_impl.h | 8 +- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 8 +- .../dpcpp/parallel_backend_sycl_radix_sort.h | 17 +- .../dpcpp/parallel_backend_sycl_reduce.h | 4 +- .../dpcpp/parallel_backend_sycl_utils.h | 14 +- .../oneapi/dpl/pstl/hetero/dpcpp/sycl_defs.h | 196 ++++++++++-------- .../pstl/hetero/dpcpp/unseq_backend_sycl.h | 2 +- .../dpl/pstl/hetero/dpcpp/utils_ranges_sycl.h | 6 +- include/oneapi/dpl/pstl/utils.h | 2 +- 10 files changed, 145 insertions(+), 128 deletions(-) diff --git a/include/oneapi/dpl/internal/reduce_by_segment_impl.h b/include/oneapi/dpl/internal/reduce_by_segment_impl.h index 4ddce6d3473..567ca73aa74 100644 --- a/include/oneapi/dpl/internal/reduce_by_segment_impl.h +++ b/include/oneapi/dpl/internal/reduce_by_segment_impl.h @@ -281,12 +281,12 @@ __sycl_reduce_by_segment(__internal::__hetero_tag<_BackendTag>, _ExecutionPolicy auto __seg_end_identification = __exec.queue().submit([&](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __keys); auto __seg_ends_acc = __seg_ends.template get_access(__cgh); -#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT +#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_ABSENT __cgh.use_kernel_bundle(__seg_reduce_count_kernel.get_kernel_bundle()); #endif __cgh.parallel_for<_SegReduceCountKernel>( sycl::nd_range<1>{__n_groups * __wgroup_size, __wgroup_size}, [=]( -#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT +#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_ABSENT __seg_reduce_count_kernel, #endif sycl::nd_item<1> __item) { @@ -319,11 +319,11 @@ __sycl_reduce_by_segment(__internal::__hetero_tag<_BackendTag>, _ExecutionPolicy __cgh.depends_on(__seg_end_identification); auto __seg_ends_acc = __seg_ends.template get_access(__cgh); auto __seg_ends_scan_acc = __seg_ends_scanned.template get_access(__cgh); -#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT +#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_ABSENT __cgh.use_kernel_bundle(__seg_reduce_offset_kernel.get_kernel_bundle()); #endif __cgh.parallel_for<_SegReduceOffsetKernel>( -#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT +#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_ABSENT __seg_reduce_offset_kernel, #endif sycl::nd_range<1>{__wgroup_size, __wgroup_size}, [=](sycl::nd_item<1> __item) { @@ -342,11 +342,11 @@ __sycl_reduce_by_segment(__internal::__hetero_tag<_BackendTag>, _ExecutionPolicy auto __partials_acc = __partials.template get_access(__cgh); auto __seg_ends_scan_acc = __seg_ends_scanned.template get_access(__cgh); __dpl_sycl::__local_accessor<__val_type> __loc_acc(2 * __wgroup_size, __cgh); -#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT +#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_ABSENT __cgh.use_kernel_bundle(__seg_reduce_wg_kernel.get_kernel_bundle()); #endif __cgh.parallel_for<_SegReduceWgKernel>( -#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT +#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_ABSENT __seg_reduce_wg_kernel, #endif sycl::nd_range<1>{__n_groups * __wgroup_size, __wgroup_size}, [=](sycl::nd_item<1> __item) { @@ -465,11 +465,11 @@ __sycl_reduce_by_segment(__internal::__hetero_tag<_BackendTag>, _ExecutionPolicy __dpl_sycl::__local_accessor<__diff_type> __loc_seg_ends_acc(__wgroup_size, __cgh); __cgh.depends_on(__wg_reduce); -#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT +#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_ABSENT __cgh.use_kernel_bundle(__seg_reduce_prefix_kernel.get_kernel_bundle()); #endif __cgh.parallel_for<_SegReducePrefixKernel>( -#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT +#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_ABSENT __seg_reduce_prefix_kernel, #endif sycl::nd_range<1>{__n_groups * __wgroup_size, __wgroup_size}, [=](sycl::nd_item<1> __item) { diff --git a/include/oneapi/dpl/internal/scan_by_segment_impl.h b/include/oneapi/dpl/internal/scan_by_segment_impl.h index a8199e227dd..139448f6944 100644 --- a/include/oneapi/dpl/internal/scan_by_segment_impl.h +++ b/include/oneapi/dpl/internal/scan_by_segment_impl.h @@ -164,11 +164,11 @@ struct __sycl_scan_by_segment_impl __dpl_sycl::__local_accessor<__val_type> __loc_acc(2 * __wgroup_size, __cgh); -#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT +#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_ABSENT __cgh.use_kernel_bundle(__seg_scan_wg_kernel.get_kernel_bundle()); #endif __cgh.parallel_for<_SegScanWgKernel>( -#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT +#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_ABSENT __seg_scan_wg_kernel, #endif sycl::nd_range<1>{__n_groups * __wgroup_size, __wgroup_size}, [=](sycl::nd_item<1> __item) { @@ -268,11 +268,11 @@ struct __sycl_scan_by_segment_impl __dpl_sycl::__local_accessor<__val_type> __loc_partials_acc(__wgroup_size, __cgh); __dpl_sycl::__local_accessor __loc_seg_ends_acc(__wgroup_size, __cgh); -#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT +#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_ABSENT __cgh.use_kernel_bundle(__seg_scan_prefix_kernel.get_kernel_bundle()); #endif __cgh.parallel_for<_SegScanPrefixKernel>( -#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT +#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_ABSENT __seg_scan_prefix_kernel, #endif sycl::nd_range<1>{__n_groups * __wgroup_size, __wgroup_size}, [=](sycl::nd_item<1> __item) { diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 330df59d020..597b23e9c59 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -324,11 +324,11 @@ struct __parallel_scan_submitter<_CustomName, __internal::__optional_kernel_name auto __temp_acc = __result_and_scratch.template __get_scratch_acc( __cgh, __dpl_sycl::__no_init{}); __dpl_sycl::__local_accessor<_Type> __local_acc(__wgroup_size, __cgh); -#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT +#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_ABSENT __cgh.use_kernel_bundle(__kernel_1.get_kernel_bundle()); #endif __cgh.parallel_for<_LocalScanKernel>( -#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT +#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_ABSENT __kernel_1, #endif sycl::nd_range<1>(__n_groups * __wgroup_size, __wgroup_size), [=](sycl::nd_item<1> __item) { @@ -345,11 +345,11 @@ struct __parallel_scan_submitter<_CustomName, __internal::__optional_kernel_name __cgh.depends_on(__submit_event); auto __temp_acc = __result_and_scratch.template __get_scratch_acc(__cgh); __dpl_sycl::__local_accessor<_Type> __local_acc(__wgroup_size, __cgh); -#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT +#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_ABSENT __cgh.use_kernel_bundle(__kernel_2.get_kernel_bundle()); #endif __cgh.parallel_for<_GroupScanKernel>( -#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT +#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_ABSENT __kernel_2, #endif // TODO: try to balance work between several workgroups instead of one diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h index d8b95a5808d..28fd5753652 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h @@ -28,6 +28,9 @@ #include "sycl_traits.h" //SYCL traits specialization for some oneDPL types. +#define _ONEDPL_SYCL2020_SUBGROUP_BARRIER_PRESENT (_ONEDPL_LIBSYCL_VERSION == 0 || _ONEDPL_LIBSYCL_VERSION >= 50700) +#define _ONEDPL_LIBSYCL_SUB_GROUP_MASK_PRESENT (_ONEDPL_LIBSYCL_VERSION >= 50700) + #define _ONEDPL_RADIX_WORKLOAD_TUNING 1 //To achieve better performance, number of segments and work-group size are variated depending on a number of elements: //1. 32K...512K - number of segments is increased up to 8 times @@ -198,11 +201,11 @@ __radix_sort_count_submit(_ExecutionPolicy&& __exec, ::std::size_t __segments, : oneapi::dpl::__ranges::__require_access(__hdl, __val_rng, __count_rng); // an accessor per work-group with value counters from each work-item auto __count_lacc = __dpl_sycl::__local_accessor<_CountT>(__wg_size * __radix_states, __hdl); -#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT +#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_ABSENT __hdl.use_kernel_bundle(__kernel.get_kernel_bundle()); #endif __hdl.parallel_for<_KernelName>( -#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT +#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_ABSENT __kernel, #endif sycl::nd_range<1>(__segments * __wg_size, __wg_size), [=](sycl::nd_item<1> __self_item) { @@ -299,11 +302,11 @@ __radix_sort_scan_submit(_ExecutionPolicy&& __exec, ::std::size_t __scan_wg_size __hdl.depends_on(__dependency_event); // access the counters for all work groups oneapi::dpl::__ranges::__require_access(__hdl, __count_rng); -#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT +#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_ABSENT __hdl.use_kernel_bundle(__kernel.get_kernel_bundle()); #endif __hdl.parallel_for<_KernelName>( -#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT +#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_ABSENT __kernel, #endif sycl::nd_range<1>(__radix_states * __scan_wg_size, __scan_wg_size), [=](sycl::nd_item<1> __self_item) { @@ -346,8 +349,6 @@ enum class __peer_prefix_algo template struct __peer_prefix_helper; -#define _ONEDPL_SYCL2020_SUBGROUP_BARRIER_PRESENT (_ONEDPL_GENERIC_SYCL_LIBRARY || _ONEDPL_LIBSYCL_VERSION >= 50700) - #if _ONEDPL_SYCL2020_SUBGROUP_BARRIER_PRESENT template struct __peer_prefix_helper<__radix_states, _OffsetT, __peer_prefix_algo::atomic_fetch_or> @@ -546,11 +547,11 @@ __radix_sort_reorder_submit(_ExecutionPolicy&& __exec, ::std::size_t __segments, typename _PeerHelper::_TempStorageT __peer_temp(1, __hdl); -#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT +#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_ABSENT __hdl.use_kernel_bundle(__kernel.get_kernel_bundle()); #endif __hdl.parallel_for<_KernelName>( -#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT +#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_ABSENT __kernel, #endif //Each SYCL work group processes one data segment. diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h index a29060a9cca..40c24dc9f7f 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h @@ -370,11 +370,11 @@ struct __parallel_transform_reduce_impl oneapi::dpl::__ranges::__require_access(__cgh, __rngs...); std::size_t __local_mem_size = __reduce_pattern.local_mem_req(__work_group_size); __dpl_sycl::__local_accessor<_Tp> __temp_local(sycl::range<1>(__local_mem_size), __cgh); -#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT +#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_ABSENT __cgh.use_kernel_bundle(__kernel.get_kernel_bundle()); #endif __cgh.parallel_for<_ReduceKernel>( -#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT +#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_ABSENT __kernel, #endif sycl::nd_range<1>(sycl::range<1>(__n_groups * __work_group_size), diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h index e98a34a212c..6a5525adb08 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h @@ -110,7 +110,7 @@ __supports_sub_group_size(const _ExecutionPolicy& __exec, std::size_t __target_s // 20201214 value corresponds to Intel(R) oneAPI C++ Compiler Classic 2021.1.2 Patch release #define _ONEDPL_SYCL2020_KERNEL_DEVICE_API_PRESENT \ - (_ONEDPL_GENERIC_SYCL_LIBRARY || __SYCL_COMPILER_VERSION > 20201214 || _ONEDPL_LIBSYCL_VERSION >= 50700) + (__SYCL_COMPILER_VERSION > 20201214 || (_ONEDPL_LIBSYCL_VERSION != 0 && _ONEDPL_LIBSYCL_VERSION >= 50700)) template ::std::size_t @@ -268,7 +268,7 @@ class __kernel_compiler static_assert(__kernel_count > 0, "At least one kernel name should be provided"); public: -#if _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT +#if !_ONEDPL_SYCL2020_KERNEL_BUNDLE_ABSENT template static auto __compile(_Exec&& __exec) @@ -543,7 +543,7 @@ struct __result_and_scratch_storage inline bool __use_USM_host_allocations(sycl::queue __queue) { -#if _ONEDPL_SYCL_UNIFIED_USM_BUFFER_PRESENT +#if !_ONEDPL_SYCL2020_UNIFIED_USM_BUFFER_ABSENT auto __device = __queue.get_device(); if (!__device.is_gpu()) return false; @@ -560,7 +560,7 @@ struct __result_and_scratch_storage inline bool __use_USM_allocations(sycl::queue __queue) { -#if _ONEDPL_SYCL_UNIFIED_USM_BUFFER_PRESENT +#if !_ONEDPL_SYCL2020_UNIFIED_USM_BUFFER_ABSENT return __queue.get_device().has(sycl::aspect::usm_device_allocations); #else return false; @@ -613,7 +613,7 @@ struct __result_and_scratch_storage static auto __get_usm_or_buffer_accessor_ptr(const _Acc& __acc, std::size_t __scratch_n = 0) { -#if _ONEDPL_SYCL_UNIFIED_USM_BUFFER_PRESENT +#if !_ONEDPL_SYCL2020_UNIFIED_USM_BUFFER_ABSENT return __acc.__get_pointer(); #else return &__acc[__scratch_n]; @@ -624,7 +624,7 @@ struct __result_and_scratch_storage auto __get_result_acc(sycl::handler& __cgh, const sycl::property_list& __prop_list = {}) const { -#if _ONEDPL_SYCL_UNIFIED_USM_BUFFER_PRESENT +#if !_ONEDPL_SYCL2020_UNIFIED_USM_BUFFER_ABSENT if (__use_USM_host && __supports_USM_device) return __usm_or_buffer_accessor<__accessor_t<_AccessMode>>(__cgh, __result_buf.get(), __prop_list); else if (__supports_USM_device) @@ -640,7 +640,7 @@ struct __result_and_scratch_storage auto __get_scratch_acc(sycl::handler& __cgh, const sycl::property_list& __prop_list = {}) const { -#if _ONEDPL_SYCL_UNIFIED_USM_BUFFER_PRESENT +#if !_ONEDPL_SYCL2020_UNIFIED_USM_BUFFER_ABSENT if (__use_USM_host || __supports_USM_device) return __usm_or_buffer_accessor<__accessor_t<_AccessMode>>(__cgh, __scratch_buf.get(), __prop_list); return __usm_or_buffer_accessor<__accessor_t<_AccessMode>>(__cgh, __sycl_buf.get(), __prop_list); diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_defs.h b/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_defs.h index 089ab4c7794..d7be7e32d4e 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_defs.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_defs.h @@ -39,9 +39,8 @@ #if defined(__LIBSYCL_MAJOR_VERSION) && defined(__LIBSYCL_MINOR_VERSION) && defined(__LIBSYCL_PATCH_VERSION) # define _ONEDPL_LIBSYCL_VERSION \ (__LIBSYCL_MAJOR_VERSION * 10000 + __LIBSYCL_MINOR_VERSION * 100 + __LIBSYCL_PATCH_VERSION) -#endif -#if !defined(_ONEDPL_LIBSYCL_VERSION) -# define _ONEDPL_GENERIC_SYCL_LIBRARY 1 +#else +# define _ONEDPL_LIBSYCL_VERSION 0 #endif #if _ONEDPL_FPGA_DEVICE @@ -52,27 +51,52 @@ # endif #endif -// Macros to check the new SYCL features -#define _ONEDPL_SYCL2020_NO_INIT_PRESENT \ - (_ONEDPL_GENERIC_SYCL_LIBRARY || _ONEDPL_LIBSYCL_VERSION >= 50300) -#define _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT \ - (_ONEDPL_GENERIC_SYCL_LIBRARY || _ONEDPL_LIBSYCL_VERSION >= 50300) -#define _ONEDPL_SYCL2020_COLLECTIVES_PRESENT \ - (_ONEDPL_GENERIC_SYCL_LIBRARY || _ONEDPL_LIBSYCL_VERSION >= 50300) -#define _ONEDPL_SYCL2020_KNOWN_IDENTITY_PRESENT \ - (_ONEDPL_GENERIC_SYCL_LIBRARY || _ONEDPL_LIBSYCL_VERSION >= 50300) -#define _ONEDPL_SYCL2020_FUNCTIONAL_OBJECTS_PRESENT \ - (_ONEDPL_GENERIC_SYCL_LIBRARY || _ONEDPL_LIBSYCL_VERSION >= 50300) -#define _ONEDPL_SYCL2020_ATOMIC_REF_PRESENT \ - (_ONEDPL_GENERIC_SYCL_LIBRARY || _ONEDPL_LIBSYCL_VERSION >= 50500) -#define _ONEDPL_SYCL2020_SUB_GROUP_PRESENT \ - (_ONEDPL_GENERIC_SYCL_LIBRARY || _ONEDPL_LIBSYCL_VERSION >= 50700) - -#define _ONEDPL_SYCL2020_PLACEHOLDER_HOST_ACCESSOR_DEPRECATED \ - (_ONEDPL_GENERIC_SYCL_LIBRARY || _ONEDPL_LIBSYCL_VERSION >= 60200) - -#define _ONEDPL_LIBSYCL_SUB_GROUP_MASK_PRESENT (_ONEDPL_LIBSYCL_VERSION >= 50700) -#define _ONEDPL_LIBSYCL_KNOWN_IDENTITY_PRESENT (_ONEDPL_LIBSYCL_VERSION == 50200) +// Feature macros, opt-out logic is stems from the assumption that SYCL2020 is supported by default +#define _ONEDPL_SYCL2020_NO_INIT_ABSENT \ + (_ONEDPL_LIBSYCL_VERSION != 0 && _ONEDPL_LIBSYCL_VERSION < 50300) +#define _ONEDPL_SYCL2020_KERNEL_BUNDLE_ABSENT \ + (_ONEDPL_LIBSYCL_VERSION != 0 && _ONEDPL_LIBSYCL_VERSION < 50300) +#define _ONEDPL_SYCL2020_COLLECTIVES_ABSENT \ + (_ONEDPL_LIBSYCL_VERSION != 0 && _ONEDPL_LIBSYCL_VERSION < 50300) +#define _ONEDPL_SYCL2020_KNOWN_IDENTITY_ABSENT \ + (_ONEDPL_LIBSYCL_VERSION != 0 && _ONEDPL_LIBSYCL_VERSION < 50300) +#define _ONEDPL_SYCL2020_FUNCTIONAL_OBJECTS_ABSENT \ + (_ONEDPL_LIBSYCL_VERSION != 0 && _ONEDPL_LIBSYCL_VERSION < 50300) +#define _ONEDPL_SYCL2020_ATOMIC_REF_ABSENT \ + (_ONEDPL_LIBSYCL_VERSION != 0 && _ONEDPL_LIBSYCL_VERSION < 50500) +#define _ONEDPL_SYCL2020_SUB_GROUP_ABSENT \ + (_ONEDPL_LIBSYCL_VERSION != 0 && _ONEDPL_LIBSYCL_VERSION < 50700) +#define _ONEDPL_SYCL2020_HOST_ACCESSOR_ABSENT \ + (_ONEDPL_LIBSYCL_VERSION != 0 && _ONEDPL_LIBSYCL_VERSION < 50700) +// The unified future supporting USM memory and buffers is only supported after DPC++ 2023.1 but not by 2023.2. +#define _ONEDPL_SYCL2020_UNIFIED_USM_BUFFER_ABSENT \ + (_ONEDPL_LIBSYCL_VERSION != 0 && (_ONEDPL_LIBSYCL_VERSION < 60100 || _ONEDPL_LIBSYCL_VERSION == 60200)) +#define _ONEDPL_SYCL2020_TARGET_ABSENT \ + (_ONEDPL_LIBSYCL_VERSION != 0 && _ONEDPL_LIBSYCL_VERSION < 50400) +#define _ONEDPL_SYCL2020_TARGET_DEVICE_ABSENT \ + (_ONEDPL_LIBSYCL_VERSION != 0 && _ONEDPL_LIBSYCL_VERSION < 50400) +#define _ONEDPL_SYCL2020_HOST_TARGET_ABSENT \ + (_ONEDPL_LIBSYCL_VERSION != 0 && _ONEDPL_LIBSYCL_VERSION < 60200) +#define _ONEDPL_SYCL2020_BUFFER_ALLOCATOR_ABSENT \ + (_ONEDPL_LIBSYCL_VERSION != 0 && _ONEDPL_LIBSYCL_VERSION < 60000) +#define _ONEDPL_SYCL2020_LOCAL_ACCESSOR_ABSENT \ + (_ONEDPL_LIBSYCL_VERSION != 0 && _ONEDPL_LIBSYCL_VERSION < 60000) +#define _ONEDPL_SYCL2020_GET_HOST_ACCESS_ABSENT \ + (_ONEDPL_LIBSYCL_VERSION != 0 && _ONEDPL_LIBSYCL_VERSION < 60200) +#define _ONEDPL_SYCL2020_LOCAL_ACC_GET_MULTI_PTR_ABSENT \ + (_ONEDPL_LIBSYCL_VERSION != 0 && _ONEDPL_LIBSYCL_VERSION < 70000) +#define _ONEDPL_SYCL2020_REQD_SUB_GROUP_SIZE_ABSENT \ + (_ONEDPL_LIBSYCL_VERSION != 0 && _ONEDPL_LIBSYCL_VERSION < 50300) +#define _ONEDPL_SYCL2020_BUFFER_SIZE_ABSENT \ + (_ONEDPL_LIBSYCL_VERSION != 0 && _ONEDPL_LIBSYCL_VERSION < 50300) +#define _ONEDPL_SYCL2020_ACCESSOR_SIZE_ABSENT \ + (_ONEDPL_LIBSYCL_VERSION != 0 && _ONEDPL_LIBSYCL_VERSION < 50300) + +// Feature macros for DPC++ SYCL runtime library alternatives to non-supported SYCL2020 features +#define _ONEDPL_LIBSYCL_COLLECTIVES_PRESENT \ + (_ONEDPL_LIBSYCL_VERSION != 0 && _ONEDPL_LIBSYCL_VERSION < 50300) +#define _ONEDPL_LIBSYCL_KNOWN_IDENTITY_PRESENT \ + (_ONEDPL_LIBSYCL_VERSION == 50200) #define _ONEDPL_SYCL_DEVICE_COPYABLE_SPECIALIZATION_BROKEN (_ONEDPL_LIBSYCL_VERSION < 70100) @@ -89,9 +113,9 @@ # endif #endif // _ONEDPL_DETECT_SPIRV_COMPILATION -#if _ONEDPL_LIBSYCL_VERSION >= 50300 || _ONEDPL_GENERIC_SYCL_LIBRARY +#if !_ONEDPL_SYCL2020_REQD_SUB_GROUP_SIZE_ABSENT # define _ONEDPL_SYCL_REQD_SUB_GROUP_SIZE(SIZE) sycl::reqd_sub_group_size(SIZE) -#else +#elif _ONEDPL_LIBSYCL_VERSION < 50300 # define _ONEDPL_SYCL_REQD_SUB_GROUP_SIZE(SIZE) intel::reqd_sub_group_size(SIZE) #endif @@ -104,25 +128,17 @@ # define _ONEDPL_SYCL_REQD_SUB_GROUP_SIZE_IF_SUPPORTED(SIZE) #endif -// The unified future supporting USM memory and buffers is only supported after DPCPP 2023.1 -// but not by 2023.2. -#if _ONEDPL_GENERIC_SYCL_LIBRARY || (_ONEDPL_LIBSYCL_VERSION >= 50300 && _ONEDPL_LIBSYCL_VERSION != 60200) -# define _ONEDPL_SYCL_UNIFIED_USM_BUFFER_PRESENT 1 -#else -# define _ONEDPL_SYCL_UNIFIED_USM_BUFFER_PRESENT 0 -#endif - namespace __dpl_sycl { using __no_init = -#if _ONEDPL_SYCL2020_NO_INIT_PRESENT +#if !_ONEDPL_SYCL2020_NO_INIT_ABSENT sycl::property::no_init; -#else +#elif _ONEDPL_LIBSYCL_VERSION < 50300 sycl::property::noinit; #endif -#if _ONEDPL_SYCL2020_KNOWN_IDENTITY_PRESENT +#if !_ONEDPL_SYCL2020_KNOWN_IDENTITY_ABSENT template using __known_identity = sycl::known_identity<_BinaryOp, _T>; @@ -135,12 +151,12 @@ using __known_identity = sycl::ONEAPI::known_identity<_BinaryOp, _T>; template using __has_known_identity = sycl::ONEAPI::has_known_identity<_BinaryOp, _T>; -#endif // _ONEDPL_SYCL2020_KNOWN_IDENTITY_PRESENT +#endif // !_ONEDPL_SYCL2020_KNOWN_IDENTITY_ABSENT template inline constexpr auto __known_identity_v = __known_identity<_BinaryOp, _T>::value; -#if _ONEDPL_SYCL2020_FUNCTIONAL_OBJECTS_PRESENT +#if !_ONEDPL_SYCL2020_FUNCTIONAL_OBJECTS_ABSENT template using __plus = sycl::plus<_T>; @@ -149,7 +165,7 @@ using __maximum = sycl::maximum<_T>; template using __minimum = sycl::minimum<_T>; -#else // _ONEDPL_SYCL2020_FUNCTIONAL_OBJECTS_PRESENT +#elif _ONEDPL_LIBSYCL_VERSION < 50300 template using __plus = sycl::ONEAPI::plus<_T>; @@ -158,11 +174,11 @@ using __maximum = sycl::ONEAPI::maximum<_T>; template using __minimum = sycl::ONEAPI::minimum<_T>; -#endif // _ONEDPL_SYCL2020_FUNCTIONAL_OBJECTS_PRESENT +#endif // !_ONEDPL_SYCL2020_FUNCTIONAL_OBJECTS_ABSENT -#if _ONEDPL_SYCL2020_SUB_GROUP_PRESENT +#if !_ONEDPL_SYCL2020_SUB_GROUP_ABSENT using __sub_group = sycl::sub_group; -#else +#elif _ONEDPL_LIBSYCL_VERSION < 50700 using __sub_group = sycl::ONEAPI::sub_group; #endif @@ -170,9 +186,9 @@ template constexpr auto __get_buffer_size(const _Buffer& __buffer) { -#if _ONEDPL_GENERIC_SYCL_LIBRARY || _ONEDPL_LIBSYCL_VERSION >= 50300 +#if !_ONEDPL_SYCL2020_BUFFER_SIZE_ABSENT return __buffer.size(); -#else +#elif _ONEDPL_LIBSYCL_VERSION < 50300 return __buffer.get_count(); #endif } @@ -181,9 +197,9 @@ template constexpr auto __get_accessor_size(const _Accessor& __accessor) { -#if _ONEDPL_GENERIC_SYCL_LIBRARY || _ONEDPL_LIBSYCL_VERSION >= 50300 +#if !_ONEDPL_SYCL2020_ACCESSOR_SIZE_ABSENT return __accessor.size(); -#else +#elif _ONEDPL_LIBSYCL_VERSION < 50300 return __accessor.get_count(); #endif } @@ -192,7 +208,7 @@ template constexpr void __group_barrier(_Item __item) { -#if 0 //_ONEDPL_GENERIC_SYCL_LIBRARY || _ONEDPL_LIBSYCL_VERSION >= 50300 +#if 0 //_ONEDPL_LIBSYCL_VERSION == 0 || _ONEDPL_LIBSYCL_VERSION >= 50300 //TODO: usage of sycl::group_barrier: probably, we have to revise SYCL parallel patterns which use a group_barrier. // 1) sycl::group_barrier() implementation is not ready // 2) sycl::group_barrier and sycl::item::group_barrier are not quite equivalent @@ -206,9 +222,9 @@ template constexpr auto __group_broadcast(_Args... __args) { -#if _ONEDPL_SYCL2020_COLLECTIVES_PRESENT +#if !_ONEDPL_SYCL2020_COLLECTIVES_ABSENT return sycl::group_broadcast(__args...); -#else +#elif _ONEDPL_LIBSYCL_COLLECTIVES_PRESENT return sycl::ONEAPI::broadcast(__args...); #endif } @@ -217,9 +233,9 @@ template constexpr auto __exclusive_scan_over_group(_Args... __args) { -#if _ONEDPL_SYCL2020_COLLECTIVES_PRESENT +#if !_ONEDPL_SYCL2020_COLLECTIVES_ABSENT return sycl::exclusive_scan_over_group(__args...); -#else +#elif _ONEDPL_LIBSYCL_COLLECTIVES_PRESENT return sycl::ONEAPI::exclusive_scan(__args...); #endif } @@ -228,9 +244,9 @@ template constexpr auto __inclusive_scan_over_group(_Args... __args) { -#if _ONEDPL_SYCL2020_COLLECTIVES_PRESENT +#if !_ONEDPL_SYCL2020_COLLECTIVES_ABSENT return sycl::inclusive_scan_over_group(__args...); -#else +#elif _ONEDPL_LIBSYCL_COLLECTIVES_PRESENT return sycl::ONEAPI::inclusive_scan(__args...); #endif } @@ -239,9 +255,9 @@ template constexpr auto __reduce_over_group(_Args... __args) { -#if _ONEDPL_SYCL2020_COLLECTIVES_PRESENT +#if !_ONEDPL_SYCL2020_COLLECTIVES_ABSENT return sycl::reduce_over_group(__args...); -#else +#elif _ONEDPL_LIBSYCL_COLLECTIVES_PRESENT return sycl::ONEAPI::reduce(__args...); #endif } @@ -250,9 +266,9 @@ template constexpr auto __any_of_group(_Args&&... __args) { -#if _ONEDPL_SYCL2020_COLLECTIVES_PRESENT +#if !_ONEDPL_SYCL2020_COLLECTIVES_ABSENT return sycl::any_of_group(::std::forward<_Args>(__args)...); -#else +#elif _ONEDPL_LIBSYCL_COLLECTIVES_PRESENT return sycl::ONEAPI::any_of(::std::forward<_Args>(__args)...); #endif } @@ -261,9 +277,9 @@ template constexpr auto __all_of_group(_Args&&... __args) { -#if _ONEDPL_SYCL2020_COLLECTIVES_PRESENT +#if !_ONEDPL_SYCL2020_COLLECTIVES_ABSENT return sycl::all_of_group(::std::forward<_Args>(__args)...); -#else +#elif _ONEDPL_LIBSYCL_COLLECTIVES_PRESENT return sycl::ONEAPI::all_of(::std::forward<_Args>(__args)...); #endif } @@ -272,9 +288,9 @@ template constexpr auto __none_of_group(_Args&&... __args) { -#if _ONEDPL_SYCL2020_COLLECTIVES_PRESENT +#if !_ONEDPL_SYCL2020_COLLECTIVES_ABSENT return sycl::none_of_group(::std::forward<_Args>(__args)...); -#else +#elif _ONEDPL_LIBSYCL_COLLECTIVES_PRESENT return sycl::ONEAPI::none_of(::std::forward<_Args>(__args)...); #endif } @@ -283,9 +299,9 @@ template constexpr auto __joint_exclusive_scan(_Args&&... __args) { -#if _ONEDPL_SYCL2020_COLLECTIVES_PRESENT +#if !_ONEDPL_SYCL2020_COLLECTIVES_ABSENT return sycl::joint_exclusive_scan(::std::forward<_Args>(__args)...); -#else +#elif _ONEDPL_LIBSYCL_COLLECTIVES_PRESENT return sycl::ONEAPI::exclusive_scan(::std::forward<_Args>(__args)...); #endif } @@ -294,9 +310,9 @@ template constexpr auto __joint_inclusive_scan(_Args&&... __args) { -#if _ONEDPL_SYCL2020_COLLECTIVES_PRESENT +#if !_ONEDPL_SYCL2020_COLLECTIVES_ABSENT return sycl::joint_inclusive_scan(::std::forward<_Args>(__args)...); -#else +#elif _ONEDPL_LIBSYCL_COLLECTIVES_PRESENT return sycl::ONEAPI::inclusive_scan(::std::forward<_Args>(__args)...); #endif } @@ -305,9 +321,9 @@ template constexpr auto __joint_reduce(_Args&&... __args) { -#if _ONEDPL_SYCL2020_COLLECTIVES_PRESENT +#if !_ONEDPL_SYCL2020_COLLECTIVES_ABSENT return sycl::joint_reduce(::std::forward<_Args>(__args)...); -#else +#elif _ONEDPL_LIBSYCL_COLLECTIVES_PRESENT return sycl::ONEAPI::reduce(::std::forward<_Args>(__args)...); #endif } @@ -316,9 +332,9 @@ template constexpr auto __joint_any_of(_Args&&... __args) { -#if _ONEDPL_SYCL2020_COLLECTIVES_PRESENT +#if !_ONEDPL_SYCL2020_COLLECTIVES_ABSENT return sycl::joint_any_of(::std::forward<_Args>(__args)...); -#else +#elif _ONEDPL_LIBSYCL_COLLECTIVES_PRESENT return sycl::ONEAPI::any_of(::std::forward<_Args>(__args)...); #endif } @@ -327,9 +343,9 @@ template constexpr auto __joint_all_of(_Args&&... __args) { -#if _ONEDPL_SYCL2020_COLLECTIVES_PRESENT +#if !_ONEDPL_SYCL2020_COLLECTIVES_ABSENT return sycl::joint_all_of(::std::forward<_Args>(__args)...); -#else +#elif _ONEDPL_LIBSYCL_COLLECTIVES_PRESENT return sycl::ONEAPI::all_of(::std::forward<_Args>(__args)...); #endif } @@ -338,9 +354,9 @@ template constexpr auto __joint_none_of(_Args&&... __args) { -#if _ONEDPL_SYCL2020_COLLECTIVES_PRESENT +#if !_ONEDPL_SYCL2020_COLLECTIVES_ABSENT return sycl::joint_none_of(::std::forward<_Args>(__args)...); -#else +#elif _ONEDPL_LIBSYCL_COLLECTIVES_PRESENT return sycl::ONEAPI::none_of(::std::forward<_Args>(__args)...); #endif } @@ -378,38 +394,38 @@ inline auto __fpga_selector() #endif // _ONEDPL_FPGA_DEVICE using __target = -#if _ONEDPL_GENERIC_SYCL_LIBRARY || _ONEDPL_LIBSYCL_VERSION >= 50400 +#if !_ONEDPL_SYCL2020_TARGET_ABSENT sycl::target; -#else +#elif _ONEDPL_LIBSYCL_VERSION < 50400 sycl::access::target; #endif constexpr __target __target_device = -#if _ONEDPL_GENERIC_SYCL_LIBRARY || _ONEDPL_LIBSYCL_VERSION >= 50400 +#if !_ONEDPL_SYCL2020_TARGET_DEVICE_ABSENT __target::device; -#else +#elif _ONEDPL_LIBSYCL_VERSION < 50400 __target::global_buffer; #endif constexpr __target __host_target = -#if _ONEDPL_GENERIC_SYCL_LIBRARY || _ONEDPL_LIBSYCL_VERSION >= 60200 +#if !_ONEDPL_SYCL2020_HOST_TARGET_ABSENT __target::host_task; -#else +#elif _ONEDPL_LIBSYCL_VERSION < 60200 __target::host_buffer; #endif template using __buffer_allocator = -#if _ONEDPL_GENERIC_SYCL_LIBRARY || _ONEDPL_LIBSYCL_VERSION >= 60000 +#if !_ONEDPL_SYCL2020_BUFFER_ALLOCATOR_ABSENT sycl::buffer_allocator<_DataT>; -#else +#elif _ONEDPL_LIBSYCL_VERSION < 60000 sycl::buffer_allocator; #endif template -#if _ONEDPL_SYCL2020_ATOMIC_REF_PRESENT +#if !_ONEDPL_SYCL2020_ATOMIC_REF_ABSENT using __atomic_ref = sycl::atomic_ref<_AtomicType, sycl::memory_order::relaxed, sycl::memory_scope::work_group, _Space>; -#else +#elif _ONEDPL_LIBSYCL_VERSION < 50500 struct __atomic_ref : sycl::atomic<_AtomicType, _Space> { explicit __atomic_ref(_AtomicType& ref) @@ -419,9 +435,9 @@ struct __atomic_ref : sycl::atomic<_AtomicType, _Space> template using __local_accessor = -#if _ONEDPL_GENERIC_SYCL_LIBRARY || _ONEDPL_LIBSYCL_VERSION >= 60000 +#if !_ONEDPL_SYCL2020_LOCAL_ACCESSOR_ABSENT sycl::local_accessor<_DataT, _Dimensions>; -#else +#elif _ONEDPL_LIBSYCL_VERSION < 60000 sycl::accessor<_DataT, _Dimensions, sycl::access::mode::read_write, __dpl_sycl::__target::local>; #endif @@ -429,9 +445,9 @@ template auto __get_host_access(_Buf&& __buf) { -#if _ONEDPL_GENERIC_SYCL_LIBRARY || _ONEDPL_LIBSYCL_VERSION >= 60200 +#if !_ONEDPL_SYCL2020_GET_HOST_ACCESS_ABSENT return ::std::forward<_Buf>(__buf).get_host_access(sycl::read_only); -#else +#elif _ONEDPL_LIBSYCL_VERSION < 60200 return ::std::forward<_Buf>(__buf).template get_access(); #endif } @@ -440,9 +456,9 @@ template auto __get_accessor_ptr(const _Acc& __acc) { -#if _ONEDPL_GENERIC_SYCL_LIBRARY || _ONEDPL_LIBSYCL_VERSION >= 70000 +#if !_ONEDPL_SYCL2020_LOCAL_ACC_GET_MULTI_PTR_ABSENT return __acc.template get_multi_ptr().get(); -#else +#elif _ONEDPL_LIBSYCL_VERSION < 70000 return __acc.get_pointer(); #endif } diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h index bd2e9e418fd..e67cf11d7d3 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h @@ -24,7 +24,7 @@ #include "sycl_defs.h" #define _ONEDPL_SYCL_KNOWN_IDENTITY_PRESENT \ - (_ONEDPL_SYCL2020_KNOWN_IDENTITY_PRESENT || _ONEDPL_LIBSYCL_KNOWN_IDENTITY_PRESENT) + (!_ONEDPL_SYCL2020_KNOWN_IDENTITY_ABSENT || _ONEDPL_LIBSYCL_KNOWN_IDENTITY_PRESENT) namespace oneapi { diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/utils_ranges_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/utils_ranges_sycl.h index aba7dacd191..80e282493db 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/utils_ranges_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/utils_ranges_sycl.h @@ -129,7 +129,7 @@ struct all_view_fn } }; -#if _ONEDPL_SYCL2020_PLACEHOLDER_HOST_ACCESSOR_DEPRECATED +#if !_ONEDPL_SYCL2020_HOST_ACCESSOR_ABSENT struct all_host_view_fn { // An overload for sycl::buffer template type @@ -163,9 +163,9 @@ inline constexpr all_view_fn all_write; -#if _ONEDPL_SYCL2020_PLACEHOLDER_HOST_ACCESSOR_DEPRECATED +#if !_ONEDPL_SYCL2020_HOST_ACCESSOR_ABSENT inline constexpr all_host_view_fn -#else +#elif _ONEDPL_LIBSYCL_VERSION < 50700 inline constexpr all_view_fn #endif diff --git a/include/oneapi/dpl/pstl/utils.h b/include/oneapi/dpl/pstl/utils.h index 45e4d145a02..e377ee01193 100644 --- a/include/oneapi/dpl/pstl/utils.h +++ b/include/oneapi/dpl/pstl/utils.h @@ -505,7 +505,7 @@ __dpl_bit_cast(const _Src& __src) noexcept { #if __cpp_lib_bit_cast >= 201806L return ::std::bit_cast<_Dst>(__src); -#elif _ONEDPL_BACKEND_SYCL && (_ONEDPL_GENERIC_SYCL_LIBRARY || _ONEDPL_LIBSYCL_VERSION >= 50300) +#elif _ONEDPL_BACKEND_SYCL && (_ONEDPL_LIBSYCL_VERSION == 0 || _ONEDPL_LIBSYCL_VERSION >= 50300) return sycl::bit_cast<_Dst>(__src); #elif __has_builtin(__builtin_bit_cast) return __builtin_bit_cast(_Dst, __src);