diff --git a/benchmark/utils/dpcpp_timer.dp.cpp b/benchmark/utils/dpcpp_timer.dp.cpp index ddd5fe7a698..1aa12f9287d 100644 --- a/benchmark/utils/dpcpp_timer.dp.cpp +++ b/benchmark/utils/dpcpp_timer.dp.cpp @@ -74,30 +74,29 @@ class SyclTimer : public Timer { { exec_->synchronize(); // Currently, gko::SyclExecutor always use default stream. - start_ = exec_->get_queue()->submit([&](sycl::handler& cgh) { - cgh.parallel_for(1, [=](sycl::id<1> id) {}); + start_ = exec_->get_queue()->submit([&](::sycl::handler& cgh) { + cgh.parallel_for(1, [=](::sycl::id<1> id) {}); }); } double toc_impl() override { - auto stop = exec_->get_queue()->submit([&](sycl::handler& cgh) { - cgh.parallel_for(1, [=](sycl::id<1> id) {}); + auto stop = exec_->get_queue()->submit([&](::sycl::handler& cgh) { + cgh.parallel_for(1, [=](::sycl::id<1> id) {}); }); stop.wait_and_throw(); // get the start time of stop auto stop_time = stop.get_profiling_info< - sycl::info::event_profiling::command_start>(); + ::sycl::info::event_profiling::command_start>(); // get the end time of start - auto start_time = - start_ - .get_profiling_info(); + auto start_time = start_.get_profiling_info< + ::sycl::info::event_profiling::command_end>(); return (stop_time - start_time) / double{1.0e9}; } private: std::shared_ptr exec_; - sycl::event start_; + ::sycl::event start_; int id_; }; diff --git a/core/device_hooks/dpcpp_hooks.cpp b/core/device_hooks/dpcpp_hooks.cpp index 465c5cf2215..9f67d80901b 100644 --- a/core/device_hooks/dpcpp_hooks.cpp +++ b/core/device_hooks/dpcpp_hooks.cpp @@ -149,7 +149,7 @@ namespace kernels { namespace sycl { -void destroy_event(sycl::event* event) GKO_NOT_COMPILED(sycl); +void destroy_event(::sycl::event* event) GKO_NOT_COMPILED(sycl); } // namespace sycl diff --git a/dpcpp/base/batch_multi_vector_kernels.dp.cpp b/dpcpp/base/batch_multi_vector_kernels.dp.cpp index 42c6263651d..93683f228ed 100644 --- a/dpcpp/base/batch_multi_vector_kernels.dp.cpp +++ b/dpcpp/base/batch_multi_vector_kernels.dp.cpp @@ -79,16 +79,16 @@ void scale(std::shared_ptr exec, const auto num_batches = x_ub.num_batch_items; auto device = exec->get_queue()->get_device(); auto group_size = - device.get_info(); + device.get_info<::sycl::info::device::max_work_group_size>(); const dim3 block(group_size); const dim3 grid(num_batches); // Launch a kernel that has nbatches blocks, each block has max group size if (alpha->get_common_size()[1] == 1) { - exec->get_queue()->submit([&](sycl::handler& cgh) { + exec->get_queue()->submit([&](::sycl::handler& cgh) { cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + sycl_nd_range(grid, block), [=](::sycl::nd_item<3> item_ct1) { auto group = item_ct1.get_group(); auto group_id = group.get_group_linear_id(); const auto alpha_b = @@ -99,9 +99,9 @@ void scale(std::shared_ptr exec, }); }); } else { - exec->get_queue()->submit([&](sycl::handler& cgh) { + exec->get_queue()->submit([&](::sycl::handler& cgh) { cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + sycl_nd_range(grid, block), [=](::sycl::nd_item<3> item_ct1) { auto group = item_ct1.get_group(); auto group_id = group.get_group_linear_id(); const auto alpha_b = @@ -130,7 +130,7 @@ void add_scaled(std::shared_ptr exec, const auto num_batches = x->get_num_batch_items(); auto device = exec->get_queue()->get_device(); auto group_size = - device.get_info(); + device.get_info<::sycl::info::device::max_work_group_size>(); const dim3 block(group_size); const dim3 grid(num_batches); @@ -138,9 +138,9 @@ void add_scaled(std::shared_ptr exec, const auto x_ub = get_batch_struct(x); const auto y_ub = get_batch_struct(y); if (alpha->get_common_size()[1] == 1) { - exec->get_queue()->submit([&](sycl::handler& cgh) { + exec->get_queue()->submit([&](::sycl::handler& cgh) { cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + sycl_nd_range(grid, block), [=](::sycl::nd_item<3> item_ct1) { auto group = item_ct1.get_group(); auto group_id = group.get_group_linear_id(); const auto alpha_b = @@ -152,9 +152,9 @@ void add_scaled(std::shared_ptr exec, }); }); } else { - exec->get_queue()->submit([&](sycl::handler& cgh) { + exec->get_queue()->submit([&](::sycl::handler& cgh) { cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + sycl_nd_range(grid, block), [=](::sycl::nd_item<3> item_ct1) { auto group = item_ct1.get_group(); auto group_id = group.get_group_linear_id(); const auto alpha_b = @@ -185,16 +185,16 @@ void compute_dot(std::shared_ptr exec, const auto num_batches = x_ub.num_batch_items; auto device = exec->get_queue()->get_device(); auto group_size = - device.get_info(); + device.get_info<::sycl::info::device::max_work_group_size>(); const dim3 block(group_size); const dim3 grid(num_batches); // TODO: Remove reqd_sub_group size and use sycl::reduce_over_group - exec->get_queue()->submit([&](sycl::handler& cgh) { + exec->get_queue()->submit([&](::sycl::handler& cgh) { cgh.parallel_for( sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( + [=](::sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( config::warp_size)]] { auto group = item_ct1.get_group(); auto group_id = group.get_group_linear_id(); @@ -224,15 +224,15 @@ void compute_conj_dot(std::shared_ptr exec, const auto num_batches = x_ub.num_batch_items; auto device = exec->get_queue()->get_device(); auto group_size = - device.get_info(); + device.get_info<::sycl::info::device::max_work_group_size>(); const dim3 block(group_size); const dim3 grid(num_batches); - exec->get_queue()->submit([&](sycl::handler& cgh) { + exec->get_queue()->submit([&](::sycl::handler& cgh) { cgh.parallel_for( sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) + [=](::sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(config::warp_size)]] { auto group = item_ct1.get_group(); auto group_id = group.get_group_linear_id(); @@ -262,14 +262,14 @@ void compute_norm2(std::shared_ptr exec, const auto num_batches = x_ub.num_batch_items; auto device = exec->get_queue()->get_device(); auto group_size = - device.get_info(); + device.get_info<::sycl::info::device::max_work_group_size>(); const dim3 block(group_size); const dim3 grid(num_batches); - exec->get_queue()->submit([&](sycl::handler& cgh) { + exec->get_queue()->submit([&](::sycl::handler& cgh) { cgh.parallel_for(sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) + [=](::sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(config::warp_size)]] { auto group = item_ct1.get_group(); auto group_id = group.get_group_linear_id(); @@ -297,14 +297,14 @@ void copy(std::shared_ptr exec, const auto num_batches = x_ub.num_batch_items; auto device = exec->get_queue()->get_device(); auto group_size = - device.get_info(); + device.get_info<::sycl::info::device::max_work_group_size>(); const dim3 block(group_size); const dim3 grid(num_batches); - exec->get_queue()->submit([&](sycl::handler& cgh) { + exec->get_queue()->submit([&](::sycl::handler& cgh) { cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + sycl_nd_range(grid, block), [=](::sycl::nd_item<3> item_ct1) { auto group = item_ct1.get_group(); auto group_id = group.get_group_linear_id(); const auto x_b = batch::extract_batch_item(x_ub, group_id); diff --git a/dpcpp/base/batch_multi_vector_kernels.hpp.inc b/dpcpp/base/batch_multi_vector_kernels.hpp.inc index 75e1981d659..4445acb6e96 100644 --- a/dpcpp/base/batch_multi_vector_kernels.hpp.inc +++ b/dpcpp/base/batch_multi_vector_kernels.hpp.inc @@ -34,7 +34,7 @@ template __dpct_inline__ void scale_kernel( const gko::batch::multi_vector::batch_item& alpha, const gko::batch::multi_vector::batch_item& x, - sycl::nd_item<3>& item_ct1, Mapping map) + ::sycl::nd_item<3>& item_ct1, Mapping map) { const int max_li = x.num_rows * x.num_rhs; for (int li = item_ct1.get_local_linear_id(); li < max_li; @@ -53,7 +53,7 @@ __dpct_inline__ void add_scaled_kernel( const gko::batch::multi_vector::batch_item& alpha, const gko::batch::multi_vector::batch_item& x, const gko::batch::multi_vector::batch_item& y, - sycl::nd_item<3>& item_ct1, Mapping map) + ::sycl::nd_item<3>& item_ct1, Mapping map) { const int max_li = x.num_rows * x.num_rhs; for (int li = item_ct1.get_local_id(2); li < max_li; @@ -72,12 +72,12 @@ __dpct_inline__ void compute_gen_dot_product_kernel( const gko::batch::multi_vector::batch_item& x, const gko::batch::multi_vector::batch_item& y, const gko::batch::multi_vector::batch_item& result, - sycl::nd_item<3>& item_ct1, Mapping conj_map) + ::sycl::nd_item<3>& item_ct1, Mapping conj_map) { constexpr auto tile_size = config::warp_size; auto subg = group::tiled_partition(group::this_thread_block(item_ct1)); - const auto subgroup = static_cast(subg); + const auto subgroup = static_cast<::sycl::sub_group>(subg); const int subgroup_id = subgroup.get_group_id(); const int subgroup_size = subgroup.get_local_range().size(); const int num_subgroups = subgroup.get_group_range().size(); @@ -107,12 +107,12 @@ __dpct_inline__ void compute_norm2_kernel( const gko::batch::multi_vector::batch_item& x, const gko::batch::multi_vector::batch_item>& result, - sycl::nd_item<3>& item_ct1) + ::sycl::nd_item<3>& item_ct1) { constexpr auto tile_size = config::warp_size; auto subg = group::tiled_partition(group::this_thread_block(item_ct1)); - const auto subgroup = static_cast(subg); + const auto subgroup = static_cast<::sycl::sub_group>(subg); const int subgroup_id = subgroup.get_group_id(); const int subgroup_size = subgroup.get_local_range().size(); const int num_subgroups = subgroup.get_group_range().size(); @@ -140,7 +140,7 @@ template __dpct_inline__ void copy_kernel( const gko::batch::multi_vector::batch_item& in, const gko::batch::multi_vector::batch_item& out, - sycl::nd_item<3>& item_ct1) + ::sycl::nd_item<3>& item_ct1) { for (int iz = item_ct1.get_local_linear_id(); iz < in.num_rows * in.num_rhs; iz += item_ct1.get_local_range().size()) { diff --git a/dpcpp/base/device.hpp b/dpcpp/base/device.hpp index a603bf4a395..db847af323c 100644 --- a/dpcpp/base/device.hpp +++ b/dpcpp/base/device.hpp @@ -43,7 +43,7 @@ namespace sycl { /** calls delete on the given event. */ -void destroy_event(sycl::event* event); +void destroy_event(::sycl::event* event); } // namespace sycl diff --git a/dpcpp/base/dim3.dp.hpp b/dpcpp/base/dim3.dp.hpp index 525fd8a3b90..8a8d93ecdfb 100644 --- a/dpcpp/base/dim3.dp.hpp +++ b/dpcpp/base/dim3.dp.hpp @@ -66,9 +66,9 @@ struct dim3 { * get_range returns the range for sycl with correct ordering (reverse of * cuda) * - * @return sycl::range<3> + * @return ::sycl::range<3> */ - sycl::range<3> get_range() { return sycl::range<3>(z, y, x); } + ::sycl::range<3> get_range() { return ::sycl::range<3>(z, y, x); } }; @@ -80,11 +80,11 @@ struct dim3 { * * @return sycl::nd_range<3> */ -inline sycl::nd_range<3> sycl_nd_range(dim3 grid, dim3 block) +inline ::sycl::nd_range<3> sycl_nd_range(dim3 grid, dim3 block) { auto local_range = block.get_range(); auto global_range = grid.get_range() * local_range; - return sycl::nd_range<3>(global_range, local_range); + return ::sycl::nd_range<3>(global_range, local_range); } diff --git a/dpcpp/base/executor.dp.cpp b/dpcpp/base/executor.dp.cpp index ff29d9f1d54..6bf3e735ba0 100644 --- a/dpcpp/base/executor.dp.cpp +++ b/dpcpp/base/executor.dp.cpp @@ -53,12 +53,12 @@ namespace detail { const std::vector get_devices(std::string device_type) { - std::map device_type_map{ - {"accelerator", sycl::info::device_type::accelerator}, - {"all", sycl::info::device_type::all}, - {"cpu", sycl::info::device_type::cpu}, - {"host", sycl::info::device_type::host}, - {"gpu", sycl::info::device_type::gpu}}; + std::map device_type_map{ + {"accelerator", ::sycl::info::device_type::accelerator}, + {"all", ::sycl::info::device_type::all}, + {"cpu", ::sycl::info::device_type::cpu}, + {"host", ::sycl::info::device_type::host}, + {"gpu", ::sycl::info::device_type::gpu}}; std::for_each(device_type.begin(), device_type.end(), [](char& c) { c = std::tolower(c); }); return sycl::device::get_devices(device_type_map.at(device_type)); @@ -224,7 +224,7 @@ bool SyclExecutor::verify_memory_to(const SyclExecutor* dest_exec) const namespace detail { -void delete_queue(sycl::queue* queue) +void delete_queue(::sycl::queue* queue) { queue->wait(); delete queue; @@ -257,7 +257,7 @@ void SyclExecutor::set_device_property(sycl_queue_property property) if (!device.is_host()) { try { auto subgroup_sizes = - device.get_info(); + device.get_info<::sycl::info::device::sub_group_sizes>(); for (auto& i : subgroup_sizes) { this->get_exec_info().subgroup_sizes.push_back(i); } @@ -266,26 +266,26 @@ void SyclExecutor::set_device_property(sycl_queue_property property) } } this->get_exec_info().num_computing_units = static_cast( - device.get_info()); + device.get_info<::sycl::info::device::max_compute_units>()); const auto subgroup_sizes = this->get_exec_info().subgroup_sizes; if (subgroup_sizes.size()) { this->get_exec_info().max_subgroup_size = static_cast( *std::max_element(subgroup_sizes.begin(), subgroup_sizes.end())); } this->get_exec_info().max_workgroup_size = static_cast( - device.get_info()); + device.get_info<::sycl::info::device::max_work_group_size>()); // They change the max_work_item_size with template parameter Dimension after // major version 6 and adding the default = 3 is not in the same release. #if GINKGO_DPCPP_MAJOR_VERSION >= 6 auto max_workitem_sizes = - device.get_info>(); + device.get_info<::sycl::info::device::max_work_item_sizes<3>>(); #else auto max_workitem_sizes = - device.get_info(); + device.get_info<::sycl::info::device::max_work_item_sizes>(); #endif - // Get the max dimension of a sycl::id object + // Get the max dimension of a ::sycl::id object auto max_work_item_dimensions = - device.get_info(); + device.get_info<::sycl::info::device::max_work_item_dimensions>(); for (uint32 i = 0; i < max_work_item_dimensions; i++) { this->get_exec_info().max_workitem_sizes.push_back( max_workitem_sizes[i]); @@ -295,10 +295,10 @@ void SyclExecutor::set_device_property(sycl_queue_property property) if (device.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu)) { #if GINKGO_DPCPP_MAJOR_VERSION >= 6 this->get_exec_info().num_pu_per_cu = device.get_info< - sycl::ext::intel::info::device::gpu_hw_threads_per_eu>(); + ::sycl::ext::intel::info::device::gpu_hw_threads_per_eu>(); #else this->get_exec_info().num_pu_per_cu = device.get_info< - sycl::info::device::ext_intel_gpu_hw_threads_per_eu>(); + ::sycl::info::device::ext_intel_gpu_hw_threads_per_eu>(); #endif } else { // To make the usage still valid. @@ -311,8 +311,10 @@ void SyclExecutor::set_device_property(sycl_queue_property property) // `wait()` would be needed after every call to a DPC++ function or kernel. // For example, without `in_order`, doing a copy, a kernel, and a copy, will // not necessarily happen in that order by default, which we need to avoid. - auto* queue = new sycl::queue{device, detail::get_property_list(property)}; - queue_ = std::move(queue_manager{queue, detail::delete_queue}); + auto* queue = + new ::sycl::queue{device, detail::get_property_list(property)}; + queue_ = + std::move(queue_manager<::sycl::queue>{queue, detail::delete_queue}); } @@ -320,7 +322,7 @@ namespace kernels { namespace sycl { -void destroy_event(sycl::event* event) { delete event; } +void destroy_event(::sycl::event* event) { delete event; } } // namespace sycl diff --git a/dpcpp/base/helper.dp.cpp b/dpcpp/base/helper.dp.cpp index e43fe3fd6b4..7ec4ce3da00 100644 --- a/dpcpp/base/helper.dp.cpp +++ b/dpcpp/base/helper.dp.cpp @@ -41,14 +41,14 @@ namespace kernels { namespace sycl { -bool validate(sycl::queue* queue, unsigned int workgroup_size, +bool validate(::sycl::queue* queue, unsigned int workgroup_size, unsigned int subgroup_size) { auto device = queue->get_device(); auto subgroup_size_list = - device.get_info(); + device.get_info<::sycl::info::device::sub_group_sizes>(); auto max_workgroup_size = - device.get_info(); + device.get_info<::sycl::info::device::max_work_group_size>(); bool allowed = false; for (auto& i : subgroup_size_list) { allowed |= (i == subgroup_size); diff --git a/dpcpp/base/helper.hpp b/dpcpp/base/helper.hpp index edc7695c296..830573db6c3 100644 --- a/dpcpp/base/helper.hpp +++ b/dpcpp/base/helper.hpp @@ -56,19 +56,19 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * @param name_ the name of the host function with config * @param kernel_ the kernel name */ -#define GKO_ENABLE_DEFAULT_HOST(name_, kernel_) \ - template \ - void name_(dim3 grid, dim3 block, gko::size_type, sycl::queue* queue, \ - InferredArgs... args) \ - { \ - queue->submit([&](sycl::handler& cgh) { \ - cgh.parallel_for( \ - sycl_nd_range(grid, block), \ - [=](sycl::nd_item<3> item_ct1) \ - [[sycl::reqd_sub_group_size(config::warp_size)]] { \ - kernel_(args..., item_ct1); \ - }); \ - }); \ +#define GKO_ENABLE_DEFAULT_HOST(name_, kernel_) \ + template \ + void name_(dim3 grid, dim3 block, gko::size_type, ::sycl::queue* queue, \ + InferredArgs... args) \ + { \ + queue->submit([&](::sycl::handler& cgh) { \ + cgh.parallel_for( \ + sycl_nd_range(grid, block), \ + [=](::sycl::nd_item<3> item_ct1) \ + [[sycl::reqd_sub_group_size(config::warp_size)]] { \ + kernel_(args..., item_ct1); \ + }); \ + }); \ } @@ -83,40 +83,40 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GKO_ENABLE_DEFAULT_HOST_CONFIG(name_, kernel_) \ template \ inline void name_(dim3 grid, dim3 block, gko::size_type, \ - sycl::queue* queue, InferredArgs... args) \ + ::sycl::queue* queue, InferredArgs... args) \ { \ - queue->submit([&](sycl::handler& cgh) { \ + queue->submit([&](::sycl::handler& cgh) { \ if constexpr (DCFG_1D::decode<1>(encoded) > 1) { \ cgh.parallel_for(sycl_nd_range(grid, block), \ - [=](sycl::nd_item<3> item_ct1) \ + [=](::sycl::nd_item<3> item_ct1) \ [[sycl::reqd_sub_group_size( \ DCFG_1D::decode<1>(encoded))]] { \ kernel_(args..., item_ct1); \ }); \ } else { \ cgh.parallel_for(sycl_nd_range(grid, block), \ - [=](sycl::nd_item<3> item_ct1) { \ + [=](::sycl::nd_item<3> item_ct1) { \ kernel_(args..., item_ct1); \ }); \ } \ }); \ } -#define GKO_ENABLE_DEFAULT_HOST_CONFIG_TYPE(name_, kernel_) \ - template \ - inline void name_(dim3 grid, dim3 block, gko::size_type, \ - sycl::queue* queue, InferredArgs... args) \ - { \ - queue->submit([&](sycl::handler& cgh) { \ - cgh.parallel_for( \ - sycl_nd_range(grid, block), \ - [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( \ - DeviceConfig:: \ - subgroup_size)]] __WG_BOUND__(DeviceConfig:: \ - block_size) { \ - kernel_(args..., item_ct1); \ - }); \ - }); \ +#define GKO_ENABLE_DEFAULT_HOST_CONFIG_TYPE(name_, kernel_) \ + template \ + inline void name_(dim3 grid, dim3 block, gko::size_type, \ + ::sycl::queue* queue, InferredArgs... args) \ + { \ + queue->submit([&](::sycl::handler& cgh) { \ + cgh.parallel_for( \ + sycl_nd_range(grid, block), \ + [=](::sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( \ + DeviceConfig:: \ + subgroup_size)]] __WG_BOUND__(DeviceConfig:: \ + block_size) { \ + kernel_(args..., item_ct1); \ + }); \ + }); \ } /** @@ -133,7 +133,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GKO_ENABLE_DEFAULT_CONFIG_CALL(name_, callable_, list_) \ template \ void name_(std::uint32_t desired_cfg, dim3 grid, dim3 block, \ - gko::size_type dynamic_shared_memory, sycl::queue* queue, \ + gko::size_type dynamic_shared_memory, ::sycl::queue* queue, \ InferredArgs... args) \ { \ callable_( \ @@ -195,7 +195,7 @@ namespace sycl { * * @return the given arguments are valid or not in given queue. */ -bool validate(sycl::queue* queue, unsigned workgroup_size, +bool validate(::sycl::queue* queue, unsigned workgroup_size, unsigned subgroup_size); diff --git a/dpcpp/base/kernel_launch.dp.hpp b/dpcpp/base/kernel_launch.dp.hpp index 744f086dbc8..0141b38037d 100644 --- a/dpcpp/base/kernel_launch.dp.hpp +++ b/dpcpp/base/kernel_launch.dp.hpp @@ -51,11 +51,11 @@ namespace device_std = std; template -void generic_kernel_1d(sycl::handler& cgh, int64 size, KernelFunction fn, +void generic_kernel_1d(::sycl::handler& cgh, int64 size, KernelFunction fn, KernelArgs... args) { - cgh.parallel_for(sycl::range<1>{static_cast(size)}, - [=](sycl::id<1> idx_id) { + cgh.parallel_for(::sycl::range<1>{static_cast(size)}, + [=](::sycl::id<1> idx_id) { auto idx = static_cast(idx_id[0]); fn(idx, args...); }); @@ -63,11 +63,11 @@ void generic_kernel_1d(sycl::handler& cgh, int64 size, KernelFunction fn, template -void generic_kernel_2d(sycl::handler& cgh, int64 rows, int64 cols, +void generic_kernel_2d(::sycl::handler& cgh, int64 rows, int64 cols, KernelFunction fn, KernelArgs... args) { - cgh.parallel_for(sycl::range<1>{static_cast(rows * cols)}, - [=](sycl::id<1> idx) { + cgh.parallel_for(::sycl::range<1>{static_cast(rows * cols)}, + [=](::sycl::id<1> idx) { auto row = static_cast(idx[0]) / cols; auto col = static_cast(idx[0]) % cols; fn(row, col, args...); @@ -79,7 +79,7 @@ template void run_kernel(std::shared_ptr exec, KernelFunction fn, size_type size, KernelArgs&&... args) { - exec->get_queue()->submit([&](sycl::handler& cgh) { + exec->get_queue()->submit([&](::sycl::handler& cgh) { generic_kernel_1d(cgh, static_cast(size), fn, map_to_device(args)...); }); @@ -89,7 +89,7 @@ template void run_kernel(std::shared_ptr exec, KernelFunction fn, dim<2> size, KernelArgs&&... args) { - exec->get_queue()->submit([&](sycl::handler& cgh) { + exec->get_queue()->submit([&](::sycl::handler& cgh) { generic_kernel_2d(cgh, static_cast(size[0]), static_cast(size[1]), fn, map_to_device(args)...); diff --git a/dpcpp/base/kernel_launch_reduction.dp.hpp b/dpcpp/base/kernel_launch_reduction.dp.hpp index 713fc02a34f..db5f29d7ec5 100644 --- a/dpcpp/base/kernel_launch_reduction.dp.hpp +++ b/dpcpp/base/kernel_launch_reduction.dp.hpp @@ -60,7 +60,7 @@ static constexpr auto dcfg_1d_list_simple_reduction = dcfg_1d_list_t(); template -void generic_kernel_reduction_1d(sycl::handler& cgh, int64 size, +void generic_kernel_reduction_1d(::sycl::handler& cgh, int64 size, int64 num_workgroups, KernelFunction fn, ReductionOp op, FinalizeOp finalize, ValueType identity, ValueType* storage, @@ -69,15 +69,16 @@ void generic_kernel_reduction_1d(sycl::handler& cgh, int64 size, constexpr auto wg_size = DeviceConfig::block_size; constexpr auto sg_size = DeviceConfig::subgroup_size; constexpr auto num_partials = wg_size / sg_size; - sycl::accessor, 0, - sycl::access_mode::read_write, sycl::access::target::local> + ::sycl::accessor, 0, + ::sycl::access_mode::read_write, + ::sycl::access::target::local> subgroup_partial_acc(cgh); const auto range = sycl_nd_range(dim3(num_workgroups), dim3(wg_size)); const auto global_size = num_workgroups * wg_size; cgh.parallel_for( range, - [=](sycl::nd_item<3> idx) [[sycl::reqd_sub_group_size(sg_size)]] { + [=](::sycl::nd_item<3> idx) [[sycl::reqd_sub_group_size(sg_size)]] { auto subgroup_partial = &(*subgroup_partial_acc.get_pointer())[0]; const auto tidx = thread::get_thread_id_flat(idx); const auto local_tidx = static_cast(tidx % wg_size); @@ -91,7 +92,7 @@ void generic_kernel_reduction_1d(sycl::handler& cgh, int64 size, if (subgroup.thread_rank() == 0) { subgroup_partial[local_tidx / sg_size] = partial; } - idx.barrier(sycl::access::fence_space::local_space); + idx.barrier(::sycl::access::fence_space::local_space); if (local_tidx < sg_size) { partial = identity; for (int64 i = local_tidx; i < num_partials; i += sg_size) { @@ -109,7 +110,7 @@ void generic_kernel_reduction_1d(sycl::handler& cgh, int64 size, template -void generic_kernel_reduction_2d(sycl::handler& cgh, int64 rows, int64 cols, +void generic_kernel_reduction_2d(::sycl::handler& cgh, int64 rows, int64 cols, int64 num_workgroups, KernelFunction fn, ReductionOp op, FinalizeOp finalize, ValueType identity, ValueType* storage, @@ -118,15 +119,16 @@ void generic_kernel_reduction_2d(sycl::handler& cgh, int64 rows, int64 cols, constexpr auto wg_size = DeviceConfig::block_size; constexpr auto sg_size = DeviceConfig::subgroup_size; constexpr auto num_partials = wg_size / sg_size; - sycl::accessor, 0, - sycl::access_mode::read_write, sycl::access::target::local> + ::sycl::accessor, 0, + ::sycl::access_mode::read_write, + ::sycl::access::target::local> subgroup_partial_acc(cgh); const auto range = sycl_nd_range(dim3(num_workgroups), dim3(wg_size)); const auto global_size = num_workgroups * wg_size; cgh.parallel_for( range, - [=](sycl::nd_item<3> idx) [[sycl::reqd_sub_group_size(sg_size)]] { + [=](::sycl::nd_item<3> idx) [[sycl::reqd_sub_group_size(sg_size)]] { auto subgroup_partial = &(*subgroup_partial_acc.get_pointer())[0]; const auto tidx = thread::get_thread_id_flat(idx); const auto local_tidx = static_cast(tidx % wg_size); @@ -142,7 +144,7 @@ void generic_kernel_reduction_2d(sycl::handler& cgh, int64 rows, int64 cols, if (subgroup.thread_rank() == 0) { subgroup_partial[local_tidx / sg_size] = partial; } - idx.barrier(sycl::access::fence_space::local_space); + idx.barrier(::sycl::access::fence_space::local_space); if (local_tidx < sg_size) { partial = identity; for (int64 i = local_tidx; i < num_partials; i += sg_size) { @@ -178,13 +180,13 @@ void run_kernel_reduction_impl(std::shared_ptr exec, if (tmp.get_num_elems() < required_storage) { tmp.resize_and_reset(required_storage); } - queue->submit([&](sycl::handler& cgh) { + queue->submit([&](::sycl::handler& cgh) { generic_kernel_reduction_1d( cgh, static_cast(size), num_workgroups, fn, op, [](auto v) { return v; }, identity, reinterpret_cast(tmp.get_data()), args...); }); - queue->submit([&](sycl::handler& cgh) { + queue->submit([&](::sycl::handler& cgh) { generic_kernel_reduction_1d( cgh, static_cast(num_workgroups), 1, [](auto i, auto v) { return v[i]; }, op, finalize, identity, @@ -192,7 +194,7 @@ void run_kernel_reduction_impl(std::shared_ptr exec, reinterpret_cast(tmp.get_const_data())); }); } else { - queue->submit([&](sycl::handler& cgh) { + queue->submit([&](::sycl::handler& cgh) { generic_kernel_reduction_1d( cgh, static_cast(size), 1, fn, op, finalize, identity, result, args...); @@ -225,13 +227,13 @@ void run_kernel_reduction_impl(std::shared_ptr exec, if (tmp.get_num_elems() < required_storage) { tmp.resize_and_reset(required_storage); } - queue->submit([&](sycl::handler& cgh) { + queue->submit([&](::sycl::handler& cgh) { generic_kernel_reduction_2d( cgh, rows, cols, num_workgroups, fn, op, [](auto v) { return v; }, identity, reinterpret_cast(tmp.get_data()), args...); }); - queue->submit([&](sycl::handler& cgh) { + queue->submit([&](::sycl::handler& cgh) { generic_kernel_reduction_1d( cgh, static_cast(num_workgroups), 1, [](auto i, auto v) { return v[i]; }, op, finalize, identity, @@ -239,7 +241,7 @@ void run_kernel_reduction_impl(std::shared_ptr exec, reinterpret_cast(tmp.get_const_data())); }); } else { - queue->submit([&](sycl::handler& cgh) { + queue->submit([&](::sycl::handler& cgh) { generic_kernel_reduction_2d(cgh, rows, cols, 1, fn, op, finalize, identity, result, args...); @@ -315,10 +317,10 @@ void generic_kernel_row_reduction_2d(syn::value_list, static_assert(ssg_size <= sg_size, "ssg must be smaller than sg"); const auto num_workgroups = ceildiv(rows * col_blocks * ssg_size, wg_size); const auto range = sycl_nd_range(dim3(num_workgroups), dim3(wg_size)); - exec->get_queue()->submit([&](sycl::handler& cgh) { + exec->get_queue()->submit([&](::sycl::handler& cgh) { cgh.parallel_for( range, - [=](sycl::nd_item<3> id) [[sycl::reqd_sub_group_size(sg_size)]] { + [=](::sycl::nd_item<3> id) [[sycl::reqd_sub_group_size(sg_size)]] { const auto idx = thread::get_subwarp_id_flat(id); const auto row = idx % rows; @@ -359,7 +361,7 @@ template void generic_kernel_col_reduction_2d_small( - sycl::handler& cgh, int64 rows, int64 cols, int64 row_blocks, + ::sycl::handler& cgh, int64 rows, int64 cols, int64 row_blocks, KernelFunction fn, ReductionOp op, FinalizeOp finalize, ValueType identity, ValueType* result, MappedKernelArgs... args) { @@ -369,12 +371,14 @@ void generic_kernel_col_reduction_2d_small( constexpr auto subgroups_per_workgroup = wg_size / sg_size; // stores the subwarp_size partial sums from each warp, grouped by warp constexpr auto shared_storage = subgroups_per_workgroup * ssg_size; - sycl::accessor, 0, - sycl::access_mode::read_write, sycl::access::target::local> + ::sycl::accessor, 0, + ::sycl::access_mode::read_write, + ::sycl::access::target::local> block_partial_acc(cgh); const auto range = sycl_nd_range(dim3(row_blocks), dim3(wg_size)); cgh.parallel_for( - range, [=](sycl::nd_item<3> id) [[sycl::reqd_sub_group_size(sg_size)]] { + range, + [=](::sycl::nd_item<3> id) [[sycl::reqd_sub_group_size(sg_size)]] { auto block_partial = &(*block_partial_acc.get_pointer())[0]; const auto ssg_id = thread::get_subwarp_id_flat(id); @@ -434,7 +438,7 @@ template void generic_kernel_col_reduction_2d_blocked( - sycl::handler& cgh, int64 rows, int64 cols, int64 row_blocks, + ::sycl::handler& cgh, int64 rows, int64 cols, int64 row_blocks, int64 col_blocks, KernelFunction fn, ReductionOp op, FinalizeOp finalize, ValueType identity, ValueType* result, MappedKernelArgs... args) { @@ -442,11 +446,13 @@ void generic_kernel_col_reduction_2d_blocked( constexpr auto sg_size = cfg::subgroup_size; const auto range = sycl_nd_range(dim3(row_blocks, col_blocks), dim3(wg_size)); - sycl::accessor, 0, - sycl::access_mode::read_write, sycl::access::target::local> + ::sycl::accessor, 0, + ::sycl::access_mode::read_write, + ::sycl::access::target::local> block_partial_acc(cgh); cgh.parallel_for( - range, [=](sycl::nd_item<3> id) [[sycl::reqd_sub_group_size(sg_size)]] { + range, + [=](::sycl::nd_item<3> id) [[sycl::reqd_sub_group_size(sg_size)]] { const auto sg_id = thread::get_subwarp_id_flat(id); const auto sg_num = thread::get_subwarp_num_flat(id); @@ -483,12 +489,12 @@ void generic_kernel_col_reduction_2d_blocked( template void generic_kernel_reduction_finalize_2d( - sycl::handler& cgh, int64 num_results, int64 num_blocks, ReductionOp op, + ::sycl::handler& cgh, int64 num_results, int64 num_blocks, ReductionOp op, FinalizeOp finalize, ValueType identity, const ValueType* input, int64 result_stride, ValueType* result) { - cgh.parallel_for(sycl::range<1>{static_cast(num_results)}, - [=](sycl::id<1> id) { + cgh.parallel_for(::sycl::range<1>{static_cast(num_results)}, + [=](::sycl::id<1> id) { auto partial = identity; for (int64 block = 0; block < num_blocks; block++) { partial = op(partial, @@ -519,7 +525,7 @@ void run_generic_col_reduction_small(syn::value_list, std::min(ceildiv(rows * ssg_size, wg_size), max_workgroups); auto queue = exec->get_queue(); if (row_blocks <= 1) { - queue->submit([&](sycl::handler& cgh) { + queue->submit([&](::sycl::handler& cgh) { generic_kernel_col_reduction_2d_small( cgh, rows, cols, 1, fn, op, finalize, identity, result, args...); @@ -529,13 +535,13 @@ void run_generic_col_reduction_small(syn::value_list, if (tmp.get_num_elems() < required_storage) { tmp.resize_and_reset(required_storage); } - queue->submit([&](sycl::handler& cgh) { + queue->submit([&](::sycl::handler& cgh) { generic_kernel_col_reduction_2d_small( cgh, rows, cols, row_blocks, fn, op, [](auto v) { return v; }, identity, reinterpret_cast(tmp.get_data()), args...); }); - queue->submit([&](sycl::handler& cgh) { + queue->submit([&](::sycl::handler& cgh) { generic_kernel_reduction_finalize_2d( cgh, cols, row_blocks, op, finalize, identity, reinterpret_cast(tmp.get_const_data()), 1, @@ -579,7 +585,7 @@ void run_kernel_row_reduction_stage1(std::shared_ptr exec, syn::value_list{}, exec, rows, cols, col_blocks, fn, op, [](auto v) { return v; }, identity, reinterpret_cast(tmp.get_data()), 1, args...); - queue->submit([&](sycl::handler& cgh) { + queue->submit([&](::sycl::handler& cgh) { generic_kernel_reduction_finalize_2d( cgh, rows, col_blocks, op, finalize, identity, reinterpret_cast(tmp.get_const_data()), @@ -638,7 +644,7 @@ void run_kernel_col_reduction_stage1(std::shared_ptr exec, col_blocks); auto queue = exec->get_queue(); if (row_blocks <= 1) { - queue->submit([&](sycl::handler& cgh) { + queue->submit([&](::sycl::handler& cgh) { generic_kernel_col_reduction_2d_blocked( cgh, rows, cols, 1, col_blocks, fn, op, finalize, identity, result, args...); @@ -648,13 +654,13 @@ void run_kernel_col_reduction_stage1(std::shared_ptr exec, if (tmp.get_num_elems() < required_storage) { tmp.resize_and_reset(required_storage); } - queue->submit([&](sycl::handler& cgh) { + queue->submit([&](::sycl::handler& cgh) { generic_kernel_col_reduction_2d_blocked( cgh, rows, cols, row_blocks, col_blocks, fn, op, [](auto v) { return v; }, identity, reinterpret_cast(tmp.get_data()), args...); }); - queue->submit([&](sycl::handler& cgh) { + queue->submit([&](::sycl::handler& cgh) { generic_kernel_reduction_finalize_2d( cgh, cols, row_blocks, op, finalize, identity, reinterpret_cast(tmp.get_const_data()), 1, diff --git a/dpcpp/base/kernel_launch_solver.dp.hpp b/dpcpp/base/kernel_launch_solver.dp.hpp index 2c41fd5c3e6..088ac90ae7a 100644 --- a/dpcpp/base/kernel_launch_solver.dp.hpp +++ b/dpcpp/base/kernel_launch_solver.dp.hpp @@ -42,12 +42,12 @@ namespace sycl { template -void generic_kernel_2d_solver(sycl::handler& cgh, int64 rows, int64 cols, +void generic_kernel_2d_solver(::sycl::handler& cgh, int64 rows, int64 cols, int64 default_stride, KernelFunction fn, KernelArgs... args) { - cgh.parallel_for(sycl::range<1>{static_cast(rows * cols)}, - [=](sycl::id<1> idx) { + cgh.parallel_for(::sycl::range<1>{static_cast(rows * cols)}, + [=](::sycl::id<1> idx) { auto row = static_cast(idx[0] / cols); auto col = static_cast(idx[0] % cols); fn(row, col, @@ -62,7 +62,7 @@ void run_kernel_solver(std::shared_ptr exec, KernelFunction fn, dim<2> size, size_type default_stride, KernelArgs&&... args) { - exec->get_queue()->submit([&](sycl::handler& cgh) { + exec->get_queue()->submit([&](::sycl::handler& cgh) { kernels::sycl::generic_kernel_2d_solver( cgh, static_cast(size[0]), static_cast(size[1]), static_cast(default_stride), fn, diff --git a/dpcpp/base/onemkl_bindings.hpp b/dpcpp/base/onemkl_bindings.hpp index 0d0e9e70827..50ecc6f1768 100644 --- a/dpcpp/base/onemkl_bindings.hpp +++ b/dpcpp/base/onemkl_bindings.hpp @@ -95,7 +95,7 @@ struct is_supported> : std::true_type {}; #define GKO_BIND_DOT(ValueType, Name, Func) \ - inline void Name(sycl::queue& exec_queue, std::int64_t n, \ + inline void Name(::sycl::queue& exec_queue, std::int64_t n, \ const ValueType* x, std::int64_t incx, \ const ValueType* y, std::int64_t incy, ValueType* result) \ { \ diff --git a/dpcpp/base/timer.dp.cpp b/dpcpp/base/timer.dp.cpp index 874bb82dba2..96b2af24d43 100644 --- a/dpcpp/base/timer.dp.cpp +++ b/dpcpp/base/timer.dp.cpp @@ -56,7 +56,7 @@ SyclTimer::SyclTimer(std::shared_ptr exec) void SyclTimer::init_time_point(time_point& time) { time.type_ = time_point::type::sycl; - time.data_.sycl_event = new sycl::event{}; + time.data_.sycl_event = new ::sycl::event{}; } @@ -64,8 +64,8 @@ void SyclTimer::record(time_point& time) { GKO_ASSERT(time.type_ == time_point::type::sycl); *time.data_.sycl_event = - exec_->get_queue()->submit([&](sycl::handler& cgh) { - cgh.parallel_for(1, [=](sycl::id<1> id) {}); + exec_->get_queue()->submit([&](::sycl::handler& cgh) { + cgh.parallel_for(1, [=](::sycl::id<1> id) {}); }); } @@ -83,12 +83,11 @@ std::chrono::nanoseconds SyclTimer::difference_async(const time_point& start, GKO_ASSERT(start.type_ == time_point::type::sycl); GKO_ASSERT(stop.type_ == time_point::type::sycl); stop.data_.sycl_event->wait_and_throw(); - auto stop_time = - stop.data_.sycl_event - ->get_profiling_info(); + auto stop_time = stop.data_.sycl_event->get_profiling_info< + ::sycl::info::event_profiling::command_start>(); auto start_time = start.data_.sycl_event - ->get_profiling_info(); + ->get_profiling_info<::sycl::info::event_profiling::command_end>(); return std::chrono::nanoseconds{static_cast(stop_time - start_time)}; } diff --git a/dpcpp/components/atomic.dp.hpp b/dpcpp/components/atomic.dp.hpp index 6fb793cf6a8..b7922875076 100644 --- a/dpcpp/components/atomic.dp.hpp +++ b/dpcpp/components/atomic.dp.hpp @@ -49,8 +49,8 @@ namespace sycl { namespace atomic { -constexpr auto local_space = sycl::access::address_space::local_space; -constexpr auto global_space = sycl::access::address_space::global_space; +constexpr auto local_space = ::sycl::access::address_space::local_space; +constexpr auto global_space = ::sycl::access::address_space::global_space; } // namespace atomic @@ -58,50 +58,52 @@ constexpr auto global_space = sycl::access::address_space::global_space; namespace { -template T atomic_compare_exchange_strong( - sycl::multi_ptr addr, T expected, T desired, - sycl::memory_order success = sycl::memory_order::relaxed, - sycl::memory_order fail = sycl::memory_order::relaxed) + ::sycl::multi_ptr addr, T expected, T desired, + ::sycl::memory_order success = ::sycl::memory_order::relaxed, + ::sycl::memory_order fail = ::sycl::memory_order::relaxed) { - sycl::atomic obj(addr); + ::sycl::atomic obj(addr); obj.compare_exchange_strong(expected, desired, success, fail); return expected; } -template T atomic_compare_exchange_strong( T* addr, T expected, T desired, - sycl::memory_order success = sycl::memory_order::relaxed, - sycl::memory_order fail = sycl::memory_order::relaxed) + ::sycl::memory_order success = ::sycl::memory_order::relaxed, + ::sycl::memory_order fail = ::sycl::memory_order::relaxed) { return atomic_compare_exchange_strong( - sycl::multi_ptr(addr), expected, desired, success, + ::sycl::multi_ptr(addr), expected, desired, success, fail); } -template inline T atomic_fetch_add( T* addr, T operand, - sycl::memory_order memoryOrder = sycl::memory_order::relaxed) + ::sycl::memory_order memoryOrder = ::sycl::memory_order::relaxed) { - sycl::atomic obj((sycl::multi_ptr(addr))); - return sycl::atomic_fetch_add(obj, operand, memoryOrder); + ::sycl::atomic obj( + (::sycl::multi_ptr(addr))); + return ::sycl::atomic_fetch_add(obj, operand, memoryOrder); } -template inline T atomic_fetch_max( T* addr, T operand, - sycl::memory_order memoryOrder = sycl::memory_order::relaxed) + ::sycl::memory_order memoryOrder = ::sycl::memory_order::relaxed) { - sycl::atomic obj((sycl::multi_ptr(addr))); - return sycl::atomic_fetch_max(obj, operand, memoryOrder); + ::sycl::atomic obj( + (::sycl::multi_ptr(addr))); + return ::sycl::atomic_fetch_max(obj, operand, memoryOrder); } @@ -111,7 +113,7 @@ inline T atomic_fetch_max( namespace detail { -template struct atomic_helper { __dpct_inline__ static ValueType atomic_add(ValueType*, ValueType) @@ -124,7 +126,7 @@ struct atomic_helper { }; -template struct atomic_max_helper { __dpct_inline__ static ValueType atomic_max(ValueType*, ValueType) @@ -147,27 +149,27 @@ __dpct_inline__ ResultType reinterpret(ValueType val) } -#define GKO_BIND_ATOMIC_HELPER_STRUCTURE(CONVERTER_TYPE) \ - template \ - struct atomic_helper< \ - addressSpace, ValueType, \ - std::enable_if_t<(sizeof(ValueType) == sizeof(CONVERTER_TYPE))>> { \ - __dpct_inline__ static ValueType atomic_add( \ - ValueType* __restrict__ addr, ValueType val) \ - { \ - CONVERTER_TYPE* address_as_converter = \ - reinterpret_cast(addr); \ - CONVERTER_TYPE old = *address_as_converter; \ - CONVERTER_TYPE assumed; \ - do { \ - assumed = old; \ - old = atomic_compare_exchange_strong( \ - address_as_converter, assumed, \ - reinterpret( \ - val + reinterpret(assumed))); \ - } while (assumed != old); \ - return reinterpret(old); \ - } \ +#define GKO_BIND_ATOMIC_HELPER_STRUCTURE(CONVERTER_TYPE) \ + template <::sycl::access::address_space addressSpace, typename ValueType> \ + struct atomic_helper< \ + addressSpace, ValueType, \ + std::enable_if_t<(sizeof(ValueType) == sizeof(CONVERTER_TYPE))>> { \ + __dpct_inline__ static ValueType atomic_add( \ + ValueType* __restrict__ addr, ValueType val) \ + { \ + CONVERTER_TYPE* address_as_converter = \ + reinterpret_cast(addr); \ + CONVERTER_TYPE old = *address_as_converter; \ + CONVERTER_TYPE assumed; \ + do { \ + assumed = old; \ + old = atomic_compare_exchange_strong( \ + address_as_converter, assumed, \ + reinterpret( \ + val + reinterpret(assumed))); \ + } while (assumed != old); \ + return reinterpret(old); \ + } \ }; // Support 64-bit ATOMIC_ADD @@ -179,7 +181,7 @@ GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned int); #undef GKO_BIND_ATOMIC_HELPER_STRUCTURE #define GKO_BIND_ATOMIC_HELPER_VALUETYPE(ValueType) \ - template \ + template <::sycl::access::address_space addressSpace> \ struct atomic_helper> { \ __dpct_inline__ static ValueType atomic_add( \ ValueType* __restrict__ addr, ValueType val) \ @@ -195,7 +197,7 @@ GKO_BIND_ATOMIC_HELPER_VALUETYPE(unsigned long long int); #undef GKO_BIND_ATOMIC_HELPER_VALUETYPE -template +template <::sycl::access::address_space addressSpace, typename ValueType> struct atomic_helper< addressSpace, ValueType, std::enable_if_t() && sizeof(ValueType) >= 16>> { @@ -214,28 +216,28 @@ struct atomic_helper< }; -#define GKO_BIND_ATOMIC_MAX_STRUCTURE(CONVERTER_TYPE) \ - template \ - struct atomic_max_helper< \ - addressSpace, ValueType, \ - std::enable_if_t<(sizeof(ValueType) == sizeof(CONVERTER_TYPE))>> { \ - __dpct_inline__ static ValueType atomic_max( \ - ValueType* __restrict__ addr, ValueType val) \ - { \ - CONVERTER_TYPE* address_as_converter = \ - reinterpret_cast(addr); \ - CONVERTER_TYPE old = *address_as_converter; \ - CONVERTER_TYPE assumed; \ - do { \ - assumed = old; \ - if (reinterpret(assumed) < val) { \ - old = atomic_compare_exchange_strong( \ - address_as_converter, assumed, \ - reinterpret(val)); \ - } \ - } while (assumed != old); \ - return reinterpret(old); \ - } \ +#define GKO_BIND_ATOMIC_MAX_STRUCTURE(CONVERTER_TYPE) \ + template <::sycl::access::address_space addressSpace, typename ValueType> \ + struct atomic_max_helper< \ + addressSpace, ValueType, \ + std::enable_if_t<(sizeof(ValueType) == sizeof(CONVERTER_TYPE))>> { \ + __dpct_inline__ static ValueType atomic_max( \ + ValueType* __restrict__ addr, ValueType val) \ + { \ + CONVERTER_TYPE* address_as_converter = \ + reinterpret_cast(addr); \ + CONVERTER_TYPE old = *address_as_converter; \ + CONVERTER_TYPE assumed; \ + do { \ + assumed = old; \ + if (reinterpret(assumed) < val) { \ + old = atomic_compare_exchange_strong( \ + address_as_converter, assumed, \ + reinterpret(val)); \ + } \ + } while (assumed != old); \ + return reinterpret(old); \ + } \ }; // Support 64-bit ATOMIC_ADD @@ -247,7 +249,7 @@ GKO_BIND_ATOMIC_MAX_STRUCTURE(unsigned int); #undef GKO_BIND_ATOMIC_MAX_STRUCTURE #define GKO_BIND_ATOMIC_MAX_VALUETYPE(ValueType) \ - template \ + template <::sycl::access::address_space addressSpace> \ struct atomic_max_helper> { \ __dpct_inline__ static ValueType atomic_max( \ @@ -267,7 +269,7 @@ GKO_BIND_ATOMIC_MAX_VALUETYPE(unsigned long long int); } // namespace detail -template __dpct_inline__ T atomic_add(T* __restrict__ addr, T val) { @@ -275,7 +277,7 @@ __dpct_inline__ T atomic_add(T* __restrict__ addr, T val) } -template __dpct_inline__ T atomic_max(T* __restrict__ addr, T val) { diff --git a/dpcpp/components/cooperative_groups.dp.hpp b/dpcpp/components/cooperative_groups.dp.hpp index 435419bbdf4..0ab4ee5e55d 100644 --- a/dpcpp/components/cooperative_groups.dp.hpp +++ b/dpcpp/components/cooperative_groups.dp.hpp @@ -168,8 +168,8 @@ namespace detail { * This is a limited implementation of the SYCL thread_block_tile. */ template -class thread_block_tile : public sycl::sub_group { - using sub_group = sycl::sub_group; +class thread_block_tile : public ::sycl::sub_group { + using sub_group = ::sycl::sub_group; using id_type = sub_group::id_type; using mask_type = config::lane_mask_type; @@ -239,9 +239,9 @@ class thread_block_tile : public sycl::sub_group { __dpct_inline__ mask_type ballot(int predicate) const noexcept { // todo: change it when OneAPI update the mask related api - return sycl::reduce_over_group( + return ::sycl::reduce_over_group( *this, (predicate != 0) ? mask_type(1) << data_.rank : mask_type(0), - sycl::plus()); + ::sycl::plus()); } /** @@ -250,7 +250,7 @@ class thread_block_tile : public sycl::sub_group { */ __dpct_inline__ bool any(int predicate) const noexcept { - return sycl::any_of_group(*this, (predicate != 0)); + return ::sycl::any_of_group(*this, (predicate != 0)); } /** @@ -259,7 +259,7 @@ class thread_block_tile : public sycl::sub_group { */ __dpct_inline__ bool all(int predicate) const noexcept { - return sycl::all_of_group(*this, (predicate != 0)); + return ::sycl::all_of_group(*this, (predicate != 0)); } @@ -390,7 +390,7 @@ struct is_communicator_group_impl> : std::true_type {}; class thread_block { - friend __dpct_inline__ thread_block this_thread_block(sycl::nd_item<3>&); + friend __dpct_inline__ thread_block this_thread_block(::sycl::nd_item<3>&); public: __dpct_inline__ unsigned thread_rank() const noexcept { return data_.rank; } @@ -400,7 +400,7 @@ class thread_block { __dpct_inline__ void sync() const noexcept { group_.barrier(); } private: - __dpct_inline__ thread_block(sycl::nd_item<3>& group) + __dpct_inline__ thread_block(::sycl::nd_item<3>& group) : group_{group}, data_{static_cast(group.get_local_range().size()), static_cast(group.get_local_linear_id())} @@ -410,11 +410,11 @@ class thread_block { unsigned rank; } data_; - sycl::nd_item<3>& group_; + ::sycl::nd_item<3>& group_; }; -__dpct_inline__ thread_block this_thread_block(sycl::nd_item<3>& group) +__dpct_inline__ thread_block this_thread_block(::sycl::nd_item<3>& group) { return thread_block(group); } @@ -444,7 +444,7 @@ struct is_synchronizable_group_impl : std::true_type {}; * bit block) would have to be used to represent the full space of thread ranks. */ class grid_group { - friend __dpct_inline__ grid_group this_grid(sycl::nd_item<3>&); + friend __dpct_inline__ grid_group this_grid(::sycl::nd_item<3>&); public: __dpct_inline__ unsigned size() const noexcept { return data_.size; } @@ -452,7 +452,7 @@ class grid_group { __dpct_inline__ unsigned thread_rank() const noexcept { return data_.rank; } private: - __dpct_inline__ grid_group(sycl::nd_item<3>& group) + __dpct_inline__ grid_group(::sycl::nd_item<3>& group) : data_{static_cast(group.get_global_range().size()), static_cast(group.get_global_linear_id())} {} @@ -467,7 +467,7 @@ class grid_group { // grid_group this_grid() // using cooperative_groups::this_grid; // Instead, use our limited implementation: -__dpct_inline__ grid_group this_grid(sycl::nd_item<3>& group) +__dpct_inline__ grid_group this_grid(::sycl::nd_item<3>& group) { return grid_group(group); } diff --git a/dpcpp/components/diagonal_block_manipulation.dp.hpp b/dpcpp/components/diagonal_block_manipulation.dp.hpp index 900e669ee72..bd5f4cf5670 100644 --- a/dpcpp/components/diagonal_block_manipulation.dp.hpp +++ b/dpcpp/components/diagonal_block_manipulation.dp.hpp @@ -68,7 +68,7 @@ __dpct_inline__ void extract_transposed_diag_blocks( const ValueType* __restrict__ values, const IndexType* __restrict__ block_ptrs, size_type num_blocks, ValueType* __restrict__ block_row, int increment, - ValueType* __restrict__ workspace, sycl::nd_item<3> item_ct1) + ValueType* __restrict__ workspace, ::sycl::nd_item<3> item_ct1) { const int tid = item_ct1.get_local_id(1) * item_ct1.get_local_range().get(2) + diff --git a/dpcpp/components/intrinsics.dp.hpp b/dpcpp/components/intrinsics.dp.hpp index fd2778adcee..440713526aa 100644 --- a/dpcpp/components/intrinsics.dp.hpp +++ b/dpcpp/components/intrinsics.dp.hpp @@ -52,10 +52,10 @@ namespace sycl { * @internal * Returns the number of set bits in the given mask. */ -__dpct_inline__ int popcnt(uint32 mask) { return sycl::popcount(mask); } +__dpct_inline__ int popcnt(uint32 mask) { return ::sycl::popcount(mask); } /** @copydoc popcnt */ -__dpct_inline__ int popcnt(uint64 mask) { return sycl::popcount(mask); } +__dpct_inline__ int popcnt(uint64 mask) { return ::sycl::popcount(mask); } /** @@ -67,13 +67,13 @@ __dpct_inline__ int popcnt(uint64 mask) { return sycl::popcount(mask); } */ __dpct_inline__ int ffs(uint32 mask) { - return (mask == 0) ? 0 : (sycl::ext::intel::ctz(mask) + 1); + return (mask == 0) ? 0 : (::sycl::ext::intel::ctz(mask) + 1); } /** @copydoc ffs */ __dpct_inline__ int ffs(uint64 mask) { - return (mask == 0) ? 0 : (sycl::ext::intel::ctz(mask) + 1); + return (mask == 0) ? 0 : (::sycl::ext::intel::ctz(mask) + 1); } @@ -82,10 +82,10 @@ __dpct_inline__ int ffs(uint64 mask) * Returns the number of zero bits before the first set bit in the given mask, * starting from the most significant bit. */ -__dpct_inline__ int clz(uint32 mask) { return sycl::clz(mask); } +__dpct_inline__ int clz(uint32 mask) { return ::sycl::clz(mask); } /** @copydoc clz */ -__dpct_inline__ int clz(uint64 mask) { return sycl::clz(mask); } +__dpct_inline__ int clz(uint64 mask) { return ::sycl::clz(mask); } } // namespace sycl diff --git a/dpcpp/components/prefix_sum.dp.hpp b/dpcpp/components/prefix_sum.dp.hpp index 7d4cf21f22d..4b5b4c7b218 100644 --- a/dpcpp/components/prefix_sum.dp.hpp +++ b/dpcpp/components/prefix_sum.dp.hpp @@ -132,7 +132,7 @@ __dpct_inline__ void subwarp_prefix_sum(ValueType element, template void start_prefix_sum( size_type num_elements, ValueType* __restrict__ elements, - ValueType* __restrict__ block_sum, sycl::nd_item<3> item_ct1, + ValueType* __restrict__ block_sum, ::sycl::nd_item<3> item_ct1, uninitialized_array& prefix_helper) { const auto tidx = thread::get_thread_id_flat(item_ct1); @@ -185,17 +185,17 @@ void start_prefix_sum( template void start_prefix_sum(dim3 grid, dim3 block, size_type dynamic_shared_memory, - sycl::queue* queue, size_type num_elements, + ::sycl::queue* queue, size_type num_elements, ValueType* elements, ValueType* block_sum) { - queue->submit([&](sycl::handler& cgh) { - sycl::accessor, - 0, sycl::access::mode::read_write, - sycl::access::target::local> + queue->submit([&](::sycl::handler& cgh) { + ::sycl::accessor< + uninitialized_array, 0, + ::sycl::access::mode::read_write, ::sycl::access::target::local> prefix_helper_acc_ct1(cgh); cgh.parallel_for(sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) { + [=](::sycl::nd_item<3> item_ct1) { start_prefix_sum( num_elements, elements, block_sum, item_ct1, *prefix_helper_acc_ct1.get_pointer()); @@ -222,7 +222,7 @@ template void finalize_prefix_sum(size_type num_elements, ValueType* __restrict__ elements, const ValueType* __restrict__ block_sum, - sycl::nd_item<3> item_ct1) + ::sycl::nd_item<3> item_ct1) { const auto tidx = thread::get_thread_id_flat(item_ct1); @@ -237,12 +237,12 @@ void finalize_prefix_sum(size_type num_elements, template void finalize_prefix_sum(dim3 grid, dim3 block, size_type dynamic_shared_memory, - sycl::queue* queue, size_type num_elements, + ::sycl::queue* queue, size_type num_elements, ValueType* elements, const ValueType* block_sum) { - queue->submit([&](sycl::handler& cgh) { + queue->submit([&](::sycl::handler& cgh) { cgh.parallel_for(sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) { + [=](::sycl::nd_item<3> item_ct1) { finalize_prefix_sum( num_elements, elements, block_sum, item_ct1); }); diff --git a/dpcpp/components/reduction.dp.hpp b/dpcpp/components/reduction.dp.hpp index a0f4d3e8d6c..54847ff986e 100644 --- a/dpcpp/components/reduction.dp.hpp +++ b/dpcpp/components/reduction.dp.hpp @@ -173,7 +173,7 @@ void reduce(const Group& __restrict__ group, ValueType* __restrict__ data, template void reduce_array(size_type size, const ValueType* __restrict__ source, - ValueType* __restrict__ result, sycl::nd_item<3> item_ct1, + ValueType* __restrict__ result, ::sycl::nd_item<3> item_ct1, Operator reduce_op = Operator{}) { const auto tidx = thread::get_thread_id_flat(item_ct1); @@ -201,7 +201,7 @@ void reduce_array(size_type size, const ValueType* __restrict__ source, template void reduce_add_array( size_type size, const ValueType* __restrict__ source, - ValueType* __restrict__ result, sycl::nd_item<3> item_ct1, + ValueType* __restrict__ result, ::sycl::nd_item<3> item_ct1, uninitialized_array& block_sum) { reduce_array( @@ -215,18 +215,18 @@ void reduce_add_array( template , typename ValueType> void reduce_add_array(dim3 grid, dim3 block, size_type dynamic_shared_memory, - sycl::queue* queue, size_type size, + ::sycl::queue* queue, size_type size, const ValueType* source, ValueType* result) { - queue->submit([&](sycl::handler& cgh) { - sycl::accessor, - 0, sycl::access::mode::read_write, - sycl::access::target::local> + queue->submit([&](::sycl::handler& cgh) { + ::sycl::accessor< + uninitialized_array, 0, + ::sycl::access::mode::read_write, ::sycl::access::target::local> block_sum_acc_ct1(cgh); cgh.parallel_for( sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) + [=](::sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(DeviceConfig::subgroup_size)]] { reduce_add_array( size, source, result, item_ct1, diff --git a/dpcpp/components/sorting.dp.hpp b/dpcpp/components/sorting.dp.hpp index a7ec4f26eb6..ea5f37c11be 100644 --- a/dpcpp/components/sorting.dp.hpp +++ b/dpcpp/components/sorting.dp.hpp @@ -136,13 +136,13 @@ struct bitonic_warp { // 1. for sorting, we have to reverse the sort order in the upper half // 2. for merging, we have to determine for the XOR shuffle if we are // the "smaller" thread, as this thread gets the "smaller" element. - __dpct_inline__ static bool upper_half(sycl::nd_item<3> item_ct1) + __dpct_inline__ static bool upper_half(::sycl::nd_item<3> item_ct1) { return bool(item_ct1.get_local_id(2) & (num_threads / 2)); } __dpct_inline__ static void merge(ValueType* els, bool reverse, - sycl::nd_item<3> item_ct1) + ::sycl::nd_item<3> item_ct1) { auto tile = group::tiled_partition( group::this_thread_block(item_ct1)); @@ -155,7 +155,7 @@ struct bitonic_warp { } __dpct_inline__ static void sort(ValueType* els, bool reverse, - sycl::nd_item<3> item_ct1) + ::sycl::nd_item<3> item_ct1) { auto new_reverse = reverse != upper_half(item_ct1); half::sort(els, new_reverse, item_ct1); @@ -167,12 +167,12 @@ template struct bitonic_warp { using local = bitonic_local; __dpct_inline__ static void merge(ValueType* els, bool reverse, - sycl::nd_item<3> item_ct1) + ::sycl::nd_item<3> item_ct1) { local::merge(els, reverse); } __dpct_inline__ static void sort(ValueType* els, bool reverse, - sycl::nd_item<3> item_ct1) + ::sycl::nd_item<3> item_ct1) { local::sort(els, reverse); } @@ -201,7 +201,8 @@ struct bitonic_global { static_assert(32 % num_groups == 0, "num_groups must be a power of two <= 32"); - __dpct_inline__ static int shared_idx(int local, sycl::nd_item<3> item_ct1) + __dpct_inline__ static int shared_idx(int local, + ::sycl::nd_item<3> item_ct1) { auto rank = group::this_thread_block(item_ct1).thread_rank(); // use the same memory-bank to avoid bank conflicts @@ -211,7 +212,7 @@ struct bitonic_global { // check if we are in the upper half of all groups in this block // this is important as for sorting, we have to reverse the sort order in // the upper half - __dpct_inline__ static bool upper_half(sycl::nd_item<3> item_ct1) + __dpct_inline__ static bool upper_half(::sycl::nd_item<3> item_ct1) { auto rank = group::this_thread_block(item_ct1).thread_rank(); return bool(rank & (num_groups * num_threads / 2)); @@ -219,7 +220,7 @@ struct bitonic_global { __dpct_inline__ static void merge(ValueType* local_els, ValueType* shared_els, bool reverse, - sycl::nd_item<3> item_ct1) + ::sycl::nd_item<3> item_ct1) { group::this_thread_block(item_ct1).sync(); auto upper_shared_els = shared_els + (num_groups * num_threads / 2); @@ -235,7 +236,7 @@ struct bitonic_global { __dpct_inline__ static void sort(ValueType* local_els, ValueType* shared_els, bool reverse, - sycl::nd_item<3> item_ct1) + ::sycl::nd_item<3> item_ct1) { auto new_reverse = reverse != upper_half(item_ct1); half::sort(local_els, shared_els, new_reverse, item_ct1); @@ -248,7 +249,8 @@ template { using warp = bitonic_warp; - __dpct_inline__ static int shared_idx(int local, sycl::nd_item<3> item_ct1) + __dpct_inline__ static int shared_idx(int local, + ::sycl::nd_item<3> item_ct1) { // use the indexing from the general struct return bitonic_global { __dpct_inline__ static void merge(ValueType* local_els, ValueType* shared_els, bool reverse, - sycl::nd_item<3> item_ct1) + ::sycl::nd_item<3> item_ct1) { group::this_thread_block(item_ct1).sync(); for (int i = 0; i < num_local; ++i) { @@ -271,7 +273,7 @@ struct bitonic_global { __dpct_inline__ static void sort(ValueType* local_els, ValueType* shared_els, bool reverse, - sycl::nd_item<3> item_ct1) + ::sycl::nd_item<3> item_ct1) { auto rank = group::this_thread_block(item_ct1).thread_rank(); // This is the first step, so we don't need to load from shared memory @@ -316,7 +318,7 @@ struct bitonic_global { template __dpct_inline__ void bitonic_sort(ValueType* local_elements, ValueType* shared_elements, - sycl::nd_item<3> item_ct1) + ::sycl::nd_item<3> item_ct1) { constexpr auto num_threads = num_elements / num_local; constexpr auto num_warps = num_threads / config::warp_size; diff --git a/dpcpp/components/thread_ids.dp.hpp b/dpcpp/components/thread_ids.dp.hpp index a5724d2f5d0..63973013dfe 100644 --- a/dpcpp/components/thread_ids.dp.hpp +++ b/dpcpp/components/thread_ids.dp.hpp @@ -68,7 +68,7 @@ namespace thread { * @note Assumes that grid dimensions are in cuda standard format: * `(block_group_size, first_grid_dimension, second grid_dimension)` */ -__dpct_inline__ size_type get_block_group_id(sycl::nd_item<3> item_ct1) +__dpct_inline__ size_type get_block_group_id(::sycl::nd_item<3> item_ct1) { return static_cast(item_ct1.get_group(0)) * item_ct1.get_group_range(1) + @@ -85,7 +85,7 @@ __dpct_inline__ size_type get_block_group_id(sycl::nd_item<3> item_ct1) * @note Assumes that grid dimensions are in cuda standard format: * `(block_group_size, first_grid_dimension, second grid_dimension)` */ -__dpct_inline__ size_type get_block_id(sycl::nd_item<3> item_ct1) +__dpct_inline__ size_type get_block_id(::sycl::nd_item<3> item_ct1) { return get_block_group_id(item_ct1) * item_ct1.get_group_range(2) + item_ct1.get_group(2); @@ -105,7 +105,7 @@ __dpct_inline__ size_type get_block_id(sycl::nd_item<3> item_ct1) * `(subwarp_size, config::warp_size / subwarp_size, block_size / * config::warp_size)` */ -__dpct_inline__ size_type get_local_warp_id(sycl::nd_item<3> item_ct1) +__dpct_inline__ size_type get_local_warp_id(::sycl::nd_item<3> item_ct1) { return static_cast(item_ct1.get_local_id(0)); } @@ -127,7 +127,7 @@ __dpct_inline__ size_type get_local_warp_id(sycl::nd_item<3> item_ct1) * config::warp_size)` */ template -__dpct_inline__ size_type get_local_subwarp_id(sycl::nd_item<3> item_ct1) +__dpct_inline__ size_type get_local_subwarp_id(::sycl::nd_item<3> item_ct1) { // sycl does not have subwarp. constexpr auto subwarps_per_warp = subwarp_size / subwarp_size; @@ -151,7 +151,7 @@ __dpct_inline__ size_type get_local_subwarp_id(sycl::nd_item<3> item_ct1) * config::warp_size)` */ template -__dpct_inline__ size_type get_local_thread_id(sycl::nd_item<3> item_ct1) +__dpct_inline__ size_type get_local_thread_id(::sycl::nd_item<3> item_ct1) { return get_local_subwarp_id(item_ct1) * subwarp_size + item_ct1.get_local_id(2); @@ -175,7 +175,7 @@ __dpct_inline__ size_type get_local_thread_id(sycl::nd_item<3> item_ct1) * respectively. */ template -__dpct_inline__ size_type get_warp_id(sycl::nd_item<3> item_ct1) +__dpct_inline__ size_type get_warp_id(::sycl::nd_item<3> item_ct1) { return get_block_id(item_ct1) * warps_per_block + get_local_warp_id(item_ct1); @@ -199,7 +199,7 @@ __dpct_inline__ size_type get_warp_id(sycl::nd_item<3> item_ct1) * respectively. */ template -__dpct_inline__ size_type get_subwarp_id(sycl::nd_item<3> item_ct1) +__dpct_inline__ size_type get_subwarp_id(::sycl::nd_item<3> item_ct1) { // sycl does not have subwarp constexpr auto subwarps_per_warp = subwarp_size / subwarp_size; @@ -225,7 +225,7 @@ __dpct_inline__ size_type get_subwarp_id(sycl::nd_item<3> item_ct1) * respectively. */ template -__dpct_inline__ size_type get_thread_id(sycl::nd_item<3> item_ct1) +__dpct_inline__ size_type get_thread_id(::sycl::nd_item<3> item_ct1) { return get_subwarp_id(item_ct1) * subwarp_size + @@ -245,7 +245,7 @@ __dpct_inline__ size_type get_thread_id(sycl::nd_item<3> item_ct1) * @tparam IndexType the index type */ template -__dpct_inline__ IndexType get_thread_id_flat(sycl::nd_item<3> item_ct1) +__dpct_inline__ IndexType get_thread_id_flat(::sycl::nd_item<3> item_ct1) { return item_ct1.get_local_id(2) + static_cast(item_ct1.get_local_range().get(2)) * @@ -265,7 +265,7 @@ __dpct_inline__ IndexType get_thread_id_flat(sycl::nd_item<3> item_ct1) * @tparam IndexType the index type */ template -__dpct_inline__ IndexType get_thread_num_flat(sycl::nd_item<3> item_ct1) +__dpct_inline__ IndexType get_thread_num_flat(::sycl::nd_item<3> item_ct1) { return item_ct1.get_local_range().get(2) * static_cast(item_ct1.get_group_range(2)); @@ -285,7 +285,7 @@ __dpct_inline__ IndexType get_thread_num_flat(sycl::nd_item<3> item_ct1) * @tparam IndexType the index type */ template -__dpct_inline__ IndexType get_subwarp_id_flat(sycl::nd_item<3> item_ct1) +__dpct_inline__ IndexType get_subwarp_id_flat(::sycl::nd_item<3> item_ct1) { static_assert(!(subwarp_size & (subwarp_size - 1)), "subwarp_size must be a power of two"); @@ -309,7 +309,7 @@ __dpct_inline__ IndexType get_subwarp_id_flat(sycl::nd_item<3> item_ct1) * @tparam IndexType the index type */ template -__dpct_inline__ IndexType get_subwarp_num_flat(sycl::nd_item<3> item_ct1) +__dpct_inline__ IndexType get_subwarp_num_flat(::sycl::nd_item<3> item_ct1) { static_assert(!(subwarp_size & (subwarp_size - 1)), "subwarp_size must be a power of two"); diff --git a/dpcpp/factorization/cholesky_kernels.dp.cpp b/dpcpp/factorization/cholesky_kernels.dp.cpp index b23673bfff2..3bc84c4b08a 100644 --- a/dpcpp/factorization/cholesky_kernels.dp.cpp +++ b/dpcpp/factorization/cholesky_kernels.dp.cpp @@ -74,8 +74,8 @@ void symbolic_count(std::shared_ptr exec, const auto postorder_parent = forest.postorder_parents.get_const_data(); auto queue = exec->get_queue(); // build sorted postorder node list for each row - queue->submit([&](sycl::handler& cgh) { - cgh.parallel_for(sycl::range<1>{num_rows}, [=](sycl::id<1> idx_id) { + queue->submit([&](::sycl::handler& cgh) { + cgh.parallel_for(::sycl::range<1>{num_rows}, [=](::sycl::id<1> idx_id) { const auto row = idx_id[0]; const auto row_begin = row_ptrs[row]; const auto row_end = row_ptrs[row + 1]; @@ -96,8 +96,8 @@ void symbolic_count(std::shared_ptr exec, }); }); // count nonzeros per row of L - queue->submit([&](sycl::handler& cgh) { - cgh.parallel_for(sycl::range<1>{num_rows}, [=](sycl::id<1> idx_id) { + queue->submit([&](::sycl::handler& cgh) { + cgh.parallel_for(::sycl::range<1>{num_rows}, [=](::sycl::id<1> idx_id) { const auto row = idx_id[0]; const auto row_begin = row_ptrs[row]; // instead of relying on the input containing a diagonal, we @@ -143,8 +143,8 @@ void symbolic_factorize( const auto postorder_parent = forest.postorder_parents.get_const_data(); const auto out_row_ptrs = l_factor->get_const_row_ptrs(); const auto out_cols = l_factor->get_col_idxs(); - exec->get_queue()->submit([&](sycl::handler& cgh) { - cgh.parallel_for(sycl::range<1>{num_rows}, [=](sycl::id<1> idx_id) { + exec->get_queue()->submit([&](::sycl::handler& cgh) { + cgh.parallel_for(::sycl::range<1>{num_rows}, [=](::sycl::id<1> idx_id) { const auto row = idx_id[0]; const auto row_begin = row_ptrs[row]; // instead of relying on the input containing a diagonal, we diff --git a/dpcpp/factorization/factorization_kernels.dp.cpp b/dpcpp/factorization/factorization_kernels.dp.cpp index c771a73cb29..264b746389c 100644 --- a/dpcpp/factorization/factorization_kernels.dp.cpp +++ b/dpcpp/factorization/factorization_kernels.dp.cpp @@ -120,7 +120,7 @@ void find_missing_diagonal_elements( const IndexType* __restrict__ col_idxs, const IndexType* __restrict__ row_ptrs, IndexType* __restrict__ elements_to_add_per_row, - bool* __restrict__ changes_required, sycl::nd_item<3> item_ct1) + bool* __restrict__ changes_required, ::sycl::nd_item<3> item_ct1) { const auto total_subwarp_count = thread::get_subwarp_num_flat(item_ct1); @@ -167,14 +167,14 @@ void find_missing_diagonal_elements( template void find_missing_diagonal_elements( - dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue, - IndexType num_rows, IndexType num_cols, const IndexType* col_idxs, - const IndexType* row_ptrs, IndexType* elements_to_add_per_row, - bool* changes_required) + dim3 grid, dim3 block, size_type dynamic_shared_memory, + ::sycl::queue* queue, IndexType num_rows, IndexType num_cols, + const IndexType* col_idxs, const IndexType* row_ptrs, + IndexType* elements_to_add_per_row, bool* changes_required) { queue->parallel_for( sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) + [=](::sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(SubwarpSize)]] { find_missing_diagonal_elements( num_rows, num_cols, col_idxs, row_ptrs, @@ -191,7 +191,8 @@ void add_missing_diagonal_elements( const IndexType* __restrict__ old_col_idxs, const IndexType* __restrict__ old_row_ptrs, ValueType* __restrict__ new_values, IndexType* __restrict__ new_col_idxs, - const IndexType* __restrict__ row_ptrs_addition, sycl::nd_item<3> item_ct1) + const IndexType* __restrict__ row_ptrs_addition, + ::sycl::nd_item<3> item_ct1) { // Precaution in case not enough threads were created const auto total_subwarp_count = @@ -267,14 +268,14 @@ void add_missing_diagonal_elements( template void add_missing_diagonal_elements( - dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue, - IndexType num_rows, const ValueType* old_values, + dim3 grid, dim3 block, size_type dynamic_shared_memory, + ::sycl::queue* queue, IndexType num_rows, const ValueType* old_values, const IndexType* old_col_idxs, const IndexType* old_row_ptrs, ValueType* new_values, IndexType* new_col_idxs, const IndexType* row_ptrs_addition) { queue->parallel_for(sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) + [=](::sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(SubwarpSize)]] { add_missing_diagonal_elements( num_rows, old_values, old_col_idxs, @@ -287,7 +288,7 @@ void add_missing_diagonal_elements( template void update_row_ptrs(IndexType num_rows, IndexType* __restrict__ row_ptrs, IndexType* __restrict__ row_ptr_addition, - sycl::nd_item<3> item_ct1) + ::sycl::nd_item<3> item_ct1) { const auto total_thread_count = thread::get_thread_num_flat(item_ct1); @@ -300,11 +301,11 @@ void update_row_ptrs(IndexType num_rows, IndexType* __restrict__ row_ptrs, template void update_row_ptrs(dim3 grid, dim3 block, size_type dynamic_shared_memory, - sycl::queue* queue, IndexType num_rows, + ::sycl::queue* queue, IndexType num_rows, IndexType* row_ptrs, IndexType* row_ptr_addition) { queue->parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + sycl_nd_range(grid, block), [=](::sycl::nd_item<3> item_ct1) { update_row_ptrs(num_rows, row_ptrs, row_ptr_addition, item_ct1); }); } @@ -317,7 +318,7 @@ void count_nnz_per_l_u_row(size_type num_rows, const ValueType* __restrict__ values, IndexType* __restrict__ l_nnz_row, IndexType* __restrict__ u_nnz_row, - sycl::nd_item<3> item_ct1) + ::sycl::nd_item<3> item_ct1) { const auto row = thread::get_thread_id_flat(item_ct1); if (row < num_rows) { @@ -337,13 +338,14 @@ void count_nnz_per_l_u_row(size_type num_rows, template void count_nnz_per_l_u_row(dim3 grid, dim3 block, - size_type dynamic_shared_memory, sycl::queue* queue, - size_type num_rows, const IndexType* row_ptrs, - const IndexType* col_idxs, const ValueType* values, - IndexType* l_nnz_row, IndexType* u_nnz_row) + size_type dynamic_shared_memory, + ::sycl::queue* queue, size_type num_rows, + const IndexType* row_ptrs, const IndexType* col_idxs, + const ValueType* values, IndexType* l_nnz_row, + IndexType* u_nnz_row) { queue->parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + sycl_nd_range(grid, block), [=](::sycl::nd_item<3> item_ct1) { count_nnz_per_l_u_row(num_rows, row_ptrs, col_idxs, values, l_nnz_row, u_nnz_row, item_ct1); }); @@ -359,7 +361,8 @@ void initialize_l_u(size_type num_rows, const IndexType* __restrict__ row_ptrs, ValueType* __restrict__ l_values, const IndexType* __restrict__ u_row_ptrs, IndexType* __restrict__ u_col_idxs, - ValueType* __restrict__ u_values, sycl::nd_item<3> item_ct1) + ValueType* __restrict__ u_values, + ::sycl::nd_item<3> item_ct1) { const auto row = thread::get_thread_id_flat(item_ct1); if (row < num_rows) { @@ -397,7 +400,7 @@ void initialize_l_u(size_type num_rows, const IndexType* __restrict__ row_ptrs, template void initialize_l_u(dim3 grid, dim3 block, size_type dynamic_shared_memory, - sycl::queue* queue, size_type num_rows, + ::sycl::queue* queue, size_type num_rows, const IndexType* row_ptrs, const IndexType* col_idxs, const ValueType* values, const IndexType* l_row_ptrs, IndexType* l_col_idxs, ValueType* l_values, @@ -405,7 +408,7 @@ void initialize_l_u(dim3 grid, dim3 block, size_type dynamic_shared_memory, ValueType* u_values) { queue->parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + sycl_nd_range(grid, block), [=](::sycl::nd_item<3> item_ct1) { initialize_l_u(num_rows, row_ptrs, col_idxs, values, l_row_ptrs, l_col_idxs, l_values, u_row_ptrs, u_col_idxs, u_values, item_ct1); @@ -419,7 +422,7 @@ void count_nnz_per_l_row(size_type num_rows, const IndexType* __restrict__ col_idxs, const ValueType* __restrict__ values, IndexType* __restrict__ l_nnz_row, - sycl::nd_item<3> item_ct1) + ::sycl::nd_item<3> item_ct1) { const auto row = thread::get_thread_id_flat(item_ct1); if (row < num_rows) { @@ -436,12 +439,12 @@ void count_nnz_per_l_row(size_type num_rows, template void count_nnz_per_l_row(dim3 grid, dim3 block, size_type dynamic_shared_memory, - sycl::queue* queue, size_type num_rows, + ::sycl::queue* queue, size_type num_rows, const IndexType* row_ptrs, const IndexType* col_idxs, const ValueType* values, IndexType* l_nnz_row) { queue->parallel_for(sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) { + [=](::sycl::nd_item<3> item_ct1) { count_nnz_per_l_row(num_rows, row_ptrs, col_idxs, values, l_nnz_row, item_ct1); }); @@ -455,7 +458,7 @@ void initialize_l(size_type num_rows, const IndexType* __restrict__ row_ptrs, const IndexType* __restrict__ l_row_ptrs, IndexType* __restrict__ l_col_idxs, ValueType* __restrict__ l_values, bool use_sqrt, - sycl::nd_item<3> item_ct1) + ::sycl::nd_item<3> item_ct1) { const auto row = thread::get_thread_id_flat(item_ct1); if (row < num_rows) { @@ -491,13 +494,13 @@ void initialize_l(size_type num_rows, const IndexType* __restrict__ row_ptrs, template void initialize_l(dim3 grid, dim3 block, size_type dynamic_shared_memory, - sycl::queue* queue, size_type num_rows, + ::sycl::queue* queue, size_type num_rows, const IndexType* row_ptrs, const IndexType* col_idxs, const ValueType* values, const IndexType* l_row_ptrs, IndexType* l_col_idxs, ValueType* l_values, bool use_sqrt) { queue->parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + sycl_nd_range(grid, block), [=](::sycl::nd_item<3> item_ct1) { initialize_l(num_rows, row_ptrs, col_idxs, values, l_row_ptrs, l_col_idxs, l_values, use_sqrt, item_ct1); }); diff --git a/dpcpp/factorization/par_ic_kernels.dp.cpp b/dpcpp/factorization/par_ic_kernels.dp.cpp index bf954e3c374..d857359b7f6 100644 --- a/dpcpp/factorization/par_ic_kernels.dp.cpp +++ b/dpcpp/factorization/par_ic_kernels.dp.cpp @@ -65,7 +65,7 @@ namespace kernel { template void ic_init(const IndexType* __restrict__ l_row_ptrs, ValueType* __restrict__ l_vals, size_type num_rows, - sycl::nd_item<3> item_ct1) + ::sycl::nd_item<3> item_ct1) { auto row = thread::get_thread_id_flat(item_ct1); if (row >= num_rows) { @@ -82,11 +82,11 @@ void ic_init(const IndexType* __restrict__ l_row_ptrs, template void ic_init(dim3 grid, dim3 block, size_type dynamic_shared_memory, - sycl::queue* queue, const IndexType* l_row_ptrs, ValueType* l_vals, - size_type num_rows) + ::sycl::queue* queue, const IndexType* l_row_ptrs, + ValueType* l_vals, size_type num_rows) { queue->parallel_for(sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) { + [=](::sycl::nd_item<3> item_ct1) { ic_init(l_row_ptrs, l_vals, num_rows, item_ct1); }); } @@ -99,7 +99,7 @@ void ic_sweep(const IndexType* __restrict__ a_row_idxs, const IndexType* __restrict__ l_row_ptrs, const IndexType* __restrict__ l_col_idxs, ValueType* __restrict__ l_vals, IndexType l_nnz, - sycl::nd_item<3> item_ct1) + ::sycl::nd_item<3> item_ct1) { const auto l_nz = thread::get_thread_id_flat(item_ct1); if (l_nz >= l_nnz) { @@ -133,13 +133,13 @@ void ic_sweep(const IndexType* __restrict__ a_row_idxs, template void ic_sweep(dim3 grid, dim3 block, size_type dynamic_shared_memory, - sycl::queue* queue, const IndexType* a_row_idxs, + ::sycl::queue* queue, const IndexType* a_row_idxs, const IndexType* a_col_idxs, const ValueType* a_vals, const IndexType* l_row_ptrs, const IndexType* l_col_idxs, ValueType* l_vals, IndexType l_nnz) { queue->parallel_for(sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) { + [=](::sycl::nd_item<3> item_ct1) { ic_sweep(a_row_idxs, a_col_idxs, a_vals, l_row_ptrs, l_col_idxs, l_vals, l_nnz, item_ct1); }); diff --git a/dpcpp/factorization/par_ict_kernels.dp.cpp b/dpcpp/factorization/par_ict_kernels.dp.cpp index 99f80a2c61e..704d774e0a1 100644 --- a/dpcpp/factorization/par_ict_kernels.dp.cpp +++ b/dpcpp/factorization/par_ict_kernels.dp.cpp @@ -88,7 +88,7 @@ void ict_tri_spgeam_nnz(const IndexType* __restrict__ llh_row_ptrs, const IndexType* __restrict__ a_row_ptrs, const IndexType* __restrict__ a_col_idxs, IndexType* __restrict__ l_new_row_ptrs, - IndexType num_rows, sycl::nd_item<3> item_ct1) + IndexType num_rows, ::sycl::nd_item<3> item_ct1) { auto subwarp = group::tiled_partition( group::this_thread_block(item_ct1)); @@ -120,14 +120,14 @@ void ict_tri_spgeam_nnz(const IndexType* __restrict__ llh_row_ptrs, template void ict_tri_spgeam_nnz(dim3 grid, dim3 block, size_type dynamic_shared_memory, - sycl::queue* queue, const IndexType* llh_row_ptrs, + ::sycl::queue* queue, const IndexType* llh_row_ptrs, const IndexType* llh_col_idxs, const IndexType* a_row_ptrs, const IndexType* a_col_idxs, IndexType* l_new_row_ptrs, IndexType num_rows) { queue->parallel_for(sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) + [=](::sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(subgroup_size)]] { ict_tri_spgeam_nnz( llh_row_ptrs, llh_col_idxs, a_row_ptrs, @@ -150,7 +150,7 @@ void ict_tri_spgeam_init(const IndexType* __restrict__ llh_row_ptrs, const IndexType* __restrict__ l_new_row_ptrs, IndexType* __restrict__ l_new_col_idxs, ValueType* __restrict__ l_new_vals, IndexType num_rows, - sycl::nd_item<3> item_ct1) + ::sycl::nd_item<3> item_ct1) { auto subwarp = group::tiled_partition( group::this_thread_block(item_ct1)); @@ -302,7 +302,7 @@ void ict_tri_spgeam_init(const IndexType* __restrict__ llh_row_ptrs, template void ict_tri_spgeam_init(dim3 grid, dim3 block, size_type dynamic_shared_memory, - sycl::queue* queue, const IndexType* llh_row_ptrs, + ::sycl::queue* queue, const IndexType* llh_row_ptrs, const IndexType* llh_col_idxs, const ValueType* llh_vals, const IndexType* a_row_ptrs, const IndexType* a_col_idxs, const ValueType* a_vals, @@ -313,7 +313,7 @@ void ict_tri_spgeam_init(dim3 grid, dim3 block, size_type dynamic_shared_memory, IndexType num_rows) { queue->parallel_for(sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) + [=](::sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(subgroup_size)]] { ict_tri_spgeam_init( llh_row_ptrs, llh_col_idxs, llh_vals, @@ -337,7 +337,7 @@ void ict_sweep(const IndexType* __restrict__ a_row_ptrs, const IndexType* __restrict__ l_row_idxs, const IndexType* __restrict__ l_col_idxs, ValueType* __restrict__ l_vals, IndexType l_nnz, - sycl::nd_item<3> item_ct1) + ::sycl::nd_item<3> item_ct1) { auto l_nz = thread::get_subwarp_id_flat(item_ct1); if (l_nz >= l_nnz) { @@ -398,13 +398,13 @@ void ict_sweep(const IndexType* __restrict__ a_row_ptrs, template void ict_sweep(dim3 grid, dim3 block, size_type dynamic_shared_memory, - sycl::queue* queue, const IndexType* a_row_ptrs, + ::sycl::queue* queue, const IndexType* a_row_ptrs, const IndexType* a_col_idxs, const ValueType* a_vals, const IndexType* l_row_ptrs, const IndexType* l_row_idxs, const IndexType* l_col_idxs, ValueType* l_vals, IndexType l_nnz) { queue->parallel_for(sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) + [=](::sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(subgroup_size)]] { ict_sweep( a_row_ptrs, a_col_idxs, a_vals, l_row_ptrs, diff --git a/dpcpp/factorization/par_ilu_kernels.dp.cpp b/dpcpp/factorization/par_ilu_kernels.dp.cpp index 775b9ee8ad2..8a4ea679999 100644 --- a/dpcpp/factorization/par_ilu_kernels.dp.cpp +++ b/dpcpp/factorization/par_ilu_kernels.dp.cpp @@ -71,7 +71,7 @@ void compute_l_u_factors(size_type num_elements, const IndexType* __restrict__ u_row_ptrs, const IndexType* __restrict__ u_col_idxs, ValueType* __restrict__ u_values, - sycl::nd_item<3> item_ct1) + ::sycl::nd_item<3> item_ct1) { const auto elem_id = thread::get_thread_id_flat(item_ct1); if (elem_id < num_elements) { @@ -112,7 +112,7 @@ void compute_l_u_factors(size_type num_elements, template void compute_l_u_factors(dim3 grid, dim3 block, size_type dynamic_shared_memory, - sycl::queue* queue, size_type num_elements, + ::sycl::queue* queue, size_type num_elements, const IndexType* row_idxs, const IndexType* col_idxs, const ValueType* values, const IndexType* l_row_ptrs, const IndexType* l_col_idxs, ValueType* l_values, @@ -120,7 +120,7 @@ void compute_l_u_factors(dim3 grid, dim3 block, size_type dynamic_shared_memory, const IndexType* u_col_idxs, ValueType* u_values) { queue->parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + sycl_nd_range(grid, block), [=](::sycl::nd_item<3> item_ct1) { compute_l_u_factors(num_elements, row_idxs, col_idxs, values, l_row_ptrs, l_col_idxs, l_values, u_row_ptrs, u_col_idxs, u_values, item_ct1); diff --git a/dpcpp/factorization/par_ilut_filter_kernels.hpp.inc b/dpcpp/factorization/par_ilut_filter_kernels.hpp.inc index 063292a293f..1d74e3e5e6a 100644 --- a/dpcpp/factorization/par_ilut_filter_kernels.hpp.inc +++ b/dpcpp/factorization/par_ilut_filter_kernels.hpp.inc @@ -39,7 +39,7 @@ template item_ct1) + ::sycl::nd_item<3> item_ct1) { auto subwarp = group::tiled_partition( group::this_thread_block(item_ct1)); @@ -67,7 +67,8 @@ void abstract_filter_impl(const IndexType* row_ptrs, IndexType num_rows, template void abstract_filter_nnz(const IndexType* __restrict__ row_ptrs, IndexType num_rows, Predicate pred, - IndexType* __restrict__ nnz, sycl::nd_item<3> item_ct1) + IndexType* __restrict__ nnz, + ::sycl::nd_item<3> item_ct1) { IndexType count{}; abstract_filter_impl( @@ -93,7 +94,7 @@ void abstract_filter(const IndexType* __restrict__ old_row_ptrs, IndexType* __restrict__ new_row_idxs, IndexType* __restrict__ new_col_idxs, ValueType* __restrict__ new_vals, - sycl::nd_item<3> item_ct1) + ::sycl::nd_item<3> item_ct1) { IndexType count{}; IndexType new_offset{}; @@ -124,7 +125,7 @@ void threshold_filter_nnz(const IndexType* __restrict__ row_ptrs, const ValueType* vals, IndexType num_rows, remove_complex threshold, IndexType* __restrict__ nnz, bool lower, - sycl::nd_item<3> item_ct1) + ::sycl::nd_item<3> item_ct1) { abstract_filter_nnz( row_ptrs, num_rows, @@ -137,18 +138,19 @@ void threshold_filter_nnz(const IndexType* __restrict__ row_ptrs, template void threshold_filter_nnz(dim3 grid, dim3 block, - size_type dynamic_shared_memory, sycl::queue* queue, + size_type dynamic_shared_memory, ::sycl::queue* queue, const IndexType* row_ptrs, const ValueType* vals, IndexType num_rows, remove_complex threshold, IndexType* nnz, bool lower) { - queue->parallel_for( - sycl_nd_range(grid, block), [= - ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(subgroup_size)]] { - threshold_filter_nnz( - row_ptrs, vals, num_rows, threshold, nnz, lower, item_ct1); - }); + queue->parallel_for(sycl_nd_range(grid, block), + [=](::sycl::nd_item<3> item_ct1) + [[sycl::reqd_sub_group_size(subgroup_size)]] { + threshold_filter_nnz( + row_ptrs, vals, num_rows, threshold, nnz, + lower, item_ct1); + }); } @@ -161,7 +163,7 @@ void threshold_filter(const IndexType* __restrict__ old_row_ptrs, IndexType* __restrict__ new_row_idxs, IndexType* __restrict__ new_col_idxs, ValueType* __restrict__ new_vals, bool lower, - sycl::nd_item<3> item_ct1) + ::sycl::nd_item<3> item_ct1) { abstract_filter( old_row_ptrs, old_col_idxs, old_vals, num_rows, @@ -174,20 +176,21 @@ void threshold_filter(const IndexType* __restrict__ old_row_ptrs, template void threshold_filter(dim3 grid, dim3 block, size_type dynamic_shared_memory, - sycl::queue* queue, const IndexType* old_row_ptrs, + ::sycl::queue* queue, const IndexType* old_row_ptrs, const IndexType* old_col_idxs, const ValueType* old_vals, IndexType num_rows, remove_complex threshold, const IndexType* new_row_ptrs, IndexType* new_row_idxs, IndexType* new_col_idxs, ValueType* new_vals, bool lower) { - queue->parallel_for( - sycl_nd_range(grid, block), [= - ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(subgroup_size)]] { - threshold_filter( - old_row_ptrs, old_col_idxs, old_vals, num_rows, threshold, - new_row_ptrs, new_row_idxs, new_col_idxs, new_vals, lower, - item_ct1); - }); + queue->parallel_for(sycl_nd_range(grid, block), + [=](::sycl::nd_item<3> item_ct1) + [[sycl::reqd_sub_group_size(subgroup_size)]] { + threshold_filter( + old_row_ptrs, old_col_idxs, old_vals, + num_rows, threshold, new_row_ptrs, + new_row_idxs, new_col_idxs, new_vals, lower, + item_ct1); + }); } @@ -195,7 +198,7 @@ template void bucket_filter_nnz(const IndexType* __restrict__ row_ptrs, const BucketType* buckets, IndexType num_rows, BucketType bucket, IndexType* __restrict__ nnz, - sycl::nd_item<3> item_ct1) + ::sycl::nd_item<3> item_ct1) { abstract_filter_nnz( row_ptrs, num_rows, @@ -207,16 +210,17 @@ void bucket_filter_nnz(const IndexType* __restrict__ row_ptrs, template void bucket_filter_nnz(dim3 grid, dim3 block, size_type dynamic_shared_memory, - sycl::queue* queue, const IndexType* row_ptrs, + ::sycl::queue* queue, const IndexType* row_ptrs, const BucketType* buckets, IndexType num_rows, BucketType bucket, IndexType* nnz) { - queue->parallel_for( - sycl_nd_range(grid, block), [= - ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(subgroup_size)]] { - bucket_filter_nnz(row_ptrs, buckets, num_rows, - bucket, nnz, item_ct1); - }); + queue->parallel_for(sycl_nd_range(grid, block), + [=](::sycl::nd_item<3> item_ct1) + [[sycl::reqd_sub_group_size(subgroup_size)]] { + bucket_filter_nnz( + row_ptrs, buckets, num_rows, bucket, nnz, + item_ct1); + }); } @@ -230,7 +234,8 @@ void bucket_filter(const IndexType* __restrict__ old_row_ptrs, const IndexType* __restrict__ new_row_ptrs, IndexType* __restrict__ new_row_idxs, IndexType* __restrict__ new_col_idxs, - ValueType* __restrict__ new_vals, sycl::nd_item<3> item_ct1) + ValueType* __restrict__ new_vals, + ::sycl::nd_item<3> item_ct1) { abstract_filter( old_row_ptrs, old_col_idxs, old_vals, num_rows, @@ -243,20 +248,22 @@ void bucket_filter(const IndexType* __restrict__ old_row_ptrs, template void bucket_filter(dim3 grid, dim3 block, size_type dynamic_shared_memory, - sycl::queue* queue, const IndexType* old_row_ptrs, + ::sycl::queue* queue, const IndexType* old_row_ptrs, const IndexType* old_col_idxs, const ValueType* old_vals, const BucketType* buckets, IndexType num_rows, BucketType bucket, const IndexType* new_row_ptrs, IndexType* new_row_idxs, IndexType* new_col_idxs, ValueType* new_vals) { - queue->parallel_for( - sycl_nd_range(grid, block), [= - ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(subgroup_size)]] { - bucket_filter( - old_row_ptrs, old_col_idxs, old_vals, buckets, num_rows, bucket, - new_row_ptrs, new_row_idxs, new_col_idxs, new_vals, item_ct1); - }); + queue->parallel_for(sycl_nd_range(grid, block), + [=](::sycl::nd_item<3> item_ct1) + [[sycl::reqd_sub_group_size(subgroup_size)]] { + bucket_filter( + old_row_ptrs, old_col_idxs, old_vals, + buckets, num_rows, bucket, new_row_ptrs, + new_row_idxs, new_col_idxs, new_vals, + item_ct1); + }); } diff --git a/dpcpp/factorization/par_ilut_select_kernels.hpp.inc b/dpcpp/factorization/par_ilut_select_kernels.hpp.inc index 41fa99cc24e..006256ca7a1 100644 --- a/dpcpp/factorization/par_ilut_select_kernels.hpp.inc +++ b/dpcpp/factorization/par_ilut_select_kernels.hpp.inc @@ -54,7 +54,7 @@ constexpr auto basecase_block_size = basecase_size / basecase_local_size; template void build_searchtree(const ValueType* __restrict__ input, IndexType size, remove_complex* __restrict__ tree_output, - sycl::nd_item<3> item_ct1, + ::sycl::nd_item<3> item_ct1, remove_complex* sh_samples) { using AbsType = remove_complex; @@ -87,22 +87,22 @@ void build_searchtree(const ValueType* __restrict__ input, IndexType size, template void build_searchtree(dim3 grid, dim3 block, size_type dynamic_shared_memory, - sycl::queue* queue, const ValueType* input, + ::sycl::queue* queue, const ValueType* input, IndexType size, remove_complex* tree_output) { - queue->submit([&](sycl::handler& cgh) { - sycl::accessor, 1, - sycl::access_mode::read_write, - sycl::access::target::local> - sh_samples_acc_ct1(sycl::range<1>(1024 /*sample_size*/), cgh); - - cgh.parallel_for( - sycl_nd_range(grid, block), [= - ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( - config::warp_size)]] { - build_searchtree(input, size, tree_output, item_ct1, - sh_samples_acc_ct1.get_pointer()); - }); + queue->submit([&](::sycl::handler& cgh) { + ::sycl::accessor, 1, + ::sycl::access_mode::read_write, + ::sycl::access::target::local> + sh_samples_acc_ct1(::sycl::range<1>(1024 /*sample_size*/), cgh); + + cgh.parallel_for(sycl_nd_range(grid, block), + [=](::sycl::nd_item<3> item_ct1) + [[sycl::reqd_sub_group_size(config::warp_size)]] { + build_searchtree( + input, size, tree_output, item_ct1, + sh_samples_acc_ct1.get_pointer()); + }); }); } @@ -119,7 +119,7 @@ template void count_buckets(const ValueType* __restrict__ input, IndexType size, const remove_complex* __restrict__ tree, IndexType* counter, unsigned char* oracles, - int items_per_thread, sycl::nd_item<3> item_ct1, + int items_per_thread, ::sycl::nd_item<3> item_ct1, remove_complex* sh_tree, IndexType* sh_counter) { // load tree into shared memory, initialize counters @@ -168,21 +168,22 @@ void count_buckets(const ValueType* __restrict__ input, IndexType size, template void count_buckets(dim3 grid, dim3 block, size_type dynamic_shared_memory, - sycl::queue* queue, const ValueType* input, IndexType size, + ::sycl::queue* queue, const ValueType* input, IndexType size, const remove_complex* tree, IndexType* counter, unsigned char* oracles, int items_per_thread) { - queue->submit([&](sycl::handler& cgh) { - sycl::accessor, 1, - sycl::access_mode::read_write, - sycl::access::target::local> - sh_tree_acc_ct1(sycl::range<1>(255 /*searchtree_inner_size*/), cgh); - sycl::accessor - sh_counter_acc_ct1(sycl::range<1>(256 /*searchtree_width*/), cgh); + queue->submit([&](::sycl::handler& cgh) { + ::sycl::accessor, 1, + ::sycl::access_mode::read_write, + ::sycl::access::target::local> + sh_tree_acc_ct1(::sycl::range<1>(255 /*searchtree_inner_size*/), + cgh); + ::sycl::accessor + sh_counter_acc_ct1(::sycl::range<1>(256 /*searchtree_width*/), cgh); cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + sycl_nd_range(grid, block), [=](::sycl::nd_item<3> item_ct1) { count_buckets( input, size, tree, counter, oracles, items_per_thread, item_ct1, @@ -204,7 +205,7 @@ void count_buckets(dim3 grid, dim3 block, size_type dynamic_shared_memory, template void block_prefix_sum(IndexType* __restrict__ counters, IndexType* __restrict__ totals, IndexType num_blocks, - sycl::nd_item<3> item_ct1, IndexType* warp_sums) + ::sycl::nd_item<3> item_ct1, IndexType* warp_sums) { constexpr auto num_warps = default_block_size / config::warp_size; static_assert(num_warps < config::warp_size, @@ -273,23 +274,24 @@ void block_prefix_sum(IndexType* __restrict__ counters, template void block_prefix_sum(dim3 grid, dim3 block, size_type dynamic_shared_memory, - sycl::queue* queue, IndexType* counters, + ::sycl::queue* queue, IndexType* counters, IndexType* totals, IndexType num_blocks) { - queue->submit([&](sycl::handler& cgh) { - sycl::accessor - warp_sums_acc_ct1(sycl::range<1>(default_block_size / - config::warp_size /*num_warps*/), + queue->submit([&](::sycl::handler& cgh) { + ::sycl::accessor + warp_sums_acc_ct1(::sycl::range<1>(default_block_size / + config::warp_size /*num_warps*/), cgh); cgh.parallel_for( - sycl_nd_range(grid, block), [= - ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( - config::warp_size)]] { - block_prefix_sum(counters, totals, num_blocks, item_ct1, - (IndexType*)warp_sums_acc_ct1.get_pointer()); - }); + sycl_nd_range(grid, block), + [=](::sycl::nd_item<3> item_ct1) + [[sycl::reqd_sub_group_size(config::warp_size)]] { + block_prefix_sum( + counters, totals, num_blocks, item_ct1, + (IndexType*)warp_sums_acc_ct1.get_pointer()); + }); }); } @@ -305,7 +307,7 @@ void filter_bucket(const ValueType* __restrict__ input, IndexType size, unsigned char bucket, const unsigned char* oracles, const IndexType* block_offsets, remove_complex* __restrict__ output, - int items_per_thread, sycl::nd_item<3> item_ct1, + int items_per_thread, ::sycl::nd_item<3> item_ct1, IndexType* counter) { // initialize the counter with the block prefix sum. @@ -337,18 +339,18 @@ void filter_bucket(const ValueType* __restrict__ input, IndexType size, template void filter_bucket(dim3 grid, dim3 block, size_type dynamic_shared_memory, - sycl::queue* queue, const ValueType* input, IndexType size, + ::sycl::queue* queue, const ValueType* input, IndexType size, unsigned char bucket, const unsigned char* oracles, const IndexType* block_offsets, remove_complex* output, int items_per_thread) { - queue->submit([&](sycl::handler& cgh) { - sycl::accessor + queue->submit([&](::sycl::handler& cgh) { + ::sycl::accessor counter_acc_ct1(cgh); cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + sycl_nd_range(grid, block), [=](::sycl::nd_item<3> item_ct1) { filter_bucket(input, size, bucket, oracles, block_offsets, output, items_per_thread, item_ct1, (IndexType*)counter_acc_ct1.get_pointer()); @@ -365,7 +367,7 @@ void filter_bucket(dim3 grid, dim3 block, size_type dynamic_shared_memory, template void basecase_select(const ValueType* __restrict__ input, IndexType size, IndexType rank, ValueType* __restrict__ out, - sycl::nd_item<3> item_ct1, ValueType* sh_local) + ::sycl::nd_item<3> item_ct1, ValueType* sh_local) { constexpr auto sentinel = std::numeric_limits::infinity(); ValueType local[basecase_local_size]; @@ -382,21 +384,21 @@ void basecase_select(const ValueType* __restrict__ input, IndexType size, template void basecase_select(dim3 grid, dim3 block, size_type dynamic_shared_memory, - sycl::queue* queue, const ValueType* input, IndexType size, - IndexType rank, ValueType* out) + ::sycl::queue* queue, const ValueType* input, + IndexType size, IndexType rank, ValueType* out) { - queue->submit([&](sycl::handler& cgh) { - sycl::accessor - sh_local_acc_ct1(sycl::range<1>(1024 /*basecase_size*/), cgh); + queue->submit([&](::sycl::handler& cgh) { + ::sycl::accessor + sh_local_acc_ct1(::sycl::range<1>(1024 /*basecase_size*/), cgh); cgh.parallel_for( - sycl_nd_range(grid, block), [= - ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( - config::warp_size)]] { - basecase_select(input, size, rank, out, item_ct1, - (ValueType*)sh_local_acc_ct1.get_pointer()); - }); + sycl_nd_range(grid, block), + [=](::sycl::nd_item<3> item_ct1) + [[sycl::reqd_sub_group_size(config::warp_size)]] { + basecase_select(input, size, rank, out, item_ct1, + (ValueType*)sh_local_acc_ct1.get_pointer()); + }); }); } @@ -410,7 +412,7 @@ void basecase_select(dim3 grid, dim3 block, size_type dynamic_shared_memory, */ template void find_bucket(IndexType* prefix_sum, IndexType rank, - sycl::nd_item<3> item_ct1) + ::sycl::nd_item<3> item_ct1) { auto warp = group::tiled_partition( group::this_thread_block(item_ct1)); @@ -429,14 +431,13 @@ void find_bucket(IndexType* prefix_sum, IndexType rank, template void find_bucket(dim3 grid, dim3 block, size_type dynamic_shared_memory, - sycl::queue* queue, IndexType* prefix_sum, IndexType rank) + ::sycl::queue* queue, IndexType* prefix_sum, IndexType rank) { - queue->parallel_for( - sycl_nd_range(grid, block), [= - ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( - config::warp_size)]] { - find_bucket(prefix_sum, rank, item_ct1); - }); + queue->parallel_for(sycl_nd_range(grid, block), + [=](::sycl::nd_item<3> item_ct1) + [[sycl::reqd_sub_group_size(config::warp_size)]] { + find_bucket(prefix_sum, rank, item_ct1); + }); } diff --git a/dpcpp/factorization/par_ilut_spgeam_kernel.dp.cpp b/dpcpp/factorization/par_ilut_spgeam_kernel.dp.cpp index e0fe8caaf85..e754aa52397 100644 --- a/dpcpp/factorization/par_ilut_spgeam_kernel.dp.cpp +++ b/dpcpp/factorization/par_ilut_spgeam_kernel.dp.cpp @@ -88,7 +88,7 @@ void tri_spgeam_nnz(const IndexType* __restrict__ lu_row_ptrs, const IndexType* __restrict__ a_col_idxs, IndexType* __restrict__ l_new_row_ptrs, IndexType* __restrict__ u_new_row_ptrs, IndexType num_rows, - sycl::nd_item<3> item_ct1) + ::sycl::nd_item<3> item_ct1) { auto subwarp = group::tiled_partition( group::this_thread_block(item_ct1)); @@ -123,13 +123,13 @@ void tri_spgeam_nnz(const IndexType* __restrict__ lu_row_ptrs, template void tri_spgeam_nnz(dim3 grid, dim3 block, size_type dynamic_shared_memory, - sycl::queue* queue, const IndexType* lu_row_ptrs, + ::sycl::queue* queue, const IndexType* lu_row_ptrs, const IndexType* lu_col_idxs, const IndexType* a_row_ptrs, const IndexType* a_col_idxs, IndexType* l_new_row_ptrs, IndexType* u_new_row_ptrs, IndexType num_rows) { queue->parallel_for(sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) + [=](::sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(subgroup_size)]] { tri_spgeam_nnz( lu_row_ptrs, lu_col_idxs, a_row_ptrs, @@ -160,7 +160,7 @@ void tri_spgeam_init(const IndexType* __restrict__ lu_row_ptrs, const IndexType* __restrict__ u_new_row_ptrs, IndexType* __restrict__ u_new_col_idxs, ValueType* __restrict__ u_new_vals, IndexType num_rows, - sycl::nd_item<3> item_ct1) + ::sycl::nd_item<3> item_ct1) { auto subwarp = group::tiled_partition( group::this_thread_block(item_ct1)); @@ -342,7 +342,7 @@ void tri_spgeam_init(const IndexType* __restrict__ lu_row_ptrs, template void tri_spgeam_init(dim3 grid, dim3 block, size_type dynamic_shared_memory, - sycl::queue* queue, const IndexType* lu_row_ptrs, + ::sycl::queue* queue, const IndexType* lu_row_ptrs, const IndexType* lu_col_idxs, const ValueType* lu_vals, const IndexType* a_row_ptrs, const IndexType* a_col_idxs, const ValueType* a_vals, const IndexType* l_row_ptrs, @@ -354,7 +354,7 @@ void tri_spgeam_init(dim3 grid, dim3 block, size_type dynamic_shared_memory, ValueType* u_new_vals, IndexType num_rows) { queue->parallel_for(sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) + [=](::sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(subgroup_size)]] { tri_spgeam_init( lu_row_ptrs, lu_col_idxs, lu_vals, diff --git a/dpcpp/factorization/par_ilut_sweep_kernel.dp.cpp b/dpcpp/factorization/par_ilut_sweep_kernel.dp.cpp index 4209e402d00..d62ca1f59ee 100644 --- a/dpcpp/factorization/par_ilut_sweep_kernel.dp.cpp +++ b/dpcpp/factorization/par_ilut_sweep_kernel.dp.cpp @@ -92,7 +92,7 @@ void sweep(const IndexType* __restrict__ a_row_ptrs, const IndexType* __restrict__ ut_col_ptrs, const IndexType* __restrict__ ut_row_idxs, ValueType* __restrict__ ut_vals, IndexType u_nnz, - sycl::nd_item<3> item_ct1) + ::sycl::nd_item<3> item_ct1) { auto tidx = thread::get_subwarp_id_flat(item_ct1); if (tidx >= l_nnz + u_nnz) { @@ -167,7 +167,7 @@ void sweep(const IndexType* __restrict__ a_row_ptrs, template void sweep(dim3 grid, dim3 block, size_type dynamic_shared_memory, - sycl::queue* queue, const IndexType* a_row_ptrs, + ::sycl::queue* queue, const IndexType* a_row_ptrs, const IndexType* a_col_idxs, const ValueType* a_vals, const IndexType* l_row_ptrs, const IndexType* l_row_idxs, const IndexType* l_col_idxs, ValueType* l_vals, IndexType l_nnz, @@ -176,7 +176,7 @@ void sweep(dim3 grid, dim3 block, size_type dynamic_shared_memory, const IndexType* ut_row_idxs, ValueType* ut_vals, IndexType u_nnz) { queue->parallel_for(sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) + [=](::sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(subgroup_size)]] { sweep( a_row_ptrs, a_col_idxs, a_vals, l_row_ptrs, diff --git a/dpcpp/matrix/coo_kernels.dp.cpp b/dpcpp/matrix/coo_kernels.dp.cpp index 05fcab9627f..a98cec436b2 100644 --- a/dpcpp/matrix/coo_kernels.dp.cpp +++ b/dpcpp/matrix/coo_kernels.dp.cpp @@ -104,7 +104,7 @@ void spmv_kernel(const size_type nnz, const size_type num_lines, const IndexType* __restrict__ row, const ValueType* __restrict__ b, const size_type b_stride, ValueType* __restrict__ c, const size_type c_stride, - Closure scale, sycl::nd_item<3> item_ct1) + Closure scale, ::sycl::nd_item<3> item_ct1) { ValueType temp_val = zero(); const auto start = @@ -160,7 +160,7 @@ void abstract_spmv(const size_type nnz, const size_type num_lines, const IndexType* __restrict__ row, const ValueType* __restrict__ b, const size_type b_stride, ValueType* __restrict__ c, const size_type c_stride, - sycl::nd_item<3> item_ct1) + ::sycl::nd_item<3> item_ct1) { spmv_kernel( nnz, num_lines, val, col, row, b, b_stride, c, c_stride, @@ -175,7 +175,7 @@ void abstract_spmv(const size_type nnz, const size_type num_lines, const IndexType* __restrict__ row, const ValueType* __restrict__ b, const size_type b_stride, ValueType* __restrict__ c, const size_type c_stride, - sycl::nd_item<3> item_ct1) + ::sycl::nd_item<3> item_ct1) { ValueType scale_factor = alpha[0]; spmv_kernel( @@ -213,7 +213,7 @@ void spmm_kernel(const size_type nnz, const size_type num_elems, const IndexType* __restrict__ row, const size_type num_cols, const ValueType* __restrict__ b, const size_type b_stride, ValueType* __restrict__ c, const size_type c_stride, - Closure scale, sycl::nd_item<3> item_ct1) + Closure scale, ::sycl::nd_item<3> item_ct1) { ValueType temp = zero(); const auto coo_idx = @@ -251,7 +251,7 @@ void abstract_spmm(const size_type nnz, const size_type num_elems, const IndexType* __restrict__ row, const size_type num_cols, const ValueType* __restrict__ b, const size_type b_stride, ValueType* __restrict__ c, const size_type c_stride, - sycl::nd_item<3> item_ct1) + ::sycl::nd_item<3> item_ct1) { spmm_kernel( nnz, num_elems, val, col, row, num_cols, b, b_stride, c, c_stride, @@ -266,7 +266,7 @@ void abstract_spmm(const size_type nnz, const size_type num_elems, const IndexType* __restrict__ row, const size_type num_cols, const ValueType* __restrict__ b, const size_type b_stride, ValueType* __restrict__ c, const size_type c_stride, - sycl::nd_item<3> item_ct1) + ::sycl::nd_item<3> item_ct1) { ValueType scale_factor = alpha[0]; spmm_kernel( diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp index 95a240d710d..43f9a9a5aee 100644 --- a/dpcpp/matrix/csr_kernels.dp.cpp +++ b/dpcpp/matrix/csr_kernels.dp.cpp @@ -109,7 +109,7 @@ __dpct_inline__ T ceildivT(T nom, T denom) template __dpct_inline__ bool block_segment_scan_reverse( const IndexType* __restrict__ ind, ValueType* __restrict__ val, - sycl::nd_item<3> item_ct1) + ::sycl::nd_item<3> item_ct1) { bool last = true; const auto reg_ind = ind[item_ct1.get_local_id(2)]; @@ -220,7 +220,7 @@ __dpct_inline__ void spmv_kernel( acc::range val, const IndexType* __restrict__ col_idxs, const IndexType* __restrict__ row_ptrs, const IndexType* __restrict__ srow, acc::range b, acc::range c, Closure scale, - sycl::nd_item<3> item_ct1) + ::sycl::nd_item<3> item_ct1) { using arithmetic_type = typename output_accessor::arithmetic_type; const IndexType warp_idx = @@ -266,7 +266,7 @@ void abstract_spmv(const IndexType nwarps, const IndexType num_rows, const IndexType* __restrict__ row_ptrs, const IndexType* __restrict__ srow, acc::range b, acc::range c, - sycl::nd_item<3> item_ct1) + ::sycl::nd_item<3> item_ct1) { using arithmetic_type = typename output_accessor::arithmetic_type; using output_type = typename output_accessor::storage_type; @@ -293,7 +293,7 @@ void abstract_spmv( acc::range val, const IndexType* __restrict__ col_idxs, const IndexType* __restrict__ row_ptrs, const IndexType* __restrict__ srow, acc::range b, acc::range c, - sycl::nd_item<3> item_ct1) + ::sycl::nd_item<3> item_ct1) { using arithmetic_type = typename output_accessor::arithmetic_type; using output_type = typename output_accessor::storage_type; @@ -336,7 +336,7 @@ template c, - Alpha_op alpha_op, sycl::nd_item<3> item_ct1, + Alpha_op alpha_op, ::sycl::nd_item<3> item_ct1, uninitialized_array& tmp_ind, uninitialized_array& tmp_val) { @@ -384,7 +384,7 @@ void merge_path_spmv( acc::range b, acc::range c, IndexType* __restrict__ row_out, typename output_accessor::arithmetic_type* __restrict__ val_out, - Alpha_op alpha_op, Beta_op beta_op, sycl::nd_item<3> item_ct1, + Alpha_op alpha_op, Beta_op beta_op, ::sycl::nd_item<3> item_ct1, IndexType* shared_row_ptrs) { using arithmetic_type = typename output_accessor::arithmetic_type; @@ -462,7 +462,7 @@ void abstract_merge_path_spmv( acc::range b, acc::range c, IndexType* __restrict__ row_out, typename output_accessor::arithmetic_type* __restrict__ val_out, - sycl::nd_item<3> item_ct1, IndexType* shared_row_ptrs) + ::sycl::nd_item<3> item_ct1, IndexType* shared_row_ptrs) { using type = typename output_accessor::arithmetic_type; merge_path_spmv( @@ -474,20 +474,21 @@ void abstract_merge_path_spmv( template void abstract_merge_path_spmv( - dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue, - const IndexType num_rows, acc::range val, - const IndexType* col_idxs, const IndexType* row_ptrs, const IndexType* srow, + dim3 grid, dim3 block, size_type dynamic_shared_memory, + ::sycl::queue* queue, const IndexType num_rows, + acc::range val, const IndexType* col_idxs, + const IndexType* row_ptrs, const IndexType* srow, acc::range b, acc::range c, IndexType* row_out, typename output_accessor::arithmetic_type* val_out) { - queue->submit([&](sycl::handler& cgh) { - sycl::accessor + queue->submit([&](::sycl::handler& cgh) { + ::sycl::accessor shared_row_ptrs_acc_ct1( - sycl::range<1>(spmv_block_size * items_per_thread), cgh); + ::sycl::range<1>(spmv_block_size * items_per_thread), cgh); cgh.parallel_for(sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) { + [=](::sycl::nd_item<3> item_ct1) { abstract_merge_path_spmv( num_rows, val, col_idxs, row_ptrs, srow, b, c, row_out, val_out, item_ct1, @@ -509,7 +510,7 @@ void abstract_merge_path_spmv( const typename output_accessor::storage_type* __restrict__ beta, acc::range c, IndexType* __restrict__ row_out, typename output_accessor::arithmetic_type* __restrict__ val_out, - sycl::nd_item<3> item_ct1, IndexType* shared_row_ptrs) + ::sycl::nd_item<3> item_ct1, IndexType* shared_row_ptrs) { using type = typename output_accessor::arithmetic_type; const type alpha_val = alpha[0]; @@ -524,8 +525,8 @@ void abstract_merge_path_spmv( template void abstract_merge_path_spmv( - dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue, - const IndexType num_rows, + dim3 grid, dim3 block, size_type dynamic_shared_memory, + ::sycl::queue* queue, const IndexType num_rows, const typename matrix_accessor::storage_type* alpha, acc::range val, const IndexType* col_idxs, const IndexType* row_ptrs, const IndexType* srow, @@ -534,14 +535,14 @@ void abstract_merge_path_spmv( acc::range c, IndexType* row_out, typename output_accessor::arithmetic_type* val_out) { - queue->submit([&](sycl::handler& cgh) { - sycl::accessor + queue->submit([&](::sycl::handler& cgh) { + ::sycl::accessor shared_row_ptrs_acc_ct1( - sycl::range<1>(spmv_block_size * items_per_thread), cgh); + ::sycl::range<1>(spmv_block_size * items_per_thread), cgh); cgh.parallel_for(sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) { + [=](::sycl::nd_item<3> item_ct1) { abstract_merge_path_spmv( num_rows, alpha, val, col_idxs, row_ptrs, srow, b, beta, c, row_out, val_out, item_ct1, @@ -557,7 +558,7 @@ template c, - sycl::nd_item<3> item_ct1, + ::sycl::nd_item<3> item_ct1, uninitialized_array& tmp_ind, uninitialized_array& tmp_val) { @@ -569,23 +570,23 @@ void abstract_reduce( template void abstract_reduce(dim3 grid, dim3 block, size_type dynamic_shared_memory, - sycl::queue* queue, const IndexType nwarps, + ::sycl::queue* queue, const IndexType nwarps, const arithmetic_type* __restrict__ last_val, const IndexType* __restrict__ last_row, acc::range c) { - queue->submit([&](sycl::handler& cgh) { - sycl::accessor, 0, - sycl::access_mode::read_write, - sycl::access::target::local> + queue->submit([&](::sycl::handler& cgh) { + ::sycl::accessor, 0, + ::sycl::access_mode::read_write, + ::sycl::access::target::local> tmp_ind_acc_ct1(cgh); - sycl::accessor, 0, - sycl::access_mode::read_write, - sycl::access::target::local> + ::sycl::accessor, + 0, ::sycl::access_mode::read_write, + ::sycl::access::target::local> tmp_val_acc_ct1(cgh); cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + sycl_nd_range(grid, block), [=](::sycl::nd_item<3> item_ct1) { abstract_reduce(nwarps, last_val, last_row, c, item_ct1, *tmp_ind_acc_ct1.get_pointer(), *tmp_val_acc_ct1.get_pointer()); @@ -600,7 +601,7 @@ void abstract_reduce( const IndexType nwarps, const arithmetic_type* __restrict__ last_val, const IndexType* __restrict__ last_row, const MatrixValueType* __restrict__ alpha, acc::range c, - sycl::nd_item<3> item_ct1, + ::sycl::nd_item<3> item_ct1, uninitialized_array& tmp_ind, uninitialized_array& tmp_val) { @@ -614,23 +615,23 @@ void abstract_reduce( template void abstract_reduce(dim3 grid, dim3 block, size_type dynamic_shared_memory, - sycl::queue* queue, const IndexType nwarps, + ::sycl::queue* queue, const IndexType nwarps, const arithmetic_type* last_val, const IndexType* last_row, const MatrixValueType* alpha, acc::range c) { - queue->submit([&](sycl::handler& cgh) { - sycl::accessor, 0, - sycl::access_mode::read_write, - sycl::access::target::local> + queue->submit([&](::sycl::handler& cgh) { + ::sycl::accessor, 0, + ::sycl::access_mode::read_write, + ::sycl::access::target::local> tmp_ind_acc_ct1(cgh); - sycl::accessor, 0, - sycl::access_mode::read_write, - sycl::access::target::local> + ::sycl::accessor, + 0, ::sycl::access_mode::read_write, + ::sycl::access::target::local> tmp_val_acc_ct1(cgh); cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + sycl_nd_range(grid, block), [=](::sycl::nd_item<3> item_ct1) { abstract_reduce(nwarps, last_val, last_row, alpha, c, item_ct1, *tmp_ind_acc_ct1.get_pointer(), *tmp_val_acc_ct1.get_pointer()); @@ -648,7 +649,7 @@ void device_classical_spmv(const size_type num_rows, const IndexType* __restrict__ row_ptrs, acc::range b, acc::range c, Closure scale, - sycl::nd_item<3> item_ct1) + ::sycl::nd_item<3> item_ct1) { using arithmetic_type = typename output_accessor::arithmetic_type; auto subgroup_tile = group::tiled_partition( @@ -686,7 +687,7 @@ void abstract_classical_spmv(const size_type num_rows, const IndexType* __restrict__ row_ptrs, acc::range b, acc::range c, - sycl::nd_item<3> item_ct1) + ::sycl::nd_item<3> item_ct1) { using type = typename output_accessor::arithmetic_type; device_classical_spmv( @@ -696,16 +697,19 @@ void abstract_classical_spmv(const size_type num_rows, template -void abstract_classical_spmv( - dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue, - const size_type num_rows, acc::range val, - const IndexType* col_idxs, const IndexType* row_ptrs, - acc::range b, acc::range c) +void abstract_classical_spmv(dim3 grid, dim3 block, + size_type dynamic_shared_memory, + ::sycl::queue* queue, const size_type num_rows, + acc::range val, + const IndexType* col_idxs, + const IndexType* row_ptrs, + acc::range b, + acc::range c) { if (subgroup_size > 1) { - queue->submit([&](sycl::handler& cgh) { + queue->submit([&](::sycl::handler& cgh) { cgh.parallel_for(sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) + [=](::sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(subgroup_size)]] { abstract_classical_spmv( num_rows, val, col_idxs, row_ptrs, b, @@ -713,9 +717,9 @@ void abstract_classical_spmv( }); }); } else { - queue->submit([&](sycl::handler& cgh) { + queue->submit([&](::sycl::handler& cgh) { cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + sycl_nd_range(grid, block), [=](::sycl::nd_item<3> item_ct1) { abstract_classical_spmv( num_rows, val, col_idxs, row_ptrs, b, c, item_ct1); }); @@ -732,7 +736,7 @@ void abstract_classical_spmv( acc::range val, const IndexType* __restrict__ col_idxs, const IndexType* __restrict__ row_ptrs, acc::range b, const typename output_accessor::storage_type* __restrict__ beta, - acc::range c, sycl::nd_item<3> item_ct1) + acc::range c, ::sycl::nd_item<3> item_ct1) { using type = typename output_accessor::arithmetic_type; const type alpha_val = alpha[0]; @@ -748,8 +752,8 @@ void abstract_classical_spmv( template void abstract_classical_spmv( - dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue, - const size_type num_rows, + dim3 grid, dim3 block, size_type dynamic_shared_memory, + ::sycl::queue* queue, const size_type num_rows, const typename matrix_accessor::storage_type* alpha, acc::range val, const IndexType* col_idxs, const IndexType* row_ptrs, acc::range b, @@ -757,9 +761,9 @@ void abstract_classical_spmv( acc::range c) { if (subgroup_size > 1) { - queue->submit([&](sycl::handler& cgh) { + queue->submit([&](::sycl::handler& cgh) { cgh.parallel_for(sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) + [=](::sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(subgroup_size)]] { abstract_classical_spmv( num_rows, alpha, val, col_idxs, @@ -767,9 +771,9 @@ void abstract_classical_spmv( }); }); } else { - queue->submit([&](sycl::handler& cgh) { + queue->submit([&](::sycl::handler& cgh) { cgh.parallel_for(sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) { + [=](::sycl::nd_item<3> item_ct1) { abstract_classical_spmv( num_rows, alpha, val, col_idxs, row_ptrs, b, beta, c, item_ct1); @@ -783,7 +787,7 @@ template void fill_in_dense(size_type num_rows, const IndexType* __restrict__ row_ptrs, const IndexType* __restrict__ col_idxs, const ValueType* __restrict__ values, size_type stride, - ValueType* __restrict__ result, sycl::nd_item<3> item_ct1) + ValueType* __restrict__ result, ::sycl::nd_item<3> item_ct1) { const auto tidx = thread::get_thread_id_flat(item_ct1); if (tidx < num_rows) { @@ -799,7 +803,7 @@ GKO_ENABLE_DEFAULT_HOST(fill_in_dense, fill_in_dense); template void check_unsorted(const IndexType* __restrict__ row_ptrs, const IndexType* __restrict__ col_idxs, IndexType num_rows, - bool* flag, sycl::nd_item<3> item_ct1, bool* sh_flag) + bool* flag, ::sycl::nd_item<3> item_ct1, bool* sh_flag) { auto block = group::this_thread_block(item_ct1); if (block.thread_rank() == 0) { @@ -826,16 +830,16 @@ void check_unsorted(const IndexType* __restrict__ row_ptrs, template void check_unsorted(dim3 grid, dim3 block, size_type dynamic_shared_memory, - sycl::queue* queue, const IndexType* row_ptrs, + ::sycl::queue* queue, const IndexType* row_ptrs, const IndexType* col_idxs, IndexType num_rows, bool* flag) { - queue->submit([&](sycl::handler& cgh) { - sycl::accessor + queue->submit([&](::sycl::handler& cgh) { + ::sycl::accessor sh_flag_acc_ct1(cgh); cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + sycl_nd_range(grid, block), [=](::sycl::nd_item<3> item_ct1) { check_unsorted(row_ptrs, col_idxs, num_rows, flag, item_ct1, sh_flag_acc_ct1.get_pointer()); }); @@ -848,7 +852,7 @@ void extract_diagonal(size_type diag_size, size_type nnz, const ValueType* __restrict__ orig_values, const IndexType* __restrict__ orig_row_ptrs, const IndexType* __restrict__ orig_col_idxs, - ValueType* __restrict__ diag, sycl::nd_item<3> item_ct1) + ValueType* __restrict__ diag, ::sycl::nd_item<3> item_ct1) { constexpr auto warp_size = config::warp_size; const auto row = thread::get_subwarp_id_flat(item_ct1); @@ -879,7 +883,7 @@ void row_ptr_permute_kernel(size_type num_rows, const IndexType* __restrict__ permutation, const IndexType* __restrict__ in_row_ptrs, IndexType* __restrict__ out_nnz, - sycl::nd_item<3> item_ct1) + ::sycl::nd_item<3> item_ct1) { auto tid = thread::get_thread_id_flat(item_ct1); if (tid >= num_rows) { @@ -898,7 +902,7 @@ void inv_row_ptr_permute_kernel(size_type num_rows, const IndexType* __restrict__ permutation, const IndexType* __restrict__ in_row_ptrs, IndexType* __restrict__ out_nnz, - sycl::nd_item<3> item_ct1) + ::sycl::nd_item<3> item_ct1) { auto tid = thread::get_thread_id_flat(item_ct1); if (tid >= num_rows) { @@ -921,7 +925,7 @@ void row_permute_kernel(size_type num_rows, const IndexType* __restrict__ out_row_ptrs, IndexType* __restrict__ out_cols, ValueType* __restrict__ out_vals, - sycl::nd_item<3> item_ct1) + ::sycl::nd_item<3> item_ct1) { auto tid = thread::get_subwarp_id_flat(item_ct1); if (tid >= num_rows) { @@ -941,15 +945,15 @@ void row_permute_kernel(size_type num_rows, template void row_permute_kernel(dim3 grid, dim3 block, size_type dynamic_shared_memory, - sycl::queue* queue, size_type num_rows, + ::sycl::queue* queue, size_type num_rows, const IndexType* permutation, const IndexType* in_row_ptrs, const IndexType* in_cols, const ValueType* in_vals, const IndexType* out_row_ptrs, IndexType* out_cols, ValueType* out_vals) { - queue->submit([&](sycl::handler& cgh) { + queue->submit([&](::sycl::handler& cgh) { cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + sycl_nd_range(grid, block), [=](::sycl::nd_item<3> item_ct1) { row_permute_kernel( num_rows, permutation, in_row_ptrs, in_cols, in_vals, out_row_ptrs, out_cols, out_vals, item_ct1); @@ -967,7 +971,7 @@ void inv_row_permute_kernel(size_type num_rows, const IndexType* __restrict__ out_row_ptrs, IndexType* __restrict__ out_cols, ValueType* __restrict__ out_vals, - sycl::nd_item<3> item_ct1) + ::sycl::nd_item<3> item_ct1) { auto tid = thread::get_subwarp_id_flat(item_ct1); if (tid >= num_rows) { @@ -987,16 +991,17 @@ void inv_row_permute_kernel(size_type num_rows, template void inv_row_permute_kernel(dim3 grid, dim3 block, - size_type dynamic_shared_memory, sycl::queue* queue, - size_type num_rows, const IndexType* permutation, + size_type dynamic_shared_memory, + ::sycl::queue* queue, size_type num_rows, + const IndexType* permutation, const IndexType* in_row_ptrs, const IndexType* in_cols, const ValueType* in_vals, const IndexType* out_row_ptrs, IndexType* out_cols, ValueType* out_vals) { - queue->submit([&](sycl::handler& cgh) { + queue->submit([&](::sycl::handler& cgh) { cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + sycl_nd_range(grid, block), [=](::sycl::nd_item<3> item_ct1) { inv_row_permute_kernel( num_rows, permutation, in_row_ptrs, in_cols, in_vals, out_row_ptrs, out_cols, out_vals, item_ct1); @@ -1014,7 +1019,7 @@ void inv_symm_permute_kernel(size_type num_rows, const IndexType* __restrict__ out_row_ptrs, IndexType* __restrict__ out_cols, ValueType* __restrict__ out_vals, - sycl::nd_item<3> item_ct1) + ::sycl::nd_item<3> item_ct1) { auto tid = thread::get_subwarp_id_flat(item_ct1); if (tid >= num_rows) { @@ -1035,16 +1040,16 @@ void inv_symm_permute_kernel(size_type num_rows, template void inv_symm_permute_kernel(dim3 grid, dim3 block, size_type dynamic_shared_memory, - sycl::queue* queue, size_type num_rows, + ::sycl::queue* queue, size_type num_rows, const IndexType* permutation, const IndexType* in_row_ptrs, const IndexType* in_cols, const ValueType* in_vals, const IndexType* out_row_ptrs, IndexType* out_cols, ValueType* out_vals) { - queue->submit([&](sycl::handler& cgh) { + queue->submit([&](::sycl::handler& cgh) { cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + sycl_nd_range(grid, block), [=](::sycl::nd_item<3> item_ct1) { inv_symm_permute_kernel( num_rows, permutation, in_row_ptrs, in_cols, in_vals, out_row_ptrs, out_cols, out_vals, item_ct1); @@ -1461,7 +1466,7 @@ void calc_nnz_in_span(const span row_span, const span col_span, const IndexType* __restrict__ row_ptrs, const IndexType* __restrict__ col_idxs, IndexType* __restrict__ nnz_per_row, - sycl::nd_item<3> item_ct1) + ::sycl::nd_item<3> item_ct1) { const auto tidx = thread::get_thread_id_flat(item_ct1) + row_span.begin; if (tidx < row_span.end) { @@ -1488,7 +1493,7 @@ void compute_submatrix_idxs_and_vals(size_type num_rows, size_type num_cols, const IndexType* __restrict__ res_row_ptrs, IndexType* __restrict__ res_col_idxs, ValueType* __restrict__ res_values, - sycl::nd_item<3> item_ct1) + ::sycl::nd_item<3> item_ct1) { const auto tidx = thread::get_thread_id_flat(item_ct1); if (tidx < num_rows) { @@ -1795,8 +1800,8 @@ void spgemm(std::shared_ptr exec, reinterpret_cast*>(heap); // first sweep: count nnz for each row - queue->submit([&](sycl::handler& cgh) { - cgh.parallel_for(sycl::range<1>{num_rows}, [=](sycl::id<1> idx) { + queue->submit([&](::sycl::handler& cgh) { + cgh.parallel_for(::sycl::range<1>{num_rows}, [=](::sycl::id<1> idx) { const auto a_row = static_cast(idx[0]); c_row_ptrs[a_row] = spgemm_multiway_merge( a_row, a_row_ptrs, a_cols, a_vals, b_row_ptrs, b_cols, b_vals, @@ -1819,8 +1824,8 @@ void spgemm(std::shared_ptr exec, auto c_col_idxs = c_col_idxs_array.get_data(); auto c_vals = c_vals_array.get_data(); - queue->submit([&](sycl::handler& cgh) { - cgh.parallel_for(sycl::range<1>{num_rows}, [=](sycl::id<1> idx) { + queue->submit([&](::sycl::handler& cgh) { + cgh.parallel_for(::sycl::range<1>{num_rows}, [=](::sycl::id<1> idx) { const auto a_row = static_cast(idx[0]); spgemm_multiway_merge( a_row, a_row_ptrs, a_cols, a_vals, b_row_ptrs, b_cols, b_vals, @@ -1880,8 +1885,8 @@ void advanced_spgemm(std::shared_ptr exec, reinterpret_cast*>(heap); // first sweep: count nnz for each row - queue->submit([&](sycl::handler& cgh) { - cgh.parallel_for(sycl::range<1>{num_rows}, [=](sycl::id<1> idx) { + queue->submit([&](::sycl::handler& cgh) { + cgh.parallel_for(::sycl::range<1>{num_rows}, [=](::sycl::id<1> idx) { const auto a_row = static_cast(idx[0]); auto d_nz = d_row_ptrs[a_row]; const auto d_end = d_row_ptrs[a_row + 1]; @@ -1918,8 +1923,8 @@ void advanced_spgemm(std::shared_ptr exec, auto c_col_idxs = c_col_idxs_array.get_data(); auto c_vals = c_vals_array.get_data(); - queue->submit([&](sycl::handler& cgh) { - cgh.parallel_for(sycl::range<1>{num_rows}, [=](sycl::id<1> idx) { + queue->submit([&](::sycl::handler& cgh) { + cgh.parallel_for(::sycl::range<1>{num_rows}, [=](::sycl::id<1> idx) { const auto a_row = static_cast(idx[0]); auto d_nz = d_row_ptrs[a_row]; const auto d_end = d_row_ptrs[a_row + 1]; @@ -1997,8 +2002,8 @@ void spgeam(std::shared_ptr exec, auto queue = exec->get_queue(); // count number of non-zeros per row - queue->submit([&](sycl::handler& cgh) { - cgh.parallel_for(sycl::range<1>{num_rows}, [=](sycl::id<1> idx) { + queue->submit([&](::sycl::handler& cgh) { + cgh.parallel_for(::sycl::range<1>{num_rows}, [=](::sycl::id<1> idx) { const auto row = static_cast(idx[0]); auto a_idx = a_row_ptrs[row]; const auto a_end = a_row_ptrs[row + 1]; @@ -2034,8 +2039,8 @@ void spgeam(std::shared_ptr exec, const auto beta_vals = beta->get_const_values(); // count number of non-zeros per row - queue->submit([&](sycl::handler& cgh) { - cgh.parallel_for(sycl::range<1>{num_rows}, [=](sycl::id<1> idx) { + queue->submit([&](::sycl::handler& cgh) { + cgh.parallel_for(::sycl::range<1>{num_rows}, [=](::sycl::id<1> idx) { const auto row = static_cast(idx[0]); auto a_idx = a_row_ptrs[row]; const auto a_end = a_row_ptrs[row + 1]; @@ -2115,8 +2120,8 @@ void generic_transpose(std::shared_ptr exec, auto out_vals = trans->get_values(); components::fill_array(exec, tmp_counts, num_cols, IndexType{}); - queue->submit([&](sycl::handler& cgh) { - cgh.parallel_for(sycl::range<1>{num_rows}, [=](sycl::id<1> idx) { + queue->submit([&](::sycl::handler& cgh) { + cgh.parallel_for(::sycl::range<1>{num_rows}, [=](::sycl::id<1> idx) { const auto row = static_cast(idx[0]); const auto begin = row_ptrs[row]; const auto end = row_ptrs[row + 1]; @@ -2129,8 +2134,8 @@ void generic_transpose(std::shared_ptr exec, components::prefix_sum_nonnegative(exec, tmp_counts, num_cols + 1); exec->copy(num_cols + 1, tmp_counts, out_row_ptrs); - queue->submit([&](sycl::handler& cgh) { - cgh.parallel_for(sycl::range<1>{num_rows}, [=](sycl::id<1> idx) { + queue->submit([&](::sycl::handler& cgh) { + cgh.parallel_for(::sycl::range<1>{num_rows}, [=](::sycl::id<1> idx) { const auto row = static_cast(idx[0]); const auto begin = row_ptrs[row]; const auto end = row_ptrs[row + 1]; @@ -2256,8 +2261,8 @@ void sort_by_column_index(std::shared_ptr exec, const auto row_ptrs = to_sort->get_const_row_ptrs(); auto cols = to_sort->get_col_idxs(); auto vals = to_sort->get_values(); - exec->get_queue()->submit([&](sycl::handler& cgh) { - cgh.parallel_for(sycl::range<1>{num_rows}, [=](sycl::id<1> idx) { + exec->get_queue()->submit([&](::sycl::handler& cgh) { + cgh.parallel_for(::sycl::range<1>{num_rows}, [=](::sycl::id<1> idx) { const auto row = static_cast(idx[0]); const auto begin = row_ptrs[row]; auto size = row_ptrs[row + 1] - begin; @@ -2316,8 +2321,8 @@ void is_sorted_by_column_index( const auto row_ptrs = to_check->get_const_row_ptrs(); const auto cols = to_check->get_const_col_idxs(); auto is_sorted_device = is_sorted_device_array.get_data(); - exec->get_queue()->submit([&](sycl::handler& cgh) { - cgh.parallel_for(sycl::range<1>{num_rows}, [=](sycl::id<1> idx) { + exec->get_queue()->submit([&](::sycl::handler& cgh) { + cgh.parallel_for(::sycl::range<1>{num_rows}, [=](::sycl::id<1> idx) { const auto row = static_cast(idx[0]); const auto begin = row_ptrs[row]; const auto end = row_ptrs[row + 1]; @@ -2473,8 +2478,8 @@ void build_lookup(std::shared_ptr exec, const IndexType* storage_offsets, int64* row_desc, int32* storage) { - exec->get_queue()->submit([&](sycl::handler& cgh) { - cgh.parallel_for(sycl::range<1>{num_rows}, [=](sycl::id<1> idx) { + exec->get_queue()->submit([&](::sycl::handler& cgh) { + cgh.parallel_for(::sycl::range<1>{num_rows}, [=](::sycl::id<1> idx) { const auto row = static_cast(idx[0]); const auto row_begin = row_ptrs[row]; const auto row_len = row_ptrs[row + 1] - row_begin; diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp index 565c25e82de..4f89dbbf9e6 100644 --- a/dpcpp/matrix/dense_kernels.dp.cpp +++ b/dpcpp/matrix/dense_kernels.dp.cpp @@ -87,7 +87,7 @@ template void transpose(const size_type nrows, const size_type ncols, const ValueType* __restrict__ in, const size_type in_stride, ValueType* __restrict__ out, const size_type out_stride, - Closure op, sycl::nd_item<3> item_ct1, + Closure op, ::sycl::nd_item<3> item_ct1, uninitialized_array& space) { auto local_x = item_ct1.get_local_id(2); @@ -98,7 +98,7 @@ void transpose(const size_type nrows, const size_type ncols, space[local_y * (sg_size + 1) + local_x] = op(in[y * in_stride + x]); } - item_ct1.barrier(sycl::access::fence_space::local_space); + item_ct1.barrier(::sycl::access::fence_space::local_space); x = item_ct1.get_group(1) * sg_size + local_x; y = item_ct1.get_group(2) * sg_size + local_y; if (y < ncols && x < nrows) { @@ -111,7 +111,7 @@ void transpose( const size_type nrows, const size_type ncols, const ValueType* __restrict__ in, const size_type in_stride, ValueType* __restrict__ out, const size_type out_stride, - sycl::nd_item<3> item_ct1, + ::sycl::nd_item<3> item_ct1, uninitialized_array& space) { @@ -121,7 +121,7 @@ void transpose( } template -void transpose(sycl::queue* queue, const matrix::Dense* orig, +void transpose(::sycl::queue* queue, const matrix::Dense* orig, matrix::Dense* trans) { auto size = orig->get_size(); @@ -129,10 +129,10 @@ void transpose(sycl::queue* queue, const matrix::Dense* orig, dim3 grid(ceildiv(size[1], sg_size), ceildiv(size[0], sg_size)); dim3 block(sg_size, sg_size); - queue->submit([&](sycl::handler& cgh) { - sycl::accessor, 0, - sycl::access_mode::read_write, - sycl::access::target::local> + queue->submit([&](::sycl::handler& cgh) { + ::sycl::accessor, + 0, ::sycl::access_mode::read_write, + ::sycl::access::target::local> space_acc_ct1(cgh); // Can not pass the member to device function directly auto in = orig->get_const_values(); @@ -140,7 +140,7 @@ void transpose(sycl::queue* queue, const matrix::Dense* orig, auto out = trans->get_values(); auto out_stride = trans->get_stride(); cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + sycl_nd_range(grid, block), [=](::sycl::nd_item<3> item_ct1) { transpose(size[0], size[1], in, in_stride, out, out_stride, item_ct1, *space_acc_ct1.get_pointer()); @@ -157,7 +157,7 @@ void conj_transpose( const size_type nrows, const size_type ncols, const ValueType* __restrict__ in, const size_type in_stride, ValueType* __restrict__ out, const size_type out_stride, - sycl::nd_item<3> item_ct1, + ::sycl::nd_item<3> item_ct1, uninitialized_array& space) { @@ -168,21 +168,21 @@ void conj_transpose( template void conj_transpose(dim3 grid, dim3 block, size_type dynamic_shared_memory, - sycl::queue* queue, const size_type nrows, + ::sycl::queue* queue, const size_type nrows, const size_type ncols, const ValueType* in, const size_type in_stride, ValueType* out, const size_type out_stride) { constexpr auto sg_size = DeviceConfig::subgroup_size; - queue->submit([&](sycl::handler& cgh) { - sycl::accessor, 0, - sycl::access_mode::read_write, - sycl::access::target::local> + queue->submit([&](::sycl::handler& cgh) { + ::sycl::accessor, + 0, ::sycl::access_mode::read_write, + ::sycl::access::target::local> space_acc_ct1(cgh); cgh.parallel_for( sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) __WG_BOUND__(sg_size, sg_size) { + [=](::sycl::nd_item<3> item_ct1) __WG_BOUND__(sg_size, sg_size) { conj_transpose(nrows, ncols, in, in_stride, out, out_stride, item_ct1, *space_acc_ct1.get_pointer()); @@ -308,8 +308,8 @@ void convert_to_coo(std::shared_ptr exec, auto cols = result->get_col_idxs(); auto vals = result->get_values(); - exec->get_queue()->submit([&](sycl::handler& cgh) { - cgh.parallel_for(num_rows, [=](sycl::item<1> item) { + exec->get_queue()->submit([&](::sycl::handler& cgh) { + cgh.parallel_for(num_rows, [=](::sycl::item<1> item) { const auto row = static_cast(item[0]); auto write_to = row_ptrs[row]; @@ -343,8 +343,8 @@ void convert_to_csr(std::shared_ptr exec, auto cols = result->get_col_idxs(); auto vals = result->get_values(); - exec->get_queue()->submit([&](sycl::handler& cgh) { - cgh.parallel_for(num_rows, [=](sycl::item<1> item) { + exec->get_queue()->submit([&](::sycl::handler& cgh) { + cgh.parallel_for(num_rows, [=](::sycl::item<1> item) { const auto row = static_cast(item[0]); auto write_to = row_ptrs[row]; @@ -378,8 +378,8 @@ void convert_to_ell(std::shared_ptr exec, auto vals = result->get_values(); const auto stride = result->get_stride(); - exec->get_queue()->submit([&](sycl::handler& cgh) { - cgh.parallel_for(num_rows, [=](sycl::item<1> item) { + exec->get_queue()->submit([&](::sycl::handler& cgh) { + cgh.parallel_for(num_rows, [=](::sycl::item<1> item) { const auto row = static_cast(item[0]); size_type col_idx = 0; for (size_type col = 0; col < num_cols; col++) { @@ -440,8 +440,8 @@ void convert_to_hybrid(std::shared_ptr exec, auto coo_cols = result->get_coo_col_idxs(); auto coo_vals = result->get_coo_values(); - exec->get_queue()->submit([&](sycl::handler& cgh) { - cgh.parallel_for(num_rows, [=](sycl::item<1> item) { + exec->get_queue()->submit([&](::sycl::handler& cgh) { + cgh.parallel_for(num_rows, [=](::sycl::item<1> item) { const auto row = static_cast(item[0]); size_type ell_count = 0; size_type col = 0; @@ -493,8 +493,8 @@ void convert_to_sellp(std::shared_ptr exec, auto vals = result->get_values(); auto col_idxs = result->get_col_idxs(); - exec->get_queue()->submit([&](sycl::handler& cgh) { - cgh.parallel_for(num_rows, [=](sycl::item<1> item) { + exec->get_queue()->submit([&](::sycl::handler& cgh) { + cgh.parallel_for(num_rows, [=](::sycl::item<1> item) { const auto row = static_cast(item[0]); const auto local_row = row % slice_size; const auto slice = row / slice_size; @@ -534,8 +534,8 @@ void convert_to_sparsity_csr(std::shared_ptr exec, const auto row_ptrs = result->get_const_row_ptrs(); auto cols = result->get_col_idxs(); - exec->get_queue()->submit([&](sycl::handler& cgh) { - cgh.parallel_for(num_rows, [=](sycl::item<1> item) { + exec->get_queue()->submit([&](::sycl::handler& cgh) { + cgh.parallel_for(num_rows, [=](::sycl::item<1> item) { const auto row = static_cast(item[0]); auto write_to = row_ptrs[row]; @@ -566,7 +566,7 @@ void transpose(std::shared_ptr exec, return validate(queue, cfg.block_size, sg_size) && sg_size * (sg_size + 1) * sizeof(ValueType) <= queue->get_device() - .get_info(); + .get_info<::sycl::info::device::local_mem_size>(); }, queue, orig, trans); } @@ -588,7 +588,7 @@ void conj_transpose(std::shared_ptr exec, return validate(queue, DCFG_1D::decode<0>(cfg), sg_size) && sg_size * (sg_size + 1) * sizeof(ValueType) <= queue->get_device() - .get_info(); + .get_info<::sycl::info::device::local_mem_size>(); }); const auto sg_size = DCFG_1D::decode<1>(cfg); dim3 grid(ceildiv(size[1], sg_size), ceildiv(size[0], sg_size)); diff --git a/dpcpp/matrix/diagonal_kernels.dp.cpp b/dpcpp/matrix/diagonal_kernels.dp.cpp index 60bc52c18d2..f44eede1dc7 100644 --- a/dpcpp/matrix/diagonal_kernels.dp.cpp +++ b/dpcpp/matrix/diagonal_kernels.dp.cpp @@ -68,7 +68,7 @@ template void apply_to_csr(size_type num_rows, const ValueType* __restrict__ diag, const IndexType* __restrict__ row_ptrs, ValueType* __restrict__ result_values, bool inverse, - sycl::nd_item<3> item_ct1) + ::sycl::nd_item<3> item_ct1) { constexpr auto warp_size = config::warp_size; auto warp_tile = diff --git a/dpcpp/matrix/ell_kernels.dp.cpp b/dpcpp/matrix/ell_kernels.dp.cpp index 88e2bf40efa..703e92056f6 100644 --- a/dpcpp/matrix/ell_kernels.dp.cpp +++ b/dpcpp/matrix/ell_kernels.dp.cpp @@ -119,7 +119,7 @@ void spmv_kernel( acc::range val, const IndexType* __restrict__ col, const size_type stride, const size_type num_stored_elements_per_row, acc::range b, OutputValueType* __restrict__ c, - const size_type c_stride, Closure op, sycl::nd_item<3> item_ct1, + const size_type c_stride, Closure op, ::sycl::nd_item<3> item_ct1, uninitialized_array& storage) { @@ -153,7 +153,7 @@ void spmv_kernel( storage[item_ct1.get_local_id(2)] = 0; } - item_ct1.barrier(sycl::access::fence_space::local_space); + item_ct1.barrier(::sycl::access::fence_space::local_space); auto temp = zero(); if (runnable) { for (size_type idx = @@ -171,7 +171,7 @@ void spmv_kernel( temp); } - item_ct1.barrier(sycl::access::fence_space::local_space); + item_ct1.barrier(::sycl::access::fence_space::local_space); if (runnable && idx_in_worker == 0) { const auto c_ind = x * c_stride + column_id; if (atomic) { @@ -192,7 +192,7 @@ void spmv( acc::range val, const IndexType* __restrict__ col, const size_type stride, const size_type num_stored_elements_per_row, acc::range b, OutputValueType* __restrict__ c, - const size_type c_stride, sycl::nd_item<3> item_ct1, + const size_type c_stride, ::sycl::nd_item<3> item_ct1, uninitialized_array& storage) { @@ -206,21 +206,21 @@ void spmv( template void spmv(dim3 grid, dim3 block, size_type dynamic_shared_memory, - sycl::queue* queue, const size_type num_rows, + ::sycl::queue* queue, const size_type num_rows, const int num_worker_per_row, acc::range val, const IndexType* col, const size_type stride, const size_type num_stored_elements_per_row, acc::range b, OutputValueType* c, const size_type c_stride) { - queue->submit([&](sycl::handler& cgh) { - sycl::accessor< + queue->submit([&](::sycl::handler& cgh) { + ::sycl::accessor< uninitialized_array, - 0, sycl::access_mode::read_write, sycl::access::target::local> + 0, ::sycl::access_mode::read_write, ::sycl::access::target::local> storage_acc_ct1(cgh); cgh.parallel_for(sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) { + [=](::sycl::nd_item<3> item_ct1) { spmv( num_rows, num_worker_per_row, val, col, stride, num_stored_elements_per_row, b, c, c_stride, @@ -238,7 +238,7 @@ void spmv( const IndexType* __restrict__ col, const size_type stride, const size_type num_stored_elements_per_row, acc::range b, const OutputValueType* __restrict__ beta, OutputValueType* __restrict__ c, - const size_type c_stride, sycl::nd_item<3> item_ct1, + const size_type c_stride, ::sycl::nd_item<3> item_ct1, uninitialized_array& storage) { @@ -272,22 +272,22 @@ void spmv( template void spmv(dim3 grid, dim3 block, size_type dynamic_shared_memory, - sycl::queue* queue, const size_type num_rows, + ::sycl::queue* queue, const size_type num_rows, const int num_worker_per_row, acc::range alpha, acc::range val, const IndexType* col, const size_type stride, const size_type num_stored_elements_per_row, acc::range b, const OutputValueType* beta, OutputValueType* c, const size_type c_stride) { - queue->submit([&](sycl::handler& cgh) { - sycl::accessor< + queue->submit([&](::sycl::handler& cgh) { + ::sycl::accessor< uninitialized_array, - 0, sycl::access_mode::read_write, sycl::access::target::local> + 0, ::sycl::access_mode::read_write, ::sycl::access::target::local> storage_acc_ct1(cgh); cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + sycl_nd_range(grid, block), [=](::sycl::nd_item<3> item_ct1) { spmv( num_rows, num_worker_per_row, alpha, val, col, stride, num_stored_elements_per_row, b, beta, c, c_stride, item_ct1, diff --git a/dpcpp/matrix/sellp_kernels.dp.cpp b/dpcpp/matrix/sellp_kernels.dp.cpp index 85300bb531a..bd16dbb4d75 100644 --- a/dpcpp/matrix/sellp_kernels.dp.cpp +++ b/dpcpp/matrix/sellp_kernels.dp.cpp @@ -76,7 +76,7 @@ void spmv_kernel(size_type num_rows, size_type num_right_hand_sides, const ValueType* __restrict__ a, const IndexType* __restrict__ cols, const ValueType* __restrict__ b, ValueType* __restrict__ c, - sycl::nd_item<3> item_ct1) + ::sycl::nd_item<3> item_ct1) { const auto row = thread::get_thread_id_flat(item_ct1); const auto slice_id = row / slice_size; @@ -99,16 +99,14 @@ GKO_ENABLE_DEFAULT_HOST(spmv_kernel, spmv_kernel); template -void advanced_spmv_kernel(size_type num_rows, size_type num_right_hand_sides, - size_type b_stride, size_type c_stride, - size_type slice_size, - const size_type* __restrict__ slice_sets, - const ValueType* __restrict__ alpha, - const ValueType* __restrict__ a, - const IndexType* __restrict__ cols, - const ValueType* __restrict__ b, - const ValueType* __restrict__ beta, - ValueType* __restrict__ c, sycl::nd_item<3> item_ct1) +void advanced_spmv_kernel( + size_type num_rows, size_type num_right_hand_sides, size_type b_stride, + size_type c_stride, size_type slice_size, + const size_type* __restrict__ slice_sets, + const ValueType* __restrict__ alpha, const ValueType* __restrict__ a, + const IndexType* __restrict__ cols, const ValueType* __restrict__ b, + const ValueType* __restrict__ beta, ValueType* __restrict__ c, + ::sycl::nd_item<3> item_ct1) { const auto row = thread::get_thread_id_flat(item_ct1); const auto slice_id = row / slice_size; diff --git a/dpcpp/matrix/sparsity_csr_kernels.dp.cpp b/dpcpp/matrix/sparsity_csr_kernels.dp.cpp index 29ddfc7bb40..6bd3eb11b6e 100644 --- a/dpcpp/matrix/sparsity_csr_kernels.dp.cpp +++ b/dpcpp/matrix/sparsity_csr_kernels.dp.cpp @@ -80,7 +80,7 @@ void device_classical_spmv(const size_type num_rows, const IndexType* __restrict__ row_ptrs, acc::range b, acc::range c, Closure scale, - sycl::nd_item<3> item_ct1) + ::sycl::nd_item<3> item_ct1) { using arithmetic_type = typename output_accessor::arithmetic_type; auto subgroup_tile = group::tiled_partition( @@ -119,7 +119,7 @@ void abstract_classical_spmv(const size_type num_rows, const IndexType* __restrict__ row_ptrs, acc::range b, acc::range c, - sycl::nd_item<3> item_ct1) + ::sycl::nd_item<3> item_ct1) { using type = typename output_accessor::arithmetic_type; device_classical_spmv( @@ -130,14 +130,14 @@ void abstract_classical_spmv(const size_type num_rows, template void abstract_classical_spmv( - dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue, - const size_type num_rows, const MatrixValueType* val, + dim3 grid, dim3 block, size_type dynamic_shared_memory, + ::sycl::queue* queue, const size_type num_rows, const MatrixValueType* val, const IndexType* col_idxs, const IndexType* row_ptrs, acc::range b, acc::range c) { // only subgroup = 1, so does not need sycl::reqd_sub_group_size queue->parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + sycl_nd_range(grid, block), [=](::sycl::nd_item<3> item_ct1) { abstract_classical_spmv(num_rows, val, col_idxs, row_ptrs, b, c, item_ct1); }); @@ -152,7 +152,7 @@ void abstract_classical_spmv( const IndexType* __restrict__ col_idxs, const IndexType* __restrict__ row_ptrs, acc::range b, const typename output_accessor::storage_type* __restrict__ beta, - acc::range c, sycl::nd_item<3> item_ct1) + acc::range c, ::sycl::nd_item<3> item_ct1) { using type = typename output_accessor::arithmetic_type; const type alpha_val = static_cast(alpha[0]); @@ -167,17 +167,20 @@ void abstract_classical_spmv( template -void abstract_classical_spmv( - dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue, - const size_type num_rows, const MatrixValueType* alpha, - const MatrixValueType* val, const IndexType* col_idxs, - const IndexType* row_ptrs, acc::range b, - const typename output_accessor::storage_type* beta, - acc::range c) +void abstract_classical_spmv(dim3 grid, dim3 block, + size_type dynamic_shared_memory, + ::sycl::queue* queue, const size_type num_rows, + const MatrixValueType* alpha, + const MatrixValueType* val, + const IndexType* col_idxs, + const IndexType* row_ptrs, + acc::range b, + const typename output_accessor::storage_type* beta, + acc::range c) { // only subgroup = 1, so does not need sycl::reqd_sub_group_size queue->parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + sycl_nd_range(grid, block), [=](::sycl::nd_item<3> item_ct1) { abstract_classical_spmv( num_rows, alpha, val, col_idxs, row_ptrs, b, beta, c, item_ct1); }); @@ -309,8 +312,8 @@ void sort_by_column_index(std::shared_ptr exec, const auto cols = to_sort->get_col_idxs(); auto queue = exec->get_queue(); // build sorted postorder node list for each row - queue->submit([&](sycl::handler& cgh) { - cgh.parallel_for(sycl::range<1>{num_rows}, [=](sycl::id<1> idx_id) { + queue->submit([&](::sycl::handler& cgh) { + cgh.parallel_for(::sycl::range<1>{num_rows}, [=](::sycl::id<1> idx_id) { const auto row = idx_id[0]; const auto row_begin = row_ptrs[row]; const auto row_end = row_ptrs[row + 1]; @@ -337,8 +340,8 @@ void is_sorted_by_column_index( const auto row_ptrs = to_check->get_const_row_ptrs(); const auto cols = to_check->get_const_col_idxs(); auto is_sorted_device = gpu_array.get_data(); - exec->get_queue()->submit([&](sycl::handler& cgh) { - cgh.parallel_for(sycl::range<1>{num_rows}, [=](sycl::id<1> idx) { + exec->get_queue()->submit([&](::sycl::handler& cgh) { + cgh.parallel_for(::sycl::range<1>{num_rows}, [=](::sycl::id<1> idx) { const auto row = static_cast(idx[0]); const auto begin = row_ptrs[row]; const auto end = row_ptrs[row + 1]; diff --git a/dpcpp/preconditioner/isai_kernels.dp.cpp b/dpcpp/preconditioner/isai_kernels.dp.cpp index f3e1c2c58ac..fa1bb7a4582 100644 --- a/dpcpp/preconditioner/isai_kernels.dp.cpp +++ b/dpcpp/preconditioner/isai_kernels.dp.cpp @@ -91,7 +91,7 @@ __dpct_inline__ void generic_generate( const IndexType* __restrict__ i_col_idxs, ValueType* __restrict__ i_values, IndexType* __restrict__ excess_rhs_sizes, IndexType* __restrict__ excess_nnz, Callable direct_solve, - sycl::nd_item<3> item_ct1, + ::sycl::nd_item<3> item_ct1, uninitialized_array* storage) { @@ -225,7 +225,7 @@ void generate_l_inverse( const IndexType* __restrict__ i_row_ptrs, const IndexType* __restrict__ i_col_idxs, ValueType* __restrict__ i_values, IndexType* __restrict__ excess_rhs_sizes, - IndexType* __restrict__ excess_nnz, sycl::nd_item<3> item_ct1, + IndexType* __restrict__ excess_nnz, ::sycl::nd_item<3> item_ct1, uninitialized_array* storage) { @@ -259,23 +259,23 @@ void generate_l_inverse( template void generate_l_inverse(dim3 grid, dim3 block, size_type dynamic_shared_memory, - sycl::queue* queue, IndexType num_rows, + ::sycl::queue* queue, IndexType num_rows, const IndexType* m_row_ptrs, const IndexType* m_col_idxs, const ValueType* m_values, const IndexType* i_row_ptrs, const IndexType* i_col_idxs, ValueType* i_values, IndexType* excess_rhs_sizes, IndexType* excess_nnz) { - queue->submit([&](sycl::handler& cgh) { - sycl::accessor< + queue->submit([&](::sycl::handler& cgh) { + ::sycl::accessor< uninitialized_array, - 0, sycl::access_mode::read_write, sycl::access::target::local> + 0, ::sycl::access_mode::read_write, ::sycl::access::target::local> storage_acc_ct1(cgh); cgh.parallel_for( sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) + [=](::sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(subwarp_size)]] { generate_l_inverse( num_rows, m_row_ptrs, m_col_idxs, m_values, i_row_ptrs, @@ -295,7 +295,7 @@ void generate_u_inverse( const IndexType* __restrict__ i_row_ptrs, const IndexType* __restrict__ i_col_idxs, ValueType* __restrict__ i_values, IndexType* __restrict__ excess_rhs_sizes, - IndexType* __restrict__ excess_nnz, sycl::nd_item<3> item_ct1, + IndexType* __restrict__ excess_nnz, ::sycl::nd_item<3> item_ct1, uninitialized_array* storage) { @@ -329,23 +329,23 @@ void generate_u_inverse( template void generate_u_inverse(dim3 grid, dim3 block, size_type dynamic_shared_memory, - sycl::queue* queue, IndexType num_rows, + ::sycl::queue* queue, IndexType num_rows, const IndexType* m_row_ptrs, const IndexType* m_col_idxs, const ValueType* m_values, const IndexType* i_row_ptrs, const IndexType* i_col_idxs, ValueType* i_values, IndexType* excess_rhs_sizes, IndexType* excess_nnz) { - queue->submit([&](sycl::handler& cgh) { - sycl::accessor< + queue->submit([&](::sycl::handler& cgh) { + ::sycl::accessor< uninitialized_array, - 0, sycl::access_mode::read_write, sycl::access::target::local> + 0, ::sycl::access_mode::read_write, ::sycl::access::target::local> storage_acc_ct1(cgh); cgh.parallel_for( sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) + [=](::sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(subwarp_size)]] { generate_u_inverse( num_rows, m_row_ptrs, m_col_idxs, m_values, i_row_ptrs, @@ -365,7 +365,7 @@ void generate_general_inverse( const IndexType* __restrict__ i_row_ptrs, const IndexType* __restrict__ i_col_idxs, ValueType* __restrict__ i_values, IndexType* __restrict__ excess_rhs_sizes, - IndexType* __restrict__ excess_nnz, bool spd, sycl::nd_item<3> item_ct1, + IndexType* __restrict__ excess_nnz, bool spd, ::sycl::nd_item<3> item_ct1, uninitialized_array* storage) { @@ -411,23 +411,23 @@ void generate_general_inverse( template void generate_general_inverse( - dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue, - IndexType num_rows, const IndexType* m_row_ptrs, + dim3 grid, dim3 block, size_type dynamic_shared_memory, + ::sycl::queue* queue, IndexType num_rows, const IndexType* m_row_ptrs, const IndexType* m_col_idxs, const ValueType* m_values, const IndexType* i_row_ptrs, const IndexType* i_col_idxs, ValueType* i_values, IndexType* excess_rhs_sizes, IndexType* excess_nnz, bool spd) { - queue->submit([&](sycl::handler& cgh) { - sycl::accessor< + queue->submit([&](::sycl::handler& cgh) { + ::sycl::accessor< uninitialized_array, - 0, sycl::access_mode::read_write, sycl::access::target::local> + 0, ::sycl::access_mode::read_write, ::sycl::access::target::local> storage_acc_ct1(cgh); cgh.parallel_for( sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) + [=](::sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(subwarp_size)]] { generate_general_inverse( num_rows, m_row_ptrs, m_col_idxs, m_values, i_row_ptrs, @@ -452,7 +452,7 @@ void generate_excess_system(IndexType num_rows, ValueType* __restrict__ excess_values, ValueType* __restrict__ excess_rhs, size_type e_start, size_type e_end, - sycl::nd_item<3> item_ct1) + ::sycl::nd_item<3> item_ct1) { const auto row = thread::get_subwarp_id_flat(item_ct1) + @@ -517,8 +517,8 @@ void generate_excess_system(IndexType num_rows, template void generate_excess_system( - dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue, - IndexType num_rows, const IndexType* m_row_ptrs, + dim3 grid, dim3 block, size_type dynamic_shared_memory, + ::sycl::queue* queue, IndexType num_rows, const IndexType* m_row_ptrs, const IndexType* m_col_idxs, const ValueType* m_values, const IndexType* i_row_ptrs, const IndexType* i_col_idxs, const IndexType* excess_rhs_ptrs, const IndexType* excess_nz_ptrs, @@ -527,7 +527,7 @@ void generate_excess_system( size_type e_end) { queue->parallel_for(sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) + [=](::sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(subwarp_size)]] { generate_excess_system( num_rows, m_row_ptrs, m_col_idxs, m_values, @@ -542,7 +542,7 @@ template void scale_excess_solution(const IndexType* __restrict__ excess_block_ptrs, ValueType* __restrict__ excess_solution, size_type e_start, size_type e_end, - sycl::nd_item<3> item_ct1) + ::sycl::nd_item<3> item_ct1) { const auto warp_id = thread::get_subwarp_id_flat(item_ct1); @@ -572,13 +572,14 @@ void scale_excess_solution(const IndexType* __restrict__ excess_block_ptrs, template void scale_excess_solution(dim3 grid, dim3 block, - size_type dynamic_shared_memory, sycl::queue* queue, + size_type dynamic_shared_memory, + ::sycl::queue* queue, const IndexType* excess_block_ptrs, ValueType* excess_solution, size_type e_start, size_type e_end) { queue->parallel_for(sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) + [=](::sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(subwarp_size)]] { scale_excess_solution( excess_block_ptrs, excess_solution, e_start, @@ -592,7 +593,7 @@ void copy_excess_solution(IndexType num_rows, const IndexType* __restrict__ excess_rhs_ptrs, const ValueType* __restrict__ excess_solution, ValueType* __restrict__ i_values, size_type e_start, - size_type e_end, sycl::nd_item<3> item_ct1) + size_type e_end, ::sycl::nd_item<3> item_ct1) { const auto excess_row = thread::get_subwarp_id_flat(item_ct1); @@ -621,14 +622,14 @@ void copy_excess_solution(IndexType num_rows, template void copy_excess_solution(dim3 grid, dim3 block, - size_type dynamic_shared_memory, sycl::queue* queue, + size_type dynamic_shared_memory, ::sycl::queue* queue, IndexType num_rows, const IndexType* i_row_ptrs, const IndexType* excess_rhs_ptrs, const ValueType* excess_solution, ValueType* i_values, size_type e_start, size_type e_end) { queue->parallel_for(sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) + [=](::sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(subwarp_size)]] { copy_excess_solution( num_rows, i_row_ptrs, excess_rhs_ptrs, diff --git a/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp b/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp index 2f5b4c70991..8957c93b7c5 100644 --- a/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp +++ b/dpcpp/preconditioner/jacobi_advanced_apply_instantiate.inc.dp.cpp @@ -71,7 +71,7 @@ void advanced_apply( const IndexType* __restrict__ block_ptrs, size_type num_blocks, const ValueType* __restrict__ alpha, const ValueType* __restrict__ b, int32 b_stride, ValueType* __restrict__ x, int32 x_stride, - sycl::nd_item<3> item_ct1) + ::sycl::nd_item<3> item_ct1) { const auto block_id = thread::get_subwarp_id(item_ct1); @@ -98,15 +98,15 @@ void advanced_apply( template void advanced_apply( - dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue, - const ValueType* blocks, + dim3 grid, dim3 block, size_type dynamic_shared_memory, + ::sycl::queue* queue, const ValueType* blocks, preconditioner::block_interleaved_storage_scheme storage_scheme, const IndexType* block_ptrs, size_type num_blocks, const ValueType* alpha, const ValueType* b, int32 b_stride, ValueType* x, int32 x_stride) { queue->parallel_for( sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) + [=](::sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(subwarp_size)]] { advanced_apply( blocks, storage_scheme, block_ptrs, num_blocks, alpha, b, @@ -124,7 +124,7 @@ void advanced_adaptive_apply( const IndexType* __restrict__ block_ptrs, size_type num_blocks, const ValueType* __restrict__ alpha, const ValueType* __restrict__ b, int32 b_stride, ValueType* __restrict__ x, int32 x_stride, - sycl::nd_item<3> item_ct1) + ::sycl::nd_item<3> item_ct1) { const auto block_id = thread::get_subwarp_id(item_ct1); @@ -156,8 +156,8 @@ void advanced_adaptive_apply( template void advanced_adaptive_apply( - dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue, - const ValueType* blocks, + dim3 grid, dim3 block, size_type dynamic_shared_memory, + ::sycl::queue* queue, const ValueType* blocks, preconditioner::block_interleaved_storage_scheme storage_scheme, const precision_reduction* block_precisions, const IndexType* block_ptrs, size_type num_blocks, const ValueType* alpha, const ValueType* b, @@ -165,7 +165,7 @@ void advanced_adaptive_apply( { queue->parallel_for( sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) + [=](::sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(subwarp_size)]] { advanced_adaptive_apply( diff --git a/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp b/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp index b1109c5e0fb..1c3d412495a 100644 --- a/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp +++ b/dpcpp/preconditioner/jacobi_generate_instantiate.inc.dp.cpp @@ -123,7 +123,7 @@ void generate( const ValueType* __restrict__ values, ValueType* __restrict__ block_data, preconditioner::block_interleaved_storage_scheme storage_scheme, const IndexType* __restrict__ block_ptrs, size_type num_blocks, - sycl::nd_item<3> item_ct1, + ::sycl::nd_item<3> item_ct1, uninitialized_array* workspace) { const auto block_id = @@ -152,21 +152,21 @@ void generate( template void generate( - dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue, - size_type num_rows, const IndexType* row_ptrs, const IndexType* col_idxs, - const ValueType* values, ValueType* block_data, + dim3 grid, dim3 block, size_type dynamic_shared_memory, + ::sycl::queue* queue, size_type num_rows, const IndexType* row_ptrs, + const IndexType* col_idxs, const ValueType* values, ValueType* block_data, preconditioner::block_interleaved_storage_scheme storage_scheme, const IndexType* block_ptrs, size_type num_blocks) { - queue->submit([&](sycl::handler& cgh) { - sycl::accessor< + queue->submit([&](::sycl::handler& cgh) { + ::sycl::accessor< uninitialized_array, 0, - sycl::access_mode::read_write, sycl::access::target::local> + ::sycl::access_mode::read_write, ::sycl::access::target::local> workspace_acc_ct1(cgh); cgh.parallel_for( sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) + [=](::sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(subwarp_size)]] { generate( num_rows, row_ptrs, col_idxs, values, block_data, @@ -239,7 +239,7 @@ void adaptive_generate( remove_complex* __restrict__ conditioning, precision_reduction* __restrict__ block_precisions, const IndexType* __restrict__ block_ptrs, size_type num_blocks, - sycl::nd_item<3> item_ct1, + ::sycl::nd_item<3> item_ct1, uninitialized_array* workspace) { // extract blocks @@ -334,23 +334,23 @@ void adaptive_generate( template void adaptive_generate( - dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue, - size_type num_rows, const IndexType* row_ptrs, const IndexType* col_idxs, - const ValueType* values, remove_complex accuracy, - ValueType* block_data, + dim3 grid, dim3 block, size_type dynamic_shared_memory, + ::sycl::queue* queue, size_type num_rows, const IndexType* row_ptrs, + const IndexType* col_idxs, const ValueType* values, + remove_complex accuracy, ValueType* block_data, preconditioner::block_interleaved_storage_scheme storage_scheme, remove_complex* conditioning, precision_reduction* block_precisions, const IndexType* block_ptrs, size_type num_blocks) { - queue->submit([&](sycl::handler& cgh) { - sycl::accessor< + queue->submit([&](::sycl::handler& cgh) { + ::sycl::accessor< uninitialized_array, 0, - sycl::access_mode::read_write, sycl::access::target::local> + ::sycl::access_mode::read_write, ::sycl::access::target::local> workspace_acc_ct1(cgh); cgh.parallel_for(sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) + [=](::sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(subwarp_size)]] { adaptive_generate( diff --git a/dpcpp/preconditioner/jacobi_kernels.dp.cpp b/dpcpp/preconditioner/jacobi_kernels.dp.cpp index 13bc822d036..9a3cf4141e1 100644 --- a/dpcpp/preconditioner/jacobi_kernels.dp.cpp +++ b/dpcpp/preconditioner/jacobi_kernels.dp.cpp @@ -72,7 +72,7 @@ constexpr int default_grid_size = 32 * 32 * 128; void duplicate_array(const precision_reduction* __restrict__ source, size_type source_size, precision_reduction* __restrict__ dest, - size_type dest_size, sycl::nd_item<3> item_ct1) + size_type dest_size, ::sycl::nd_item<3> item_ct1) { auto grid = group::this_grid(item_ct1); if (grid.thread_rank() >= dest_size) { @@ -84,12 +84,12 @@ void duplicate_array(const precision_reduction* __restrict__ source, } void duplicate_array(dim3 grid, dim3 block, size_type dynamic_shared_memory, - sycl::queue* queue, const precision_reduction* source, + ::sycl::queue* queue, const precision_reduction* source, size_type source_size, precision_reduction* dest, size_type dest_size) { queue->parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + sycl_nd_range(grid, block), [=](::sycl::nd_item<3> item_ct1) { duplicate_array(source, source_size, dest, dest_size, item_ct1); }); } @@ -100,7 +100,7 @@ void compare_adjacent_rows(size_type num_rows, int32 max_block_size, const IndexType* __restrict__ row_ptrs, const IndexType* __restrict__ col_idx, bool* __restrict__ matching_next_row, - sycl::nd_item<3> item_ct1) + ::sycl::nd_item<3> item_ct1) { const auto warp = group::tiled_partition( group::this_thread_block(item_ct1)); @@ -142,13 +142,13 @@ void compare_adjacent_rows(size_type num_rows, int32 max_block_size, template void compare_adjacent_rows(dim3 grid, dim3 block, - size_type dynamic_shared_memory, sycl::queue* queue, - size_type num_rows, int32 max_block_size, - const IndexType* row_ptrs, const IndexType* col_idx, - bool* matching_next_row) + size_type dynamic_shared_memory, + ::sycl::queue* queue, size_type num_rows, + int32 max_block_size, const IndexType* row_ptrs, + const IndexType* col_idx, bool* matching_next_row) { queue->parallel_for(sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) + [=](::sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(config::warp_size)]] { compare_adjacent_rows( num_rows, max_block_size, row_ptrs, col_idx, @@ -184,13 +184,16 @@ void generate_natural_block_pointer(size_type num_rows, int32 max_block_size, } template -void generate_natural_block_pointer( - dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue, - size_type num_rows, int32 max_block_size, const bool* matching_next_row, - IndexType* block_ptrs, size_type* num_blocks_arr) +void generate_natural_block_pointer(dim3 grid, dim3 block, + size_type dynamic_shared_memory, + ::sycl::queue* queue, size_type num_rows, + int32 max_block_size, + const bool* matching_next_row, + IndexType* block_ptrs, + size_type* num_blocks_arr) { queue->parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + sycl_nd_range(grid, block), [=](::sycl::nd_item<3> item_ct1) { generate_natural_block_pointer(num_rows, max_block_size, matching_next_row, block_ptrs, num_blocks_arr); @@ -225,15 +228,13 @@ void agglomerate_supervariables_kernel(int32 max_block_size, } template -void agglomerate_supervariables_kernel(dim3 grid, dim3 block, - size_type dynamic_shared_memory, - sycl::queue* queue, int32 max_block_size, - size_type num_natural_blocks, - IndexType* block_ptrs, - size_type* num_blocks_arr) +void agglomerate_supervariables_kernel( + dim3 grid, dim3 block, size_type dynamic_shared_memory, + ::sycl::queue* queue, int32 max_block_size, size_type num_natural_blocks, + IndexType* block_ptrs, size_type* num_blocks_arr) { queue->parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + sycl_nd_range(grid, block), [=](::sycl::nd_item<3> item_ct1) { agglomerate_supervariables_kernel( max_block_size, num_natural_blocks, block_ptrs, num_blocks_arr); }); @@ -246,7 +247,7 @@ void transpose_jacobi( const ValueType* __restrict__ blocks, preconditioner::block_interleaved_storage_scheme storage_scheme, const IndexType* __restrict__ block_ptrs, size_type num_blocks, - ValueType* __restrict__ out_blocks, sycl::nd_item<3> item_ct1) + ValueType* __restrict__ out_blocks, ::sycl::nd_item<3> item_ct1) { const auto block_id = thread::get_subwarp_id(item_ct1); @@ -272,13 +273,13 @@ void transpose_jacobi( template void transpose_jacobi( - dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue, - const ValueType* blocks, + dim3 grid, dim3 block, size_type dynamic_shared_memory, + ::sycl::queue* queue, const ValueType* blocks, preconditioner::block_interleaved_storage_scheme storage_scheme, const IndexType* block_ptrs, size_type num_blocks, ValueType* out_blocks) { queue->parallel_for(sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) + [=](::sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(subwarp_size)]] { transpose_jacobi( @@ -295,7 +296,7 @@ void adaptive_transpose_jacobi( preconditioner::block_interleaved_storage_scheme storage_scheme, const precision_reduction* __restrict__ block_precisions, const IndexType* __restrict__ block_ptrs, size_type num_blocks, - ValueType* __restrict__ out_blocks, sycl::nd_item<3> item_ct1) + ValueType* __restrict__ out_blocks, ::sycl::nd_item<3> item_ct1) { const auto block_id = thread::get_subwarp_id(item_ct1); @@ -330,15 +331,15 @@ void adaptive_transpose_jacobi( template void adaptive_transpose_jacobi( - dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue, - const ValueType* blocks, + dim3 grid, dim3 block, size_type dynamic_shared_memory, + ::sycl::queue* queue, const ValueType* blocks, preconditioner::block_interleaved_storage_scheme storage_scheme, const precision_reduction* block_precisions, const IndexType* block_ptrs, size_type num_blocks, ValueType* out_blocks) { queue->parallel_for( sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) + [=](::sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(subwarp_size)]] { adaptive_transpose_jacobi( diff --git a/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp b/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp index 39e78f37d1c..4dde844b589 100644 --- a/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp +++ b/dpcpp/preconditioner/jacobi_simple_apply_instantiate.inc.dp.cpp @@ -72,7 +72,7 @@ void apply( preconditioner::block_interleaved_storage_scheme storage_scheme, const IndexType* __restrict__ block_ptrs, size_type num_blocks, const ValueType* __restrict__ b, int32 b_stride, ValueType* __restrict__ x, - int32 x_stride, sycl::nd_item<3> item_ct1) + int32 x_stride, ::sycl::nd_item<3> item_ct1) { const auto block_id = thread::get_subwarp_id(item_ct1); @@ -98,15 +98,15 @@ void apply( template void apply( - dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue, - const ValueType* blocks, + dim3 grid, dim3 block, size_type dynamic_shared_memory, + ::sycl::queue* queue, const ValueType* blocks, preconditioner::block_interleaved_storage_scheme storage_scheme, const IndexType* block_ptrs, size_type num_blocks, const ValueType* b, int32 b_stride, ValueType* x, int32 x_stride) { queue->parallel_for( sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) + [=](::sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(subwarp_size)]] { apply( blocks, storage_scheme, block_ptrs, num_blocks, b, b_stride, @@ -123,7 +123,7 @@ void adaptive_apply( const precision_reduction* __restrict__ block_precisions, const IndexType* __restrict__ block_ptrs, size_type num_blocks, const ValueType* __restrict__ b, int32 b_stride, ValueType* __restrict__ x, - int32 x_stride, sycl::nd_item<3> item_ct1) + int32 x_stride, ::sycl::nd_item<3> item_ct1) { const auto block_id = thread::get_subwarp_id(item_ct1); @@ -153,8 +153,8 @@ void adaptive_apply( template void adaptive_apply( - dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue, - const ValueType* blocks, + dim3 grid, dim3 block, size_type dynamic_shared_memory, + ::sycl::queue* queue, const ValueType* blocks, preconditioner::block_interleaved_storage_scheme storage_scheme, const precision_reduction* block_precisions, const IndexType* block_ptrs, size_type num_blocks, const ValueType* b, int32 b_stride, ValueType* x, @@ -162,7 +162,7 @@ void adaptive_apply( { queue->parallel_for( sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) + [=](::sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(subwarp_size)]] { adaptive_apply( blocks, storage_scheme, block_precisions, block_ptrs, diff --git a/dpcpp/solver/cb_gmres_kernels.dp.cpp b/dpcpp/solver/cb_gmres_kernels.dp.cpp index fb5633999d1..3c1e5223025 100644 --- a/dpcpp/solver/cb_gmres_kernels.dp.cpp +++ b/dpcpp/solver/cb_gmres_kernels.dp.cpp @@ -82,7 +82,7 @@ constexpr int default_dot_size = default_dot_dim * default_dot_dim; template void zero_matrix_kernel(size_type m, size_type n, size_type stride, ValueType* __restrict__ array, - sycl::nd_item<3> item_ct1) + ::sycl::nd_item<3> item_ct1) { const auto tidx = thread::get_thread_id_flat(item_ct1); if (tidx < n) { @@ -102,7 +102,7 @@ template void restart_1_kernel(size_type num_rows, size_type num_rhs, size_type krylov_dim, Accessor3d krylov_bases, ValueType* __restrict__ residual_norm_collection, - size_type stride_residual_nc, sycl::nd_item<3> item_ct1) + size_type stride_residual_nc, ::sycl::nd_item<3> item_ct1) { const auto global_id = thread::get_thread_id_flat(item_ct1); const auto krylov_stride = @@ -130,14 +130,15 @@ void restart_1_kernel(size_type num_rows, size_type num_rhs, template void restart_1_kernel(dim3 grid, dim3 block, size_type dynamic_shared_memory, - sycl::queue* queue, size_type num_rows, size_type num_rhs, - size_type krylov_dim, Accessor3d krylov_bases, + ::sycl::queue* queue, size_type num_rows, + size_type num_rhs, size_type krylov_dim, + Accessor3d krylov_bases, ValueType* residual_norm_collection, size_type stride_residual_nc) { - queue->submit([&](sycl::handler& cgh) { + queue->submit([&](::sycl::handler& cgh) { cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + sycl_nd_range(grid, block), [=](::sycl::nd_item<3> item_ct1) { restart_1_kernel( num_rows, num_rhs, krylov_dim, krylov_bases, residual_norm_collection, stride_residual_nc, item_ct1); @@ -154,7 +155,7 @@ void restart_2_kernel( const remove_complex* __restrict__ residual_norm, ValueType* __restrict__ residual_norm_collection, Accessor3d krylov_bases, ValueType* __restrict__ next_krylov_basis, size_type stride_next_krylov, - size_type* __restrict__ final_iter_nums, sycl::nd_item<3> item_ct1) + size_type* __restrict__ final_iter_nums, ::sycl::nd_item<3> item_ct1) { const auto global_id = thread::get_thread_id_flat(item_ct1); const auto krylov_stride = @@ -178,16 +179,17 @@ void restart_2_kernel( template void restart_2_kernel(dim3 grid, dim3 block, size_type dynamic_shared_memory, - sycl::queue* queue, size_type num_rows, size_type num_rhs, - const ValueType* residual, size_type stride_residual, + ::sycl::queue* queue, size_type num_rows, + size_type num_rhs, const ValueType* residual, + size_type stride_residual, const remove_complex* residual_norm, ValueType* residual_norm_collection, Accessor3d krylov_bases, ValueType* next_krylov_basis, size_type stride_next_krylov, size_type* final_iter_nums) { - queue->submit([&](sycl::handler& cgh) { + queue->submit([&](::sycl::handler& cgh) { cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + sycl_nd_range(grid, block), [=](::sycl::nd_item<3> item_ct1) { restart_2_kernel( num_rows, num_rhs, residual, stride_residual, residual_norm, residual_norm_collection, krylov_bases, next_krylov_basis, @@ -200,7 +202,7 @@ void restart_2_kernel(dim3 grid, dim3 block, size_type dynamic_shared_memory, void increase_final_iteration_numbers_kernel( size_type* __restrict__ final_iter_nums, const stopping_status* __restrict__ stop_status, size_type total_number, - sycl::nd_item<3> item_ct1) + ::sycl::nd_item<3> item_ct1) { const auto global_id = thread::get_thread_id_flat(item_ct1); if (global_id < total_number) { @@ -217,7 +219,8 @@ void multinorm2_kernel( size_type num_rows, size_type num_cols, const ValueType* __restrict__ next_krylov_basis, size_type stride_next_krylov, remove_complex* __restrict__ norms, - const stopping_status* __restrict__ stop_status, sycl::nd_item<3> item_ct1, + const stopping_status* __restrict__ stop_status, + ::sycl::nd_item<3> item_ct1, uninitialized_array, default_dot_dim*(default_dot_dim + 1)>* reduction_helper_array) @@ -261,22 +264,22 @@ void multinorm2_kernel( template void multinorm2_kernel(dim3 grid, dim3 block, size_type dynamic_shared_memory, - sycl::queue* queue, size_type num_rows, + ::sycl::queue* queue, size_type num_rows, size_type num_cols, const ValueType* next_krylov_basis, size_type stride_next_krylov, remove_complex* norms, const stopping_status* stop_status) { - queue->submit([&](sycl::handler& cgh) { - sycl::accessor< + queue->submit([&](::sycl::handler& cgh) { + ::sycl::accessor< uninitialized_array, default_dot_dim*(default_dot_dim + 1)>, - 0, sycl::access_mode::read_write, sycl::access::target::local> + 0, ::sycl::access_mode::read_write, ::sycl::access::target::local> reduction_helper_array_acc_ct1(cgh); cgh.parallel_for( sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) + [=](::sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(default_dot_dim)]] { multinorm2_kernel( num_rows, num_cols, next_krylov_basis, @@ -292,7 +295,7 @@ void multinorminf_without_stop_kernel( size_type num_rows, size_type num_cols, const ValueType* __restrict__ next_krylov_basis, size_type stride_next_krylov, remove_complex* __restrict__ norms, - size_type stride_norms, sycl::nd_item<3> item_ct1, + size_type stride_norms, ::sycl::nd_item<3> item_ct1, uninitialized_array, default_dot_dim*(default_dot_dim + 1)>* reduction_helper_array) @@ -339,21 +342,21 @@ void multinorminf_without_stop_kernel( template void multinorminf_without_stop_kernel( - dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue, - size_type num_rows, size_type num_cols, const ValueType* next_krylov_basis, - size_type stride_next_krylov, remove_complex* norms, - size_type stride_norms) + dim3 grid, dim3 block, size_type dynamic_shared_memory, + ::sycl::queue* queue, size_type num_rows, size_type num_cols, + const ValueType* next_krylov_basis, size_type stride_next_krylov, + remove_complex* norms, size_type stride_norms) { - queue->submit([&](sycl::handler& cgh) { - sycl::accessor< + queue->submit([&](::sycl::handler& cgh) { + ::sycl::accessor< uninitialized_array, default_dot_dim*(default_dot_dim + 1)>, - 0, sycl::access_mode::read_write, sycl::access::target::local> + 0, ::sycl::access_mode::read_write, ::sycl::access::target::local> reduction_helper_array_acc_ct1(cgh); cgh.parallel_for( sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) + [=](::sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(default_dot_dim)]] { multinorminf_without_stop_kernel( num_rows, num_cols, next_krylov_basis, @@ -372,7 +375,8 @@ void multinorm2_inf_kernel( size_type stride_next_krylov, remove_complex* __restrict__ norms1, remove_complex* __restrict__ norms2, - const stopping_status* __restrict__ stop_status, sycl::nd_item<3> item_ct1, + const stopping_status* __restrict__ stop_status, + ::sycl::nd_item<3> item_ct1, uninitialized_array, (1 + compute_inf) * default_dot_dim*(default_dot_dim + 1)>* @@ -441,22 +445,23 @@ void multinorm2_inf_kernel( template void multinorm2_inf_kernel( - dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue, - size_type num_rows, size_type num_cols, const ValueType* next_krylov_basis, - size_type stride_next_krylov, remove_complex* norms1, - remove_complex* norms2, const stopping_status* stop_status) + dim3 grid, dim3 block, size_type dynamic_shared_memory, + ::sycl::queue* queue, size_type num_rows, size_type num_cols, + const ValueType* next_krylov_basis, size_type stride_next_krylov, + remove_complex* norms1, remove_complex* norms2, + const stopping_status* stop_status) { - queue->submit([&](sycl::handler& cgh) { - sycl::accessor< + queue->submit([&](::sycl::handler& cgh) { + ::sycl::accessor< uninitialized_array, (1 + compute_inf) * default_dot_dim*(default_dot_dim + 1)>, - 0, sycl::access_mode::read_write, sycl::access::target::local> + 0, ::sycl::access_mode::read_write, ::sycl::access::target::local> reduction_helper_array_acc_ct1(cgh); cgh.parallel_for( sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) + [=](::sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(default_dot_dim)]] { multinorm2_inf_kernel( num_rows, num_cols, next_krylov_basis, @@ -473,7 +478,8 @@ void multidot_kernel( const ValueType* __restrict__ next_krylov_basis, size_type stride_next_krylov, const Accessor3d krylov_bases, ValueType* __restrict__ hessenberg_iter, size_type stride_hessenberg, - const stopping_status* __restrict__ stop_status, sycl::nd_item<3> item_ct1, + const stopping_status* __restrict__ stop_status, + ::sycl::nd_item<3> item_ct1, uninitialized_array& reduction_helper_array) { /* @@ -533,22 +539,22 @@ void multidot_kernel( template void multidot_kernel(dim3 grid, dim3 block, size_type dynamic_shared_memory, - sycl::queue* queue, size_type num_rows, size_type num_cols, - const ValueType* next_krylov_basis, + ::sycl::queue* queue, size_type num_rows, + size_type num_cols, const ValueType* next_krylov_basis, size_type stride_next_krylov, const Accessor3d krylov_bases, ValueType* hessenberg_iter, size_type stride_hessenberg, const stopping_status* stop_status) { - queue->submit([&](sycl::handler& cgh) { - sycl::accessor, 0, - sycl::access_mode::read_write, - sycl::access::target::local> + queue->submit([&](::sycl::handler& cgh) { + ::sycl::accessor, 0, + ::sycl::access_mode::read_write, + ::sycl::access::target::local> reduction_helper_array_acc_ct1(cgh); cgh.parallel_for( sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) + [=](::sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(dot_dim)]] { multidot_kernel( num_rows, num_cols, next_krylov_basis, @@ -565,7 +571,8 @@ void singledot_kernel( size_type num_rows, const ValueType* __restrict__ next_krylov_basis, size_type stride_next_krylov, const Accessor3d krylov_bases, ValueType* __restrict__ hessenberg_iter, size_type stride_hessenberg, - const stopping_status* __restrict__ stop_status, sycl::nd_item<3> item_ct1, + const stopping_status* __restrict__ stop_status, + ::sycl::nd_item<3> item_ct1, uninitialized_array& reduction_helper_array) { /* @@ -615,22 +622,22 @@ void singledot_kernel( template void singledot_kernel(dim3 grid, dim3 block, size_type dynamic_shared_memory, - sycl::queue* queue, size_type num_rows, + ::sycl::queue* queue, size_type num_rows, const ValueType* next_krylov_basis, size_type stride_next_krylov, const Accessor3d krylov_bases, ValueType* hessenberg_iter, size_type stride_hessenberg, const stopping_status* stop_status) { - queue->submit([&](sycl::handler& cgh) { - sycl::accessor, 0, - sycl::access_mode::read_write, - sycl::access::target::local> + queue->submit([&](::sycl::handler& cgh) { + ::sycl::accessor, 0, + ::sycl::access_mode::read_write, + ::sycl::access::target::local> reduction_helper_array_acc_ct1(cgh); cgh.parallel_for( sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) + [=](::sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(config::warp_size)]] { singledot_kernel( num_rows, next_krylov_basis, stride_next_krylov, @@ -645,12 +652,15 @@ void singledot_kernel(dim3 grid, dim3 block, size_type dynamic_shared_memory, // Must be called with at least `num_rows * stride_next_krylov` threads in // total. template -void update_next_krylov_kernel( - size_type num_iters, size_type num_rows, size_type num_cols, - ValueType* __restrict__ next_krylov_basis, size_type stride_next_krylov, - const Accessor3d krylov_bases, - const ValueType* __restrict__ hessenberg_iter, size_type stride_hessenberg, - const stopping_status* __restrict__ stop_status, sycl::nd_item<3> item_ct1) +void update_next_krylov_kernel(size_type num_iters, size_type num_rows, + size_type num_cols, + ValueType* __restrict__ next_krylov_basis, + size_type stride_next_krylov, + const Accessor3d krylov_bases, + const ValueType* __restrict__ hessenberg_iter, + size_type stride_hessenberg, + const stopping_status* __restrict__ stop_status, + ::sycl::nd_item<3> item_ct1) { const auto global_id = thread::get_thread_id_flat(item_ct1); const auto row_idx = global_id / stride_next_krylov; @@ -672,15 +682,16 @@ void update_next_krylov_kernel( template void update_next_krylov_kernel( - dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue, - size_type num_iters, size_type num_rows, size_type num_cols, - ValueType* next_krylov_basis, size_type stride_next_krylov, - const Accessor3d krylov_bases, const ValueType* hessenberg_iter, - size_type stride_hessenberg, const stopping_status* stop_status) + dim3 grid, dim3 block, size_type dynamic_shared_memory, + ::sycl::queue* queue, size_type num_iters, size_type num_rows, + size_type num_cols, ValueType* next_krylov_basis, + size_type stride_next_krylov, const Accessor3d krylov_bases, + const ValueType* hessenberg_iter, size_type stride_hessenberg, + const stopping_status* stop_status) { - queue->submit([&](sycl::handler& cgh) { + queue->submit([&](::sycl::handler& cgh) { cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + sycl_nd_range(grid, block), [=](::sycl::nd_item<3> item_ct1) { update_next_krylov_kernel( num_iters, num_rows, num_cols, next_krylov_basis, stride_next_krylov, krylov_bases, hessenberg_iter, @@ -700,7 +711,7 @@ void update_next_krylov_and_add_kernel( size_type stride_hessenberg, const ValueType* __restrict__ buffer_iter, size_type stride_buffer, const stopping_status* __restrict__ stop_status, const stopping_status* __restrict__ reorth_status, - sycl::nd_item<3> item_ct1) + ::sycl::nd_item<3> item_ct1) { const auto global_id = thread::get_thread_id_flat(item_ct1); const auto row_idx = global_id / stride_next_krylov; @@ -726,17 +737,17 @@ void update_next_krylov_and_add_kernel( template void update_next_krylov_and_add_kernel( - dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue, - size_type num_iters, size_type num_rows, size_type num_cols, - ValueType* next_krylov_basis, size_type stride_next_krylov, - const Accessor3d krylov_bases, ValueType* hessenberg_iter, - size_type stride_hessenberg, const ValueType* buffer_iter, - size_type stride_buffer, const stopping_status* stop_status, - const stopping_status* reorth_status) + dim3 grid, dim3 block, size_type dynamic_shared_memory, + ::sycl::queue* queue, size_type num_iters, size_type num_rows, + size_type num_cols, ValueType* next_krylov_basis, + size_type stride_next_krylov, const Accessor3d krylov_bases, + ValueType* hessenberg_iter, size_type stride_hessenberg, + const ValueType* buffer_iter, size_type stride_buffer, + const stopping_status* stop_status, const stopping_status* reorth_status) { - queue->submit([&](sycl::handler& cgh) { + queue->submit([&](::sycl::handler& cgh) { cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + sycl_nd_range(grid, block), [=](::sycl::nd_item<3> item_ct1) { update_next_krylov_and_add_kernel( num_iters, num_rows, num_cols, next_krylov_basis, stride_next_krylov, krylov_bases, hessenberg_iter, @@ -755,7 +766,7 @@ void check_arnoldi_norms( size_type stride_hessenberg, size_type iter, Accessor3d krylov_bases, const stopping_status* __restrict__ stop_status, stopping_status* __restrict__ reorth_status, - size_type* __restrict__ num_reorth, sycl::nd_item<3> item_ct1) + size_type* __restrict__ num_reorth, ::sycl::nd_item<3> item_ct1) { const remove_complex eta_squared = 1.0 / 2.0; const auto col_idx = thread::get_thread_id_flat(item_ct1); @@ -783,7 +794,7 @@ void check_arnoldi_norms( template void check_arnoldi_norms(dim3 grid, dim3 block, size_type dynamic_shared_memory, - sycl::queue* queue, size_type num_rhs, + ::sycl::queue* queue, size_type num_rhs, remove_complex* arnoldi_norm, size_type stride_norm, ValueType* hessenberg_iter, size_type stride_hessenberg, size_type iter, @@ -791,9 +802,9 @@ void check_arnoldi_norms(dim3 grid, dim3 block, size_type dynamic_shared_memory, const stopping_status* stop_status, stopping_status* reorth_status, size_type* num_reorth) { - queue->submit([&](sycl::handler& cgh) { + queue->submit([&](::sycl::handler& cgh) { cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + sycl_nd_range(grid, block), [=](::sycl::nd_item<3> item_ct1) { check_arnoldi_norms( num_rhs, arnoldi_norm, stride_norm, hessenberg_iter, stride_hessenberg, iter, krylov_bases, stop_status, @@ -809,7 +820,7 @@ void set_scalar_kernel(size_type num_rhs, size_type num_blocks, size_type stride_residual, const RealValueType* __restrict__ arnoldi_inf, size_type stride_inf, Accessor3d krylov_bases, - sycl::nd_item<3> item_ct1) + ::sycl::nd_item<3> item_ct1) { static_assert(!is_complex_s::value, "ValueType must not be complex!"); @@ -836,15 +847,15 @@ void set_scalar_kernel(size_type num_rhs, size_type num_blocks, template void set_scalar_kernel(dim3 grid, dim3 block, size_type dynamic_shared_memory, - sycl::queue* queue, size_type num_rhs, + ::sycl::queue* queue, size_type num_rhs, size_type num_blocks, const RealValueType* residual_norm, size_type stride_residual, const RealValueType* arnoldi_inf, size_type stride_inf, Accessor3d krylov_bases) { - queue->submit([&](sycl::handler& cgh) { + queue->submit([&](::sycl::handler& cgh) { cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + sycl_nd_range(grid, block), [=](::sycl::nd_item<3> item_ct1) { set_scalar_kernel( num_rhs, num_blocks, residual_norm, stride_residual, arnoldi_inf, stride_inf, krylov_bases, item_ct1); @@ -861,7 +872,8 @@ void update_krylov_next_krylov_kernel( ValueType* __restrict__ next_krylov_basis, size_type stride_next_krylov, Accessor3d krylov_bases, const ValueType* __restrict__ hessenberg_iter, size_type stride_hessenberg, - const stopping_status* __restrict__ stop_status, sycl::nd_item<3> item_ct1) + const stopping_status* __restrict__ stop_status, + ::sycl::nd_item<3> item_ct1) { const auto global_id = thread::get_thread_id_flat(item_ct1); const auto row_idx = global_id / stride_next_krylov; @@ -883,15 +895,16 @@ void update_krylov_next_krylov_kernel( template void update_krylov_next_krylov_kernel( - dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue, - size_type iter, size_type num_rows, size_type num_cols, - ValueType* next_krylov_basis, size_type stride_next_krylov, - Accessor3d krylov_bases, const ValueType* hessenberg_iter, - size_type stride_hessenberg, const stopping_status* stop_status) + dim3 grid, dim3 block, size_type dynamic_shared_memory, + ::sycl::queue* queue, size_type iter, size_type num_rows, + size_type num_cols, ValueType* next_krylov_basis, + size_type stride_next_krylov, Accessor3d krylov_bases, + const ValueType* hessenberg_iter, size_type stride_hessenberg, + const stopping_status* stop_status) { - queue->submit([&](sycl::handler& cgh) { + queue->submit([&](::sycl::handler& cgh) { cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + sycl_nd_range(grid, block), [=](::sycl::nd_item<3> item_ct1) { update_krylov_next_krylov_kernel( iter, num_rows, num_cols, next_krylov_basis, stride_next_krylov, krylov_bases, hessenberg_iter, @@ -910,7 +923,7 @@ void calculate_Qy_kernel(size_type num_rows, size_type num_cols, ValueType* __restrict__ before_preconditioner, size_type stride_preconditioner, const size_type* __restrict__ final_iter_nums, - sycl::nd_item<3> item_ct1) + ::sycl::nd_item<3> item_ct1) { const auto global_id = thread::get_thread_id_flat(item_ct1); const auto row_id = global_id / stride_preconditioner; @@ -927,16 +940,16 @@ void calculate_Qy_kernel(size_type num_rows, size_type num_cols, template void calculate_Qy_kernel(dim3 grid, dim3 block, size_type dynamic_shared_memory, - sycl::queue* queue, size_type num_rows, + ::sycl::queue* queue, size_type num_rows, size_type num_cols, const Accessor3d krylov_bases, const ValueType* y, size_type stride_y, ValueType* before_preconditioner, size_type stride_preconditioner, const size_type* final_iter_nums) { - queue->submit([&](sycl::handler& cgh) { + queue->submit([&](::sycl::handler& cgh) { cgh.parallel_for(sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) { + [=](::sycl::nd_item<3> item_ct1) { calculate_Qy_kernel( num_rows, num_cols, krylov_bases, y, stride_y, before_preconditioner, stride_preconditioner, diff --git a/dpcpp/solver/common_gmres_kernels.dp.inc b/dpcpp/solver/common_gmres_kernels.dp.inc index e376d5d5521..10a10cda15a 100644 --- a/dpcpp/solver/common_gmres_kernels.dp.inc +++ b/dpcpp/solver/common_gmres_kernels.dp.inc @@ -40,7 +40,7 @@ void initialize_kernel( ValueType *__restrict__ residual, size_type stride_residual, ValueType *__restrict__ givens_sin, size_type stride_sin, ValueType *__restrict__ givens_cos, size_type stride_cos, - stopping_status *__restrict__ stop_status, sycl::nd_item<3> item_ct1) + stopping_status *__restrict__ stop_status, ::sycl::nd_item<3> item_ct1) { const auto global_id = thread::get_thread_id_flat(item_ct1); @@ -67,7 +67,7 @@ void initialize_kernel( template void initialize_kernel(dim3 grid, dim3 block, size_type dynamic_shared_memory, - sycl::queue *queue, size_type num_rows, + ::sycl::queue *queue, size_type num_rows, size_type num_cols, size_type krylov_dim, const ValueType *b, size_type stride_b, ValueType *residual, size_type stride_residual, @@ -75,9 +75,9 @@ void initialize_kernel(dim3 grid, dim3 block, size_type dynamic_shared_memory, ValueType *givens_cos, size_type stride_cos, stopping_status *stop_status) { - queue->submit([&](sycl::handler &cgh) { + queue->submit([&](::sycl::handler &cgh) { cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + sycl_nd_range(grid, block), [=](::sycl::nd_item<3> item_ct1) { initialize_kernel( num_rows, num_cols, krylov_dim, b, stride_b, residual, stride_residual, givens_sin, stride_sin, givens_cos, @@ -146,7 +146,7 @@ void givens_rotation_kernel( remove_complex *__restrict__ residual_norm, ValueType *__restrict__ residual_norm_collection, size_type stride_residual_norm_collection, - const stopping_status *__restrict__ stop_status, sycl::nd_item<3> item_ct1) + const stopping_status *__restrict__ stop_status, ::sycl::nd_item<3> item_ct1) { const auto col_idx = thread::get_thread_id_flat(item_ct1); @@ -195,7 +195,7 @@ void givens_rotation_kernel( template void givens_rotation_kernel(dim3 grid, dim3 block, - size_type dynamic_shared_memory, sycl::queue *queue, + size_type dynamic_shared_memory, ::sycl::queue *queue, size_type num_rows, size_type num_cols, size_type iter, ValueType *hessenberg_iter, size_type stride_hessenberg, ValueType *givens_sin, @@ -206,9 +206,9 @@ void givens_rotation_kernel(dim3 grid, dim3 block, size_type stride_residual_norm_collection, const stopping_status *stop_status) { - queue->submit([&](sycl::handler &cgh) { + queue->submit([&](::sycl::handler &cgh) { cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + sycl_nd_range(grid, block), [=](::sycl::nd_item<3> item_ct1) { givens_rotation_kernel( num_rows, num_cols, iter, hessenberg_iter, stride_hessenberg, givens_sin, stride_sin, givens_cos, @@ -227,7 +227,7 @@ void solve_upper_triangular_kernel( size_type stride_residual_norm_collection, const ValueType *__restrict__ hessenberg, size_type stride_hessenberg, ValueType *__restrict__ y, size_type stride_y, - const size_type *__restrict__ final_iter_nums, sycl::nd_item<3> item_ct1) + const size_type *__restrict__ final_iter_nums, ::sycl::nd_item<3> item_ct1) { const auto col_idx = thread::get_thread_id_flat(item_ct1); @@ -253,16 +253,16 @@ void solve_upper_triangular_kernel( template void solve_upper_triangular_kernel( - dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue *queue, + dim3 grid, dim3 block, size_type dynamic_shared_memory, ::sycl::queue *queue, size_type num_cols, size_type num_rhs, const ValueType *residual_norm_collection, size_type stride_residual_norm_collection, const ValueType *hessenberg, size_type stride_hessenberg, ValueType *y, size_type stride_y, const size_type *final_iter_nums) { - queue->submit([&](sycl::handler &cgh) { + queue->submit([&](::sycl::handler &cgh) { cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + sycl_nd_range(grid, block), [=](::sycl::nd_item<3> item_ct1) { solve_upper_triangular_kernel( num_cols, num_rhs, residual_norm_collection, stride_residual_norm_collection, hessenberg, diff --git a/dpcpp/solver/idr_kernels.dp.cpp b/dpcpp/solver/idr_kernels.dp.cpp index 5549992f990..8ba72595234 100644 --- a/dpcpp/solver/idr_kernels.dp.cpp +++ b/dpcpp/solver/idr_kernels.dp.cpp @@ -75,7 +75,7 @@ template void initialize_m_kernel(size_type subspace_dim, size_type nrhs, ValueType* __restrict__ m_values, size_type m_stride, stopping_status* __restrict__ stop_status, - sycl::nd_item<3> item_ct1) + ::sycl::nd_item<3> item_ct1) { const auto global_id = thread::get_thread_id_flat(item_ct1); const auto row = global_id / m_stride; @@ -93,16 +93,16 @@ void initialize_m_kernel(size_type subspace_dim, size_type nrhs, template void initialize_m_kernel(dim3 grid, dim3 block, size_t dynamic_shared_memory, - sycl::queue* stream, size_type subspace_dim, + ::sycl::queue* stream, size_type subspace_dim, size_type nrhs, ValueType* m_values, size_type m_stride, stopping_status* stop_status) { if (nrhs == 0) { return; } - stream->submit([&](sycl::handler& cgh) { + stream->submit([&](::sycl::handler& cgh) { cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + sycl_nd_range(grid, block), [=](::sycl::nd_item<3> item_ct1) { initialize_m_kernel(subspace_dim, nrhs, m_values, m_stride, stop_status, item_ct1); }); @@ -113,7 +113,7 @@ void initialize_m_kernel(dim3 grid, dim3 block, size_t dynamic_shared_memory, template void orthonormalize_subspace_vectors_kernel( size_type num_rows, size_type num_cols, ValueType* __restrict__ values, - size_type stride, sycl::nd_item<3> item_ct1, + size_type stride, ::sycl::nd_item<3> item_ct1, uninitialized_array& reduction_helper_array) { const auto tidx = thread::get_thread_id_flat(item_ct1); @@ -131,12 +131,12 @@ void orthonormalize_subspace_vectors_kernel( } // Ensure already finish reading this shared memory - item_ct1.barrier(sycl::access::fence_space::local_space); + item_ct1.barrier(::sycl::access::fence_space::local_space); reduction_helper[tidx] = dot; ::gko::kernels::sycl::reduce( group::this_thread_block(item_ct1), reduction_helper, [](const ValueType& a, const ValueType& b) { return a + b; }); - item_ct1.barrier(sycl::access::fence_space::local_space); + item_ct1.barrier(::sycl::access::fence_space::local_space); dot = reduction_helper[0]; for (size_type j = tidx; j < num_cols; j += block_size) { @@ -150,13 +150,13 @@ void orthonormalize_subspace_vectors_kernel( } // Ensure already finish reading this shared memory - item_ct1.barrier(sycl::access::fence_space::local_space); + item_ct1.barrier(::sycl::access::fence_space::local_space); reduction_helper_real[tidx] = norm; ::gko::kernels::sycl::reduce( group::this_thread_block(item_ct1), reduction_helper_real, [](const remove_complex& a, const remove_complex& b) { return a + b; }); - item_ct1.barrier(sycl::access::fence_space::local_space); + item_ct1.barrier(::sycl::access::fence_space::local_space); norm = std::sqrt(reduction_helper_real[0]); for (size_type j = tidx; j < num_cols; j += block_size) { @@ -167,18 +167,18 @@ void orthonormalize_subspace_vectors_kernel( template void orthonormalize_subspace_vectors_kernel( - dim3 grid, dim3 block, size_t dynamic_shared_memory, sycl::queue* stream, + dim3 grid, dim3 block, size_t dynamic_shared_memory, ::sycl::queue* stream, size_type num_rows, size_type num_cols, ValueType* values, size_type stride) { - stream->submit([&](sycl::handler& cgh) { - sycl::accessor, 0, - sycl::access_mode::read_write, - sycl::access::target::local> + stream->submit([&](::sycl::handler& cgh) { + ::sycl::accessor, 0, + ::sycl::access_mode::read_write, + ::sycl::access::target::local> reduction_helper_array_acc_ct1(cgh); cgh.parallel_for( sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) + [=](::sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(config::warp_size)]] { orthonormalize_subspace_vectors_kernel( num_rows, num_cols, values, stride, item_ct1, @@ -194,7 +194,8 @@ void solve_lower_triangular_kernel( const ValueType* __restrict__ m_values, size_type m_stride, const ValueType* __restrict__ f_values, size_type f_stride, ValueType* __restrict__ c_values, size_type c_stride, - const stopping_status* __restrict__ stop_status, sycl::nd_item<3> item_ct1) + const stopping_status* __restrict__ stop_status, + ::sycl::nd_item<3> item_ct1) { const auto global_id = thread::get_thread_id_flat(item_ct1); @@ -217,14 +218,14 @@ void solve_lower_triangular_kernel( template void solve_lower_triangular_kernel( - dim3 grid, dim3 block, size_t dynamic_shared_memory, sycl::queue* stream, + dim3 grid, dim3 block, size_t dynamic_shared_memory, ::sycl::queue* stream, size_type subspace_dim, size_type nrhs, const ValueType* m_values, size_type m_stride, const ValueType* f_values, size_type f_stride, ValueType* c_values, size_type c_stride, const stopping_status* stop_status) { - stream->submit([&](sycl::handler& cgh) { + stream->submit([&](::sycl::handler& cgh) { cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + sycl_nd_range(grid, block), [=](::sycl::nd_item<3> item_ct1) { solve_lower_triangular_kernel( subspace_dim, nrhs, m_values, m_stride, f_values, f_stride, c_values, c_stride, stop_status, item_ct1); @@ -242,7 +243,7 @@ void step_1_kernel(size_type k, size_type num_rows, size_type subspace_dim, const ValueType* __restrict__ g_values, size_type g_stride, ValueType* __restrict__ v_values, size_type v_stride, const stopping_status* __restrict__ stop_status, - sycl::nd_item<3> item_ct1) + ::sycl::nd_item<3> item_ct1) { const auto global_id = thread::get_thread_id_flat(item_ct1); const auto row = global_id / nrhs; @@ -264,7 +265,7 @@ void step_1_kernel(size_type k, size_type num_rows, size_type subspace_dim, template void step_1_kernel(dim3 grid, dim3 block, size_t dynamic_shared_memory, - sycl::queue* stream, size_type k, size_type num_rows, + ::sycl::queue* stream, size_type k, size_type num_rows, size_type subspace_dim, size_type nrhs, const ValueType* residual_values, size_type residual_stride, const ValueType* c_values, size_type c_stride, @@ -275,9 +276,9 @@ void step_1_kernel(dim3 grid, dim3 block, size_t dynamic_shared_memory, if (nrhs == 0) { return; } - stream->submit([&](sycl::handler& cgh) { + stream->submit([&](::sycl::handler& cgh) { cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + sycl_nd_range(grid, block), [=](::sycl::nd_item<3> item_ct1) { step_1_kernel(k, num_rows, subspace_dim, nrhs, residual_values, residual_stride, c_values, c_stride, g_values, g_stride, v_values, v_stride, stop_status, @@ -294,7 +295,7 @@ void step_2_kernel(size_type k, size_type num_rows, size_type subspace_dim, const ValueType* __restrict__ c_values, size_type c_stride, ValueType* __restrict__ u_values, size_type u_stride, const stopping_status* __restrict__ stop_status, - sycl::nd_item<3> item_ct1) + ::sycl::nd_item<3> item_ct1) { const auto global_id = thread::get_thread_id_flat(item_ct1); const auto row = global_id / nrhs; @@ -316,7 +317,7 @@ void step_2_kernel(size_type k, size_type num_rows, size_type subspace_dim, template void step_2_kernel(dim3 grid, dim3 block, size_t dynamic_shared_memory, - sycl::queue* stream, size_type k, size_type num_rows, + ::sycl::queue* stream, size_type k, size_type num_rows, size_type subspace_dim, size_type nrhs, const ValueType* omega_values, const ValueType* v_values, size_type v_stride, const ValueType* c_values, @@ -326,9 +327,9 @@ void step_2_kernel(dim3 grid, dim3 block, size_t dynamic_shared_memory, if (nrhs == 0) { return; } - stream->submit([&](sycl::handler& cgh) { + stream->submit([&](::sycl::handler& cgh) { cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + sycl_nd_range(grid, block), [=](::sycl::nd_item<3> item_ct1) { step_2_kernel(k, num_rows, subspace_dim, nrhs, omega_values, v_values, v_stride, c_values, c_stride, u_values, u_stride, stop_status, item_ct1); @@ -342,7 +343,8 @@ void multidot_kernel( size_type num_rows, size_type nrhs, const ValueType* __restrict__ p_i, const ValueType* __restrict__ g_k, size_type g_k_stride, ValueType* __restrict__ alpha, - const stopping_status* __restrict__ stop_status, sycl::nd_item<3> item_ct1, + const stopping_status* __restrict__ stop_status, + ::sycl::nd_item<3> item_ct1, uninitialized_array& reduction_helper_array) { @@ -367,7 +369,7 @@ void multidot_kernel( } } reduction_helper[tidx * (default_dot_dim + 1) + tidy] = local_res; - item_ct1.barrier(sycl::access::fence_space::local_space); + item_ct1.barrier(::sycl::access::fence_space::local_space); local_res = reduction_helper[tidy * (default_dot_dim + 1) + tidx]; const auto tile_block = group::tiled_partition( group::this_thread_block(item_ct1)); @@ -382,21 +384,21 @@ void multidot_kernel( template void multidot_kernel(dim3 grid, dim3 block, size_t dynamic_shared_memory, - sycl::queue* stream, size_type num_rows, size_type nrhs, + ::sycl::queue* stream, size_type num_rows, size_type nrhs, const ValueType* p_i, const ValueType* g_k, size_type g_k_stride, ValueType* alpha, const stopping_status* stop_status) { - stream->submit([&](sycl::handler& cgh) { - sycl::accessor, - 0, sycl::access_mode::read_write, - sycl::access::target::local> + stream->submit([&](::sycl::handler& cgh) { + ::sycl::accessor, + 0, ::sycl::access_mode::read_write, + ::sycl::access::target::local> reduction_helper_array_acc_ct1(cgh); cgh.parallel_for( sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) + [=](::sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(default_dot_dim)]] { multidot_kernel( num_rows, nrhs, p_i, g_k, g_k_stride, alpha, @@ -414,7 +416,8 @@ void update_g_k_and_u_kernel( size_type m_stride, const ValueType* __restrict__ g_values, size_type g_stride, ValueType* __restrict__ g_k_values, size_type g_k_stride, ValueType* __restrict__ u_values, size_type u_stride, - const stopping_status* __restrict__ stop_status, sycl::nd_item<3> item_ct1) + const stopping_status* __restrict__ stop_status, + ::sycl::nd_item<3> item_ct1) { const auto tidx = thread::get_thread_id_flat(item_ct1); const auto row = tidx / g_k_stride; @@ -434,22 +437,20 @@ void update_g_k_and_u_kernel( } template -void update_g_k_and_u_kernel(dim3 grid, dim3 block, - size_t dynamic_shared_memory, sycl::queue* stream, - size_type k, size_type i, size_type size, - size_type nrhs, const ValueType* alpha, - const ValueType* m_values, size_type m_stride, - const ValueType* g_values, size_type g_stride, - ValueType* g_k_values, size_type g_k_stride, - ValueType* u_values, size_type u_stride, - const stopping_status* stop_status) +void update_g_k_and_u_kernel( + dim3 grid, dim3 block, size_t dynamic_shared_memory, ::sycl::queue* stream, + size_type k, size_type i, size_type size, size_type nrhs, + const ValueType* alpha, const ValueType* m_values, size_type m_stride, + const ValueType* g_values, size_type g_stride, ValueType* g_k_values, + size_type g_k_stride, ValueType* u_values, size_type u_stride, + const stopping_status* stop_status) { if (g_k_stride == 0) { return; } - stream->submit([&](sycl::handler& cgh) { + stream->submit([&](::sycl::handler& cgh) { cgh.parallel_for(sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) { + [=](::sycl::nd_item<3> item_ct1) { update_g_k_and_u_kernel( k, i, size, nrhs, alpha, m_values, m_stride, g_values, g_stride, g_k_values, g_k_stride, @@ -465,7 +466,7 @@ void update_g_kernel(size_type k, size_type size, size_type nrhs, size_type g_k_stride, ValueType* __restrict__ g_values, size_type g_stride, const stopping_status* __restrict__ stop_status, - sycl::nd_item<3> item_ct1) + ::sycl::nd_item<3> item_ct1) { const auto tidx = thread::get_thread_id_flat(item_ct1); const auto row = tidx / g_k_stride; @@ -483,7 +484,7 @@ void update_g_kernel(size_type k, size_type size, size_type nrhs, template void update_g_kernel(dim3 grid, dim3 block, size_t dynamic_shared_memory, - sycl::queue* stream, size_type k, size_type size, + ::sycl::queue* stream, size_type k, size_type size, size_type nrhs, const ValueType* g_k_values, size_type g_k_stride, ValueType* g_values, size_type g_stride, const stopping_status* stop_status) @@ -491,9 +492,9 @@ void update_g_kernel(dim3 grid, dim3 block, size_t dynamic_shared_memory, if (g_k_stride == 0) { return; } - stream->submit([&](sycl::handler& cgh) { + stream->submit([&](::sycl::handler& cgh) { cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + sycl_nd_range(grid, block), [=](::sycl::nd_item<3> item_ct1) { update_g_kernel(k, size, nrhs, g_k_values, g_k_stride, g_values, g_stride, stop_status, item_ct1); @@ -511,7 +512,8 @@ void update_x_r_and_f_kernel( ValueType* __restrict__ f_values, size_type f_stride, ValueType* __restrict__ r_values, size_type r_stride, ValueType* __restrict__ x_values, size_type x_stride, - const stopping_status* __restrict__ stop_status, sycl::nd_item<3> item_ct1) + const stopping_status* __restrict__ stop_status, + ::sycl::nd_item<3> item_ct1) { const auto global_id = thread::get_thread_id_flat(item_ct1); const auto row = global_id / x_stride; @@ -538,7 +540,7 @@ void update_x_r_and_f_kernel( template void update_x_r_and_f_kernel( - dim3 grid, dim3 block, size_t dynamic_shared_memory, sycl::queue* stream, + dim3 grid, dim3 block, size_t dynamic_shared_memory, ::sycl::queue* stream, size_type k, size_type size, size_type subspace_dim, size_type nrhs, const ValueType* m_values, size_type m_stride, const ValueType* g_values, size_type g_stride, const ValueType* u_values, size_type u_stride, @@ -546,9 +548,9 @@ void update_x_r_and_f_kernel( size_type r_stride, ValueType* x_values, size_type x_stride, const stopping_status* stop_status) { - stream->submit([&](sycl::handler& cgh) { + stream->submit([&](::sycl::handler& cgh) { cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + sycl_nd_range(grid, block), [=](::sycl::nd_item<3> item_ct1) { update_x_r_and_f_kernel( k, size, subspace_dim, nrhs, m_values, m_stride, g_values, g_stride, u_values, u_stride, f_values, f_stride, r_values, @@ -564,7 +566,8 @@ void compute_omega_kernel( const ValueType* __restrict__ tht, const remove_complex* __restrict__ residual_norm, ValueType* __restrict__ omega, - const stopping_status* __restrict__ stop_status, sycl::nd_item<3> item_ct1) + const stopping_status* __restrict__ stop_status, + ::sycl::nd_item<3> item_ct1) { const auto global_id = thread::get_thread_id_flat(item_ct1); @@ -586,15 +589,15 @@ void compute_omega_kernel( template void compute_omega_kernel(dim3 grid, dim3 block, size_t dynamic_shared_memory, - sycl::queue* stream, size_type nrhs, + ::sycl::queue* stream, size_type nrhs, const remove_complex kappa, const ValueType* tht, const remove_complex* residual_norm, ValueType* omega, const stopping_status* stop_status) { - stream->submit([&](sycl::handler& cgh) { + stream->submit([&](::sycl::handler& cgh) { cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + sycl_nd_range(grid, block), [=](::sycl::nd_item<3> item_ct1) { compute_omega_kernel(nrhs, kappa, tht, residual_norm, omega, stop_status, item_ct1); }); @@ -632,8 +635,8 @@ void initialize_subspace_vectors(std::shared_ptr exec, auto n = subspace_vectors->get_size()[0] * subspace_vectors->get_stride(); n = is_complex() ? 2 * n : n; - exec->get_queue()->submit([&](sycl::handler& cgh) { - cgh.parallel_for(sycl::range<1>(n), [=](sycl::item<1> idx) { + exec->get_queue()->submit([&](::sycl::handler& cgh) { + cgh.parallel_for(::sycl::range<1>(n), [=](::sycl::item<1> idx) { std::uint64_t offset = idx.get_linear_id(); oneapi::dpl::minstd_rand engine(seed, offset); oneapi::dpl::normal_distribution> diff --git a/dpcpp/stop/criterion_kernels.dp.cpp b/dpcpp/stop/criterion_kernels.dp.cpp index 3a24092ecac..9b8c90fa93a 100644 --- a/dpcpp/stop/criterion_kernels.dp.cpp +++ b/dpcpp/stop/criterion_kernels.dp.cpp @@ -56,8 +56,8 @@ void set_all_statuses(std::shared_ptr exec, { auto size = stop_status->get_num_elems(); stopping_status* __restrict__ stop_status_ptr = stop_status->get_data(); - exec->get_queue()->submit([&](sycl::handler& cgh) { - cgh.parallel_for(sycl::range<1>{size}, [=](sycl::id<1> idx_id) { + exec->get_queue()->submit([&](::sycl::handler& cgh) { + cgh.parallel_for(::sycl::range<1>{size}, [=](::sycl::id<1> idx_id) { const auto idx = idx_id[0]; stop_status_ptr[idx].stop(stoppingId, setFinalized); }); diff --git a/dpcpp/stop/residual_norm_kernels.dp.cpp b/dpcpp/stop/residual_norm_kernels.dp.cpp index 76295864840..34249c7cab2 100644 --- a/dpcpp/stop/residual_norm_kernels.dp.cpp +++ b/dpcpp/stop/residual_norm_kernels.dp.cpp @@ -68,8 +68,8 @@ void residual_norm(std::shared_ptr exec, static_assert(is_complex_s::value == false, "ValueType must not be complex in this function!"); auto device_storage_val = device_storage->get_data(); - exec->get_queue()->submit([&](sycl::handler& cgh) { - cgh.parallel_for(sycl::range<1>{1}, [=](sycl::id<1>) { + exec->get_queue()->submit([&](::sycl::handler& cgh) { + cgh.parallel_for(::sycl::range<1>{1}, [=](::sycl::id<1>) { device_storage_val[0] = true; device_storage_val[1] = false; }); @@ -78,9 +78,9 @@ void residual_norm(std::shared_ptr exec, auto orig_tau_val = orig_tau->get_const_values(); auto tau_val = tau->get_const_values(); auto stop_status_val = stop_status->get_data(); - exec->get_queue()->submit([&](sycl::handler& cgh) { + exec->get_queue()->submit([&](::sycl::handler& cgh) { cgh.parallel_for( - sycl::range<1>{tau->get_size()[1]}, [=](sycl::id<1> idx_id) { + ::sycl::range<1>{tau->get_size()[1]}, [=](::sycl::id<1> idx_id) { const auto tidx = idx_id[0]; if (tau_val[tidx] < rel_residual_goal * orig_tau_val[tidx]) { stop_status_val[tidx].converge(stoppingId, setFinalized); @@ -124,8 +124,8 @@ void implicit_residual_norm( array* device_storage, bool* all_converged, bool* one_changed) { auto device_storage_val = device_storage->get_data(); - exec->get_queue()->submit([&](sycl::handler& cgh) { - cgh.parallel_for(sycl::range<1>{1}, [=](sycl::id<1>) { + exec->get_queue()->submit([&](::sycl::handler& cgh) { + cgh.parallel_for(::sycl::range<1>{1}, [=](::sycl::id<1>) { device_storage_val[0] = true; device_storage_val[1] = false; }); @@ -134,9 +134,9 @@ void implicit_residual_norm( auto orig_tau_val = orig_tau->get_const_values(); auto tau_val = tau->get_const_values(); auto stop_status_val = stop_status->get_data(); - exec->get_queue()->submit([&](sycl::handler& cgh) { + exec->get_queue()->submit([&](::sycl::handler& cgh) { cgh.parallel_for( - sycl::range<1>{tau->get_size()[1]}, [=](sycl::id<1> idx_id) { + ::sycl::range<1>{tau->get_size()[1]}, [=](::sycl::id<1> idx_id) { const auto tidx = idx_id[0]; if (std::sqrt(std::abs(tau_val[tidx])) < rel_residual_goal * orig_tau_val[tidx]) { diff --git a/dpcpp/test/base/executor.dp.cpp b/dpcpp/test/base/executor.dp.cpp index 19f03039c73..976102a217b 100644 --- a/dpcpp/test/base/executor.dp.cpp +++ b/dpcpp/test/base/executor.dp.cpp @@ -115,7 +115,8 @@ TEST_F(SyclExecutor, CanGetExecInfo) TEST_F(SyclExecutor, KnowsNumberOfDevicesOfTypeAll) { - auto count = sycl::device::get_devices(sycl::info::device_type::all).size(); + auto count = + sycl::device::get_devices(::sycl::info::device_type::all).size(); auto num_devices = gko::SyclExecutor::get_num_devices("all"); @@ -125,7 +126,8 @@ TEST_F(SyclExecutor, KnowsNumberOfDevicesOfTypeAll) TEST_F(SyclExecutor, KnowsNumberOfDevicesOfTypeCPU) { - auto count = sycl::device::get_devices(sycl::info::device_type::cpu).size(); + auto count = + sycl::device::get_devices(::sycl::info::device_type::cpu).size(); auto num_devices = gko::SyclExecutor::get_num_devices("cpu"); @@ -135,7 +137,8 @@ TEST_F(SyclExecutor, KnowsNumberOfDevicesOfTypeCPU) TEST_F(SyclExecutor, KnowsNumberOfDevicesOfTypeGPU) { - auto count = sycl::device::get_devices(sycl::info::device_type::gpu).size(); + auto count = + sycl::device::get_devices(::sycl::info::device_type::gpu).size(); auto num_devices = gko::SyclExecutor::get_num_devices("gpu"); @@ -146,7 +149,8 @@ TEST_F(SyclExecutor, KnowsNumberOfDevicesOfTypeGPU) TEST_F(SyclExecutor, KnowsNumberOfDevicesOfTypeAccelerator) { auto count = - sycl::device::get_devices(sycl::info::device_type::accelerator).size(); + sycl::device::get_devices(::sycl::info::device_type::accelerator) + .size(); auto num_devices = gko::SyclExecutor::get_num_devices("accelerator"); @@ -197,7 +201,7 @@ TEST_F(SyclExecutor, CopiesDataToCPU) is_set.set_executor(sycl); ASSERT_NO_THROW(sycl->synchronize()); - ASSERT_NO_THROW(sycl->get_queue()->submit([&](sycl::handler& cgh) { + ASSERT_NO_THROW(sycl->get_queue()->submit([&](::sycl::handler& cgh) { auto* is_set_ptr = is_set.get_data(); cgh.single_task([=]() { check_data(copy, is_set_ptr); }); })); @@ -217,7 +221,7 @@ TEST_F(SyclExecutor, CopiesDataFromCPU) { int copy[2]; auto orig = sycl->alloc(2); - sycl->get_queue()->submit([&](sycl::handler& cgh) { + sycl->get_queue()->submit([&](::sycl::handler& cgh) { cgh.single_task([=]() { init_data(orig); }); }); @@ -238,7 +242,7 @@ TEST_F(SyclExecutor, CopiesDataFromSyclToSycl) int copy[2]; gko::array is_set(ref, 1); auto orig = sycl->alloc(2); - sycl->get_queue()->submit([&](sycl::handler& cgh) { + sycl->get_queue()->submit([&](::sycl::handler& cgh) { cgh.single_task([=]() { init_data(orig); }); }); @@ -246,7 +250,7 @@ TEST_F(SyclExecutor, CopiesDataFromSyclToSycl) sycl2->copy_from(sycl, 2, orig, copy_sycl2); // Check that the data is really on GPU is_set.set_executor(sycl2); - ASSERT_NO_THROW(sycl2->get_queue()->submit([&](sycl::handler& cgh) { + ASSERT_NO_THROW(sycl2->get_queue()->submit([&](::sycl::handler& cgh) { auto* is_set_ptr = is_set.get_data(); cgh.single_task([=]() { check_data(copy_sycl2, is_set_ptr); }); })); @@ -278,9 +282,9 @@ TEST_F(SyclExecutor, FreeAfterKernel) gko::array y(sycl, length); auto x_val = x.get_data(); auto y_val = y.get_data(); - sycl->get_queue()->submit([&](sycl::handler& cgh) { - cgh.parallel_for(sycl::range<1>{length}, - [=](sycl::id<1> i) { y_val[i] += x_val[i]; }); + sycl->get_queue()->submit([&](::sycl::handler& cgh) { + cgh.parallel_for(::sycl::range<1>{length}, + [=](::sycl::id<1> i) { y_val[i] += x_val[i]; }); }); } // to ensure everything on queue is finished. diff --git a/dpcpp/test/components/cooperative_groups.dp.cpp b/dpcpp/test/components/cooperative_groups.dp.cpp index d48d3a0d6a9..35d53eac9e1 100644 --- a/dpcpp/test/components/cooperative_groups.dp.cpp +++ b/dpcpp/test/components/cooperative_groups.dp.cpp @@ -113,7 +113,7 @@ class CooperativeGroups : public testing::TestWithParam { // kernel implementation template -void cg_shuffle(bool* s, sycl::nd_item<3> item_ct1) +void cg_shuffle(bool* s, ::sycl::nd_item<3> item_ct1) { constexpr auto sg_size = DeviceConfig::subgroup_size; auto group = @@ -130,13 +130,13 @@ void cg_shuffle(bool* s, sycl::nd_item<3> item_ct1) // group all kernel things together template void cg_shuffle_host(dim3 grid, dim3 block, - gko::size_type dynamic_shared_memory, sycl::queue* queue, + gko::size_type dynamic_shared_memory, ::sycl::queue* queue, bool* s) { - queue->submit([&](sycl::handler& cgh) { + queue->submit([&](::sycl::handler& cgh) { cgh.parallel_for( sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( + [=](::sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( DeviceConfig::subgroup_size)]] __WG_BOUND__(DeviceConfig:: block_size) { cg_shuffle(s, item_ct1); @@ -151,7 +151,7 @@ GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION_TOTYPE(cg_shuffle_config, // the call void cg_shuffle_config_call(std::uint32_t desired_cfg, dim3 grid, dim3 block, gko::size_type dynamic_shared_memory, - sycl::queue* queue, bool* s) + ::sycl::queue* queue, bool* s) { cg_shuffle_config( default_config_list, @@ -169,7 +169,7 @@ TEST_P(CooperativeGroups, Shuffle) template -void cg_all(bool* s, sycl::nd_item<3> item_ct1) +void cg_all(bool* s, ::sycl::nd_item<3> item_ct1) { constexpr auto sg_size = DeviceConfig::subgroup_size; auto group = @@ -190,7 +190,7 @@ TEST_P(CooperativeGroups, All) { test_all_subgroup(cg_all_call); } template -void cg_any(bool* s, sycl::nd_item<3> item_ct1) +void cg_any(bool* s, ::sycl::nd_item<3> item_ct1) { constexpr auto sg_size = DeviceConfig::subgroup_size; auto group = @@ -210,7 +210,7 @@ TEST_P(CooperativeGroups, Any) { test_all_subgroup(cg_any_call); } template -void cg_ballot(bool* s, sycl::nd_item<3> item_ct1) +void cg_ballot(bool* s, ::sycl::nd_item<3> item_ct1) { constexpr auto sg_size = cfg::subgroup_size; auto group = diff --git a/dpcpp/test_dpcpp.dp.cpp b/dpcpp/test_dpcpp.dp.cpp index 80627b6498b..f9984698af4 100644 --- a/dpcpp/test_dpcpp.dp.cpp +++ b/dpcpp/test_dpcpp.dp.cpp @@ -36,6 +36,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. int main() { // Use the queue property `in_order` which is DPC++ only - sycl::queue myQueue{sycl::property::queue::in_order{}}; + ::sycl::queue myQueue{sycl::property::queue::in_order{}}; return 0; } diff --git a/include/ginkgo/core/base/executor.hpp b/include/ginkgo/core/base/executor.hpp index 7f91c08db0d..53a5e01ad83 100644 --- a/include/ginkgo/core/base/executor.hpp +++ b/include/ginkgo/core/base/executor.hpp @@ -1969,7 +1969,7 @@ class SyclExecutor : public detail::ExecutorBase, return this->get_exec_info().device_id; } - sycl::queue* get_queue() const { return queue_.get(); } + ::sycl::queue* get_queue() const { return queue_.get(); } /** * Get the number of devices present on the system. @@ -2088,7 +2088,7 @@ class SyclExecutor : public detail::ExecutorBase, template using queue_manager = std::unique_ptr>; - queue_manager queue_; + queue_manager<::sycl::queue> queue_; };