Skip to content

Commit

Permalink
use ::sycl for anything from sycl
Browse files Browse the repository at this point in the history
  • Loading branch information
yhmtsai committed Aug 22, 2023
1 parent c325a59 commit 17302d4
Show file tree
Hide file tree
Showing 52 changed files with 985 additions and 936 deletions.
17 changes: 8 additions & 9 deletions benchmark/utils/dpcpp_timer.dp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,30 +74,29 @@ class SyclTimer : public Timer {
{
exec_->synchronize();
// Currently, gko::SyclExecutor always use default stream.
start_ = exec_->get_queue()->submit([&](sycl::handler& cgh) {
cgh.parallel_for(1, [=](sycl::id<1> id) {});
start_ = exec_->get_queue()->submit([&](::sycl::handler& cgh) {
cgh.parallel_for(1, [=](::sycl::id<1> id) {});
});
}

double toc_impl() override
{
auto stop = exec_->get_queue()->submit([&](sycl::handler& cgh) {
cgh.parallel_for(1, [=](sycl::id<1> id) {});
auto stop = exec_->get_queue()->submit([&](::sycl::handler& cgh) {
cgh.parallel_for(1, [=](::sycl::id<1> id) {});
});
stop.wait_and_throw();
// get the start time of stop
auto stop_time = stop.get_profiling_info<
sycl::info::event_profiling::command_start>();
::sycl::info::event_profiling::command_start>();
// get the end time of start
auto start_time =
start_
.get_profiling_info<sycl::info::event_profiling::command_end>();
auto start_time = start_.get_profiling_info<
::sycl::info::event_profiling::command_end>();
return (stop_time - start_time) / double{1.0e9};
}

private:
std::shared_ptr<const gko::SyclExecutor> exec_;
sycl::event start_;
::sycl::event start_;
int id_;
};

Expand Down
2 changes: 1 addition & 1 deletion core/device_hooks/dpcpp_hooks.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ namespace kernels {
namespace sycl {


void destroy_event(sycl::event* event) GKO_NOT_COMPILED(sycl);
void destroy_event(::sycl::event* event) GKO_NOT_COMPILED(sycl);


} // namespace sycl
Expand Down
44 changes: 22 additions & 22 deletions dpcpp/base/batch_multi_vector_kernels.dp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,16 +79,16 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
const auto num_batches = x_ub.num_batch_items;
auto device = exec->get_queue()->get_device();
auto group_size =
device.get_info<sycl::info::device::max_work_group_size>();
device.get_info<::sycl::info::device::max_work_group_size>();

const dim3 block(group_size);
const dim3 grid(num_batches);

// Launch a kernel that has nbatches blocks, each block has max group size
if (alpha->get_common_size()[1] == 1) {
exec->get_queue()->submit([&](sycl::handler& cgh) {
exec->get_queue()->submit([&](::sycl::handler& cgh) {
cgh.parallel_for(
sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
sycl_nd_range(grid, block), [=](::sycl::nd_item<3> item_ct1) {
auto group = item_ct1.get_group();
auto group_id = group.get_group_linear_id();
const auto alpha_b =
Expand All @@ -99,9 +99,9 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
});
});
} else {
exec->get_queue()->submit([&](sycl::handler& cgh) {
exec->get_queue()->submit([&](::sycl::handler& cgh) {
cgh.parallel_for(
sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
sycl_nd_range(grid, block), [=](::sycl::nd_item<3> item_ct1) {
auto group = item_ct1.get_group();
auto group_id = group.get_group_linear_id();
const auto alpha_b =
Expand Down Expand Up @@ -130,17 +130,17 @@ void add_scaled(std::shared_ptr<const DefaultExecutor> exec,
const auto num_batches = x->get_num_batch_items();
auto device = exec->get_queue()->get_device();
auto group_size =
device.get_info<sycl::info::device::max_work_group_size>();
device.get_info<::sycl::info::device::max_work_group_size>();

const dim3 block(group_size);
const dim3 grid(num_batches);
const auto alpha_ub = get_batch_struct(alpha);
const auto x_ub = get_batch_struct(x);
const auto y_ub = get_batch_struct(y);
if (alpha->get_common_size()[1] == 1) {
exec->get_queue()->submit([&](sycl::handler& cgh) {
exec->get_queue()->submit([&](::sycl::handler& cgh) {
cgh.parallel_for(
sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
sycl_nd_range(grid, block), [=](::sycl::nd_item<3> item_ct1) {
auto group = item_ct1.get_group();
auto group_id = group.get_group_linear_id();
const auto alpha_b =
Expand All @@ -152,9 +152,9 @@ void add_scaled(std::shared_ptr<const DefaultExecutor> exec,
});
});
} else {
exec->get_queue()->submit([&](sycl::handler& cgh) {
exec->get_queue()->submit([&](::sycl::handler& cgh) {
cgh.parallel_for(
sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
sycl_nd_range(grid, block), [=](::sycl::nd_item<3> item_ct1) {
auto group = item_ct1.get_group();
auto group_id = group.get_group_linear_id();
const auto alpha_b =
Expand Down Expand Up @@ -185,16 +185,16 @@ void compute_dot(std::shared_ptr<const DefaultExecutor> exec,
const auto num_batches = x_ub.num_batch_items;
auto device = exec->get_queue()->get_device();
auto group_size =
device.get_info<sycl::info::device::max_work_group_size>();
device.get_info<::sycl::info::device::max_work_group_size>();

const dim3 block(group_size);
const dim3 grid(num_batches);

// TODO: Remove reqd_sub_group size and use sycl::reduce_over_group
exec->get_queue()->submit([&](sycl::handler& cgh) {
exec->get_queue()->submit([&](::sycl::handler& cgh) {
cgh.parallel_for(
sycl_nd_range(grid, block),
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(
[=](::sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(
config::warp_size)]] {
auto group = item_ct1.get_group();
auto group_id = group.get_group_linear_id();
Expand Down Expand Up @@ -224,15 +224,15 @@ void compute_conj_dot(std::shared_ptr<const DefaultExecutor> exec,
const auto num_batches = x_ub.num_batch_items;
auto device = exec->get_queue()->get_device();
auto group_size =
device.get_info<sycl::info::device::max_work_group_size>();
device.get_info<::sycl::info::device::max_work_group_size>();

const dim3 block(group_size);
const dim3 grid(num_batches);

exec->get_queue()->submit([&](sycl::handler& cgh) {
exec->get_queue()->submit([&](::sycl::handler& cgh) {
cgh.parallel_for(
sycl_nd_range(grid, block),
[=](sycl::nd_item<3> item_ct1)
[=](::sycl::nd_item<3> item_ct1)
[[sycl::reqd_sub_group_size(config::warp_size)]] {
auto group = item_ct1.get_group();
auto group_id = group.get_group_linear_id();
Expand Down Expand Up @@ -262,14 +262,14 @@ void compute_norm2(std::shared_ptr<const DefaultExecutor> exec,
const auto num_batches = x_ub.num_batch_items;
auto device = exec->get_queue()->get_device();
auto group_size =
device.get_info<sycl::info::device::max_work_group_size>();
device.get_info<::sycl::info::device::max_work_group_size>();

const dim3 block(group_size);
const dim3 grid(num_batches);

exec->get_queue()->submit([&](sycl::handler& cgh) {
exec->get_queue()->submit([&](::sycl::handler& cgh) {
cgh.parallel_for(sycl_nd_range(grid, block),
[=](sycl::nd_item<3> item_ct1)
[=](::sycl::nd_item<3> item_ct1)
[[sycl::reqd_sub_group_size(config::warp_size)]] {
auto group = item_ct1.get_group();
auto group_id = group.get_group_linear_id();
Expand Down Expand Up @@ -297,14 +297,14 @@ void copy(std::shared_ptr<const DefaultExecutor> exec,
const auto num_batches = x_ub.num_batch_items;
auto device = exec->get_queue()->get_device();
auto group_size =
device.get_info<sycl::info::device::max_work_group_size>();
device.get_info<::sycl::info::device::max_work_group_size>();

const dim3 block(group_size);
const dim3 grid(num_batches);

exec->get_queue()->submit([&](sycl::handler& cgh) {
exec->get_queue()->submit([&](::sycl::handler& cgh) {
cgh.parallel_for(
sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
sycl_nd_range(grid, block), [=](::sycl::nd_item<3> item_ct1) {
auto group = item_ct1.get_group();
auto group_id = group.get_group_linear_id();
const auto x_b = batch::extract_batch_item(x_ub, group_id);
Expand Down
14 changes: 7 additions & 7 deletions dpcpp/base/batch_multi_vector_kernels.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ template <typename ValueType, typename Mapping>
__dpct_inline__ void scale_kernel(
const gko::batch::multi_vector::batch_item<const ValueType>& alpha,
const gko::batch::multi_vector::batch_item<ValueType>& x,
sycl::nd_item<3>& item_ct1, Mapping map)
::sycl::nd_item<3>& item_ct1, Mapping map)
{
const int max_li = x.num_rows * x.num_rhs;
for (int li = item_ct1.get_local_linear_id(); li < max_li;
Expand All @@ -53,7 +53,7 @@ __dpct_inline__ void add_scaled_kernel(
const gko::batch::multi_vector::batch_item<const ValueType>& alpha,
const gko::batch::multi_vector::batch_item<const ValueType>& x,
const gko::batch::multi_vector::batch_item<ValueType>& y,
sycl::nd_item<3>& item_ct1, Mapping map)
::sycl::nd_item<3>& item_ct1, Mapping map)
{
const int max_li = x.num_rows * x.num_rhs;
for (int li = item_ct1.get_local_id(2); li < max_li;
Expand All @@ -72,12 +72,12 @@ __dpct_inline__ void compute_gen_dot_product_kernel(
const gko::batch::multi_vector::batch_item<const ValueType>& x,
const gko::batch::multi_vector::batch_item<const ValueType>& y,
const gko::batch::multi_vector::batch_item<ValueType>& result,
sycl::nd_item<3>& item_ct1, Mapping conj_map)
::sycl::nd_item<3>& item_ct1, Mapping conj_map)
{
constexpr auto tile_size = config::warp_size;
auto subg =
group::tiled_partition<tile_size>(group::this_thread_block(item_ct1));
const auto subgroup = static_cast<sycl::sub_group>(subg);
const auto subgroup = static_cast<::sycl::sub_group>(subg);
const int subgroup_id = subgroup.get_group_id();
const int subgroup_size = subgroup.get_local_range().size();
const int num_subgroups = subgroup.get_group_range().size();
Expand Down Expand Up @@ -107,12 +107,12 @@ __dpct_inline__ void compute_norm2_kernel(
const gko::batch::multi_vector::batch_item<const ValueType>& x,
const gko::batch::multi_vector::batch_item<remove_complex<ValueType>>&
result,
sycl::nd_item<3>& item_ct1)
::sycl::nd_item<3>& item_ct1)
{
constexpr auto tile_size = config::warp_size;
auto subg =
group::tiled_partition<tile_size>(group::this_thread_block(item_ct1));
const auto subgroup = static_cast<sycl::sub_group>(subg);
const auto subgroup = static_cast<::sycl::sub_group>(subg);
const int subgroup_id = subgroup.get_group_id();
const int subgroup_size = subgroup.get_local_range().size();
const int num_subgroups = subgroup.get_group_range().size();
Expand Down Expand Up @@ -140,7 +140,7 @@ template <typename ValueType>
__dpct_inline__ void copy_kernel(
const gko::batch::multi_vector::batch_item<const ValueType>& in,
const gko::batch::multi_vector::batch_item<ValueType>& out,
sycl::nd_item<3>& item_ct1)
::sycl::nd_item<3>& item_ct1)
{
for (int iz = item_ct1.get_local_linear_id(); iz < in.num_rows * in.num_rhs;
iz += item_ct1.get_local_range().size()) {
Expand Down
2 changes: 1 addition & 1 deletion dpcpp/base/device.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ namespace sycl {


/** calls delete on the given event. */
void destroy_event(sycl::event* event);
void destroy_event(::sycl::event* event);


} // namespace sycl
Expand Down
8 changes: 4 additions & 4 deletions dpcpp/base/dim3.dp.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,9 +66,9 @@ struct dim3 {
* get_range returns the range for sycl with correct ordering (reverse of
* cuda)
*
* @return sycl::range<3>
* @return ::sycl::range<3>
*/
sycl::range<3> get_range() { return sycl::range<3>(z, y, x); }
::sycl::range<3> get_range() { return ::sycl::range<3>(z, y, x); }
};


Expand All @@ -80,11 +80,11 @@ struct dim3 {
*
* @return sycl::nd_range<3>
*/
inline sycl::nd_range<3> sycl_nd_range(dim3 grid, dim3 block)
inline ::sycl::nd_range<3> sycl_nd_range(dim3 grid, dim3 block)
{
auto local_range = block.get_range();
auto global_range = grid.get_range() * local_range;
return sycl::nd_range<3>(global_range, local_range);
return ::sycl::nd_range<3>(global_range, local_range);
}


Expand Down
40 changes: 21 additions & 19 deletions dpcpp/base/executor.dp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,12 +53,12 @@ namespace detail {

const std::vector<sycl::device> get_devices(std::string device_type)
{
std::map<std::string, sycl::info::device_type> device_type_map{
{"accelerator", sycl::info::device_type::accelerator},
{"all", sycl::info::device_type::all},
{"cpu", sycl::info::device_type::cpu},
{"host", sycl::info::device_type::host},
{"gpu", sycl::info::device_type::gpu}};
std::map<std::string, ::sycl::info::device_type> device_type_map{
{"accelerator", ::sycl::info::device_type::accelerator},
{"all", ::sycl::info::device_type::all},
{"cpu", ::sycl::info::device_type::cpu},
{"host", ::sycl::info::device_type::host},
{"gpu", ::sycl::info::device_type::gpu}};
std::for_each(device_type.begin(), device_type.end(),
[](char& c) { c = std::tolower(c); });
return sycl::device::get_devices(device_type_map.at(device_type));
Expand Down Expand Up @@ -224,7 +224,7 @@ bool SyclExecutor::verify_memory_to(const SyclExecutor* dest_exec) const
namespace detail {


void delete_queue(sycl::queue* queue)
void delete_queue(::sycl::queue* queue)
{
queue->wait();
delete queue;
Expand Down Expand Up @@ -257,7 +257,7 @@ void SyclExecutor::set_device_property(sycl_queue_property property)
if (!device.is_host()) {
try {
auto subgroup_sizes =
device.get_info<sycl::info::device::sub_group_sizes>();
device.get_info<::sycl::info::device::sub_group_sizes>();
for (auto& i : subgroup_sizes) {
this->get_exec_info().subgroup_sizes.push_back(i);
}
Expand All @@ -266,26 +266,26 @@ void SyclExecutor::set_device_property(sycl_queue_property property)
}
}
this->get_exec_info().num_computing_units = static_cast<int>(
device.get_info<sycl::info::device::max_compute_units>());
device.get_info<::sycl::info::device::max_compute_units>());
const auto subgroup_sizes = this->get_exec_info().subgroup_sizes;
if (subgroup_sizes.size()) {
this->get_exec_info().max_subgroup_size = static_cast<int>(
*std::max_element(subgroup_sizes.begin(), subgroup_sizes.end()));
}
this->get_exec_info().max_workgroup_size = static_cast<int>(
device.get_info<sycl::info::device::max_work_group_size>());
device.get_info<::sycl::info::device::max_work_group_size>());
// They change the max_work_item_size with template parameter Dimension after
// major version 6 and adding the default = 3 is not in the same release.
#if GINKGO_DPCPP_MAJOR_VERSION >= 6
auto max_workitem_sizes =
device.get_info<sycl::info::device::max_work_item_sizes<3>>();
device.get_info<::sycl::info::device::max_work_item_sizes<3>>();
#else
auto max_workitem_sizes =
device.get_info<sycl::info::device::max_work_item_sizes>();
device.get_info<::sycl::info::device::max_work_item_sizes>();
#endif
// Get the max dimension of a sycl::id object
// Get the max dimension of a ::sycl::id object
auto max_work_item_dimensions =
device.get_info<sycl::info::device::max_work_item_dimensions>();
device.get_info<::sycl::info::device::max_work_item_dimensions>();
for (uint32 i = 0; i < max_work_item_dimensions; i++) {
this->get_exec_info().max_workitem_sizes.push_back(
max_workitem_sizes[i]);
Expand All @@ -295,10 +295,10 @@ void SyclExecutor::set_device_property(sycl_queue_property property)
if (device.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu)) {
#if GINKGO_DPCPP_MAJOR_VERSION >= 6
this->get_exec_info().num_pu_per_cu = device.get_info<
sycl::ext::intel::info::device::gpu_hw_threads_per_eu>();
::sycl::ext::intel::info::device::gpu_hw_threads_per_eu>();
#else
this->get_exec_info().num_pu_per_cu = device.get_info<
sycl::info::device::ext_intel_gpu_hw_threads_per_eu>();
::sycl::info::device::ext_intel_gpu_hw_threads_per_eu>();
#endif
} else {
// To make the usage still valid.
Expand All @@ -311,16 +311,18 @@ void SyclExecutor::set_device_property(sycl_queue_property property)
// `wait()` would be needed after every call to a DPC++ function or kernel.
// For example, without `in_order`, doing a copy, a kernel, and a copy, will
// not necessarily happen in that order by default, which we need to avoid.
auto* queue = new sycl::queue{device, detail::get_property_list(property)};
queue_ = std::move(queue_manager<sycl::queue>{queue, detail::delete_queue});
auto* queue =
new ::sycl::queue{device, detail::get_property_list(property)};
queue_ =
std::move(queue_manager<::sycl::queue>{queue, detail::delete_queue});
}


namespace kernels {
namespace sycl {


void destroy_event(sycl::event* event) { delete event; }
void destroy_event(::sycl::event* event) { delete event; }


} // namespace sycl
Expand Down
Loading

0 comments on commit 17302d4

Please sign in to comment.