Skip to content

Commit

Permalink
Add dpcpp kernels
Browse files Browse the repository at this point in the history
Co-authored-by: Phuong Nguyen <[email protected]>
  • Loading branch information
pratikvn and Phuong Nguyen committed Apr 24, 2024
1 parent 45ae7d0 commit 96c2138
Show file tree
Hide file tree
Showing 6 changed files with 509 additions and 16 deletions.
1 change: 0 additions & 1 deletion cuda/preconditioner/batch_jacobi_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
#include "core/synthesizer/implementation_selection.hpp"
#include "cuda/base/batch_struct.hpp"
#include "cuda/base/config.hpp"
#include "cuda/base/exception.cuh"
#include "cuda/base/types.hpp"
#include "cuda/components/cooperative_groups.cuh"
#include "cuda/components/intrinsics.cuh"
Expand Down
66 changes: 62 additions & 4 deletions dpcpp/preconditioner/batch_block_jacobi.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -44,25 +44,83 @@ public:
const batch::matrix::ell::batch_item<const value_type,
const index_type>&,
value_type* const, sycl::nd_item<3> item_ct1)
{}
{
common_generate_for_all_system_matrix_types(batch_id);
item_ct1.barrier(sycl::access::fence_space::local_space);
}

void generate(size_type batch_id,
const batch::matrix::csr::batch_item<const value_type,
const index_type>&,
value_type* const, sycl::nd_item<3> item_ct1)
{}
{
common_generate_for_all_system_matrix_types(batch_id);
item_ct1.barrier(sycl::access::fence_space::local_space);
}

void generate(size_type batch_id,
const batch::matrix::dense::batch_item<const value_type>&,
value_type* const, sycl::nd_item<3> item_ct1)
{}
{
common_generate_for_all_system_matrix_types(batch_id);
item_ct1.barrier(sycl::access::fence_space::local_space);
}

__dpct_inline__ void apply(const int num_rows, const value_type* const r,
value_type* const z,
sycl::nd_item<3> item_ct1) const
{}
{
// Structure-aware SpMV
const auto sg = item_ct1.get_sub_group();
const int sg_id = sg.get_group_id();
const int sg_size = sg.get_local_range().size();
const int num_sg = sg.get_group_range().size();
const int sg_tid = sg.get_local_id();

// one subwarp per row
for (int row_idx = sg_id; row_idx < num_rows; row_idx += num_sg) {
const int block_idx = row_block_map_[row_idx];
const value_type* dense_block_ptr =
blocks_arr_entry_ + gko::detail::batch_jacobi::get_block_offset(
block_idx, blocks_cumulative_offsets_);
const auto stride = gko::detail::batch_jacobi::get_stride(
block_idx, block_ptrs_arr_);

const int idx_start = block_ptrs_arr_[block_idx];
const int idx_end = block_ptrs_arr_[block_idx + 1];
const int bsize = idx_end - idx_start;

const int dense_block_row = row_idx - idx_start;
auto sum = zero<value_type>();

for (int dense_block_col = sg_tid; dense_block_col < bsize;
dense_block_col += sg_size) {
const auto block_val =
dense_block_ptr[dense_block_row * stride +
dense_block_col]; // coalesced accesses
sum += block_val * r[dense_block_col + idx_start];
}

// reduction
sum = sycl::reduce_over_group(sg, sum, sycl::plus<>());

if (sg_tid == 0) {
z[row_idx] = sum;
}
}
}

private:
__dpct_inline__ void common_generate_for_all_system_matrix_types(
size_type batch_id)
{
blocks_arr_entry_ =
blocks_arr_batch_ +
gko::detail::batch_jacobi::get_batch_offset(
batch_id, num_blocks_, blocks_cumulative_offsets_);
}


const size_type num_blocks_;
const int* const blocks_cumulative_offsets_;
const value_type* const blocks_arr_batch_;
Expand Down
130 changes: 124 additions & 6 deletions dpcpp/preconditioner/batch_jacobi_kernels.dp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,15 @@

#include "core/base/batch_struct.hpp"
#include "core/base/utils.hpp"
#include "core/components/prefix_sum_kernels.hpp"
#include "core/matrix/batch_struct.hpp"
#include "core/synthesizer/implementation_selection.hpp"
#include "dpcpp/base/batch_struct.hpp"
#include "dpcpp/base/config.hpp"
#include "dpcpp/base/dim3.dp.hpp"
#include "dpcpp/base/dpct.hpp"
#include "dpcpp/matrix/batch_struct.hpp"
#include "dpcpp/preconditioner/jacobi_common.hpp"


namespace gko {
Expand All @@ -21,11 +27,34 @@ namespace dpcpp {
namespace batch_jacobi {


namespace {


using batch_jacobi_dpcpp_compiled_max_block_sizes =
gko::kernels::dpcpp::jacobi::compiled_kernels;

#include "dpcpp/preconditioner/batch_jacobi_kernels.hpp.inc"


} // namespace


template <typename IndexType>
void compute_cumulative_block_storage(
std::shared_ptr<const DefaultExecutor> exec, const size_type num_blocks,
const IndexType* const block_pointers,
IndexType* const blocks_cumulative_offsets) GKO_NOT_IMPLEMENTED;
IndexType* const blocks_cumulative_offsets)
{
(exec->get_queue())->submit([&](sycl::handler& cgh) {
cgh.parallel_for(num_blocks, [=](auto id) {
const auto bsize = block_pointers[id + 1] - block_pointers[id];
blocks_cumulative_offsets[id] = bsize * bsize;
});
});
exec->get_queue()->wait();
components::prefix_sum_nonnegative(exec, blocks_cumulative_offsets,
num_blocks + 1);
}

GKO_INSTANTIATE_FOR_INT32_TYPE(
GKO_DECLARE_BATCH_BLOCK_JACOBI_COMPUTE_CUMULATIVE_BLOCK_STORAGE);
Expand All @@ -35,7 +64,15 @@ template <typename IndexType>
void find_row_block_map(std::shared_ptr<const DefaultExecutor> exec,
const size_type num_blocks,
const IndexType* const block_pointers,
IndexType* const map_block_to_row) GKO_NOT_IMPLEMENTED;
IndexType* const map_block_to_row)
{
(exec->get_queue())->submit([&](sycl::handler& cgh) {
cgh.parallel_for(num_blocks, [=](auto id) {
for (int i = block_pointers[id]; i < block_pointers[id + 1]; i++)
map_block_to_row[i] = id;
});
});
}

GKO_INSTANTIATE_FOR_INT32_TYPE(
GKO_DECLARE_BATCH_BLOCK_JACOBI_FIND_ROW_BLOCK_MAP);
Expand All @@ -46,21 +83,102 @@ void extract_common_blocks_pattern(
std::shared_ptr<const DefaultExecutor> exec,
const gko::matrix::Csr<ValueType, IndexType>* const first_sys_csr,
const size_type num_blocks, const IndexType* const cumulative_block_storage,
const IndexType* const block_pointers, const IndexType* const,
IndexType* const blocks_pattern) GKO_NOT_IMPLEMENTED;
const IndexType* const block_pointers, const IndexType* const map_block_row,
IndexType* const blocks_pattern)
{
const auto nrows = first_sys_csr->get_size()[0];
constexpr int subgroup_size = config::warp_size;
auto device = exec->get_queue()->get_device();
auto group_size =
device.get_info<sycl::info::device::max_work_group_size>();

const dim3 block(group_size);
const dim3 grid(ceildiv(nrows * subgroup_size, group_size));

const auto row_ptrs = first_sys_csr->get_const_row_ptrs();
const auto col_idxs = first_sys_csr->get_const_col_idxs();

(exec->get_queue())->submit([&](sycl::handler& cgh) {
cgh.parallel_for(sycl_nd_range(grid, block),
[=](sycl::nd_item<3> item_ct1)
[[intel::reqd_sub_group_size(subgroup_size)]] {
extract_common_block_pattern_kernel(
static_cast<int>(nrows), row_ptrs,
col_idxs, num_blocks,
cumulative_block_storage, block_pointers,
map_block_row, blocks_pattern, item_ct1);
});
});
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
GKO_DECLARE_BATCH_BLOCK_JACOBI_EXTRACT_PATTERN_KERNEL);


namespace {


template <int compiled_max_block_size, typename ValueType, typename IndexType>
void compute_block_jacobi_helper(
syn::value_list<int, compiled_max_block_size>,
const batch::matrix::Csr<ValueType, IndexType>* const sys_csr,
const size_type num_blocks, const IndexType* const cumulative_block_storage,
const IndexType* const block_pointers,
const IndexType* const blocks_pattern, ValueType* const blocks,
std::shared_ptr<const DpcppExecutor> exec)
{
// constexpr int subwarp_size =
// gko::kernels::dpcpp::jacobi::get_larger_power(compiled_max_block_size);
// TODO: Find a way to allow smaller block_sizes (<16)

constexpr int subgroup_size = config::warp_size;
auto device = exec->get_queue()->get_device();
auto group_size =
device.get_info<sycl::info::device::max_work_group_size>();

const auto nbatch = sys_csr->get_num_batch_items();
const auto nrows = sys_csr->get_common_size()[0];
const auto nnz = sys_csr->get_num_stored_elements() / nbatch;
const auto sys_csr_values = sys_csr->get_const_values();

dim3 block(group_size);
dim3 grid(ceildiv(num_blocks * nbatch * subgroup_size, group_size));

(exec->get_queue())->submit([&](sycl::handler& cgh) {
cgh.parallel_for(sycl_nd_range(grid, block),
[=](sycl::nd_item<3> item_ct1)
[[intel::reqd_sub_group_size(subgroup_size)]] {
compute_block_jacobi_kernel(
nbatch, static_cast<int>(nnz),
sys_csr_values, num_blocks,
cumulative_block_storage, block_pointers,
blocks_pattern, blocks, item_ct1);
});
});
}

GKO_ENABLE_IMPLEMENTATION_SELECTION(select_compute_block_jacobi_helper,
compute_block_jacobi_helper);

} // anonymous namespace


template <typename ValueType, typename IndexType>
void compute_block_jacobi(
std::shared_ptr<const DefaultExecutor> exec,
const batch::matrix::Csr<ValueType, IndexType>* const sys_csr, const uint32,
const size_type num_blocks, const IndexType* const cumulative_block_storage,
const IndexType* const block_pointers,
const IndexType* const blocks_pattern,
ValueType* const blocks) GKO_NOT_IMPLEMENTED;
const IndexType* const blocks_pattern, ValueType* const blocks)
{
select_compute_block_jacobi_helper(
batch_jacobi_dpcpp_compiled_max_block_sizes(),
[&](int compiled_block_size) {
return user_given_max_block_size <= compiled_block_size;
},
syn::value_list<int>(), syn::type_list<>(), sys_csr, num_blocks,
cumulative_block_storage, block_pointers, blocks_pattern, blocks, exec);
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
GKO_DECLARE_BATCH_BLOCK_JACOBI_COMPUTE_KERNEL);
Expand Down
Loading

0 comments on commit 96c2138

Please sign in to comment.