Add dpcpp kernels

Co-authored-by: Phuong Nguyen <[email protected]>
ginkgo-project · Apr 24, 2024 · 96c2138 · 96c2138
1 parent 45ae7d0
commit 96c2138
Show file tree

Hide file tree

Showing 6 changed files with 509 additions and 16 deletions.
diff --git a/cuda/preconditioner/batch_jacobi_kernels.cu b/cuda/preconditioner/batch_jacobi_kernels.cu
@@ -18,7 +18,6 @@
 #include "core/synthesizer/implementation_selection.hpp"
 #include "cuda/base/batch_struct.hpp"
 #include "cuda/base/config.hpp"
-#include "cuda/base/exception.cuh"
 #include "cuda/base/types.hpp"
 #include "cuda/components/cooperative_groups.cuh"
 #include "cuda/components/intrinsics.cuh"

diff --git a/dpcpp/preconditioner/batch_block_jacobi.hpp.inc b/dpcpp/preconditioner/batch_block_jacobi.hpp.inc
@@ -44,25 +44,83 @@ public:
                   const batch::matrix::ell::batch_item<const value_type,
                                                        const index_type>&,
                   value_type* const, sycl::nd_item<3> item_ct1)
-    {}
+    {
+        common_generate_for_all_system_matrix_types(batch_id);
+        item_ct1.barrier(sycl::access::fence_space::local_space);
+    }
 
     void generate(size_type batch_id,
                   const batch::matrix::csr::batch_item<const value_type,
                                                        const index_type>&,
                   value_type* const, sycl::nd_item<3> item_ct1)
-    {}
+    {
+        common_generate_for_all_system_matrix_types(batch_id);
+        item_ct1.barrier(sycl::access::fence_space::local_space);
+    }
 
     void generate(size_type batch_id,
                   const batch::matrix::dense::batch_item<const value_type>&,
                   value_type* const, sycl::nd_item<3> item_ct1)
-    {}
+    {
+        common_generate_for_all_system_matrix_types(batch_id);
+        item_ct1.barrier(sycl::access::fence_space::local_space);
+    }
 
     __dpct_inline__ void apply(const int num_rows, const value_type* const r,
                                value_type* const z,
                                sycl::nd_item<3> item_ct1) const
-    {}
+    {
+        // Structure-aware SpMV
+        const auto sg = item_ct1.get_sub_group();
+        const int sg_id = sg.get_group_id();
+        const int sg_size = sg.get_local_range().size();
+        const int num_sg = sg.get_group_range().size();
+        const int sg_tid = sg.get_local_id();
+
+        // one subwarp per row
+        for (int row_idx = sg_id; row_idx < num_rows; row_idx += num_sg) {
+            const int block_idx = row_block_map_[row_idx];
+            const value_type* dense_block_ptr =
+                blocks_arr_entry_ + gko::detail::batch_jacobi::get_block_offset(
+                                        block_idx, blocks_cumulative_offsets_);
+            const auto stride = gko::detail::batch_jacobi::get_stride(
+                block_idx, block_ptrs_arr_);
+
+            const int idx_start = block_ptrs_arr_[block_idx];
+            const int idx_end = block_ptrs_arr_[block_idx + 1];
+            const int bsize = idx_end - idx_start;
+
+            const int dense_block_row = row_idx - idx_start;
+            auto sum = zero<value_type>();
+
+            for (int dense_block_col = sg_tid; dense_block_col < bsize;
+                 dense_block_col += sg_size) {
+                const auto block_val =
+                    dense_block_ptr[dense_block_row * stride +
+                                    dense_block_col];  // coalesced accesses
+                sum += block_val * r[dense_block_col + idx_start];
+            }
+
+            // reduction
+            sum = sycl::reduce_over_group(sg, sum, sycl::plus<>());
+
+            if (sg_tid == 0) {
+                z[row_idx] = sum;
+            }
+        }
+    }
 
 private:
+    __dpct_inline__ void common_generate_for_all_system_matrix_types(
+        size_type batch_id)
+    {
+        blocks_arr_entry_ =
+            blocks_arr_batch_ +
+            gko::detail::batch_jacobi::get_batch_offset(
+                batch_id, num_blocks_, blocks_cumulative_offsets_);
+    }
+
+
     const size_type num_blocks_;
     const int* const blocks_cumulative_offsets_;
     const value_type* const blocks_arr_batch_;

diff --git a/dpcpp/preconditioner/batch_jacobi_kernels.dp.cpp b/dpcpp/preconditioner/batch_jacobi_kernels.dp.cpp
@@ -10,9 +10,15 @@
 
 #include "core/base/batch_struct.hpp"
 #include "core/base/utils.hpp"
+#include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/batch_struct.hpp"
+#include "core/synthesizer/implementation_selection.hpp"
 #include "dpcpp/base/batch_struct.hpp"
+#include "dpcpp/base/config.hpp"
+#include "dpcpp/base/dim3.dp.hpp"
+#include "dpcpp/base/dpct.hpp"
 #include "dpcpp/matrix/batch_struct.hpp"
+#include "dpcpp/preconditioner/jacobi_common.hpp"
 
 
 namespace gko {
@@ -21,11 +27,34 @@ namespace dpcpp {
 namespace batch_jacobi {
 
 
+namespace {
+
+
+using batch_jacobi_dpcpp_compiled_max_block_sizes =
+    gko::kernels::dpcpp::jacobi::compiled_kernels;
+
+#include "dpcpp/preconditioner/batch_jacobi_kernels.hpp.inc"
+
+
+}  // namespace
+
+
 template <typename IndexType>
 void compute_cumulative_block_storage(
     std::shared_ptr<const DefaultExecutor> exec, const size_type num_blocks,
     const IndexType* const block_pointers,
-    IndexType* const blocks_cumulative_offsets) GKO_NOT_IMPLEMENTED;
+    IndexType* const blocks_cumulative_offsets)
+{
+    (exec->get_queue())->submit([&](sycl::handler& cgh) {
+        cgh.parallel_for(num_blocks, [=](auto id) {
+            const auto bsize = block_pointers[id + 1] - block_pointers[id];
+            blocks_cumulative_offsets[id] = bsize * bsize;
+        });
+    });
+    exec->get_queue()->wait();
+    components::prefix_sum_nonnegative(exec, blocks_cumulative_offsets,
+                                       num_blocks + 1);
+}
 
 GKO_INSTANTIATE_FOR_INT32_TYPE(
     GKO_DECLARE_BATCH_BLOCK_JACOBI_COMPUTE_CUMULATIVE_BLOCK_STORAGE);
@@ -35,7 +64,15 @@ template <typename IndexType>
 void find_row_block_map(std::shared_ptr<const DefaultExecutor> exec,
                         const size_type num_blocks,
                         const IndexType* const block_pointers,
-                        IndexType* const map_block_to_row) GKO_NOT_IMPLEMENTED;
+                        IndexType* const map_block_to_row)
+{
+    (exec->get_queue())->submit([&](sycl::handler& cgh) {
+        cgh.parallel_for(num_blocks, [=](auto id) {
+            for (int i = block_pointers[id]; i < block_pointers[id + 1]; i++)
+                map_block_to_row[i] = id;
+        });
+    });
+}
 
 GKO_INSTANTIATE_FOR_INT32_TYPE(
     GKO_DECLARE_BATCH_BLOCK_JACOBI_FIND_ROW_BLOCK_MAP);
@@ -46,21 +83,102 @@ void extract_common_blocks_pattern(
     std::shared_ptr<const DefaultExecutor> exec,
     const gko::matrix::Csr<ValueType, IndexType>* const first_sys_csr,
     const size_type num_blocks, const IndexType* const cumulative_block_storage,
-    const IndexType* const block_pointers, const IndexType* const,
-    IndexType* const blocks_pattern) GKO_NOT_IMPLEMENTED;
+    const IndexType* const block_pointers, const IndexType* const map_block_row,
+    IndexType* const blocks_pattern)
+{
+    const auto nrows = first_sys_csr->get_size()[0];
+    constexpr int subgroup_size = config::warp_size;
+    auto device = exec->get_queue()->get_device();
+    auto group_size =
+        device.get_info<sycl::info::device::max_work_group_size>();
+
+    const dim3 block(group_size);
+    const dim3 grid(ceildiv(nrows * subgroup_size, group_size));
+
+    const auto row_ptrs = first_sys_csr->get_const_row_ptrs();
+    const auto col_idxs = first_sys_csr->get_const_col_idxs();
+
+    (exec->get_queue())->submit([&](sycl::handler& cgh) {
+        cgh.parallel_for(sycl_nd_range(grid, block),
+                         [=](sycl::nd_item<3> item_ct1)
+                             [[intel::reqd_sub_group_size(subgroup_size)]] {
+                                 extract_common_block_pattern_kernel(
+                                     static_cast<int>(nrows), row_ptrs,
+                                     col_idxs, num_blocks,
+                                     cumulative_block_storage, block_pointers,
+                                     map_block_row, blocks_pattern, item_ct1);
+                             });
+    });
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
     GKO_DECLARE_BATCH_BLOCK_JACOBI_EXTRACT_PATTERN_KERNEL);
 
 
+namespace {
+
+
+template <int compiled_max_block_size, typename ValueType, typename IndexType>
+void compute_block_jacobi_helper(
+    syn::value_list<int, compiled_max_block_size>,
+    const batch::matrix::Csr<ValueType, IndexType>* const sys_csr,
+    const size_type num_blocks, const IndexType* const cumulative_block_storage,
+    const IndexType* const block_pointers,
+    const IndexType* const blocks_pattern, ValueType* const blocks,
+    std::shared_ptr<const DpcppExecutor> exec)
+{
+    //    constexpr int subwarp_size =
+    //        gko::kernels::dpcpp::jacobi::get_larger_power(compiled_max_block_size);
+    // TODO: Find a way to allow smaller block_sizes (<16)
+
+    constexpr int subgroup_size = config::warp_size;
+    auto device = exec->get_queue()->get_device();
+    auto group_size =
+        device.get_info<sycl::info::device::max_work_group_size>();
+
+    const auto nbatch = sys_csr->get_num_batch_items();
+    const auto nrows = sys_csr->get_common_size()[0];
+    const auto nnz = sys_csr->get_num_stored_elements() / nbatch;
+    const auto sys_csr_values = sys_csr->get_const_values();
+
+    dim3 block(group_size);
+    dim3 grid(ceildiv(num_blocks * nbatch * subgroup_size, group_size));
+
+    (exec->get_queue())->submit([&](sycl::handler& cgh) {
+        cgh.parallel_for(sycl_nd_range(grid, block),
+                         [=](sycl::nd_item<3> item_ct1)
+                             [[intel::reqd_sub_group_size(subgroup_size)]] {
+                                 compute_block_jacobi_kernel(
+                                     nbatch, static_cast<int>(nnz),
+                                     sys_csr_values, num_blocks,
+                                     cumulative_block_storage, block_pointers,
+                                     blocks_pattern, blocks, item_ct1);
+                             });
+    });
+}
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_compute_block_jacobi_helper,
+                                    compute_block_jacobi_helper);
+
+}  // anonymous namespace
+
+
 template <typename ValueType, typename IndexType>
 void compute_block_jacobi(
     std::shared_ptr<const DefaultExecutor> exec,
     const batch::matrix::Csr<ValueType, IndexType>* const sys_csr, const uint32,
     const size_type num_blocks, const IndexType* const cumulative_block_storage,
     const IndexType* const block_pointers,
-    const IndexType* const blocks_pattern,
-    ValueType* const blocks) GKO_NOT_IMPLEMENTED;
+    const IndexType* const blocks_pattern, ValueType* const blocks)
+{
+    select_compute_block_jacobi_helper(
+        batch_jacobi_dpcpp_compiled_max_block_sizes(),
+        [&](int compiled_block_size) {
+            return user_given_max_block_size <= compiled_block_size;
+        },
+        syn::value_list<int>(), syn::type_list<>(), sys_csr, num_blocks,
+        cumulative_block_storage, block_pointers, blocks_pattern, blocks, exec);
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
     GKO_DECLARE_BATCH_BLOCK_JACOBI_COMPUTE_KERNEL);