Merge (#1450): Add a CSR batched matrix format, CUDA, HIP and DPCPP k…

…ernels Add a CSR batched matrix format, CUDA, HIP and DPCPP kernels Related PR: #1450
ginkgo-project · Dec 12, 2023 · 830c289 · 830c289
2 parents 310bb4c + c30aae8
commit 830c289
Show file tree

Hide file tree

Showing 39 changed files with 2,407 additions and 1 deletion.
diff --git a/common/cuda_hip/matrix/batch_csr_kernel_launcher.hpp.inc b/common/cuda_hip/matrix/batch_csr_kernel_launcher.hpp.inc
@@ -0,0 +1,50 @@
+// SPDX-FileCopyrightText: 2017-2023 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+template <typename ValueType, typename IndexType>
+void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
+                  const batch::matrix::Csr<ValueType, IndexType>* mat,
+                  const batch::MultiVector<ValueType>* b,
+                  batch::MultiVector<ValueType>* x)
+{
+    const auto num_blocks = mat->get_num_batch_items();
+    const auto b_ub = get_batch_struct(b);
+    const auto x_ub = get_batch_struct(x);
+    const auto mat_ub = get_batch_struct(mat);
+    if (b->get_common_size()[1] > 1) {
+        GKO_NOT_IMPLEMENTED;
+    }
+    simple_apply_kernel<<<num_blocks, default_block_size, 0,
+                          exec->get_stream()>>>(mat_ub, b_ub, x_ub);
+}
+
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
+    GKO_DECLARE_BATCH_CSR_SIMPLE_APPLY_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
+                    const batch::MultiVector<ValueType>* alpha,
+                    const batch::matrix::Csr<ValueType, IndexType>* mat,
+                    const batch::MultiVector<ValueType>* b,
+                    const batch::MultiVector<ValueType>* beta,
+                    batch::MultiVector<ValueType>* x)
+{
+    const auto num_blocks = mat->get_num_batch_items();
+    const auto b_ub = get_batch_struct(b);
+    const auto x_ub = get_batch_struct(x);
+    const auto mat_ub = get_batch_struct(mat);
+    const auto alpha_ub = get_batch_struct(alpha);
+    const auto beta_ub = get_batch_struct(beta);
+    if (b->get_common_size()[1] > 1) {
+        GKO_NOT_IMPLEMENTED;
+    }
+    advanced_apply_kernel<<<num_blocks, default_block_size, 0,
+                            exec->get_stream()>>>(alpha_ub, mat_ub, b_ub,
+                                                  beta_ub, x_ub);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
+    GKO_DECLARE_BATCH_CSR_ADVANCED_APPLY_KERNEL);
diff --git a/common/cuda_hip/matrix/batch_csr_kernels.hpp.inc b/common/cuda_hip/matrix/batch_csr_kernels.hpp.inc
@@ -0,0 +1,113 @@
+// SPDX-FileCopyrightText: 2017-2023 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+template <typename ValueType, typename IndexType>
+__device__ __forceinline__ void simple_apply(
+    const gko::batch::matrix::csr::batch_item<const ValueType, IndexType>& mat,
+    const ValueType* const __restrict__ b, ValueType* const __restrict__ x)
+{
+    const auto num_rows = mat.num_rows;
+    const auto val = mat.values;
+    const auto col = mat.col_idxs;
+    for (int row = threadIdx.x; row < num_rows; row += blockDim.x) {
+        auto temp = zero<ValueType>();
+        for (auto nnz = mat.row_ptrs[row]; nnz < mat.row_ptrs[row + 1]; nnz++) {
+            const auto col_idx = col[nnz];
+            temp += val[nnz] * b[col_idx];
+        }
+        x[row] = temp;
+    }
+}
+
+template <typename ValueType, typename IndexType>
+__global__ __launch_bounds__(
+    default_block_size,
+    sm_oversubscription) void simple_apply_kernel(const gko::batch::matrix::
+                                                      csr::uniform_batch<
+                                                          const ValueType,
+                                                          IndexType>
+                                                          mat,
+                                                  const gko::batch::
+                                                      multi_vector::
+                                                          uniform_batch<
+                                                              const ValueType>
+                                                              b,
+                                                  const gko::batch::
+                                                      multi_vector::
+                                                          uniform_batch<
+                                                              ValueType>
+                                                              x)
+{
+    for (size_type batch_id = blockIdx.x; batch_id < mat.num_batch_items;
+         batch_id += gridDim.x) {
+        const auto mat_b =
+            gko::batch::matrix::extract_batch_item(mat, batch_id);
+        const auto b_b = gko::batch::extract_batch_item(b, batch_id);
+        const auto x_b = gko::batch::extract_batch_item(x, batch_id);
+        simple_apply(mat_b, b_b.values, x_b.values);
+    }
+}
+
+
+template <typename ValueType, typename IndexType>
+__device__ __forceinline__ void advanced_apply(
+    const ValueType alpha,
+    const gko::batch::matrix::csr::batch_item<const ValueType, IndexType>& mat,
+    const ValueType* const __restrict__ b, const ValueType beta,
+    ValueType* const __restrict__ x)
+{
+    const auto num_rows = mat.num_rows;
+    const auto val = mat.values;
+    const auto col = mat.col_idxs;
+    for (int row = threadIdx.x; row < num_rows; row += blockDim.x) {
+        auto temp = zero<ValueType>();
+        for (auto nnz = mat.row_ptrs[row]; nnz < mat.row_ptrs[row + 1]; nnz++) {
+            const auto col_idx = col[nnz];
+            temp += alpha * val[nnz] * b[col_idx];
+        }
+        x[row] = temp + beta * x[row];
+    }
+}
+
+template <typename ValueType, typename IndexType>
+__global__ __launch_bounds__(
+    default_block_size,
+    sm_oversubscription) void advanced_apply_kernel(const gko::batch::
+                                                        multi_vector::
+                                                            uniform_batch<
+                                                                const ValueType>
+                                                                alpha,
+                                                    const gko::batch::matrix::
+                                                        csr::uniform_batch<
+                                                            const ValueType,
+                                                            IndexType>
+                                                            mat,
+                                                    const gko::batch::
+                                                        multi_vector::
+                                                            uniform_batch<
+                                                                const ValueType>
+                                                                b,
+                                                    const gko::batch::
+                                                        multi_vector::
+                                                            uniform_batch<
+                                                                const ValueType>
+                                                                beta,
+                                                    const gko::batch::
+                                                        multi_vector::
+                                                            uniform_batch<
+                                                                ValueType>
+                                                                x)
+{
+    for (size_type batch_id = blockIdx.x; batch_id < mat.num_batch_items;
+         batch_id += gridDim.x) {
+        const auto mat_b =
+            gko::batch::matrix::extract_batch_item(mat, batch_id);
+        const auto b_b = gko::batch::extract_batch_item(b, batch_id);
+        const auto x_b = gko::batch::extract_batch_item(x, batch_id);
+        const auto alpha_b = gko::batch::extract_batch_item(alpha, batch_id);
+        const auto beta_b = gko::batch::extract_batch_item(beta, batch_id);
+        advanced_apply(alpha_b.values[0], mat_b, b_b.values, beta_b.values[0],
+                       x_b.values);
+    }
+}
diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt
@@ -40,6 +40,7 @@ target_sources(ginkgo
     log/vtune.cpp
     log/record.cpp
     log/stream.cpp
+    matrix/batch_csr.cpp
     matrix/batch_dense.cpp
     matrix/batch_ell.cpp
     matrix/batch_identity.cpp

diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp
@@ -29,6 +29,7 @@
 #include "core/factorization/par_ict_kernels.hpp"
 #include "core/factorization/par_ilu_kernels.hpp"
 #include "core/factorization/par_ilut_kernels.hpp"
+#include "core/matrix/batch_csr_kernels.hpp"
 #include "core/matrix/batch_dense_kernels.hpp"
 #include "core/matrix/batch_ell_kernels.hpp"
 #include "core/matrix/coo_kernels.hpp"
@@ -281,6 +282,16 @@ GKO_STUB_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR_COPY_KERNEL);
 }  // namespace batch_multi_vector
 
 
+namespace batch_csr {
+
+
+GKO_STUB_VALUE_AND_INT32_TYPE(GKO_DECLARE_BATCH_CSR_SIMPLE_APPLY_KERNEL);
+GKO_STUB_VALUE_AND_INT32_TYPE(GKO_DECLARE_BATCH_CSR_ADVANCED_APPLY_KERNEL);
+
+
+}  // namespace batch_csr
+
+
 namespace batch_dense {