ginkgo-project · MarcelKoch · Jun 20, 2024 · Jun 20, 2024 · Jun 20, 2024 · Sep 17, 2024
diff --git a/common/cuda_hip/solver/batch_bicgstab_kernels.hpp b/common/cuda_hip/solver/batch_bicgstab_kernels.hpp
@@ -5,6 +5,8 @@
 #ifndef GKO_COMMON_CUDA_HIP_SOLVER_BATCH_BICGSTAB_KERNELS_HPP_
 #define GKO_COMMON_CUDA_HIP_SOLVER_BATCH_BICGSTAB_KERNELS_HPP_
 
+#include "core/solver/batch_bicgstab_kernels.hpp"
+
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/types.hpp>
@@ -25,6 +27,11 @@
 namespace gko {
 namespace kernels {
 namespace GKO_DEVICE_NAMESPACE {
+
+
+constexpr int max_bicgstab_threads = 1024;
+
+
 namespace batch_single_kernels {
 
 
@@ -168,12 +175,14 @@ __device__ __forceinline__ void update_x_middle(
 template <typename StopType, int n_shared, bool prec_shared_bool,
           typename PrecType, typename LogType, typename BatchMatrixType,
           typename ValueType>
-__global__ void apply_kernel(
-    const gko::kernels::batch_bicgstab::storage_config sconf,
-    const int max_iter, const gko::remove_complex<ValueType> tol,
-    LogType logger, PrecType prec_shared, const BatchMatrixType mat,
-    const ValueType* const __restrict__ b, ValueType* const __restrict__ x,
-    ValueType* const __restrict__ workspace = nullptr)
+__global__ void __launch_bounds__(max_bicgstab_threads)
+    apply_kernel(const gko::kernels::batch_bicgstab::storage_config sconf,
+                 const int max_iter, const gko::remove_complex<ValueType> tol,
+                 LogType logger, PrecType prec_shared,
+                 const BatchMatrixType mat,
+                 const ValueType* const __restrict__ b,
+                 ValueType* const __restrict__ x,
+                 ValueType* const __restrict__ workspace = nullptr)
 {
     using real_type = typename gko::remove_complex<ValueType>;
     const auto num_batch_items = mat.num_batch_items;

diff --git a/common/cuda_hip/solver/batch_cg_kernels.hpp b/common/cuda_hip/solver/batch_cg_kernels.hpp
@@ -6,6 +6,8 @@
 #define GKO_COMMON_CUDA_HIP_SOLVER_BATCH_CG_KERNELS_HPP_
 
 
+#include "core/solver/batch_cg_kernels.hpp"
+
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/types.hpp>
@@ -27,6 +29,11 @@
 namespace gko {
 namespace kernels {
 namespace GKO_DEVICE_NAMESPACE {
+
+
+constexpr int max_cg_threads = 1024;
+
+
 namespace batch_single_kernels {
 
 
@@ -113,14 +120,14 @@ __device__ __forceinline__ void update_x_and_r(
 template <typename StopType, const int n_shared, const bool prec_shared_bool,
           typename PrecType, typename LogType, typename BatchMatrixType,
           typename ValueType>
-__global__ void apply_kernel(const gko::kernels::batch_cg::storage_config sconf,
-                             const int max_iter,
-                             const gko::remove_complex<ValueType> tol,
-                             LogType logger, PrecType prec_shared,
-                             const BatchMatrixType mat,
-                             const ValueType* const __restrict__ b,
-                             ValueType* const __restrict__ x,
-                             ValueType* const __restrict__ workspace = nullptr)
+__global__ void __launch_bounds__(max_cg_threads)
+    apply_kernel(const gko::kernels::batch_cg::storage_config sconf,
+                 const int max_iter, const gko::remove_complex<ValueType> tol,
+                 LogType logger, PrecType prec_shared,
+                 const BatchMatrixType mat,
+                 const ValueType* const __restrict__ b,
+                 ValueType* const __restrict__ x,
+                 ValueType* const __restrict__ workspace = nullptr)
 {
     using real_type = typename gko::remove_complex<ValueType>;
     const auto num_batch_items = mat.num_batch_items;

diff --git a/core/matrix/batch_struct.hpp b/core/matrix/batch_struct.hpp
@@ -22,14 +22,14 @@ namespace csr {
 /**
  * Encapsulates one matrix from a batch of csr matrices.
  */
-template <typename ValueType, typename IndexType>
+template <typename ValueType, typename IndexType = const int32>
 struct batch_item {
     using value_type = ValueType;
     using index_type = IndexType;
 
     ValueType* values;
-    const index_type* col_idxs;
-    const index_type* row_ptrs;
+    index_type* col_idxs;
+    index_type* row_ptrs;
     index_type num_rows;
     index_type num_cols;
     index_type num_nnz_per_item;
@@ -44,15 +44,15 @@ struct batch_item {
 /**
  * A 'simple' structure to store a global uniform batch of csr matrices.
  */
-template <typename ValueType, typename IndexType>
+template <typename ValueType, typename IndexType = const int32>
 struct uniform_batch {
     using value_type = ValueType;
     using index_type = IndexType;
     using entry_type = batch_item<value_type, index_type>;
 
     ValueType* values;
-    const index_type* col_idxs;
-    const index_type* row_ptrs;
+    index_type* col_idxs;
+    index_type* row_ptrs;
     size_type num_batch_items;
     index_type num_rows;
     index_type num_cols;
@@ -119,13 +119,13 @@ namespace ell {
 /**
  * Encapsulates one matrix from a batch of ell matrices.
  */
-template <typename ValueType, typename IndexType>
+template <typename ValueType, typename IndexType = const int32>
 struct batch_item {
     using value_type = ValueType;
     using index_type = IndexType;
 
     ValueType* values;
-    const index_type* col_idxs;
+    index_type* col_idxs;
     index_type stride;
     index_type num_rows;
     index_type num_cols;
@@ -141,14 +141,14 @@ struct batch_item {
 /**
  * A 'simple' structure to store a global uniform batch of ell matrices.
  */
-template <typename ValueType, typename IndexType>
+template <typename ValueType, typename IndexType = const int32>
 struct uniform_batch {
     using value_type = ValueType;
     using index_type = IndexType;
     using entry_type = batch_item<value_type, index_type>;
 
     ValueType* values;
-    const index_type* col_idxs;
+    index_type* col_idxs;
     size_type num_batch_items;
     index_type stride;
     index_type num_rows;

diff --git a/core/solver/batch_dispatch.hpp b/core/solver/batch_dispatch.hpp
@@ -164,6 +164,41 @@ enum class log_type { simple_convergence_completion };
 }  // namespace log
 
 
+#define GKO_BATCH_INSTANTIATE_STOP(macro, ...)                          \
+    macro(__VA_ARGS__,                                                  \
+          ::gko::batch::solver::device::batch_stop::SimpleAbsResidual); \
+    template macro(                                                     \
+        __VA_ARGS__,                                                    \
+        ::gko::batch::solver::device::batch_stop::SimpleRelResidual)
+
+#define GKO_BATCH_INSTANTIATE_PRECONDITIONER(macro, ...)                   \
+    GKO_BATCH_INSTANTIATE_STOP(                                            \
+        macro, __VA_ARGS__,                                                \
+        ::gko::batch::solver::device::batch_preconditioner::Identity);     \
+    template GKO_BATCH_INSTANTIATE_STOP(                                   \
+        macro, __VA_ARGS__,                                                \
+        ::gko::batch::solver::device::batch_preconditioner::ScalarJacobi); \
+    template GKO_BATCH_INSTANTIATE_STOP(                                   \
+        macro, __VA_ARGS__,                                                \
+        ::gko::batch::solver::device::batch_preconditioner::BlockJacobi)
+
+#define GKO_BATCH_INSTANTIATE_LOGGER(macro, ...) \
+    GKO_BATCH_INSTANTIATE_PRECONDITIONER(        \
+        macro, __VA_ARGS__,                      \
+        ::gko::batch::solver::device::batch_log::SimpleFinalLogger)
+
+#define GKO_BATCH_INSTANTIATE_MATRIX(macro, ...)                     \
+    GKO_BATCH_INSTANTIATE_LOGGER(macro, __VA_ARGS__,                 \
+                                 batch::matrix::ell::uniform_batch); \
+    template GKO_BATCH_INSTANTIATE_LOGGER(                           \
+        macro, __VA_ARGS__, batch::matrix::dense::uniform_batch);    \
+    template GKO_BATCH_INSTANTIATE_LOGGER(macro, __VA_ARGS__,        \
+                                          batch::matrix::csr::uniform_batch)
+
+#define GKO_BATCH_INSTANTIATE(macro, ...) \
+    GKO_BATCH_INSTANTIATE_MATRIX(macro, __VA_ARGS__)
+
+
 /**
  * Handles dispatching to the correct instantiation of a batched solver
  * depending on runtime parameters.

diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt
@@ -3,6 +3,8 @@ add_library(ginkgo_cuda $<TARGET_OBJECTS:ginkgo_cuda_device> "")
 include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake)
 add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip matrix/csr_kernels.instantiate.cpp CSR_INSTANTIATE)
 add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip matrix/fbcsr_kernels.instantiate.cpp FBCSR_INSTANTIATE)
+add_instantiation_files(. solver/batch_bicgstab_launch.instantiate.cu BATCH_BICGSTAB_INSTANTIATE)
+add_instantiation_files(. solver/batch_cg_launch.instantiate.cu BATCH_CG_INSTANTIATE)
 # we don't split up the dense kernels into distinct compilations
 list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp)
 target_sources(ginkgo_cuda
@@ -21,7 +23,9 @@ target_sources(ginkgo_cuda
     matrix/fft_kernels.cu
     preconditioner/batch_jacobi_kernels.cu
     solver/batch_bicgstab_kernels.cu
+    ${BATCH_BICGSTAB_INSTANTIATE}
     solver/batch_cg_kernels.cu
+    ${BATCH_CG_INSTANTIATE}
     solver/lower_trs_kernels.cu
     solver/upper_trs_kernels.cu
     ${GKO_UNIFIED_COMMON_SOURCES}