review updates

Co-authored-by: Marcel Koch <[email protected]> Co-authored-by: Yu-Hsiang Tsai <[email protected]>
ginkgo-project · May 5, 2024 · 5992056 · 5992056
1 parent c39d311
commit 5992056
Show file tree

Hide file tree

Showing 14 changed files with 53 additions and 161 deletions.
diff --git a/core/solver/batch_cg_kernels.hpp b/core/solver/batch_cg_kernels.hpp
@@ -82,8 +82,7 @@ void set_gmem_stride_bytes(storage_config& sconf,
         gmem_stride += prec_storage_bytes;
     }
     // align global memory chunks
-    sconf.gmem_stride_bytes =
-        gmem_stride > 0 ? ceildiv(gmem_stride, align_bytes) * align_bytes : 0;
+    sconf.gmem_stride_bytes = ceildiv(gmem_stride, align_bytes) * align_bytes;
 }
 
 

diff --git a/core/test/solver/batch_cg.cpp b/core/test/solver/batch_cg.cpp
@@ -117,16 +117,16 @@ TYPED_TEST(BatchCg, CanBeMoved)
 {
     using Mtx = typename TestFixture::Mtx;
     using Solver = typename TestFixture::Solver;
-    auto copy = this->solver_factory->generate(Mtx::create(this->exec));
+    auto move = this->solver_factory->generate(Mtx::create(this->exec));
 
-    copy->move_from(this->solver);
+    move->move_from(this->solver);
 
-    ASSERT_EQ(copy->get_common_size(),
+    ASSERT_EQ(move->get_common_size(),
               gko::dim<2>(this->num_rows, this->num_rows));
-    ASSERT_EQ(copy->get_num_batch_items(), this->num_batch_items);
-    auto copy_mtx = gko::as<Solver>(copy.get())->get_system_matrix();
-    const auto copy_batch_mtx = gko::as<const Mtx>(copy_mtx.get());
-    GKO_ASSERT_BATCH_MTX_NEAR(this->mtx.get(), copy_batch_mtx, 0.0);
+    ASSERT_EQ(move->get_num_batch_items(), this->num_batch_items);
+    auto moved_mtx = gko::as<Solver>(move.get())->get_system_matrix();
+    const auto moved_batch_mtx = gko::as<const Mtx>(moved_mtx.get());
+    GKO_ASSERT_BATCH_MTX_NEAR(this->mtx.get(), moved_batch_mtx, 0.0);
     ASSERT_EQ(gko::as<Solver>(this->solver.get())->get_system_matrix(),
               nullptr);
 }

diff --git a/cuda/solver/batch_bicgstab_kernels.cu b/cuda/solver/batch_bicgstab_kernels.cu
@@ -157,10 +157,8 @@ public:
                 exec_, mat.num_rows);
         GKO_ASSERT(block_size >= 2 * config::warp_size);
 
-        const size_t prec_size =
-            PrecType::dynamic_work_size(padded_num_rows,
-                                        mat.get_single_item_num_nnz()) *
-            sizeof(value_type);
+        const size_t prec_size = PrecType::dynamic_work_size(
+            padded_num_rows, mat.get_single_item_num_nnz());
         const auto sconf =
             gko::kernels::batch_bicgstab::compute_shared_storage<PrecType,
                                                                  value_type>(

diff --git a/cuda/solver/batch_cg_kernels.cu b/cuda/solver/batch_cg_kernels.cu
@@ -57,47 +57,6 @@ template <typename T>
 using settings = gko::kernels::batch_cg::settings<T>;
 
 
-template <typename CuValueType>
-class kernel_caller {
-public:
-    using value_type = CuValueType;
-
-    kernel_caller(std::shared_ptr<const DefaultExecutor> exec,
-                  const settings<remove_complex<value_type>> settings)
-        : exec_{std::move(exec)}, settings_{settings}
-    {}
-
-    template <typename StopType, const int n_shared,
-              const bool prec_shared_bool, typename PrecType, typename LogType,
-              typename BatchMatrixType>
-    void launch_apply_kernel(
-        const gko::kernels::batch_cg::storage_config& sconf, LogType& logger,
-        PrecType& prec, const BatchMatrixType& mat,
-        const value_type* const __restrict__ b_values,
-        value_type* const __restrict__ x_values,
-        value_type* const __restrict__ workspace_data, const int& block_size,
-        const size_t& shared_size) const
-    {
-        GKO_NOT_IMPLEMENTED;
-    }
-
-
-    template <typename BatchMatrixType, typename PrecType, typename StopType,
-              typename LogType>
-    void call_kernel(
-        LogType logger, const BatchMatrixType& mat, PrecType prec,
-        const gko::batch::multi_vector::uniform_batch<const value_type>& b,
-        const gko::batch::multi_vector::uniform_batch<value_type>& x) const
-    {
-        GKO_NOT_IMPLEMENTED;
-    }
-
-private:
-    std::shared_ptr<const DefaultExecutor> exec_;
-    const settings<remove_complex<value_type>> settings_;
-};
-
-
 template <typename ValueType>
 void apply(std::shared_ptr<const DefaultExecutor> exec,
            const settings<remove_complex<ValueType>>& settings,

diff --git a/dpcpp/solver/batch_cg_kernels.dp.cpp b/dpcpp/solver/batch_cg_kernels.dp.cpp
@@ -49,44 +49,6 @@ template <typename T>
 using settings = gko::kernels::batch_cg::settings<T>;
 
 
-template <typename ValueType>
-class KernelCaller {
-public:
-    KernelCaller(std::shared_ptr<const DefaultExecutor> exec,
-                 const settings<remove_complex<ValueType>> settings)
-        : exec_{std::move(exec)}, settings_{settings}
-    {}
-
-    template <typename StopType, const int subgroup_size,
-              const int n_shared_total, typename PrecType, typename LogType,
-              typename BatchMatrixType>
-    __dpct_inline__ void launch_apply_kernel(
-        const gko::kernels::batch_cg::storage_config& sconf, LogType& logger,
-        PrecType& prec, const BatchMatrixType mat,
-        const ValueType* const __restrict__ b_values,
-        ValueType* const __restrict__ x_values,
-        ValueType* const __restrict__ workspace, const int& group_size,
-        const int& shared_size) const
-    {
-        GKO_NOT_IMPLEMENTED;
-    }
-
-    template <typename BatchMatrixType, typename PrecType, typename StopType,
-              typename LogType>
-    void call_kernel(
-        LogType logger, const BatchMatrixType& mat, PrecType prec,
-        const gko::batch::multi_vector::uniform_batch<const ValueType>& b,
-        const gko::batch::multi_vector::uniform_batch<ValueType>& x) const
-    {
-        GKO_NOT_IMPLEMENTED;
-    }
-
-private:
-    std::shared_ptr<const DefaultExecutor> exec_;
-    const settings<remove_complex<ValueType>> settings_;
-};
-
-
 template <typename ValueType>
 void apply(std::shared_ptr<const DefaultExecutor> exec,
            const settings<remove_complex<ValueType>>& settings,

diff --git a/hip/solver/batch_bicgstab_kernels.hip.cpp b/hip/solver/batch_bicgstab_kernels.hip.cpp
@@ -130,10 +130,9 @@ class kernel_caller {
             get_num_threads_per_block<BatchMatrixType>(exec_, mat.num_rows);
         GKO_ASSERT(block_size >= 2 * config::warp_size);
 
-        const size_t prec_size =
-            PrecType::dynamic_work_size(padded_num_rows,
-                                        mat.get_single_item_num_nnz()) *
-            sizeof(value_type);
+        // Returns amount required in bytes
+        const size_t prec_size = PrecType::dynamic_work_size(
+            padded_num_rows, mat.get_single_item_num_nnz());
         const auto sconf =
             gko::kernels::batch_bicgstab::compute_shared_storage<PrecType,
                                                                  value_type>(

diff --git a/hip/solver/batch_cg_kernels.hip.cpp b/hip/solver/batch_cg_kernels.hip.cpp
@@ -56,47 +56,6 @@ template <typename T>
 using settings = gko::kernels::batch_cg::settings<T>;
 
 
-template <typename HipValueType>
-class kernel_caller {
-public:
-    using value_type = HipValueType;
-
-    kernel_caller(std::shared_ptr<const DefaultExecutor> exec,
-                  const settings<remove_complex<value_type>> settings)
-        : exec_{exec}, settings_{settings}
-    {}
-
-    template <typename StopType, const int n_shared,
-              const bool prec_shared_bool, typename PrecType, typename LogType,
-              typename BatchMatrixType>
-    void launch_apply_kernel(
-        const gko::kernels::batch_cg::storage_config& sconf, LogType& logger,
-        PrecType& prec, const BatchMatrixType& mat,
-        const value_type* const __restrict__ b_values,
-        value_type* const __restrict__ x_values,
-        value_type* const __restrict__ workspace_data, const int& block_size,
-        const size_t& shared_size) const
-    {
-        GKO_NOT_IMPLEMENTED;
-    }
-
-
-    template <typename BatchMatrixType, typename PrecType, typename StopType,
-              typename LogType>
-    void call_kernel(
-        LogType logger, const BatchMatrixType& mat, PrecType prec,
-        const gko::batch::multi_vector::uniform_batch<const value_type>& b,
-        const gko::batch::multi_vector::uniform_batch<value_type>& x) const
-    {
-        GKO_NOT_IMPLEMENTED;
-    }
-
-private:
-    std::shared_ptr<const DefaultExecutor> exec_;
-    const settings<remove_complex<value_type>> settings_;
-};
-
-
 template <typename ValueType>
 void apply(std::shared_ptr<const DefaultExecutor> exec,
            const settings<remove_complex<ValueType>>& settings,

diff --git a/include/ginkgo/core/solver/batch_cg.hpp b/include/ginkgo/core/solver/batch_cg.hpp
@@ -24,7 +24,7 @@ namespace solver {
 
 
 /**
- * Cg or the Conjugate Gradient-Stabilized is a Krylov subspace solver.
+ * Cg or the Conjugate Gradient is a Krylov subspace solver.
  * It is a short recurrence solver that is generally used to solve linear
  * systems with SPD matrices.
  *

diff --git a/omp/solver/batch_bicgstab_kernels.cpp b/omp/solver/batch_bicgstab_kernels.cpp
@@ -5,6 +5,12 @@
 #include "core/solver/batch_bicgstab_kernels.hpp"
 
 
+#include <omp.h>
+
+
+#include <ginkgo/core/base/array.hpp>
+
+
 #include "core/solver/batch_dispatch.hpp"
 
 
@@ -66,19 +72,21 @@ class kernel_caller {
             gko::kernels::batch_bicgstab::local_memory_requirement<ValueType>(
                 num_rows, num_rhs) +
             PrecondType::dynamic_work_size(num_rows,
-                                           mat.get_single_item_num_nnz()) *
-                sizeof(ValueType);
+                                           mat.get_single_item_num_nnz());
+        int max_threads = omp_get_max_threads();
+        auto local_space =
+            array<unsigned char>(exec_, local_size_bytes * max_threads);
 
 #pragma omp parallel for
         for (size_type batch_id = 0; batch_id < num_batch_items; batch_id++) {
-            // TODO: Align to cache line boundary
-            // TODO: Allocate and free once per thread rather than once per
-            // work-item.
-            auto local_space = array<unsigned char>(exec_, local_size_bytes);
+            auto thread_local_space = gko::make_array_view(
+                exec_, local_size_bytes,
+                local_space.get_data() +
+                    omp_get_thread_num() * local_size_bytes);
             batch_entry_bicgstab_impl<StopType, PrecondType, LogType,
                                       BatchMatrixType, ValueType>(
                 settings_, logger, precond, mat, b, x, batch_id,
-                local_space.get_data());
+                thread_local_space.get_data());
         }
     }
 

diff --git a/omp/solver/batch_cg_kernels.cpp b/omp/solver/batch_cg_kernels.cpp
@@ -5,6 +5,12 @@
 #include "core/solver/batch_cg_kernels.hpp"
 
 
+#include <omp.h>
+
+
+#include <ginkgo/core/base/array.hpp>
+
+
 #include "core/solver/batch_dispatch.hpp"
 
 
@@ -66,18 +72,20 @@ class kernel_caller {
             gko::kernels::batch_cg::local_memory_requirement<ValueType>(
                 num_rows, num_rhs) +
             PrecondType::dynamic_work_size(num_rows,
-                                           mat.get_single_item_num_nnz()) *
-                sizeof(ValueType);
-
+                                           mat.get_single_item_num_nnz());
+        int max_threads = omp_get_max_threads();
+        auto local_space =
+            array<unsigned char>(exec_, local_size_bytes * max_threads);
 #pragma omp parallel for
         for (size_type batch_id = 0; batch_id < num_batch_items; batch_id++) {
-            // TODO: Align to cache line boundary
-            // TODO: Allocate and free once per thread rather than once per
-            // work-item.
-            auto local_space = array<unsigned char>(exec_, local_size_bytes);
+            auto thread_local_space = gko::make_array_view(
+                exec_, local_size_bytes,
+                local_space.get_data() +
+                    omp_get_thread_num() * local_size_bytes);
             batch_entry_cg_impl<StopType, PrecondType, LogType, BatchMatrixType,
                                 ValueType>(settings_, logger, precond, mat, b,
-                                           x, batch_id, local_space.get_data());
+                                           x, batch_id,
+                                           thread_local_space.get_data());
         }
     }
 

diff --git a/reference/preconditioner/batch_identity.hpp b/reference/preconditioner/batch_identity.hpp
@@ -32,7 +32,8 @@ class Identity final {
     static constexpr int work_size = 0;
 
     /**
-     * The size of the work vector required in case of dynamic allocation.
+     * The size of the work vector required in case of dynamic allocation in
+     * bytes.
      */
     static int dynamic_work_size(int, int) { return 0; }
 

diff --git a/reference/solver/batch_bicgstab_kernels.cpp b/reference/solver/batch_bicgstab_kernels.cpp
@@ -68,8 +68,7 @@ class kernel_caller {
             gko::kernels::batch_bicgstab::local_memory_requirement<ValueType>(
                 num_rows, num_rhs) +
             PrecType::dynamic_work_size(num_rows,
-                                        mat.get_single_item_num_nnz()) *
-                sizeof(ValueType);
+                                        mat.get_single_item_num_nnz());
         array<unsigned char> local_space(exec_, local_size_bytes);
 
         for (size_type batch_id = 0; batch_id < num_batch_items; batch_id++) {

diff --git a/reference/solver/batch_cg_kernels.cpp b/reference/solver/batch_cg_kernels.cpp
@@ -67,10 +67,8 @@ class kernel_caller {
         const size_type local_size_bytes =
             gko::kernels::batch_cg::local_memory_requirement<ValueType>(
                 num_rows, num_rhs) +
-            // TODO: return bytes for dynamic work size as well
             PrecType::dynamic_work_size(num_rows,
-                                        mat.get_single_item_num_nnz()) *
-                sizeof(ValueType);
+                                        mat.get_single_item_num_nnz());
         array<unsigned char> local_space(exec_, local_size_bytes);
 
         for (size_type batch_id = 0; batch_id < num_batch_items; batch_id++) {

diff --git a/reference/test/solver/batch_cg_kernels.cpp b/reference/test/solver/batch_cg_kernels.cpp
@@ -163,9 +163,11 @@ TYPED_TEST(BatchCg, ApplyLogsResAndIters)
 
     auto iter_counts = logger->get_num_iterations();
     auto res_norm = logger->get_residual_norm();
-    GKO_ASSERT_BATCH_MTX_NEAR(res.x, linear_system.exact_sol, tol * 1000);
+    GKO_ASSERT_BATCH_MTX_NEAR(res.x, linear_system.exact_sol, tol * 2000);
     for (size_t i = 0; i < num_batch_items; i++) {
         ASSERT_LE(iter_counts.get_const_data()[i], max_iters);
+        ASSERT_NEAR(res_norm.get_const_data()[i],
+                    res.host_res_norm->get_const_values()[i], tol * 100);
     }
 }
 
@@ -197,5 +199,5 @@ TYPED_TEST(BatchCg, CanSolveHpdSystem)
     auto res =
         gko::test::solve_linear_system(this->exec, linear_system, solver);
 
-    GKO_ASSERT_BATCH_MTX_NEAR(res.x, linear_system.exact_sol, tol * 500);
+    GKO_ASSERT_BATCH_MTX_NEAR(res.x, linear_system.exact_sol, tol * 1000);
 }