Review updates.

Co-authored-by: Thomas Grützmacher <[email protected]> Co-authored-by: Yu-Hsiang Tsai <[email protected]> Co-authored-by: Marcel Koch <[email protected]>
ginkgo-project · Jul 31, 2023 · 966a877 · 966a877
1 parent f0ef0fa
commit 966a877
Show file tree

Hide file tree

Showing 12 changed files with 176 additions and 122 deletions.
diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc b/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc
@@ -47,10 +47,15 @@ __device__ __forceinline__ void scale(
 }
 
 template <typename ValueType, typename Mapping>
-__global__
-__launch_bounds__(default_block_size, sm_multiplier) void scale_kernel(
-    const gko::batch_multi_vector::uniform_batch<const ValueType> alpha,
-    const gko::batch_multi_vector::uniform_batch<ValueType> x, Mapping map)
+__global__ __launch_bounds__(
+    default_block_size,
+    sm_oversubscription) void scale_kernel(const gko::batch_multi_vector::
+                                               uniform_batch<const ValueType>
+                                                   alpha,
+                                           const gko::batch_multi_vector::
+                                               uniform_batch<ValueType>
+                                                   x,
+                                           Mapping map)
 {
     for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_entries;
          batch_id += gridDim.x) {
@@ -78,11 +83,20 @@ __device__ __forceinline__ void add_scaled(
 }
 
 template <typename ValueType, typename Mapping>
-__global__
-__launch_bounds__(default_block_size, sm_multiplier) void add_scaled_kernel(
-    const gko::batch_multi_vector::uniform_batch<const ValueType> alpha,
-    const gko::batch_multi_vector::uniform_batch<const ValueType> x,
-    const gko::batch_multi_vector::uniform_batch<ValueType> y, Mapping map)
+__global__ __launch_bounds__(
+    default_block_size,
+    sm_oversubscription) void add_scaled_kernel(const gko::batch_multi_vector::
+                                                    uniform_batch<
+                                                        const ValueType>
+                                                        alpha,
+                                                const gko::batch_multi_vector::
+                                                    uniform_batch<
+                                                        const ValueType>
+                                                        x,
+                                                const gko::batch_multi_vector::
+                                                    uniform_batch<ValueType>
+                                                        y,
+                                                Mapping map)
 {
     for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_entries;
          batch_id += gridDim.x) {
@@ -139,24 +153,12 @@ __device__ __forceinline__ void compute_gen_dot_product(
 
 
 template <typename ValueType, typename Mapping>
-__global__ __launch_bounds__(
-    default_block_size,
-    sm_multiplier) void compute_gen_dot_product_kernel(const gko::
-                                                           batch_multi_vector::
-                                                               uniform_batch<
-                                                                   const ValueType>
-                                                                   x,
-                                                       const gko::
-                                                           batch_multi_vector::
-                                                               uniform_batch<
-                                                                   const ValueType>
-                                                                   y,
-                                                       const gko::
-                                                           batch_multi_vector::
-                                                               uniform_batch<
-                                                                   ValueType>
-                                                                   result,
-                                                       Mapping map)
+__global__
+    __launch_bounds__(default_block_size, sm_oversubscription) void compute_gen_dot_product_kernel(
+        const gko::batch_multi_vector::uniform_batch<const ValueType> x,
+        const gko::batch_multi_vector::uniform_batch<const ValueType> y,
+        const gko::batch_multi_vector::uniform_batch<ValueType> result,
+        Mapping map)
 {
     for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_entries;
          batch_id += gridDim.x) {
@@ -218,11 +220,19 @@ __device__ __forceinline__ void compute_norm2(
 
 
 template <typename ValueType>
-__global__
-__launch_bounds__(default_block_size, sm_multiplier) void compute_norm2_kernel(
-    const gko::batch_multi_vector::uniform_batch<const ValueType> x,
-    const gko::batch_multi_vector::uniform_batch<remove_complex<ValueType>>
-        result)
+__global__ __launch_bounds__(
+    default_block_size,
+    sm_oversubscription) void compute_norm2_kernel(const gko::
+                                                       batch_multi_vector::
+                                                           uniform_batch<
+                                                               const ValueType>
+                                                               x,
+                                                   const gko::
+                                                       batch_multi_vector::
+                                                           uniform_batch<
+                                                               remove_complex<
+                                                                   ValueType>>
+                                                               result)
 {
     for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_entries;
          batch_id += gridDim.x) {
@@ -255,9 +265,9 @@ __device__ __forceinline__ void copy(
 
 template <typename ValueType>
 __global__
-__launch_bounds__(default_block_size, sm_multiplier) void copy_kernel(
-    const gko::batch_multi_vector::uniform_batch<const ValueType> src,
-    const gko::batch_multi_vector::uniform_batch<ValueType> dst)
+    __launch_bounds__(default_block_size, sm_oversubscription) void copy_kernel(
+        const gko::batch_multi_vector::uniform_batch<const ValueType> src,
+        const gko::batch_multi_vector::uniform_batch<ValueType> dst)
 {
     for (size_type batch_id = blockIdx.x; batch_id < src.num_batch_entries;
          batch_id += gridDim.x) {

diff --git a/core/base/batch_multi_vector.cpp b/core/base/batch_multi_vector.cpp
@@ -65,6 +65,86 @@ GKO_REGISTER_OPERATION(copy, batch_multi_vector::copy);
 }  // namespace
 }  // namespace batch_multi_vector
 
+namespace detail {
+
+
+template <typename ValueType>
+batch_dim<2> compute_batch_size(
+    const std::vector<matrix::Dense<ValueType>*>& matrices)
+{
+    auto common_size = matrices[0]->get_size();
+    for (size_type i = 1; i < matrices.size(); ++i) {
+        GKO_ASSERT_EQUAL_DIMENSIONS(common_size, matrices[i]->get_size());
+    }
+    return batch_dim<2>{matrices.size(), common_size};
+}
+
+
+}  // namespace detail
+
+
+template <typename ValueType>
+BatchMultiVector<ValueType>::BatchMultiVector(
+    std::shared_ptr<const Executor> exec, const batch_dim<2>& size)
+    : EnablePolymorphicObject<BatchMultiVector<ValueType>>(exec),
+      batch_size_(size),
+      values_(exec, compute_num_elems(size))
+{}
+
+
+template <typename ValueType>
+BatchMultiVector<ValueType>::BatchMultiVector(
+    std::shared_ptr<const Executor> exec,
+    const std::vector<matrix::Dense<ValueType>*>& matrices)
+    : EnablePolymorphicObject<BatchMultiVector<ValueType>>(exec),
+      batch_size_{detail::compute_batch_size(matrices)},
+      values_(exec, compute_num_elems(batch_size_))
+{
+    for (size_type i = 0; i < this->get_num_batch_entries(); ++i) {
+        auto local_exec = matrices[i]->get_executor();
+        exec->copy_from(
+            local_exec.get(), matrices[i]->get_num_stored_elements(),
+            matrices[i]->get_const_values(),
+            this->get_values() + this->get_size().get_cumulative_offset(i));
+    }
+}
+
+
+template <typename ValueType>
+BatchMultiVector<ValueType>::BatchMultiVector(
+    std::shared_ptr<const Executor> exec, size_type num_duplications,
+    const matrix::Dense<value_type>* input)
+    : BatchMultiVector<ValueType>(
+          exec, gko::batch_dim<2>(num_duplications, input->get_size()))
+{
+    size_type offset = 0;
+    for (size_type i = 0; i < num_duplications; ++i) {
+        exec->copy_from(input->get_executor().get(),
+                        input->get_num_stored_elements(),
+                        input->get_const_values(), this->get_values() + offset);
+        offset += input->get_num_stored_elements();
+    }
+}
+
+
+template <typename ValueType>
+BatchMultiVector<ValueType>::BatchMultiVector(
+    std::shared_ptr<const Executor> exec, size_type num_duplications,
+    const BatchMultiVector<value_type>* input)
+    : BatchMultiVector<ValueType>(
+          exec,
+          gko::batch_dim<2>(input->get_num_batch_entries() * num_duplications,
+                            input->get_common_size()))
+{
+    size_type offset = 0;
+    for (size_type i = 0; i < num_duplications; ++i) {
+        exec->copy_from(input->get_executor().get(),
+                        input->get_num_stored_elements(),
+                        input->get_const_values(), this->get_values() + offset);
+        offset += input->get_num_stored_elements();
+    }
+}
+
 
 template <typename ValueType>
 std::unique_ptr<BatchMultiVector<ValueType>>

diff --git a/core/test/base/batch_multi_vector.cpp b/core/test/base/batch_multi_vector.cpp
@@ -97,13 +97,15 @@ TYPED_TEST_SUITE(BatchMultiVector, gko::test::ValueTypes);
 TYPED_TEST(BatchMultiVector, CanBeEmpty)
 {
     auto empty = gko::BatchMultiVector<TypeParam>::create(this->exec);
+
     this->assert_empty(empty.get());
 }
 
 
 TYPED_TEST(BatchMultiVector, KnowsItsSizeAndValues)
 {
     ASSERT_NE(this->mtx->get_const_values(), nullptr);
+
     this->assert_equal_to_original_mtx(this->mtx.get());
 }
 
@@ -119,7 +121,9 @@ TYPED_TEST(BatchMultiVector, CanGetValuesForEntry)
 TYPED_TEST(BatchMultiVector, CanBeCopied)
 {
     auto mtx_copy = gko::BatchMultiVector<TypeParam>::create(this->exec);
+
     mtx_copy->copy_from(this->mtx.get());
+
     this->assert_equal_to_original_mtx(this->mtx.get());
     this->mtx->at(0, 0, 0) = 7;
     this->mtx->at(0, 1) = 7;
@@ -130,14 +134,17 @@ TYPED_TEST(BatchMultiVector, CanBeCopied)
 TYPED_TEST(BatchMultiVector, CanBeMoved)
 {
     auto mtx_copy = gko::BatchMultiVector<TypeParam>::create(this->exec);
+
     this->mtx->move_to(mtx_copy.get());
+
     this->assert_equal_to_original_mtx(mtx_copy.get());
 }
 
 
 TYPED_TEST(BatchMultiVector, CanBeCloned)
 {
     auto mtx_clone = this->mtx->clone();
+
     this->assert_equal_to_original_mtx(
         dynamic_cast<decltype(this->mtx.get())>(mtx_clone.get()));
 }
@@ -146,13 +153,15 @@ TYPED_TEST(BatchMultiVector, CanBeCloned)
 TYPED_TEST(BatchMultiVector, CanBeCleared)
 {
     this->mtx->clear();
+
     this->assert_empty(this->mtx.get());
 }
 
 
 TYPED_TEST(BatchMultiVector, CanBeConstructedWithSize)
 {
     using size_type = gko::size_type;
+
     auto m = gko::BatchMultiVector<TypeParam>::create(
         this->exec, gko::batch_dim<2>(2, gko::dim<2>(2, 4)));
 
@@ -281,6 +290,7 @@ TYPED_TEST(BatchMultiVector, CanBeConstructedFromBatchMultiVectorMatrices)
 TYPED_TEST(BatchMultiVector, CanBeListConstructed)
 {
     using value_type = typename TestFixture::value_type;
+
     auto m = gko::batch_initialize<gko::BatchMultiVector<TypeParam>>(
         {{1.0, 2.0}, {1.0, 3.0}}, this->exec);
 
@@ -296,6 +306,7 @@ TYPED_TEST(BatchMultiVector, CanBeListConstructed)
 TYPED_TEST(BatchMultiVector, CanBeListConstructedByCopies)
 {
     using value_type = typename TestFixture::value_type;
+
     auto m = gko::batch_initialize<gko::BatchMultiVector<TypeParam>>(
         2, I<value_type>({1.0, 2.0}), this->exec);
 
@@ -312,6 +323,7 @@ TYPED_TEST(BatchMultiVector, CanBeDoubleListConstructed)
 {
     using value_type = typename TestFixture::value_type;
     using T = value_type;
+
     auto m = gko::batch_initialize<gko::BatchMultiVector<TypeParam>>(
         {{I<T>{1.0, 1.0, 0.0}, I<T>{2.0, 4.0, 3.0}, I<T>{3.0, 6.0, 1.0}},
          {I<T>{1.0, 2.0, -1.0}, I<T>{3.0, 4.0, -2.0}, I<T>{5.0, 6.0, -3.0}}},
@@ -401,6 +413,7 @@ TYPED_TEST(BatchMultiVector, CanBeReadFromSparseMatrixData)
 {
     using value_type = typename TestFixture::value_type;
     auto m = gko::BatchMultiVector<TypeParam>::create(this->exec);
+
     // clang-format off
     m->read({gko::matrix_data<TypeParam>{{2, 2},
                                          {{0, 0, 1.0},

diff --git a/cuda/base/batch_multi_vector_kernels.cu b/cuda/base/batch_multi_vector_kernels.cu
@@ -65,7 +65,7 @@ namespace batch_multi_vector {
 
 
 constexpr auto default_block_size = 256;
-constexpr int sm_multiplier = 4;
+constexpr int sm_oversubscription = 4;
 
 // clang-format off
 

diff --git a/cuda/base/batch_struct.hpp b/cuda/base/batch_struct.hpp
@@ -51,7 +51,7 @@ namespace cuda {
 /** @file batch_struct.hpp
  *
  * Helper functions to generate a batch struct from a batch LinOp,
- * while also shallow-casting to the requried CUDA scalar type.
+ * while also shallow-casting to the required CUDA scalar type.
  *
  * A specialization is needed for every format of every kind of linear algebra
  * object. These are intended to be called on the host.

diff --git a/dpcpp/base/batch_struct.hpp b/dpcpp/base/batch_struct.hpp
@@ -50,7 +50,7 @@ namespace dpcpp {
 /** @file batch_struct.hpp
  *
  * Helper functions to generate a batch struct from a batch LinOp,
- * while also shallow-casting to the requried DPCPP scalar type.
+ * while also shallow-casting to the required DPCPP scalar type.
  *
  * A specialization is needed for every format of every kind of linear algebra
  * object. These are intended to be called on the host.

diff --git a/hip/base/batch_multi_vector_kernels.hip.cpp b/hip/base/batch_multi_vector_kernels.hip.cpp
@@ -66,7 +66,7 @@ namespace batch_multi_vector {
 
 
 constexpr auto default_block_size = 256;
-constexpr int sm_multiplier = 4;
+constexpr int sm_oversubscription = 4;
 
 
 // clang-format off

diff --git a/hip/base/batch_struct.hip.hpp b/hip/base/batch_struct.hip.hpp
@@ -51,7 +51,7 @@ namespace hip {
 /** @file batch_struct.hpp
  *
  * Helper functions to generate a batch struct from a batch LinOp,
- * while also shallow-casting to the requried Hip scalar type.
+ * while also shallow-casting to the required Hip scalar type.
  *
  * A specialization is needed for every format of every kind of linear algebra
  * object. These are intended to be called on the host.

diff --git a/include/ginkgo/core/base/batch_lin_op_helpers.hpp b/include/ginkgo/core/base/batch_lin_op_helpers.hpp
@@ -37,6 +37,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <memory>
 #include <type_traits>
 #include <utility>
+#include <vector>
 
 
 #include <ginkgo/core/base/abstract_factory.hpp>