Review updates

Co-authored-by: Marcel Koch <[email protected]> Co-authored-by: Tobias Ribizel <[email protected]>
ginkgo-project · Jul 21, 2023 · b724eeb · b724eeb
1 parent a60ae1e
commit b724eeb
Show file tree

Hide file tree

Showing 14 changed files with 184 additions and 184 deletions.
diff --git a/common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc b/common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc
@@ -36,11 +36,16 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
            const BatchMultiVector<ValueType>* const alpha,
            BatchMultiVector<ValueType>* const x)
 {
-    const auto num_blocks = exec->get_num_multiprocessor() * sm_multiplier;
+    const auto num_blocks = x->get_num_batch_entries();
     const auto alpha_ub = get_batch_struct(alpha);
     const auto x_ub = get_batch_struct(x);
-    scale_kernel<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
-        alpha_ub, x_ub);
+    if (alpha->get_common_size()[1] == 1) {
+        scale_kernel<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+            alpha_ub, x_ub, [] __device__(int col) { return 0; });
+    } else {
+        scale_kernel<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+            alpha_ub, x_ub, [] __device__(int col) { return col; });
+    }
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
@@ -53,13 +58,20 @@ void add_scaled(std::shared_ptr<const DefaultExecutor> exec,
                 const BatchMultiVector<ValueType>* const x,
                 BatchMultiVector<ValueType>* const y)
 {
-    const auto num_blocks = exec->get_num_multiprocessor() * sm_multiplier;
+    const auto num_blocks = x->get_num_batch_entries();
     const size_type nrhs = x->get_common_size()[1];
     const auto alpha_ub = get_batch_struct(alpha);
     const auto x_ub = get_batch_struct(x);
     const auto y_ub = get_batch_struct(y);
-    add_scaled_kernel<<<num_blocks, default_block_size, 0,
-                        exec->get_stream()>>>(alpha_ub, x_ub, y_ub);
+    if (alpha->get_common_size()[1] == 1) {
+        add_scaled_kernel<<<num_blocks, default_block_size, 0,
+                            exec->get_stream()>>>(
+            alpha_ub, x_ub, y_ub, [] __device__(int col) { return 0; });
+    } else {
+        add_scaled_kernel<<<num_blocks, default_block_size, 0,
+                            exec->get_stream()>>>(
+            alpha_ub, x_ub, y_ub, [] __device__(int col) { return col; });
+    }
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
@@ -107,7 +119,7 @@ void copy(std::shared_ptr<const DefaultExecutor> exec,
           const BatchMultiVector<ValueType>* x,
           BatchMultiVector<ValueType>* result)
 {
-    const auto num_blocks = exec->get_num_multiprocessor() * sm_multiplier;
+    const auto num_blocks = x->get_num_batch_entries();
     const auto result_ub = get_batch_struct(result);
     const auto x_ub = get_batch_struct(x);
     copy_kernel<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(

diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc b/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc
@@ -35,85 +35,75 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  * Scales the vectors in global or shared memory with a factor of alpha (alpha
  * is in global memory or shared memory)
  */
-template <typename ValueType>
+template <typename ValueType, typename Mapping>
 __device__ __forceinline__ void scale(
-    const gko::batch_multi_vector::BatchEntry<const ValueType>& alpha,
-    const gko::batch_multi_vector::BatchEntry<ValueType>& x)
+    const gko::batch_multi_vector::batch_entry<const ValueType>& alpha,
+    const gko::batch_multi_vector::batch_entry<ValueType>& x, Mapping map)
 {
     const int max_li = x.num_rows * x.num_rhs;
     for (int li = threadIdx.x; li < max_li; li += blockDim.x) {
         const int row = li / x.num_rhs;
         const int col = li % x.num_rhs;
 
-        if (alpha.num_rhs == 1) {
-            x.values[row * x.stride + col] =
-                alpha.values[0] * x.values[row * x.stride + col];
-        } else {
-            x.values[row * x.stride + col] =
-                alpha.values[col] * x.values[row * x.stride + col];
-        }
+        x.values[row * x.stride + col] =
+            alpha.values[map(col)] * x.values[row * x.stride + col];
     }
 }
 
-template <typename ValueType>
+template <typename ValueType, typename Mapping>
 __global__
-__launch_bounds__(default_block_size, sm_multiplier) void scale_kernel(
-    const gko::batch_multi_vector::UniformBatch<const ValueType> alpha,
-    const gko::batch_multi_vector::UniformBatch<ValueType> x)
+    __launch_bounds__(default_block_size, sm_multiplier) void scale_kernel(
+        const gko::batch_multi_vector::uniform_batch<const ValueType> alpha,
+        const gko::batch_multi_vector::uniform_batch<ValueType> x, Mapping map)
 {
     for (size_type ibatch = blockIdx.x; ibatch < x.num_batch_entries;
          ibatch += gridDim.x) {
         const auto alpha_b = gko::batch::batch_entry(alpha, ibatch);
         const auto x_b = gko::batch::batch_entry(x, ibatch);
-        scale(alpha_b, x_b);
+        scale(alpha_b, x_b, map);
     }
 }
 
 
-template <typename ValueType>
+template <typename ValueType, typename Mapping>
 __device__ __forceinline__ void add_scaled(
-    const gko::batch_multi_vector::BatchEntry<const ValueType>& alpha,
-    const gko::batch_multi_vector::BatchEntry<const ValueType>& x,
-    const gko::batch_multi_vector::BatchEntry<ValueType>& y)
+    const gko::batch_multi_vector::batch_entry<const ValueType>& alpha,
+    const gko::batch_multi_vector::batch_entry<const ValueType>& x,
+    const gko::batch_multi_vector::batch_entry<ValueType>& y, Mapping map)
 {
     const int max_li = x.num_rows * x.num_rhs;
     for (int li = threadIdx.x; li < max_li; li += blockDim.x) {
         const int row = li / x.num_rhs;
         const int col = li % x.num_rhs;
 
-        if (alpha.num_rhs == 1) {
-            y.values[row * y.stride + col] +=
-                alpha.values[0] * x.values[row * x.stride + col];
-        } else {
-            y.values[row * y.stride + col] +=
-                alpha.values[col] * x.values[row * x.stride + col];
-        }
+        y.values[row * y.stride + col] +=
+            alpha.values[map(col)] * x.values[row * x.stride + col];
     }
 }
 
-template <typename ValueType>
+template <typename ValueType, typename Mapping>
 __global__
-__launch_bounds__(default_block_size, sm_multiplier) void add_scaled_kernel(
-    const gko::batch_multi_vector::UniformBatch<const ValueType> alpha,
-    const gko::batch_multi_vector::UniformBatch<const ValueType> x,
-    const gko::batch_multi_vector::UniformBatch<ValueType> y)
+    __launch_bounds__(default_block_size, sm_multiplier) void add_scaled_kernel(
+        const gko::batch_multi_vector::uniform_batch<const ValueType> alpha,
+        const gko::batch_multi_vector::uniform_batch<const ValueType> x,
+        const gko::batch_multi_vector::uniform_batch<ValueType> y, Mapping map)
 {
     for (size_type ibatch = blockIdx.x; ibatch < x.num_batch_entries;
          ibatch += gridDim.x) {
         const auto alpha_b = gko::batch::batch_entry(alpha, ibatch);
         const auto x_b = gko::batch::batch_entry(x, ibatch);
         const auto y_b = gko::batch::batch_entry(y, ibatch);
-        add_scaled(alpha_b, x_b, y_b);
+        add_scaled(alpha_b, x_b, y_b, map);
     }
 }
 
 
 template <typename ValueType>
 __device__ __forceinline__ void one_dot(
-    const gko::batch_multi_vector::BatchEntry<const ValueType>& x,
-    const gko::batch_multi_vector::BatchEntry<const ValueType>& y,
+    const gko::batch_multi_vector::batch_entry<const ValueType>& x,
+    const gko::batch_multi_vector::batch_entry<const ValueType>& y,
     const int rhs_index,
-    const gko::batch_multi_vector::BatchEntry<ValueType>& result,
+    const gko::batch_multi_vector::batch_entry<ValueType>& result,
     group::thread_block_tile<config::warp_size>& subwarp_grp)
 {
     ValueType val = zero<ValueType>();
@@ -143,9 +133,9 @@ __device__ __forceinline__ void one_dot(
  */
 template <typename ValueType>
 __device__ __forceinline__ void compute_dot_product(
-    const gko::batch_multi_vector::BatchEntry<const ValueType>& x,
-    const gko::batch_multi_vector::BatchEntry<const ValueType>& y,
-    const gko::batch_multi_vector::BatchEntry<ValueType>& result)
+    const gko::batch_multi_vector::batch_entry<const ValueType>& x,
+    const gko::batch_multi_vector::batch_entry<const ValueType>& y,
+    const gko::batch_multi_vector::batch_entry<ValueType>& result)
 {
     constexpr auto tile_size = config::warp_size;
     auto thread_block = group::this_thread_block();
@@ -165,17 +155,17 @@ __global__ __launch_bounds__(
     default_block_size,
     sm_multiplier) void compute_dot_product_kernel(const gko::
                                                        batch_multi_vector::
-                                                           UniformBatch<
+                                                           uniform_batch<
                                                                const ValueType>
                                                                x,
                                                    const gko::
                                                        batch_multi_vector::
-                                                           UniformBatch<
+                                                           uniform_batch<
                                                                const ValueType>
                                                                y,
                                                    const gko::
                                                        batch_multi_vector::
-                                                           UniformBatch<
+                                                           uniform_batch<
                                                                ValueType>
                                                                result)
 {
@@ -191,9 +181,9 @@ __global__ __launch_bounds__(
 
 template <typename ValueType>
 __device__ __forceinline__ void one_norm2(
-    const gko::batch_multi_vector::BatchEntry<const ValueType>& x,
+    const gko::batch_multi_vector::batch_entry<const ValueType>& x,
     const int rhs_index,
-    const gko::batch_multi_vector::BatchEntry<remove_complex<ValueType>>&
+    const gko::batch_multi_vector::batch_entry<remove_complex<ValueType>>&
         result,
     group::thread_block_tile<config::warp_size>& subwarp_grp)
 {
@@ -225,8 +215,8 @@ __device__ __forceinline__ void one_norm2(
  */
 template <typename ValueType>
 __device__ __forceinline__ void compute_norm2(
-    const gko::batch_multi_vector::BatchEntry<const ValueType>& x,
-    const gko::batch_multi_vector::BatchEntry<remove_complex<ValueType>>&
+    const gko::batch_multi_vector::batch_entry<const ValueType>& x,
+    const gko::batch_multi_vector::batch_entry<remove_complex<ValueType>>&
         result)
 {
     constexpr auto tile_size = config::warp_size;
@@ -243,11 +233,15 @@ __device__ __forceinline__ void compute_norm2(
 
 
 template <typename ValueType>
-__global__
-__launch_bounds__(default_block_size, sm_multiplier) void compute_norm2_kernel(
-    const gko::batch_multi_vector::UniformBatch<const ValueType> x,
-    const gko::batch_multi_vector::UniformBatch<remove_complex<ValueType>>
-        result)
+__global__ __launch_bounds__(
+    default_block_size,
+    sm_multiplier) void compute_norm2_kernel(const gko::batch_multi_vector::
+                                                 uniform_batch<const ValueType>
+                                                     x,
+                                             const gko::batch_multi_vector::
+                                                 uniform_batch<
+                                                     remove_complex<ValueType>>
+                                                     result)
 {
     for (size_type ibatch = blockIdx.x; ibatch < x.num_batch_entries;
          ibatch += gridDim.x) {
@@ -266,8 +260,8 @@ __launch_bounds__(default_block_size, sm_multiplier) void compute_norm2_kernel(
  */
 template <typename ValueType>
 __device__ __forceinline__ void copy(
-    const gko::batch_multi_vector::BatchEntry<const ValueType>& in,
-    const gko::batch_multi_vector::BatchEntry<ValueType>& out)
+    const gko::batch_multi_vector::batch_entry<const ValueType>& in,
+    const gko::batch_multi_vector::batch_entry<ValueType>& out)
 {
     for (int iz = threadIdx.x; iz < in.num_rows * in.num_rhs;
          iz += blockDim.x) {
@@ -280,9 +274,9 @@ __device__ __forceinline__ void copy(
 
 template <typename ValueType>
 __global__
-__launch_bounds__(default_block_size, sm_multiplier) void copy_kernel(
-    const gko::batch_multi_vector::UniformBatch<const ValueType> src,
-    const gko::batch_multi_vector::UniformBatch<ValueType> dst)
+    __launch_bounds__(default_block_size, sm_multiplier) void copy_kernel(
+        const gko::batch_multi_vector::uniform_batch<const ValueType> src,
+        const gko::batch_multi_vector::uniform_batch<ValueType> dst)
 {
     for (size_type ibatch = blockIdx.x; ibatch < src.num_batch_entries;
          ibatch += gridDim.x) {

diff --git a/core/base/batch_multi_vector.cpp b/core/base/batch_multi_vector.cpp
@@ -72,12 +72,10 @@ void BatchMultiVector<ValueType>::scale_impl(
     GKO_ASSERT_EQ(alpha->get_num_batch_entries(),
                   this->get_num_batch_entries());
     GKO_ASSERT_EQUAL_ROWS(alpha->get_common_size(), dim<2>(1, 1));
-    for (size_type b = 0; b < alpha->get_num_batch_entries(); ++b) {
-        if (alpha->get_common_size()[1] != 1) {
-            // different alpha for each column
-            GKO_ASSERT_EQUAL_COLS(this->get_common_size(),
-                                  alpha->get_common_size());
-        }
+    if (alpha->get_common_size()[1] != 1) {
+        // different alpha for each column
+        GKO_ASSERT_EQUAL_COLS(this->get_common_size(),
+                              alpha->get_common_size());
     }
     this->get_executor()->run(batch_multi_vector::make_scale(alpha, this));
 }
@@ -91,12 +89,10 @@ void BatchMultiVector<ValueType>::add_scaled_impl(
     GKO_ASSERT_EQ(alpha->get_num_batch_entries(),
                   this->get_num_batch_entries());
     GKO_ASSERT_EQUAL_ROWS(alpha->get_common_size(), dim<2>(1, 1));
-    for (size_type b = 0; b < alpha->get_num_batch_entries(); ++b) {
-        if (alpha->get_common_size()[1] != 1) {
-            // different alpha for each column
-            GKO_ASSERT_EQUAL_COLS(this->get_common_size(),
-                                  alpha->get_common_size());
-        }
+    if (alpha->get_common_size()[1] != 1) {
+        // different alpha for each column
+        GKO_ASSERT_EQUAL_COLS(this->get_common_size(),
+                              alpha->get_common_size());
     }
     GKO_ASSERT_EQ(b->get_num_batch_entries(), this->get_num_batch_entries());
     GKO_ASSERT_EQUAL_DIMENSIONS(this->get_common_size(), b->get_common_size());
@@ -162,11 +158,11 @@ void BatchMultiVector<ValueType>::move_to(
 
 
 template <typename MatrixType, typename MatrixData>
-inline void read_impl(MatrixType* mtx, const std::vector<MatrixData>& data)
+void read_impl(MatrixType* mtx, const std::vector<MatrixData>& data)
 {
+    GKO_ASSERT(data.size() > 0);
     auto common_size = data[0].size;
     auto batch_size = batch_dim<2>(data.size(), common_size);
-    size_type ind = 0;
     for (const auto& b : data) {
         auto b_size = b.size;
         GKO_ASSERT_EQUAL_DIMENSIONS(common_size, b_size);
@@ -208,17 +204,9 @@ void BatchMultiVector<ValueType>::read(const std::vector<mat_data32>& data)
 
 
 template <typename MatrixType, typename MatrixData>
-inline void write_impl(const MatrixType* mtx, std::vector<MatrixData>& data)
+void write_impl(const MatrixType* mtx, std::vector<MatrixData>& data)
 {
-    std::unique_ptr<const BatchMultiVector<typename MatrixData::value_type>>
-        op{};
-    const MatrixType* tmp{};
-    if (mtx->get_executor()->get_master() != mtx->get_executor()) {
-        op = mtx->clone(mtx->get_executor()->get_master());
-        tmp = static_cast<const MatrixType*>(op.get());
-    } else {
-        tmp = mtx;
-    }
+    auto tmp = make_temporary_clone(mtx->get_executor()->get_master(), mtx);
 
     data = std::vector<MatrixData>(mtx->get_num_batch_entries());
     for (size_type b = 0; b < mtx->get_num_batch_entries(); ++b) {