From ffc1059db28402020fa2813580f7e143dd6bd0eb Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Tue, 23 Jul 2024 13:34:59 +0200
Subject: [PATCH 1/6] unify cuda/hip batch_mvec

---
 common/cuda_hip/CMakeLists.txt                |   1 +
 ...hpp.inc => batch_multi_vector_kernels.cpp} |  67 +++-
 .../base/batch_multi_vector_kernels.hpp       | 326 ++++++++++++++++++
 .../base/batch_multi_vector_kernels.hpp.inc   |  43 +--
 cuda/CMakeLists.txt                           |   1 -
 cuda/base/batch_multi_vector_kernels.cu       |  56 ---
 hip/CMakeLists.txt                            |   1 -
 hip/base/batch_multi_vector_kernels.hip.cpp   |  56 ---
 8 files changed, 390 insertions(+), 161 deletions(-)
 rename common/cuda_hip/base/{batch_multi_vector_kernel_launcher.hpp.inc => batch_multi_vector_kernels.cpp} (67%)
 create mode 100644 common/cuda_hip/base/batch_multi_vector_kernels.hpp
 delete mode 100644 cuda/base/batch_multi_vector_kernels.cu
 delete mode 100644 hip/base/batch_multi_vector_kernels.hip.cpp

diff --git a/common/cuda_hip/CMakeLists.txt b/common/cuda_hip/CMakeLists.txt
index 463abfd9284..15d3a82419e 100644
--- a/common/cuda_hip/CMakeLists.txt
+++ b/common/cuda_hip/CMakeLists.txt
@@ -1,5 +1,6 @@
 include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake)
 set(CUDA_HIP_SOURCES
+    base/batch_multi_vector_kernels.cpp
     base/device_matrix_data_kernels.cpp
     base/index_set_kernels.cpp
     components/prefix_sum_kernels.cpp
diff --git a/common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc b/common/cuda_hip/base/batch_multi_vector_kernels.cpp
similarity index 67%
rename from common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc
rename to common/cuda_hip/base/batch_multi_vector_kernels.cpp
index 19b5b74a547..17f65487464 100644
--- a/common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc
+++ b/common/cuda_hip/base/batch_multi_vector_kernels.cpp
@@ -2,6 +2,32 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp"
+
+#include <thrust/functional.h>
+#include <thrust/transform.h>
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/types.hpp>
+
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "core/base/batch_multi_vector_kernels.hpp"
+#include "core/base/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace batch_multi_vector {
+
+
+constexpr auto default_block_size = 256;
+
+
 template <typename ValueType>
 void scale(std::shared_ptr<const DefaultExecutor> exec,
            const batch::MultiVector<ValueType>* const alpha,
@@ -11,16 +37,19 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
     const auto alpha_ub = get_batch_struct(alpha);
     const auto x_ub = get_batch_struct(x);
     if (alpha->get_common_size()[1] == 1) {
-        scale_kernel<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+        batch_single_kernels::scale_kernel<<<num_blocks, default_block_size, 0,
+                                             exec->get_stream()>>>(
             alpha_ub, x_ub,
             [] __device__(int row, int col, int stride) { return 0; });
     } else if (alpha->get_common_size() == x->get_common_size()) {
-        scale_kernel<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+        batch_single_kernels::scale_kernel<<<num_blocks, default_block_size, 0,
+                                             exec->get_stream()>>>(
             alpha_ub, x_ub, [] __device__(int row, int col, int stride) {
                 return row * stride + col;
             });
     } else {
-        scale_kernel<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+        batch_single_kernels::scale_kernel<<<num_blocks, default_block_size, 0,
+                                             exec->get_stream()>>>(
             alpha_ub, x_ub,
             [] __device__(int row, int col, int stride) { return col; });
     }
@@ -42,12 +71,12 @@ void add_scaled(std::shared_ptr<const DefaultExecutor> exec,
     const auto x_ub = get_batch_struct(x);
     const auto y_ub = get_batch_struct(y);
     if (alpha->get_common_size()[1] == 1) {
-        add_scaled_kernel<<<num_blocks, default_block_size, 0,
-                            exec->get_stream()>>>(
+        batch_single_kernels::add_scaled_kernel<<<
+            num_blocks, default_block_size, 0, exec->get_stream()>>>(
             alpha_ub, x_ub, y_ub, [] __device__(int col) { return 0; });
     } else {
-        add_scaled_kernel<<<num_blocks, default_block_size, 0,
-                            exec->get_stream()>>>(
+        batch_single_kernels::add_scaled_kernel<<<
+            num_blocks, default_block_size, 0, exec->get_stream()>>>(
             alpha_ub, x_ub, y_ub, [] __device__(int col) { return col; });
     }
 }
@@ -67,8 +96,8 @@ void compute_dot(std::shared_ptr<const DefaultExecutor> exec,
     const auto x_ub = get_batch_struct(x);
     const auto y_ub = get_batch_struct(y);
     const auto res_ub = get_batch_struct(result);
-    compute_gen_dot_product_kernel<<<num_blocks, default_block_size, 0,
-                                     exec->get_stream()>>>(
+    batch_single_kernels::compute_gen_dot_product_kernel<<<
+        num_blocks, default_block_size, 0, exec->get_stream()>>>(
         x_ub, y_ub, res_ub, [] __device__(auto val) { return val; });
 }
 
@@ -87,8 +116,8 @@ void compute_conj_dot(std::shared_ptr<const DefaultExecutor> exec,
     const auto x_ub = get_batch_struct(x);
     const auto y_ub = get_batch_struct(y);
     const auto res_ub = get_batch_struct(result);
-    compute_gen_dot_product_kernel<<<num_blocks, default_block_size, 0,
-                                     exec->get_stream()>>>(
+    batch_single_kernels::compute_gen_dot_product_kernel<<<
+        num_blocks, default_block_size, 0, exec->get_stream()>>>(
         x_ub, y_ub, res_ub, [] __device__(auto val) { return conj(val); });
 }
 
@@ -105,8 +134,9 @@ void compute_norm2(std::shared_ptr<const DefaultExecutor> exec,
     const auto num_rhs = x->get_common_size()[1];
     const auto x_ub = get_batch_struct(x);
     const auto res_ub = get_batch_struct(result);
-    compute_norm2_kernel<<<num_blocks, default_block_size, 0,
-                           exec->get_stream()>>>(x_ub, res_ub);
+    batch_single_kernels::compute_norm2_kernel<<<num_blocks, default_block_size,
+                                                 0, exec->get_stream()>>>(
+        x_ub, res_ub);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
@@ -121,8 +151,15 @@ void copy(std::shared_ptr<const DefaultExecutor> exec,
     const auto num_blocks = x->get_num_batch_items();
     const auto result_ub = get_batch_struct(result);
     const auto x_ub = get_batch_struct(x);
-    copy_kernel<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
-        x_ub, result_ub);
+    batch_single_kernels::
+        copy_kernel<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+            x_ub, result_ub);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR_COPY_KERNEL);
+
+
+}  // namespace batch_multi_vector
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.hpp b/common/cuda_hip/base/batch_multi_vector_kernels.hpp
new file mode 100644
index 00000000000..36aa69d7d99
--- /dev/null
+++ b/common/cuda_hip/base/batch_multi_vector_kernels.hpp
@@ -0,0 +1,326 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include <thrust/functional.h>
+#include <thrust/transform.h>
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/types.hpp>
+
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/thrust.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/format_conversion.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/segment_scan.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/warp_blas.hpp"
+
+#if defined(GKO_COMPILING_CUDA)
+#include "cuda/base/batch_struct.hpp"
+#elif defined(GKO_COMPILING_HIP)
+#include "hip/base/batch_struct.hip.hpp"
+#else
+#error "batch struct def missing"
+#endif
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace batch_multi_vector {
+namespace batch_single_kernels {
+
+
+constexpr auto default_block_size = 256;
+
+
+template <typename ValueType, typename Mapping>
+__device__ __forceinline__ void scale(
+    const gko::batch::multi_vector::batch_item<const ValueType>& alpha,
+    const gko::batch::multi_vector::batch_item<ValueType>& x, Mapping map)
+{
+    const int max_li = x.num_rows * x.num_rhs;
+    for (int li = threadIdx.x; li < max_li; li += blockDim.x) {
+        const int row = li / x.num_rhs;
+        const int col = li % x.num_rhs;
+
+        x.values[row * x.stride + col] =
+            alpha.values[map(row, col, alpha.stride)] *
+            x.values[row * x.stride + col];
+    }
+}
+
+
+template <typename ValueType, typename Mapping>
+__global__ __launch_bounds__(default_block_size) void scale_kernel(
+    const gko::batch::multi_vector::uniform_batch<const ValueType> alpha,
+    const gko::batch::multi_vector::uniform_batch<ValueType> x, Mapping map)
+{
+    for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items;
+         batch_id += gridDim.x) {
+        const auto alpha_b = gko::batch::extract_batch_item(alpha, batch_id);
+        const auto x_b = gko::batch::extract_batch_item(x, batch_id);
+        scale(alpha_b, x_b, map);
+    }
+}
+
+
+template <typename ValueType, typename Mapping>
+__device__ __forceinline__ void add_scaled(
+    const gko::batch::multi_vector::batch_item<const ValueType>& alpha,
+    const gko::batch::multi_vector::batch_item<const ValueType>& x,
+    const gko::batch::multi_vector::batch_item<ValueType>& y, Mapping map)
+{
+    const int max_li = x.num_rows * x.num_rhs;
+    for (int li = threadIdx.x; li < max_li; li += blockDim.x) {
+        const int row = li / x.num_rhs;
+        const int col = li % x.num_rhs;
+
+        y.values[row * y.stride + col] +=
+            alpha.values[map(col)] * x.values[row * x.stride + col];
+    }
+}
+
+
+template <typename ValueType, typename Mapping>
+__global__ __launch_bounds__(default_block_size) void add_scaled_kernel(
+    const gko::batch::multi_vector::uniform_batch<const ValueType> alpha,
+    const gko::batch::multi_vector::uniform_batch<const ValueType> x,
+    const gko::batch::multi_vector::uniform_batch<ValueType> y, Mapping map)
+{
+    for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items;
+         batch_id += gridDim.x) {
+        const auto alpha_b = gko::batch::extract_batch_item(alpha, batch_id);
+        const auto x_b = gko::batch::extract_batch_item(x, batch_id);
+        const auto y_b = gko::batch::extract_batch_item(y, batch_id);
+        add_scaled(alpha_b, x_b, y_b, map);
+    }
+}
+
+
+template <typename Group, typename ValueType>
+__device__ __forceinline__ void single_rhs_compute_conj_dot(Group subgroup,
+                                                            const int num_rows,
+                                                            const ValueType* x,
+                                                            const ValueType* y,
+                                                            ValueType& result)
+
+{
+    ValueType val = zero<ValueType>();
+    for (int r = subgroup.thread_rank(); r < num_rows; r += subgroup.size()) {
+        val += conj(x[r]) * y[r];
+    }
+
+    // subgroup level reduction
+    val = reduce(subgroup, val, thrust::plus<ValueType>{});
+
+    if (subgroup.thread_rank() == 0) {
+        result = val;
+    }
+}
+
+
+template <typename Group, typename ValueType, typename Mapping>
+__device__ __forceinline__ void gen_one_dot(
+    const gko::batch::multi_vector::batch_item<const ValueType>& x,
+    const gko::batch::multi_vector::batch_item<const ValueType>& y,
+    const int rhs_index,
+    const gko::batch::multi_vector::batch_item<ValueType>& result,
+    Group subgroup, Mapping conj_map)
+{
+    ValueType val = zero<ValueType>();
+
+    for (int r = subgroup.thread_rank(); r < x.num_rows; r += subgroup.size()) {
+        val += conj_map(x.values[r * x.stride + rhs_index]) *
+               y.values[r * y.stride + rhs_index];
+    }
+
+    // subgroup level reduction
+    val = reduce(subgroup, val, thrust::plus<ValueType>{});
+
+    if (subgroup.thread_rank() == 0) {
+        result.values[rhs_index] = val;
+    }
+}
+
+
+template <typename ValueType, typename Mapping>
+__device__ __forceinline__ void compute_gen_dot_product(
+    const gko::batch::multi_vector::batch_item<const ValueType>& x,
+    const gko::batch::multi_vector::batch_item<const ValueType>& y,
+    const gko::batch::multi_vector::batch_item<ValueType>& result,
+    Mapping conj_map)
+{
+    constexpr auto tile_size = config::warp_size;
+    auto thread_block = group::this_thread_block();
+    auto subgroup = group::tiled_partition<tile_size>(thread_block);
+    const auto subgroup_id = static_cast<int>(threadIdx.x / tile_size);
+    const int num_subgroups_per_block = ceildiv(blockDim.x, tile_size);
+
+    for (int rhs_index = subgroup_id; rhs_index < x.num_rhs;
+         rhs_index += num_subgroups_per_block) {
+        gen_one_dot(x, y, rhs_index, result, subgroup, conj_map);
+    }
+}
+
+
+template <typename ValueType, typename Mapping>
+__global__
+__launch_bounds__(default_block_size) void compute_gen_dot_product_kernel(
+    const gko::batch::multi_vector::uniform_batch<const ValueType> x,
+    const gko::batch::multi_vector::uniform_batch<const ValueType> y,
+    const gko::batch::multi_vector::uniform_batch<ValueType> result,
+    Mapping map)
+{
+    for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items;
+         batch_id += gridDim.x) {
+        const auto x_b = gko::batch::extract_batch_item(x, batch_id);
+        const auto y_b = gko::batch::extract_batch_item(y, batch_id);
+        const auto r_b = gko::batch::extract_batch_item(result, batch_id);
+        compute_gen_dot_product(x_b, y_b, r_b, map);
+    }
+}
+
+
+template <typename Group, typename ValueType>
+__device__ __forceinline__ void single_rhs_compute_norm2(
+    Group subgroup, const int num_rows, const ValueType* x,
+    remove_complex<ValueType>& result)
+{
+    using real_type = typename gko::remove_complex<ValueType>;
+    real_type val = zero<real_type>();
+
+    for (int r = subgroup.thread_rank(); r < num_rows; r += subgroup.size()) {
+        val += squared_norm(x[r]);
+    }
+
+    // subgroup level reduction
+    val = reduce(subgroup, val, thrust::plus<remove_complex<ValueType>>{});
+
+    if (subgroup.thread_rank() == 0) {
+        result = sqrt(val);
+    }
+}
+
+
+template <typename Group, typename ValueType>
+__device__ __forceinline__ void one_norm2(
+    const gko::batch::multi_vector::batch_item<const ValueType>& x,
+    const int rhs_index,
+    const gko::batch::multi_vector::batch_item<remove_complex<ValueType>>&
+        result,
+    Group subgroup)
+{
+    using real_type = typename gko::remove_complex<ValueType>;
+    real_type val = zero<real_type>();
+
+    for (int r = subgroup.thread_rank(); r < x.num_rows; r += subgroup.size()) {
+        val += squared_norm(x.values[r * x.stride + rhs_index]);
+    }
+
+    // subgroup level reduction
+    val = reduce(subgroup, val, thrust::plus<remove_complex<ValueType>>{});
+
+    if (subgroup.thread_rank() == 0) {
+        result.values[rhs_index] = sqrt(val);
+    }
+}
+
+
+/**
+ * Computes the 2-norms of some column vectors in global or shared memory.
+ *
+ * @param x  A row-major multivector with nrhs columns.
+ * @param result  Holds norm value for each vector in x.
+ */
+template <typename ValueType>
+__device__ __forceinline__ void compute_norm2(
+    const gko::batch::multi_vector::batch_item<const ValueType>& x,
+    const gko::batch::multi_vector::batch_item<remove_complex<ValueType>>&
+        result)
+{
+    constexpr auto tile_size = config::warp_size;
+    auto thread_block = group::this_thread_block();
+    auto subgroup = group::tiled_partition<tile_size>(thread_block);
+    const auto subgroup_id = static_cast<int>(threadIdx.x / tile_size);
+    const int num_subgroups_per_block = ceildiv(blockDim.x, tile_size);
+
+    for (int rhs_index = subgroup_id; rhs_index < x.num_rhs;
+         rhs_index += num_subgroups_per_block) {
+        one_norm2(x, rhs_index, result, subgroup);
+    }
+}
+
+
+template <typename ValueType>
+__global__ __launch_bounds__(default_block_size) void compute_norm2_kernel(
+    const gko::batch::multi_vector::uniform_batch<const ValueType> x,
+    const gko::batch::multi_vector::uniform_batch<remove_complex<ValueType>>
+        result)
+{
+    for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items;
+         batch_id += gridDim.x) {
+        const auto x_b = gko::batch::extract_batch_item(x, batch_id);
+        const auto r_b = gko::batch::extract_batch_item(result, batch_id);
+        compute_norm2(x_b, r_b);
+    }
+}
+
+
+template <typename ValueType>
+__device__ __forceinline__ void single_rhs_copy(const int num_rows,
+                                                const ValueType* in,
+                                                ValueType* out)
+{
+    for (int iz = threadIdx.x; iz < num_rows; iz += blockDim.x) {
+        out[iz] = in[iz];
+    }
+}
+
+
+/**
+ * Copies the values of one multi-vector into another.
+ *
+ * Note that the output multi-vector should already have memory allocated
+ * and stride set.
+ */
+template <typename ValueType>
+__device__ __forceinline__ void copy(
+    const gko::batch::multi_vector::batch_item<const ValueType>& in,
+    const gko::batch::multi_vector::batch_item<ValueType>& out)
+{
+    for (int iz = threadIdx.x; iz < in.num_rows * in.num_rhs;
+         iz += blockDim.x) {
+        const int i = iz / in.num_rhs;
+        const int j = iz % in.num_rhs;
+        out.values[i * out.stride + j] = in.values[i * in.stride + j];
+    }
+}
+
+
+template <typename ValueType>
+__global__ __launch_bounds__(default_block_size) void copy_kernel(
+    const gko::batch::multi_vector::uniform_batch<const ValueType> src,
+    const gko::batch::multi_vector::uniform_batch<ValueType> dst)
+{
+    for (size_type batch_id = blockIdx.x; batch_id < src.num_batch_items;
+         batch_id += gridDim.x) {
+        const auto dst_b = gko::batch::extract_batch_item(dst, batch_id);
+        const auto src_b = gko::batch::extract_batch_item(src, batch_id);
+        copy(src_b, dst_b);
+    }
+}
+
+
+}  // namespace batch_single_kernels
+}  // namespace batch_multi_vector
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc b/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc
index 9b6301674be..7af3c84303f 100644
--- a/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc
+++ b/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc
@@ -20,8 +20,7 @@ __device__ __forceinline__ void scale(
 
 
 template <typename ValueType, typename Mapping>
-__global__
-__launch_bounds__(default_block_size, sm_oversubscription) void scale_kernel(
+__global__ __launch_bounds__(default_block_size) void scale_kernel(
     const gko::batch::multi_vector::uniform_batch<const ValueType> alpha,
     const gko::batch::multi_vector::uniform_batch<ValueType> x, Mapping map)
 {
@@ -52,20 +51,10 @@ __device__ __forceinline__ void add_scaled(
 
 
 template <typename ValueType, typename Mapping>
-__global__ __launch_bounds__(
-    default_block_size,
-    sm_oversubscription) void add_scaled_kernel(const gko::batch::multi_vector::
-                                                    uniform_batch<
-                                                        const ValueType>
-                                                        alpha,
-                                                const gko::batch::multi_vector::
-                                                    uniform_batch<
-                                                        const ValueType>
-                                                        x,
-                                                const gko::batch::multi_vector::
-                                                    uniform_batch<ValueType>
-                                                        y,
-                                                Mapping map)
+__global__ __launch_bounds__(default_block_size) void add_scaled_kernel(
+    const gko::batch::multi_vector::uniform_batch<const ValueType> alpha,
+    const gko::batch::multi_vector::uniform_batch<const ValueType> x,
+    const gko::batch::multi_vector::uniform_batch<ValueType> y, Mapping map)
 {
     for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items;
          batch_id += gridDim.x) {
@@ -145,7 +134,7 @@ __device__ __forceinline__ void compute_gen_dot_product(
 
 template <typename ValueType, typename Mapping>
 __global__
-__launch_bounds__(default_block_size, sm_oversubscription) void compute_gen_dot_product_kernel(
+__launch_bounds__(default_block_size) void compute_gen_dot_product_kernel(
     const gko::batch::multi_vector::uniform_batch<const ValueType> x,
     const gko::batch::multi_vector::uniform_batch<const ValueType> y,
     const gko::batch::multi_vector::uniform_batch<ValueType> result,
@@ -232,19 +221,10 @@ __device__ __forceinline__ void compute_norm2(
 
 
 template <typename ValueType>
-__global__ __launch_bounds__(
-    default_block_size,
-    sm_oversubscription) void compute_norm2_kernel(const gko::batch::
-                                                       multi_vector::
-                                                           uniform_batch<
-                                                               const ValueType>
-                                                               x,
-                                                   const gko::batch::
-                                                       multi_vector::
-                                                           uniform_batch<
-                                                               remove_complex<
-                                                                   ValueType>>
-                                                               result)
+__global__ __launch_bounds__(default_block_size) void compute_norm2_kernel(
+    const gko::batch::multi_vector::uniform_batch<const ValueType> x,
+    const gko::batch::multi_vector::uniform_batch<remove_complex<ValueType>>
+        result)
 {
     for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items;
          batch_id += gridDim.x) {
@@ -287,8 +267,7 @@ __device__ __forceinline__ void copy(
 
 
 template <typename ValueType>
-__global__
-__launch_bounds__(default_block_size, sm_oversubscription) void copy_kernel(
+__global__ __launch_bounds__(default_block_size) void copy_kernel(
     const gko::batch::multi_vector::uniform_batch<const ValueType> src,
     const gko::batch::multi_vector::uniform_batch<ValueType> dst)
 {
diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt
index d4a94eda802..3631a65f48d 100644
--- a/cuda/CMakeLists.txt
+++ b/cuda/CMakeLists.txt
@@ -7,7 +7,6 @@ add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip matrix/fbcsr_kerne
 list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp)
 target_sources(ginkgo_cuda
     PRIVATE
-    base/batch_multi_vector_kernels.cu
     base/device.cpp
     base/exception.cpp
     base/executor.cpp
diff --git a/cuda/base/batch_multi_vector_kernels.cu b/cuda/base/batch_multi_vector_kernels.cu
deleted file mode 100644
index 3dad5ba94f1..00000000000
--- a/cuda/base/batch_multi_vector_kernels.cu
+++ /dev/null
@@ -1,56 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/base/batch_multi_vector_kernels.hpp"
-
-#include <thrust/functional.h>
-#include <thrust/transform.h>
-
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/base/range_accessors.hpp>
-
-#include "common/cuda_hip/base/blas_bindings.hpp"
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/pointer_mode_guard.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/thrust.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "common/cuda_hip/components/reduction.hpp"
-#include "common/cuda_hip/components/thread_ids.hpp"
-#include "common/cuda_hip/components/warp_blas.hpp"
-#include "core/base/batch_struct.hpp"
-#include "cuda/base/batch_struct.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-/**
- * @brief The MultiVector matrix format namespace.
- *
- * @ingroup batch_multi_vector
- */
-namespace batch_multi_vector {
-
-
-constexpr auto default_block_size = 256;
-constexpr int sm_oversubscription = 4;
-
-
-// clang-format off
-
-// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
-
-#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc"
-
-
-#include "common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc"
-
-// clang-format on
-
-
-}  // namespace batch_multi_vector
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt
index 46b2d7bd19b..84bba295120 100644
--- a/hip/CMakeLists.txt
+++ b/hip/CMakeLists.txt
@@ -5,7 +5,6 @@ add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip matrix/fbcsr_kerne
 # we don't split up the dense kernels into distinct compilations
 list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp)
 set(GINKGO_HIP_SOURCES
-    base/batch_multi_vector_kernels.hip.cpp
     base/device.hip.cpp
     base/exception.hip.cpp
     base/executor.hip.cpp
diff --git a/hip/base/batch_multi_vector_kernels.hip.cpp b/hip/base/batch_multi_vector_kernels.hip.cpp
deleted file mode 100644
index 701f4655a9a..00000000000
--- a/hip/base/batch_multi_vector_kernels.hip.cpp
+++ /dev/null
@@ -1,56 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/base/batch_multi_vector_kernels.hpp"
-
-#include <thrust/functional.h>
-#include <thrust/transform.h>
-
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/base/range_accessors.hpp>
-
-#include "common/cuda_hip/base/blas_bindings.hpp"
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/pointer_mode_guard.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/thrust.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "common/cuda_hip/components/reduction.hpp"
-#include "common/cuda_hip/components/thread_ids.hpp"
-#include "common/cuda_hip/components/uninitialized_array.hpp"
-#include "core/base/batch_struct.hpp"
-#include "hip/base/batch_struct.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-/**
- * @brief The MultiVector matrix format namespace.
- *
- * @ingroup batch_multi_vector
- */
-namespace batch_multi_vector {
-
-
-constexpr auto default_block_size = 256;
-constexpr int sm_oversubscription = 4;
-
-
-// clang-format off
-
-// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
-
-#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc"
-
-
-#include "common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc"
-
-// clang-format on
-
-
-}  // namespace batch_multi_vector
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko

From 9db040e5404a3d4f21da81f89061c8f6940c5c69 Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Mon, 19 Aug 2024 11:03:12 +0200
Subject: [PATCH 2/6] [cuda,hip] update namespaces and includes

---
 .../base/batch_multi_vector_kernels.cpp       |  37 +--
 .../base/batch_multi_vector_kernels.hpp       |   2 -
 .../base/batch_multi_vector_kernels.hpp.inc   | 280 ------------------
 .../solver/batch_bicgstab_kernels.hpp.inc     |  41 ++-
 .../cuda_hip/solver/batch_cg_kernels.hpp.inc  |  23 +-
 cuda/solver/batch_bicgstab_kernels.cu         |   2 +-
 cuda/solver/batch_cg_kernels.cu               |   2 +-
 hip/solver/batch_bicgstab_kernels.hip.cpp     |   2 +-
 hip/solver/batch_cg_kernels.hip.cpp           |   2 +-
 9 files changed, 63 insertions(+), 328 deletions(-)
 delete mode 100644 common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc

diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.cpp b/common/cuda_hip/base/batch_multi_vector_kernels.cpp
index 17f65487464..76565a83f80 100644
--- a/common/cuda_hip/base/batch_multi_vector_kernels.cpp
+++ b/common/cuda_hip/base/batch_multi_vector_kernels.cpp
@@ -37,19 +37,19 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
     const auto alpha_ub = get_batch_struct(alpha);
     const auto x_ub = get_batch_struct(x);
     if (alpha->get_common_size()[1] == 1) {
-        batch_single_kernels::scale_kernel<<<num_blocks, default_block_size, 0,
-                                             exec->get_stream()>>>(
+        GKO_DEVICE_NAMESPACE::batch_single_kernels::scale_kernel<<<
+            num_blocks, default_block_size, 0, exec->get_stream()>>>(
             alpha_ub, x_ub,
             [] __device__(int row, int col, int stride) { return 0; });
     } else if (alpha->get_common_size() == x->get_common_size()) {
-        batch_single_kernels::scale_kernel<<<num_blocks, default_block_size, 0,
-                                             exec->get_stream()>>>(
+        GKO_DEVICE_NAMESPACE::batch_single_kernels::scale_kernel<<<
+            num_blocks, default_block_size, 0, exec->get_stream()>>>(
             alpha_ub, x_ub, [] __device__(int row, int col, int stride) {
                 return row * stride + col;
             });
     } else {
-        batch_single_kernels::scale_kernel<<<num_blocks, default_block_size, 0,
-                                             exec->get_stream()>>>(
+        GKO_DEVICE_NAMESPACE::batch_single_kernels::scale_kernel<<<
+            num_blocks, default_block_size, 0, exec->get_stream()>>>(
             alpha_ub, x_ub,
             [] __device__(int row, int col, int stride) { return col; });
     }
@@ -71,11 +71,11 @@ void add_scaled(std::shared_ptr<const DefaultExecutor> exec,
     const auto x_ub = get_batch_struct(x);
     const auto y_ub = get_batch_struct(y);
     if (alpha->get_common_size()[1] == 1) {
-        batch_single_kernels::add_scaled_kernel<<<
+        GKO_DEVICE_NAMESPACE::batch_single_kernels::add_scaled_kernel<<<
             num_blocks, default_block_size, 0, exec->get_stream()>>>(
             alpha_ub, x_ub, y_ub, [] __device__(int col) { return 0; });
     } else {
-        batch_single_kernels::add_scaled_kernel<<<
+        GKO_DEVICE_NAMESPACE::batch_single_kernels::add_scaled_kernel<<<
             num_blocks, default_block_size, 0, exec->get_stream()>>>(
             alpha_ub, x_ub, y_ub, [] __device__(int col) { return col; });
     }
@@ -96,9 +96,10 @@ void compute_dot(std::shared_ptr<const DefaultExecutor> exec,
     const auto x_ub = get_batch_struct(x);
     const auto y_ub = get_batch_struct(y);
     const auto res_ub = get_batch_struct(result);
-    batch_single_kernels::compute_gen_dot_product_kernel<<<
-        num_blocks, default_block_size, 0, exec->get_stream()>>>(
-        x_ub, y_ub, res_ub, [] __device__(auto val) { return val; });
+    GKO_DEVICE_NAMESPACE::batch_single_kernels::
+        compute_gen_dot_product_kernel<<<num_blocks, default_block_size, 0,
+                                         exec->get_stream()>>>(
+            x_ub, y_ub, res_ub, [] __device__(auto val) { return val; });
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
@@ -116,9 +117,10 @@ void compute_conj_dot(std::shared_ptr<const DefaultExecutor> exec,
     const auto x_ub = get_batch_struct(x);
     const auto y_ub = get_batch_struct(y);
     const auto res_ub = get_batch_struct(result);
-    batch_single_kernels::compute_gen_dot_product_kernel<<<
-        num_blocks, default_block_size, 0, exec->get_stream()>>>(
-        x_ub, y_ub, res_ub, [] __device__(auto val) { return conj(val); });
+    GKO_DEVICE_NAMESPACE::batch_single_kernels::
+        compute_gen_dot_product_kernel<<<num_blocks, default_block_size, 0,
+                                         exec->get_stream()>>>(
+            x_ub, y_ub, res_ub, [] __device__(auto val) { return conj(val); });
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
@@ -134,9 +136,8 @@ void compute_norm2(std::shared_ptr<const DefaultExecutor> exec,
     const auto num_rhs = x->get_common_size()[1];
     const auto x_ub = get_batch_struct(x);
     const auto res_ub = get_batch_struct(result);
-    batch_single_kernels::compute_norm2_kernel<<<num_blocks, default_block_size,
-                                                 0, exec->get_stream()>>>(
-        x_ub, res_ub);
+    GKO_DEVICE_NAMESPACE::batch_single_kernels::compute_norm2_kernel<<<
+        num_blocks, default_block_size, 0, exec->get_stream()>>>(x_ub, res_ub);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
@@ -151,7 +152,7 @@ void copy(std::shared_ptr<const DefaultExecutor> exec,
     const auto num_blocks = x->get_num_batch_items();
     const auto result_ub = get_batch_struct(result);
     const auto x_ub = get_batch_struct(x);
-    batch_single_kernels::
+    GKO_DEVICE_NAMESPACE::batch_single_kernels::
         copy_kernel<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
             x_ub, result_ub);
 }
diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.hpp b/common/cuda_hip/base/batch_multi_vector_kernels.hpp
index 36aa69d7d99..bb3aac67b55 100644
--- a/common/cuda_hip/base/batch_multi_vector_kernels.hpp
+++ b/common/cuda_hip/base/batch_multi_vector_kernels.hpp
@@ -34,7 +34,6 @@
 namespace gko {
 namespace kernels {
 namespace GKO_DEVICE_NAMESPACE {
-namespace batch_multi_vector {
 namespace batch_single_kernels {
 
 
@@ -320,7 +319,6 @@ __global__ __launch_bounds__(default_block_size) void copy_kernel(
 
 
 }  // namespace batch_single_kernels
-}  // namespace batch_multi_vector
 }  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
 }  // namespace gko
diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc b/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc
deleted file mode 100644
index 7af3c84303f..00000000000
--- a/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc
+++ /dev/null
@@ -1,280 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-template <typename ValueType, typename Mapping>
-__device__ __forceinline__ void scale(
-    const gko::batch::multi_vector::batch_item<const ValueType>& alpha,
-    const gko::batch::multi_vector::batch_item<ValueType>& x, Mapping map)
-{
-    const int max_li = x.num_rows * x.num_rhs;
-    for (int li = threadIdx.x; li < max_li; li += blockDim.x) {
-        const int row = li / x.num_rhs;
-        const int col = li % x.num_rhs;
-
-        x.values[row * x.stride + col] =
-            alpha.values[map(row, col, alpha.stride)] *
-            x.values[row * x.stride + col];
-    }
-}
-
-
-template <typename ValueType, typename Mapping>
-__global__ __launch_bounds__(default_block_size) void scale_kernel(
-    const gko::batch::multi_vector::uniform_batch<const ValueType> alpha,
-    const gko::batch::multi_vector::uniform_batch<ValueType> x, Mapping map)
-{
-    for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items;
-         batch_id += gridDim.x) {
-        const auto alpha_b = gko::batch::extract_batch_item(alpha, batch_id);
-        const auto x_b = gko::batch::extract_batch_item(x, batch_id);
-        scale(alpha_b, x_b, map);
-    }
-}
-
-
-template <typename ValueType, typename Mapping>
-__device__ __forceinline__ void add_scaled(
-    const gko::batch::multi_vector::batch_item<const ValueType>& alpha,
-    const gko::batch::multi_vector::batch_item<const ValueType>& x,
-    const gko::batch::multi_vector::batch_item<ValueType>& y, Mapping map)
-{
-    const int max_li = x.num_rows * x.num_rhs;
-    for (int li = threadIdx.x; li < max_li; li += blockDim.x) {
-        const int row = li / x.num_rhs;
-        const int col = li % x.num_rhs;
-
-        y.values[row * y.stride + col] +=
-            alpha.values[map(col)] * x.values[row * x.stride + col];
-    }
-}
-
-
-template <typename ValueType, typename Mapping>
-__global__ __launch_bounds__(default_block_size) void add_scaled_kernel(
-    const gko::batch::multi_vector::uniform_batch<const ValueType> alpha,
-    const gko::batch::multi_vector::uniform_batch<const ValueType> x,
-    const gko::batch::multi_vector::uniform_batch<ValueType> y, Mapping map)
-{
-    for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items;
-         batch_id += gridDim.x) {
-        const auto alpha_b = gko::batch::extract_batch_item(alpha, batch_id);
-        const auto x_b = gko::batch::extract_batch_item(x, batch_id);
-        const auto y_b = gko::batch::extract_batch_item(y, batch_id);
-        add_scaled(alpha_b, x_b, y_b, map);
-    }
-}
-
-
-template <typename Group, typename ValueType>
-__device__ __forceinline__ void single_rhs_compute_conj_dot(Group subgroup,
-                                                            const int num_rows,
-                                                            const ValueType* x,
-                                                            const ValueType* y,
-                                                            ValueType& result)
-
-{
-    ValueType val = zero<ValueType>();
-    for (int r = subgroup.thread_rank(); r < num_rows; r += subgroup.size()) {
-        val += conj(x[r]) * y[r];
-    }
-
-    // subgroup level reduction
-    val = reduce(subgroup, val, thrust::plus<ValueType>{});
-
-    if (subgroup.thread_rank() == 0) {
-        result = val;
-    }
-}
-
-
-template <typename Group, typename ValueType, typename Mapping>
-__device__ __forceinline__ void gen_one_dot(
-    const gko::batch::multi_vector::batch_item<const ValueType>& x,
-    const gko::batch::multi_vector::batch_item<const ValueType>& y,
-    const int rhs_index,
-    const gko::batch::multi_vector::batch_item<ValueType>& result,
-    Group subgroup, Mapping conj_map)
-{
-    ValueType val = zero<ValueType>();
-
-    for (int r = subgroup.thread_rank(); r < x.num_rows; r += subgroup.size()) {
-        val += conj_map(x.values[r * x.stride + rhs_index]) *
-               y.values[r * y.stride + rhs_index];
-    }
-
-    // subgroup level reduction
-    val = reduce(subgroup, val, thrust::plus<ValueType>{});
-
-    if (subgroup.thread_rank() == 0) {
-        result.values[rhs_index] = val;
-    }
-}
-
-
-template <typename ValueType, typename Mapping>
-__device__ __forceinline__ void compute_gen_dot_product(
-    const gko::batch::multi_vector::batch_item<const ValueType>& x,
-    const gko::batch::multi_vector::batch_item<const ValueType>& y,
-    const gko::batch::multi_vector::batch_item<ValueType>& result,
-    Mapping conj_map)
-{
-    constexpr auto tile_size = config::warp_size;
-    auto thread_block = group::this_thread_block();
-    auto subgroup = group::tiled_partition<tile_size>(thread_block);
-    const auto subgroup_id = static_cast<int>(threadIdx.x / tile_size);
-    const int num_subgroups_per_block = ceildiv(blockDim.x, tile_size);
-
-    for (int rhs_index = subgroup_id; rhs_index < x.num_rhs;
-         rhs_index += num_subgroups_per_block) {
-        gen_one_dot(x, y, rhs_index, result, subgroup, conj_map);
-    }
-}
-
-
-template <typename ValueType, typename Mapping>
-__global__
-__launch_bounds__(default_block_size) void compute_gen_dot_product_kernel(
-    const gko::batch::multi_vector::uniform_batch<const ValueType> x,
-    const gko::batch::multi_vector::uniform_batch<const ValueType> y,
-    const gko::batch::multi_vector::uniform_batch<ValueType> result,
-    Mapping map)
-{
-    for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items;
-         batch_id += gridDim.x) {
-        const auto x_b = gko::batch::extract_batch_item(x, batch_id);
-        const auto y_b = gko::batch::extract_batch_item(y, batch_id);
-        const auto r_b = gko::batch::extract_batch_item(result, batch_id);
-        compute_gen_dot_product(x_b, y_b, r_b, map);
-    }
-}
-
-
-template <typename Group, typename ValueType>
-__device__ __forceinline__ void single_rhs_compute_norm2(
-    Group subgroup, const int num_rows, const ValueType* x,
-    remove_complex<ValueType>& result)
-{
-    using real_type = typename gko::remove_complex<ValueType>;
-    real_type val = zero<real_type>();
-
-    for (int r = subgroup.thread_rank(); r < num_rows; r += subgroup.size()) {
-        val += squared_norm(x[r]);
-    }
-
-    // subgroup level reduction
-    val = reduce(subgroup, val, thrust::plus<remove_complex<ValueType>>{});
-
-    if (subgroup.thread_rank() == 0) {
-        result = sqrt(val);
-    }
-}
-
-
-template <typename Group, typename ValueType>
-__device__ __forceinline__ void one_norm2(
-    const gko::batch::multi_vector::batch_item<const ValueType>& x,
-    const int rhs_index,
-    const gko::batch::multi_vector::batch_item<remove_complex<ValueType>>&
-        result,
-    Group subgroup)
-{
-    using real_type = typename gko::remove_complex<ValueType>;
-    real_type val = zero<real_type>();
-
-    for (int r = subgroup.thread_rank(); r < x.num_rows; r += subgroup.size()) {
-        val += squared_norm(x.values[r * x.stride + rhs_index]);
-    }
-
-    // subgroup level reduction
-    val = reduce(subgroup, val, thrust::plus<remove_complex<ValueType>>{});
-
-    if (subgroup.thread_rank() == 0) {
-        result.values[rhs_index] = sqrt(val);
-    }
-}
-
-
-/**
- * Computes the 2-norms of some column vectors in global or shared memory.
- *
- * @param x  A row-major multivector with nrhs columns.
- * @param result  Holds norm value for each vector in x.
- */
-template <typename ValueType>
-__device__ __forceinline__ void compute_norm2(
-    const gko::batch::multi_vector::batch_item<const ValueType>& x,
-    const gko::batch::multi_vector::batch_item<remove_complex<ValueType>>&
-        result)
-{
-    constexpr auto tile_size = config::warp_size;
-    auto thread_block = group::this_thread_block();
-    auto subgroup = group::tiled_partition<tile_size>(thread_block);
-    const auto subgroup_id = static_cast<int>(threadIdx.x / tile_size);
-    const int num_subgroups_per_block = ceildiv(blockDim.x, tile_size);
-
-    for (int rhs_index = subgroup_id; rhs_index < x.num_rhs;
-         rhs_index += num_subgroups_per_block) {
-        one_norm2(x, rhs_index, result, subgroup);
-    }
-}
-
-
-template <typename ValueType>
-__global__ __launch_bounds__(default_block_size) void compute_norm2_kernel(
-    const gko::batch::multi_vector::uniform_batch<const ValueType> x,
-    const gko::batch::multi_vector::uniform_batch<remove_complex<ValueType>>
-        result)
-{
-    for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items;
-         batch_id += gridDim.x) {
-        const auto x_b = gko::batch::extract_batch_item(x, batch_id);
-        const auto r_b = gko::batch::extract_batch_item(result, batch_id);
-        compute_norm2(x_b, r_b);
-    }
-}
-
-
-template <typename ValueType>
-__device__ __forceinline__ void single_rhs_copy(const int num_rows,
-                                                const ValueType* in,
-                                                ValueType* out)
-{
-    for (int iz = threadIdx.x; iz < num_rows; iz += blockDim.x) {
-        out[iz] = in[iz];
-    }
-}
-
-
-/**
- * Copies the values of one multi-vector into another.
- *
- * Note that the output multi-vector should already have memory allocated
- * and stride set.
- */
-template <typename ValueType>
-__device__ __forceinline__ void copy(
-    const gko::batch::multi_vector::batch_item<const ValueType>& in,
-    const gko::batch::multi_vector::batch_item<ValueType>& out)
-{
-    for (int iz = threadIdx.x; iz < in.num_rows * in.num_rhs;
-         iz += blockDim.x) {
-        const int i = iz / in.num_rhs;
-        const int j = iz % in.num_rhs;
-        out.values[i * out.stride + j] = in.values[i * in.stride + j];
-    }
-}
-
-
-template <typename ValueType>
-__global__ __launch_bounds__(default_block_size) void copy_kernel(
-    const gko::batch::multi_vector::uniform_batch<const ValueType> src,
-    const gko::batch::multi_vector::uniform_batch<ValueType> dst)
-{
-    for (size_type batch_id = blockIdx.x; batch_id < src.num_batch_items;
-         batch_id += gridDim.x) {
-        const auto dst_b = gko::batch::extract_batch_item(dst, batch_id);
-        const auto src_b = gko::batch::extract_batch_item(src, batch_id);
-        copy(src_b, dst_b);
-    }
-}
diff --git a/common/cuda_hip/solver/batch_bicgstab_kernels.hpp.inc b/common/cuda_hip/solver/batch_bicgstab_kernels.hpp.inc
index f71c8c40c3e..c2a53b2e518 100644
--- a/common/cuda_hip/solver/batch_bicgstab_kernels.hpp.inc
+++ b/common/cuda_hip/solver/batch_bicgstab_kernels.hpp.inc
@@ -32,10 +32,14 @@ __device__ __forceinline__ void initialize(
     __syncthreads();
 
     if (threadIdx.x / config::warp_size == 0) {
-        single_rhs_compute_norm2(subgroup, num_rows, r_shared_entry, res_norm);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+            single_rhs_compute_norm2(subgroup, num_rows, r_shared_entry,
+                                     res_norm);
     } else if (threadIdx.x / config::warp_size == 1) {
         // Compute norms of rhs
-        single_rhs_compute_norm2(subgroup, num_rows, b_global_entry, rhs_norm);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+            single_rhs_compute_norm2(subgroup, num_rows, b_global_entry,
+                                     rhs_norm);
     }
     __syncthreads();
 
@@ -70,8 +74,9 @@ __device__ __forceinline__ void compute_alpha(
     const ValueType* const v_shared_entry, ValueType& alpha)
 {
     if (threadIdx.x / config::warp_size == 0) {
-        single_rhs_compute_conj_dot(subgroup, num_rows, r_hat_shared_entry,
-                                    v_shared_entry, alpha);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+            single_rhs_compute_conj_dot(subgroup, num_rows, r_hat_shared_entry,
+                                        v_shared_entry, alpha);
     }
     __syncthreads();
     if (threadIdx.x == 0) {
@@ -99,11 +104,13 @@ __device__ __forceinline__ void compute_omega(
     const ValueType* const s_shared_entry, ValueType& temp, ValueType& omega)
 {
     if (threadIdx.x / config::warp_size == 0) {
-        single_rhs_compute_conj_dot(subgroup, num_rows, t_shared_entry,
-                                    s_shared_entry, omega);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+            single_rhs_compute_conj_dot(subgroup, num_rows, t_shared_entry,
+                                        s_shared_entry, omega);
     } else if (threadIdx.x / config::warp_size == 1) {
-        single_rhs_compute_conj_dot(subgroup, num_rows, t_shared_entry,
-                                    t_shared_entry, temp);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+            single_rhs_compute_conj_dot(subgroup, num_rows, t_shared_entry,
+                                        t_shared_entry, temp);
     }
 
     __syncthreads();
@@ -271,8 +278,9 @@ __global__ void apply_kernel(
 
             // rho_new =  < r_hat , r > = (r_hat)' * (r)
             if (threadIdx.x / config::warp_size == 0) {
-                single_rhs_compute_conj_dot(subgroup, num_rows, r_hat_sh, r_sh,
-                                            rho_new_sh[0]);
+                gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+                    single_rhs_compute_conj_dot(subgroup, num_rows, r_hat_sh,
+                                                r_sh, rho_new_sh[0]);
             }
             __syncthreads();
 
@@ -301,8 +309,9 @@ __global__ void apply_kernel(
 
             // an estimate of residual norms
             if (threadIdx.x / config::warp_size == 0) {
-                single_rhs_compute_norm2(subgroup, num_rows, s_sh,
-                                         norms_res_sh[0]);
+                gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+                    single_rhs_compute_norm2(subgroup, num_rows, s_sh,
+                                             norms_res_sh[0]);
             }
             __syncthreads();
 
@@ -333,8 +342,9 @@ __global__ void apply_kernel(
             __syncthreads();
 
             if (threadIdx.x / config::warp_size == 0) {
-                single_rhs_compute_norm2(subgroup, num_rows, r_sh,
-                                         norms_res_sh[0]);
+                gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+                    single_rhs_compute_norm2(subgroup, num_rows, r_sh,
+                                             norms_res_sh[0]);
             }
             //__syncthreads();
 
@@ -347,7 +357,8 @@ __global__ void apply_kernel(
         logger.log_iteration(batch_id, iter, norms_res_sh[0]);
 
         // copy x back to global memory
-        single_rhs_copy(num_rows, x_sh, x_gl_entry_ptr);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+            single_rhs_copy(num_rows, x_sh, x_gl_entry_ptr);
         __syncthreads();
     }
 }
diff --git a/common/cuda_hip/solver/batch_cg_kernels.hpp.inc b/common/cuda_hip/solver/batch_cg_kernels.hpp.inc
index ffee501b58c..c95a6b1cf05 100644
--- a/common/cuda_hip/solver/batch_cg_kernels.hpp.inc
+++ b/common/cuda_hip/solver/batch_cg_kernels.hpp.inc
@@ -32,12 +32,14 @@ __device__ __forceinline__ void initialize(
 
     if (threadIdx.x / config::warp_size == 0) {
         // Compute norms of rhs
-        single_rhs_compute_norm2(subgroup, num_rows, b_global_entry,
-                                 rhs_norms_sh);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+            single_rhs_compute_norm2(subgroup, num_rows, b_global_entry,
+                                     rhs_norms_sh);
     } else if (threadIdx.x / config::warp_size == 1) {
         // rho_old = r' * z
-        single_rhs_compute_conj_dot(subgroup, num_rows, r_shared_entry,
-                                    z_shared_entry, rho_old_shared_entry);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+            single_rhs_compute_conj_dot(subgroup, num_rows, r_shared_entry,
+                                        z_shared_entry, rho_old_shared_entry);
     }
 
     // p = z
@@ -69,8 +71,9 @@ __device__ __forceinline__ void update_x_and_r(
     ValueType* const x_shared_entry, ValueType* const r_shared_entry)
 {
     if (threadIdx.x / config::warp_size == 0) {
-        single_rhs_compute_conj_dot(subgroup, num_rows, p_shared_entry,
-                                    Ap_shared_entry, alpha_shared_entry);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+            single_rhs_compute_conj_dot(subgroup, num_rows, p_shared_entry,
+                                        Ap_shared_entry, alpha_shared_entry);
     }
     __syncthreads();
 
@@ -202,8 +205,9 @@ __global__ void apply_kernel(const gko::kernels::batch_cg::storage_config sconf,
 
             if (threadIdx.x / config::warp_size == 0) {
                 // rho_new =  (r)' * (z)
-                single_rhs_compute_conj_dot(subgroup, num_rows, r_sh, z_sh,
-                                            rho_new_sh[0]);
+                gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+                    single_rhs_compute_conj_dot(subgroup, num_rows, r_sh, z_sh,
+                                                rho_new_sh[0]);
             }
             __syncthreads();
 
@@ -222,7 +226,8 @@ __global__ void apply_kernel(const gko::kernels::batch_cg::storage_config sconf,
         logger.log_iteration(batch_id, iter, norms_res_sh[0]);
 
         // copy x back to global memory
-        single_rhs_copy(num_rows, x_sh, x_global_entry);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+            single_rhs_copy(num_rows, x_sh, x_global_entry);
         __syncthreads();
     }
 }
diff --git a/cuda/solver/batch_bicgstab_kernels.cu b/cuda/solver/batch_bicgstab_kernels.cu
index 3c7fe50709c..4d3deb742fe 100644
--- a/cuda/solver/batch_bicgstab_kernels.cu
+++ b/cuda/solver/batch_bicgstab_kernels.cu
@@ -10,6 +10,7 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 
+#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp"
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/thrust.hpp"
@@ -43,7 +44,6 @@ constexpr int sm_oversubscription = 4;
 namespace batch_bicgstab {
 
 
-#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc"
 #include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc"
 #include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc"
 #include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc"
diff --git a/cuda/solver/batch_cg_kernels.cu b/cuda/solver/batch_cg_kernels.cu
index b681bd13ce3..21c3e3d43c4 100644
--- a/cuda/solver/batch_cg_kernels.cu
+++ b/cuda/solver/batch_cg_kernels.cu
@@ -10,6 +10,7 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 
+#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp"
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/thrust.hpp"
 #include "common/cuda_hip/base/types.hpp"
@@ -42,7 +43,6 @@ constexpr int sm_oversubscription = 4;
 namespace batch_cg {
 
 
-#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc"
 #include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc"
 #include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc"
 #include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc"
diff --git a/hip/solver/batch_bicgstab_kernels.hip.cpp b/hip/solver/batch_bicgstab_kernels.hip.cpp
index ca49fa5eb9c..1c1be8b21f7 100644
--- a/hip/solver/batch_bicgstab_kernels.hip.cpp
+++ b/hip/solver/batch_bicgstab_kernels.hip.cpp
@@ -10,6 +10,7 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 
+#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp"
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
@@ -42,7 +43,6 @@ constexpr int sm_oversubscription = 4;
 namespace batch_bicgstab {
 
 
-#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc"
 #include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc"
 #include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc"
 #include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc"
diff --git a/hip/solver/batch_cg_kernels.hip.cpp b/hip/solver/batch_cg_kernels.hip.cpp
index 3a1642edfea..c860286c17c 100644
--- a/hip/solver/batch_cg_kernels.hip.cpp
+++ b/hip/solver/batch_cg_kernels.hip.cpp
@@ -10,6 +10,7 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 
+#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp"
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
@@ -42,7 +43,6 @@ constexpr int sm_oversubscription = 4;
 namespace batch_cg {
 
 
-#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc"
 #include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc"
 #include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc"
 #include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc"

From d26f79308e70cdb0506e4506a68ce6ee2bd6454f Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Mon, 19 Aug 2024 11:32:50 +0200
Subject: [PATCH 3/6] [ref, omp] move kernels to headers

---
 omp/base/batch_multi_vector_kernels.cpp       | 30 +++++++-------
 omp/solver/batch_bicgstab_kernels.cpp         |  2 +-
 omp/solver/batch_cg_kernels.cpp               |  2 +-
 reference/base/batch_multi_vector_kernels.cpp | 33 +++++++--------
 ...hpp.inc => batch_multi_vector_kernels.hpp} | 20 +++++++++
 reference/solver/batch_bicgstab_kernels.cpp   |  2 +-
 .../solver/batch_bicgstab_kernels.hpp.inc     | 41 ++++++++++++-------
 reference/solver/batch_cg_kernels.cpp         |  2 +-
 reference/solver/batch_cg_kernels.hpp.inc     | 23 +++++++----
 9 files changed, 96 insertions(+), 59 deletions(-)
 rename reference/base/{batch_multi_vector_kernels.hpp.inc => batch_multi_vector_kernels.hpp} (90%)

diff --git a/omp/base/batch_multi_vector_kernels.cpp b/omp/base/batch_multi_vector_kernels.cpp
index 395bf96cc7a..8a947107479 100644
--- a/omp/base/batch_multi_vector_kernels.cpp
+++ b/omp/base/batch_multi_vector_kernels.cpp
@@ -10,24 +10,18 @@
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/range_accessors.hpp>
 
+#include "common/unified/base/kernel_launch.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
+#include "reference/base/batch_multi_vector_kernels.hpp"
 #include "reference/base/batch_struct.hpp"
 
 
 namespace gko {
 namespace kernels {
-namespace omp {
-/**
- * @brief The batch::MultiVector matrix format namespace.
- * @ref batch::MultiVector
- * @ingroup batch_multi_vector
- */
+namespace GKO_DEVICE_NAMESPACE {
 namespace batch_multi_vector {
 
 
-#include "reference/base/batch_multi_vector_kernels.hpp.inc"
-
-
 template <typename ValueType>
 void scale(std::shared_ptr<const DefaultExecutor> exec,
            const batch::MultiVector<ValueType>* const alpha,
@@ -39,7 +33,7 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
     for (size_type batch = 0; batch < x->get_num_batch_items(); ++batch) {
         const auto alpha_b = gko::batch::extract_batch_item(alpha_ub, batch);
         const auto x_b = gko::batch::extract_batch_item(x_ub, batch);
-        scale_kernel(alpha_b, x_b);
+        GKO_DEVICE_NAMESPACE::batch_single_kernels::scale_kernel(alpha_b, x_b);
     }
 }
 
@@ -61,7 +55,8 @@ void add_scaled(std::shared_ptr<const DefaultExecutor> exec,
         const auto alpha_b = gko::batch::extract_batch_item(alpha_ub, batch);
         const auto x_b = gko::batch::extract_batch_item(x_ub, batch);
         const auto y_b = gko::batch::extract_batch_item(y_ub, batch);
-        add_scaled_kernel(alpha_b, x_b, y_b);
+        GKO_DEVICE_NAMESPACE::batch_single_kernels::add_scaled_kernel(alpha_b,
+                                                                      x_b, y_b);
     }
 }
 
@@ -83,7 +78,8 @@ void compute_dot(std::shared_ptr<const DefaultExecutor> exec,
         const auto res_b = gko::batch::extract_batch_item(res_ub, batch);
         const auto x_b = gko::batch::extract_batch_item(x_ub, batch);
         const auto y_b = gko::batch::extract_batch_item(y_ub, batch);
-        compute_dot_product_kernel(x_b, y_b, res_b);
+        GKO_DEVICE_NAMESPACE::batch_single_kernels::compute_dot_product_kernel(
+            x_b, y_b, res_b);
     }
 }
 
@@ -105,7 +101,8 @@ void compute_conj_dot(std::shared_ptr<const DefaultExecutor> exec,
         const auto res_b = gko::batch::extract_batch_item(res_ub, batch);
         const auto x_b = gko::batch::extract_batch_item(x_ub, batch);
         const auto y_b = gko::batch::extract_batch_item(y_ub, batch);
-        compute_conj_dot_product_kernel(x_b, y_b, res_b);
+        GKO_DEVICE_NAMESPACE::batch_single_kernels::
+            compute_conj_dot_product_kernel(x_b, y_b, res_b);
     }
 }
 
@@ -124,7 +121,8 @@ void compute_norm2(std::shared_ptr<const DefaultExecutor> exec,
     for (size_type batch = 0; batch < result->get_num_batch_items(); ++batch) {
         const auto res_b = gko::batch::extract_batch_item(res_ub, batch);
         const auto x_b = gko::batch::extract_batch_item(x_ub, batch);
-        compute_norm2_kernel(x_b, res_b);
+        GKO_DEVICE_NAMESPACE::batch_single_kernels::compute_norm2_kernel(x_b,
+                                                                         res_b);
     }
 }
 
@@ -143,7 +141,7 @@ void copy(std::shared_ptr<const DefaultExecutor> exec,
     for (size_type batch = 0; batch < x->get_num_batch_items(); ++batch) {
         const auto result_b = gko::batch::extract_batch_item(result_ub, batch);
         const auto x_b = gko::batch::extract_batch_item(x_ub, batch);
-        copy_kernel(x_b, result_b);
+        GKO_DEVICE_NAMESPACE::batch_single_kernels::copy_kernel(x_b, result_b);
     }
 }
 
@@ -151,6 +149,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR_COPY_KERNEL);
 
 
 }  // namespace batch_multi_vector
-}  // namespace omp
+}  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
 }  // namespace gko
diff --git a/omp/solver/batch_bicgstab_kernels.cpp b/omp/solver/batch_bicgstab_kernels.cpp
index 81df9c45e51..c245f284106 100644
--- a/omp/solver/batch_bicgstab_kernels.cpp
+++ b/omp/solver/batch_bicgstab_kernels.cpp
@@ -9,6 +9,7 @@
 #include <ginkgo/core/base/array.hpp>
 
 #include "core/solver/batch_dispatch.hpp"
+#include "reference/base/batch_multi_vector_kernels.hpp"
 
 
 namespace gko {
@@ -28,7 +29,6 @@ namespace {
 constexpr int max_num_rhs = 1;
 
 
-#include "reference/base/batch_multi_vector_kernels.hpp.inc"
 #include "reference/matrix/batch_csr_kernels.hpp.inc"
 #include "reference/matrix/batch_dense_kernels.hpp.inc"
 #include "reference/matrix/batch_ell_kernels.hpp.inc"
diff --git a/omp/solver/batch_cg_kernels.cpp b/omp/solver/batch_cg_kernels.cpp
index 51c794ab597..55d6ee29321 100644
--- a/omp/solver/batch_cg_kernels.cpp
+++ b/omp/solver/batch_cg_kernels.cpp
@@ -9,6 +9,7 @@
 #include <ginkgo/core/base/array.hpp>
 
 #include "core/solver/batch_dispatch.hpp"
+#include "reference/base/batch_multi_vector_kernels.hpp"
 
 
 namespace gko {
@@ -28,7 +29,6 @@ namespace {
 constexpr int max_num_rhs = 1;
 
 
-#include "reference/base/batch_multi_vector_kernels.hpp.inc"
 #include "reference/matrix/batch_csr_kernels.hpp.inc"
 #include "reference/matrix/batch_dense_kernels.hpp.inc"
 #include "reference/matrix/batch_ell_kernels.hpp.inc"
diff --git a/reference/base/batch_multi_vector_kernels.cpp b/reference/base/batch_multi_vector_kernels.cpp
index b0d20a6b826..c05398226f0 100644
--- a/reference/base/batch_multi_vector_kernels.cpp
+++ b/reference/base/batch_multi_vector_kernels.cpp
@@ -10,24 +10,21 @@
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/range_accessors.hpp>
 
+
+#define GKO_DEVICE_NAMESPACE reference
+
+
 #include "core/base/batch_struct.hpp"
+#include "reference/base/batch_multi_vector_kernels.hpp"
 #include "reference/base/batch_struct.hpp"
 
 
 namespace gko {
 namespace kernels {
-namespace reference {
-/**
- * @brief The batch::MultiVector matrix format namespace.
- * @ref batch::MultiVector
- * @ingroup batch_multi_vector
- */
+namespace GKO_DEVICE_NAMESPACE {
 namespace batch_multi_vector {
 
 
-#include "reference/base/batch_multi_vector_kernels.hpp.inc"
-
-
 template <typename ValueType>
 void scale(std::shared_ptr<const DefaultExecutor> exec,
            const batch::MultiVector<ValueType>* alpha,
@@ -38,7 +35,7 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
     for (size_type batch = 0; batch < x->get_num_batch_items(); ++batch) {
         const auto alpha_b = batch::extract_batch_item(alpha_ub, batch);
         const auto x_b = batch::extract_batch_item(x_ub, batch);
-        scale_kernel(alpha_b, x_b);
+        GKO_DEVICE_NAMESPACE::batch_single_kernels::scale_kernel(alpha_b, x_b);
     }
 }
 
@@ -59,7 +56,8 @@ void add_scaled(std::shared_ptr<const DefaultExecutor> exec,
         const auto alpha_b = batch::extract_batch_item(alpha_ub, batch);
         const auto x_b = batch::extract_batch_item(x_ub, batch);
         const auto y_b = batch::extract_batch_item(y_ub, batch);
-        add_scaled_kernel(alpha_b, x_b, y_b);
+        GKO_DEVICE_NAMESPACE::batch_single_kernels::add_scaled_kernel(alpha_b,
+                                                                      x_b, y_b);
     }
 }
 
@@ -80,7 +78,8 @@ void compute_dot(std::shared_ptr<const DefaultExecutor> exec,
         const auto res_b = batch::extract_batch_item(res_ub, batch);
         const auto x_b = batch::extract_batch_item(x_ub, batch);
         const auto y_b = batch::extract_batch_item(y_ub, batch);
-        compute_dot_product_kernel(x_b, y_b, res_b);
+        GKO_DEVICE_NAMESPACE::batch_single_kernels::compute_dot_product_kernel(
+            x_b, y_b, res_b);
     }
 }
 
@@ -101,7 +100,8 @@ void compute_conj_dot(std::shared_ptr<const DefaultExecutor> exec,
         const auto res_b = batch::extract_batch_item(res_ub, batch);
         const auto x_b = batch::extract_batch_item(x_ub, batch);
         const auto y_b = batch::extract_batch_item(y_ub, batch);
-        compute_conj_dot_product_kernel(x_b, y_b, res_b);
+        GKO_DEVICE_NAMESPACE::batch_single_kernels::
+            compute_conj_dot_product_kernel(x_b, y_b, res_b);
     }
 }
 
@@ -119,7 +119,8 @@ void compute_norm2(std::shared_ptr<const DefaultExecutor> exec,
     for (size_type batch = 0; batch < result->get_num_batch_items(); ++batch) {
         const auto res_b = batch::extract_batch_item(res_ub, batch);
         const auto x_b = batch::extract_batch_item(x_ub, batch);
-        compute_norm2_kernel(x_b, res_b);
+        GKO_DEVICE_NAMESPACE::batch_single_kernels::compute_norm2_kernel(x_b,
+                                                                         res_b);
     }
 }
 
@@ -137,7 +138,7 @@ void copy(std::shared_ptr<const DefaultExecutor> exec,
     for (size_type batch = 0; batch < x->get_num_batch_items(); ++batch) {
         const auto result_b = batch::extract_batch_item(result_ub, batch);
         const auto x_b = batch::extract_batch_item(x_ub, batch);
-        copy_kernel(x_b, result_b);
+        GKO_DEVICE_NAMESPACE::batch_single_kernels::copy_kernel(x_b, result_b);
     }
 }
 
@@ -145,6 +146,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR_COPY_KERNEL);
 
 
 }  // namespace batch_multi_vector
-}  // namespace reference
+}  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
 }  // namespace gko
diff --git a/reference/base/batch_multi_vector_kernels.hpp.inc b/reference/base/batch_multi_vector_kernels.hpp
similarity index 90%
rename from reference/base/batch_multi_vector_kernels.hpp.inc
rename to reference/base/batch_multi_vector_kernels.hpp
index 24e59664b74..88f531f29cc 100644
--- a/reference/base/batch_multi_vector_kernels.hpp.inc
+++ b/reference/base/batch_multi_vector_kernels.hpp
@@ -2,6 +2,20 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/types.hpp>
+
+#include "reference/base/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace batch_single_kernels {
+
+
 template <typename ValueType>
 inline void scale_kernel(
     const gko::batch::multi_vector::batch_item<const ValueType>& alpha,
@@ -129,3 +143,9 @@ inline void copy_kernel(
         out.values[i * out.stride + j] = in.values[i * in.stride + j];
     }
 }
+
+
+}  // namespace batch_single_kernels
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/reference/solver/batch_bicgstab_kernels.cpp b/reference/solver/batch_bicgstab_kernels.cpp
index 97de157fb90..e68caffa936 100644
--- a/reference/solver/batch_bicgstab_kernels.cpp
+++ b/reference/solver/batch_bicgstab_kernels.cpp
@@ -5,6 +5,7 @@
 #include "core/solver/batch_bicgstab_kernels.hpp"
 
 #include "core/solver/batch_dispatch.hpp"
+#include "reference/base/batch_multi_vector_kernels.hpp"
 
 
 namespace gko {
@@ -26,7 +27,6 @@ namespace {
 constexpr int max_num_rhs = 1;
 
 
-#include "reference/base/batch_multi_vector_kernels.hpp.inc"
 #include "reference/matrix/batch_csr_kernels.hpp.inc"
 #include "reference/matrix/batch_dense_kernels.hpp.inc"
 #include "reference/matrix/batch_ell_kernels.hpp.inc"
diff --git a/reference/solver/batch_bicgstab_kernels.hpp.inc b/reference/solver/batch_bicgstab_kernels.hpp.inc
index b61db3669ef..1f8537ab66d 100644
--- a/reference/solver/batch_bicgstab_kernels.hpp.inc
+++ b/reference/solver/batch_bicgstab_kernels.hpp.inc
@@ -25,17 +25,20 @@ inline void initialize(
     alpha_entry.values[0] = one<ValueType>();
 
     // Compute norms of rhs
-    compute_norm2_kernel<ValueType>(b_entry, rhs_norms_entry);
+    gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+        compute_norm2_kernel<ValueType>(b_entry, rhs_norms_entry);
 
     // r = b
-    copy_kernel(b_entry, r_entry);
+    gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::copy_kernel(
+        b_entry, r_entry);
 
     // r = b - A*x
     advanced_apply_kernel(static_cast<ValueType>(-1.0), A_entry,
                           gko::batch::to_const(x_entry),
                           static_cast<ValueType>(1.0), r_entry);
-    compute_norm2_kernel<ValueType>(gko::batch::to_const(r_entry),
-                                    res_norms_entry);
+    gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+        compute_norm2_kernel<ValueType>(gko::batch::to_const(r_entry),
+                                        res_norms_entry);
 
     for (int r = 0; r < p_entry.num_rows; r++) {
         r_hat_entry.values[r * r_hat_entry.stride] =
@@ -75,7 +78,9 @@ inline void compute_alpha(
     const gko::batch::multi_vector::batch_item<const ValueType>& v_entry,
     const gko::batch::multi_vector::batch_item<ValueType>& alpha_entry)
 {
-    compute_dot_product_kernel<ValueType>(r_hat_entry, v_entry, alpha_entry);
+    gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+        compute_dot_product_kernel<ValueType>(r_hat_entry, v_entry,
+                                              alpha_entry);
     alpha_entry.values[0] = rho_new_entry.values[0] / alpha_entry.values[0];
 }
 
@@ -102,8 +107,10 @@ inline void compute_omega(
     const gko::batch::multi_vector::batch_item<ValueType>& temp_entry,
     const gko::batch::multi_vector::batch_item<ValueType>& omega_entry)
 {
-    compute_dot_product_kernel<ValueType>(t_entry, s_entry, omega_entry);
-    compute_dot_product_kernel<ValueType>(t_entry, t_entry, temp_entry);
+    gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+        compute_dot_product_kernel<ValueType>(t_entry, s_entry, omega_entry);
+    gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+        compute_dot_product_kernel<ValueType>(t_entry, t_entry, temp_entry);
     omega_entry.values[0] /= temp_entry.values[0];
 }
 
@@ -246,9 +253,10 @@ inline void batch_entry_bicgstab_impl(
         }
 
         // rho_new =  < r_hat , r > = (r_hat)' * (r)
-        compute_dot_product_kernel<ValueType>(gko::batch::to_const(r_hat_entry),
-                                              gko::batch::to_const(r_entry),
-                                              rho_new_entry);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+            compute_dot_product_kernel<ValueType>(
+                gko::batch::to_const(r_hat_entry),
+                gko::batch::to_const(r_entry), rho_new_entry);
 
         // beta = (rho_new / rho_old)*(alpha / omega)
         // p = r + beta*(p - omega * v)
@@ -277,8 +285,9 @@ inline void batch_entry_bicgstab_impl(
                  gko::batch::to_const(v_entry), s_entry);
 
         // an estimate of residual norms
-        compute_norm2_kernel<ValueType>(gko::batch::to_const(s_entry),
-                                        res_norms_entry);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+            compute_norm2_kernel<ValueType>(gko::batch::to_const(s_entry),
+                                            res_norms_entry);
 
         if (stop.check_converged(res_norms_entry.values)) {
             // update x for the systems
@@ -310,11 +319,13 @@ inline void batch_entry_bicgstab_impl(
                        gko::batch::to_const(s_entry),
                        gko::batch::to_const(t_entry), x_entry, r_entry);
 
-        compute_norm2_kernel<ValueType>(gko::batch::to_const(r_entry),
-                                        res_norms_entry);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+            compute_norm2_kernel<ValueType>(gko::batch::to_const(r_entry),
+                                            res_norms_entry);
 
         // rho_old = rho_new
-        copy_kernel(gko::batch::to_const(rho_new_entry), rho_old_entry);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::copy_kernel(
+            gko::batch::to_const(rho_new_entry), rho_old_entry);
     }
 
     logger.log_iteration(batch_item_id, iter, res_norms_entry.values[0]);
diff --git a/reference/solver/batch_cg_kernels.cpp b/reference/solver/batch_cg_kernels.cpp
index 290fbc3718b..785a7a868a2 100644
--- a/reference/solver/batch_cg_kernels.cpp
+++ b/reference/solver/batch_cg_kernels.cpp
@@ -5,6 +5,7 @@
 #include "core/solver/batch_cg_kernels.hpp"
 
 #include "core/solver/batch_dispatch.hpp"
+#include "reference/base/batch_multi_vector_kernels.hpp"
 
 
 namespace gko {
@@ -26,7 +27,6 @@ namespace {
 constexpr int max_num_rhs = 1;
 
 
-#include "reference/base/batch_multi_vector_kernels.hpp.inc"
 #include "reference/matrix/batch_csr_kernels.hpp.inc"
 #include "reference/matrix/batch_dense_kernels.hpp.inc"
 #include "reference/matrix/batch_ell_kernels.hpp.inc"
diff --git a/reference/solver/batch_cg_kernels.hpp.inc b/reference/solver/batch_cg_kernels.hpp.inc
index b3df5ba97fd..ca88940cd69 100644
--- a/reference/solver/batch_cg_kernels.hpp.inc
+++ b/reference/solver/batch_cg_kernels.hpp.inc
@@ -26,10 +26,12 @@ inline void initialize(
     }
 
     // Compute norms of rhs
-    compute_norm2_kernel<ValueType>(b_entry, rhs_norms_entry);
+    gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+        compute_norm2_kernel<ValueType>(b_entry, rhs_norms_entry);
 
     // r = b
-    copy_kernel(b_entry, r_entry);
+    gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::copy_kernel(
+        b_entry, r_entry);
 
     // r = b - A*x
     advanced_apply_kernel(static_cast<ValueType>(-1.0), A_entry,
@@ -46,7 +48,8 @@ inline void update_p(
     const gko::batch::multi_vector::batch_item<ValueType>& p_entry)
 {
     if (rho_old_entry.values[0] == zero<ValueType>()) {
-        copy_kernel(z_entry, p_entry);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::copy_kernel(
+            z_entry, p_entry);
         return;
     }
     const ValueType beta = rho_new_entry.values[0] / rho_old_entry.values[0];
@@ -67,7 +70,9 @@ inline void update_x_and_r(
     const gko::batch::multi_vector::batch_item<ValueType>& x_entry,
     const gko::batch::multi_vector::batch_item<ValueType>& r_entry)
 {
-    compute_conj_dot_product_kernel<ValueType>(p_entry, Ap_entry, alpha_entry);
+    gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+        compute_conj_dot_product_kernel<ValueType>(p_entry, Ap_entry,
+                                                   alpha_entry);
 
     const ValueType temp = rho_old_entry.values[0] / alpha_entry.values[0];
     for (int row = 0; row < r_entry.num_rows; row++) {
@@ -154,9 +159,10 @@ inline void batch_entry_cg_impl(
         prec.apply(gko::batch::to_const(r_entry), z_entry);
 
         // rho_new =  < r , z > = (r)' * (z)
-        compute_conj_dot_product_kernel<ValueType>(
-            gko::batch::to_const(r_entry), gko::batch::to_const(z_entry),
-            rho_new_entry);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+            compute_conj_dot_product_kernel<ValueType>(
+                gko::batch::to_const(r_entry), gko::batch::to_const(z_entry),
+                rho_new_entry);
         ++iter;
         // use implicit residual norms
         res_norms_entry.values[0] = sqrt(abs(rho_new_entry.values[0]));
@@ -185,7 +191,8 @@ inline void batch_entry_cg_impl(
             gko::batch::to_const(Ap_entry), alpha_entry, x_entry, r_entry);
 
         // rho_old = rho_new
-        copy_kernel(gko::batch::to_const(rho_new_entry), rho_old_entry);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::copy_kernel(
+            gko::batch::to_const(rho_new_entry), rho_old_entry);
     }
 
     logger.log_iteration(batch_item_id, iter, res_norms_entry.values[0]);

From d518c3010f69079efd8f1116a5f51c4baabb90dd Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Mon, 19 Aug 2024 15:14:26 +0200
Subject: [PATCH 4/6] [kernels] remove GKO_DEVICE_NAMESPACE

---
 .../base/batch_multi_vector_kernels.cpp       | 37 +++++++++----------
 omp/base/batch_multi_vector_kernels.cpp       | 16 +++-----
 reference/base/batch_multi_vector_kernels.cpp | 16 +++-----
 3 files changed, 30 insertions(+), 39 deletions(-)

diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.cpp b/common/cuda_hip/base/batch_multi_vector_kernels.cpp
index 76565a83f80..17f65487464 100644
--- a/common/cuda_hip/base/batch_multi_vector_kernels.cpp
+++ b/common/cuda_hip/base/batch_multi_vector_kernels.cpp
@@ -37,19 +37,19 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
     const auto alpha_ub = get_batch_struct(alpha);
     const auto x_ub = get_batch_struct(x);
     if (alpha->get_common_size()[1] == 1) {
-        GKO_DEVICE_NAMESPACE::batch_single_kernels::scale_kernel<<<
-            num_blocks, default_block_size, 0, exec->get_stream()>>>(
+        batch_single_kernels::scale_kernel<<<num_blocks, default_block_size, 0,
+                                             exec->get_stream()>>>(
             alpha_ub, x_ub,
             [] __device__(int row, int col, int stride) { return 0; });
     } else if (alpha->get_common_size() == x->get_common_size()) {
-        GKO_DEVICE_NAMESPACE::batch_single_kernels::scale_kernel<<<
-            num_blocks, default_block_size, 0, exec->get_stream()>>>(
+        batch_single_kernels::scale_kernel<<<num_blocks, default_block_size, 0,
+                                             exec->get_stream()>>>(
             alpha_ub, x_ub, [] __device__(int row, int col, int stride) {
                 return row * stride + col;
             });
     } else {
-        GKO_DEVICE_NAMESPACE::batch_single_kernels::scale_kernel<<<
-            num_blocks, default_block_size, 0, exec->get_stream()>>>(
+        batch_single_kernels::scale_kernel<<<num_blocks, default_block_size, 0,
+                                             exec->get_stream()>>>(
             alpha_ub, x_ub,
             [] __device__(int row, int col, int stride) { return col; });
     }
@@ -71,11 +71,11 @@ void add_scaled(std::shared_ptr<const DefaultExecutor> exec,
     const auto x_ub = get_batch_struct(x);
     const auto y_ub = get_batch_struct(y);
     if (alpha->get_common_size()[1] == 1) {
-        GKO_DEVICE_NAMESPACE::batch_single_kernels::add_scaled_kernel<<<
+        batch_single_kernels::add_scaled_kernel<<<
             num_blocks, default_block_size, 0, exec->get_stream()>>>(
             alpha_ub, x_ub, y_ub, [] __device__(int col) { return 0; });
     } else {
-        GKO_DEVICE_NAMESPACE::batch_single_kernels::add_scaled_kernel<<<
+        batch_single_kernels::add_scaled_kernel<<<
             num_blocks, default_block_size, 0, exec->get_stream()>>>(
             alpha_ub, x_ub, y_ub, [] __device__(int col) { return col; });
     }
@@ -96,10 +96,9 @@ void compute_dot(std::shared_ptr<const DefaultExecutor> exec,
     const auto x_ub = get_batch_struct(x);
     const auto y_ub = get_batch_struct(y);
     const auto res_ub = get_batch_struct(result);
-    GKO_DEVICE_NAMESPACE::batch_single_kernels::
-        compute_gen_dot_product_kernel<<<num_blocks, default_block_size, 0,
-                                         exec->get_stream()>>>(
-            x_ub, y_ub, res_ub, [] __device__(auto val) { return val; });
+    batch_single_kernels::compute_gen_dot_product_kernel<<<
+        num_blocks, default_block_size, 0, exec->get_stream()>>>(
+        x_ub, y_ub, res_ub, [] __device__(auto val) { return val; });
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
@@ -117,10 +116,9 @@ void compute_conj_dot(std::shared_ptr<const DefaultExecutor> exec,
     const auto x_ub = get_batch_struct(x);
     const auto y_ub = get_batch_struct(y);
     const auto res_ub = get_batch_struct(result);
-    GKO_DEVICE_NAMESPACE::batch_single_kernels::
-        compute_gen_dot_product_kernel<<<num_blocks, default_block_size, 0,
-                                         exec->get_stream()>>>(
-            x_ub, y_ub, res_ub, [] __device__(auto val) { return conj(val); });
+    batch_single_kernels::compute_gen_dot_product_kernel<<<
+        num_blocks, default_block_size, 0, exec->get_stream()>>>(
+        x_ub, y_ub, res_ub, [] __device__(auto val) { return conj(val); });
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
@@ -136,8 +134,9 @@ void compute_norm2(std::shared_ptr<const DefaultExecutor> exec,
     const auto num_rhs = x->get_common_size()[1];
     const auto x_ub = get_batch_struct(x);
     const auto res_ub = get_batch_struct(result);
-    GKO_DEVICE_NAMESPACE::batch_single_kernels::compute_norm2_kernel<<<
-        num_blocks, default_block_size, 0, exec->get_stream()>>>(x_ub, res_ub);
+    batch_single_kernels::compute_norm2_kernel<<<num_blocks, default_block_size,
+                                                 0, exec->get_stream()>>>(
+        x_ub, res_ub);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
@@ -152,7 +151,7 @@ void copy(std::shared_ptr<const DefaultExecutor> exec,
     const auto num_blocks = x->get_num_batch_items();
     const auto result_ub = get_batch_struct(result);
     const auto x_ub = get_batch_struct(x);
-    GKO_DEVICE_NAMESPACE::batch_single_kernels::
+    batch_single_kernels::
         copy_kernel<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
             x_ub, result_ub);
 }
diff --git a/omp/base/batch_multi_vector_kernels.cpp b/omp/base/batch_multi_vector_kernels.cpp
index 8a947107479..f740e3c32f0 100644
--- a/omp/base/batch_multi_vector_kernels.cpp
+++ b/omp/base/batch_multi_vector_kernels.cpp
@@ -33,7 +33,7 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
     for (size_type batch = 0; batch < x->get_num_batch_items(); ++batch) {
         const auto alpha_b = gko::batch::extract_batch_item(alpha_ub, batch);
         const auto x_b = gko::batch::extract_batch_item(x_ub, batch);
-        GKO_DEVICE_NAMESPACE::batch_single_kernels::scale_kernel(alpha_b, x_b);
+        batch_single_kernels::scale_kernel(alpha_b, x_b);
     }
 }
 
@@ -55,8 +55,7 @@ void add_scaled(std::shared_ptr<const DefaultExecutor> exec,
         const auto alpha_b = gko::batch::extract_batch_item(alpha_ub, batch);
         const auto x_b = gko::batch::extract_batch_item(x_ub, batch);
         const auto y_b = gko::batch::extract_batch_item(y_ub, batch);
-        GKO_DEVICE_NAMESPACE::batch_single_kernels::add_scaled_kernel(alpha_b,
-                                                                      x_b, y_b);
+        batch_single_kernels::add_scaled_kernel(alpha_b, x_b, y_b);
     }
 }
 
@@ -78,8 +77,7 @@ void compute_dot(std::shared_ptr<const DefaultExecutor> exec,
         const auto res_b = gko::batch::extract_batch_item(res_ub, batch);
         const auto x_b = gko::batch::extract_batch_item(x_ub, batch);
         const auto y_b = gko::batch::extract_batch_item(y_ub, batch);
-        GKO_DEVICE_NAMESPACE::batch_single_kernels::compute_dot_product_kernel(
-            x_b, y_b, res_b);
+        batch_single_kernels::compute_dot_product_kernel(x_b, y_b, res_b);
     }
 }
 
@@ -101,8 +99,7 @@ void compute_conj_dot(std::shared_ptr<const DefaultExecutor> exec,
         const auto res_b = gko::batch::extract_batch_item(res_ub, batch);
         const auto x_b = gko::batch::extract_batch_item(x_ub, batch);
         const auto y_b = gko::batch::extract_batch_item(y_ub, batch);
-        GKO_DEVICE_NAMESPACE::batch_single_kernels::
-            compute_conj_dot_product_kernel(x_b, y_b, res_b);
+        batch_single_kernels::compute_conj_dot_product_kernel(x_b, y_b, res_b);
     }
 }
 
@@ -121,8 +118,7 @@ void compute_norm2(std::shared_ptr<const DefaultExecutor> exec,
     for (size_type batch = 0; batch < result->get_num_batch_items(); ++batch) {
         const auto res_b = gko::batch::extract_batch_item(res_ub, batch);
         const auto x_b = gko::batch::extract_batch_item(x_ub, batch);
-        GKO_DEVICE_NAMESPACE::batch_single_kernels::compute_norm2_kernel(x_b,
-                                                                         res_b);
+        batch_single_kernels::compute_norm2_kernel(x_b, res_b);
     }
 }
 
@@ -141,7 +137,7 @@ void copy(std::shared_ptr<const DefaultExecutor> exec,
     for (size_type batch = 0; batch < x->get_num_batch_items(); ++batch) {
         const auto result_b = gko::batch::extract_batch_item(result_ub, batch);
         const auto x_b = gko::batch::extract_batch_item(x_ub, batch);
-        GKO_DEVICE_NAMESPACE::batch_single_kernels::copy_kernel(x_b, result_b);
+        batch_single_kernels::copy_kernel(x_b, result_b);
     }
 }
 
diff --git a/reference/base/batch_multi_vector_kernels.cpp b/reference/base/batch_multi_vector_kernels.cpp
index c05398226f0..f5e1c653054 100644
--- a/reference/base/batch_multi_vector_kernels.cpp
+++ b/reference/base/batch_multi_vector_kernels.cpp
@@ -35,7 +35,7 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
     for (size_type batch = 0; batch < x->get_num_batch_items(); ++batch) {
         const auto alpha_b = batch::extract_batch_item(alpha_ub, batch);
         const auto x_b = batch::extract_batch_item(x_ub, batch);
-        GKO_DEVICE_NAMESPACE::batch_single_kernels::scale_kernel(alpha_b, x_b);
+        batch_single_kernels::scale_kernel(alpha_b, x_b);
     }
 }
 
@@ -56,8 +56,7 @@ void add_scaled(std::shared_ptr<const DefaultExecutor> exec,
         const auto alpha_b = batch::extract_batch_item(alpha_ub, batch);
         const auto x_b = batch::extract_batch_item(x_ub, batch);
         const auto y_b = batch::extract_batch_item(y_ub, batch);
-        GKO_DEVICE_NAMESPACE::batch_single_kernels::add_scaled_kernel(alpha_b,
-                                                                      x_b, y_b);
+        batch_single_kernels::add_scaled_kernel(alpha_b, x_b, y_b);
     }
 }
 
@@ -78,8 +77,7 @@ void compute_dot(std::shared_ptr<const DefaultExecutor> exec,
         const auto res_b = batch::extract_batch_item(res_ub, batch);
         const auto x_b = batch::extract_batch_item(x_ub, batch);
         const auto y_b = batch::extract_batch_item(y_ub, batch);
-        GKO_DEVICE_NAMESPACE::batch_single_kernels::compute_dot_product_kernel(
-            x_b, y_b, res_b);
+        batch_single_kernels::compute_dot_product_kernel(x_b, y_b, res_b);
     }
 }
 
@@ -100,8 +98,7 @@ void compute_conj_dot(std::shared_ptr<const DefaultExecutor> exec,
         const auto res_b = batch::extract_batch_item(res_ub, batch);
         const auto x_b = batch::extract_batch_item(x_ub, batch);
         const auto y_b = batch::extract_batch_item(y_ub, batch);
-        GKO_DEVICE_NAMESPACE::batch_single_kernels::
-            compute_conj_dot_product_kernel(x_b, y_b, res_b);
+        batch_single_kernels::compute_conj_dot_product_kernel(x_b, y_b, res_b);
     }
 }
 
@@ -119,8 +116,7 @@ void compute_norm2(std::shared_ptr<const DefaultExecutor> exec,
     for (size_type batch = 0; batch < result->get_num_batch_items(); ++batch) {
         const auto res_b = batch::extract_batch_item(res_ub, batch);
         const auto x_b = batch::extract_batch_item(x_ub, batch);
-        GKO_DEVICE_NAMESPACE::batch_single_kernels::compute_norm2_kernel(x_b,
-                                                                         res_b);
+        batch_single_kernels::compute_norm2_kernel(x_b, res_b);
     }
 }
 
@@ -138,7 +134,7 @@ void copy(std::shared_ptr<const DefaultExecutor> exec,
     for (size_type batch = 0; batch < x->get_num_batch_items(); ++batch) {
         const auto result_b = batch::extract_batch_item(result_ub, batch);
         const auto x_b = batch::extract_batch_item(x_ub, batch);
-        GKO_DEVICE_NAMESPACE::batch_single_kernels::copy_kernel(x_b, result_b);
+        batch_single_kernels::copy_kernel(x_b, result_b);
     }
 }
 

From 1661d66c8d9f72a6622b31dab38e74a638c60467 Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Mon, 19 Aug 2024 15:31:01 +0200
Subject: [PATCH 5/6] [dpcpp] move to proper headers

---
 dpcpp/base/batch_multi_vector_kernels.dp.cpp  | 64 +++++++++----------
 ...hpp.inc => batch_multi_vector_kernels.hpp} | 29 +++++++++
 dpcpp/solver/batch_bicgstab_kernels.dp.cpp    |  2 +-
 dpcpp/solver/batch_bicgstab_kernels.hpp.inc   | 43 ++++++++-----
 dpcpp/solver/batch_cg_kernels.dp.cpp          |  2 +-
 dpcpp/solver/batch_cg_kernels.hpp.inc         | 25 +++++---
 6 files changed, 102 insertions(+), 63 deletions(-)
 rename dpcpp/base/{batch_multi_vector_kernels.hpp.inc => batch_multi_vector_kernels.hpp} (92%)

diff --git a/dpcpp/base/batch_multi_vector_kernels.dp.cpp b/dpcpp/base/batch_multi_vector_kernels.dp.cpp
index 8f607725bc8..0d2662bdccd 100644
--- a/dpcpp/base/batch_multi_vector_kernels.dp.cpp
+++ b/dpcpp/base/batch_multi_vector_kernels.dp.cpp
@@ -15,6 +15,7 @@
 
 #include "core/base/batch_struct.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
+#include "dpcpp/base/batch_multi_vector_kernels.hpp"
 #include "dpcpp/base/batch_struct.hpp"
 #include "dpcpp/base/config.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
@@ -29,17 +30,9 @@
 namespace gko {
 namespace kernels {
 namespace dpcpp {
-/**
- * @brief The MultiVector matrix format namespace.
- * @ref MultiVector
- * @ingroup batch_multi_vector
- */
 namespace batch_multi_vector {
 
 
-#include "dpcpp/base/batch_multi_vector_kernels.hpp.inc"
-
-
 template <typename ValueType>
 void scale(std::shared_ptr<const DefaultExecutor> exec,
            const batch::MultiVector<ValueType>* const alpha,
@@ -71,7 +64,7 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
                     const auto alpha_b =
                         batch::extract_batch_item(alpha_ub, group_id);
                     const auto x_b = batch::extract_batch_item(x_ub, group_id);
-                    scale_kernel(
+                    batch_single_kernels::scale_kernel(
                         alpha_b, x_b, item_ct1,
                         [](int row, int col, int stride) { return 0; });
                 });
@@ -85,10 +78,11 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
                     const auto alpha_b =
                         batch::extract_batch_item(alpha_ub, group_id);
                     const auto x_b = batch::extract_batch_item(x_ub, group_id);
-                    scale_kernel(alpha_b, x_b, item_ct1,
-                                 [](int row, int col, int stride) {
-                                     return row * stride + col;
-                                 });
+                    batch_single_kernels::scale_kernel(
+                        alpha_b, x_b, item_ct1,
+                        [](int row, int col, int stride) {
+                            return row * stride + col;
+                        });
                 });
         });
     } else {
@@ -100,7 +94,7 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
                     const auto alpha_b =
                         batch::extract_batch_item(alpha_ub, group_id);
                     const auto x_b = batch::extract_batch_item(x_ub, group_id);
-                    scale_kernel(
+                    batch_single_kernels::scale_kernel(
                         alpha_b, x_b, item_ct1,
                         [](int row, int col, int stride) { return col; });
                 });
@@ -144,8 +138,9 @@ void add_scaled(std::shared_ptr<const DefaultExecutor> exec,
                         batch::extract_batch_item(alpha_ub, group_id);
                     const auto x_b = batch::extract_batch_item(x_ub, group_id);
                     const auto y_b = batch::extract_batch_item(y_ub, group_id);
-                    add_scaled_kernel(alpha_b, x_b, y_b, item_ct1,
-                                      [](auto col) { return 0; });
+                    batch_single_kernels::add_scaled_kernel(
+                        alpha_b, x_b, y_b, item_ct1,
+                        [](auto col) { return 0; });
                 });
         });
     } else {
@@ -158,8 +153,9 @@ void add_scaled(std::shared_ptr<const DefaultExecutor> exec,
                         batch::extract_batch_item(alpha_ub, group_id);
                     const auto x_b = batch::extract_batch_item(x_ub, group_id);
                     const auto y_b = batch::extract_batch_item(y_ub, group_id);
-                    add_scaled_kernel(alpha_b, x_b, y_b, item_ct1,
-                                      [](auto col) { return col; });
+                    batch_single_kernels::add_scaled_kernel(
+                        alpha_b, x_b, y_b, item_ct1,
+                        [](auto col) { return col; });
                 });
         });
     }
@@ -206,7 +202,7 @@ void compute_dot(std::shared_ptr<const DefaultExecutor> exec,
                             batch::extract_batch_item(y_ub, group_id);
                         const auto res_b =
                             batch::extract_batch_item(res_ub, group_id);
-                        single_rhs_compute_conj_dot_sg(
+                        batch_single_kernels::single_rhs_compute_conj_dot_sg(
                             x_b.num_rows, x_b.values, y_b.values,
                             res_b.values[0], item_ct1);
                     });
@@ -226,7 +222,7 @@ void compute_dot(std::shared_ptr<const DefaultExecutor> exec,
                             batch::extract_batch_item(y_ub, group_id);
                         const auto res_b =
                             batch::extract_batch_item(res_ub, group_id);
-                        compute_gen_dot_product_kernel(
+                        batch_single_kernels::compute_gen_dot_product_kernel(
                             x_b, y_b, res_b, item_ct1,
                             [](auto val) { return val; });
                     });
@@ -272,7 +268,7 @@ void compute_conj_dot(std::shared_ptr<const DefaultExecutor> exec,
                     const auto y_b = batch::extract_batch_item(y_ub, group_id);
                     const auto res_b =
                         batch::extract_batch_item(res_ub, group_id);
-                    compute_gen_dot_product_kernel(
+                    batch_single_kernels::compute_gen_dot_product_kernel(
                         x_b, y_b, res_b, item_ct1,
                         [](auto val) { return conj(val); });
                 });
@@ -308,17 +304,16 @@ void compute_norm2(std::shared_ptr<const DefaultExecutor> exec,
         exec->get_queue()->submit([&](sycl::handler& cgh) {
             cgh.parallel_for(
                 sycl_nd_range(grid, block),
-                [=](sycl::nd_item<3> item_ct1)
-                    [[sycl::reqd_sub_group_size(max_subgroup_size)]] {
-                        auto group = item_ct1.get_group();
-                        auto group_id = group.get_group_linear_id();
-                        const auto x_b =
-                            batch::extract_batch_item(x_ub, group_id);
-                        const auto res_b =
-                            batch::extract_batch_item(res_ub, group_id);
-                        single_rhs_compute_norm2_sg(x_b.num_rows, x_b.values,
-                                                    res_b.values[0], item_ct1);
-                    });
+                [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(
+                    max_subgroup_size)]] {
+                    auto group = item_ct1.get_group();
+                    auto group_id = group.get_group_linear_id();
+                    const auto x_b = batch::extract_batch_item(x_ub, group_id);
+                    const auto res_b =
+                        batch::extract_batch_item(res_ub, group_id);
+                    batch_single_kernels::single_rhs_compute_norm2_sg(
+                        x_b.num_rows, x_b.values, res_b.values[0], item_ct1);
+                });
         });
     } else {
         exec->get_queue()->submit([&](sycl::handler& cgh) {
@@ -332,7 +327,8 @@ void compute_norm2(std::shared_ptr<const DefaultExecutor> exec,
                             batch::extract_batch_item(x_ub, group_id);
                         const auto res_b =
                             batch::extract_batch_item(res_ub, group_id);
-                        compute_norm2_kernel(x_b, res_b, item_ct1);
+                        batch_single_kernels::compute_norm2_kernel(x_b, res_b,
+                                                                   item_ct1);
                     });
         });
     }
@@ -371,7 +367,7 @@ void copy(std::shared_ptr<const DefaultExecutor> exec,
                 const auto x_b = batch::extract_batch_item(x_ub, group_id);
                 const auto result_b =
                     batch::extract_batch_item(result_ub, group_id);
-                copy_kernel(x_b, result_b, item_ct1);
+                batch_single_kernels::copy_kernel(x_b, result_b, item_ct1);
             });
     });
 }
diff --git a/dpcpp/base/batch_multi_vector_kernels.hpp.inc b/dpcpp/base/batch_multi_vector_kernels.hpp
similarity index 92%
rename from dpcpp/base/batch_multi_vector_kernels.hpp.inc
rename to dpcpp/base/batch_multi_vector_kernels.hpp
index c41eafd7efd..a16df237e34 100644
--- a/dpcpp/base/batch_multi_vector_kernels.hpp.inc
+++ b/dpcpp/base/batch_multi_vector_kernels.hpp
@@ -2,6 +2,29 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+
+#include <memory>
+
+#include <CL/sycl.hpp>
+
+#include "core/base/batch_struct.hpp"
+#include "dpcpp/base/batch_struct.hpp"
+#include "dpcpp/base/config.hpp"
+#include "dpcpp/base/dim3.dp.hpp"
+#include "dpcpp/base/dpct.hpp"
+#include "dpcpp/base/helper.hpp"
+#include "dpcpp/components/cooperative_groups.dp.hpp"
+#include "dpcpp/components/intrinsics.dp.hpp"
+#include "dpcpp/components/reduction.dp.hpp"
+#include "dpcpp/components/thread_ids.dp.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace batch_single_kernels {
+
+
 template <typename ValueType, typename Mapping>
 __dpct_inline__ void scale_kernel(
     const gko::batch::multi_vector::batch_item<const ValueType>& alpha,
@@ -229,3 +252,9 @@ __dpct_inline__ void copy_kernel(
         out.values[i * out.stride + j] = in.values[i * in.stride + j];
     }
 }
+
+
+}  // namespace batch_single_kernels
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/dpcpp/solver/batch_bicgstab_kernels.dp.cpp b/dpcpp/solver/batch_bicgstab_kernels.dp.cpp
index bb84283b49f..7dc8f3ec23b 100644
--- a/dpcpp/solver/batch_bicgstab_kernels.dp.cpp
+++ b/dpcpp/solver/batch_bicgstab_kernels.dp.cpp
@@ -13,6 +13,7 @@
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/solver/batch_dispatch.hpp"
+#include "dpcpp/base/batch_multi_vector_kernels.hpp"
 #include "dpcpp/base/batch_struct.hpp"
 #include "dpcpp/base/config.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
@@ -36,7 +37,6 @@ namespace dpcpp {
 namespace batch_bicgstab {
 
 
-#include "dpcpp/base/batch_multi_vector_kernels.hpp.inc"
 #include "dpcpp/matrix/batch_csr_kernels.hpp.inc"
 #include "dpcpp/matrix/batch_dense_kernels.hpp.inc"
 #include "dpcpp/matrix/batch_ell_kernels.hpp.inc"
diff --git a/dpcpp/solver/batch_bicgstab_kernels.hpp.inc b/dpcpp/solver/batch_bicgstab_kernels.hpp.inc
index ad7eaeff556..f5a88e9d59d 100644
--- a/dpcpp/solver/batch_bicgstab_kernels.hpp.inc
+++ b/dpcpp/solver/batch_bicgstab_kernels.hpp.inc
@@ -39,11 +39,13 @@ __dpct_inline__ void initialize(
     item_ct1.barrier(sycl::access::fence_space::global_and_local);
 
     if (sg_id == 0) {
-        single_rhs_compute_norm2_sg(num_rows, r_shared_entry, res_norm,
-                                    item_ct1);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+            single_rhs_compute_norm2_sg(num_rows, r_shared_entry, res_norm,
+                                        item_ct1);
     } else if (sg_id == 1) {
-        single_rhs_compute_norm2_sg(num_rows, b_global_entry, rhs_norm,
-                                    item_ct1);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+            single_rhs_compute_norm2_sg(num_rows, b_global_entry, rhs_norm,
+                                        item_ct1);
     }
     item_ct1.barrier(sycl::access::fence_space::global_and_local);
 
@@ -86,8 +88,9 @@ __dpct_inline__ void compute_alpha(const int num_rows, const ValueType& rho_new,
     const auto sg_id = sg.get_group_id();
     const auto tid = item_ct1.get_local_linear_id();
     if (sg_id == 0) {
-        single_rhs_compute_conj_dot_sg(num_rows, r_hat_shared_entry,
-                                       v_shared_entry, alpha, item_ct1);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+            single_rhs_compute_conj_dot_sg(num_rows, r_hat_shared_entry,
+                                           v_shared_entry, alpha, item_ct1);
     }
     item_ct1.barrier(sycl::access::fence_space::global_and_local);
     if (tid == 0) {
@@ -123,11 +126,13 @@ __dpct_inline__ void compute_omega(const int num_rows,
     const auto sg_id = sg.get_group_id();
     const auto tid = item_ct1.get_local_linear_id();
     if (sg_id == 0) {
-        single_rhs_compute_conj_dot_sg(num_rows, t_shared_entry, s_shared_entry,
-                                       omega, item_ct1);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+            single_rhs_compute_conj_dot_sg(num_rows, t_shared_entry,
+                                           s_shared_entry, omega, item_ct1);
     } else if (sg_id == 1) {
-        single_rhs_compute_conj_dot_sg(num_rows, t_shared_entry, t_shared_entry,
-                                       temp, item_ct1);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+            single_rhs_compute_conj_dot_sg(num_rows, t_shared_entry,
+                                           t_shared_entry, temp, item_ct1);
     }
     item_ct1.barrier(sycl::access::fence_space::global_and_local);
     if (tid == 0) {
@@ -308,8 +313,9 @@ void apply_kernel(const gko::kernels::batch_bicgstab::storage_config sconf,
 
         // rho_new =  < r_hat , r > = (r_hat)' * (r)
         if (sg_id == 0) {
-            single_rhs_compute_conj_dot_sg(num_rows, r_hat_sh, r_sh,
-                                           rho_new_sh[0], item_ct1);
+            gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+                single_rhs_compute_conj_dot_sg(num_rows, r_hat_sh, r_sh,
+                                               rho_new_sh[0], item_ct1);
         }
         item_ct1.barrier(sycl::access::fence_space::global_and_local);
 
@@ -338,8 +344,9 @@ void apply_kernel(const gko::kernels::batch_bicgstab::storage_config sconf,
 
         // an estimate of residual norms
         if (sg_id == 0) {
-            single_rhs_compute_norm2_sg(num_rows, s_sh, norms_res_sh[0],
-                                        item_ct1);
+            gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+                single_rhs_compute_norm2_sg(num_rows, s_sh, norms_res_sh[0],
+                                            item_ct1);
         }
         item_ct1.barrier(sycl::access::fence_space::global_and_local);
 
@@ -368,8 +375,9 @@ void apply_kernel(const gko::kernels::batch_bicgstab::storage_config sconf,
         item_ct1.barrier(sycl::access::fence_space::global_and_local);
 
         if (sg_id == 0)
-            single_rhs_compute_norm2_sg(num_rows, r_sh, norms_res_sh[0],
-                                        item_ct1);
+            gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+                single_rhs_compute_norm2_sg(num_rows, r_sh, norms_res_sh[0],
+                                            item_ct1);
         if (tid == group_size - 1) {
             rho_old_sh[0] = rho_new_sh[0];
         }
@@ -379,6 +387,7 @@ void apply_kernel(const gko::kernels::batch_bicgstab::storage_config sconf,
     logger.log_iteration(batch_id, iter, norms_res_sh[0]);
 
     // copy x back to global memory
-    copy_kernel(num_rows, x_sh, x_global_entry, item_ct1);
+    gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::copy_kernel(
+        num_rows, x_sh, x_global_entry, item_ct1);
     item_ct1.barrier(sycl::access::fence_space::global_and_local);
 }
diff --git a/dpcpp/solver/batch_cg_kernels.dp.cpp b/dpcpp/solver/batch_cg_kernels.dp.cpp
index 61591f9efb6..f25d8266803 100644
--- a/dpcpp/solver/batch_cg_kernels.dp.cpp
+++ b/dpcpp/solver/batch_cg_kernels.dp.cpp
@@ -13,6 +13,7 @@
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/solver/batch_dispatch.hpp"
+#include "dpcpp/base/batch_multi_vector_kernels.hpp"
 #include "dpcpp/base/batch_struct.hpp"
 #include "dpcpp/base/config.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
@@ -36,7 +37,6 @@ namespace dpcpp {
 namespace batch_cg {
 
 
-#include "dpcpp/base/batch_multi_vector_kernels.hpp.inc"
 #include "dpcpp/matrix/batch_csr_kernels.hpp.inc"
 #include "dpcpp/matrix/batch_dense_kernels.hpp.inc"
 #include "dpcpp/matrix/batch_ell_kernels.hpp.inc"
diff --git a/dpcpp/solver/batch_cg_kernels.hpp.inc b/dpcpp/solver/batch_cg_kernels.hpp.inc
index cef6e620b64..7a91bcb2bbf 100644
--- a/dpcpp/solver/batch_cg_kernels.hpp.inc
+++ b/dpcpp/solver/batch_cg_kernels.hpp.inc
@@ -40,11 +40,13 @@ __dpct_inline__ void initialize(
     // Compute norms of rhs
     // and rho_old = r' * z
     if (sg_id == 0) {
-        single_rhs_compute_norm2_sg(num_rows, b_global_entry, rhs_norms,
-                                    item_ct1);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+            single_rhs_compute_norm2_sg(num_rows, b_global_entry, rhs_norms,
+                                        item_ct1);
     } else if (sg_id == 1) {
-        single_rhs_compute_conj_dot_sg(num_rows, r_shared_entry, z_shared_entry,
-                                       rho_old, item_ct1);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+            single_rhs_compute_conj_dot_sg(num_rows, r_shared_entry,
+                                           z_shared_entry, rho_old, item_ct1);
     }
     item_ct1.barrier(sycl::access::fence_space::global_and_local);
 
@@ -80,9 +82,10 @@ __dpct_inline__ void update_x_and_r(
     auto sg = item_ct1.get_sub_group();
     const auto tid = item_ct1.get_local_linear_id();
     if (sg.get_group_id() == 0) {
-        single_rhs_compute_conj_dot_sg(num_rows, p_shared_entry,
-                                       Ap_shared_entry, alpha_shared_entry,
-                                       item_ct1);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+            single_rhs_compute_conj_dot_sg(num_rows, p_shared_entry,
+                                           Ap_shared_entry, alpha_shared_entry,
+                                           item_ct1);
     }
     item_ct1.barrier(sycl::access::fence_space::global_and_local);
     if (tid == 0) {
@@ -221,8 +224,9 @@ __dpct_inline__ void apply_kernel(
 
         //  rho_new =  (r)' * (z)
         if (sg_id == 0) {
-            single_rhs_compute_conj_dot_sg(num_rows, r_sh, z_sh, rho_new_sh[0],
-                                           item_ct1);
+            gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+                single_rhs_compute_conj_dot_sg(num_rows, r_sh, z_sh,
+                                               rho_new_sh[0], item_ct1);
         }
         item_ct1.barrier(sycl::access::fence_space::global_and_local);
 
@@ -239,6 +243,7 @@ __dpct_inline__ void apply_kernel(
     logger.log_iteration(batch_id, iter, norms_res_sh[0]);
 
     // copy x back to global memory
-    copy_kernel(num_rows, x_sh, x_global_entry, item_ct1);
+    gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::copy_kernel(
+        num_rows, x_sh, x_global_entry, item_ct1);
     item_ct1.barrier(sycl::access::fence_space::global_and_local);
 }

From fd24ac584338e5f4a42eeb727c07796be74cc2e6 Mon Sep 17 00:00:00 2001
From: ginkgo-bot <ginkgo.library@gmail.com>
Date: Tue, 20 Aug 2024 12:04:44 +0000
Subject: [PATCH 6/6] [format] Format files

Co-authored-by: Pratik Nayak <pratikvn@pm.me>
---
 dpcpp/base/batch_multi_vector_kernels.hpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/dpcpp/base/batch_multi_vector_kernels.hpp b/dpcpp/base/batch_multi_vector_kernels.hpp
index a16df237e34..bbcc540ae60 100644
--- a/dpcpp/base/batch_multi_vector_kernels.hpp
+++ b/dpcpp/base/batch_multi_vector_kernels.hpp
@@ -2,7 +2,6 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-
 #include <memory>
 
 #include <CL/sycl.hpp>