diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.cpp b/common/cuda_hip/base/batch_multi_vector_kernels.cpp
index 17f65487464..76565a83f80 100644
--- a/common/cuda_hip/base/batch_multi_vector_kernels.cpp
+++ b/common/cuda_hip/base/batch_multi_vector_kernels.cpp
@@ -37,19 +37,19 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
     const auto alpha_ub = get_batch_struct(alpha);
     const auto x_ub = get_batch_struct(x);
     if (alpha->get_common_size()[1] == 1) {
-        batch_single_kernels::scale_kernel<<<num_blocks, default_block_size, 0,
-                                             exec->get_stream()>>>(
+        GKO_DEVICE_NAMESPACE::batch_single_kernels::scale_kernel<<<
+            num_blocks, default_block_size, 0, exec->get_stream()>>>(
             alpha_ub, x_ub,
             [] __device__(int row, int col, int stride) { return 0; });
     } else if (alpha->get_common_size() == x->get_common_size()) {
-        batch_single_kernels::scale_kernel<<<num_blocks, default_block_size, 0,
-                                             exec->get_stream()>>>(
+        GKO_DEVICE_NAMESPACE::batch_single_kernels::scale_kernel<<<
+            num_blocks, default_block_size, 0, exec->get_stream()>>>(
             alpha_ub, x_ub, [] __device__(int row, int col, int stride) {
                 return row * stride + col;
             });
     } else {
-        batch_single_kernels::scale_kernel<<<num_blocks, default_block_size, 0,
-                                             exec->get_stream()>>>(
+        GKO_DEVICE_NAMESPACE::batch_single_kernels::scale_kernel<<<
+            num_blocks, default_block_size, 0, exec->get_stream()>>>(
             alpha_ub, x_ub,
             [] __device__(int row, int col, int stride) { return col; });
     }
@@ -71,11 +71,11 @@ void add_scaled(std::shared_ptr<const DefaultExecutor> exec,
     const auto x_ub = get_batch_struct(x);
     const auto y_ub = get_batch_struct(y);
     if (alpha->get_common_size()[1] == 1) {
-        batch_single_kernels::add_scaled_kernel<<<
+        GKO_DEVICE_NAMESPACE::batch_single_kernels::add_scaled_kernel<<<
             num_blocks, default_block_size, 0, exec->get_stream()>>>(
             alpha_ub, x_ub, y_ub, [] __device__(int col) { return 0; });
     } else {
-        batch_single_kernels::add_scaled_kernel<<<
+        GKO_DEVICE_NAMESPACE::batch_single_kernels::add_scaled_kernel<<<
             num_blocks, default_block_size, 0, exec->get_stream()>>>(
             alpha_ub, x_ub, y_ub, [] __device__(int col) { return col; });
     }
@@ -96,9 +96,10 @@ void compute_dot(std::shared_ptr<const DefaultExecutor> exec,
     const auto x_ub = get_batch_struct(x);
     const auto y_ub = get_batch_struct(y);
     const auto res_ub = get_batch_struct(result);
-    batch_single_kernels::compute_gen_dot_product_kernel<<<
-        num_blocks, default_block_size, 0, exec->get_stream()>>>(
-        x_ub, y_ub, res_ub, [] __device__(auto val) { return val; });
+    GKO_DEVICE_NAMESPACE::batch_single_kernels::
+        compute_gen_dot_product_kernel<<<num_blocks, default_block_size, 0,
+                                         exec->get_stream()>>>(
+            x_ub, y_ub, res_ub, [] __device__(auto val) { return val; });
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
@@ -116,9 +117,10 @@ void compute_conj_dot(std::shared_ptr<const DefaultExecutor> exec,
     const auto x_ub = get_batch_struct(x);
     const auto y_ub = get_batch_struct(y);
     const auto res_ub = get_batch_struct(result);
-    batch_single_kernels::compute_gen_dot_product_kernel<<<
-        num_blocks, default_block_size, 0, exec->get_stream()>>>(
-        x_ub, y_ub, res_ub, [] __device__(auto val) { return conj(val); });
+    GKO_DEVICE_NAMESPACE::batch_single_kernels::
+        compute_gen_dot_product_kernel<<<num_blocks, default_block_size, 0,
+                                         exec->get_stream()>>>(
+            x_ub, y_ub, res_ub, [] __device__(auto val) { return conj(val); });
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
@@ -134,9 +136,8 @@ void compute_norm2(std::shared_ptr<const DefaultExecutor> exec,
     const auto num_rhs = x->get_common_size()[1];
     const auto x_ub = get_batch_struct(x);
     const auto res_ub = get_batch_struct(result);
-    batch_single_kernels::compute_norm2_kernel<<<num_blocks, default_block_size,
-                                                 0, exec->get_stream()>>>(
-        x_ub, res_ub);
+    GKO_DEVICE_NAMESPACE::batch_single_kernels::compute_norm2_kernel<<<
+        num_blocks, default_block_size, 0, exec->get_stream()>>>(x_ub, res_ub);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
@@ -151,7 +152,7 @@ void copy(std::shared_ptr<const DefaultExecutor> exec,
     const auto num_blocks = x->get_num_batch_items();
     const auto result_ub = get_batch_struct(result);
     const auto x_ub = get_batch_struct(x);
-    batch_single_kernels::
+    GKO_DEVICE_NAMESPACE::batch_single_kernels::
         copy_kernel<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
             x_ub, result_ub);
 }
diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.hpp b/common/cuda_hip/base/batch_multi_vector_kernels.hpp
index 36aa69d7d99..bb3aac67b55 100644
--- a/common/cuda_hip/base/batch_multi_vector_kernels.hpp
+++ b/common/cuda_hip/base/batch_multi_vector_kernels.hpp
@@ -34,7 +34,6 @@
 namespace gko {
 namespace kernels {
 namespace GKO_DEVICE_NAMESPACE {
-namespace batch_multi_vector {
 namespace batch_single_kernels {
 
 
@@ -320,7 +319,6 @@ __global__ __launch_bounds__(default_block_size) void copy_kernel(
 
 
 }  // namespace batch_single_kernels
-}  // namespace batch_multi_vector
 }  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
 }  // namespace gko
diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc b/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc
deleted file mode 100644
index 7af3c84303f..00000000000
--- a/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc
+++ /dev/null
@@ -1,280 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-template <typename ValueType, typename Mapping>
-__device__ __forceinline__ void scale(
-    const gko::batch::multi_vector::batch_item<const ValueType>& alpha,
-    const gko::batch::multi_vector::batch_item<ValueType>& x, Mapping map)
-{
-    const int max_li = x.num_rows * x.num_rhs;
-    for (int li = threadIdx.x; li < max_li; li += blockDim.x) {
-        const int row = li / x.num_rhs;
-        const int col = li % x.num_rhs;
-
-        x.values[row * x.stride + col] =
-            alpha.values[map(row, col, alpha.stride)] *
-            x.values[row * x.stride + col];
-    }
-}
-
-
-template <typename ValueType, typename Mapping>
-__global__ __launch_bounds__(default_block_size) void scale_kernel(
-    const gko::batch::multi_vector::uniform_batch<const ValueType> alpha,
-    const gko::batch::multi_vector::uniform_batch<ValueType> x, Mapping map)
-{
-    for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items;
-         batch_id += gridDim.x) {
-        const auto alpha_b = gko::batch::extract_batch_item(alpha, batch_id);
-        const auto x_b = gko::batch::extract_batch_item(x, batch_id);
-        scale(alpha_b, x_b, map);
-    }
-}
-
-
-template <typename ValueType, typename Mapping>
-__device__ __forceinline__ void add_scaled(
-    const gko::batch::multi_vector::batch_item<const ValueType>& alpha,
-    const gko::batch::multi_vector::batch_item<const ValueType>& x,
-    const gko::batch::multi_vector::batch_item<ValueType>& y, Mapping map)
-{
-    const int max_li = x.num_rows * x.num_rhs;
-    for (int li = threadIdx.x; li < max_li; li += blockDim.x) {
-        const int row = li / x.num_rhs;
-        const int col = li % x.num_rhs;
-
-        y.values[row * y.stride + col] +=
-            alpha.values[map(col)] * x.values[row * x.stride + col];
-    }
-}
-
-
-template <typename ValueType, typename Mapping>
-__global__ __launch_bounds__(default_block_size) void add_scaled_kernel(
-    const gko::batch::multi_vector::uniform_batch<const ValueType> alpha,
-    const gko::batch::multi_vector::uniform_batch<const ValueType> x,
-    const gko::batch::multi_vector::uniform_batch<ValueType> y, Mapping map)
-{
-    for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items;
-         batch_id += gridDim.x) {
-        const auto alpha_b = gko::batch::extract_batch_item(alpha, batch_id);
-        const auto x_b = gko::batch::extract_batch_item(x, batch_id);
-        const auto y_b = gko::batch::extract_batch_item(y, batch_id);
-        add_scaled(alpha_b, x_b, y_b, map);
-    }
-}
-
-
-template <typename Group, typename ValueType>
-__device__ __forceinline__ void single_rhs_compute_conj_dot(Group subgroup,
-                                                            const int num_rows,
-                                                            const ValueType* x,
-                                                            const ValueType* y,
-                                                            ValueType& result)
-
-{
-    ValueType val = zero<ValueType>();
-    for (int r = subgroup.thread_rank(); r < num_rows; r += subgroup.size()) {
-        val += conj(x[r]) * y[r];
-    }
-
-    // subgroup level reduction
-    val = reduce(subgroup, val, thrust::plus<ValueType>{});
-
-    if (subgroup.thread_rank() == 0) {
-        result = val;
-    }
-}
-
-
-template <typename Group, typename ValueType, typename Mapping>
-__device__ __forceinline__ void gen_one_dot(
-    const gko::batch::multi_vector::batch_item<const ValueType>& x,
-    const gko::batch::multi_vector::batch_item<const ValueType>& y,
-    const int rhs_index,
-    const gko::batch::multi_vector::batch_item<ValueType>& result,
-    Group subgroup, Mapping conj_map)
-{
-    ValueType val = zero<ValueType>();
-
-    for (int r = subgroup.thread_rank(); r < x.num_rows; r += subgroup.size()) {
-        val += conj_map(x.values[r * x.stride + rhs_index]) *
-               y.values[r * y.stride + rhs_index];
-    }
-
-    // subgroup level reduction
-    val = reduce(subgroup, val, thrust::plus<ValueType>{});
-
-    if (subgroup.thread_rank() == 0) {
-        result.values[rhs_index] = val;
-    }
-}
-
-
-template <typename ValueType, typename Mapping>
-__device__ __forceinline__ void compute_gen_dot_product(
-    const gko::batch::multi_vector::batch_item<const ValueType>& x,
-    const gko::batch::multi_vector::batch_item<const ValueType>& y,
-    const gko::batch::multi_vector::batch_item<ValueType>& result,
-    Mapping conj_map)
-{
-    constexpr auto tile_size = config::warp_size;
-    auto thread_block = group::this_thread_block();
-    auto subgroup = group::tiled_partition<tile_size>(thread_block);
-    const auto subgroup_id = static_cast<int>(threadIdx.x / tile_size);
-    const int num_subgroups_per_block = ceildiv(blockDim.x, tile_size);
-
-    for (int rhs_index = subgroup_id; rhs_index < x.num_rhs;
-         rhs_index += num_subgroups_per_block) {
-        gen_one_dot(x, y, rhs_index, result, subgroup, conj_map);
-    }
-}
-
-
-template <typename ValueType, typename Mapping>
-__global__
-__launch_bounds__(default_block_size) void compute_gen_dot_product_kernel(
-    const gko::batch::multi_vector::uniform_batch<const ValueType> x,
-    const gko::batch::multi_vector::uniform_batch<const ValueType> y,
-    const gko::batch::multi_vector::uniform_batch<ValueType> result,
-    Mapping map)
-{
-    for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items;
-         batch_id += gridDim.x) {
-        const auto x_b = gko::batch::extract_batch_item(x, batch_id);
-        const auto y_b = gko::batch::extract_batch_item(y, batch_id);
-        const auto r_b = gko::batch::extract_batch_item(result, batch_id);
-        compute_gen_dot_product(x_b, y_b, r_b, map);
-    }
-}
-
-
-template <typename Group, typename ValueType>
-__device__ __forceinline__ void single_rhs_compute_norm2(
-    Group subgroup, const int num_rows, const ValueType* x,
-    remove_complex<ValueType>& result)
-{
-    using real_type = typename gko::remove_complex<ValueType>;
-    real_type val = zero<real_type>();
-
-    for (int r = subgroup.thread_rank(); r < num_rows; r += subgroup.size()) {
-        val += squared_norm(x[r]);
-    }
-
-    // subgroup level reduction
-    val = reduce(subgroup, val, thrust::plus<remove_complex<ValueType>>{});
-
-    if (subgroup.thread_rank() == 0) {
-        result = sqrt(val);
-    }
-}
-
-
-template <typename Group, typename ValueType>
-__device__ __forceinline__ void one_norm2(
-    const gko::batch::multi_vector::batch_item<const ValueType>& x,
-    const int rhs_index,
-    const gko::batch::multi_vector::batch_item<remove_complex<ValueType>>&
-        result,
-    Group subgroup)
-{
-    using real_type = typename gko::remove_complex<ValueType>;
-    real_type val = zero<real_type>();
-
-    for (int r = subgroup.thread_rank(); r < x.num_rows; r += subgroup.size()) {
-        val += squared_norm(x.values[r * x.stride + rhs_index]);
-    }
-
-    // subgroup level reduction
-    val = reduce(subgroup, val, thrust::plus<remove_complex<ValueType>>{});
-
-    if (subgroup.thread_rank() == 0) {
-        result.values[rhs_index] = sqrt(val);
-    }
-}
-
-
-/**
- * Computes the 2-norms of some column vectors in global or shared memory.
- *
- * @param x  A row-major multivector with nrhs columns.
- * @param result  Holds norm value for each vector in x.
- */
-template <typename ValueType>
-__device__ __forceinline__ void compute_norm2(
-    const gko::batch::multi_vector::batch_item<const ValueType>& x,
-    const gko::batch::multi_vector::batch_item<remove_complex<ValueType>>&
-        result)
-{
-    constexpr auto tile_size = config::warp_size;
-    auto thread_block = group::this_thread_block();
-    auto subgroup = group::tiled_partition<tile_size>(thread_block);
-    const auto subgroup_id = static_cast<int>(threadIdx.x / tile_size);
-    const int num_subgroups_per_block = ceildiv(blockDim.x, tile_size);
-
-    for (int rhs_index = subgroup_id; rhs_index < x.num_rhs;
-         rhs_index += num_subgroups_per_block) {
-        one_norm2(x, rhs_index, result, subgroup);
-    }
-}
-
-
-template <typename ValueType>
-__global__ __launch_bounds__(default_block_size) void compute_norm2_kernel(
-    const gko::batch::multi_vector::uniform_batch<const ValueType> x,
-    const gko::batch::multi_vector::uniform_batch<remove_complex<ValueType>>
-        result)
-{
-    for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items;
-         batch_id += gridDim.x) {
-        const auto x_b = gko::batch::extract_batch_item(x, batch_id);
-        const auto r_b = gko::batch::extract_batch_item(result, batch_id);
-        compute_norm2(x_b, r_b);
-    }
-}
-
-
-template <typename ValueType>
-__device__ __forceinline__ void single_rhs_copy(const int num_rows,
-                                                const ValueType* in,
-                                                ValueType* out)
-{
-    for (int iz = threadIdx.x; iz < num_rows; iz += blockDim.x) {
-        out[iz] = in[iz];
-    }
-}
-
-
-/**
- * Copies the values of one multi-vector into another.
- *
- * Note that the output multi-vector should already have memory allocated
- * and stride set.
- */
-template <typename ValueType>
-__device__ __forceinline__ void copy(
-    const gko::batch::multi_vector::batch_item<const ValueType>& in,
-    const gko::batch::multi_vector::batch_item<ValueType>& out)
-{
-    for (int iz = threadIdx.x; iz < in.num_rows * in.num_rhs;
-         iz += blockDim.x) {
-        const int i = iz / in.num_rhs;
-        const int j = iz % in.num_rhs;
-        out.values[i * out.stride + j] = in.values[i * in.stride + j];
-    }
-}
-
-
-template <typename ValueType>
-__global__ __launch_bounds__(default_block_size) void copy_kernel(
-    const gko::batch::multi_vector::uniform_batch<const ValueType> src,
-    const gko::batch::multi_vector::uniform_batch<ValueType> dst)
-{
-    for (size_type batch_id = blockIdx.x; batch_id < src.num_batch_items;
-         batch_id += gridDim.x) {
-        const auto dst_b = gko::batch::extract_batch_item(dst, batch_id);
-        const auto src_b = gko::batch::extract_batch_item(src, batch_id);
-        copy(src_b, dst_b);
-    }
-}
diff --git a/common/cuda_hip/solver/batch_bicgstab_kernels.hpp.inc b/common/cuda_hip/solver/batch_bicgstab_kernels.hpp.inc
index f71c8c40c3e..c2a53b2e518 100644
--- a/common/cuda_hip/solver/batch_bicgstab_kernels.hpp.inc
+++ b/common/cuda_hip/solver/batch_bicgstab_kernels.hpp.inc
@@ -32,10 +32,14 @@ __device__ __forceinline__ void initialize(
     __syncthreads();
 
     if (threadIdx.x / config::warp_size == 0) {
-        single_rhs_compute_norm2(subgroup, num_rows, r_shared_entry, res_norm);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+            single_rhs_compute_norm2(subgroup, num_rows, r_shared_entry,
+                                     res_norm);
     } else if (threadIdx.x / config::warp_size == 1) {
         // Compute norms of rhs
-        single_rhs_compute_norm2(subgroup, num_rows, b_global_entry, rhs_norm);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+            single_rhs_compute_norm2(subgroup, num_rows, b_global_entry,
+                                     rhs_norm);
     }
     __syncthreads();
 
@@ -70,8 +74,9 @@ __device__ __forceinline__ void compute_alpha(
     const ValueType* const v_shared_entry, ValueType& alpha)
 {
     if (threadIdx.x / config::warp_size == 0) {
-        single_rhs_compute_conj_dot(subgroup, num_rows, r_hat_shared_entry,
-                                    v_shared_entry, alpha);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+            single_rhs_compute_conj_dot(subgroup, num_rows, r_hat_shared_entry,
+                                        v_shared_entry, alpha);
     }
     __syncthreads();
     if (threadIdx.x == 0) {
@@ -99,11 +104,13 @@ __device__ __forceinline__ void compute_omega(
     const ValueType* const s_shared_entry, ValueType& temp, ValueType& omega)
 {
     if (threadIdx.x / config::warp_size == 0) {
-        single_rhs_compute_conj_dot(subgroup, num_rows, t_shared_entry,
-                                    s_shared_entry, omega);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+            single_rhs_compute_conj_dot(subgroup, num_rows, t_shared_entry,
+                                        s_shared_entry, omega);
     } else if (threadIdx.x / config::warp_size == 1) {
-        single_rhs_compute_conj_dot(subgroup, num_rows, t_shared_entry,
-                                    t_shared_entry, temp);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+            single_rhs_compute_conj_dot(subgroup, num_rows, t_shared_entry,
+                                        t_shared_entry, temp);
     }
 
     __syncthreads();
@@ -271,8 +278,9 @@ __global__ void apply_kernel(
 
             // rho_new =  < r_hat , r > = (r_hat)' * (r)
             if (threadIdx.x / config::warp_size == 0) {
-                single_rhs_compute_conj_dot(subgroup, num_rows, r_hat_sh, r_sh,
-                                            rho_new_sh[0]);
+                gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+                    single_rhs_compute_conj_dot(subgroup, num_rows, r_hat_sh,
+                                                r_sh, rho_new_sh[0]);
             }
             __syncthreads();
 
@@ -301,8 +309,9 @@ __global__ void apply_kernel(
 
             // an estimate of residual norms
             if (threadIdx.x / config::warp_size == 0) {
-                single_rhs_compute_norm2(subgroup, num_rows, s_sh,
-                                         norms_res_sh[0]);
+                gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+                    single_rhs_compute_norm2(subgroup, num_rows, s_sh,
+                                             norms_res_sh[0]);
             }
             __syncthreads();
 
@@ -333,8 +342,9 @@ __global__ void apply_kernel(
             __syncthreads();
 
             if (threadIdx.x / config::warp_size == 0) {
-                single_rhs_compute_norm2(subgroup, num_rows, r_sh,
-                                         norms_res_sh[0]);
+                gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+                    single_rhs_compute_norm2(subgroup, num_rows, r_sh,
+                                             norms_res_sh[0]);
             }
             //__syncthreads();
 
@@ -347,7 +357,8 @@ __global__ void apply_kernel(
         logger.log_iteration(batch_id, iter, norms_res_sh[0]);
 
         // copy x back to global memory
-        single_rhs_copy(num_rows, x_sh, x_gl_entry_ptr);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+            single_rhs_copy(num_rows, x_sh, x_gl_entry_ptr);
         __syncthreads();
     }
 }
diff --git a/common/cuda_hip/solver/batch_cg_kernels.hpp.inc b/common/cuda_hip/solver/batch_cg_kernels.hpp.inc
index ffee501b58c..c95a6b1cf05 100644
--- a/common/cuda_hip/solver/batch_cg_kernels.hpp.inc
+++ b/common/cuda_hip/solver/batch_cg_kernels.hpp.inc
@@ -32,12 +32,14 @@ __device__ __forceinline__ void initialize(
 
     if (threadIdx.x / config::warp_size == 0) {
         // Compute norms of rhs
-        single_rhs_compute_norm2(subgroup, num_rows, b_global_entry,
-                                 rhs_norms_sh);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+            single_rhs_compute_norm2(subgroup, num_rows, b_global_entry,
+                                     rhs_norms_sh);
     } else if (threadIdx.x / config::warp_size == 1) {
         // rho_old = r' * z
-        single_rhs_compute_conj_dot(subgroup, num_rows, r_shared_entry,
-                                    z_shared_entry, rho_old_shared_entry);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+            single_rhs_compute_conj_dot(subgroup, num_rows, r_shared_entry,
+                                        z_shared_entry, rho_old_shared_entry);
     }
 
     // p = z
@@ -69,8 +71,9 @@ __device__ __forceinline__ void update_x_and_r(
     ValueType* const x_shared_entry, ValueType* const r_shared_entry)
 {
     if (threadIdx.x / config::warp_size == 0) {
-        single_rhs_compute_conj_dot(subgroup, num_rows, p_shared_entry,
-                                    Ap_shared_entry, alpha_shared_entry);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+            single_rhs_compute_conj_dot(subgroup, num_rows, p_shared_entry,
+                                        Ap_shared_entry, alpha_shared_entry);
     }
     __syncthreads();
 
@@ -202,8 +205,9 @@ __global__ void apply_kernel(const gko::kernels::batch_cg::storage_config sconf,
 
             if (threadIdx.x / config::warp_size == 0) {
                 // rho_new =  (r)' * (z)
-                single_rhs_compute_conj_dot(subgroup, num_rows, r_sh, z_sh,
-                                            rho_new_sh[0]);
+                gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+                    single_rhs_compute_conj_dot(subgroup, num_rows, r_sh, z_sh,
+                                                rho_new_sh[0]);
             }
             __syncthreads();
 
@@ -222,7 +226,8 @@ __global__ void apply_kernel(const gko::kernels::batch_cg::storage_config sconf,
         logger.log_iteration(batch_id, iter, norms_res_sh[0]);
 
         // copy x back to global memory
-        single_rhs_copy(num_rows, x_sh, x_global_entry);
+        gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels::
+            single_rhs_copy(num_rows, x_sh, x_global_entry);
         __syncthreads();
     }
 }
diff --git a/cuda/solver/batch_bicgstab_kernels.cu b/cuda/solver/batch_bicgstab_kernels.cu
index 3c7fe50709c..4d3deb742fe 100644
--- a/cuda/solver/batch_bicgstab_kernels.cu
+++ b/cuda/solver/batch_bicgstab_kernels.cu
@@ -10,6 +10,7 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 
+#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp"
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/thrust.hpp"
@@ -43,7 +44,6 @@ constexpr int sm_oversubscription = 4;
 namespace batch_bicgstab {
 
 
-#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc"
 #include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc"
 #include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc"
 #include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc"
diff --git a/cuda/solver/batch_cg_kernels.cu b/cuda/solver/batch_cg_kernels.cu
index b681bd13ce3..21c3e3d43c4 100644
--- a/cuda/solver/batch_cg_kernels.cu
+++ b/cuda/solver/batch_cg_kernels.cu
@@ -10,6 +10,7 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 
+#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp"
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/thrust.hpp"
 #include "common/cuda_hip/base/types.hpp"
@@ -42,7 +43,6 @@ constexpr int sm_oversubscription = 4;
 namespace batch_cg {
 
 
-#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc"
 #include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc"
 #include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc"
 #include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc"
diff --git a/hip/solver/batch_bicgstab_kernels.hip.cpp b/hip/solver/batch_bicgstab_kernels.hip.cpp
index ca49fa5eb9c..1c1be8b21f7 100644
--- a/hip/solver/batch_bicgstab_kernels.hip.cpp
+++ b/hip/solver/batch_bicgstab_kernels.hip.cpp
@@ -10,6 +10,7 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 
+#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp"
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
@@ -42,7 +43,6 @@ constexpr int sm_oversubscription = 4;
 namespace batch_bicgstab {
 
 
-#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc"
 #include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc"
 #include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc"
 #include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc"
diff --git a/hip/solver/batch_cg_kernels.hip.cpp b/hip/solver/batch_cg_kernels.hip.cpp
index 3a1642edfea..c860286c17c 100644
--- a/hip/solver/batch_cg_kernels.hip.cpp
+++ b/hip/solver/batch_cg_kernels.hip.cpp
@@ -10,6 +10,7 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 
+#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp"
 #include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
@@ -42,7 +43,6 @@ constexpr int sm_oversubscription = 4;
 namespace batch_cg {
 
 
-#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc"
 #include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc"
 #include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc"
 #include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc"