diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.cpp b/common/cuda_hip/base/batch_multi_vector_kernels.cpp index 17f65487464..76565a83f80 100644 --- a/common/cuda_hip/base/batch_multi_vector_kernels.cpp +++ b/common/cuda_hip/base/batch_multi_vector_kernels.cpp @@ -37,19 +37,19 @@ void scale(std::shared_ptr exec, const auto alpha_ub = get_batch_struct(alpha); const auto x_ub = get_batch_struct(x); if (alpha->get_common_size()[1] == 1) { - batch_single_kernels::scale_kernel<<get_stream()>>>( + GKO_DEVICE_NAMESPACE::batch_single_kernels::scale_kernel<<< + num_blocks, default_block_size, 0, exec->get_stream()>>>( alpha_ub, x_ub, [] __device__(int row, int col, int stride) { return 0; }); } else if (alpha->get_common_size() == x->get_common_size()) { - batch_single_kernels::scale_kernel<<get_stream()>>>( + GKO_DEVICE_NAMESPACE::batch_single_kernels::scale_kernel<<< + num_blocks, default_block_size, 0, exec->get_stream()>>>( alpha_ub, x_ub, [] __device__(int row, int col, int stride) { return row * stride + col; }); } else { - batch_single_kernels::scale_kernel<<get_stream()>>>( + GKO_DEVICE_NAMESPACE::batch_single_kernels::scale_kernel<<< + num_blocks, default_block_size, 0, exec->get_stream()>>>( alpha_ub, x_ub, [] __device__(int row, int col, int stride) { return col; }); } @@ -71,11 +71,11 @@ void add_scaled(std::shared_ptr exec, const auto x_ub = get_batch_struct(x); const auto y_ub = get_batch_struct(y); if (alpha->get_common_size()[1] == 1) { - batch_single_kernels::add_scaled_kernel<<< + GKO_DEVICE_NAMESPACE::batch_single_kernels::add_scaled_kernel<<< num_blocks, default_block_size, 0, exec->get_stream()>>>( alpha_ub, x_ub, y_ub, [] __device__(int col) { return 0; }); } else { - batch_single_kernels::add_scaled_kernel<<< + GKO_DEVICE_NAMESPACE::batch_single_kernels::add_scaled_kernel<<< num_blocks, default_block_size, 0, exec->get_stream()>>>( alpha_ub, x_ub, y_ub, [] __device__(int col) { return col; }); } @@ -96,9 +96,10 @@ void compute_dot(std::shared_ptr exec, const auto x_ub = get_batch_struct(x); const auto y_ub = get_batch_struct(y); const auto res_ub = get_batch_struct(result); - batch_single_kernels::compute_gen_dot_product_kernel<<< - num_blocks, default_block_size, 0, exec->get_stream()>>>( - x_ub, y_ub, res_ub, [] __device__(auto val) { return val; }); + GKO_DEVICE_NAMESPACE::batch_single_kernels:: + compute_gen_dot_product_kernel<<get_stream()>>>( + x_ub, y_ub, res_ub, [] __device__(auto val) { return val; }); } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( @@ -116,9 +117,10 @@ void compute_conj_dot(std::shared_ptr exec, const auto x_ub = get_batch_struct(x); const auto y_ub = get_batch_struct(y); const auto res_ub = get_batch_struct(result); - batch_single_kernels::compute_gen_dot_product_kernel<<< - num_blocks, default_block_size, 0, exec->get_stream()>>>( - x_ub, y_ub, res_ub, [] __device__(auto val) { return conj(val); }); + GKO_DEVICE_NAMESPACE::batch_single_kernels:: + compute_gen_dot_product_kernel<<get_stream()>>>( + x_ub, y_ub, res_ub, [] __device__(auto val) { return conj(val); }); } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( @@ -134,9 +136,8 @@ void compute_norm2(std::shared_ptr exec, const auto num_rhs = x->get_common_size()[1]; const auto x_ub = get_batch_struct(x); const auto res_ub = get_batch_struct(result); - batch_single_kernels::compute_norm2_kernel<<get_stream()>>>( - x_ub, res_ub); + GKO_DEVICE_NAMESPACE::batch_single_kernels::compute_norm2_kernel<<< + num_blocks, default_block_size, 0, exec->get_stream()>>>(x_ub, res_ub); } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( @@ -151,7 +152,7 @@ void copy(std::shared_ptr exec, const auto num_blocks = x->get_num_batch_items(); const auto result_ub = get_batch_struct(result); const auto x_ub = get_batch_struct(x); - batch_single_kernels:: + GKO_DEVICE_NAMESPACE::batch_single_kernels:: copy_kernel<<get_stream()>>>( x_ub, result_ub); } diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.hpp b/common/cuda_hip/base/batch_multi_vector_kernels.hpp index 36aa69d7d99..bb3aac67b55 100644 --- a/common/cuda_hip/base/batch_multi_vector_kernels.hpp +++ b/common/cuda_hip/base/batch_multi_vector_kernels.hpp @@ -34,7 +34,6 @@ namespace gko { namespace kernels { namespace GKO_DEVICE_NAMESPACE { -namespace batch_multi_vector { namespace batch_single_kernels { @@ -320,7 +319,6 @@ __global__ __launch_bounds__(default_block_size) void copy_kernel( } // namespace batch_single_kernels -} // namespace batch_multi_vector } // namespace GKO_DEVICE_NAMESPACE } // namespace kernels } // namespace gko diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc b/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc deleted file mode 100644 index 7af3c84303f..00000000000 --- a/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc +++ /dev/null @@ -1,280 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -template -__device__ __forceinline__ void scale( - const gko::batch::multi_vector::batch_item& alpha, - const gko::batch::multi_vector::batch_item& x, Mapping map) -{ - const int max_li = x.num_rows * x.num_rhs; - for (int li = threadIdx.x; li < max_li; li += blockDim.x) { - const int row = li / x.num_rhs; - const int col = li % x.num_rhs; - - x.values[row * x.stride + col] = - alpha.values[map(row, col, alpha.stride)] * - x.values[row * x.stride + col]; - } -} - - -template -__global__ __launch_bounds__(default_block_size) void scale_kernel( - const gko::batch::multi_vector::uniform_batch alpha, - const gko::batch::multi_vector::uniform_batch x, Mapping map) -{ - for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items; - batch_id += gridDim.x) { - const auto alpha_b = gko::batch::extract_batch_item(alpha, batch_id); - const auto x_b = gko::batch::extract_batch_item(x, batch_id); - scale(alpha_b, x_b, map); - } -} - - -template -__device__ __forceinline__ void add_scaled( - const gko::batch::multi_vector::batch_item& alpha, - const gko::batch::multi_vector::batch_item& x, - const gko::batch::multi_vector::batch_item& y, Mapping map) -{ - const int max_li = x.num_rows * x.num_rhs; - for (int li = threadIdx.x; li < max_li; li += blockDim.x) { - const int row = li / x.num_rhs; - const int col = li % x.num_rhs; - - y.values[row * y.stride + col] += - alpha.values[map(col)] * x.values[row * x.stride + col]; - } -} - - -template -__global__ __launch_bounds__(default_block_size) void add_scaled_kernel( - const gko::batch::multi_vector::uniform_batch alpha, - const gko::batch::multi_vector::uniform_batch x, - const gko::batch::multi_vector::uniform_batch y, Mapping map) -{ - for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items; - batch_id += gridDim.x) { - const auto alpha_b = gko::batch::extract_batch_item(alpha, batch_id); - const auto x_b = gko::batch::extract_batch_item(x, batch_id); - const auto y_b = gko::batch::extract_batch_item(y, batch_id); - add_scaled(alpha_b, x_b, y_b, map); - } -} - - -template -__device__ __forceinline__ void single_rhs_compute_conj_dot(Group subgroup, - const int num_rows, - const ValueType* x, - const ValueType* y, - ValueType& result) - -{ - ValueType val = zero(); - for (int r = subgroup.thread_rank(); r < num_rows; r += subgroup.size()) { - val += conj(x[r]) * y[r]; - } - - // subgroup level reduction - val = reduce(subgroup, val, thrust::plus{}); - - if (subgroup.thread_rank() == 0) { - result = val; - } -} - - -template -__device__ __forceinline__ void gen_one_dot( - const gko::batch::multi_vector::batch_item& x, - const gko::batch::multi_vector::batch_item& y, - const int rhs_index, - const gko::batch::multi_vector::batch_item& result, - Group subgroup, Mapping conj_map) -{ - ValueType val = zero(); - - for (int r = subgroup.thread_rank(); r < x.num_rows; r += subgroup.size()) { - val += conj_map(x.values[r * x.stride + rhs_index]) * - y.values[r * y.stride + rhs_index]; - } - - // subgroup level reduction - val = reduce(subgroup, val, thrust::plus{}); - - if (subgroup.thread_rank() == 0) { - result.values[rhs_index] = val; - } -} - - -template -__device__ __forceinline__ void compute_gen_dot_product( - const gko::batch::multi_vector::batch_item& x, - const gko::batch::multi_vector::batch_item& y, - const gko::batch::multi_vector::batch_item& result, - Mapping conj_map) -{ - constexpr auto tile_size = config::warp_size; - auto thread_block = group::this_thread_block(); - auto subgroup = group::tiled_partition(thread_block); - const auto subgroup_id = static_cast(threadIdx.x / tile_size); - const int num_subgroups_per_block = ceildiv(blockDim.x, tile_size); - - for (int rhs_index = subgroup_id; rhs_index < x.num_rhs; - rhs_index += num_subgroups_per_block) { - gen_one_dot(x, y, rhs_index, result, subgroup, conj_map); - } -} - - -template -__global__ -__launch_bounds__(default_block_size) void compute_gen_dot_product_kernel( - const gko::batch::multi_vector::uniform_batch x, - const gko::batch::multi_vector::uniform_batch y, - const gko::batch::multi_vector::uniform_batch result, - Mapping map) -{ - for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items; - batch_id += gridDim.x) { - const auto x_b = gko::batch::extract_batch_item(x, batch_id); - const auto y_b = gko::batch::extract_batch_item(y, batch_id); - const auto r_b = gko::batch::extract_batch_item(result, batch_id); - compute_gen_dot_product(x_b, y_b, r_b, map); - } -} - - -template -__device__ __forceinline__ void single_rhs_compute_norm2( - Group subgroup, const int num_rows, const ValueType* x, - remove_complex& result) -{ - using real_type = typename gko::remove_complex; - real_type val = zero(); - - for (int r = subgroup.thread_rank(); r < num_rows; r += subgroup.size()) { - val += squared_norm(x[r]); - } - - // subgroup level reduction - val = reduce(subgroup, val, thrust::plus>{}); - - if (subgroup.thread_rank() == 0) { - result = sqrt(val); - } -} - - -template -__device__ __forceinline__ void one_norm2( - const gko::batch::multi_vector::batch_item& x, - const int rhs_index, - const gko::batch::multi_vector::batch_item>& - result, - Group subgroup) -{ - using real_type = typename gko::remove_complex; - real_type val = zero(); - - for (int r = subgroup.thread_rank(); r < x.num_rows; r += subgroup.size()) { - val += squared_norm(x.values[r * x.stride + rhs_index]); - } - - // subgroup level reduction - val = reduce(subgroup, val, thrust::plus>{}); - - if (subgroup.thread_rank() == 0) { - result.values[rhs_index] = sqrt(val); - } -} - - -/** - * Computes the 2-norms of some column vectors in global or shared memory. - * - * @param x A row-major multivector with nrhs columns. - * @param result Holds norm value for each vector in x. - */ -template -__device__ __forceinline__ void compute_norm2( - const gko::batch::multi_vector::batch_item& x, - const gko::batch::multi_vector::batch_item>& - result) -{ - constexpr auto tile_size = config::warp_size; - auto thread_block = group::this_thread_block(); - auto subgroup = group::tiled_partition(thread_block); - const auto subgroup_id = static_cast(threadIdx.x / tile_size); - const int num_subgroups_per_block = ceildiv(blockDim.x, tile_size); - - for (int rhs_index = subgroup_id; rhs_index < x.num_rhs; - rhs_index += num_subgroups_per_block) { - one_norm2(x, rhs_index, result, subgroup); - } -} - - -template -__global__ __launch_bounds__(default_block_size) void compute_norm2_kernel( - const gko::batch::multi_vector::uniform_batch x, - const gko::batch::multi_vector::uniform_batch> - result) -{ - for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items; - batch_id += gridDim.x) { - const auto x_b = gko::batch::extract_batch_item(x, batch_id); - const auto r_b = gko::batch::extract_batch_item(result, batch_id); - compute_norm2(x_b, r_b); - } -} - - -template -__device__ __forceinline__ void single_rhs_copy(const int num_rows, - const ValueType* in, - ValueType* out) -{ - for (int iz = threadIdx.x; iz < num_rows; iz += blockDim.x) { - out[iz] = in[iz]; - } -} - - -/** - * Copies the values of one multi-vector into another. - * - * Note that the output multi-vector should already have memory allocated - * and stride set. - */ -template -__device__ __forceinline__ void copy( - const gko::batch::multi_vector::batch_item& in, - const gko::batch::multi_vector::batch_item& out) -{ - for (int iz = threadIdx.x; iz < in.num_rows * in.num_rhs; - iz += blockDim.x) { - const int i = iz / in.num_rhs; - const int j = iz % in.num_rhs; - out.values[i * out.stride + j] = in.values[i * in.stride + j]; - } -} - - -template -__global__ __launch_bounds__(default_block_size) void copy_kernel( - const gko::batch::multi_vector::uniform_batch src, - const gko::batch::multi_vector::uniform_batch dst) -{ - for (size_type batch_id = blockIdx.x; batch_id < src.num_batch_items; - batch_id += gridDim.x) { - const auto dst_b = gko::batch::extract_batch_item(dst, batch_id); - const auto src_b = gko::batch::extract_batch_item(src, batch_id); - copy(src_b, dst_b); - } -} diff --git a/common/cuda_hip/solver/batch_bicgstab_kernels.hpp.inc b/common/cuda_hip/solver/batch_bicgstab_kernels.hpp.inc index f71c8c40c3e..c2a53b2e518 100644 --- a/common/cuda_hip/solver/batch_bicgstab_kernels.hpp.inc +++ b/common/cuda_hip/solver/batch_bicgstab_kernels.hpp.inc @@ -32,10 +32,14 @@ __device__ __forceinline__ void initialize( __syncthreads(); if (threadIdx.x / config::warp_size == 0) { - single_rhs_compute_norm2(subgroup, num_rows, r_shared_entry, res_norm); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: + single_rhs_compute_norm2(subgroup, num_rows, r_shared_entry, + res_norm); } else if (threadIdx.x / config::warp_size == 1) { // Compute norms of rhs - single_rhs_compute_norm2(subgroup, num_rows, b_global_entry, rhs_norm); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: + single_rhs_compute_norm2(subgroup, num_rows, b_global_entry, + rhs_norm); } __syncthreads(); @@ -70,8 +74,9 @@ __device__ __forceinline__ void compute_alpha( const ValueType* const v_shared_entry, ValueType& alpha) { if (threadIdx.x / config::warp_size == 0) { - single_rhs_compute_conj_dot(subgroup, num_rows, r_hat_shared_entry, - v_shared_entry, alpha); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: + single_rhs_compute_conj_dot(subgroup, num_rows, r_hat_shared_entry, + v_shared_entry, alpha); } __syncthreads(); if (threadIdx.x == 0) { @@ -99,11 +104,13 @@ __device__ __forceinline__ void compute_omega( const ValueType* const s_shared_entry, ValueType& temp, ValueType& omega) { if (threadIdx.x / config::warp_size == 0) { - single_rhs_compute_conj_dot(subgroup, num_rows, t_shared_entry, - s_shared_entry, omega); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: + single_rhs_compute_conj_dot(subgroup, num_rows, t_shared_entry, + s_shared_entry, omega); } else if (threadIdx.x / config::warp_size == 1) { - single_rhs_compute_conj_dot(subgroup, num_rows, t_shared_entry, - t_shared_entry, temp); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: + single_rhs_compute_conj_dot(subgroup, num_rows, t_shared_entry, + t_shared_entry, temp); } __syncthreads(); @@ -271,8 +278,9 @@ __global__ void apply_kernel( // rho_new = < r_hat , r > = (r_hat)' * (r) if (threadIdx.x / config::warp_size == 0) { - single_rhs_compute_conj_dot(subgroup, num_rows, r_hat_sh, r_sh, - rho_new_sh[0]); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: + single_rhs_compute_conj_dot(subgroup, num_rows, r_hat_sh, + r_sh, rho_new_sh[0]); } __syncthreads(); @@ -301,8 +309,9 @@ __global__ void apply_kernel( // an estimate of residual norms if (threadIdx.x / config::warp_size == 0) { - single_rhs_compute_norm2(subgroup, num_rows, s_sh, - norms_res_sh[0]); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: + single_rhs_compute_norm2(subgroup, num_rows, s_sh, + norms_res_sh[0]); } __syncthreads(); @@ -333,8 +342,9 @@ __global__ void apply_kernel( __syncthreads(); if (threadIdx.x / config::warp_size == 0) { - single_rhs_compute_norm2(subgroup, num_rows, r_sh, - norms_res_sh[0]); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: + single_rhs_compute_norm2(subgroup, num_rows, r_sh, + norms_res_sh[0]); } //__syncthreads(); @@ -347,7 +357,8 @@ __global__ void apply_kernel( logger.log_iteration(batch_id, iter, norms_res_sh[0]); // copy x back to global memory - single_rhs_copy(num_rows, x_sh, x_gl_entry_ptr); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: + single_rhs_copy(num_rows, x_sh, x_gl_entry_ptr); __syncthreads(); } } diff --git a/common/cuda_hip/solver/batch_cg_kernels.hpp.inc b/common/cuda_hip/solver/batch_cg_kernels.hpp.inc index ffee501b58c..c95a6b1cf05 100644 --- a/common/cuda_hip/solver/batch_cg_kernels.hpp.inc +++ b/common/cuda_hip/solver/batch_cg_kernels.hpp.inc @@ -32,12 +32,14 @@ __device__ __forceinline__ void initialize( if (threadIdx.x / config::warp_size == 0) { // Compute norms of rhs - single_rhs_compute_norm2(subgroup, num_rows, b_global_entry, - rhs_norms_sh); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: + single_rhs_compute_norm2(subgroup, num_rows, b_global_entry, + rhs_norms_sh); } else if (threadIdx.x / config::warp_size == 1) { // rho_old = r' * z - single_rhs_compute_conj_dot(subgroup, num_rows, r_shared_entry, - z_shared_entry, rho_old_shared_entry); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: + single_rhs_compute_conj_dot(subgroup, num_rows, r_shared_entry, + z_shared_entry, rho_old_shared_entry); } // p = z @@ -69,8 +71,9 @@ __device__ __forceinline__ void update_x_and_r( ValueType* const x_shared_entry, ValueType* const r_shared_entry) { if (threadIdx.x / config::warp_size == 0) { - single_rhs_compute_conj_dot(subgroup, num_rows, p_shared_entry, - Ap_shared_entry, alpha_shared_entry); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: + single_rhs_compute_conj_dot(subgroup, num_rows, p_shared_entry, + Ap_shared_entry, alpha_shared_entry); } __syncthreads(); @@ -202,8 +205,9 @@ __global__ void apply_kernel(const gko::kernels::batch_cg::storage_config sconf, if (threadIdx.x / config::warp_size == 0) { // rho_new = (r)' * (z) - single_rhs_compute_conj_dot(subgroup, num_rows, r_sh, z_sh, - rho_new_sh[0]); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: + single_rhs_compute_conj_dot(subgroup, num_rows, r_sh, z_sh, + rho_new_sh[0]); } __syncthreads(); @@ -222,7 +226,8 @@ __global__ void apply_kernel(const gko::kernels::batch_cg::storage_config sconf, logger.log_iteration(batch_id, iter, norms_res_sh[0]); // copy x back to global memory - single_rhs_copy(num_rows, x_sh, x_global_entry); + gko::kernels::GKO_DEVICE_NAMESPACE::batch_single_kernels:: + single_rhs_copy(num_rows, x_sh, x_global_entry); __syncthreads(); } } diff --git a/cuda/solver/batch_bicgstab_kernels.cu b/cuda/solver/batch_bicgstab_kernels.cu index 3c7fe50709c..4d3deb742fe 100644 --- a/cuda/solver/batch_bicgstab_kernels.cu +++ b/cuda/solver/batch_bicgstab_kernels.cu @@ -10,6 +10,7 @@ #include #include +#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp" #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/thrust.hpp" @@ -43,7 +44,6 @@ constexpr int sm_oversubscription = 4; namespace batch_bicgstab { -#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc" #include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc" #include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc" #include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc" diff --git a/cuda/solver/batch_cg_kernels.cu b/cuda/solver/batch_cg_kernels.cu index b681bd13ce3..21c3e3d43c4 100644 --- a/cuda/solver/batch_cg_kernels.cu +++ b/cuda/solver/batch_cg_kernels.cu @@ -10,6 +10,7 @@ #include #include +#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp" #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/thrust.hpp" #include "common/cuda_hip/base/types.hpp" @@ -42,7 +43,6 @@ constexpr int sm_oversubscription = 4; namespace batch_cg { -#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc" #include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc" #include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc" #include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc" diff --git a/hip/solver/batch_bicgstab_kernels.hip.cpp b/hip/solver/batch_bicgstab_kernels.hip.cpp index ca49fa5eb9c..1c1be8b21f7 100644 --- a/hip/solver/batch_bicgstab_kernels.hip.cpp +++ b/hip/solver/batch_bicgstab_kernels.hip.cpp @@ -10,6 +10,7 @@ #include #include +#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp" #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/runtime.hpp" @@ -42,7 +43,6 @@ constexpr int sm_oversubscription = 4; namespace batch_bicgstab { -#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc" #include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc" #include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc" #include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc" diff --git a/hip/solver/batch_cg_kernels.hip.cpp b/hip/solver/batch_cg_kernels.hip.cpp index 3a1642edfea..c860286c17c 100644 --- a/hip/solver/batch_cg_kernels.hip.cpp +++ b/hip/solver/batch_cg_kernels.hip.cpp @@ -10,6 +10,7 @@ #include #include +#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp" #include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/runtime.hpp" @@ -42,7 +43,6 @@ constexpr int sm_oversubscription = 4; namespace batch_cg { -#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc" #include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc" #include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc" #include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc"