diff --git a/common/cuda_hip/preconditioner/batch_scalar_jacobi.hpp.inc b/common/cuda_hip/preconditioner/batch_scalar_jacobi.hpp.inc index 5cda893ec4c..be619cfad48 100644 --- a/common/cuda_hip/preconditioner/batch_scalar_jacobi.hpp.inc +++ b/common/cuda_hip/preconditioner/batch_scalar_jacobi.hpp.inc @@ -17,7 +17,7 @@ public: __host__ __device__ static constexpr int dynamic_work_size( const int num_rows, int) { - return num_rows; + return num_rows * sizeof(value_type); } /** diff --git a/common/cuda_hip/solver/batch_cg_kernels.hpp.inc b/common/cuda_hip/solver/batch_cg_kernels.hpp.inc new file mode 100644 index 00000000000..ffee501b58c --- /dev/null +++ b/common/cuda_hip/solver/batch_cg_kernels.hpp.inc @@ -0,0 +1,228 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +template +__device__ __forceinline__ void initialize( + Group subgroup, const int num_rows, const BatchMatrixType_entry& mat_entry, + const ValueType* const b_global_entry, + const ValueType* const x_global_entry, ValueType* const x_shared_entry, + ValueType* const r_shared_entry, const PrecType& prec_shared, + ValueType* const z_shared_entry, ValueType& rho_old_shared_entry, + ValueType* const p_shared_entry, + typename gko::remove_complex& rhs_norms_sh) +{ + // copy x from global to shared memory + // r = b + for (int iz = threadIdx.x; iz < num_rows; iz += blockDim.x) { + x_shared_entry[iz] = x_global_entry[iz]; + r_shared_entry[iz] = b_global_entry[iz]; + } + __syncthreads(); + + // r = b - A*x + advanced_apply(static_cast(-1.0), mat_entry, x_shared_entry, + static_cast(1.0), r_shared_entry); + __syncthreads(); + + // z = precond * r + prec_shared.apply(num_rows, r_shared_entry, z_shared_entry); + __syncthreads(); + + if (threadIdx.x / config::warp_size == 0) { + // Compute norms of rhs + single_rhs_compute_norm2(subgroup, num_rows, b_global_entry, + rhs_norms_sh); + } else if (threadIdx.x / config::warp_size == 1) { + // rho_old = r' * z + single_rhs_compute_conj_dot(subgroup, num_rows, r_shared_entry, + z_shared_entry, rho_old_shared_entry); + } + + // p = z + for (int iz = threadIdx.x; iz < num_rows; iz += blockDim.x) { + p_shared_entry[iz] = z_shared_entry[iz]; + } +} + + +template +__device__ __forceinline__ void update_p(const int num_rows, + const ValueType& rho_new_shared_entry, + const ValueType& rho_old_shared_entry, + const ValueType* const z_shared_entry, + ValueType* const p_shared_entry) +{ + for (int li = threadIdx.x; li < num_rows; li += blockDim.x) { + const ValueType beta = rho_new_shared_entry / rho_old_shared_entry; + p_shared_entry[li] = z_shared_entry[li] + beta * p_shared_entry[li]; + } +} + + +template +__device__ __forceinline__ void update_x_and_r( + Group subgroup, const int num_rows, const ValueType& rho_old_shared_entry, + const ValueType* const p_shared_entry, + const ValueType* const Ap_shared_entry, ValueType& alpha_shared_entry, + ValueType* const x_shared_entry, ValueType* const r_shared_entry) +{ + if (threadIdx.x / config::warp_size == 0) { + single_rhs_compute_conj_dot(subgroup, num_rows, p_shared_entry, + Ap_shared_entry, alpha_shared_entry); + } + __syncthreads(); + + for (int li = threadIdx.x; li < num_rows; li += blockDim.x) { + const ValueType alpha = rho_old_shared_entry / alpha_shared_entry; + x_shared_entry[li] += alpha * p_shared_entry[li]; + r_shared_entry[li] -= alpha * Ap_shared_entry[li]; + } +} + + +template +__global__ void apply_kernel(const gko::kernels::batch_cg::storage_config sconf, + const int max_iter, + const gko::remove_complex tol, + LogType logger, PrecType prec_shared, + const BatchMatrixType mat, + const ValueType* const __restrict__ b, + ValueType* const __restrict__ x, + ValueType* const __restrict__ workspace = nullptr) +{ + using real_type = typename gko::remove_complex; + const auto num_batch_items = mat.num_batch_items; + const auto num_rows = mat.num_rows; + + constexpr auto tile_size = config::warp_size; + auto thread_block = group::this_thread_block(); + auto subgroup = group::tiled_partition(thread_block); + + for (size_type batch_id = blockIdx.x; batch_id < num_batch_items; + batch_id += gridDim.x) { + const int gmem_offset = + batch_id * sconf.gmem_stride_bytes / sizeof(ValueType); + extern __shared__ char local_mem_sh[]; + + ValueType* r_sh; + ValueType* z_sh; + ValueType* p_sh; + ValueType* Ap_sh; + ValueType* x_sh; + ValueType* prec_work_sh; + + if (n_shared >= 1) { + r_sh = reinterpret_cast(local_mem_sh); + } else { + r_sh = workspace + gmem_offset; + } + if (n_shared == 1) { + z_sh = workspace + gmem_offset; + } else { + z_sh = r_sh + sconf.padded_vec_len; + } + if (n_shared == 2) { + p_sh = workspace + gmem_offset; + } else { + p_sh = z_sh + sconf.padded_vec_len; + } + if (n_shared == 3) { + Ap_sh = workspace + gmem_offset; + } else { + Ap_sh = p_sh + sconf.padded_vec_len; + } + if (n_shared == 4) { + x_sh = workspace + gmem_offset; + } else { + x_sh = Ap_sh + sconf.padded_vec_len; + } + if (!prec_shared_bool && n_shared == 5) { + prec_work_sh = workspace + gmem_offset; + } else { + prec_work_sh = x_sh + sconf.padded_vec_len; + } + + __shared__ uninitialized_array rho_old_sh; + __shared__ uninitialized_array rho_new_sh; + __shared__ uninitialized_array alpha_sh; + __shared__ real_type norms_rhs_sh[1]; + __shared__ real_type norms_res_sh[1]; + + const auto mat_entry = + gko::batch::matrix::extract_batch_item(mat, batch_id); + const ValueType* const b_global_entry = + gko::batch::multi_vector::batch_item_ptr(b, 1, num_rows, batch_id); + ValueType* const x_global_entry = + gko::batch::multi_vector::batch_item_ptr(x, 1, num_rows, batch_id); + + // generate preconditioner + prec_shared.generate(batch_id, mat_entry, prec_work_sh); + + // initialization + // compute b norms + // r = b - A*x + // z = precond*r + // rho_old = r' * z (' is for hermitian transpose) + // p = z + initialize(subgroup, num_rows, mat_entry, b_global_entry, + x_global_entry, x_sh, r_sh, prec_shared, z_sh, rho_old_sh[0], + p_sh, norms_rhs_sh[0]); + __syncthreads(); + + // stopping criterion object + StopType stop(tol, norms_rhs_sh); + + int iter = 0; + for (; iter < max_iter; iter++) { + norms_res_sh[0] = sqrt(abs(rho_old_sh[0])); + __syncthreads(); + if (stop.check_converged(norms_res_sh)) { + logger.log_iteration(batch_id, iter, norms_res_sh[0]); + break; + } + + // Ap = A * p + simple_apply(mat_entry, p_sh, Ap_sh); + __syncthreads(); + + // alpha = rho_old / (p' * Ap) + // x = x + alpha * p + // r = r - alpha * Ap + update_x_and_r(subgroup, num_rows, rho_old_sh[0], p_sh, Ap_sh, + alpha_sh[0], x_sh, r_sh); + __syncthreads(); + + // z = precond * r + prec_shared.apply(num_rows, r_sh, z_sh); + __syncthreads(); + + if (threadIdx.x / config::warp_size == 0) { + // rho_new = (r)' * (z) + single_rhs_compute_conj_dot(subgroup, num_rows, r_sh, z_sh, + rho_new_sh[0]); + } + __syncthreads(); + + // beta = rho_new / rho_old + // p = z + beta * p + update_p(num_rows, rho_new_sh[0], rho_old_sh[0], z_sh, p_sh); + __syncthreads(); + + // rho_old = rho_new + if (threadIdx.x == 0) { + rho_old_sh[0] = rho_new_sh[0]; + } + __syncthreads(); + } + + logger.log_iteration(batch_id, iter, norms_res_sh[0]); + + // copy x back to global memory + single_rhs_copy(num_rows, x_sh, x_global_entry); + __syncthreads(); + } +} diff --git a/core/solver/batch_bicgstab_kernels.hpp b/core/solver/batch_bicgstab_kernels.hpp index 0741637f3d4..43f55f1356d 100644 --- a/core/solver/batch_bicgstab_kernels.hpp +++ b/core/solver/batch_bicgstab_kernels.hpp @@ -88,8 +88,7 @@ void set_gmem_stride_bytes(storage_config& sconf, gmem_stride += prec_storage_bytes; } // align global memory chunks - sconf.gmem_stride_bytes = - gmem_stride > 0 ? ceildiv(gmem_stride, align_bytes) * align_bytes : 0; + sconf.gmem_stride_bytes = ceildiv(gmem_stride, align_bytes) * align_bytes; } @@ -134,8 +133,7 @@ storage_config compute_shared_storage(const int available_shared_mem, using real_type = remove_complex; const int vec_size = num_rows * num_rhs * sizeof(ValueType); const int num_main_vecs = 9; - const int prec_storage = - Prectype::dynamic_work_size(num_rows, num_nz) * sizeof(ValueType); + const int prec_storage = Prectype::dynamic_work_size(num_rows, num_nz); int rem_shared = available_shared_mem; // Set default values. Initially all vecs are in global memory. // {prec_shared, n_shared, n_global, gmem_stride_bytes, padded_vec_len} diff --git a/core/solver/batch_cg_kernels.hpp b/core/solver/batch_cg_kernels.hpp index 5ff78524f11..d2c64460be2 100644 --- a/core/solver/batch_cg_kernels.hpp +++ b/core/solver/batch_cg_kernels.hpp @@ -121,8 +121,7 @@ storage_config compute_shared_storage(const int available_shared_mem, using real_type = remove_complex; const int vec_bytes = num_rows * num_rhs * sizeof(ValueType); const int num_main_vecs = 5; - const int prec_storage = - Prectype::dynamic_work_size(num_rows, num_nz) * sizeof(ValueType); + const int prec_storage = Prectype::dynamic_work_size(num_rows, num_nz); int rem_shared = available_shared_mem; // Set default values. Initially all vecs are in global memory. // {prec_shared, n_shared, n_global, gmem_stride_bytes, padded_vec_len} @@ -160,13 +159,13 @@ storage_config compute_shared_storage(const int available_shared_mem, } // namespace batch_cg -#define GKO_DECLARE_BATCH_CG_APPLY_KERNEL(_type) \ - void apply( \ - std::shared_ptr exec, \ - const gko::kernels::batch_cg::settings>& \ - options, \ - const batch::BatchLinOp* a, const batch::BatchLinOp* preconditioner, \ - const batch::MultiVector<_type>* b, batch::MultiVector<_type>* x, \ +#define GKO_DECLARE_BATCH_CG_APPLY_KERNEL(_type) \ + void apply( \ + std::shared_ptr exec, \ + const gko::kernels::batch_cg::settings>& \ + options, \ + const batch::BatchLinOp* mat, const batch::BatchLinOp* preconditioner, \ + const batch::MultiVector<_type>* b, batch::MultiVector<_type>* x, \ gko::batch::log::detail::log_data>& logdata) diff --git a/cuda/solver/batch_cg_kernels.cu b/cuda/solver/batch_cg_kernels.cu index 57c3612df69..541f3f6b936 100644 --- a/cuda/solver/batch_cg_kernels.cu +++ b/cuda/solver/batch_cg_kernels.cu @@ -51,12 +51,173 @@ namespace batch_cg { #include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc" #include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc" #include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc" +#include "common/cuda_hip/solver/batch_cg_kernels.hpp.inc" + + +template +int get_num_threads_per_block(std::shared_ptr exec, + const int num_rows) +{ + int num_warps = std::max(num_rows / 4, 2); + constexpr int warp_sz = static_cast(config::warp_size); + const int min_block_size = 2 * warp_sz; + const int device_max_threads = + ((std::max(num_rows, min_block_size)) / warp_sz) * warp_sz; + cudaFuncAttributes funcattr; + cudaFuncGetAttributes(&funcattr, + apply_kernel); + const int num_regs_used = funcattr.numRegs; + int max_regs_blk = 0; + cudaDeviceGetAttribute(&max_regs_blk, cudaDevAttrMaxRegistersPerBlock, + exec->get_device_id()); + const int max_threads_regs = + ((max_regs_blk / static_cast(num_regs_used)) / warp_sz) * warp_sz; + int max_threads = std::min(max_threads_regs, device_max_threads); + max_threads = max_threads <= 1024 ? max_threads : 1024; + return std::max(std::min(num_warps * warp_sz, max_threads), min_block_size); +} + + +template +int get_max_dynamic_shared_memory(std::shared_ptr exec) +{ + int shmem_per_sm = 0; + cudaDeviceGetAttribute(&shmem_per_sm, + cudaDevAttrMaxSharedMemoryPerMultiprocessor, + exec->get_device_id()); + GKO_ASSERT_NO_CUDA_ERRORS(cudaFuncSetAttribute( + apply_kernel, + cudaFuncAttributePreferredSharedMemoryCarveout, 99 /*%*/)); + cudaFuncAttributes funcattr; + cudaFuncGetAttributes(&funcattr, + apply_kernel); + return funcattr.maxDynamicSharedSizeBytes; +} template using settings = gko::kernels::batch_cg::settings; +template +class kernel_caller { +public: + using value_type = CuValueType; + + kernel_caller(std::shared_ptr exec, + const settings> settings) + : exec_{std::move(exec)}, settings_{settings} + {} + + template + void launch_apply_kernel( + const gko::kernels::batch_cg::storage_config& sconf, LogType& logger, + PrecType& prec, const BatchMatrixType& mat, + const value_type* const __restrict__ b_values, + value_type* const __restrict__ x_values, + value_type* const __restrict__ workspace_data, const int& block_size, + const size_t& shared_size) const + { + apply_kernel + <<get_stream()>>>(sconf, settings_.max_iterations, + settings_.residual_tol, logger, prec, mat, + b_values, x_values, workspace_data); + } + + template + void call_kernel( + LogType logger, const BatchMatrixType& mat, PrecType prec, + const gko::batch::multi_vector::uniform_batch& b, + const gko::batch::multi_vector::uniform_batch& x) const + { + using real_type = gko::remove_complex; + const size_type num_batch_items = mat.num_batch_items; + constexpr int align_multiple = 8; + const int padded_num_rows = + ceildiv(mat.num_rows, align_multiple) * align_multiple; + auto shem_guard = + gko::kernels::cuda::detail::shared_memory_config_guard< + value_type>(); + const int shmem_per_blk = + get_max_dynamic_shared_memory(exec_); + const int block_size = + get_num_threads_per_block( + exec_, mat.num_rows); + GKO_ASSERT(block_size >= 2 * config::warp_size); + + const size_t prec_size = PrecType::dynamic_work_size( + padded_num_rows, mat.get_single_item_num_nnz()); + const auto sconf = + gko::kernels::batch_cg::compute_shared_storage( + shmem_per_blk, padded_num_rows, mat.get_single_item_num_nnz(), + b.num_rhs); + const size_t shared_size = + sconf.n_shared * padded_num_rows * sizeof(value_type) + + (sconf.prec_shared ? prec_size : 0); + auto workspace = gko::array( + exec_, + sconf.gmem_stride_bytes * num_batch_items / sizeof(value_type)); + assert(sconf.gmem_stride_bytes % sizeof(value_type) == 0); + + value_type* const workspace_data = workspace.get_data(); + + // Template parameters launch_apply_kernel + if (sconf.prec_shared) + launch_apply_kernel( + sconf, logger, prec, mat, b.values, x.values, workspace_data, + block_size, shared_size); + else { + switch (sconf.n_shared) { + case 0: + launch_apply_kernel( + sconf, logger, prec, mat, b.values, x.values, + workspace_data, block_size, shared_size); + break; + case 1: + launch_apply_kernel( + sconf, logger, prec, mat, b.values, x.values, + workspace_data, block_size, shared_size); + break; + case 2: + launch_apply_kernel( + sconf, logger, prec, mat, b.values, x.values, + workspace_data, block_size, shared_size); + break; + case 3: + launch_apply_kernel( + sconf, logger, prec, mat, b.values, x.values, + workspace_data, block_size, shared_size); + break; + case 4: + launch_apply_kernel( + sconf, logger, prec, mat, b.values, x.values, + workspace_data, block_size, shared_size); + break; + default: + GKO_NOT_IMPLEMENTED; + } + } + } + +private: + std::shared_ptr exec_; + const settings> settings_; +}; + + template void apply(std::shared_ptr exec, const settings>& settings, @@ -66,7 +227,10 @@ void apply(std::shared_ptr exec, batch::MultiVector* const x, batch::log::detail::log_data>& logdata) { - GKO_NOT_IMPLEMENTED; + using cu_value_type = cuda_type; + auto dispatcher = batch::solver::create_dispatcher( + kernel_caller(exec, settings), settings, mat, precon); + dispatcher.apply(b, x, logdata); } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_APPLY_KERNEL); diff --git a/dpcpp/preconditioner/batch_scalar_jacobi.hpp.inc b/dpcpp/preconditioner/batch_scalar_jacobi.hpp.inc index 7af84aa60bb..de82476519d 100644 --- a/dpcpp/preconditioner/batch_scalar_jacobi.hpp.inc +++ b/dpcpp/preconditioner/batch_scalar_jacobi.hpp.inc @@ -16,7 +16,7 @@ public: */ static constexpr int dynamic_work_size(const int num_rows, int) { - return num_rows; + return num_rows * sizeof(value_type); } /** diff --git a/hip/solver/batch_cg_kernels.hip.cpp b/hip/solver/batch_cg_kernels.hip.cpp index 1288c15b4c0..a47dd594c31 100644 --- a/hip/solver/batch_cg_kernels.hip.cpp +++ b/hip/solver/batch_cg_kernels.hip.cpp @@ -50,12 +50,154 @@ namespace batch_cg { #include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc" #include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc" #include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc" +#include "common/cuda_hip/solver/batch_cg_kernels.hpp.inc" + + +template +int get_num_threads_per_block(std::shared_ptr exec, + const int num_rows) +{ + int num_warps = std::max(num_rows / 4, 2); + constexpr int warp_sz = static_cast(config::warp_size); + const int min_block_size = 2 * warp_sz; + const int device_max_threads = + ((std::max(num_rows, min_block_size)) / warp_sz) * warp_sz; + // This value has been taken from ROCm docs. This is the number of registers + // that maximizes the occupancy on an AMD GPU (MI200). HIP does not have an + // API to query the number of registers a function uses. + const int num_regs_used_per_thread = 64; + int max_regs_blk = 0; + GKO_ASSERT_NO_HIP_ERRORS(hipDeviceGetAttribute( + &max_regs_blk, hipDeviceAttributeMaxRegistersPerBlock, + exec->get_device_id())); + const int max_threads_regs = (max_regs_blk / num_regs_used_per_thread); + int max_threads = std::min(max_threads_regs, device_max_threads); + max_threads = max_threads <= 1024 ? max_threads : 1024; + return std::max(std::min(num_warps * warp_sz, max_threads), min_block_size); +} template using settings = gko::kernels::batch_cg::settings; +template +class kernel_caller { +public: + using value_type = HipValueType; + + kernel_caller(std::shared_ptr exec, + const settings> settings) + : exec_{exec}, settings_{settings} + {} + + template + void launch_apply_kernel( + const gko::kernels::batch_cg::storage_config& sconf, LogType& logger, + PrecType& prec, const BatchMatrixType& mat, + const value_type* const __restrict__ b_values, + value_type* const __restrict__ x_values, + value_type* const __restrict__ workspace_data, const int& block_size, + const size_t& shared_size) const + { + apply_kernel + <<get_stream()>>>(sconf, settings_.max_iterations, + settings_.residual_tol, logger, prec, mat, + b_values, x_values, workspace_data); + } + + + template + void call_kernel( + LogType logger, const BatchMatrixType& mat, PrecType prec, + const gko::batch::multi_vector::uniform_batch& b, + const gko::batch::multi_vector::uniform_batch& x) const + { + using real_type = gko::remove_complex; + const size_type num_batch_items = mat.num_batch_items; + constexpr int align_multiple = 8; + const int padded_num_rows = + ceildiv(mat.num_rows, align_multiple) * align_multiple; + int shmem_per_blk = 0; + GKO_ASSERT_NO_HIP_ERRORS(hipDeviceGetAttribute( + &shmem_per_blk, hipDeviceAttributeMaxSharedMemoryPerBlock, + exec_->get_device_id())); + const int block_size = + get_num_threads_per_block(exec_, mat.num_rows); + GKO_ASSERT(block_size >= 2 * config::warp_size); + + // Returns amount required in bytes + const size_t prec_size = PrecType::dynamic_work_size( + padded_num_rows, mat.get_single_item_num_nnz()); + const auto sconf = + gko::kernels::batch_cg::compute_shared_storage( + shmem_per_blk, padded_num_rows, mat.get_single_item_num_nnz(), + b.num_rhs); + const size_t shared_size = + sconf.n_shared * padded_num_rows * sizeof(value_type) + + (sconf.prec_shared ? prec_size : 0); + auto workspace = gko::array( + exec_, + sconf.gmem_stride_bytes * num_batch_items / sizeof(value_type)); + assert(sconf.gmem_stride_bytes % sizeof(value_type) == 0); + + value_type* const workspace_data = workspace.get_data(); + + // Template parameters launch_apply_kernel( + sconf, logger, prec, mat, b.values, x.values, workspace_data, + block_size, shared_size); + } else { + switch (sconf.n_shared) { + case 0: + launch_apply_kernel( + sconf, logger, prec, mat, b.values, x.values, + workspace_data, block_size, shared_size); + break; + case 1: + launch_apply_kernel( + sconf, logger, prec, mat, b.values, x.values, + workspace_data, block_size, shared_size); + break; + case 2: + launch_apply_kernel( + sconf, logger, prec, mat, b.values, x.values, + workspace_data, block_size, shared_size); + break; + case 3: + launch_apply_kernel( + sconf, logger, prec, mat, b.values, x.values, + workspace_data, block_size, shared_size); + break; + case 4: + launch_apply_kernel( + sconf, logger, prec, mat, b.values, x.values, + workspace_data, block_size, shared_size); + break; + case 5: + launch_apply_kernel( + sconf, logger, prec, mat, b.values, x.values, + workspace_data, block_size, shared_size); + break; + default: + GKO_NOT_IMPLEMENTED; + } + } + } + +private: + std::shared_ptr exec_; + const settings> settings_; +}; + + template void apply(std::shared_ptr exec, const settings>& settings, @@ -65,7 +207,10 @@ void apply(std::shared_ptr exec, batch::MultiVector* const x, batch::log::detail::log_data>& logdata) { - GKO_NOT_IMPLEMENTED; + using hip_value_type = hip_type; + auto dispatcher = batch::solver::create_dispatcher( + kernel_caller(exec, settings), settings, mat, precon); + dispatcher.apply(b, x, logdata); } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_APPLY_KERNEL); diff --git a/reference/preconditioner/batch_scalar_jacobi.hpp b/reference/preconditioner/batch_scalar_jacobi.hpp index 0ee09591e88..18b6aa20b14 100644 --- a/reference/preconditioner/batch_scalar_jacobi.hpp +++ b/reference/preconditioner/batch_scalar_jacobi.hpp @@ -25,7 +25,10 @@ class ScalarJacobi final { /** * The size of the work vector required in case of dynamic allocation. */ - static int dynamic_work_size(const int nrows, int) { return nrows; } + static int dynamic_work_size(const int nrows, int) + { + return nrows * sizeof(ValueType); + } /** * Sets the input and generates the preconditioner by storing the inverse diff --git a/test/solver/CMakeLists.txt b/test/solver/CMakeLists.txt index 9b8cfcb97fa..b66ddd7cebc 100644 --- a/test/solver/CMakeLists.txt +++ b/test/solver/CMakeLists.txt @@ -1,5 +1,5 @@ ginkgo_create_common_test(batch_bicgstab_kernels) -ginkgo_create_common_test(batch_cg_kernels DISABLE_EXECUTORS cuda hip dpcpp) +ginkgo_create_common_test(batch_cg_kernels DISABLE_EXECUTORS dpcpp) ginkgo_create_common_test(bicg_kernels) ginkgo_create_common_test(bicgstab_kernels) ginkgo_create_common_test(cb_gmres_kernels)