From 259f2c17ec5c54c07c285074fe60c39f4e33f745 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Mon, 24 Jun 2024 10:16:23 +0200 Subject: [PATCH] [batch] split cg compilation (cuda) --- cuda/CMakeLists.txt | 2 + cuda/solver/batch_cg_kernels.cu | 209 +-------------- cuda/solver/batch_cg_kernels.cuh | 242 ++++++++++++++++++ .../batch_cg_kernels_launch.instantiate.cu | 95 +++++++ 4 files changed, 341 insertions(+), 207 deletions(-) create mode 100644 cuda/solver/batch_cg_kernels.cuh create mode 100644 cuda/solver/batch_cg_kernels_launch.instantiate.cu diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt index dd3622a9c25..b5ba8dc0603 100644 --- a/cuda/CMakeLists.txt +++ b/cuda/CMakeLists.txt @@ -4,6 +4,7 @@ include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake) add_instantiation_files(. matrix/csr_kernels.instantiate.cu CSR_INSTANTIATE) add_instantiation_files(. matrix/fbcsr_kernels.instantiate.cu FBCSR_INSTANTIATE) add_instantiation_files(. solver/batch_bicgstab_kernels_launch.instantiate.cu BATCH_BICGSTAB_INSTANTIATE) +add_instantiation_files(. solver/batch_cg_kernels_launch.instantiate.cu BATCH_CG_INSTANTIATE) # we don't split up the dense kernels into distinct compilations list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp) target_sources(ginkgo_cuda @@ -63,6 +64,7 @@ target_sources(ginkgo_cuda solver/batch_bicgstab_kernels.cu ${BATCH_BICGSTAB_INSTANTIATE} solver/batch_cg_kernels.cu + ${BACTH_CG_INSTANTIATE} solver/cb_gmres_kernels.cu solver/idr_kernels.cu solver/lower_trs_kernels.cu diff --git a/cuda/solver/batch_cg_kernels.cu b/cuda/solver/batch_cg_kernels.cu index f429e5f22f0..74720089bd6 100644 --- a/cuda/solver/batch_cg_kernels.cu +++ b/cuda/solver/batch_cg_kernels.cu @@ -5,224 +5,20 @@ #include "core/solver/batch_cg_kernels.hpp" -#include -#include - - -#include #include -#include "core/base/batch_struct.hpp" -#include "core/matrix/batch_struct.hpp" #include "core/solver/batch_dispatch.hpp" -#include "cuda/base/batch_struct.hpp" -#include "cuda/base/config.hpp" -#include "cuda/base/kernel_config.hpp" -#include "cuda/base/thrust.cuh" -#include "cuda/base/types.hpp" -#include "cuda/components/cooperative_groups.cuh" -#include "cuda/components/reduction.cuh" -#include "cuda/components/thread_ids.cuh" #include "cuda/components/uninitialized_array.hpp" -#include "cuda/matrix/batch_struct.hpp" +#include "cuda/solver/batch_cg_kernels.cuh" namespace gko { namespace kernels { namespace cuda { - - -// NOTE: this default block size is not used for the main solver kernel. -constexpr int default_block_size = 256; -constexpr int sm_oversubscription = 4; - - -/** - * @brief The batch Cg solver namespace. - * - * @ingroup batch_cg - */ namespace batch_cg { -#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc" -#include "common/cuda_hip/components/uninitialized_array.hpp.inc" -#include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc" -#include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc" -#include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc" -#include "common/cuda_hip/solver/batch_cg_kernels.hpp.inc" - - -template -int get_num_threads_per_block(std::shared_ptr exec, - const int num_rows) -{ - int num_warps = std::max(num_rows / 4, 2); - constexpr int warp_sz = static_cast(config::warp_size); - const int min_block_size = 2 * warp_sz; - const int device_max_threads = - (std::max(num_rows, min_block_size) / warp_sz) * warp_sz; - cudaFuncAttributes funcattr; - cudaFuncGetAttributes(&funcattr, - apply_kernel); - const int num_regs_used = funcattr.numRegs; - int max_regs_blk = 0; - cudaDeviceGetAttribute(&max_regs_blk, cudaDevAttrMaxRegistersPerBlock, - exec->get_device_id()); - const int max_threads_regs = - ((max_regs_blk / static_cast(num_regs_used)) / warp_sz) * warp_sz; - int max_threads = std::min(max_threads_regs, device_max_threads); - max_threads = max_threads <= 1024 ? max_threads : 1024; - return std::max(std::min(num_warps * warp_sz, max_threads), min_block_size); -} - - -template -int get_max_dynamic_shared_memory(std::shared_ptr exec) -{ - int shmem_per_sm = 0; - cudaDeviceGetAttribute(&shmem_per_sm, - cudaDevAttrMaxSharedMemoryPerMultiprocessor, - exec->get_device_id()); - GKO_ASSERT_NO_CUDA_ERRORS(cudaFuncSetAttribute( - apply_kernel, - cudaFuncAttributePreferredSharedMemoryCarveout, 99 /*%*/)); - cudaFuncAttributes funcattr; - cudaFuncGetAttributes(&funcattr, - apply_kernel); - return funcattr.maxDynamicSharedSizeBytes; -} - - -template -using settings = gko::kernels::batch_cg::settings; - - -template -class kernel_caller { -public: - using value_type = CuValueType; - - kernel_caller(std::shared_ptr exec, - const settings> settings) - : exec_{std::move(exec)}, settings_{settings} - {} - - template - void launch_apply_kernel( - const gko::kernels::batch_cg::storage_config& sconf, LogType& logger, - PrecType& prec, const BatchMatrixType& mat, - const value_type* const __restrict__ b_values, - value_type* const __restrict__ x_values, - value_type* const __restrict__ workspace_data, const int& block_size, - const size_t& shared_size) const - { - apply_kernel - <<get_stream()>>>(sconf, settings_.max_iterations, - settings_.residual_tol, logger, prec, mat, - b_values, x_values, workspace_data); - } - - template - void call_kernel( - LogType logger, const BatchMatrixType& mat, PrecType prec, - const gko::batch::multi_vector::uniform_batch& b, - const gko::batch::multi_vector::uniform_batch& x) const - { - using real_type = gko::remove_complex; - const size_type num_batch_items = mat.num_batch_items; - constexpr int align_multiple = 8; - const int padded_num_rows = - ceildiv(mat.num_rows, align_multiple) * align_multiple; - auto shem_guard = - gko::kernels::cuda::detail::shared_memory_config_guard< - value_type>(); - const int shmem_per_blk = - get_max_dynamic_shared_memory(exec_); - const int block_size = - get_num_threads_per_block( - exec_, mat.num_rows); - GKO_ASSERT(block_size >= 2 * config::warp_size); - - const size_t prec_size = PrecType::dynamic_work_size( - padded_num_rows, mat.get_single_item_num_nnz()); - const auto sconf = - gko::kernels::batch_cg::compute_shared_storage( - shmem_per_blk, padded_num_rows, mat.get_single_item_num_nnz(), - b.num_rhs); - const size_t shared_size = - sconf.n_shared * padded_num_rows * sizeof(value_type) + - (sconf.prec_shared ? prec_size : 0); - auto workspace = gko::array( - exec_, - sconf.gmem_stride_bytes * num_batch_items / sizeof(value_type)); - GKO_ASSERT(sconf.gmem_stride_bytes % sizeof(value_type) == 0); - - value_type* const workspace_data = workspace.get_data(); - - // Template parameters launch_apply_kernel - if (sconf.prec_shared) { - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, workspace_data, - block_size, shared_size); - } else { - switch (sconf.n_shared) { - case 0: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - case 1: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - case 2: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - case 3: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - case 4: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - case 5: - launch_apply_kernel( - sconf, logger, prec, mat, b.values, x.values, - workspace_data, block_size, shared_size); - break; - default: - GKO_NOT_IMPLEMENTED; - } - } - } - -private: - std::shared_ptr exec_; - const settings> settings_; -}; - - template void apply(std::shared_ptr exec, const settings>& settings, @@ -232,9 +28,8 @@ void apply(std::shared_ptr exec, batch::MultiVector* const x, batch::log::detail::log_data>& logdata) { - using cu_value_type = cuda_type; auto dispatcher = batch::solver::create_dispatcher( - kernel_caller(exec, settings), settings, mat, precon); + kernel_caller(exec, settings), settings, mat, precon); dispatcher.apply(b, x, logdata); } diff --git a/cuda/solver/batch_cg_kernels.cuh b/cuda/solver/batch_cg_kernels.cuh new file mode 100644 index 00000000000..13c9fdd0e3d --- /dev/null +++ b/cuda/solver/batch_cg_kernels.cuh @@ -0,0 +1,242 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "core/solver/batch_cg_kernels.hpp" + + +#include +#include + + +#include +#include + + +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_struct.hpp" +#include "core/solver/batch_dispatch.hpp" +#include "cuda/base/batch_struct.hpp" +#include "cuda/base/config.hpp" +#include "cuda/base/kernel_config.hpp" +#include "cuda/base/thrust.cuh" +#include "cuda/base/types.hpp" +#include "cuda/components/cooperative_groups.cuh" +#include "cuda/components/reduction.cuh" +#include "cuda/components/thread_ids.cuh" +#include "cuda/components/uninitialized_array.hpp" +#include "cuda/matrix/batch_struct.hpp" + + +namespace gko { +namespace kernels { +namespace cuda { + + +/** + * @brief The batch Cg solver namespace. + * + * @ingroup batch_cg + */ +namespace batch_cg { + + +template +int get_num_threads_per_block(std::shared_ptr exec, + const int num_rows) +{ + int num_warps = std::max(num_rows / 4, 2); + constexpr int warp_sz = static_cast(config::warp_size); + const int min_block_size = 2 * warp_sz; + const int device_max_threads = + (std::max(num_rows, min_block_size) / warp_sz) * warp_sz; + cudaFuncAttributes funcattr; + cudaFuncGetAttributes(&funcattr, + apply_kernel); + const int num_regs_used = funcattr.numRegs; + int max_regs_blk = 0; + cudaDeviceGetAttribute(&max_regs_blk, cudaDevAttrMaxRegistersPerBlock, + exec->get_device_id()); + const int max_threads_regs = + ((max_regs_blk / static_cast(num_regs_used)) / warp_sz) * warp_sz; + int max_threads = std::min(max_threads_regs, device_max_threads); + max_threads = max_threads <= 1024 ? max_threads : 1024; + return std::max(std::min(num_warps * warp_sz, max_threads), min_block_size); +} + + +template +int get_max_dynamic_shared_memory(std::shared_ptr exec) +{ + int shmem_per_sm = 0; + cudaDeviceGetAttribute(&shmem_per_sm, + cudaDevAttrMaxSharedMemoryPerMultiprocessor, + exec->get_device_id()); + GKO_ASSERT_NO_CUDA_ERRORS(cudaFuncSetAttribute( + apply_kernel, + cudaFuncAttributePreferredSharedMemoryCarveout, 99 /*%*/)); + cudaFuncAttributes funcattr; + cudaFuncGetAttributes(&funcattr, + apply_kernel); + return funcattr.maxDynamicSharedSizeBytes; +} + + +template +using settings = gko::kernels::batch_cg::settings; + + +template +void launch_apply_kernel( + std::shared_ptr exec, + const gko::kernels::batch_cg::storage_config& sconf, + const settings>& settings, LogType& logger, + PrecType& prec, const BatchMatrixType& mat, + const cuda_type* const __restrict__ b_values, + cuda_type* const __restrict__ x_values, + cuda_type* const __restrict__ workspace_data, + const int& block_size, const size_t& shared_size); + +#define GKO_DECLARE_BATCH_CG_LAUNCH(_vtype, _n_shared, _prec_shared, mat_t, \ + log_t, pre_t, stop_t) \ + void launch_apply_kernel<_vtype, _n_shared, _prec_shared, \ + stop_t>>( \ + std::shared_ptr exec, \ + const gko::kernels::batch_cg::storage_config& sconf, \ + const settings>& settings, \ + log_t>>& logger, \ + pre_t>& prec, \ + const mat_t>& mat, \ + const cuda_type<_vtype>* const __restrict__ b_values, \ + cuda_type<_vtype>* const __restrict__ x_values, \ + cuda_type<_vtype>* const __restrict__ workspace_data, \ + const int& block_size, const size_t& shared_size) + +#define GKO_DECLARE_BATCH_CG_LAUNCH_0_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 0, false) +#define GKO_DECLARE_BATCH_CG_LAUNCH_1_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 1, false) +#define GKO_DECLARE_BATCH_CG_LAUNCH_2_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 2, false) +#define GKO_DECLARE_BATCH_CG_LAUNCH_3_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 3, false) +#define GKO_DECLARE_BATCH_CG_LAUNCH_4_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 4, false) +#define GKO_DECLARE_BATCH_CG_LAUNCH_5_FALSE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 5, false) +#define GKO_DECLARE_BATCH_CG_LAUNCH_5_TRUE(_vtype) \ + GKO_BATCH_INSTANTIATE(GKO_DECLARE_BATCH_CG_LAUNCH, _vtype, 5, true) + + +template +class kernel_caller { +public: + using cuda_value_type = cuda_type; + + kernel_caller(std::shared_ptr exec, + const settings> settings) + : exec_{std::move(exec)}, settings_{settings} + {} + + template + void call_kernel( + LogType logger, const BatchMatrixType& mat, PrecType prec, + const gko::batch::multi_vector::uniform_batch& b, + const gko::batch::multi_vector::uniform_batch& x) const + { + using real_type = gko::remove_complex; + const size_type num_batch_items = mat.num_batch_items; + constexpr int align_multiple = 8; + const int padded_num_rows = + ceildiv(mat.num_rows, align_multiple) * align_multiple; + auto shem_guard = + gko::kernels::cuda::detail::shared_memory_config_guard< + cuda_value_type>(); + const int shmem_per_blk = + get_max_dynamic_shared_memory( + exec_); + const int block_size = + get_num_threads_per_block( + exec_, mat.num_rows); + GKO_ASSERT(block_size >= 2 * config::warp_size); + + const size_t prec_size = PrecType::dynamic_work_size( + padded_num_rows, mat.get_single_item_num_nnz()); + const auto sconf = + gko::kernels::batch_cg::compute_shared_storage( + shmem_per_blk, padded_num_rows, mat.get_single_item_num_nnz(), + b.num_rhs); + const size_t shared_size = + sconf.n_shared * padded_num_rows * sizeof(cuda_value_type) + + (sconf.prec_shared ? prec_size : 0); + auto workspace = gko::array( + exec_, sconf.gmem_stride_bytes * num_batch_items / + sizeof(cuda_value_type)); + GKO_ASSERT(sconf.gmem_stride_bytes % sizeof(cuda_value_type) == 0); + + cuda_value_type* const workspace_data = workspace.get_data(); + + // Template parameters launch_apply_kernel + if (sconf.prec_shared) { + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, x.values, + workspace_data, block_size, shared_size); + } else { + switch (sconf.n_shared) { + case 0: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + case 1: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + case 2: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + case 3: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + case 4: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + case 5: + launch_apply_kernel( + exec_, sconf, settings_, logger, prec, mat, b.values, + x.values, workspace_data, block_size, shared_size); + break; + default: + GKO_NOT_IMPLEMENTED; + } + } + } + +private: + std::shared_ptr exec_; + const settings> settings_; +}; + + +} // namespace batch_cg +} // namespace cuda +} // namespace kernels +} // namespace gko diff --git a/cuda/solver/batch_cg_kernels_launch.instantiate.cu b/cuda/solver/batch_cg_kernels_launch.instantiate.cu new file mode 100644 index 00000000000..9ddc611a52c --- /dev/null +++ b/cuda/solver/batch_cg_kernels_launch.instantiate.cu @@ -0,0 +1,95 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include +#include + + +#include +#include + + +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_struct.hpp" +#include "core/solver/batch_cg_kernels.hpp" +#include "core/solver/batch_dispatch.hpp" +#include "cuda/base/batch_struct.hpp" +#include "cuda/base/config.hpp" +#include "cuda/base/kernel_config.hpp" +#include "cuda/base/thrust.cuh" +#include "cuda/base/types.hpp" +#include "cuda/components/cooperative_groups.cuh" +#include "cuda/components/reduction.cuh" +#include "cuda/components/thread_ids.cuh" +#include "cuda/components/uninitialized_array.hpp" +#include "cuda/matrix/batch_struct.hpp" +#include "cuda/solver/batch_cg_kernels.cuh" + + +namespace gko { +namespace kernels { +namespace cuda { + + +// NOTE: this default block size is not used for the main solver kernel. +constexpr int default_block_size = 256; +constexpr int sm_oversubscription = 4; + + +/** + * @brief The batch Cg solver namespace. + * + * @ingroup batch_cg + */ +namespace batch_cg { + + +#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc" +#include "common/cuda_hip/components/uninitialized_array.hpp.inc" +#include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc" +#include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc" +#include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc" +#include "common/cuda_hip/solver/batch_cg_kernels.hpp.inc" + + +template +void launch_apply_kernel( + std::shared_ptr exec, + const gko::kernels::batch_cg::storage_config& sconf, + const settings>& settings, LogType& logger, + PrecType& prec, const BatchMatrixType& mat, + const cuda_type* const __restrict__ b_values, + cuda_type* const __restrict__ x_values, + cuda_type* const __restrict__ workspace_data, + const int& block_size, const size_t& shared_size) +{ + apply_kernel + <<get_stream()>>>( + sconf, settings.max_iterations, as_cuda_type(settings.residual_tol), + logger, prec, mat, b_values, x_values, workspace_data); +} + + +// begin +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_0_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_1_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_2_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_3_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_4_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_5_FALSE); +// split +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CG_LAUNCH_5_TRUE); +// end + + +} // namespace batch_cg +} // namespace cuda +} // namespace kernels +} // namespace gko