Skip to content

Commit

Permalink
[batch] split cg compilation (cuda)
Browse files Browse the repository at this point in the history
  • Loading branch information
MarcelKoch committed Jun 24, 2024
1 parent 1435291 commit 8c25a83
Show file tree
Hide file tree
Showing 4 changed files with 375 additions and 207 deletions.
2 changes: 2 additions & 0 deletions cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake)
add_instantiation_files(. matrix/csr_kernels.instantiate.cu CSR_INSTANTIATE)
add_instantiation_files(. matrix/fbcsr_kernels.instantiate.cu FBCSR_INSTANTIATE)
add_instantiation_files(. solver/batch_bicgstab_kernels_launch.instantiate.cu BATCH_BICGSTAB_INSTANTIATE)
add_instantiation_files(. solver/batch_cg_kernels_launch.instantiate.cu BATCH_CG_INSTANTIATE)
# we don't split up the dense kernels into distinct compilations
list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp)
target_sources(ginkgo_cuda
Expand Down Expand Up @@ -63,6 +64,7 @@ target_sources(ginkgo_cuda
solver/batch_bicgstab_kernels.cu
${BATCH_BICGSTAB_INSTANTIATE}
solver/batch_cg_kernels.cu
${BACTH_CG_INSTANTIATE}
solver/cb_gmres_kernels.cu
solver/idr_kernels.cu
solver/lower_trs_kernels.cu
Expand Down
209 changes: 2 additions & 207 deletions cuda/solver/batch_cg_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -5,224 +5,20 @@
#include "core/solver/batch_cg_kernels.hpp"


#include <thrust/functional.h>
#include <thrust/transform.h>


#include <ginkgo/core/base/exception_helpers.hpp>
#include <ginkgo/core/base/math.hpp>


#include "core/base/batch_struct.hpp"
#include "core/matrix/batch_struct.hpp"
#include "core/solver/batch_dispatch.hpp"
#include "cuda/base/batch_struct.hpp"
#include "cuda/base/config.hpp"
#include "cuda/base/kernel_config.hpp"
#include "cuda/base/thrust.cuh"
#include "cuda/base/types.hpp"
#include "cuda/components/cooperative_groups.cuh"
#include "cuda/components/reduction.cuh"
#include "cuda/components/thread_ids.cuh"
#include "cuda/components/uninitialized_array.hpp"
#include "cuda/matrix/batch_struct.hpp"
#include "cuda/solver/batch_cg_kernels.cuh"


namespace gko {
namespace kernels {
namespace cuda {


// NOTE: this default block size is not used for the main solver kernel.
constexpr int default_block_size = 256;
constexpr int sm_oversubscription = 4;


/**
* @brief The batch Cg solver namespace.
*
* @ingroup batch_cg
*/
namespace batch_cg {


#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc"
#include "common/cuda_hip/components/uninitialized_array.hpp.inc"
#include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc"
#include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc"
#include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc"
#include "common/cuda_hip/solver/batch_cg_kernels.hpp.inc"


template <typename StopType, typename PrecType, typename LogType,
typename BatchMatrixType, typename ValueType>
int get_num_threads_per_block(std::shared_ptr<const DefaultExecutor> exec,
const int num_rows)
{
int num_warps = std::max(num_rows / 4, 2);
constexpr int warp_sz = static_cast<int>(config::warp_size);
const int min_block_size = 2 * warp_sz;
const int device_max_threads =
(std::max(num_rows, min_block_size) / warp_sz) * warp_sz;
cudaFuncAttributes funcattr;
cudaFuncGetAttributes(&funcattr,
apply_kernel<StopType, 5, true, PrecType, LogType,
BatchMatrixType, ValueType>);
const int num_regs_used = funcattr.numRegs;
int max_regs_blk = 0;
cudaDeviceGetAttribute(&max_regs_blk, cudaDevAttrMaxRegistersPerBlock,
exec->get_device_id());
const int max_threads_regs =
((max_regs_blk / static_cast<int>(num_regs_used)) / warp_sz) * warp_sz;
int max_threads = std::min(max_threads_regs, device_max_threads);
max_threads = max_threads <= 1024 ? max_threads : 1024;
return std::max(std::min(num_warps * warp_sz, max_threads), min_block_size);
}


template <typename StopType, typename PrecType, typename LogType,
typename BatchMatrixType, typename ValueType>
int get_max_dynamic_shared_memory(std::shared_ptr<const DefaultExecutor> exec)
{
int shmem_per_sm = 0;
cudaDeviceGetAttribute(&shmem_per_sm,
cudaDevAttrMaxSharedMemoryPerMultiprocessor,
exec->get_device_id());
GKO_ASSERT_NO_CUDA_ERRORS(cudaFuncSetAttribute(
apply_kernel<StopType, 5, true, PrecType, LogType, BatchMatrixType,
ValueType>,
cudaFuncAttributePreferredSharedMemoryCarveout, 99 /*%*/));
cudaFuncAttributes funcattr;
cudaFuncGetAttributes(&funcattr,
apply_kernel<StopType, 5, true, PrecType, LogType,
BatchMatrixType, ValueType>);
return funcattr.maxDynamicSharedSizeBytes;
}


template <typename T>
using settings = gko::kernels::batch_cg::settings<T>;


template <typename CuValueType>
class kernel_caller {
public:
using value_type = CuValueType;

kernel_caller(std::shared_ptr<const DefaultExecutor> exec,
const settings<remove_complex<value_type>> settings)
: exec_{std::move(exec)}, settings_{settings}
{}

template <typename StopType, const int n_shared,
const bool prec_shared_bool, typename PrecType, typename LogType,
typename BatchMatrixType>
void launch_apply_kernel(
const gko::kernels::batch_cg::storage_config& sconf, LogType& logger,
PrecType& prec, const BatchMatrixType& mat,
const value_type* const __restrict__ b_values,
value_type* const __restrict__ x_values,
value_type* const __restrict__ workspace_data, const int& block_size,
const size_t& shared_size) const
{
apply_kernel<StopType, n_shared, prec_shared_bool>
<<<mat.num_batch_items, block_size, shared_size,
exec_->get_stream()>>>(sconf, settings_.max_iterations,
settings_.residual_tol, logger, prec, mat,
b_values, x_values, workspace_data);
}
template <typename BatchMatrixType, typename PrecType, typename StopType,
typename LogType>
void call_kernel(
LogType logger, const BatchMatrixType& mat, PrecType prec,
const gko::batch::multi_vector::uniform_batch<const value_type>& b,
const gko::batch::multi_vector::uniform_batch<value_type>& x) const
{
using real_type = gko::remove_complex<value_type>;
const size_type num_batch_items = mat.num_batch_items;
constexpr int align_multiple = 8;
const int padded_num_rows =
ceildiv(mat.num_rows, align_multiple) * align_multiple;
auto shem_guard =
gko::kernels::cuda::detail::shared_memory_config_guard<
value_type>();
const int shmem_per_blk =
get_max_dynamic_shared_memory<StopType, PrecType, LogType,
BatchMatrixType, value_type>(exec_);
const int block_size =
get_num_threads_per_block<StopType, PrecType, LogType,
BatchMatrixType, value_type>(
exec_, mat.num_rows);
GKO_ASSERT(block_size >= 2 * config::warp_size);
const size_t prec_size = PrecType::dynamic_work_size(
padded_num_rows, mat.get_single_item_num_nnz());
const auto sconf =
gko::kernels::batch_cg::compute_shared_storage<PrecType,
value_type>(
shmem_per_blk, padded_num_rows, mat.get_single_item_num_nnz(),
b.num_rhs);
const size_t shared_size =
sconf.n_shared * padded_num_rows * sizeof(value_type) +
(sconf.prec_shared ? prec_size : 0);
auto workspace = gko::array<value_type>(
exec_,
sconf.gmem_stride_bytes * num_batch_items / sizeof(value_type));
GKO_ASSERT(sconf.gmem_stride_bytes % sizeof(value_type) == 0);
value_type* const workspace_data = workspace.get_data();
// Template parameters launch_apply_kernel<StopType, n_shared,
// prec_shared>
if (sconf.prec_shared) {
launch_apply_kernel<StopType, 5, true>(
sconf, logger, prec, mat, b.values, x.values, workspace_data,
block_size, shared_size);
} else {
switch (sconf.n_shared) {
case 0:
launch_apply_kernel<StopType, 0, false>(
sconf, logger, prec, mat, b.values, x.values,
workspace_data, block_size, shared_size);
break;
case 1:
launch_apply_kernel<StopType, 1, false>(
sconf, logger, prec, mat, b.values, x.values,
workspace_data, block_size, shared_size);
break;
case 2:
launch_apply_kernel<StopType, 2, false>(
sconf, logger, prec, mat, b.values, x.values,
workspace_data, block_size, shared_size);
break;
case 3:
launch_apply_kernel<StopType, 3, false>(
sconf, logger, prec, mat, b.values, x.values,
workspace_data, block_size, shared_size);
break;
case 4:
launch_apply_kernel<StopType, 4, false>(
sconf, logger, prec, mat, b.values, x.values,
workspace_data, block_size, shared_size);
break;
case 5:
launch_apply_kernel<StopType, 5, false>(
sconf, logger, prec, mat, b.values, x.values,
workspace_data, block_size, shared_size);
break;
default:
GKO_NOT_IMPLEMENTED;
}
}
}
private:
std::shared_ptr<const DefaultExecutor> exec_;
const settings<remove_complex<value_type>> settings_;
};
template <typename ValueType>
void apply(std::shared_ptr<const DefaultExecutor> exec,
const settings<remove_complex<ValueType>>& settings,
Expand All @@ -232,9 +28,8 @@ void apply(std::shared_ptr<const DefaultExecutor> exec,
batch::MultiVector<ValueType>* const x,
batch::log::detail::log_data<remove_complex<ValueType>>& logdata)
{
using cu_value_type = cuda_type<ValueType>;
auto dispatcher = batch::solver::create_dispatcher<ValueType>(
kernel_caller<cu_value_type>(exec, settings), settings, mat, precon);
kernel_caller<ValueType>(exec, settings), settings, mat, precon);
dispatcher.apply(b, x, logdata);
}

Expand Down
Loading

0 comments on commit 8c25a83

Please sign in to comment.