From 88702f5d253a69cb1d59aad703e88246c4a82727 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Sun, 19 May 2024 22:12:45 +0200 Subject: [PATCH] fix compilation issues --- common/CMakeLists.txt | 1 - common/cuda_hip/CMakeLists.txt | 60 ---- .../base/batch_multi_vector_kernels.hpp.inc | 301 ++++++++++++++++++ common/cuda_hip/base/pointer_mode_guard.hpp | 5 +- common/cuda_hip/base/randlib_bindings.hpp | 2 +- common/cuda_hip/base/sparselib_bindings.hpp | 2 +- .../base/sparselib_block_bindings.hpp | 16 + .../cuda_hip/components/format_conversion.hpp | 2 +- common/cuda_hip/components/memory.hpp | 2 +- ...ct_kernels.cpp => par_ict_kernels.hpp.inc} | 184 ----------- ...ls.cpp => par_ilut_filter_kernels.hpp.inc} | 135 -------- ...ls.cpp => par_ilut_select_kernels.hpp.inc} | 157 --------- ...ls.cpp => par_ilut_spgeam_kernels.hpp.inc} | 154 --------- .../par_ilut_sweep_kernels.hpp.inc | 94 ++++++ ...{batch_logger.hpp => batch_logger.hpp.inc} | 22 -- ..._kernels.cpp => batch_csr_kernels.hpp.inc} | 54 ---- ...ernels.cpp => batch_dense_kernels.hpp.inc} | 55 ---- ..._kernels.cpp => batch_ell_kernels.hpp.inc} | 54 ---- .../matrix/fbcsr_kernels.template.cpp | 3 +- ...obi_kernels.cpp => jacobi_kernels.hpp.inc} | 48 --- common/cuda_hip/sources.cmake | 31 ++ ...ch_criteria.hpp => batch_criteria.hpp.inc} | 21 -- common/unified/base/kernel_launch.hpp | 4 +- .../unified/base/kernel_launch_reduction.hpp | 4 +- common/unified/base/kernel_launch_solver.hpp | 4 +- cuda/CMakeLists.txt | 11 + cuda/base/batch_multi_vector_kernels.cu | 59 ++++ cuda/factorization/par_ict_kernels.cu | 189 +++++++++++ .../par_ilut_approx_filter_kernels.cu | 12 +- cuda/factorization/par_ilut_filter_kernels.cu | 140 ++++++++ cuda/factorization/par_ilut_select_common.cu | 14 +- cuda/factorization/par_ilut_select_kernels.cu | 162 ++++++++++ cuda/factorization/par_ilut_spgeam_kernels.cu | 159 +++++++++ .../factorization/par_ilut_sweep_kernels.cu | 97 +----- cuda/log/batch_logger.cuh | 27 ++ cuda/matrix/batch_csr_kernels.cu | 58 ++++ cuda/matrix/batch_dense_kernels.cu | 59 ++++ cuda/matrix/batch_ell_kernels.cu | 58 ++++ cuda/preconditioner/batch_preconditioners.cuh | 2 +- ...cobi_advanced_apply_kernels.instantiate.cu | 8 +- .../jacobi_generate_kernels.instantiate.cu | 12 +- cuda/preconditioner/jacobi_kernels.cu | 51 +++ ...jacobi_simple_apply_kernels.instantiate.cu | 8 +- cuda/solver/batch_bicgstab_kernels.cu | 7 +- cuda/solver/batch_cg_kernels.cu | 7 +- cuda/stop/batch_criteria.cuh | 26 ++ hip/CMakeLists.txt | 19 +- hip/base/batch_multi_vector_kernels.hip.cpp | 59 ++++ hip/base/config.hip.hpp | 2 +- hip/base/hipblas_bindings.hip.hpp | 2 +- hip/base/hiprand_bindings.hip.hpp | 2 +- hip/components/format_conversion.hip.hpp | 2 +- hip/factorization/par_ict_kernels.hip.cpp | 189 +++++++++++ .../par_ilut_approx_filter_kernels.hip.cpp | 12 +- .../par_ilut_filter_kernels.hip.cpp | 140 ++++++++ .../par_ilut_select_common.hip.cpp | 14 +- .../par_ilut_select_kernels.hip.cpp | 162 ++++++++++ .../par_ilut_spgeam_kernels.hip.cpp | 159 +++++++++ .../par_ilut_sweep_kernels.hip.cpp | 123 +++++++ hip/log/batch_logger.hip.hpp | 26 ++ hip/matrix/batch_csr_kernels.hip.cpp | 58 ++++ hip/matrix/batch_dense_kernels.hip.cpp | 59 ++++ hip/matrix/batch_ell_kernels.hip.cpp | 58 ++++ hip/matrix/csr_kernels.template.hip.cpp | 18 +- .../batch_preconditioners.hip.hpp | 2 +- ...obi_advanced_apply_instantiate.inc.hip.cpp | 8 +- .../jacobi_generate_instantiate.inc.hip.cpp | 12 +- .../jacobi_generate_kernel.hip.cpp | 12 +- hip/preconditioner/jacobi_kernels.hip.cpp | 51 +++ ...acobi_simple_apply_instantiate.inc.hip.cpp | 8 +- .../jacobi_simple_apply_kernel.hip.cpp | 8 +- hip/solver/batch_bicgstab_kernels.hip.cpp | 9 +- hip/solver/batch_cg_kernels.hip.cpp | 9 +- hip/solver/common_trs_kernels.hip.hpp | 2 +- hip/solver/lower_trs_kernels.hip.cpp | 2 +- hip/solver/upper_trs_kernels.hip.cpp | 2 +- hip/stop/batch_criteria.hip.hpp | 26 ++ hip/stop/criterion_kernels.hip.cpp | 4 +- hip/stop/residual_norm_kernels.hip.cpp | 4 +- hip/test/base/math.hip.cpp | 2 +- hip/test/components/merging.hip.cpp | 4 +- hip/test/components/searching.hip.cpp | 4 +- hip/test/components/sorting.hip.cpp | 4 +- 83 files changed, 2681 insertions(+), 1177 deletions(-) delete mode 100644 common/cuda_hip/CMakeLists.txt create mode 100644 common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc create mode 100644 common/cuda_hip/base/sparselib_block_bindings.hpp rename common/cuda_hip/factorization/{par_ict_kernels.cpp => par_ict_kernels.hpp.inc} (62%) rename common/cuda_hip/factorization/{par_ilut_filter_kernels.cpp => par_ilut_filter_kernels.hpp.inc} (57%) rename common/cuda_hip/factorization/{par_ilut_select_kernels.cpp => par_ilut_select_kernels.hpp.inc} (63%) rename common/cuda_hip/factorization/{par_ilut_spgeam_kernels.cpp => par_ilut_spgeam_kernels.hpp.inc} (63%) create mode 100644 common/cuda_hip/factorization/par_ilut_sweep_kernels.hpp.inc rename common/cuda_hip/log/{batch_logger.hpp => batch_logger.hpp.inc} (68%) rename common/cuda_hip/matrix/{batch_csr_kernels.cpp => batch_csr_kernels.hpp.inc} (87%) rename common/cuda_hip/matrix/{batch_dense_kernels.cpp => batch_dense_kernels.hpp.inc} (89%) rename common/cuda_hip/matrix/{batch_ell_kernels.cpp => batch_ell_kernels.hpp.inc} (87%) rename common/cuda_hip/preconditioner/{jacobi_kernels.cpp => jacobi_kernels.hpp.inc} (91%) create mode 100644 common/cuda_hip/sources.cmake rename common/cuda_hip/stop/{batch_criteria.hpp => batch_criteria.hpp.inc} (75%) create mode 100644 cuda/base/batch_multi_vector_kernels.cu create mode 100644 cuda/factorization/par_ict_kernels.cu create mode 100644 cuda/factorization/par_ilut_filter_kernels.cu create mode 100644 cuda/factorization/par_ilut_select_kernels.cu create mode 100644 cuda/factorization/par_ilut_spgeam_kernels.cu rename common/cuda_hip/factorization/par_ilut_sweep_kernels.cpp => cuda/factorization/par_ilut_sweep_kernels.cu (53%) create mode 100644 cuda/log/batch_logger.cuh create mode 100644 cuda/matrix/batch_csr_kernels.cu create mode 100644 cuda/matrix/batch_dense_kernels.cu create mode 100644 cuda/matrix/batch_ell_kernels.cu create mode 100644 cuda/preconditioner/jacobi_kernels.cu create mode 100644 cuda/stop/batch_criteria.cuh create mode 100644 hip/base/batch_multi_vector_kernels.hip.cpp create mode 100644 hip/factorization/par_ict_kernels.hip.cpp create mode 100644 hip/factorization/par_ilut_filter_kernels.hip.cpp create mode 100644 hip/factorization/par_ilut_select_kernels.hip.cpp create mode 100644 hip/factorization/par_ilut_spgeam_kernels.hip.cpp create mode 100644 hip/factorization/par_ilut_sweep_kernels.hip.cpp create mode 100644 hip/log/batch_logger.hip.hpp create mode 100644 hip/matrix/batch_csr_kernels.hip.cpp create mode 100644 hip/matrix/batch_dense_kernels.hip.cpp create mode 100644 hip/matrix/batch_ell_kernels.hip.cpp create mode 100644 hip/preconditioner/jacobi_kernels.hip.cpp create mode 100644 hip/stop/batch_criteria.hip.hpp diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index e7c665640b3..77bdd7230b9 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -1,3 +1,2 @@ add_subdirectory(unified) set(GKO_UNIFIED_COMMON_SOURCES ${GKO_UNIFIED_COMMON_SOURCES} PARENT_SCOPE) -set(GKO_CUDA_HIP_COMMON_SOURCES ${GKO_CUDA_HIP_COMMON_SOURCES} PARENT_SCOPE) diff --git a/common/cuda_hip/CMakeLists.txt b/common/cuda_hip/CMakeLists.txt deleted file mode 100644 index 8014d5dad6d..00000000000 --- a/common/cuda_hip/CMakeLists.txt +++ /dev/null @@ -1,60 +0,0 @@ -include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake) -add_instantiation_files(${CMAKE_CURRENT_SOURCE_DIR} matrix/fbcsr_kernels.instantiate.cpp FBCSR_INSTANTIATE) -set(CUDA_HIP_SOURCES - base/batch_multi_vector_kernels.cpp - base/device_matrix_data_kernels.cpp - base/kernel_launch.hpp - base/kernel_launch_reduction.hpp - base/kernel_launch_solver.hpp - base/math.hpp - components/atomic.hpp - components/diagonal_block_manipulation.hpp - components/intrinsics.hpp - components/merging.hpp - components/prefix_sum.hpp - components/prefix_sum_kernels.cpp - components/reduction.hpp - components/searching.hpp - components/segment_scan.hpp - components/sorting.hpp - components/syncfree.hpp - components/thread_ids.hpp - components/uninitialized_array.hpp - components/warp_blas.hpp - distributed/index_map_kernels.cpp - distributed/matrix_kernels.cpp - distributed/partition_helpers_kernels.cpp - distributed/partition_kernels.cpp - distributed/vector_kernels.cpp - factorization/cholesky_kernels.cpp - factorization/factorization_kernels.cpp - factorization/lu_kernels.cpp - factorization/par_ic_kernels.cpp - factorization/par_ict_kernels.cpp - factorization/par_ilu_kernels.cpp - factorization/par_ilut_filter_kernels.cpp - factorization/par_ilut_select_kernels.cpp - factorization/par_ilut_spgeam_kernels.cpp - factorization/par_ilut_sweep_kernels.cpp - log/batch_logger.hpp - matrix/batch_csr_kernels.cpp - matrix/batch_dense_kernels.cpp - matrix/batch_ell_kernels.cpp - matrix/coo_kernels.cpp - matrix/dense_kernels.cpp - matrix/diagonal_kernels.cpp - matrix/ell_kernels.cpp - matrix/sellp_kernels.cpp - matrix/sparsity_csr_kernels.cpp - multigrid/pgm_kernels.cpp - preconditioner/isai_kernels.cpp - preconditioner/jacobi_kernels.cpp - reorder/rcm_kernels.cpp - solver/cb_gmres_kernels.cpp - solver/idr_kernels.cpp - solver/multigrid_kernels.cpp - stop/batch_criteria.hpp - ) -list(TRANSFORM CUDA_HIP_SOURCES PREPEND ${CMAKE_CURRENT_SOURCE_DIR}/) -list(APPEND CUDA_HIP_SOURCES ${FBCSR_INSTANTIATE}) -set(GKO_CUDA_HIP_COMMON_SOURCES ${CUDA_HIP_SOURCES} PARENT_SCOPE) \ No newline at end of file diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc b/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc new file mode 100644 index 00000000000..9b6301674be --- /dev/null +++ b/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc @@ -0,0 +1,301 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +template +__device__ __forceinline__ void scale( + const gko::batch::multi_vector::batch_item& alpha, + const gko::batch::multi_vector::batch_item& x, Mapping map) +{ + const int max_li = x.num_rows * x.num_rhs; + for (int li = threadIdx.x; li < max_li; li += blockDim.x) { + const int row = li / x.num_rhs; + const int col = li % x.num_rhs; + + x.values[row * x.stride + col] = + alpha.values[map(row, col, alpha.stride)] * + x.values[row * x.stride + col]; + } +} + + +template +__global__ +__launch_bounds__(default_block_size, sm_oversubscription) void scale_kernel( + const gko::batch::multi_vector::uniform_batch alpha, + const gko::batch::multi_vector::uniform_batch x, Mapping map) +{ + for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items; + batch_id += gridDim.x) { + const auto alpha_b = gko::batch::extract_batch_item(alpha, batch_id); + const auto x_b = gko::batch::extract_batch_item(x, batch_id); + scale(alpha_b, x_b, map); + } +} + + +template +__device__ __forceinline__ void add_scaled( + const gko::batch::multi_vector::batch_item& alpha, + const gko::batch::multi_vector::batch_item& x, + const gko::batch::multi_vector::batch_item& y, Mapping map) +{ + const int max_li = x.num_rows * x.num_rhs; + for (int li = threadIdx.x; li < max_li; li += blockDim.x) { + const int row = li / x.num_rhs; + const int col = li % x.num_rhs; + + y.values[row * y.stride + col] += + alpha.values[map(col)] * x.values[row * x.stride + col]; + } +} + + +template +__global__ __launch_bounds__( + default_block_size, + sm_oversubscription) void add_scaled_kernel(const gko::batch::multi_vector:: + uniform_batch< + const ValueType> + alpha, + const gko::batch::multi_vector:: + uniform_batch< + const ValueType> + x, + const gko::batch::multi_vector:: + uniform_batch + y, + Mapping map) +{ + for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items; + batch_id += gridDim.x) { + const auto alpha_b = gko::batch::extract_batch_item(alpha, batch_id); + const auto x_b = gko::batch::extract_batch_item(x, batch_id); + const auto y_b = gko::batch::extract_batch_item(y, batch_id); + add_scaled(alpha_b, x_b, y_b, map); + } +} + + +template +__device__ __forceinline__ void single_rhs_compute_conj_dot(Group subgroup, + const int num_rows, + const ValueType* x, + const ValueType* y, + ValueType& result) + +{ + ValueType val = zero(); + for (int r = subgroup.thread_rank(); r < num_rows; r += subgroup.size()) { + val += conj(x[r]) * y[r]; + } + + // subgroup level reduction + val = reduce(subgroup, val, thrust::plus{}); + + if (subgroup.thread_rank() == 0) { + result = val; + } +} + + +template +__device__ __forceinline__ void gen_one_dot( + const gko::batch::multi_vector::batch_item& x, + const gko::batch::multi_vector::batch_item& y, + const int rhs_index, + const gko::batch::multi_vector::batch_item& result, + Group subgroup, Mapping conj_map) +{ + ValueType val = zero(); + + for (int r = subgroup.thread_rank(); r < x.num_rows; r += subgroup.size()) { + val += conj_map(x.values[r * x.stride + rhs_index]) * + y.values[r * y.stride + rhs_index]; + } + + // subgroup level reduction + val = reduce(subgroup, val, thrust::plus{}); + + if (subgroup.thread_rank() == 0) { + result.values[rhs_index] = val; + } +} + + +template +__device__ __forceinline__ void compute_gen_dot_product( + const gko::batch::multi_vector::batch_item& x, + const gko::batch::multi_vector::batch_item& y, + const gko::batch::multi_vector::batch_item& result, + Mapping conj_map) +{ + constexpr auto tile_size = config::warp_size; + auto thread_block = group::this_thread_block(); + auto subgroup = group::tiled_partition(thread_block); + const auto subgroup_id = static_cast(threadIdx.x / tile_size); + const int num_subgroups_per_block = ceildiv(blockDim.x, tile_size); + + for (int rhs_index = subgroup_id; rhs_index < x.num_rhs; + rhs_index += num_subgroups_per_block) { + gen_one_dot(x, y, rhs_index, result, subgroup, conj_map); + } +} + + +template +__global__ +__launch_bounds__(default_block_size, sm_oversubscription) void compute_gen_dot_product_kernel( + const gko::batch::multi_vector::uniform_batch x, + const gko::batch::multi_vector::uniform_batch y, + const gko::batch::multi_vector::uniform_batch result, + Mapping map) +{ + for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items; + batch_id += gridDim.x) { + const auto x_b = gko::batch::extract_batch_item(x, batch_id); + const auto y_b = gko::batch::extract_batch_item(y, batch_id); + const auto r_b = gko::batch::extract_batch_item(result, batch_id); + compute_gen_dot_product(x_b, y_b, r_b, map); + } +} + + +template +__device__ __forceinline__ void single_rhs_compute_norm2( + Group subgroup, const int num_rows, const ValueType* x, + remove_complex& result) +{ + using real_type = typename gko::remove_complex; + real_type val = zero(); + + for (int r = subgroup.thread_rank(); r < num_rows; r += subgroup.size()) { + val += squared_norm(x[r]); + } + + // subgroup level reduction + val = reduce(subgroup, val, thrust::plus>{}); + + if (subgroup.thread_rank() == 0) { + result = sqrt(val); + } +} + + +template +__device__ __forceinline__ void one_norm2( + const gko::batch::multi_vector::batch_item& x, + const int rhs_index, + const gko::batch::multi_vector::batch_item>& + result, + Group subgroup) +{ + using real_type = typename gko::remove_complex; + real_type val = zero(); + + for (int r = subgroup.thread_rank(); r < x.num_rows; r += subgroup.size()) { + val += squared_norm(x.values[r * x.stride + rhs_index]); + } + + // subgroup level reduction + val = reduce(subgroup, val, thrust::plus>{}); + + if (subgroup.thread_rank() == 0) { + result.values[rhs_index] = sqrt(val); + } +} + + +/** + * Computes the 2-norms of some column vectors in global or shared memory. + * + * @param x A row-major multivector with nrhs columns. + * @param result Holds norm value for each vector in x. + */ +template +__device__ __forceinline__ void compute_norm2( + const gko::batch::multi_vector::batch_item& x, + const gko::batch::multi_vector::batch_item>& + result) +{ + constexpr auto tile_size = config::warp_size; + auto thread_block = group::this_thread_block(); + auto subgroup = group::tiled_partition(thread_block); + const auto subgroup_id = static_cast(threadIdx.x / tile_size); + const int num_subgroups_per_block = ceildiv(blockDim.x, tile_size); + + for (int rhs_index = subgroup_id; rhs_index < x.num_rhs; + rhs_index += num_subgroups_per_block) { + one_norm2(x, rhs_index, result, subgroup); + } +} + + +template +__global__ __launch_bounds__( + default_block_size, + sm_oversubscription) void compute_norm2_kernel(const gko::batch:: + multi_vector:: + uniform_batch< + const ValueType> + x, + const gko::batch:: + multi_vector:: + uniform_batch< + remove_complex< + ValueType>> + result) +{ + for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items; + batch_id += gridDim.x) { + const auto x_b = gko::batch::extract_batch_item(x, batch_id); + const auto r_b = gko::batch::extract_batch_item(result, batch_id); + compute_norm2(x_b, r_b); + } +} + + +template +__device__ __forceinline__ void single_rhs_copy(const int num_rows, + const ValueType* in, + ValueType* out) +{ + for (int iz = threadIdx.x; iz < num_rows; iz += blockDim.x) { + out[iz] = in[iz]; + } +} + + +/** + * Copies the values of one multi-vector into another. + * + * Note that the output multi-vector should already have memory allocated + * and stride set. + */ +template +__device__ __forceinline__ void copy( + const gko::batch::multi_vector::batch_item& in, + const gko::batch::multi_vector::batch_item& out) +{ + for (int iz = threadIdx.x; iz < in.num_rows * in.num_rhs; + iz += blockDim.x) { + const int i = iz / in.num_rhs; + const int j = iz % in.num_rhs; + out.values[i * out.stride + j] = in.values[i * in.stride + j]; + } +} + + +template +__global__ +__launch_bounds__(default_block_size, sm_oversubscription) void copy_kernel( + const gko::batch::multi_vector::uniform_batch src, + const gko::batch::multi_vector::uniform_batch dst) +{ + for (size_type batch_id = blockIdx.x; batch_id < src.num_batch_items; + batch_id += gridDim.x) { + const auto dst_b = gko::batch::extract_batch_item(dst, batch_id); + const auto src_b = gko::batch::extract_batch_item(src, batch_id); + copy(src_b, dst_b); + } +} diff --git a/common/cuda_hip/base/pointer_mode_guard.hpp b/common/cuda_hip/base/pointer_mode_guard.hpp index 382fd85ca76..41ff6242e49 100644 --- a/common/cuda_hip/base/pointer_mode_guard.hpp +++ b/common/cuda_hip/base/pointer_mode_guard.hpp @@ -6,13 +6,10 @@ #define GKO_COMMON_CUDA_HIP_BASE_POINTER_MODE_GUARD_HPP_ -#include "common/unified/base/config.hpp" - - #ifdef GKO_COMPILING_HIP #include "hip/base/pointer_mode_guard.hip.hpp" #else // GKO_COMPILING_CUDA -#include "common/cuda_hip/base/pointer_mode_guard.hpp" +#include "cuda/base/pointer_mode_guard.hpp" #endif diff --git a/common/cuda_hip/base/randlib_bindings.hpp b/common/cuda_hip/base/randlib_bindings.hpp index 6207723b806..05a616a40f3 100644 --- a/common/cuda_hip/base/randlib_bindings.hpp +++ b/common/cuda_hip/base/randlib_bindings.hpp @@ -11,7 +11,7 @@ #define RANDLIB_RNG_PSEUDO_DEFAULT HIPRAND_RNG_PSEUDO_DEFAULT #else // GKO_COMPILING_CUDA -#include "common/cuda_hip/base/curand_bindings.hpp" +#include "cuda/base/curand_bindings.hpp" #define RANDLIB_RNG_PSEUDO_DEFAULT CURAND_RNG_PSEUDO_DEFAULT #endif diff --git a/common/cuda_hip/base/sparselib_bindings.hpp b/common/cuda_hip/base/sparselib_bindings.hpp index b2d6cf2d398..bc565f9190a 100644 --- a/common/cuda_hip/base/sparselib_bindings.hpp +++ b/common/cuda_hip/base/sparselib_bindings.hpp @@ -9,7 +9,7 @@ #ifdef GKO_COMPILING_HIP #include "hip/base/hipsparse_bindings.hip.hpp" #else // GKO_COMPILING_CUDA -#include "common/cuda_hip/base/cusparse_bindings.hpp" +#include "cuda/base/cusparse_bindings.hpp" #endif diff --git a/common/cuda_hip/base/sparselib_block_bindings.hpp b/common/cuda_hip/base/sparselib_block_bindings.hpp new file mode 100644 index 00000000000..3b0f03444d7 --- /dev/null +++ b/common/cuda_hip/base/sparselib_block_bindings.hpp @@ -0,0 +1,16 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_COMMON_CUDA_HIP_BASE_SPARSELIB_BLOCK_BINDINGS_HPP_ +#define GKO_COMMON_CUDA_HIP_BASE_SPARSELIB_BLOCK_BINDINGS_HPP_ + + +#ifdef GKO_COMPILING_HIP +#include "hip/base/hipsparse_block_bindings.hip.hpp" +#else // GKO_COMPILING_CUDA +#include "cuda/base/cusparse_block_bindings.hpp" +#endif + + +#endif // GKO_COMMON_CUDA_HIP_BASE_SPARSELIB_BLOCK_BINDINGS_HPP_ diff --git a/common/cuda_hip/components/format_conversion.hpp b/common/cuda_hip/components/format_conversion.hpp index f0a78a68d45..a16d09b2e3a 100644 --- a/common/cuda_hip/components/format_conversion.hpp +++ b/common/cuda_hip/components/format_conversion.hpp @@ -9,7 +9,7 @@ #ifdef GKO_COMPILING_HIP #include "hip/components/format_conversion.hip.hpp" #else // GKO_COMPILING_CUDA -#include "common/cuda_hip/components/format_conversion.hpp" +#include "cuda/components/format_conversion.cuh" #endif diff --git a/common/cuda_hip/components/memory.hpp b/common/cuda_hip/components/memory.hpp index d1d855f461d..974431e2fb8 100644 --- a/common/cuda_hip/components/memory.hpp +++ b/common/cuda_hip/components/memory.hpp @@ -9,7 +9,7 @@ #ifdef GKO_COMPILING_HIP #include "hip/components/memory.hip.hpp" #else // GKO_COMPILING_CUDA -#include "common/cuda_hip/components/memory.hpp" +#include "cuda/components/memory.cuh" #endif diff --git a/common/cuda_hip/factorization/par_ict_kernels.cpp b/common/cuda_hip/factorization/par_ict_kernels.hpp.inc similarity index 62% rename from common/cuda_hip/factorization/par_ict_kernels.cpp rename to common/cuda_hip/factorization/par_ict_kernels.hpp.inc index 523f89082af..87aa8297345 100644 --- a/common/cuda_hip/factorization/par_ict_kernels.cpp +++ b/common/cuda_hip/factorization/par_ict_kernels.hpp.inc @@ -2,51 +2,6 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include "core/factorization/par_ict_kernels.hpp" - - -#include -#include -#include -#include -#include - - -#include "common/cuda_hip/base/math.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/components/intrinsics.hpp" -#include "common/cuda_hip/components/memory.hpp" -#include "common/cuda_hip/components/merging.hpp" -#include "common/cuda_hip/components/prefix_sum.hpp" -#include "common/cuda_hip/components/reduction.hpp" -#include "common/cuda_hip/components/searching.hpp" -#include "common/cuda_hip/components/thread_ids.hpp" -#include "core/components/prefix_sum_kernels.hpp" -#include "core/matrix/coo_builder.hpp" -#include "core/matrix/csr_builder.hpp" -#include "core/matrix/csr_kernels.hpp" -#include "core/synthesizer/implementation_selection.hpp" - - -namespace gko { -namespace kernels { -namespace GKO_DEVICE_NAMESPACE { -/** - * @brief The parallel ICT factorization namespace. - * - * @ingroup factor - */ -namespace par_ict_factorization { - - -constexpr int default_block_size = 512; - - -// subwarp sizes for all warp-parallel kernels (filter, add_candidates) -using compiled_kernels = - syn::value_list; - - namespace kernel { @@ -320,142 +275,3 @@ __global__ __launch_bounds__(default_block_size) void ict_sweep( } // namespace kernel - - -namespace { - - -template -void add_candidates(syn::value_list, - std::shared_ptr exec, - const matrix::Csr* llh, - const matrix::Csr* a, - const matrix::Csr* l, - matrix::Csr* l_new) -{ - auto num_rows = static_cast(llh->get_size()[0]); - auto subwarps_per_block = default_block_size / subwarp_size; - auto num_blocks = ceildiv(num_rows, subwarps_per_block); - matrix::CsrBuilder l_new_builder(l_new); - auto llh_row_ptrs = llh->get_const_row_ptrs(); - auto llh_col_idxs = llh->get_const_col_idxs(); - auto llh_vals = llh->get_const_values(); - auto a_row_ptrs = a->get_const_row_ptrs(); - auto a_col_idxs = a->get_const_col_idxs(); - auto a_vals = a->get_const_values(); - auto l_row_ptrs = l->get_const_row_ptrs(); - auto l_col_idxs = l->get_const_col_idxs(); - auto l_vals = l->get_const_values(); - auto l_new_row_ptrs = l_new->get_row_ptrs(); - // count non-zeros per row - if (num_blocks > 0) { - kernel::ict_tri_spgeam_nnz - <<get_stream()>>>( - llh_row_ptrs, llh_col_idxs, a_row_ptrs, a_col_idxs, - l_new_row_ptrs, num_rows); - } - - // build row ptrs - components::prefix_sum_nonnegative(exec, l_new_row_ptrs, num_rows + 1); - - // resize output arrays - auto l_new_nnz = exec->copy_val_to_host(l_new_row_ptrs + num_rows); - l_new_builder.get_col_idx_array().resize_and_reset(l_new_nnz); - l_new_builder.get_value_array().resize_and_reset(l_new_nnz); - - auto l_new_col_idxs = l_new->get_col_idxs(); - auto l_new_vals = l_new->get_values(); - - // fill columns and values - if (num_blocks > 0) { - kernel::ict_tri_spgeam_init - <<get_stream()>>>( - llh_row_ptrs, llh_col_idxs, as_device_type(llh_vals), - a_row_ptrs, a_col_idxs, as_device_type(a_vals), l_row_ptrs, - l_col_idxs, as_device_type(l_vals), l_new_row_ptrs, - l_new_col_idxs, as_device_type(l_new_vals), num_rows); - } -} - - -GKO_ENABLE_IMPLEMENTATION_SELECTION(select_add_candidates, add_candidates); - - -template -void compute_factor(syn::value_list, - std::shared_ptr exec, - const matrix::Csr* a, - matrix::Csr* l, - const matrix::Coo* l_coo) -{ - auto total_nnz = static_cast(l->get_num_stored_elements()); - auto block_size = default_block_size / subwarp_size; - auto num_blocks = ceildiv(total_nnz, block_size); - if (num_blocks > 0) { - kernel::ict_sweep - <<get_stream()>>>( - a->get_const_row_ptrs(), a->get_const_col_idxs(), - as_device_type(a->get_const_values()), l->get_const_row_ptrs(), - l_coo->get_const_row_idxs(), l->get_const_col_idxs(), - as_device_type(l->get_values()), - static_cast(l->get_num_stored_elements())); - } -} - - -GKO_ENABLE_IMPLEMENTATION_SELECTION(select_compute_factor, compute_factor); - - -} // namespace - - -template -void add_candidates(std::shared_ptr exec, - const matrix::Csr* llh, - const matrix::Csr* a, - const matrix::Csr* l, - matrix::Csr* l_new) -{ - auto num_rows = a->get_size()[0]; - auto total_nnz = - llh->get_num_stored_elements() + a->get_num_stored_elements(); - auto total_nnz_per_row = total_nnz / num_rows; - select_add_candidates( - compiled_kernels(), - [&](int compiled_subwarp_size) { - return total_nnz_per_row <= compiled_subwarp_size || - compiled_subwarp_size == config::warp_size; - }, - syn::value_list(), syn::type_list<>(), exec, llh, a, l, l_new); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_PAR_ICT_ADD_CANDIDATES_KERNEL); - - -template -void compute_factor(std::shared_ptr exec, - const matrix::Csr* a, - matrix::Csr* l, - const matrix::Coo* l_coo) -{ - auto num_rows = a->get_size()[0]; - auto total_nnz = 2 * l->get_num_stored_elements(); - auto total_nnz_per_row = total_nnz / num_rows; - select_compute_factor( - compiled_kernels(), - [&](int compiled_subwarp_size) { - return total_nnz_per_row <= compiled_subwarp_size || - compiled_subwarp_size == config::warp_size; - }, - syn::value_list(), syn::type_list<>(), exec, a, l, l_coo); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_PAR_ICT_COMPUTE_FACTOR_KERNEL); - - -} // namespace par_ict_factorization -} // namespace GKO_DEVICE_NAMESPACE -} // namespace kernels -} // namespace gko diff --git a/common/cuda_hip/factorization/par_ilut_filter_kernels.cpp b/common/cuda_hip/factorization/par_ilut_filter_kernels.hpp.inc similarity index 57% rename from common/cuda_hip/factorization/par_ilut_filter_kernels.cpp rename to common/cuda_hip/factorization/par_ilut_filter_kernels.hpp.inc index 3622f971878..68794bfc8d1 100644 --- a/common/cuda_hip/factorization/par_ilut_filter_kernels.cpp +++ b/common/cuda_hip/factorization/par_ilut_filter_kernels.hpp.inc @@ -2,49 +2,6 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include "core/factorization/par_ilut_kernels.hpp" - - -#include -#include -#include -#include -#include - - -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/math.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/types.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "common/cuda_hip/components/intrinsics.hpp" -#include "common/cuda_hip/components/thread_ids.hpp" -#include "core/components/prefix_sum_kernels.hpp" -#include "core/matrix/coo_builder.hpp" -#include "core/matrix/csr_builder.hpp" -#include "core/matrix/csr_kernels.hpp" -#include "core/synthesizer/implementation_selection.hpp" - - -namespace gko { -namespace kernels { -namespace GKO_DEVICE_NAMESPACE { -/** - * @brief The parallel ILUT factorization namespace. - * - * @ingroup factor - */ -namespace par_ilut_factorization { - - -constexpr int default_block_size = 512; - - -// subwarp sizes for filter kernels -using compiled_kernels = - syn::value_list; - - namespace kernel { @@ -205,95 +162,3 @@ __global__ __launch_bounds__(default_block_size) void bucket_filter( } // namespace kernel - - -namespace { - - -template -void threshold_filter(syn::value_list, - std::shared_ptr exec, - const matrix::Csr* a, - remove_complex threshold, - matrix::Csr* m_out, - matrix::Coo* m_out_coo, bool lower) -{ - auto old_row_ptrs = a->get_const_row_ptrs(); - auto old_col_idxs = a->get_const_col_idxs(); - auto old_vals = a->get_const_values(); - // compute nnz for each row - auto num_rows = static_cast(a->get_size()[0]); - auto block_size = default_block_size / subwarp_size; - auto num_blocks = ceildiv(num_rows, block_size); - auto new_row_ptrs = m_out->get_row_ptrs(); - if (num_blocks > 0) { - kernel::threshold_filter_nnz - <<get_stream()>>>( - old_row_ptrs, as_device_type(old_vals), num_rows, - as_device_type(threshold), new_row_ptrs, lower); - } - - // build row pointers - components::prefix_sum_nonnegative(exec, new_row_ptrs, num_rows + 1); - - // build matrix - auto new_nnz = exec->copy_val_to_host(new_row_ptrs + num_rows); - // resize arrays and update aliases - matrix::CsrBuilder builder{m_out}; - builder.get_col_idx_array().resize_and_reset(new_nnz); - builder.get_value_array().resize_and_reset(new_nnz); - auto new_col_idxs = m_out->get_col_idxs(); - auto new_vals = m_out->get_values(); - IndexType* new_row_idxs{}; - if (m_out_coo) { - matrix::CooBuilder coo_builder{m_out_coo}; - coo_builder.get_row_idx_array().resize_and_reset(new_nnz); - coo_builder.get_col_idx_array() = - make_array_view(exec, new_nnz, new_col_idxs); - coo_builder.get_value_array() = - make_array_view(exec, new_nnz, new_vals); - new_row_idxs = m_out_coo->get_row_idxs(); - } - if (num_blocks > 0) { - kernel::threshold_filter - <<get_stream()>>>( - old_row_ptrs, old_col_idxs, as_device_type(old_vals), num_rows, - as_device_type(threshold), new_row_ptrs, new_row_idxs, - new_col_idxs, as_device_type(new_vals), lower); - } -} - - -GKO_ENABLE_IMPLEMENTATION_SELECTION(select_threshold_filter, threshold_filter); - - -} // namespace - -template -void threshold_filter(std::shared_ptr exec, - const matrix::Csr* a, - remove_complex threshold, - matrix::Csr* m_out, - matrix::Coo* m_out_coo, bool lower) -{ - auto num_rows = a->get_size()[0]; - auto total_nnz = a->get_num_stored_elements(); - auto total_nnz_per_row = total_nnz / num_rows; - select_threshold_filter( - compiled_kernels(), - [&](int compiled_subwarp_size) { - return total_nnz_per_row <= compiled_subwarp_size || - compiled_subwarp_size == config::warp_size; - }, - syn::value_list(), syn::type_list<>(), exec, a, threshold, m_out, - m_out_coo, lower); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_KERNEL); - - -} // namespace par_ilut_factorization -} // namespace GKO_DEVICE_NAMESPACE -} // namespace kernels -} // namespace gko diff --git a/common/cuda_hip/factorization/par_ilut_select_kernels.cpp b/common/cuda_hip/factorization/par_ilut_select_kernels.hpp.inc similarity index 63% rename from common/cuda_hip/factorization/par_ilut_select_kernels.cpp rename to common/cuda_hip/factorization/par_ilut_select_kernels.hpp.inc index 5c00503923a..2ee5061d4c5 100644 --- a/common/cuda_hip/factorization/par_ilut_select_kernels.cpp +++ b/common/cuda_hip/factorization/par_ilut_select_kernels.hpp.inc @@ -2,40 +2,6 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include "core/factorization/par_ilut_kernels.hpp" - - -#include -#include - - -#include -#include -#include - - -#include "common/cuda_hip/base/math.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/components/atomic.hpp" -#include "common/cuda_hip/components/intrinsics.hpp" -#include "common/cuda_hip/components/prefix_sum.hpp" -#include "common/cuda_hip/components/searching.hpp" -#include "common/cuda_hip/components/sorting.hpp" -#include "common/cuda_hip/components/thread_ids.hpp" -#include "core/components/prefix_sum_kernels.hpp" - - -namespace gko { -namespace kernels { -namespace GKO_DEVICE_NAMESPACE { -/** - * @brief The parallel ILUT factorization namespace. - * - * @ingroup factor - */ -namespace par_ilut_factorization { - - namespace kernel { @@ -312,126 +278,3 @@ __global__ __launch_bounds__(config::warp_size) void find_bucket( } // namespace kernel - - -template -void sampleselect_filter(std::shared_ptr exec, - const ValueType* values, IndexType size, - const unsigned char* oracles, - const IndexType* partial_counts, IndexType bucket, - remove_complex* out) -{ - auto num_threads_total = ceildiv(size, items_per_thread); - auto num_blocks = - static_cast(ceildiv(num_threads_total, default_block_size)); - if (num_blocks > 0) { - kernel::filter_bucket<<get_stream()>>>( - as_device_type(values), size, bucket, oracles, partial_counts, - as_device_type(out), items_per_thread); - } -} - - -template -void threshold_select(std::shared_ptr exec, - const matrix::Csr* m, - IndexType rank, array& tmp1, - array>& tmp2, - remove_complex& threshold) -{ - auto values = m->get_const_values(); - IndexType size = m->get_num_stored_elements(); - using AbsType = remove_complex; - constexpr auto bucket_count = kernel::searchtree_width; - auto max_num_threads = ceildiv(size, items_per_thread); - auto max_num_blocks = ceildiv(max_num_threads, default_block_size); - - size_type tmp_size_totals = - ceildiv((bucket_count + 1) * sizeof(IndexType), sizeof(ValueType)); - size_type tmp_size_partials = ceildiv( - bucket_count * max_num_blocks * sizeof(IndexType), sizeof(ValueType)); - size_type tmp_size_oracles = - ceildiv(size * sizeof(unsigned char), sizeof(ValueType)); - size_type tmp_size_tree = - ceildiv(kernel::searchtree_size * sizeof(AbsType), sizeof(ValueType)); - size_type tmp_size_vals = - size / bucket_count * 4; // pessimistic estimate for temporary storage - size_type tmp_size = - tmp_size_totals + tmp_size_partials + tmp_size_oracles + tmp_size_tree; - tmp1.resize_and_reset(tmp_size); - tmp2.resize_and_reset(tmp_size_vals); - - auto total_counts = reinterpret_cast(tmp1.get_data()); - auto partial_counts = - reinterpret_cast(tmp1.get_data() + tmp_size_totals); - auto oracles = reinterpret_cast( - tmp1.get_data() + tmp_size_totals + tmp_size_partials); - auto tree = - reinterpret_cast(tmp1.get_data() + tmp_size_totals + - tmp_size_partials + tmp_size_oracles); - - sampleselect_count(exec, values, size, tree, oracles, partial_counts, - total_counts); - - // determine bucket with correct rank, use bucket-local rank - auto bucket = sampleselect_find_bucket(exec, total_counts, rank); - rank -= bucket.begin; - - if (bucket.size * 2 > tmp_size_vals) { - // we need to reallocate tmp2 - tmp2.resize_and_reset(bucket.size * 2); - } - auto tmp21 = tmp2.get_data(); - auto tmp22 = tmp2.get_data() + bucket.size; - // extract target bucket - sampleselect_filter(exec, values, size, oracles, partial_counts, bucket.idx, - tmp22); - - // recursively select from smaller buckets - int step{}; - while (bucket.size > kernel::basecase_size) { - std::swap(tmp21, tmp22); - const auto* tmp_in = tmp21; - auto tmp_out = tmp22; - - sampleselect_count(exec, tmp_in, bucket.size, tree, oracles, - partial_counts, total_counts); - auto new_bucket = sampleselect_find_bucket(exec, total_counts, rank); - sampleselect_filter(exec, tmp_in, bucket.size, oracles, partial_counts, - bucket.idx, tmp_out); - - rank -= new_bucket.begin; - bucket.size = new_bucket.size; - // we should never need more than 5 recursion steps, this would mean - // 256^5 = 2^40. fall back to standard library algorithm in that case. - ++step; - if (step > 5) { - array cpu_out_array{ - exec->get_master(), - make_array_view(exec, bucket.size, tmp_out)}; - auto begin = cpu_out_array.get_data(); - auto end = begin + bucket.size; - auto middle = begin + rank; - std::nth_element(begin, middle, end); - threshold = *middle; - return; - } - } - - // base case - auto out_ptr = reinterpret_cast(tmp1.get_data()); - kernel::basecase_select<<<1, kernel::basecase_block_size, 0, - exec->get_stream()>>>( - as_device_type(tmp22), bucket.size, rank, as_device_type(out_ptr)); - threshold = exec->copy_val_to_host(out_ptr); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_PAR_ILUT_THRESHOLD_SELECT_KERNEL); - - -} // namespace par_ilut_factorization -} // namespace GKO_DEVICE_NAMESPACE -} // namespace kernels -} // namespace gko diff --git a/common/cuda_hip/factorization/par_ilut_spgeam_kernels.cpp b/common/cuda_hip/factorization/par_ilut_spgeam_kernels.hpp.inc similarity index 63% rename from common/cuda_hip/factorization/par_ilut_spgeam_kernels.cpp rename to common/cuda_hip/factorization/par_ilut_spgeam_kernels.hpp.inc index b9658f69f70..a97f0f08937 100644 --- a/common/cuda_hip/factorization/par_ilut_spgeam_kernels.cpp +++ b/common/cuda_hip/factorization/par_ilut_spgeam_kernels.hpp.inc @@ -2,50 +2,6 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include "core/factorization/par_ilut_kernels.hpp" - - -#include -#include -#include -#include -#include - - -#include "common/cuda_hip/base/math.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "common/cuda_hip/components/intrinsics.hpp" -#include "common/cuda_hip/components/merging.hpp" -#include "common/cuda_hip/components/prefix_sum.hpp" -#include "common/cuda_hip/components/searching.hpp" -#include "common/cuda_hip/components/thread_ids.hpp" -#include "core/components/prefix_sum_kernels.hpp" -#include "core/matrix/coo_builder.hpp" -#include "core/matrix/csr_builder.hpp" -#include "core/matrix/csr_kernels.hpp" -#include "core/synthesizer/implementation_selection.hpp" - - -namespace gko { -namespace kernels { -namespace GKO_DEVICE_NAMESPACE { -/** - * @brief The parallel ILUT factorization namespace. - * - * @ingroup factor - */ -namespace par_ilut_factorization { - - -constexpr int default_block_size = 512; - - -// subwarp sizes for add_candidates kernels -using compiled_kernels = - syn::value_list; - - namespace kernel { @@ -290,113 +246,3 @@ __global__ __launch_bounds__(default_block_size) void tri_spgeam_init( } // namespace kernel - - -namespace { - - -template -void add_candidates(syn::value_list, - std::shared_ptr exec, - const matrix::Csr* lu, - const matrix::Csr* a, - const matrix::Csr* l, - const matrix::Csr* u, - matrix::Csr* l_new, - matrix::Csr* u_new) -{ - auto num_rows = static_cast(lu->get_size()[0]); - auto subwarps_per_block = default_block_size / subwarp_size; - auto num_blocks = ceildiv(num_rows, subwarps_per_block); - matrix::CsrBuilder l_new_builder(l_new); - matrix::CsrBuilder u_new_builder(u_new); - auto lu_row_ptrs = lu->get_const_row_ptrs(); - auto lu_col_idxs = lu->get_const_col_idxs(); - auto lu_vals = lu->get_const_values(); - auto a_row_ptrs = a->get_const_row_ptrs(); - auto a_col_idxs = a->get_const_col_idxs(); - auto a_vals = a->get_const_values(); - auto l_row_ptrs = l->get_const_row_ptrs(); - auto l_col_idxs = l->get_const_col_idxs(); - auto l_vals = l->get_const_values(); - auto u_row_ptrs = u->get_const_row_ptrs(); - auto u_col_idxs = u->get_const_col_idxs(); - auto u_vals = u->get_const_values(); - auto l_new_row_ptrs = l_new->get_row_ptrs(); - auto u_new_row_ptrs = u_new->get_row_ptrs(); - if (num_blocks > 0) { - // count non-zeros per row - kernel::tri_spgeam_nnz - <<get_stream()>>>( - lu_row_ptrs, lu_col_idxs, a_row_ptrs, a_col_idxs, - l_new_row_ptrs, u_new_row_ptrs, num_rows); - } - - // build row ptrs - components::prefix_sum_nonnegative(exec, l_new_row_ptrs, num_rows + 1); - components::prefix_sum_nonnegative(exec, u_new_row_ptrs, num_rows + 1); - - // resize output arrays - auto l_new_nnz = exec->copy_val_to_host(l_new_row_ptrs + num_rows); - auto u_new_nnz = exec->copy_val_to_host(u_new_row_ptrs + num_rows); - l_new_builder.get_col_idx_array().resize_and_reset(l_new_nnz); - l_new_builder.get_value_array().resize_and_reset(l_new_nnz); - u_new_builder.get_col_idx_array().resize_and_reset(u_new_nnz); - u_new_builder.get_value_array().resize_and_reset(u_new_nnz); - - auto l_new_col_idxs = l_new->get_col_idxs(); - auto l_new_vals = l_new->get_values(); - auto u_new_col_idxs = u_new->get_col_idxs(); - auto u_new_vals = u_new->get_values(); - - if (num_blocks > 0) { - // fill columns and values - kernel::tri_spgeam_init - <<get_stream()>>>( - lu_row_ptrs, lu_col_idxs, as_device_type(lu_vals), a_row_ptrs, - a_col_idxs, as_device_type(a_vals), l_row_ptrs, l_col_idxs, - as_device_type(l_vals), u_row_ptrs, u_col_idxs, - as_device_type(u_vals), l_new_row_ptrs, l_new_col_idxs, - as_device_type(l_new_vals), u_new_row_ptrs, u_new_col_idxs, - as_device_type(u_new_vals), num_rows); - } -} - - -GKO_ENABLE_IMPLEMENTATION_SELECTION(select_add_candidates, add_candidates); - - -} // namespace - - -template -void add_candidates(std::shared_ptr exec, - const matrix::Csr* lu, - const matrix::Csr* a, - const matrix::Csr* l, - const matrix::Csr* u, - matrix::Csr* l_new, - matrix::Csr* u_new) -{ - auto num_rows = a->get_size()[0]; - auto total_nnz = - lu->get_num_stored_elements() + a->get_num_stored_elements(); - auto total_nnz_per_row = total_nnz / num_rows; - select_add_candidates( - compiled_kernels(), - [&](int compiled_subwarp_size) { - return total_nnz_per_row <= compiled_subwarp_size || - compiled_subwarp_size == config::warp_size; - }, - syn::value_list(), syn::type_list<>(), exec, lu, a, l, u, l_new, - u_new); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_PAR_ILUT_ADD_CANDIDATES_KERNEL); - - -} // namespace par_ilut_factorization -} // namespace GKO_DEVICE_NAMESPACE -} // namespace kernels -} // namespace gko diff --git a/common/cuda_hip/factorization/par_ilut_sweep_kernels.hpp.inc b/common/cuda_hip/factorization/par_ilut_sweep_kernels.hpp.inc new file mode 100644 index 00000000000..9da94a878b3 --- /dev/null +++ b/common/cuda_hip/factorization/par_ilut_sweep_kernels.hpp.inc @@ -0,0 +1,94 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +namespace kernel { + + +template +__global__ __launch_bounds__(default_block_size) void sweep( + const IndexType* __restrict__ a_row_ptrs, + const IndexType* __restrict__ a_col_idxs, + const ValueType* __restrict__ a_vals, + const IndexType* __restrict__ l_row_ptrs, + const IndexType* __restrict__ l_row_idxs, + const IndexType* __restrict__ l_col_idxs, ValueType* __restrict__ l_vals, + IndexType l_nnz, const IndexType* __restrict__ u_row_idxs, + const IndexType* __restrict__ u_col_idxs, ValueType* __restrict__ u_vals, + const IndexType* __restrict__ ut_col_ptrs, + const IndexType* __restrict__ ut_row_idxs, ValueType* __restrict__ ut_vals, + IndexType u_nnz) +{ + auto tidx = thread::get_subwarp_id_flat(); + if (tidx >= l_nnz + u_nnz) { + return; + } + // split the subwarps into two halves for lower and upper triangle + auto l_nz = tidx; + auto u_nz = l_nz - l_nnz; + auto lower = u_nz < 0; + auto row = lower ? l_row_idxs[l_nz] : u_row_idxs[u_nz]; + auto col = lower ? l_col_idxs[l_nz] : u_col_idxs[u_nz]; + if (lower && row == col) { + // don't update the diagonal twice + return; + } + auto subwarp = + group::tiled_partition(group::this_thread_block()); + // find entry of A at (row, col) + auto a_row_begin = a_row_ptrs[row]; + auto a_row_end = a_row_ptrs[row + 1]; + auto a_row_size = a_row_end - a_row_begin; + auto a_idx = + group_wide_search(a_row_begin, a_row_size, subwarp, + [&](IndexType i) { return a_col_idxs[i] >= col; }); + bool has_a = a_idx < a_row_end && a_col_idxs[a_idx] == col; + auto a_val = has_a ? a_vals[a_idx] : zero(); + auto l_row_begin = l_row_ptrs[row]; + auto l_row_size = l_row_ptrs[row + 1] - l_row_begin; + auto ut_col_begin = ut_col_ptrs[col]; + auto ut_col_size = ut_col_ptrs[col + 1] - ut_col_begin; + ValueType sum{}; + IndexType ut_nz{}; + auto last_entry = min(row, col); + group_merge( + l_col_idxs + l_row_begin, l_row_size, ut_row_idxs + ut_col_begin, + ut_col_size, subwarp, + [&](IndexType l_idx, IndexType l_col, IndexType ut_idx, + IndexType ut_row, IndexType, bool) { + // we don't need to use the `bool valid` because last_entry is + // already a smaller sentinel value than the one used in group_merge + if (l_col == ut_row && l_col < last_entry) { + sum += load_relaxed(l_vals + (l_idx + l_row_begin)) * + load_relaxed(ut_vals + (ut_idx + ut_col_begin)); + } + // remember the transposed element + auto found_transp = subwarp.ballot(ut_row == row); + if (found_transp) { + ut_nz = + subwarp.shfl(ut_idx + ut_col_begin, ffs(found_transp) - 1); + } + return true; + }); + // accumulate result from all threads + sum = reduce(subwarp, sum, [](ValueType a, ValueType b) { return a + b; }); + + if (subwarp.thread_rank() == 0) { + if (lower) { + auto to_write = (a_val - sum) / + load_relaxed(ut_vals + (ut_col_ptrs[col + 1] - 1)); + if (is_finite(to_write)) { + store_relaxed(l_vals + l_nz, to_write); + } + } else { + auto to_write = a_val - sum; + if (is_finite(to_write)) { + store_relaxed(u_vals + u_nz, to_write); + store_relaxed(ut_vals + ut_nz, to_write); + } + } + } +} + + +} // namespace kernel diff --git a/common/cuda_hip/log/batch_logger.hpp b/common/cuda_hip/log/batch_logger.hpp.inc similarity index 68% rename from common/cuda_hip/log/batch_logger.hpp rename to common/cuda_hip/log/batch_logger.hpp.inc index 77ec84fb7bd..04b614b50f9 100644 --- a/common/cuda_hip/log/batch_logger.hpp +++ b/common/cuda_hip/log/batch_logger.hpp.inc @@ -2,19 +2,6 @@ // // SPDX-License-Identifier: BSD-3-Clause -#ifndef GKO_COMMON_CUDA_HIP_LOG_BATCH_LOGGER_HPP_ -#define GKO_COMMON_CUDA_HIP_LOG_BATCH_LOGGER_HPP_ - - -#include - - -namespace gko { -namespace kernels { -namespace GKO_DEVICE_NAMESPACE { -namespace batch_log { - - /** * @see reference/log/batch_logger.hpp */ @@ -41,12 +28,3 @@ class SimpleFinalLogger final { real_type* const final_residuals_; idx_type* const final_iters_; }; - - -} // namespace batch_log -} // namespace GKO_DEVICE_NAMESPACE -} // namespace kernels -} // namespace gko - - -#endif // GKO_COMMON_CUDA_HIP_LOG_BATCH_LOGGER_HPP_ diff --git a/common/cuda_hip/matrix/batch_csr_kernels.cpp b/common/cuda_hip/matrix/batch_csr_kernels.hpp.inc similarity index 87% rename from common/cuda_hip/matrix/batch_csr_kernels.cpp rename to common/cuda_hip/matrix/batch_csr_kernels.hpp.inc index a07074e29e8..e041dadaa3e 100644 --- a/common/cuda_hip/matrix/batch_csr_kernels.cpp +++ b/common/cuda_hip/matrix/batch_csr_kernels.hpp.inc @@ -2,49 +2,6 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include -#include - - -#include - - -#include -#include -#include - - -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/thrust.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "common/cuda_hip/components/reduction.hpp" -#include "common/cuda_hip/components/thread_ids.hpp" -#include "common/cuda_hip/components/uninitialized_array.hpp" -#include "core/base/batch_struct.hpp" -#include "core/matrix/batch_csr_kernels.hpp" -#include "core/matrix/batch_struct.hpp" - - -namespace gko { -namespace kernels { -namespace GKO_DEVICE_NAMESPACE { -/** - * @brief The Csr matrix format namespace. - * @ref Csr - * @ingroup batch_csr - */ -namespace batch_csr { - - -constexpr auto default_block_size = 256; -constexpr int sm_oversubscription = 4; - -// clang-format off - -// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES - - template __device__ __forceinline__ void simple_apply( const gko::batch::matrix::csr::batch_item& mat, @@ -239,14 +196,3 @@ __global__ void add_scaled_identity_kernel( add_scaled_identity(alpha_b.values[0], beta_b.values[0], mat_b); } } - - -#include "common/cuda_hip/matrix/batch_csr_kernel_launcher.hpp.inc" - -// clang-format on - - -} // namespace batch_csr -} // namespace GKO_DEVICE_NAMESPACE -} // namespace kernels -} // namespace gko diff --git a/common/cuda_hip/matrix/batch_dense_kernels.cpp b/common/cuda_hip/matrix/batch_dense_kernels.hpp.inc similarity index 89% rename from common/cuda_hip/matrix/batch_dense_kernels.cpp rename to common/cuda_hip/matrix/batch_dense_kernels.hpp.inc index b5c2dbe1d5d..f8abf9131a1 100644 --- a/common/cuda_hip/matrix/batch_dense_kernels.cpp +++ b/common/cuda_hip/matrix/batch_dense_kernels.hpp.inc @@ -2,49 +2,6 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include -#include - - -#include - - -#include -#include -#include - - -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/thrust.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "common/cuda_hip/components/reduction.hpp" -#include "common/cuda_hip/components/thread_ids.hpp" -#include "common/cuda_hip/components/uninitialized_array.hpp" -#include "core/base/batch_struct.hpp" -#include "core/matrix/batch_dense_kernels.hpp" -#include "core/matrix/batch_struct.hpp" - - -namespace gko { -namespace kernels { -namespace GKO_DEVICE_NAMESPACE { -/** - * @brief The Dense matrix format namespace. - * - * @ingroup batch_dense - */ -namespace batch_dense { - - -constexpr auto default_block_size = 256; -constexpr int sm_oversubscription = 4; - -// clang-format off - -// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES - - template __device__ __forceinline__ void simple_apply( const gko::batch::matrix::dense::batch_item& mat, @@ -286,15 +243,3 @@ __global__ void add_scaled_identity_kernel( add_scaled_identity(alpha_b.values[0], beta_b.values[0], mat_b); } } - - -#include "common/cuda_hip/matrix/batch_dense_kernel_launcher.hpp.inc" - - -// clang-format on - - -} // namespace batch_dense -} // namespace GKO_DEVICE_NAMESPACE -} // namespace kernels -} // namespace gko diff --git a/common/cuda_hip/matrix/batch_ell_kernels.cpp b/common/cuda_hip/matrix/batch_ell_kernels.hpp.inc similarity index 87% rename from common/cuda_hip/matrix/batch_ell_kernels.cpp rename to common/cuda_hip/matrix/batch_ell_kernels.hpp.inc index c3bf21c7744..0a6d1927c96 100644 --- a/common/cuda_hip/matrix/batch_ell_kernels.cpp +++ b/common/cuda_hip/matrix/batch_ell_kernels.hpp.inc @@ -2,49 +2,6 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include -#include - - -#include - - -#include -#include -#include - - -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/thrust.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "common/cuda_hip/components/reduction.hpp" -#include "common/cuda_hip/components/thread_ids.hpp" -#include "common/cuda_hip/components/uninitialized_array.hpp" -#include "core/base/batch_struct.hpp" -#include "core/matrix/batch_ell_kernels.hpp" -#include "core/matrix/batch_struct.hpp" - - -namespace gko { -namespace kernels { -namespace GKO_DEVICE_NAMESPACE { -/** - * @brief The Ell matrix format namespace. - * @ref Ell - * @ingroup batch_ell - */ -namespace batch_ell { - - -constexpr auto default_block_size = 256; -constexpr int sm_oversubscription = 4; - -// clang-format off - -// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES - - template __device__ __forceinline__ void simple_apply( const gko::batch::matrix::ell::batch_item& mat, @@ -248,14 +205,3 @@ __global__ void add_scaled_identity_kernel( add_scaled_identity(alpha_b.values[0], beta_b.values[0], mat_b); } } - - -#include "common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc" - -// clang-format on - - -} // namespace batch_ell -} // namespace GKO_DEVICE_NAMESPACE -} // namespace kernels -} // namespace gko diff --git a/common/cuda_hip/matrix/fbcsr_kernels.template.cpp b/common/cuda_hip/matrix/fbcsr_kernels.template.cpp index 9e5eed5f570..7e0b38c015b 100644 --- a/common/cuda_hip/matrix/fbcsr_kernels.template.cpp +++ b/common/cuda_hip/matrix/fbcsr_kernels.template.cpp @@ -3,7 +3,6 @@ // SPDX-License-Identifier: BSD-3-Clause #include -#include #include @@ -28,6 +27,7 @@ #include "common/cuda_hip/base/pointer_mode_guard.hpp" #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/sparselib_bindings.hpp" +#include "common/cuda_hip/base/sparselib_block_bindings.hpp" #include "common/cuda_hip/base/thrust.hpp" #include "common/cuda_hip/base/types.hpp" #include "common/cuda_hip/components/atomic.hpp" @@ -67,6 +67,7 @@ constexpr int default_block_size{512}; #include "common/cuda_hip/matrix/csr_common.hpp.inc" + namespace kernel { diff --git a/common/cuda_hip/preconditioner/jacobi_kernels.cpp b/common/cuda_hip/preconditioner/jacobi_kernels.hpp.inc similarity index 91% rename from common/cuda_hip/preconditioner/jacobi_kernels.cpp rename to common/cuda_hip/preconditioner/jacobi_kernels.hpp.inc index 45d32493f25..e0d7cfef0e9 100644 --- a/common/cuda_hip/preconditioner/jacobi_kernels.cpp +++ b/common/cuda_hip/preconditioner/jacobi_kernels.hpp.inc @@ -2,48 +2,6 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include "core/preconditioner/jacobi_kernels.hpp" - - -#include - - -#include - - -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/math.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/types.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "common/cuda_hip/components/thread_ids.hpp" -#include "core/base/extended_float.hpp" -#include "core/preconditioner/jacobi_utils.hpp" -#include "core/synthesizer/implementation_selection.hpp" - - -namespace gko { -namespace kernels { -namespace GKO_DEVICE_NAMESPACE { -/** - * @brief The Jacobi preconditioner namespace. - * @ref Jacobi - * @ingroup jacobi - */ -namespace jacobi { - - -// a total of 32/16 warps (1024 threads) -#if defined(GKO_COMPILING_HIP) && GINKGO_HIP_PLATFORM_HCC -constexpr int default_num_warps = 16; -#else // !defined(GKO_COMPILING_HIP) || GINKGO_HIP_PLATFORM_NVCC -constexpr int default_num_warps = 32; -#endif -// with current architectures, at most 32 warps can be scheduled per SM (and -// current GPUs have at most 84 SMs) -constexpr int default_grid_size = 32 * 32 * 128; - - namespace { @@ -411,9 +369,3 @@ void convert_to_dense( GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_JACOBI_CONVERT_TO_DENSE_KERNEL); - - -} // namespace jacobi -} // namespace GKO_DEVICE_NAMESPACE -} // namespace kernels -} // namespace gko diff --git a/common/cuda_hip/sources.cmake b/common/cuda_hip/sources.cmake new file mode 100644 index 00000000000..aac602cbbd3 --- /dev/null +++ b/common/cuda_hip/sources.cmake @@ -0,0 +1,31 @@ +# this file needs to be include(...)'d directly because of scoping issues +include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake) +add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip/ matrix/fbcsr_kernels.instantiate.cpp FBCSR_INSTANTIATE) +set(GKO_CUDA_HIP_COMMON_SOURCES + base/device_matrix_data_kernels.cpp + components/prefix_sum_kernels.cpp + distributed/index_map_kernels.cpp + distributed/matrix_kernels.cpp + distributed/partition_helpers_kernels.cpp + distributed/partition_kernels.cpp + distributed/vector_kernels.cpp + factorization/cholesky_kernels.cpp + factorization/factorization_kernels.cpp + factorization/lu_kernels.cpp + factorization/par_ic_kernels.cpp + factorization/par_ilu_kernels.cpp + matrix/coo_kernels.cpp + matrix/dense_kernels.cpp + matrix/diagonal_kernels.cpp + matrix/ell_kernels.cpp + matrix/sellp_kernels.cpp + matrix/sparsity_csr_kernels.cpp + multigrid/pgm_kernels.cpp + preconditioner/isai_kernels.cpp + reorder/rcm_kernels.cpp + solver/cb_gmres_kernels.cpp + solver/idr_kernels.cpp + solver/multigrid_kernels.cpp + ) +list(TRANSFORM GKO_CUDA_HIP_COMMON_SOURCES PREPEND ${PROJECT_SOURCE_DIR}/common/cuda_hip/) +list(APPEND GKO_CUDA_HIP_COMMON_SOURCES ${FBCSR_INSTANTIATE}) \ No newline at end of file diff --git a/common/cuda_hip/stop/batch_criteria.hpp b/common/cuda_hip/stop/batch_criteria.hpp.inc similarity index 75% rename from common/cuda_hip/stop/batch_criteria.hpp rename to common/cuda_hip/stop/batch_criteria.hpp.inc index 7491a143a31..38072467765 100644 --- a/common/cuda_hip/stop/batch_criteria.hpp +++ b/common/cuda_hip/stop/batch_criteria.hpp.inc @@ -2,19 +2,6 @@ // // SPDX-License-Identifier: BSD-3-Clause -#ifndef GKO_COMMON_CUDA_HIP_STOP_BATCH_CRITERIA_HPP_ -#define GKO_COMMON_CUDA_HIP_STOP_BATCH_CRITERIA_HPP_ - - -#include - - -namespace gko { -namespace kernels { -namespace GKO_DEVICE_NAMESPACE { -namespace batch_stop { - - /** * @see reference/stop/batch_criteria.hpp */ @@ -62,11 +49,3 @@ class SimpleAbsResidual { private: const real_type abs_tol_; }; - - -} // namespace batch_stop -} // namespace GKO_DEVICE_NAMESPACE -} // namespace kernels -} // namespace gko - -#endif // GKO_COMMON_CUDA_HIP_STOP_BATCH_CRITERIA_HPP_ diff --git a/common/unified/base/kernel_launch.hpp b/common/unified/base/kernel_launch.hpp index 04db86c27e5..f4ff02cee2f 100644 --- a/common/unified/base/kernel_launch.hpp +++ b/common/unified/base/kernel_launch.hpp @@ -270,10 +270,8 @@ typename to_device_type_impl::type map_to_device(T&& param) } // namespace gko -#if defined(GKO_COMPILING_CUDA) +#if defined(GKO_COMPILING_CUDA) || defined(GKO_COMPILING_HIP) #include "common/cuda_hip/base/kernel_launch.hpp" -#elif defined(GKO_COMPILING_HIP) -#include "hip/base/kernel_launch.hip.hpp" #elif defined(GKO_COMPILING_DPCPP) #include "dpcpp/base/kernel_launch.dp.hpp" #elif defined(GKO_COMPILING_OMP) diff --git a/common/unified/base/kernel_launch_reduction.hpp b/common/unified/base/kernel_launch_reduction.hpp index a9b738f2ca9..b7b3e258dd4 100644 --- a/common/unified/base/kernel_launch_reduction.hpp +++ b/common/unified/base/kernel_launch_reduction.hpp @@ -19,10 +19,8 @@ {} -#if defined(GKO_COMPILING_CUDA) +#if defined(GKO_COMPILING_CUDA) || defined(GKO_COMPILING_HIP) #include "common/cuda_hip/base/kernel_launch_reduction.hpp" -#elif defined(GKO_COMPILING_HIP) -#include "hip/base/kernel_launch_reduction.hip.hpp" #elif defined(GKO_COMPILING_DPCPP) #include "dpcpp/base/kernel_launch_reduction.dp.hpp" #elif defined(GKO_COMPILING_OMP) diff --git a/common/unified/base/kernel_launch_solver.hpp b/common/unified/base/kernel_launch_solver.hpp index 11f282e5c09..14f2cbfeacf 100644 --- a/common/unified/base/kernel_launch_solver.hpp +++ b/common/unified/base/kernel_launch_solver.hpp @@ -107,10 +107,8 @@ const device_type* row_vector(const matrix::Dense* mtx) } // namespace gko -#if defined(GKO_COMPILING_CUDA) +#if defined(GKO_COMPILING_CUDA) || defined(GKO_COMPILING_HIP) #include "common/cuda_hip/base/kernel_launch_solver.hpp" -#elif defined(GKO_COMPILING_HIP) -#include "hip/base/kernel_launch_solver.hip.hpp" #elif defined(GKO_COMPILING_DPCPP) #include "dpcpp/base/kernel_launch_solver.dp.hpp" #elif defined(GKO_COMPILING_OMP) diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt index 8abdd7ba3f5..8c7a5309dcc 100644 --- a/cuda/CMakeLists.txt +++ b/cuda/CMakeLists.txt @@ -1,11 +1,13 @@ cmake_minimum_required(VERSION 3.18 FATAL_ERROR) add_library(ginkgo_cuda $ "") +include(${PROJECT_SOURCE_DIR}/common/cuda_hip/sources.cmake) include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake) add_instantiation_files(. matrix/csr_kernels.instantiate.cu CSR_INSTANTIATE) # we don't split up the dense kernels into distinct compilations list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp) target_sources(ginkgo_cuda PRIVATE + base/batch_multi_vector_kernels.cu base/device.cpp base/exception.cpp base/executor.cpp @@ -18,14 +20,23 @@ target_sources(ginkgo_cuda base/version.cpp factorization/ic_kernels.cu factorization/ilu_kernels.cu + factorization/par_ict_kernels.cu factorization/par_ilut_approx_filter_kernels.cu + factorization/par_ilut_filter_kernels.cu factorization/par_ilut_select_common.cu + factorization/par_ilut_select_kernels.cu + factorization/par_ilut_spgeam_kernels.cu + factorization/par_ilut_sweep_kernels.cu + matrix/batch_csr_kernels.cu + matrix/batch_dense_kernels.cu + matrix/batch_ell_kernels.cu ${CSR_INSTANTIATE} matrix/fft_kernels.cu preconditioner/batch_jacobi_kernels.cu preconditioner/jacobi_advanced_apply_kernels.cu preconditioner/jacobi_generate_kernels.cu preconditioner/jacobi_simple_apply_kernels.cu + preconditioner/jacobi_kernels.cu solver/batch_bicgstab_kernels.cu solver/batch_cg_kernels.cu solver/lower_trs_kernels.cu diff --git a/cuda/base/batch_multi_vector_kernels.cu b/cuda/base/batch_multi_vector_kernels.cu new file mode 100644 index 00000000000..61a7ad92e52 --- /dev/null +++ b/cuda/base/batch_multi_vector_kernels.cu @@ -0,0 +1,59 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "core/base/batch_multi_vector_kernels.hpp" + + +#include +#include + + +#include +#include + + +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/uninitialized_array.hpp" +#include "core/base/batch_struct.hpp" +#include "cuda/base/batch_struct.hpp" +#include "cuda/base/config.hpp" +#include "cuda/base/cublas_bindings.hpp" +#include "cuda/base/pointer_mode_guard.hpp" +#include "cuda/base/thrust.cuh" +#include "cuda/components/cooperative_groups.cuh" + + +namespace gko { +namespace kernels { +namespace cuda { +/** + * @brief The MultiVector matrix format namespace. + * + * @ingroup batch_multi_vector + */ +namespace batch_multi_vector { + + +constexpr auto default_block_size = 256; +constexpr int sm_oversubscription = 4; + + +// clang-format off + +// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES + +#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc" + + +#include "common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc" + +// clang-format on + + +} // namespace batch_multi_vector +} // namespace cuda +} // namespace kernels +} // namespace gko diff --git a/cuda/factorization/par_ict_kernels.cu b/cuda/factorization/par_ict_kernels.cu new file mode 100644 index 00000000000..d4bec43a844 --- /dev/null +++ b/cuda/factorization/par_ict_kernels.cu @@ -0,0 +1,189 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "core/factorization/par_ict_kernels.hpp" + + +#include +#include +#include +#include +#include + + +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/intrinsics.hpp" +#include "common/cuda_hip/components/merging.hpp" +#include "common/cuda_hip/components/prefix_sum.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/searching.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "core/components/prefix_sum_kernels.hpp" +#include "core/matrix/coo_builder.hpp" +#include "core/matrix/csr_builder.hpp" +#include "core/matrix/csr_kernels.hpp" +#include "core/synthesizer/implementation_selection.hpp" +#include "cuda/components/memory.cuh" + + +namespace gko { +namespace kernels { +namespace cuda { +/** + * @brief The parallel ICT factorization namespace. + * + * @ingroup factor + */ +namespace par_ict_factorization { + + +constexpr int default_block_size = 512; + + +// subwarp sizes for all warp-parallel kernels (filter, add_candidates) +using compiled_kernels = + syn::value_list; + + +#include "common/cuda_hip/factorization/par_ict_kernels.hpp.inc" + + +namespace { + + +template +void add_candidates(syn::value_list, + std::shared_ptr exec, + const matrix::Csr* llh, + const matrix::Csr* a, + const matrix::Csr* l, + matrix::Csr* l_new) +{ + auto num_rows = static_cast(llh->get_size()[0]); + auto subwarps_per_block = default_block_size / subwarp_size; + auto num_blocks = ceildiv(num_rows, subwarps_per_block); + matrix::CsrBuilder l_new_builder(l_new); + auto llh_row_ptrs = llh->get_const_row_ptrs(); + auto llh_col_idxs = llh->get_const_col_idxs(); + auto llh_vals = llh->get_const_values(); + auto a_row_ptrs = a->get_const_row_ptrs(); + auto a_col_idxs = a->get_const_col_idxs(); + auto a_vals = a->get_const_values(); + auto l_row_ptrs = l->get_const_row_ptrs(); + auto l_col_idxs = l->get_const_col_idxs(); + auto l_vals = l->get_const_values(); + auto l_new_row_ptrs = l_new->get_row_ptrs(); + // count non-zeros per row + if (num_blocks > 0) { + kernel::ict_tri_spgeam_nnz + <<get_stream()>>>( + llh_row_ptrs, llh_col_idxs, a_row_ptrs, a_col_idxs, + l_new_row_ptrs, num_rows); + } + + // build row ptrs + components::prefix_sum_nonnegative(exec, l_new_row_ptrs, num_rows + 1); + + // resize output arrays + auto l_new_nnz = exec->copy_val_to_host(l_new_row_ptrs + num_rows); + l_new_builder.get_col_idx_array().resize_and_reset(l_new_nnz); + l_new_builder.get_value_array().resize_and_reset(l_new_nnz); + + auto l_new_col_idxs = l_new->get_col_idxs(); + auto l_new_vals = l_new->get_values(); + + // fill columns and values + if (num_blocks > 0) { + kernel::ict_tri_spgeam_init + <<get_stream()>>>( + llh_row_ptrs, llh_col_idxs, as_device_type(llh_vals), + a_row_ptrs, a_col_idxs, as_device_type(a_vals), l_row_ptrs, + l_col_idxs, as_device_type(l_vals), l_new_row_ptrs, + l_new_col_idxs, as_device_type(l_new_vals), num_rows); + } +} + + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_add_candidates, add_candidates); + + +template +void compute_factor(syn::value_list, + std::shared_ptr exec, + const matrix::Csr* a, + matrix::Csr* l, + const matrix::Coo* l_coo) +{ + auto total_nnz = static_cast(l->get_num_stored_elements()); + auto block_size = default_block_size / subwarp_size; + auto num_blocks = ceildiv(total_nnz, block_size); + if (num_blocks > 0) { + kernel::ict_sweep + <<get_stream()>>>( + a->get_const_row_ptrs(), a->get_const_col_idxs(), + as_device_type(a->get_const_values()), l->get_const_row_ptrs(), + l_coo->get_const_row_idxs(), l->get_const_col_idxs(), + as_device_type(l->get_values()), + static_cast(l->get_num_stored_elements())); + } +} + + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_compute_factor, compute_factor); + + +} // namespace + + +template +void add_candidates(std::shared_ptr exec, + const matrix::Csr* llh, + const matrix::Csr* a, + const matrix::Csr* l, + matrix::Csr* l_new) +{ + auto num_rows = a->get_size()[0]; + auto total_nnz = + llh->get_num_stored_elements() + a->get_num_stored_elements(); + auto total_nnz_per_row = total_nnz / num_rows; + select_add_candidates( + compiled_kernels(), + [&](int compiled_subwarp_size) { + return total_nnz_per_row <= compiled_subwarp_size || + compiled_subwarp_size == config::warp_size; + }, + syn::value_list(), syn::type_list<>(), exec, llh, a, l, l_new); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_PAR_ICT_ADD_CANDIDATES_KERNEL); + + +template +void compute_factor(std::shared_ptr exec, + const matrix::Csr* a, + matrix::Csr* l, + const matrix::Coo* l_coo) +{ + auto num_rows = a->get_size()[0]; + auto total_nnz = 2 * l->get_num_stored_elements(); + auto total_nnz_per_row = total_nnz / num_rows; + select_compute_factor( + compiled_kernels(), + [&](int compiled_subwarp_size) { + return total_nnz_per_row <= compiled_subwarp_size || + compiled_subwarp_size == config::warp_size; + }, + syn::value_list(), syn::type_list<>(), exec, a, l, l_coo); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_PAR_ICT_COMPUTE_FACTOR_KERNEL); + + +} // namespace par_ict_factorization +} // namespace cuda +} // namespace kernels +} // namespace gko diff --git a/cuda/factorization/par_ilut_approx_filter_kernels.cu b/cuda/factorization/par_ilut_approx_filter_kernels.cu index 853519cd36b..2932647231a 100644 --- a/cuda/factorization/par_ilut_approx_filter_kernels.cu +++ b/cuda/factorization/par_ilut_approx_filter_kernels.cu @@ -15,20 +15,20 @@ #include +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/components/atomic.hpp" +#include "common/cuda_hip/components/intrinsics.hpp" +#include "common/cuda_hip/components/prefix_sum.hpp" +#include "common/cuda_hip/components/sorting.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/coo_builder.hpp" #include "core/matrix/csr_builder.hpp" #include "core/matrix/csr_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" #include "cuda/base/config.hpp" -#include "cuda/base/math.hpp" #include "cuda/base/types.hpp" -#include "cuda/components/atomic.cuh" #include "cuda/components/cooperative_groups.cuh" -#include "cuda/components/intrinsics.cuh" -#include "cuda/components/prefix_sum.cuh" -#include "cuda/components/sorting.cuh" -#include "cuda/components/thread_ids.cuh" #include "cuda/factorization/par_ilut_select_common.cuh" diff --git a/cuda/factorization/par_ilut_filter_kernels.cu b/cuda/factorization/par_ilut_filter_kernels.cu new file mode 100644 index 00000000000..ceb471bc642 --- /dev/null +++ b/cuda/factorization/par_ilut_filter_kernels.cu @@ -0,0 +1,140 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "core/factorization/par_ilut_kernels.hpp" + + +#include +#include +#include +#include +#include + + +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/intrinsics.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "core/components/prefix_sum_kernels.hpp" +#include "core/matrix/coo_builder.hpp" +#include "core/matrix/csr_builder.hpp" +#include "core/matrix/csr_kernels.hpp" +#include "core/synthesizer/implementation_selection.hpp" +#include "cuda/base/config.hpp" +#include "cuda/base/types.hpp" +#include "cuda/components/cooperative_groups.cuh" + + +namespace gko { +namespace kernels { +namespace cuda { +/** + * @brief The parallel ILUT factorization namespace. + * + * @ingroup factor + */ +namespace par_ilut_factorization { + + +constexpr int default_block_size = 512; + + +// subwarp sizes for filter kernels +using compiled_kernels = + syn::value_list; + + +#include "common/cuda_hip/factorization/par_ilut_filter_kernels.hpp.inc" + + +namespace { + + +template +void threshold_filter(syn::value_list, + std::shared_ptr exec, + const matrix::Csr* a, + remove_complex threshold, + matrix::Csr* m_out, + matrix::Coo* m_out_coo, bool lower) +{ + auto old_row_ptrs = a->get_const_row_ptrs(); + auto old_col_idxs = a->get_const_col_idxs(); + auto old_vals = a->get_const_values(); + // compute nnz for each row + auto num_rows = static_cast(a->get_size()[0]); + auto block_size = default_block_size / subwarp_size; + auto num_blocks = ceildiv(num_rows, block_size); + auto new_row_ptrs = m_out->get_row_ptrs(); + if (num_blocks > 0) { + kernel::threshold_filter_nnz + <<get_stream()>>>( + old_row_ptrs, as_device_type(old_vals), num_rows, + as_device_type(threshold), new_row_ptrs, lower); + } + + // build row pointers + components::prefix_sum_nonnegative(exec, new_row_ptrs, num_rows + 1); + + // build matrix + auto new_nnz = exec->copy_val_to_host(new_row_ptrs + num_rows); + // resize arrays and update aliases + matrix::CsrBuilder builder{m_out}; + builder.get_col_idx_array().resize_and_reset(new_nnz); + builder.get_value_array().resize_and_reset(new_nnz); + auto new_col_idxs = m_out->get_col_idxs(); + auto new_vals = m_out->get_values(); + IndexType* new_row_idxs{}; + if (m_out_coo) { + matrix::CooBuilder coo_builder{m_out_coo}; + coo_builder.get_row_idx_array().resize_and_reset(new_nnz); + coo_builder.get_col_idx_array() = + make_array_view(exec, new_nnz, new_col_idxs); + coo_builder.get_value_array() = + make_array_view(exec, new_nnz, new_vals); + new_row_idxs = m_out_coo->get_row_idxs(); + } + if (num_blocks > 0) { + kernel::threshold_filter + <<get_stream()>>>( + old_row_ptrs, old_col_idxs, as_device_type(old_vals), num_rows, + as_device_type(threshold), new_row_ptrs, new_row_idxs, + new_col_idxs, as_device_type(new_vals), lower); + } +} + + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_threshold_filter, threshold_filter); + + +} // namespace + +template +void threshold_filter(std::shared_ptr exec, + const matrix::Csr* a, + remove_complex threshold, + matrix::Csr* m_out, + matrix::Coo* m_out_coo, bool lower) +{ + auto num_rows = a->get_size()[0]; + auto total_nnz = a->get_num_stored_elements(); + auto total_nnz_per_row = total_nnz / num_rows; + select_threshold_filter( + compiled_kernels(), + [&](int compiled_subwarp_size) { + return total_nnz_per_row <= compiled_subwarp_size || + compiled_subwarp_size == config::warp_size; + }, + syn::value_list(), syn::type_list<>(), exec, a, threshold, m_out, + m_out_coo, lower); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_KERNEL); + + +} // namespace par_ilut_factorization +} // namespace cuda +} // namespace kernels +} // namespace gko diff --git a/cuda/factorization/par_ilut_select_common.cu b/cuda/factorization/par_ilut_select_common.cu index bbba93595c8..7aa8c43259d 100644 --- a/cuda/factorization/par_ilut_select_common.cu +++ b/cuda/factorization/par_ilut_select_common.cu @@ -5,15 +5,15 @@ #include "cuda/factorization/par_ilut_select_common.cuh" +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/components/atomic.hpp" +#include "common/cuda_hip/components/intrinsics.hpp" +#include "common/cuda_hip/components/prefix_sum.hpp" +#include "common/cuda_hip/components/searching.hpp" +#include "common/cuda_hip/components/sorting.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/factorization/par_ilut_kernels.hpp" -#include "cuda/base/math.hpp" -#include "cuda/components/atomic.cuh" -#include "cuda/components/intrinsics.cuh" -#include "cuda/components/prefix_sum.cuh" -#include "cuda/components/searching.cuh" -#include "cuda/components/sorting.cuh" -#include "cuda/components/thread_ids.cuh" namespace gko { diff --git a/cuda/factorization/par_ilut_select_kernels.cu b/cuda/factorization/par_ilut_select_kernels.cu new file mode 100644 index 00000000000..13869f90784 --- /dev/null +++ b/cuda/factorization/par_ilut_select_kernels.cu @@ -0,0 +1,162 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "core/factorization/par_ilut_kernels.hpp" + + +#include + + +#include +#include +#include + + +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/atomic.hpp" +#include "common/cuda_hip/components/intrinsics.hpp" +#include "common/cuda_hip/components/prefix_sum.hpp" +#include "common/cuda_hip/components/searching.hpp" +#include "common/cuda_hip/components/sorting.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "core/components/prefix_sum_kernels.hpp" +#include "cuda/factorization/par_ilut_select_common.cuh" + + +namespace gko { +namespace kernels { +namespace cuda { +/** + * @brief The parallel ILUT factorization namespace. + * + * @ingroup factor + */ +namespace par_ilut_factorization { + + +#include "common/cuda_hip/factorization/par_ilut_select_kernels.hpp.inc" + + +template +void sampleselect_filter(std::shared_ptr exec, + const ValueType* values, IndexType size, + const unsigned char* oracles, + const IndexType* partial_counts, IndexType bucket, + remove_complex* out) +{ + auto num_threads_total = ceildiv(size, items_per_thread); + auto num_blocks = + static_cast(ceildiv(num_threads_total, default_block_size)); + if (num_blocks > 0) { + kernel::filter_bucket<<get_stream()>>>( + as_device_type(values), size, bucket, oracles, partial_counts, + as_device_type(out), items_per_thread); + } +} + + +template +void threshold_select(std::shared_ptr exec, + const matrix::Csr* m, + IndexType rank, array& tmp1, + array>& tmp2, + remove_complex& threshold) +{ + auto values = m->get_const_values(); + IndexType size = m->get_num_stored_elements(); + using AbsType = remove_complex; + constexpr auto bucket_count = kernel::searchtree_width; + auto max_num_threads = ceildiv(size, items_per_thread); + auto max_num_blocks = ceildiv(max_num_threads, default_block_size); + + size_type tmp_size_totals = + ceildiv((bucket_count + 1) * sizeof(IndexType), sizeof(ValueType)); + size_type tmp_size_partials = ceildiv( + bucket_count * max_num_blocks * sizeof(IndexType), sizeof(ValueType)); + size_type tmp_size_oracles = + ceildiv(size * sizeof(unsigned char), sizeof(ValueType)); + size_type tmp_size_tree = + ceildiv(kernel::searchtree_size * sizeof(AbsType), sizeof(ValueType)); + size_type tmp_size_vals = + size / bucket_count * 4; // pessimistic estimate for temporary storage + size_type tmp_size = + tmp_size_totals + tmp_size_partials + tmp_size_oracles + tmp_size_tree; + tmp1.resize_and_reset(tmp_size); + tmp2.resize_and_reset(tmp_size_vals); + + auto total_counts = reinterpret_cast(tmp1.get_data()); + auto partial_counts = + reinterpret_cast(tmp1.get_data() + tmp_size_totals); + auto oracles = reinterpret_cast( + tmp1.get_data() + tmp_size_totals + tmp_size_partials); + auto tree = + reinterpret_cast(tmp1.get_data() + tmp_size_totals + + tmp_size_partials + tmp_size_oracles); + + sampleselect_count(exec, values, size, tree, oracles, partial_counts, + total_counts); + + // determine bucket with correct rank, use bucket-local rank + auto bucket = sampleselect_find_bucket(exec, total_counts, rank); + rank -= bucket.begin; + + if (bucket.size * 2 > tmp_size_vals) { + // we need to reallocate tmp2 + tmp2.resize_and_reset(bucket.size * 2); + } + auto tmp21 = tmp2.get_data(); + auto tmp22 = tmp2.get_data() + bucket.size; + // extract target bucket + sampleselect_filter(exec, values, size, oracles, partial_counts, bucket.idx, + tmp22); + + // recursively select from smaller buckets + int step{}; + while (bucket.size > kernel::basecase_size) { + std::swap(tmp21, tmp22); + const auto* tmp_in = tmp21; + auto tmp_out = tmp22; + + sampleselect_count(exec, tmp_in, bucket.size, tree, oracles, + partial_counts, total_counts); + auto new_bucket = sampleselect_find_bucket(exec, total_counts, rank); + sampleselect_filter(exec, tmp_in, bucket.size, oracles, partial_counts, + bucket.idx, tmp_out); + + rank -= new_bucket.begin; + bucket.size = new_bucket.size; + // we should never need more than 5 recursion steps, this would mean + // 256^5 = 2^40. fall back to standard library algorithm in that case. + ++step; + if (step > 5) { + array cpu_out_array{ + exec->get_master(), + make_array_view(exec, bucket.size, tmp_out)}; + auto begin = cpu_out_array.get_data(); + auto end = begin + bucket.size; + auto middle = begin + rank; + std::nth_element(begin, middle, end); + threshold = *middle; + return; + } + } + + // base case + auto out_ptr = reinterpret_cast(tmp1.get_data()); + kernel::basecase_select<<<1, kernel::basecase_block_size, 0, + exec->get_stream()>>>( + as_device_type(tmp22), bucket.size, rank, as_device_type(out_ptr)); + threshold = exec->copy_val_to_host(out_ptr); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_PAR_ILUT_THRESHOLD_SELECT_KERNEL); + + +} // namespace par_ilut_factorization +} // namespace cuda +} // namespace kernels +} // namespace gko diff --git a/cuda/factorization/par_ilut_spgeam_kernels.cu b/cuda/factorization/par_ilut_spgeam_kernels.cu new file mode 100644 index 00000000000..218e4bddee4 --- /dev/null +++ b/cuda/factorization/par_ilut_spgeam_kernels.cu @@ -0,0 +1,159 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "core/factorization/par_ilut_kernels.hpp" + + +#include +#include +#include +#include +#include + + +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/intrinsics.hpp" +#include "common/cuda_hip/components/merging.hpp" +#include "common/cuda_hip/components/prefix_sum.hpp" +#include "common/cuda_hip/components/searching.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "core/components/prefix_sum_kernels.hpp" +#include "core/matrix/coo_builder.hpp" +#include "core/matrix/csr_builder.hpp" +#include "core/matrix/csr_kernels.hpp" +#include "core/synthesizer/implementation_selection.hpp" +#include "cuda/components/cooperative_groups.cuh" + + +namespace gko { +namespace kernels { +namespace cuda { +/** + * @brief The parallel ILUT factorization namespace. + * + * @ingroup factor + */ +namespace par_ilut_factorization { + + +constexpr int default_block_size = 512; + + +// subwarp sizes for add_candidates kernels +using compiled_kernels = + syn::value_list; + + +#include "common/cuda_hip/factorization/par_ilut_spgeam_kernels.hpp.inc" + + +namespace { + + +template +void add_candidates(syn::value_list, + std::shared_ptr exec, + const matrix::Csr* lu, + const matrix::Csr* a, + const matrix::Csr* l, + const matrix::Csr* u, + matrix::Csr* l_new, + matrix::Csr* u_new) +{ + auto num_rows = static_cast(lu->get_size()[0]); + auto subwarps_per_block = default_block_size / subwarp_size; + auto num_blocks = ceildiv(num_rows, subwarps_per_block); + matrix::CsrBuilder l_new_builder(l_new); + matrix::CsrBuilder u_new_builder(u_new); + auto lu_row_ptrs = lu->get_const_row_ptrs(); + auto lu_col_idxs = lu->get_const_col_idxs(); + auto lu_vals = lu->get_const_values(); + auto a_row_ptrs = a->get_const_row_ptrs(); + auto a_col_idxs = a->get_const_col_idxs(); + auto a_vals = a->get_const_values(); + auto l_row_ptrs = l->get_const_row_ptrs(); + auto l_col_idxs = l->get_const_col_idxs(); + auto l_vals = l->get_const_values(); + auto u_row_ptrs = u->get_const_row_ptrs(); + auto u_col_idxs = u->get_const_col_idxs(); + auto u_vals = u->get_const_values(); + auto l_new_row_ptrs = l_new->get_row_ptrs(); + auto u_new_row_ptrs = u_new->get_row_ptrs(); + if (num_blocks > 0) { + // count non-zeros per row + kernel::tri_spgeam_nnz + <<get_stream()>>>( + lu_row_ptrs, lu_col_idxs, a_row_ptrs, a_col_idxs, + l_new_row_ptrs, u_new_row_ptrs, num_rows); + } + + // build row ptrs + components::prefix_sum_nonnegative(exec, l_new_row_ptrs, num_rows + 1); + components::prefix_sum_nonnegative(exec, u_new_row_ptrs, num_rows + 1); + + // resize output arrays + auto l_new_nnz = exec->copy_val_to_host(l_new_row_ptrs + num_rows); + auto u_new_nnz = exec->copy_val_to_host(u_new_row_ptrs + num_rows); + l_new_builder.get_col_idx_array().resize_and_reset(l_new_nnz); + l_new_builder.get_value_array().resize_and_reset(l_new_nnz); + u_new_builder.get_col_idx_array().resize_and_reset(u_new_nnz); + u_new_builder.get_value_array().resize_and_reset(u_new_nnz); + + auto l_new_col_idxs = l_new->get_col_idxs(); + auto l_new_vals = l_new->get_values(); + auto u_new_col_idxs = u_new->get_col_idxs(); + auto u_new_vals = u_new->get_values(); + + if (num_blocks > 0) { + // fill columns and values + kernel::tri_spgeam_init + <<get_stream()>>>( + lu_row_ptrs, lu_col_idxs, as_device_type(lu_vals), a_row_ptrs, + a_col_idxs, as_device_type(a_vals), l_row_ptrs, l_col_idxs, + as_device_type(l_vals), u_row_ptrs, u_col_idxs, + as_device_type(u_vals), l_new_row_ptrs, l_new_col_idxs, + as_device_type(l_new_vals), u_new_row_ptrs, u_new_col_idxs, + as_device_type(u_new_vals), num_rows); + } +} + + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_add_candidates, add_candidates); + + +} // namespace + + +template +void add_candidates(std::shared_ptr exec, + const matrix::Csr* lu, + const matrix::Csr* a, + const matrix::Csr* l, + const matrix::Csr* u, + matrix::Csr* l_new, + matrix::Csr* u_new) +{ + auto num_rows = a->get_size()[0]; + auto total_nnz = + lu->get_num_stored_elements() + a->get_num_stored_elements(); + auto total_nnz_per_row = total_nnz / num_rows; + select_add_candidates( + compiled_kernels(), + [&](int compiled_subwarp_size) { + return total_nnz_per_row <= compiled_subwarp_size || + compiled_subwarp_size == config::warp_size; + }, + syn::value_list(), syn::type_list<>(), exec, lu, a, l, u, l_new, + u_new); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_PAR_ILUT_ADD_CANDIDATES_KERNEL); + + +} // namespace par_ilut_factorization +} // namespace cuda +} // namespace kernels +} // namespace gko diff --git a/common/cuda_hip/factorization/par_ilut_sweep_kernels.cpp b/cuda/factorization/par_ilut_sweep_kernels.cu similarity index 53% rename from common/cuda_hip/factorization/par_ilut_sweep_kernels.cpp rename to cuda/factorization/par_ilut_sweep_kernels.cu index 6ae783133e5..416f492ead2 100644 --- a/common/cuda_hip/factorization/par_ilut_sweep_kernels.cpp +++ b/cuda/factorization/par_ilut_sweep_kernels.cu @@ -15,7 +15,6 @@ #include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/components/intrinsics.hpp" -#include "common/cuda_hip/components/memory.hpp" #include "common/cuda_hip/components/merging.hpp" #include "common/cuda_hip/components/prefix_sum.hpp" #include "common/cuda_hip/components/reduction.hpp" @@ -26,11 +25,12 @@ #include "core/matrix/csr_builder.hpp" #include "core/matrix/csr_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" +#include "cuda/components/memory.cuh" namespace gko { namespace kernels { -namespace GKO_DEVICE_NAMESPACE { +namespace cuda { /** * @brief The parallel ILUT factorization namespace. * @@ -47,96 +47,7 @@ using compiled_kernels = syn::value_list; -namespace kernel { - - -template -__global__ __launch_bounds__(default_block_size) void sweep( - const IndexType* __restrict__ a_row_ptrs, - const IndexType* __restrict__ a_col_idxs, - const ValueType* __restrict__ a_vals, - const IndexType* __restrict__ l_row_ptrs, - const IndexType* __restrict__ l_row_idxs, - const IndexType* __restrict__ l_col_idxs, ValueType* __restrict__ l_vals, - IndexType l_nnz, const IndexType* __restrict__ u_row_idxs, - const IndexType* __restrict__ u_col_idxs, ValueType* __restrict__ u_vals, - const IndexType* __restrict__ ut_col_ptrs, - const IndexType* __restrict__ ut_row_idxs, ValueType* __restrict__ ut_vals, - IndexType u_nnz) -{ - auto tidx = thread::get_subwarp_id_flat(); - if (tidx >= l_nnz + u_nnz) { - return; - } - // split the subwarps into two halves for lower and upper triangle - auto l_nz = tidx; - auto u_nz = l_nz - l_nnz; - auto lower = u_nz < 0; - auto row = lower ? l_row_idxs[l_nz] : u_row_idxs[u_nz]; - auto col = lower ? l_col_idxs[l_nz] : u_col_idxs[u_nz]; - if (lower && row == col) { - // don't update the diagonal twice - return; - } - auto subwarp = - group::tiled_partition(group::this_thread_block()); - // find entry of A at (row, col) - auto a_row_begin = a_row_ptrs[row]; - auto a_row_end = a_row_ptrs[row + 1]; - auto a_row_size = a_row_end - a_row_begin; - auto a_idx = - group_wide_search(a_row_begin, a_row_size, subwarp, - [&](IndexType i) { return a_col_idxs[i] >= col; }); - bool has_a = a_idx < a_row_end && a_col_idxs[a_idx] == col; - auto a_val = has_a ? a_vals[a_idx] : zero(); - auto l_row_begin = l_row_ptrs[row]; - auto l_row_size = l_row_ptrs[row + 1] - l_row_begin; - auto ut_col_begin = ut_col_ptrs[col]; - auto ut_col_size = ut_col_ptrs[col + 1] - ut_col_begin; - ValueType sum{}; - IndexType ut_nz{}; - auto last_entry = min(row, col); - group_merge( - l_col_idxs + l_row_begin, l_row_size, ut_row_idxs + ut_col_begin, - ut_col_size, subwarp, - [&](IndexType l_idx, IndexType l_col, IndexType ut_idx, - IndexType ut_row, IndexType, bool) { - // we don't need to use the `bool valid` because last_entry is - // already a smaller sentinel value than the one used in group_merge - if (l_col == ut_row && l_col < last_entry) { - sum += load_relaxed(l_vals + (l_idx + l_row_begin)) * - load_relaxed(ut_vals + (ut_idx + ut_col_begin)); - } - // remember the transposed element - auto found_transp = subwarp.ballot(ut_row == row); - if (found_transp) { - ut_nz = - subwarp.shfl(ut_idx + ut_col_begin, ffs(found_transp) - 1); - } - return true; - }); - // accumulate result from all threads - sum = reduce(subwarp, sum, [](ValueType a, ValueType b) { return a + b; }); - - if (subwarp.thread_rank() == 0) { - if (lower) { - auto to_write = (a_val - sum) / - load_relaxed(ut_vals + (ut_col_ptrs[col + 1] - 1)); - if (is_finite(to_write)) { - store_relaxed(l_vals + l_nz, to_write); - } - } else { - auto to_write = a_val - sum; - if (is_finite(to_write)) { - store_relaxed(u_vals + u_nz, to_write); - store_relaxed(ut_vals + ut_nz, to_write); - } - } - } -} - - -} // namespace kernel +#include "common/cuda_hip/factorization/par_ilut_sweep_kernels.hpp.inc" namespace { @@ -207,6 +118,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( } // namespace par_ilut_factorization -} // namespace GKO_DEVICE_NAMESPACE +} // namespace cuda } // namespace kernels } // namespace gko diff --git a/cuda/log/batch_logger.cuh b/cuda/log/batch_logger.cuh new file mode 100644 index 00000000000..3e53d6ef0a6 --- /dev/null +++ b/cuda/log/batch_logger.cuh @@ -0,0 +1,27 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_CUDA_LOG_BATCH_LOGGER_CUH_ +#define GKO_CUDA_LOG_BATCH_LOGGER_CUH_ + + +#include + + +namespace gko { +namespace kernels { +namespace cuda { +namespace batch_log { + + +#include "common/cuda_hip/log/batch_logger.hpp.inc" + + +} // namespace batch_log +} // namespace cuda +} // namespace kernels +} // namespace gko + + +#endif // GKO_CUDA_LOG_BATCH_LOGGER_CUH_ diff --git a/cuda/matrix/batch_csr_kernels.cu b/cuda/matrix/batch_csr_kernels.cu new file mode 100644 index 00000000000..29e209f2f3e --- /dev/null +++ b/cuda/matrix/batch_csr_kernels.cu @@ -0,0 +1,58 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "core/matrix/batch_csr_kernels.hpp" + + +#include + + +#include +#include +#include + + +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/uninitialized_array.hpp" +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_struct.hpp" +#include "cuda/base/batch_struct.hpp" +#include "cuda/base/config.hpp" +#include "cuda/base/thrust.cuh" +#include "cuda/components/cooperative_groups.cuh" +#include "cuda/matrix/batch_struct.hpp" + + +namespace gko { +namespace kernels { +namespace cuda { +/** + * @brief The Csr matrix format namespace. + * @ref Csr + * @ingroup batch_csr + */ +namespace batch_csr { + + +constexpr auto default_block_size = 256; +constexpr int sm_oversubscription = 4; + +// clang-format off + +// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES + +#include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc" + + +#include "common/cuda_hip/matrix/batch_csr_kernel_launcher.hpp.inc" + +// clang-format on + + +} // namespace batch_csr +} // namespace cuda +} // namespace kernels +} // namespace gko diff --git a/cuda/matrix/batch_dense_kernels.cu b/cuda/matrix/batch_dense_kernels.cu new file mode 100644 index 00000000000..65e5b1c93ac --- /dev/null +++ b/cuda/matrix/batch_dense_kernels.cu @@ -0,0 +1,59 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "core/matrix/batch_dense_kernels.hpp" + + +#include + + +#include +#include +#include + + +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/uninitialized_array.hpp" +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_struct.hpp" +#include "cuda/base/batch_struct.hpp" +#include "cuda/base/config.hpp" +#include "cuda/base/thrust.cuh" +#include "cuda/components/cooperative_groups.cuh" +#include "cuda/matrix/batch_struct.hpp" + + +namespace gko { +namespace kernels { +namespace cuda { +/** + * @brief The Dense matrix format namespace. + * + * @ingroup batch_dense + */ +namespace batch_dense { + + +constexpr auto default_block_size = 256; +constexpr int sm_oversubscription = 4; + +// clang-format off + +// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES + +#include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc" + + +#include "common/cuda_hip/matrix/batch_dense_kernel_launcher.hpp.inc" + + +// clang-format on + + +} // namespace batch_dense +} // namespace cuda +} // namespace kernels +} // namespace gko diff --git a/cuda/matrix/batch_ell_kernels.cu b/cuda/matrix/batch_ell_kernels.cu new file mode 100644 index 00000000000..4847cd8a8ff --- /dev/null +++ b/cuda/matrix/batch_ell_kernels.cu @@ -0,0 +1,58 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "core/matrix/batch_ell_kernels.hpp" + + +#include + + +#include +#include +#include + + +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/uninitialized_array.hpp" +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_struct.hpp" +#include "cuda/base/batch_struct.hpp" +#include "cuda/base/config.hpp" +#include "cuda/base/thrust.cuh" +#include "cuda/components/cooperative_groups.cuh" +#include "cuda/matrix/batch_struct.hpp" + + +namespace gko { +namespace kernels { +namespace cuda { +/** + * @brief The Ell matrix format namespace. + * @ref Ell + * @ingroup batch_ell + */ +namespace batch_ell { + + +constexpr auto default_block_size = 256; +constexpr int sm_oversubscription = 4; + +// clang-format off + +// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES + +#include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc" + + +#include "common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc" + +// clang-format on + + +} // namespace batch_ell +} // namespace cuda +} // namespace kernels +} // namespace gko diff --git a/cuda/preconditioner/batch_preconditioners.cuh b/cuda/preconditioner/batch_preconditioners.cuh index 373f68bb376..966add291ff 100644 --- a/cuda/preconditioner/batch_preconditioners.cuh +++ b/cuda/preconditioner/batch_preconditioners.cuh @@ -9,9 +9,9 @@ #include +#include "common/cuda_hip/components/reduction.hpp" #include "core/matrix/batch_struct.hpp" #include "cuda/components/cooperative_groups.cuh" -#include "cuda/components/reduction.cuh" namespace gko { diff --git a/cuda/preconditioner/jacobi_advanced_apply_kernels.instantiate.cu b/cuda/preconditioner/jacobi_advanced_apply_kernels.instantiate.cu index ca7bf20372c..ae873ccaa29 100644 --- a/cuda/preconditioner/jacobi_advanced_apply_kernels.instantiate.cu +++ b/cuda/preconditioner/jacobi_advanced_apply_kernels.instantiate.cu @@ -8,16 +8,16 @@ #include +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/warp_blas.hpp" #include "core/base/extended_float.hpp" #include "core/matrix/dense_kernels.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" #include "cuda/base/config.hpp" -#include "cuda/base/math.hpp" #include "cuda/base/types.hpp" #include "cuda/components/cooperative_groups.cuh" -#include "cuda/components/thread_ids.cuh" -#include "cuda/components/warp_blas.cuh" #include "cuda/preconditioner/jacobi_common.hpp" @@ -32,7 +32,7 @@ namespace cuda { namespace jacobi { -#include +#include "common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.hpp.inc" // clang-format off diff --git a/cuda/preconditioner/jacobi_generate_kernels.instantiate.cu b/cuda/preconditioner/jacobi_generate_kernels.instantiate.cu index a3ad8890042..da10af38f0d 100644 --- a/cuda/preconditioner/jacobi_generate_kernels.instantiate.cu +++ b/cuda/preconditioner/jacobi_generate_kernels.instantiate.cu @@ -9,18 +9,18 @@ #include +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/components/diagonal_block_manipulation.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/uninitialized_array.hpp" +#include "common/cuda_hip/components/warp_blas.hpp" #include "core/base/extended_float.hpp" #include "core/components/fill_array_kernels.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" #include "cuda/base/config.hpp" -#include "cuda/base/math.hpp" #include "cuda/base/types.hpp" #include "cuda/components/cooperative_groups.cuh" -#include "cuda/components/diagonal_block_manipulation.cuh" -#include "cuda/components/thread_ids.cuh" -#include "cuda/components/uninitialized_array.hpp" -#include "cuda/components/warp_blas.cuh" #include "cuda/preconditioner/jacobi_common.hpp" @@ -35,7 +35,7 @@ namespace cuda { namespace jacobi { -#include +#include "common/cuda_hip/preconditioner/jacobi_generate_kernels.hpp.inc" // clang-format off diff --git a/cuda/preconditioner/jacobi_kernels.cu b/cuda/preconditioner/jacobi_kernels.cu new file mode 100644 index 00000000000..c35f62b7364 --- /dev/null +++ b/cuda/preconditioner/jacobi_kernels.cu @@ -0,0 +1,51 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "core/preconditioner/jacobi_kernels.hpp" + + +#include + + +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "core/base/extended_float.hpp" +#include "core/preconditioner/jacobi_utils.hpp" +#include "core/synthesizer/implementation_selection.hpp" +#include "cuda/preconditioner/jacobi_common.hpp" + + +namespace gko { +namespace kernels { +namespace cuda { +/** + * @brief The Jacobi preconditioner namespace. + * @ref Jacobi + * @ingroup jacobi + */ +namespace jacobi { + + +// a total of 32/16 warps (1024 threads) +#if defined(GKO_COMPILING_HIP) && GINKGO_HIP_PLATFORM_HCC +constexpr int default_num_warps = 16; +#else // !defined(GKO_COMPILING_HIP) || GINKGO_HIP_PLATFORM_NVCC +constexpr int default_num_warps = 32; +#endif +// with current architectures, at most 32 warps can be scheduled per SM (and +// current GPUs have at most 84 SMs) +constexpr int default_grid_size = 32 * 32 * 128; + + +#include "common/cuda_hip/preconditioner/jacobi_kernels.hpp.inc" + + +} // namespace jacobi +} // namespace cuda +} // namespace kernels +} // namespace gko diff --git a/cuda/preconditioner/jacobi_simple_apply_kernels.instantiate.cu b/cuda/preconditioner/jacobi_simple_apply_kernels.instantiate.cu index a227adb701b..3997eeeb43a 100644 --- a/cuda/preconditioner/jacobi_simple_apply_kernels.instantiate.cu +++ b/cuda/preconditioner/jacobi_simple_apply_kernels.instantiate.cu @@ -8,16 +8,16 @@ #include +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/warp_blas.hpp" #include "core/base/extended_float.hpp" #include "core/matrix/dense_kernels.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" #include "cuda/base/config.hpp" -#include "cuda/base/math.hpp" #include "cuda/base/types.hpp" #include "cuda/components/cooperative_groups.cuh" -#include "cuda/components/thread_ids.cuh" -#include "cuda/components/warp_blas.cuh" #include "cuda/preconditioner/jacobi_common.hpp" @@ -32,7 +32,7 @@ namespace cuda { namespace jacobi { -#include +#include "common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.hpp.inc" // clang-format off diff --git a/cuda/solver/batch_bicgstab_kernels.cu b/cuda/solver/batch_bicgstab_kernels.cu index b1114f05d20..1d3e1a4c68f 100644 --- a/cuda/solver/batch_bicgstab_kernels.cu +++ b/cuda/solver/batch_bicgstab_kernels.cu @@ -14,6 +14,9 @@ #include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/uninitialized_array.hpp" #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "core/solver/batch_dispatch.hpp" @@ -23,9 +26,6 @@ #include "cuda/base/thrust.cuh" #include "cuda/base/types.hpp" #include "cuda/components/cooperative_groups.cuh" -#include "cuda/components/reduction.cuh" -#include "cuda/components/thread_ids.cuh" -#include "cuda/components/uninitialized_array.hpp" #include "cuda/matrix/batch_struct.hpp" @@ -48,7 +48,6 @@ namespace batch_bicgstab { #include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc" -#include "common/cuda_hip/components/uninitialized_array.hpp.inc" #include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc" #include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc" #include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc" diff --git a/cuda/solver/batch_cg_kernels.cu b/cuda/solver/batch_cg_kernels.cu index 57c3612df69..a3360d31737 100644 --- a/cuda/solver/batch_cg_kernels.cu +++ b/cuda/solver/batch_cg_kernels.cu @@ -13,6 +13,9 @@ #include +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/uninitialized_array.hpp" #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "core/solver/batch_dispatch.hpp" @@ -22,9 +25,6 @@ #include "cuda/base/thrust.cuh" #include "cuda/base/types.hpp" #include "cuda/components/cooperative_groups.cuh" -#include "cuda/components/reduction.cuh" -#include "cuda/components/thread_ids.cuh" -#include "cuda/components/uninitialized_array.hpp" #include "cuda/matrix/batch_struct.hpp" @@ -47,7 +47,6 @@ namespace batch_cg { #include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc" -#include "common/cuda_hip/components/uninitialized_array.hpp.inc" #include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc" #include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc" #include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc" diff --git a/cuda/stop/batch_criteria.cuh b/cuda/stop/batch_criteria.cuh new file mode 100644 index 00000000000..f4f434dda11 --- /dev/null +++ b/cuda/stop/batch_criteria.cuh @@ -0,0 +1,26 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_CUDA_STOP_BATCH_CRITERIA_CUH_ +#define GKO_CUDA_STOP_BATCH_CRITERIA_CUH_ + + +#include + + +namespace gko { +namespace kernels { +namespace cuda { +namespace batch_stop { + + +#include "common/cuda_hip/stop/batch_criteria.hpp.inc" + + +} // namespace batch_stop +} // namespace cuda +} // namespace kernels +} // namespace gko + +#endif // GKO_CUDA_STOP_BATCH_CRITERIA_CUH_ diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt index 5bcc1de1f21..54d8dfb0d09 100644 --- a/hip/CMakeLists.txt +++ b/hip/CMakeLists.txt @@ -1,10 +1,11 @@ cmake_minimum_required(VERSION 3.21) include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake) +include(${PROJECT_SOURCE_DIR}/common/cuda_hip/sources.cmake) add_instantiation_files(. matrix/csr_kernels.instantiate.hip.cpp CSR_INSTANTIATE) -add_instantiation_files(. matrix/fbcsr_kernels.instantiate.hip.cpp FBCSR_INSTANTIATE) # we don't split up the dense kernels into distinct compilations list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp) set(GINKGO_HIP_SOURCES + base/batch_multi_vector_kernels.hip.cpp base/device.hip.cpp base/exception.hip.cpp base/executor.hip.cpp @@ -17,17 +18,21 @@ set(GINKGO_HIP_SOURCES base/version.hip.cpp factorization/ic_kernels.hip.cpp factorization/ilu_kernels.hip.cpp - factorization/par_ilut_approx_filter_kernel.hip.cpp - factorization/par_ilut_filter_kernel.hip.cpp + factorization/par_ict_kernels.hip.cpp + factorization/par_ilut_approx_filter_kernels.hip.cpp + factorization/par_ilut_filter_kernels.hip.cpp factorization/par_ilut_select_common.hip.cpp - factorization/par_ilut_select_kernel.hip.cpp - factorization/par_ilut_spgeam_kernel.hip.cpp - factorization/par_ilut_sweep_kernel.hip.cpp + factorization/par_ilut_select_kernels.hip.cpp + factorization/par_ilut_spgeam_kernels.hip.cpp + factorization/par_ilut_sweep_kernels.hip.cpp + matrix/batch_csr_kernels.hip.cpp + matrix/batch_dense_kernels.hip.cpp + matrix/batch_ell_kernels.hip.cpp ${CSR_INSTANTIATE} - ${FBCSR_INSTANTIATE} preconditioner/batch_jacobi_kernels.hip.cpp preconditioner/jacobi_advanced_apply_kernel.hip.cpp preconditioner/jacobi_generate_kernel.hip.cpp + preconditioner/jacobi_kernels.hip.cpp preconditioner/jacobi_simple_apply_kernel.hip.cpp solver/batch_bicgstab_kernels.hip.cpp solver/batch_cg_kernels.hip.cpp diff --git a/hip/base/batch_multi_vector_kernels.hip.cpp b/hip/base/batch_multi_vector_kernels.hip.cpp new file mode 100644 index 00000000000..4483fe77214 --- /dev/null +++ b/hip/base/batch_multi_vector_kernels.hip.cpp @@ -0,0 +1,59 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "core/base/batch_multi_vector_kernels.hpp" + + +#include +#include + + +#include +#include + + +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/uninitialized_array.hpp" +#include "core/base/batch_struct.hpp" +#include "hip/base/batch_struct.hip.hpp" +#include "hip/base/config.hip.hpp" +#include "hip/base/hipblas_bindings.hip.hpp" +#include "hip/base/pointer_mode_guard.hip.hpp" +#include "hip/base/thrust.hip.hpp" +#include "hip/components/cooperative_groups.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { +/** + * @brief The MultiVector matrix format namespace. + * + * @ingroup batch_multi_vector + */ +namespace batch_multi_vector { + + +constexpr auto default_block_size = 256; +constexpr int sm_oversubscription = 4; + + +// clang-format off + +// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES + +#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc" + + +#include "common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc" + +// clang-format on + + +} // namespace batch_multi_vector +} // namespace hip +} // namespace kernels +} // namespace gko diff --git a/hip/base/config.hip.hpp b/hip/base/config.hip.hpp index fbad841fd0f..3c4b67d52a9 100644 --- a/hip/base/config.hip.hpp +++ b/hip/base/config.hip.hpp @@ -15,7 +15,7 @@ #include -#include "hip/base/math.hip.hpp" +#include "common/cuda_hip/base/math.hpp" namespace gko { diff --git a/hip/base/hipblas_bindings.hip.hpp b/hip/base/hipblas_bindings.hip.hpp index 380f051d53b..a4b4431382b 100644 --- a/hip/base/hipblas_bindings.hip.hpp +++ b/hip/base/hipblas_bindings.hip.hpp @@ -18,7 +18,7 @@ #include -#include "hip/base/math.hip.hpp" +#include "common/cuda_hip/base/math.hpp" #include "hip/base/types.hip.hpp" diff --git a/hip/base/hiprand_bindings.hip.hpp b/hip/base/hiprand_bindings.hip.hpp index b78fa2780eb..f2748eb59df 100644 --- a/hip/base/hiprand_bindings.hip.hpp +++ b/hip/base/hiprand_bindings.hip.hpp @@ -17,7 +17,7 @@ #include -#include "hip/base/math.hip.hpp" +#include "common/cuda_hip/base/math.hpp" #include "hip/base/types.hip.hpp" diff --git a/hip/components/format_conversion.hip.hpp b/hip/components/format_conversion.hip.hpp index 59c0405a874..e0199ddd374 100644 --- a/hip/components/format_conversion.hip.hpp +++ b/hip/components/format_conversion.hip.hpp @@ -13,8 +13,8 @@ #include +#include "common/cuda_hip/components/thread_ids.hpp" #include "hip/components/cooperative_groups.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" #ifdef GINKGO_BENCHMARK_ENABLE_TUNING diff --git a/hip/factorization/par_ict_kernels.hip.cpp b/hip/factorization/par_ict_kernels.hip.cpp new file mode 100644 index 00000000000..1970ac49634 --- /dev/null +++ b/hip/factorization/par_ict_kernels.hip.cpp @@ -0,0 +1,189 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "core/factorization/par_ict_kernels.hpp" + + +#include +#include +#include +#include +#include + + +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/intrinsics.hpp" +#include "common/cuda_hip/components/merging.hpp" +#include "common/cuda_hip/components/prefix_sum.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/searching.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "core/components/prefix_sum_kernels.hpp" +#include "core/matrix/coo_builder.hpp" +#include "core/matrix/csr_builder.hpp" +#include "core/matrix/csr_kernels.hpp" +#include "core/synthesizer/implementation_selection.hpp" +#include "hip/components/memory.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { +/** + * @brief The parallel ICT factorization namespace. + * + * @ingroup factor + */ +namespace par_ict_factorization { + + +constexpr int default_block_size = 512; + + +// subwarp sizes for all warp-parallel kernels (filter, add_candidates) +using compiled_kernels = + syn::value_list; + + +#include "common/cuda_hip/factorization/par_ict_kernels.hpp.inc" + + +namespace { + + +template +void add_candidates(syn::value_list, + std::shared_ptr exec, + const matrix::Csr* llh, + const matrix::Csr* a, + const matrix::Csr* l, + matrix::Csr* l_new) +{ + auto num_rows = static_cast(llh->get_size()[0]); + auto subwarps_per_block = default_block_size / subwarp_size; + auto num_blocks = ceildiv(num_rows, subwarps_per_block); + matrix::CsrBuilder l_new_builder(l_new); + auto llh_row_ptrs = llh->get_const_row_ptrs(); + auto llh_col_idxs = llh->get_const_col_idxs(); + auto llh_vals = llh->get_const_values(); + auto a_row_ptrs = a->get_const_row_ptrs(); + auto a_col_idxs = a->get_const_col_idxs(); + auto a_vals = a->get_const_values(); + auto l_row_ptrs = l->get_const_row_ptrs(); + auto l_col_idxs = l->get_const_col_idxs(); + auto l_vals = l->get_const_values(); + auto l_new_row_ptrs = l_new->get_row_ptrs(); + // count non-zeros per row + if (num_blocks > 0) { + kernel::ict_tri_spgeam_nnz + <<get_stream()>>>( + llh_row_ptrs, llh_col_idxs, a_row_ptrs, a_col_idxs, + l_new_row_ptrs, num_rows); + } + + // build row ptrs + components::prefix_sum_nonnegative(exec, l_new_row_ptrs, num_rows + 1); + + // resize output arrays + auto l_new_nnz = exec->copy_val_to_host(l_new_row_ptrs + num_rows); + l_new_builder.get_col_idx_array().resize_and_reset(l_new_nnz); + l_new_builder.get_value_array().resize_and_reset(l_new_nnz); + + auto l_new_col_idxs = l_new->get_col_idxs(); + auto l_new_vals = l_new->get_values(); + + // fill columns and values + if (num_blocks > 0) { + kernel::ict_tri_spgeam_init + <<get_stream()>>>( + llh_row_ptrs, llh_col_idxs, as_device_type(llh_vals), + a_row_ptrs, a_col_idxs, as_device_type(a_vals), l_row_ptrs, + l_col_idxs, as_device_type(l_vals), l_new_row_ptrs, + l_new_col_idxs, as_device_type(l_new_vals), num_rows); + } +} + + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_add_candidates, add_candidates); + + +template +void compute_factor(syn::value_list, + std::shared_ptr exec, + const matrix::Csr* a, + matrix::Csr* l, + const matrix::Coo* l_coo) +{ + auto total_nnz = static_cast(l->get_num_stored_elements()); + auto block_size = default_block_size / subwarp_size; + auto num_blocks = ceildiv(total_nnz, block_size); + if (num_blocks > 0) { + kernel::ict_sweep + <<get_stream()>>>( + a->get_const_row_ptrs(), a->get_const_col_idxs(), + as_device_type(a->get_const_values()), l->get_const_row_ptrs(), + l_coo->get_const_row_idxs(), l->get_const_col_idxs(), + as_device_type(l->get_values()), + static_cast(l->get_num_stored_elements())); + } +} + + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_compute_factor, compute_factor); + + +} // namespace + + +template +void add_candidates(std::shared_ptr exec, + const matrix::Csr* llh, + const matrix::Csr* a, + const matrix::Csr* l, + matrix::Csr* l_new) +{ + auto num_rows = a->get_size()[0]; + auto total_nnz = + llh->get_num_stored_elements() + a->get_num_stored_elements(); + auto total_nnz_per_row = total_nnz / num_rows; + select_add_candidates( + compiled_kernels(), + [&](int compiled_subwarp_size) { + return total_nnz_per_row <= compiled_subwarp_size || + compiled_subwarp_size == config::warp_size; + }, + syn::value_list(), syn::type_list<>(), exec, llh, a, l, l_new); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_PAR_ICT_ADD_CANDIDATES_KERNEL); + + +template +void compute_factor(std::shared_ptr exec, + const matrix::Csr* a, + matrix::Csr* l, + const matrix::Coo* l_coo) +{ + auto num_rows = a->get_size()[0]; + auto total_nnz = 2 * l->get_num_stored_elements(); + auto total_nnz_per_row = total_nnz / num_rows; + select_compute_factor( + compiled_kernels(), + [&](int compiled_subwarp_size) { + return total_nnz_per_row <= compiled_subwarp_size || + compiled_subwarp_size == config::warp_size; + }, + syn::value_list(), syn::type_list<>(), exec, a, l, l_coo); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_PAR_ICT_COMPUTE_FACTOR_KERNEL); + + +} // namespace par_ict_factorization +} // namespace hip +} // namespace kernels +} // namespace gko diff --git a/hip/factorization/par_ilut_approx_filter_kernels.hip.cpp b/hip/factorization/par_ilut_approx_filter_kernels.hip.cpp index d730e33e418..f682cb9efad 100644 --- a/hip/factorization/par_ilut_approx_filter_kernels.hip.cpp +++ b/hip/factorization/par_ilut_approx_filter_kernels.hip.cpp @@ -18,20 +18,20 @@ #include +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/components/atomic.hpp" +#include "common/cuda_hip/components/intrinsics.hpp" +#include "common/cuda_hip/components/prefix_sum.hpp" +#include "common/cuda_hip/components/sorting.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/coo_builder.hpp" #include "core/matrix/csr_builder.hpp" #include "core/matrix/csr_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" #include "hip/base/config.hip.hpp" -#include "hip/base/math.hip.hpp" #include "hip/base/types.hip.hpp" -#include "hip/components/atomic.hip.hpp" #include "hip/components/cooperative_groups.hip.hpp" -#include "hip/components/intrinsics.hip.hpp" -#include "hip/components/prefix_sum.hip.hpp" -#include "hip/components/sorting.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" #include "hip/factorization/par_ilut_select_common.hip.hpp" diff --git a/hip/factorization/par_ilut_filter_kernels.hip.cpp b/hip/factorization/par_ilut_filter_kernels.hip.cpp new file mode 100644 index 00000000000..36cd0873903 --- /dev/null +++ b/hip/factorization/par_ilut_filter_kernels.hip.cpp @@ -0,0 +1,140 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "core/factorization/par_ilut_kernels.hpp" + + +#include +#include +#include +#include +#include + + +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/intrinsics.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "core/components/prefix_sum_kernels.hpp" +#include "core/matrix/coo_builder.hpp" +#include "core/matrix/csr_builder.hpp" +#include "core/matrix/csr_kernels.hpp" +#include "core/synthesizer/implementation_selection.hpp" +#include "hip/base/config.hip.hpp" +#include "hip/base/types.hip.hpp" +#include "hip/components/cooperative_groups.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { +/** + * @brief The parallel ILUT factorization namespace. + * + * @ingroup factor + */ +namespace par_ilut_factorization { + + +constexpr int default_block_size = 512; + + +// subwarp sizes for filter kernels +using compiled_kernels = + syn::value_list; + + +#include "common/cuda_hip/factorization/par_ilut_filter_kernels.hpp.inc" + + +namespace { + + +template +void threshold_filter(syn::value_list, + std::shared_ptr exec, + const matrix::Csr* a, + remove_complex threshold, + matrix::Csr* m_out, + matrix::Coo* m_out_coo, bool lower) +{ + auto old_row_ptrs = a->get_const_row_ptrs(); + auto old_col_idxs = a->get_const_col_idxs(); + auto old_vals = a->get_const_values(); + // compute nnz for each row + auto num_rows = static_cast(a->get_size()[0]); + auto block_size = default_block_size / subwarp_size; + auto num_blocks = ceildiv(num_rows, block_size); + auto new_row_ptrs = m_out->get_row_ptrs(); + if (num_blocks > 0) { + kernel::threshold_filter_nnz + <<get_stream()>>>( + old_row_ptrs, as_device_type(old_vals), num_rows, + as_device_type(threshold), new_row_ptrs, lower); + } + + // build row pointers + components::prefix_sum_nonnegative(exec, new_row_ptrs, num_rows + 1); + + // build matrix + auto new_nnz = exec->copy_val_to_host(new_row_ptrs + num_rows); + // resize arrays and update aliases + matrix::CsrBuilder builder{m_out}; + builder.get_col_idx_array().resize_and_reset(new_nnz); + builder.get_value_array().resize_and_reset(new_nnz); + auto new_col_idxs = m_out->get_col_idxs(); + auto new_vals = m_out->get_values(); + IndexType* new_row_idxs{}; + if (m_out_coo) { + matrix::CooBuilder coo_builder{m_out_coo}; + coo_builder.get_row_idx_array().resize_and_reset(new_nnz); + coo_builder.get_col_idx_array() = + make_array_view(exec, new_nnz, new_col_idxs); + coo_builder.get_value_array() = + make_array_view(exec, new_nnz, new_vals); + new_row_idxs = m_out_coo->get_row_idxs(); + } + if (num_blocks > 0) { + kernel::threshold_filter + <<get_stream()>>>( + old_row_ptrs, old_col_idxs, as_device_type(old_vals), num_rows, + as_device_type(threshold), new_row_ptrs, new_row_idxs, + new_col_idxs, as_device_type(new_vals), lower); + } +} + + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_threshold_filter, threshold_filter); + + +} // namespace + +template +void threshold_filter(std::shared_ptr exec, + const matrix::Csr* a, + remove_complex threshold, + matrix::Csr* m_out, + matrix::Coo* m_out_coo, bool lower) +{ + auto num_rows = a->get_size()[0]; + auto total_nnz = a->get_num_stored_elements(); + auto total_nnz_per_row = total_nnz / num_rows; + select_threshold_filter( + compiled_kernels(), + [&](int compiled_subwarp_size) { + return total_nnz_per_row <= compiled_subwarp_size || + compiled_subwarp_size == config::warp_size; + }, + syn::value_list(), syn::type_list<>(), exec, a, threshold, m_out, + m_out_coo, lower); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_KERNEL); + + +} // namespace par_ilut_factorization +} // namespace hip +} // namespace kernels +} // namespace gko diff --git a/hip/factorization/par_ilut_select_common.hip.cpp b/hip/factorization/par_ilut_select_common.hip.cpp index 85c2eaa7036..e10f2298463 100644 --- a/hip/factorization/par_ilut_select_common.hip.cpp +++ b/hip/factorization/par_ilut_select_common.hip.cpp @@ -11,15 +11,15 @@ #include "hip/factorization/par_ilut_select_common.hip.hpp" +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/components/atomic.hpp" +#include "common/cuda_hip/components/intrinsics.hpp" +#include "common/cuda_hip/components/prefix_sum.hpp" +#include "common/cuda_hip/components/searching.hpp" +#include "common/cuda_hip/components/sorting.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/factorization/par_ilut_kernels.hpp" -#include "hip/base/math.hip.hpp" -#include "hip/components/atomic.hip.hpp" -#include "hip/components/intrinsics.hip.hpp" -#include "hip/components/prefix_sum.hip.hpp" -#include "hip/components/searching.hip.hpp" -#include "hip/components/sorting.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" namespace gko { diff --git a/hip/factorization/par_ilut_select_kernels.hip.cpp b/hip/factorization/par_ilut_select_kernels.hip.cpp new file mode 100644 index 00000000000..1e778c3951d --- /dev/null +++ b/hip/factorization/par_ilut_select_kernels.hip.cpp @@ -0,0 +1,162 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "core/factorization/par_ilut_kernels.hpp" + + +#include + + +#include +#include +#include + + +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/atomic.hpp" +#include "common/cuda_hip/components/intrinsics.hpp" +#include "common/cuda_hip/components/prefix_sum.hpp" +#include "common/cuda_hip/components/searching.hpp" +#include "common/cuda_hip/components/sorting.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "core/components/prefix_sum_kernels.hpp" +#include "hip/factorization/par_ilut_select_common.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { +/** + * @brief The parallel ILUT factorization namespace. + * + * @ingroup factor + */ +namespace par_ilut_factorization { + + +#include "common/cuda_hip/factorization/par_ilut_select_kernels.hpp.inc" + + +template +void sampleselect_filter(std::shared_ptr exec, + const ValueType* values, IndexType size, + const unsigned char* oracles, + const IndexType* partial_counts, IndexType bucket, + remove_complex* out) +{ + auto num_threads_total = ceildiv(size, items_per_thread); + auto num_blocks = + static_cast(ceildiv(num_threads_total, default_block_size)); + if (num_blocks > 0) { + kernel::filter_bucket<<get_stream()>>>( + as_device_type(values), size, bucket, oracles, partial_counts, + as_device_type(out), items_per_thread); + } +} + + +template +void threshold_select(std::shared_ptr exec, + const matrix::Csr* m, + IndexType rank, array& tmp1, + array>& tmp2, + remove_complex& threshold) +{ + auto values = m->get_const_values(); + IndexType size = m->get_num_stored_elements(); + using AbsType = remove_complex; + constexpr auto bucket_count = kernel::searchtree_width; + auto max_num_threads = ceildiv(size, items_per_thread); + auto max_num_blocks = ceildiv(max_num_threads, default_block_size); + + size_type tmp_size_totals = + ceildiv((bucket_count + 1) * sizeof(IndexType), sizeof(ValueType)); + size_type tmp_size_partials = ceildiv( + bucket_count * max_num_blocks * sizeof(IndexType), sizeof(ValueType)); + size_type tmp_size_oracles = + ceildiv(size * sizeof(unsigned char), sizeof(ValueType)); + size_type tmp_size_tree = + ceildiv(kernel::searchtree_size * sizeof(AbsType), sizeof(ValueType)); + size_type tmp_size_vals = + size / bucket_count * 4; // pessimistic estimate for temporary storage + size_type tmp_size = + tmp_size_totals + tmp_size_partials + tmp_size_oracles + tmp_size_tree; + tmp1.resize_and_reset(tmp_size); + tmp2.resize_and_reset(tmp_size_vals); + + auto total_counts = reinterpret_cast(tmp1.get_data()); + auto partial_counts = + reinterpret_cast(tmp1.get_data() + tmp_size_totals); + auto oracles = reinterpret_cast( + tmp1.get_data() + tmp_size_totals + tmp_size_partials); + auto tree = + reinterpret_cast(tmp1.get_data() + tmp_size_totals + + tmp_size_partials + tmp_size_oracles); + + sampleselect_count(exec, values, size, tree, oracles, partial_counts, + total_counts); + + // determine bucket with correct rank, use bucket-local rank + auto bucket = sampleselect_find_bucket(exec, total_counts, rank); + rank -= bucket.begin; + + if (bucket.size * 2 > tmp_size_vals) { + // we need to reallocate tmp2 + tmp2.resize_and_reset(bucket.size * 2); + } + auto tmp21 = tmp2.get_data(); + auto tmp22 = tmp2.get_data() + bucket.size; + // extract target bucket + sampleselect_filter(exec, values, size, oracles, partial_counts, bucket.idx, + tmp22); + + // recursively select from smaller buckets + int step{}; + while (bucket.size > kernel::basecase_size) { + std::swap(tmp21, tmp22); + const auto* tmp_in = tmp21; + auto tmp_out = tmp22; + + sampleselect_count(exec, tmp_in, bucket.size, tree, oracles, + partial_counts, total_counts); + auto new_bucket = sampleselect_find_bucket(exec, total_counts, rank); + sampleselect_filter(exec, tmp_in, bucket.size, oracles, partial_counts, + bucket.idx, tmp_out); + + rank -= new_bucket.begin; + bucket.size = new_bucket.size; + // we should never need more than 5 recursion steps, this would mean + // 256^5 = 2^40. fall back to standard library algorithm in that case. + ++step; + if (step > 5) { + array cpu_out_array{ + exec->get_master(), + make_array_view(exec, bucket.size, tmp_out)}; + auto begin = cpu_out_array.get_data(); + auto end = begin + bucket.size; + auto middle = begin + rank; + std::nth_element(begin, middle, end); + threshold = *middle; + return; + } + } + + // base case + auto out_ptr = reinterpret_cast(tmp1.get_data()); + kernel::basecase_select<<<1, kernel::basecase_block_size, 0, + exec->get_stream()>>>( + as_device_type(tmp22), bucket.size, rank, as_device_type(out_ptr)); + threshold = exec->copy_val_to_host(out_ptr); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_PAR_ILUT_THRESHOLD_SELECT_KERNEL); + + +} // namespace par_ilut_factorization +} // namespace hip +} // namespace kernels +} // namespace gko diff --git a/hip/factorization/par_ilut_spgeam_kernels.hip.cpp b/hip/factorization/par_ilut_spgeam_kernels.hip.cpp new file mode 100644 index 00000000000..1c8ea4bd27c --- /dev/null +++ b/hip/factorization/par_ilut_spgeam_kernels.hip.cpp @@ -0,0 +1,159 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "core/factorization/par_ilut_kernels.hpp" + + +#include +#include +#include +#include +#include + + +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/intrinsics.hpp" +#include "common/cuda_hip/components/merging.hpp" +#include "common/cuda_hip/components/prefix_sum.hpp" +#include "common/cuda_hip/components/searching.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "core/components/prefix_sum_kernels.hpp" +#include "core/matrix/coo_builder.hpp" +#include "core/matrix/csr_builder.hpp" +#include "core/matrix/csr_kernels.hpp" +#include "core/synthesizer/implementation_selection.hpp" +#include "hip/components/cooperative_groups.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { +/** + * @brief The parallel ILUT factorization namespace. + * + * @ingroup factor + */ +namespace par_ilut_factorization { + + +constexpr int default_block_size = 512; + + +// subwarp sizes for add_candidates kernels +using compiled_kernels = + syn::value_list; + + +#include "common/cuda_hip/factorization/par_ilut_spgeam_kernels.hpp.inc" + + +namespace { + + +template +void add_candidates(syn::value_list, + std::shared_ptr exec, + const matrix::Csr* lu, + const matrix::Csr* a, + const matrix::Csr* l, + const matrix::Csr* u, + matrix::Csr* l_new, + matrix::Csr* u_new) +{ + auto num_rows = static_cast(lu->get_size()[0]); + auto subwarps_per_block = default_block_size / subwarp_size; + auto num_blocks = ceildiv(num_rows, subwarps_per_block); + matrix::CsrBuilder l_new_builder(l_new); + matrix::CsrBuilder u_new_builder(u_new); + auto lu_row_ptrs = lu->get_const_row_ptrs(); + auto lu_col_idxs = lu->get_const_col_idxs(); + auto lu_vals = lu->get_const_values(); + auto a_row_ptrs = a->get_const_row_ptrs(); + auto a_col_idxs = a->get_const_col_idxs(); + auto a_vals = a->get_const_values(); + auto l_row_ptrs = l->get_const_row_ptrs(); + auto l_col_idxs = l->get_const_col_idxs(); + auto l_vals = l->get_const_values(); + auto u_row_ptrs = u->get_const_row_ptrs(); + auto u_col_idxs = u->get_const_col_idxs(); + auto u_vals = u->get_const_values(); + auto l_new_row_ptrs = l_new->get_row_ptrs(); + auto u_new_row_ptrs = u_new->get_row_ptrs(); + if (num_blocks > 0) { + // count non-zeros per row + kernel::tri_spgeam_nnz + <<get_stream()>>>( + lu_row_ptrs, lu_col_idxs, a_row_ptrs, a_col_idxs, + l_new_row_ptrs, u_new_row_ptrs, num_rows); + } + + // build row ptrs + components::prefix_sum_nonnegative(exec, l_new_row_ptrs, num_rows + 1); + components::prefix_sum_nonnegative(exec, u_new_row_ptrs, num_rows + 1); + + // resize output arrays + auto l_new_nnz = exec->copy_val_to_host(l_new_row_ptrs + num_rows); + auto u_new_nnz = exec->copy_val_to_host(u_new_row_ptrs + num_rows); + l_new_builder.get_col_idx_array().resize_and_reset(l_new_nnz); + l_new_builder.get_value_array().resize_and_reset(l_new_nnz); + u_new_builder.get_col_idx_array().resize_and_reset(u_new_nnz); + u_new_builder.get_value_array().resize_and_reset(u_new_nnz); + + auto l_new_col_idxs = l_new->get_col_idxs(); + auto l_new_vals = l_new->get_values(); + auto u_new_col_idxs = u_new->get_col_idxs(); + auto u_new_vals = u_new->get_values(); + + if (num_blocks > 0) { + // fill columns and values + kernel::tri_spgeam_init + <<get_stream()>>>( + lu_row_ptrs, lu_col_idxs, as_device_type(lu_vals), a_row_ptrs, + a_col_idxs, as_device_type(a_vals), l_row_ptrs, l_col_idxs, + as_device_type(l_vals), u_row_ptrs, u_col_idxs, + as_device_type(u_vals), l_new_row_ptrs, l_new_col_idxs, + as_device_type(l_new_vals), u_new_row_ptrs, u_new_col_idxs, + as_device_type(u_new_vals), num_rows); + } +} + + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_add_candidates, add_candidates); + + +} // namespace + + +template +void add_candidates(std::shared_ptr exec, + const matrix::Csr* lu, + const matrix::Csr* a, + const matrix::Csr* l, + const matrix::Csr* u, + matrix::Csr* l_new, + matrix::Csr* u_new) +{ + auto num_rows = a->get_size()[0]; + auto total_nnz = + lu->get_num_stored_elements() + a->get_num_stored_elements(); + auto total_nnz_per_row = total_nnz / num_rows; + select_add_candidates( + compiled_kernels(), + [&](int compiled_subwarp_size) { + return total_nnz_per_row <= compiled_subwarp_size || + compiled_subwarp_size == config::warp_size; + }, + syn::value_list(), syn::type_list<>(), exec, lu, a, l, u, l_new, + u_new); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_PAR_ILUT_ADD_CANDIDATES_KERNEL); + + +} // namespace par_ilut_factorization +} // namespace hip +} // namespace kernels +} // namespace gko diff --git a/hip/factorization/par_ilut_sweep_kernels.hip.cpp b/hip/factorization/par_ilut_sweep_kernels.hip.cpp new file mode 100644 index 00000000000..71cc1166179 --- /dev/null +++ b/hip/factorization/par_ilut_sweep_kernels.hip.cpp @@ -0,0 +1,123 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "core/factorization/par_ilut_kernels.hpp" + + +#include +#include +#include +#include +#include + + +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/intrinsics.hpp" +#include "common/cuda_hip/components/merging.hpp" +#include "common/cuda_hip/components/prefix_sum.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/searching.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "core/components/prefix_sum_kernels.hpp" +#include "core/matrix/coo_builder.hpp" +#include "core/matrix/csr_builder.hpp" +#include "core/matrix/csr_kernels.hpp" +#include "core/synthesizer/implementation_selection.hpp" +#include "hip/components/memory.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { +/** + * @brief The parallel ILUT factorization namespace. + * + * @ingroup factor + */ +namespace par_ilut_factorization { + + +constexpr int default_block_size = 512; + + +// subwarp sizes for all warp-parallel kernels (filter, add_candidates) +using compiled_kernels = + syn::value_list; + + +#include "common/cuda_hip/factorization/par_ilut_sweep_kernels.hpp.inc" + + +namespace { + + +template +void compute_l_u_factors(syn::value_list, + std::shared_ptr exec, + const matrix::Csr* a, + matrix::Csr* l, + const matrix::Coo* l_coo, + matrix::Csr* u, + const matrix::Coo* u_coo, + matrix::Csr* u_csc) +{ + auto total_nnz = static_cast(l->get_num_stored_elements() + + u->get_num_stored_elements()); + auto block_size = default_block_size / subwarp_size; + auto num_blocks = ceildiv(total_nnz, block_size); + if (num_blocks > 0) { + kernel::sweep + <<get_stream()>>>( + a->get_const_row_ptrs(), a->get_const_col_idxs(), + as_device_type(a->get_const_values()), l->get_const_row_ptrs(), + l_coo->get_const_row_idxs(), l->get_const_col_idxs(), + as_device_type(l->get_values()), + static_cast(l->get_num_stored_elements()), + u_coo->get_const_row_idxs(), u_coo->get_const_col_idxs(), + as_device_type(u->get_values()), u_csc->get_const_row_ptrs(), + u_csc->get_const_col_idxs(), + as_device_type(u_csc->get_values()), + static_cast(u->get_num_stored_elements())); + } +} + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_compute_l_u_factors, + compute_l_u_factors); + + +} // namespace + + +template +void compute_l_u_factors(std::shared_ptr exec, + const matrix::Csr* a, + matrix::Csr* l, + const matrix::Coo* l_coo, + matrix::Csr* u, + const matrix::Coo* u_coo, + matrix::Csr* u_csc) +{ + auto num_rows = a->get_size()[0]; + auto total_nnz = + l->get_num_stored_elements() + u->get_num_stored_elements(); + auto total_nnz_per_row = total_nnz / num_rows; + select_compute_l_u_factors( + compiled_kernels(), + [&](int compiled_subwarp_size) { + return total_nnz_per_row <= compiled_subwarp_size || + compiled_subwarp_size == config::warp_size; + }, + syn::value_list(), syn::type_list<>(), exec, a, l, l_coo, u, u_coo, + u_csc); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_PAR_ILUT_COMPUTE_LU_FACTORS_KERNEL); + + +} // namespace par_ilut_factorization +} // namespace hip +} // namespace kernels +} // namespace gko diff --git a/hip/log/batch_logger.hip.hpp b/hip/log/batch_logger.hip.hpp new file mode 100644 index 00000000000..a2540f2bd9d --- /dev/null +++ b/hip/log/batch_logger.hip.hpp @@ -0,0 +1,26 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_HIP_LOG_BATCH_LOGGER_HIP_HPP_ +#define GKO_HIP_LOG_BATCH_LOGGER_HIP_HPP_ + + +#include + + +namespace gko { +namespace kernels { +namespace hip { +namespace batch_log { + +#include "common/cuda_hip/log/batch_logger.hpp.inc" + + +} // namespace batch_log +} // namespace hip +} // namespace kernels +} // namespace gko + + +#endif // GKO_HIP_LOG_BATCH_LOGGER_HIP_HPP_ diff --git a/hip/matrix/batch_csr_kernels.hip.cpp b/hip/matrix/batch_csr_kernels.hip.cpp new file mode 100644 index 00000000000..8540a542138 --- /dev/null +++ b/hip/matrix/batch_csr_kernels.hip.cpp @@ -0,0 +1,58 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "core/matrix/batch_csr_kernels.hpp" + + +#include + + +#include +#include +#include + + +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/uninitialized_array.hpp" +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_struct.hpp" +#include "hip/base/batch_struct.hip.hpp" +#include "hip/base/config.hip.hpp" +#include "hip/base/thrust.hip.hpp" +#include "hip/components/cooperative_groups.hip.hpp" +#include "hip/matrix/batch_struct.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { +/** + * @brief The Csr matrix format namespace. + * @ref Csr + * @ingroup batch_csr + */ +namespace batch_csr { + + +constexpr auto default_block_size = 256; +constexpr int sm_oversubscription = 4; + +// clang-format off + +// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES + +#include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc" + + +#include "common/cuda_hip/matrix/batch_csr_kernel_launcher.hpp.inc" + +// clang-format on + + +} // namespace batch_csr +} // namespace hip +} // namespace kernels +} // namespace gko diff --git a/hip/matrix/batch_dense_kernels.hip.cpp b/hip/matrix/batch_dense_kernels.hip.cpp new file mode 100644 index 00000000000..981490d560a --- /dev/null +++ b/hip/matrix/batch_dense_kernels.hip.cpp @@ -0,0 +1,59 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "core/matrix/batch_dense_kernels.hpp" + + +#include + + +#include +#include +#include + + +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/uninitialized_array.hpp" +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_struct.hpp" +#include "hip/base/batch_struct.hip.hpp" +#include "hip/base/config.hip.hpp" +#include "hip/base/thrust.hip.hpp" +#include "hip/components/cooperative_groups.hip.hpp" +#include "hip/matrix/batch_struct.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { +/** + * @brief The Dense matrix format namespace. + * + * @ingroup batch_dense + */ +namespace batch_dense { + + +constexpr auto default_block_size = 256; +constexpr int sm_oversubscription = 4; + +// clang-format off + +// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES + +#include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc" + + +#include "common/cuda_hip/matrix/batch_dense_kernel_launcher.hpp.inc" + + +// clang-format on + + +} // namespace batch_dense +} // namespace hip +} // namespace kernels +} // namespace gko diff --git a/hip/matrix/batch_ell_kernels.hip.cpp b/hip/matrix/batch_ell_kernels.hip.cpp new file mode 100644 index 00000000000..373f87f1f81 --- /dev/null +++ b/hip/matrix/batch_ell_kernels.hip.cpp @@ -0,0 +1,58 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "core/matrix/batch_ell_kernels.hpp" + + +#include + + +#include +#include +#include + + +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/uninitialized_array.hpp" +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_struct.hpp" +#include "hip/base/batch_struct.hip.hpp" +#include "hip/base/config.hip.hpp" +#include "hip/base/thrust.hip.hpp" +#include "hip/components/cooperative_groups.hip.hpp" +#include "hip/matrix/batch_struct.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { +/** + * @brief The Ell matrix format namespace. + * @ref Ell + * @ingroup batch_ell + */ +namespace batch_ell { + + +constexpr auto default_block_size = 256; +constexpr int sm_oversubscription = 4; + +// clang-format off + +// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES + +#include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc" + + +#include "common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc" + +// clang-format on + + +} // namespace batch_ell +} // namespace hip +} // namespace kernels +} // namespace gko diff --git a/hip/matrix/csr_kernels.template.hip.cpp b/hip/matrix/csr_kernels.template.hip.cpp index 8371bc45b52..f80d9d07285 100644 --- a/hip/matrix/csr_kernels.template.hip.cpp +++ b/hip/matrix/csr_kernels.template.hip.cpp @@ -29,6 +29,15 @@ #include "accessor/hip_helper.hpp" +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/components/atomic.hpp" +#include "common/cuda_hip/components/intrinsics.hpp" +#include "common/cuda_hip/components/merging.hpp" +#include "common/cuda_hip/components/prefix_sum.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/segment_scan.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/uninitialized_array.hpp" #include "core/base/array_access.hpp" #include "core/base/mixed_precision_types.hpp" #include "core/components/fill_array_kernels.hpp" @@ -41,19 +50,10 @@ #include "core/synthesizer/implementation_selection.hpp" #include "hip/base/config.hip.hpp" #include "hip/base/hipsparse_bindings.hip.hpp" -#include "hip/base/math.hip.hpp" #include "hip/base/pointer_mode_guard.hip.hpp" #include "hip/base/thrust.hip.hpp" #include "hip/base/types.hip.hpp" -#include "hip/components/atomic.hip.hpp" #include "hip/components/cooperative_groups.hip.hpp" -#include "hip/components/intrinsics.hip.hpp" -#include "hip/components/merging.hip.hpp" -#include "hip/components/prefix_sum.hip.hpp" -#include "hip/components/reduction.hip.hpp" -#include "hip/components/segment_scan.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" -#include "hip/components/uninitialized_array.hip.hpp" namespace gko { diff --git a/hip/preconditioner/batch_preconditioners.hip.hpp b/hip/preconditioner/batch_preconditioners.hip.hpp index 5d2358378d2..71549260dfb 100644 --- a/hip/preconditioner/batch_preconditioners.hip.hpp +++ b/hip/preconditioner/batch_preconditioners.hip.hpp @@ -9,9 +9,9 @@ #include +#include "common/cuda_hip/components/reduction.hpp" #include "core/matrix/batch_struct.hpp" #include "hip/components/cooperative_groups.hip.hpp" -#include "hip/components/reduction.hip.hpp" namespace gko { diff --git a/hip/preconditioner/jacobi_advanced_apply_instantiate.inc.hip.cpp b/hip/preconditioner/jacobi_advanced_apply_instantiate.inc.hip.cpp index 326b9f6b720..f5c53022564 100644 --- a/hip/preconditioner/jacobi_advanced_apply_instantiate.inc.hip.cpp +++ b/hip/preconditioner/jacobi_advanced_apply_instantiate.inc.hip.cpp @@ -11,16 +11,16 @@ #include +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/warp_blas.hpp" #include "core/base/extended_float.hpp" #include "core/matrix/dense_kernels.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" #include "hip/base/config.hip.hpp" -#include "hip/base/math.hip.hpp" #include "hip/base/types.hip.hpp" #include "hip/components/cooperative_groups.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" -#include "hip/components/warp_blas.hip.hpp" #include "hip/preconditioner/jacobi_common.hip.hpp" @@ -35,7 +35,7 @@ namespace hip { namespace jacobi { -#include "common/cuda_hip/preconditioner/jacobi_advanced_apply_kernel.hpp.inc" +#include "common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.hpp.inc" // clang-format off diff --git a/hip/preconditioner/jacobi_generate_instantiate.inc.hip.cpp b/hip/preconditioner/jacobi_generate_instantiate.inc.hip.cpp index 86a3b799590..a78e7b7cd45 100644 --- a/hip/preconditioner/jacobi_generate_instantiate.inc.hip.cpp +++ b/hip/preconditioner/jacobi_generate_instantiate.inc.hip.cpp @@ -9,18 +9,18 @@ #include +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/components/diagonal_block_manipulation.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/uninitialized_array.hpp" +#include "common/cuda_hip/components/warp_blas.hpp" #include "core/base/extended_float.hpp" #include "core/components/fill_array_kernels.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" #include "hip/base/config.hip.hpp" -#include "hip/base/math.hip.hpp" #include "hip/base/types.hip.hpp" #include "hip/components/cooperative_groups.hip.hpp" -#include "hip/components/diagonal_block_manipulation.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" -#include "hip/components/uninitialized_array.hip.hpp" -#include "hip/components/warp_blas.hip.hpp" #include "hip/preconditioner/jacobi_common.hip.hpp" @@ -35,7 +35,7 @@ namespace hip { namespace jacobi { -#include "common/cuda_hip/preconditioner/jacobi_generate_kernel.hpp.inc" +#include "common/cuda_hip/preconditioner/jacobi_generate_kernels.hpp.inc" // clang-format off diff --git a/hip/preconditioner/jacobi_generate_kernel.hip.cpp b/hip/preconditioner/jacobi_generate_kernel.hip.cpp index 713be193250..6da1b964595 100644 --- a/hip/preconditioner/jacobi_generate_kernel.hip.cpp +++ b/hip/preconditioner/jacobi_generate_kernel.hip.cpp @@ -12,18 +12,18 @@ #include +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/components/diagonal_block_manipulation.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/uninitialized_array.hpp" +#include "common/cuda_hip/components/warp_blas.hpp" #include "core/base/extended_float.hpp" #include "core/components/fill_array_kernels.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" #include "hip/base/config.hip.hpp" -#include "hip/base/math.hip.hpp" #include "hip/base/types.hip.hpp" #include "hip/components/cooperative_groups.hip.hpp" -#include "hip/components/diagonal_block_manipulation.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" -#include "hip/components/uninitialized_array.hip.hpp" -#include "hip/components/warp_blas.hip.hpp" #include "hip/preconditioner/jacobi_common.hip.hpp" @@ -38,7 +38,7 @@ namespace hip { namespace jacobi { -#include "common/cuda_hip/preconditioner/jacobi_generate_kernel.hpp.inc" +#include "common/cuda_hip/preconditioner/jacobi_generate_kernels.hpp.inc" template + + +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "core/base/extended_float.hpp" +#include "core/preconditioner/jacobi_utils.hpp" +#include "core/synthesizer/implementation_selection.hpp" +#include "hip/base/config.hip.hpp" +#include "hip/base/types.hip.hpp" +#include "hip/components/cooperative_groups.hip.hpp" +#include "hip/preconditioner/jacobi_common.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { +/** + * @brief The Jacobi preconditioner namespace. + * @ref Jacobi + * @ingroup jacobi + */ +namespace jacobi { + + +// a total of 32/16 warps (1024 threads) +#if defined(GKO_COMPILING_HIP) && GINKGO_HIP_PLATFORM_HCC +constexpr int default_num_warps = 16; +#else // !defined(GKO_COMPILING_HIP) || GINKGO_HIP_PLATFORM_NVCC +constexpr int default_num_warps = 32; +#endif +// with current architectures, at most 32 warps can be scheduled per SM (and +// current GPUs have at most 84 SMs) +constexpr int default_grid_size = 32 * 32 * 128; + + +#include "common/cuda_hip/preconditioner/jacobi_kernels.hpp.inc" + + +} // namespace jacobi +} // namespace hip +} // namespace kernels +} // namespace gko diff --git a/hip/preconditioner/jacobi_simple_apply_instantiate.inc.hip.cpp b/hip/preconditioner/jacobi_simple_apply_instantiate.inc.hip.cpp index be485af5730..b191d7dbf2e 100644 --- a/hip/preconditioner/jacobi_simple_apply_instantiate.inc.hip.cpp +++ b/hip/preconditioner/jacobi_simple_apply_instantiate.inc.hip.cpp @@ -8,16 +8,16 @@ #include +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/warp_blas.hpp" #include "core/base/extended_float.hpp" #include "core/matrix/dense_kernels.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" #include "hip/base/config.hip.hpp" -#include "hip/base/math.hip.hpp" #include "hip/base/types.hip.hpp" #include "hip/components/cooperative_groups.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" -#include "hip/components/warp_blas.hip.hpp" #include "hip/preconditioner/jacobi_common.hip.hpp" @@ -32,7 +32,7 @@ namespace hip { namespace jacobi { -#include "common/cuda_hip/preconditioner/jacobi_simple_apply_kernel.hpp.inc" +#include "common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.hpp.inc" // clang-format off diff --git a/hip/preconditioner/jacobi_simple_apply_kernel.hip.cpp b/hip/preconditioner/jacobi_simple_apply_kernel.hip.cpp index 0763e986d41..10e3a580b3c 100644 --- a/hip/preconditioner/jacobi_simple_apply_kernel.hip.cpp +++ b/hip/preconditioner/jacobi_simple_apply_kernel.hip.cpp @@ -11,16 +11,16 @@ #include +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/warp_blas.hpp" #include "core/base/extended_float.hpp" #include "core/matrix/dense_kernels.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" #include "hip/base/config.hip.hpp" -#include "hip/base/math.hip.hpp" #include "hip/base/types.hip.hpp" #include "hip/components/cooperative_groups.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" -#include "hip/components/warp_blas.hip.hpp" #include "hip/preconditioner/jacobi_common.hip.hpp" @@ -35,7 +35,7 @@ namespace hip { namespace jacobi { -#include "common/cuda_hip/preconditioner/jacobi_simple_apply_kernel.hpp.inc" +#include "common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.hpp.inc" template +#include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/uninitialized_array.hpp" #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "core/solver/batch_dispatch.hpp" #include "hip/base/batch_struct.hip.hpp" #include "hip/base/config.hip.hpp" -#include "hip/base/math.hip.hpp" #include "hip/base/thrust.hip.hpp" #include "hip/base/types.hip.hpp" #include "hip/components/cooperative_groups.hip.hpp" -#include "hip/components/reduction.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" -#include "hip/components/uninitialized_array.hip.hpp" #include "hip/matrix/batch_struct.hip.hpp" @@ -47,7 +47,6 @@ namespace batch_bicgstab { #include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc" -#include "common/cuda_hip/components/uninitialized_array.hpp.inc" #include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc" #include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc" #include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc" diff --git a/hip/solver/batch_cg_kernels.hip.cpp b/hip/solver/batch_cg_kernels.hip.cpp index 1288c15b4c0..b5232aea8b2 100644 --- a/hip/solver/batch_cg_kernels.hip.cpp +++ b/hip/solver/batch_cg_kernels.hip.cpp @@ -14,18 +14,18 @@ #include +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/uninitialized_array.hpp" #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" #include "core/solver/batch_dispatch.hpp" #include "hip/base/batch_struct.hip.hpp" #include "hip/base/config.hip.hpp" -#include "hip/base/math.hip.hpp" #include "hip/base/thrust.hip.hpp" #include "hip/base/types.hip.hpp" #include "hip/components/cooperative_groups.hip.hpp" -#include "hip/components/reduction.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" -#include "hip/components/uninitialized_array.hip.hpp" #include "hip/matrix/batch_struct.hip.hpp" @@ -46,7 +46,6 @@ namespace batch_cg { #include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc" -#include "common/cuda_hip/components/uninitialized_array.hpp.inc" #include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc" #include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc" #include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc" diff --git a/hip/solver/common_trs_kernels.hip.hpp b/hip/solver/common_trs_kernels.hip.hpp index 8bba503ba0a..72683f7d8ec 100644 --- a/hip/solver/common_trs_kernels.hip.hpp +++ b/hip/solver/common_trs_kernels.hip.hpp @@ -22,10 +22,10 @@ #include +#include "common/cuda_hip/base/math.hpp" #include "core/matrix/dense_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" #include "hip/base/hipsparse_bindings.hip.hpp" -#include "hip/base/math.hip.hpp" #include "hip/base/pointer_mode_guard.hip.hpp" #include "hip/base/types.hip.hpp" diff --git a/hip/solver/lower_trs_kernels.hip.cpp b/hip/solver/lower_trs_kernels.hip.cpp index 08f35d3d674..6a3ace8231a 100644 --- a/hip/solver/lower_trs_kernels.hip.cpp +++ b/hip/solver/lower_trs_kernels.hip.cpp @@ -21,8 +21,8 @@ #include +#include "common/cuda_hip/base/math.hpp" #include "hip/base/hipsparse_bindings.hip.hpp" -#include "hip/base/math.hip.hpp" #include "hip/base/types.hip.hpp" #include "hip/solver/common_trs_kernels.hip.hpp" diff --git a/hip/solver/upper_trs_kernels.hip.cpp b/hip/solver/upper_trs_kernels.hip.cpp index cd6b0719844..613d2f33577 100644 --- a/hip/solver/upper_trs_kernels.hip.cpp +++ b/hip/solver/upper_trs_kernels.hip.cpp @@ -21,8 +21,8 @@ #include +#include "common/cuda_hip/base/math.hpp" #include "hip/base/hipsparse_bindings.hip.hpp" -#include "hip/base/math.hip.hpp" #include "hip/base/types.hip.hpp" #include "hip/solver/common_trs_kernels.hip.hpp" diff --git a/hip/stop/batch_criteria.hip.hpp b/hip/stop/batch_criteria.hip.hpp new file mode 100644 index 00000000000..1f721e36aaf --- /dev/null +++ b/hip/stop/batch_criteria.hip.hpp @@ -0,0 +1,26 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_HIP_STOP_BATCH_CRITERIA_HIP_HPP_ +#define GKO_HIP_STOP_BATCH_CRITERIA_HIP_HPP_ + + +#include + + +namespace gko { +namespace kernels { +namespace hip { +namespace batch_stop { + + +#include "common/cuda_hip/stop/batch_criteria.hpp.inc" + + +} // namespace batch_stop +} // namespace hip +} // namespace kernels +} // namespace gko + +#endif // GKO_HIP_STOP_BATCH_CRITERIA_HIP_HPP_ diff --git a/hip/stop/criterion_kernels.hip.cpp b/hip/stop/criterion_kernels.hip.cpp index 8c7caeb32b8..6ce3fe81ab3 100644 --- a/hip/stop/criterion_kernels.hip.cpp +++ b/hip/stop/criterion_kernels.hip.cpp @@ -10,9 +10,9 @@ #include -#include "hip/base/math.hip.hpp" +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" #include "hip/base/types.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" namespace gko { diff --git a/hip/stop/residual_norm_kernels.hip.cpp b/hip/stop/residual_norm_kernels.hip.cpp index d790dd652f0..efb6c31a7eb 100644 --- a/hip/stop/residual_norm_kernels.hip.cpp +++ b/hip/stop/residual_norm_kernels.hip.cpp @@ -13,10 +13,10 @@ #include +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" #include "core/base/array_access.hpp" -#include "hip/base/math.hip.hpp" #include "hip/base/types.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" namespace gko { diff --git a/hip/test/base/math.hip.cpp b/hip/test/base/math.hip.cpp index 2c25f5b3a7a..77a20eb09cd 100644 --- a/hip/test/base/math.hip.cpp +++ b/hip/test/base/math.hip.cpp @@ -23,7 +23,7 @@ #include -#include "hip/base/math.hip.hpp" +#include "common/cuda_hip/base/math.hpp" #include "hip/base/types.hip.hpp" #include "hip/test/utils.hip.hpp" diff --git a/hip/test/components/merging.hip.cpp b/hip/test/components/merging.hip.cpp index 7bfab76f795..244d81c2cc1 100644 --- a/hip/test/components/merging.hip.cpp +++ b/hip/test/components/merging.hip.cpp @@ -8,9 +8,6 @@ // force-top: off -#include "hip/components/merging.hip.hpp" - - #include #include #include @@ -24,6 +21,7 @@ #include +#include "common/cuda_hip/components/merging.hpp" #include "hip/components/cooperative_groups.hip.hpp" #include "hip/test/utils.hip.hpp" diff --git a/hip/test/components/searching.hip.cpp b/hip/test/components/searching.hip.cpp index 1db0c6e9562..250611b509a 100644 --- a/hip/test/components/searching.hip.cpp +++ b/hip/test/components/searching.hip.cpp @@ -8,9 +8,6 @@ // force-top: off -#include "hip/components/searching.hip.hpp" - - #include #include #include @@ -23,6 +20,7 @@ #include +#include "common/cuda_hip/components/searching.hpp" #include "hip/components/cooperative_groups.hip.hpp" #include "hip/test/utils.hip.hpp" diff --git a/hip/test/components/sorting.hip.cpp b/hip/test/components/sorting.hip.cpp index 5cab0048a4b..2a7c244615d 100644 --- a/hip/test/components/sorting.hip.cpp +++ b/hip/test/components/sorting.hip.cpp @@ -2,9 +2,6 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include "hip/components/sorting.hip.hpp" - - #include #include @@ -16,6 +13,7 @@ #include +#include "common/cuda_hip/components/sorting.hpp" #include "hip/test/utils.hip.hpp"