From 5f4c65287420f3b1d398ab53b11a3804020f8e0f Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Sun, 19 May 2024 19:23:19 +0200 Subject: [PATCH] apply script --- common/CMakeLists.txt | 1 + common/cuda_hip/CMakeLists.txt | 58 ++ ...hpp.inc => batch_multi_vector_kernels.cpp} | 55 ++ ...hpp.inc => device_matrix_data_kernels.cpp} | 26 + ...ernel_launch.hpp.inc => kernel_launch.hpp} | 54 ++ ...on.hpp.inc => kernel_launch_reduction.hpp} | 23 + ...olver.hpp.inc => kernel_launch_solver.hpp} | 19 + .../cuda_hip/base/{math.hpp.inc => math.hpp} | 19 + .../components/{atomic.hpp.inc => atomic.hpp} | 24 + ...pp.inc => diagonal_block_manipulation.hpp} | 27 + .../{intrinsics.hpp.inc => intrinsics.hpp} | 20 + .../{merging.hpp.inc => merging.hpp} | 23 + .../{prefix_sum.hpp.inc => prefix_sum.hpp} | 26 + ...kernels.hpp.inc => prefix_sum_kernels.cpp} | 27 + .../{reduction.hpp.inc => reduction.hpp} | 77 +++ .../{searching.hpp.inc => searching.hpp} | 21 + ...{segment_scan.hpp.inc => segment_scan.hpp} | 20 + .../{sorting.hpp.inc => sorting.hpp} | 21 + .../{syncfree.hpp.inc => syncfree.hpp} | 27 + .../{thread_ids.hpp.inc => thread_ids.hpp} | 22 + ..._array.hpp.inc => uninitialized_array.hpp} | 20 + .../{warp_blas.hpp.inc => warp_blas.hpp} | 28 + ..._kernels.hpp.inc => index_map_kernels.cpp} | 35 ++ ...rix_kernels.hpp.inc => matrix_kernels.cpp} | 33 ++ ....hpp.inc => partition_helpers_kernels.cpp} | 22 + ..._kernels.hpp.inc => partition_kernels.cpp} | 26 + ...tor_kernels.hpp.inc => vector_kernels.cpp} | 30 ++ ...y_kernels.hpp.inc => cholesky_kernels.cpp} | 108 ++++ ...nels.hpp.inc => factorization_kernels.cpp} | 36 ++ .../{lu_kernels.hpp.inc => lu_kernels.cpp} | 43 ++ ..._ic_kernels.hpp.inc => par_ic_kernels.cpp} | 32 ++ ...ct_kernels.hpp.inc => par_ict_kernels.cpp} | 184 +++++++ ...lu_kernels.hpp.inc => par_ilu_kernels.cpp} | 32 ++ ...ls.hpp.inc => par_ilut_filter_kernels.cpp} | 135 +++++ ...ls.hpp.inc => par_ilut_select_kernels.cpp} | 157 ++++++ ...ls.hpp.inc => par_ilut_spgeam_kernels.cpp} | 154 ++++++ .../factorization/par_ilut_sweep_kernels.cpp | 212 ++++++++ .../par_ilut_sweep_kernels.hpp.inc | 94 ---- ...{batch_logger.hpp.inc => batch_logger.hpp} | 22 + ..._kernels.hpp.inc => batch_csr_kernels.cpp} | 54 ++ ...ernels.hpp.inc => batch_dense_kernels.cpp} | 55 ++ ..._kernels.hpp.inc => batch_ell_kernels.cpp} | 54 ++ .../{coo_kernels.hpp.inc => coo_kernels.cpp} | 42 ++ ...ense_kernels.hpp.inc => dense_kernels.cpp} | 225 ++++++++ ...l_kernels.hpp.inc => diagonal_kernels.cpp} | 32 ++ .../cuda_hip/matrix/ell_kernels.cpp | 173 +++++- common/cuda_hip/matrix/ell_kernels.hpp.inc | 133 ----- ...bcsr_kernels.hpp.inc => fbcsr_kernels.cpp} | 297 ++++++++++ ...ellp_kernels.hpp.inc => sellp_kernels.cpp} | 37 ++ .../cuda_hip/matrix/sparsity_csr_kernels.cpp | 147 ++++- .../matrix/sparsity_csr_kernels.hpp.inc | 111 ---- .../{pgm_kernels.hpp.inc => pgm_kernels.cpp} | 36 ++ ...{isai_kernels.hpp.inc => isai_kernels.cpp} | 42 ++ ...obi_kernels.hpp.inc => jacobi_kernels.cpp} | 48 ++ .../{rcm_kernels.hpp.inc => rcm_kernels.cpp} | 47 ++ ...s_kernels.hpp.inc => cb_gmres_kernels.cpp} | 500 +++++++++++++++++ .../cuda_hip/solver/idr_kernels.cpp | 341 +++++++++++- common/cuda_hip/solver/idr_kernels.hpp.inc | 318 ----------- ..._kernels.hpp.inc => multigrid_kernels.cpp} | 34 ++ ...ch_criteria.hpp.inc => batch_criteria.hpp} | 21 + cuda/CMakeLists.txt | 33 +- cuda/base/batch_multi_vector_kernels.cu | 59 -- cuda/base/device_matrix_data_kernels.cu | 33 -- cuda/base/kernel_launch.cuh | 57 -- cuda/base/kernel_launch_reduction.cuh | 28 - cuda/base/kernel_launch_solver.cuh | 24 - cuda/base/math.hpp | 24 - cuda/components/atomic.cuh | 29 - .../diagonal_block_manipulation.cuh | 32 -- cuda/components/intrinsics.cuh | 25 - cuda/components/merging.cuh | 28 - cuda/components/prefix_sum.cuh | 31 -- cuda/components/prefix_sum_kernels.cu | 34 -- cuda/components/reduction.cuh | 82 --- cuda/components/searching.cuh | 26 - cuda/components/segment_scan.cuh | 25 - cuda/components/sorting.cuh | 26 - cuda/components/syncfree.cuh | 32 -- cuda/components/thread_ids.cuh | 27 - cuda/components/uninitialized_array.hpp | 25 - cuda/components/warp_blas.cuh | 33 -- cuda/distributed/index_map_kernels.cu | 42 -- cuda/distributed/matrix_kernels.cu | 40 -- cuda/distributed/partition_helpers_kernels.cu | 29 - cuda/distributed/partition_kernels.cu | 33 -- cuda/distributed/vector_kernels.cu | 37 -- cuda/factorization/cholesky_kernels.cu | 115 ---- cuda/factorization/factorization_kernels.cu | 43 -- cuda/factorization/lu_kernels.cu | 50 -- cuda/factorization/par_ic_kernels.cu | 39 -- cuda/factorization/par_ict_kernels.cu | 189 ------- cuda/factorization/par_ilu_kernels.cu | 39 -- cuda/factorization/par_ilut_filter_kernels.cu | 140 ----- cuda/factorization/par_ilut_select_kernels.cu | 162 ------ cuda/factorization/par_ilut_spgeam_kernels.cu | 159 ------ cuda/factorization/par_ilut_sweep_kernels.cu | 123 ----- cuda/log/batch_logger.cuh | 27 - cuda/matrix/batch_csr_kernels.cu | 58 -- cuda/matrix/batch_dense_kernels.cu | 59 -- cuda/matrix/batch_ell_kernels.cu | 58 -- cuda/matrix/coo_kernels.cu | 49 -- cuda/matrix/dense_kernels.cu | 232 -------- cuda/matrix/diagonal_kernels.cu | 39 -- cuda/matrix/fbcsr_kernels.template.cu | 303 ----------- cuda/matrix/sellp_kernels.cu | 44 -- cuda/matrix/sparsity_csr_kernels.cu | 226 -------- cuda/multigrid/pgm_kernels.cu | 43 -- cuda/preconditioner/isai_kernels.cu | 49 -- cuda/preconditioner/jacobi_kernels.cu | 51 -- cuda/reorder/rcm_kernels.cu | 54 -- cuda/solver/cb_gmres_kernels.cu | 507 ------------------ cuda/solver/multigrid_kernels.cu | 41 -- cuda/stop/batch_criteria.cuh | 26 - hip/CMakeLists.txt | 31 +- hip/base/batch_multi_vector_kernels.hip.cpp | 59 -- hip/base/device_matrix_data_kernels.hip.cpp | 33 -- hip/base/kernel_launch.hip.hpp | 57 -- hip/base/kernel_launch_reduction.hip.hpp | 28 - hip/base/kernel_launch_solver.hip.hpp | 24 - hip/base/math.hip.hpp | 24 - hip/components/atomic.hip.hpp | 29 - .../diagonal_block_manipulation.hip.hpp | 32 -- hip/components/intrinsics.hip.hpp | 25 - hip/components/merging.hip.hpp | 28 - hip/components/prefix_sum.hip.hpp | 31 -- hip/components/prefix_sum_kernels.hip.cpp | 34 -- hip/components/reduction.hip.hpp | 82 --- hip/components/searching.hip.hpp | 26 - hip/components/segment_scan.hip.hpp | 25 - hip/components/sorting.hip.hpp | 26 - hip/components/syncfree.hip.hpp | 32 -- hip/components/thread_ids.hip.hpp | 27 - hip/components/uninitialized_array.hip.hpp | 25 - hip/components/warp_blas.hip.hpp | 33 -- hip/distributed/index_map_kernels.hip.cpp | 42 -- hip/distributed/matrix_kernels.hip.cpp | 40 -- .../partition_helpers_kernels.hip.cpp | 29 - hip/distributed/partition_kernels.hip.cpp | 33 -- hip/distributed/vector_kernels.hip.cpp | 37 -- hip/factorization/cholesky_kernels.hip.cpp | 115 ---- .../factorization_kernels.hip.cpp | 43 -- hip/factorization/lu_kernels.hip.cpp | 50 -- hip/factorization/par_ic_kernels.hip.cpp | 39 -- hip/factorization/par_ict_kernels.hip.cpp | 189 ------- hip/factorization/par_ilu_kernels.hip.cpp | 39 -- .../par_ilut_filter_kernels.hip.cpp | 140 ----- .../par_ilut_select_kernels.hip.cpp | 162 ------ .../par_ilut_spgeam_kernels.hip.cpp | 159 ------ .../par_ilut_sweep_kernels.hip.cpp | 123 ----- hip/log/batch_logger.hip.hpp | 26 - hip/matrix/batch_csr_kernels.hip.cpp | 58 -- hip/matrix/batch_dense_kernels.hip.cpp | 59 -- hip/matrix/batch_ell_kernels.hip.cpp | 58 -- hip/matrix/coo_kernels.hip.cpp | 49 -- hip/matrix/dense_kernels.hip.cpp | 232 -------- hip/matrix/diagonal_kernels.hip.cpp | 39 -- hip/matrix/ell_kernels.hip.cpp | 272 ---------- hip/matrix/fbcsr_kernels.template.hip.cpp | 303 ----------- hip/matrix/sellp_kernels.hip.cpp | 44 -- hip/multigrid/pgm_kernels.hip.cpp | 43 -- hip/preconditioner/isai_kernels.hip.cpp | 49 -- hip/preconditioner/jacobi_kernels.hip.cpp | 51 -- hip/reorder/rcm_kernels.hip.cpp | 54 -- hip/solver/cb_gmres_kernels.hip.cpp | 507 ------------------ hip/solver/idr_kernels.hip.cpp | 343 ------------ hip/solver/multigrid_kernels.hip.cpp | 41 -- hip/stop/batch_criteria.hip.hpp | 26 - 167 files changed, 4029 insertions(+), 8735 deletions(-) create mode 100644 common/cuda_hip/CMakeLists.txt rename common/cuda_hip/base/{batch_multi_vector_kernels.hpp.inc => batch_multi_vector_kernels.cpp} (89%) rename common/cuda_hip/base/{device_matrix_data_kernels.hpp.inc => device_matrix_data_kernels.cpp} (88%) rename common/cuda_hip/base/{kernel_launch.hpp.inc => kernel_launch.hpp} (58%) rename common/cuda_hip/base/{kernel_launch_reduction.hpp.inc => kernel_launch_reduction.hpp} (97%) rename common/cuda_hip/base/{kernel_launch_solver.hpp.inc => kernel_launch_solver.hpp} (77%) rename common/cuda_hip/base/{math.hpp.inc => math.hpp} (80%) rename common/cuda_hip/components/{atomic.hpp.inc => atomic.hpp} (95%) rename common/cuda_hip/components/{diagonal_block_manipulation.hpp.inc => diagonal_block_manipulation.hpp} (81%) rename common/cuda_hip/components/{intrinsics.hpp.inc => intrinsics.hpp} (74%) rename common/cuda_hip/components/{merging.hpp.inc => merging.hpp} (96%) rename common/cuda_hip/components/{prefix_sum.hpp.inc => prefix_sum.hpp} (91%) rename common/cuda_hip/components/{prefix_sum_kernels.hpp.inc => prefix_sum_kernels.cpp} (80%) rename common/cuda_hip/components/{reduction.hpp.inc => reduction.hpp} (78%) rename common/cuda_hip/components/{searching.hpp.inc => searching.hpp} (95%) rename common/cuda_hip/components/{segment_scan.hpp.inc => segment_scan.hpp} (73%) rename common/cuda_hip/components/{sorting.hpp.inc => sorting.hpp} (96%) rename common/cuda_hip/components/{syncfree.hpp.inc => syncfree.hpp} (86%) rename common/cuda_hip/components/{thread_ids.hpp.inc => thread_ids.hpp} (94%) rename common/cuda_hip/components/{uninitialized_array.hpp.inc => uninitialized_array.hpp} (82%) rename common/cuda_hip/components/{warp_blas.hpp.inc => warp_blas.hpp} (97%) rename common/cuda_hip/distributed/{index_map_kernels.hpp.inc => index_map_kernels.cpp} (92%) rename common/cuda_hip/distributed/{matrix_kernels.hpp.inc => matrix_kernels.cpp} (94%) rename common/cuda_hip/distributed/{partition_helpers_kernels.hpp.inc => partition_helpers_kernels.cpp} (70%) rename common/cuda_hip/distributed/{partition_kernels.hpp.inc => partition_kernels.cpp} (89%) rename common/cuda_hip/distributed/{vector_kernels.hpp.inc => vector_kernels.cpp} (83%) rename common/cuda_hip/factorization/{cholesky_kernels.hpp.inc => cholesky_kernels.cpp} (78%) rename common/cuda_hip/factorization/{factorization_kernels.hpp.inc => factorization_kernels.cpp} (95%) rename common/cuda_hip/factorization/{lu_kernels.hpp.inc => lu_kernels.cpp} (92%) rename common/cuda_hip/factorization/{par_ic_kernels.hpp.inc => par_ic_kernels.cpp} (84%) rename common/cuda_hip/factorization/{par_ict_kernels.hpp.inc => par_ict_kernels.cpp} (62%) rename common/cuda_hip/factorization/{par_ilu_kernels.hpp.inc => par_ilu_kernels.cpp} (83%) rename common/cuda_hip/factorization/{par_ilut_filter_kernels.hpp.inc => par_ilut_filter_kernels.cpp} (57%) rename common/cuda_hip/factorization/{par_ilut_select_kernels.hpp.inc => par_ilut_select_kernels.cpp} (63%) rename common/cuda_hip/factorization/{par_ilut_spgeam_kernels.hpp.inc => par_ilut_spgeam_kernels.cpp} (63%) create mode 100644 common/cuda_hip/factorization/par_ilut_sweep_kernels.cpp delete mode 100644 common/cuda_hip/factorization/par_ilut_sweep_kernels.hpp.inc rename common/cuda_hip/log/{batch_logger.hpp.inc => batch_logger.hpp} (68%) rename common/cuda_hip/matrix/{batch_csr_kernels.hpp.inc => batch_csr_kernels.cpp} (87%) rename common/cuda_hip/matrix/{batch_dense_kernels.hpp.inc => batch_dense_kernels.cpp} (89%) rename common/cuda_hip/matrix/{batch_ell_kernels.hpp.inc => batch_ell_kernels.cpp} (87%) rename common/cuda_hip/matrix/{coo_kernels.hpp.inc => coo_kernels.cpp} (91%) rename common/cuda_hip/matrix/{dense_kernels.hpp.inc => dense_kernels.cpp} (75%) rename common/cuda_hip/matrix/{diagonal_kernels.hpp.inc => diagonal_kernels.cpp} (73%) rename cuda/matrix/ell_kernels.cu => common/cuda_hip/matrix/ell_kernels.cpp (57%) delete mode 100644 common/cuda_hip/matrix/ell_kernels.hpp.inc rename common/cuda_hip/matrix/{fbcsr_kernels.hpp.inc => fbcsr_kernels.cpp} (57%) rename common/cuda_hip/matrix/{sellp_kernels.hpp.inc => sellp_kernels.cpp} (83%) rename hip/matrix/sparsity_csr_kernels.hip.cpp => common/cuda_hip/matrix/sparsity_csr_kernels.cpp (58%) delete mode 100644 common/cuda_hip/matrix/sparsity_csr_kernels.hpp.inc rename common/cuda_hip/multigrid/{pgm_kernels.hpp.inc => pgm_kernels.cpp} (77%) rename common/cuda_hip/preconditioner/{isai_kernels.hpp.inc => isai_kernels.cpp} (94%) rename common/cuda_hip/preconditioner/{jacobi_kernels.hpp.inc => jacobi_kernels.cpp} (91%) rename common/cuda_hip/reorder/{rcm_kernels.hpp.inc => rcm_kernels.cpp} (95%) rename common/cuda_hip/solver/{cb_gmres_kernels.hpp.inc => cb_gmres_kernels.cpp} (50%) rename cuda/solver/idr_kernels.cu => common/cuda_hip/solver/idr_kernels.cpp (51%) delete mode 100644 common/cuda_hip/solver/idr_kernels.hpp.inc rename common/cuda_hip/solver/{multigrid_kernels.hpp.inc => multigrid_kernels.cpp} (89%) rename common/cuda_hip/stop/{batch_criteria.hpp.inc => batch_criteria.hpp} (75%) delete mode 100644 cuda/base/batch_multi_vector_kernels.cu delete mode 100644 cuda/base/device_matrix_data_kernels.cu delete mode 100644 cuda/base/kernel_launch.cuh delete mode 100644 cuda/base/kernel_launch_reduction.cuh delete mode 100644 cuda/base/kernel_launch_solver.cuh delete mode 100644 cuda/base/math.hpp delete mode 100644 cuda/components/atomic.cuh delete mode 100644 cuda/components/diagonal_block_manipulation.cuh delete mode 100644 cuda/components/intrinsics.cuh delete mode 100644 cuda/components/merging.cuh delete mode 100644 cuda/components/prefix_sum.cuh delete mode 100644 cuda/components/prefix_sum_kernels.cu delete mode 100644 cuda/components/reduction.cuh delete mode 100644 cuda/components/searching.cuh delete mode 100644 cuda/components/segment_scan.cuh delete mode 100644 cuda/components/sorting.cuh delete mode 100644 cuda/components/syncfree.cuh delete mode 100644 cuda/components/thread_ids.cuh delete mode 100644 cuda/components/uninitialized_array.hpp delete mode 100644 cuda/components/warp_blas.cuh delete mode 100644 cuda/distributed/index_map_kernels.cu delete mode 100644 cuda/distributed/matrix_kernels.cu delete mode 100644 cuda/distributed/partition_helpers_kernels.cu delete mode 100644 cuda/distributed/partition_kernels.cu delete mode 100644 cuda/distributed/vector_kernels.cu delete mode 100644 cuda/factorization/cholesky_kernels.cu delete mode 100644 cuda/factorization/factorization_kernels.cu delete mode 100644 cuda/factorization/lu_kernels.cu delete mode 100644 cuda/factorization/par_ic_kernels.cu delete mode 100644 cuda/factorization/par_ict_kernels.cu delete mode 100644 cuda/factorization/par_ilu_kernels.cu delete mode 100644 cuda/factorization/par_ilut_filter_kernels.cu delete mode 100644 cuda/factorization/par_ilut_select_kernels.cu delete mode 100644 cuda/factorization/par_ilut_spgeam_kernels.cu delete mode 100644 cuda/factorization/par_ilut_sweep_kernels.cu delete mode 100644 cuda/log/batch_logger.cuh delete mode 100644 cuda/matrix/batch_csr_kernels.cu delete mode 100644 cuda/matrix/batch_dense_kernels.cu delete mode 100644 cuda/matrix/batch_ell_kernels.cu delete mode 100644 cuda/matrix/coo_kernels.cu delete mode 100644 cuda/matrix/dense_kernels.cu delete mode 100644 cuda/matrix/diagonal_kernels.cu delete mode 100644 cuda/matrix/fbcsr_kernels.template.cu delete mode 100644 cuda/matrix/sellp_kernels.cu delete mode 100644 cuda/matrix/sparsity_csr_kernels.cu delete mode 100644 cuda/multigrid/pgm_kernels.cu delete mode 100644 cuda/preconditioner/isai_kernels.cu delete mode 100644 cuda/preconditioner/jacobi_kernels.cu delete mode 100644 cuda/reorder/rcm_kernels.cu delete mode 100644 cuda/solver/cb_gmres_kernels.cu delete mode 100644 cuda/solver/multigrid_kernels.cu delete mode 100644 cuda/stop/batch_criteria.cuh delete mode 100644 hip/base/batch_multi_vector_kernels.hip.cpp delete mode 100644 hip/base/device_matrix_data_kernels.hip.cpp delete mode 100644 hip/base/kernel_launch.hip.hpp delete mode 100644 hip/base/kernel_launch_reduction.hip.hpp delete mode 100644 hip/base/kernel_launch_solver.hip.hpp delete mode 100644 hip/base/math.hip.hpp delete mode 100644 hip/components/atomic.hip.hpp delete mode 100644 hip/components/diagonal_block_manipulation.hip.hpp delete mode 100644 hip/components/intrinsics.hip.hpp delete mode 100644 hip/components/merging.hip.hpp delete mode 100644 hip/components/prefix_sum.hip.hpp delete mode 100644 hip/components/prefix_sum_kernels.hip.cpp delete mode 100644 hip/components/reduction.hip.hpp delete mode 100644 hip/components/searching.hip.hpp delete mode 100644 hip/components/segment_scan.hip.hpp delete mode 100644 hip/components/sorting.hip.hpp delete mode 100644 hip/components/syncfree.hip.hpp delete mode 100644 hip/components/thread_ids.hip.hpp delete mode 100644 hip/components/uninitialized_array.hip.hpp delete mode 100644 hip/components/warp_blas.hip.hpp delete mode 100644 hip/distributed/index_map_kernels.hip.cpp delete mode 100644 hip/distributed/matrix_kernels.hip.cpp delete mode 100644 hip/distributed/partition_helpers_kernels.hip.cpp delete mode 100644 hip/distributed/partition_kernels.hip.cpp delete mode 100644 hip/distributed/vector_kernels.hip.cpp delete mode 100644 hip/factorization/cholesky_kernels.hip.cpp delete mode 100644 hip/factorization/factorization_kernels.hip.cpp delete mode 100644 hip/factorization/lu_kernels.hip.cpp delete mode 100644 hip/factorization/par_ic_kernels.hip.cpp delete mode 100644 hip/factorization/par_ict_kernels.hip.cpp delete mode 100644 hip/factorization/par_ilu_kernels.hip.cpp delete mode 100644 hip/factorization/par_ilut_filter_kernels.hip.cpp delete mode 100644 hip/factorization/par_ilut_select_kernels.hip.cpp delete mode 100644 hip/factorization/par_ilut_spgeam_kernels.hip.cpp delete mode 100644 hip/factorization/par_ilut_sweep_kernels.hip.cpp delete mode 100644 hip/log/batch_logger.hip.hpp delete mode 100644 hip/matrix/batch_csr_kernels.hip.cpp delete mode 100644 hip/matrix/batch_dense_kernels.hip.cpp delete mode 100644 hip/matrix/batch_ell_kernels.hip.cpp delete mode 100644 hip/matrix/coo_kernels.hip.cpp delete mode 100644 hip/matrix/dense_kernels.hip.cpp delete mode 100644 hip/matrix/diagonal_kernels.hip.cpp delete mode 100644 hip/matrix/ell_kernels.hip.cpp delete mode 100644 hip/matrix/fbcsr_kernels.template.hip.cpp delete mode 100644 hip/matrix/sellp_kernels.hip.cpp delete mode 100644 hip/multigrid/pgm_kernels.hip.cpp delete mode 100644 hip/preconditioner/isai_kernels.hip.cpp delete mode 100644 hip/preconditioner/jacobi_kernels.hip.cpp delete mode 100644 hip/reorder/rcm_kernels.hip.cpp delete mode 100644 hip/solver/cb_gmres_kernels.hip.cpp delete mode 100644 hip/solver/idr_kernels.hip.cpp delete mode 100644 hip/solver/multigrid_kernels.hip.cpp delete mode 100644 hip/stop/batch_criteria.hip.hpp diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index 77bdd7230b9..e7c665640b3 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -1,2 +1,3 @@ add_subdirectory(unified) set(GKO_UNIFIED_COMMON_SOURCES ${GKO_UNIFIED_COMMON_SOURCES} PARENT_SCOPE) +set(GKO_CUDA_HIP_COMMON_SOURCES ${GKO_CUDA_HIP_COMMON_SOURCES} PARENT_SCOPE) diff --git a/common/cuda_hip/CMakeLists.txt b/common/cuda_hip/CMakeLists.txt new file mode 100644 index 00000000000..af919e90897 --- /dev/null +++ b/common/cuda_hip/CMakeLists.txt @@ -0,0 +1,58 @@ +set(CUDA_HIP_SOURCES + base/batch_multi_vector_kernels.cpp + base/device_matrix_data_kernels.cpp + base/kernel_launch.hpp + base/kernel_launch_reduction.hpp + base/kernel_launch_solver.hpp + base/math.hpp + components/atomic.hpp + components/diagonal_block_manipulation.hpp + components/intrinsics.hpp + components/merging.hpp + components/prefix_sum.hpp + components/prefix_sum_kernels.cpp + components/reduction.hpp + components/searching.hpp + components/segment_scan.hpp + components/sorting.hpp + components/syncfree.hpp + components/thread_ids.hpp + components/uninitialized_array.hpp + components/warp_blas.hpp + distributed/index_map_kernels.cpp + distributed/matrix_kernels.cpp + distributed/partition_helpers_kernels.cpp + distributed/partition_kernels.cpp + distributed/vector_kernels.cpp + factorization/cholesky_kernels.cpp + factorization/factorization_kernels.cpp + factorization/lu_kernels.cpp + factorization/par_ic_kernels.cpp + factorization/par_ict_kernels.cpp + factorization/par_ilu_kernels.cpp + factorization/par_ilut_filter_kernels.cpp + factorization/par_ilut_select_kernels.cpp + factorization/par_ilut_spgeam_kernels.cpp + factorization/par_ilut_sweep_kernels.cpp + log/batch_logger.hpp + matrix/batch_csr_kernels.cpp + matrix/batch_dense_kernels.cpp + matrix/batch_ell_kernels.cpp + matrix/coo_kernels.cpp + matrix/dense_kernels.cpp + matrix/diagonal_kernels.cpp + matrix/ell_kernels.cpp + matrix/fbcsr_kernels.cpp + matrix/sellp_kernels.cpp + matrix/sparsity_csr_kernels.cpp + multigrid/pgm_kernels.cpp + preconditioner/isai_kernels.cpp + preconditioner/jacobi_kernels.cpp + reorder/rcm_kernels.cpp + solver/cb_gmres_kernels.cpp + solver/idr_kernels.cpp + solver/multigrid_kernels.cpp + stop/batch_criteria.hpp + ) +list(TRANSFORM CUDA_HIP_SOURCES PREPEND ${CMAKE_CURRENT_SOURCE_DIR}/) +set(GKO_CUDA_HIP_COMMON_SOURCES ${CUDA_HIP_SOURCES} PARENT_SCOPE) \ No newline at end of file diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc b/common/cuda_hip/base/batch_multi_vector_kernels.cpp similarity index 89% rename from common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc rename to common/cuda_hip/base/batch_multi_vector_kernels.cpp index 9b6301674be..2a4618f32aa 100644 --- a/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc +++ b/common/cuda_hip/base/batch_multi_vector_kernels.cpp @@ -2,6 +2,50 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include + + +#include +#include + + +#include +#include + + +#include "common/cuda_hip/base/blas_bindings.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/pointer_mode_guard.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/thrust.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/uninitialized_array.hpp" +#include "core/base/batch_multi_vector_kernels.hpp" +#include "core/base/batch_struct.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +/** + * @brief The MultiVector matrix format namespace. + * + * @ingroup batch_multi_vector + */ +namespace batch_multi_vector { + + +constexpr auto default_block_size = 256; +constexpr int sm_oversubscription = 4; + + +// clang-format off + +// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES + + template __device__ __forceinline__ void scale( const gko::batch::multi_vector::batch_item& alpha, @@ -299,3 +343,14 @@ __launch_bounds__(default_block_size, sm_oversubscription) void copy_kernel( copy(src_b, dst_b); } } + + +#include "common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc" + +// clang-format on + + +} // namespace batch_multi_vector +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/common/cuda_hip/base/device_matrix_data_kernels.hpp.inc b/common/cuda_hip/base/device_matrix_data_kernels.cpp similarity index 88% rename from common/cuda_hip/base/device_matrix_data_kernels.hpp.inc rename to common/cuda_hip/base/device_matrix_data_kernels.cpp index 70cbd9e7391..192facc6950 100644 --- a/common/cuda_hip/base/device_matrix_data_kernels.hpp.inc +++ b/common/cuda_hip/base/device_matrix_data_kernels.cpp @@ -2,6 +2,26 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include +#include +#include +#include +#include +#include +#include + + +#include "common/cuda_hip/base/thrust.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "core/base/device_matrix_data_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace components { + + template void remove_zeros(std::shared_ptr exec, array& values, array& row_idxs, @@ -99,3 +119,9 @@ void sort_row_major(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_DEVICE_MATRIX_DATA_SORT_ROW_MAJOR_KERNEL); + + +} // namespace components +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/common/cuda_hip/base/kernel_launch.hpp.inc b/common/cuda_hip/base/kernel_launch.hpp similarity index 58% rename from common/cuda_hip/base/kernel_launch.hpp.inc rename to common/cuda_hip/base/kernel_launch.hpp index c46e6c879cb..3d59e145a86 100644 --- a/common/cuda_hip/base/kernel_launch.hpp.inc +++ b/common/cuda_hip/base/kernel_launch.hpp @@ -2,6 +2,55 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_HPP_ +#error \ + "This file can only be used from inside common/unified/base/kernel_launch.hpp" +#endif + + +#include + + +#include + + +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { + + +template +struct to_device_type_impl&> { + using type = std::decay_t>()))>; + static type map_to_device(gko::acc::range& range) + { + return gko::acc::as_device_range(range); + } +}; + +template +struct to_device_type_impl&> { + using type = std::decay_t>()))>; + static type map_to_device(const gko::acc::range& range) + { + return gko::acc::as_device_range(range); + } +}; + + +namespace device_std = thrust; + + +constexpr int default_block_size = 512; + + template __global__ __launch_bounds__(default_block_size) void generic_kernel_1d( int64 size, KernelFunction fn, KernelArgs... args) @@ -52,3 +101,8 @@ void run_kernel(std::shared_ptr exec, KernelFunction fn, map_to_device(args)...); } } + + +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/common/cuda_hip/base/kernel_launch_reduction.hpp.inc b/common/cuda_hip/base/kernel_launch_reduction.hpp similarity index 97% rename from common/cuda_hip/base/kernel_launch_reduction.hpp.inc rename to common/cuda_hip/base/kernel_launch_reduction.hpp index e5caedacb1f..4c4fb366802 100644 --- a/common/cuda_hip/base/kernel_launch_reduction.hpp.inc +++ b/common/cuda_hip/base/kernel_launch_reduction.hpp @@ -2,6 +2,24 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_REDUCTION_HPP_ +#error \ + "This file can only be used from inside common/unified/base/kernel_launch_reduction.hpp" +#endif + + +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "core/synthesizer/implementation_selection.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { + + template __global__ __launch_bounds__( @@ -505,3 +523,8 @@ void run_kernel_col_reduction_cached( } } } + + +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/common/cuda_hip/base/kernel_launch_solver.hpp.inc b/common/cuda_hip/base/kernel_launch_solver.hpp similarity index 77% rename from common/cuda_hip/base/kernel_launch_solver.hpp.inc rename to common/cuda_hip/base/kernel_launch_solver.hpp index cef3c8a3adc..e32ba52e79a 100644 --- a/common/cuda_hip/base/kernel_launch_solver.hpp.inc +++ b/common/cuda_hip/base/kernel_launch_solver.hpp @@ -2,6 +2,20 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_SOLVER_HPP_ +#error \ + "This file can only be used from inside common/unified/base/kernel_launch_solver.hpp" +#endif + + +#include "common/cuda_hip/base/runtime.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { + + template __global__ __launch_bounds__(default_block_size) void generic_kernel_2d_solver( int64 rows, int64 cols, int64 default_stride, KernelFunction fn, @@ -32,3 +46,8 @@ void run_kernel_solver(std::shared_ptr exec, static_cast(default_stride), fn, map_to_device(args)...); } } + + +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/common/cuda_hip/base/math.hpp.inc b/common/cuda_hip/base/math.hpp similarity index 80% rename from common/cuda_hip/base/math.hpp.inc rename to common/cuda_hip/base/math.hpp index 430163f3791..c328b299d70 100644 --- a/common/cuda_hip/base/math.hpp.inc +++ b/common/cuda_hip/base/math.hpp @@ -2,6 +2,19 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_COMMON_CUDA_HIP_BASE_MATH_HPP_ +#define GKO_COMMON_CUDA_HIP_BASE_MATH_HPP_ + + +#include + + +#include + + +namespace gko { + + // We need this struct, because otherwise we would call a __host__ function in a // __device__ function (even though it is constexpr) template @@ -37,3 +50,9 @@ struct truncate_type_impl> { } // namespace detail + + +} // namespace gko + + +#endif // GKO_COMMON_CUDA_HIP_BASE_MATH_HPP_ diff --git a/common/cuda_hip/components/atomic.hpp.inc b/common/cuda_hip/components/atomic.hpp similarity index 95% rename from common/cuda_hip/components/atomic.hpp.inc rename to common/cuda_hip/components/atomic.hpp index 60eaf5a9dd9..b71ca1f8b5b 100644 --- a/common/cuda_hip/components/atomic.hpp.inc +++ b/common/cuda_hip/components/atomic.hpp @@ -2,6 +2,22 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_ATOMIC_HPP_ +#define GKO_COMMON_CUDA_HIP_COMPONENTS_ATOMIC_HPP_ + + +#include + + +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/base/types.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { + + namespace detail { @@ -228,3 +244,11 @@ __forceinline__ __device__ thrust::complex atomic_add( auto imag = atomic_add(addr + 1, val.imag()); return {real, imag}; } + + +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko + + +#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_ATOMIC_HPP_ diff --git a/common/cuda_hip/components/diagonal_block_manipulation.hpp.inc b/common/cuda_hip/components/diagonal_block_manipulation.hpp similarity index 81% rename from common/cuda_hip/components/diagonal_block_manipulation.hpp.inc rename to common/cuda_hip/components/diagonal_block_manipulation.hpp index a8e7004b5aa..923b62bf5e5 100644 --- a/common/cuda_hip/components/diagonal_block_manipulation.hpp.inc +++ b/common/cuda_hip/components/diagonal_block_manipulation.hpp @@ -2,6 +2,24 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_HPP_ +#define GKO_COMMON_CUDA_HIP_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_HPP_ + + +#include + + +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace csr { + + /** * @internal * @@ -63,3 +81,12 @@ __device__ __forceinline__ void extract_transposed_diag_blocks( } } } + + +} // namespace csr +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko + + +#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_HPP_ diff --git a/common/cuda_hip/components/intrinsics.hpp.inc b/common/cuda_hip/components/intrinsics.hpp similarity index 74% rename from common/cuda_hip/components/intrinsics.hpp.inc rename to common/cuda_hip/components/intrinsics.hpp index 3fc28cee871..df3b5ad4c7f 100644 --- a/common/cuda_hip/components/intrinsics.hpp.inc +++ b/common/cuda_hip/components/intrinsics.hpp @@ -2,6 +2,18 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_INTRINSICS_HPP_ +#define GKO_COMMON_CUDA_HIP_COMPONENTS_INTRINSICS_HPP_ + + +#include + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { + + /** * @internal * Returns the number of set bits in the given mask. @@ -36,3 +48,11 @@ __forceinline__ __device__ int clz(uint32 mask) { return __clz(mask); } /** @copydoc clz */ __forceinline__ __device__ int clz(uint64 mask) { return __clzll(mask); } + + +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko + + +#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_INTRINSICS_HPP_ diff --git a/common/cuda_hip/components/merging.hpp.inc b/common/cuda_hip/components/merging.hpp similarity index 96% rename from common/cuda_hip/components/merging.hpp.inc rename to common/cuda_hip/components/merging.hpp index d77707795a1..ab070741fbd 100644 --- a/common/cuda_hip/components/merging.hpp.inc +++ b/common/cuda_hip/components/merging.hpp @@ -2,6 +2,21 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_MERGING_HPP_ +#define GKO_COMMON_CUDA_HIP_COMPONENTS_MERGING_HPP_ + + +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/components/intrinsics.hpp" +#include "common/cuda_hip/components/searching.hpp" +#include "core/base/utils.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { + + namespace detail { @@ -280,3 +295,11 @@ __forceinline__ __device__ void sequential_match(const ValueType* a, return a_idx < a_size && b_idx < b_size; }); } + + +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko + + +#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_MERGING_HPP_ diff --git a/common/cuda_hip/components/prefix_sum.hpp.inc b/common/cuda_hip/components/prefix_sum.hpp similarity index 91% rename from common/cuda_hip/components/prefix_sum.hpp.inc rename to common/cuda_hip/components/prefix_sum.hpp index 474b0b88cd1..defd2be5e0e 100644 --- a/common/cuda_hip/components/prefix_sum.hpp.inc +++ b/common/cuda_hip/components/prefix_sum.hpp @@ -2,6 +2,24 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_PREFIX_SUM_HPP_ +#define GKO_COMMON_CUDA_HIP_COMPONENTS_PREFIX_SUM_HPP_ + + +#include + + +#include "common/cuda_hip/base/blas_bindings.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { + + /** * @internal * Computes the prefix sum and total sum of `element` over a subwarp. @@ -158,3 +176,11 @@ __global__ __launch_bounds__(block_size) void finalize_prefix_sum( elements[tidx] += prefix_block_sum; } } + + +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko + + +#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_PREFIX_SUM_HPP_ diff --git a/common/cuda_hip/components/prefix_sum_kernels.hpp.inc b/common/cuda_hip/components/prefix_sum_kernels.cpp similarity index 80% rename from common/cuda_hip/components/prefix_sum_kernels.hpp.inc rename to common/cuda_hip/components/prefix_sum_kernels.cpp index c232e115a22..4583e2456f9 100644 --- a/common/cuda_hip/components/prefix_sum_kernels.hpp.inc +++ b/common/cuda_hip/components/prefix_sum_kernels.cpp @@ -2,6 +2,27 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include + + +#include + + +#include +#include +#include + + +#include "common/cuda_hip/base/thrust.hpp" +#include "core/components/prefix_sum_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace components { + + template struct overflowing_sum { constexpr static IndexType max = std::numeric_limits::max(); @@ -56,3 +77,9 @@ GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_PREFIX_SUM_NONNEGATIVE_KERNEL); // instantiate for size_type as well, as this is used in the Sellp format template void prefix_sum_nonnegative( std::shared_ptr, size_type*, size_type); + + +} // namespace components +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/common/cuda_hip/components/reduction.hpp.inc b/common/cuda_hip/components/reduction.hpp similarity index 78% rename from common/cuda_hip/components/reduction.hpp.inc rename to common/cuda_hip/components/reduction.hpp index 1a6a64d6fb7..e2c49836149 100644 --- a/common/cuda_hip/components/reduction.hpp.inc +++ b/common/cuda_hip/components/reduction.hpp @@ -2,6 +2,34 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_REDUCTION_HPP_ +#define GKO_COMMON_CUDA_HIP_COMPONENTS_REDUCTION_HPP_ + + +#include + + +#include +#include + + +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/uninitialized_array.hpp" +#include "core/base/array_access.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { + + +constexpr int default_reduce_block_size = 512; + + /** * @internal * @@ -222,3 +250,52 @@ __launch_bounds__(default_reduce_block_size) void reduce_add_array_with_initial_ result[blockIdx.x] += block_sum[0]; } } + + +/** + * Compute a reduction using add operation (+). + * + * @param exec Executor associated to the array + * @param size size of the array + * @param source the pointer of the array + * + * @return the reduction result + */ +template +__host__ ValueType reduce_add_array(std::shared_ptr exec, + size_type size, const ValueType* source) +{ + auto block_results_val = source; + size_type grid_dim = size; + auto block_results = array(exec); + if (size > default_reduce_block_size) { + const auto n = ceildiv(size, default_reduce_block_size); + grid_dim = + (n <= default_reduce_block_size) ? n : default_reduce_block_size; + + block_results.resize_and_reset(grid_dim); + + reduce_add_array<<get_stream()>>>( + size, as_device_type(source), + as_device_type(block_results.get_data())); + + block_results_val = block_results.get_const_data(); + } + + auto d_result = array(exec, 1); + + reduce_add_array<<<1, default_reduce_block_size, 0, exec->get_stream()>>>( + grid_dim, as_device_type(block_results_val), + as_device_type(d_result.get_data())); + auto answer = get_element(d_result, 0); + return answer; +} + + +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko + + +#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_REDUCTION_HPP_ diff --git a/common/cuda_hip/components/searching.hpp.inc b/common/cuda_hip/components/searching.hpp similarity index 95% rename from common/cuda_hip/components/searching.hpp.inc rename to common/cuda_hip/components/searching.hpp index a0f842dca35..cb219c58b0b 100644 --- a/common/cuda_hip/components/searching.hpp.inc +++ b/common/cuda_hip/components/searching.hpp @@ -2,6 +2,19 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_SEARCHING_HPP_ +#define GKO_COMMON_CUDA_HIP_COMPONENTS_SEARCHING_HPP_ + + +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/components/intrinsics.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { + + /** * @internal * Generic binary search that finds the first index where a predicate is true. @@ -208,3 +221,11 @@ __forceinline__ __device__ IndexType group_ary_search(IndexType offset, auto pos = mask == 0 ? group.size() : ffs(mask) - 1; return offset + pos; } + + +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko + + +#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_SEARCHING_HPP_ diff --git a/common/cuda_hip/components/segment_scan.hpp.inc b/common/cuda_hip/components/segment_scan.hpp similarity index 73% rename from common/cuda_hip/components/segment_scan.hpp.inc rename to common/cuda_hip/components/segment_scan.hpp index 75cc0654531..0ab34fd093b 100644 --- a/common/cuda_hip/components/segment_scan.hpp.inc +++ b/common/cuda_hip/components/segment_scan.hpp @@ -2,6 +2,18 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_SEGMENT_SCAN_HPP_ +#define GKO_COMMON_CUDA_HIP_COMPONENTS_SEGMENT_SCAN_HPP_ + + +#include "common/cuda_hip/components/cooperative_groups.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { + + /** * @internal * @@ -33,3 +45,11 @@ __device__ __forceinline__ bool segment_scan( } return head; } + + +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko + + +#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_SEGMENT_SCAN_HPP_ diff --git a/common/cuda_hip/components/sorting.hpp.inc b/common/cuda_hip/components/sorting.hpp similarity index 96% rename from common/cuda_hip/components/sorting.hpp.inc rename to common/cuda_hip/components/sorting.hpp index 10db7eb6daa..7603d41a8ba 100644 --- a/common/cuda_hip/components/sorting.hpp.inc +++ b/common/cuda_hip/components/sorting.hpp @@ -2,6 +2,19 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_SORTING_HPP_ +#define GKO_COMMON_CUDA_HIP_COMPONENTS_SORTING_HPP_ + + +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { + + namespace detail { @@ -291,3 +304,11 @@ __forceinline__ __device__ void bitonic_sort(ValueType* local_elements, local_elements, false); } } + + +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko + + +#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_SORTING_HPP_ diff --git a/common/cuda_hip/components/syncfree.hpp.inc b/common/cuda_hip/components/syncfree.hpp similarity index 86% rename from common/cuda_hip/components/syncfree.hpp.inc rename to common/cuda_hip/components/syncfree.hpp index f0d0bbe4d22..9524c68637e 100644 --- a/common/cuda_hip/components/syncfree.hpp.inc +++ b/common/cuda_hip/components/syncfree.hpp @@ -2,6 +2,25 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_SYNCFREE_HPP_ +#define GKO_COMMON_CUDA_HIP_COMPONENTS_SYNCFREE_HPP_ + + +#include + + +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/components/atomic.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/memory.hpp" +#include "core/components/fill_array_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { + + struct syncfree_storage { using status_word = int; @@ -110,3 +129,11 @@ class syncfree_scheduler { IndexType work_id; IndexType block_id; }; + + +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko + + +#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_SYNCFREE_HPP_ diff --git a/common/cuda_hip/components/thread_ids.hpp.inc b/common/cuda_hip/components/thread_ids.hpp similarity index 94% rename from common/cuda_hip/components/thread_ids.hpp.inc rename to common/cuda_hip/components/thread_ids.hpp index 1befa428f3c..e73296f92a9 100644 --- a/common/cuda_hip/components/thread_ids.hpp.inc +++ b/common/cuda_hip/components/thread_ids.hpp @@ -2,6 +2,19 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_THREAD_IDS_HPP_ +#define GKO_COMMON_CUDA_HIP_COMPONENTS_THREAD_IDS_HPP_ + + +#include "common/cuda_hip/base/config.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace thread { + + /** * @internal * @@ -242,3 +255,12 @@ __device__ __forceinline__ IndexType get_subwarp_num_flat() "subwarp_size must be a power of two"); return blockDim.x / subwarp_size * static_cast(gridDim.x); } + + +} // namespace thread +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko + + +#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_THREAD_IDS_HPP_ diff --git a/common/cuda_hip/components/uninitialized_array.hpp.inc b/common/cuda_hip/components/uninitialized_array.hpp similarity index 82% rename from common/cuda_hip/components/uninitialized_array.hpp.inc rename to common/cuda_hip/components/uninitialized_array.hpp index 932ae8a5caa..44fcbfd0d85 100644 --- a/common/cuda_hip/components/uninitialized_array.hpp.inc +++ b/common/cuda_hip/components/uninitialized_array.hpp @@ -2,6 +2,18 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_UNINITIALIZED_ARRAY_HPP_ +#define GKO_COMMON_CUDA_HIP_COMPONENTS_UNINITIALIZED_ARRAY_HPP_ + + +#include + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { + + /** * Stores an array with uninitialized contents. * @@ -63,3 +75,11 @@ class uninitialized_array { private: unsigned char data_[sizeof(ValueType) / sizeof(unsigned char) * size]; }; + + +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko + + +#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_UNINITIALIZED_ARRAY_HPP_ diff --git a/common/cuda_hip/components/warp_blas.hpp.inc b/common/cuda_hip/components/warp_blas.hpp similarity index 97% rename from common/cuda_hip/components/warp_blas.hpp.inc rename to common/cuda_hip/components/warp_blas.hpp index 61b2ae25e7f..eb98466c6b8 100644 --- a/common/cuda_hip/components/warp_blas.hpp.inc +++ b/common/cuda_hip/components/warp_blas.hpp @@ -2,6 +2,26 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_WARP_BLAS_HPP_ +#define GKO_COMMON_CUDA_HIP_COMPONENTS_WARP_BLAS_HPP_ + + +#include +#include + + +#include + + +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/components/reduction.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { + + /** * @internal * @@ -409,3 +429,11 @@ __device__ __forceinline__ remove_complex compute_infinity_norm( return reduce(group, sum, [](result_type x, result_type y) { return max(x, y); }); } + + +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko + + +#endif // GKO_COMMON_CUDA_HIP_COMPONENTS_WARP_BLAS_HPP_ diff --git a/common/cuda_hip/distributed/index_map_kernels.hpp.inc b/common/cuda_hip/distributed/index_map_kernels.cpp similarity index 92% rename from common/cuda_hip/distributed/index_map_kernels.hpp.inc rename to common/cuda_hip/distributed/index_map_kernels.cpp index 9d312cc43aa..706ab2a2355 100644 --- a/common/cuda_hip/distributed/index_map_kernels.hpp.inc +++ b/common/cuda_hip/distributed/index_map_kernels.cpp @@ -2,6 +2,35 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include + + +#include "common/cuda_hip/base/thrust.hpp" +#include "common/cuda_hip/components/atomic.hpp" +#include "common/cuda_hip/components/searching.hpp" +#include "core/distributed/index_map_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace index_map { + + /** * This struct is necessary, since the `transform_output_iterator` seemingly * doesn't support non-copyable tranfsorm function (this excludes lambdas) @@ -266,3 +295,9 @@ void map_to_local( GKO_INSTANTIATE_FOR_EACH_LOCAL_GLOBAL_INDEX_TYPE( GKO_DECLARE_INDEX_MAP_MAP_TO_LOCAL); + + +} // namespace index_map +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/common/cuda_hip/distributed/matrix_kernels.hpp.inc b/common/cuda_hip/distributed/matrix_kernels.cpp similarity index 94% rename from common/cuda_hip/distributed/matrix_kernels.hpp.inc rename to common/cuda_hip/distributed/matrix_kernels.cpp index 5caf3522f62..90d7b4a09f9 100644 --- a/common/cuda_hip/distributed/matrix_kernels.hpp.inc +++ b/common/cuda_hip/distributed/matrix_kernels.cpp @@ -2,6 +2,33 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include + + +#include "common/cuda_hip/base/thrust.hpp" +#include "common/cuda_hip/components/atomic.hpp" +#include "core/distributed/matrix_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace distributed_matrix { + + template struct input_type { GlobalIndexType row; @@ -261,3 +288,9 @@ void build_local_nonlocal( GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( GKO_DECLARE_BUILD_LOCAL_NONLOCAL); + + +} // namespace distributed_matrix +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/common/cuda_hip/distributed/partition_helpers_kernels.hpp.inc b/common/cuda_hip/distributed/partition_helpers_kernels.cpp similarity index 70% rename from common/cuda_hip/distributed/partition_helpers_kernels.hpp.inc rename to common/cuda_hip/distributed/partition_helpers_kernels.cpp index 88343370d99..9081e36019f 100644 --- a/common/cuda_hip/distributed/partition_helpers_kernels.hpp.inc +++ b/common/cuda_hip/distributed/partition_helpers_kernels.cpp @@ -2,6 +2,22 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include +#include +#include +#include + + +#include "common/cuda_hip/base/thrust.hpp" +#include "core/distributed/partition_helpers_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace partition_helpers { + + template void sort_by_range_start( std::shared_ptr exec, @@ -24,3 +40,9 @@ void sort_by_range_start( GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( GKO_DECLARE_PARTITION_HELPERS_SORT_BY_RANGE_START); + + +} // namespace partition_helpers +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/common/cuda_hip/distributed/partition_kernels.hpp.inc b/common/cuda_hip/distributed/partition_kernels.cpp similarity index 89% rename from common/cuda_hip/distributed/partition_kernels.hpp.inc rename to common/cuda_hip/distributed/partition_kernels.cpp index 20f3ebd47dc..9830ba94faf 100644 --- a/common/cuda_hip/distributed/partition_kernels.hpp.inc +++ b/common/cuda_hip/distributed/partition_kernels.cpp @@ -2,6 +2,26 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include +#include +#include +#include +#include +#include + + +#include "common/cuda_hip/base/thrust.hpp" +#include "common/unified/base/kernel_launch.hpp" +#include "core/components/fill_array_kernels.hpp" +#include "core/distributed/partition_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace partition { + + namespace kernel { @@ -110,3 +130,9 @@ void build_starting_indices(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_LOCAL_GLOBAL_INDEX_TYPE( GKO_DECLARE_PARTITION_BUILD_STARTING_INDICES); + + +} // namespace partition +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/common/cuda_hip/distributed/vector_kernels.hpp.inc b/common/cuda_hip/distributed/vector_kernels.cpp similarity index 83% rename from common/cuda_hip/distributed/vector_kernels.hpp.inc rename to common/cuda_hip/distributed/vector_kernels.cpp index 6a0497db78a..4dd2d4f59c3 100644 --- a/common/cuda_hip/distributed/vector_kernels.hpp.inc +++ b/common/cuda_hip/distributed/vector_kernels.cpp @@ -2,6 +2,30 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include + + +#include +#include +#include +#include +#include +#include + + +#include + + +#include "common/cuda_hip/base/thrust.hpp" +#include "core/distributed/vector_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace distributed_vector { + + template void build_local( std::shared_ptr exec, @@ -65,3 +89,9 @@ void build_local( GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( GKO_DECLARE_DISTRIBUTED_VECTOR_BUILD_LOCAL); + + +} // namespace distributed_vector +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/common/cuda_hip/factorization/cholesky_kernels.hpp.inc b/common/cuda_hip/factorization/cholesky_kernels.cpp similarity index 78% rename from common/cuda_hip/factorization/cholesky_kernels.hpp.inc rename to common/cuda_hip/factorization/cholesky_kernels.cpp index e6220019d22..be906086f04 100644 --- a/common/cuda_hip/factorization/cholesky_kernels.hpp.inc +++ b/common/cuda_hip/factorization/cholesky_kernels.cpp @@ -2,6 +2,51 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include +#include + + +#include +#include +#include +#include +#include +#include + + +#include + + +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/base/sparselib_bindings.hpp" +#include "common/cuda_hip/base/thrust.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/intrinsics.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/syncfree.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "core/components/fill_array_kernels.hpp" +#include "core/components/format_conversion_kernels.hpp" +#include "core/factorization/cholesky_kernels.hpp" +#include "core/factorization/elimination_forest.hpp" +#include "core/factorization/lu_kernels.hpp" +#include "core/matrix/csr_lookup.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +/** + * @brief The Cholesky namespace. + * + * @ingroup factor + */ +namespace cholesky { + + +constexpr int default_block_size = 512; + + #include "core/factorization/elimination_forest.hpp" namespace kernel { @@ -330,3 +375,66 @@ void factorize(std::shared_ptr exec, } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_FACTORIZE); + + +template +void symbolic_count(std::shared_ptr exec, + const matrix::Csr* mtx, + const factorization::elimination_forest& forest, + IndexType* row_nnz, array& tmp_storage) +{ + const auto num_rows = static_cast(mtx->get_size()[0]); + if (num_rows == 0) { + return; + } + const auto mtx_nnz = static_cast(mtx->get_num_stored_elements()); + tmp_storage.resize_and_reset(mtx_nnz + num_rows); + const auto postorder_cols = tmp_storage.get_data(); + const auto lower_ends = postorder_cols + mtx_nnz; + const auto row_ptrs = mtx->get_const_row_ptrs(); + const auto cols = mtx->get_const_col_idxs(); + const auto inv_postorder = forest.inv_postorder.get_const_data(); + const auto postorder_parent = forest.postorder_parents.get_const_data(); + // transform col indices to postorder indices + { + const auto num_blocks = ceildiv(num_rows, default_block_size); + kernel::build_postorder_cols<<get_stream()>>>( + num_rows, cols, row_ptrs, inv_postorder, postorder_cols, + lower_ends); + } + // sort postorder_cols inside rows + { + const auto handle = exec->get_sparselib_handle(); + auto descr = sparselib::create_mat_descr(); + array permutation_array(exec, mtx_nnz); + auto permutation = permutation_array.get_data(); + components::fill_seq_array(exec, permutation, mtx_nnz); + size_type buffer_size{}; + sparselib::csrsort_buffer_size(handle, num_rows, num_rows, mtx_nnz, + row_ptrs, postorder_cols, buffer_size); + array buffer_array{exec, buffer_size}; + auto buffer = buffer_array.get_data(); + sparselib::csrsort(handle, num_rows, num_rows, mtx_nnz, descr, row_ptrs, + postorder_cols, permutation, buffer); + sparselib::destroy(descr); + } + // count nonzeros per row of L + { + const auto num_blocks = + ceildiv(num_rows, default_block_size / config::warp_size); + kernel::symbolic_count + <<get_stream()>>>( + num_rows, row_ptrs, lower_ends, inv_postorder, postorder_cols, + postorder_parent, row_nnz); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_CHOLESKY_SYMBOLIC_COUNT); + + +} // namespace cholesky +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/common/cuda_hip/factorization/factorization_kernels.hpp.inc b/common/cuda_hip/factorization/factorization_kernels.cpp similarity index 95% rename from common/cuda_hip/factorization/factorization_kernels.hpp.inc rename to common/cuda_hip/factorization/factorization_kernels.cpp index 806797e60d8..bcdf90ec969 100644 --- a/common/cuda_hip/factorization/factorization_kernels.hpp.inc +++ b/common/cuda_hip/factorization/factorization_kernels.cpp @@ -2,6 +2,36 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include + + +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/intrinsics.hpp" +#include "common/cuda_hip/components/searching.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "core/base/array_access.hpp" +#include "core/components/prefix_sum_kernels.hpp" +#include "core/factorization/factorization_kernels.hpp" +#include "core/matrix/csr_builder.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +/** + * @brief The factorization namespace. + * + * @ingroup factor + */ +namespace factorization { + + +constexpr int default_block_size{512}; + + namespace kernel { @@ -520,3 +550,9 @@ void initialize_l(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_FACTORIZATION_INITIALIZE_L_KERNEL); + + +} // namespace factorization +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/common/cuda_hip/factorization/lu_kernels.hpp.inc b/common/cuda_hip/factorization/lu_kernels.cpp similarity index 92% rename from common/cuda_hip/factorization/lu_kernels.hpp.inc rename to common/cuda_hip/factorization/lu_kernels.cpp index f8f317bc6a5..12ec5c7b10d 100644 --- a/common/cuda_hip/factorization/lu_kernels.hpp.inc +++ b/common/cuda_hip/factorization/lu_kernels.cpp @@ -2,6 +2,43 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include +#include + + +#include +#include +#include + + +#include + + +#include "common/cuda_hip/base/thrust.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/syncfree.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "core/base/allocator.hpp" +#include "core/factorization/lu_kernels.hpp" +#include "core/matrix/csr_lookup.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +/** + * @brief The LU namespace. + * + * @ingroup factor + */ +namespace lu_factorization { + + +constexpr static int default_block_size = 512; + + namespace kernel { @@ -301,3 +338,9 @@ void symbolic_factorize_simple_finalize( GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( GKO_DECLARE_LU_SYMMETRIC_FACTORIZE_SIMPLE_FINALIZE); + + +} // namespace lu_factorization +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/common/cuda_hip/factorization/par_ic_kernels.hpp.inc b/common/cuda_hip/factorization/par_ic_kernels.cpp similarity index 84% rename from common/cuda_hip/factorization/par_ic_kernels.hpp.inc rename to common/cuda_hip/factorization/par_ic_kernels.cpp index dd30eb2fc1c..785540c56fc 100644 --- a/common/cuda_hip/factorization/par_ic_kernels.hpp.inc +++ b/common/cuda_hip/factorization/par_ic_kernels.cpp @@ -2,6 +2,32 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include +#include +#include + + +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/memory.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "core/factorization/par_ic_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +/** + * @brief The parallel ic factorization namespace. + * + * @ingroup factor + */ +namespace par_ic_factorization { + + +constexpr int default_block_size = 512; + + namespace kernel { @@ -111,3 +137,9 @@ void compute_factor(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_PAR_IC_COMPUTE_FACTOR_KERNEL); + + +} // namespace par_ic_factorization +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/common/cuda_hip/factorization/par_ict_kernels.hpp.inc b/common/cuda_hip/factorization/par_ict_kernels.cpp similarity index 62% rename from common/cuda_hip/factorization/par_ict_kernels.hpp.inc rename to common/cuda_hip/factorization/par_ict_kernels.cpp index 87aa8297345..523f89082af 100644 --- a/common/cuda_hip/factorization/par_ict_kernels.hpp.inc +++ b/common/cuda_hip/factorization/par_ict_kernels.cpp @@ -2,6 +2,51 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include "core/factorization/par_ict_kernels.hpp" + + +#include +#include +#include +#include +#include + + +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/intrinsics.hpp" +#include "common/cuda_hip/components/memory.hpp" +#include "common/cuda_hip/components/merging.hpp" +#include "common/cuda_hip/components/prefix_sum.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/searching.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "core/components/prefix_sum_kernels.hpp" +#include "core/matrix/coo_builder.hpp" +#include "core/matrix/csr_builder.hpp" +#include "core/matrix/csr_kernels.hpp" +#include "core/synthesizer/implementation_selection.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +/** + * @brief The parallel ICT factorization namespace. + * + * @ingroup factor + */ +namespace par_ict_factorization { + + +constexpr int default_block_size = 512; + + +// subwarp sizes for all warp-parallel kernels (filter, add_candidates) +using compiled_kernels = + syn::value_list; + + namespace kernel { @@ -275,3 +320,142 @@ __global__ __launch_bounds__(default_block_size) void ict_sweep( } // namespace kernel + + +namespace { + + +template +void add_candidates(syn::value_list, + std::shared_ptr exec, + const matrix::Csr* llh, + const matrix::Csr* a, + const matrix::Csr* l, + matrix::Csr* l_new) +{ + auto num_rows = static_cast(llh->get_size()[0]); + auto subwarps_per_block = default_block_size / subwarp_size; + auto num_blocks = ceildiv(num_rows, subwarps_per_block); + matrix::CsrBuilder l_new_builder(l_new); + auto llh_row_ptrs = llh->get_const_row_ptrs(); + auto llh_col_idxs = llh->get_const_col_idxs(); + auto llh_vals = llh->get_const_values(); + auto a_row_ptrs = a->get_const_row_ptrs(); + auto a_col_idxs = a->get_const_col_idxs(); + auto a_vals = a->get_const_values(); + auto l_row_ptrs = l->get_const_row_ptrs(); + auto l_col_idxs = l->get_const_col_idxs(); + auto l_vals = l->get_const_values(); + auto l_new_row_ptrs = l_new->get_row_ptrs(); + // count non-zeros per row + if (num_blocks > 0) { + kernel::ict_tri_spgeam_nnz + <<get_stream()>>>( + llh_row_ptrs, llh_col_idxs, a_row_ptrs, a_col_idxs, + l_new_row_ptrs, num_rows); + } + + // build row ptrs + components::prefix_sum_nonnegative(exec, l_new_row_ptrs, num_rows + 1); + + // resize output arrays + auto l_new_nnz = exec->copy_val_to_host(l_new_row_ptrs + num_rows); + l_new_builder.get_col_idx_array().resize_and_reset(l_new_nnz); + l_new_builder.get_value_array().resize_and_reset(l_new_nnz); + + auto l_new_col_idxs = l_new->get_col_idxs(); + auto l_new_vals = l_new->get_values(); + + // fill columns and values + if (num_blocks > 0) { + kernel::ict_tri_spgeam_init + <<get_stream()>>>( + llh_row_ptrs, llh_col_idxs, as_device_type(llh_vals), + a_row_ptrs, a_col_idxs, as_device_type(a_vals), l_row_ptrs, + l_col_idxs, as_device_type(l_vals), l_new_row_ptrs, + l_new_col_idxs, as_device_type(l_new_vals), num_rows); + } +} + + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_add_candidates, add_candidates); + + +template +void compute_factor(syn::value_list, + std::shared_ptr exec, + const matrix::Csr* a, + matrix::Csr* l, + const matrix::Coo* l_coo) +{ + auto total_nnz = static_cast(l->get_num_stored_elements()); + auto block_size = default_block_size / subwarp_size; + auto num_blocks = ceildiv(total_nnz, block_size); + if (num_blocks > 0) { + kernel::ict_sweep + <<get_stream()>>>( + a->get_const_row_ptrs(), a->get_const_col_idxs(), + as_device_type(a->get_const_values()), l->get_const_row_ptrs(), + l_coo->get_const_row_idxs(), l->get_const_col_idxs(), + as_device_type(l->get_values()), + static_cast(l->get_num_stored_elements())); + } +} + + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_compute_factor, compute_factor); + + +} // namespace + + +template +void add_candidates(std::shared_ptr exec, + const matrix::Csr* llh, + const matrix::Csr* a, + const matrix::Csr* l, + matrix::Csr* l_new) +{ + auto num_rows = a->get_size()[0]; + auto total_nnz = + llh->get_num_stored_elements() + a->get_num_stored_elements(); + auto total_nnz_per_row = total_nnz / num_rows; + select_add_candidates( + compiled_kernels(), + [&](int compiled_subwarp_size) { + return total_nnz_per_row <= compiled_subwarp_size || + compiled_subwarp_size == config::warp_size; + }, + syn::value_list(), syn::type_list<>(), exec, llh, a, l, l_new); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_PAR_ICT_ADD_CANDIDATES_KERNEL); + + +template +void compute_factor(std::shared_ptr exec, + const matrix::Csr* a, + matrix::Csr* l, + const matrix::Coo* l_coo) +{ + auto num_rows = a->get_size()[0]; + auto total_nnz = 2 * l->get_num_stored_elements(); + auto total_nnz_per_row = total_nnz / num_rows; + select_compute_factor( + compiled_kernels(), + [&](int compiled_subwarp_size) { + return total_nnz_per_row <= compiled_subwarp_size || + compiled_subwarp_size == config::warp_size; + }, + syn::value_list(), syn::type_list<>(), exec, a, l, l_coo); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_PAR_ICT_COMPUTE_FACTOR_KERNEL); + + +} // namespace par_ict_factorization +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/common/cuda_hip/factorization/par_ilu_kernels.hpp.inc b/common/cuda_hip/factorization/par_ilu_kernels.cpp similarity index 83% rename from common/cuda_hip/factorization/par_ilu_kernels.hpp.inc rename to common/cuda_hip/factorization/par_ilu_kernels.cpp index 1029c0d08f6..abecf288e49 100644 --- a/common/cuda_hip/factorization/par_ilu_kernels.hpp.inc +++ b/common/cuda_hip/factorization/par_ilu_kernels.cpp @@ -2,6 +2,32 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include +#include + + +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/memory.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "core/factorization/par_ilu_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +/** + * @brief The parallel ilu factorization namespace. + * + * @ingroup factor + */ +namespace par_ilu_factorization { + + +constexpr int default_block_size{512}; + + namespace kernel { @@ -85,3 +111,9 @@ void compute_l_u_factors(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_PAR_ILU_COMPUTE_L_U_FACTORS_KERNEL); + + +} // namespace par_ilu_factorization +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/common/cuda_hip/factorization/par_ilut_filter_kernels.hpp.inc b/common/cuda_hip/factorization/par_ilut_filter_kernels.cpp similarity index 57% rename from common/cuda_hip/factorization/par_ilut_filter_kernels.hpp.inc rename to common/cuda_hip/factorization/par_ilut_filter_kernels.cpp index 68794bfc8d1..3622f971878 100644 --- a/common/cuda_hip/factorization/par_ilut_filter_kernels.hpp.inc +++ b/common/cuda_hip/factorization/par_ilut_filter_kernels.cpp @@ -2,6 +2,49 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include "core/factorization/par_ilut_kernels.hpp" + + +#include +#include +#include +#include +#include + + +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/intrinsics.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "core/components/prefix_sum_kernels.hpp" +#include "core/matrix/coo_builder.hpp" +#include "core/matrix/csr_builder.hpp" +#include "core/matrix/csr_kernels.hpp" +#include "core/synthesizer/implementation_selection.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +/** + * @brief The parallel ILUT factorization namespace. + * + * @ingroup factor + */ +namespace par_ilut_factorization { + + +constexpr int default_block_size = 512; + + +// subwarp sizes for filter kernels +using compiled_kernels = + syn::value_list; + + namespace kernel { @@ -162,3 +205,95 @@ __global__ __launch_bounds__(default_block_size) void bucket_filter( } // namespace kernel + + +namespace { + + +template +void threshold_filter(syn::value_list, + std::shared_ptr exec, + const matrix::Csr* a, + remove_complex threshold, + matrix::Csr* m_out, + matrix::Coo* m_out_coo, bool lower) +{ + auto old_row_ptrs = a->get_const_row_ptrs(); + auto old_col_idxs = a->get_const_col_idxs(); + auto old_vals = a->get_const_values(); + // compute nnz for each row + auto num_rows = static_cast(a->get_size()[0]); + auto block_size = default_block_size / subwarp_size; + auto num_blocks = ceildiv(num_rows, block_size); + auto new_row_ptrs = m_out->get_row_ptrs(); + if (num_blocks > 0) { + kernel::threshold_filter_nnz + <<get_stream()>>>( + old_row_ptrs, as_device_type(old_vals), num_rows, + as_device_type(threshold), new_row_ptrs, lower); + } + + // build row pointers + components::prefix_sum_nonnegative(exec, new_row_ptrs, num_rows + 1); + + // build matrix + auto new_nnz = exec->copy_val_to_host(new_row_ptrs + num_rows); + // resize arrays and update aliases + matrix::CsrBuilder builder{m_out}; + builder.get_col_idx_array().resize_and_reset(new_nnz); + builder.get_value_array().resize_and_reset(new_nnz); + auto new_col_idxs = m_out->get_col_idxs(); + auto new_vals = m_out->get_values(); + IndexType* new_row_idxs{}; + if (m_out_coo) { + matrix::CooBuilder coo_builder{m_out_coo}; + coo_builder.get_row_idx_array().resize_and_reset(new_nnz); + coo_builder.get_col_idx_array() = + make_array_view(exec, new_nnz, new_col_idxs); + coo_builder.get_value_array() = + make_array_view(exec, new_nnz, new_vals); + new_row_idxs = m_out_coo->get_row_idxs(); + } + if (num_blocks > 0) { + kernel::threshold_filter + <<get_stream()>>>( + old_row_ptrs, old_col_idxs, as_device_type(old_vals), num_rows, + as_device_type(threshold), new_row_ptrs, new_row_idxs, + new_col_idxs, as_device_type(new_vals), lower); + } +} + + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_threshold_filter, threshold_filter); + + +} // namespace + +template +void threshold_filter(std::shared_ptr exec, + const matrix::Csr* a, + remove_complex threshold, + matrix::Csr* m_out, + matrix::Coo* m_out_coo, bool lower) +{ + auto num_rows = a->get_size()[0]; + auto total_nnz = a->get_num_stored_elements(); + auto total_nnz_per_row = total_nnz / num_rows; + select_threshold_filter( + compiled_kernels(), + [&](int compiled_subwarp_size) { + return total_nnz_per_row <= compiled_subwarp_size || + compiled_subwarp_size == config::warp_size; + }, + syn::value_list(), syn::type_list<>(), exec, a, threshold, m_out, + m_out_coo, lower); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_KERNEL); + + +} // namespace par_ilut_factorization +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/common/cuda_hip/factorization/par_ilut_select_kernels.hpp.inc b/common/cuda_hip/factorization/par_ilut_select_kernels.cpp similarity index 63% rename from common/cuda_hip/factorization/par_ilut_select_kernels.hpp.inc rename to common/cuda_hip/factorization/par_ilut_select_kernels.cpp index 2ee5061d4c5..5c00503923a 100644 --- a/common/cuda_hip/factorization/par_ilut_select_kernels.hpp.inc +++ b/common/cuda_hip/factorization/par_ilut_select_kernels.cpp @@ -2,6 +2,40 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include "core/factorization/par_ilut_kernels.hpp" + + +#include +#include + + +#include +#include +#include + + +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/atomic.hpp" +#include "common/cuda_hip/components/intrinsics.hpp" +#include "common/cuda_hip/components/prefix_sum.hpp" +#include "common/cuda_hip/components/searching.hpp" +#include "common/cuda_hip/components/sorting.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "core/components/prefix_sum_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +/** + * @brief The parallel ILUT factorization namespace. + * + * @ingroup factor + */ +namespace par_ilut_factorization { + + namespace kernel { @@ -278,3 +312,126 @@ __global__ __launch_bounds__(config::warp_size) void find_bucket( } // namespace kernel + + +template +void sampleselect_filter(std::shared_ptr exec, + const ValueType* values, IndexType size, + const unsigned char* oracles, + const IndexType* partial_counts, IndexType bucket, + remove_complex* out) +{ + auto num_threads_total = ceildiv(size, items_per_thread); + auto num_blocks = + static_cast(ceildiv(num_threads_total, default_block_size)); + if (num_blocks > 0) { + kernel::filter_bucket<<get_stream()>>>( + as_device_type(values), size, bucket, oracles, partial_counts, + as_device_type(out), items_per_thread); + } +} + + +template +void threshold_select(std::shared_ptr exec, + const matrix::Csr* m, + IndexType rank, array& tmp1, + array>& tmp2, + remove_complex& threshold) +{ + auto values = m->get_const_values(); + IndexType size = m->get_num_stored_elements(); + using AbsType = remove_complex; + constexpr auto bucket_count = kernel::searchtree_width; + auto max_num_threads = ceildiv(size, items_per_thread); + auto max_num_blocks = ceildiv(max_num_threads, default_block_size); + + size_type tmp_size_totals = + ceildiv((bucket_count + 1) * sizeof(IndexType), sizeof(ValueType)); + size_type tmp_size_partials = ceildiv( + bucket_count * max_num_blocks * sizeof(IndexType), sizeof(ValueType)); + size_type tmp_size_oracles = + ceildiv(size * sizeof(unsigned char), sizeof(ValueType)); + size_type tmp_size_tree = + ceildiv(kernel::searchtree_size * sizeof(AbsType), sizeof(ValueType)); + size_type tmp_size_vals = + size / bucket_count * 4; // pessimistic estimate for temporary storage + size_type tmp_size = + tmp_size_totals + tmp_size_partials + tmp_size_oracles + tmp_size_tree; + tmp1.resize_and_reset(tmp_size); + tmp2.resize_and_reset(tmp_size_vals); + + auto total_counts = reinterpret_cast(tmp1.get_data()); + auto partial_counts = + reinterpret_cast(tmp1.get_data() + tmp_size_totals); + auto oracles = reinterpret_cast( + tmp1.get_data() + tmp_size_totals + tmp_size_partials); + auto tree = + reinterpret_cast(tmp1.get_data() + tmp_size_totals + + tmp_size_partials + tmp_size_oracles); + + sampleselect_count(exec, values, size, tree, oracles, partial_counts, + total_counts); + + // determine bucket with correct rank, use bucket-local rank + auto bucket = sampleselect_find_bucket(exec, total_counts, rank); + rank -= bucket.begin; + + if (bucket.size * 2 > tmp_size_vals) { + // we need to reallocate tmp2 + tmp2.resize_and_reset(bucket.size * 2); + } + auto tmp21 = tmp2.get_data(); + auto tmp22 = tmp2.get_data() + bucket.size; + // extract target bucket + sampleselect_filter(exec, values, size, oracles, partial_counts, bucket.idx, + tmp22); + + // recursively select from smaller buckets + int step{}; + while (bucket.size > kernel::basecase_size) { + std::swap(tmp21, tmp22); + const auto* tmp_in = tmp21; + auto tmp_out = tmp22; + + sampleselect_count(exec, tmp_in, bucket.size, tree, oracles, + partial_counts, total_counts); + auto new_bucket = sampleselect_find_bucket(exec, total_counts, rank); + sampleselect_filter(exec, tmp_in, bucket.size, oracles, partial_counts, + bucket.idx, tmp_out); + + rank -= new_bucket.begin; + bucket.size = new_bucket.size; + // we should never need more than 5 recursion steps, this would mean + // 256^5 = 2^40. fall back to standard library algorithm in that case. + ++step; + if (step > 5) { + array cpu_out_array{ + exec->get_master(), + make_array_view(exec, bucket.size, tmp_out)}; + auto begin = cpu_out_array.get_data(); + auto end = begin + bucket.size; + auto middle = begin + rank; + std::nth_element(begin, middle, end); + threshold = *middle; + return; + } + } + + // base case + auto out_ptr = reinterpret_cast(tmp1.get_data()); + kernel::basecase_select<<<1, kernel::basecase_block_size, 0, + exec->get_stream()>>>( + as_device_type(tmp22), bucket.size, rank, as_device_type(out_ptr)); + threshold = exec->copy_val_to_host(out_ptr); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_PAR_ILUT_THRESHOLD_SELECT_KERNEL); + + +} // namespace par_ilut_factorization +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/common/cuda_hip/factorization/par_ilut_spgeam_kernels.hpp.inc b/common/cuda_hip/factorization/par_ilut_spgeam_kernels.cpp similarity index 63% rename from common/cuda_hip/factorization/par_ilut_spgeam_kernels.hpp.inc rename to common/cuda_hip/factorization/par_ilut_spgeam_kernels.cpp index a97f0f08937..b9658f69f70 100644 --- a/common/cuda_hip/factorization/par_ilut_spgeam_kernels.hpp.inc +++ b/common/cuda_hip/factorization/par_ilut_spgeam_kernels.cpp @@ -2,6 +2,50 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include "core/factorization/par_ilut_kernels.hpp" + + +#include +#include +#include +#include +#include + + +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/intrinsics.hpp" +#include "common/cuda_hip/components/merging.hpp" +#include "common/cuda_hip/components/prefix_sum.hpp" +#include "common/cuda_hip/components/searching.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "core/components/prefix_sum_kernels.hpp" +#include "core/matrix/coo_builder.hpp" +#include "core/matrix/csr_builder.hpp" +#include "core/matrix/csr_kernels.hpp" +#include "core/synthesizer/implementation_selection.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +/** + * @brief The parallel ILUT factorization namespace. + * + * @ingroup factor + */ +namespace par_ilut_factorization { + + +constexpr int default_block_size = 512; + + +// subwarp sizes for add_candidates kernels +using compiled_kernels = + syn::value_list; + + namespace kernel { @@ -246,3 +290,113 @@ __global__ __launch_bounds__(default_block_size) void tri_spgeam_init( } // namespace kernel + + +namespace { + + +template +void add_candidates(syn::value_list, + std::shared_ptr exec, + const matrix::Csr* lu, + const matrix::Csr* a, + const matrix::Csr* l, + const matrix::Csr* u, + matrix::Csr* l_new, + matrix::Csr* u_new) +{ + auto num_rows = static_cast(lu->get_size()[0]); + auto subwarps_per_block = default_block_size / subwarp_size; + auto num_blocks = ceildiv(num_rows, subwarps_per_block); + matrix::CsrBuilder l_new_builder(l_new); + matrix::CsrBuilder u_new_builder(u_new); + auto lu_row_ptrs = lu->get_const_row_ptrs(); + auto lu_col_idxs = lu->get_const_col_idxs(); + auto lu_vals = lu->get_const_values(); + auto a_row_ptrs = a->get_const_row_ptrs(); + auto a_col_idxs = a->get_const_col_idxs(); + auto a_vals = a->get_const_values(); + auto l_row_ptrs = l->get_const_row_ptrs(); + auto l_col_idxs = l->get_const_col_idxs(); + auto l_vals = l->get_const_values(); + auto u_row_ptrs = u->get_const_row_ptrs(); + auto u_col_idxs = u->get_const_col_idxs(); + auto u_vals = u->get_const_values(); + auto l_new_row_ptrs = l_new->get_row_ptrs(); + auto u_new_row_ptrs = u_new->get_row_ptrs(); + if (num_blocks > 0) { + // count non-zeros per row + kernel::tri_spgeam_nnz + <<get_stream()>>>( + lu_row_ptrs, lu_col_idxs, a_row_ptrs, a_col_idxs, + l_new_row_ptrs, u_new_row_ptrs, num_rows); + } + + // build row ptrs + components::prefix_sum_nonnegative(exec, l_new_row_ptrs, num_rows + 1); + components::prefix_sum_nonnegative(exec, u_new_row_ptrs, num_rows + 1); + + // resize output arrays + auto l_new_nnz = exec->copy_val_to_host(l_new_row_ptrs + num_rows); + auto u_new_nnz = exec->copy_val_to_host(u_new_row_ptrs + num_rows); + l_new_builder.get_col_idx_array().resize_and_reset(l_new_nnz); + l_new_builder.get_value_array().resize_and_reset(l_new_nnz); + u_new_builder.get_col_idx_array().resize_and_reset(u_new_nnz); + u_new_builder.get_value_array().resize_and_reset(u_new_nnz); + + auto l_new_col_idxs = l_new->get_col_idxs(); + auto l_new_vals = l_new->get_values(); + auto u_new_col_idxs = u_new->get_col_idxs(); + auto u_new_vals = u_new->get_values(); + + if (num_blocks > 0) { + // fill columns and values + kernel::tri_spgeam_init + <<get_stream()>>>( + lu_row_ptrs, lu_col_idxs, as_device_type(lu_vals), a_row_ptrs, + a_col_idxs, as_device_type(a_vals), l_row_ptrs, l_col_idxs, + as_device_type(l_vals), u_row_ptrs, u_col_idxs, + as_device_type(u_vals), l_new_row_ptrs, l_new_col_idxs, + as_device_type(l_new_vals), u_new_row_ptrs, u_new_col_idxs, + as_device_type(u_new_vals), num_rows); + } +} + + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_add_candidates, add_candidates); + + +} // namespace + + +template +void add_candidates(std::shared_ptr exec, + const matrix::Csr* lu, + const matrix::Csr* a, + const matrix::Csr* l, + const matrix::Csr* u, + matrix::Csr* l_new, + matrix::Csr* u_new) +{ + auto num_rows = a->get_size()[0]; + auto total_nnz = + lu->get_num_stored_elements() + a->get_num_stored_elements(); + auto total_nnz_per_row = total_nnz / num_rows; + select_add_candidates( + compiled_kernels(), + [&](int compiled_subwarp_size) { + return total_nnz_per_row <= compiled_subwarp_size || + compiled_subwarp_size == config::warp_size; + }, + syn::value_list(), syn::type_list<>(), exec, lu, a, l, u, l_new, + u_new); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_PAR_ILUT_ADD_CANDIDATES_KERNEL); + + +} // namespace par_ilut_factorization +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/common/cuda_hip/factorization/par_ilut_sweep_kernels.cpp b/common/cuda_hip/factorization/par_ilut_sweep_kernels.cpp new file mode 100644 index 00000000000..6ae783133e5 --- /dev/null +++ b/common/cuda_hip/factorization/par_ilut_sweep_kernels.cpp @@ -0,0 +1,212 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "core/factorization/par_ilut_kernels.hpp" + + +#include +#include +#include +#include +#include + + +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/intrinsics.hpp" +#include "common/cuda_hip/components/memory.hpp" +#include "common/cuda_hip/components/merging.hpp" +#include "common/cuda_hip/components/prefix_sum.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/searching.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "core/components/prefix_sum_kernels.hpp" +#include "core/matrix/coo_builder.hpp" +#include "core/matrix/csr_builder.hpp" +#include "core/matrix/csr_kernels.hpp" +#include "core/synthesizer/implementation_selection.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +/** + * @brief The parallel ILUT factorization namespace. + * + * @ingroup factor + */ +namespace par_ilut_factorization { + + +constexpr int default_block_size = 512; + + +// subwarp sizes for all warp-parallel kernels (filter, add_candidates) +using compiled_kernels = + syn::value_list; + + +namespace kernel { + + +template +__global__ __launch_bounds__(default_block_size) void sweep( + const IndexType* __restrict__ a_row_ptrs, + const IndexType* __restrict__ a_col_idxs, + const ValueType* __restrict__ a_vals, + const IndexType* __restrict__ l_row_ptrs, + const IndexType* __restrict__ l_row_idxs, + const IndexType* __restrict__ l_col_idxs, ValueType* __restrict__ l_vals, + IndexType l_nnz, const IndexType* __restrict__ u_row_idxs, + const IndexType* __restrict__ u_col_idxs, ValueType* __restrict__ u_vals, + const IndexType* __restrict__ ut_col_ptrs, + const IndexType* __restrict__ ut_row_idxs, ValueType* __restrict__ ut_vals, + IndexType u_nnz) +{ + auto tidx = thread::get_subwarp_id_flat(); + if (tidx >= l_nnz + u_nnz) { + return; + } + // split the subwarps into two halves for lower and upper triangle + auto l_nz = tidx; + auto u_nz = l_nz - l_nnz; + auto lower = u_nz < 0; + auto row = lower ? l_row_idxs[l_nz] : u_row_idxs[u_nz]; + auto col = lower ? l_col_idxs[l_nz] : u_col_idxs[u_nz]; + if (lower && row == col) { + // don't update the diagonal twice + return; + } + auto subwarp = + group::tiled_partition(group::this_thread_block()); + // find entry of A at (row, col) + auto a_row_begin = a_row_ptrs[row]; + auto a_row_end = a_row_ptrs[row + 1]; + auto a_row_size = a_row_end - a_row_begin; + auto a_idx = + group_wide_search(a_row_begin, a_row_size, subwarp, + [&](IndexType i) { return a_col_idxs[i] >= col; }); + bool has_a = a_idx < a_row_end && a_col_idxs[a_idx] == col; + auto a_val = has_a ? a_vals[a_idx] : zero(); + auto l_row_begin = l_row_ptrs[row]; + auto l_row_size = l_row_ptrs[row + 1] - l_row_begin; + auto ut_col_begin = ut_col_ptrs[col]; + auto ut_col_size = ut_col_ptrs[col + 1] - ut_col_begin; + ValueType sum{}; + IndexType ut_nz{}; + auto last_entry = min(row, col); + group_merge( + l_col_idxs + l_row_begin, l_row_size, ut_row_idxs + ut_col_begin, + ut_col_size, subwarp, + [&](IndexType l_idx, IndexType l_col, IndexType ut_idx, + IndexType ut_row, IndexType, bool) { + // we don't need to use the `bool valid` because last_entry is + // already a smaller sentinel value than the one used in group_merge + if (l_col == ut_row && l_col < last_entry) { + sum += load_relaxed(l_vals + (l_idx + l_row_begin)) * + load_relaxed(ut_vals + (ut_idx + ut_col_begin)); + } + // remember the transposed element + auto found_transp = subwarp.ballot(ut_row == row); + if (found_transp) { + ut_nz = + subwarp.shfl(ut_idx + ut_col_begin, ffs(found_transp) - 1); + } + return true; + }); + // accumulate result from all threads + sum = reduce(subwarp, sum, [](ValueType a, ValueType b) { return a + b; }); + + if (subwarp.thread_rank() == 0) { + if (lower) { + auto to_write = (a_val - sum) / + load_relaxed(ut_vals + (ut_col_ptrs[col + 1] - 1)); + if (is_finite(to_write)) { + store_relaxed(l_vals + l_nz, to_write); + } + } else { + auto to_write = a_val - sum; + if (is_finite(to_write)) { + store_relaxed(u_vals + u_nz, to_write); + store_relaxed(ut_vals + ut_nz, to_write); + } + } + } +} + + +} // namespace kernel + + +namespace { + + +template +void compute_l_u_factors(syn::value_list, + std::shared_ptr exec, + const matrix::Csr* a, + matrix::Csr* l, + const matrix::Coo* l_coo, + matrix::Csr* u, + const matrix::Coo* u_coo, + matrix::Csr* u_csc) +{ + auto total_nnz = static_cast(l->get_num_stored_elements() + + u->get_num_stored_elements()); + auto block_size = default_block_size / subwarp_size; + auto num_blocks = ceildiv(total_nnz, block_size); + if (num_blocks > 0) { + kernel::sweep + <<get_stream()>>>( + a->get_const_row_ptrs(), a->get_const_col_idxs(), + as_device_type(a->get_const_values()), l->get_const_row_ptrs(), + l_coo->get_const_row_idxs(), l->get_const_col_idxs(), + as_device_type(l->get_values()), + static_cast(l->get_num_stored_elements()), + u_coo->get_const_row_idxs(), u_coo->get_const_col_idxs(), + as_device_type(u->get_values()), u_csc->get_const_row_ptrs(), + u_csc->get_const_col_idxs(), + as_device_type(u_csc->get_values()), + static_cast(u->get_num_stored_elements())); + } +} + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_compute_l_u_factors, + compute_l_u_factors); + + +} // namespace + + +template +void compute_l_u_factors(std::shared_ptr exec, + const matrix::Csr* a, + matrix::Csr* l, + const matrix::Coo* l_coo, + matrix::Csr* u, + const matrix::Coo* u_coo, + matrix::Csr* u_csc) +{ + auto num_rows = a->get_size()[0]; + auto total_nnz = + l->get_num_stored_elements() + u->get_num_stored_elements(); + auto total_nnz_per_row = total_nnz / num_rows; + select_compute_l_u_factors( + compiled_kernels(), + [&](int compiled_subwarp_size) { + return total_nnz_per_row <= compiled_subwarp_size || + compiled_subwarp_size == config::warp_size; + }, + syn::value_list(), syn::type_list<>(), exec, a, l, l_coo, u, u_coo, + u_csc); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_PAR_ILUT_COMPUTE_LU_FACTORS_KERNEL); + + +} // namespace par_ilut_factorization +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/common/cuda_hip/factorization/par_ilut_sweep_kernels.hpp.inc b/common/cuda_hip/factorization/par_ilut_sweep_kernels.hpp.inc deleted file mode 100644 index 9da94a878b3..00000000000 --- a/common/cuda_hip/factorization/par_ilut_sweep_kernels.hpp.inc +++ /dev/null @@ -1,94 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -namespace kernel { - - -template -__global__ __launch_bounds__(default_block_size) void sweep( - const IndexType* __restrict__ a_row_ptrs, - const IndexType* __restrict__ a_col_idxs, - const ValueType* __restrict__ a_vals, - const IndexType* __restrict__ l_row_ptrs, - const IndexType* __restrict__ l_row_idxs, - const IndexType* __restrict__ l_col_idxs, ValueType* __restrict__ l_vals, - IndexType l_nnz, const IndexType* __restrict__ u_row_idxs, - const IndexType* __restrict__ u_col_idxs, ValueType* __restrict__ u_vals, - const IndexType* __restrict__ ut_col_ptrs, - const IndexType* __restrict__ ut_row_idxs, ValueType* __restrict__ ut_vals, - IndexType u_nnz) -{ - auto tidx = thread::get_subwarp_id_flat(); - if (tidx >= l_nnz + u_nnz) { - return; - } - // split the subwarps into two halves for lower and upper triangle - auto l_nz = tidx; - auto u_nz = l_nz - l_nnz; - auto lower = u_nz < 0; - auto row = lower ? l_row_idxs[l_nz] : u_row_idxs[u_nz]; - auto col = lower ? l_col_idxs[l_nz] : u_col_idxs[u_nz]; - if (lower && row == col) { - // don't update the diagonal twice - return; - } - auto subwarp = - group::tiled_partition(group::this_thread_block()); - // find entry of A at (row, col) - auto a_row_begin = a_row_ptrs[row]; - auto a_row_end = a_row_ptrs[row + 1]; - auto a_row_size = a_row_end - a_row_begin; - auto a_idx = - group_wide_search(a_row_begin, a_row_size, subwarp, - [&](IndexType i) { return a_col_idxs[i] >= col; }); - bool has_a = a_idx < a_row_end && a_col_idxs[a_idx] == col; - auto a_val = has_a ? a_vals[a_idx] : zero(); - auto l_row_begin = l_row_ptrs[row]; - auto l_row_size = l_row_ptrs[row + 1] - l_row_begin; - auto ut_col_begin = ut_col_ptrs[col]; - auto ut_col_size = ut_col_ptrs[col + 1] - ut_col_begin; - ValueType sum{}; - IndexType ut_nz{}; - auto last_entry = min(row, col); - group_merge( - l_col_idxs + l_row_begin, l_row_size, ut_row_idxs + ut_col_begin, - ut_col_size, subwarp, - [&](IndexType l_idx, IndexType l_col, IndexType ut_idx, - IndexType ut_row, IndexType, bool) { - // we don't need to use the `bool valid` because last_entry is - // already a smaller sentinel value than the one used in group_merge - if (l_col == ut_row && l_col < last_entry) { - sum += load_relaxed(l_vals + (l_idx + l_row_begin)) * - load_relaxed(ut_vals + (ut_idx + ut_col_begin)); - } - // remember the transposed element - auto found_transp = subwarp.ballot(ut_row == row); - if (found_transp) { - ut_nz = - subwarp.shfl(ut_idx + ut_col_begin, ffs(found_transp) - 1); - } - return true; - }); - // accumulate result from all threads - sum = reduce(subwarp, sum, [](ValueType a, ValueType b) { return a + b; }); - - if (subwarp.thread_rank() == 0) { - if (lower) { - auto to_write = (a_val - sum) / - load_relaxed(ut_vals + (ut_col_ptrs[col + 1] - 1)); - if (is_finite(to_write)) { - store_relaxed(l_vals + l_nz, to_write); - } - } else { - auto to_write = a_val - sum; - if (is_finite(to_write)) { - store_relaxed(u_vals + u_nz, to_write); - store_relaxed(ut_vals + ut_nz, to_write); - } - } - } -} - - -} // namespace kernel diff --git a/common/cuda_hip/log/batch_logger.hpp.inc b/common/cuda_hip/log/batch_logger.hpp similarity index 68% rename from common/cuda_hip/log/batch_logger.hpp.inc rename to common/cuda_hip/log/batch_logger.hpp index 04b614b50f9..77ec84fb7bd 100644 --- a/common/cuda_hip/log/batch_logger.hpp.inc +++ b/common/cuda_hip/log/batch_logger.hpp @@ -2,6 +2,19 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_COMMON_CUDA_HIP_LOG_BATCH_LOGGER_HPP_ +#define GKO_COMMON_CUDA_HIP_LOG_BATCH_LOGGER_HPP_ + + +#include + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace batch_log { + + /** * @see reference/log/batch_logger.hpp */ @@ -28,3 +41,12 @@ class SimpleFinalLogger final { real_type* const final_residuals_; idx_type* const final_iters_; }; + + +} // namespace batch_log +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko + + +#endif // GKO_COMMON_CUDA_HIP_LOG_BATCH_LOGGER_HPP_ diff --git a/common/cuda_hip/matrix/batch_csr_kernels.hpp.inc b/common/cuda_hip/matrix/batch_csr_kernels.cpp similarity index 87% rename from common/cuda_hip/matrix/batch_csr_kernels.hpp.inc rename to common/cuda_hip/matrix/batch_csr_kernels.cpp index e041dadaa3e..a07074e29e8 100644 --- a/common/cuda_hip/matrix/batch_csr_kernels.hpp.inc +++ b/common/cuda_hip/matrix/batch_csr_kernels.cpp @@ -2,6 +2,49 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include +#include + + +#include + + +#include +#include +#include + + +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/thrust.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/uninitialized_array.hpp" +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_csr_kernels.hpp" +#include "core/matrix/batch_struct.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +/** + * @brief The Csr matrix format namespace. + * @ref Csr + * @ingroup batch_csr + */ +namespace batch_csr { + + +constexpr auto default_block_size = 256; +constexpr int sm_oversubscription = 4; + +// clang-format off + +// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES + + template __device__ __forceinline__ void simple_apply( const gko::batch::matrix::csr::batch_item& mat, @@ -196,3 +239,14 @@ __global__ void add_scaled_identity_kernel( add_scaled_identity(alpha_b.values[0], beta_b.values[0], mat_b); } } + + +#include "common/cuda_hip/matrix/batch_csr_kernel_launcher.hpp.inc" + +// clang-format on + + +} // namespace batch_csr +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/common/cuda_hip/matrix/batch_dense_kernels.hpp.inc b/common/cuda_hip/matrix/batch_dense_kernels.cpp similarity index 89% rename from common/cuda_hip/matrix/batch_dense_kernels.hpp.inc rename to common/cuda_hip/matrix/batch_dense_kernels.cpp index f8abf9131a1..b5c2dbe1d5d 100644 --- a/common/cuda_hip/matrix/batch_dense_kernels.hpp.inc +++ b/common/cuda_hip/matrix/batch_dense_kernels.cpp @@ -2,6 +2,49 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include +#include + + +#include + + +#include +#include +#include + + +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/thrust.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/uninitialized_array.hpp" +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_dense_kernels.hpp" +#include "core/matrix/batch_struct.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +/** + * @brief The Dense matrix format namespace. + * + * @ingroup batch_dense + */ +namespace batch_dense { + + +constexpr auto default_block_size = 256; +constexpr int sm_oversubscription = 4; + +// clang-format off + +// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES + + template __device__ __forceinline__ void simple_apply( const gko::batch::matrix::dense::batch_item& mat, @@ -243,3 +286,15 @@ __global__ void add_scaled_identity_kernel( add_scaled_identity(alpha_b.values[0], beta_b.values[0], mat_b); } } + + +#include "common/cuda_hip/matrix/batch_dense_kernel_launcher.hpp.inc" + + +// clang-format on + + +} // namespace batch_dense +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/common/cuda_hip/matrix/batch_ell_kernels.hpp.inc b/common/cuda_hip/matrix/batch_ell_kernels.cpp similarity index 87% rename from common/cuda_hip/matrix/batch_ell_kernels.hpp.inc rename to common/cuda_hip/matrix/batch_ell_kernels.cpp index 0a6d1927c96..c3bf21c7744 100644 --- a/common/cuda_hip/matrix/batch_ell_kernels.hpp.inc +++ b/common/cuda_hip/matrix/batch_ell_kernels.cpp @@ -2,6 +2,49 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include +#include + + +#include + + +#include +#include +#include + + +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/thrust.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/uninitialized_array.hpp" +#include "core/base/batch_struct.hpp" +#include "core/matrix/batch_ell_kernels.hpp" +#include "core/matrix/batch_struct.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +/** + * @brief The Ell matrix format namespace. + * @ref Ell + * @ingroup batch_ell + */ +namespace batch_ell { + + +constexpr auto default_block_size = 256; +constexpr int sm_oversubscription = 4; + +// clang-format off + +// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES + + template __device__ __forceinline__ void simple_apply( const gko::batch::matrix::ell::batch_item& mat, @@ -205,3 +248,14 @@ __global__ void add_scaled_identity_kernel( add_scaled_identity(alpha_b.values[0], beta_b.values[0], mat_b); } } + + +#include "common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc" + +// clang-format on + + +} // namespace batch_ell +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/common/cuda_hip/matrix/coo_kernels.hpp.inc b/common/cuda_hip/matrix/coo_kernels.cpp similarity index 91% rename from common/cuda_hip/matrix/coo_kernels.hpp.inc rename to common/cuda_hip/matrix/coo_kernels.cpp index 98332f6cd7b..103926124a2 100644 --- a/common/cuda_hip/matrix/coo_kernels.hpp.inc +++ b/common/cuda_hip/matrix/coo_kernels.cpp @@ -2,6 +2,42 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include +#include +#include +#include +#include + + +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/sparselib_bindings.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/atomic.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/format_conversion.hpp" +#include "common/cuda_hip/components/segment_scan.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "core/matrix/coo_kernels.hpp" +#include "core/matrix/dense_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +/** + * @brief The Coordinate matrix format namespace. + * + * @ingroup coo + */ +namespace coo { + + +constexpr int warps_in_block = 4; +constexpr int spmv_block_size = warps_in_block * config::warp_size; + + namespace { @@ -304,3 +340,9 @@ void advanced_spmv2(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_COO_ADVANCED_SPMV2_KERNEL); + + +} // namespace coo +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/common/cuda_hip/matrix/dense_kernels.hpp.inc b/common/cuda_hip/matrix/dense_kernels.cpp similarity index 75% rename from common/cuda_hip/matrix/dense_kernels.hpp.inc rename to common/cuda_hip/matrix/dense_kernels.cpp index b48d2c4ff4f..1524e0a93b0 100644 --- a/common/cuda_hip/matrix/dense_kernels.hpp.inc +++ b/common/cuda_hip/matrix/dense_kernels.cpp @@ -2,6 +2,46 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "common/cuda_hip/base/blas_bindings.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/pointer_mode_guard.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/intrinsics.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/uninitialized_array.hpp" +#include "core/base/utils.hpp" +#include "core/components/prefix_sum_kernels.hpp" +#include "core/matrix/dense_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +/** + * @brief The Dense matrix format namespace. + * + * @ingroup dense + */ +namespace dense { + + +constexpr int default_block_size = 512; + + namespace kernel { @@ -619,3 +659,188 @@ void convert_to_sparsity_csr(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_DENSE_CONVERT_TO_SPARSITY_CSR_KERNEL); + + +template +void compute_dot_dispatch(std::shared_ptr exec, + const matrix::Dense* x, + const matrix::Dense* y, + matrix::Dense* result, array& tmp) +{ + if (x->get_size()[1] == 1 && y->get_size()[1] == 1) { + if (blas::is_supported::value) { + auto handle = exec->get_blas_handle(); + blas::dot(handle, x->get_size()[0], x->get_const_values(), + x->get_stride(), y->get_const_values(), y->get_stride(), + result->get_values()); + } else { + compute_dot(exec, x, y, result, tmp); + } + } else { + compute_dot(exec, x, y, result, tmp); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL); + + +template +void compute_conj_dot_dispatch(std::shared_ptr exec, + const matrix::Dense* x, + const matrix::Dense* y, + matrix::Dense* result, + array& tmp) +{ + if (x->get_size()[1] == 1 && y->get_size()[1] == 1) { + if (blas::is_supported::value) { + auto handle = exec->get_blas_handle(); + blas::conj_dot(handle, x->get_size()[0], x->get_const_values(), + x->get_stride(), y->get_const_values(), + y->get_stride(), result->get_values()); + } else { + compute_conj_dot(exec, x, y, result, tmp); + } + } else { + compute_conj_dot(exec, x, y, result, tmp); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL); + + +template +void compute_norm2_dispatch(std::shared_ptr exec, + const matrix::Dense* x, + matrix::Dense>* result, + array& tmp) +{ + if (x->get_size()[1] == 1) { + if (blas::is_supported::value) { + auto handle = exec->get_blas_handle(); + blas::norm2(handle, x->get_size()[0], x->get_const_values(), + x->get_stride(), result->get_values()); + } else { + compute_norm2(exec, x, result, tmp); + } + } else { + compute_norm2(exec, x, result, tmp); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL); + + +template +void simple_apply(std::shared_ptr exec, + const matrix::Dense* a, + const matrix::Dense* b, + matrix::Dense* c) +{ + if (blas::is_supported::value) { + auto handle = exec->get_blas_handle(); + if (c->get_size()[0] > 0 && c->get_size()[1] > 0) { + if (a->get_size()[1] > 0) { + blas::pointer_mode_guard pm_guard(handle); + auto alpha = one(); + auto beta = zero(); + blas::gemm(handle, BLAS_OP_N, BLAS_OP_N, c->get_size()[1], + c->get_size()[0], a->get_size()[1], &alpha, + b->get_const_values(), b->get_stride(), + a->get_const_values(), a->get_stride(), &beta, + c->get_values(), c->get_stride()); + } else { + dense::fill(exec, c, zero()); + } + } + } else { + GKO_NOT_IMPLEMENTED; + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_SIMPLE_APPLY_KERNEL); + + +template +void apply(std::shared_ptr exec, + const matrix::Dense* alpha, + const matrix::Dense* a, const matrix::Dense* b, + const matrix::Dense* beta, matrix::Dense* c) +{ + if (blas::is_supported::value) { + if (c->get_size()[0] > 0 && c->get_size()[1] > 0) { + if (a->get_size()[1] > 0) { + blas::gemm(exec->get_blas_handle(), BLAS_OP_N, BLAS_OP_N, + c->get_size()[1], c->get_size()[0], a->get_size()[1], + alpha->get_const_values(), b->get_const_values(), + b->get_stride(), a->get_const_values(), + a->get_stride(), beta->get_const_values(), + c->get_values(), c->get_stride()); + } else { + dense::scale(exec, beta, c); + } + } + } else { + GKO_NOT_IMPLEMENTED; + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL); + + +template +void transpose(std::shared_ptr exec, + const matrix::Dense* orig, + matrix::Dense* trans) +{ + if (blas::is_supported::value) { + auto handle = exec->get_blas_handle(); + if (orig->get_size()[0] > 0 && orig->get_size()[1] > 0) { + blas::pointer_mode_guard pm_guard(handle); + auto alpha = one(); + auto beta = zero(); + blas::geam(handle, BLAS_OP_T, BLAS_OP_N, orig->get_size()[0], + orig->get_size()[1], &alpha, orig->get_const_values(), + orig->get_stride(), &beta, trans->get_const_values(), + trans->get_stride(), trans->get_values(), + trans->get_stride()); + } + } else { + GKO_NOT_IMPLEMENTED; + } +}; + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_TRANSPOSE_KERNEL); + + +template +void conj_transpose(std::shared_ptr exec, + const matrix::Dense* orig, + matrix::Dense* trans) +{ + if (blas::is_supported::value) { + auto handle = exec->get_blas_handle(); + if (orig->get_size()[0] > 0 && orig->get_size()[1] > 0) { + blas::pointer_mode_guard pm_guard(handle); + auto alpha = one(); + auto beta = zero(); + blas::geam(handle, BLAS_OP_C, BLAS_OP_N, orig->get_size()[0], + orig->get_size()[1], &alpha, orig->get_const_values(), + orig->get_stride(), &beta, trans->get_const_values(), + trans->get_stride(), trans->get_values(), + trans->get_stride()); + } + } else { + GKO_NOT_IMPLEMENTED; + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_CONJ_TRANSPOSE_KERNEL); + + +} // namespace dense +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/common/cuda_hip/matrix/diagonal_kernels.hpp.inc b/common/cuda_hip/matrix/diagonal_kernels.cpp similarity index 73% rename from common/cuda_hip/matrix/diagonal_kernels.hpp.inc rename to common/cuda_hip/matrix/diagonal_kernels.cpp index c3919fda079..75f07d7373e 100644 --- a/common/cuda_hip/matrix/diagonal_kernels.hpp.inc +++ b/common/cuda_hip/matrix/diagonal_kernels.cpp @@ -2,6 +2,32 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include +#include + + +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "core/matrix/diagonal_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +/** + * @brief The Diagonal matrix format namespace. + * + * @ingroup diagonal + */ +namespace diagonal { + + +constexpr int default_block_size = 512; + + namespace kernel { @@ -57,3 +83,9 @@ void apply_to_csr(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_DIAGONAL_APPLY_TO_CSR_KERNEL); + + +} // namespace diagonal +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/cuda/matrix/ell_kernels.cu b/common/cuda_hip/matrix/ell_kernels.cpp similarity index 57% rename from cuda/matrix/ell_kernels.cu rename to common/cuda_hip/matrix/ell_kernels.cpp index e91b03c816d..96e9dac9d78 100644 --- a/cuda/matrix/ell_kernels.cu +++ b/common/cuda_hip/matrix/ell_kernels.cpp @@ -2,9 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include "core/matrix/ell_kernels.hpp" - - +#include #include @@ -15,27 +13,27 @@ #include -#include "accessor/cuda_helper.hpp" #include "accessor/reduced_row_major.hpp" +#include "common/cuda_hip/base/config.hpp" #include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/sparselib_bindings.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/atomic.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/format_conversion.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" #include "core/base/mixed_precision_types.hpp" #include "core/components/fill_array_kernels.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/dense_kernels.hpp" +#include "core/matrix/ell_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "cuda/base/config.hpp" -#include "cuda/base/cusparse_bindings.hpp" -#include "cuda/base/types.hpp" -#include "cuda/components/atomic.cuh" -#include "cuda/components/cooperative_groups.cuh" -#include "cuda/components/format_conversion.cuh" -#include "cuda/components/reduction.cuh" -#include "cuda/components/thread_ids.cuh" namespace gko { namespace kernels { -namespace cuda { +namespace GKO_DEVICE_NAMESPACE { /** * @brief The ELL matrix format namespace. * @@ -78,7 +76,135 @@ constexpr int max_thread_per_worker = 32; using compiled_kernels = syn::value_list; -#include "common/cuda_hip/matrix/ell_kernels.hpp.inc" +namespace kernel { + + +template +__device__ void spmv_kernel( + const size_type num_rows, const int num_worker_per_row, + acc::range val, const IndexType* __restrict__ col, + const size_type stride, const size_type num_stored_elements_per_row, + acc::range b, OutputValueType* __restrict__ c, + const size_type c_stride, Closure op) +{ + using arithmetic_type = typename a_accessor::arithmetic_type; + const auto tidx = thread::get_thread_id_flat(); + const decltype(tidx) column_id = blockIdx.y; + if (num_thread_per_worker == 1) { + // Specialize the num_thread_per_worker = 1. It doesn't need the shared + // memory, __syncthreads, and atomic_add + if (tidx < num_rows) { + auto temp = zero(); + for (size_type idx = 0; idx < num_stored_elements_per_row; idx++) { + const auto ind = tidx + idx * stride; + const auto col_idx = col[ind]; + if (col_idx == invalid_index()) { + break; + } else { + temp += val(ind) * b(col_idx, column_id); + } + } + const auto c_ind = tidx * c_stride + column_id; + c[c_ind] = op(temp, c[c_ind]); + } + } else { + if (tidx < num_worker_per_row * num_rows) { + const auto idx_in_worker = threadIdx.y; + const auto x = tidx % num_rows; + const auto worker_id = tidx / num_rows; + const auto step_size = num_worker_per_row * num_thread_per_worker; + __shared__ uninitialized_array< + arithmetic_type, default_block_size / num_thread_per_worker> + storage; + if (idx_in_worker == 0) { + storage[threadIdx.x] = 0; + } + __syncthreads(); + auto temp = zero(); + for (size_type idx = + worker_id * num_thread_per_worker + idx_in_worker; + idx < num_stored_elements_per_row; idx += step_size) { + const auto ind = x + idx * stride; + const auto col_idx = col[ind]; + if (col_idx == invalid_index()) { + break; + } else { + temp += val(ind) * b(col_idx, column_id); + } + } + atomic_add(&storage[threadIdx.x], temp); + __syncthreads(); + if (idx_in_worker == 0) { + const auto c_ind = x * c_stride + column_id; + if (atomic) { + atomic_add(&(c[c_ind]), op(storage[threadIdx.x], c[c_ind])); + } else { + c[c_ind] = op(storage[threadIdx.x], c[c_ind]); + } + } + } + } +} + + +template +__global__ __launch_bounds__(default_block_size) void spmv( + const size_type num_rows, const int num_worker_per_row, + acc::range val, const IndexType* __restrict__ col, + const size_type stride, const size_type num_stored_elements_per_row, + acc::range b, OutputValueType* __restrict__ c, + const size_type c_stride) +{ + spmv_kernel( + num_rows, num_worker_per_row, val, col, stride, + num_stored_elements_per_row, b, c, c_stride, + [](const auto& x, const OutputValueType& y) { + return static_cast(x); + }); +} + + +template +__global__ __launch_bounds__(default_block_size) void spmv( + const size_type num_rows, const int num_worker_per_row, + acc::range alpha, acc::range val, + const IndexType* __restrict__ col, const size_type stride, + const size_type num_stored_elements_per_row, acc::range b, + const OutputValueType* __restrict__ beta, OutputValueType* __restrict__ c, + const size_type c_stride) +{ + using arithmetic_type = typename a_accessor::arithmetic_type; + const auto alpha_val = alpha(0); + const OutputValueType beta_val = beta[0]; + if (atomic) { + // Because the atomic operation changes the values of c during + // computation, it can not directly do alpha * a * b + beta * c + // operation. The beta * c needs to be done before calling this kernel. + // Then, this kernel only adds alpha * a * b when it uses atomic + // operation. + spmv_kernel( + num_rows, num_worker_per_row, val, col, stride, + num_stored_elements_per_row, b, c, c_stride, + [&alpha_val](const auto& x, const OutputValueType& y) { + return static_cast(alpha_val * x); + }); + } else { + spmv_kernel( + num_rows, num_worker_per_row, val, col, stride, + num_stored_elements_per_row, b, c, c_stride, + [&alpha_val, &beta_val](const auto& x, const OutputValueType& y) { + return static_cast( + alpha_val * x + static_cast(beta_val * y)); + }); + } +} + + +} // namespace kernel namespace { @@ -131,9 +257,9 @@ void abstract_spmv(syn::value_list, if (grid_size.x > 0 && grid_size.y > 0) { kernel::spmv <<get_stream()>>>( - nrows, num_worker_per_row, acc::as_cuda_range(a_vals), + nrows, num_worker_per_row, acc::as_device_range(a_vals), a->get_const_col_idxs(), stride, - num_stored_elements_per_row, acc::as_cuda_range(b_vals), + num_stored_elements_per_row, acc::as_device_range(b_vals), as_device_type(c->get_values()), c->get_stride()); } } else if (alpha != nullptr && beta != nullptr) { @@ -142,9 +268,10 @@ void abstract_spmv(syn::value_list, if (grid_size.x > 0 && grid_size.y > 0) { kernel::spmv <<get_stream()>>>( - nrows, num_worker_per_row, acc::as_cuda_range(alpha_val), - acc::as_cuda_range(a_vals), a->get_const_col_idxs(), stride, - num_stored_elements_per_row, acc::as_cuda_range(b_vals), + nrows, num_worker_per_row, acc::as_device_range(alpha_val), + acc::as_device_range(a_vals), a->get_const_col_idxs(), + stride, num_stored_elements_per_row, + acc::as_device_range(b_vals), as_device_type(beta->get_const_values()), as_device_type(c->get_values()), c->get_stride()); } @@ -158,7 +285,7 @@ GKO_ENABLE_IMPLEMENTATION_SELECTION(select_abstract_spmv, abstract_spmv); template std::array compute_thread_worker_and_atomicity( - std::shared_ptr exec, + std::shared_ptr exec, const matrix::Ell* a) { int num_thread_per_worker = 1; @@ -202,7 +329,7 @@ std::array compute_thread_worker_and_atomicity( template -void spmv(std::shared_ptr exec, +void spmv(std::shared_ptr exec, const matrix::Ell* a, const matrix::Dense* b, matrix::Dense* c) @@ -234,7 +361,7 @@ GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( template -void advanced_spmv(std::shared_ptr exec, +void advanced_spmv(std::shared_ptr exec, const matrix::Dense* alpha, const matrix::Ell* a, const matrix::Dense* b, @@ -267,6 +394,6 @@ GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( } // namespace ell -} // namespace cuda +} // namespace GKO_DEVICE_NAMESPACE } // namespace kernels } // namespace gko diff --git a/common/cuda_hip/matrix/ell_kernels.hpp.inc b/common/cuda_hip/matrix/ell_kernels.hpp.inc deleted file mode 100644 index a5fd37c1d05..00000000000 --- a/common/cuda_hip/matrix/ell_kernels.hpp.inc +++ /dev/null @@ -1,133 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -namespace kernel { - - -template -__device__ void spmv_kernel( - const size_type num_rows, const int num_worker_per_row, - acc::range val, const IndexType* __restrict__ col, - const size_type stride, const size_type num_stored_elements_per_row, - acc::range b, OutputValueType* __restrict__ c, - const size_type c_stride, Closure op) -{ - using arithmetic_type = typename a_accessor::arithmetic_type; - const auto tidx = thread::get_thread_id_flat(); - const decltype(tidx) column_id = blockIdx.y; - if (num_thread_per_worker == 1) { - // Specialize the num_thread_per_worker = 1. It doesn't need the shared - // memory, __syncthreads, and atomic_add - if (tidx < num_rows) { - auto temp = zero(); - for (size_type idx = 0; idx < num_stored_elements_per_row; idx++) { - const auto ind = tidx + idx * stride; - const auto col_idx = col[ind]; - if (col_idx == invalid_index()) { - break; - } else { - temp += val(ind) * b(col_idx, column_id); - } - } - const auto c_ind = tidx * c_stride + column_id; - c[c_ind] = op(temp, c[c_ind]); - } - } else { - if (tidx < num_worker_per_row * num_rows) { - const auto idx_in_worker = threadIdx.y; - const auto x = tidx % num_rows; - const auto worker_id = tidx / num_rows; - const auto step_size = num_worker_per_row * num_thread_per_worker; - __shared__ uninitialized_array< - arithmetic_type, default_block_size / num_thread_per_worker> - storage; - if (idx_in_worker == 0) { - storage[threadIdx.x] = 0; - } - __syncthreads(); - auto temp = zero(); - for (size_type idx = - worker_id * num_thread_per_worker + idx_in_worker; - idx < num_stored_elements_per_row; idx += step_size) { - const auto ind = x + idx * stride; - const auto col_idx = col[ind]; - if (col_idx == invalid_index()) { - break; - } else { - temp += val(ind) * b(col_idx, column_id); - } - } - atomic_add(&storage[threadIdx.x], temp); - __syncthreads(); - if (idx_in_worker == 0) { - const auto c_ind = x * c_stride + column_id; - if (atomic) { - atomic_add(&(c[c_ind]), op(storage[threadIdx.x], c[c_ind])); - } else { - c[c_ind] = op(storage[threadIdx.x], c[c_ind]); - } - } - } - } -} - - -template -__global__ __launch_bounds__(default_block_size) void spmv( - const size_type num_rows, const int num_worker_per_row, - acc::range val, const IndexType* __restrict__ col, - const size_type stride, const size_type num_stored_elements_per_row, - acc::range b, OutputValueType* __restrict__ c, - const size_type c_stride) -{ - spmv_kernel( - num_rows, num_worker_per_row, val, col, stride, - num_stored_elements_per_row, b, c, c_stride, - [](const auto& x, const OutputValueType& y) { - return static_cast(x); - }); -} - - -template -__global__ __launch_bounds__(default_block_size) void spmv( - const size_type num_rows, const int num_worker_per_row, - acc::range alpha, acc::range val, - const IndexType* __restrict__ col, const size_type stride, - const size_type num_stored_elements_per_row, acc::range b, - const OutputValueType* __restrict__ beta, OutputValueType* __restrict__ c, - const size_type c_stride) -{ - using arithmetic_type = typename a_accessor::arithmetic_type; - const auto alpha_val = alpha(0); - const OutputValueType beta_val = beta[0]; - if (atomic) { - // Because the atomic operation changes the values of c during - // computation, it can not directly do alpha * a * b + beta * c - // operation. The beta * c needs to be done before calling this kernel. - // Then, this kernel only adds alpha * a * b when it uses atomic - // operation. - spmv_kernel( - num_rows, num_worker_per_row, val, col, stride, - num_stored_elements_per_row, b, c, c_stride, - [&alpha_val](const auto& x, const OutputValueType& y) { - return static_cast(alpha_val * x); - }); - } else { - spmv_kernel( - num_rows, num_worker_per_row, val, col, stride, - num_stored_elements_per_row, b, c, c_stride, - [&alpha_val, &beta_val](const auto& x, const OutputValueType& y) { - return static_cast( - alpha_val * x + static_cast(beta_val * y)); - }); - } -} - - -} // namespace kernel diff --git a/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc b/common/cuda_hip/matrix/fbcsr_kernels.cpp similarity index 57% rename from common/cuda_hip/matrix/fbcsr_kernels.hpp.inc rename to common/cuda_hip/matrix/fbcsr_kernels.cpp index d801876adbc..9e5eed5f570 100644 --- a/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc +++ b/common/cuda_hip/matrix/fbcsr_kernels.cpp @@ -2,6 +2,71 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include +#include + + +#include +#include +#include +#include +#include +#include +#include + + +#include +#include +#include +#include +#include + + +#include "common/cuda_hip/base/blas_bindings.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/base/pointer_mode_guard.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/sparselib_bindings.hpp" +#include "common/cuda_hip/base/thrust.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/atomic.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/merging.hpp" +#include "common/cuda_hip/components/prefix_sum.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/uninitialized_array.hpp" +#include "common/unified/base/kernel_launch.hpp" +#include "core/base/array_access.hpp" +#include "core/base/block_sizes.hpp" +#include "core/base/device_matrix_data_kernels.hpp" +#include "core/components/fill_array_kernels.hpp" +#include "core/components/format_conversion_kernels.hpp" +#include "core/matrix/csr_lookup.hpp" +#include "core/matrix/dense_kernels.hpp" +#include "core/matrix/fbcsr_kernels.hpp" +#include "core/synthesizer/implementation_selection.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { + + +/** + * @brief The fixed-size block compressed sparse row matrix format namespace. + * + * @ingroup fbcsr + */ +namespace fbcsr { + + +constexpr int default_block_size{512}; + + +#include "common/cuda_hip/matrix/csr_common.hpp.inc" + namespace kernel { @@ -341,3 +406,235 @@ template void extract_diagonal(std::shared_ptr exec, const matrix::Fbcsr* orig, matrix::Diagonal* diag) GKO_NOT_IMPLEMENTED; + + +namespace { + + +template +void dense_transpose(std::shared_ptr exec, + const size_type nrows, const size_type ncols, + const size_type orig_stride, const ValueType* const orig, + const size_type trans_stride, ValueType* const trans) +{ + if (nrows == 0) { + return; + } + if (blas::is_supported::value) { + auto handle = exec->get_blas_handle(); + { + blas::pointer_mode_guard pm_guard(handle); + auto alpha = one(); + auto beta = zero(); + blas::geam(handle, BLAS_OP_T, BLAS_OP_N, nrows, ncols, &alpha, orig, + orig_stride, &beta, trans, trans_stride, trans, + trans_stride); + } + } else { + GKO_NOT_IMPLEMENTED; + } +} + + +} // namespace + + +template +void spmv(std::shared_ptr exec, + const matrix::Fbcsr* const a, + const matrix::Dense* const b, + matrix::Dense* const c) +{ + if (c->get_size()[0] == 0 || c->get_size()[1] == 0) { + // empty output: nothing to do + return; + } + if (b->get_size()[0] == 0 || a->get_num_stored_blocks() == 0) { + // empty input: fill output with zero + dense::fill(exec, c, zero()); + return; + } + if (sparselib::is_supported::value) { + auto handle = exec->get_sparselib_handle(); + sparselib::pointer_mode_guard pm_guard(handle); + const auto alpha = one(); + const auto beta = zero(); + auto descr = sparselib::create_mat_descr(); + const auto row_ptrs = a->get_const_row_ptrs(); + const auto col_idxs = a->get_const_col_idxs(); + const auto values = a->get_const_values(); + const int bs = a->get_block_size(); + const IndexType mb = a->get_num_block_rows(); + const IndexType nb = a->get_num_block_cols(); + const auto nnzb = static_cast(a->get_num_stored_blocks()); + const auto nrhs = static_cast(b->get_size()[1]); + const auto nrows = a->get_size()[0]; + const auto ncols = a->get_size()[1]; + const auto in_stride = b->get_stride(); + const auto out_stride = c->get_stride(); + if (nrhs == 1 && in_stride == 1 && out_stride == 1) { + sparselib::bsrmv(handle, SPARSELIB_OPERATION_NON_TRANSPOSE, mb, nb, + nnzb, &alpha, descr, values, row_ptrs, col_idxs, + bs, b->get_const_values(), &beta, c->get_values()); + } else { + const auto trans_stride = nrows; + auto trans_c = array(exec, nrows * nrhs); + sparselib::bsrmm(handle, SPARSELIB_OPERATION_NON_TRANSPOSE, + SPARSELIB_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb, + &alpha, descr, values, row_ptrs, col_idxs, bs, + b->get_const_values(), in_stride, &beta, + trans_c.get_data(), trans_stride); + dense_transpose(exec, nrhs, nrows, trans_stride, trans_c.get_data(), + out_stride, c->get_values()); + } + sparselib::destroy(descr); + } else { + GKO_NOT_IMPLEMENTED; + } +} + + +template +void advanced_spmv(std::shared_ptr exec, + const matrix::Dense* const alpha, + const matrix::Fbcsr* const a, + const matrix::Dense* const b, + const matrix::Dense* const beta, + matrix::Dense* const c) +{ + if (c->get_size()[0] == 0 || c->get_size()[1] == 0) { + // empty output: nothing to do + return; + } + if (b->get_size()[0] == 0 || a->get_num_stored_blocks() == 0) { + // empty input: scale output + dense::scale(exec, beta, c); + return; + } + if (sparselib::is_supported::value) { + auto handle = exec->get_sparselib_handle(); + const auto alphp = alpha->get_const_values(); + const auto betap = beta->get_const_values(); + auto descr = sparselib::create_mat_descr(); + const auto row_ptrs = a->get_const_row_ptrs(); + const auto col_idxs = a->get_const_col_idxs(); + const auto values = a->get_const_values(); + const int bs = a->get_block_size(); + const IndexType mb = a->get_num_block_rows(); + const IndexType nb = a->get_num_block_cols(); + const auto nnzb = static_cast(a->get_num_stored_blocks()); + const auto nrhs = static_cast(b->get_size()[1]); + const auto nrows = a->get_size()[0]; + const auto ncols = a->get_size()[1]; + const auto in_stride = b->get_stride(); + const auto out_stride = c->get_stride(); + if (nrhs == 1 && in_stride == 1 && out_stride == 1) { + sparselib::bsrmv(handle, SPARSELIB_OPERATION_NON_TRANSPOSE, mb, nb, + nnzb, alphp, descr, values, row_ptrs, col_idxs, bs, + b->get_const_values(), betap, c->get_values()); + } else { + const auto trans_stride = nrows; + auto trans_c = array(exec, nrows * nrhs); + dense_transpose(exec, nrows, nrhs, out_stride, c->get_values(), + trans_stride, trans_c.get_data()); + sparselib::bsrmm(handle, SPARSELIB_OPERATION_NON_TRANSPOSE, + SPARSELIB_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb, + alphp, descr, values, row_ptrs, col_idxs, bs, + b->get_const_values(), in_stride, betap, + trans_c.get_data(), trans_stride); + dense_transpose(exec, nrhs, nrows, trans_stride, trans_c.get_data(), + out_stride, c->get_values()); + } + sparselib::destroy(descr); + } else { + GKO_NOT_IMPLEMENTED; + } +} + + +namespace { + + +template +void transpose_blocks_impl(syn::value_list, + std::shared_ptr exec, + matrix::Fbcsr* const mat) +{ + constexpr int subwarp_size = config::warp_size; + const auto nbnz = mat->get_num_stored_blocks(); + const auto numthreads = nbnz * subwarp_size; + const auto block_size = default_block_size; + const auto grid_dim = ceildiv(numthreads, block_size); + if (grid_dim > 0) { + kernel::transpose_blocks + <<get_stream()>>>( + nbnz, mat->get_values()); + } +} + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_transpose_blocks, + transpose_blocks_impl); + + +} // namespace + + +template +void transpose(const std::shared_ptr exec, + const matrix::Fbcsr* const orig, + matrix::Fbcsr* const trans) +{ +#ifdef GKO_COMPILING_CUDA + if (sparselib::is_supported::value) { + const int bs = orig->get_block_size(); + const IndexType nnzb = + static_cast(orig->get_num_stored_blocks()); + cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC; + cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO; + const IndexType buffer_size = sparselib::bsr_transpose_buffersize( + exec->get_sparselib_handle(), orig->get_num_block_rows(), + orig->get_num_block_cols(), nnzb, orig->get_const_values(), + orig->get_const_row_ptrs(), orig->get_const_col_idxs(), bs, bs); + array buffer_array(exec, buffer_size); + auto buffer = buffer_array.get_data(); + sparselib::bsr_transpose( + exec->get_sparselib_handle(), orig->get_num_block_rows(), + orig->get_num_block_cols(), nnzb, orig->get_const_values(), + orig->get_const_row_ptrs(), orig->get_const_col_idxs(), bs, bs, + trans->get_values(), trans->get_col_idxs(), trans->get_row_ptrs(), + copyValues, idxBase, buffer); + + // transpose blocks + select_transpose_blocks( + fixedblock::compiled_kernels(), + [bs](int compiled_block_size) { return bs == compiled_block_size; }, + syn::value_list(), syn::type_list<>(), exec, trans); + } else +#endif + { + fallback_transpose(exec, orig, trans); + } +} + + +template +void conj_transpose(std::shared_ptr exec, + const matrix::Fbcsr* orig, + matrix::Fbcsr* trans) +{ + const int grid_size = + ceildiv(trans->get_num_stored_elements(), default_block_size); + transpose(exec, orig, trans); + if (grid_size > 0 && is_complex()) { + kernel:: + conjugate<<get_stream()>>>( + trans->get_num_stored_elements(), + as_device_type(trans->get_values())); + } +} + + +} // namespace fbcsr +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/common/cuda_hip/matrix/sellp_kernels.hpp.inc b/common/cuda_hip/matrix/sellp_kernels.cpp similarity index 83% rename from common/cuda_hip/matrix/sellp_kernels.hpp.inc rename to common/cuda_hip/matrix/sellp_kernels.cpp index f4f0035c276..af7f22ee7d5 100644 --- a/common/cuda_hip/matrix/sellp_kernels.hpp.inc +++ b/common/cuda_hip/matrix/sellp_kernels.cpp @@ -2,6 +2,37 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include +#include +#include +#include +#include + + +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/sparselib_bindings.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "core/components/prefix_sum_kernels.hpp" +#include "core/matrix/sellp_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +/** + * @brief The SELL-P matrix format namespace. + * + * @ingroup sellp + */ +namespace sellp { + + +constexpr int default_block_size = 512; + + template __global__ __launch_bounds__(default_block_size) void spmv_kernel( size_type num_rows, size_type num_right_hand_sides, size_type b_stride, @@ -102,3 +133,9 @@ void advanced_spmv(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_SELLP_ADVANCED_SPMV_KERNEL); + + +} // namespace sellp +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/hip/matrix/sparsity_csr_kernels.hip.cpp b/common/cuda_hip/matrix/sparsity_csr_kernels.cpp similarity index 58% rename from hip/matrix/sparsity_csr_kernels.hip.cpp rename to common/cuda_hip/matrix/sparsity_csr_kernels.cpp index b662f07257e..540722d843c 100644 --- a/hip/matrix/sparsity_csr_kernels.hip.cpp +++ b/common/cuda_hip/matrix/sparsity_csr_kernels.cpp @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include "core/matrix/sparsity_csr_kernels.hpp" +#include #include @@ -11,27 +11,27 @@ #include -#include "accessor/hip_helper.hpp" #include "accessor/reduced_row_major.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/sparselib_bindings.hpp" +#include "common/cuda_hip/base/thrust.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/uninitialized_array.hpp" #include "core/base/mixed_precision_types.hpp" #include "core/components/fill_array_kernels.hpp" #include "core/components/format_conversion_kernels.hpp" +#include "core/matrix/sparsity_csr_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "hip/base/config.hip.hpp" -#include "hip/base/hipsparse_bindings.hip.hpp" -#include "hip/base/math.hip.hpp" -#include "hip/base/thrust.hip.hpp" -#include "hip/base/types.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" -#include "hip/components/reduction.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" -#include "hip/components/uninitialized_array.hip.hpp" namespace gko { namespace kernels { -namespace hip { +namespace GKO_DEVICE_NAMESPACE { /** * @brief The Compressed sparse row matrix format namespace. * @@ -54,7 +54,114 @@ using classical_kernels = syn::value_list; #include "common/cuda_hip/matrix/csr_common.hpp.inc" -#include "common/cuda_hip/matrix/sparsity_csr_kernels.hpp.inc" + +namespace kernel { + + +template +__device__ void device_classical_spmv(const size_type num_rows, + const MatrixValueType* __restrict__ val, + const IndexType* __restrict__ col_idxs, + const IndexType* __restrict__ row_ptrs, + acc::range b, + acc::range c, + Closure scale) +{ + using arithmetic_type = typename output_accessor::arithmetic_type; + auto subwarp_tile = + group::tiled_partition(group::this_thread_block()); + const auto subrow = thread::get_subwarp_num_flat(); + const auto subid = subwarp_tile.thread_rank(); + const IndexType column_id = blockIdx.y; + const arithmetic_type value = val[0]; + auto row = thread::get_subwarp_id_flat(); + for (; row < num_rows; row += subrow) { + const auto ind_end = row_ptrs[row + 1]; + arithmetic_type temp_val = zero(); + for (auto ind = row_ptrs[row] + subid; ind < ind_end; + ind += subwarp_size) { + temp_val += value * b(col_idxs[ind], column_id); + } + auto subwarp_result = + reduce(subwarp_tile, temp_val, + [](const arithmetic_type& a, const arithmetic_type& b) { + return a + b; + }); + if (subid == 0) { + c(row, column_id) = scale(subwarp_result, c(row, column_id)); + } + } +} + + +template +__global__ __launch_bounds__(spmv_block_size) void abstract_classical_spmv( + const size_type num_rows, const MatrixValueType* __restrict__ val, + const IndexType* __restrict__ col_idxs, + const IndexType* __restrict__ row_ptrs, acc::range b, + acc::range c) +{ + using type = typename output_accessor::arithmetic_type; + device_classical_spmv( + num_rows, val, col_idxs, row_ptrs, b, c, + [](const type& x, const type& y) { return x; }); +} + + +template +__global__ __launch_bounds__(spmv_block_size) void abstract_classical_spmv( + const size_type num_rows, const MatrixValueType* __restrict__ alpha, + const MatrixValueType* __restrict__ val, + const IndexType* __restrict__ col_idxs, + const IndexType* __restrict__ row_ptrs, acc::range b, + const typename output_accessor::storage_type* __restrict__ beta, + acc::range c) +{ + using type = typename output_accessor::arithmetic_type; + const type alpha_val = alpha[0]; + const type beta_val = beta[0]; + device_classical_spmv( + num_rows, val, col_idxs, row_ptrs, b, c, + [&alpha_val, &beta_val](const type& x, const type& y) { + return alpha_val * x + beta_val * y; + }); +} + + +} // namespace kernel + + +template +void transpose(std::shared_ptr exec, + const matrix::SparsityCsr* orig, + matrix::SparsityCsr* trans) + GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_SPARSITY_CSR_TRANSPOSE_KERNEL); + + +template +void fallback_sort(std::shared_ptr exec, + matrix::SparsityCsr* to_sort) +{ + const auto row_ptrs = to_sort->get_const_row_ptrs(); + const auto col_idxs = to_sort->get_col_idxs(); + const auto nnz = to_sort->get_num_nonzeros(); + const auto num_rows = to_sort->get_size()[0]; + array row_idx_array(exec, nnz); + const auto row_idxs = row_idx_array.get_data(); + components::convert_ptrs_to_idxs(exec, row_ptrs, num_rows, row_idxs); + // two sorts by integer keys hopefully enable Thrust to use cub's RadixSort + thrust::sort_by_key(thrust_policy(exec), col_idxs, col_idxs + nnz, + row_idxs); + thrust::stable_sort_by_key(thrust_policy(exec), row_idxs, row_idxs + nnz, + col_idxs); +} namespace host_kernel { @@ -63,7 +170,7 @@ namespace host_kernel { template void classical_spmv(syn::value_list, - std::shared_ptr exec, + std::shared_ptr exec, const matrix::SparsityCsr* a, const matrix::Dense* b, matrix::Dense* c, @@ -110,16 +217,16 @@ void classical_spmv(syn::value_list, a->get_size()[0], as_device_type(a->get_const_value()), a->get_const_col_idxs(), as_device_type(a->get_const_row_ptrs()), - acc::as_hip_range(b_vals), acc::as_hip_range(c_vals)); + acc::as_device_range(b_vals), acc::as_device_range(c_vals)); } else if (alpha != nullptr && beta != nullptr) { kernel::abstract_classical_spmv <<get_stream()>>>( a->get_size()[0], as_device_type(alpha->get_const_values()), as_device_type(a->get_const_value()), a->get_const_col_idxs(), as_device_type(a->get_const_row_ptrs()), - acc::as_hip_range(b_vals), + acc::as_device_range(b_vals), as_device_type(beta->get_const_values()), - acc::as_hip_range(c_vals)); + acc::as_device_range(c_vals)); } else { GKO_KERNEL_NOT_FOUND; } @@ -132,7 +239,7 @@ GKO_ENABLE_IMPLEMENTATION_SELECTION(select_classical_spmv, classical_spmv); template -void spmv(std::shared_ptr exec, +void spmv(std::shared_ptr exec, const matrix::SparsityCsr* a, const matrix::Dense* b, matrix::Dense* c) @@ -148,7 +255,7 @@ GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( template -void advanced_spmv(std::shared_ptr exec, +void advanced_spmv(std::shared_ptr exec, const matrix::Dense* alpha, const matrix::SparsityCsr* a, const matrix::Dense* b, @@ -221,6 +328,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( } // namespace sparsity_csr -} // namespace hip +} // namespace GKO_DEVICE_NAMESPACE } // namespace kernels } // namespace gko diff --git a/common/cuda_hip/matrix/sparsity_csr_kernels.hpp.inc b/common/cuda_hip/matrix/sparsity_csr_kernels.hpp.inc deleted file mode 100644 index aedf9638888..00000000000 --- a/common/cuda_hip/matrix/sparsity_csr_kernels.hpp.inc +++ /dev/null @@ -1,111 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -namespace kernel { - - -template -__device__ void device_classical_spmv(const size_type num_rows, - const MatrixValueType* __restrict__ val, - const IndexType* __restrict__ col_idxs, - const IndexType* __restrict__ row_ptrs, - acc::range b, - acc::range c, - Closure scale) -{ - using arithmetic_type = typename output_accessor::arithmetic_type; - auto subwarp_tile = - group::tiled_partition(group::this_thread_block()); - const auto subrow = thread::get_subwarp_num_flat(); - const auto subid = subwarp_tile.thread_rank(); - const IndexType column_id = blockIdx.y; - const arithmetic_type value = val[0]; - auto row = thread::get_subwarp_id_flat(); - for (; row < num_rows; row += subrow) { - const auto ind_end = row_ptrs[row + 1]; - arithmetic_type temp_val = zero(); - for (auto ind = row_ptrs[row] + subid; ind < ind_end; - ind += subwarp_size) { - temp_val += value * b(col_idxs[ind], column_id); - } - auto subwarp_result = - reduce(subwarp_tile, temp_val, - [](const arithmetic_type& a, const arithmetic_type& b) { - return a + b; - }); - if (subid == 0) { - c(row, column_id) = scale(subwarp_result, c(row, column_id)); - } - } -} - - -template -__global__ __launch_bounds__(spmv_block_size) void abstract_classical_spmv( - const size_type num_rows, const MatrixValueType* __restrict__ val, - const IndexType* __restrict__ col_idxs, - const IndexType* __restrict__ row_ptrs, acc::range b, - acc::range c) -{ - using type = typename output_accessor::arithmetic_type; - device_classical_spmv( - num_rows, val, col_idxs, row_ptrs, b, c, - [](const type& x, const type& y) { return x; }); -} - - -template -__global__ __launch_bounds__(spmv_block_size) void abstract_classical_spmv( - const size_type num_rows, const MatrixValueType* __restrict__ alpha, - const MatrixValueType* __restrict__ val, - const IndexType* __restrict__ col_idxs, - const IndexType* __restrict__ row_ptrs, acc::range b, - const typename output_accessor::storage_type* __restrict__ beta, - acc::range c) -{ - using type = typename output_accessor::arithmetic_type; - const type alpha_val = alpha[0]; - const type beta_val = beta[0]; - device_classical_spmv( - num_rows, val, col_idxs, row_ptrs, b, c, - [&alpha_val, &beta_val](const type& x, const type& y) { - return alpha_val * x + beta_val * y; - }); -} - - -} // namespace kernel - - -template -void transpose(std::shared_ptr exec, - const matrix::SparsityCsr* orig, - matrix::SparsityCsr* trans) - GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_SPARSITY_CSR_TRANSPOSE_KERNEL); - - -template -void fallback_sort(std::shared_ptr exec, - matrix::SparsityCsr* to_sort) -{ - const auto row_ptrs = to_sort->get_const_row_ptrs(); - const auto col_idxs = to_sort->get_col_idxs(); - const auto nnz = to_sort->get_num_nonzeros(); - const auto num_rows = to_sort->get_size()[0]; - array row_idx_array(exec, nnz); - const auto row_idxs = row_idx_array.get_data(); - components::convert_ptrs_to_idxs(exec, row_ptrs, num_rows, row_idxs); - // two sorts by integer keys hopefully enable Thrust to use cub's RadixSort - thrust::sort_by_key(thrust_policy(exec), col_idxs, col_idxs + nnz, - row_idxs); - thrust::stable_sort_by_key(thrust_policy(exec), row_idxs, row_idxs + nnz, - col_idxs); -} diff --git a/common/cuda_hip/multigrid/pgm_kernels.hpp.inc b/common/cuda_hip/multigrid/pgm_kernels.cpp similarity index 77% rename from common/cuda_hip/multigrid/pgm_kernels.hpp.inc rename to common/cuda_hip/multigrid/pgm_kernels.cpp index 9b2a5735c71..60dea00cc12 100644 --- a/common/cuda_hip/multigrid/pgm_kernels.hpp.inc +++ b/common/cuda_hip/multigrid/pgm_kernels.cpp @@ -2,6 +2,36 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include + + +#include +#include +#include +#include +#include + + +#include +#include + + +#include "common/cuda_hip/base/thrust.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "core/multigrid/pgm_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +/** + * @brief The PGM solver namespace. + * + * @ingroup pgm + */ +namespace pgm { + + template void sort_agg(std::shared_ptr exec, IndexType num, IndexType* row_idxs, IndexType* col_idxs) @@ -52,3 +82,9 @@ void compute_coarse_coo(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_PGM_COMPUTE_COARSE_COO); + + +} // namespace pgm +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/common/cuda_hip/preconditioner/isai_kernels.hpp.inc b/common/cuda_hip/preconditioner/isai_kernels.cpp similarity index 94% rename from common/cuda_hip/preconditioner/isai_kernels.hpp.inc rename to common/cuda_hip/preconditioner/isai_kernels.cpp index 86d47680e0e..a79b8f711d3 100644 --- a/common/cuda_hip/preconditioner/isai_kernels.hpp.inc +++ b/common/cuda_hip/preconditioner/isai_kernels.cpp @@ -2,6 +2,42 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include +#include +#include + + +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/merging.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/uninitialized_array.hpp" +#include "common/cuda_hip/components/warp_blas.hpp" +#include "core/components/prefix_sum_kernels.hpp" +#include "core/matrix/csr_builder.hpp" +#include "core/preconditioner/isai_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +/** + * @brief The Isai preconditioner namespace. + * @ref Isai + * @ingroup isai + */ +namespace isai { + + +constexpr int subwarp_size{row_size_limit}; +constexpr int subwarps_per_block{2}; +constexpr int default_block_size{subwarps_per_block * subwarp_size}; + + namespace kernel { @@ -559,3 +595,9 @@ void scatter_excess_solution(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_ISAI_SCATTER_EXCESS_SOLUTION_KERNEL); + + +} // namespace isai +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/common/cuda_hip/preconditioner/jacobi_kernels.hpp.inc b/common/cuda_hip/preconditioner/jacobi_kernels.cpp similarity index 91% rename from common/cuda_hip/preconditioner/jacobi_kernels.hpp.inc rename to common/cuda_hip/preconditioner/jacobi_kernels.cpp index e0d7cfef0e9..45d32493f25 100644 --- a/common/cuda_hip/preconditioner/jacobi_kernels.hpp.inc +++ b/common/cuda_hip/preconditioner/jacobi_kernels.cpp @@ -2,6 +2,48 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include "core/preconditioner/jacobi_kernels.hpp" + + +#include + + +#include + + +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "core/base/extended_float.hpp" +#include "core/preconditioner/jacobi_utils.hpp" +#include "core/synthesizer/implementation_selection.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +/** + * @brief The Jacobi preconditioner namespace. + * @ref Jacobi + * @ingroup jacobi + */ +namespace jacobi { + + +// a total of 32/16 warps (1024 threads) +#if defined(GKO_COMPILING_HIP) && GINKGO_HIP_PLATFORM_HCC +constexpr int default_num_warps = 16; +#else // !defined(GKO_COMPILING_HIP) || GINKGO_HIP_PLATFORM_NVCC +constexpr int default_num_warps = 32; +#endif +// with current architectures, at most 32 warps can be scheduled per SM (and +// current GPUs have at most 84 SMs) +constexpr int default_grid_size = 32 * 32 * 128; + + namespace { @@ -369,3 +411,9 @@ void convert_to_dense( GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_JACOBI_CONVERT_TO_DENSE_KERNEL); + + +} // namespace jacobi +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/common/cuda_hip/reorder/rcm_kernels.hpp.inc b/common/cuda_hip/reorder/rcm_kernels.cpp similarity index 95% rename from common/cuda_hip/reorder/rcm_kernels.hpp.inc rename to common/cuda_hip/reorder/rcm_kernels.cpp index 05fe3bce07e..12f2eca9580 100644 --- a/common/cuda_hip/reorder/rcm_kernels.hpp.inc +++ b/common/cuda_hip/reorder/rcm_kernels.cpp @@ -2,6 +2,47 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include +#include +#include +#include +#include +#include + + +#include "common/cuda_hip/base/thrust.hpp" +#include "common/cuda_hip/components/memory.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "core/base/array_access.hpp" +#include "core/reorder/rcm_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +/** + * @brief The reordering namespace. + * + * @ingroup reorder + */ +namespace rcm { + + +constexpr int default_block_size = 512; + + template array compute_node_degrees( std::shared_ptr exec, @@ -613,3 +654,9 @@ void compute_permutation(std::shared_ptr exec, } GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_RCM_COMPUTE_PERMUTATION_KERNEL); + + +} // namespace rcm +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/common/cuda_hip/solver/cb_gmres_kernels.hpp.inc b/common/cuda_hip/solver/cb_gmres_kernels.cpp similarity index 50% rename from common/cuda_hip/solver/cb_gmres_kernels.hpp.inc rename to common/cuda_hip/solver/cb_gmres_kernels.cpp index 2a5a6c3f7f9..9be99c094fc 100644 --- a/common/cuda_hip/solver/cb_gmres_kernels.hpp.inc +++ b/common/cuda_hip/solver/cb_gmres_kernels.cpp @@ -2,6 +2,52 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include +#include + + +#include +#include +#include +#include + + +#include "accessor/range.hpp" +#include "accessor/reduced_row_major.hpp" +#include "accessor/scaled_reduced_row_major.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/atomic.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/uninitialized_array.hpp" +#include "core/base/array_access.hpp" +#include "core/components/fill_array_kernels.hpp" +#include "core/matrix/dense_kernels.hpp" +#include "core/solver/cb_gmres_accessor.hpp" +#include "core/solver/cb_gmres_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +/** + * @brief The CB_GMRES solver namespace. + * + * @ingroup cb_gmres + */ +namespace cb_gmres { + + +constexpr int default_block_size = 512; +// default_dot_dim can not be 64 in hip because 64 * 64 exceeds their max block +// size limit. +constexpr int default_dot_dim = 32; +constexpr int default_dot_size = default_dot_dim * default_dot_dim; + + #include "common/cuda_hip/solver/common_gmres_kernels.hpp.inc" @@ -551,3 +597,457 @@ __global__ __launch_bounds__(block_size) void calculate_Qy_kernel( before_preconditioner[global_id] = temp; } } + + +template +void zero_matrix(std::shared_ptr exec, size_type m, + size_type n, size_type stride, ValueType* array) +{ + const auto block_size = default_block_size; + const auto grid_size = ceildiv(n, block_size); + zero_matrix_kernel<<get_stream()>>>( + m, n, stride, as_device_type(array)); +} + + +template +void initialize(std::shared_ptr exec, + const matrix::Dense* b, + matrix::Dense* residual, + matrix::Dense* givens_sin, + matrix::Dense* givens_cos, + array* stop_status, size_type krylov_dim) +{ + const auto num_threads = std::max(b->get_size()[0] * b->get_stride(), + krylov_dim * b->get_size()[1]); + const auto grid_dim = ceildiv(num_threads, default_block_size); + const auto block_dim = default_block_size; + constexpr auto block_size = default_block_size; + + initialize_kernel + <<get_stream()>>>( + b->get_size()[0], b->get_size()[1], krylov_dim, + as_device_type(b->get_const_values()), b->get_stride(), + as_device_type(residual->get_values()), residual->get_stride(), + as_device_type(givens_sin->get_values()), givens_sin->get_stride(), + as_device_type(givens_cos->get_values()), givens_cos->get_stride(), + as_device_type(stop_status->get_data())); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CB_GMRES_INITIALIZE_KERNEL); + + +template +void restart(std::shared_ptr exec, + const matrix::Dense* residual, + matrix::Dense>* residual_norm, + matrix::Dense* residual_norm_collection, + matrix::Dense>* arnoldi_norm, + Accessor3d krylov_bases, + matrix::Dense* next_krylov_basis, + array* final_iter_nums, array& reduction_tmp, + size_type krylov_dim) +{ + constexpr bool use_scalar = + gko::cb_gmres::detail::has_3d_scaled_accessor::value; + const auto num_rows = residual->get_size()[0]; + const auto num_rhs = residual->get_size()[1]; + const auto krylov_stride = + gko::cb_gmres::helper_functions_accessor::get_stride( + krylov_bases); + const auto grid_dim_1 = + ceildiv((krylov_dim + 1) * krylov_stride[0], default_block_size); + const auto block_dim = default_block_size; + constexpr auto block_size = default_block_size; + const auto stride_arnoldi = arnoldi_norm->get_stride(); + + restart_1_kernel + <<get_stream()>>>( + residual->get_size()[0], residual->get_size()[1], krylov_dim, + acc::as_device_range(krylov_bases), + as_device_type(residual_norm_collection->get_values()), + residual_norm_collection->get_stride()); + kernels::GKO_DEVICE_NAMESPACE::dense::compute_norm2_dispatch( + exec, residual, residual_norm, reduction_tmp); + + if (use_scalar) { + components::fill_array(exec, + arnoldi_norm->get_values() + 2 * stride_arnoldi, + num_rhs, zero>()); + const dim3 grid_size_nrm(ceildiv(num_rhs, default_dot_dim), + exec->get_num_multiprocessor() * 2); + const dim3 block_size_nrm(default_dot_dim, default_dot_dim); + multinorminf_without_stop_kernel<<get_stream()>>>( + num_rows, num_rhs, as_device_type(residual->get_const_values()), + residual->get_stride(), + as_device_type(arnoldi_norm->get_values() + 2 * stride_arnoldi), 0); + } + + if (gko::cb_gmres::detail::has_3d_scaled_accessor::value) { + set_scalar_kernel + <<get_stream()>>>( + num_rhs, krylov_dim + 1, + as_device_type(residual_norm->get_const_values()), + residual_norm->get_stride(), + as_device_type(arnoldi_norm->get_const_values() + + 2 * stride_arnoldi), + stride_arnoldi, acc::as_device_range(krylov_bases)); + } + + const auto grid_dim_2 = + ceildiv(std::max(num_rows, 1) * krylov_stride[1], + default_block_size); + restart_2_kernel + <<get_stream()>>>( + residual->get_size()[0], residual->get_size()[1], + as_device_type(residual->get_const_values()), + residual->get_stride(), + as_device_type(residual_norm->get_const_values()), + as_device_type(residual_norm_collection->get_values()), + acc::as_device_range(krylov_bases), + as_device_type(next_krylov_basis->get_values()), + next_krylov_basis->get_stride(), + as_device_type(final_iter_nums->get_data())); +} + +GKO_INSTANTIATE_FOR_EACH_CB_GMRES_TYPE(GKO_DECLARE_CB_GMRES_RESTART_KERNEL); + + +template +void finish_arnoldi_CGS(std::shared_ptr exec, + matrix::Dense* next_krylov_basis, + Accessor3dim krylov_bases, + matrix::Dense* hessenberg_iter, + matrix::Dense* buffer_iter, + matrix::Dense>* arnoldi_norm, + size_type iter, const stopping_status* stop_status, + stopping_status* reorth_status, + array* num_reorth) +{ + const auto dim_size = next_krylov_basis->get_size(); + if (dim_size[1] == 0) { + return; + } + using non_complex = remove_complex; + // optimization parameter + constexpr int singledot_block_size = default_dot_dim; + constexpr bool use_scalar = + gko::cb_gmres::detail::has_3d_scaled_accessor::value; + const auto stride_next_krylov = next_krylov_basis->get_stride(); + const auto stride_hessenberg = hessenberg_iter->get_stride(); + const auto stride_buffer = buffer_iter->get_stride(); + const auto stride_arnoldi = arnoldi_norm->get_stride(); + const dim3 grid_size(ceildiv(dim_size[1], default_dot_dim), + exec->get_num_multiprocessor() * 2); + const dim3 grid_size_num_iters(ceildiv(dim_size[1], default_dot_dim), + exec->get_num_multiprocessor() * 2, + iter + 1); + const dim3 block_size(default_dot_dim, default_dot_dim); + // Note: having iter first (instead of row_idx information) is likely + // beneficial for avoiding atomic_add conflicts, but that needs + // further investigation. + const dim3 grid_size_iters_single(exec->get_num_multiprocessor() * 2, + iter + 1); + const auto block_size_iters_single = singledot_block_size; + size_type num_reorth_host; + + components::fill_array(exec, arnoldi_norm->get_values(), dim_size[1], + zero()); + multinorm2_kernel<<get_stream()>>>( + dim_size[0], dim_size[1], + as_device_type(next_krylov_basis->get_const_values()), + stride_next_krylov, as_device_type(arnoldi_norm->get_values()), + as_device_type(stop_status)); + // nrmP = norm(next_krylov_basis) + zero_matrix(exec, iter + 1, dim_size[1], stride_hessenberg, + hessenberg_iter->get_values()); + if (dim_size[1] > 1) { + multidot_kernel + <<get_stream()>>>( + dim_size[0], dim_size[1], + as_device_type(next_krylov_basis->get_const_values()), + stride_next_krylov, acc::as_device_range(krylov_bases), + as_device_type(hessenberg_iter->get_values()), + stride_hessenberg, as_device_type(stop_status)); + } else { + singledot_kernel + <<get_stream()>>>( + dim_size[0], + as_device_type(next_krylov_basis->get_const_values()), + stride_next_krylov, acc::as_device_range(krylov_bases), + as_device_type(hessenberg_iter->get_values()), + stride_hessenberg, as_device_type(stop_status)); + } + // for i in 1:iter + // hessenberg(iter, i) = next_krylov_basis' * krylov_bases(:, i) + // end + update_next_krylov_kernel + <<get_stream()>>>( + iter + 1, dim_size[0], dim_size[1], + as_device_type(next_krylov_basis->get_values()), stride_next_krylov, + acc::as_device_range(krylov_bases), + as_device_type(hessenberg_iter->get_const_values()), + stride_hessenberg, as_device_type(stop_status)); + + // for i in 1:iter + // next_krylov_basis -= hessenberg(iter, i) * krylov_bases(:, i) + // end + components::fill_array(exec, arnoldi_norm->get_values() + stride_arnoldi, + dim_size[1], zero()); + if (use_scalar) { + components::fill_array(exec, + arnoldi_norm->get_values() + 2 * stride_arnoldi, + dim_size[1], zero()); + } + multinorm2_inf_kernel + <<get_stream()>>>( + dim_size[0], dim_size[1], + as_device_type(next_krylov_basis->get_const_values()), + stride_next_krylov, + as_device_type(arnoldi_norm->get_values() + stride_arnoldi), + as_device_type(arnoldi_norm->get_values() + 2 * stride_arnoldi), + as_device_type(stop_status)); + // nrmN = norm(next_krylov_basis) + components::fill_array(exec, num_reorth->get_data(), 1, zero()); + check_arnoldi_norms + <<get_stream()>>>( + dim_size[1], as_device_type(arnoldi_norm->get_values()), + stride_arnoldi, as_device_type(hessenberg_iter->get_values()), + stride_hessenberg, iter + 1, acc::as_device_range(krylov_bases), + as_device_type(stop_status), as_device_type(reorth_status), + as_device_type(num_reorth->get_data())); + num_reorth_host = get_element(*num_reorth, 0); + // num_reorth_host := number of next_krylov vector to be reorthogonalization + for (size_type l = 1; (num_reorth_host > 0) && (l < 3); l++) { + zero_matrix(exec, iter + 1, dim_size[1], stride_buffer, + buffer_iter->get_values()); + if (dim_size[1] > 1) { + multidot_kernel + <<get_stream()>>>( + dim_size[0], dim_size[1], + as_device_type(next_krylov_basis->get_const_values()), + stride_next_krylov, acc::as_device_range(krylov_bases), + as_device_type(buffer_iter->get_values()), stride_buffer, + as_device_type(stop_status)); + } else { + singledot_kernel + <<get_stream()>>>( + dim_size[0], + as_device_type(next_krylov_basis->get_const_values()), + stride_next_krylov, acc::as_device_range(krylov_bases), + as_device_type(buffer_iter->get_values()), stride_buffer, + as_device_type(stop_status)); + } + // for i in 1:iter + // hessenberg(iter, i) = next_krylov_basis' * krylov_bases(:, i) + // end + update_next_krylov_and_add_kernel + <<get_stream()>>>( + iter + 1, dim_size[0], dim_size[1], + as_device_type(next_krylov_basis->get_values()), + stride_next_krylov, acc::as_device_range(krylov_bases), + as_device_type(hessenberg_iter->get_values()), + stride_hessenberg, + as_device_type(buffer_iter->get_const_values()), stride_buffer, + as_device_type(stop_status), as_device_type(reorth_status)); + // for i in 1:iter + // next_krylov_basis -= hessenberg(iter, i) * krylov_bases(:, i) + // end + components::fill_array(exec, + arnoldi_norm->get_values() + stride_arnoldi, + dim_size[1], zero()); + if (use_scalar) { + components::fill_array( + exec, arnoldi_norm->get_values() + 2 * stride_arnoldi, + dim_size[1], zero()); + } + multinorm2_inf_kernel + <<get_stream()>>>( + dim_size[0], dim_size[1], + as_device_type(next_krylov_basis->get_const_values()), + stride_next_krylov, + as_device_type(arnoldi_norm->get_values() + stride_arnoldi), + as_device_type(arnoldi_norm->get_values() + 2 * stride_arnoldi), + as_device_type(stop_status)); + // nrmN = norm(next_krylov_basis) + components::fill_array(exec, num_reorth->get_data(), 1, + zero()); + check_arnoldi_norms + <<get_stream()>>>( + dim_size[1], as_device_type(arnoldi_norm->get_values()), + stride_arnoldi, as_device_type(hessenberg_iter->get_values()), + stride_hessenberg, iter + 1, acc::as_device_range(krylov_bases), + as_device_type(stop_status), as_device_type(reorth_status), + num_reorth->get_data()); + num_reorth_host = get_element(*num_reorth, 0); + // num_reorth_host := number of next_krylov vector to be + // reorthogonalization + } + update_krylov_next_krylov_kernel + <<get_stream()>>>( + iter, dim_size[0], dim_size[1], + as_device_type(next_krylov_basis->get_values()), stride_next_krylov, + acc::as_device_range(krylov_bases), + as_device_type(hessenberg_iter->get_const_values()), + stride_hessenberg, as_device_type(stop_status)); + // next_krylov_basis /= hessenberg(iter, iter + 1) + // krylov_bases(:, iter + 1) = next_krylov_basis + // End of arnoldi +} + +template +void givens_rotation(std::shared_ptr exec, + matrix::Dense* givens_sin, + matrix::Dense* givens_cos, + matrix::Dense* hessenberg_iter, + matrix::Dense>* residual_norm, + matrix::Dense* residual_norm_collection, + size_type iter, const array* stop_status) +{ + // TODO: tune block_size for optimal performance + constexpr auto block_size = default_block_size; + const auto num_cols = hessenberg_iter->get_size()[1]; + const auto block_dim = block_size; + const auto grid_dim = + static_cast(ceildiv(num_cols, block_size)); + + givens_rotation_kernel + <<get_stream()>>>( + hessenberg_iter->get_size()[0], hessenberg_iter->get_size()[1], + iter, as_device_type(hessenberg_iter->get_values()), + hessenberg_iter->get_stride(), + as_device_type(givens_sin->get_values()), givens_sin->get_stride(), + as_device_type(givens_cos->get_values()), givens_cos->get_stride(), + as_device_type(residual_norm->get_values()), + as_device_type(residual_norm_collection->get_values()), + residual_norm_collection->get_stride(), + stop_status->get_const_data()); +} + + +template +void arnoldi(std::shared_ptr exec, + matrix::Dense* next_krylov_basis, + matrix::Dense* givens_sin, + matrix::Dense* givens_cos, + matrix::Dense>* residual_norm, + matrix::Dense* residual_norm_collection, + Accessor3d krylov_bases, matrix::Dense* hessenberg_iter, + matrix::Dense* buffer_iter, + matrix::Dense>* arnoldi_norm, + size_type iter, array* final_iter_nums, + const array* stop_status, + array* reorth_status, + array* num_reorth) +{ + increase_final_iteration_numbers_kernel<<< + static_cast( + ceildiv(final_iter_nums->get_size(), default_block_size)), + default_block_size, 0, exec->get_stream()>>>( + as_device_type(final_iter_nums->get_data()), + stop_status->get_const_data(), final_iter_nums->get_size()); + finish_arnoldi_CGS(exec, next_krylov_basis, krylov_bases, hessenberg_iter, + buffer_iter, arnoldi_norm, iter, + stop_status->get_const_data(), reorth_status->get_data(), + num_reorth); + givens_rotation(exec, givens_sin, givens_cos, hessenberg_iter, + residual_norm, residual_norm_collection, iter, stop_status); +} + +GKO_INSTANTIATE_FOR_EACH_CB_GMRES_TYPE(GKO_DECLARE_CB_GMRES_ARNOLDI_KERNEL); + + +template +void solve_upper_triangular( + std::shared_ptr exec, + const matrix::Dense* residual_norm_collection, + const matrix::Dense* hessenberg, matrix::Dense* y, + const array* final_iter_nums) +{ + // TODO: tune block_size for optimal performance + constexpr auto block_size = default_block_size; + const auto num_rhs = residual_norm_collection->get_size()[1]; + const auto block_dim = block_size; + const auto grid_dim = + static_cast(ceildiv(num_rhs, block_size)); + + solve_upper_triangular_kernel + <<get_stream()>>>( + hessenberg->get_size()[1], num_rhs, + as_device_type(residual_norm_collection->get_const_values()), + residual_norm_collection->get_stride(), + as_device_type(hessenberg->get_const_values()), + hessenberg->get_stride(), as_device_type(y->get_values()), + y->get_stride(), as_device_type(final_iter_nums->get_const_data())); +} + + +template +void calculate_qy(std::shared_ptr exec, + ConstAccessor3d krylov_bases, size_type num_krylov_bases, + const matrix::Dense* y, + matrix::Dense* before_preconditioner, + const array* final_iter_nums) +{ + const auto num_rows = before_preconditioner->get_size()[0]; + const auto num_cols = before_preconditioner->get_size()[1]; + const auto stride_before_preconditioner = + before_preconditioner->get_stride(); + + constexpr auto block_size = default_block_size; + const auto grid_dim = static_cast( + ceildiv(num_rows * stride_before_preconditioner, block_size)); + const auto block_dim = block_size; + + calculate_Qy_kernel + <<get_stream()>>>( + num_rows, num_cols, acc::as_device_range(krylov_bases), + as_device_type(y->get_const_values()), y->get_stride(), + as_device_type(before_preconditioner->get_values()), + stride_before_preconditioner, + as_device_type(final_iter_nums->get_const_data())); + // Calculate qy + // before_preconditioner = krylov_bases * y +} + + +template +void solve_krylov(std::shared_ptr exec, + const matrix::Dense* residual_norm_collection, + ConstAccessor3d krylov_bases, + const matrix::Dense* hessenberg, + matrix::Dense* y, + matrix::Dense* before_preconditioner, + const array* final_iter_nums) +{ + if (before_preconditioner->get_size()[1] == 0) { + return; + } + // since hessenberg has dims: iters x iters * num_rhs + // krylov_bases has dims: (iters + 1) x sysmtx[0] x num_rhs + const auto iters = + hessenberg->get_size()[1] / before_preconditioner->get_size()[1]; + const auto num_krylov_bases = iters + 1; + solve_upper_triangular(exec, residual_norm_collection, hessenberg, y, + final_iter_nums); + calculate_qy(exec, krylov_bases, num_krylov_bases, y, before_preconditioner, + final_iter_nums); +} + +GKO_INSTANTIATE_FOR_EACH_CB_GMRES_CONST_TYPE( + GKO_DECLARE_CB_GMRES_SOLVE_KRYLOV_KERNEL); + + +} // namespace cb_gmres +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/cuda/solver/idr_kernels.cu b/common/cuda_hip/solver/idr_kernels.cpp similarity index 51% rename from cuda/solver/idr_kernels.cu rename to common/cuda_hip/solver/idr_kernels.cpp index fcb84920265..6b3f001af0c 100644 --- a/cuda/solver/idr_kernels.cu +++ b/common/cuda_hip/solver/idr_kernels.cpp @@ -2,9 +2,6 @@ // // SPDX-License-Identifier: BSD-3-Clause -#include "core/solver/idr_kernels.hpp" - - #include #include @@ -13,22 +10,23 @@ #include +#include "common/cuda_hip/base/blas_bindings.hpp" +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/base/randlib_bindings.hpp" #include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/atomic.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" #include "core/components/fill_array_kernels.hpp" -#include "cuda/base/config.hpp" -#include "cuda/base/cublas_bindings.hpp" -#include "cuda/base/curand_bindings.hpp" -#include "cuda/base/math.hpp" -#include "cuda/base/types.hpp" -#include "cuda/components/atomic.cuh" -#include "cuda/components/cooperative_groups.cuh" -#include "cuda/components/reduction.cuh" -#include "cuda/components/thread_ids.cuh" +#include "core/solver/idr_kernels.hpp" namespace gko { namespace kernels { -namespace cuda { +namespace GKO_DEVICE_NAMESPACE { /** * @brief The IDR solver namespace. * @@ -42,7 +40,320 @@ constexpr int default_dot_dim = 32; constexpr int default_dot_size = default_dot_dim * default_dot_dim; -#include "common/cuda_hip/solver/idr_kernels.hpp.inc" +template +__global__ __launch_bounds__(default_block_size) void initialize_m_kernel( + size_type subspace_dim, size_type nrhs, ValueType* __restrict__ m_values, + size_type m_stride, stopping_status* __restrict__ stop_status) +{ + const auto global_id = thread::get_thread_id_flat(); + const auto row = global_id / m_stride; + const auto col = global_id % m_stride; + + if (global_id < nrhs) { + stop_status[global_id].reset(); + } + + if (row < subspace_dim && col < nrhs * subspace_dim) { + m_values[row * m_stride + col] = + (row == col / nrhs) ? one() : zero(); + } +} + + +template +__global__ +__launch_bounds__(block_size) void orthonormalize_subspace_vectors_kernel( + size_type num_rows, size_type num_cols, ValueType* __restrict__ values, + size_type stride) +{ + const auto tidx = thread::get_thread_id_flat(); + + __shared__ uninitialized_array + reduction_helper_array; + // they are not be used in the same time. + ValueType* reduction_helper = reduction_helper_array; + auto reduction_helper_real = + reinterpret_cast*>(reduction_helper); + + for (size_type row = 0; row < num_rows; row++) { + for (size_type i = 0; i < row; i++) { + auto dot = zero(); + for (size_type j = tidx; j < num_cols; j += block_size) { + dot += values[row * stride + j] * conj(values[i * stride + j]); + } + + // Ensure already finish reading this shared memory + __syncthreads(); + reduction_helper[tidx] = dot; + reduce( + group::this_thread_block(), reduction_helper, + [](const ValueType& a, const ValueType& b) { return a + b; }); + __syncthreads(); + + dot = reduction_helper[0]; + for (size_type j = tidx; j < num_cols; j += block_size) { + values[row * stride + j] -= dot * values[i * stride + j]; + } + } + + auto norm = zero>(); + for (size_type j = tidx; j < num_cols; j += block_size) { + norm += squared_norm(values[row * stride + j]); + } + // Ensure already finish reading this shared memory + __syncthreads(); + reduction_helper_real[tidx] = norm; + reduce(group::this_thread_block(), reduction_helper_real, + [](const remove_complex& a, + const remove_complex& b) { return a + b; }); + __syncthreads(); + + norm = sqrt(reduction_helper_real[0]); + for (size_type j = tidx; j < num_cols; j += block_size) { + values[row * stride + j] /= norm; + } + } +} + + +template +__global__ +__launch_bounds__(default_block_size) void solve_lower_triangular_kernel( + size_type subspace_dim, size_type nrhs, + const ValueType* __restrict__ m_values, size_type m_stride, + const ValueType* __restrict__ f_values, size_type f_stride, + ValueType* __restrict__ c_values, size_type c_stride, + const stopping_status* __restrict__ stop_status) +{ + const auto global_id = thread::get_thread_id_flat(); + + if (global_id >= nrhs) { + return; + } + + if (!stop_status[global_id].has_stopped()) { + for (size_type row = 0; row < subspace_dim; row++) { + auto temp = f_values[row * f_stride + global_id]; + for (size_type col = 0; col < row; col++) { + temp -= m_values[row * m_stride + col * nrhs + global_id] * + c_values[col * c_stride + global_id]; + } + c_values[row * c_stride + global_id] = + temp / m_values[row * m_stride + row * nrhs + global_id]; + } + } +} + + +template +__global__ __launch_bounds__(default_block_size) void step_1_kernel( + size_type k, size_type num_rows, size_type subspace_dim, size_type nrhs, + const ValueType* __restrict__ residual_values, size_type residual_stride, + const ValueType* __restrict__ c_values, size_type c_stride, + const ValueType* __restrict__ g_values, size_type g_stride, + ValueType* __restrict__ v_values, size_type v_stride, + const stopping_status* __restrict__ stop_status) +{ + const auto global_id = thread::get_thread_id_flat(); + const auto row = global_id / nrhs; + const auto col = global_id % nrhs; + + if (row >= num_rows) { + return; + } + + if (!stop_status[col].has_stopped()) { + auto temp = residual_values[row * residual_stride + col]; + for (size_type j = k; j < subspace_dim; j++) { + temp -= c_values[j * c_stride + col] * + g_values[row * g_stride + j * nrhs + col]; + } + v_values[row * v_stride + col] = temp; + } +} + + +template +__global__ __launch_bounds__(default_block_size) void step_2_kernel( + size_type k, size_type num_rows, size_type subspace_dim, size_type nrhs, + const ValueType* __restrict__ omega_values, + const ValueType* __restrict__ v_values, size_type v_stride, + const ValueType* __restrict__ c_values, size_type c_stride, + ValueType* __restrict__ u_values, size_type u_stride, + const stopping_status* __restrict__ stop_status) +{ + const auto global_id = thread::get_thread_id_flat(); + const auto row = global_id / nrhs; + const auto col = global_id % nrhs; + + if (row >= num_rows) { + return; + } + + if (!stop_status[col].has_stopped()) { + auto temp = omega_values[col] * v_values[row * v_stride + col]; + for (size_type j = k; j < subspace_dim; j++) { + temp += c_values[j * c_stride + col] * + u_values[row * u_stride + j * nrhs + col]; + } + u_values[row * u_stride + k * nrhs + col] = temp; + } +} + + +template +__global__ __launch_bounds__(default_dot_size) void multidot_kernel( + size_type num_rows, size_type nrhs, const ValueType* __restrict__ p_i, + const ValueType* __restrict__ g_k, size_type g_k_stride, + ValueType* __restrict__ alpha, + const stopping_status* __restrict__ stop_status) +{ + const auto tidx = threadIdx.x; + const auto tidy = threadIdx.y; + const auto rhs = blockIdx.x * default_dot_dim + tidx; + const auto num = ceildiv(num_rows, gridDim.y); + const auto start_row = blockIdx.y * num; + const auto end_row = + ((blockIdx.y + 1) * num > num_rows) ? num_rows : (blockIdx.y + 1) * num; + // Used that way to get around dynamic initialization warning and + // template error when using `reduction_helper_array` directly in `reduce` + __shared__ + uninitialized_array + reduction_helper_array; + ValueType* __restrict__ reduction_helper = reduction_helper_array; + + ValueType local_res = zero(); + if (rhs < nrhs && !stop_status[rhs].has_stopped()) { + for (size_type i = start_row + tidy; i < end_row; + i += default_dot_dim) { + const auto g_idx = i * g_k_stride + rhs; + local_res += p_i[i] * g_k[g_idx]; + } + } + reduction_helper[tidx * (default_dot_dim + 1) + tidy] = local_res; + __syncthreads(); + local_res = reduction_helper[tidy * (default_dot_dim + 1) + tidx]; + const auto tile_block = + group::tiled_partition(group::this_thread_block()); + const auto sum = + reduce(tile_block, local_res, + [](const ValueType& a, const ValueType& b) { return a + b; }); + const auto new_rhs = blockIdx.x * default_dot_dim + tidy; + if (tidx == 0 && new_rhs < nrhs && !stop_status[new_rhs].has_stopped()) { + atomic_add(alpha + new_rhs, sum); + } +} + + +template +__global__ __launch_bounds__(block_size) void update_g_k_and_u_kernel( + size_type k, size_type i, size_type size, size_type nrhs, + const ValueType* __restrict__ alpha, const ValueType* __restrict__ m_values, + size_type m_stride, const ValueType* __restrict__ g_values, + size_type g_stride, ValueType* __restrict__ g_k_values, + size_type g_k_stride, ValueType* __restrict__ u_values, size_type u_stride, + const stopping_status* __restrict__ stop_status) +{ + const auto tidx = thread::get_thread_id_flat(); + const auto row = tidx / g_k_stride; + const auto rhs = tidx % g_k_stride; + + if (row >= size || rhs >= nrhs) { + return; + } + + if (!stop_status[rhs].has_stopped()) { + const auto fact = alpha[rhs] / m_values[i * m_stride + i * nrhs + rhs]; + g_k_values[row * g_k_stride + rhs] -= + fact * g_values[row * g_stride + i * nrhs + rhs]; + u_values[row * u_stride + k * nrhs + rhs] -= + fact * u_values[row * u_stride + i * nrhs + rhs]; + } +} + + +template +__global__ __launch_bounds__(block_size) void update_g_kernel( + size_type k, size_type size, size_type nrhs, + const ValueType* __restrict__ g_k_values, size_type g_k_stride, + ValueType* __restrict__ g_values, size_type g_stride, + const stopping_status* __restrict__ stop_status) +{ + const auto tidx = thread::get_thread_id_flat(); + const auto row = tidx / g_k_stride; + const auto rhs = tidx % nrhs; + + if (row >= size || rhs >= nrhs) { + return; + } + + if (!stop_status[rhs].has_stopped()) { + g_values[row * g_stride + k * nrhs + rhs] = + g_k_values[row * g_k_stride + rhs]; + } +} + + +template +__global__ __launch_bounds__(default_block_size) void update_x_r_and_f_kernel( + size_type k, size_type size, size_type subspace_dim, size_type nrhs, + const ValueType* __restrict__ m_values, size_type m_stride, + const ValueType* __restrict__ g_values, size_type g_stride, + const ValueType* __restrict__ u_values, size_type u_stride, + ValueType* __restrict__ f_values, size_type f_stride, + ValueType* __restrict__ r_values, size_type r_stride, + ValueType* __restrict__ x_values, size_type x_stride, + const stopping_status* __restrict__ stop_status) +{ + const auto global_id = thread::get_thread_id_flat(); + const auto row = global_id / x_stride; + const auto col = global_id % x_stride; + + if (row >= size || col >= nrhs) { + return; + } + + if (!stop_status[col].has_stopped()) { + const auto beta = f_values[k * f_stride + col] / + m_values[k * m_stride + k * nrhs + col]; + r_values[row * r_stride + col] -= + beta * g_values[row * g_stride + k * nrhs + col]; + x_values[row * x_stride + col] += + beta * u_values[row * u_stride + k * nrhs + col]; + + if (k < row && k + 1 < subspace_dim && row < subspace_dim) { + f_values[row * f_stride + col] -= + beta * m_values[row * m_stride + k * nrhs + col]; + } + } +} + + +template +__global__ __launch_bounds__(config::warp_size) void compute_omega_kernel( + size_type nrhs, const remove_complex kappa, + const ValueType* __restrict__ tht, + const remove_complex* __restrict__ residual_norm, + ValueType* __restrict__ omega, + const stopping_status* __restrict__ stop_status) +{ + const auto global_id = thread::get_thread_id_flat(); + + if (global_id >= nrhs) { + return; + } + + if (!stop_status[global_id].has_stopped()) { + auto thr = omega[global_id]; + omega[global_id] /= tht[global_id]; + auto absrho = + abs(thr / (sqrt(real(tht[global_id])) * residual_norm[global_id])); + + if (absrho < kappa) { + omega[global_id] *= kappa / absrho; + } + } +} namespace { @@ -338,6 +649,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_COMPUTE_OMEGA_KERNEL); } // namespace idr -} // namespace cuda +} // namespace GKO_DEVICE_NAMESPACE } // namespace kernels } // namespace gko diff --git a/common/cuda_hip/solver/idr_kernels.hpp.inc b/common/cuda_hip/solver/idr_kernels.hpp.inc deleted file mode 100644 index 465417a6edb..00000000000 --- a/common/cuda_hip/solver/idr_kernels.hpp.inc +++ /dev/null @@ -1,318 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -template -__global__ __launch_bounds__(default_block_size) void initialize_m_kernel( - size_type subspace_dim, size_type nrhs, ValueType* __restrict__ m_values, - size_type m_stride, stopping_status* __restrict__ stop_status) -{ - const auto global_id = thread::get_thread_id_flat(); - const auto row = global_id / m_stride; - const auto col = global_id % m_stride; - - if (global_id < nrhs) { - stop_status[global_id].reset(); - } - - if (row < subspace_dim && col < nrhs * subspace_dim) { - m_values[row * m_stride + col] = - (row == col / nrhs) ? one() : zero(); - } -} - - -template -__global__ -__launch_bounds__(block_size) void orthonormalize_subspace_vectors_kernel( - size_type num_rows, size_type num_cols, ValueType* __restrict__ values, - size_type stride) -{ - const auto tidx = thread::get_thread_id_flat(); - - __shared__ uninitialized_array - reduction_helper_array; - // they are not be used in the same time. - ValueType* reduction_helper = reduction_helper_array; - auto reduction_helper_real = - reinterpret_cast*>(reduction_helper); - - for (size_type row = 0; row < num_rows; row++) { - for (size_type i = 0; i < row; i++) { - auto dot = zero(); - for (size_type j = tidx; j < num_cols; j += block_size) { - dot += values[row * stride + j] * conj(values[i * stride + j]); - } - - // Ensure already finish reading this shared memory - __syncthreads(); - reduction_helper[tidx] = dot; - reduce( - group::this_thread_block(), reduction_helper, - [](const ValueType& a, const ValueType& b) { return a + b; }); - __syncthreads(); - - dot = reduction_helper[0]; - for (size_type j = tidx; j < num_cols; j += block_size) { - values[row * stride + j] -= dot * values[i * stride + j]; - } - } - - auto norm = zero>(); - for (size_type j = tidx; j < num_cols; j += block_size) { - norm += squared_norm(values[row * stride + j]); - } - // Ensure already finish reading this shared memory - __syncthreads(); - reduction_helper_real[tidx] = norm; - reduce(group::this_thread_block(), reduction_helper_real, - [](const remove_complex& a, - const remove_complex& b) { return a + b; }); - __syncthreads(); - - norm = sqrt(reduction_helper_real[0]); - for (size_type j = tidx; j < num_cols; j += block_size) { - values[row * stride + j] /= norm; - } - } -} - - -template -__global__ -__launch_bounds__(default_block_size) void solve_lower_triangular_kernel( - size_type subspace_dim, size_type nrhs, - const ValueType* __restrict__ m_values, size_type m_stride, - const ValueType* __restrict__ f_values, size_type f_stride, - ValueType* __restrict__ c_values, size_type c_stride, - const stopping_status* __restrict__ stop_status) -{ - const auto global_id = thread::get_thread_id_flat(); - - if (global_id >= nrhs) { - return; - } - - if (!stop_status[global_id].has_stopped()) { - for (size_type row = 0; row < subspace_dim; row++) { - auto temp = f_values[row * f_stride + global_id]; - for (size_type col = 0; col < row; col++) { - temp -= m_values[row * m_stride + col * nrhs + global_id] * - c_values[col * c_stride + global_id]; - } - c_values[row * c_stride + global_id] = - temp / m_values[row * m_stride + row * nrhs + global_id]; - } - } -} - - -template -__global__ __launch_bounds__(default_block_size) void step_1_kernel( - size_type k, size_type num_rows, size_type subspace_dim, size_type nrhs, - const ValueType* __restrict__ residual_values, size_type residual_stride, - const ValueType* __restrict__ c_values, size_type c_stride, - const ValueType* __restrict__ g_values, size_type g_stride, - ValueType* __restrict__ v_values, size_type v_stride, - const stopping_status* __restrict__ stop_status) -{ - const auto global_id = thread::get_thread_id_flat(); - const auto row = global_id / nrhs; - const auto col = global_id % nrhs; - - if (row >= num_rows) { - return; - } - - if (!stop_status[col].has_stopped()) { - auto temp = residual_values[row * residual_stride + col]; - for (size_type j = k; j < subspace_dim; j++) { - temp -= c_values[j * c_stride + col] * - g_values[row * g_stride + j * nrhs + col]; - } - v_values[row * v_stride + col] = temp; - } -} - - -template -__global__ __launch_bounds__(default_block_size) void step_2_kernel( - size_type k, size_type num_rows, size_type subspace_dim, size_type nrhs, - const ValueType* __restrict__ omega_values, - const ValueType* __restrict__ v_values, size_type v_stride, - const ValueType* __restrict__ c_values, size_type c_stride, - ValueType* __restrict__ u_values, size_type u_stride, - const stopping_status* __restrict__ stop_status) -{ - const auto global_id = thread::get_thread_id_flat(); - const auto row = global_id / nrhs; - const auto col = global_id % nrhs; - - if (row >= num_rows) { - return; - } - - if (!stop_status[col].has_stopped()) { - auto temp = omega_values[col] * v_values[row * v_stride + col]; - for (size_type j = k; j < subspace_dim; j++) { - temp += c_values[j * c_stride + col] * - u_values[row * u_stride + j * nrhs + col]; - } - u_values[row * u_stride + k * nrhs + col] = temp; - } -} - - -template -__global__ __launch_bounds__(default_dot_size) void multidot_kernel( - size_type num_rows, size_type nrhs, const ValueType* __restrict__ p_i, - const ValueType* __restrict__ g_k, size_type g_k_stride, - ValueType* __restrict__ alpha, - const stopping_status* __restrict__ stop_status) -{ - const auto tidx = threadIdx.x; - const auto tidy = threadIdx.y; - const auto rhs = blockIdx.x * default_dot_dim + tidx; - const auto num = ceildiv(num_rows, gridDim.y); - const auto start_row = blockIdx.y * num; - const auto end_row = - ((blockIdx.y + 1) * num > num_rows) ? num_rows : (blockIdx.y + 1) * num; - // Used that way to get around dynamic initialization warning and - // template error when using `reduction_helper_array` directly in `reduce` - __shared__ - uninitialized_array - reduction_helper_array; - ValueType* __restrict__ reduction_helper = reduction_helper_array; - - ValueType local_res = zero(); - if (rhs < nrhs && !stop_status[rhs].has_stopped()) { - for (size_type i = start_row + tidy; i < end_row; - i += default_dot_dim) { - const auto g_idx = i * g_k_stride + rhs; - local_res += p_i[i] * g_k[g_idx]; - } - } - reduction_helper[tidx * (default_dot_dim + 1) + tidy] = local_res; - __syncthreads(); - local_res = reduction_helper[tidy * (default_dot_dim + 1) + tidx]; - const auto tile_block = - group::tiled_partition(group::this_thread_block()); - const auto sum = - reduce(tile_block, local_res, - [](const ValueType& a, const ValueType& b) { return a + b; }); - const auto new_rhs = blockIdx.x * default_dot_dim + tidy; - if (tidx == 0 && new_rhs < nrhs && !stop_status[new_rhs].has_stopped()) { - atomic_add(alpha + new_rhs, sum); - } -} - - -template -__global__ __launch_bounds__(block_size) void update_g_k_and_u_kernel( - size_type k, size_type i, size_type size, size_type nrhs, - const ValueType* __restrict__ alpha, const ValueType* __restrict__ m_values, - size_type m_stride, const ValueType* __restrict__ g_values, - size_type g_stride, ValueType* __restrict__ g_k_values, - size_type g_k_stride, ValueType* __restrict__ u_values, size_type u_stride, - const stopping_status* __restrict__ stop_status) -{ - const auto tidx = thread::get_thread_id_flat(); - const auto row = tidx / g_k_stride; - const auto rhs = tidx % g_k_stride; - - if (row >= size || rhs >= nrhs) { - return; - } - - if (!stop_status[rhs].has_stopped()) { - const auto fact = alpha[rhs] / m_values[i * m_stride + i * nrhs + rhs]; - g_k_values[row * g_k_stride + rhs] -= - fact * g_values[row * g_stride + i * nrhs + rhs]; - u_values[row * u_stride + k * nrhs + rhs] -= - fact * u_values[row * u_stride + i * nrhs + rhs]; - } -} - - -template -__global__ __launch_bounds__(block_size) void update_g_kernel( - size_type k, size_type size, size_type nrhs, - const ValueType* __restrict__ g_k_values, size_type g_k_stride, - ValueType* __restrict__ g_values, size_type g_stride, - const stopping_status* __restrict__ stop_status) -{ - const auto tidx = thread::get_thread_id_flat(); - const auto row = tidx / g_k_stride; - const auto rhs = tidx % nrhs; - - if (row >= size || rhs >= nrhs) { - return; - } - - if (!stop_status[rhs].has_stopped()) { - g_values[row * g_stride + k * nrhs + rhs] = - g_k_values[row * g_k_stride + rhs]; - } -} - - -template -__global__ __launch_bounds__(default_block_size) void update_x_r_and_f_kernel( - size_type k, size_type size, size_type subspace_dim, size_type nrhs, - const ValueType* __restrict__ m_values, size_type m_stride, - const ValueType* __restrict__ g_values, size_type g_stride, - const ValueType* __restrict__ u_values, size_type u_stride, - ValueType* __restrict__ f_values, size_type f_stride, - ValueType* __restrict__ r_values, size_type r_stride, - ValueType* __restrict__ x_values, size_type x_stride, - const stopping_status* __restrict__ stop_status) -{ - const auto global_id = thread::get_thread_id_flat(); - const auto row = global_id / x_stride; - const auto col = global_id % x_stride; - - if (row >= size || col >= nrhs) { - return; - } - - if (!stop_status[col].has_stopped()) { - const auto beta = f_values[k * f_stride + col] / - m_values[k * m_stride + k * nrhs + col]; - r_values[row * r_stride + col] -= - beta * g_values[row * g_stride + k * nrhs + col]; - x_values[row * x_stride + col] += - beta * u_values[row * u_stride + k * nrhs + col]; - - if (k < row && k + 1 < subspace_dim && row < subspace_dim) { - f_values[row * f_stride + col] -= - beta * m_values[row * m_stride + k * nrhs + col]; - } - } -} - - -template -__global__ __launch_bounds__(config::warp_size) void compute_omega_kernel( - size_type nrhs, const remove_complex kappa, - const ValueType* __restrict__ tht, - const remove_complex* __restrict__ residual_norm, - ValueType* __restrict__ omega, - const stopping_status* __restrict__ stop_status) -{ - const auto global_id = thread::get_thread_id_flat(); - - if (global_id >= nrhs) { - return; - } - - if (!stop_status[global_id].has_stopped()) { - auto thr = omega[global_id]; - omega[global_id] /= tht[global_id]; - auto absrho = - abs(thr / (sqrt(real(tht[global_id])) * residual_norm[global_id])); - - if (absrho < kappa) { - omega[global_id] *= kappa / absrho; - } - } -} diff --git a/common/cuda_hip/solver/multigrid_kernels.hpp.inc b/common/cuda_hip/solver/multigrid_kernels.cpp similarity index 89% rename from common/cuda_hip/solver/multigrid_kernels.hpp.inc rename to common/cuda_hip/solver/multigrid_kernels.cpp index 98b1fcfeff4..e3ccc923b2c 100644 --- a/common/cuda_hip/solver/multigrid_kernels.hpp.inc +++ b/common/cuda_hip/solver/multigrid_kernels.cpp @@ -2,6 +2,34 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include +#include +#include +#include + + +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "core/base/array_access.hpp" +#include "core/components/fill_array_kernels.hpp" +#include "core/solver/multigrid_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +/** + * @brief The MULTIGRID solver namespace. + * + * @ingroup multigrid + */ +namespace multigrid { + + +constexpr int default_block_size = 512; + + namespace kernel { @@ -171,3 +199,9 @@ void kcycle_check_stop(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE( GKO_DECLARE_MULTIGRID_KCYCLE_CHECK_STOP_KERNEL); + + +} // namespace multigrid +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/common/cuda_hip/stop/batch_criteria.hpp.inc b/common/cuda_hip/stop/batch_criteria.hpp similarity index 75% rename from common/cuda_hip/stop/batch_criteria.hpp.inc rename to common/cuda_hip/stop/batch_criteria.hpp index 38072467765..7491a143a31 100644 --- a/common/cuda_hip/stop/batch_criteria.hpp.inc +++ b/common/cuda_hip/stop/batch_criteria.hpp @@ -2,6 +2,19 @@ // // SPDX-License-Identifier: BSD-3-Clause +#ifndef GKO_COMMON_CUDA_HIP_STOP_BATCH_CRITERIA_HPP_ +#define GKO_COMMON_CUDA_HIP_STOP_BATCH_CRITERIA_HPP_ + + +#include + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace batch_stop { + + /** * @see reference/stop/batch_criteria.hpp */ @@ -49,3 +62,11 @@ class SimpleAbsResidual { private: const real_type abs_tol_; }; + + +} // namespace batch_stop +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko + +#endif // GKO_COMMON_CUDA_HIP_STOP_BATCH_CRITERIA_HPP_ diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt index 88ae83e9005..11c00a1f8e1 100644 --- a/cuda/CMakeLists.txt +++ b/cuda/CMakeLists.txt @@ -7,9 +7,7 @@ add_instantiation_files(. matrix/fbcsr_kernels.instantiate.cu FBCSR_INSTANTIATE) list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp) target_sources(ginkgo_cuda PRIVATE - base/batch_multi_vector_kernels.cu base/device.cpp - base/device_matrix_data_kernels.cu base/exception.cpp base/executor.cpp base/index_set_kernels.cpp @@ -19,59 +17,32 @@ target_sources(ginkgo_cuda base/stream.cpp base/timer.cpp base/version.cpp - components/prefix_sum_kernels.cu - distributed/index_map_kernels.cu - distributed/matrix_kernels.cu - distributed/partition_helpers_kernels.cu - distributed/partition_kernels.cu - distributed/vector_kernels.cu - factorization/cholesky_kernels.cu - factorization/factorization_kernels.cu factorization/ic_kernels.cu factorization/ilu_kernels.cu - factorization/lu_kernels.cu - factorization/par_ic_kernels.cu - factorization/par_ict_kernels.cu - factorization/par_ilu_kernels.cu factorization/par_ilut_approx_filter_kernel.cu factorization/par_ilut_filter_kernel.cu factorization/par_ilut_select_common.cu factorization/par_ilut_select_kernel.cu factorization/par_ilut_spgeam_kernel.cu factorization/par_ilut_sweep_kernel.cu - matrix/batch_csr_kernels.cu - matrix/batch_dense_kernels.cu - matrix/batch_ell_kernels.cu - matrix/coo_kernels.cu ${CSR_INSTANTIATE} - matrix/dense_kernels.cu - matrix/diagonal_kernels.cu - matrix/ell_kernels.cu ${FBCSR_INSTANTIATE} matrix/fft_kernels.cu - matrix/sellp_kernels.cu - matrix/sparsity_csr_kernels.cu - multigrid/pgm_kernels.cu preconditioner/batch_jacobi_kernels.cu - preconditioner/isai_kernels.cu preconditioner/jacobi_advanced_apply_kernel.cu preconditioner/jacobi_generate_kernel.cu - preconditioner/jacobi_kernels.cu preconditioner/jacobi_simple_apply_kernel.cu - reorder/rcm_kernels.cu solver/batch_bicgstab_kernels.cu solver/batch_cg_kernels.cu - solver/cb_gmres_kernels.cu - solver/idr_kernels.cu solver/lower_trs_kernels.cu - solver/multigrid_kernels.cu solver/upper_trs_kernels.cu stop/criterion_kernels.cu stop/residual_norm_kernels.cu ${GKO_UNIFIED_COMMON_SOURCES} + ${GKO_CUDA_HIP_COMMON_SOURCES} ) # override the default language mapping for the common files, set them to CUDA -foreach(source_file IN LISTS GKO_UNIFIED_COMMON_SOURCES) +foreach(source_file IN LISTS GKO_UNIFIED_COMMON_SOURCES GKO_CUDA_HIP_COMMON_SOURCES) set_source_files_properties(${source_file} PROPERTIES LANGUAGE CUDA) endforeach(source_file) if(GINKGO_JACOBI_FULL_OPTIMIZATIONS) diff --git a/cuda/base/batch_multi_vector_kernels.cu b/cuda/base/batch_multi_vector_kernels.cu deleted file mode 100644 index 0e42278740e..00000000000 --- a/cuda/base/batch_multi_vector_kernels.cu +++ /dev/null @@ -1,59 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/base/batch_multi_vector_kernels.hpp" - - -#include -#include - - -#include -#include - - -#include "common/cuda_hip/base/runtime.hpp" -#include "core/base/batch_struct.hpp" -#include "cuda/base/batch_struct.hpp" -#include "cuda/base/config.hpp" -#include "cuda/base/cublas_bindings.hpp" -#include "cuda/base/pointer_mode_guard.hpp" -#include "cuda/base/thrust.cuh" -#include "cuda/components/cooperative_groups.cuh" -#include "cuda/components/reduction.cuh" -#include "cuda/components/thread_ids.cuh" -#include "cuda/components/uninitialized_array.hpp" - - -namespace gko { -namespace kernels { -namespace cuda { -/** - * @brief The MultiVector matrix format namespace. - * - * @ingroup batch_multi_vector - */ -namespace batch_multi_vector { - - -constexpr auto default_block_size = 256; -constexpr int sm_oversubscription = 4; - - -// clang-format off - -// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES - -#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc" - - -#include "common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc" - -// clang-format on - - -} // namespace batch_multi_vector -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/base/device_matrix_data_kernels.cu b/cuda/base/device_matrix_data_kernels.cu deleted file mode 100644 index ed5601f57a5..00000000000 --- a/cuda/base/device_matrix_data_kernels.cu +++ /dev/null @@ -1,33 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/base/device_matrix_data_kernels.hpp" - - -#include -#include -#include -#include -#include -#include -#include - - -#include "cuda/base/thrust.cuh" -#include "cuda/base/types.hpp" - - -namespace gko { -namespace kernels { -namespace cuda { -namespace components { - - -#include "common/cuda_hip/base/device_matrix_data_kernels.hpp.inc" - - -} // namespace components -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/base/kernel_launch.cuh b/cuda/base/kernel_launch.cuh deleted file mode 100644 index 7b929b9ba7c..00000000000 --- a/cuda/base/kernel_launch.cuh +++ /dev/null @@ -1,57 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_HPP_ -#error \ - "This file can only be used from inside common/unified/base/kernel_launch.hpp" -#endif - - -#include - - -#include "accessor/cuda_helper.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "cuda/base/types.hpp" -#include "cuda/components/thread_ids.cuh" - - -namespace gko { -namespace kernels { -namespace cuda { - - -template -struct to_device_type_impl&> { - using type = std::decay_t>()))>; - static type map_to_device(gko::acc::range& range) - { - return gko::acc::as_cuda_range(range); - } -}; - -template -struct to_device_type_impl&> { - using type = std::decay_t>()))>; - static type map_to_device(const gko::acc::range& range) - { - return gko::acc::as_cuda_range(range); - } -}; - - -namespace device_std = thrust; - - -constexpr int default_block_size = 512; - - -#include "common/cuda_hip/base/kernel_launch.hpp.inc" - - -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/base/kernel_launch_reduction.cuh b/cuda/base/kernel_launch_reduction.cuh deleted file mode 100644 index 6146d7248d0..00000000000 --- a/cuda/base/kernel_launch_reduction.cuh +++ /dev/null @@ -1,28 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_REDUCTION_HPP_ -#error \ - "This file can only be used from inside common/unified/base/kernel_launch_reduction.hpp" -#endif - - -#include "core/synthesizer/implementation_selection.hpp" -#include "cuda/base/types.hpp" -#include "cuda/components/cooperative_groups.cuh" -#include "cuda/components/reduction.cuh" -#include "cuda/components/thread_ids.cuh" - - -namespace gko { -namespace kernels { -namespace cuda { - - -#include "common/cuda_hip/base/kernel_launch_reduction.hpp.inc" - - -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/base/kernel_launch_solver.cuh b/cuda/base/kernel_launch_solver.cuh deleted file mode 100644 index 0d9eaeb2653..00000000000 --- a/cuda/base/kernel_launch_solver.cuh +++ /dev/null @@ -1,24 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_SOLVER_HPP_ -#error \ - "This file can only be used from inside common/unified/base/kernel_launch_solver.hpp" -#endif - - -#include "common/cuda_hip/base/runtime.hpp" - - -namespace gko { -namespace kernels { -namespace cuda { - - -#include "common/cuda_hip/base/kernel_launch_solver.hpp.inc" - - -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/base/math.hpp b/cuda/base/math.hpp deleted file mode 100644 index d86a85a083e..00000000000 --- a/cuda/base/math.hpp +++ /dev/null @@ -1,24 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_CUDA_BASE_MATH_HPP_ -#define GKO_CUDA_BASE_MATH_HPP_ - - -#include - - -#include - - -namespace gko { - - -#include "common/cuda_hip/base/math.hpp.inc" - - -} // namespace gko - - -#endif // GKO_CUDA_BASE_MATH_HPP_ diff --git a/cuda/components/atomic.cuh b/cuda/components/atomic.cuh deleted file mode 100644 index ad76dd0e0ce..00000000000 --- a/cuda/components/atomic.cuh +++ /dev/null @@ -1,29 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_CUDA_COMPONENTS_ATOMIC_CUH_ -#define GKO_CUDA_COMPONENTS_ATOMIC_CUH_ - - -#include - - -#include "cuda/base/math.hpp" -#include "cuda/base/types.hpp" - - -namespace gko { -namespace kernels { -namespace cuda { - - -#include "common/cuda_hip/components/atomic.hpp.inc" - - -} // namespace cuda -} // namespace kernels -} // namespace gko - - -#endif // GKO_CUDA_COMPONENTS_ATOMIC_CUH_ diff --git a/cuda/components/diagonal_block_manipulation.cuh b/cuda/components/diagonal_block_manipulation.cuh deleted file mode 100644 index d748fcab2e5..00000000000 --- a/cuda/components/diagonal_block_manipulation.cuh +++ /dev/null @@ -1,32 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_CUDA_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_CUH_ -#define GKO_CUDA_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_CUH_ - - -#include - - -#include "cuda/base/config.hpp" -#include "cuda/base/types.hpp" -#include "cuda/components/cooperative_groups.cuh" - - -namespace gko { -namespace kernels { -namespace cuda { -namespace csr { - - -#include "common/cuda_hip/components/diagonal_block_manipulation.hpp.inc" - - -} // namespace csr -} // namespace cuda -} // namespace kernels -} // namespace gko - - -#endif // GKO_CUDA_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_CUH_ diff --git a/cuda/components/intrinsics.cuh b/cuda/components/intrinsics.cuh deleted file mode 100644 index d35043c34ce..00000000000 --- a/cuda/components/intrinsics.cuh +++ /dev/null @@ -1,25 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_CUDA_COMPONENTS_INTRINSICS_CUH_ -#define GKO_CUDA_COMPONENTS_INTRINSICS_CUH_ - - -#include - - -namespace gko { -namespace kernels { -namespace cuda { - - -#include "common/cuda_hip/components/intrinsics.hpp.inc" - - -} // namespace cuda -} // namespace kernels -} // namespace gko - - -#endif // GKO_CUDA_COMPONENTS_INTRINSICS_CUH_ diff --git a/cuda/components/merging.cuh b/cuda/components/merging.cuh deleted file mode 100644 index 3c7f5e52d47..00000000000 --- a/cuda/components/merging.cuh +++ /dev/null @@ -1,28 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_CUDA_COMPONENTS_MERGING_CUH_ -#define GKO_CUDA_COMPONENTS_MERGING_CUH_ - - -#include "core/base/utils.hpp" -#include "cuda/base/math.hpp" -#include "cuda/components/intrinsics.cuh" -#include "cuda/components/searching.cuh" - - -namespace gko { -namespace kernels { -namespace cuda { - - -#include "common/cuda_hip/components/merging.hpp.inc" - - -} // namespace cuda -} // namespace kernels -} // namespace gko - - -#endif // GKO_CUDA_COMPONENTS_MERGING_CUH_ diff --git a/cuda/components/prefix_sum.cuh b/cuda/components/prefix_sum.cuh deleted file mode 100644 index 653de4e9e15..00000000000 --- a/cuda/components/prefix_sum.cuh +++ /dev/null @@ -1,31 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_CUDA_COMPONENTS_PREFIX_SUM_CUH_ -#define GKO_CUDA_COMPONENTS_PREFIX_SUM_CUH_ - - -#include - - -#include "cuda/base/cublas_bindings.hpp" -#include "cuda/components/cooperative_groups.cuh" -#include "cuda/components/reduction.cuh" -#include "cuda/components/thread_ids.cuh" - - -namespace gko { -namespace kernels { -namespace cuda { - - -#include "common/cuda_hip/components/prefix_sum.hpp.inc" - - -} // namespace cuda -} // namespace kernels -} // namespace gko - - -#endif // GKO_CUDA_COMPONENTS_PREFIX_SUM_CUH_ diff --git a/cuda/components/prefix_sum_kernels.cu b/cuda/components/prefix_sum_kernels.cu deleted file mode 100644 index d330ce0a2b0..00000000000 --- a/cuda/components/prefix_sum_kernels.cu +++ /dev/null @@ -1,34 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/components/prefix_sum_kernels.hpp" - - -#include - - -#include - - -#include -#include -#include - - -#include "cuda/base/thrust.cuh" - - -namespace gko { -namespace kernels { -namespace cuda { -namespace components { - - -#include "common/cuda_hip/components/prefix_sum_kernels.hpp.inc" - - -} // namespace components -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/components/reduction.cuh b/cuda/components/reduction.cuh deleted file mode 100644 index e53e1451d7f..00000000000 --- a/cuda/components/reduction.cuh +++ /dev/null @@ -1,82 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_CUDA_COMPONENTS_REDUCTION_CUH_ -#define GKO_CUDA_COMPONENTS_REDUCTION_CUH_ - - -#include - - -#include -#include - - -#include "common/cuda_hip/base/runtime.hpp" -#include "core/base/array_access.hpp" -#include "cuda/base/config.hpp" -#include "cuda/base/types.hpp" -#include "cuda/components/cooperative_groups.cuh" -#include "cuda/components/thread_ids.cuh" -#include "cuda/components/uninitialized_array.hpp" - - -namespace gko { -namespace kernels { -namespace cuda { - - -constexpr int default_reduce_block_size = 512; - - -#include "common/cuda_hip/components/reduction.hpp.inc" - - -/** - * Compute a reduction using add operation (+). - * - * @param exec Executor associated to the array - * @param size size of the array - * @param source the pointer of the array - * - * @return the reduction result - */ -template -__host__ ValueType reduce_add_array(std::shared_ptr exec, - size_type size, const ValueType* source) -{ - auto block_results_val = source; - size_type grid_dim = size; - auto block_results = array(exec); - if (size > default_reduce_block_size) { - const auto n = ceildiv(size, default_reduce_block_size); - grid_dim = - (n <= default_reduce_block_size) ? n : default_reduce_block_size; - - block_results.resize_and_reset(grid_dim); - - reduce_add_array<<get_stream()>>>( - size, as_device_type(source), - as_device_type(block_results.get_data())); - - block_results_val = block_results.get_const_data(); - } - - auto d_result = array(exec, 1); - - reduce_add_array<<<1, default_reduce_block_size, 0, exec->get_stream()>>>( - grid_dim, as_device_type(block_results_val), - as_device_type(d_result.get_data())); - auto answer = get_element(d_result, 0); - return answer; -} - - -} // namespace cuda -} // namespace kernels -} // namespace gko - - -#endif // GKO_CUDA_COMPONENTS_REDUCTION_CUH_ diff --git a/cuda/components/searching.cuh b/cuda/components/searching.cuh deleted file mode 100644 index 1dc1304a82a..00000000000 --- a/cuda/components/searching.cuh +++ /dev/null @@ -1,26 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_CUDA_COMPONENTS_SEARCHING_CUH_ -#define GKO_CUDA_COMPONENTS_SEARCHING_CUH_ - - -#include "cuda/base/config.hpp" -#include "cuda/components/intrinsics.cuh" - - -namespace gko { -namespace kernels { -namespace cuda { - - -#include "common/cuda_hip/components/searching.hpp.inc" - - -} // namespace cuda -} // namespace kernels -} // namespace gko - - -#endif // GKO_CUDA_COMPONENTS_SEARCHING_CUH_ diff --git a/cuda/components/segment_scan.cuh b/cuda/components/segment_scan.cuh deleted file mode 100644 index 842f1e06760..00000000000 --- a/cuda/components/segment_scan.cuh +++ /dev/null @@ -1,25 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_CUDA_COMPONENTS_SEGMENT_SCAN_CUH_ -#define GKO_CUDA_COMPONENTS_SEGMENT_SCAN_CUH_ - - -#include "cuda/components/cooperative_groups.cuh" - - -namespace gko { -namespace kernels { -namespace cuda { - - -#include "common/cuda_hip/components/segment_scan.hpp.inc" - - -} // namespace cuda -} // namespace kernels -} // namespace gko - - -#endif // GKO_CUDA_COMPONENTS_SEGMENT_SCAN_CUH_ diff --git a/cuda/components/sorting.cuh b/cuda/components/sorting.cuh deleted file mode 100644 index e6eb17ec8e4..00000000000 --- a/cuda/components/sorting.cuh +++ /dev/null @@ -1,26 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_CUDA_COMPONENTS_SORTING_CUH_ -#define GKO_CUDA_COMPONENTS_SORTING_CUH_ - - -#include "cuda/base/config.hpp" -#include "cuda/components/cooperative_groups.cuh" - - -namespace gko { -namespace kernels { -namespace cuda { - - -#include "common/cuda_hip/components/sorting.hpp.inc" - - -} // namespace cuda -} // namespace kernels -} // namespace gko - - -#endif // GKO_CUDA_COMPONENTS_SORTING_CUH_ diff --git a/cuda/components/syncfree.cuh b/cuda/components/syncfree.cuh deleted file mode 100644 index 0d45c8db516..00000000000 --- a/cuda/components/syncfree.cuh +++ /dev/null @@ -1,32 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_CUDA_COMPONENTS_SYNCFREE_CUH_ -#define GKO_CUDA_COMPONENTS_SYNCFREE_CUH_ - - -#include - - -#include "core/components/fill_array_kernels.hpp" -#include "cuda/base/config.hpp" -#include "cuda/components/atomic.cuh" -#include "cuda/components/cooperative_groups.cuh" -#include "cuda/components/memory.cuh" - - -namespace gko { -namespace kernels { -namespace cuda { - - -#include "common/cuda_hip/components/syncfree.hpp.inc" - - -} // namespace cuda -} // namespace kernels -} // namespace gko - - -#endif // GKO_CUDA_COMPONENTS_SYNCFREE_CUH_ diff --git a/cuda/components/thread_ids.cuh b/cuda/components/thread_ids.cuh deleted file mode 100644 index 965053dd3b9..00000000000 --- a/cuda/components/thread_ids.cuh +++ /dev/null @@ -1,27 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_CUDA_COMPONENTS_THREAD_IDS_CUH_ -#define GKO_CUDA_COMPONENTS_THREAD_IDS_CUH_ - - -#include "cuda/base/config.hpp" - - -namespace gko { -namespace kernels { -namespace cuda { -namespace thread { - - -#include "common/cuda_hip/components/thread_ids.hpp.inc" - - -} // namespace thread -} // namespace cuda -} // namespace kernels -} // namespace gko - - -#endif // GKO_CUDA_COMPONENTS_THREAD_IDS_CUH_ diff --git a/cuda/components/uninitialized_array.hpp b/cuda/components/uninitialized_array.hpp deleted file mode 100644 index b98c812c16d..00000000000 --- a/cuda/components/uninitialized_array.hpp +++ /dev/null @@ -1,25 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_CUDA_COMPONENTS_UNINITIALIZED_ARRAY_HPP_ -#define GKO_CUDA_COMPONENTS_UNINITIALIZED_ARRAY_HPP_ - - -#include - - -namespace gko { -namespace kernels { -namespace cuda { - - -#include "common/cuda_hip/components/uninitialized_array.hpp.inc" - - -} // namespace cuda -} // namespace kernels -} // namespace gko - - -#endif // GKO_CUDA_COMPONENTS_UNINITIALIZED_ARRAY_HPP_ diff --git a/cuda/components/warp_blas.cuh b/cuda/components/warp_blas.cuh deleted file mode 100644 index fa5e3d3ae3b..00000000000 --- a/cuda/components/warp_blas.cuh +++ /dev/null @@ -1,33 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_CUDA_COMPONENTS_WARP_BLAS_CUH_ -#define GKO_CUDA_COMPONENTS_WARP_BLAS_CUH_ - - -#include -#include - - -#include - - -#include "cuda/base/math.hpp" -#include "cuda/components/reduction.cuh" - - -namespace gko { -namespace kernels { -namespace cuda { - - -#include "common/cuda_hip/components/warp_blas.hpp.inc" - - -} // namespace cuda -} // namespace kernels -} // namespace gko - - -#endif // GKO_CUDA_COMPONENTS_WARP_BLAS_CUH_ diff --git a/cuda/distributed/index_map_kernels.cu b/cuda/distributed/index_map_kernels.cu deleted file mode 100644 index a5d838e901f..00000000000 --- a/cuda/distributed/index_map_kernels.cu +++ /dev/null @@ -1,42 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/distributed/index_map_kernels.hpp" - - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -#include - - -#include "cuda/base/thrust.cuh" -#include "cuda/components/atomic.cuh" -#include "cuda/components/searching.cuh" - - -namespace gko { -namespace kernels { -namespace cuda { -namespace index_map { - - -#include "common/cuda_hip/distributed/index_map_kernels.hpp.inc" - - -} // namespace index_map -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/distributed/matrix_kernels.cu b/cuda/distributed/matrix_kernels.cu deleted file mode 100644 index 3ad815d7090..00000000000 --- a/cuda/distributed/matrix_kernels.cu +++ /dev/null @@ -1,40 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/distributed/matrix_kernels.hpp" - - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -#include - - -#include "cuda/base/thrust.cuh" -#include "cuda/components/atomic.cuh" - - -namespace gko { -namespace kernels { -namespace cuda { -namespace distributed_matrix { - - -#include "common/cuda_hip/distributed/matrix_kernels.hpp.inc" - - -} // namespace distributed_matrix -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/distributed/partition_helpers_kernels.cu b/cuda/distributed/partition_helpers_kernels.cu deleted file mode 100644 index b478477ce18..00000000000 --- a/cuda/distributed/partition_helpers_kernels.cu +++ /dev/null @@ -1,29 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/distributed/partition_helpers_kernels.hpp" - - -#include -#include -#include -#include - - -#include "cuda/base/thrust.cuh" - - -namespace gko { -namespace kernels { -namespace cuda { -namespace partition_helpers { - - -#include "common/cuda_hip/distributed/partition_helpers_kernels.hpp.inc" - - -} // namespace partition_helpers -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/distributed/partition_kernels.cu b/cuda/distributed/partition_kernels.cu deleted file mode 100644 index de6c5bc6c02..00000000000 --- a/cuda/distributed/partition_kernels.cu +++ /dev/null @@ -1,33 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/distributed/partition_kernels.hpp" - - -#include -#include -#include -#include -#include -#include - - -#include "common/unified/base/kernel_launch.hpp" -#include "core/components/fill_array_kernels.hpp" -#include "cuda/base/thrust.cuh" - - -namespace gko { -namespace kernels { -namespace cuda { -namespace partition { - - -#include "common/cuda_hip/distributed/partition_kernels.hpp.inc" - - -} // namespace partition -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/distributed/vector_kernels.cu b/cuda/distributed/vector_kernels.cu deleted file mode 100644 index 7b06ada9f0e..00000000000 --- a/cuda/distributed/vector_kernels.cu +++ /dev/null @@ -1,37 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/distributed/vector_kernels.hpp" - - -#include - - -#include -#include -#include -#include -#include -#include - - -#include - - -#include "cuda/base/thrust.cuh" - - -namespace gko { -namespace kernels { -namespace cuda { -namespace distributed_vector { - - -#include "common/cuda_hip/distributed/vector_kernels.hpp.inc" - - -} // namespace distributed_vector -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/factorization/cholesky_kernels.cu b/cuda/factorization/cholesky_kernels.cu deleted file mode 100644 index e4ff3f4d4d5..00000000000 --- a/cuda/factorization/cholesky_kernels.cu +++ /dev/null @@ -1,115 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/factorization/cholesky_kernels.hpp" - - -#include -#include - - -#include -#include -#include -#include -#include -#include - - -#include - - -#include "core/components/fill_array_kernels.hpp" -#include "core/components/format_conversion_kernels.hpp" -#include "core/factorization/elimination_forest.hpp" -#include "core/factorization/lu_kernels.hpp" -#include "core/matrix/csr_lookup.hpp" -#include "cuda/base/cusparse_bindings.hpp" -#include "cuda/base/math.hpp" -#include "cuda/base/thrust.cuh" -#include "cuda/components/cooperative_groups.cuh" -#include "cuda/components/intrinsics.cuh" -#include "cuda/components/reduction.cuh" -#include "cuda/components/syncfree.cuh" -#include "cuda/components/thread_ids.cuh" - - -namespace gko { -namespace kernels { -namespace cuda { -/** - * @brief The Cholesky namespace. - * - * @ingroup factor - */ -namespace cholesky { - - -constexpr int default_block_size = 512; - - -#include "common/cuda_hip/factorization/cholesky_kernels.hpp.inc" - - -template -void symbolic_count(std::shared_ptr exec, - const matrix::Csr* mtx, - const factorization::elimination_forest& forest, - IndexType* row_nnz, array& tmp_storage) -{ - const auto num_rows = static_cast(mtx->get_size()[0]); - if (num_rows == 0) { - return; - } - const auto mtx_nnz = static_cast(mtx->get_num_stored_elements()); - tmp_storage.resize_and_reset(mtx_nnz + num_rows); - const auto postorder_cols = tmp_storage.get_data(); - const auto lower_ends = postorder_cols + mtx_nnz; - const auto row_ptrs = mtx->get_const_row_ptrs(); - const auto cols = mtx->get_const_col_idxs(); - const auto inv_postorder = forest.inv_postorder.get_const_data(); - const auto postorder_parent = forest.postorder_parents.get_const_data(); - // transform col indices to postorder indices - { - const auto num_blocks = ceildiv(num_rows, default_block_size); - kernel::build_postorder_cols<<get_stream()>>>( - num_rows, cols, row_ptrs, inv_postorder, postorder_cols, - lower_ends); - } - // sort postorder_cols inside rows - { - const auto handle = exec->get_sparselib_handle(); - auto descr = sparselib::create_mat_descr(); - array permutation_array(exec, mtx_nnz); - auto permutation = permutation_array.get_data(); - components::fill_seq_array(exec, permutation, mtx_nnz); - size_type buffer_size{}; - sparselib::csrsort_buffer_size(handle, num_rows, num_rows, mtx_nnz, - row_ptrs, postorder_cols, buffer_size); - array buffer_array{exec, buffer_size}; - auto buffer = buffer_array.get_data(); - sparselib::csrsort(handle, num_rows, num_rows, mtx_nnz, descr, row_ptrs, - postorder_cols, permutation, buffer); - sparselib::destroy(descr); - } - // count nonzeros per row of L - { - const auto num_blocks = - ceildiv(num_rows, default_block_size / config::warp_size); - kernel::symbolic_count - <<get_stream()>>>( - num_rows, row_ptrs, lower_ends, inv_postorder, postorder_cols, - postorder_parent, row_nnz); - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CHOLESKY_SYMBOLIC_COUNT); - - -} // namespace cholesky -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/factorization/factorization_kernels.cu b/cuda/factorization/factorization_kernels.cu deleted file mode 100644 index ac5c14481e9..00000000000 --- a/cuda/factorization/factorization_kernels.cu +++ /dev/null @@ -1,43 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/factorization/factorization_kernels.hpp" - - -#include - - -#include "common/cuda_hip/base/runtime.hpp" -#include "core/base/array_access.hpp" -#include "core/components/prefix_sum_kernels.hpp" -#include "core/matrix/csr_builder.hpp" -#include "cuda/base/config.hpp" -#include "cuda/base/types.hpp" -#include "cuda/components/cooperative_groups.cuh" -#include "cuda/components/intrinsics.cuh" -#include "cuda/components/searching.cuh" -#include "cuda/components/thread_ids.cuh" - - -namespace gko { -namespace kernels { -namespace cuda { -/** - * @brief The factorization namespace. - * - * @ingroup factor - */ -namespace factorization { - - -constexpr int default_block_size{512}; - - -#include "common/cuda_hip/factorization/factorization_kernels.hpp.inc" - - -} // namespace factorization -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/factorization/lu_kernels.cu b/cuda/factorization/lu_kernels.cu deleted file mode 100644 index 583bf51fb67..00000000000 --- a/cuda/factorization/lu_kernels.cu +++ /dev/null @@ -1,50 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/factorization/lu_kernels.hpp" - - -#include -#include - - -#include -#include -#include - - -#include - - -#include "core/base/allocator.hpp" -#include "core/matrix/csr_lookup.hpp" -#include "cuda/base/thrust.cuh" -#include "cuda/base/types.hpp" -#include "cuda/components/cooperative_groups.cuh" -#include "cuda/components/reduction.cuh" -#include "cuda/components/syncfree.cuh" -#include "cuda/components/thread_ids.cuh" - - -namespace gko { -namespace kernels { -namespace cuda { -/** - * @brief The LU namespace. - * - * @ingroup factor - */ -namespace lu_factorization { - - -constexpr static int default_block_size = 512; - - -#include "common/cuda_hip/factorization/lu_kernels.hpp.inc" - - -} // namespace lu_factorization -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/factorization/par_ic_kernels.cu b/cuda/factorization/par_ic_kernels.cu deleted file mode 100644 index a9de634f1f9..00000000000 --- a/cuda/factorization/par_ic_kernels.cu +++ /dev/null @@ -1,39 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/factorization/par_ic_kernels.hpp" - - -#include -#include -#include - - -#include "cuda/base/math.hpp" -#include "cuda/base/types.hpp" -#include "cuda/components/memory.cuh" -#include "cuda/components/thread_ids.cuh" - - -namespace gko { -namespace kernels { -namespace cuda { -/** - * @brief The parallel ic factorization namespace. - * - * @ingroup factor - */ -namespace par_ic_factorization { - - -constexpr int default_block_size = 512; - - -#include "common/cuda_hip/factorization/par_ic_kernels.hpp.inc" - - -} // namespace par_ic_factorization -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/factorization/par_ict_kernels.cu b/cuda/factorization/par_ict_kernels.cu deleted file mode 100644 index 9285e786adf..00000000000 --- a/cuda/factorization/par_ict_kernels.cu +++ /dev/null @@ -1,189 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/factorization/par_ict_kernels.hpp" - - -#include -#include -#include -#include -#include - - -#include "common/cuda_hip/base/runtime.hpp" -#include "core/components/prefix_sum_kernels.hpp" -#include "core/matrix/coo_builder.hpp" -#include "core/matrix/csr_builder.hpp" -#include "core/matrix/csr_kernels.hpp" -#include "core/synthesizer/implementation_selection.hpp" -#include "cuda/base/math.hpp" -#include "cuda/components/intrinsics.cuh" -#include "cuda/components/memory.cuh" -#include "cuda/components/merging.cuh" -#include "cuda/components/prefix_sum.cuh" -#include "cuda/components/reduction.cuh" -#include "cuda/components/searching.cuh" -#include "cuda/components/thread_ids.cuh" - - -namespace gko { -namespace kernels { -namespace cuda { -/** - * @brief The parallel ICT factorization namespace. - * - * @ingroup factor - */ -namespace par_ict_factorization { - - -constexpr int default_block_size = 512; - - -// subwarp sizes for all warp-parallel kernels (filter, add_candidates) -using compiled_kernels = - syn::value_list; - - -#include "common/cuda_hip/factorization/par_ict_kernels.hpp.inc" - - -namespace { - - -template -void add_candidates(syn::value_list, - std::shared_ptr exec, - const matrix::Csr* llh, - const matrix::Csr* a, - const matrix::Csr* l, - matrix::Csr* l_new) -{ - auto num_rows = static_cast(llh->get_size()[0]); - auto subwarps_per_block = default_block_size / subwarp_size; - auto num_blocks = ceildiv(num_rows, subwarps_per_block); - matrix::CsrBuilder l_new_builder(l_new); - auto llh_row_ptrs = llh->get_const_row_ptrs(); - auto llh_col_idxs = llh->get_const_col_idxs(); - auto llh_vals = llh->get_const_values(); - auto a_row_ptrs = a->get_const_row_ptrs(); - auto a_col_idxs = a->get_const_col_idxs(); - auto a_vals = a->get_const_values(); - auto l_row_ptrs = l->get_const_row_ptrs(); - auto l_col_idxs = l->get_const_col_idxs(); - auto l_vals = l->get_const_values(); - auto l_new_row_ptrs = l_new->get_row_ptrs(); - // count non-zeros per row - if (num_blocks > 0) { - kernel::ict_tri_spgeam_nnz - <<get_stream()>>>( - llh_row_ptrs, llh_col_idxs, a_row_ptrs, a_col_idxs, - l_new_row_ptrs, num_rows); - } - - // build row ptrs - components::prefix_sum_nonnegative(exec, l_new_row_ptrs, num_rows + 1); - - // resize output arrays - auto l_new_nnz = exec->copy_val_to_host(l_new_row_ptrs + num_rows); - l_new_builder.get_col_idx_array().resize_and_reset(l_new_nnz); - l_new_builder.get_value_array().resize_and_reset(l_new_nnz); - - auto l_new_col_idxs = l_new->get_col_idxs(); - auto l_new_vals = l_new->get_values(); - - // fill columns and values - if (num_blocks > 0) { - kernel::ict_tri_spgeam_init - <<get_stream()>>>( - llh_row_ptrs, llh_col_idxs, as_device_type(llh_vals), - a_row_ptrs, a_col_idxs, as_device_type(a_vals), l_row_ptrs, - l_col_idxs, as_device_type(l_vals), l_new_row_ptrs, - l_new_col_idxs, as_device_type(l_new_vals), num_rows); - } -} - - -GKO_ENABLE_IMPLEMENTATION_SELECTION(select_add_candidates, add_candidates); - - -template -void compute_factor(syn::value_list, - std::shared_ptr exec, - const matrix::Csr* a, - matrix::Csr* l, - const matrix::Coo* l_coo) -{ - auto total_nnz = static_cast(l->get_num_stored_elements()); - auto block_size = default_block_size / subwarp_size; - auto num_blocks = ceildiv(total_nnz, block_size); - if (num_blocks > 0) { - kernel::ict_sweep - <<get_stream()>>>( - a->get_const_row_ptrs(), a->get_const_col_idxs(), - as_device_type(a->get_const_values()), l->get_const_row_ptrs(), - l_coo->get_const_row_idxs(), l->get_const_col_idxs(), - as_device_type(l->get_values()), - static_cast(l->get_num_stored_elements())); - } -} - - -GKO_ENABLE_IMPLEMENTATION_SELECTION(select_compute_factor, compute_factor); - - -} // namespace - - -template -void add_candidates(std::shared_ptr exec, - const matrix::Csr* llh, - const matrix::Csr* a, - const matrix::Csr* l, - matrix::Csr* l_new) -{ - auto num_rows = a->get_size()[0]; - auto total_nnz = - llh->get_num_stored_elements() + a->get_num_stored_elements(); - auto total_nnz_per_row = total_nnz / num_rows; - select_add_candidates( - compiled_kernels(), - [&](int compiled_subwarp_size) { - return total_nnz_per_row <= compiled_subwarp_size || - compiled_subwarp_size == config::warp_size; - }, - syn::value_list(), syn::type_list<>(), exec, llh, a, l, l_new); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_PAR_ICT_ADD_CANDIDATES_KERNEL); - - -template -void compute_factor(std::shared_ptr exec, - const matrix::Csr* a, - matrix::Csr* l, - const matrix::Coo* l_coo) -{ - auto num_rows = a->get_size()[0]; - auto total_nnz = 2 * l->get_num_stored_elements(); - auto total_nnz_per_row = total_nnz / num_rows; - select_compute_factor( - compiled_kernels(), - [&](int compiled_subwarp_size) { - return total_nnz_per_row <= compiled_subwarp_size || - compiled_subwarp_size == config::warp_size; - }, - syn::value_list(), syn::type_list<>(), exec, a, l, l_coo); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_PAR_ICT_COMPUTE_FACTOR_KERNEL); - - -} // namespace par_ict_factorization -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/factorization/par_ilu_kernels.cu b/cuda/factorization/par_ilu_kernels.cu deleted file mode 100644 index 11c1ab1b3e2..00000000000 --- a/cuda/factorization/par_ilu_kernels.cu +++ /dev/null @@ -1,39 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/factorization/par_ilu_kernels.hpp" - - -#include -#include - - -#include "common/cuda_hip/base/runtime.hpp" -#include "cuda/base/math.hpp" -#include "cuda/base/types.hpp" -#include "cuda/components/memory.cuh" -#include "cuda/components/thread_ids.cuh" - - -namespace gko { -namespace kernels { -namespace cuda { -/** - * @brief The parallel ilu factorization namespace. - * - * @ingroup factor - */ -namespace par_ilu_factorization { - - -constexpr int default_block_size{512}; - - -#include "common/cuda_hip/factorization/par_ilu_kernels.hpp.inc" - - -} // namespace par_ilu_factorization -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/factorization/par_ilut_filter_kernels.cu b/cuda/factorization/par_ilut_filter_kernels.cu deleted file mode 100644 index ddd4b428d55..00000000000 --- a/cuda/factorization/par_ilut_filter_kernels.cu +++ /dev/null @@ -1,140 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/factorization/par_ilut_kernels.hpp" - - -#include -#include -#include -#include -#include - - -#include "common/cuda_hip/base/runtime.hpp" -#include "core/components/prefix_sum_kernels.hpp" -#include "core/matrix/coo_builder.hpp" -#include "core/matrix/csr_builder.hpp" -#include "core/matrix/csr_kernels.hpp" -#include "core/synthesizer/implementation_selection.hpp" -#include "cuda/base/config.hpp" -#include "cuda/base/math.hpp" -#include "cuda/base/types.hpp" -#include "cuda/components/cooperative_groups.cuh" -#include "cuda/components/intrinsics.cuh" -#include "cuda/components/thread_ids.cuh" - - -namespace gko { -namespace kernels { -namespace cuda { -/** - * @brief The parallel ILUT factorization namespace. - * - * @ingroup factor - */ -namespace par_ilut_factorization { - - -constexpr int default_block_size = 512; - - -// subwarp sizes for filter kernels -using compiled_kernels = - syn::value_list; - - -#include "common/cuda_hip/factorization/par_ilut_filter_kernels.hpp.inc" - - -namespace { - - -template -void threshold_filter(syn::value_list, - std::shared_ptr exec, - const matrix::Csr* a, - remove_complex threshold, - matrix::Csr* m_out, - matrix::Coo* m_out_coo, bool lower) -{ - auto old_row_ptrs = a->get_const_row_ptrs(); - auto old_col_idxs = a->get_const_col_idxs(); - auto old_vals = a->get_const_values(); - // compute nnz for each row - auto num_rows = static_cast(a->get_size()[0]); - auto block_size = default_block_size / subwarp_size; - auto num_blocks = ceildiv(num_rows, block_size); - auto new_row_ptrs = m_out->get_row_ptrs(); - if (num_blocks > 0) { - kernel::threshold_filter_nnz - <<get_stream()>>>( - old_row_ptrs, as_device_type(old_vals), num_rows, - as_device_type(threshold), new_row_ptrs, lower); - } - - // build row pointers - components::prefix_sum_nonnegative(exec, new_row_ptrs, num_rows + 1); - - // build matrix - auto new_nnz = exec->copy_val_to_host(new_row_ptrs + num_rows); - // resize arrays and update aliases - matrix::CsrBuilder builder{m_out}; - builder.get_col_idx_array().resize_and_reset(new_nnz); - builder.get_value_array().resize_and_reset(new_nnz); - auto new_col_idxs = m_out->get_col_idxs(); - auto new_vals = m_out->get_values(); - IndexType* new_row_idxs{}; - if (m_out_coo) { - matrix::CooBuilder coo_builder{m_out_coo}; - coo_builder.get_row_idx_array().resize_and_reset(new_nnz); - coo_builder.get_col_idx_array() = - make_array_view(exec, new_nnz, new_col_idxs); - coo_builder.get_value_array() = - make_array_view(exec, new_nnz, new_vals); - new_row_idxs = m_out_coo->get_row_idxs(); - } - if (num_blocks > 0) { - kernel::threshold_filter - <<get_stream()>>>( - old_row_ptrs, old_col_idxs, as_device_type(old_vals), num_rows, - as_device_type(threshold), new_row_ptrs, new_row_idxs, - new_col_idxs, as_device_type(new_vals), lower); - } -} - - -GKO_ENABLE_IMPLEMENTATION_SELECTION(select_threshold_filter, threshold_filter); - - -} // namespace - -template -void threshold_filter(std::shared_ptr exec, - const matrix::Csr* a, - remove_complex threshold, - matrix::Csr* m_out, - matrix::Coo* m_out_coo, bool lower) -{ - auto num_rows = a->get_size()[0]; - auto total_nnz = a->get_num_stored_elements(); - auto total_nnz_per_row = total_nnz / num_rows; - select_threshold_filter( - compiled_kernels(), - [&](int compiled_subwarp_size) { - return total_nnz_per_row <= compiled_subwarp_size || - compiled_subwarp_size == config::warp_size; - }, - syn::value_list(), syn::type_list<>(), exec, a, threshold, m_out, - m_out_coo, lower); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_KERNEL); - - -} // namespace par_ilut_factorization -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/factorization/par_ilut_select_kernels.cu b/cuda/factorization/par_ilut_select_kernels.cu deleted file mode 100644 index 6a7bd53c1c4..00000000000 --- a/cuda/factorization/par_ilut_select_kernels.cu +++ /dev/null @@ -1,162 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/factorization/par_ilut_kernels.hpp" - - -#include - - -#include -#include -#include - - -#include "common/cuda_hip/base/runtime.hpp" -#include "core/components/prefix_sum_kernels.hpp" -#include "cuda/base/math.hpp" -#include "cuda/components/atomic.cuh" -#include "cuda/components/intrinsics.cuh" -#include "cuda/components/prefix_sum.cuh" -#include "cuda/components/searching.cuh" -#include "cuda/components/sorting.cuh" -#include "cuda/components/thread_ids.cuh" -#include "cuda/factorization/par_ilut_select_common.cuh" - - -namespace gko { -namespace kernels { -namespace cuda { -/** - * @brief The parallel ILUT factorization namespace. - * - * @ingroup factor - */ -namespace par_ilut_factorization { - - -#include "common/cuda_hip/factorization/par_ilut_select_kernels.hpp.inc" - - -template -void sampleselect_filter(std::shared_ptr exec, - const ValueType* values, IndexType size, - const unsigned char* oracles, - const IndexType* partial_counts, IndexType bucket, - remove_complex* out) -{ - auto num_threads_total = ceildiv(size, items_per_thread); - auto num_blocks = - static_cast(ceildiv(num_threads_total, default_block_size)); - if (num_blocks > 0) { - kernel::filter_bucket<<get_stream()>>>( - as_device_type(values), size, bucket, oracles, partial_counts, - as_device_type(out), items_per_thread); - } -} - - -template -void threshold_select(std::shared_ptr exec, - const matrix::Csr* m, - IndexType rank, array& tmp1, - array>& tmp2, - remove_complex& threshold) -{ - auto values = m->get_const_values(); - IndexType size = m->get_num_stored_elements(); - using AbsType = remove_complex; - constexpr auto bucket_count = kernel::searchtree_width; - auto max_num_threads = ceildiv(size, items_per_thread); - auto max_num_blocks = ceildiv(max_num_threads, default_block_size); - - size_type tmp_size_totals = - ceildiv((bucket_count + 1) * sizeof(IndexType), sizeof(ValueType)); - size_type tmp_size_partials = ceildiv( - bucket_count * max_num_blocks * sizeof(IndexType), sizeof(ValueType)); - size_type tmp_size_oracles = - ceildiv(size * sizeof(unsigned char), sizeof(ValueType)); - size_type tmp_size_tree = - ceildiv(kernel::searchtree_size * sizeof(AbsType), sizeof(ValueType)); - size_type tmp_size_vals = - size / bucket_count * 4; // pessimistic estimate for temporary storage - size_type tmp_size = - tmp_size_totals + tmp_size_partials + tmp_size_oracles + tmp_size_tree; - tmp1.resize_and_reset(tmp_size); - tmp2.resize_and_reset(tmp_size_vals); - - auto total_counts = reinterpret_cast(tmp1.get_data()); - auto partial_counts = - reinterpret_cast(tmp1.get_data() + tmp_size_totals); - auto oracles = reinterpret_cast( - tmp1.get_data() + tmp_size_totals + tmp_size_partials); - auto tree = - reinterpret_cast(tmp1.get_data() + tmp_size_totals + - tmp_size_partials + tmp_size_oracles); - - sampleselect_count(exec, values, size, tree, oracles, partial_counts, - total_counts); - - // determine bucket with correct rank, use bucket-local rank - auto bucket = sampleselect_find_bucket(exec, total_counts, rank); - rank -= bucket.begin; - - if (bucket.size * 2 > tmp_size_vals) { - // we need to reallocate tmp2 - tmp2.resize_and_reset(bucket.size * 2); - } - auto tmp21 = tmp2.get_data(); - auto tmp22 = tmp2.get_data() + bucket.size; - // extract target bucket - sampleselect_filter(exec, values, size, oracles, partial_counts, bucket.idx, - tmp22); - - // recursively select from smaller buckets - int step{}; - while (bucket.size > kernel::basecase_size) { - std::swap(tmp21, tmp22); - const auto* tmp_in = tmp21; - auto tmp_out = tmp22; - - sampleselect_count(exec, tmp_in, bucket.size, tree, oracles, - partial_counts, total_counts); - auto new_bucket = sampleselect_find_bucket(exec, total_counts, rank); - sampleselect_filter(exec, tmp_in, bucket.size, oracles, partial_counts, - bucket.idx, tmp_out); - - rank -= new_bucket.begin; - bucket.size = new_bucket.size; - // we should never need more than 5 recursion steps, this would mean - // 256^5 = 2^40. fall back to standard library algorithm in that case. - ++step; - if (step > 5) { - array cpu_out_array{ - exec->get_master(), - make_array_view(exec, bucket.size, tmp_out)}; - auto begin = cpu_out_array.get_data(); - auto end = begin + bucket.size; - auto middle = begin + rank; - std::nth_element(begin, middle, end); - threshold = *middle; - return; - } - } - - // base case - auto out_ptr = reinterpret_cast(tmp1.get_data()); - kernel::basecase_select<<<1, kernel::basecase_block_size, 0, - exec->get_stream()>>>( - as_device_type(tmp22), bucket.size, rank, as_device_type(out_ptr)); - threshold = exec->copy_val_to_host(out_ptr); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_PAR_ILUT_THRESHOLD_SELECT_KERNEL); - - -} // namespace par_ilut_factorization -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/factorization/par_ilut_spgeam_kernels.cu b/cuda/factorization/par_ilut_spgeam_kernels.cu deleted file mode 100644 index 7f59e4edc37..00000000000 --- a/cuda/factorization/par_ilut_spgeam_kernels.cu +++ /dev/null @@ -1,159 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/factorization/par_ilut_kernels.hpp" - - -#include -#include -#include -#include -#include - - -#include "common/cuda_hip/base/runtime.hpp" -#include "core/components/prefix_sum_kernels.hpp" -#include "core/matrix/coo_builder.hpp" -#include "core/matrix/csr_builder.hpp" -#include "core/matrix/csr_kernels.hpp" -#include "core/synthesizer/implementation_selection.hpp" -#include "cuda/base/math.hpp" -#include "cuda/components/cooperative_groups.cuh" -#include "cuda/components/intrinsics.cuh" -#include "cuda/components/merging.cuh" -#include "cuda/components/prefix_sum.cuh" -#include "cuda/components/searching.cuh" -#include "cuda/components/thread_ids.cuh" - - -namespace gko { -namespace kernels { -namespace cuda { -/** - * @brief The parallel ILUT factorization namespace. - * - * @ingroup factor - */ -namespace par_ilut_factorization { - - -constexpr int default_block_size = 512; - - -// subwarp sizes for add_candidates kernels -using compiled_kernels = - syn::value_list; - - -#include "common/cuda_hip/factorization/par_ilut_spgeam_kernels.hpp.inc" - - -namespace { - - -template -void add_candidates(syn::value_list, - std::shared_ptr exec, - const matrix::Csr* lu, - const matrix::Csr* a, - const matrix::Csr* l, - const matrix::Csr* u, - matrix::Csr* l_new, - matrix::Csr* u_new) -{ - auto num_rows = static_cast(lu->get_size()[0]); - auto subwarps_per_block = default_block_size / subwarp_size; - auto num_blocks = ceildiv(num_rows, subwarps_per_block); - matrix::CsrBuilder l_new_builder(l_new); - matrix::CsrBuilder u_new_builder(u_new); - auto lu_row_ptrs = lu->get_const_row_ptrs(); - auto lu_col_idxs = lu->get_const_col_idxs(); - auto lu_vals = lu->get_const_values(); - auto a_row_ptrs = a->get_const_row_ptrs(); - auto a_col_idxs = a->get_const_col_idxs(); - auto a_vals = a->get_const_values(); - auto l_row_ptrs = l->get_const_row_ptrs(); - auto l_col_idxs = l->get_const_col_idxs(); - auto l_vals = l->get_const_values(); - auto u_row_ptrs = u->get_const_row_ptrs(); - auto u_col_idxs = u->get_const_col_idxs(); - auto u_vals = u->get_const_values(); - auto l_new_row_ptrs = l_new->get_row_ptrs(); - auto u_new_row_ptrs = u_new->get_row_ptrs(); - if (num_blocks > 0) { - // count non-zeros per row - kernel::tri_spgeam_nnz - <<get_stream()>>>( - lu_row_ptrs, lu_col_idxs, a_row_ptrs, a_col_idxs, - l_new_row_ptrs, u_new_row_ptrs, num_rows); - } - - // build row ptrs - components::prefix_sum_nonnegative(exec, l_new_row_ptrs, num_rows + 1); - components::prefix_sum_nonnegative(exec, u_new_row_ptrs, num_rows + 1); - - // resize output arrays - auto l_new_nnz = exec->copy_val_to_host(l_new_row_ptrs + num_rows); - auto u_new_nnz = exec->copy_val_to_host(u_new_row_ptrs + num_rows); - l_new_builder.get_col_idx_array().resize_and_reset(l_new_nnz); - l_new_builder.get_value_array().resize_and_reset(l_new_nnz); - u_new_builder.get_col_idx_array().resize_and_reset(u_new_nnz); - u_new_builder.get_value_array().resize_and_reset(u_new_nnz); - - auto l_new_col_idxs = l_new->get_col_idxs(); - auto l_new_vals = l_new->get_values(); - auto u_new_col_idxs = u_new->get_col_idxs(); - auto u_new_vals = u_new->get_values(); - - if (num_blocks > 0) { - // fill columns and values - kernel::tri_spgeam_init - <<get_stream()>>>( - lu_row_ptrs, lu_col_idxs, as_device_type(lu_vals), a_row_ptrs, - a_col_idxs, as_device_type(a_vals), l_row_ptrs, l_col_idxs, - as_device_type(l_vals), u_row_ptrs, u_col_idxs, - as_device_type(u_vals), l_new_row_ptrs, l_new_col_idxs, - as_device_type(l_new_vals), u_new_row_ptrs, u_new_col_idxs, - as_device_type(u_new_vals), num_rows); - } -} - - -GKO_ENABLE_IMPLEMENTATION_SELECTION(select_add_candidates, add_candidates); - - -} // namespace - - -template -void add_candidates(std::shared_ptr exec, - const matrix::Csr* lu, - const matrix::Csr* a, - const matrix::Csr* l, - const matrix::Csr* u, - matrix::Csr* l_new, - matrix::Csr* u_new) -{ - auto num_rows = a->get_size()[0]; - auto total_nnz = - lu->get_num_stored_elements() + a->get_num_stored_elements(); - auto total_nnz_per_row = total_nnz / num_rows; - select_add_candidates( - compiled_kernels(), - [&](int compiled_subwarp_size) { - return total_nnz_per_row <= compiled_subwarp_size || - compiled_subwarp_size == config::warp_size; - }, - syn::value_list(), syn::type_list<>(), exec, lu, a, l, u, l_new, - u_new); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_PAR_ILUT_ADD_CANDIDATES_KERNEL); - - -} // namespace par_ilut_factorization -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/factorization/par_ilut_sweep_kernels.cu b/cuda/factorization/par_ilut_sweep_kernels.cu deleted file mode 100644 index 5ec8dd81325..00000000000 --- a/cuda/factorization/par_ilut_sweep_kernels.cu +++ /dev/null @@ -1,123 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/factorization/par_ilut_kernels.hpp" - - -#include -#include -#include -#include -#include - - -#include "common/cuda_hip/base/runtime.hpp" -#include "core/components/prefix_sum_kernels.hpp" -#include "core/matrix/coo_builder.hpp" -#include "core/matrix/csr_builder.hpp" -#include "core/matrix/csr_kernels.hpp" -#include "core/synthesizer/implementation_selection.hpp" -#include "cuda/base/math.hpp" -#include "cuda/components/intrinsics.cuh" -#include "cuda/components/memory.cuh" -#include "cuda/components/merging.cuh" -#include "cuda/components/prefix_sum.cuh" -#include "cuda/components/reduction.cuh" -#include "cuda/components/searching.cuh" -#include "cuda/components/thread_ids.cuh" - - -namespace gko { -namespace kernels { -namespace cuda { -/** - * @brief The parallel ILUT factorization namespace. - * - * @ingroup factor - */ -namespace par_ilut_factorization { - - -constexpr int default_block_size = 512; - - -// subwarp sizes for all warp-parallel kernels (filter, add_candidates) -using compiled_kernels = - syn::value_list; - - -#include "common/cuda_hip/factorization/par_ilut_sweep_kernels.hpp.inc" - - -namespace { - - -template -void compute_l_u_factors(syn::value_list, - std::shared_ptr exec, - const matrix::Csr* a, - matrix::Csr* l, - const matrix::Coo* l_coo, - matrix::Csr* u, - const matrix::Coo* u_coo, - matrix::Csr* u_csc) -{ - auto total_nnz = static_cast(l->get_num_stored_elements() + - u->get_num_stored_elements()); - auto block_size = default_block_size / subwarp_size; - auto num_blocks = ceildiv(total_nnz, block_size); - if (num_blocks > 0) { - kernel::sweep - <<get_stream()>>>( - a->get_const_row_ptrs(), a->get_const_col_idxs(), - as_device_type(a->get_const_values()), l->get_const_row_ptrs(), - l_coo->get_const_row_idxs(), l->get_const_col_idxs(), - as_device_type(l->get_values()), - static_cast(l->get_num_stored_elements()), - u_coo->get_const_row_idxs(), u_coo->get_const_col_idxs(), - as_device_type(u->get_values()), u_csc->get_const_row_ptrs(), - u_csc->get_const_col_idxs(), - as_device_type(u_csc->get_values()), - static_cast(u->get_num_stored_elements())); - } -} - -GKO_ENABLE_IMPLEMENTATION_SELECTION(select_compute_l_u_factors, - compute_l_u_factors); - - -} // namespace - - -template -void compute_l_u_factors(std::shared_ptr exec, - const matrix::Csr* a, - matrix::Csr* l, - const matrix::Coo* l_coo, - matrix::Csr* u, - const matrix::Coo* u_coo, - matrix::Csr* u_csc) -{ - auto num_rows = a->get_size()[0]; - auto total_nnz = - l->get_num_stored_elements() + u->get_num_stored_elements(); - auto total_nnz_per_row = total_nnz / num_rows; - select_compute_l_u_factors( - compiled_kernels(), - [&](int compiled_subwarp_size) { - return total_nnz_per_row <= compiled_subwarp_size || - compiled_subwarp_size == config::warp_size; - }, - syn::value_list(), syn::type_list<>(), exec, a, l, l_coo, u, u_coo, - u_csc); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_PAR_ILUT_COMPUTE_LU_FACTORS_KERNEL); - - -} // namespace par_ilut_factorization -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/log/batch_logger.cuh b/cuda/log/batch_logger.cuh deleted file mode 100644 index 3e53d6ef0a6..00000000000 --- a/cuda/log/batch_logger.cuh +++ /dev/null @@ -1,27 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_CUDA_LOG_BATCH_LOGGER_CUH_ -#define GKO_CUDA_LOG_BATCH_LOGGER_CUH_ - - -#include - - -namespace gko { -namespace kernels { -namespace cuda { -namespace batch_log { - - -#include "common/cuda_hip/log/batch_logger.hpp.inc" - - -} // namespace batch_log -} // namespace cuda -} // namespace kernels -} // namespace gko - - -#endif // GKO_CUDA_LOG_BATCH_LOGGER_CUH_ diff --git a/cuda/matrix/batch_csr_kernels.cu b/cuda/matrix/batch_csr_kernels.cu deleted file mode 100644 index 0d7da274ca8..00000000000 --- a/cuda/matrix/batch_csr_kernels.cu +++ /dev/null @@ -1,58 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/matrix/batch_csr_kernels.hpp" - - -#include - - -#include -#include -#include - - -#include "common/cuda_hip/base/runtime.hpp" -#include "core/base/batch_struct.hpp" -#include "core/matrix/batch_struct.hpp" -#include "cuda/base/batch_struct.hpp" -#include "cuda/base/config.hpp" -#include "cuda/base/thrust.cuh" -#include "cuda/components/cooperative_groups.cuh" -#include "cuda/components/reduction.cuh" -#include "cuda/components/thread_ids.cuh" -#include "cuda/components/uninitialized_array.hpp" -#include "cuda/matrix/batch_struct.hpp" - - -namespace gko { -namespace kernels { -namespace cuda { -/** - * @brief The Csr matrix format namespace. - * @ref Csr - * @ingroup batch_csr - */ -namespace batch_csr { - - -constexpr auto default_block_size = 256; -constexpr int sm_oversubscription = 4; - -// clang-format off - -// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES - -#include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc" - - -#include "common/cuda_hip/matrix/batch_csr_kernel_launcher.hpp.inc" - -// clang-format on - - -} // namespace batch_csr -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/matrix/batch_dense_kernels.cu b/cuda/matrix/batch_dense_kernels.cu deleted file mode 100644 index ea10d088b32..00000000000 --- a/cuda/matrix/batch_dense_kernels.cu +++ /dev/null @@ -1,59 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/matrix/batch_dense_kernels.hpp" - - -#include - - -#include -#include -#include - - -#include "common/cuda_hip/base/runtime.hpp" -#include "core/base/batch_struct.hpp" -#include "core/matrix/batch_struct.hpp" -#include "cuda/base/batch_struct.hpp" -#include "cuda/base/config.hpp" -#include "cuda/base/thrust.cuh" -#include "cuda/components/cooperative_groups.cuh" -#include "cuda/components/reduction.cuh" -#include "cuda/components/thread_ids.cuh" -#include "cuda/components/uninitialized_array.hpp" -#include "cuda/matrix/batch_struct.hpp" - - -namespace gko { -namespace kernels { -namespace cuda { -/** - * @brief The Dense matrix format namespace. - * - * @ingroup batch_dense - */ -namespace batch_dense { - - -constexpr auto default_block_size = 256; -constexpr int sm_oversubscription = 4; - -// clang-format off - -// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES - -#include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc" - - -#include "common/cuda_hip/matrix/batch_dense_kernel_launcher.hpp.inc" - - -// clang-format on - - -} // namespace batch_dense -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/matrix/batch_ell_kernels.cu b/cuda/matrix/batch_ell_kernels.cu deleted file mode 100644 index 15d6d6bbd5b..00000000000 --- a/cuda/matrix/batch_ell_kernels.cu +++ /dev/null @@ -1,58 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/matrix/batch_ell_kernels.hpp" - - -#include - - -#include -#include -#include - - -#include "common/cuda_hip/base/runtime.hpp" -#include "core/base/batch_struct.hpp" -#include "core/matrix/batch_struct.hpp" -#include "cuda/base/batch_struct.hpp" -#include "cuda/base/config.hpp" -#include "cuda/base/thrust.cuh" -#include "cuda/components/cooperative_groups.cuh" -#include "cuda/components/reduction.cuh" -#include "cuda/components/thread_ids.cuh" -#include "cuda/components/uninitialized_array.hpp" -#include "cuda/matrix/batch_struct.hpp" - - -namespace gko { -namespace kernels { -namespace cuda { -/** - * @brief The Ell matrix format namespace. - * @ref Ell - * @ingroup batch_ell - */ -namespace batch_ell { - - -constexpr auto default_block_size = 256; -constexpr int sm_oversubscription = 4; - -// clang-format off - -// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES - -#include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc" - - -#include "common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc" - -// clang-format on - - -} // namespace batch_ell -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/matrix/coo_kernels.cu b/cuda/matrix/coo_kernels.cu deleted file mode 100644 index 38df6a91c9f..00000000000 --- a/cuda/matrix/coo_kernels.cu +++ /dev/null @@ -1,49 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/matrix/coo_kernels.hpp" - - -#include -#include -#include -#include -#include - - -#include "common/cuda_hip/base/runtime.hpp" -#include "core/matrix/dense_kernels.hpp" -#include "cuda/base/config.hpp" -#include "cuda/base/cusparse_bindings.hpp" -#include "cuda/base/math.hpp" -#include "cuda/base/types.hpp" -#include "cuda/components/atomic.cuh" -#include "cuda/components/cooperative_groups.cuh" -#include "cuda/components/format_conversion.cuh" -#include "cuda/components/segment_scan.cuh" -#include "cuda/components/thread_ids.cuh" - - -namespace gko { -namespace kernels { -namespace cuda { -/** - * @brief The Coordinate matrix format namespace. - * - * @ingroup coo - */ -namespace coo { - - -constexpr int warps_in_block = 4; -constexpr int spmv_block_size = warps_in_block * config::warp_size; - - -#include "common/cuda_hip/matrix/coo_kernels.hpp.inc" - - -} // namespace coo -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/matrix/dense_kernels.cu b/cuda/matrix/dense_kernels.cu deleted file mode 100644 index 2d159282e31..00000000000 --- a/cuda/matrix/dense_kernels.cu +++ /dev/null @@ -1,232 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/matrix/dense_kernels.hpp" - - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -#include "common/cuda_hip/base/runtime.hpp" -#include "core/base/utils.hpp" -#include "core/components/prefix_sum_kernels.hpp" -#include "cuda/base/config.hpp" -#include "cuda/base/cublas_bindings.hpp" -#include "cuda/base/pointer_mode_guard.hpp" -#include "cuda/components/cooperative_groups.cuh" -#include "cuda/components/intrinsics.cuh" -#include "cuda/components/reduction.cuh" -#include "cuda/components/thread_ids.cuh" -#include "cuda/components/uninitialized_array.hpp" - - -namespace gko { -namespace kernels { -namespace cuda { -/** - * @brief The Dense matrix format namespace. - * - * @ingroup dense - */ -namespace dense { - - -constexpr int default_block_size = 512; - - -#include "common/cuda_hip/matrix/dense_kernels.hpp.inc" - - -template -void compute_dot_dispatch(std::shared_ptr exec, - const matrix::Dense* x, - const matrix::Dense* y, - matrix::Dense* result, array& tmp) -{ - if (x->get_size()[1] == 1 && y->get_size()[1] == 1) { - if (blas::is_supported::value) { - auto handle = exec->get_blas_handle(); - blas::dot(handle, x->get_size()[0], x->get_const_values(), - x->get_stride(), y->get_const_values(), y->get_stride(), - result->get_values()); - } else { - compute_dot(exec, x, y, result, tmp); - } - } else { - compute_dot(exec, x, y, result, tmp); - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL); - - -template -void compute_conj_dot_dispatch(std::shared_ptr exec, - const matrix::Dense* x, - const matrix::Dense* y, - matrix::Dense* result, - array& tmp) -{ - if (x->get_size()[1] == 1 && y->get_size()[1] == 1) { - if (blas::is_supported::value) { - auto handle = exec->get_blas_handle(); - blas::conj_dot(handle, x->get_size()[0], x->get_const_values(), - x->get_stride(), y->get_const_values(), - y->get_stride(), result->get_values()); - } else { - compute_conj_dot(exec, x, y, result, tmp); - } - } else { - compute_conj_dot(exec, x, y, result, tmp); - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL); - - -template -void compute_norm2_dispatch(std::shared_ptr exec, - const matrix::Dense* x, - matrix::Dense>* result, - array& tmp) -{ - if (x->get_size()[1] == 1) { - if (blas::is_supported::value) { - auto handle = exec->get_blas_handle(); - blas::norm2(handle, x->get_size()[0], x->get_const_values(), - x->get_stride(), result->get_values()); - } else { - compute_norm2(exec, x, result, tmp); - } - } else { - compute_norm2(exec, x, result, tmp); - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL); - - -template -void simple_apply(std::shared_ptr exec, - const matrix::Dense* a, - const matrix::Dense* b, - matrix::Dense* c) -{ - if (blas::is_supported::value) { - auto handle = exec->get_blas_handle(); - if (c->get_size()[0] > 0 && c->get_size()[1] > 0) { - if (a->get_size()[1] > 0) { - blas::pointer_mode_guard pm_guard(handle); - auto alpha = one(); - auto beta = zero(); - blas::gemm(handle, BLAS_OP_N, BLAS_OP_N, c->get_size()[1], - c->get_size()[0], a->get_size()[1], &alpha, - b->get_const_values(), b->get_stride(), - a->get_const_values(), a->get_stride(), &beta, - c->get_values(), c->get_stride()); - } else { - dense::fill(exec, c, zero()); - } - } - } else { - GKO_NOT_IMPLEMENTED; - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_SIMPLE_APPLY_KERNEL); - - -template -void apply(std::shared_ptr exec, - const matrix::Dense* alpha, - const matrix::Dense* a, const matrix::Dense* b, - const matrix::Dense* beta, matrix::Dense* c) -{ - if (blas::is_supported::value) { - if (c->get_size()[0] > 0 && c->get_size()[1] > 0) { - if (a->get_size()[1] > 0) { - blas::gemm(exec->get_blas_handle(), BLAS_OP_N, BLAS_OP_N, - c->get_size()[1], c->get_size()[0], a->get_size()[1], - alpha->get_const_values(), b->get_const_values(), - b->get_stride(), a->get_const_values(), - a->get_stride(), beta->get_const_values(), - c->get_values(), c->get_stride()); - } else { - dense::scale(exec, beta, c); - } - } - } else { - GKO_NOT_IMPLEMENTED; - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL); - - -template -void transpose(std::shared_ptr exec, - const matrix::Dense* orig, - matrix::Dense* trans) -{ - if (blas::is_supported::value) { - auto handle = exec->get_blas_handle(); - if (orig->get_size()[0] > 0 && orig->get_size()[1] > 0) { - blas::pointer_mode_guard pm_guard(handle); - auto alpha = one(); - auto beta = zero(); - blas::geam(handle, BLAS_OP_T, BLAS_OP_N, orig->get_size()[0], - orig->get_size()[1], &alpha, orig->get_const_values(), - orig->get_stride(), &beta, trans->get_const_values(), - trans->get_stride(), trans->get_values(), - trans->get_stride()); - } - } else { - GKO_NOT_IMPLEMENTED; - } -}; - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_TRANSPOSE_KERNEL); - - -template -void conj_transpose(std::shared_ptr exec, - const matrix::Dense* orig, - matrix::Dense* trans) -{ - if (blas::is_supported::value) { - auto handle = exec->get_blas_handle(); - if (orig->get_size()[0] > 0 && orig->get_size()[1] > 0) { - blas::pointer_mode_guard pm_guard(handle); - auto alpha = one(); - auto beta = zero(); - blas::geam(handle, BLAS_OP_C, BLAS_OP_N, orig->get_size()[0], - orig->get_size()[1], &alpha, orig->get_const_values(), - orig->get_stride(), &beta, trans->get_const_values(), - trans->get_stride(), trans->get_values(), - trans->get_stride()); - } - } else { - GKO_NOT_IMPLEMENTED; - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_CONJ_TRANSPOSE_KERNEL); - - -} // namespace dense -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/matrix/diagonal_kernels.cu b/cuda/matrix/diagonal_kernels.cu deleted file mode 100644 index 7eaa35a638a..00000000000 --- a/cuda/matrix/diagonal_kernels.cu +++ /dev/null @@ -1,39 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/matrix/diagonal_kernels.hpp" - - -#include -#include - - -#include "common/cuda_hip/base/runtime.hpp" -#include "cuda/base/config.hpp" -#include "cuda/base/types.hpp" -#include "cuda/components/cooperative_groups.cuh" -#include "cuda/components/thread_ids.cuh" - - -namespace gko { -namespace kernels { -namespace cuda { -/** - * @brief The Diagonal matrix format namespace. - * - * @ingroup diagonal - */ -namespace diagonal { - - -constexpr int default_block_size = 512; - - -#include "common/cuda_hip/matrix/diagonal_kernels.hpp.inc" - - -} // namespace diagonal -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/matrix/fbcsr_kernels.template.cu b/cuda/matrix/fbcsr_kernels.template.cu deleted file mode 100644 index 3ad7f106049..00000000000 --- a/cuda/matrix/fbcsr_kernels.template.cu +++ /dev/null @@ -1,303 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/matrix/fbcsr_kernels.hpp" - - -#include - - -#include -#include -#include -#include -#include -#include -#include - - -#include -#include -#include -#include -#include - - -#include "common/cuda_hip/base/runtime.hpp" -#include "common/unified/base/kernel_launch.hpp" -#include "core/base/array_access.hpp" -#include "core/base/block_sizes.hpp" -#include "core/base/device_matrix_data_kernels.hpp" -#include "core/components/fill_array_kernels.hpp" -#include "core/components/format_conversion_kernels.hpp" -#include "core/matrix/csr_lookup.hpp" -#include "core/matrix/dense_kernels.hpp" -#include "core/synthesizer/implementation_selection.hpp" -#include "cuda/base/config.hpp" -#include "cuda/base/cublas_bindings.hpp" -#include "cuda/base/cusparse_bindings.hpp" -#include "cuda/base/cusparse_block_bindings.hpp" -#include "cuda/base/math.hpp" -#include "cuda/base/pointer_mode_guard.hpp" -#include "cuda/base/thrust.cuh" -#include "cuda/base/types.hpp" -#include "cuda/components/atomic.cuh" -#include "cuda/components/cooperative_groups.cuh" -#include "cuda/components/merging.cuh" -#include "cuda/components/prefix_sum.cuh" -#include "cuda/components/reduction.cuh" -#include "cuda/components/thread_ids.cuh" -#include "cuda/components/uninitialized_array.hpp" - - -namespace gko { -namespace kernels { -namespace cuda { - - -/** - * @brief The fixed-size block compressed sparse row matrix format namespace. - * - * @ingroup fbcsr - */ -namespace fbcsr { - - -constexpr int default_block_size{512}; - - -#include "common/cuda_hip/matrix/csr_common.hpp.inc" -#include "common/cuda_hip/matrix/fbcsr_kernels.hpp.inc" - - -namespace { - - -template -void dense_transpose(std::shared_ptr exec, - const size_type nrows, const size_type ncols, - const size_type orig_stride, const ValueType* const orig, - const size_type trans_stride, ValueType* const trans) -{ - if (nrows == 0) { - return; - } - if (blas::is_supported::value) { - auto handle = exec->get_blas_handle(); - { - blas::pointer_mode_guard pm_guard(handle); - auto alpha = one(); - auto beta = zero(); - blas::geam(handle, BLAS_OP_T, BLAS_OP_N, nrows, ncols, &alpha, orig, - orig_stride, &beta, trans, trans_stride, trans, - trans_stride); - } - } else { - GKO_NOT_IMPLEMENTED; - } -} - - -} // namespace - - -template -void spmv(std::shared_ptr exec, - const matrix::Fbcsr* const a, - const matrix::Dense* const b, - matrix::Dense* const c) -{ - if (c->get_size()[0] == 0 || c->get_size()[1] == 0) { - // empty output: nothing to do - return; - } - if (b->get_size()[0] == 0 || a->get_num_stored_blocks() == 0) { - // empty input: fill output with zero - dense::fill(exec, c, zero()); - return; - } - if (sparselib::is_supported::value) { - auto handle = exec->get_sparselib_handle(); - sparselib::pointer_mode_guard pm_guard(handle); - const auto alpha = one(); - const auto beta = zero(); - auto descr = sparselib::create_mat_descr(); - const auto row_ptrs = a->get_const_row_ptrs(); - const auto col_idxs = a->get_const_col_idxs(); - const auto values = a->get_const_values(); - const int bs = a->get_block_size(); - const IndexType mb = a->get_num_block_rows(); - const IndexType nb = a->get_num_block_cols(); - const auto nnzb = static_cast(a->get_num_stored_blocks()); - const auto nrhs = static_cast(b->get_size()[1]); - const auto nrows = a->get_size()[0]; - const auto ncols = a->get_size()[1]; - const auto in_stride = b->get_stride(); - const auto out_stride = c->get_stride(); - if (nrhs == 1 && in_stride == 1 && out_stride == 1) { - sparselib::bsrmv(handle, SPARSELIB_OPERATION_NON_TRANSPOSE, mb, nb, - nnzb, &alpha, descr, values, row_ptrs, col_idxs, - bs, b->get_const_values(), &beta, c->get_values()); - } else { - const auto trans_stride = nrows; - auto trans_c = array(exec, nrows * nrhs); - sparselib::bsrmm(handle, SPARSELIB_OPERATION_NON_TRANSPOSE, - SPARSELIB_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb, - &alpha, descr, values, row_ptrs, col_idxs, bs, - b->get_const_values(), in_stride, &beta, - trans_c.get_data(), trans_stride); - dense_transpose(exec, nrhs, nrows, trans_stride, trans_c.get_data(), - out_stride, c->get_values()); - } - sparselib::destroy(descr); - } else { - GKO_NOT_IMPLEMENTED; - } -} - - -template -void advanced_spmv(std::shared_ptr exec, - const matrix::Dense* const alpha, - const matrix::Fbcsr* const a, - const matrix::Dense* const b, - const matrix::Dense* const beta, - matrix::Dense* const c) -{ - if (c->get_size()[0] == 0 || c->get_size()[1] == 0) { - // empty output: nothing to do - return; - } - if (b->get_size()[0] == 0 || a->get_num_stored_blocks() == 0) { - // empty input: scale output - dense::scale(exec, beta, c); - return; - } - if (sparselib::is_supported::value) { - auto handle = exec->get_sparselib_handle(); - const auto alphp = alpha->get_const_values(); - const auto betap = beta->get_const_values(); - auto descr = sparselib::create_mat_descr(); - const auto row_ptrs = a->get_const_row_ptrs(); - const auto col_idxs = a->get_const_col_idxs(); - const auto values = a->get_const_values(); - const int bs = a->get_block_size(); - const IndexType mb = a->get_num_block_rows(); - const IndexType nb = a->get_num_block_cols(); - const auto nnzb = static_cast(a->get_num_stored_blocks()); - const auto nrhs = static_cast(b->get_size()[1]); - const auto nrows = a->get_size()[0]; - const auto ncols = a->get_size()[1]; - const auto in_stride = b->get_stride(); - const auto out_stride = c->get_stride(); - if (nrhs == 1 && in_stride == 1 && out_stride == 1) { - sparselib::bsrmv(handle, SPARSELIB_OPERATION_NON_TRANSPOSE, mb, nb, - nnzb, alphp, descr, values, row_ptrs, col_idxs, bs, - b->get_const_values(), betap, c->get_values()); - } else { - const auto trans_stride = nrows; - auto trans_c = array(exec, nrows * nrhs); - dense_transpose(exec, nrows, nrhs, out_stride, c->get_values(), - trans_stride, trans_c.get_data()); - sparselib::bsrmm(handle, SPARSELIB_OPERATION_NON_TRANSPOSE, - SPARSELIB_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb, - alphp, descr, values, row_ptrs, col_idxs, bs, - b->get_const_values(), in_stride, betap, - trans_c.get_data(), trans_stride); - dense_transpose(exec, nrhs, nrows, trans_stride, trans_c.get_data(), - out_stride, c->get_values()); - } - sparselib::destroy(descr); - } else { - GKO_NOT_IMPLEMENTED; - } -} - - -namespace { - - -template -void transpose_blocks_impl(syn::value_list, - std::shared_ptr exec, - matrix::Fbcsr* const mat) -{ - constexpr int subwarp_size = config::warp_size; - const auto nbnz = mat->get_num_stored_blocks(); - const auto numthreads = nbnz * subwarp_size; - const auto block_size = default_block_size; - const auto grid_dim = ceildiv(numthreads, block_size); - if (grid_dim > 0) { - kernel::transpose_blocks - <<get_stream()>>>( - nbnz, mat->get_values()); - } -} - -GKO_ENABLE_IMPLEMENTATION_SELECTION(select_transpose_blocks, - transpose_blocks_impl); - - -} // namespace - - -template -void transpose(const std::shared_ptr exec, - const matrix::Fbcsr* const orig, - matrix::Fbcsr* const trans) -{ -#ifdef GKO_COMPILING_CUDA - if (sparselib::is_supported::value) { - const int bs = orig->get_block_size(); - const IndexType nnzb = - static_cast(orig->get_num_stored_blocks()); - cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC; - cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO; - const IndexType buffer_size = sparselib::bsr_transpose_buffersize( - exec->get_sparselib_handle(), orig->get_num_block_rows(), - orig->get_num_block_cols(), nnzb, orig->get_const_values(), - orig->get_const_row_ptrs(), orig->get_const_col_idxs(), bs, bs); - array buffer_array(exec, buffer_size); - auto buffer = buffer_array.get_data(); - sparselib::bsr_transpose( - exec->get_sparselib_handle(), orig->get_num_block_rows(), - orig->get_num_block_cols(), nnzb, orig->get_const_values(), - orig->get_const_row_ptrs(), orig->get_const_col_idxs(), bs, bs, - trans->get_values(), trans->get_col_idxs(), trans->get_row_ptrs(), - copyValues, idxBase, buffer); - - // transpose blocks - select_transpose_blocks( - fixedblock::compiled_kernels(), - [bs](int compiled_block_size) { return bs == compiled_block_size; }, - syn::value_list(), syn::type_list<>(), exec, trans); - } else -#endif - { - fallback_transpose(exec, orig, trans); - } -} - - -template -void conj_transpose(std::shared_ptr exec, - const matrix::Fbcsr* orig, - matrix::Fbcsr* trans) -{ - const int grid_size = - ceildiv(trans->get_num_stored_elements(), default_block_size); - transpose(exec, orig, trans); - if (grid_size > 0 && is_complex()) { - kernel:: - conjugate<<get_stream()>>>( - trans->get_num_stored_elements(), - as_device_type(trans->get_values())); - } -} - - -} // namespace fbcsr -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/matrix/sellp_kernels.cu b/cuda/matrix/sellp_kernels.cu deleted file mode 100644 index 4dcc756a186..00000000000 --- a/cuda/matrix/sellp_kernels.cu +++ /dev/null @@ -1,44 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/matrix/sellp_kernels.hpp" - - -#include -#include -#include -#include -#include - - -#include "common/cuda_hip/base/runtime.hpp" -#include "core/components/prefix_sum_kernels.hpp" -#include "cuda/base/config.hpp" -#include "cuda/base/cusparse_bindings.hpp" -#include "cuda/base/types.hpp" -#include "cuda/components/reduction.cuh" -#include "cuda/components/thread_ids.cuh" - - -namespace gko { -namespace kernels { -namespace cuda { -/** - * @brief The SELL-P matrix format namespace. - * - * @ingroup sellp - */ -namespace sellp { - - -constexpr int default_block_size = 512; - - -#include "common/cuda_hip/matrix/sellp_kernels.hpp.inc" - - -} // namespace sellp -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/matrix/sparsity_csr_kernels.cu b/cuda/matrix/sparsity_csr_kernels.cu deleted file mode 100644 index 8176581859b..00000000000 --- a/cuda/matrix/sparsity_csr_kernels.cu +++ /dev/null @@ -1,226 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/matrix/sparsity_csr_kernels.hpp" - - -#include - - -#include - - -#include "accessor/cuda_helper.hpp" -#include "accessor/reduced_row_major.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "core/base/mixed_precision_types.hpp" -#include "core/components/fill_array_kernels.hpp" -#include "core/components/format_conversion_kernels.hpp" -#include "core/synthesizer/implementation_selection.hpp" -#include "cuda/base/config.hpp" -#include "cuda/base/cusparse_bindings.hpp" -#include "cuda/base/math.hpp" -#include "cuda/base/thrust.cuh" -#include "cuda/base/types.hpp" -#include "cuda/components/cooperative_groups.cuh" -#include "cuda/components/reduction.cuh" -#include "cuda/components/thread_ids.cuh" -#include "cuda/components/uninitialized_array.hpp" - - -namespace gko { -namespace kernels { -namespace cuda { -/** - * @brief The Compressed sparse row matrix format namespace. - * - * @ingroup sparsity - */ -namespace sparsity_csr { - - -constexpr int classical_oversubscription = 32; -constexpr int default_block_size = 512; -#ifdef GKO_COMPILING_HIP -constexpr int spmv_block_size = 256; -#else -constexpr int spmv_block_size = 128; -#endif -constexpr int warps_in_block = 4; - - -using classical_kernels = syn::value_list; - - -#include "common/cuda_hip/matrix/csr_common.hpp.inc" -#include "common/cuda_hip/matrix/sparsity_csr_kernels.hpp.inc" - - -namespace host_kernel { - - -template -void classical_spmv(syn::value_list, - std::shared_ptr exec, - const matrix::SparsityCsr* a, - const matrix::Dense* b, - matrix::Dense* c, - const matrix::Dense* alpha = nullptr, - const matrix::Dense* beta = nullptr) -{ - using arithmetic_type = - highest_precision; - using input_accessor = - gko::acc::reduced_row_major<2, arithmetic_type, const InputValueType>; - using output_accessor = - gko::acc::reduced_row_major<2, arithmetic_type, OutputValueType>; - - const auto nwarps = exec->get_num_warps_per_sm() * - exec->get_num_multiprocessor() * - classical_oversubscription; - const auto gridx = - std::min(ceildiv(a->get_size()[0], spmv_block_size / subwarp_size), - int64(nwarps / warps_in_block)); - const dim3 grid(gridx, b->get_size()[1]); - const auto block = spmv_block_size; - - const auto b_vals = gko::acc::range( - std::array{ - {static_cast(b->get_size()[0]), - static_cast(b->get_size()[1])}}, - b->get_const_values(), - std::array{ - {static_cast(b->get_stride())}}); - auto c_vals = gko::acc::range( - std::array{ - {static_cast(c->get_size()[0]), - static_cast(c->get_size()[1])}}, - c->get_values(), - std::array{ - {static_cast(c->get_stride())}}); - if (c->get_size()[0] == 0 || c->get_size()[1] == 0) { - // empty output: nothing to do - return; - } - if (alpha == nullptr && beta == nullptr) { - kernel::abstract_classical_spmv - <<get_stream()>>>( - a->get_size()[0], as_device_type(a->get_const_value()), - a->get_const_col_idxs(), - as_device_type(a->get_const_row_ptrs()), - acc::as_cuda_range(b_vals), acc::as_cuda_range(c_vals)); - } else if (alpha != nullptr && beta != nullptr) { - kernel::abstract_classical_spmv - <<get_stream()>>>( - a->get_size()[0], as_device_type(alpha->get_const_values()), - as_device_type(a->get_const_value()), a->get_const_col_idxs(), - as_device_type(a->get_const_row_ptrs()), - acc::as_cuda_range(b_vals), - as_device_type(beta->get_const_values()), - acc::as_cuda_range(c_vals)); - } else { - GKO_KERNEL_NOT_FOUND; - } -} - -GKO_ENABLE_IMPLEMENTATION_SELECTION(select_classical_spmv, classical_spmv); - - -} // namespace host_kernel - -template -void spmv(std::shared_ptr exec, - const matrix::SparsityCsr* a, - const matrix::Dense* b, - matrix::Dense* c) -{ - host_kernel::select_classical_spmv( - classical_kernels(), [](int compiled_info) { return true; }, - syn::value_list(), syn::type_list<>(), exec, a, b, c); -} - -GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_SPARSITY_CSR_SPMV_KERNEL); - - -template -void advanced_spmv(std::shared_ptr exec, - const matrix::Dense* alpha, - const matrix::SparsityCsr* a, - const matrix::Dense* b, - const matrix::Dense* beta, - matrix::Dense* c) -{ - host_kernel::select_classical_spmv( - classical_kernels(), [](int compiled_info) { return true; }, - syn::value_list(), syn::type_list<>(), exec, a, b, c, alpha, beta); -} - -GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_SPARSITY_CSR_ADVANCED_SPMV_KERNEL); - - -template -void sort_by_column_index(std::shared_ptr exec, - matrix::SparsityCsr* to_sort) -{ - const auto nnz = static_cast(to_sort->get_num_nonzeros()); - const auto num_rows = static_cast(to_sort->get_size()[0]); - const auto num_cols = static_cast(to_sort->get_size()[1]); - const auto row_ptrs = to_sort->get_const_row_ptrs(); - const auto col_idxs = to_sort->get_col_idxs(); - if (sparselib::is_supported::value) { - const auto handle = exec->get_sparselib_handle(); - auto descr = sparselib::create_mat_descr(); - array permutation_array(exec, to_sort->get_num_nonzeros()); - auto permutation = permutation_array.get_data(); - components::fill_seq_array(exec, permutation, - to_sort->get_num_nonzeros()); - size_type buffer_size{}; - sparselib::csrsort_buffer_size(handle, num_rows, num_cols, nnz, - row_ptrs, col_idxs, buffer_size); - array buffer_array{exec, buffer_size}; - auto buffer = buffer_array.get_data(); - sparselib::csrsort(handle, num_rows, num_cols, nnz, descr, row_ptrs, - col_idxs, permutation, buffer); - sparselib::destroy(descr); - } else { - fallback_sort(exec, to_sort); - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_SPARSITY_CSR_SORT_BY_COLUMN_INDEX); - - -template -void is_sorted_by_column_index( - std::shared_ptr exec, - const matrix::SparsityCsr* to_check, bool* is_sorted) -{ - *is_sorted = true; - auto cpu_array = make_array_view(exec->get_master(), 1, is_sorted); - auto gpu_array = array{exec, cpu_array}; - const auto num_rows = static_cast(to_check->get_size()[0]); - auto num_blocks = ceildiv(num_rows, default_block_size); - if (num_blocks > 0) { - kernel::check_unsorted<<get_stream()>>>( - to_check->get_const_row_ptrs(), to_check->get_const_col_idxs(), - num_rows, gpu_array.get_data()); - } - cpu_array = gpu_array; -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_SPARSITY_CSR_IS_SORTED_BY_COLUMN_INDEX); - - -} // namespace sparsity_csr -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/multigrid/pgm_kernels.cu b/cuda/multigrid/pgm_kernels.cu deleted file mode 100644 index 1b3915c82e9..00000000000 --- a/cuda/multigrid/pgm_kernels.cu +++ /dev/null @@ -1,43 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/multigrid/pgm_kernels.hpp" - - -#include - - -#include -#include -#include -#include -#include - - -#include -#include - - -#include "cuda/base/thrust.cuh" -#include "cuda/base/types.hpp" - - -namespace gko { -namespace kernels { -namespace cuda { -/** - * @brief The PGM solver namespace. - * - * @ingroup pgm - */ -namespace pgm { - - -#include "common/cuda_hip/multigrid/pgm_kernels.hpp.inc" - - -} // namespace pgm -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/preconditioner/isai_kernels.cu b/cuda/preconditioner/isai_kernels.cu deleted file mode 100644 index 0912b4c25f5..00000000000 --- a/cuda/preconditioner/isai_kernels.cu +++ /dev/null @@ -1,49 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/preconditioner/isai_kernels.hpp" - - -#include -#include -#include - - -#include "common/cuda_hip/base/runtime.hpp" -#include "core/components/prefix_sum_kernels.hpp" -#include "core/matrix/csr_builder.hpp" -#include "cuda/base/config.hpp" -#include "cuda/base/math.hpp" -#include "cuda/base/types.hpp" -#include "cuda/components/cooperative_groups.cuh" -#include "cuda/components/merging.cuh" -#include "cuda/components/reduction.cuh" -#include "cuda/components/thread_ids.cuh" -#include "cuda/components/uninitialized_array.hpp" -#include "cuda/components/warp_blas.cuh" - - -namespace gko { -namespace kernels { -namespace cuda { -/** - * @brief The Isai preconditioner namespace. - * @ref Isai - * @ingroup isai - */ -namespace isai { - - -constexpr int subwarp_size{row_size_limit}; -constexpr int subwarps_per_block{2}; -constexpr int default_block_size{subwarps_per_block * subwarp_size}; - - -#include "common/cuda_hip/preconditioner/isai_kernels.hpp.inc" - - -} // namespace isai -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/preconditioner/jacobi_kernels.cu b/cuda/preconditioner/jacobi_kernels.cu deleted file mode 100644 index dfe8d042b29..00000000000 --- a/cuda/preconditioner/jacobi_kernels.cu +++ /dev/null @@ -1,51 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/preconditioner/jacobi_kernels.hpp" - - -#include - - -#include "common/cuda_hip/base/runtime.hpp" -#include "core/base/extended_float.hpp" -#include "core/preconditioner/jacobi_utils.hpp" -#include "core/synthesizer/implementation_selection.hpp" -#include "cuda/base/config.hpp" -#include "cuda/base/math.hpp" -#include "cuda/base/types.hpp" -#include "cuda/components/cooperative_groups.cuh" -#include "cuda/components/thread_ids.cuh" -#include "cuda/preconditioner/jacobi_common.hpp" - - -namespace gko { -namespace kernels { -namespace cuda { -/** - * @brief The Jacobi preconditioner namespace. - * @ref Jacobi - * @ingroup jacobi - */ -namespace jacobi { - - -// a total of 32/16 warps (1024 threads) -#if defined(GKO_COMPILING_HIP) && GINKGO_HIP_PLATFORM_HCC -constexpr int default_num_warps = 16; -#else // !defined(GKO_COMPILING_HIP) || GINKGO_HIP_PLATFORM_NVCC -constexpr int default_num_warps = 32; -#endif -// with current architectures, at most 32 warps can be scheduled per SM (and -// current GPUs have at most 84 SMs) -constexpr int default_grid_size = 32 * 32 * 128; - - -#include "common/cuda_hip/preconditioner/jacobi_kernels.hpp.inc" - - -} // namespace jacobi -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/reorder/rcm_kernels.cu b/cuda/reorder/rcm_kernels.cu deleted file mode 100644 index d699d00dfb6..00000000000 --- a/cuda/reorder/rcm_kernels.cu +++ /dev/null @@ -1,54 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/reorder/rcm_kernels.hpp" - - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -#include -#include -#include -#include -#include -#include - - -#include "core/base/array_access.hpp" -#include "cuda/base/thrust.cuh" -#include "cuda/components/memory.cuh" -#include "cuda/components/thread_ids.cuh" - - -namespace gko { -namespace kernels { -namespace cuda { -/** - * @brief The reordering namespace. - * - * @ingroup reorder - */ -namespace rcm { - - -constexpr int default_block_size = 512; - - -#include "common/cuda_hip/reorder/rcm_kernels.hpp.inc" - - -} // namespace rcm -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/solver/cb_gmres_kernels.cu b/cuda/solver/cb_gmres_kernels.cu deleted file mode 100644 index 3cbe036f55f..00000000000 --- a/cuda/solver/cb_gmres_kernels.cu +++ /dev/null @@ -1,507 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/solver/cb_gmres_kernels.hpp" - - -#include - - -#include -#include -#include -#include - - -#include "accessor/cuda_helper.hpp" -#include "accessor/range.hpp" -#include "accessor/reduced_row_major.hpp" -#include "accessor/scaled_reduced_row_major.hpp" -#include "core/base/array_access.hpp" -#include "core/components/fill_array_kernels.hpp" -#include "core/matrix/dense_kernels.hpp" -#include "core/solver/cb_gmres_accessor.hpp" -#include "cuda/base/config.hpp" -#include "cuda/base/math.hpp" -#include "cuda/base/types.hpp" -#include "cuda/components/atomic.cuh" -#include "cuda/components/cooperative_groups.cuh" -#include "cuda/components/reduction.cuh" -#include "cuda/components/thread_ids.cuh" -#include "cuda/components/uninitialized_array.hpp" - - -namespace gko { -namespace kernels { -namespace cuda { -/** - * @brief The CB_GMRES solver namespace. - * - * @ingroup cb_gmres - */ -namespace cb_gmres { - - -constexpr int default_block_size = 512; -// default_dot_dim can not be 64 in hip because 64 * 64 exceeds their max block -// size limit. -constexpr int default_dot_dim = 32; -constexpr int default_dot_size = default_dot_dim * default_dot_dim; - - -#include "common/cuda_hip/solver/cb_gmres_kernels.hpp.inc" - - -template -void zero_matrix(std::shared_ptr exec, size_type m, - size_type n, size_type stride, ValueType* array) -{ - const auto block_size = default_block_size; - const auto grid_size = ceildiv(n, block_size); - zero_matrix_kernel<<get_stream()>>>( - m, n, stride, as_device_type(array)); -} - - -template -void initialize(std::shared_ptr exec, - const matrix::Dense* b, - matrix::Dense* residual, - matrix::Dense* givens_sin, - matrix::Dense* givens_cos, - array* stop_status, size_type krylov_dim) -{ - const auto num_threads = std::max(b->get_size()[0] * b->get_stride(), - krylov_dim * b->get_size()[1]); - const auto grid_dim = ceildiv(num_threads, default_block_size); - const auto block_dim = default_block_size; - constexpr auto block_size = default_block_size; - - initialize_kernel - <<get_stream()>>>( - b->get_size()[0], b->get_size()[1], krylov_dim, - as_device_type(b->get_const_values()), b->get_stride(), - as_device_type(residual->get_values()), residual->get_stride(), - as_device_type(givens_sin->get_values()), givens_sin->get_stride(), - as_device_type(givens_cos->get_values()), givens_cos->get_stride(), - as_device_type(stop_status->get_data())); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CB_GMRES_INITIALIZE_KERNEL); - - -template -void restart(std::shared_ptr exec, - const matrix::Dense* residual, - matrix::Dense>* residual_norm, - matrix::Dense* residual_norm_collection, - matrix::Dense>* arnoldi_norm, - Accessor3d krylov_bases, - matrix::Dense* next_krylov_basis, - array* final_iter_nums, array& reduction_tmp, - size_type krylov_dim) -{ - constexpr bool use_scalar = - gko::cb_gmres::detail::has_3d_scaled_accessor::value; - const auto num_rows = residual->get_size()[0]; - const auto num_rhs = residual->get_size()[1]; - const auto krylov_stride = - gko::cb_gmres::helper_functions_accessor::get_stride( - krylov_bases); - const auto grid_dim_1 = - ceildiv((krylov_dim + 1) * krylov_stride[0], default_block_size); - const auto block_dim = default_block_size; - constexpr auto block_size = default_block_size; - const auto stride_arnoldi = arnoldi_norm->get_stride(); - - restart_1_kernel - <<get_stream()>>>( - residual->get_size()[0], residual->get_size()[1], krylov_dim, - acc::as_cuda_range(krylov_bases), - as_device_type(residual_norm_collection->get_values()), - residual_norm_collection->get_stride()); - kernels::cuda::dense::compute_norm2_dispatch(exec, residual, residual_norm, - reduction_tmp); - - if (use_scalar) { - components::fill_array(exec, - arnoldi_norm->get_values() + 2 * stride_arnoldi, - num_rhs, zero>()); - const dim3 grid_size_nrm(ceildiv(num_rhs, default_dot_dim), - exec->get_num_multiprocessor() * 2); - const dim3 block_size_nrm(default_dot_dim, default_dot_dim); - multinorminf_without_stop_kernel<<get_stream()>>>( - num_rows, num_rhs, as_device_type(residual->get_const_values()), - residual->get_stride(), - as_device_type(arnoldi_norm->get_values() + 2 * stride_arnoldi), 0); - } - - if (gko::cb_gmres::detail::has_3d_scaled_accessor::value) { - set_scalar_kernel - <<get_stream()>>>( - num_rhs, krylov_dim + 1, - as_device_type(residual_norm->get_const_values()), - residual_norm->get_stride(), - as_device_type(arnoldi_norm->get_const_values() + - 2 * stride_arnoldi), - stride_arnoldi, acc::as_cuda_range(krylov_bases)); - } - - const auto grid_dim_2 = - ceildiv(std::max(num_rows, 1) * krylov_stride[1], - default_block_size); - restart_2_kernel - <<get_stream()>>>( - residual->get_size()[0], residual->get_size()[1], - as_device_type(residual->get_const_values()), - residual->get_stride(), - as_device_type(residual_norm->get_const_values()), - as_device_type(residual_norm_collection->get_values()), - acc::as_cuda_range(krylov_bases), - as_device_type(next_krylov_basis->get_values()), - next_krylov_basis->get_stride(), - as_device_type(final_iter_nums->get_data())); -} - -GKO_INSTANTIATE_FOR_EACH_CB_GMRES_TYPE(GKO_DECLARE_CB_GMRES_RESTART_KERNEL); - - -template -void finish_arnoldi_CGS(std::shared_ptr exec, - matrix::Dense* next_krylov_basis, - Accessor3dim krylov_bases, - matrix::Dense* hessenberg_iter, - matrix::Dense* buffer_iter, - matrix::Dense>* arnoldi_norm, - size_type iter, const stopping_status* stop_status, - stopping_status* reorth_status, - array* num_reorth) -{ - const auto dim_size = next_krylov_basis->get_size(); - if (dim_size[1] == 0) { - return; - } - using non_complex = remove_complex; - // optimization parameter - constexpr int singledot_block_size = default_dot_dim; - constexpr bool use_scalar = - gko::cb_gmres::detail::has_3d_scaled_accessor::value; - const auto stride_next_krylov = next_krylov_basis->get_stride(); - const auto stride_hessenberg = hessenberg_iter->get_stride(); - const auto stride_buffer = buffer_iter->get_stride(); - const auto stride_arnoldi = arnoldi_norm->get_stride(); - const dim3 grid_size(ceildiv(dim_size[1], default_dot_dim), - exec->get_num_multiprocessor() * 2); - const dim3 grid_size_num_iters(ceildiv(dim_size[1], default_dot_dim), - exec->get_num_multiprocessor() * 2, - iter + 1); - const dim3 block_size(default_dot_dim, default_dot_dim); - // Note: having iter first (instead of row_idx information) is likely - // beneficial for avoiding atomic_add conflicts, but that needs - // further investigation. - const dim3 grid_size_iters_single(exec->get_num_multiprocessor() * 2, - iter + 1); - const auto block_size_iters_single = singledot_block_size; - size_type num_reorth_host; - - components::fill_array(exec, arnoldi_norm->get_values(), dim_size[1], - zero()); - multinorm2_kernel<<get_stream()>>>( - dim_size[0], dim_size[1], - as_device_type(next_krylov_basis->get_const_values()), - stride_next_krylov, as_device_type(arnoldi_norm->get_values()), - as_device_type(stop_status)); - // nrmP = norm(next_krylov_basis) - zero_matrix(exec, iter + 1, dim_size[1], stride_hessenberg, - hessenberg_iter->get_values()); - if (dim_size[1] > 1) { - multidot_kernel - <<get_stream()>>>( - dim_size[0], dim_size[1], - as_device_type(next_krylov_basis->get_const_values()), - stride_next_krylov, acc::as_cuda_range(krylov_bases), - as_device_type(hessenberg_iter->get_values()), - stride_hessenberg, as_device_type(stop_status)); - } else { - singledot_kernel - <<get_stream()>>>( - dim_size[0], - as_device_type(next_krylov_basis->get_const_values()), - stride_next_krylov, acc::as_cuda_range(krylov_bases), - as_device_type(hessenberg_iter->get_values()), - stride_hessenberg, as_device_type(stop_status)); - } - // for i in 1:iter - // hessenberg(iter, i) = next_krylov_basis' * krylov_bases(:, i) - // end - update_next_krylov_kernel - <<get_stream()>>>( - iter + 1, dim_size[0], dim_size[1], - as_device_type(next_krylov_basis->get_values()), stride_next_krylov, - acc::as_cuda_range(krylov_bases), - as_device_type(hessenberg_iter->get_const_values()), - stride_hessenberg, as_device_type(stop_status)); - - // for i in 1:iter - // next_krylov_basis -= hessenberg(iter, i) * krylov_bases(:, i) - // end - components::fill_array(exec, arnoldi_norm->get_values() + stride_arnoldi, - dim_size[1], zero()); - if (use_scalar) { - components::fill_array(exec, - arnoldi_norm->get_values() + 2 * stride_arnoldi, - dim_size[1], zero()); - } - multinorm2_inf_kernel - <<get_stream()>>>( - dim_size[0], dim_size[1], - as_device_type(next_krylov_basis->get_const_values()), - stride_next_krylov, - as_device_type(arnoldi_norm->get_values() + stride_arnoldi), - as_device_type(arnoldi_norm->get_values() + 2 * stride_arnoldi), - as_device_type(stop_status)); - // nrmN = norm(next_krylov_basis) - components::fill_array(exec, num_reorth->get_data(), 1, zero()); - check_arnoldi_norms - <<get_stream()>>>( - dim_size[1], as_device_type(arnoldi_norm->get_values()), - stride_arnoldi, as_device_type(hessenberg_iter->get_values()), - stride_hessenberg, iter + 1, acc::as_cuda_range(krylov_bases), - as_device_type(stop_status), as_device_type(reorth_status), - as_device_type(num_reorth->get_data())); - num_reorth_host = get_element(*num_reorth, 0); - // num_reorth_host := number of next_krylov vector to be reorthogonalization - for (size_type l = 1; (num_reorth_host > 0) && (l < 3); l++) { - zero_matrix(exec, iter + 1, dim_size[1], stride_buffer, - buffer_iter->get_values()); - if (dim_size[1] > 1) { - multidot_kernel - <<get_stream()>>>( - dim_size[0], dim_size[1], - as_device_type(next_krylov_basis->get_const_values()), - stride_next_krylov, acc::as_cuda_range(krylov_bases), - as_device_type(buffer_iter->get_values()), stride_buffer, - as_device_type(stop_status)); - } else { - singledot_kernel - <<get_stream()>>>( - dim_size[0], - as_device_type(next_krylov_basis->get_const_values()), - stride_next_krylov, acc::as_cuda_range(krylov_bases), - as_device_type(buffer_iter->get_values()), stride_buffer, - as_device_type(stop_status)); - } - // for i in 1:iter - // hessenberg(iter, i) = next_krylov_basis' * krylov_bases(:, i) - // end - update_next_krylov_and_add_kernel - <<get_stream()>>>( - iter + 1, dim_size[0], dim_size[1], - as_device_type(next_krylov_basis->get_values()), - stride_next_krylov, acc::as_cuda_range(krylov_bases), - as_device_type(hessenberg_iter->get_values()), - stride_hessenberg, - as_device_type(buffer_iter->get_const_values()), stride_buffer, - as_device_type(stop_status), as_device_type(reorth_status)); - // for i in 1:iter - // next_krylov_basis -= hessenberg(iter, i) * krylov_bases(:, i) - // end - components::fill_array(exec, - arnoldi_norm->get_values() + stride_arnoldi, - dim_size[1], zero()); - if (use_scalar) { - components::fill_array( - exec, arnoldi_norm->get_values() + 2 * stride_arnoldi, - dim_size[1], zero()); - } - multinorm2_inf_kernel - <<get_stream()>>>( - dim_size[0], dim_size[1], - as_device_type(next_krylov_basis->get_const_values()), - stride_next_krylov, - as_device_type(arnoldi_norm->get_values() + stride_arnoldi), - as_device_type(arnoldi_norm->get_values() + 2 * stride_arnoldi), - as_device_type(stop_status)); - // nrmN = norm(next_krylov_basis) - components::fill_array(exec, num_reorth->get_data(), 1, - zero()); - check_arnoldi_norms - <<get_stream()>>>( - dim_size[1], as_device_type(arnoldi_norm->get_values()), - stride_arnoldi, as_device_type(hessenberg_iter->get_values()), - stride_hessenberg, iter + 1, acc::as_cuda_range(krylov_bases), - as_device_type(stop_status), as_device_type(reorth_status), - num_reorth->get_data()); - num_reorth_host = get_element(*num_reorth, 0); - // num_reorth_host := number of next_krylov vector to be - // reorthogonalization - } - update_krylov_next_krylov_kernel - <<get_stream()>>>( - iter, dim_size[0], dim_size[1], - as_device_type(next_krylov_basis->get_values()), stride_next_krylov, - acc::as_cuda_range(krylov_bases), - as_device_type(hessenberg_iter->get_const_values()), - stride_hessenberg, as_device_type(stop_status)); - // next_krylov_basis /= hessenberg(iter, iter + 1) - // krylov_bases(:, iter + 1) = next_krylov_basis - // End of arnoldi -} - -template -void givens_rotation(std::shared_ptr exec, - matrix::Dense* givens_sin, - matrix::Dense* givens_cos, - matrix::Dense* hessenberg_iter, - matrix::Dense>* residual_norm, - matrix::Dense* residual_norm_collection, - size_type iter, const array* stop_status) -{ - // TODO: tune block_size for optimal performance - constexpr auto block_size = default_block_size; - const auto num_cols = hessenberg_iter->get_size()[1]; - const auto block_dim = block_size; - const auto grid_dim = - static_cast(ceildiv(num_cols, block_size)); - - givens_rotation_kernel - <<get_stream()>>>( - hessenberg_iter->get_size()[0], hessenberg_iter->get_size()[1], - iter, as_device_type(hessenberg_iter->get_values()), - hessenberg_iter->get_stride(), - as_device_type(givens_sin->get_values()), givens_sin->get_stride(), - as_device_type(givens_cos->get_values()), givens_cos->get_stride(), - as_device_type(residual_norm->get_values()), - as_device_type(residual_norm_collection->get_values()), - residual_norm_collection->get_stride(), - stop_status->get_const_data()); -} - - -template -void arnoldi(std::shared_ptr exec, - matrix::Dense* next_krylov_basis, - matrix::Dense* givens_sin, - matrix::Dense* givens_cos, - matrix::Dense>* residual_norm, - matrix::Dense* residual_norm_collection, - Accessor3d krylov_bases, matrix::Dense* hessenberg_iter, - matrix::Dense* buffer_iter, - matrix::Dense>* arnoldi_norm, - size_type iter, array* final_iter_nums, - const array* stop_status, - array* reorth_status, - array* num_reorth) -{ - increase_final_iteration_numbers_kernel<<< - static_cast( - ceildiv(final_iter_nums->get_size(), default_block_size)), - default_block_size, 0, exec->get_stream()>>>( - as_device_type(final_iter_nums->get_data()), - stop_status->get_const_data(), final_iter_nums->get_size()); - finish_arnoldi_CGS(exec, next_krylov_basis, krylov_bases, hessenberg_iter, - buffer_iter, arnoldi_norm, iter, - stop_status->get_const_data(), reorth_status->get_data(), - num_reorth); - givens_rotation(exec, givens_sin, givens_cos, hessenberg_iter, - residual_norm, residual_norm_collection, iter, stop_status); -} - -GKO_INSTANTIATE_FOR_EACH_CB_GMRES_TYPE(GKO_DECLARE_CB_GMRES_ARNOLDI_KERNEL); - - -template -void solve_upper_triangular( - std::shared_ptr exec, - const matrix::Dense* residual_norm_collection, - const matrix::Dense* hessenberg, matrix::Dense* y, - const array* final_iter_nums) -{ - // TODO: tune block_size for optimal performance - constexpr auto block_size = default_block_size; - const auto num_rhs = residual_norm_collection->get_size()[1]; - const auto block_dim = block_size; - const auto grid_dim = - static_cast(ceildiv(num_rhs, block_size)); - - solve_upper_triangular_kernel - <<get_stream()>>>( - hessenberg->get_size()[1], num_rhs, - as_device_type(residual_norm_collection->get_const_values()), - residual_norm_collection->get_stride(), - as_device_type(hessenberg->get_const_values()), - hessenberg->get_stride(), as_device_type(y->get_values()), - y->get_stride(), as_device_type(final_iter_nums->get_const_data())); -} - - -template -void calculate_qy(std::shared_ptr exec, - ConstAccessor3d krylov_bases, size_type num_krylov_bases, - const matrix::Dense* y, - matrix::Dense* before_preconditioner, - const array* final_iter_nums) -{ - const auto num_rows = before_preconditioner->get_size()[0]; - const auto num_cols = before_preconditioner->get_size()[1]; - const auto stride_before_preconditioner = - before_preconditioner->get_stride(); - - constexpr auto block_size = default_block_size; - const auto grid_dim = static_cast( - ceildiv(num_rows * stride_before_preconditioner, block_size)); - const auto block_dim = block_size; - - calculate_Qy_kernel - <<get_stream()>>>( - num_rows, num_cols, acc::as_cuda_range(krylov_bases), - as_device_type(y->get_const_values()), y->get_stride(), - as_device_type(before_preconditioner->get_values()), - stride_before_preconditioner, - as_device_type(final_iter_nums->get_const_data())); - // Calculate qy - // before_preconditioner = krylov_bases * y -} - - -template -void solve_krylov(std::shared_ptr exec, - const matrix::Dense* residual_norm_collection, - ConstAccessor3d krylov_bases, - const matrix::Dense* hessenberg, - matrix::Dense* y, - matrix::Dense* before_preconditioner, - const array* final_iter_nums) -{ - if (before_preconditioner->get_size()[1] == 0) { - return; - } - // since hessenberg has dims: iters x iters * num_rhs - // krylov_bases has dims: (iters + 1) x sysmtx[0] x num_rhs - const auto iters = - hessenberg->get_size()[1] / before_preconditioner->get_size()[1]; - const auto num_krylov_bases = iters + 1; - solve_upper_triangular(exec, residual_norm_collection, hessenberg, y, - final_iter_nums); - calculate_qy(exec, krylov_bases, num_krylov_bases, y, before_preconditioner, - final_iter_nums); -} - -GKO_INSTANTIATE_FOR_EACH_CB_GMRES_CONST_TYPE( - GKO_DECLARE_CB_GMRES_SOLVE_KRYLOV_KERNEL); - - -} // namespace cb_gmres -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/solver/multigrid_kernels.cu b/cuda/solver/multigrid_kernels.cu deleted file mode 100644 index eaa913aa064..00000000000 --- a/cuda/solver/multigrid_kernels.cu +++ /dev/null @@ -1,41 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/solver/multigrid_kernels.hpp" - - -#include -#include -#include -#include - - -#include "common/cuda_hip/base/runtime.hpp" -#include "core/base/array_access.hpp" -#include "core/components/fill_array_kernels.hpp" -#include "cuda/base/types.hpp" -#include "cuda/components/thread_ids.cuh" - - -namespace gko { -namespace kernels { -namespace cuda { -/** - * @brief The MULTIGRID solver namespace. - * - * @ingroup multigrid - */ -namespace multigrid { - - -constexpr int default_block_size = 512; - - -#include "common/cuda_hip/solver/multigrid_kernels.hpp.inc" - - -} // namespace multigrid -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/stop/batch_criteria.cuh b/cuda/stop/batch_criteria.cuh deleted file mode 100644 index f4f434dda11..00000000000 --- a/cuda/stop/batch_criteria.cuh +++ /dev/null @@ -1,26 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_CUDA_STOP_BATCH_CRITERIA_CUH_ -#define GKO_CUDA_STOP_BATCH_CRITERIA_CUH_ - - -#include - - -namespace gko { -namespace kernels { -namespace cuda { -namespace batch_stop { - - -#include "common/cuda_hip/stop/batch_criteria.hpp.inc" - - -} // namespace batch_stop -} // namespace cuda -} // namespace kernels -} // namespace gko - -#endif // GKO_CUDA_STOP_BATCH_CRITERIA_CUH_ diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt index de44eb20682..5bcc1de1f21 100644 --- a/hip/CMakeLists.txt +++ b/hip/CMakeLists.txt @@ -5,9 +5,7 @@ add_instantiation_files(. matrix/fbcsr_kernels.instantiate.hip.cpp FBCSR_INSTANT # we don't split up the dense kernels into distinct compilations list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp) set(GINKGO_HIP_SOURCES - base/batch_multi_vector_kernels.hip.cpp base/device.hip.cpp - base/device_matrix_data_kernels.hip.cpp base/exception.hip.cpp base/executor.hip.cpp base/index_set_kernels.hip.cpp @@ -17,55 +15,28 @@ set(GINKGO_HIP_SOURCES base/stream.hip.cpp base/timer.hip.cpp base/version.hip.cpp - components/prefix_sum_kernels.hip.cpp - distributed/index_map_kernels.hip.cpp - distributed/matrix_kernels.hip.cpp - distributed/partition_helpers_kernels.hip.cpp - distributed/partition_kernels.hip.cpp - distributed/vector_kernels.hip.cpp - factorization/cholesky_kernels.hip.cpp - factorization/factorization_kernels.hip.cpp factorization/ic_kernels.hip.cpp factorization/ilu_kernels.hip.cpp - factorization/lu_kernels.hip.cpp - factorization/par_ic_kernels.hip.cpp - factorization/par_ict_kernels.hip.cpp - factorization/par_ilu_kernels.hip.cpp factorization/par_ilut_approx_filter_kernel.hip.cpp factorization/par_ilut_filter_kernel.hip.cpp factorization/par_ilut_select_common.hip.cpp factorization/par_ilut_select_kernel.hip.cpp factorization/par_ilut_spgeam_kernel.hip.cpp factorization/par_ilut_sweep_kernel.hip.cpp - matrix/batch_csr_kernels.hip.cpp - matrix/batch_dense_kernels.hip.cpp - matrix/batch_ell_kernels.hip.cpp - matrix/coo_kernels.hip.cpp ${CSR_INSTANTIATE} - matrix/dense_kernels.hip.cpp - matrix/diagonal_kernels.hip.cpp - matrix/ell_kernels.hip.cpp ${FBCSR_INSTANTIATE} - matrix/sellp_kernels.hip.cpp - matrix/sparsity_csr_kernels.hip.cpp - multigrid/pgm_kernels.hip.cpp preconditioner/batch_jacobi_kernels.hip.cpp - preconditioner/isai_kernels.hip.cpp preconditioner/jacobi_advanced_apply_kernel.hip.cpp preconditioner/jacobi_generate_kernel.hip.cpp - preconditioner/jacobi_kernels.hip.cpp preconditioner/jacobi_simple_apply_kernel.hip.cpp - reorder/rcm_kernels.hip.cpp solver/batch_bicgstab_kernels.hip.cpp solver/batch_cg_kernels.hip.cpp - solver/cb_gmres_kernels.hip.cpp - solver/idr_kernels.hip.cpp solver/lower_trs_kernels.hip.cpp - solver/multigrid_kernels.hip.cpp solver/upper_trs_kernels.hip.cpp stop/criterion_kernels.hip.cpp stop/residual_norm_kernels.hip.cpp ${GKO_UNIFIED_COMMON_SOURCES} + ${GKO_CUDA_HIP_COMMON_SOURCES} ) if(hipfft_FOUND) diff --git a/hip/base/batch_multi_vector_kernels.hip.cpp b/hip/base/batch_multi_vector_kernels.hip.cpp deleted file mode 100644 index 14a915630a5..00000000000 --- a/hip/base/batch_multi_vector_kernels.hip.cpp +++ /dev/null @@ -1,59 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/base/batch_multi_vector_kernels.hpp" - - -#include -#include - - -#include -#include - - -#include "common/cuda_hip/base/runtime.hpp" -#include "core/base/batch_struct.hpp" -#include "hip/base/batch_struct.hip.hpp" -#include "hip/base/config.hip.hpp" -#include "hip/base/hipblas_bindings.hip.hpp" -#include "hip/base/pointer_mode_guard.hip.hpp" -#include "hip/base/thrust.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" -#include "hip/components/reduction.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" -#include "hip/components/uninitialized_array.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -/** - * @brief The MultiVector matrix format namespace. - * - * @ingroup batch_multi_vector - */ -namespace batch_multi_vector { - - -constexpr auto default_block_size = 256; -constexpr int sm_oversubscription = 4; - - -// clang-format off - -// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES - -#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc" - - -#include "common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc" - -// clang-format on - - -} // namespace batch_multi_vector -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/base/device_matrix_data_kernels.hip.cpp b/hip/base/device_matrix_data_kernels.hip.cpp deleted file mode 100644 index 745ba955014..00000000000 --- a/hip/base/device_matrix_data_kernels.hip.cpp +++ /dev/null @@ -1,33 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/base/device_matrix_data_kernels.hpp" - - -#include -#include -#include -#include -#include -#include -#include - - -#include "hip/base/thrust.hip.hpp" -#include "hip/base/types.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -namespace components { - - -#include "common/cuda_hip/base/device_matrix_data_kernels.hpp.inc" - - -} // namespace components -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/base/kernel_launch.hip.hpp b/hip/base/kernel_launch.hip.hpp deleted file mode 100644 index 2889314f498..00000000000 --- a/hip/base/kernel_launch.hip.hpp +++ /dev/null @@ -1,57 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_HPP_ -#error \ - "This file can only be used from inside common/unified/base/kernel_launch.hpp" -#endif - - -#include - - -#include "accessor/hip_helper.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "hip/base/types.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { - - -template -struct to_device_type_impl&> { - using type = std::decay_t>()))>; - static type map_to_device(gko::acc::range& range) - { - return gko::acc::as_hip_range(range); - } -}; - -template -struct to_device_type_impl&> { - using type = std::decay_t>()))>; - static type map_to_device(const gko::acc::range& range) - { - return gko::acc::as_hip_range(range); - } -}; - - -namespace device_std = thrust; - - -constexpr int default_block_size = 512; - - -#include "common/cuda_hip/base/kernel_launch.hpp.inc" - - -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/base/kernel_launch_reduction.hip.hpp b/hip/base/kernel_launch_reduction.hip.hpp deleted file mode 100644 index 7c5d0c01c9c..00000000000 --- a/hip/base/kernel_launch_reduction.hip.hpp +++ /dev/null @@ -1,28 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_REDUCTION_HPP_ -#error \ - "This file can only be used from inside common/unified/base/kernel_launch_reduction.hpp" -#endif - - -#include "core/synthesizer/implementation_selection.hpp" -#include "hip/base/types.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" -#include "hip/components/reduction.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { - - -#include "common/cuda_hip/base/kernel_launch_reduction.hpp.inc" - - -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/base/kernel_launch_solver.hip.hpp b/hip/base/kernel_launch_solver.hip.hpp deleted file mode 100644 index eda18f35eab..00000000000 --- a/hip/base/kernel_launch_solver.hip.hpp +++ /dev/null @@ -1,24 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_SOLVER_HPP_ -#error \ - "This file can only be used from inside common/unified/base/kernel_launch_solver.hpp" -#endif - - -#include "common/cuda_hip/base/runtime.hpp" - - -namespace gko { -namespace kernels { -namespace hip { - - -#include "common/cuda_hip/base/kernel_launch_solver.hpp.inc" - - -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/base/math.hip.hpp b/hip/base/math.hip.hpp deleted file mode 100644 index f9427089126..00000000000 --- a/hip/base/math.hip.hpp +++ /dev/null @@ -1,24 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_HIP_BASE_MATH_HIP_HPP_ -#define GKO_HIP_BASE_MATH_HIP_HPP_ - - -#include - - -#include - - -namespace gko { - - -#include "common/cuda_hip/base/math.hpp.inc" - - -} // namespace gko - - -#endif // GKO_HIP_BASE_MATH_HIP_HPP_ diff --git a/hip/components/atomic.hip.hpp b/hip/components/atomic.hip.hpp deleted file mode 100644 index 6c3eaaeb82a..00000000000 --- a/hip/components/atomic.hip.hpp +++ /dev/null @@ -1,29 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_HIP_COMPONENTS_ATOMIC_HIP_HPP_ -#define GKO_HIP_COMPONENTS_ATOMIC_HIP_HPP_ - - -#include - - -#include "hip/base/math.hip.hpp" -#include "hip/base/types.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { - - -#include "common/cuda_hip/components/atomic.hpp.inc" - - -} // namespace hip -} // namespace kernels -} // namespace gko - - -#endif // GKO_HIP_COMPONENTS_ATOMIC_HIP_HPP_ diff --git a/hip/components/diagonal_block_manipulation.hip.hpp b/hip/components/diagonal_block_manipulation.hip.hpp deleted file mode 100644 index 0261c7549c5..00000000000 --- a/hip/components/diagonal_block_manipulation.hip.hpp +++ /dev/null @@ -1,32 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_HIP_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_HIP_HPP_ -#define GKO_HIP_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_HIP_HPP_ - - -#include - - -#include "hip/base/config.hip.hpp" -#include "hip/base/types.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -namespace csr { - - -#include "common/cuda_hip/components/diagonal_block_manipulation.hpp.inc" - - -} // namespace csr -} // namespace hip -} // namespace kernels -} // namespace gko - - -#endif // GKO_HIP_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_HIP_HPP_ diff --git a/hip/components/intrinsics.hip.hpp b/hip/components/intrinsics.hip.hpp deleted file mode 100644 index af849d4471a..00000000000 --- a/hip/components/intrinsics.hip.hpp +++ /dev/null @@ -1,25 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_HIP_COMPONENTS_INTRINSICS_HIP_HPP_ -#define GKO_HIP_COMPONENTS_INTRINSICS_HIP_HPP_ - - -#include - - -namespace gko { -namespace kernels { -namespace hip { - - -#include "common/cuda_hip/components/intrinsics.hpp.inc" - - -} // namespace hip -} // namespace kernels -} // namespace gko - - -#endif // GKO_HIP_COMPONENTS_INTRINSICS_HIP_HPP_ diff --git a/hip/components/merging.hip.hpp b/hip/components/merging.hip.hpp deleted file mode 100644 index 3f031947940..00000000000 --- a/hip/components/merging.hip.hpp +++ /dev/null @@ -1,28 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_HIP_COMPONENTS_MERGING_HIP_HPP_ -#define GKO_HIP_COMPONENTS_MERGING_HIP_HPP_ - - -#include "core/base/utils.hpp" -#include "hip/base/math.hip.hpp" -#include "hip/components/intrinsics.hip.hpp" -#include "hip/components/searching.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { - - -#include "common/cuda_hip/components/merging.hpp.inc" - - -} // namespace hip -} // namespace kernels -} // namespace gko - - -#endif // GKO_HIP_COMPONENTS_MERGING_HIP_HPP_ diff --git a/hip/components/prefix_sum.hip.hpp b/hip/components/prefix_sum.hip.hpp deleted file mode 100644 index b5065589d8e..00000000000 --- a/hip/components/prefix_sum.hip.hpp +++ /dev/null @@ -1,31 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_HIP_COMPONENTS_PREFIX_SUM_HIP_HPP_ -#define GKO_HIP_COMPONENTS_PREFIX_SUM_HIP_HPP_ - - -#include - - -#include "hip/base/hipblas_bindings.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" -#include "hip/components/reduction.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { - - -#include "common/cuda_hip/components/prefix_sum.hpp.inc" - - -} // namespace hip -} // namespace kernels -} // namespace gko - - -#endif // GKO_HIP_COMPONENTS_PREFIX_SUM_HIP_HPP_ diff --git a/hip/components/prefix_sum_kernels.hip.cpp b/hip/components/prefix_sum_kernels.hip.cpp deleted file mode 100644 index ad55c0954d1..00000000000 --- a/hip/components/prefix_sum_kernels.hip.cpp +++ /dev/null @@ -1,34 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/components/prefix_sum_kernels.hpp" - - -#include - - -#include - - -#include -#include -#include - - -#include "hip/base/thrust.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -namespace components { - - -#include "common/cuda_hip/components/prefix_sum_kernels.hpp.inc" - - -} // namespace components -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/components/reduction.hip.hpp b/hip/components/reduction.hip.hpp deleted file mode 100644 index bcff77707ca..00000000000 --- a/hip/components/reduction.hip.hpp +++ /dev/null @@ -1,82 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_HIP_COMPONENTS_REDUCTION_HIP_HPP_ -#define GKO_HIP_COMPONENTS_REDUCTION_HIP_HPP_ - - -#include - - -#include -#include - - -#include "common/cuda_hip/base/runtime.hpp" -#include "core/base/array_access.hpp" -#include "hip/base/config.hip.hpp" -#include "hip/base/types.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" -#include "hip/components/uninitialized_array.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { - - -constexpr int default_reduce_block_size = 512; - - -#include "common/cuda_hip/components/reduction.hpp.inc" - - -/** - * Compute a reduction using add operation (+). - * - * @param exec Executor associated to the array - * @param size size of the array - * @param source the pointer of the array - * - * @return the reduction result - */ -template -__host__ ValueType reduce_add_array(std::shared_ptr exec, - size_type size, const ValueType* source) -{ - auto block_results_val = source; - size_type grid_dim = size; - auto block_results = array(exec); - if (size > default_reduce_block_size) { - const auto n = ceildiv(size, default_reduce_block_size); - grid_dim = - (n <= default_reduce_block_size) ? n : default_reduce_block_size; - - block_results.resize_and_reset(grid_dim); - - reduce_add_array<<get_stream()>>>( - size, as_device_type(source), - as_device_type(block_results.get_data())); - - block_results_val = block_results.get_const_data(); - } - - auto d_result = array(exec, 1); - - reduce_add_array<<<1, default_reduce_block_size, 0, exec->get_stream()>>>( - grid_dim, as_device_type(block_results_val), - as_device_type(d_result.get_data())); - auto answer = get_element(d_result, 0); - return answer; -} - - -} // namespace hip -} // namespace kernels -} // namespace gko - - -#endif // GKO_HIP_COMPONENTS_REDUCTION_HIP_HPP_ diff --git a/hip/components/searching.hip.hpp b/hip/components/searching.hip.hpp deleted file mode 100644 index 2a6be767c2c..00000000000 --- a/hip/components/searching.hip.hpp +++ /dev/null @@ -1,26 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_HIP_COMPONENTS_SEARCHING_HIP_HPP_ -#define GKO_HIP_COMPONENTS_SEARCHING_HIP_HPP_ - - -#include "hip/base/config.hip.hpp" -#include "hip/components/intrinsics.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { - - -#include "common/cuda_hip/components/searching.hpp.inc" - - -} // namespace hip -} // namespace kernels -} // namespace gko - - -#endif // GKO_HIP_COMPONENTS_SEARCHING_HIP_HPP_ diff --git a/hip/components/segment_scan.hip.hpp b/hip/components/segment_scan.hip.hpp deleted file mode 100644 index 7f98d08cf69..00000000000 --- a/hip/components/segment_scan.hip.hpp +++ /dev/null @@ -1,25 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_HIP_COMPONENTS_SEGMENT_SCAN_HIP_HPP_ -#define GKO_HIP_COMPONENTS_SEGMENT_SCAN_HIP_HPP_ - - -#include "hip/components/cooperative_groups.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { - - -#include "common/cuda_hip/components/segment_scan.hpp.inc" - - -} // namespace hip -} // namespace kernels -} // namespace gko - - -#endif // GKO_HIP_COMPONENTS_SEGMENT_SCAN_HIP_HPP_ diff --git a/hip/components/sorting.hip.hpp b/hip/components/sorting.hip.hpp deleted file mode 100644 index 730c3c56401..00000000000 --- a/hip/components/sorting.hip.hpp +++ /dev/null @@ -1,26 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_HIP_COMPONENTS_SORTING_HIP_HPP_ -#define GKO_HIP_COMPONENTS_SORTING_HIP_HPP_ - - -#include "hip/base/config.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { - - -#include "common/cuda_hip/components/sorting.hpp.inc" - - -} // namespace hip -} // namespace kernels -} // namespace gko - - -#endif // GKO_HIP_COMPONENTS_SORTING_HIP_HPP_ diff --git a/hip/components/syncfree.hip.hpp b/hip/components/syncfree.hip.hpp deleted file mode 100644 index 9fe48944b56..00000000000 --- a/hip/components/syncfree.hip.hpp +++ /dev/null @@ -1,32 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_HIP_COMPONENTS_SYNCFREE_HIP_HPP_ -#define GKO_HIP_COMPONENTS_SYNCFREE_HIP_HPP_ - - -#include - - -#include "core/components/fill_array_kernels.hpp" -#include "hip/base/config.hip.hpp" -#include "hip/components/atomic.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" -#include "hip/components/memory.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { - - -#include "common/cuda_hip/components/syncfree.hpp.inc" - - -} // namespace hip -} // namespace kernels -} // namespace gko - - -#endif // GKO_HIP_COMPONENTS_SYNCFREE_HIP_HPP_ diff --git a/hip/components/thread_ids.hip.hpp b/hip/components/thread_ids.hip.hpp deleted file mode 100644 index 8cd204438ae..00000000000 --- a/hip/components/thread_ids.hip.hpp +++ /dev/null @@ -1,27 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_HIP_COMPONENTS_THREAD_IDS_HIP_HPP_ -#define GKO_HIP_COMPONENTS_THREAD_IDS_HIP_HPP_ - - -#include "hip/base/config.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -namespace thread { - - -#include "common/cuda_hip/components/thread_ids.hpp.inc" - - -} // namespace thread -} // namespace hip -} // namespace kernels -} // namespace gko - - -#endif // GKO_HIP_COMPONENTS_THREAD_IDS_HIP_HPP_ diff --git a/hip/components/uninitialized_array.hip.hpp b/hip/components/uninitialized_array.hip.hpp deleted file mode 100644 index e59d2c21a63..00000000000 --- a/hip/components/uninitialized_array.hip.hpp +++ /dev/null @@ -1,25 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_HIP_COMPONENTS_UNINITIALIZED_ARRAY_HIP_HPP_ -#define GKO_HIP_COMPONENTS_UNINITIALIZED_ARRAY_HIP_HPP_ - - -#include - - -namespace gko { -namespace kernels { -namespace hip { - - -#include "common/cuda_hip/components/uninitialized_array.hpp.inc" - - -} // namespace hip -} // namespace kernels -} // namespace gko - - -#endif // GKO_HIP_COMPONENTS_UNINITIALIZED_ARRAY_HIP_HPP_ diff --git a/hip/components/warp_blas.hip.hpp b/hip/components/warp_blas.hip.hpp deleted file mode 100644 index 8ac59719aa7..00000000000 --- a/hip/components/warp_blas.hip.hpp +++ /dev/null @@ -1,33 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_HIP_COMPONENTS_WARP_BLAS_HIP_HPP_ -#define GKO_HIP_COMPONENTS_WARP_BLAS_HIP_HPP_ - - -#include -#include - - -#include - - -#include "hip/base/math.hip.hpp" -#include "hip/components/reduction.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { - - -#include "common/cuda_hip/components/warp_blas.hpp.inc" - - -} // namespace hip -} // namespace kernels -} // namespace gko - - -#endif // GKO_HIP_COMPONENTS_WARP_BLAS_HIP_HPP_ diff --git a/hip/distributed/index_map_kernels.hip.cpp b/hip/distributed/index_map_kernels.hip.cpp deleted file mode 100644 index d45674a66a3..00000000000 --- a/hip/distributed/index_map_kernels.hip.cpp +++ /dev/null @@ -1,42 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/distributed/index_map_kernels.hpp" - - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -#include - - -#include "hip/base/thrust.hip.hpp" -#include "hip/components/atomic.hip.hpp" -#include "hip/components/searching.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -namespace index_map { - - -#include "common/cuda_hip/distributed/index_map_kernels.hpp.inc" - - -} // namespace index_map -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/distributed/matrix_kernels.hip.cpp b/hip/distributed/matrix_kernels.hip.cpp deleted file mode 100644 index 54cde64c429..00000000000 --- a/hip/distributed/matrix_kernels.hip.cpp +++ /dev/null @@ -1,40 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/distributed/matrix_kernels.hpp" - - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -#include - - -#include "hip/base/thrust.hip.hpp" -#include "hip/components/atomic.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -namespace distributed_matrix { - - -#include "common/cuda_hip/distributed/matrix_kernels.hpp.inc" - - -} // namespace distributed_matrix -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/distributed/partition_helpers_kernels.hip.cpp b/hip/distributed/partition_helpers_kernels.hip.cpp deleted file mode 100644 index 744d8de887b..00000000000 --- a/hip/distributed/partition_helpers_kernels.hip.cpp +++ /dev/null @@ -1,29 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/distributed/partition_helpers_kernels.hpp" - - -#include -#include -#include -#include - - -#include "hip/base/thrust.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -namespace partition_helpers { - - -#include "common/cuda_hip/distributed/partition_helpers_kernels.hpp.inc" - - -} // namespace partition_helpers -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/distributed/partition_kernels.hip.cpp b/hip/distributed/partition_kernels.hip.cpp deleted file mode 100644 index 00dc74b910f..00000000000 --- a/hip/distributed/partition_kernels.hip.cpp +++ /dev/null @@ -1,33 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/distributed/partition_kernels.hpp" - - -#include -#include -#include -#include -#include -#include - - -#include "common/unified/base/kernel_launch.hpp" -#include "core/components/fill_array_kernels.hpp" -#include "hip/base/thrust.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -namespace partition { - - -#include "common/cuda_hip/distributed/partition_kernels.hpp.inc" - - -} // namespace partition -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/distributed/vector_kernels.hip.cpp b/hip/distributed/vector_kernels.hip.cpp deleted file mode 100644 index 320d847ed85..00000000000 --- a/hip/distributed/vector_kernels.hip.cpp +++ /dev/null @@ -1,37 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/distributed/vector_kernels.hpp" - - -#include - - -#include -#include -#include -#include -#include -#include - - -#include - - -#include "hip/base/thrust.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -namespace distributed_vector { - - -#include "common/cuda_hip/distributed/vector_kernels.hpp.inc" - - -} // namespace distributed_vector -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/factorization/cholesky_kernels.hip.cpp b/hip/factorization/cholesky_kernels.hip.cpp deleted file mode 100644 index 04aa8da65ca..00000000000 --- a/hip/factorization/cholesky_kernels.hip.cpp +++ /dev/null @@ -1,115 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/factorization/cholesky_kernels.hpp" - - -#include -#include - - -#include -#include -#include -#include -#include -#include - - -#include - - -#include "core/components/fill_array_kernels.hpp" -#include "core/components/format_conversion_kernels.hpp" -#include "core/factorization/elimination_forest.hpp" -#include "core/factorization/lu_kernels.hpp" -#include "core/matrix/csr_lookup.hpp" -#include "hip/base/hipsparse_bindings.hip.hpp" -#include "hip/base/math.hip.hpp" -#include "hip/base/thrust.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" -#include "hip/components/intrinsics.hip.hpp" -#include "hip/components/reduction.hip.hpp" -#include "hip/components/syncfree.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -/** - * @brief The Cholesky namespace. - * - * @ingroup factor - */ -namespace cholesky { - - -constexpr int default_block_size = 512; - - -#include "common/cuda_hip/factorization/cholesky_kernels.hpp.inc" - - -template -void symbolic_count(std::shared_ptr exec, - const matrix::Csr* mtx, - const factorization::elimination_forest& forest, - IndexType* row_nnz, array& tmp_storage) -{ - const auto num_rows = static_cast(mtx->get_size()[0]); - if (num_rows == 0) { - return; - } - const auto mtx_nnz = static_cast(mtx->get_num_stored_elements()); - tmp_storage.resize_and_reset(mtx_nnz + num_rows); - const auto postorder_cols = tmp_storage.get_data(); - const auto lower_ends = postorder_cols + mtx_nnz; - const auto row_ptrs = mtx->get_const_row_ptrs(); - const auto cols = mtx->get_const_col_idxs(); - const auto inv_postorder = forest.inv_postorder.get_const_data(); - const auto postorder_parent = forest.postorder_parents.get_const_data(); - // transform col indices to postorder indices - { - const auto num_blocks = ceildiv(num_rows, default_block_size); - kernel::build_postorder_cols<<get_stream()>>>( - num_rows, cols, row_ptrs, inv_postorder, postorder_cols, - lower_ends); - } - // sort postorder_cols inside rows - { - const auto handle = exec->get_sparselib_handle(); - auto descr = sparselib::create_mat_descr(); - array permutation_array(exec, mtx_nnz); - auto permutation = permutation_array.get_data(); - components::fill_seq_array(exec, permutation, mtx_nnz); - size_type buffer_size{}; - sparselib::csrsort_buffer_size(handle, num_rows, num_rows, mtx_nnz, - row_ptrs, postorder_cols, buffer_size); - array buffer_array{exec, buffer_size}; - auto buffer = buffer_array.get_data(); - sparselib::csrsort(handle, num_rows, num_rows, mtx_nnz, descr, row_ptrs, - postorder_cols, permutation, buffer); - sparselib::destroy(descr); - } - // count nonzeros per row of L - { - const auto num_blocks = - ceildiv(num_rows, default_block_size / config::warp_size); - kernel::symbolic_count - <<get_stream()>>>( - num_rows, row_ptrs, lower_ends, inv_postorder, postorder_cols, - postorder_parent, row_nnz); - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_CHOLESKY_SYMBOLIC_COUNT); - - -} // namespace cholesky -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/factorization/factorization_kernels.hip.cpp b/hip/factorization/factorization_kernels.hip.cpp deleted file mode 100644 index 6ad176645f2..00000000000 --- a/hip/factorization/factorization_kernels.hip.cpp +++ /dev/null @@ -1,43 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/factorization/factorization_kernels.hpp" - - -#include - - -#include "common/cuda_hip/base/runtime.hpp" -#include "core/base/array_access.hpp" -#include "core/components/prefix_sum_kernels.hpp" -#include "core/matrix/csr_builder.hpp" -#include "hip/base/config.hip.hpp" -#include "hip/base/types.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" -#include "hip/components/intrinsics.hip.hpp" -#include "hip/components/searching.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -/** - * @brief The factorization namespace. - * - * @ingroup factor - */ -namespace factorization { - - -constexpr int default_block_size{512}; - - -#include "common/cuda_hip/factorization/factorization_kernels.hpp.inc" - - -} // namespace factorization -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/factorization/lu_kernels.hip.cpp b/hip/factorization/lu_kernels.hip.cpp deleted file mode 100644 index e1c60103dd3..00000000000 --- a/hip/factorization/lu_kernels.hip.cpp +++ /dev/null @@ -1,50 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/factorization/lu_kernels.hpp" - - -#include -#include - - -#include -#include -#include - - -#include - - -#include "core/base/allocator.hpp" -#include "core/matrix/csr_lookup.hpp" -#include "hip/base/thrust.hip.hpp" -#include "hip/base/types.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" -#include "hip/components/reduction.hip.hpp" -#include "hip/components/syncfree.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -/** - * @brief The LU namespace. - * - * @ingroup factor - */ -namespace lu_factorization { - - -constexpr static int default_block_size = 512; - - -#include "common/cuda_hip/factorization/lu_kernels.hpp.inc" - - -} // namespace lu_factorization -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/factorization/par_ic_kernels.hip.cpp b/hip/factorization/par_ic_kernels.hip.cpp deleted file mode 100644 index dd91ac27339..00000000000 --- a/hip/factorization/par_ic_kernels.hip.cpp +++ /dev/null @@ -1,39 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/factorization/par_ic_kernels.hpp" - - -#include -#include -#include - - -#include "hip/base/math.hip.hpp" -#include "hip/base/types.hip.hpp" -#include "hip/components/memory.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -/** - * @brief The parallel ic factorization namespace. - * - * @ingroup factor - */ -namespace par_ic_factorization { - - -constexpr int default_block_size = 512; - - -#include "common/cuda_hip/factorization/par_ic_kernels.hpp.inc" - - -} // namespace par_ic_factorization -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/factorization/par_ict_kernels.hip.cpp b/hip/factorization/par_ict_kernels.hip.cpp deleted file mode 100644 index 1d5e412e9dd..00000000000 --- a/hip/factorization/par_ict_kernels.hip.cpp +++ /dev/null @@ -1,189 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/factorization/par_ict_kernels.hpp" - - -#include -#include -#include -#include -#include - - -#include "common/cuda_hip/base/runtime.hpp" -#include "core/components/prefix_sum_kernels.hpp" -#include "core/matrix/coo_builder.hpp" -#include "core/matrix/csr_builder.hpp" -#include "core/matrix/csr_kernels.hpp" -#include "core/synthesizer/implementation_selection.hpp" -#include "hip/base/math.hip.hpp" -#include "hip/components/intrinsics.hip.hpp" -#include "hip/components/memory.hip.hpp" -#include "hip/components/merging.hip.hpp" -#include "hip/components/prefix_sum.hip.hpp" -#include "hip/components/reduction.hip.hpp" -#include "hip/components/searching.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -/** - * @brief The parallel ICT factorization namespace. - * - * @ingroup factor - */ -namespace par_ict_factorization { - - -constexpr int default_block_size = 512; - - -// subwarp sizes for all warp-parallel kernels (filter, add_candidates) -using compiled_kernels = - syn::value_list; - - -#include "common/cuda_hip/factorization/par_ict_kernels.hpp.inc" - - -namespace { - - -template -void add_candidates(syn::value_list, - std::shared_ptr exec, - const matrix::Csr* llh, - const matrix::Csr* a, - const matrix::Csr* l, - matrix::Csr* l_new) -{ - auto num_rows = static_cast(llh->get_size()[0]); - auto subwarps_per_block = default_block_size / subwarp_size; - auto num_blocks = ceildiv(num_rows, subwarps_per_block); - matrix::CsrBuilder l_new_builder(l_new); - auto llh_row_ptrs = llh->get_const_row_ptrs(); - auto llh_col_idxs = llh->get_const_col_idxs(); - auto llh_vals = llh->get_const_values(); - auto a_row_ptrs = a->get_const_row_ptrs(); - auto a_col_idxs = a->get_const_col_idxs(); - auto a_vals = a->get_const_values(); - auto l_row_ptrs = l->get_const_row_ptrs(); - auto l_col_idxs = l->get_const_col_idxs(); - auto l_vals = l->get_const_values(); - auto l_new_row_ptrs = l_new->get_row_ptrs(); - // count non-zeros per row - if (num_blocks > 0) { - kernel::ict_tri_spgeam_nnz - <<get_stream()>>>( - llh_row_ptrs, llh_col_idxs, a_row_ptrs, a_col_idxs, - l_new_row_ptrs, num_rows); - } - - // build row ptrs - components::prefix_sum_nonnegative(exec, l_new_row_ptrs, num_rows + 1); - - // resize output arrays - auto l_new_nnz = exec->copy_val_to_host(l_new_row_ptrs + num_rows); - l_new_builder.get_col_idx_array().resize_and_reset(l_new_nnz); - l_new_builder.get_value_array().resize_and_reset(l_new_nnz); - - auto l_new_col_idxs = l_new->get_col_idxs(); - auto l_new_vals = l_new->get_values(); - - // fill columns and values - if (num_blocks > 0) { - kernel::ict_tri_spgeam_init - <<get_stream()>>>( - llh_row_ptrs, llh_col_idxs, as_device_type(llh_vals), - a_row_ptrs, a_col_idxs, as_device_type(a_vals), l_row_ptrs, - l_col_idxs, as_device_type(l_vals), l_new_row_ptrs, - l_new_col_idxs, as_device_type(l_new_vals), num_rows); - } -} - - -GKO_ENABLE_IMPLEMENTATION_SELECTION(select_add_candidates, add_candidates); - - -template -void compute_factor(syn::value_list, - std::shared_ptr exec, - const matrix::Csr* a, - matrix::Csr* l, - const matrix::Coo* l_coo) -{ - auto total_nnz = static_cast(l->get_num_stored_elements()); - auto block_size = default_block_size / subwarp_size; - auto num_blocks = ceildiv(total_nnz, block_size); - if (num_blocks > 0) { - kernel::ict_sweep - <<get_stream()>>>( - a->get_const_row_ptrs(), a->get_const_col_idxs(), - as_device_type(a->get_const_values()), l->get_const_row_ptrs(), - l_coo->get_const_row_idxs(), l->get_const_col_idxs(), - as_device_type(l->get_values()), - static_cast(l->get_num_stored_elements())); - } -} - - -GKO_ENABLE_IMPLEMENTATION_SELECTION(select_compute_factor, compute_factor); - - -} // namespace - - -template -void add_candidates(std::shared_ptr exec, - const matrix::Csr* llh, - const matrix::Csr* a, - const matrix::Csr* l, - matrix::Csr* l_new) -{ - auto num_rows = a->get_size()[0]; - auto total_nnz = - llh->get_num_stored_elements() + a->get_num_stored_elements(); - auto total_nnz_per_row = total_nnz / num_rows; - select_add_candidates( - compiled_kernels(), - [&](int compiled_subwarp_size) { - return total_nnz_per_row <= compiled_subwarp_size || - compiled_subwarp_size == config::warp_size; - }, - syn::value_list(), syn::type_list<>(), exec, llh, a, l, l_new); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_PAR_ICT_ADD_CANDIDATES_KERNEL); - - -template -void compute_factor(std::shared_ptr exec, - const matrix::Csr* a, - matrix::Csr* l, - const matrix::Coo* l_coo) -{ - auto num_rows = a->get_size()[0]; - auto total_nnz = 2 * l->get_num_stored_elements(); - auto total_nnz_per_row = total_nnz / num_rows; - select_compute_factor( - compiled_kernels(), - [&](int compiled_subwarp_size) { - return total_nnz_per_row <= compiled_subwarp_size || - compiled_subwarp_size == config::warp_size; - }, - syn::value_list(), syn::type_list<>(), exec, a, l, l_coo); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_PAR_ICT_COMPUTE_FACTOR_KERNEL); - - -} // namespace par_ict_factorization -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/factorization/par_ilu_kernels.hip.cpp b/hip/factorization/par_ilu_kernels.hip.cpp deleted file mode 100644 index 20537a35965..00000000000 --- a/hip/factorization/par_ilu_kernels.hip.cpp +++ /dev/null @@ -1,39 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/factorization/par_ilu_kernels.hpp" - - -#include -#include - - -#include "common/cuda_hip/base/runtime.hpp" -#include "hip/base/math.hip.hpp" -#include "hip/base/types.hip.hpp" -#include "hip/components/memory.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -/** - * @brief The parallel ilu factorization namespace. - * - * @ingroup factor - */ -namespace par_ilu_factorization { - - -constexpr int default_block_size{512}; - - -#include "common/cuda_hip/factorization/par_ilu_kernels.hpp.inc" - - -} // namespace par_ilu_factorization -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/factorization/par_ilut_filter_kernels.hip.cpp b/hip/factorization/par_ilut_filter_kernels.hip.cpp deleted file mode 100644 index 2777d218149..00000000000 --- a/hip/factorization/par_ilut_filter_kernels.hip.cpp +++ /dev/null @@ -1,140 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/factorization/par_ilut_kernels.hpp" - - -#include -#include -#include -#include -#include - - -#include "common/cuda_hip/base/runtime.hpp" -#include "core/components/prefix_sum_kernels.hpp" -#include "core/matrix/coo_builder.hpp" -#include "core/matrix/csr_builder.hpp" -#include "core/matrix/csr_kernels.hpp" -#include "core/synthesizer/implementation_selection.hpp" -#include "hip/base/config.hip.hpp" -#include "hip/base/math.hip.hpp" -#include "hip/base/types.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" -#include "hip/components/intrinsics.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -/** - * @brief The parallel ILUT factorization namespace. - * - * @ingroup factor - */ -namespace par_ilut_factorization { - - -constexpr int default_block_size = 512; - - -// subwarp sizes for filter kernels -using compiled_kernels = - syn::value_list; - - -#include "common/cuda_hip/factorization/par_ilut_filter_kernels.hpp.inc" - - -namespace { - - -template -void threshold_filter(syn::value_list, - std::shared_ptr exec, - const matrix::Csr* a, - remove_complex threshold, - matrix::Csr* m_out, - matrix::Coo* m_out_coo, bool lower) -{ - auto old_row_ptrs = a->get_const_row_ptrs(); - auto old_col_idxs = a->get_const_col_idxs(); - auto old_vals = a->get_const_values(); - // compute nnz for each row - auto num_rows = static_cast(a->get_size()[0]); - auto block_size = default_block_size / subwarp_size; - auto num_blocks = ceildiv(num_rows, block_size); - auto new_row_ptrs = m_out->get_row_ptrs(); - if (num_blocks > 0) { - kernel::threshold_filter_nnz - <<get_stream()>>>( - old_row_ptrs, as_device_type(old_vals), num_rows, - as_device_type(threshold), new_row_ptrs, lower); - } - - // build row pointers - components::prefix_sum_nonnegative(exec, new_row_ptrs, num_rows + 1); - - // build matrix - auto new_nnz = exec->copy_val_to_host(new_row_ptrs + num_rows); - // resize arrays and update aliases - matrix::CsrBuilder builder{m_out}; - builder.get_col_idx_array().resize_and_reset(new_nnz); - builder.get_value_array().resize_and_reset(new_nnz); - auto new_col_idxs = m_out->get_col_idxs(); - auto new_vals = m_out->get_values(); - IndexType* new_row_idxs{}; - if (m_out_coo) { - matrix::CooBuilder coo_builder{m_out_coo}; - coo_builder.get_row_idx_array().resize_and_reset(new_nnz); - coo_builder.get_col_idx_array() = - make_array_view(exec, new_nnz, new_col_idxs); - coo_builder.get_value_array() = - make_array_view(exec, new_nnz, new_vals); - new_row_idxs = m_out_coo->get_row_idxs(); - } - if (num_blocks > 0) { - kernel::threshold_filter - <<get_stream()>>>( - old_row_ptrs, old_col_idxs, as_device_type(old_vals), num_rows, - as_device_type(threshold), new_row_ptrs, new_row_idxs, - new_col_idxs, as_device_type(new_vals), lower); - } -} - - -GKO_ENABLE_IMPLEMENTATION_SELECTION(select_threshold_filter, threshold_filter); - - -} // namespace - -template -void threshold_filter(std::shared_ptr exec, - const matrix::Csr* a, - remove_complex threshold, - matrix::Csr* m_out, - matrix::Coo* m_out_coo, bool lower) -{ - auto num_rows = a->get_size()[0]; - auto total_nnz = a->get_num_stored_elements(); - auto total_nnz_per_row = total_nnz / num_rows; - select_threshold_filter( - compiled_kernels(), - [&](int compiled_subwarp_size) { - return total_nnz_per_row <= compiled_subwarp_size || - compiled_subwarp_size == config::warp_size; - }, - syn::value_list(), syn::type_list<>(), exec, a, threshold, m_out, - m_out_coo, lower); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_KERNEL); - - -} // namespace par_ilut_factorization -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/factorization/par_ilut_select_kernels.hip.cpp b/hip/factorization/par_ilut_select_kernels.hip.cpp deleted file mode 100644 index b259133b95d..00000000000 --- a/hip/factorization/par_ilut_select_kernels.hip.cpp +++ /dev/null @@ -1,162 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/factorization/par_ilut_kernels.hpp" - - -#include - - -#include -#include -#include - - -#include "common/cuda_hip/base/runtime.hpp" -#include "core/components/prefix_sum_kernels.hpp" -#include "hip/base/math.hip.hpp" -#include "hip/components/atomic.hip.hpp" -#include "hip/components/intrinsics.hip.hpp" -#include "hip/components/prefix_sum.hip.hpp" -#include "hip/components/searching.hip.hpp" -#include "hip/components/sorting.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" -#include "hip/factorization/par_ilut_select_common.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -/** - * @brief The parallel ILUT factorization namespace. - * - * @ingroup factor - */ -namespace par_ilut_factorization { - - -#include "common/cuda_hip/factorization/par_ilut_select_kernels.hpp.inc" - - -template -void sampleselect_filter(std::shared_ptr exec, - const ValueType* values, IndexType size, - const unsigned char* oracles, - const IndexType* partial_counts, IndexType bucket, - remove_complex* out) -{ - auto num_threads_total = ceildiv(size, items_per_thread); - auto num_blocks = - static_cast(ceildiv(num_threads_total, default_block_size)); - if (num_blocks > 0) { - kernel::filter_bucket<<get_stream()>>>( - as_device_type(values), size, bucket, oracles, partial_counts, - as_device_type(out), items_per_thread); - } -} - - -template -void threshold_select(std::shared_ptr exec, - const matrix::Csr* m, - IndexType rank, array& tmp1, - array>& tmp2, - remove_complex& threshold) -{ - auto values = m->get_const_values(); - IndexType size = m->get_num_stored_elements(); - using AbsType = remove_complex; - constexpr auto bucket_count = kernel::searchtree_width; - auto max_num_threads = ceildiv(size, items_per_thread); - auto max_num_blocks = ceildiv(max_num_threads, default_block_size); - - size_type tmp_size_totals = - ceildiv((bucket_count + 1) * sizeof(IndexType), sizeof(ValueType)); - size_type tmp_size_partials = ceildiv( - bucket_count * max_num_blocks * sizeof(IndexType), sizeof(ValueType)); - size_type tmp_size_oracles = - ceildiv(size * sizeof(unsigned char), sizeof(ValueType)); - size_type tmp_size_tree = - ceildiv(kernel::searchtree_size * sizeof(AbsType), sizeof(ValueType)); - size_type tmp_size_vals = - size / bucket_count * 4; // pessimistic estimate for temporary storage - size_type tmp_size = - tmp_size_totals + tmp_size_partials + tmp_size_oracles + tmp_size_tree; - tmp1.resize_and_reset(tmp_size); - tmp2.resize_and_reset(tmp_size_vals); - - auto total_counts = reinterpret_cast(tmp1.get_data()); - auto partial_counts = - reinterpret_cast(tmp1.get_data() + tmp_size_totals); - auto oracles = reinterpret_cast( - tmp1.get_data() + tmp_size_totals + tmp_size_partials); - auto tree = - reinterpret_cast(tmp1.get_data() + tmp_size_totals + - tmp_size_partials + tmp_size_oracles); - - sampleselect_count(exec, values, size, tree, oracles, partial_counts, - total_counts); - - // determine bucket with correct rank, use bucket-local rank - auto bucket = sampleselect_find_bucket(exec, total_counts, rank); - rank -= bucket.begin; - - if (bucket.size * 2 > tmp_size_vals) { - // we need to reallocate tmp2 - tmp2.resize_and_reset(bucket.size * 2); - } - auto tmp21 = tmp2.get_data(); - auto tmp22 = tmp2.get_data() + bucket.size; - // extract target bucket - sampleselect_filter(exec, values, size, oracles, partial_counts, bucket.idx, - tmp22); - - // recursively select from smaller buckets - int step{}; - while (bucket.size > kernel::basecase_size) { - std::swap(tmp21, tmp22); - const auto* tmp_in = tmp21; - auto tmp_out = tmp22; - - sampleselect_count(exec, tmp_in, bucket.size, tree, oracles, - partial_counts, total_counts); - auto new_bucket = sampleselect_find_bucket(exec, total_counts, rank); - sampleselect_filter(exec, tmp_in, bucket.size, oracles, partial_counts, - bucket.idx, tmp_out); - - rank -= new_bucket.begin; - bucket.size = new_bucket.size; - // we should never need more than 5 recursion steps, this would mean - // 256^5 = 2^40. fall back to standard library algorithm in that case. - ++step; - if (step > 5) { - array cpu_out_array{ - exec->get_master(), - make_array_view(exec, bucket.size, tmp_out)}; - auto begin = cpu_out_array.get_data(); - auto end = begin + bucket.size; - auto middle = begin + rank; - std::nth_element(begin, middle, end); - threshold = *middle; - return; - } - } - - // base case - auto out_ptr = reinterpret_cast(tmp1.get_data()); - kernel::basecase_select<<<1, kernel::basecase_block_size, 0, - exec->get_stream()>>>( - as_device_type(tmp22), bucket.size, rank, as_device_type(out_ptr)); - threshold = exec->copy_val_to_host(out_ptr); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_PAR_ILUT_THRESHOLD_SELECT_KERNEL); - - -} // namespace par_ilut_factorization -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/factorization/par_ilut_spgeam_kernels.hip.cpp b/hip/factorization/par_ilut_spgeam_kernels.hip.cpp deleted file mode 100644 index cd9d7b7124a..00000000000 --- a/hip/factorization/par_ilut_spgeam_kernels.hip.cpp +++ /dev/null @@ -1,159 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/factorization/par_ilut_kernels.hpp" - - -#include -#include -#include -#include -#include - - -#include "common/cuda_hip/base/runtime.hpp" -#include "core/components/prefix_sum_kernels.hpp" -#include "core/matrix/coo_builder.hpp" -#include "core/matrix/csr_builder.hpp" -#include "core/matrix/csr_kernels.hpp" -#include "core/synthesizer/implementation_selection.hpp" -#include "hip/base/math.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" -#include "hip/components/intrinsics.hip.hpp" -#include "hip/components/merging.hip.hpp" -#include "hip/components/prefix_sum.hip.hpp" -#include "hip/components/searching.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -/** - * @brief The parallel ILUT factorization namespace. - * - * @ingroup factor - */ -namespace par_ilut_factorization { - - -constexpr int default_block_size = 512; - - -// subwarp sizes for add_candidates kernels -using compiled_kernels = - syn::value_list; - - -#include "common/cuda_hip/factorization/par_ilut_spgeam_kernels.hpp.inc" - - -namespace { - - -template -void add_candidates(syn::value_list, - std::shared_ptr exec, - const matrix::Csr* lu, - const matrix::Csr* a, - const matrix::Csr* l, - const matrix::Csr* u, - matrix::Csr* l_new, - matrix::Csr* u_new) -{ - auto num_rows = static_cast(lu->get_size()[0]); - auto subwarps_per_block = default_block_size / subwarp_size; - auto num_blocks = ceildiv(num_rows, subwarps_per_block); - matrix::CsrBuilder l_new_builder(l_new); - matrix::CsrBuilder u_new_builder(u_new); - auto lu_row_ptrs = lu->get_const_row_ptrs(); - auto lu_col_idxs = lu->get_const_col_idxs(); - auto lu_vals = lu->get_const_values(); - auto a_row_ptrs = a->get_const_row_ptrs(); - auto a_col_idxs = a->get_const_col_idxs(); - auto a_vals = a->get_const_values(); - auto l_row_ptrs = l->get_const_row_ptrs(); - auto l_col_idxs = l->get_const_col_idxs(); - auto l_vals = l->get_const_values(); - auto u_row_ptrs = u->get_const_row_ptrs(); - auto u_col_idxs = u->get_const_col_idxs(); - auto u_vals = u->get_const_values(); - auto l_new_row_ptrs = l_new->get_row_ptrs(); - auto u_new_row_ptrs = u_new->get_row_ptrs(); - if (num_blocks > 0) { - // count non-zeros per row - kernel::tri_spgeam_nnz - <<get_stream()>>>( - lu_row_ptrs, lu_col_idxs, a_row_ptrs, a_col_idxs, - l_new_row_ptrs, u_new_row_ptrs, num_rows); - } - - // build row ptrs - components::prefix_sum_nonnegative(exec, l_new_row_ptrs, num_rows + 1); - components::prefix_sum_nonnegative(exec, u_new_row_ptrs, num_rows + 1); - - // resize output arrays - auto l_new_nnz = exec->copy_val_to_host(l_new_row_ptrs + num_rows); - auto u_new_nnz = exec->copy_val_to_host(u_new_row_ptrs + num_rows); - l_new_builder.get_col_idx_array().resize_and_reset(l_new_nnz); - l_new_builder.get_value_array().resize_and_reset(l_new_nnz); - u_new_builder.get_col_idx_array().resize_and_reset(u_new_nnz); - u_new_builder.get_value_array().resize_and_reset(u_new_nnz); - - auto l_new_col_idxs = l_new->get_col_idxs(); - auto l_new_vals = l_new->get_values(); - auto u_new_col_idxs = u_new->get_col_idxs(); - auto u_new_vals = u_new->get_values(); - - if (num_blocks > 0) { - // fill columns and values - kernel::tri_spgeam_init - <<get_stream()>>>( - lu_row_ptrs, lu_col_idxs, as_device_type(lu_vals), a_row_ptrs, - a_col_idxs, as_device_type(a_vals), l_row_ptrs, l_col_idxs, - as_device_type(l_vals), u_row_ptrs, u_col_idxs, - as_device_type(u_vals), l_new_row_ptrs, l_new_col_idxs, - as_device_type(l_new_vals), u_new_row_ptrs, u_new_col_idxs, - as_device_type(u_new_vals), num_rows); - } -} - - -GKO_ENABLE_IMPLEMENTATION_SELECTION(select_add_candidates, add_candidates); - - -} // namespace - - -template -void add_candidates(std::shared_ptr exec, - const matrix::Csr* lu, - const matrix::Csr* a, - const matrix::Csr* l, - const matrix::Csr* u, - matrix::Csr* l_new, - matrix::Csr* u_new) -{ - auto num_rows = a->get_size()[0]; - auto total_nnz = - lu->get_num_stored_elements() + a->get_num_stored_elements(); - auto total_nnz_per_row = total_nnz / num_rows; - select_add_candidates( - compiled_kernels(), - [&](int compiled_subwarp_size) { - return total_nnz_per_row <= compiled_subwarp_size || - compiled_subwarp_size == config::warp_size; - }, - syn::value_list(), syn::type_list<>(), exec, lu, a, l, u, l_new, - u_new); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_PAR_ILUT_ADD_CANDIDATES_KERNEL); - - -} // namespace par_ilut_factorization -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/factorization/par_ilut_sweep_kernels.hip.cpp b/hip/factorization/par_ilut_sweep_kernels.hip.cpp deleted file mode 100644 index 26672fd2acb..00000000000 --- a/hip/factorization/par_ilut_sweep_kernels.hip.cpp +++ /dev/null @@ -1,123 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/factorization/par_ilut_kernels.hpp" - - -#include -#include -#include -#include -#include - - -#include "common/cuda_hip/base/runtime.hpp" -#include "core/components/prefix_sum_kernels.hpp" -#include "core/matrix/coo_builder.hpp" -#include "core/matrix/csr_builder.hpp" -#include "core/matrix/csr_kernels.hpp" -#include "core/synthesizer/implementation_selection.hpp" -#include "hip/base/math.hip.hpp" -#include "hip/components/intrinsics.hip.hpp" -#include "hip/components/memory.hip.hpp" -#include "hip/components/merging.hip.hpp" -#include "hip/components/prefix_sum.hip.hpp" -#include "hip/components/reduction.hip.hpp" -#include "hip/components/searching.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -/** - * @brief The parallel ILUT factorization namespace. - * - * @ingroup factor - */ -namespace par_ilut_factorization { - - -constexpr int default_block_size = 512; - - -// subwarp sizes for all warp-parallel kernels (filter, add_candidates) -using compiled_kernels = - syn::value_list; - - -#include "common/cuda_hip/factorization/par_ilut_sweep_kernels.hpp.inc" - - -namespace { - - -template -void compute_l_u_factors(syn::value_list, - std::shared_ptr exec, - const matrix::Csr* a, - matrix::Csr* l, - const matrix::Coo* l_coo, - matrix::Csr* u, - const matrix::Coo* u_coo, - matrix::Csr* u_csc) -{ - auto total_nnz = static_cast(l->get_num_stored_elements() + - u->get_num_stored_elements()); - auto block_size = default_block_size / subwarp_size; - auto num_blocks = ceildiv(total_nnz, block_size); - if (num_blocks > 0) { - kernel::sweep - <<get_stream()>>>( - a->get_const_row_ptrs(), a->get_const_col_idxs(), - as_device_type(a->get_const_values()), l->get_const_row_ptrs(), - l_coo->get_const_row_idxs(), l->get_const_col_idxs(), - as_device_type(l->get_values()), - static_cast(l->get_num_stored_elements()), - u_coo->get_const_row_idxs(), u_coo->get_const_col_idxs(), - as_device_type(u->get_values()), u_csc->get_const_row_ptrs(), - u_csc->get_const_col_idxs(), - as_device_type(u_csc->get_values()), - static_cast(u->get_num_stored_elements())); - } -} - -GKO_ENABLE_IMPLEMENTATION_SELECTION(select_compute_l_u_factors, - compute_l_u_factors); - - -} // namespace - - -template -void compute_l_u_factors(std::shared_ptr exec, - const matrix::Csr* a, - matrix::Csr* l, - const matrix::Coo* l_coo, - matrix::Csr* u, - const matrix::Coo* u_coo, - matrix::Csr* u_csc) -{ - auto num_rows = a->get_size()[0]; - auto total_nnz = - l->get_num_stored_elements() + u->get_num_stored_elements(); - auto total_nnz_per_row = total_nnz / num_rows; - select_compute_l_u_factors( - compiled_kernels(), - [&](int compiled_subwarp_size) { - return total_nnz_per_row <= compiled_subwarp_size || - compiled_subwarp_size == config::warp_size; - }, - syn::value_list(), syn::type_list<>(), exec, a, l, l_coo, u, u_coo, - u_csc); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_PAR_ILUT_COMPUTE_LU_FACTORS_KERNEL); - - -} // namespace par_ilut_factorization -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/log/batch_logger.hip.hpp b/hip/log/batch_logger.hip.hpp deleted file mode 100644 index a2540f2bd9d..00000000000 --- a/hip/log/batch_logger.hip.hpp +++ /dev/null @@ -1,26 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_HIP_LOG_BATCH_LOGGER_HIP_HPP_ -#define GKO_HIP_LOG_BATCH_LOGGER_HIP_HPP_ - - -#include - - -namespace gko { -namespace kernels { -namespace hip { -namespace batch_log { - -#include "common/cuda_hip/log/batch_logger.hpp.inc" - - -} // namespace batch_log -} // namespace hip -} // namespace kernels -} // namespace gko - - -#endif // GKO_HIP_LOG_BATCH_LOGGER_HIP_HPP_ diff --git a/hip/matrix/batch_csr_kernels.hip.cpp b/hip/matrix/batch_csr_kernels.hip.cpp deleted file mode 100644 index 2b5e02a1c31..00000000000 --- a/hip/matrix/batch_csr_kernels.hip.cpp +++ /dev/null @@ -1,58 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/matrix/batch_csr_kernels.hpp" - - -#include - - -#include -#include -#include - - -#include "common/cuda_hip/base/runtime.hpp" -#include "core/base/batch_struct.hpp" -#include "core/matrix/batch_struct.hpp" -#include "hip/base/batch_struct.hip.hpp" -#include "hip/base/config.hip.hpp" -#include "hip/base/thrust.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" -#include "hip/components/reduction.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" -#include "hip/components/uninitialized_array.hip.hpp" -#include "hip/matrix/batch_struct.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -/** - * @brief The Csr matrix format namespace. - * @ref Csr - * @ingroup batch_csr - */ -namespace batch_csr { - - -constexpr auto default_block_size = 256; -constexpr int sm_oversubscription = 4; - -// clang-format off - -// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES - -#include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc" - - -#include "common/cuda_hip/matrix/batch_csr_kernel_launcher.hpp.inc" - -// clang-format on - - -} // namespace batch_csr -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/matrix/batch_dense_kernels.hip.cpp b/hip/matrix/batch_dense_kernels.hip.cpp deleted file mode 100644 index c53a271598e..00000000000 --- a/hip/matrix/batch_dense_kernels.hip.cpp +++ /dev/null @@ -1,59 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/matrix/batch_dense_kernels.hpp" - - -#include - - -#include -#include -#include - - -#include "common/cuda_hip/base/runtime.hpp" -#include "core/base/batch_struct.hpp" -#include "core/matrix/batch_struct.hpp" -#include "hip/base/batch_struct.hip.hpp" -#include "hip/base/config.hip.hpp" -#include "hip/base/thrust.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" -#include "hip/components/reduction.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" -#include "hip/components/uninitialized_array.hip.hpp" -#include "hip/matrix/batch_struct.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -/** - * @brief The Dense matrix format namespace. - * - * @ingroup batch_dense - */ -namespace batch_dense { - - -constexpr auto default_block_size = 256; -constexpr int sm_oversubscription = 4; - -// clang-format off - -// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES - -#include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc" - - -#include "common/cuda_hip/matrix/batch_dense_kernel_launcher.hpp.inc" - - -// clang-format on - - -} // namespace batch_dense -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/matrix/batch_ell_kernels.hip.cpp b/hip/matrix/batch_ell_kernels.hip.cpp deleted file mode 100644 index c6ef298803b..00000000000 --- a/hip/matrix/batch_ell_kernels.hip.cpp +++ /dev/null @@ -1,58 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/matrix/batch_ell_kernels.hpp" - - -#include - - -#include -#include -#include - - -#include "common/cuda_hip/base/runtime.hpp" -#include "core/base/batch_struct.hpp" -#include "core/matrix/batch_struct.hpp" -#include "hip/base/batch_struct.hip.hpp" -#include "hip/base/config.hip.hpp" -#include "hip/base/thrust.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" -#include "hip/components/reduction.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" -#include "hip/components/uninitialized_array.hip.hpp" -#include "hip/matrix/batch_struct.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -/** - * @brief The Ell matrix format namespace. - * @ref Ell - * @ingroup batch_ell - */ -namespace batch_ell { - - -constexpr auto default_block_size = 256; -constexpr int sm_oversubscription = 4; - -// clang-format off - -// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES - -#include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc" - - -#include "common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc" - -// clang-format on - - -} // namespace batch_ell -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/matrix/coo_kernels.hip.cpp b/hip/matrix/coo_kernels.hip.cpp deleted file mode 100644 index 35bc698a4de..00000000000 --- a/hip/matrix/coo_kernels.hip.cpp +++ /dev/null @@ -1,49 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/matrix/coo_kernels.hpp" - - -#include -#include -#include -#include -#include - - -#include "common/cuda_hip/base/runtime.hpp" -#include "core/matrix/dense_kernels.hpp" -#include "hip/base/config.hip.hpp" -#include "hip/base/hipsparse_bindings.hip.hpp" -#include "hip/base/math.hip.hpp" -#include "hip/base/types.hip.hpp" -#include "hip/components/atomic.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" -#include "hip/components/format_conversion.hip.hpp" -#include "hip/components/segment_scan.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -/** - * @brief The Coordinate matrix format namespace. - * - * @ingroup coo - */ -namespace coo { - - -constexpr int warps_in_block = 4; -constexpr int spmv_block_size = warps_in_block * config::warp_size; - - -#include "common/cuda_hip/matrix/coo_kernels.hpp.inc" - - -} // namespace coo -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/matrix/dense_kernels.hip.cpp b/hip/matrix/dense_kernels.hip.cpp deleted file mode 100644 index 50bd975411e..00000000000 --- a/hip/matrix/dense_kernels.hip.cpp +++ /dev/null @@ -1,232 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/matrix/dense_kernels.hpp" - - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -#include "common/cuda_hip/base/runtime.hpp" -#include "core/base/utils.hpp" -#include "core/components/prefix_sum_kernels.hpp" -#include "hip/base/config.hip.hpp" -#include "hip/base/hipblas_bindings.hip.hpp" -#include "hip/base/pointer_mode_guard.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" -#include "hip/components/intrinsics.hip.hpp" -#include "hip/components/reduction.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" -#include "hip/components/uninitialized_array.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -/** - * @brief The Dense matrix format namespace. - * - * @ingroup dense - */ -namespace dense { - - -constexpr int default_block_size = 512; - - -#include "common/cuda_hip/matrix/dense_kernels.hpp.inc" - - -template -void compute_dot_dispatch(std::shared_ptr exec, - const matrix::Dense* x, - const matrix::Dense* y, - matrix::Dense* result, array& tmp) -{ - if (x->get_size()[1] == 1 && y->get_size()[1] == 1) { - if (blas::is_supported::value) { - auto handle = exec->get_blas_handle(); - blas::dot(handle, x->get_size()[0], x->get_const_values(), - x->get_stride(), y->get_const_values(), y->get_stride(), - result->get_values()); - } else { - compute_dot(exec, x, y, result, tmp); - } - } else { - compute_dot(exec, x, y, result, tmp); - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL); - - -template -void compute_conj_dot_dispatch(std::shared_ptr exec, - const matrix::Dense* x, - const matrix::Dense* y, - matrix::Dense* result, - array& tmp) -{ - if (x->get_size()[1] == 1 && y->get_size()[1] == 1) { - if (blas::is_supported::value) { - auto handle = exec->get_blas_handle(); - blas::conj_dot(handle, x->get_size()[0], x->get_const_values(), - x->get_stride(), y->get_const_values(), - y->get_stride(), result->get_values()); - } else { - compute_conj_dot(exec, x, y, result, tmp); - } - } else { - compute_conj_dot(exec, x, y, result, tmp); - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL); - - -template -void compute_norm2_dispatch(std::shared_ptr exec, - const matrix::Dense* x, - matrix::Dense>* result, - array& tmp) -{ - if (x->get_size()[1] == 1) { - if (blas::is_supported::value) { - auto handle = exec->get_blas_handle(); - blas::norm2(handle, x->get_size()[0], x->get_const_values(), - x->get_stride(), result->get_values()); - } else { - compute_norm2(exec, x, result, tmp); - } - } else { - compute_norm2(exec, x, result, tmp); - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL); - - -template -void simple_apply(std::shared_ptr exec, - const matrix::Dense* a, - const matrix::Dense* b, - matrix::Dense* c) -{ - if (blas::is_supported::value) { - auto handle = exec->get_blas_handle(); - if (c->get_size()[0] > 0 && c->get_size()[1] > 0) { - if (a->get_size()[1] > 0) { - blas::pointer_mode_guard pm_guard(handle); - auto alpha = one(); - auto beta = zero(); - blas::gemm(handle, BLAS_OP_N, BLAS_OP_N, c->get_size()[1], - c->get_size()[0], a->get_size()[1], &alpha, - b->get_const_values(), b->get_stride(), - a->get_const_values(), a->get_stride(), &beta, - c->get_values(), c->get_stride()); - } else { - dense::fill(exec, c, zero()); - } - } - } else { - GKO_NOT_IMPLEMENTED; - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_SIMPLE_APPLY_KERNEL); - - -template -void apply(std::shared_ptr exec, - const matrix::Dense* alpha, - const matrix::Dense* a, const matrix::Dense* b, - const matrix::Dense* beta, matrix::Dense* c) -{ - if (blas::is_supported::value) { - if (c->get_size()[0] > 0 && c->get_size()[1] > 0) { - if (a->get_size()[1] > 0) { - blas::gemm(exec->get_blas_handle(), BLAS_OP_N, BLAS_OP_N, - c->get_size()[1], c->get_size()[0], a->get_size()[1], - alpha->get_const_values(), b->get_const_values(), - b->get_stride(), a->get_const_values(), - a->get_stride(), beta->get_const_values(), - c->get_values(), c->get_stride()); - } else { - dense::scale(exec, beta, c); - } - } - } else { - GKO_NOT_IMPLEMENTED; - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL); - - -template -void transpose(std::shared_ptr exec, - const matrix::Dense* orig, - matrix::Dense* trans) -{ - if (blas::is_supported::value) { - auto handle = exec->get_blas_handle(); - if (orig->get_size()[0] > 0 && orig->get_size()[1] > 0) { - blas::pointer_mode_guard pm_guard(handle); - auto alpha = one(); - auto beta = zero(); - blas::geam(handle, BLAS_OP_T, BLAS_OP_N, orig->get_size()[0], - orig->get_size()[1], &alpha, orig->get_const_values(), - orig->get_stride(), &beta, trans->get_const_values(), - trans->get_stride(), trans->get_values(), - trans->get_stride()); - } - } else { - GKO_NOT_IMPLEMENTED; - } -}; - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_TRANSPOSE_KERNEL); - - -template -void conj_transpose(std::shared_ptr exec, - const matrix::Dense* orig, - matrix::Dense* trans) -{ - if (blas::is_supported::value) { - auto handle = exec->get_blas_handle(); - if (orig->get_size()[0] > 0 && orig->get_size()[1] > 0) { - blas::pointer_mode_guard pm_guard(handle); - auto alpha = one(); - auto beta = zero(); - blas::geam(handle, BLAS_OP_C, BLAS_OP_N, orig->get_size()[0], - orig->get_size()[1], &alpha, orig->get_const_values(), - orig->get_stride(), &beta, trans->get_const_values(), - trans->get_stride(), trans->get_values(), - trans->get_stride()); - } - } else { - GKO_NOT_IMPLEMENTED; - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_CONJ_TRANSPOSE_KERNEL); - - -} // namespace dense -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/matrix/diagonal_kernels.hip.cpp b/hip/matrix/diagonal_kernels.hip.cpp deleted file mode 100644 index d707fda9108..00000000000 --- a/hip/matrix/diagonal_kernels.hip.cpp +++ /dev/null @@ -1,39 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/matrix/diagonal_kernels.hpp" - - -#include -#include - - -#include "common/cuda_hip/base/runtime.hpp" -#include "hip/base/config.hip.hpp" -#include "hip/base/types.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -/** - * @brief The Diagonal matrix format namespace. - * - * @ingroup diagonal - */ -namespace diagonal { - - -constexpr int default_block_size = 512; - - -#include "common/cuda_hip/matrix/diagonal_kernels.hpp.inc" - - -} // namespace diagonal -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/matrix/ell_kernels.hip.cpp b/hip/matrix/ell_kernels.hip.cpp deleted file mode 100644 index 669b0934165..00000000000 --- a/hip/matrix/ell_kernels.hip.cpp +++ /dev/null @@ -1,272 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/matrix/ell_kernels.hpp" - - -#include - - -#include -#include -#include -#include -#include - - -#include "accessor/hip_helper.hpp" -#include "accessor/reduced_row_major.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "core/base/mixed_precision_types.hpp" -#include "core/components/fill_array_kernels.hpp" -#include "core/components/prefix_sum_kernels.hpp" -#include "core/matrix/dense_kernels.hpp" -#include "core/synthesizer/implementation_selection.hpp" -#include "hip/base/config.hip.hpp" -#include "hip/base/hipsparse_bindings.hip.hpp" -#include "hip/base/types.hip.hpp" -#include "hip/components/atomic.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" -#include "hip/components/format_conversion.hip.hpp" -#include "hip/components/reduction.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -/** - * @brief The ELL matrix format namespace. - * - * @ingroup ell - */ -namespace ell { - - -constexpr int default_block_size = 512; - - -// TODO: num_threads_per_core and ratio are parameters should be tuned -/** - * num_threads_per_core is the oversubscribing parameter. There are - * `num_threads_per_core` threads assigned to each physical core. - */ -constexpr int num_threads_per_core = 4; - - -/** - * ratio is the parameter to decide when to use threads to do reduction on each - * row. (#cols/#rows > ratio) - */ -constexpr double ratio = 1e-2; - - -/** - * max_thread_per_worker is the max number of thread per worker. The - * `compiled_kernels` must be a list <0, 1, 2, ..., max_thread_per_worker> - */ -constexpr int max_thread_per_worker = 32; - - -/** - * A compile-time list of sub-warp sizes for which the spmv kernels should be - * compiled. - * 0 is a special case where it uses a sub-warp size of warp_size in - * combination with atomic_adds. - */ -using compiled_kernels = syn::value_list; - - -#include "common/cuda_hip/matrix/ell_kernels.hpp.inc" - - -namespace { - - -template -void abstract_spmv(syn::value_list, - std::shared_ptr exec, - int num_worker_per_row, - const matrix::Ell* a, - const matrix::Dense* b, - matrix::Dense* c, - const matrix::Dense* alpha = nullptr, - const matrix::Dense* beta = nullptr) -{ - using arithmetic_type = - highest_precision; - using a_accessor = - acc::reduced_row_major<1, arithmetic_type, const MatrixValueType>; - using b_accessor = - acc::reduced_row_major<2, arithmetic_type, const InputValueType>; - - const auto nrows = a->get_size()[0]; - const auto stride = a->get_stride(); - const auto num_stored_elements_per_row = - a->get_num_stored_elements_per_row(); - - constexpr int num_thread_per_worker = - (info == 0) ? max_thread_per_worker : info; - constexpr bool atomic = (info == 0); - const dim3 block_size(default_block_size / num_thread_per_worker, - num_thread_per_worker, 1); - const dim3 grid_size(ceildiv(nrows * num_worker_per_row, block_size.x), - b->get_size()[1], 1); - - const auto a_vals = acc::range( - std::array{{static_cast( - num_stored_elements_per_row * stride)}}, - a->get_const_values()); - const auto b_vals = acc::range( - std::array{ - {static_cast(b->get_size()[0]), - static_cast(b->get_size()[1])}}, - b->get_const_values(), - std::array{ - {static_cast(b->get_stride())}}); - - if (alpha == nullptr && beta == nullptr) { - if (grid_size.x > 0 && grid_size.y > 0) { - kernel::spmv - <<get_stream()>>>( - nrows, num_worker_per_row, acc::as_hip_range(a_vals), - a->get_const_col_idxs(), stride, - num_stored_elements_per_row, acc::as_hip_range(b_vals), - as_device_type(c->get_values()), c->get_stride()); - } - } else if (alpha != nullptr && beta != nullptr) { - const auto alpha_val = acc::range( - std::array{1}, alpha->get_const_values()); - if (grid_size.x > 0 && grid_size.y > 0) { - kernel::spmv - <<get_stream()>>>( - nrows, num_worker_per_row, acc::as_hip_range(alpha_val), - acc::as_hip_range(a_vals), a->get_const_col_idxs(), stride, - num_stored_elements_per_row, acc::as_hip_range(b_vals), - as_device_type(beta->get_const_values()), - as_device_type(c->get_values()), c->get_stride()); - } - } else { - GKO_KERNEL_NOT_FOUND; - } -} - -GKO_ENABLE_IMPLEMENTATION_SELECTION(select_abstract_spmv, abstract_spmv); - - -template -std::array compute_thread_worker_and_atomicity( - std::shared_ptr exec, - const matrix::Ell* a) -{ - int num_thread_per_worker = 1; - int atomic = 0; - int num_worker_per_row = 1; - - const auto nrows = a->get_size()[0]; - const auto ell_ncols = a->get_num_stored_elements_per_row(); - // TODO: num_threads_per_core should be tuned for AMD gpu - const auto nwarps = exec->get_num_warps_per_sm() * - exec->get_num_multiprocessor() * num_threads_per_core; - - // Use multithreads to perform the reduction on each row when the matrix is - // wide. - // To make every thread have computation, so pick the value which is the - // power of 2 less than max_thread_per_worker and is less than or equal to - // ell_ncols. If the num_thread_per_worker is max_thread_per_worker and - // allow more than one worker to work on the same row, use atomic add to - // handle the worker write the value into the same position. The #worker is - // decided according to the number of worker allowed on GPU. - if (static_cast(ell_ncols) / nrows > ratio) { - while (num_thread_per_worker < max_thread_per_worker && - (num_thread_per_worker << 1) <= ell_ncols) { - num_thread_per_worker <<= 1; - } - if (num_thread_per_worker == max_thread_per_worker) { - num_worker_per_row = - std::min(ell_ncols / max_thread_per_worker, nwarps / nrows); - num_worker_per_row = std::max(num_worker_per_row, 1); - } - if (num_worker_per_row > 1) { - atomic = 1; - } - } - return {num_thread_per_worker, atomic, num_worker_per_row}; -} - - -} // namespace - - -template -void spmv(std::shared_ptr exec, - const matrix::Ell* a, - const matrix::Dense* b, - matrix::Dense* c) -{ - const auto data = compute_thread_worker_and_atomicity(exec, a); - const int num_thread_per_worker = std::get<0>(data); - const int atomic = std::get<1>(data); - const int num_worker_per_row = std::get<2>(data); - - /** - * info is the parameter for selecting the device kernel. - * for info == 0, it uses the kernel by warp_size threads with atomic - * operation for other value, it uses the kernel without atomic_add - */ - const int info = (!atomic) * num_thread_per_worker; - if (atomic) { - dense::fill(exec, c, zero()); - } - select_abstract_spmv( - compiled_kernels(), - [&info](int compiled_info) { return info == compiled_info; }, - syn::value_list(), syn::type_list<>(), exec, num_worker_per_row, a, - b, c); -} - -GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_ELL_SPMV_KERNEL); - - -template -void advanced_spmv(std::shared_ptr exec, - const matrix::Dense* alpha, - const matrix::Ell* a, - const matrix::Dense* b, - const matrix::Dense* beta, - matrix::Dense* c) -{ - const auto data = compute_thread_worker_and_atomicity(exec, a); - const int num_thread_per_worker = std::get<0>(data); - const int atomic = std::get<1>(data); - const int num_worker_per_row = std::get<2>(data); - - /** - * info is the parameter for selecting the device kernel. - * for info == 0, it uses the kernel by warp_size threads with atomic - * operation for other value, it uses the kernel without atomic_add - */ - const int info = (!atomic) * num_thread_per_worker; - if (atomic) { - dense::scale(exec, beta, c); - } - select_abstract_spmv( - compiled_kernels(), - [&info](int compiled_info) { return info == compiled_info; }, - syn::value_list(), syn::type_list<>(), exec, num_worker_per_row, a, - b, c, alpha, beta); -} - -GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL); - - -} // namespace ell -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/matrix/fbcsr_kernels.template.hip.cpp b/hip/matrix/fbcsr_kernels.template.hip.cpp deleted file mode 100644 index 8e4519e74e5..00000000000 --- a/hip/matrix/fbcsr_kernels.template.hip.cpp +++ /dev/null @@ -1,303 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/matrix/fbcsr_kernels.hpp" - - -#include - - -#include -#include -#include -#include -#include -#include -#include - - -#include -#include -#include -#include -#include - - -#include "common/cuda_hip/base/runtime.hpp" -#include "common/unified/base/kernel_launch.hpp" -#include "core/base/array_access.hpp" -#include "core/base/block_sizes.hpp" -#include "core/base/device_matrix_data_kernels.hpp" -#include "core/components/fill_array_kernels.hpp" -#include "core/components/format_conversion_kernels.hpp" -#include "core/matrix/csr_lookup.hpp" -#include "core/matrix/dense_kernels.hpp" -#include "core/synthesizer/implementation_selection.hpp" -#include "hip/base/config.hip.hpp" -#include "hip/base/hipblas_bindings.hip.hpp" -#include "hip/base/hipsparse_bindings.hip.hpp" -#include "hip/base/hipsparse_block_bindings.hip.hpp" -#include "hip/base/math.hip.hpp" -#include "hip/base/pointer_mode_guard.hip.hpp" -#include "hip/base/thrust.hip.hpp" -#include "hip/base/types.hip.hpp" -#include "hip/components/atomic.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" -#include "hip/components/merging.hip.hpp" -#include "hip/components/prefix_sum.hip.hpp" -#include "hip/components/reduction.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" -#include "hip/components/uninitialized_array.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { - - -/** - * @brief The fixed-size block compressed sparse row matrix format namespace. - * - * @ingroup fbcsr - */ -namespace fbcsr { - - -constexpr int default_block_size{512}; - - -#include "common/cuda_hip/matrix/csr_common.hpp.inc" -#include "common/cuda_hip/matrix/fbcsr_kernels.hpp.inc" - - -namespace { - - -template -void dense_transpose(std::shared_ptr exec, - const size_type nrows, const size_type ncols, - const size_type orig_stride, const ValueType* const orig, - const size_type trans_stride, ValueType* const trans) -{ - if (nrows == 0) { - return; - } - if (blas::is_supported::value) { - auto handle = exec->get_blas_handle(); - { - blas::pointer_mode_guard pm_guard(handle); - auto alpha = one(); - auto beta = zero(); - blas::geam(handle, BLAS_OP_T, BLAS_OP_N, nrows, ncols, &alpha, orig, - orig_stride, &beta, trans, trans_stride, trans, - trans_stride); - } - } else { - GKO_NOT_IMPLEMENTED; - } -} - - -} // namespace - - -template -void spmv(std::shared_ptr exec, - const matrix::Fbcsr* const a, - const matrix::Dense* const b, - matrix::Dense* const c) -{ - if (c->get_size()[0] == 0 || c->get_size()[1] == 0) { - // empty output: nothing to do - return; - } - if (b->get_size()[0] == 0 || a->get_num_stored_blocks() == 0) { - // empty input: fill output with zero - dense::fill(exec, c, zero()); - return; - } - if (sparselib::is_supported::value) { - auto handle = exec->get_sparselib_handle(); - sparselib::pointer_mode_guard pm_guard(handle); - const auto alpha = one(); - const auto beta = zero(); - auto descr = sparselib::create_mat_descr(); - const auto row_ptrs = a->get_const_row_ptrs(); - const auto col_idxs = a->get_const_col_idxs(); - const auto values = a->get_const_values(); - const int bs = a->get_block_size(); - const IndexType mb = a->get_num_block_rows(); - const IndexType nb = a->get_num_block_cols(); - const auto nnzb = static_cast(a->get_num_stored_blocks()); - const auto nrhs = static_cast(b->get_size()[1]); - const auto nrows = a->get_size()[0]; - const auto ncols = a->get_size()[1]; - const auto in_stride = b->get_stride(); - const auto out_stride = c->get_stride(); - if (nrhs == 1 && in_stride == 1 && out_stride == 1) { - sparselib::bsrmv(handle, SPARSELIB_OPERATION_NON_TRANSPOSE, mb, nb, - nnzb, &alpha, descr, values, row_ptrs, col_idxs, - bs, b->get_const_values(), &beta, c->get_values()); - } else { - const auto trans_stride = nrows; - auto trans_c = array(exec, nrows * nrhs); - sparselib::bsrmm(handle, SPARSELIB_OPERATION_NON_TRANSPOSE, - SPARSELIB_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb, - &alpha, descr, values, row_ptrs, col_idxs, bs, - b->get_const_values(), in_stride, &beta, - trans_c.get_data(), trans_stride); - dense_transpose(exec, nrhs, nrows, trans_stride, trans_c.get_data(), - out_stride, c->get_values()); - } - sparselib::destroy(descr); - } else { - GKO_NOT_IMPLEMENTED; - } -} - - -template -void advanced_spmv(std::shared_ptr exec, - const matrix::Dense* const alpha, - const matrix::Fbcsr* const a, - const matrix::Dense* const b, - const matrix::Dense* const beta, - matrix::Dense* const c) -{ - if (c->get_size()[0] == 0 || c->get_size()[1] == 0) { - // empty output: nothing to do - return; - } - if (b->get_size()[0] == 0 || a->get_num_stored_blocks() == 0) { - // empty input: scale output - dense::scale(exec, beta, c); - return; - } - if (sparselib::is_supported::value) { - auto handle = exec->get_sparselib_handle(); - const auto alphp = alpha->get_const_values(); - const auto betap = beta->get_const_values(); - auto descr = sparselib::create_mat_descr(); - const auto row_ptrs = a->get_const_row_ptrs(); - const auto col_idxs = a->get_const_col_idxs(); - const auto values = a->get_const_values(); - const int bs = a->get_block_size(); - const IndexType mb = a->get_num_block_rows(); - const IndexType nb = a->get_num_block_cols(); - const auto nnzb = static_cast(a->get_num_stored_blocks()); - const auto nrhs = static_cast(b->get_size()[1]); - const auto nrows = a->get_size()[0]; - const auto ncols = a->get_size()[1]; - const auto in_stride = b->get_stride(); - const auto out_stride = c->get_stride(); - if (nrhs == 1 && in_stride == 1 && out_stride == 1) { - sparselib::bsrmv(handle, SPARSELIB_OPERATION_NON_TRANSPOSE, mb, nb, - nnzb, alphp, descr, values, row_ptrs, col_idxs, bs, - b->get_const_values(), betap, c->get_values()); - } else { - const auto trans_stride = nrows; - auto trans_c = array(exec, nrows * nrhs); - dense_transpose(exec, nrows, nrhs, out_stride, c->get_values(), - trans_stride, trans_c.get_data()); - sparselib::bsrmm(handle, SPARSELIB_OPERATION_NON_TRANSPOSE, - SPARSELIB_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb, - alphp, descr, values, row_ptrs, col_idxs, bs, - b->get_const_values(), in_stride, betap, - trans_c.get_data(), trans_stride); - dense_transpose(exec, nrhs, nrows, trans_stride, trans_c.get_data(), - out_stride, c->get_values()); - } - sparselib::destroy(descr); - } else { - GKO_NOT_IMPLEMENTED; - } -} - - -namespace { - - -template -void transpose_blocks_impl(syn::value_list, - std::shared_ptr exec, - matrix::Fbcsr* const mat) -{ - constexpr int subwarp_size = config::warp_size; - const auto nbnz = mat->get_num_stored_blocks(); - const auto numthreads = nbnz * subwarp_size; - const auto block_size = default_block_size; - const auto grid_dim = ceildiv(numthreads, block_size); - if (grid_dim > 0) { - kernel::transpose_blocks - <<get_stream()>>>( - nbnz, mat->get_values()); - } -} - -GKO_ENABLE_IMPLEMENTATION_SELECTION(select_transpose_blocks, - transpose_blocks_impl); - - -} // namespace - - -template -void transpose(const std::shared_ptr exec, - const matrix::Fbcsr* const orig, - matrix::Fbcsr* const trans) -{ -#ifdef GKO_COMPILING_CUDA - if (sparselib::is_supported::value) { - const int bs = orig->get_block_size(); - const IndexType nnzb = - static_cast(orig->get_num_stored_blocks()); - cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC; - cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO; - const IndexType buffer_size = sparselib::bsr_transpose_buffersize( - exec->get_sparselib_handle(), orig->get_num_block_rows(), - orig->get_num_block_cols(), nnzb, orig->get_const_values(), - orig->get_const_row_ptrs(), orig->get_const_col_idxs(), bs, bs); - array buffer_array(exec, buffer_size); - auto buffer = buffer_array.get_data(); - sparselib::bsr_transpose( - exec->get_sparselib_handle(), orig->get_num_block_rows(), - orig->get_num_block_cols(), nnzb, orig->get_const_values(), - orig->get_const_row_ptrs(), orig->get_const_col_idxs(), bs, bs, - trans->get_values(), trans->get_col_idxs(), trans->get_row_ptrs(), - copyValues, idxBase, buffer); - - // transpose blocks - select_transpose_blocks( - fixedblock::compiled_kernels(), - [bs](int compiled_block_size) { return bs == compiled_block_size; }, - syn::value_list(), syn::type_list<>(), exec, trans); - } else -#endif - { - fallback_transpose(exec, orig, trans); - } -} - - -template -void conj_transpose(std::shared_ptr exec, - const matrix::Fbcsr* orig, - matrix::Fbcsr* trans) -{ - const int grid_size = - ceildiv(trans->get_num_stored_elements(), default_block_size); - transpose(exec, orig, trans); - if (grid_size > 0 && is_complex()) { - kernel:: - conjugate<<get_stream()>>>( - trans->get_num_stored_elements(), - as_device_type(trans->get_values())); - } -} - - -} // namespace fbcsr -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/matrix/sellp_kernels.hip.cpp b/hip/matrix/sellp_kernels.hip.cpp deleted file mode 100644 index 16b139987a2..00000000000 --- a/hip/matrix/sellp_kernels.hip.cpp +++ /dev/null @@ -1,44 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/matrix/sellp_kernels.hpp" - - -#include -#include -#include -#include -#include - - -#include "common/cuda_hip/base/runtime.hpp" -#include "core/components/prefix_sum_kernels.hpp" -#include "hip/base/config.hip.hpp" -#include "hip/base/hipsparse_bindings.hip.hpp" -#include "hip/base/types.hip.hpp" -#include "hip/components/reduction.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -/** - * @brief The SELL-P matrix format namespace. - * - * @ingroup sellp - */ -namespace sellp { - - -constexpr int default_block_size = 512; - - -#include "common/cuda_hip/matrix/sellp_kernels.hpp.inc" - - -} // namespace sellp -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/multigrid/pgm_kernels.hip.cpp b/hip/multigrid/pgm_kernels.hip.cpp deleted file mode 100644 index ed81d1c66dc..00000000000 --- a/hip/multigrid/pgm_kernels.hip.cpp +++ /dev/null @@ -1,43 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/multigrid/pgm_kernels.hpp" - - -#include - - -#include -#include -#include -#include -#include - - -#include -#include - - -#include "hip/base/thrust.hip.hpp" -#include "hip/base/types.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -/** - * @brief The PGM solver namespace. - * - * @ingroup pgm - */ -namespace pgm { - - -#include "common/cuda_hip/multigrid/pgm_kernels.hpp.inc" - - -} // namespace pgm -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/preconditioner/isai_kernels.hip.cpp b/hip/preconditioner/isai_kernels.hip.cpp deleted file mode 100644 index 11e0e229abc..00000000000 --- a/hip/preconditioner/isai_kernels.hip.cpp +++ /dev/null @@ -1,49 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/preconditioner/isai_kernels.hpp" - - -#include -#include -#include - - -#include "common/cuda_hip/base/runtime.hpp" -#include "core/components/prefix_sum_kernels.hpp" -#include "core/matrix/csr_builder.hpp" -#include "hip/base/config.hip.hpp" -#include "hip/base/math.hip.hpp" -#include "hip/base/types.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" -#include "hip/components/merging.hip.hpp" -#include "hip/components/reduction.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" -#include "hip/components/uninitialized_array.hip.hpp" -#include "hip/components/warp_blas.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -/** - * @brief The Isai preconditioner namespace. - * @ref Isai - * @ingroup isai - */ -namespace isai { - - -constexpr int subwarp_size{row_size_limit}; -constexpr int subwarps_per_block{2}; -constexpr int default_block_size{subwarps_per_block * subwarp_size}; - - -#include "common/cuda_hip/preconditioner/isai_kernels.hpp.inc" - - -} // namespace isai -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/preconditioner/jacobi_kernels.hip.cpp b/hip/preconditioner/jacobi_kernels.hip.cpp deleted file mode 100644 index 292b040ff1a..00000000000 --- a/hip/preconditioner/jacobi_kernels.hip.cpp +++ /dev/null @@ -1,51 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/preconditioner/jacobi_kernels.hpp" - - -#include - - -#include "common/cuda_hip/base/runtime.hpp" -#include "core/base/extended_float.hpp" -#include "core/preconditioner/jacobi_utils.hpp" -#include "core/synthesizer/implementation_selection.hpp" -#include "hip/base/config.hip.hpp" -#include "hip/base/math.hip.hpp" -#include "hip/base/types.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" -#include "hip/preconditioner/jacobi_common.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -/** - * @brief The Jacobi preconditioner namespace. - * @ref Jacobi - * @ingroup jacobi - */ -namespace jacobi { - - -// a total of 32/16 warps (1024 threads) -#if defined(GKO_COMPILING_HIP) && GINKGO_HIP_PLATFORM_HCC -constexpr int default_num_warps = 16; -#else // !defined(GKO_COMPILING_HIP) || GINKGO_HIP_PLATFORM_NVCC -constexpr int default_num_warps = 32; -#endif -// with current architectures, at most 32 warps can be scheduled per SM (and -// current GPUs have at most 84 SMs) -constexpr int default_grid_size = 32 * 32 * 128; - - -#include "common/cuda_hip/preconditioner/jacobi_kernels.hpp.inc" - - -} // namespace jacobi -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/reorder/rcm_kernels.hip.cpp b/hip/reorder/rcm_kernels.hip.cpp deleted file mode 100644 index 0c83c728e79..00000000000 --- a/hip/reorder/rcm_kernels.hip.cpp +++ /dev/null @@ -1,54 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/reorder/rcm_kernels.hpp" - - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -#include -#include -#include -#include -#include -#include - - -#include "core/base/array_access.hpp" -#include "hip/base/thrust.hip.hpp" -#include "hip/components/memory.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -/** - * @brief The reordering namespace. - * - * @ingroup reorder - */ -namespace rcm { - - -constexpr int default_block_size = 512; - - -#include "common/cuda_hip/reorder/rcm_kernels.hpp.inc" - - -} // namespace rcm -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/solver/cb_gmres_kernels.hip.cpp b/hip/solver/cb_gmres_kernels.hip.cpp deleted file mode 100644 index 75bb5475b4c..00000000000 --- a/hip/solver/cb_gmres_kernels.hip.cpp +++ /dev/null @@ -1,507 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/solver/cb_gmres_kernels.hpp" - - -#include - - -#include -#include -#include -#include - - -#include "accessor/hip_helper.hpp" -#include "accessor/range.hpp" -#include "accessor/reduced_row_major.hpp" -#include "accessor/scaled_reduced_row_major.hpp" -#include "core/base/array_access.hpp" -#include "core/components/fill_array_kernels.hpp" -#include "core/matrix/dense_kernels.hpp" -#include "core/solver/cb_gmres_accessor.hpp" -#include "hip/base/config.hip.hpp" -#include "hip/base/math.hip.hpp" -#include "hip/base/types.hip.hpp" -#include "hip/components/atomic.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" -#include "hip/components/reduction.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" -#include "hip/components/uninitialized_array.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -/** - * @brief The CB_GMRES solver namespace. - * - * @ingroup cb_gmres - */ -namespace cb_gmres { - - -constexpr int default_block_size = 512; -// default_dot_dim can not be 64 in hip because 64 * 64 exceeds their max block -// size limit. -constexpr int default_dot_dim = 32; -constexpr int default_dot_size = default_dot_dim * default_dot_dim; - - -#include "common/cuda_hip/solver/cb_gmres_kernels.hpp.inc" - - -template -void zero_matrix(std::shared_ptr exec, size_type m, - size_type n, size_type stride, ValueType* array) -{ - const auto block_size = default_block_size; - const auto grid_size = ceildiv(n, block_size); - zero_matrix_kernel<<get_stream()>>>( - m, n, stride, as_device_type(array)); -} - - -template -void initialize(std::shared_ptr exec, - const matrix::Dense* b, - matrix::Dense* residual, - matrix::Dense* givens_sin, - matrix::Dense* givens_cos, - array* stop_status, size_type krylov_dim) -{ - const auto num_threads = std::max(b->get_size()[0] * b->get_stride(), - krylov_dim * b->get_size()[1]); - const auto grid_dim = ceildiv(num_threads, default_block_size); - const auto block_dim = default_block_size; - constexpr auto block_size = default_block_size; - - initialize_kernel - <<get_stream()>>>( - b->get_size()[0], b->get_size()[1], krylov_dim, - as_device_type(b->get_const_values()), b->get_stride(), - as_device_type(residual->get_values()), residual->get_stride(), - as_device_type(givens_sin->get_values()), givens_sin->get_stride(), - as_device_type(givens_cos->get_values()), givens_cos->get_stride(), - as_device_type(stop_status->get_data())); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CB_GMRES_INITIALIZE_KERNEL); - - -template -void restart(std::shared_ptr exec, - const matrix::Dense* residual, - matrix::Dense>* residual_norm, - matrix::Dense* residual_norm_collection, - matrix::Dense>* arnoldi_norm, - Accessor3d krylov_bases, - matrix::Dense* next_krylov_basis, - array* final_iter_nums, array& reduction_tmp, - size_type krylov_dim) -{ - constexpr bool use_scalar = - gko::cb_gmres::detail::has_3d_scaled_accessor::value; - const auto num_rows = residual->get_size()[0]; - const auto num_rhs = residual->get_size()[1]; - const auto krylov_stride = - gko::cb_gmres::helper_functions_accessor::get_stride( - krylov_bases); - const auto grid_dim_1 = - ceildiv((krylov_dim + 1) * krylov_stride[0], default_block_size); - const auto block_dim = default_block_size; - constexpr auto block_size = default_block_size; - const auto stride_arnoldi = arnoldi_norm->get_stride(); - - restart_1_kernel - <<get_stream()>>>( - residual->get_size()[0], residual->get_size()[1], krylov_dim, - acc::as_hip_range(krylov_bases), - as_device_type(residual_norm_collection->get_values()), - residual_norm_collection->get_stride()); - kernels::hip::dense::compute_norm2_dispatch(exec, residual, residual_norm, - reduction_tmp); - - if (use_scalar) { - components::fill_array(exec, - arnoldi_norm->get_values() + 2 * stride_arnoldi, - num_rhs, zero>()); - const dim3 grid_size_nrm(ceildiv(num_rhs, default_dot_dim), - exec->get_num_multiprocessor() * 2); - const dim3 block_size_nrm(default_dot_dim, default_dot_dim); - multinorminf_without_stop_kernel<<get_stream()>>>( - num_rows, num_rhs, as_device_type(residual->get_const_values()), - residual->get_stride(), - as_device_type(arnoldi_norm->get_values() + 2 * stride_arnoldi), 0); - } - - if (gko::cb_gmres::detail::has_3d_scaled_accessor::value) { - set_scalar_kernel - <<get_stream()>>>( - num_rhs, krylov_dim + 1, - as_device_type(residual_norm->get_const_values()), - residual_norm->get_stride(), - as_device_type(arnoldi_norm->get_const_values() + - 2 * stride_arnoldi), - stride_arnoldi, acc::as_hip_range(krylov_bases)); - } - - const auto grid_dim_2 = - ceildiv(std::max(num_rows, 1) * krylov_stride[1], - default_block_size); - restart_2_kernel - <<get_stream()>>>( - residual->get_size()[0], residual->get_size()[1], - as_device_type(residual->get_const_values()), - residual->get_stride(), - as_device_type(residual_norm->get_const_values()), - as_device_type(residual_norm_collection->get_values()), - acc::as_hip_range(krylov_bases), - as_device_type(next_krylov_basis->get_values()), - next_krylov_basis->get_stride(), - as_device_type(final_iter_nums->get_data())); -} - -GKO_INSTANTIATE_FOR_EACH_CB_GMRES_TYPE(GKO_DECLARE_CB_GMRES_RESTART_KERNEL); - - -template -void finish_arnoldi_CGS(std::shared_ptr exec, - matrix::Dense* next_krylov_basis, - Accessor3dim krylov_bases, - matrix::Dense* hessenberg_iter, - matrix::Dense* buffer_iter, - matrix::Dense>* arnoldi_norm, - size_type iter, const stopping_status* stop_status, - stopping_status* reorth_status, - array* num_reorth) -{ - const auto dim_size = next_krylov_basis->get_size(); - if (dim_size[1] == 0) { - return; - } - using non_complex = remove_complex; - // optimization parameter - constexpr int singledot_block_size = default_dot_dim; - constexpr bool use_scalar = - gko::cb_gmres::detail::has_3d_scaled_accessor::value; - const auto stride_next_krylov = next_krylov_basis->get_stride(); - const auto stride_hessenberg = hessenberg_iter->get_stride(); - const auto stride_buffer = buffer_iter->get_stride(); - const auto stride_arnoldi = arnoldi_norm->get_stride(); - const dim3 grid_size(ceildiv(dim_size[1], default_dot_dim), - exec->get_num_multiprocessor() * 2); - const dim3 grid_size_num_iters(ceildiv(dim_size[1], default_dot_dim), - exec->get_num_multiprocessor() * 2, - iter + 1); - const dim3 block_size(default_dot_dim, default_dot_dim); - // Note: having iter first (instead of row_idx information) is likely - // beneficial for avoiding atomic_add conflicts, but that needs - // further investigation. - const dim3 grid_size_iters_single(exec->get_num_multiprocessor() * 2, - iter + 1); - const auto block_size_iters_single = singledot_block_size; - size_type num_reorth_host; - - components::fill_array(exec, arnoldi_norm->get_values(), dim_size[1], - zero()); - multinorm2_kernel<<get_stream()>>>( - dim_size[0], dim_size[1], - as_device_type(next_krylov_basis->get_const_values()), - stride_next_krylov, as_device_type(arnoldi_norm->get_values()), - as_device_type(stop_status)); - // nrmP = norm(next_krylov_basis) - zero_matrix(exec, iter + 1, dim_size[1], stride_hessenberg, - hessenberg_iter->get_values()); - if (dim_size[1] > 1) { - multidot_kernel - <<get_stream()>>>( - dim_size[0], dim_size[1], - as_device_type(next_krylov_basis->get_const_values()), - stride_next_krylov, acc::as_hip_range(krylov_bases), - as_device_type(hessenberg_iter->get_values()), - stride_hessenberg, as_device_type(stop_status)); - } else { - singledot_kernel - <<get_stream()>>>( - dim_size[0], - as_device_type(next_krylov_basis->get_const_values()), - stride_next_krylov, acc::as_hip_range(krylov_bases), - as_device_type(hessenberg_iter->get_values()), - stride_hessenberg, as_device_type(stop_status)); - } - // for i in 1:iter - // hessenberg(iter, i) = next_krylov_basis' * krylov_bases(:, i) - // end - update_next_krylov_kernel - <<get_stream()>>>( - iter + 1, dim_size[0], dim_size[1], - as_device_type(next_krylov_basis->get_values()), stride_next_krylov, - acc::as_hip_range(krylov_bases), - as_device_type(hessenberg_iter->get_const_values()), - stride_hessenberg, as_device_type(stop_status)); - - // for i in 1:iter - // next_krylov_basis -= hessenberg(iter, i) * krylov_bases(:, i) - // end - components::fill_array(exec, arnoldi_norm->get_values() + stride_arnoldi, - dim_size[1], zero()); - if (use_scalar) { - components::fill_array(exec, - arnoldi_norm->get_values() + 2 * stride_arnoldi, - dim_size[1], zero()); - } - multinorm2_inf_kernel - <<get_stream()>>>( - dim_size[0], dim_size[1], - as_device_type(next_krylov_basis->get_const_values()), - stride_next_krylov, - as_device_type(arnoldi_norm->get_values() + stride_arnoldi), - as_device_type(arnoldi_norm->get_values() + 2 * stride_arnoldi), - as_device_type(stop_status)); - // nrmN = norm(next_krylov_basis) - components::fill_array(exec, num_reorth->get_data(), 1, zero()); - check_arnoldi_norms - <<get_stream()>>>( - dim_size[1], as_device_type(arnoldi_norm->get_values()), - stride_arnoldi, as_device_type(hessenberg_iter->get_values()), - stride_hessenberg, iter + 1, acc::as_hip_range(krylov_bases), - as_device_type(stop_status), as_device_type(reorth_status), - as_device_type(num_reorth->get_data())); - num_reorth_host = get_element(*num_reorth, 0); - // num_reorth_host := number of next_krylov vector to be reorthogonalization - for (size_type l = 1; (num_reorth_host > 0) && (l < 3); l++) { - zero_matrix(exec, iter + 1, dim_size[1], stride_buffer, - buffer_iter->get_values()); - if (dim_size[1] > 1) { - multidot_kernel - <<get_stream()>>>( - dim_size[0], dim_size[1], - as_device_type(next_krylov_basis->get_const_values()), - stride_next_krylov, acc::as_hip_range(krylov_bases), - as_device_type(buffer_iter->get_values()), stride_buffer, - as_device_type(stop_status)); - } else { - singledot_kernel - <<get_stream()>>>( - dim_size[0], - as_device_type(next_krylov_basis->get_const_values()), - stride_next_krylov, acc::as_hip_range(krylov_bases), - as_device_type(buffer_iter->get_values()), stride_buffer, - as_device_type(stop_status)); - } - // for i in 1:iter - // hessenberg(iter, i) = next_krylov_basis' * krylov_bases(:, i) - // end - update_next_krylov_and_add_kernel - <<get_stream()>>>( - iter + 1, dim_size[0], dim_size[1], - as_device_type(next_krylov_basis->get_values()), - stride_next_krylov, acc::as_hip_range(krylov_bases), - as_device_type(hessenberg_iter->get_values()), - stride_hessenberg, - as_device_type(buffer_iter->get_const_values()), stride_buffer, - as_device_type(stop_status), as_device_type(reorth_status)); - // for i in 1:iter - // next_krylov_basis -= hessenberg(iter, i) * krylov_bases(:, i) - // end - components::fill_array(exec, - arnoldi_norm->get_values() + stride_arnoldi, - dim_size[1], zero()); - if (use_scalar) { - components::fill_array( - exec, arnoldi_norm->get_values() + 2 * stride_arnoldi, - dim_size[1], zero()); - } - multinorm2_inf_kernel - <<get_stream()>>>( - dim_size[0], dim_size[1], - as_device_type(next_krylov_basis->get_const_values()), - stride_next_krylov, - as_device_type(arnoldi_norm->get_values() + stride_arnoldi), - as_device_type(arnoldi_norm->get_values() + 2 * stride_arnoldi), - as_device_type(stop_status)); - // nrmN = norm(next_krylov_basis) - components::fill_array(exec, num_reorth->get_data(), 1, - zero()); - check_arnoldi_norms - <<get_stream()>>>( - dim_size[1], as_device_type(arnoldi_norm->get_values()), - stride_arnoldi, as_device_type(hessenberg_iter->get_values()), - stride_hessenberg, iter + 1, acc::as_hip_range(krylov_bases), - as_device_type(stop_status), as_device_type(reorth_status), - num_reorth->get_data()); - num_reorth_host = get_element(*num_reorth, 0); - // num_reorth_host := number of next_krylov vector to be - // reorthogonalization - } - update_krylov_next_krylov_kernel - <<get_stream()>>>( - iter, dim_size[0], dim_size[1], - as_device_type(next_krylov_basis->get_values()), stride_next_krylov, - acc::as_hip_range(krylov_bases), - as_device_type(hessenberg_iter->get_const_values()), - stride_hessenberg, as_device_type(stop_status)); - // next_krylov_basis /= hessenberg(iter, iter + 1) - // krylov_bases(:, iter + 1) = next_krylov_basis - // End of arnoldi -} - -template -void givens_rotation(std::shared_ptr exec, - matrix::Dense* givens_sin, - matrix::Dense* givens_cos, - matrix::Dense* hessenberg_iter, - matrix::Dense>* residual_norm, - matrix::Dense* residual_norm_collection, - size_type iter, const array* stop_status) -{ - // TODO: tune block_size for optimal performance - constexpr auto block_size = default_block_size; - const auto num_cols = hessenberg_iter->get_size()[1]; - const auto block_dim = block_size; - const auto grid_dim = - static_cast(ceildiv(num_cols, block_size)); - - givens_rotation_kernel - <<get_stream()>>>( - hessenberg_iter->get_size()[0], hessenberg_iter->get_size()[1], - iter, as_device_type(hessenberg_iter->get_values()), - hessenberg_iter->get_stride(), - as_device_type(givens_sin->get_values()), givens_sin->get_stride(), - as_device_type(givens_cos->get_values()), givens_cos->get_stride(), - as_device_type(residual_norm->get_values()), - as_device_type(residual_norm_collection->get_values()), - residual_norm_collection->get_stride(), - stop_status->get_const_data()); -} - - -template -void arnoldi(std::shared_ptr exec, - matrix::Dense* next_krylov_basis, - matrix::Dense* givens_sin, - matrix::Dense* givens_cos, - matrix::Dense>* residual_norm, - matrix::Dense* residual_norm_collection, - Accessor3d krylov_bases, matrix::Dense* hessenberg_iter, - matrix::Dense* buffer_iter, - matrix::Dense>* arnoldi_norm, - size_type iter, array* final_iter_nums, - const array* stop_status, - array* reorth_status, - array* num_reorth) -{ - increase_final_iteration_numbers_kernel<<< - static_cast( - ceildiv(final_iter_nums->get_size(), default_block_size)), - default_block_size, 0, exec->get_stream()>>>( - as_device_type(final_iter_nums->get_data()), - stop_status->get_const_data(), final_iter_nums->get_size()); - finish_arnoldi_CGS(exec, next_krylov_basis, krylov_bases, hessenberg_iter, - buffer_iter, arnoldi_norm, iter, - stop_status->get_const_data(), reorth_status->get_data(), - num_reorth); - givens_rotation(exec, givens_sin, givens_cos, hessenberg_iter, - residual_norm, residual_norm_collection, iter, stop_status); -} - -GKO_INSTANTIATE_FOR_EACH_CB_GMRES_TYPE(GKO_DECLARE_CB_GMRES_ARNOLDI_KERNEL); - - -template -void solve_upper_triangular( - std::shared_ptr exec, - const matrix::Dense* residual_norm_collection, - const matrix::Dense* hessenberg, matrix::Dense* y, - const array* final_iter_nums) -{ - // TODO: tune block_size for optimal performance - constexpr auto block_size = default_block_size; - const auto num_rhs = residual_norm_collection->get_size()[1]; - const auto block_dim = block_size; - const auto grid_dim = - static_cast(ceildiv(num_rhs, block_size)); - - solve_upper_triangular_kernel - <<get_stream()>>>( - hessenberg->get_size()[1], num_rhs, - as_device_type(residual_norm_collection->get_const_values()), - residual_norm_collection->get_stride(), - as_device_type(hessenberg->get_const_values()), - hessenberg->get_stride(), as_device_type(y->get_values()), - y->get_stride(), as_device_type(final_iter_nums->get_const_data())); -} - - -template -void calculate_qy(std::shared_ptr exec, - ConstAccessor3d krylov_bases, size_type num_krylov_bases, - const matrix::Dense* y, - matrix::Dense* before_preconditioner, - const array* final_iter_nums) -{ - const auto num_rows = before_preconditioner->get_size()[0]; - const auto num_cols = before_preconditioner->get_size()[1]; - const auto stride_before_preconditioner = - before_preconditioner->get_stride(); - - constexpr auto block_size = default_block_size; - const auto grid_dim = static_cast( - ceildiv(num_rows * stride_before_preconditioner, block_size)); - const auto block_dim = block_size; - - calculate_Qy_kernel - <<get_stream()>>>( - num_rows, num_cols, acc::as_hip_range(krylov_bases), - as_device_type(y->get_const_values()), y->get_stride(), - as_device_type(before_preconditioner->get_values()), - stride_before_preconditioner, - as_device_type(final_iter_nums->get_const_data())); - // Calculate qy - // before_preconditioner = krylov_bases * y -} - - -template -void solve_krylov(std::shared_ptr exec, - const matrix::Dense* residual_norm_collection, - ConstAccessor3d krylov_bases, - const matrix::Dense* hessenberg, - matrix::Dense* y, - matrix::Dense* before_preconditioner, - const array* final_iter_nums) -{ - if (before_preconditioner->get_size()[1] == 0) { - return; - } - // since hessenberg has dims: iters x iters * num_rhs - // krylov_bases has dims: (iters + 1) x sysmtx[0] x num_rhs - const auto iters = - hessenberg->get_size()[1] / before_preconditioner->get_size()[1]; - const auto num_krylov_bases = iters + 1; - solve_upper_triangular(exec, residual_norm_collection, hessenberg, y, - final_iter_nums); - calculate_qy(exec, krylov_bases, num_krylov_bases, y, before_preconditioner, - final_iter_nums); -} - -GKO_INSTANTIATE_FOR_EACH_CB_GMRES_CONST_TYPE( - GKO_DECLARE_CB_GMRES_SOLVE_KRYLOV_KERNEL); - - -} // namespace cb_gmres -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/solver/idr_kernels.hip.cpp b/hip/solver/idr_kernels.hip.cpp deleted file mode 100644 index 049b05c5750..00000000000 --- a/hip/solver/idr_kernels.hip.cpp +++ /dev/null @@ -1,343 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/solver/idr_kernels.hpp" - - -#include -#include - - -#include -#include - - -#include "common/cuda_hip/base/runtime.hpp" -#include "core/components/fill_array_kernels.hpp" -#include "hip/base/config.hip.hpp" -#include "hip/base/hipblas_bindings.hip.hpp" -#include "hip/base/hiprand_bindings.hip.hpp" -#include "hip/base/math.hip.hpp" -#include "hip/base/types.hip.hpp" -#include "hip/components/atomic.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" -#include "hip/components/reduction.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -/** - * @brief The IDR solver namespace. - * - * @ingroup idr - */ -namespace idr { - - -constexpr int default_block_size = 512; -constexpr int default_dot_dim = 32; -constexpr int default_dot_size = default_dot_dim * default_dot_dim; - - -#include "common/cuda_hip/solver/idr_kernels.hpp.inc" - - -namespace { - - -template -void initialize_m(std::shared_ptr exec, - const size_type nrhs, matrix::Dense* m, - array* stop_status) -{ - const auto subspace_dim = m->get_size()[0]; - const auto m_stride = m->get_stride(); - - const auto grid_dim = ceildiv(m_stride * subspace_dim, default_block_size); - initialize_m_kernel<<get_stream()>>>( - subspace_dim, nrhs, as_device_type(m->get_values()), m_stride, - as_device_type(stop_status->get_data())); -} - - -template -void initialize_subspace_vectors(std::shared_ptr exec, - matrix::Dense* subspace_vectors, - bool deterministic) -{ - if (!deterministic) { - auto gen = randlib::rand_generator(std::random_device{}(), - RANDLIB_RNG_PSEUDO_DEFAULT, - exec->get_stream()); - randlib::rand_vector( - gen, - subspace_vectors->get_size()[0] * subspace_vectors->get_stride(), - 0.0, 1.0, subspace_vectors->get_values()); - randlib::destroy(gen); - } -} - - -template -void orthonormalize_subspace_vectors( - std::shared_ptr exec, - matrix::Dense* subspace_vectors) -{ - orthonormalize_subspace_vectors_kernel - <<<1, default_block_size, 0, exec->get_stream()>>>( - subspace_vectors->get_size()[0], subspace_vectors->get_size()[1], - as_device_type(subspace_vectors->get_values()), - subspace_vectors->get_stride()); -} - - -template -void solve_lower_triangular(std::shared_ptr exec, - const size_type nrhs, - const matrix::Dense* m, - const matrix::Dense* f, - matrix::Dense* c, - const array* stop_status) -{ - const auto subspace_dim = m->get_size()[0]; - - const auto grid_dim = ceildiv(nrhs, default_block_size); - solve_lower_triangular_kernel<<get_stream()>>>( - subspace_dim, nrhs, as_device_type(m->get_const_values()), - m->get_stride(), as_device_type(f->get_const_values()), f->get_stride(), - as_device_type(c->get_values()), c->get_stride(), - stop_status->get_const_data()); -} - - -template -void update_g_and_u(std::shared_ptr exec, - const size_type nrhs, const size_type k, - const matrix::Dense* p, - const matrix::Dense* m, - matrix::Dense* alpha, - matrix::Dense* g, matrix::Dense* g_k, - matrix::Dense* u, - const array* stop_status) -{ - if (nrhs == 0) { - return; - } - const auto size = g->get_size()[0]; - const auto p_stride = p->get_stride(); - - const dim3 grid_dim(ceildiv(nrhs, default_dot_dim), - exec->get_num_multiprocessor() * 2); - const dim3 block_dim(default_dot_dim, default_dot_dim); - - for (size_type i = 0; i < k; i++) { - const auto p_i = p->get_const_values() + i * p_stride; - if (nrhs > 1 || is_complex()) { - components::fill_array(exec, alpha->get_values(), nrhs, - zero()); - multidot_kernel<<get_stream()>>>( - size, nrhs, as_device_type(p_i), - as_device_type(g_k->get_values()), g_k->get_stride(), - as_device_type(alpha->get_values()), - stop_status->get_const_data()); - } else { - blas::dot(exec->get_blas_handle(), size, p_i, 1, g_k->get_values(), - g_k->get_stride(), alpha->get_values()); - } - update_g_k_and_u_kernel - <<get_stride(), default_block_size), - default_block_size, 0, exec->get_stream()>>>( - k, i, size, nrhs, as_device_type(alpha->get_const_values()), - as_device_type(m->get_const_values()), m->get_stride(), - as_device_type(g->get_const_values()), g->get_stride(), - as_device_type(g_k->get_values()), g_k->get_stride(), - as_device_type(u->get_values()), u->get_stride(), - stop_status->get_const_data()); - } - update_g_kernel - <<get_stride(), default_block_size), - default_block_size, 0, exec->get_stream()>>>( - k, size, nrhs, as_device_type(g_k->get_const_values()), - g_k->get_stride(), as_device_type(g->get_values()), g->get_stride(), - stop_status->get_const_data()); -} - - -template -void update_m(std::shared_ptr exec, const size_type nrhs, - const size_type k, const matrix::Dense* p, - const matrix::Dense* g_k, matrix::Dense* m, - const array* stop_status) -{ - if (nrhs == 0) { - return; - } - const auto size = g_k->get_size()[0]; - const auto subspace_dim = m->get_size()[0]; - const auto p_stride = p->get_stride(); - const auto m_stride = m->get_stride(); - - const dim3 grid_dim(ceildiv(nrhs, default_dot_dim), - exec->get_num_multiprocessor() * 2); - const dim3 block_dim(default_dot_dim, default_dot_dim); - - for (size_type i = k; i < subspace_dim; i++) { - const auto p_i = p->get_const_values() + i * p_stride; - auto m_i = m->get_values() + i * m_stride + k * nrhs; - if (nrhs > 1 || is_complex()) { - components::fill_array(exec, m_i, nrhs, zero()); - multidot_kernel<<get_stream()>>>( - size, nrhs, as_device_type(p_i), - as_device_type(g_k->get_const_values()), g_k->get_stride(), - as_device_type(m_i), stop_status->get_const_data()); - } else { - blas::dot(exec->get_blas_handle(), size, p_i, 1, - g_k->get_const_values(), g_k->get_stride(), m_i); - } - } -} - - -template -void update_x_r_and_f(std::shared_ptr exec, - const size_type nrhs, const size_type k, - const matrix::Dense* m, - const matrix::Dense* g, - const matrix::Dense* u, - matrix::Dense* f, matrix::Dense* r, - matrix::Dense* x, - const array* stop_status) -{ - const auto size = x->get_size()[0]; - const auto subspace_dim = m->get_size()[0]; - - const auto grid_dim = ceildiv(size * x->get_stride(), default_block_size); - update_x_r_and_f_kernel<<get_stream()>>>( - k, size, subspace_dim, nrhs, as_device_type(m->get_const_values()), - m->get_stride(), as_device_type(g->get_const_values()), g->get_stride(), - as_device_type(u->get_const_values()), u->get_stride(), - as_device_type(f->get_values()), f->get_stride(), - as_device_type(r->get_values()), r->get_stride(), - as_device_type(x->get_values()), x->get_stride(), - stop_status->get_const_data()); - components::fill_array(exec, f->get_values() + k * f->get_stride(), nrhs, - zero()); -} - - -} // namespace - - -template -void initialize(std::shared_ptr exec, - const size_type nrhs, matrix::Dense* m, - matrix::Dense* subspace_vectors, bool deterministic, - array* stop_status) -{ - initialize_m(exec, nrhs, m, stop_status); - initialize_subspace_vectors(exec, subspace_vectors, deterministic); - orthonormalize_subspace_vectors(exec, subspace_vectors); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_INITIALIZE_KERNEL); - - -template -void step_1(std::shared_ptr exec, const size_type nrhs, - const size_type k, const matrix::Dense* m, - const matrix::Dense* f, - const matrix::Dense* residual, - const matrix::Dense* g, matrix::Dense* c, - matrix::Dense* v, - const array* stop_status) -{ - solve_lower_triangular(exec, nrhs, m, f, c, stop_status); - - const auto num_rows = v->get_size()[0]; - const auto subspace_dim = m->get_size()[0]; - - const auto grid_dim = ceildiv(nrhs * num_rows, default_block_size); - step_1_kernel<<get_stream()>>>( - k, num_rows, subspace_dim, nrhs, - as_device_type(residual->get_const_values()), residual->get_stride(), - as_device_type(c->get_const_values()), c->get_stride(), - as_device_type(g->get_const_values()), g->get_stride(), - as_device_type(v->get_values()), v->get_stride(), - stop_status->get_const_data()); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_STEP_1_KERNEL); - - -template -void step_2(std::shared_ptr exec, const size_type nrhs, - const size_type k, const matrix::Dense* omega, - const matrix::Dense* preconditioned_vector, - const matrix::Dense* c, matrix::Dense* u, - const array* stop_status) -{ - if (nrhs == 0) { - return; - } - const auto num_rows = preconditioned_vector->get_size()[0]; - const auto subspace_dim = u->get_size()[1] / nrhs; - - const auto grid_dim = ceildiv(nrhs * num_rows, default_block_size); - step_2_kernel<<get_stream()>>>( - k, num_rows, subspace_dim, nrhs, - as_device_type(omega->get_const_values()), - as_device_type(preconditioned_vector->get_const_values()), - preconditioned_vector->get_stride(), - as_device_type(c->get_const_values()), c->get_stride(), - as_device_type(u->get_values()), u->get_stride(), - stop_status->get_const_data()); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_STEP_2_KERNEL); - - -template -void step_3(std::shared_ptr exec, const size_type nrhs, - const size_type k, const matrix::Dense* p, - matrix::Dense* g, matrix::Dense* g_k, - matrix::Dense* u, matrix::Dense* m, - matrix::Dense* f, matrix::Dense* alpha, - matrix::Dense* residual, matrix::Dense* x, - const array* stop_status) -{ - update_g_and_u(exec, nrhs, k, p, m, alpha, g, g_k, u, stop_status); - update_m(exec, nrhs, k, p, g_k, m, stop_status); - update_x_r_and_f(exec, nrhs, k, m, g, u, f, residual, x, stop_status); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_STEP_3_KERNEL); - - -template -void compute_omega( - std::shared_ptr exec, const size_type nrhs, - const remove_complex kappa, const matrix::Dense* tht, - const matrix::Dense>* residual_norm, - matrix::Dense* omega, const array* stop_status) -{ - const auto grid_dim = ceildiv(nrhs, config::warp_size); - compute_omega_kernel<<get_stream()>>>( - nrhs, as_device_type(kappa), as_device_type(tht->get_const_values()), - as_device_type(residual_norm->get_const_values()), - as_device_type(omega->get_values()), stop_status->get_const_data()); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_COMPUTE_OMEGA_KERNEL); - - -} // namespace idr -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/solver/multigrid_kernels.hip.cpp b/hip/solver/multigrid_kernels.hip.cpp deleted file mode 100644 index d09bb2d0a21..00000000000 --- a/hip/solver/multigrid_kernels.hip.cpp +++ /dev/null @@ -1,41 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/solver/multigrid_kernels.hpp" - - -#include -#include -#include -#include - - -#include "common/cuda_hip/base/runtime.hpp" -#include "core/base/array_access.hpp" -#include "core/components/fill_array_kernels.hpp" -#include "hip/base/types.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -/** - * @brief The MULTIGRID solver namespace. - * - * @ingroup multigrid - */ -namespace multigrid { - - -constexpr int default_block_size = 512; - - -#include "common/cuda_hip/solver/multigrid_kernels.hpp.inc" - - -} // namespace multigrid -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/stop/batch_criteria.hip.hpp b/hip/stop/batch_criteria.hip.hpp deleted file mode 100644 index 1f721e36aaf..00000000000 --- a/hip/stop/batch_criteria.hip.hpp +++ /dev/null @@ -1,26 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#ifndef GKO_HIP_STOP_BATCH_CRITERIA_HIP_HPP_ -#define GKO_HIP_STOP_BATCH_CRITERIA_HIP_HPP_ - - -#include - - -namespace gko { -namespace kernels { -namespace hip { -namespace batch_stop { - - -#include "common/cuda_hip/stop/batch_criteria.hpp.inc" - - -} // namespace batch_stop -} // namespace hip -} // namespace kernels -} // namespace gko - -#endif // GKO_HIP_STOP_BATCH_CRITERIA_HIP_HPP_