From 5f4c65287420f3b1d398ab53b11a3804020f8e0f Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Sun, 19 May 2024 19:23:19 +0200
Subject: [PATCH] apply script

---
 common/CMakeLists.txt                         |   1 +
 common/cuda_hip/CMakeLists.txt                |  58 ++
 ...hpp.inc => batch_multi_vector_kernels.cpp} |  55 ++
 ...hpp.inc => device_matrix_data_kernels.cpp} |  26 +
 ...ernel_launch.hpp.inc => kernel_launch.hpp} |  54 ++
 ...on.hpp.inc => kernel_launch_reduction.hpp} |  23 +
 ...olver.hpp.inc => kernel_launch_solver.hpp} |  19 +
 .../cuda_hip/base/{math.hpp.inc => math.hpp}  |  19 +
 .../components/{atomic.hpp.inc => atomic.hpp} |  24 +
 ...pp.inc => diagonal_block_manipulation.hpp} |  27 +
 .../{intrinsics.hpp.inc => intrinsics.hpp}    |  20 +
 .../{merging.hpp.inc => merging.hpp}          |  23 +
 .../{prefix_sum.hpp.inc => prefix_sum.hpp}    |  26 +
 ...kernels.hpp.inc => prefix_sum_kernels.cpp} |  27 +
 .../{reduction.hpp.inc => reduction.hpp}      |  77 +++
 .../{searching.hpp.inc => searching.hpp}      |  21 +
 ...{segment_scan.hpp.inc => segment_scan.hpp} |  20 +
 .../{sorting.hpp.inc => sorting.hpp}          |  21 +
 .../{syncfree.hpp.inc => syncfree.hpp}        |  27 +
 .../{thread_ids.hpp.inc => thread_ids.hpp}    |  22 +
 ..._array.hpp.inc => uninitialized_array.hpp} |  20 +
 .../{warp_blas.hpp.inc => warp_blas.hpp}      |  28 +
 ..._kernels.hpp.inc => index_map_kernels.cpp} |  35 ++
 ...rix_kernels.hpp.inc => matrix_kernels.cpp} |  33 ++
 ....hpp.inc => partition_helpers_kernels.cpp} |  22 +
 ..._kernels.hpp.inc => partition_kernels.cpp} |  26 +
 ...tor_kernels.hpp.inc => vector_kernels.cpp} |  30 ++
 ...y_kernels.hpp.inc => cholesky_kernels.cpp} | 108 ++++
 ...nels.hpp.inc => factorization_kernels.cpp} |  36 ++
 .../{lu_kernels.hpp.inc => lu_kernels.cpp}    |  43 ++
 ..._ic_kernels.hpp.inc => par_ic_kernels.cpp} |  32 ++
 ...ct_kernels.hpp.inc => par_ict_kernels.cpp} | 184 +++++++
 ...lu_kernels.hpp.inc => par_ilu_kernels.cpp} |  32 ++
 ...ls.hpp.inc => par_ilut_filter_kernels.cpp} | 135 +++++
 ...ls.hpp.inc => par_ilut_select_kernels.cpp} | 157 ++++++
 ...ls.hpp.inc => par_ilut_spgeam_kernels.cpp} | 154 ++++++
 .../factorization/par_ilut_sweep_kernels.cpp  | 212 ++++++++
 .../par_ilut_sweep_kernels.hpp.inc            |  94 ----
 ...{batch_logger.hpp.inc => batch_logger.hpp} |  22 +
 ..._kernels.hpp.inc => batch_csr_kernels.cpp} |  54 ++
 ...ernels.hpp.inc => batch_dense_kernels.cpp} |  55 ++
 ..._kernels.hpp.inc => batch_ell_kernels.cpp} |  54 ++
 .../{coo_kernels.hpp.inc => coo_kernels.cpp}  |  42 ++
 ...ense_kernels.hpp.inc => dense_kernels.cpp} | 225 ++++++++
 ...l_kernels.hpp.inc => diagonal_kernels.cpp} |  32 ++
 .../cuda_hip/matrix/ell_kernels.cpp           | 173 +++++-
 common/cuda_hip/matrix/ell_kernels.hpp.inc    | 133 -----
 ...bcsr_kernels.hpp.inc => fbcsr_kernels.cpp} | 297 ++++++++++
 ...ellp_kernels.hpp.inc => sellp_kernels.cpp} |  37 ++
 .../cuda_hip/matrix/sparsity_csr_kernels.cpp  | 147 ++++-
 .../matrix/sparsity_csr_kernels.hpp.inc       | 111 ----
 .../{pgm_kernels.hpp.inc => pgm_kernels.cpp}  |  36 ++
 ...{isai_kernels.hpp.inc => isai_kernels.cpp} |  42 ++
 ...obi_kernels.hpp.inc => jacobi_kernels.cpp} |  48 ++
 .../{rcm_kernels.hpp.inc => rcm_kernels.cpp}  |  47 ++
 ...s_kernels.hpp.inc => cb_gmres_kernels.cpp} | 500 +++++++++++++++++
 .../cuda_hip/solver/idr_kernels.cpp           | 341 +++++++++++-
 common/cuda_hip/solver/idr_kernels.hpp.inc    | 318 -----------
 ..._kernels.hpp.inc => multigrid_kernels.cpp} |  34 ++
 ...ch_criteria.hpp.inc => batch_criteria.hpp} |  21 +
 cuda/CMakeLists.txt                           |  33 +-
 cuda/base/batch_multi_vector_kernels.cu       |  59 --
 cuda/base/device_matrix_data_kernels.cu       |  33 --
 cuda/base/kernel_launch.cuh                   |  57 --
 cuda/base/kernel_launch_reduction.cuh         |  28 -
 cuda/base/kernel_launch_solver.cuh            |  24 -
 cuda/base/math.hpp                            |  24 -
 cuda/components/atomic.cuh                    |  29 -
 .../diagonal_block_manipulation.cuh           |  32 --
 cuda/components/intrinsics.cuh                |  25 -
 cuda/components/merging.cuh                   |  28 -
 cuda/components/prefix_sum.cuh                |  31 --
 cuda/components/prefix_sum_kernels.cu         |  34 --
 cuda/components/reduction.cuh                 |  82 ---
 cuda/components/searching.cuh                 |  26 -
 cuda/components/segment_scan.cuh              |  25 -
 cuda/components/sorting.cuh                   |  26 -
 cuda/components/syncfree.cuh                  |  32 --
 cuda/components/thread_ids.cuh                |  27 -
 cuda/components/uninitialized_array.hpp       |  25 -
 cuda/components/warp_blas.cuh                 |  33 --
 cuda/distributed/index_map_kernels.cu         |  42 --
 cuda/distributed/matrix_kernels.cu            |  40 --
 cuda/distributed/partition_helpers_kernels.cu |  29 -
 cuda/distributed/partition_kernels.cu         |  33 --
 cuda/distributed/vector_kernels.cu            |  37 --
 cuda/factorization/cholesky_kernels.cu        | 115 ----
 cuda/factorization/factorization_kernels.cu   |  43 --
 cuda/factorization/lu_kernels.cu              |  50 --
 cuda/factorization/par_ic_kernels.cu          |  39 --
 cuda/factorization/par_ict_kernels.cu         | 189 -------
 cuda/factorization/par_ilu_kernels.cu         |  39 --
 cuda/factorization/par_ilut_filter_kernels.cu | 140 -----
 cuda/factorization/par_ilut_select_kernels.cu | 162 ------
 cuda/factorization/par_ilut_spgeam_kernels.cu | 159 ------
 cuda/factorization/par_ilut_sweep_kernels.cu  | 123 -----
 cuda/log/batch_logger.cuh                     |  27 -
 cuda/matrix/batch_csr_kernels.cu              |  58 --
 cuda/matrix/batch_dense_kernels.cu            |  59 --
 cuda/matrix/batch_ell_kernels.cu              |  58 --
 cuda/matrix/coo_kernels.cu                    |  49 --
 cuda/matrix/dense_kernels.cu                  | 232 --------
 cuda/matrix/diagonal_kernels.cu               |  39 --
 cuda/matrix/fbcsr_kernels.template.cu         | 303 -----------
 cuda/matrix/sellp_kernels.cu                  |  44 --
 cuda/matrix/sparsity_csr_kernels.cu           | 226 --------
 cuda/multigrid/pgm_kernels.cu                 |  43 --
 cuda/preconditioner/isai_kernels.cu           |  49 --
 cuda/preconditioner/jacobi_kernels.cu         |  51 --
 cuda/reorder/rcm_kernels.cu                   |  54 --
 cuda/solver/cb_gmres_kernels.cu               | 507 ------------------
 cuda/solver/multigrid_kernels.cu              |  41 --
 cuda/stop/batch_criteria.cuh                  |  26 -
 hip/CMakeLists.txt                            |  31 +-
 hip/base/batch_multi_vector_kernels.hip.cpp   |  59 --
 hip/base/device_matrix_data_kernels.hip.cpp   |  33 --
 hip/base/kernel_launch.hip.hpp                |  57 --
 hip/base/kernel_launch_reduction.hip.hpp      |  28 -
 hip/base/kernel_launch_solver.hip.hpp         |  24 -
 hip/base/math.hip.hpp                         |  24 -
 hip/components/atomic.hip.hpp                 |  29 -
 .../diagonal_block_manipulation.hip.hpp       |  32 --
 hip/components/intrinsics.hip.hpp             |  25 -
 hip/components/merging.hip.hpp                |  28 -
 hip/components/prefix_sum.hip.hpp             |  31 --
 hip/components/prefix_sum_kernels.hip.cpp     |  34 --
 hip/components/reduction.hip.hpp              |  82 ---
 hip/components/searching.hip.hpp              |  26 -
 hip/components/segment_scan.hip.hpp           |  25 -
 hip/components/sorting.hip.hpp                |  26 -
 hip/components/syncfree.hip.hpp               |  32 --
 hip/components/thread_ids.hip.hpp             |  27 -
 hip/components/uninitialized_array.hip.hpp    |  25 -
 hip/components/warp_blas.hip.hpp              |  33 --
 hip/distributed/index_map_kernels.hip.cpp     |  42 --
 hip/distributed/matrix_kernels.hip.cpp        |  40 --
 .../partition_helpers_kernels.hip.cpp         |  29 -
 hip/distributed/partition_kernels.hip.cpp     |  33 --
 hip/distributed/vector_kernels.hip.cpp        |  37 --
 hip/factorization/cholesky_kernels.hip.cpp    | 115 ----
 .../factorization_kernels.hip.cpp             |  43 --
 hip/factorization/lu_kernels.hip.cpp          |  50 --
 hip/factorization/par_ic_kernels.hip.cpp      |  39 --
 hip/factorization/par_ict_kernels.hip.cpp     | 189 -------
 hip/factorization/par_ilu_kernels.hip.cpp     |  39 --
 .../par_ilut_filter_kernels.hip.cpp           | 140 -----
 .../par_ilut_select_kernels.hip.cpp           | 162 ------
 .../par_ilut_spgeam_kernels.hip.cpp           | 159 ------
 .../par_ilut_sweep_kernels.hip.cpp            | 123 -----
 hip/log/batch_logger.hip.hpp                  |  26 -
 hip/matrix/batch_csr_kernels.hip.cpp          |  58 --
 hip/matrix/batch_dense_kernels.hip.cpp        |  59 --
 hip/matrix/batch_ell_kernels.hip.cpp          |  58 --
 hip/matrix/coo_kernels.hip.cpp                |  49 --
 hip/matrix/dense_kernels.hip.cpp              | 232 --------
 hip/matrix/diagonal_kernels.hip.cpp           |  39 --
 hip/matrix/ell_kernels.hip.cpp                | 272 ----------
 hip/matrix/fbcsr_kernels.template.hip.cpp     | 303 -----------
 hip/matrix/sellp_kernels.hip.cpp              |  44 --
 hip/multigrid/pgm_kernels.hip.cpp             |  43 --
 hip/preconditioner/isai_kernels.hip.cpp       |  49 --
 hip/preconditioner/jacobi_kernels.hip.cpp     |  51 --
 hip/reorder/rcm_kernels.hip.cpp               |  54 --
 hip/solver/cb_gmres_kernels.hip.cpp           | 507 ------------------
 hip/solver/idr_kernels.hip.cpp                | 343 ------------
 hip/solver/multigrid_kernels.hip.cpp          |  41 --
 hip/stop/batch_criteria.hip.hpp               |  26 -
 167 files changed, 4029 insertions(+), 8735 deletions(-)
 create mode 100644 common/cuda_hip/CMakeLists.txt
 rename common/cuda_hip/base/{batch_multi_vector_kernels.hpp.inc => batch_multi_vector_kernels.cpp} (89%)
 rename common/cuda_hip/base/{device_matrix_data_kernels.hpp.inc => device_matrix_data_kernels.cpp} (88%)
 rename common/cuda_hip/base/{kernel_launch.hpp.inc => kernel_launch.hpp} (58%)
 rename common/cuda_hip/base/{kernel_launch_reduction.hpp.inc => kernel_launch_reduction.hpp} (97%)
 rename common/cuda_hip/base/{kernel_launch_solver.hpp.inc => kernel_launch_solver.hpp} (77%)
 rename common/cuda_hip/base/{math.hpp.inc => math.hpp} (80%)
 rename common/cuda_hip/components/{atomic.hpp.inc => atomic.hpp} (95%)
 rename common/cuda_hip/components/{diagonal_block_manipulation.hpp.inc => diagonal_block_manipulation.hpp} (81%)
 rename common/cuda_hip/components/{intrinsics.hpp.inc => intrinsics.hpp} (74%)
 rename common/cuda_hip/components/{merging.hpp.inc => merging.hpp} (96%)
 rename common/cuda_hip/components/{prefix_sum.hpp.inc => prefix_sum.hpp} (91%)
 rename common/cuda_hip/components/{prefix_sum_kernels.hpp.inc => prefix_sum_kernels.cpp} (80%)
 rename common/cuda_hip/components/{reduction.hpp.inc => reduction.hpp} (78%)
 rename common/cuda_hip/components/{searching.hpp.inc => searching.hpp} (95%)
 rename common/cuda_hip/components/{segment_scan.hpp.inc => segment_scan.hpp} (73%)
 rename common/cuda_hip/components/{sorting.hpp.inc => sorting.hpp} (96%)
 rename common/cuda_hip/components/{syncfree.hpp.inc => syncfree.hpp} (86%)
 rename common/cuda_hip/components/{thread_ids.hpp.inc => thread_ids.hpp} (94%)
 rename common/cuda_hip/components/{uninitialized_array.hpp.inc => uninitialized_array.hpp} (82%)
 rename common/cuda_hip/components/{warp_blas.hpp.inc => warp_blas.hpp} (97%)
 rename common/cuda_hip/distributed/{index_map_kernels.hpp.inc => index_map_kernels.cpp} (92%)
 rename common/cuda_hip/distributed/{matrix_kernels.hpp.inc => matrix_kernels.cpp} (94%)
 rename common/cuda_hip/distributed/{partition_helpers_kernels.hpp.inc => partition_helpers_kernels.cpp} (70%)
 rename common/cuda_hip/distributed/{partition_kernels.hpp.inc => partition_kernels.cpp} (89%)
 rename common/cuda_hip/distributed/{vector_kernels.hpp.inc => vector_kernels.cpp} (83%)
 rename common/cuda_hip/factorization/{cholesky_kernels.hpp.inc => cholesky_kernels.cpp} (78%)
 rename common/cuda_hip/factorization/{factorization_kernels.hpp.inc => factorization_kernels.cpp} (95%)
 rename common/cuda_hip/factorization/{lu_kernels.hpp.inc => lu_kernels.cpp} (92%)
 rename common/cuda_hip/factorization/{par_ic_kernels.hpp.inc => par_ic_kernels.cpp} (84%)
 rename common/cuda_hip/factorization/{par_ict_kernels.hpp.inc => par_ict_kernels.cpp} (62%)
 rename common/cuda_hip/factorization/{par_ilu_kernels.hpp.inc => par_ilu_kernels.cpp} (83%)
 rename common/cuda_hip/factorization/{par_ilut_filter_kernels.hpp.inc => par_ilut_filter_kernels.cpp} (57%)
 rename common/cuda_hip/factorization/{par_ilut_select_kernels.hpp.inc => par_ilut_select_kernels.cpp} (63%)
 rename common/cuda_hip/factorization/{par_ilut_spgeam_kernels.hpp.inc => par_ilut_spgeam_kernels.cpp} (63%)
 create mode 100644 common/cuda_hip/factorization/par_ilut_sweep_kernels.cpp
 delete mode 100644 common/cuda_hip/factorization/par_ilut_sweep_kernels.hpp.inc
 rename common/cuda_hip/log/{batch_logger.hpp.inc => batch_logger.hpp} (68%)
 rename common/cuda_hip/matrix/{batch_csr_kernels.hpp.inc => batch_csr_kernels.cpp} (87%)
 rename common/cuda_hip/matrix/{batch_dense_kernels.hpp.inc => batch_dense_kernels.cpp} (89%)
 rename common/cuda_hip/matrix/{batch_ell_kernels.hpp.inc => batch_ell_kernels.cpp} (87%)
 rename common/cuda_hip/matrix/{coo_kernels.hpp.inc => coo_kernels.cpp} (91%)
 rename common/cuda_hip/matrix/{dense_kernels.hpp.inc => dense_kernels.cpp} (75%)
 rename common/cuda_hip/matrix/{diagonal_kernels.hpp.inc => diagonal_kernels.cpp} (73%)
 rename cuda/matrix/ell_kernels.cu => common/cuda_hip/matrix/ell_kernels.cpp (57%)
 delete mode 100644 common/cuda_hip/matrix/ell_kernels.hpp.inc
 rename common/cuda_hip/matrix/{fbcsr_kernels.hpp.inc => fbcsr_kernels.cpp} (57%)
 rename common/cuda_hip/matrix/{sellp_kernels.hpp.inc => sellp_kernels.cpp} (83%)
 rename hip/matrix/sparsity_csr_kernels.hip.cpp => common/cuda_hip/matrix/sparsity_csr_kernels.cpp (58%)
 delete mode 100644 common/cuda_hip/matrix/sparsity_csr_kernels.hpp.inc
 rename common/cuda_hip/multigrid/{pgm_kernels.hpp.inc => pgm_kernels.cpp} (77%)
 rename common/cuda_hip/preconditioner/{isai_kernels.hpp.inc => isai_kernels.cpp} (94%)
 rename common/cuda_hip/preconditioner/{jacobi_kernels.hpp.inc => jacobi_kernels.cpp} (91%)
 rename common/cuda_hip/reorder/{rcm_kernels.hpp.inc => rcm_kernels.cpp} (95%)
 rename common/cuda_hip/solver/{cb_gmres_kernels.hpp.inc => cb_gmres_kernels.cpp} (50%)
 rename cuda/solver/idr_kernels.cu => common/cuda_hip/solver/idr_kernels.cpp (51%)
 delete mode 100644 common/cuda_hip/solver/idr_kernels.hpp.inc
 rename common/cuda_hip/solver/{multigrid_kernels.hpp.inc => multigrid_kernels.cpp} (89%)
 rename common/cuda_hip/stop/{batch_criteria.hpp.inc => batch_criteria.hpp} (75%)
 delete mode 100644 cuda/base/batch_multi_vector_kernels.cu
 delete mode 100644 cuda/base/device_matrix_data_kernels.cu
 delete mode 100644 cuda/base/kernel_launch.cuh
 delete mode 100644 cuda/base/kernel_launch_reduction.cuh
 delete mode 100644 cuda/base/kernel_launch_solver.cuh
 delete mode 100644 cuda/base/math.hpp
 delete mode 100644 cuda/components/atomic.cuh
 delete mode 100644 cuda/components/diagonal_block_manipulation.cuh
 delete mode 100644 cuda/components/intrinsics.cuh
 delete mode 100644 cuda/components/merging.cuh
 delete mode 100644 cuda/components/prefix_sum.cuh
 delete mode 100644 cuda/components/prefix_sum_kernels.cu
 delete mode 100644 cuda/components/reduction.cuh
 delete mode 100644 cuda/components/searching.cuh
 delete mode 100644 cuda/components/segment_scan.cuh
 delete mode 100644 cuda/components/sorting.cuh
 delete mode 100644 cuda/components/syncfree.cuh
 delete mode 100644 cuda/components/thread_ids.cuh
 delete mode 100644 cuda/components/uninitialized_array.hpp
 delete mode 100644 cuda/components/warp_blas.cuh
 delete mode 100644 cuda/distributed/index_map_kernels.cu
 delete mode 100644 cuda/distributed/matrix_kernels.cu
 delete mode 100644 cuda/distributed/partition_helpers_kernels.cu
 delete mode 100644 cuda/distributed/partition_kernels.cu
 delete mode 100644 cuda/distributed/vector_kernels.cu
 delete mode 100644 cuda/factorization/cholesky_kernels.cu
 delete mode 100644 cuda/factorization/factorization_kernels.cu
 delete mode 100644 cuda/factorization/lu_kernels.cu
 delete mode 100644 cuda/factorization/par_ic_kernels.cu
 delete mode 100644 cuda/factorization/par_ict_kernels.cu
 delete mode 100644 cuda/factorization/par_ilu_kernels.cu
 delete mode 100644 cuda/factorization/par_ilut_filter_kernels.cu
 delete mode 100644 cuda/factorization/par_ilut_select_kernels.cu
 delete mode 100644 cuda/factorization/par_ilut_spgeam_kernels.cu
 delete mode 100644 cuda/factorization/par_ilut_sweep_kernels.cu
 delete mode 100644 cuda/log/batch_logger.cuh
 delete mode 100644 cuda/matrix/batch_csr_kernels.cu
 delete mode 100644 cuda/matrix/batch_dense_kernels.cu
 delete mode 100644 cuda/matrix/batch_ell_kernels.cu
 delete mode 100644 cuda/matrix/coo_kernels.cu
 delete mode 100644 cuda/matrix/dense_kernels.cu
 delete mode 100644 cuda/matrix/diagonal_kernels.cu
 delete mode 100644 cuda/matrix/fbcsr_kernels.template.cu
 delete mode 100644 cuda/matrix/sellp_kernels.cu
 delete mode 100644 cuda/matrix/sparsity_csr_kernels.cu
 delete mode 100644 cuda/multigrid/pgm_kernels.cu
 delete mode 100644 cuda/preconditioner/isai_kernels.cu
 delete mode 100644 cuda/preconditioner/jacobi_kernels.cu
 delete mode 100644 cuda/reorder/rcm_kernels.cu
 delete mode 100644 cuda/solver/cb_gmres_kernels.cu
 delete mode 100644 cuda/solver/multigrid_kernels.cu
 delete mode 100644 cuda/stop/batch_criteria.cuh
 delete mode 100644 hip/base/batch_multi_vector_kernels.hip.cpp
 delete mode 100644 hip/base/device_matrix_data_kernels.hip.cpp
 delete mode 100644 hip/base/kernel_launch.hip.hpp
 delete mode 100644 hip/base/kernel_launch_reduction.hip.hpp
 delete mode 100644 hip/base/kernel_launch_solver.hip.hpp
 delete mode 100644 hip/base/math.hip.hpp
 delete mode 100644 hip/components/atomic.hip.hpp
 delete mode 100644 hip/components/diagonal_block_manipulation.hip.hpp
 delete mode 100644 hip/components/intrinsics.hip.hpp
 delete mode 100644 hip/components/merging.hip.hpp
 delete mode 100644 hip/components/prefix_sum.hip.hpp
 delete mode 100644 hip/components/prefix_sum_kernels.hip.cpp
 delete mode 100644 hip/components/reduction.hip.hpp
 delete mode 100644 hip/components/searching.hip.hpp
 delete mode 100644 hip/components/segment_scan.hip.hpp
 delete mode 100644 hip/components/sorting.hip.hpp
 delete mode 100644 hip/components/syncfree.hip.hpp
 delete mode 100644 hip/components/thread_ids.hip.hpp
 delete mode 100644 hip/components/uninitialized_array.hip.hpp
 delete mode 100644 hip/components/warp_blas.hip.hpp
 delete mode 100644 hip/distributed/index_map_kernels.hip.cpp
 delete mode 100644 hip/distributed/matrix_kernels.hip.cpp
 delete mode 100644 hip/distributed/partition_helpers_kernels.hip.cpp
 delete mode 100644 hip/distributed/partition_kernels.hip.cpp
 delete mode 100644 hip/distributed/vector_kernels.hip.cpp
 delete mode 100644 hip/factorization/cholesky_kernels.hip.cpp
 delete mode 100644 hip/factorization/factorization_kernels.hip.cpp
 delete mode 100644 hip/factorization/lu_kernels.hip.cpp
 delete mode 100644 hip/factorization/par_ic_kernels.hip.cpp
 delete mode 100644 hip/factorization/par_ict_kernels.hip.cpp
 delete mode 100644 hip/factorization/par_ilu_kernels.hip.cpp
 delete mode 100644 hip/factorization/par_ilut_filter_kernels.hip.cpp
 delete mode 100644 hip/factorization/par_ilut_select_kernels.hip.cpp
 delete mode 100644 hip/factorization/par_ilut_spgeam_kernels.hip.cpp
 delete mode 100644 hip/factorization/par_ilut_sweep_kernels.hip.cpp
 delete mode 100644 hip/log/batch_logger.hip.hpp
 delete mode 100644 hip/matrix/batch_csr_kernels.hip.cpp
 delete mode 100644 hip/matrix/batch_dense_kernels.hip.cpp
 delete mode 100644 hip/matrix/batch_ell_kernels.hip.cpp
 delete mode 100644 hip/matrix/coo_kernels.hip.cpp
 delete mode 100644 hip/matrix/dense_kernels.hip.cpp
 delete mode 100644 hip/matrix/diagonal_kernels.hip.cpp
 delete mode 100644 hip/matrix/ell_kernels.hip.cpp
 delete mode 100644 hip/matrix/fbcsr_kernels.template.hip.cpp
 delete mode 100644 hip/matrix/sellp_kernels.hip.cpp
 delete mode 100644 hip/multigrid/pgm_kernels.hip.cpp
 delete mode 100644 hip/preconditioner/isai_kernels.hip.cpp
 delete mode 100644 hip/preconditioner/jacobi_kernels.hip.cpp
 delete mode 100644 hip/reorder/rcm_kernels.hip.cpp
 delete mode 100644 hip/solver/cb_gmres_kernels.hip.cpp
 delete mode 100644 hip/solver/idr_kernels.hip.cpp
 delete mode 100644 hip/solver/multigrid_kernels.hip.cpp
 delete mode 100644 hip/stop/batch_criteria.hip.hpp

diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
index 77bdd7230b9..e7c665640b3 100644
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -1,2 +1,3 @@
 add_subdirectory(unified)
 set(GKO_UNIFIED_COMMON_SOURCES ${GKO_UNIFIED_COMMON_SOURCES} PARENT_SCOPE)
+set(GKO_CUDA_HIP_COMMON_SOURCES ${GKO_CUDA_HIP_COMMON_SOURCES} PARENT_SCOPE)
diff --git a/common/cuda_hip/CMakeLists.txt b/common/cuda_hip/CMakeLists.txt
new file mode 100644
index 00000000000..af919e90897
--- /dev/null
+++ b/common/cuda_hip/CMakeLists.txt
@@ -0,0 +1,58 @@
+set(CUDA_HIP_SOURCES
+    base/batch_multi_vector_kernels.cpp
+    base/device_matrix_data_kernels.cpp
+    base/kernel_launch.hpp
+    base/kernel_launch_reduction.hpp
+    base/kernel_launch_solver.hpp
+    base/math.hpp
+    components/atomic.hpp
+    components/diagonal_block_manipulation.hpp
+    components/intrinsics.hpp
+    components/merging.hpp
+    components/prefix_sum.hpp
+    components/prefix_sum_kernels.cpp
+    components/reduction.hpp
+    components/searching.hpp
+    components/segment_scan.hpp
+    components/sorting.hpp
+    components/syncfree.hpp
+    components/thread_ids.hpp
+    components/uninitialized_array.hpp
+    components/warp_blas.hpp
+    distributed/index_map_kernels.cpp
+    distributed/matrix_kernels.cpp
+    distributed/partition_helpers_kernels.cpp
+    distributed/partition_kernels.cpp
+    distributed/vector_kernels.cpp
+    factorization/cholesky_kernels.cpp
+    factorization/factorization_kernels.cpp
+    factorization/lu_kernels.cpp
+    factorization/par_ic_kernels.cpp
+    factorization/par_ict_kernels.cpp
+    factorization/par_ilu_kernels.cpp
+    factorization/par_ilut_filter_kernels.cpp
+    factorization/par_ilut_select_kernels.cpp
+    factorization/par_ilut_spgeam_kernels.cpp
+    factorization/par_ilut_sweep_kernels.cpp
+    log/batch_logger.hpp
+    matrix/batch_csr_kernels.cpp
+    matrix/batch_dense_kernels.cpp
+    matrix/batch_ell_kernels.cpp
+    matrix/coo_kernels.cpp
+    matrix/dense_kernels.cpp
+    matrix/diagonal_kernels.cpp
+    matrix/ell_kernels.cpp
+    matrix/fbcsr_kernels.cpp
+    matrix/sellp_kernels.cpp
+    matrix/sparsity_csr_kernels.cpp
+    multigrid/pgm_kernels.cpp
+    preconditioner/isai_kernels.cpp
+    preconditioner/jacobi_kernels.cpp
+    reorder/rcm_kernels.cpp
+    solver/cb_gmres_kernels.cpp
+    solver/idr_kernels.cpp
+    solver/multigrid_kernels.cpp
+    stop/batch_criteria.hpp
+    )
+list(TRANSFORM CUDA_HIP_SOURCES PREPEND ${CMAKE_CURRENT_SOURCE_DIR}/)
+set(GKO_CUDA_HIP_COMMON_SOURCES ${CUDA_HIP_SOURCES} PARENT_SCOPE)
\ No newline at end of file
diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc b/common/cuda_hip/base/batch_multi_vector_kernels.cpp
similarity index 89%
rename from common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc
rename to common/cuda_hip/base/batch_multi_vector_kernels.cpp
index 9b6301674be..2a4618f32aa 100644
--- a/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc
+++ b/common/cuda_hip/base/batch_multi_vector_kernels.cpp
@@ -2,6 +2,50 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include <common/cuda_hip/base/batch_struct.hpp>
+
+
+#include <thrust/functional.h>
+#include <thrust/transform.h>
+
+
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/range_accessors.hpp>
+
+
+#include "common/cuda_hip/base/blas_bindings.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/pointer_mode_guard.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/thrust.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/uninitialized_array.hpp"
+#include "core/base/batch_multi_vector_kernels.hpp"
+#include "core/base/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+/**
+ * @brief The MultiVector matrix format namespace.
+ *
+ * @ingroup batch_multi_vector
+ */
+namespace batch_multi_vector {
+
+
+constexpr auto default_block_size = 256;
+constexpr int sm_oversubscription = 4;
+
+
+// clang-format off
+
+// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
+
+
 template <typename ValueType, typename Mapping>
 __device__ __forceinline__ void scale(
     const gko::batch::multi_vector::batch_item<const ValueType>& alpha,
@@ -299,3 +343,14 @@ __launch_bounds__(default_block_size, sm_oversubscription) void copy_kernel(
         copy(src_b, dst_b);
     }
 }
+
+
+#include "common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc"
+
+// clang-format on
+
+
+}  // namespace batch_multi_vector
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/common/cuda_hip/base/device_matrix_data_kernels.hpp.inc b/common/cuda_hip/base/device_matrix_data_kernels.cpp
similarity index 88%
rename from common/cuda_hip/base/device_matrix_data_kernels.hpp.inc
rename to common/cuda_hip/base/device_matrix_data_kernels.cpp
index 70cbd9e7391..192facc6950 100644
--- a/common/cuda_hip/base/device_matrix_data_kernels.hpp.inc
+++ b/common/cuda_hip/base/device_matrix_data_kernels.cpp
@@ -2,6 +2,26 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include <thrust/copy.h>
+#include <thrust/count.h>
+#include <thrust/device_ptr.h>
+#include <thrust/execution_policy.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/sort.h>
+#include <thrust/tuple.h>
+
+
+#include "common/cuda_hip/base/thrust.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "core/base/device_matrix_data_kernels.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace components {
+
+
 template <typename ValueType, typename IndexType>
 void remove_zeros(std::shared_ptr<const DefaultExecutor> exec,
                   array<ValueType>& values, array<IndexType>& row_idxs,
@@ -99,3 +119,9 @@ void sort_row_major(std::shared_ptr<const DefaultExecutor> exec,
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DEVICE_MATRIX_DATA_SORT_ROW_MAJOR_KERNEL);
+
+
+}  // namespace components
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/common/cuda_hip/base/kernel_launch.hpp.inc b/common/cuda_hip/base/kernel_launch.hpp
similarity index 58%
rename from common/cuda_hip/base/kernel_launch.hpp.inc
rename to common/cuda_hip/base/kernel_launch.hpp
index c46e6c879cb..3d59e145a86 100644
--- a/common/cuda_hip/base/kernel_launch.hpp.inc
+++ b/common/cuda_hip/base/kernel_launch.hpp
@@ -2,6 +2,55 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_HPP_
+#error \
+    "This file can only be used from inside common/unified/base/kernel_launch.hpp"
+#endif
+
+
+#include <accessor/device_helper.hpp>
+
+
+#include <thrust/tuple.h>
+
+
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+
+
+template <typename AccessorType>
+struct to_device_type_impl<gko::acc::range<AccessorType>&> {
+    using type = std::decay_t<decltype(gko::acc::as_device_range(
+        std::declval<gko::acc::range<AccessorType>>()))>;
+    static type map_to_device(gko::acc::range<AccessorType>& range)
+    {
+        return gko::acc::as_device_range(range);
+    }
+};
+
+template <typename AccessorType>
+struct to_device_type_impl<const gko::acc::range<AccessorType>&> {
+    using type = std::decay_t<decltype(gko::acc::as_device_range(
+        std::declval<gko::acc::range<AccessorType>>()))>;
+    static type map_to_device(const gko::acc::range<AccessorType>& range)
+    {
+        return gko::acc::as_device_range(range);
+    }
+};
+
+
+namespace device_std = thrust;
+
+
+constexpr int default_block_size = 512;
+
+
 template <typename KernelFunction, typename... KernelArgs>
 __global__ __launch_bounds__(default_block_size) void generic_kernel_1d(
     int64 size, KernelFunction fn, KernelArgs... args)
@@ -52,3 +101,8 @@ void run_kernel(std::shared_ptr<const DefaultExecutor> exec, KernelFunction fn,
             map_to_device(args)...);
     }
 }
+
+
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/common/cuda_hip/base/kernel_launch_reduction.hpp.inc b/common/cuda_hip/base/kernel_launch_reduction.hpp
similarity index 97%
rename from common/cuda_hip/base/kernel_launch_reduction.hpp.inc
rename to common/cuda_hip/base/kernel_launch_reduction.hpp
index e5caedacb1f..4c4fb366802 100644
--- a/common/cuda_hip/base/kernel_launch_reduction.hpp.inc
+++ b/common/cuda_hip/base/kernel_launch_reduction.hpp
@@ -2,6 +2,24 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_REDUCTION_HPP_
+#error \
+    "This file can only be used from inside common/unified/base/kernel_launch_reduction.hpp"
+#endif
+
+
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "core/synthesizer/implementation_selection.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+
+
 template <typename ValueType, typename KernelFunction, typename ReductionOp,
           typename FinalizeOp, typename... KernelArgs>
 __global__ __launch_bounds__(
@@ -505,3 +523,8 @@ void run_kernel_col_reduction_cached(
         }
     }
 }
+
+
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/common/cuda_hip/base/kernel_launch_solver.hpp.inc b/common/cuda_hip/base/kernel_launch_solver.hpp
similarity index 77%
rename from common/cuda_hip/base/kernel_launch_solver.hpp.inc
rename to common/cuda_hip/base/kernel_launch_solver.hpp
index cef3c8a3adc..e32ba52e79a 100644
--- a/common/cuda_hip/base/kernel_launch_solver.hpp.inc
+++ b/common/cuda_hip/base/kernel_launch_solver.hpp
@@ -2,6 +2,20 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_SOLVER_HPP_
+#error \
+    "This file can only be used from inside common/unified/base/kernel_launch_solver.hpp"
+#endif
+
+
+#include "common/cuda_hip/base/runtime.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+
+
 template <typename KernelFunction, typename... KernelArgs>
 __global__ __launch_bounds__(default_block_size) void generic_kernel_2d_solver(
     int64 rows, int64 cols, int64 default_stride, KernelFunction fn,
@@ -32,3 +46,8 @@ void run_kernel_solver(std::shared_ptr<const DefaultExecutor> exec,
             static_cast<int64>(default_stride), fn, map_to_device(args)...);
     }
 }
+
+
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/common/cuda_hip/base/math.hpp.inc b/common/cuda_hip/base/math.hpp
similarity index 80%
rename from common/cuda_hip/base/math.hpp.inc
rename to common/cuda_hip/base/math.hpp
index 430163f3791..c328b299d70 100644
--- a/common/cuda_hip/base/math.hpp.inc
+++ b/common/cuda_hip/base/math.hpp
@@ -2,6 +2,19 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_COMMON_CUDA_HIP_BASE_MATH_HPP_
+#define GKO_COMMON_CUDA_HIP_BASE_MATH_HPP_
+
+
+#include <thrust/complex.h>
+
+
+#include <ginkgo/core/base/math.hpp>
+
+
+namespace gko {
+
+
 // We need this struct, because otherwise we would call a __host__ function in a
 // __device__ function (even though it is constexpr)
 template <typename T>
@@ -37,3 +50,9 @@ struct truncate_type_impl<thrust::complex<T>> {
 
 
 }  // namespace detail
+
+
+}  // namespace gko
+
+
+#endif  // GKO_COMMON_CUDA_HIP_BASE_MATH_HPP_
diff --git a/common/cuda_hip/components/atomic.hpp.inc b/common/cuda_hip/components/atomic.hpp
similarity index 95%
rename from common/cuda_hip/components/atomic.hpp.inc
rename to common/cuda_hip/components/atomic.hpp
index 60eaf5a9dd9..b71ca1f8b5b 100644
--- a/common/cuda_hip/components/atomic.hpp.inc
+++ b/common/cuda_hip/components/atomic.hpp
@@ -2,6 +2,22 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_ATOMIC_HPP_
+#define GKO_COMMON_CUDA_HIP_COMPONENTS_ATOMIC_HPP_
+
+
+#include <type_traits>
+
+
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/types.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+
+
 namespace detail {
 
 
@@ -228,3 +244,11 @@ __forceinline__ __device__ thrust::complex<double> atomic_add(
     auto imag = atomic_add(addr + 1, val.imag());
     return {real, imag};
 }
+
+
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_ATOMIC_HPP_
diff --git a/common/cuda_hip/components/diagonal_block_manipulation.hpp.inc b/common/cuda_hip/components/diagonal_block_manipulation.hpp
similarity index 81%
rename from common/cuda_hip/components/diagonal_block_manipulation.hpp.inc
rename to common/cuda_hip/components/diagonal_block_manipulation.hpp
index a8e7004b5aa..923b62bf5e5 100644
--- a/common/cuda_hip/components/diagonal_block_manipulation.hpp.inc
+++ b/common/cuda_hip/components/diagonal_block_manipulation.hpp
@@ -2,6 +2,24 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_HPP_
+#define GKO_COMMON_CUDA_HIP_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_HPP_
+
+
+#include <type_traits>
+
+
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace csr {
+
+
 /**
  * @internal
  *
@@ -63,3 +81,12 @@ __device__ __forceinline__ void extract_transposed_diag_blocks(
         }
     }
 }
+
+
+}  // namespace csr
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_HPP_
diff --git a/common/cuda_hip/components/intrinsics.hpp.inc b/common/cuda_hip/components/intrinsics.hpp
similarity index 74%
rename from common/cuda_hip/components/intrinsics.hpp.inc
rename to common/cuda_hip/components/intrinsics.hpp
index 3fc28cee871..df3b5ad4c7f 100644
--- a/common/cuda_hip/components/intrinsics.hpp.inc
+++ b/common/cuda_hip/components/intrinsics.hpp
@@ -2,6 +2,18 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_INTRINSICS_HPP_
+#define GKO_COMMON_CUDA_HIP_COMPONENTS_INTRINSICS_HPP_
+
+
+#include <ginkgo/core/base/types.hpp>
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+
+
 /**
  * @internal
  * Returns the number of set bits in the given mask.
@@ -36,3 +48,11 @@ __forceinline__ __device__ int clz(uint32 mask) { return __clz(mask); }
 
 /** @copydoc clz */
 __forceinline__ __device__ int clz(uint64 mask) { return __clzll(mask); }
+
+
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_INTRINSICS_HPP_
diff --git a/common/cuda_hip/components/merging.hpp.inc b/common/cuda_hip/components/merging.hpp
similarity index 96%
rename from common/cuda_hip/components/merging.hpp.inc
rename to common/cuda_hip/components/merging.hpp
index d77707795a1..ab070741fbd 100644
--- a/common/cuda_hip/components/merging.hpp.inc
+++ b/common/cuda_hip/components/merging.hpp
@@ -2,6 +2,21 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_MERGING_HPP_
+#define GKO_COMMON_CUDA_HIP_COMPONENTS_MERGING_HPP_
+
+
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/components/intrinsics.hpp"
+#include "common/cuda_hip/components/searching.hpp"
+#include "core/base/utils.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+
+
 namespace detail {
 
 
@@ -280,3 +295,11 @@ __forceinline__ __device__ void sequential_match(const ValueType* a,
                          return a_idx < a_size && b_idx < b_size;
                      });
 }
+
+
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_MERGING_HPP_
diff --git a/common/cuda_hip/components/prefix_sum.hpp.inc b/common/cuda_hip/components/prefix_sum.hpp
similarity index 91%
rename from common/cuda_hip/components/prefix_sum.hpp.inc
rename to common/cuda_hip/components/prefix_sum.hpp
index 474b0b88cd1..defd2be5e0e 100644
--- a/common/cuda_hip/components/prefix_sum.hpp.inc
+++ b/common/cuda_hip/components/prefix_sum.hpp
@@ -2,6 +2,24 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_PREFIX_SUM_HPP_
+#define GKO_COMMON_CUDA_HIP_COMPONENTS_PREFIX_SUM_HPP_
+
+
+#include <type_traits>
+
+
+#include "common/cuda_hip/base/blas_bindings.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+
+
 /**
  * @internal
  * Computes the prefix sum and total sum of `element` over a subwarp.
@@ -158,3 +176,11 @@ __global__ __launch_bounds__(block_size) void finalize_prefix_sum(
         elements[tidx] += prefix_block_sum;
     }
 }
+
+
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_PREFIX_SUM_HPP_
diff --git a/common/cuda_hip/components/prefix_sum_kernels.hpp.inc b/common/cuda_hip/components/prefix_sum_kernels.cpp
similarity index 80%
rename from common/cuda_hip/components/prefix_sum_kernels.hpp.inc
rename to common/cuda_hip/components/prefix_sum_kernels.cpp
index c232e115a22..4583e2456f9 100644
--- a/common/cuda_hip/components/prefix_sum_kernels.hpp.inc
+++ b/common/cuda_hip/components/prefix_sum_kernels.cpp
@@ -2,6 +2,27 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include <limits>
+
+
+#include <thrust/scan.h>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/exception.hpp>
+#include <ginkgo/core/base/name_demangling.hpp>
+
+
+#include "common/cuda_hip/base/thrust.hpp"
+#include "core/components/prefix_sum_kernels.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace components {
+
+
 template <typename IndexType>
 struct overflowing_sum {
     constexpr static IndexType max = std::numeric_limits<IndexType>::max();
@@ -56,3 +77,9 @@ GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_PREFIX_SUM_NONNEGATIVE_KERNEL);
 // instantiate for size_type as well, as this is used in the Sellp format
 template void prefix_sum_nonnegative<size_type>(
     std::shared_ptr<const DefaultExecutor>, size_type*, size_type);
+
+
+}  // namespace components
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/common/cuda_hip/components/reduction.hpp.inc b/common/cuda_hip/components/reduction.hpp
similarity index 78%
rename from common/cuda_hip/components/reduction.hpp.inc
rename to common/cuda_hip/components/reduction.hpp
index 1a6a64d6fb7..e2c49836149 100644
--- a/common/cuda_hip/components/reduction.hpp.inc
+++ b/common/cuda_hip/components/reduction.hpp
@@ -2,6 +2,34 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_REDUCTION_HPP_
+#define GKO_COMMON_CUDA_HIP_COMPONENTS_REDUCTION_HPP_
+
+
+#include <type_traits>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/executor.hpp>
+
+
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/uninitialized_array.hpp"
+#include "core/base/array_access.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+
+
+constexpr int default_reduce_block_size = 512;
+
+
 /**
  * @internal
  *
@@ -222,3 +250,52 @@ __launch_bounds__(default_reduce_block_size) void reduce_add_array_with_initial_
         result[blockIdx.x] += block_sum[0];
     }
 }
+
+
+/**
+ * Compute a reduction using add operation (+).
+ *
+ * @param exec  Executor associated to the array
+ * @param size  size of the array
+ * @param source  the pointer of the array
+ *
+ * @return the reduction result
+ */
+template <typename ValueType>
+__host__ ValueType reduce_add_array(std::shared_ptr<const DefaultExecutor> exec,
+                                    size_type size, const ValueType* source)
+{
+    auto block_results_val = source;
+    size_type grid_dim = size;
+    auto block_results = array<ValueType>(exec);
+    if (size > default_reduce_block_size) {
+        const auto n = ceildiv(size, default_reduce_block_size);
+        grid_dim =
+            (n <= default_reduce_block_size) ? n : default_reduce_block_size;
+
+        block_results.resize_and_reset(grid_dim);
+
+        reduce_add_array<<<grid_dim, default_reduce_block_size, 0,
+                           exec->get_stream()>>>(
+            size, as_device_type(source),
+            as_device_type(block_results.get_data()));
+
+        block_results_val = block_results.get_const_data();
+    }
+
+    auto d_result = array<ValueType>(exec, 1);
+
+    reduce_add_array<<<1, default_reduce_block_size, 0, exec->get_stream()>>>(
+        grid_dim, as_device_type(block_results_val),
+        as_device_type(d_result.get_data()));
+    auto answer = get_element(d_result, 0);
+    return answer;
+}
+
+
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_REDUCTION_HPP_
diff --git a/common/cuda_hip/components/searching.hpp.inc b/common/cuda_hip/components/searching.hpp
similarity index 95%
rename from common/cuda_hip/components/searching.hpp.inc
rename to common/cuda_hip/components/searching.hpp
index a0f842dca35..cb219c58b0b 100644
--- a/common/cuda_hip/components/searching.hpp.inc
+++ b/common/cuda_hip/components/searching.hpp
@@ -2,6 +2,19 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_SEARCHING_HPP_
+#define GKO_COMMON_CUDA_HIP_COMPONENTS_SEARCHING_HPP_
+
+
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/components/intrinsics.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+
+
 /**
  * @internal
  * Generic binary search that finds the first index where a predicate is true.
@@ -208,3 +221,11 @@ __forceinline__ __device__ IndexType group_ary_search(IndexType offset,
     auto pos = mask == 0 ? group.size() : ffs(mask) - 1;
     return offset + pos;
 }
+
+
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_SEARCHING_HPP_
diff --git a/common/cuda_hip/components/segment_scan.hpp.inc b/common/cuda_hip/components/segment_scan.hpp
similarity index 73%
rename from common/cuda_hip/components/segment_scan.hpp.inc
rename to common/cuda_hip/components/segment_scan.hpp
index 75cc0654531..0ab34fd093b 100644
--- a/common/cuda_hip/components/segment_scan.hpp.inc
+++ b/common/cuda_hip/components/segment_scan.hpp
@@ -2,6 +2,18 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_SEGMENT_SCAN_HPP_
+#define GKO_COMMON_CUDA_HIP_COMPONENTS_SEGMENT_SCAN_HPP_
+
+
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+
+
 /**
  * @internal
  *
@@ -33,3 +45,11 @@ __device__ __forceinline__ bool segment_scan(
     }
     return head;
 }
+
+
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_SEGMENT_SCAN_HPP_
diff --git a/common/cuda_hip/components/sorting.hpp.inc b/common/cuda_hip/components/sorting.hpp
similarity index 96%
rename from common/cuda_hip/components/sorting.hpp.inc
rename to common/cuda_hip/components/sorting.hpp
index 10db7eb6daa..7603d41a8ba 100644
--- a/common/cuda_hip/components/sorting.hpp.inc
+++ b/common/cuda_hip/components/sorting.hpp
@@ -2,6 +2,19 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_SORTING_HPP_
+#define GKO_COMMON_CUDA_HIP_COMPONENTS_SORTING_HPP_
+
+
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+
+
 namespace detail {
 
 
@@ -291,3 +304,11 @@ __forceinline__ __device__ void bitonic_sort(ValueType* local_elements,
             local_elements, false);
     }
 }
+
+
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_SORTING_HPP_
diff --git a/common/cuda_hip/components/syncfree.hpp.inc b/common/cuda_hip/components/syncfree.hpp
similarity index 86%
rename from common/cuda_hip/components/syncfree.hpp.inc
rename to common/cuda_hip/components/syncfree.hpp
index f0d0bbe4d22..9524c68637e 100644
--- a/common/cuda_hip/components/syncfree.hpp.inc
+++ b/common/cuda_hip/components/syncfree.hpp
@@ -2,6 +2,25 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_SYNCFREE_HPP_
+#define GKO_COMMON_CUDA_HIP_COMPONENTS_SYNCFREE_HPP_
+
+
+#include <ginkgo/core/base/array.hpp>
+
+
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/components/atomic.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/memory.hpp"
+#include "core/components/fill_array_kernels.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+
+
 struct syncfree_storage {
     using status_word = int;
 
@@ -110,3 +129,11 @@ class syncfree_scheduler {
     IndexType work_id;
     IndexType block_id;
 };
+
+
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_SYNCFREE_HPP_
diff --git a/common/cuda_hip/components/thread_ids.hpp.inc b/common/cuda_hip/components/thread_ids.hpp
similarity index 94%
rename from common/cuda_hip/components/thread_ids.hpp.inc
rename to common/cuda_hip/components/thread_ids.hpp
index 1befa428f3c..e73296f92a9 100644
--- a/common/cuda_hip/components/thread_ids.hpp.inc
+++ b/common/cuda_hip/components/thread_ids.hpp
@@ -2,6 +2,19 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_THREAD_IDS_HPP_
+#define GKO_COMMON_CUDA_HIP_COMPONENTS_THREAD_IDS_HPP_
+
+
+#include "common/cuda_hip/base/config.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace thread {
+
+
 /**
  * @internal
  *
@@ -242,3 +255,12 @@ __device__ __forceinline__ IndexType get_subwarp_num_flat()
                   "subwarp_size must be a power of two");
     return blockDim.x / subwarp_size * static_cast<IndexType>(gridDim.x);
 }
+
+
+}  // namespace thread
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_THREAD_IDS_HPP_
diff --git a/common/cuda_hip/components/uninitialized_array.hpp.inc b/common/cuda_hip/components/uninitialized_array.hpp
similarity index 82%
rename from common/cuda_hip/components/uninitialized_array.hpp.inc
rename to common/cuda_hip/components/uninitialized_array.hpp
index 932ae8a5caa..44fcbfd0d85 100644
--- a/common/cuda_hip/components/uninitialized_array.hpp.inc
+++ b/common/cuda_hip/components/uninitialized_array.hpp
@@ -2,6 +2,18 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_UNINITIALIZED_ARRAY_HPP_
+#define GKO_COMMON_CUDA_HIP_COMPONENTS_UNINITIALIZED_ARRAY_HPP_
+
+
+#include <ginkgo/core/base/types.hpp>
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+
+
 /**
  * Stores an array with uninitialized contents.
  *
@@ -63,3 +75,11 @@ class uninitialized_array {
 private:
     unsigned char data_[sizeof(ValueType) / sizeof(unsigned char) * size];
 };
+
+
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_UNINITIALIZED_ARRAY_HPP_
diff --git a/common/cuda_hip/components/warp_blas.hpp.inc b/common/cuda_hip/components/warp_blas.hpp
similarity index 97%
rename from common/cuda_hip/components/warp_blas.hpp.inc
rename to common/cuda_hip/components/warp_blas.hpp
index 61b2ae25e7f..eb98466c6b8 100644
--- a/common/cuda_hip/components/warp_blas.hpp.inc
+++ b/common/cuda_hip/components/warp_blas.hpp
@@ -2,6 +2,26 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_WARP_BLAS_HPP_
+#define GKO_COMMON_CUDA_HIP_COMPONENTS_WARP_BLAS_HPP_
+
+
+#include <cassert>
+#include <type_traits>
+
+
+#include <ginkgo/config.hpp>
+
+
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+
+
 /**
  * @internal
  *
@@ -409,3 +429,11 @@ __device__ __forceinline__ remove_complex<ValueType> compute_infinity_norm(
     return reduce(group, sum,
                   [](result_type x, result_type y) { return max(x, y); });
 }
+
+
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_WARP_BLAS_HPP_
diff --git a/common/cuda_hip/distributed/index_map_kernels.hpp.inc b/common/cuda_hip/distributed/index_map_kernels.cpp
similarity index 92%
rename from common/cuda_hip/distributed/index_map_kernels.hpp.inc
rename to common/cuda_hip/distributed/index_map_kernels.cpp
index 9d312cc43aa..706ab2a2355 100644
--- a/common/cuda_hip/distributed/index_map_kernels.hpp.inc
+++ b/common/cuda_hip/distributed/index_map_kernels.cpp
@@ -2,6 +2,35 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include <thrust/binary_search.h>
+#include <thrust/copy.h>
+#include <thrust/distance.h>
+#include <thrust/execution_policy.h>
+#include <thrust/for_each.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/iterator/transform_output_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+#include <thrust/transform_reduce.h>
+#include <thrust/unique.h>
+
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+
+
+#include "common/cuda_hip/base/thrust.hpp"
+#include "common/cuda_hip/components/atomic.hpp"
+#include "common/cuda_hip/components/searching.hpp"
+#include "core/distributed/index_map_kernels.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace index_map {
+
+
 /**
  * This struct is necessary, since the `transform_output_iterator` seemingly
  * doesn't support non-copyable tranfsorm function (this excludes lambdas)
@@ -266,3 +295,9 @@ void map_to_local(
 
 GKO_INSTANTIATE_FOR_EACH_LOCAL_GLOBAL_INDEX_TYPE(
     GKO_DECLARE_INDEX_MAP_MAP_TO_LOCAL);
+
+
+}  // namespace index_map
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/common/cuda_hip/distributed/matrix_kernels.hpp.inc b/common/cuda_hip/distributed/matrix_kernels.cpp
similarity index 94%
rename from common/cuda_hip/distributed/matrix_kernels.hpp.inc
rename to common/cuda_hip/distributed/matrix_kernels.cpp
index 5caf3522f62..90d7b4a09f9 100644
--- a/common/cuda_hip/distributed/matrix_kernels.hpp.inc
+++ b/common/cuda_hip/distributed/matrix_kernels.cpp
@@ -2,6 +2,33 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include <thrust/binary_search.h>
+#include <thrust/copy.h>
+#include <thrust/distance.h>
+#include <thrust/execution_policy.h>
+#include <thrust/for_each.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+#include <thrust/transform_reduce.h>
+#include <thrust/unique.h>
+
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+
+
+#include "common/cuda_hip/base/thrust.hpp"
+#include "common/cuda_hip/components/atomic.hpp"
+#include "core/distributed/matrix_kernels.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace distributed_matrix {
+
+
 template <typename ValueType, typename GlobalIndexType>
 struct input_type {
     GlobalIndexType row;
@@ -261,3 +288,9 @@ void build_local_nonlocal(
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
     GKO_DECLARE_BUILD_LOCAL_NONLOCAL);
+
+
+}  // namespace distributed_matrix
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/common/cuda_hip/distributed/partition_helpers_kernels.hpp.inc b/common/cuda_hip/distributed/partition_helpers_kernels.cpp
similarity index 70%
rename from common/cuda_hip/distributed/partition_helpers_kernels.hpp.inc
rename to common/cuda_hip/distributed/partition_helpers_kernels.cpp
index 88343370d99..9081e36019f 100644
--- a/common/cuda_hip/distributed/partition_helpers_kernels.hpp.inc
+++ b/common/cuda_hip/distributed/partition_helpers_kernels.cpp
@@ -2,6 +2,22 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include <thrust/device_ptr.h>
+#include <thrust/execution_policy.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/sort.h>
+
+
+#include "common/cuda_hip/base/thrust.hpp"
+#include "core/distributed/partition_helpers_kernels.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace partition_helpers {
+
+
 template <typename GlobalIndexType>
 void sort_by_range_start(
     std::shared_ptr<const DefaultExecutor> exec,
@@ -24,3 +40,9 @@ void sort_by_range_start(
 
 GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(
     GKO_DECLARE_PARTITION_HELPERS_SORT_BY_RANGE_START);
+
+
+}  // namespace partition_helpers
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/common/cuda_hip/distributed/partition_kernels.hpp.inc b/common/cuda_hip/distributed/partition_kernels.cpp
similarity index 89%
rename from common/cuda_hip/distributed/partition_kernels.hpp.inc
rename to common/cuda_hip/distributed/partition_kernels.cpp
index 20f3ebd47dc..9830ba94faf 100644
--- a/common/cuda_hip/distributed/partition_kernels.hpp.inc
+++ b/common/cuda_hip/distributed/partition_kernels.cpp
@@ -2,6 +2,26 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include <thrust/count.h>
+#include <thrust/device_ptr.h>
+#include <thrust/execution_policy.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/scan.h>
+#include <thrust/sort.h>
+
+
+#include "common/cuda_hip/base/thrust.hpp"
+#include "common/unified/base/kernel_launch.hpp"
+#include "core/components/fill_array_kernels.hpp"
+#include "core/distributed/partition_kernels.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace partition {
+
+
 namespace kernel {
 
 
@@ -110,3 +130,9 @@ void build_starting_indices(std::shared_ptr<const DefaultExecutor> exec,
 
 GKO_INSTANTIATE_FOR_EACH_LOCAL_GLOBAL_INDEX_TYPE(
     GKO_DECLARE_PARTITION_BUILD_STARTING_INDICES);
+
+
+}  // namespace partition
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/common/cuda_hip/distributed/vector_kernels.hpp.inc b/common/cuda_hip/distributed/vector_kernels.cpp
similarity index 83%
rename from common/cuda_hip/distributed/vector_kernels.hpp.inc
rename to common/cuda_hip/distributed/vector_kernels.cpp
index 6a0497db78a..4dd2d4f59c3 100644
--- a/common/cuda_hip/distributed/vector_kernels.hpp.inc
+++ b/common/cuda_hip/distributed/vector_kernels.cpp
@@ -2,6 +2,30 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include <functional>
+
+
+#include <thrust/binary_search.h>
+#include <thrust/execution_policy.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/scatter.h>
+#include <thrust/tuple.h>
+
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+
+
+#include "common/cuda_hip/base/thrust.hpp"
+#include "core/distributed/vector_kernels.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace distributed_vector {
+
+
 template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
 void build_local(
     std::shared_ptr<const DefaultExecutor> exec,
@@ -65,3 +89,9 @@ void build_local(
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
     GKO_DECLARE_DISTRIBUTED_VECTOR_BUILD_LOCAL);
+
+
+}  // namespace distributed_vector
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/common/cuda_hip/factorization/cholesky_kernels.hpp.inc b/common/cuda_hip/factorization/cholesky_kernels.cpp
similarity index 78%
rename from common/cuda_hip/factorization/cholesky_kernels.hpp.inc
rename to common/cuda_hip/factorization/cholesky_kernels.cpp
index e6220019d22..be906086f04 100644
--- a/common/cuda_hip/factorization/cholesky_kernels.hpp.inc
+++ b/common/cuda_hip/factorization/cholesky_kernels.cpp
@@ -2,6 +2,51 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include <algorithm>
+#include <memory>
+
+
+#include <thrust/execution_policy.h>
+#include <thrust/functional.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+#include <thrust/transform.h>
+#include <thrust/tuple.h>
+
+
+#include <ginkgo/core/matrix/csr.hpp>
+
+
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/sparselib_bindings.hpp"
+#include "common/cuda_hip/base/thrust.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/intrinsics.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/syncfree.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "core/components/fill_array_kernels.hpp"
+#include "core/components/format_conversion_kernels.hpp"
+#include "core/factorization/cholesky_kernels.hpp"
+#include "core/factorization/elimination_forest.hpp"
+#include "core/factorization/lu_kernels.hpp"
+#include "core/matrix/csr_lookup.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+/**
+ * @brief The Cholesky namespace.
+ *
+ * @ingroup factor
+ */
+namespace cholesky {
+
+
+constexpr int default_block_size = 512;
+
+
 #include "core/factorization/elimination_forest.hpp"
 namespace kernel {
 
@@ -330,3 +375,66 @@ void factorize(std::shared_ptr<const DefaultExecutor> exec,
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_FACTORIZE);
+
+
+template <typename ValueType, typename IndexType>
+void symbolic_count(std::shared_ptr<const DefaultExecutor> exec,
+                    const matrix::Csr<ValueType, IndexType>* mtx,
+                    const factorization::elimination_forest<IndexType>& forest,
+                    IndexType* row_nnz, array<IndexType>& tmp_storage)
+{
+    const auto num_rows = static_cast<IndexType>(mtx->get_size()[0]);
+    if (num_rows == 0) {
+        return;
+    }
+    const auto mtx_nnz = static_cast<IndexType>(mtx->get_num_stored_elements());
+    tmp_storage.resize_and_reset(mtx_nnz + num_rows);
+    const auto postorder_cols = tmp_storage.get_data();
+    const auto lower_ends = postorder_cols + mtx_nnz;
+    const auto row_ptrs = mtx->get_const_row_ptrs();
+    const auto cols = mtx->get_const_col_idxs();
+    const auto inv_postorder = forest.inv_postorder.get_const_data();
+    const auto postorder_parent = forest.postorder_parents.get_const_data();
+    // transform col indices to postorder indices
+    {
+        const auto num_blocks = ceildiv(num_rows, default_block_size);
+        kernel::build_postorder_cols<<<num_blocks, default_block_size, 0,
+                                       exec->get_stream()>>>(
+            num_rows, cols, row_ptrs, inv_postorder, postorder_cols,
+            lower_ends);
+    }
+    // sort postorder_cols inside rows
+    {
+        const auto handle = exec->get_sparselib_handle();
+        auto descr = sparselib::create_mat_descr();
+        array<IndexType> permutation_array(exec, mtx_nnz);
+        auto permutation = permutation_array.get_data();
+        components::fill_seq_array(exec, permutation, mtx_nnz);
+        size_type buffer_size{};
+        sparselib::csrsort_buffer_size(handle, num_rows, num_rows, mtx_nnz,
+                                       row_ptrs, postorder_cols, buffer_size);
+        array<char> buffer_array{exec, buffer_size};
+        auto buffer = buffer_array.get_data();
+        sparselib::csrsort(handle, num_rows, num_rows, mtx_nnz, descr, row_ptrs,
+                           postorder_cols, permutation, buffer);
+        sparselib::destroy(descr);
+    }
+    // count nonzeros per row of L
+    {
+        const auto num_blocks =
+            ceildiv(num_rows, default_block_size / config::warp_size);
+        kernel::symbolic_count<config::warp_size>
+            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+                num_rows, row_ptrs, lower_ends, inv_postorder, postorder_cols,
+                postorder_parent, row_nnz);
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CHOLESKY_SYMBOLIC_COUNT);
+
+
+}  // namespace cholesky
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/common/cuda_hip/factorization/factorization_kernels.hpp.inc b/common/cuda_hip/factorization/factorization_kernels.cpp
similarity index 95%
rename from common/cuda_hip/factorization/factorization_kernels.hpp.inc
rename to common/cuda_hip/factorization/factorization_kernels.cpp
index 806797e60d8..bcdf90ec969 100644
--- a/common/cuda_hip/factorization/factorization_kernels.hpp.inc
+++ b/common/cuda_hip/factorization/factorization_kernels.cpp
@@ -2,6 +2,36 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include <ginkgo/core/base/array.hpp>
+
+
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/intrinsics.hpp"
+#include "common/cuda_hip/components/searching.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "core/base/array_access.hpp"
+#include "core/components/prefix_sum_kernels.hpp"
+#include "core/factorization/factorization_kernels.hpp"
+#include "core/matrix/csr_builder.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+/**
+ * @brief The factorization namespace.
+ *
+ * @ingroup factor
+ */
+namespace factorization {
+
+
+constexpr int default_block_size{512};
+
+
 namespace kernel {
 
 
@@ -520,3 +550,9 @@ void initialize_l(std::shared_ptr<const DefaultExecutor> exec,
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_FACTORIZATION_INITIALIZE_L_KERNEL);
+
+
+}  // namespace factorization
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/common/cuda_hip/factorization/lu_kernels.hpp.inc b/common/cuda_hip/factorization/lu_kernels.cpp
similarity index 92%
rename from common/cuda_hip/factorization/lu_kernels.hpp.inc
rename to common/cuda_hip/factorization/lu_kernels.cpp
index f8f317bc6a5..12ec5c7b10d 100644
--- a/common/cuda_hip/factorization/lu_kernels.hpp.inc
+++ b/common/cuda_hip/factorization/lu_kernels.cpp
@@ -2,6 +2,43 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include <algorithm>
+#include <memory>
+
+
+#include <thrust/copy.h>
+#include <thrust/iterator/transform_output_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+
+
+#include <ginkgo/core/matrix/csr.hpp>
+
+
+#include "common/cuda_hip/base/thrust.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/syncfree.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "core/base/allocator.hpp"
+#include "core/factorization/lu_kernels.hpp"
+#include "core/matrix/csr_lookup.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+/**
+ * @brief The LU namespace.
+ *
+ * @ingroup factor
+ */
+namespace lu_factorization {
+
+
+constexpr static int default_block_size = 512;
+
+
 namespace kernel {
 
 
@@ -301,3 +338,9 @@ void symbolic_factorize_simple_finalize(
 
 GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(
     GKO_DECLARE_LU_SYMMETRIC_FACTORIZE_SIMPLE_FINALIZE);
+
+
+}  // namespace lu_factorization
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/common/cuda_hip/factorization/par_ic_kernels.hpp.inc b/common/cuda_hip/factorization/par_ic_kernels.cpp
similarity index 84%
rename from common/cuda_hip/factorization/par_ic_kernels.hpp.inc
rename to common/cuda_hip/factorization/par_ic_kernels.cpp
index dd30eb2fc1c..785540c56fc 100644
--- a/common/cuda_hip/factorization/par_ic_kernels.hpp.inc
+++ b/common/cuda_hip/factorization/par_ic_kernels.cpp
@@ -2,6 +2,32 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/coo.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+
+
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/memory.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "core/factorization/par_ic_kernels.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+/**
+ * @brief The parallel ic factorization namespace.
+ *
+ * @ingroup factor
+ */
+namespace par_ic_factorization {
+
+
+constexpr int default_block_size = 512;
+
+
 namespace kernel {
 
 
@@ -111,3 +137,9 @@ void compute_factor(std::shared_ptr<const DefaultExecutor> exec,
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_PAR_IC_COMPUTE_FACTOR_KERNEL);
+
+
+}  // namespace par_ic_factorization
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/common/cuda_hip/factorization/par_ict_kernels.hpp.inc b/common/cuda_hip/factorization/par_ict_kernels.cpp
similarity index 62%
rename from common/cuda_hip/factorization/par_ict_kernels.hpp.inc
rename to common/cuda_hip/factorization/par_ict_kernels.cpp
index 87aa8297345..523f89082af 100644
--- a/common/cuda_hip/factorization/par_ict_kernels.hpp.inc
+++ b/common/cuda_hip/factorization/par_ict_kernels.cpp
@@ -2,6 +2,51 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "core/factorization/par_ict_kernels.hpp"
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/coo.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/intrinsics.hpp"
+#include "common/cuda_hip/components/memory.hpp"
+#include "common/cuda_hip/components/merging.hpp"
+#include "common/cuda_hip/components/prefix_sum.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/searching.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "core/components/prefix_sum_kernels.hpp"
+#include "core/matrix/coo_builder.hpp"
+#include "core/matrix/csr_builder.hpp"
+#include "core/matrix/csr_kernels.hpp"
+#include "core/synthesizer/implementation_selection.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+/**
+ * @brief The parallel ICT factorization namespace.
+ *
+ * @ingroup factor
+ */
+namespace par_ict_factorization {
+
+
+constexpr int default_block_size = 512;
+
+
+// subwarp sizes for all warp-parallel kernels (filter, add_candidates)
+using compiled_kernels =
+    syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
+
+
 namespace kernel {
 
 
@@ -275,3 +320,142 @@ __global__ __launch_bounds__(default_block_size) void ict_sweep(
 
 
 }  // namespace kernel
+
+
+namespace {
+
+
+template <int subwarp_size, typename ValueType, typename IndexType>
+void add_candidates(syn::value_list<int, subwarp_size>,
+                    std::shared_ptr<const DefaultExecutor> exec,
+                    const matrix::Csr<ValueType, IndexType>* llh,
+                    const matrix::Csr<ValueType, IndexType>* a,
+                    const matrix::Csr<ValueType, IndexType>* l,
+                    matrix::Csr<ValueType, IndexType>* l_new)
+{
+    auto num_rows = static_cast<IndexType>(llh->get_size()[0]);
+    auto subwarps_per_block = default_block_size / subwarp_size;
+    auto num_blocks = ceildiv(num_rows, subwarps_per_block);
+    matrix::CsrBuilder<ValueType, IndexType> l_new_builder(l_new);
+    auto llh_row_ptrs = llh->get_const_row_ptrs();
+    auto llh_col_idxs = llh->get_const_col_idxs();
+    auto llh_vals = llh->get_const_values();
+    auto a_row_ptrs = a->get_const_row_ptrs();
+    auto a_col_idxs = a->get_const_col_idxs();
+    auto a_vals = a->get_const_values();
+    auto l_row_ptrs = l->get_const_row_ptrs();
+    auto l_col_idxs = l->get_const_col_idxs();
+    auto l_vals = l->get_const_values();
+    auto l_new_row_ptrs = l_new->get_row_ptrs();
+    // count non-zeros per row
+    if (num_blocks > 0) {
+        kernel::ict_tri_spgeam_nnz<subwarp_size>
+            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+                llh_row_ptrs, llh_col_idxs, a_row_ptrs, a_col_idxs,
+                l_new_row_ptrs, num_rows);
+    }
+
+    // build row ptrs
+    components::prefix_sum_nonnegative(exec, l_new_row_ptrs, num_rows + 1);
+
+    // resize output arrays
+    auto l_new_nnz = exec->copy_val_to_host(l_new_row_ptrs + num_rows);
+    l_new_builder.get_col_idx_array().resize_and_reset(l_new_nnz);
+    l_new_builder.get_value_array().resize_and_reset(l_new_nnz);
+
+    auto l_new_col_idxs = l_new->get_col_idxs();
+    auto l_new_vals = l_new->get_values();
+
+    // fill columns and values
+    if (num_blocks > 0) {
+        kernel::ict_tri_spgeam_init<subwarp_size>
+            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+                llh_row_ptrs, llh_col_idxs, as_device_type(llh_vals),
+                a_row_ptrs, a_col_idxs, as_device_type(a_vals), l_row_ptrs,
+                l_col_idxs, as_device_type(l_vals), l_new_row_ptrs,
+                l_new_col_idxs, as_device_type(l_new_vals), num_rows);
+    }
+}
+
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_add_candidates, add_candidates);
+
+
+template <int subwarp_size, typename ValueType, typename IndexType>
+void compute_factor(syn::value_list<int, subwarp_size>,
+                    std::shared_ptr<const DefaultExecutor> exec,
+                    const matrix::Csr<ValueType, IndexType>* a,
+                    matrix::Csr<ValueType, IndexType>* l,
+                    const matrix::Coo<ValueType, IndexType>* l_coo)
+{
+    auto total_nnz = static_cast<IndexType>(l->get_num_stored_elements());
+    auto block_size = default_block_size / subwarp_size;
+    auto num_blocks = ceildiv(total_nnz, block_size);
+    if (num_blocks > 0) {
+        kernel::ict_sweep<subwarp_size>
+            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+                a->get_const_row_ptrs(), a->get_const_col_idxs(),
+                as_device_type(a->get_const_values()), l->get_const_row_ptrs(),
+                l_coo->get_const_row_idxs(), l->get_const_col_idxs(),
+                as_device_type(l->get_values()),
+                static_cast<IndexType>(l->get_num_stored_elements()));
+    }
+}
+
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_compute_factor, compute_factor);
+
+
+}  // namespace
+
+
+template <typename ValueType, typename IndexType>
+void add_candidates(std::shared_ptr<const DefaultExecutor> exec,
+                    const matrix::Csr<ValueType, IndexType>* llh,
+                    const matrix::Csr<ValueType, IndexType>* a,
+                    const matrix::Csr<ValueType, IndexType>* l,
+                    matrix::Csr<ValueType, IndexType>* l_new)
+{
+    auto num_rows = a->get_size()[0];
+    auto total_nnz =
+        llh->get_num_stored_elements() + a->get_num_stored_elements();
+    auto total_nnz_per_row = total_nnz / num_rows;
+    select_add_candidates(
+        compiled_kernels(),
+        [&](int compiled_subwarp_size) {
+            return total_nnz_per_row <= compiled_subwarp_size ||
+                   compiled_subwarp_size == config::warp_size;
+        },
+        syn::value_list<int>(), syn::type_list<>(), exec, llh, a, l, l_new);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_PAR_ICT_ADD_CANDIDATES_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void compute_factor(std::shared_ptr<const DefaultExecutor> exec,
+                    const matrix::Csr<ValueType, IndexType>* a,
+                    matrix::Csr<ValueType, IndexType>* l,
+                    const matrix::Coo<ValueType, IndexType>* l_coo)
+{
+    auto num_rows = a->get_size()[0];
+    auto total_nnz = 2 * l->get_num_stored_elements();
+    auto total_nnz_per_row = total_nnz / num_rows;
+    select_compute_factor(
+        compiled_kernels(),
+        [&](int compiled_subwarp_size) {
+            return total_nnz_per_row <= compiled_subwarp_size ||
+                   compiled_subwarp_size == config::warp_size;
+        },
+        syn::value_list<int>(), syn::type_list<>(), exec, a, l, l_coo);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_PAR_ICT_COMPUTE_FACTOR_KERNEL);
+
+
+}  // namespace par_ict_factorization
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/common/cuda_hip/factorization/par_ilu_kernels.hpp.inc b/common/cuda_hip/factorization/par_ilu_kernels.cpp
similarity index 83%
rename from common/cuda_hip/factorization/par_ilu_kernels.hpp.inc
rename to common/cuda_hip/factorization/par_ilu_kernels.cpp
index 1029c0d08f6..abecf288e49 100644
--- a/common/cuda_hip/factorization/par_ilu_kernels.hpp.inc
+++ b/common/cuda_hip/factorization/par_ilu_kernels.cpp
@@ -2,6 +2,32 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include <ginkgo/core/base/std_extensions.hpp>
+#include <ginkgo/core/matrix/coo.hpp>
+
+
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/memory.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "core/factorization/par_ilu_kernels.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+/**
+ * @brief The parallel ilu factorization namespace.
+ *
+ * @ingroup factor
+ */
+namespace par_ilu_factorization {
+
+
+constexpr int default_block_size{512};
+
+
 namespace kernel {
 
 
@@ -85,3 +111,9 @@ void compute_l_u_factors(std::shared_ptr<const DefaultExecutor> exec,
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_PAR_ILU_COMPUTE_L_U_FACTORS_KERNEL);
+
+
+}  // namespace par_ilu_factorization
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/common/cuda_hip/factorization/par_ilut_filter_kernels.hpp.inc b/common/cuda_hip/factorization/par_ilut_filter_kernels.cpp
similarity index 57%
rename from common/cuda_hip/factorization/par_ilut_filter_kernels.hpp.inc
rename to common/cuda_hip/factorization/par_ilut_filter_kernels.cpp
index 68794bfc8d1..3622f971878 100644
--- a/common/cuda_hip/factorization/par_ilut_filter_kernels.hpp.inc
+++ b/common/cuda_hip/factorization/par_ilut_filter_kernels.cpp
@@ -2,6 +2,49 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "core/factorization/par_ilut_kernels.hpp"
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/coo.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/intrinsics.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "core/components/prefix_sum_kernels.hpp"
+#include "core/matrix/coo_builder.hpp"
+#include "core/matrix/csr_builder.hpp"
+#include "core/matrix/csr_kernels.hpp"
+#include "core/synthesizer/implementation_selection.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+/**
+ * @brief The parallel ILUT factorization namespace.
+ *
+ * @ingroup factor
+ */
+namespace par_ilut_factorization {
+
+
+constexpr int default_block_size = 512;
+
+
+// subwarp sizes for filter kernels
+using compiled_kernels =
+    syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
+
+
 namespace kernel {
 
 
@@ -162,3 +205,95 @@ __global__ __launch_bounds__(default_block_size) void bucket_filter(
 
 
 }  // namespace kernel
+
+
+namespace {
+
+
+template <int subwarp_size, typename ValueType, typename IndexType>
+void threshold_filter(syn::value_list<int, subwarp_size>,
+                      std::shared_ptr<const DefaultExecutor> exec,
+                      const matrix::Csr<ValueType, IndexType>* a,
+                      remove_complex<ValueType> threshold,
+                      matrix::Csr<ValueType, IndexType>* m_out,
+                      matrix::Coo<ValueType, IndexType>* m_out_coo, bool lower)
+{
+    auto old_row_ptrs = a->get_const_row_ptrs();
+    auto old_col_idxs = a->get_const_col_idxs();
+    auto old_vals = a->get_const_values();
+    // compute nnz for each row
+    auto num_rows = static_cast<IndexType>(a->get_size()[0]);
+    auto block_size = default_block_size / subwarp_size;
+    auto num_blocks = ceildiv(num_rows, block_size);
+    auto new_row_ptrs = m_out->get_row_ptrs();
+    if (num_blocks > 0) {
+        kernel::threshold_filter_nnz<subwarp_size>
+            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+                old_row_ptrs, as_device_type(old_vals), num_rows,
+                as_device_type(threshold), new_row_ptrs, lower);
+    }
+
+    // build row pointers
+    components::prefix_sum_nonnegative(exec, new_row_ptrs, num_rows + 1);
+
+    // build matrix
+    auto new_nnz = exec->copy_val_to_host(new_row_ptrs + num_rows);
+    // resize arrays and update aliases
+    matrix::CsrBuilder<ValueType, IndexType> builder{m_out};
+    builder.get_col_idx_array().resize_and_reset(new_nnz);
+    builder.get_value_array().resize_and_reset(new_nnz);
+    auto new_col_idxs = m_out->get_col_idxs();
+    auto new_vals = m_out->get_values();
+    IndexType* new_row_idxs{};
+    if (m_out_coo) {
+        matrix::CooBuilder<ValueType, IndexType> coo_builder{m_out_coo};
+        coo_builder.get_row_idx_array().resize_and_reset(new_nnz);
+        coo_builder.get_col_idx_array() =
+            make_array_view(exec, new_nnz, new_col_idxs);
+        coo_builder.get_value_array() =
+            make_array_view(exec, new_nnz, new_vals);
+        new_row_idxs = m_out_coo->get_row_idxs();
+    }
+    if (num_blocks > 0) {
+        kernel::threshold_filter<subwarp_size>
+            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+                old_row_ptrs, old_col_idxs, as_device_type(old_vals), num_rows,
+                as_device_type(threshold), new_row_ptrs, new_row_idxs,
+                new_col_idxs, as_device_type(new_vals), lower);
+    }
+}
+
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_threshold_filter, threshold_filter);
+
+
+}  // namespace
+
+template <typename ValueType, typename IndexType>
+void threshold_filter(std::shared_ptr<const DefaultExecutor> exec,
+                      const matrix::Csr<ValueType, IndexType>* a,
+                      remove_complex<ValueType> threshold,
+                      matrix::Csr<ValueType, IndexType>* m_out,
+                      matrix::Coo<ValueType, IndexType>* m_out_coo, bool lower)
+{
+    auto num_rows = a->get_size()[0];
+    auto total_nnz = a->get_num_stored_elements();
+    auto total_nnz_per_row = total_nnz / num_rows;
+    select_threshold_filter(
+        compiled_kernels(),
+        [&](int compiled_subwarp_size) {
+            return total_nnz_per_row <= compiled_subwarp_size ||
+                   compiled_subwarp_size == config::warp_size;
+        },
+        syn::value_list<int>(), syn::type_list<>(), exec, a, threshold, m_out,
+        m_out_coo, lower);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_KERNEL);
+
+
+}  // namespace par_ilut_factorization
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/common/cuda_hip/factorization/par_ilut_select_kernels.hpp.inc b/common/cuda_hip/factorization/par_ilut_select_kernels.cpp
similarity index 63%
rename from common/cuda_hip/factorization/par_ilut_select_kernels.hpp.inc
rename to common/cuda_hip/factorization/par_ilut_select_kernels.cpp
index 2ee5061d4c5..5c00503923a 100644
--- a/common/cuda_hip/factorization/par_ilut_select_kernels.hpp.inc
+++ b/common/cuda_hip/factorization/par_ilut_select_kernels.cpp
@@ -2,6 +2,40 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "core/factorization/par_ilut_kernels.hpp"
+
+
+#include <algorithm>
+#include <common/cuda_hip/factorization/par_ilut_select_common.hpp>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+
+
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/atomic.hpp"
+#include "common/cuda_hip/components/intrinsics.hpp"
+#include "common/cuda_hip/components/prefix_sum.hpp"
+#include "common/cuda_hip/components/searching.hpp"
+#include "common/cuda_hip/components/sorting.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "core/components/prefix_sum_kernels.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+/**
+ * @brief The parallel ILUT factorization namespace.
+ *
+ * @ingroup factor
+ */
+namespace par_ilut_factorization {
+
+
 namespace kernel {
 
 
@@ -278,3 +312,126 @@ __global__ __launch_bounds__(config::warp_size) void find_bucket(
 
 
 }  // namespace kernel
+
+
+template <typename ValueType, typename IndexType>
+void sampleselect_filter(std::shared_ptr<const DefaultExecutor> exec,
+                         const ValueType* values, IndexType size,
+                         const unsigned char* oracles,
+                         const IndexType* partial_counts, IndexType bucket,
+                         remove_complex<ValueType>* out)
+{
+    auto num_threads_total = ceildiv(size, items_per_thread);
+    auto num_blocks =
+        static_cast<IndexType>(ceildiv(num_threads_total, default_block_size));
+    if (num_blocks > 0) {
+        kernel::filter_bucket<<<num_blocks, default_block_size, 0,
+                                exec->get_stream()>>>(
+            as_device_type(values), size, bucket, oracles, partial_counts,
+            as_device_type(out), items_per_thread);
+    }
+}
+
+
+template <typename ValueType, typename IndexType>
+void threshold_select(std::shared_ptr<const DefaultExecutor> exec,
+                      const matrix::Csr<ValueType, IndexType>* m,
+                      IndexType rank, array<ValueType>& tmp1,
+                      array<remove_complex<ValueType>>& tmp2,
+                      remove_complex<ValueType>& threshold)
+{
+    auto values = m->get_const_values();
+    IndexType size = m->get_num_stored_elements();
+    using AbsType = remove_complex<ValueType>;
+    constexpr auto bucket_count = kernel::searchtree_width;
+    auto max_num_threads = ceildiv(size, items_per_thread);
+    auto max_num_blocks = ceildiv(max_num_threads, default_block_size);
+
+    size_type tmp_size_totals =
+        ceildiv((bucket_count + 1) * sizeof(IndexType), sizeof(ValueType));
+    size_type tmp_size_partials = ceildiv(
+        bucket_count * max_num_blocks * sizeof(IndexType), sizeof(ValueType));
+    size_type tmp_size_oracles =
+        ceildiv(size * sizeof(unsigned char), sizeof(ValueType));
+    size_type tmp_size_tree =
+        ceildiv(kernel::searchtree_size * sizeof(AbsType), sizeof(ValueType));
+    size_type tmp_size_vals =
+        size / bucket_count * 4;  // pessimistic estimate for temporary storage
+    size_type tmp_size =
+        tmp_size_totals + tmp_size_partials + tmp_size_oracles + tmp_size_tree;
+    tmp1.resize_and_reset(tmp_size);
+    tmp2.resize_and_reset(tmp_size_vals);
+
+    auto total_counts = reinterpret_cast<IndexType*>(tmp1.get_data());
+    auto partial_counts =
+        reinterpret_cast<IndexType*>(tmp1.get_data() + tmp_size_totals);
+    auto oracles = reinterpret_cast<unsigned char*>(
+        tmp1.get_data() + tmp_size_totals + tmp_size_partials);
+    auto tree =
+        reinterpret_cast<AbsType*>(tmp1.get_data() + tmp_size_totals +
+                                   tmp_size_partials + tmp_size_oracles);
+
+    sampleselect_count(exec, values, size, tree, oracles, partial_counts,
+                       total_counts);
+
+    // determine bucket with correct rank, use bucket-local rank
+    auto bucket = sampleselect_find_bucket(exec, total_counts, rank);
+    rank -= bucket.begin;
+
+    if (bucket.size * 2 > tmp_size_vals) {
+        // we need to reallocate tmp2
+        tmp2.resize_and_reset(bucket.size * 2);
+    }
+    auto tmp21 = tmp2.get_data();
+    auto tmp22 = tmp2.get_data() + bucket.size;
+    // extract target bucket
+    sampleselect_filter(exec, values, size, oracles, partial_counts, bucket.idx,
+                        tmp22);
+
+    // recursively select from smaller buckets
+    int step{};
+    while (bucket.size > kernel::basecase_size) {
+        std::swap(tmp21, tmp22);
+        const auto* tmp_in = tmp21;
+        auto tmp_out = tmp22;
+
+        sampleselect_count(exec, tmp_in, bucket.size, tree, oracles,
+                           partial_counts, total_counts);
+        auto new_bucket = sampleselect_find_bucket(exec, total_counts, rank);
+        sampleselect_filter(exec, tmp_in, bucket.size, oracles, partial_counts,
+                            bucket.idx, tmp_out);
+
+        rank -= new_bucket.begin;
+        bucket.size = new_bucket.size;
+        // we should never need more than 5 recursion steps, this would mean
+        // 256^5 = 2^40. fall back to standard library algorithm in that case.
+        ++step;
+        if (step > 5) {
+            array<AbsType> cpu_out_array{
+                exec->get_master(),
+                make_array_view(exec, bucket.size, tmp_out)};
+            auto begin = cpu_out_array.get_data();
+            auto end = begin + bucket.size;
+            auto middle = begin + rank;
+            std::nth_element(begin, middle, end);
+            threshold = *middle;
+            return;
+        }
+    }
+
+    // base case
+    auto out_ptr = reinterpret_cast<AbsType*>(tmp1.get_data());
+    kernel::basecase_select<<<1, kernel::basecase_block_size, 0,
+                              exec->get_stream()>>>(
+        as_device_type(tmp22), bucket.size, rank, as_device_type(out_ptr));
+    threshold = exec->copy_val_to_host(out_ptr);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_PAR_ILUT_THRESHOLD_SELECT_KERNEL);
+
+
+}  // namespace par_ilut_factorization
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/common/cuda_hip/factorization/par_ilut_spgeam_kernels.hpp.inc b/common/cuda_hip/factorization/par_ilut_spgeam_kernels.cpp
similarity index 63%
rename from common/cuda_hip/factorization/par_ilut_spgeam_kernels.hpp.inc
rename to common/cuda_hip/factorization/par_ilut_spgeam_kernels.cpp
index a97f0f08937..b9658f69f70 100644
--- a/common/cuda_hip/factorization/par_ilut_spgeam_kernels.hpp.inc
+++ b/common/cuda_hip/factorization/par_ilut_spgeam_kernels.cpp
@@ -2,6 +2,50 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "core/factorization/par_ilut_kernels.hpp"
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/coo.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/intrinsics.hpp"
+#include "common/cuda_hip/components/merging.hpp"
+#include "common/cuda_hip/components/prefix_sum.hpp"
+#include "common/cuda_hip/components/searching.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "core/components/prefix_sum_kernels.hpp"
+#include "core/matrix/coo_builder.hpp"
+#include "core/matrix/csr_builder.hpp"
+#include "core/matrix/csr_kernels.hpp"
+#include "core/synthesizer/implementation_selection.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+/**
+ * @brief The parallel ILUT factorization namespace.
+ *
+ * @ingroup factor
+ */
+namespace par_ilut_factorization {
+
+
+constexpr int default_block_size = 512;
+
+
+// subwarp sizes for add_candidates kernels
+using compiled_kernels =
+    syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
+
+
 namespace kernel {
 
 
@@ -246,3 +290,113 @@ __global__ __launch_bounds__(default_block_size) void tri_spgeam_init(
 
 
 }  // namespace kernel
+
+
+namespace {
+
+
+template <int subwarp_size, typename ValueType, typename IndexType>
+void add_candidates(syn::value_list<int, subwarp_size>,
+                    std::shared_ptr<const DefaultExecutor> exec,
+                    const matrix::Csr<ValueType, IndexType>* lu,
+                    const matrix::Csr<ValueType, IndexType>* a,
+                    const matrix::Csr<ValueType, IndexType>* l,
+                    const matrix::Csr<ValueType, IndexType>* u,
+                    matrix::Csr<ValueType, IndexType>* l_new,
+                    matrix::Csr<ValueType, IndexType>* u_new)
+{
+    auto num_rows = static_cast<IndexType>(lu->get_size()[0]);
+    auto subwarps_per_block = default_block_size / subwarp_size;
+    auto num_blocks = ceildiv(num_rows, subwarps_per_block);
+    matrix::CsrBuilder<ValueType, IndexType> l_new_builder(l_new);
+    matrix::CsrBuilder<ValueType, IndexType> u_new_builder(u_new);
+    auto lu_row_ptrs = lu->get_const_row_ptrs();
+    auto lu_col_idxs = lu->get_const_col_idxs();
+    auto lu_vals = lu->get_const_values();
+    auto a_row_ptrs = a->get_const_row_ptrs();
+    auto a_col_idxs = a->get_const_col_idxs();
+    auto a_vals = a->get_const_values();
+    auto l_row_ptrs = l->get_const_row_ptrs();
+    auto l_col_idxs = l->get_const_col_idxs();
+    auto l_vals = l->get_const_values();
+    auto u_row_ptrs = u->get_const_row_ptrs();
+    auto u_col_idxs = u->get_const_col_idxs();
+    auto u_vals = u->get_const_values();
+    auto l_new_row_ptrs = l_new->get_row_ptrs();
+    auto u_new_row_ptrs = u_new->get_row_ptrs();
+    if (num_blocks > 0) {
+        // count non-zeros per row
+        kernel::tri_spgeam_nnz<subwarp_size>
+            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+                lu_row_ptrs, lu_col_idxs, a_row_ptrs, a_col_idxs,
+                l_new_row_ptrs, u_new_row_ptrs, num_rows);
+    }
+
+    // build row ptrs
+    components::prefix_sum_nonnegative(exec, l_new_row_ptrs, num_rows + 1);
+    components::prefix_sum_nonnegative(exec, u_new_row_ptrs, num_rows + 1);
+
+    // resize output arrays
+    auto l_new_nnz = exec->copy_val_to_host(l_new_row_ptrs + num_rows);
+    auto u_new_nnz = exec->copy_val_to_host(u_new_row_ptrs + num_rows);
+    l_new_builder.get_col_idx_array().resize_and_reset(l_new_nnz);
+    l_new_builder.get_value_array().resize_and_reset(l_new_nnz);
+    u_new_builder.get_col_idx_array().resize_and_reset(u_new_nnz);
+    u_new_builder.get_value_array().resize_and_reset(u_new_nnz);
+
+    auto l_new_col_idxs = l_new->get_col_idxs();
+    auto l_new_vals = l_new->get_values();
+    auto u_new_col_idxs = u_new->get_col_idxs();
+    auto u_new_vals = u_new->get_values();
+
+    if (num_blocks > 0) {
+        // fill columns and values
+        kernel::tri_spgeam_init<subwarp_size>
+            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+                lu_row_ptrs, lu_col_idxs, as_device_type(lu_vals), a_row_ptrs,
+                a_col_idxs, as_device_type(a_vals), l_row_ptrs, l_col_idxs,
+                as_device_type(l_vals), u_row_ptrs, u_col_idxs,
+                as_device_type(u_vals), l_new_row_ptrs, l_new_col_idxs,
+                as_device_type(l_new_vals), u_new_row_ptrs, u_new_col_idxs,
+                as_device_type(u_new_vals), num_rows);
+    }
+}
+
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_add_candidates, add_candidates);
+
+
+}  // namespace
+
+
+template <typename ValueType, typename IndexType>
+void add_candidates(std::shared_ptr<const DefaultExecutor> exec,
+                    const matrix::Csr<ValueType, IndexType>* lu,
+                    const matrix::Csr<ValueType, IndexType>* a,
+                    const matrix::Csr<ValueType, IndexType>* l,
+                    const matrix::Csr<ValueType, IndexType>* u,
+                    matrix::Csr<ValueType, IndexType>* l_new,
+                    matrix::Csr<ValueType, IndexType>* u_new)
+{
+    auto num_rows = a->get_size()[0];
+    auto total_nnz =
+        lu->get_num_stored_elements() + a->get_num_stored_elements();
+    auto total_nnz_per_row = total_nnz / num_rows;
+    select_add_candidates(
+        compiled_kernels(),
+        [&](int compiled_subwarp_size) {
+            return total_nnz_per_row <= compiled_subwarp_size ||
+                   compiled_subwarp_size == config::warp_size;
+        },
+        syn::value_list<int>(), syn::type_list<>(), exec, lu, a, l, u, l_new,
+        u_new);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_PAR_ILUT_ADD_CANDIDATES_KERNEL);
+
+
+}  // namespace par_ilut_factorization
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/common/cuda_hip/factorization/par_ilut_sweep_kernels.cpp b/common/cuda_hip/factorization/par_ilut_sweep_kernels.cpp
new file mode 100644
index 00000000000..6ae783133e5
--- /dev/null
+++ b/common/cuda_hip/factorization/par_ilut_sweep_kernels.cpp
@@ -0,0 +1,212 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "core/factorization/par_ilut_kernels.hpp"
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/coo.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/intrinsics.hpp"
+#include "common/cuda_hip/components/memory.hpp"
+#include "common/cuda_hip/components/merging.hpp"
+#include "common/cuda_hip/components/prefix_sum.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/searching.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "core/components/prefix_sum_kernels.hpp"
+#include "core/matrix/coo_builder.hpp"
+#include "core/matrix/csr_builder.hpp"
+#include "core/matrix/csr_kernels.hpp"
+#include "core/synthesizer/implementation_selection.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+/**
+ * @brief The parallel ILUT factorization namespace.
+ *
+ * @ingroup factor
+ */
+namespace par_ilut_factorization {
+
+
+constexpr int default_block_size = 512;
+
+
+// subwarp sizes for all warp-parallel kernels (filter, add_candidates)
+using compiled_kernels =
+    syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
+
+
+namespace kernel {
+
+
+template <int subwarp_size, typename ValueType, typename IndexType>
+__global__ __launch_bounds__(default_block_size) void sweep(
+    const IndexType* __restrict__ a_row_ptrs,
+    const IndexType* __restrict__ a_col_idxs,
+    const ValueType* __restrict__ a_vals,
+    const IndexType* __restrict__ l_row_ptrs,
+    const IndexType* __restrict__ l_row_idxs,
+    const IndexType* __restrict__ l_col_idxs, ValueType* __restrict__ l_vals,
+    IndexType l_nnz, const IndexType* __restrict__ u_row_idxs,
+    const IndexType* __restrict__ u_col_idxs, ValueType* __restrict__ u_vals,
+    const IndexType* __restrict__ ut_col_ptrs,
+    const IndexType* __restrict__ ut_row_idxs, ValueType* __restrict__ ut_vals,
+    IndexType u_nnz)
+{
+    auto tidx = thread::get_subwarp_id_flat<subwarp_size, IndexType>();
+    if (tidx >= l_nnz + u_nnz) {
+        return;
+    }
+    // split the subwarps into two halves for lower and upper triangle
+    auto l_nz = tidx;
+    auto u_nz = l_nz - l_nnz;
+    auto lower = u_nz < 0;
+    auto row = lower ? l_row_idxs[l_nz] : u_row_idxs[u_nz];
+    auto col = lower ? l_col_idxs[l_nz] : u_col_idxs[u_nz];
+    if (lower && row == col) {
+        // don't update the diagonal twice
+        return;
+    }
+    auto subwarp =
+        group::tiled_partition<subwarp_size>(group::this_thread_block());
+    // find entry of A at (row, col)
+    auto a_row_begin = a_row_ptrs[row];
+    auto a_row_end = a_row_ptrs[row + 1];
+    auto a_row_size = a_row_end - a_row_begin;
+    auto a_idx =
+        group_wide_search(a_row_begin, a_row_size, subwarp,
+                          [&](IndexType i) { return a_col_idxs[i] >= col; });
+    bool has_a = a_idx < a_row_end && a_col_idxs[a_idx] == col;
+    auto a_val = has_a ? a_vals[a_idx] : zero<ValueType>();
+    auto l_row_begin = l_row_ptrs[row];
+    auto l_row_size = l_row_ptrs[row + 1] - l_row_begin;
+    auto ut_col_begin = ut_col_ptrs[col];
+    auto ut_col_size = ut_col_ptrs[col + 1] - ut_col_begin;
+    ValueType sum{};
+    IndexType ut_nz{};
+    auto last_entry = min(row, col);
+    group_merge<subwarp_size>(
+        l_col_idxs + l_row_begin, l_row_size, ut_row_idxs + ut_col_begin,
+        ut_col_size, subwarp,
+        [&](IndexType l_idx, IndexType l_col, IndexType ut_idx,
+            IndexType ut_row, IndexType, bool) {
+            // we don't need to use the `bool valid` because last_entry is
+            // already a smaller sentinel value than the one used in group_merge
+            if (l_col == ut_row && l_col < last_entry) {
+                sum += load_relaxed(l_vals + (l_idx + l_row_begin)) *
+                       load_relaxed(ut_vals + (ut_idx + ut_col_begin));
+            }
+            // remember the transposed element
+            auto found_transp = subwarp.ballot(ut_row == row);
+            if (found_transp) {
+                ut_nz =
+                    subwarp.shfl(ut_idx + ut_col_begin, ffs(found_transp) - 1);
+            }
+            return true;
+        });
+    // accumulate result from all threads
+    sum = reduce(subwarp, sum, [](ValueType a, ValueType b) { return a + b; });
+
+    if (subwarp.thread_rank() == 0) {
+        if (lower) {
+            auto to_write = (a_val - sum) /
+                            load_relaxed(ut_vals + (ut_col_ptrs[col + 1] - 1));
+            if (is_finite(to_write)) {
+                store_relaxed(l_vals + l_nz, to_write);
+            }
+        } else {
+            auto to_write = a_val - sum;
+            if (is_finite(to_write)) {
+                store_relaxed(u_vals + u_nz, to_write);
+                store_relaxed(ut_vals + ut_nz, to_write);
+            }
+        }
+    }
+}
+
+
+}  // namespace kernel
+
+
+namespace {
+
+
+template <int subwarp_size, typename ValueType, typename IndexType>
+void compute_l_u_factors(syn::value_list<int, subwarp_size>,
+                         std::shared_ptr<const DefaultExecutor> exec,
+                         const matrix::Csr<ValueType, IndexType>* a,
+                         matrix::Csr<ValueType, IndexType>* l,
+                         const matrix::Coo<ValueType, IndexType>* l_coo,
+                         matrix::Csr<ValueType, IndexType>* u,
+                         const matrix::Coo<ValueType, IndexType>* u_coo,
+                         matrix::Csr<ValueType, IndexType>* u_csc)
+{
+    auto total_nnz = static_cast<IndexType>(l->get_num_stored_elements() +
+                                            u->get_num_stored_elements());
+    auto block_size = default_block_size / subwarp_size;
+    auto num_blocks = ceildiv(total_nnz, block_size);
+    if (num_blocks > 0) {
+        kernel::sweep<subwarp_size>
+            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+                a->get_const_row_ptrs(), a->get_const_col_idxs(),
+                as_device_type(a->get_const_values()), l->get_const_row_ptrs(),
+                l_coo->get_const_row_idxs(), l->get_const_col_idxs(),
+                as_device_type(l->get_values()),
+                static_cast<IndexType>(l->get_num_stored_elements()),
+                u_coo->get_const_row_idxs(), u_coo->get_const_col_idxs(),
+                as_device_type(u->get_values()), u_csc->get_const_row_ptrs(),
+                u_csc->get_const_col_idxs(),
+                as_device_type(u_csc->get_values()),
+                static_cast<IndexType>(u->get_num_stored_elements()));
+    }
+}
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_compute_l_u_factors,
+                                    compute_l_u_factors);
+
+
+}  // namespace
+
+
+template <typename ValueType, typename IndexType>
+void compute_l_u_factors(std::shared_ptr<const DefaultExecutor> exec,
+                         const matrix::Csr<ValueType, IndexType>* a,
+                         matrix::Csr<ValueType, IndexType>* l,
+                         const matrix::Coo<ValueType, IndexType>* l_coo,
+                         matrix::Csr<ValueType, IndexType>* u,
+                         const matrix::Coo<ValueType, IndexType>* u_coo,
+                         matrix::Csr<ValueType, IndexType>* u_csc)
+{
+    auto num_rows = a->get_size()[0];
+    auto total_nnz =
+        l->get_num_stored_elements() + u->get_num_stored_elements();
+    auto total_nnz_per_row = total_nnz / num_rows;
+    select_compute_l_u_factors(
+        compiled_kernels(),
+        [&](int compiled_subwarp_size) {
+            return total_nnz_per_row <= compiled_subwarp_size ||
+                   compiled_subwarp_size == config::warp_size;
+        },
+        syn::value_list<int>(), syn::type_list<>(), exec, a, l, l_coo, u, u_coo,
+        u_csc);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_PAR_ILUT_COMPUTE_LU_FACTORS_KERNEL);
+
+
+}  // namespace par_ilut_factorization
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/common/cuda_hip/factorization/par_ilut_sweep_kernels.hpp.inc b/common/cuda_hip/factorization/par_ilut_sweep_kernels.hpp.inc
deleted file mode 100644
index 9da94a878b3..00000000000
--- a/common/cuda_hip/factorization/par_ilut_sweep_kernels.hpp.inc
+++ /dev/null
@@ -1,94 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-namespace kernel {
-
-
-template <int subwarp_size, typename ValueType, typename IndexType>
-__global__ __launch_bounds__(default_block_size) void sweep(
-    const IndexType* __restrict__ a_row_ptrs,
-    const IndexType* __restrict__ a_col_idxs,
-    const ValueType* __restrict__ a_vals,
-    const IndexType* __restrict__ l_row_ptrs,
-    const IndexType* __restrict__ l_row_idxs,
-    const IndexType* __restrict__ l_col_idxs, ValueType* __restrict__ l_vals,
-    IndexType l_nnz, const IndexType* __restrict__ u_row_idxs,
-    const IndexType* __restrict__ u_col_idxs, ValueType* __restrict__ u_vals,
-    const IndexType* __restrict__ ut_col_ptrs,
-    const IndexType* __restrict__ ut_row_idxs, ValueType* __restrict__ ut_vals,
-    IndexType u_nnz)
-{
-    auto tidx = thread::get_subwarp_id_flat<subwarp_size, IndexType>();
-    if (tidx >= l_nnz + u_nnz) {
-        return;
-    }
-    // split the subwarps into two halves for lower and upper triangle
-    auto l_nz = tidx;
-    auto u_nz = l_nz - l_nnz;
-    auto lower = u_nz < 0;
-    auto row = lower ? l_row_idxs[l_nz] : u_row_idxs[u_nz];
-    auto col = lower ? l_col_idxs[l_nz] : u_col_idxs[u_nz];
-    if (lower && row == col) {
-        // don't update the diagonal twice
-        return;
-    }
-    auto subwarp =
-        group::tiled_partition<subwarp_size>(group::this_thread_block());
-    // find entry of A at (row, col)
-    auto a_row_begin = a_row_ptrs[row];
-    auto a_row_end = a_row_ptrs[row + 1];
-    auto a_row_size = a_row_end - a_row_begin;
-    auto a_idx =
-        group_wide_search(a_row_begin, a_row_size, subwarp,
-                          [&](IndexType i) { return a_col_idxs[i] >= col; });
-    bool has_a = a_idx < a_row_end && a_col_idxs[a_idx] == col;
-    auto a_val = has_a ? a_vals[a_idx] : zero<ValueType>();
-    auto l_row_begin = l_row_ptrs[row];
-    auto l_row_size = l_row_ptrs[row + 1] - l_row_begin;
-    auto ut_col_begin = ut_col_ptrs[col];
-    auto ut_col_size = ut_col_ptrs[col + 1] - ut_col_begin;
-    ValueType sum{};
-    IndexType ut_nz{};
-    auto last_entry = min(row, col);
-    group_merge<subwarp_size>(
-        l_col_idxs + l_row_begin, l_row_size, ut_row_idxs + ut_col_begin,
-        ut_col_size, subwarp,
-        [&](IndexType l_idx, IndexType l_col, IndexType ut_idx,
-            IndexType ut_row, IndexType, bool) {
-            // we don't need to use the `bool valid` because last_entry is
-            // already a smaller sentinel value than the one used in group_merge
-            if (l_col == ut_row && l_col < last_entry) {
-                sum += load_relaxed(l_vals + (l_idx + l_row_begin)) *
-                       load_relaxed(ut_vals + (ut_idx + ut_col_begin));
-            }
-            // remember the transposed element
-            auto found_transp = subwarp.ballot(ut_row == row);
-            if (found_transp) {
-                ut_nz =
-                    subwarp.shfl(ut_idx + ut_col_begin, ffs(found_transp) - 1);
-            }
-            return true;
-        });
-    // accumulate result from all threads
-    sum = reduce(subwarp, sum, [](ValueType a, ValueType b) { return a + b; });
-
-    if (subwarp.thread_rank() == 0) {
-        if (lower) {
-            auto to_write = (a_val - sum) /
-                            load_relaxed(ut_vals + (ut_col_ptrs[col + 1] - 1));
-            if (is_finite(to_write)) {
-                store_relaxed(l_vals + l_nz, to_write);
-            }
-        } else {
-            auto to_write = a_val - sum;
-            if (is_finite(to_write)) {
-                store_relaxed(u_vals + u_nz, to_write);
-                store_relaxed(ut_vals + ut_nz, to_write);
-            }
-        }
-    }
-}
-
-
-}  // namespace kernel
diff --git a/common/cuda_hip/log/batch_logger.hpp.inc b/common/cuda_hip/log/batch_logger.hpp
similarity index 68%
rename from common/cuda_hip/log/batch_logger.hpp.inc
rename to common/cuda_hip/log/batch_logger.hpp
index 04b614b50f9..77ec84fb7bd 100644
--- a/common/cuda_hip/log/batch_logger.hpp.inc
+++ b/common/cuda_hip/log/batch_logger.hpp
@@ -2,6 +2,19 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_COMMON_CUDA_HIP_LOG_BATCH_LOGGER_HPP_
+#define GKO_COMMON_CUDA_HIP_LOG_BATCH_LOGGER_HPP_
+
+
+#include <ginkgo/core/base/types.hpp>
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace batch_log {
+
+
 /**
  * @see reference/log/batch_logger.hpp
  */
@@ -28,3 +41,12 @@ class SimpleFinalLogger final {
     real_type* const final_residuals_;
     idx_type* const final_iters_;
 };
+
+
+}  // namespace batch_log
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_COMMON_CUDA_HIP_LOG_BATCH_LOGGER_HPP_
diff --git a/common/cuda_hip/matrix/batch_csr_kernels.hpp.inc b/common/cuda_hip/matrix/batch_csr_kernels.cpp
similarity index 87%
rename from common/cuda_hip/matrix/batch_csr_kernels.hpp.inc
rename to common/cuda_hip/matrix/batch_csr_kernels.cpp
index e041dadaa3e..a07074e29e8 100644
--- a/common/cuda_hip/matrix/batch_csr_kernels.hpp.inc
+++ b/common/cuda_hip/matrix/batch_csr_kernels.cpp
@@ -2,6 +2,49 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include <common/cuda_hip/base/batch_struct.hpp>
+#include <common/cuda_hip/matrix/batch_struct.hpp>
+
+
+#include <thrust/functional.h>
+
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/matrix/batch_csr.hpp>
+
+
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/thrust.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/uninitialized_array.hpp"
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_csr_kernels.hpp"
+#include "core/matrix/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+/**
+ * @brief The Csr matrix format namespace.
+ * @ref Csr
+ * @ingroup batch_csr
+ */
+namespace batch_csr {
+
+
+constexpr auto default_block_size = 256;
+constexpr int sm_oversubscription = 4;
+
+// clang-format off
+
+// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
+
+
 template <typename ValueType, typename IndexType>
 __device__ __forceinline__ void simple_apply(
     const gko::batch::matrix::csr::batch_item<const ValueType, IndexType>& mat,
@@ -196,3 +239,14 @@ __global__ void add_scaled_identity_kernel(
         add_scaled_identity(alpha_b.values[0], beta_b.values[0], mat_b);
     }
 }
+
+
+#include "common/cuda_hip/matrix/batch_csr_kernel_launcher.hpp.inc"
+
+// clang-format on
+
+
+}  // namespace batch_csr
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/common/cuda_hip/matrix/batch_dense_kernels.hpp.inc b/common/cuda_hip/matrix/batch_dense_kernels.cpp
similarity index 89%
rename from common/cuda_hip/matrix/batch_dense_kernels.hpp.inc
rename to common/cuda_hip/matrix/batch_dense_kernels.cpp
index f8abf9131a1..b5c2dbe1d5d 100644
--- a/common/cuda_hip/matrix/batch_dense_kernels.hpp.inc
+++ b/common/cuda_hip/matrix/batch_dense_kernels.cpp
@@ -2,6 +2,49 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include <common/cuda_hip/base/batch_struct.hpp>
+#include <common/cuda_hip/matrix/batch_struct.hpp>
+
+
+#include <thrust/functional.h>
+
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/batch_dense.hpp>
+
+
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/thrust.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/uninitialized_array.hpp"
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_dense_kernels.hpp"
+#include "core/matrix/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+/**
+ * @brief The Dense matrix format namespace.
+ *
+ * @ingroup batch_dense
+ */
+namespace batch_dense {
+
+
+constexpr auto default_block_size = 256;
+constexpr int sm_oversubscription = 4;
+
+// clang-format off
+
+// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
+
+
 template <typename ValueType>
 __device__ __forceinline__ void simple_apply(
     const gko::batch::matrix::dense::batch_item<const ValueType>& mat,
@@ -243,3 +286,15 @@ __global__ void add_scaled_identity_kernel(
         add_scaled_identity(alpha_b.values[0], beta_b.values[0], mat_b);
     }
 }
+
+
+#include "common/cuda_hip/matrix/batch_dense_kernel_launcher.hpp.inc"
+
+
+// clang-format on
+
+
+}  // namespace batch_dense
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/common/cuda_hip/matrix/batch_ell_kernels.hpp.inc b/common/cuda_hip/matrix/batch_ell_kernels.cpp
similarity index 87%
rename from common/cuda_hip/matrix/batch_ell_kernels.hpp.inc
rename to common/cuda_hip/matrix/batch_ell_kernels.cpp
index 0a6d1927c96..c3bf21c7744 100644
--- a/common/cuda_hip/matrix/batch_ell_kernels.hpp.inc
+++ b/common/cuda_hip/matrix/batch_ell_kernels.cpp
@@ -2,6 +2,49 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include <common/cuda_hip/base/batch_struct.hpp>
+#include <common/cuda_hip/matrix/batch_struct.hpp>
+
+
+#include <thrust/functional.h>
+
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/matrix/batch_ell.hpp>
+
+
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/thrust.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/uninitialized_array.hpp"
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_ell_kernels.hpp"
+#include "core/matrix/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+/**
+ * @brief The Ell matrix format namespace.
+ * @ref Ell
+ * @ingroup batch_ell
+ */
+namespace batch_ell {
+
+
+constexpr auto default_block_size = 256;
+constexpr int sm_oversubscription = 4;
+
+// clang-format off
+
+// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
+
+
 template <typename ValueType, typename IndexType>
 __device__ __forceinline__ void simple_apply(
     const gko::batch::matrix::ell::batch_item<const ValueType, IndexType>& mat,
@@ -205,3 +248,14 @@ __global__ void add_scaled_identity_kernel(
         add_scaled_identity(alpha_b.values[0], beta_b.values[0], mat_b);
     }
 }
+
+
+#include "common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc"
+
+// clang-format on
+
+
+}  // namespace batch_ell
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/common/cuda_hip/matrix/coo_kernels.hpp.inc b/common/cuda_hip/matrix/coo_kernels.cpp
similarity index 91%
rename from common/cuda_hip/matrix/coo_kernels.hpp.inc
rename to common/cuda_hip/matrix/coo_kernels.cpp
index 98332f6cd7b..103926124a2 100644
--- a/common/cuda_hip/matrix/coo_kernels.hpp.inc
+++ b/common/cuda_hip/matrix/coo_kernels.cpp
@@ -2,6 +2,42 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/sparselib_bindings.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/atomic.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/format_conversion.hpp"
+#include "common/cuda_hip/components/segment_scan.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "core/matrix/coo_kernels.hpp"
+#include "core/matrix/dense_kernels.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+/**
+ * @brief The Coordinate matrix format namespace.
+ *
+ * @ingroup coo
+ */
+namespace coo {
+
+
+constexpr int warps_in_block = 4;
+constexpr int spmv_block_size = warps_in_block * config::warp_size;
+
+
 namespace {
 
 
@@ -304,3 +340,9 @@ void advanced_spmv2(std::shared_ptr<const DefaultExecutor> exec,
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_COO_ADVANCED_SPMV2_KERNEL);
+
+
+}  // namespace coo
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/common/cuda_hip/matrix/dense_kernels.hpp.inc b/common/cuda_hip/matrix/dense_kernels.cpp
similarity index 75%
rename from common/cuda_hip/matrix/dense_kernels.hpp.inc
rename to common/cuda_hip/matrix/dense_kernels.cpp
index b48d2c4ff4f..1524e0a93b0 100644
--- a/common/cuda_hip/matrix/dense_kernels.hpp.inc
+++ b/common/cuda_hip/matrix/dense_kernels.cpp
@@ -2,6 +2,46 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/range_accessors.hpp>
+#include <ginkgo/core/matrix/coo.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/diagonal.hpp>
+#include <ginkgo/core/matrix/ell.hpp>
+#include <ginkgo/core/matrix/fbcsr.hpp>
+#include <ginkgo/core/matrix/hybrid.hpp>
+#include <ginkgo/core/matrix/sellp.hpp>
+#include <ginkgo/core/matrix/sparsity_csr.hpp>
+
+
+#include "common/cuda_hip/base/blas_bindings.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/pointer_mode_guard.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/intrinsics.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/uninitialized_array.hpp"
+#include "core/base/utils.hpp"
+#include "core/components/prefix_sum_kernels.hpp"
+#include "core/matrix/dense_kernels.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+/**
+ * @brief The Dense matrix format namespace.
+ *
+ * @ingroup dense
+ */
+namespace dense {
+
+
+constexpr int default_block_size = 512;
+
+
 namespace kernel {
 
 
@@ -619,3 +659,188 @@ void convert_to_sparsity_csr(std::shared_ptr<const DefaultExecutor> exec,
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_CONVERT_TO_SPARSITY_CSR_KERNEL);
+
+
+template <typename ValueType>
+void compute_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
+                          const matrix::Dense<ValueType>* x,
+                          const matrix::Dense<ValueType>* y,
+                          matrix::Dense<ValueType>* result, array<char>& tmp)
+{
+    if (x->get_size()[1] == 1 && y->get_size()[1] == 1) {
+        if (blas::is_supported<ValueType>::value) {
+            auto handle = exec->get_blas_handle();
+            blas::dot(handle, x->get_size()[0], x->get_const_values(),
+                      x->get_stride(), y->get_const_values(), y->get_stride(),
+                      result->get_values());
+        } else {
+            compute_dot(exec, x, y, result, tmp);
+        }
+    } else {
+        compute_dot(exec, x, y, result, tmp);
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL);
+
+
+template <typename ValueType>
+void compute_conj_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
+                               const matrix::Dense<ValueType>* x,
+                               const matrix::Dense<ValueType>* y,
+                               matrix::Dense<ValueType>* result,
+                               array<char>& tmp)
+{
+    if (x->get_size()[1] == 1 && y->get_size()[1] == 1) {
+        if (blas::is_supported<ValueType>::value) {
+            auto handle = exec->get_blas_handle();
+            blas::conj_dot(handle, x->get_size()[0], x->get_const_values(),
+                           x->get_stride(), y->get_const_values(),
+                           y->get_stride(), result->get_values());
+        } else {
+            compute_conj_dot(exec, x, y, result, tmp);
+        }
+    } else {
+        compute_conj_dot(exec, x, y, result, tmp);
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL);
+
+
+template <typename ValueType>
+void compute_norm2_dispatch(std::shared_ptr<const DefaultExecutor> exec,
+                            const matrix::Dense<ValueType>* x,
+                            matrix::Dense<remove_complex<ValueType>>* result,
+                            array<char>& tmp)
+{
+    if (x->get_size()[1] == 1) {
+        if (blas::is_supported<ValueType>::value) {
+            auto handle = exec->get_blas_handle();
+            blas::norm2(handle, x->get_size()[0], x->get_const_values(),
+                        x->get_stride(), result->get_values());
+        } else {
+            compute_norm2(exec, x, result, tmp);
+        }
+    } else {
+        compute_norm2(exec, x, result, tmp);
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL);
+
+
+template <typename ValueType>
+void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
+                  const matrix::Dense<ValueType>* a,
+                  const matrix::Dense<ValueType>* b,
+                  matrix::Dense<ValueType>* c)
+{
+    if (blas::is_supported<ValueType>::value) {
+        auto handle = exec->get_blas_handle();
+        if (c->get_size()[0] > 0 && c->get_size()[1] > 0) {
+            if (a->get_size()[1] > 0) {
+                blas::pointer_mode_guard pm_guard(handle);
+                auto alpha = one<ValueType>();
+                auto beta = zero<ValueType>();
+                blas::gemm(handle, BLAS_OP_N, BLAS_OP_N, c->get_size()[1],
+                           c->get_size()[0], a->get_size()[1], &alpha,
+                           b->get_const_values(), b->get_stride(),
+                           a->get_const_values(), a->get_stride(), &beta,
+                           c->get_values(), c->get_stride());
+            } else {
+                dense::fill(exec, c, zero<ValueType>());
+            }
+        }
+    } else {
+        GKO_NOT_IMPLEMENTED;
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_SIMPLE_APPLY_KERNEL);
+
+
+template <typename ValueType>
+void apply(std::shared_ptr<const DefaultExecutor> exec,
+           const matrix::Dense<ValueType>* alpha,
+           const matrix::Dense<ValueType>* a, const matrix::Dense<ValueType>* b,
+           const matrix::Dense<ValueType>* beta, matrix::Dense<ValueType>* c)
+{
+    if (blas::is_supported<ValueType>::value) {
+        if (c->get_size()[0] > 0 && c->get_size()[1] > 0) {
+            if (a->get_size()[1] > 0) {
+                blas::gemm(exec->get_blas_handle(), BLAS_OP_N, BLAS_OP_N,
+                           c->get_size()[1], c->get_size()[0], a->get_size()[1],
+                           alpha->get_const_values(), b->get_const_values(),
+                           b->get_stride(), a->get_const_values(),
+                           a->get_stride(), beta->get_const_values(),
+                           c->get_values(), c->get_stride());
+            } else {
+                dense::scale(exec, beta, c);
+            }
+        }
+    } else {
+        GKO_NOT_IMPLEMENTED;
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL);
+
+
+template <typename ValueType>
+void transpose(std::shared_ptr<const DefaultExecutor> exec,
+               const matrix::Dense<ValueType>* orig,
+               matrix::Dense<ValueType>* trans)
+{
+    if (blas::is_supported<ValueType>::value) {
+        auto handle = exec->get_blas_handle();
+        if (orig->get_size()[0] > 0 && orig->get_size()[1] > 0) {
+            blas::pointer_mode_guard pm_guard(handle);
+            auto alpha = one<ValueType>();
+            auto beta = zero<ValueType>();
+            blas::geam(handle, BLAS_OP_T, BLAS_OP_N, orig->get_size()[0],
+                       orig->get_size()[1], &alpha, orig->get_const_values(),
+                       orig->get_stride(), &beta, trans->get_const_values(),
+                       trans->get_stride(), trans->get_values(),
+                       trans->get_stride());
+        }
+    } else {
+        GKO_NOT_IMPLEMENTED;
+    }
+};
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_TRANSPOSE_KERNEL);
+
+
+template <typename ValueType>
+void conj_transpose(std::shared_ptr<const DefaultExecutor> exec,
+                    const matrix::Dense<ValueType>* orig,
+                    matrix::Dense<ValueType>* trans)
+{
+    if (blas::is_supported<ValueType>::value) {
+        auto handle = exec->get_blas_handle();
+        if (orig->get_size()[0] > 0 && orig->get_size()[1] > 0) {
+            blas::pointer_mode_guard pm_guard(handle);
+            auto alpha = one<ValueType>();
+            auto beta = zero<ValueType>();
+            blas::geam(handle, BLAS_OP_C, BLAS_OP_N, orig->get_size()[0],
+                       orig->get_size()[1], &alpha, orig->get_const_values(),
+                       orig->get_stride(), &beta, trans->get_const_values(),
+                       trans->get_stride(), trans->get_values(),
+                       trans->get_stride());
+        }
+    } else {
+        GKO_NOT_IMPLEMENTED;
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_CONJ_TRANSPOSE_KERNEL);
+
+
+}  // namespace dense
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/common/cuda_hip/matrix/diagonal_kernels.hpp.inc b/common/cuda_hip/matrix/diagonal_kernels.cpp
similarity index 73%
rename from common/cuda_hip/matrix/diagonal_kernels.hpp.inc
rename to common/cuda_hip/matrix/diagonal_kernels.cpp
index c3919fda079..75f07d7373e 100644
--- a/common/cuda_hip/matrix/diagonal_kernels.hpp.inc
+++ b/common/cuda_hip/matrix/diagonal_kernels.cpp
@@ -2,6 +2,32 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "core/matrix/diagonal_kernels.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+/**
+ * @brief The Diagonal matrix format namespace.
+ *
+ * @ingroup diagonal
+ */
+namespace diagonal {
+
+
+constexpr int default_block_size = 512;
+
+
 namespace kernel {
 
 
@@ -57,3 +83,9 @@ void apply_to_csr(std::shared_ptr<const DefaultExecutor> exec,
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DIAGONAL_APPLY_TO_CSR_KERNEL);
+
+
+}  // namespace diagonal
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/cuda/matrix/ell_kernels.cu b/common/cuda_hip/matrix/ell_kernels.cpp
similarity index 57%
rename from cuda/matrix/ell_kernels.cu
rename to common/cuda_hip/matrix/ell_kernels.cpp
index e91b03c816d..96e9dac9d78 100644
--- a/cuda/matrix/ell_kernels.cu
+++ b/common/cuda_hip/matrix/ell_kernels.cpp
@@ -2,9 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include "core/matrix/ell_kernels.hpp"
-
-
+#include <accessor/device_helper.hpp>
 #include <array>
 
 
@@ -15,27 +13,27 @@
 #include <ginkgo/core/matrix/dense.hpp>
 
 
-#include "accessor/cuda_helper.hpp"
 #include "accessor/reduced_row_major.hpp"
+#include "common/cuda_hip/base/config.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/sparselib_bindings.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/atomic.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/format_conversion.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
 #include "core/base/mixed_precision_types.hpp"
 #include "core/components/fill_array_kernels.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/dense_kernels.hpp"
+#include "core/matrix/ell_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "cuda/base/config.hpp"
-#include "cuda/base/cusparse_bindings.hpp"
-#include "cuda/base/types.hpp"
-#include "cuda/components/atomic.cuh"
-#include "cuda/components/cooperative_groups.cuh"
-#include "cuda/components/format_conversion.cuh"
-#include "cuda/components/reduction.cuh"
-#include "cuda/components/thread_ids.cuh"
 
 
 namespace gko {
 namespace kernels {
-namespace cuda {
+namespace GKO_DEVICE_NAMESPACE {
 /**
  * @brief The ELL matrix format namespace.
  *
@@ -78,7 +76,135 @@ constexpr int max_thread_per_worker = 32;
 using compiled_kernels = syn::value_list<int, 0, 1, 2, 4, 8, 16, 32>;
 
 
-#include "common/cuda_hip/matrix/ell_kernels.hpp.inc"
+namespace kernel {
+
+
+template <int num_thread_per_worker, bool atomic, typename b_accessor,
+          typename a_accessor, typename OutputValueType, typename IndexType,
+          typename Closure>
+__device__ void spmv_kernel(
+    const size_type num_rows, const int num_worker_per_row,
+    acc::range<a_accessor> val, const IndexType* __restrict__ col,
+    const size_type stride, const size_type num_stored_elements_per_row,
+    acc::range<b_accessor> b, OutputValueType* __restrict__ c,
+    const size_type c_stride, Closure op)
+{
+    using arithmetic_type = typename a_accessor::arithmetic_type;
+    const auto tidx = thread::get_thread_id_flat();
+    const decltype(tidx) column_id = blockIdx.y;
+    if (num_thread_per_worker == 1) {
+        // Specialize the num_thread_per_worker = 1. It doesn't need the shared
+        // memory, __syncthreads, and atomic_add
+        if (tidx < num_rows) {
+            auto temp = zero<arithmetic_type>();
+            for (size_type idx = 0; idx < num_stored_elements_per_row; idx++) {
+                const auto ind = tidx + idx * stride;
+                const auto col_idx = col[ind];
+                if (col_idx == invalid_index<IndexType>()) {
+                    break;
+                } else {
+                    temp += val(ind) * b(col_idx, column_id);
+                }
+            }
+            const auto c_ind = tidx * c_stride + column_id;
+            c[c_ind] = op(temp, c[c_ind]);
+        }
+    } else {
+        if (tidx < num_worker_per_row * num_rows) {
+            const auto idx_in_worker = threadIdx.y;
+            const auto x = tidx % num_rows;
+            const auto worker_id = tidx / num_rows;
+            const auto step_size = num_worker_per_row * num_thread_per_worker;
+            __shared__ uninitialized_array<
+                arithmetic_type, default_block_size / num_thread_per_worker>
+                storage;
+            if (idx_in_worker == 0) {
+                storage[threadIdx.x] = 0;
+            }
+            __syncthreads();
+            auto temp = zero<arithmetic_type>();
+            for (size_type idx =
+                     worker_id * num_thread_per_worker + idx_in_worker;
+                 idx < num_stored_elements_per_row; idx += step_size) {
+                const auto ind = x + idx * stride;
+                const auto col_idx = col[ind];
+                if (col_idx == invalid_index<IndexType>()) {
+                    break;
+                } else {
+                    temp += val(ind) * b(col_idx, column_id);
+                }
+            }
+            atomic_add(&storage[threadIdx.x], temp);
+            __syncthreads();
+            if (idx_in_worker == 0) {
+                const auto c_ind = x * c_stride + column_id;
+                if (atomic) {
+                    atomic_add(&(c[c_ind]), op(storage[threadIdx.x], c[c_ind]));
+                } else {
+                    c[c_ind] = op(storage[threadIdx.x], c[c_ind]);
+                }
+            }
+        }
+    }
+}
+
+
+template <int num_thread_per_worker, bool atomic = false, typename b_accessor,
+          typename a_accessor, typename OutputValueType, typename IndexType>
+__global__ __launch_bounds__(default_block_size) void spmv(
+    const size_type num_rows, const int num_worker_per_row,
+    acc::range<a_accessor> val, const IndexType* __restrict__ col,
+    const size_type stride, const size_type num_stored_elements_per_row,
+    acc::range<b_accessor> b, OutputValueType* __restrict__ c,
+    const size_type c_stride)
+{
+    spmv_kernel<num_thread_per_worker, atomic>(
+        num_rows, num_worker_per_row, val, col, stride,
+        num_stored_elements_per_row, b, c, c_stride,
+        [](const auto& x, const OutputValueType& y) {
+            return static_cast<OutputValueType>(x);
+        });
+}
+
+
+template <int num_thread_per_worker, bool atomic = false, typename b_accessor,
+          typename a_accessor, typename OutputValueType, typename IndexType>
+__global__ __launch_bounds__(default_block_size) void spmv(
+    const size_type num_rows, const int num_worker_per_row,
+    acc::range<a_accessor> alpha, acc::range<a_accessor> val,
+    const IndexType* __restrict__ col, const size_type stride,
+    const size_type num_stored_elements_per_row, acc::range<b_accessor> b,
+    const OutputValueType* __restrict__ beta, OutputValueType* __restrict__ c,
+    const size_type c_stride)
+{
+    using arithmetic_type = typename a_accessor::arithmetic_type;
+    const auto alpha_val = alpha(0);
+    const OutputValueType beta_val = beta[0];
+    if (atomic) {
+        // Because the atomic operation changes the values of c during
+        // computation, it can not directly do alpha * a * b + beta * c
+        // operation. The beta * c needs to be done before calling this kernel.
+        // Then, this kernel only adds alpha * a * b when it uses atomic
+        // operation.
+        spmv_kernel<num_thread_per_worker, atomic>(
+            num_rows, num_worker_per_row, val, col, stride,
+            num_stored_elements_per_row, b, c, c_stride,
+            [&alpha_val](const auto& x, const OutputValueType& y) {
+                return static_cast<OutputValueType>(alpha_val * x);
+            });
+    } else {
+        spmv_kernel<num_thread_per_worker, atomic>(
+            num_rows, num_worker_per_row, val, col, stride,
+            num_stored_elements_per_row, b, c, c_stride,
+            [&alpha_val, &beta_val](const auto& x, const OutputValueType& y) {
+                return static_cast<OutputValueType>(
+                    alpha_val * x + static_cast<arithmetic_type>(beta_val * y));
+            });
+    }
+}
+
+
+}  // namespace kernel
 
 
 namespace {
@@ -131,9 +257,9 @@ void abstract_spmv(syn::value_list<int, info>,
         if (grid_size.x > 0 && grid_size.y > 0) {
             kernel::spmv<num_thread_per_worker, atomic>
                 <<<grid_size, block_size, 0, exec->get_stream()>>>(
-                    nrows, num_worker_per_row, acc::as_cuda_range(a_vals),
+                    nrows, num_worker_per_row, acc::as_device_range(a_vals),
                     a->get_const_col_idxs(), stride,
-                    num_stored_elements_per_row, acc::as_cuda_range(b_vals),
+                    num_stored_elements_per_row, acc::as_device_range(b_vals),
                     as_device_type(c->get_values()), c->get_stride());
         }
     } else if (alpha != nullptr && beta != nullptr) {
@@ -142,9 +268,10 @@ void abstract_spmv(syn::value_list<int, info>,
         if (grid_size.x > 0 && grid_size.y > 0) {
             kernel::spmv<num_thread_per_worker, atomic>
                 <<<grid_size, block_size, 0, exec->get_stream()>>>(
-                    nrows, num_worker_per_row, acc::as_cuda_range(alpha_val),
-                    acc::as_cuda_range(a_vals), a->get_const_col_idxs(), stride,
-                    num_stored_elements_per_row, acc::as_cuda_range(b_vals),
+                    nrows, num_worker_per_row, acc::as_device_range(alpha_val),
+                    acc::as_device_range(a_vals), a->get_const_col_idxs(),
+                    stride, num_stored_elements_per_row,
+                    acc::as_device_range(b_vals),
                     as_device_type(beta->get_const_values()),
                     as_device_type(c->get_values()), c->get_stride());
         }
@@ -158,7 +285,7 @@ GKO_ENABLE_IMPLEMENTATION_SELECTION(select_abstract_spmv, abstract_spmv);
 
 template <typename ValueType, typename IndexType>
 std::array<int, 3> compute_thread_worker_and_atomicity(
-    std::shared_ptr<const CudaExecutor> exec,
+    std::shared_ptr<const DefaultExecutor> exec,
     const matrix::Ell<ValueType, IndexType>* a)
 {
     int num_thread_per_worker = 1;
@@ -202,7 +329,7 @@ std::array<int, 3> compute_thread_worker_and_atomicity(
 
 template <typename InputValueType, typename MatrixValueType,
           typename OutputValueType, typename IndexType>
-void spmv(std::shared_ptr<const CudaExecutor> exec,
+void spmv(std::shared_ptr<const DefaultExecutor> exec,
           const matrix::Ell<MatrixValueType, IndexType>* a,
           const matrix::Dense<InputValueType>* b,
           matrix::Dense<OutputValueType>* c)
@@ -234,7 +361,7 @@ GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
 
 template <typename InputValueType, typename MatrixValueType,
           typename OutputValueType, typename IndexType>
-void advanced_spmv(std::shared_ptr<const CudaExecutor> exec,
+void advanced_spmv(std::shared_ptr<const DefaultExecutor> exec,
                    const matrix::Dense<MatrixValueType>* alpha,
                    const matrix::Ell<MatrixValueType, IndexType>* a,
                    const matrix::Dense<InputValueType>* b,
@@ -267,6 +394,6 @@ GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
 
 
 }  // namespace ell
-}  // namespace cuda
+}  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
 }  // namespace gko
diff --git a/common/cuda_hip/matrix/ell_kernels.hpp.inc b/common/cuda_hip/matrix/ell_kernels.hpp.inc
deleted file mode 100644
index a5fd37c1d05..00000000000
--- a/common/cuda_hip/matrix/ell_kernels.hpp.inc
+++ /dev/null
@@ -1,133 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-namespace kernel {
-
-
-template <int num_thread_per_worker, bool atomic, typename b_accessor,
-          typename a_accessor, typename OutputValueType, typename IndexType,
-          typename Closure>
-__device__ void spmv_kernel(
-    const size_type num_rows, const int num_worker_per_row,
-    acc::range<a_accessor> val, const IndexType* __restrict__ col,
-    const size_type stride, const size_type num_stored_elements_per_row,
-    acc::range<b_accessor> b, OutputValueType* __restrict__ c,
-    const size_type c_stride, Closure op)
-{
-    using arithmetic_type = typename a_accessor::arithmetic_type;
-    const auto tidx = thread::get_thread_id_flat();
-    const decltype(tidx) column_id = blockIdx.y;
-    if (num_thread_per_worker == 1) {
-        // Specialize the num_thread_per_worker = 1. It doesn't need the shared
-        // memory, __syncthreads, and atomic_add
-        if (tidx < num_rows) {
-            auto temp = zero<arithmetic_type>();
-            for (size_type idx = 0; idx < num_stored_elements_per_row; idx++) {
-                const auto ind = tidx + idx * stride;
-                const auto col_idx = col[ind];
-                if (col_idx == invalid_index<IndexType>()) {
-                    break;
-                } else {
-                    temp += val(ind) * b(col_idx, column_id);
-                }
-            }
-            const auto c_ind = tidx * c_stride + column_id;
-            c[c_ind] = op(temp, c[c_ind]);
-        }
-    } else {
-        if (tidx < num_worker_per_row * num_rows) {
-            const auto idx_in_worker = threadIdx.y;
-            const auto x = tidx % num_rows;
-            const auto worker_id = tidx / num_rows;
-            const auto step_size = num_worker_per_row * num_thread_per_worker;
-            __shared__ uninitialized_array<
-                arithmetic_type, default_block_size / num_thread_per_worker>
-                storage;
-            if (idx_in_worker == 0) {
-                storage[threadIdx.x] = 0;
-            }
-            __syncthreads();
-            auto temp = zero<arithmetic_type>();
-            for (size_type idx =
-                     worker_id * num_thread_per_worker + idx_in_worker;
-                 idx < num_stored_elements_per_row; idx += step_size) {
-                const auto ind = x + idx * stride;
-                const auto col_idx = col[ind];
-                if (col_idx == invalid_index<IndexType>()) {
-                    break;
-                } else {
-                    temp += val(ind) * b(col_idx, column_id);
-                }
-            }
-            atomic_add(&storage[threadIdx.x], temp);
-            __syncthreads();
-            if (idx_in_worker == 0) {
-                const auto c_ind = x * c_stride + column_id;
-                if (atomic) {
-                    atomic_add(&(c[c_ind]), op(storage[threadIdx.x], c[c_ind]));
-                } else {
-                    c[c_ind] = op(storage[threadIdx.x], c[c_ind]);
-                }
-            }
-        }
-    }
-}
-
-
-template <int num_thread_per_worker, bool atomic = false, typename b_accessor,
-          typename a_accessor, typename OutputValueType, typename IndexType>
-__global__ __launch_bounds__(default_block_size) void spmv(
-    const size_type num_rows, const int num_worker_per_row,
-    acc::range<a_accessor> val, const IndexType* __restrict__ col,
-    const size_type stride, const size_type num_stored_elements_per_row,
-    acc::range<b_accessor> b, OutputValueType* __restrict__ c,
-    const size_type c_stride)
-{
-    spmv_kernel<num_thread_per_worker, atomic>(
-        num_rows, num_worker_per_row, val, col, stride,
-        num_stored_elements_per_row, b, c, c_stride,
-        [](const auto& x, const OutputValueType& y) {
-            return static_cast<OutputValueType>(x);
-        });
-}
-
-
-template <int num_thread_per_worker, bool atomic = false, typename b_accessor,
-          typename a_accessor, typename OutputValueType, typename IndexType>
-__global__ __launch_bounds__(default_block_size) void spmv(
-    const size_type num_rows, const int num_worker_per_row,
-    acc::range<a_accessor> alpha, acc::range<a_accessor> val,
-    const IndexType* __restrict__ col, const size_type stride,
-    const size_type num_stored_elements_per_row, acc::range<b_accessor> b,
-    const OutputValueType* __restrict__ beta, OutputValueType* __restrict__ c,
-    const size_type c_stride)
-{
-    using arithmetic_type = typename a_accessor::arithmetic_type;
-    const auto alpha_val = alpha(0);
-    const OutputValueType beta_val = beta[0];
-    if (atomic) {
-        // Because the atomic operation changes the values of c during
-        // computation, it can not directly do alpha * a * b + beta * c
-        // operation. The beta * c needs to be done before calling this kernel.
-        // Then, this kernel only adds alpha * a * b when it uses atomic
-        // operation.
-        spmv_kernel<num_thread_per_worker, atomic>(
-            num_rows, num_worker_per_row, val, col, stride,
-            num_stored_elements_per_row, b, c, c_stride,
-            [&alpha_val](const auto& x, const OutputValueType& y) {
-                return static_cast<OutputValueType>(alpha_val * x);
-            });
-    } else {
-        spmv_kernel<num_thread_per_worker, atomic>(
-            num_rows, num_worker_per_row, val, col, stride,
-            num_stored_elements_per_row, b, c, c_stride,
-            [&alpha_val, &beta_val](const auto& x, const OutputValueType& y) {
-                return static_cast<OutputValueType>(
-                    alpha_val * x + static_cast<arithmetic_type>(beta_val * y));
-            });
-    }
-}
-
-
-}  // namespace kernel
diff --git a/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc b/common/cuda_hip/matrix/fbcsr_kernels.cpp
similarity index 57%
rename from common/cuda_hip/matrix/fbcsr_kernels.hpp.inc
rename to common/cuda_hip/matrix/fbcsr_kernels.cpp
index d801876adbc..9e5eed5f570 100644
--- a/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc
+++ b/common/cuda_hip/matrix/fbcsr_kernels.cpp
@@ -2,6 +2,71 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include <algorithm>
+#include <common/cuda_hip/base/sparselib_block_bindings.hpp>
+
+
+#include <thrust/copy.h>
+#include <thrust/count.h>
+#include <thrust/device_ptr.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_output_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/sort.h>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+#include "common/cuda_hip/base/blas_bindings.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/pointer_mode_guard.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/sparselib_bindings.hpp"
+#include "common/cuda_hip/base/thrust.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/atomic.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/merging.hpp"
+#include "common/cuda_hip/components/prefix_sum.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/uninitialized_array.hpp"
+#include "common/unified/base/kernel_launch.hpp"
+#include "core/base/array_access.hpp"
+#include "core/base/block_sizes.hpp"
+#include "core/base/device_matrix_data_kernels.hpp"
+#include "core/components/fill_array_kernels.hpp"
+#include "core/components/format_conversion_kernels.hpp"
+#include "core/matrix/csr_lookup.hpp"
+#include "core/matrix/dense_kernels.hpp"
+#include "core/matrix/fbcsr_kernels.hpp"
+#include "core/synthesizer/implementation_selection.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+
+
+/**
+ * @brief The fixed-size block compressed sparse row matrix format namespace.
+ *
+ * @ingroup fbcsr
+ */
+namespace fbcsr {
+
+
+constexpr int default_block_size{512};
+
+
+#include "common/cuda_hip/matrix/csr_common.hpp.inc"
+
 namespace kernel {
 
 
@@ -341,3 +406,235 @@ template <typename ValueType, typename IndexType>
 void extract_diagonal(std::shared_ptr<const DefaultExecutor> exec,
                       const matrix::Fbcsr<ValueType, IndexType>* orig,
                       matrix::Diagonal<ValueType>* diag) GKO_NOT_IMPLEMENTED;
+
+
+namespace {
+
+
+template <typename ValueType>
+void dense_transpose(std::shared_ptr<const DefaultExecutor> exec,
+                     const size_type nrows, const size_type ncols,
+                     const size_type orig_stride, const ValueType* const orig,
+                     const size_type trans_stride, ValueType* const trans)
+{
+    if (nrows == 0) {
+        return;
+    }
+    if (blas::is_supported<ValueType>::value) {
+        auto handle = exec->get_blas_handle();
+        {
+            blas::pointer_mode_guard pm_guard(handle);
+            auto alpha = one<ValueType>();
+            auto beta = zero<ValueType>();
+            blas::geam(handle, BLAS_OP_T, BLAS_OP_N, nrows, ncols, &alpha, orig,
+                       orig_stride, &beta, trans, trans_stride, trans,
+                       trans_stride);
+        }
+    } else {
+        GKO_NOT_IMPLEMENTED;
+    }
+}
+
+
+}  // namespace
+
+
+template <typename ValueType, typename IndexType>
+void spmv(std::shared_ptr<const DefaultExecutor> exec,
+          const matrix::Fbcsr<ValueType, IndexType>* const a,
+          const matrix::Dense<ValueType>* const b,
+          matrix::Dense<ValueType>* const c)
+{
+    if (c->get_size()[0] == 0 || c->get_size()[1] == 0) {
+        // empty output: nothing to do
+        return;
+    }
+    if (b->get_size()[0] == 0 || a->get_num_stored_blocks() == 0) {
+        // empty input: fill output with zero
+        dense::fill(exec, c, zero<ValueType>());
+        return;
+    }
+    if (sparselib::is_supported<ValueType, IndexType>::value) {
+        auto handle = exec->get_sparselib_handle();
+        sparselib::pointer_mode_guard pm_guard(handle);
+        const auto alpha = one<ValueType>();
+        const auto beta = zero<ValueType>();
+        auto descr = sparselib::create_mat_descr();
+        const auto row_ptrs = a->get_const_row_ptrs();
+        const auto col_idxs = a->get_const_col_idxs();
+        const auto values = a->get_const_values();
+        const int bs = a->get_block_size();
+        const IndexType mb = a->get_num_block_rows();
+        const IndexType nb = a->get_num_block_cols();
+        const auto nnzb = static_cast<IndexType>(a->get_num_stored_blocks());
+        const auto nrhs = static_cast<IndexType>(b->get_size()[1]);
+        const auto nrows = a->get_size()[0];
+        const auto ncols = a->get_size()[1];
+        const auto in_stride = b->get_stride();
+        const auto out_stride = c->get_stride();
+        if (nrhs == 1 && in_stride == 1 && out_stride == 1) {
+            sparselib::bsrmv(handle, SPARSELIB_OPERATION_NON_TRANSPOSE, mb, nb,
+                             nnzb, &alpha, descr, values, row_ptrs, col_idxs,
+                             bs, b->get_const_values(), &beta, c->get_values());
+        } else {
+            const auto trans_stride = nrows;
+            auto trans_c = array<ValueType>(exec, nrows * nrhs);
+            sparselib::bsrmm(handle, SPARSELIB_OPERATION_NON_TRANSPOSE,
+                             SPARSELIB_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb,
+                             &alpha, descr, values, row_ptrs, col_idxs, bs,
+                             b->get_const_values(), in_stride, &beta,
+                             trans_c.get_data(), trans_stride);
+            dense_transpose(exec, nrhs, nrows, trans_stride, trans_c.get_data(),
+                            out_stride, c->get_values());
+        }
+        sparselib::destroy(descr);
+    } else {
+        GKO_NOT_IMPLEMENTED;
+    }
+}
+
+
+template <typename ValueType, typename IndexType>
+void advanced_spmv(std::shared_ptr<const DefaultExecutor> exec,
+                   const matrix::Dense<ValueType>* const alpha,
+                   const matrix::Fbcsr<ValueType, IndexType>* const a,
+                   const matrix::Dense<ValueType>* const b,
+                   const matrix::Dense<ValueType>* const beta,
+                   matrix::Dense<ValueType>* const c)
+{
+    if (c->get_size()[0] == 0 || c->get_size()[1] == 0) {
+        // empty output: nothing to do
+        return;
+    }
+    if (b->get_size()[0] == 0 || a->get_num_stored_blocks() == 0) {
+        // empty input: scale output
+        dense::scale(exec, beta, c);
+        return;
+    }
+    if (sparselib::is_supported<ValueType, IndexType>::value) {
+        auto handle = exec->get_sparselib_handle();
+        const auto alphp = alpha->get_const_values();
+        const auto betap = beta->get_const_values();
+        auto descr = sparselib::create_mat_descr();
+        const auto row_ptrs = a->get_const_row_ptrs();
+        const auto col_idxs = a->get_const_col_idxs();
+        const auto values = a->get_const_values();
+        const int bs = a->get_block_size();
+        const IndexType mb = a->get_num_block_rows();
+        const IndexType nb = a->get_num_block_cols();
+        const auto nnzb = static_cast<IndexType>(a->get_num_stored_blocks());
+        const auto nrhs = static_cast<IndexType>(b->get_size()[1]);
+        const auto nrows = a->get_size()[0];
+        const auto ncols = a->get_size()[1];
+        const auto in_stride = b->get_stride();
+        const auto out_stride = c->get_stride();
+        if (nrhs == 1 && in_stride == 1 && out_stride == 1) {
+            sparselib::bsrmv(handle, SPARSELIB_OPERATION_NON_TRANSPOSE, mb, nb,
+                             nnzb, alphp, descr, values, row_ptrs, col_idxs, bs,
+                             b->get_const_values(), betap, c->get_values());
+        } else {
+            const auto trans_stride = nrows;
+            auto trans_c = array<ValueType>(exec, nrows * nrhs);
+            dense_transpose(exec, nrows, nrhs, out_stride, c->get_values(),
+                            trans_stride, trans_c.get_data());
+            sparselib::bsrmm(handle, SPARSELIB_OPERATION_NON_TRANSPOSE,
+                             SPARSELIB_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb,
+                             alphp, descr, values, row_ptrs, col_idxs, bs,
+                             b->get_const_values(), in_stride, betap,
+                             trans_c.get_data(), trans_stride);
+            dense_transpose(exec, nrhs, nrows, trans_stride, trans_c.get_data(),
+                            out_stride, c->get_values());
+        }
+        sparselib::destroy(descr);
+    } else {
+        GKO_NOT_IMPLEMENTED;
+    }
+}
+
+
+namespace {
+
+
+template <int mat_blk_sz, typename ValueType, typename IndexType>
+void transpose_blocks_impl(syn::value_list<int, mat_blk_sz>,
+                           std::shared_ptr<const DefaultExecutor> exec,
+                           matrix::Fbcsr<ValueType, IndexType>* const mat)
+{
+    constexpr int subwarp_size = config::warp_size;
+    const auto nbnz = mat->get_num_stored_blocks();
+    const auto numthreads = nbnz * subwarp_size;
+    const auto block_size = default_block_size;
+    const auto grid_dim = ceildiv(numthreads, block_size);
+    if (grid_dim > 0) {
+        kernel::transpose_blocks<mat_blk_sz, subwarp_size>
+            <<<grid_dim, block_size, 0, exec->get_stream()>>>(
+                nbnz, mat->get_values());
+    }
+}
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_transpose_blocks,
+                                    transpose_blocks_impl);
+
+
+}  // namespace
+
+
+template <typename ValueType, typename IndexType>
+void transpose(const std::shared_ptr<const DefaultExecutor> exec,
+               const matrix::Fbcsr<ValueType, IndexType>* const orig,
+               matrix::Fbcsr<ValueType, IndexType>* const trans)
+{
+#ifdef GKO_COMPILING_CUDA
+    if (sparselib::is_supported<ValueType, IndexType>::value) {
+        const int bs = orig->get_block_size();
+        const IndexType nnzb =
+            static_cast<IndexType>(orig->get_num_stored_blocks());
+        cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC;
+        cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO;
+        const IndexType buffer_size = sparselib::bsr_transpose_buffersize(
+            exec->get_sparselib_handle(), orig->get_num_block_rows(),
+            orig->get_num_block_cols(), nnzb, orig->get_const_values(),
+            orig->get_const_row_ptrs(), orig->get_const_col_idxs(), bs, bs);
+        array<char> buffer_array(exec, buffer_size);
+        auto buffer = buffer_array.get_data();
+        sparselib::bsr_transpose(
+            exec->get_sparselib_handle(), orig->get_num_block_rows(),
+            orig->get_num_block_cols(), nnzb, orig->get_const_values(),
+            orig->get_const_row_ptrs(), orig->get_const_col_idxs(), bs, bs,
+            trans->get_values(), trans->get_col_idxs(), trans->get_row_ptrs(),
+            copyValues, idxBase, buffer);
+
+        // transpose blocks
+        select_transpose_blocks(
+            fixedblock::compiled_kernels(),
+            [bs](int compiled_block_size) { return bs == compiled_block_size; },
+            syn::value_list<int>(), syn::type_list<>(), exec, trans);
+    } else
+#endif
+    {
+        fallback_transpose(exec, orig, trans);
+    }
+}
+
+
+template <typename ValueType, typename IndexType>
+void conj_transpose(std::shared_ptr<const DefaultExecutor> exec,
+                    const matrix::Fbcsr<ValueType, IndexType>* orig,
+                    matrix::Fbcsr<ValueType, IndexType>* trans)
+{
+    const int grid_size =
+        ceildiv(trans->get_num_stored_elements(), default_block_size);
+    transpose(exec, orig, trans);
+    if (grid_size > 0 && is_complex<ValueType>()) {
+        kernel::
+            conjugate<<<grid_size, default_block_size, 0, exec->get_stream()>>>(
+                trans->get_num_stored_elements(),
+                as_device_type(trans->get_values()));
+    }
+}
+
+
+}  // namespace fbcsr
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/common/cuda_hip/matrix/sellp_kernels.hpp.inc b/common/cuda_hip/matrix/sellp_kernels.cpp
similarity index 83%
rename from common/cuda_hip/matrix/sellp_kernels.hpp.inc
rename to common/cuda_hip/matrix/sellp_kernels.cpp
index f4f0035c276..af7f22ee7d5 100644
--- a/common/cuda_hip/matrix/sellp_kernels.hpp.inc
+++ b/common/cuda_hip/matrix/sellp_kernels.cpp
@@ -2,6 +2,37 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/sparselib_bindings.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "core/components/prefix_sum_kernels.hpp"
+#include "core/matrix/sellp_kernels.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+/**
+ * @brief The SELL-P matrix format namespace.
+ *
+ * @ingroup sellp
+ */
+namespace sellp {
+
+
+constexpr int default_block_size = 512;
+
+
 template <typename ValueType, typename IndexType>
 __global__ __launch_bounds__(default_block_size) void spmv_kernel(
     size_type num_rows, size_type num_right_hand_sides, size_type b_stride,
@@ -102,3 +133,9 @@ void advanced_spmv(std::shared_ptr<const DefaultExecutor> exec,
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SELLP_ADVANCED_SPMV_KERNEL);
+
+
+}  // namespace sellp
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/hip/matrix/sparsity_csr_kernels.hip.cpp b/common/cuda_hip/matrix/sparsity_csr_kernels.cpp
similarity index 58%
rename from hip/matrix/sparsity_csr_kernels.hip.cpp
rename to common/cuda_hip/matrix/sparsity_csr_kernels.cpp
index b662f07257e..540722d843c 100644
--- a/hip/matrix/sparsity_csr_kernels.hip.cpp
+++ b/common/cuda_hip/matrix/sparsity_csr_kernels.cpp
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include "core/matrix/sparsity_csr_kernels.hpp"
+#include <accessor/device_helper.hpp>
 
 
 #include <thrust/sort.h>
@@ -11,27 +11,27 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
-#include "accessor/hip_helper.hpp"
 #include "accessor/reduced_row_major.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/sparselib_bindings.hpp"
+#include "common/cuda_hip/base/thrust.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/uninitialized_array.hpp"
 #include "core/base/mixed_precision_types.hpp"
 #include "core/components/fill_array_kernels.hpp"
 #include "core/components/format_conversion_kernels.hpp"
+#include "core/matrix/sparsity_csr_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "hip/base/config.hip.hpp"
-#include "hip/base/hipsparse_bindings.hip.hpp"
-#include "hip/base/math.hip.hpp"
-#include "hip/base/thrust.hip.hpp"
-#include "hip/base/types.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
-#include "hip/components/reduction.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-#include "hip/components/uninitialized_array.hip.hpp"
 
 
 namespace gko {
 namespace kernels {
-namespace hip {
+namespace GKO_DEVICE_NAMESPACE {
 /**
  * @brief The Compressed sparse row matrix format namespace.
  *
@@ -54,7 +54,114 @@ using classical_kernels = syn::value_list<int, 2>;
 
 
 #include "common/cuda_hip/matrix/csr_common.hpp.inc"
-#include "common/cuda_hip/matrix/sparsity_csr_kernels.hpp.inc"
+
+namespace kernel {
+
+
+template <size_type subwarp_size, typename MatrixValueType,
+          typename input_accessor, typename output_accessor, typename IndexType,
+          typename Closure>
+__device__ void device_classical_spmv(const size_type num_rows,
+                                      const MatrixValueType* __restrict__ val,
+                                      const IndexType* __restrict__ col_idxs,
+                                      const IndexType* __restrict__ row_ptrs,
+                                      acc::range<input_accessor> b,
+                                      acc::range<output_accessor> c,
+                                      Closure scale)
+{
+    using arithmetic_type = typename output_accessor::arithmetic_type;
+    auto subwarp_tile =
+        group::tiled_partition<subwarp_size>(group::this_thread_block());
+    const auto subrow = thread::get_subwarp_num_flat<subwarp_size>();
+    const auto subid = subwarp_tile.thread_rank();
+    const IndexType column_id = blockIdx.y;
+    const arithmetic_type value = val[0];
+    auto row = thread::get_subwarp_id_flat<subwarp_size>();
+    for (; row < num_rows; row += subrow) {
+        const auto ind_end = row_ptrs[row + 1];
+        arithmetic_type temp_val = zero<arithmetic_type>();
+        for (auto ind = row_ptrs[row] + subid; ind < ind_end;
+             ind += subwarp_size) {
+            temp_val += value * b(col_idxs[ind], column_id);
+        }
+        auto subwarp_result =
+            reduce(subwarp_tile, temp_val,
+                   [](const arithmetic_type& a, const arithmetic_type& b) {
+                       return a + b;
+                   });
+        if (subid == 0) {
+            c(row, column_id) = scale(subwarp_result, c(row, column_id));
+        }
+    }
+}
+
+
+template <size_type subwarp_size, typename MatrixValueType,
+          typename input_accessor, typename output_accessor, typename IndexType>
+__global__ __launch_bounds__(spmv_block_size) void abstract_classical_spmv(
+    const size_type num_rows, const MatrixValueType* __restrict__ val,
+    const IndexType* __restrict__ col_idxs,
+    const IndexType* __restrict__ row_ptrs, acc::range<input_accessor> b,
+    acc::range<output_accessor> c)
+{
+    using type = typename output_accessor::arithmetic_type;
+    device_classical_spmv<subwarp_size>(
+        num_rows, val, col_idxs, row_ptrs, b, c,
+        [](const type& x, const type& y) { return x; });
+}
+
+
+template <size_type subwarp_size, typename MatrixValueType,
+          typename input_accessor, typename output_accessor, typename IndexType>
+__global__ __launch_bounds__(spmv_block_size) void abstract_classical_spmv(
+    const size_type num_rows, const MatrixValueType* __restrict__ alpha,
+    const MatrixValueType* __restrict__ val,
+    const IndexType* __restrict__ col_idxs,
+    const IndexType* __restrict__ row_ptrs, acc::range<input_accessor> b,
+    const typename output_accessor::storage_type* __restrict__ beta,
+    acc::range<output_accessor> c)
+{
+    using type = typename output_accessor::arithmetic_type;
+    const type alpha_val = alpha[0];
+    const type beta_val = beta[0];
+    device_classical_spmv<subwarp_size>(
+        num_rows, val, col_idxs, row_ptrs, b, c,
+        [&alpha_val, &beta_val](const type& x, const type& y) {
+            return alpha_val * x + beta_val * y;
+        });
+}
+
+
+}  // namespace kernel
+
+
+template <typename ValueType, typename IndexType>
+void transpose(std::shared_ptr<const DefaultExecutor> exec,
+               const matrix::SparsityCsr<ValueType, IndexType>* orig,
+               matrix::SparsityCsr<ValueType, IndexType>* trans)
+    GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_SPARSITY_CSR_TRANSPOSE_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void fallback_sort(std::shared_ptr<const DefaultExecutor> exec,
+                   matrix::SparsityCsr<ValueType, IndexType>* to_sort)
+{
+    const auto row_ptrs = to_sort->get_const_row_ptrs();
+    const auto col_idxs = to_sort->get_col_idxs();
+    const auto nnz = to_sort->get_num_nonzeros();
+    const auto num_rows = to_sort->get_size()[0];
+    array<IndexType> row_idx_array(exec, nnz);
+    const auto row_idxs = row_idx_array.get_data();
+    components::convert_ptrs_to_idxs(exec, row_ptrs, num_rows, row_idxs);
+    // two sorts by integer keys hopefully enable Thrust to use cub's RadixSort
+    thrust::sort_by_key(thrust_policy(exec), col_idxs, col_idxs + nnz,
+                        row_idxs);
+    thrust::stable_sort_by_key(thrust_policy(exec), row_idxs, row_idxs + nnz,
+                               col_idxs);
+}
 
 
 namespace host_kernel {
@@ -63,7 +170,7 @@ namespace host_kernel {
 template <int subwarp_size, typename MatrixValueType, typename InputValueType,
           typename OutputValueType, typename IndexType>
 void classical_spmv(syn::value_list<int, subwarp_size>,
-                    std::shared_ptr<const HipExecutor> exec,
+                    std::shared_ptr<const DefaultExecutor> exec,
                     const matrix::SparsityCsr<MatrixValueType, IndexType>* a,
                     const matrix::Dense<InputValueType>* b,
                     matrix::Dense<OutputValueType>* c,
@@ -110,16 +217,16 @@ void classical_spmv(syn::value_list<int, subwarp_size>,
                 a->get_size()[0], as_device_type(a->get_const_value()),
                 a->get_const_col_idxs(),
                 as_device_type(a->get_const_row_ptrs()),
-                acc::as_hip_range(b_vals), acc::as_hip_range(c_vals));
+                acc::as_device_range(b_vals), acc::as_device_range(c_vals));
     } else if (alpha != nullptr && beta != nullptr) {
         kernel::abstract_classical_spmv<subwarp_size>
             <<<grid, block, 0, exec->get_stream()>>>(
                 a->get_size()[0], as_device_type(alpha->get_const_values()),
                 as_device_type(a->get_const_value()), a->get_const_col_idxs(),
                 as_device_type(a->get_const_row_ptrs()),
-                acc::as_hip_range(b_vals),
+                acc::as_device_range(b_vals),
                 as_device_type(beta->get_const_values()),
-                acc::as_hip_range(c_vals));
+                acc::as_device_range(c_vals));
     } else {
         GKO_KERNEL_NOT_FOUND;
     }
@@ -132,7 +239,7 @@ GKO_ENABLE_IMPLEMENTATION_SELECTION(select_classical_spmv, classical_spmv);
 
 template <typename MatrixValueType, typename InputValueType,
           typename OutputValueType, typename IndexType>
-void spmv(std::shared_ptr<const HipExecutor> exec,
+void spmv(std::shared_ptr<const DefaultExecutor> exec,
           const matrix::SparsityCsr<MatrixValueType, IndexType>* a,
           const matrix::Dense<InputValueType>* b,
           matrix::Dense<OutputValueType>* c)
@@ -148,7 +255,7 @@ GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
 
 template <typename MatrixValueType, typename InputValueType,
           typename OutputValueType, typename IndexType>
-void advanced_spmv(std::shared_ptr<const HipExecutor> exec,
+void advanced_spmv(std::shared_ptr<const DefaultExecutor> exec,
                    const matrix::Dense<MatrixValueType>* alpha,
                    const matrix::SparsityCsr<MatrixValueType, IndexType>* a,
                    const matrix::Dense<InputValueType>* b,
@@ -221,6 +328,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 
 }  // namespace sparsity_csr
-}  // namespace hip
+}  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
 }  // namespace gko
diff --git a/common/cuda_hip/matrix/sparsity_csr_kernels.hpp.inc b/common/cuda_hip/matrix/sparsity_csr_kernels.hpp.inc
deleted file mode 100644
index aedf9638888..00000000000
--- a/common/cuda_hip/matrix/sparsity_csr_kernels.hpp.inc
+++ /dev/null
@@ -1,111 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-namespace kernel {
-
-
-template <size_type subwarp_size, typename MatrixValueType,
-          typename input_accessor, typename output_accessor, typename IndexType,
-          typename Closure>
-__device__ void device_classical_spmv(const size_type num_rows,
-                                      const MatrixValueType* __restrict__ val,
-                                      const IndexType* __restrict__ col_idxs,
-                                      const IndexType* __restrict__ row_ptrs,
-                                      acc::range<input_accessor> b,
-                                      acc::range<output_accessor> c,
-                                      Closure scale)
-{
-    using arithmetic_type = typename output_accessor::arithmetic_type;
-    auto subwarp_tile =
-        group::tiled_partition<subwarp_size>(group::this_thread_block());
-    const auto subrow = thread::get_subwarp_num_flat<subwarp_size>();
-    const auto subid = subwarp_tile.thread_rank();
-    const IndexType column_id = blockIdx.y;
-    const arithmetic_type value = val[0];
-    auto row = thread::get_subwarp_id_flat<subwarp_size>();
-    for (; row < num_rows; row += subrow) {
-        const auto ind_end = row_ptrs[row + 1];
-        arithmetic_type temp_val = zero<arithmetic_type>();
-        for (auto ind = row_ptrs[row] + subid; ind < ind_end;
-             ind += subwarp_size) {
-            temp_val += value * b(col_idxs[ind], column_id);
-        }
-        auto subwarp_result =
-            reduce(subwarp_tile, temp_val,
-                   [](const arithmetic_type& a, const arithmetic_type& b) {
-                       return a + b;
-                   });
-        if (subid == 0) {
-            c(row, column_id) = scale(subwarp_result, c(row, column_id));
-        }
-    }
-}
-
-
-template <size_type subwarp_size, typename MatrixValueType,
-          typename input_accessor, typename output_accessor, typename IndexType>
-__global__ __launch_bounds__(spmv_block_size) void abstract_classical_spmv(
-    const size_type num_rows, const MatrixValueType* __restrict__ val,
-    const IndexType* __restrict__ col_idxs,
-    const IndexType* __restrict__ row_ptrs, acc::range<input_accessor> b,
-    acc::range<output_accessor> c)
-{
-    using type = typename output_accessor::arithmetic_type;
-    device_classical_spmv<subwarp_size>(
-        num_rows, val, col_idxs, row_ptrs, b, c,
-        [](const type& x, const type& y) { return x; });
-}
-
-
-template <size_type subwarp_size, typename MatrixValueType,
-          typename input_accessor, typename output_accessor, typename IndexType>
-__global__ __launch_bounds__(spmv_block_size) void abstract_classical_spmv(
-    const size_type num_rows, const MatrixValueType* __restrict__ alpha,
-    const MatrixValueType* __restrict__ val,
-    const IndexType* __restrict__ col_idxs,
-    const IndexType* __restrict__ row_ptrs, acc::range<input_accessor> b,
-    const typename output_accessor::storage_type* __restrict__ beta,
-    acc::range<output_accessor> c)
-{
-    using type = typename output_accessor::arithmetic_type;
-    const type alpha_val = alpha[0];
-    const type beta_val = beta[0];
-    device_classical_spmv<subwarp_size>(
-        num_rows, val, col_idxs, row_ptrs, b, c,
-        [&alpha_val, &beta_val](const type& x, const type& y) {
-            return alpha_val * x + beta_val * y;
-        });
-}
-
-
-}  // namespace kernel
-
-
-template <typename ValueType, typename IndexType>
-void transpose(std::shared_ptr<const DefaultExecutor> exec,
-               const matrix::SparsityCsr<ValueType, IndexType>* orig,
-               matrix::SparsityCsr<ValueType, IndexType>* trans)
-    GKO_NOT_IMPLEMENTED;
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_SPARSITY_CSR_TRANSPOSE_KERNEL);
-
-
-template <typename ValueType, typename IndexType>
-void fallback_sort(std::shared_ptr<const DefaultExecutor> exec,
-                   matrix::SparsityCsr<ValueType, IndexType>* to_sort)
-{
-    const auto row_ptrs = to_sort->get_const_row_ptrs();
-    const auto col_idxs = to_sort->get_col_idxs();
-    const auto nnz = to_sort->get_num_nonzeros();
-    const auto num_rows = to_sort->get_size()[0];
-    array<IndexType> row_idx_array(exec, nnz);
-    const auto row_idxs = row_idx_array.get_data();
-    components::convert_ptrs_to_idxs(exec, row_ptrs, num_rows, row_idxs);
-    // two sorts by integer keys hopefully enable Thrust to use cub's RadixSort
-    thrust::sort_by_key(thrust_policy(exec), col_idxs, col_idxs + nnz,
-                        row_idxs);
-    thrust::stable_sort_by_key(thrust_policy(exec), row_idxs, row_idxs + nnz,
-                               col_idxs);
-}
diff --git a/common/cuda_hip/multigrid/pgm_kernels.hpp.inc b/common/cuda_hip/multigrid/pgm_kernels.cpp
similarity index 77%
rename from common/cuda_hip/multigrid/pgm_kernels.hpp.inc
rename to common/cuda_hip/multigrid/pgm_kernels.cpp
index 9b2a5735c71..60dea00cc12 100644
--- a/common/cuda_hip/multigrid/pgm_kernels.hpp.inc
+++ b/common/cuda_hip/multigrid/pgm_kernels.cpp
@@ -2,6 +2,36 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include <memory>
+
+
+#include <thrust/device_ptr.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/reduce.h>
+#include <thrust/sort.h>
+#include <thrust/tuple.h>
+
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/math.hpp>
+
+
+#include "common/cuda_hip/base/thrust.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "core/multigrid/pgm_kernels.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+/**
+ * @brief The PGM solver namespace.
+ *
+ * @ingroup pgm
+ */
+namespace pgm {
+
+
 template <typename IndexType>
 void sort_agg(std::shared_ptr<const DefaultExecutor> exec, IndexType num,
               IndexType* row_idxs, IndexType* col_idxs)
@@ -52,3 +82,9 @@ void compute_coarse_coo(std::shared_ptr<const DefaultExecutor> exec,
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_PGM_COMPUTE_COARSE_COO);
+
+
+}  // namespace pgm
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/common/cuda_hip/preconditioner/isai_kernels.hpp.inc b/common/cuda_hip/preconditioner/isai_kernels.cpp
similarity index 94%
rename from common/cuda_hip/preconditioner/isai_kernels.hpp.inc
rename to common/cuda_hip/preconditioner/isai_kernels.cpp
index 86d47680e0e..a79b8f711d3 100644
--- a/common/cuda_hip/preconditioner/isai_kernels.hpp.inc
+++ b/common/cuda_hip/preconditioner/isai_kernels.cpp
@@ -2,6 +2,42 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+
+
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/merging.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/uninitialized_array.hpp"
+#include "common/cuda_hip/components/warp_blas.hpp"
+#include "core/components/prefix_sum_kernels.hpp"
+#include "core/matrix/csr_builder.hpp"
+#include "core/preconditioner/isai_kernels.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+/**
+ * @brief The Isai preconditioner namespace.
+ * @ref Isai
+ * @ingroup isai
+ */
+namespace isai {
+
+
+constexpr int subwarp_size{row_size_limit};
+constexpr int subwarps_per_block{2};
+constexpr int default_block_size{subwarps_per_block * subwarp_size};
+
+
 namespace kernel {
 
 
@@ -559,3 +595,9 @@ void scatter_excess_solution(std::shared_ptr<const DefaultExecutor> exec,
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_ISAI_SCATTER_EXCESS_SOLUTION_KERNEL);
+
+
+}  // namespace isai
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/common/cuda_hip/preconditioner/jacobi_kernels.hpp.inc b/common/cuda_hip/preconditioner/jacobi_kernels.cpp
similarity index 91%
rename from common/cuda_hip/preconditioner/jacobi_kernels.hpp.inc
rename to common/cuda_hip/preconditioner/jacobi_kernels.cpp
index e0d7cfef0e9..45d32493f25 100644
--- a/common/cuda_hip/preconditioner/jacobi_kernels.hpp.inc
+++ b/common/cuda_hip/preconditioner/jacobi_kernels.cpp
@@ -2,6 +2,48 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "core/preconditioner/jacobi_kernels.hpp"
+
+
+#include <common/cuda_hip/preconditioner/jacobi_common.hpp>
+
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+
+
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "core/base/extended_float.hpp"
+#include "core/preconditioner/jacobi_utils.hpp"
+#include "core/synthesizer/implementation_selection.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+/**
+ * @brief The Jacobi preconditioner namespace.
+ * @ref Jacobi
+ * @ingroup jacobi
+ */
+namespace jacobi {
+
+
+// a total of 32/16 warps (1024 threads)
+#if defined(GKO_COMPILING_HIP) && GINKGO_HIP_PLATFORM_HCC
+constexpr int default_num_warps = 16;
+#else  // !defined(GKO_COMPILING_HIP) || GINKGO_HIP_PLATFORM_NVCC
+constexpr int default_num_warps = 32;
+#endif
+// with current architectures, at most 32 warps can be scheduled per SM (and
+// current GPUs have at most 84 SMs)
+constexpr int default_grid_size = 32 * 32 * 128;
+
+
 namespace {
 
 
@@ -369,3 +411,9 @@ void convert_to_dense(
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_JACOBI_CONVERT_TO_DENSE_KERNEL);
+
+
+}  // namespace jacobi
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/common/cuda_hip/reorder/rcm_kernels.hpp.inc b/common/cuda_hip/reorder/rcm_kernels.cpp
similarity index 95%
rename from common/cuda_hip/reorder/rcm_kernels.hpp.inc
rename to common/cuda_hip/reorder/rcm_kernels.cpp
index 05fe3bce07e..12f2eca9580 100644
--- a/common/cuda_hip/reorder/rcm_kernels.hpp.inc
+++ b/common/cuda_hip/reorder/rcm_kernels.cpp
@@ -2,6 +2,47 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include <thrust/binary_search.h>
+#include <thrust/copy.h>
+#include <thrust/count.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/permutation_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/reduce.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+#include <thrust/transform.h>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/std_extensions.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/permutation.hpp>
+#include <ginkgo/core/matrix/sparsity_csr.hpp>
+
+
+#include "common/cuda_hip/base/thrust.hpp"
+#include "common/cuda_hip/components/memory.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "core/base/array_access.hpp"
+#include "core/reorder/rcm_kernels.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+/**
+ * @brief The reordering namespace.
+ *
+ * @ingroup reorder
+ */
+namespace rcm {
+
+
+constexpr int default_block_size = 512;
+
+
 template <typename IndexType>
 array<IndexType> compute_node_degrees(
     std::shared_ptr<const DefaultExecutor> exec,
@@ -613,3 +654,9 @@ void compute_permutation(std::shared_ptr<const DefaultExecutor> exec,
 }
 
 GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_RCM_COMPUTE_PERMUTATION_KERNEL);
+
+
+}  // namespace rcm
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/common/cuda_hip/solver/cb_gmres_kernels.hpp.inc b/common/cuda_hip/solver/cb_gmres_kernels.cpp
similarity index 50%
rename from common/cuda_hip/solver/cb_gmres_kernels.hpp.inc
rename to common/cuda_hip/solver/cb_gmres_kernels.cpp
index 2a5a6c3f7f9..9be99c094fc 100644
--- a/common/cuda_hip/solver/cb_gmres_kernels.hpp.inc
+++ b/common/cuda_hip/solver/cb_gmres_kernels.cpp
@@ -2,6 +2,52 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include <accessor/device_helper.hpp>
+#include <algorithm>
+
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/stop/stopping_status.hpp>
+
+
+#include "accessor/range.hpp"
+#include "accessor/reduced_row_major.hpp"
+#include "accessor/scaled_reduced_row_major.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/atomic.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/uninitialized_array.hpp"
+#include "core/base/array_access.hpp"
+#include "core/components/fill_array_kernels.hpp"
+#include "core/matrix/dense_kernels.hpp"
+#include "core/solver/cb_gmres_accessor.hpp"
+#include "core/solver/cb_gmres_kernels.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+/**
+ * @brief The CB_GMRES solver namespace.
+ *
+ * @ingroup cb_gmres
+ */
+namespace cb_gmres {
+
+
+constexpr int default_block_size = 512;
+// default_dot_dim can not be 64 in hip because 64 * 64 exceeds their max block
+// size limit.
+constexpr int default_dot_dim = 32;
+constexpr int default_dot_size = default_dot_dim * default_dot_dim;
+
+
 #include "common/cuda_hip/solver/common_gmres_kernels.hpp.inc"
 
 
@@ -551,3 +597,457 @@ __global__ __launch_bounds__(block_size) void calculate_Qy_kernel(
         before_preconditioner[global_id] = temp;
     }
 }
+
+
+template <typename ValueType>
+void zero_matrix(std::shared_ptr<const DefaultExecutor> exec, size_type m,
+                 size_type n, size_type stride, ValueType* array)
+{
+    const auto block_size = default_block_size;
+    const auto grid_size = ceildiv(n, block_size);
+    zero_matrix_kernel<<<grid_size, block_size, 0, exec->get_stream()>>>(
+        m, n, stride, as_device_type(array));
+}
+
+
+template <typename ValueType>
+void initialize(std::shared_ptr<const DefaultExecutor> exec,
+                const matrix::Dense<ValueType>* b,
+                matrix::Dense<ValueType>* residual,
+                matrix::Dense<ValueType>* givens_sin,
+                matrix::Dense<ValueType>* givens_cos,
+                array<stopping_status>* stop_status, size_type krylov_dim)
+{
+    const auto num_threads = std::max(b->get_size()[0] * b->get_stride(),
+                                      krylov_dim * b->get_size()[1]);
+    const auto grid_dim = ceildiv(num_threads, default_block_size);
+    const auto block_dim = default_block_size;
+    constexpr auto block_size = default_block_size;
+
+    initialize_kernel<block_size>
+        <<<grid_dim, block_dim, 0, exec->get_stream()>>>(
+            b->get_size()[0], b->get_size()[1], krylov_dim,
+            as_device_type(b->get_const_values()), b->get_stride(),
+            as_device_type(residual->get_values()), residual->get_stride(),
+            as_device_type(givens_sin->get_values()), givens_sin->get_stride(),
+            as_device_type(givens_cos->get_values()), givens_cos->get_stride(),
+            as_device_type(stop_status->get_data()));
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CB_GMRES_INITIALIZE_KERNEL);
+
+
+template <typename ValueType, typename Accessor3d>
+void restart(std::shared_ptr<const DefaultExecutor> exec,
+             const matrix::Dense<ValueType>* residual,
+             matrix::Dense<remove_complex<ValueType>>* residual_norm,
+             matrix::Dense<ValueType>* residual_norm_collection,
+             matrix::Dense<remove_complex<ValueType>>* arnoldi_norm,
+             Accessor3d krylov_bases,
+             matrix::Dense<ValueType>* next_krylov_basis,
+             array<size_type>* final_iter_nums, array<char>& reduction_tmp,
+             size_type krylov_dim)
+{
+    constexpr bool use_scalar =
+        gko::cb_gmres::detail::has_3d_scaled_accessor<Accessor3d>::value;
+    const auto num_rows = residual->get_size()[0];
+    const auto num_rhs = residual->get_size()[1];
+    const auto krylov_stride =
+        gko::cb_gmres::helper_functions_accessor<Accessor3d>::get_stride(
+            krylov_bases);
+    const auto grid_dim_1 =
+        ceildiv((krylov_dim + 1) * krylov_stride[0], default_block_size);
+    const auto block_dim = default_block_size;
+    constexpr auto block_size = default_block_size;
+    const auto stride_arnoldi = arnoldi_norm->get_stride();
+
+    restart_1_kernel<block_size>
+        <<<grid_dim_1, block_dim, 0, exec->get_stream()>>>(
+            residual->get_size()[0], residual->get_size()[1], krylov_dim,
+            acc::as_device_range(krylov_bases),
+            as_device_type(residual_norm_collection->get_values()),
+            residual_norm_collection->get_stride());
+    kernels::GKO_DEVICE_NAMESPACE::dense::compute_norm2_dispatch(
+        exec, residual, residual_norm, reduction_tmp);
+
+    if (use_scalar) {
+        components::fill_array(exec,
+                               arnoldi_norm->get_values() + 2 * stride_arnoldi,
+                               num_rhs, zero<remove_complex<ValueType>>());
+        const dim3 grid_size_nrm(ceildiv(num_rhs, default_dot_dim),
+                                 exec->get_num_multiprocessor() * 2);
+        const dim3 block_size_nrm(default_dot_dim, default_dot_dim);
+        multinorminf_without_stop_kernel<<<grid_size_nrm, block_size_nrm, 0,
+                                           exec->get_stream()>>>(
+            num_rows, num_rhs, as_device_type(residual->get_const_values()),
+            residual->get_stride(),
+            as_device_type(arnoldi_norm->get_values() + 2 * stride_arnoldi), 0);
+    }
+
+    if (gko::cb_gmres::detail::has_3d_scaled_accessor<Accessor3d>::value) {
+        set_scalar_kernel<default_block_size>
+            <<<ceildiv(num_rhs * (krylov_dim + 1), default_block_size),
+               default_block_size, 0, exec->get_stream()>>>(
+                num_rhs, krylov_dim + 1,
+                as_device_type(residual_norm->get_const_values()),
+                residual_norm->get_stride(),
+                as_device_type(arnoldi_norm->get_const_values() +
+                               2 * stride_arnoldi),
+                stride_arnoldi, acc::as_device_range(krylov_bases));
+    }
+
+    const auto grid_dim_2 =
+        ceildiv(std::max<size_type>(num_rows, 1) * krylov_stride[1],
+                default_block_size);
+    restart_2_kernel<block_size>
+        <<<grid_dim_2, block_dim, 0, exec->get_stream()>>>(
+            residual->get_size()[0], residual->get_size()[1],
+            as_device_type(residual->get_const_values()),
+            residual->get_stride(),
+            as_device_type(residual_norm->get_const_values()),
+            as_device_type(residual_norm_collection->get_values()),
+            acc::as_device_range(krylov_bases),
+            as_device_type(next_krylov_basis->get_values()),
+            next_krylov_basis->get_stride(),
+            as_device_type(final_iter_nums->get_data()));
+}
+
+GKO_INSTANTIATE_FOR_EACH_CB_GMRES_TYPE(GKO_DECLARE_CB_GMRES_RESTART_KERNEL);
+
+
+template <typename ValueType, typename Accessor3dim>
+void finish_arnoldi_CGS(std::shared_ptr<const DefaultExecutor> exec,
+                        matrix::Dense<ValueType>* next_krylov_basis,
+                        Accessor3dim krylov_bases,
+                        matrix::Dense<ValueType>* hessenberg_iter,
+                        matrix::Dense<ValueType>* buffer_iter,
+                        matrix::Dense<remove_complex<ValueType>>* arnoldi_norm,
+                        size_type iter, const stopping_status* stop_status,
+                        stopping_status* reorth_status,
+                        array<size_type>* num_reorth)
+{
+    const auto dim_size = next_krylov_basis->get_size();
+    if (dim_size[1] == 0) {
+        return;
+    }
+    using non_complex = remove_complex<ValueType>;
+    // optimization parameter
+    constexpr int singledot_block_size = default_dot_dim;
+    constexpr bool use_scalar =
+        gko::cb_gmres::detail::has_3d_scaled_accessor<Accessor3dim>::value;
+    const auto stride_next_krylov = next_krylov_basis->get_stride();
+    const auto stride_hessenberg = hessenberg_iter->get_stride();
+    const auto stride_buffer = buffer_iter->get_stride();
+    const auto stride_arnoldi = arnoldi_norm->get_stride();
+    const dim3 grid_size(ceildiv(dim_size[1], default_dot_dim),
+                         exec->get_num_multiprocessor() * 2);
+    const dim3 grid_size_num_iters(ceildiv(dim_size[1], default_dot_dim),
+                                   exec->get_num_multiprocessor() * 2,
+                                   iter + 1);
+    const dim3 block_size(default_dot_dim, default_dot_dim);
+    // Note: having iter first (instead of row_idx information) is likely
+    //       beneficial for avoiding atomic_add conflicts, but that needs
+    //       further investigation.
+    const dim3 grid_size_iters_single(exec->get_num_multiprocessor() * 2,
+                                      iter + 1);
+    const auto block_size_iters_single = singledot_block_size;
+    size_type num_reorth_host;
+
+    components::fill_array(exec, arnoldi_norm->get_values(), dim_size[1],
+                           zero<non_complex>());
+    multinorm2_kernel<<<grid_size, block_size, 0, exec->get_stream()>>>(
+        dim_size[0], dim_size[1],
+        as_device_type(next_krylov_basis->get_const_values()),
+        stride_next_krylov, as_device_type(arnoldi_norm->get_values()),
+        as_device_type(stop_status));
+    // nrmP = norm(next_krylov_basis)
+    zero_matrix(exec, iter + 1, dim_size[1], stride_hessenberg,
+                hessenberg_iter->get_values());
+    if (dim_size[1] > 1) {
+        multidot_kernel<default_dot_dim>
+            <<<grid_size_num_iters, block_size, 0, exec->get_stream()>>>(
+                dim_size[0], dim_size[1],
+                as_device_type(next_krylov_basis->get_const_values()),
+                stride_next_krylov, acc::as_device_range(krylov_bases),
+                as_device_type(hessenberg_iter->get_values()),
+                stride_hessenberg, as_device_type(stop_status));
+    } else {
+        singledot_kernel<singledot_block_size>
+            <<<grid_size_iters_single, block_size_iters_single, 0,
+               exec->get_stream()>>>(
+                dim_size[0],
+                as_device_type(next_krylov_basis->get_const_values()),
+                stride_next_krylov, acc::as_device_range(krylov_bases),
+                as_device_type(hessenberg_iter->get_values()),
+                stride_hessenberg, as_device_type(stop_status));
+    }
+    // for i in 1:iter
+    //     hessenberg(iter, i) = next_krylov_basis' * krylov_bases(:, i)
+    // end
+    update_next_krylov_kernel<default_block_size>
+        <<<ceildiv(dim_size[0] * stride_next_krylov, default_block_size),
+           default_block_size, 0, exec->get_stream()>>>(
+            iter + 1, dim_size[0], dim_size[1],
+            as_device_type(next_krylov_basis->get_values()), stride_next_krylov,
+            acc::as_device_range(krylov_bases),
+            as_device_type(hessenberg_iter->get_const_values()),
+            stride_hessenberg, as_device_type(stop_status));
+
+    // for i in 1:iter
+    //     next_krylov_basis  -= hessenberg(iter, i) * krylov_bases(:, i)
+    // end
+    components::fill_array(exec, arnoldi_norm->get_values() + stride_arnoldi,
+                           dim_size[1], zero<non_complex>());
+    if (use_scalar) {
+        components::fill_array(exec,
+                               arnoldi_norm->get_values() + 2 * stride_arnoldi,
+                               dim_size[1], zero<non_complex>());
+    }
+    multinorm2_inf_kernel<use_scalar>
+        <<<grid_size, block_size, 0, exec->get_stream()>>>(
+            dim_size[0], dim_size[1],
+            as_device_type(next_krylov_basis->get_const_values()),
+            stride_next_krylov,
+            as_device_type(arnoldi_norm->get_values() + stride_arnoldi),
+            as_device_type(arnoldi_norm->get_values() + 2 * stride_arnoldi),
+            as_device_type(stop_status));
+    // nrmN = norm(next_krylov_basis)
+    components::fill_array(exec, num_reorth->get_data(), 1, zero<size_type>());
+    check_arnoldi_norms<default_block_size>
+        <<<ceildiv(dim_size[1], default_block_size), default_block_size, 0,
+           exec->get_stream()>>>(
+            dim_size[1], as_device_type(arnoldi_norm->get_values()),
+            stride_arnoldi, as_device_type(hessenberg_iter->get_values()),
+            stride_hessenberg, iter + 1, acc::as_device_range(krylov_bases),
+            as_device_type(stop_status), as_device_type(reorth_status),
+            as_device_type(num_reorth->get_data()));
+    num_reorth_host = get_element(*num_reorth, 0);
+    // num_reorth_host := number of next_krylov vector to be reorthogonalization
+    for (size_type l = 1; (num_reorth_host > 0) && (l < 3); l++) {
+        zero_matrix(exec, iter + 1, dim_size[1], stride_buffer,
+                    buffer_iter->get_values());
+        if (dim_size[1] > 1) {
+            multidot_kernel<default_dot_dim>
+                <<<grid_size_num_iters, block_size, 0, exec->get_stream()>>>(
+                    dim_size[0], dim_size[1],
+                    as_device_type(next_krylov_basis->get_const_values()),
+                    stride_next_krylov, acc::as_device_range(krylov_bases),
+                    as_device_type(buffer_iter->get_values()), stride_buffer,
+                    as_device_type(stop_status));
+        } else {
+            singledot_kernel<singledot_block_size>
+                <<<grid_size_iters_single, block_size_iters_single, 0,
+                   exec->get_stream()>>>(
+                    dim_size[0],
+                    as_device_type(next_krylov_basis->get_const_values()),
+                    stride_next_krylov, acc::as_device_range(krylov_bases),
+                    as_device_type(buffer_iter->get_values()), stride_buffer,
+                    as_device_type(stop_status));
+        }
+        // for i in 1:iter
+        //     hessenberg(iter, i) = next_krylov_basis' * krylov_bases(:, i)
+        // end
+        update_next_krylov_and_add_kernel<default_block_size>
+            <<<ceildiv(dim_size[0] * stride_next_krylov, default_block_size),
+               default_block_size, 0, exec->get_stream()>>>(
+                iter + 1, dim_size[0], dim_size[1],
+                as_device_type(next_krylov_basis->get_values()),
+                stride_next_krylov, acc::as_device_range(krylov_bases),
+                as_device_type(hessenberg_iter->get_values()),
+                stride_hessenberg,
+                as_device_type(buffer_iter->get_const_values()), stride_buffer,
+                as_device_type(stop_status), as_device_type(reorth_status));
+        // for i in 1:iter
+        //     next_krylov_basis  -= hessenberg(iter, i) * krylov_bases(:, i)
+        // end
+        components::fill_array(exec,
+                               arnoldi_norm->get_values() + stride_arnoldi,
+                               dim_size[1], zero<non_complex>());
+        if (use_scalar) {
+            components::fill_array(
+                exec, arnoldi_norm->get_values() + 2 * stride_arnoldi,
+                dim_size[1], zero<non_complex>());
+        }
+        multinorm2_inf_kernel<use_scalar>
+            <<<grid_size, block_size, 0, exec->get_stream()>>>(
+                dim_size[0], dim_size[1],
+                as_device_type(next_krylov_basis->get_const_values()),
+                stride_next_krylov,
+                as_device_type(arnoldi_norm->get_values() + stride_arnoldi),
+                as_device_type(arnoldi_norm->get_values() + 2 * stride_arnoldi),
+                as_device_type(stop_status));
+        // nrmN = norm(next_krylov_basis)
+        components::fill_array(exec, num_reorth->get_data(), 1,
+                               zero<size_type>());
+        check_arnoldi_norms<default_block_size>
+            <<<ceildiv(dim_size[1], default_block_size), default_block_size, 0,
+               exec->get_stream()>>>(
+                dim_size[1], as_device_type(arnoldi_norm->get_values()),
+                stride_arnoldi, as_device_type(hessenberg_iter->get_values()),
+                stride_hessenberg, iter + 1, acc::as_device_range(krylov_bases),
+                as_device_type(stop_status), as_device_type(reorth_status),
+                num_reorth->get_data());
+        num_reorth_host = get_element(*num_reorth, 0);
+        // num_reorth_host := number of next_krylov vector to be
+        // reorthogonalization
+    }
+    update_krylov_next_krylov_kernel<default_block_size>
+        <<<ceildiv(dim_size[0] * stride_next_krylov, default_block_size),
+           default_block_size, 0, exec->get_stream()>>>(
+            iter, dim_size[0], dim_size[1],
+            as_device_type(next_krylov_basis->get_values()), stride_next_krylov,
+            acc::as_device_range(krylov_bases),
+            as_device_type(hessenberg_iter->get_const_values()),
+            stride_hessenberg, as_device_type(stop_status));
+    // next_krylov_basis /= hessenberg(iter, iter + 1)
+    // krylov_bases(:, iter + 1) = next_krylov_basis
+    // End of arnoldi
+}
+
+template <typename ValueType>
+void givens_rotation(std::shared_ptr<const DefaultExecutor> exec,
+                     matrix::Dense<ValueType>* givens_sin,
+                     matrix::Dense<ValueType>* givens_cos,
+                     matrix::Dense<ValueType>* hessenberg_iter,
+                     matrix::Dense<remove_complex<ValueType>>* residual_norm,
+                     matrix::Dense<ValueType>* residual_norm_collection,
+                     size_type iter, const array<stopping_status>* stop_status)
+{
+    // TODO: tune block_size for optimal performance
+    constexpr auto block_size = default_block_size;
+    const auto num_cols = hessenberg_iter->get_size()[1];
+    const auto block_dim = block_size;
+    const auto grid_dim =
+        static_cast<unsigned int>(ceildiv(num_cols, block_size));
+
+    givens_rotation_kernel<block_size>
+        <<<grid_dim, block_dim, 0, exec->get_stream()>>>(
+            hessenberg_iter->get_size()[0], hessenberg_iter->get_size()[1],
+            iter, as_device_type(hessenberg_iter->get_values()),
+            hessenberg_iter->get_stride(),
+            as_device_type(givens_sin->get_values()), givens_sin->get_stride(),
+            as_device_type(givens_cos->get_values()), givens_cos->get_stride(),
+            as_device_type(residual_norm->get_values()),
+            as_device_type(residual_norm_collection->get_values()),
+            residual_norm_collection->get_stride(),
+            stop_status->get_const_data());
+}
+
+
+template <typename ValueType, typename Accessor3d>
+void arnoldi(std::shared_ptr<const DefaultExecutor> exec,
+             matrix::Dense<ValueType>* next_krylov_basis,
+             matrix::Dense<ValueType>* givens_sin,
+             matrix::Dense<ValueType>* givens_cos,
+             matrix::Dense<remove_complex<ValueType>>* residual_norm,
+             matrix::Dense<ValueType>* residual_norm_collection,
+             Accessor3d krylov_bases, matrix::Dense<ValueType>* hessenberg_iter,
+             matrix::Dense<ValueType>* buffer_iter,
+             matrix::Dense<remove_complex<ValueType>>* arnoldi_norm,
+             size_type iter, array<size_type>* final_iter_nums,
+             const array<stopping_status>* stop_status,
+             array<stopping_status>* reorth_status,
+             array<size_type>* num_reorth)
+{
+    increase_final_iteration_numbers_kernel<<<
+        static_cast<unsigned int>(
+            ceildiv(final_iter_nums->get_size(), default_block_size)),
+        default_block_size, 0, exec->get_stream()>>>(
+        as_device_type(final_iter_nums->get_data()),
+        stop_status->get_const_data(), final_iter_nums->get_size());
+    finish_arnoldi_CGS(exec, next_krylov_basis, krylov_bases, hessenberg_iter,
+                       buffer_iter, arnoldi_norm, iter,
+                       stop_status->get_const_data(), reorth_status->get_data(),
+                       num_reorth);
+    givens_rotation(exec, givens_sin, givens_cos, hessenberg_iter,
+                    residual_norm, residual_norm_collection, iter, stop_status);
+}
+
+GKO_INSTANTIATE_FOR_EACH_CB_GMRES_TYPE(GKO_DECLARE_CB_GMRES_ARNOLDI_KERNEL);
+
+
+template <typename ValueType>
+void solve_upper_triangular(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const matrix::Dense<ValueType>* residual_norm_collection,
+    const matrix::Dense<ValueType>* hessenberg, matrix::Dense<ValueType>* y,
+    const array<size_type>* final_iter_nums)
+{
+    // TODO: tune block_size for optimal performance
+    constexpr auto block_size = default_block_size;
+    const auto num_rhs = residual_norm_collection->get_size()[1];
+    const auto block_dim = block_size;
+    const auto grid_dim =
+        static_cast<unsigned int>(ceildiv(num_rhs, block_size));
+
+    solve_upper_triangular_kernel<block_size>
+        <<<grid_dim, block_dim, 0, exec->get_stream()>>>(
+            hessenberg->get_size()[1], num_rhs,
+            as_device_type(residual_norm_collection->get_const_values()),
+            residual_norm_collection->get_stride(),
+            as_device_type(hessenberg->get_const_values()),
+            hessenberg->get_stride(), as_device_type(y->get_values()),
+            y->get_stride(), as_device_type(final_iter_nums->get_const_data()));
+}
+
+
+template <typename ValueType, typename ConstAccessor3d>
+void calculate_qy(std::shared_ptr<const DefaultExecutor> exec,
+                  ConstAccessor3d krylov_bases, size_type num_krylov_bases,
+                  const matrix::Dense<ValueType>* y,
+                  matrix::Dense<ValueType>* before_preconditioner,
+                  const array<size_type>* final_iter_nums)
+{
+    const auto num_rows = before_preconditioner->get_size()[0];
+    const auto num_cols = before_preconditioner->get_size()[1];
+    const auto stride_before_preconditioner =
+        before_preconditioner->get_stride();
+
+    constexpr auto block_size = default_block_size;
+    const auto grid_dim = static_cast<unsigned int>(
+        ceildiv(num_rows * stride_before_preconditioner, block_size));
+    const auto block_dim = block_size;
+
+    calculate_Qy_kernel<block_size>
+        <<<grid_dim, block_dim, 0, exec->get_stream()>>>(
+            num_rows, num_cols, acc::as_device_range(krylov_bases),
+            as_device_type(y->get_const_values()), y->get_stride(),
+            as_device_type(before_preconditioner->get_values()),
+            stride_before_preconditioner,
+            as_device_type(final_iter_nums->get_const_data()));
+    // Calculate qy
+    // before_preconditioner = krylov_bases * y
+}
+
+
+template <typename ValueType, typename ConstAccessor3d>
+void solve_krylov(std::shared_ptr<const DefaultExecutor> exec,
+                  const matrix::Dense<ValueType>* residual_norm_collection,
+                  ConstAccessor3d krylov_bases,
+                  const matrix::Dense<ValueType>* hessenberg,
+                  matrix::Dense<ValueType>* y,
+                  matrix::Dense<ValueType>* before_preconditioner,
+                  const array<size_type>* final_iter_nums)
+{
+    if (before_preconditioner->get_size()[1] == 0) {
+        return;
+    }
+    // since hessenberg has dims:  iters x iters * num_rhs
+    // krylov_bases has dims:  (iters + 1) x sysmtx[0] x num_rhs
+    const auto iters =
+        hessenberg->get_size()[1] / before_preconditioner->get_size()[1];
+    const auto num_krylov_bases = iters + 1;
+    solve_upper_triangular(exec, residual_norm_collection, hessenberg, y,
+                           final_iter_nums);
+    calculate_qy(exec, krylov_bases, num_krylov_bases, y, before_preconditioner,
+                 final_iter_nums);
+}
+
+GKO_INSTANTIATE_FOR_EACH_CB_GMRES_CONST_TYPE(
+    GKO_DECLARE_CB_GMRES_SOLVE_KRYLOV_KERNEL);
+
+
+}  // namespace cb_gmres
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/cuda/solver/idr_kernels.cu b/common/cuda_hip/solver/idr_kernels.cpp
similarity index 51%
rename from cuda/solver/idr_kernels.cu
rename to common/cuda_hip/solver/idr_kernels.cpp
index fcb84920265..6b3f001af0c 100644
--- a/cuda/solver/idr_kernels.cu
+++ b/common/cuda_hip/solver/idr_kernels.cpp
@@ -2,9 +2,6 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include "core/solver/idr_kernels.hpp"
-
-
 #include <ctime>
 #include <random>
 
@@ -13,22 +10,23 @@
 #include <ginkgo/core/base/math.hpp>
 
 
+#include "common/cuda_hip/base/blas_bindings.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/randlib_bindings.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/atomic.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
 #include "core/components/fill_array_kernels.hpp"
-#include "cuda/base/config.hpp"
-#include "cuda/base/cublas_bindings.hpp"
-#include "cuda/base/curand_bindings.hpp"
-#include "cuda/base/math.hpp"
-#include "cuda/base/types.hpp"
-#include "cuda/components/atomic.cuh"
-#include "cuda/components/cooperative_groups.cuh"
-#include "cuda/components/reduction.cuh"
-#include "cuda/components/thread_ids.cuh"
+#include "core/solver/idr_kernels.hpp"
 
 
 namespace gko {
 namespace kernels {
-namespace cuda {
+namespace GKO_DEVICE_NAMESPACE {
 /**
  * @brief The IDR solver namespace.
  *
@@ -42,7 +40,320 @@ constexpr int default_dot_dim = 32;
 constexpr int default_dot_size = default_dot_dim * default_dot_dim;
 
 
-#include "common/cuda_hip/solver/idr_kernels.hpp.inc"
+template <typename ValueType>
+__global__ __launch_bounds__(default_block_size) void initialize_m_kernel(
+    size_type subspace_dim, size_type nrhs, ValueType* __restrict__ m_values,
+    size_type m_stride, stopping_status* __restrict__ stop_status)
+{
+    const auto global_id = thread::get_thread_id_flat();
+    const auto row = global_id / m_stride;
+    const auto col = global_id % m_stride;
+
+    if (global_id < nrhs) {
+        stop_status[global_id].reset();
+    }
+
+    if (row < subspace_dim && col < nrhs * subspace_dim) {
+        m_values[row * m_stride + col] =
+            (row == col / nrhs) ? one<ValueType>() : zero<ValueType>();
+    }
+}
+
+
+template <size_type block_size, typename ValueType>
+__global__
+__launch_bounds__(block_size) void orthonormalize_subspace_vectors_kernel(
+    size_type num_rows, size_type num_cols, ValueType* __restrict__ values,
+    size_type stride)
+{
+    const auto tidx = thread::get_thread_id_flat();
+
+    __shared__ uninitialized_array<ValueType, block_size>
+        reduction_helper_array;
+    // they are not be used in the same time.
+    ValueType* reduction_helper = reduction_helper_array;
+    auto reduction_helper_real =
+        reinterpret_cast<remove_complex<ValueType>*>(reduction_helper);
+
+    for (size_type row = 0; row < num_rows; row++) {
+        for (size_type i = 0; i < row; i++) {
+            auto dot = zero<ValueType>();
+            for (size_type j = tidx; j < num_cols; j += block_size) {
+                dot += values[row * stride + j] * conj(values[i * stride + j]);
+            }
+
+            // Ensure already finish reading this shared memory
+            __syncthreads();
+            reduction_helper[tidx] = dot;
+            reduce(
+                group::this_thread_block(), reduction_helper,
+                [](const ValueType& a, const ValueType& b) { return a + b; });
+            __syncthreads();
+
+            dot = reduction_helper[0];
+            for (size_type j = tidx; j < num_cols; j += block_size) {
+                values[row * stride + j] -= dot * values[i * stride + j];
+            }
+        }
+
+        auto norm = zero<remove_complex<ValueType>>();
+        for (size_type j = tidx; j < num_cols; j += block_size) {
+            norm += squared_norm(values[row * stride + j]);
+        }
+        // Ensure already finish reading this shared memory
+        __syncthreads();
+        reduction_helper_real[tidx] = norm;
+        reduce(group::this_thread_block(), reduction_helper_real,
+               [](const remove_complex<ValueType>& a,
+                  const remove_complex<ValueType>& b) { return a + b; });
+        __syncthreads();
+
+        norm = sqrt(reduction_helper_real[0]);
+        for (size_type j = tidx; j < num_cols; j += block_size) {
+            values[row * stride + j] /= norm;
+        }
+    }
+}
+
+
+template <typename ValueType>
+__global__
+__launch_bounds__(default_block_size) void solve_lower_triangular_kernel(
+    size_type subspace_dim, size_type nrhs,
+    const ValueType* __restrict__ m_values, size_type m_stride,
+    const ValueType* __restrict__ f_values, size_type f_stride,
+    ValueType* __restrict__ c_values, size_type c_stride,
+    const stopping_status* __restrict__ stop_status)
+{
+    const auto global_id = thread::get_thread_id_flat();
+
+    if (global_id >= nrhs) {
+        return;
+    }
+
+    if (!stop_status[global_id].has_stopped()) {
+        for (size_type row = 0; row < subspace_dim; row++) {
+            auto temp = f_values[row * f_stride + global_id];
+            for (size_type col = 0; col < row; col++) {
+                temp -= m_values[row * m_stride + col * nrhs + global_id] *
+                        c_values[col * c_stride + global_id];
+            }
+            c_values[row * c_stride + global_id] =
+                temp / m_values[row * m_stride + row * nrhs + global_id];
+        }
+    }
+}
+
+
+template <typename ValueType>
+__global__ __launch_bounds__(default_block_size) void step_1_kernel(
+    size_type k, size_type num_rows, size_type subspace_dim, size_type nrhs,
+    const ValueType* __restrict__ residual_values, size_type residual_stride,
+    const ValueType* __restrict__ c_values, size_type c_stride,
+    const ValueType* __restrict__ g_values, size_type g_stride,
+    ValueType* __restrict__ v_values, size_type v_stride,
+    const stopping_status* __restrict__ stop_status)
+{
+    const auto global_id = thread::get_thread_id_flat();
+    const auto row = global_id / nrhs;
+    const auto col = global_id % nrhs;
+
+    if (row >= num_rows) {
+        return;
+    }
+
+    if (!stop_status[col].has_stopped()) {
+        auto temp = residual_values[row * residual_stride + col];
+        for (size_type j = k; j < subspace_dim; j++) {
+            temp -= c_values[j * c_stride + col] *
+                    g_values[row * g_stride + j * nrhs + col];
+        }
+        v_values[row * v_stride + col] = temp;
+    }
+}
+
+
+template <typename ValueType>
+__global__ __launch_bounds__(default_block_size) void step_2_kernel(
+    size_type k, size_type num_rows, size_type subspace_dim, size_type nrhs,
+    const ValueType* __restrict__ omega_values,
+    const ValueType* __restrict__ v_values, size_type v_stride,
+    const ValueType* __restrict__ c_values, size_type c_stride,
+    ValueType* __restrict__ u_values, size_type u_stride,
+    const stopping_status* __restrict__ stop_status)
+{
+    const auto global_id = thread::get_thread_id_flat();
+    const auto row = global_id / nrhs;
+    const auto col = global_id % nrhs;
+
+    if (row >= num_rows) {
+        return;
+    }
+
+    if (!stop_status[col].has_stopped()) {
+        auto temp = omega_values[col] * v_values[row * v_stride + col];
+        for (size_type j = k; j < subspace_dim; j++) {
+            temp += c_values[j * c_stride + col] *
+                    u_values[row * u_stride + j * nrhs + col];
+        }
+        u_values[row * u_stride + k * nrhs + col] = temp;
+    }
+}
+
+
+template <typename ValueType>
+__global__ __launch_bounds__(default_dot_size) void multidot_kernel(
+    size_type num_rows, size_type nrhs, const ValueType* __restrict__ p_i,
+    const ValueType* __restrict__ g_k, size_type g_k_stride,
+    ValueType* __restrict__ alpha,
+    const stopping_status* __restrict__ stop_status)
+{
+    const auto tidx = threadIdx.x;
+    const auto tidy = threadIdx.y;
+    const auto rhs = blockIdx.x * default_dot_dim + tidx;
+    const auto num = ceildiv(num_rows, gridDim.y);
+    const auto start_row = blockIdx.y * num;
+    const auto end_row =
+        ((blockIdx.y + 1) * num > num_rows) ? num_rows : (blockIdx.y + 1) * num;
+    // Used that way to get around dynamic initialization warning and
+    // template error when using `reduction_helper_array` directly in `reduce`
+    __shared__
+        uninitialized_array<ValueType, default_dot_dim*(default_dot_dim + 1)>
+            reduction_helper_array;
+    ValueType* __restrict__ reduction_helper = reduction_helper_array;
+
+    ValueType local_res = zero<ValueType>();
+    if (rhs < nrhs && !stop_status[rhs].has_stopped()) {
+        for (size_type i = start_row + tidy; i < end_row;
+             i += default_dot_dim) {
+            const auto g_idx = i * g_k_stride + rhs;
+            local_res += p_i[i] * g_k[g_idx];
+        }
+    }
+    reduction_helper[tidx * (default_dot_dim + 1) + tidy] = local_res;
+    __syncthreads();
+    local_res = reduction_helper[tidy * (default_dot_dim + 1) + tidx];
+    const auto tile_block =
+        group::tiled_partition<default_dot_dim>(group::this_thread_block());
+    const auto sum =
+        reduce(tile_block, local_res,
+               [](const ValueType& a, const ValueType& b) { return a + b; });
+    const auto new_rhs = blockIdx.x * default_dot_dim + tidy;
+    if (tidx == 0 && new_rhs < nrhs && !stop_status[new_rhs].has_stopped()) {
+        atomic_add(alpha + new_rhs, sum);
+    }
+}
+
+
+template <size_type block_size, typename ValueType>
+__global__ __launch_bounds__(block_size) void update_g_k_and_u_kernel(
+    size_type k, size_type i, size_type size, size_type nrhs,
+    const ValueType* __restrict__ alpha, const ValueType* __restrict__ m_values,
+    size_type m_stride, const ValueType* __restrict__ g_values,
+    size_type g_stride, ValueType* __restrict__ g_k_values,
+    size_type g_k_stride, ValueType* __restrict__ u_values, size_type u_stride,
+    const stopping_status* __restrict__ stop_status)
+{
+    const auto tidx = thread::get_thread_id_flat();
+    const auto row = tidx / g_k_stride;
+    const auto rhs = tidx % g_k_stride;
+
+    if (row >= size || rhs >= nrhs) {
+        return;
+    }
+
+    if (!stop_status[rhs].has_stopped()) {
+        const auto fact = alpha[rhs] / m_values[i * m_stride + i * nrhs + rhs];
+        g_k_values[row * g_k_stride + rhs] -=
+            fact * g_values[row * g_stride + i * nrhs + rhs];
+        u_values[row * u_stride + k * nrhs + rhs] -=
+            fact * u_values[row * u_stride + i * nrhs + rhs];
+    }
+}
+
+
+template <size_type block_size, typename ValueType>
+__global__ __launch_bounds__(block_size) void update_g_kernel(
+    size_type k, size_type size, size_type nrhs,
+    const ValueType* __restrict__ g_k_values, size_type g_k_stride,
+    ValueType* __restrict__ g_values, size_type g_stride,
+    const stopping_status* __restrict__ stop_status)
+{
+    const auto tidx = thread::get_thread_id_flat();
+    const auto row = tidx / g_k_stride;
+    const auto rhs = tidx % nrhs;
+
+    if (row >= size || rhs >= nrhs) {
+        return;
+    }
+
+    if (!stop_status[rhs].has_stopped()) {
+        g_values[row * g_stride + k * nrhs + rhs] =
+            g_k_values[row * g_k_stride + rhs];
+    }
+}
+
+
+template <typename ValueType>
+__global__ __launch_bounds__(default_block_size) void update_x_r_and_f_kernel(
+    size_type k, size_type size, size_type subspace_dim, size_type nrhs,
+    const ValueType* __restrict__ m_values, size_type m_stride,
+    const ValueType* __restrict__ g_values, size_type g_stride,
+    const ValueType* __restrict__ u_values, size_type u_stride,
+    ValueType* __restrict__ f_values, size_type f_stride,
+    ValueType* __restrict__ r_values, size_type r_stride,
+    ValueType* __restrict__ x_values, size_type x_stride,
+    const stopping_status* __restrict__ stop_status)
+{
+    const auto global_id = thread::get_thread_id_flat();
+    const auto row = global_id / x_stride;
+    const auto col = global_id % x_stride;
+
+    if (row >= size || col >= nrhs) {
+        return;
+    }
+
+    if (!stop_status[col].has_stopped()) {
+        const auto beta = f_values[k * f_stride + col] /
+                          m_values[k * m_stride + k * nrhs + col];
+        r_values[row * r_stride + col] -=
+            beta * g_values[row * g_stride + k * nrhs + col];
+        x_values[row * x_stride + col] +=
+            beta * u_values[row * u_stride + k * nrhs + col];
+
+        if (k < row && k + 1 < subspace_dim && row < subspace_dim) {
+            f_values[row * f_stride + col] -=
+                beta * m_values[row * m_stride + k * nrhs + col];
+        }
+    }
+}
+
+
+template <typename ValueType>
+__global__ __launch_bounds__(config::warp_size) void compute_omega_kernel(
+    size_type nrhs, const remove_complex<ValueType> kappa,
+    const ValueType* __restrict__ tht,
+    const remove_complex<ValueType>* __restrict__ residual_norm,
+    ValueType* __restrict__ omega,
+    const stopping_status* __restrict__ stop_status)
+{
+    const auto global_id = thread::get_thread_id_flat();
+
+    if (global_id >= nrhs) {
+        return;
+    }
+
+    if (!stop_status[global_id].has_stopped()) {
+        auto thr = omega[global_id];
+        omega[global_id] /= tht[global_id];
+        auto absrho =
+            abs(thr / (sqrt(real(tht[global_id])) * residual_norm[global_id]));
+
+        if (absrho < kappa) {
+            omega[global_id] *= kappa / absrho;
+        }
+    }
+}
 
 
 namespace {
@@ -338,6 +649,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_COMPUTE_OMEGA_KERNEL);
 
 
 }  // namespace idr
-}  // namespace cuda
+}  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
 }  // namespace gko
diff --git a/common/cuda_hip/solver/idr_kernels.hpp.inc b/common/cuda_hip/solver/idr_kernels.hpp.inc
deleted file mode 100644
index 465417a6edb..00000000000
--- a/common/cuda_hip/solver/idr_kernels.hpp.inc
+++ /dev/null
@@ -1,318 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-template <typename ValueType>
-__global__ __launch_bounds__(default_block_size) void initialize_m_kernel(
-    size_type subspace_dim, size_type nrhs, ValueType* __restrict__ m_values,
-    size_type m_stride, stopping_status* __restrict__ stop_status)
-{
-    const auto global_id = thread::get_thread_id_flat();
-    const auto row = global_id / m_stride;
-    const auto col = global_id % m_stride;
-
-    if (global_id < nrhs) {
-        stop_status[global_id].reset();
-    }
-
-    if (row < subspace_dim && col < nrhs * subspace_dim) {
-        m_values[row * m_stride + col] =
-            (row == col / nrhs) ? one<ValueType>() : zero<ValueType>();
-    }
-}
-
-
-template <size_type block_size, typename ValueType>
-__global__
-__launch_bounds__(block_size) void orthonormalize_subspace_vectors_kernel(
-    size_type num_rows, size_type num_cols, ValueType* __restrict__ values,
-    size_type stride)
-{
-    const auto tidx = thread::get_thread_id_flat();
-
-    __shared__ uninitialized_array<ValueType, block_size>
-        reduction_helper_array;
-    // they are not be used in the same time.
-    ValueType* reduction_helper = reduction_helper_array;
-    auto reduction_helper_real =
-        reinterpret_cast<remove_complex<ValueType>*>(reduction_helper);
-
-    for (size_type row = 0; row < num_rows; row++) {
-        for (size_type i = 0; i < row; i++) {
-            auto dot = zero<ValueType>();
-            for (size_type j = tidx; j < num_cols; j += block_size) {
-                dot += values[row * stride + j] * conj(values[i * stride + j]);
-            }
-
-            // Ensure already finish reading this shared memory
-            __syncthreads();
-            reduction_helper[tidx] = dot;
-            reduce(
-                group::this_thread_block(), reduction_helper,
-                [](const ValueType& a, const ValueType& b) { return a + b; });
-            __syncthreads();
-
-            dot = reduction_helper[0];
-            for (size_type j = tidx; j < num_cols; j += block_size) {
-                values[row * stride + j] -= dot * values[i * stride + j];
-            }
-        }
-
-        auto norm = zero<remove_complex<ValueType>>();
-        for (size_type j = tidx; j < num_cols; j += block_size) {
-            norm += squared_norm(values[row * stride + j]);
-        }
-        // Ensure already finish reading this shared memory
-        __syncthreads();
-        reduction_helper_real[tidx] = norm;
-        reduce(group::this_thread_block(), reduction_helper_real,
-               [](const remove_complex<ValueType>& a,
-                  const remove_complex<ValueType>& b) { return a + b; });
-        __syncthreads();
-
-        norm = sqrt(reduction_helper_real[0]);
-        for (size_type j = tidx; j < num_cols; j += block_size) {
-            values[row * stride + j] /= norm;
-        }
-    }
-}
-
-
-template <typename ValueType>
-__global__
-__launch_bounds__(default_block_size) void solve_lower_triangular_kernel(
-    size_type subspace_dim, size_type nrhs,
-    const ValueType* __restrict__ m_values, size_type m_stride,
-    const ValueType* __restrict__ f_values, size_type f_stride,
-    ValueType* __restrict__ c_values, size_type c_stride,
-    const stopping_status* __restrict__ stop_status)
-{
-    const auto global_id = thread::get_thread_id_flat();
-
-    if (global_id >= nrhs) {
-        return;
-    }
-
-    if (!stop_status[global_id].has_stopped()) {
-        for (size_type row = 0; row < subspace_dim; row++) {
-            auto temp = f_values[row * f_stride + global_id];
-            for (size_type col = 0; col < row; col++) {
-                temp -= m_values[row * m_stride + col * nrhs + global_id] *
-                        c_values[col * c_stride + global_id];
-            }
-            c_values[row * c_stride + global_id] =
-                temp / m_values[row * m_stride + row * nrhs + global_id];
-        }
-    }
-}
-
-
-template <typename ValueType>
-__global__ __launch_bounds__(default_block_size) void step_1_kernel(
-    size_type k, size_type num_rows, size_type subspace_dim, size_type nrhs,
-    const ValueType* __restrict__ residual_values, size_type residual_stride,
-    const ValueType* __restrict__ c_values, size_type c_stride,
-    const ValueType* __restrict__ g_values, size_type g_stride,
-    ValueType* __restrict__ v_values, size_type v_stride,
-    const stopping_status* __restrict__ stop_status)
-{
-    const auto global_id = thread::get_thread_id_flat();
-    const auto row = global_id / nrhs;
-    const auto col = global_id % nrhs;
-
-    if (row >= num_rows) {
-        return;
-    }
-
-    if (!stop_status[col].has_stopped()) {
-        auto temp = residual_values[row * residual_stride + col];
-        for (size_type j = k; j < subspace_dim; j++) {
-            temp -= c_values[j * c_stride + col] *
-                    g_values[row * g_stride + j * nrhs + col];
-        }
-        v_values[row * v_stride + col] = temp;
-    }
-}
-
-
-template <typename ValueType>
-__global__ __launch_bounds__(default_block_size) void step_2_kernel(
-    size_type k, size_type num_rows, size_type subspace_dim, size_type nrhs,
-    const ValueType* __restrict__ omega_values,
-    const ValueType* __restrict__ v_values, size_type v_stride,
-    const ValueType* __restrict__ c_values, size_type c_stride,
-    ValueType* __restrict__ u_values, size_type u_stride,
-    const stopping_status* __restrict__ stop_status)
-{
-    const auto global_id = thread::get_thread_id_flat();
-    const auto row = global_id / nrhs;
-    const auto col = global_id % nrhs;
-
-    if (row >= num_rows) {
-        return;
-    }
-
-    if (!stop_status[col].has_stopped()) {
-        auto temp = omega_values[col] * v_values[row * v_stride + col];
-        for (size_type j = k; j < subspace_dim; j++) {
-            temp += c_values[j * c_stride + col] *
-                    u_values[row * u_stride + j * nrhs + col];
-        }
-        u_values[row * u_stride + k * nrhs + col] = temp;
-    }
-}
-
-
-template <typename ValueType>
-__global__ __launch_bounds__(default_dot_size) void multidot_kernel(
-    size_type num_rows, size_type nrhs, const ValueType* __restrict__ p_i,
-    const ValueType* __restrict__ g_k, size_type g_k_stride,
-    ValueType* __restrict__ alpha,
-    const stopping_status* __restrict__ stop_status)
-{
-    const auto tidx = threadIdx.x;
-    const auto tidy = threadIdx.y;
-    const auto rhs = blockIdx.x * default_dot_dim + tidx;
-    const auto num = ceildiv(num_rows, gridDim.y);
-    const auto start_row = blockIdx.y * num;
-    const auto end_row =
-        ((blockIdx.y + 1) * num > num_rows) ? num_rows : (blockIdx.y + 1) * num;
-    // Used that way to get around dynamic initialization warning and
-    // template error when using `reduction_helper_array` directly in `reduce`
-    __shared__
-        uninitialized_array<ValueType, default_dot_dim*(default_dot_dim + 1)>
-            reduction_helper_array;
-    ValueType* __restrict__ reduction_helper = reduction_helper_array;
-
-    ValueType local_res = zero<ValueType>();
-    if (rhs < nrhs && !stop_status[rhs].has_stopped()) {
-        for (size_type i = start_row + tidy; i < end_row;
-             i += default_dot_dim) {
-            const auto g_idx = i * g_k_stride + rhs;
-            local_res += p_i[i] * g_k[g_idx];
-        }
-    }
-    reduction_helper[tidx * (default_dot_dim + 1) + tidy] = local_res;
-    __syncthreads();
-    local_res = reduction_helper[tidy * (default_dot_dim + 1) + tidx];
-    const auto tile_block =
-        group::tiled_partition<default_dot_dim>(group::this_thread_block());
-    const auto sum =
-        reduce(tile_block, local_res,
-               [](const ValueType& a, const ValueType& b) { return a + b; });
-    const auto new_rhs = blockIdx.x * default_dot_dim + tidy;
-    if (tidx == 0 && new_rhs < nrhs && !stop_status[new_rhs].has_stopped()) {
-        atomic_add(alpha + new_rhs, sum);
-    }
-}
-
-
-template <size_type block_size, typename ValueType>
-__global__ __launch_bounds__(block_size) void update_g_k_and_u_kernel(
-    size_type k, size_type i, size_type size, size_type nrhs,
-    const ValueType* __restrict__ alpha, const ValueType* __restrict__ m_values,
-    size_type m_stride, const ValueType* __restrict__ g_values,
-    size_type g_stride, ValueType* __restrict__ g_k_values,
-    size_type g_k_stride, ValueType* __restrict__ u_values, size_type u_stride,
-    const stopping_status* __restrict__ stop_status)
-{
-    const auto tidx = thread::get_thread_id_flat();
-    const auto row = tidx / g_k_stride;
-    const auto rhs = tidx % g_k_stride;
-
-    if (row >= size || rhs >= nrhs) {
-        return;
-    }
-
-    if (!stop_status[rhs].has_stopped()) {
-        const auto fact = alpha[rhs] / m_values[i * m_stride + i * nrhs + rhs];
-        g_k_values[row * g_k_stride + rhs] -=
-            fact * g_values[row * g_stride + i * nrhs + rhs];
-        u_values[row * u_stride + k * nrhs + rhs] -=
-            fact * u_values[row * u_stride + i * nrhs + rhs];
-    }
-}
-
-
-template <size_type block_size, typename ValueType>
-__global__ __launch_bounds__(block_size) void update_g_kernel(
-    size_type k, size_type size, size_type nrhs,
-    const ValueType* __restrict__ g_k_values, size_type g_k_stride,
-    ValueType* __restrict__ g_values, size_type g_stride,
-    const stopping_status* __restrict__ stop_status)
-{
-    const auto tidx = thread::get_thread_id_flat();
-    const auto row = tidx / g_k_stride;
-    const auto rhs = tidx % nrhs;
-
-    if (row >= size || rhs >= nrhs) {
-        return;
-    }
-
-    if (!stop_status[rhs].has_stopped()) {
-        g_values[row * g_stride + k * nrhs + rhs] =
-            g_k_values[row * g_k_stride + rhs];
-    }
-}
-
-
-template <typename ValueType>
-__global__ __launch_bounds__(default_block_size) void update_x_r_and_f_kernel(
-    size_type k, size_type size, size_type subspace_dim, size_type nrhs,
-    const ValueType* __restrict__ m_values, size_type m_stride,
-    const ValueType* __restrict__ g_values, size_type g_stride,
-    const ValueType* __restrict__ u_values, size_type u_stride,
-    ValueType* __restrict__ f_values, size_type f_stride,
-    ValueType* __restrict__ r_values, size_type r_stride,
-    ValueType* __restrict__ x_values, size_type x_stride,
-    const stopping_status* __restrict__ stop_status)
-{
-    const auto global_id = thread::get_thread_id_flat();
-    const auto row = global_id / x_stride;
-    const auto col = global_id % x_stride;
-
-    if (row >= size || col >= nrhs) {
-        return;
-    }
-
-    if (!stop_status[col].has_stopped()) {
-        const auto beta = f_values[k * f_stride + col] /
-                          m_values[k * m_stride + k * nrhs + col];
-        r_values[row * r_stride + col] -=
-            beta * g_values[row * g_stride + k * nrhs + col];
-        x_values[row * x_stride + col] +=
-            beta * u_values[row * u_stride + k * nrhs + col];
-
-        if (k < row && k + 1 < subspace_dim && row < subspace_dim) {
-            f_values[row * f_stride + col] -=
-                beta * m_values[row * m_stride + k * nrhs + col];
-        }
-    }
-}
-
-
-template <typename ValueType>
-__global__ __launch_bounds__(config::warp_size) void compute_omega_kernel(
-    size_type nrhs, const remove_complex<ValueType> kappa,
-    const ValueType* __restrict__ tht,
-    const remove_complex<ValueType>* __restrict__ residual_norm,
-    ValueType* __restrict__ omega,
-    const stopping_status* __restrict__ stop_status)
-{
-    const auto global_id = thread::get_thread_id_flat();
-
-    if (global_id >= nrhs) {
-        return;
-    }
-
-    if (!stop_status[global_id].has_stopped()) {
-        auto thr = omega[global_id];
-        omega[global_id] /= tht[global_id];
-        auto absrho =
-            abs(thr / (sqrt(real(tht[global_id])) * residual_norm[global_id]));
-
-        if (absrho < kappa) {
-            omega[global_id] *= kappa / absrho;
-        }
-    }
-}
diff --git a/common/cuda_hip/solver/multigrid_kernels.hpp.inc b/common/cuda_hip/solver/multigrid_kernels.cpp
similarity index 89%
rename from common/cuda_hip/solver/multigrid_kernels.hpp.inc
rename to common/cuda_hip/solver/multigrid_kernels.cpp
index 98b1fcfeff4..e3ccc923b2c 100644
--- a/common/cuda_hip/solver/multigrid_kernels.hpp.inc
+++ b/common/cuda_hip/solver/multigrid_kernels.cpp
@@ -2,6 +2,34 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/types.hpp>
+
+
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "core/base/array_access.hpp"
+#include "core/components/fill_array_kernels.hpp"
+#include "core/solver/multigrid_kernels.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+/**
+ * @brief The MULTIGRID solver namespace.
+ *
+ * @ingroup multigrid
+ */
+namespace multigrid {
+
+
+constexpr int default_block_size = 512;
+
+
 namespace kernel {
 
 
@@ -171,3 +199,9 @@ void kcycle_check_stop(std::shared_ptr<const DefaultExecutor> exec,
 
 GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(
     GKO_DECLARE_MULTIGRID_KCYCLE_CHECK_STOP_KERNEL);
+
+
+}  // namespace multigrid
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/common/cuda_hip/stop/batch_criteria.hpp.inc b/common/cuda_hip/stop/batch_criteria.hpp
similarity index 75%
rename from common/cuda_hip/stop/batch_criteria.hpp.inc
rename to common/cuda_hip/stop/batch_criteria.hpp
index 38072467765..7491a143a31 100644
--- a/common/cuda_hip/stop/batch_criteria.hpp.inc
+++ b/common/cuda_hip/stop/batch_criteria.hpp
@@ -2,6 +2,19 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#ifndef GKO_COMMON_CUDA_HIP_STOP_BATCH_CRITERIA_HPP_
+#define GKO_COMMON_CUDA_HIP_STOP_BATCH_CRITERIA_HPP_
+
+
+#include <ginkgo/core/base/math.hpp>
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace batch_stop {
+
+
 /**
  * @see reference/stop/batch_criteria.hpp
  */
@@ -49,3 +62,11 @@ class SimpleAbsResidual {
 private:
     const real_type abs_tol_;
 };
+
+
+}  // namespace batch_stop
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
+
+#endif  // GKO_COMMON_CUDA_HIP_STOP_BATCH_CRITERIA_HPP_
diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt
index 88ae83e9005..11c00a1f8e1 100644
--- a/cuda/CMakeLists.txt
+++ b/cuda/CMakeLists.txt
@@ -7,9 +7,7 @@ add_instantiation_files(. matrix/fbcsr_kernels.instantiate.cu FBCSR_INSTANTIATE)
 list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp)
 target_sources(ginkgo_cuda
     PRIVATE
-    base/batch_multi_vector_kernels.cu
     base/device.cpp
-    base/device_matrix_data_kernels.cu
     base/exception.cpp
     base/executor.cpp
     base/index_set_kernels.cpp
@@ -19,59 +17,32 @@ target_sources(ginkgo_cuda
     base/stream.cpp
     base/timer.cpp
     base/version.cpp
-    components/prefix_sum_kernels.cu
-    distributed/index_map_kernels.cu
-    distributed/matrix_kernels.cu
-    distributed/partition_helpers_kernels.cu
-    distributed/partition_kernels.cu
-    distributed/vector_kernels.cu
-    factorization/cholesky_kernels.cu
-    factorization/factorization_kernels.cu
     factorization/ic_kernels.cu
     factorization/ilu_kernels.cu
-    factorization/lu_kernels.cu
-    factorization/par_ic_kernels.cu
-    factorization/par_ict_kernels.cu
-    factorization/par_ilu_kernels.cu
     factorization/par_ilut_approx_filter_kernel.cu
     factorization/par_ilut_filter_kernel.cu
     factorization/par_ilut_select_common.cu
     factorization/par_ilut_select_kernel.cu
     factorization/par_ilut_spgeam_kernel.cu
     factorization/par_ilut_sweep_kernel.cu
-    matrix/batch_csr_kernels.cu
-    matrix/batch_dense_kernels.cu
-    matrix/batch_ell_kernels.cu
-    matrix/coo_kernels.cu
     ${CSR_INSTANTIATE}
-    matrix/dense_kernels.cu
-    matrix/diagonal_kernels.cu
-    matrix/ell_kernels.cu
     ${FBCSR_INSTANTIATE}
     matrix/fft_kernels.cu
-    matrix/sellp_kernels.cu
-    matrix/sparsity_csr_kernels.cu
-    multigrid/pgm_kernels.cu
     preconditioner/batch_jacobi_kernels.cu
-    preconditioner/isai_kernels.cu
     preconditioner/jacobi_advanced_apply_kernel.cu
     preconditioner/jacobi_generate_kernel.cu
-    preconditioner/jacobi_kernels.cu
     preconditioner/jacobi_simple_apply_kernel.cu
-    reorder/rcm_kernels.cu
     solver/batch_bicgstab_kernels.cu
     solver/batch_cg_kernels.cu
-    solver/cb_gmres_kernels.cu
-    solver/idr_kernels.cu
     solver/lower_trs_kernels.cu
-    solver/multigrid_kernels.cu
     solver/upper_trs_kernels.cu
     stop/criterion_kernels.cu
     stop/residual_norm_kernels.cu
     ${GKO_UNIFIED_COMMON_SOURCES}
+    ${GKO_CUDA_HIP_COMMON_SOURCES}
     )
 # override the default language mapping for the common files, set them to CUDA
-foreach(source_file IN LISTS GKO_UNIFIED_COMMON_SOURCES)
+foreach(source_file IN LISTS GKO_UNIFIED_COMMON_SOURCES GKO_CUDA_HIP_COMMON_SOURCES)
     set_source_files_properties(${source_file} PROPERTIES LANGUAGE CUDA)
 endforeach(source_file)
 if(GINKGO_JACOBI_FULL_OPTIMIZATIONS)
diff --git a/cuda/base/batch_multi_vector_kernels.cu b/cuda/base/batch_multi_vector_kernels.cu
deleted file mode 100644
index 0e42278740e..00000000000
--- a/cuda/base/batch_multi_vector_kernels.cu
+++ /dev/null
@@ -1,59 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/base/batch_multi_vector_kernels.hpp"
-
-
-#include <thrust/functional.h>
-#include <thrust/transform.h>
-
-
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/base/range_accessors.hpp>
-
-
-#include "common/cuda_hip/base/runtime.hpp"
-#include "core/base/batch_struct.hpp"
-#include "cuda/base/batch_struct.hpp"
-#include "cuda/base/config.hpp"
-#include "cuda/base/cublas_bindings.hpp"
-#include "cuda/base/pointer_mode_guard.hpp"
-#include "cuda/base/thrust.cuh"
-#include "cuda/components/cooperative_groups.cuh"
-#include "cuda/components/reduction.cuh"
-#include "cuda/components/thread_ids.cuh"
-#include "cuda/components/uninitialized_array.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-/**
- * @brief The MultiVector matrix format namespace.
- *
- * @ingroup batch_multi_vector
- */
-namespace batch_multi_vector {
-
-
-constexpr auto default_block_size = 256;
-constexpr int sm_oversubscription = 4;
-
-
-// clang-format off
-
-// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
-
-#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc"
-
-
-#include "common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc"
-
-// clang-format on
-
-
-}  // namespace batch_multi_vector
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/base/device_matrix_data_kernels.cu b/cuda/base/device_matrix_data_kernels.cu
deleted file mode 100644
index ed5601f57a5..00000000000
--- a/cuda/base/device_matrix_data_kernels.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/base/device_matrix_data_kernels.hpp"
-
-
-#include <thrust/copy.h>
-#include <thrust/count.h>
-#include <thrust/device_ptr.h>
-#include <thrust/execution_policy.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/sort.h>
-#include <thrust/tuple.h>
-
-
-#include "cuda/base/thrust.cuh"
-#include "cuda/base/types.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-namespace components {
-
-
-#include "common/cuda_hip/base/device_matrix_data_kernels.hpp.inc"
-
-
-}  // namespace components
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/base/kernel_launch.cuh b/cuda/base/kernel_launch.cuh
deleted file mode 100644
index 7b929b9ba7c..00000000000
--- a/cuda/base/kernel_launch.cuh
+++ /dev/null
@@ -1,57 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_HPP_
-#error \
-    "This file can only be used from inside common/unified/base/kernel_launch.hpp"
-#endif
-
-
-#include <thrust/tuple.h>
-
-
-#include "accessor/cuda_helper.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "cuda/base/types.hpp"
-#include "cuda/components/thread_ids.cuh"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-
-
-template <typename AccessorType>
-struct to_device_type_impl<gko::acc::range<AccessorType>&> {
-    using type = std::decay_t<decltype(gko::acc::as_cuda_range(
-        std::declval<gko::acc::range<AccessorType>>()))>;
-    static type map_to_device(gko::acc::range<AccessorType>& range)
-    {
-        return gko::acc::as_cuda_range(range);
-    }
-};
-
-template <typename AccessorType>
-struct to_device_type_impl<const gko::acc::range<AccessorType>&> {
-    using type = std::decay_t<decltype(gko::acc::as_cuda_range(
-        std::declval<gko::acc::range<AccessorType>>()))>;
-    static type map_to_device(const gko::acc::range<AccessorType>& range)
-    {
-        return gko::acc::as_cuda_range(range);
-    }
-};
-
-
-namespace device_std = thrust;
-
-
-constexpr int default_block_size = 512;
-
-
-#include "common/cuda_hip/base/kernel_launch.hpp.inc"
-
-
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/base/kernel_launch_reduction.cuh b/cuda/base/kernel_launch_reduction.cuh
deleted file mode 100644
index 6146d7248d0..00000000000
--- a/cuda/base/kernel_launch_reduction.cuh
+++ /dev/null
@@ -1,28 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_REDUCTION_HPP_
-#error \
-    "This file can only be used from inside common/unified/base/kernel_launch_reduction.hpp"
-#endif
-
-
-#include "core/synthesizer/implementation_selection.hpp"
-#include "cuda/base/types.hpp"
-#include "cuda/components/cooperative_groups.cuh"
-#include "cuda/components/reduction.cuh"
-#include "cuda/components/thread_ids.cuh"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-
-
-#include "common/cuda_hip/base/kernel_launch_reduction.hpp.inc"
-
-
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/base/kernel_launch_solver.cuh b/cuda/base/kernel_launch_solver.cuh
deleted file mode 100644
index 0d9eaeb2653..00000000000
--- a/cuda/base/kernel_launch_solver.cuh
+++ /dev/null
@@ -1,24 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_SOLVER_HPP_
-#error \
-    "This file can only be used from inside common/unified/base/kernel_launch_solver.hpp"
-#endif
-
-
-#include "common/cuda_hip/base/runtime.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-
-
-#include "common/cuda_hip/base/kernel_launch_solver.hpp.inc"
-
-
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/base/math.hpp b/cuda/base/math.hpp
deleted file mode 100644
index d86a85a083e..00000000000
--- a/cuda/base/math.hpp
+++ /dev/null
@@ -1,24 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_CUDA_BASE_MATH_HPP_
-#define GKO_CUDA_BASE_MATH_HPP_
-
-
-#include <ginkgo/core/base/math.hpp>
-
-
-#include <thrust/complex.h>
-
-
-namespace gko {
-
-
-#include "common/cuda_hip/base/math.hpp.inc"
-
-
-}  // namespace gko
-
-
-#endif  // GKO_CUDA_BASE_MATH_HPP_
diff --git a/cuda/components/atomic.cuh b/cuda/components/atomic.cuh
deleted file mode 100644
index ad76dd0e0ce..00000000000
--- a/cuda/components/atomic.cuh
+++ /dev/null
@@ -1,29 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_CUDA_COMPONENTS_ATOMIC_CUH_
-#define GKO_CUDA_COMPONENTS_ATOMIC_CUH_
-
-
-#include <type_traits>
-
-
-#include "cuda/base/math.hpp"
-#include "cuda/base/types.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-
-
-#include "common/cuda_hip/components/atomic.hpp.inc"
-
-
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
-
-
-#endif  // GKO_CUDA_COMPONENTS_ATOMIC_CUH_
diff --git a/cuda/components/diagonal_block_manipulation.cuh b/cuda/components/diagonal_block_manipulation.cuh
deleted file mode 100644
index d748fcab2e5..00000000000
--- a/cuda/components/diagonal_block_manipulation.cuh
+++ /dev/null
@@ -1,32 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_CUDA_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_CUH_
-#define GKO_CUDA_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_CUH_
-
-
-#include <type_traits>
-
-
-#include "cuda/base/config.hpp"
-#include "cuda/base/types.hpp"
-#include "cuda/components/cooperative_groups.cuh"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-namespace csr {
-
-
-#include "common/cuda_hip/components/diagonal_block_manipulation.hpp.inc"
-
-
-}  // namespace csr
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
-
-
-#endif  // GKO_CUDA_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_CUH_
diff --git a/cuda/components/intrinsics.cuh b/cuda/components/intrinsics.cuh
deleted file mode 100644
index d35043c34ce..00000000000
--- a/cuda/components/intrinsics.cuh
+++ /dev/null
@@ -1,25 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_CUDA_COMPONENTS_INTRINSICS_CUH_
-#define GKO_CUDA_COMPONENTS_INTRINSICS_CUH_
-
-
-#include <ginkgo/core/base/types.hpp>
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-
-
-#include "common/cuda_hip/components/intrinsics.hpp.inc"
-
-
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
-
-
-#endif  // GKO_CUDA_COMPONENTS_INTRINSICS_CUH_
diff --git a/cuda/components/merging.cuh b/cuda/components/merging.cuh
deleted file mode 100644
index 3c7f5e52d47..00000000000
--- a/cuda/components/merging.cuh
+++ /dev/null
@@ -1,28 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_CUDA_COMPONENTS_MERGING_CUH_
-#define GKO_CUDA_COMPONENTS_MERGING_CUH_
-
-
-#include "core/base/utils.hpp"
-#include "cuda/base/math.hpp"
-#include "cuda/components/intrinsics.cuh"
-#include "cuda/components/searching.cuh"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-
-
-#include "common/cuda_hip/components/merging.hpp.inc"
-
-
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
-
-
-#endif  // GKO_CUDA_COMPONENTS_MERGING_CUH_
diff --git a/cuda/components/prefix_sum.cuh b/cuda/components/prefix_sum.cuh
deleted file mode 100644
index 653de4e9e15..00000000000
--- a/cuda/components/prefix_sum.cuh
+++ /dev/null
@@ -1,31 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_CUDA_COMPONENTS_PREFIX_SUM_CUH_
-#define GKO_CUDA_COMPONENTS_PREFIX_SUM_CUH_
-
-
-#include <type_traits>
-
-
-#include "cuda/base/cublas_bindings.hpp"
-#include "cuda/components/cooperative_groups.cuh"
-#include "cuda/components/reduction.cuh"
-#include "cuda/components/thread_ids.cuh"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-
-
-#include "common/cuda_hip/components/prefix_sum.hpp.inc"
-
-
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
-
-
-#endif  // GKO_CUDA_COMPONENTS_PREFIX_SUM_CUH_
diff --git a/cuda/components/prefix_sum_kernels.cu b/cuda/components/prefix_sum_kernels.cu
deleted file mode 100644
index d330ce0a2b0..00000000000
--- a/cuda/components/prefix_sum_kernels.cu
+++ /dev/null
@@ -1,34 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/components/prefix_sum_kernels.hpp"
-
-
-#include <limits>
-
-
-#include <thrust/scan.h>
-
-
-#include <ginkgo/core/base/array.hpp>
-#include <ginkgo/core/base/exception.hpp>
-#include <ginkgo/core/base/name_demangling.hpp>
-
-
-#include "cuda/base/thrust.cuh"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-namespace components {
-
-
-#include "common/cuda_hip/components/prefix_sum_kernels.hpp.inc"
-
-
-}  // namespace components
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/components/reduction.cuh b/cuda/components/reduction.cuh
deleted file mode 100644
index e53e1451d7f..00000000000
--- a/cuda/components/reduction.cuh
+++ /dev/null
@@ -1,82 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_CUDA_COMPONENTS_REDUCTION_CUH_
-#define GKO_CUDA_COMPONENTS_REDUCTION_CUH_
-
-
-#include <type_traits>
-
-
-#include <ginkgo/core/base/array.hpp>
-#include <ginkgo/core/base/executor.hpp>
-
-
-#include "common/cuda_hip/base/runtime.hpp"
-#include "core/base/array_access.hpp"
-#include "cuda/base/config.hpp"
-#include "cuda/base/types.hpp"
-#include "cuda/components/cooperative_groups.cuh"
-#include "cuda/components/thread_ids.cuh"
-#include "cuda/components/uninitialized_array.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-
-
-constexpr int default_reduce_block_size = 512;
-
-
-#include "common/cuda_hip/components/reduction.hpp.inc"
-
-
-/**
- * Compute a reduction using add operation (+).
- *
- * @param exec  Executor associated to the array
- * @param size  size of the array
- * @param source  the pointer of the array
- *
- * @return the reduction result
- */
-template <typename ValueType>
-__host__ ValueType reduce_add_array(std::shared_ptr<const CudaExecutor> exec,
-                                    size_type size, const ValueType* source)
-{
-    auto block_results_val = source;
-    size_type grid_dim = size;
-    auto block_results = array<ValueType>(exec);
-    if (size > default_reduce_block_size) {
-        const auto n = ceildiv(size, default_reduce_block_size);
-        grid_dim =
-            (n <= default_reduce_block_size) ? n : default_reduce_block_size;
-
-        block_results.resize_and_reset(grid_dim);
-
-        reduce_add_array<<<grid_dim, default_reduce_block_size, 0,
-                           exec->get_stream()>>>(
-            size, as_device_type(source),
-            as_device_type(block_results.get_data()));
-
-        block_results_val = block_results.get_const_data();
-    }
-
-    auto d_result = array<ValueType>(exec, 1);
-
-    reduce_add_array<<<1, default_reduce_block_size, 0, exec->get_stream()>>>(
-        grid_dim, as_device_type(block_results_val),
-        as_device_type(d_result.get_data()));
-    auto answer = get_element(d_result, 0);
-    return answer;
-}
-
-
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
-
-
-#endif  // GKO_CUDA_COMPONENTS_REDUCTION_CUH_
diff --git a/cuda/components/searching.cuh b/cuda/components/searching.cuh
deleted file mode 100644
index 1dc1304a82a..00000000000
--- a/cuda/components/searching.cuh
+++ /dev/null
@@ -1,26 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_CUDA_COMPONENTS_SEARCHING_CUH_
-#define GKO_CUDA_COMPONENTS_SEARCHING_CUH_
-
-
-#include "cuda/base/config.hpp"
-#include "cuda/components/intrinsics.cuh"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-
-
-#include "common/cuda_hip/components/searching.hpp.inc"
-
-
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
-
-
-#endif  // GKO_CUDA_COMPONENTS_SEARCHING_CUH_
diff --git a/cuda/components/segment_scan.cuh b/cuda/components/segment_scan.cuh
deleted file mode 100644
index 842f1e06760..00000000000
--- a/cuda/components/segment_scan.cuh
+++ /dev/null
@@ -1,25 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_CUDA_COMPONENTS_SEGMENT_SCAN_CUH_
-#define GKO_CUDA_COMPONENTS_SEGMENT_SCAN_CUH_
-
-
-#include "cuda/components/cooperative_groups.cuh"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-
-
-#include "common/cuda_hip/components/segment_scan.hpp.inc"
-
-
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
-
-
-#endif  // GKO_CUDA_COMPONENTS_SEGMENT_SCAN_CUH_
diff --git a/cuda/components/sorting.cuh b/cuda/components/sorting.cuh
deleted file mode 100644
index e6eb17ec8e4..00000000000
--- a/cuda/components/sorting.cuh
+++ /dev/null
@@ -1,26 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_CUDA_COMPONENTS_SORTING_CUH_
-#define GKO_CUDA_COMPONENTS_SORTING_CUH_
-
-
-#include "cuda/base/config.hpp"
-#include "cuda/components/cooperative_groups.cuh"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-
-
-#include "common/cuda_hip/components/sorting.hpp.inc"
-
-
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
-
-
-#endif  // GKO_CUDA_COMPONENTS_SORTING_CUH_
diff --git a/cuda/components/syncfree.cuh b/cuda/components/syncfree.cuh
deleted file mode 100644
index 0d45c8db516..00000000000
--- a/cuda/components/syncfree.cuh
+++ /dev/null
@@ -1,32 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_CUDA_COMPONENTS_SYNCFREE_CUH_
-#define GKO_CUDA_COMPONENTS_SYNCFREE_CUH_
-
-
-#include <ginkgo/core/base/array.hpp>
-
-
-#include "core/components/fill_array_kernels.hpp"
-#include "cuda/base/config.hpp"
-#include "cuda/components/atomic.cuh"
-#include "cuda/components/cooperative_groups.cuh"
-#include "cuda/components/memory.cuh"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-
-
-#include "common/cuda_hip/components/syncfree.hpp.inc"
-
-
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
-
-
-#endif  // GKO_CUDA_COMPONENTS_SYNCFREE_CUH_
diff --git a/cuda/components/thread_ids.cuh b/cuda/components/thread_ids.cuh
deleted file mode 100644
index 965053dd3b9..00000000000
--- a/cuda/components/thread_ids.cuh
+++ /dev/null
@@ -1,27 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_CUDA_COMPONENTS_THREAD_IDS_CUH_
-#define GKO_CUDA_COMPONENTS_THREAD_IDS_CUH_
-
-
-#include "cuda/base/config.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-namespace thread {
-
-
-#include "common/cuda_hip/components/thread_ids.hpp.inc"
-
-
-}  // namespace thread
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
-
-
-#endif  // GKO_CUDA_COMPONENTS_THREAD_IDS_CUH_
diff --git a/cuda/components/uninitialized_array.hpp b/cuda/components/uninitialized_array.hpp
deleted file mode 100644
index b98c812c16d..00000000000
--- a/cuda/components/uninitialized_array.hpp
+++ /dev/null
@@ -1,25 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_CUDA_COMPONENTS_UNINITIALIZED_ARRAY_HPP_
-#define GKO_CUDA_COMPONENTS_UNINITIALIZED_ARRAY_HPP_
-
-
-#include <ginkgo/core/base/types.hpp>
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-
-
-#include "common/cuda_hip/components/uninitialized_array.hpp.inc"
-
-
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
-
-
-#endif  // GKO_CUDA_COMPONENTS_UNINITIALIZED_ARRAY_HPP_
diff --git a/cuda/components/warp_blas.cuh b/cuda/components/warp_blas.cuh
deleted file mode 100644
index fa5e3d3ae3b..00000000000
--- a/cuda/components/warp_blas.cuh
+++ /dev/null
@@ -1,33 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_CUDA_COMPONENTS_WARP_BLAS_CUH_
-#define GKO_CUDA_COMPONENTS_WARP_BLAS_CUH_
-
-
-#include <cassert>
-#include <type_traits>
-
-
-#include <ginkgo/config.hpp>
-
-
-#include "cuda/base/math.hpp"
-#include "cuda/components/reduction.cuh"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-
-
-#include "common/cuda_hip/components/warp_blas.hpp.inc"
-
-
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
-
-
-#endif  // GKO_CUDA_COMPONENTS_WARP_BLAS_CUH_
diff --git a/cuda/distributed/index_map_kernels.cu b/cuda/distributed/index_map_kernels.cu
deleted file mode 100644
index a5d838e901f..00000000000
--- a/cuda/distributed/index_map_kernels.cu
+++ /dev/null
@@ -1,42 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/distributed/index_map_kernels.hpp"
-
-
-#include <thrust/binary_search.h>
-#include <thrust/copy.h>
-#include <thrust/distance.h>
-#include <thrust/execution_policy.h>
-#include <thrust/for_each.h>
-#include <thrust/iterator/transform_iterator.h>
-#include <thrust/iterator/transform_output_iterator.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/sequence.h>
-#include <thrust/sort.h>
-#include <thrust/transform_reduce.h>
-#include <thrust/unique.h>
-
-
-#include <ginkgo/core/base/exception_helpers.hpp>
-
-
-#include "cuda/base/thrust.cuh"
-#include "cuda/components/atomic.cuh"
-#include "cuda/components/searching.cuh"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-namespace index_map {
-
-
-#include "common/cuda_hip/distributed/index_map_kernels.hpp.inc"
-
-
-}  // namespace index_map
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/distributed/matrix_kernels.cu b/cuda/distributed/matrix_kernels.cu
deleted file mode 100644
index 3ad815d7090..00000000000
--- a/cuda/distributed/matrix_kernels.cu
+++ /dev/null
@@ -1,40 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/distributed/matrix_kernels.hpp"
-
-
-#include <thrust/binary_search.h>
-#include <thrust/copy.h>
-#include <thrust/distance.h>
-#include <thrust/execution_policy.h>
-#include <thrust/for_each.h>
-#include <thrust/iterator/transform_iterator.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/sequence.h>
-#include <thrust/sort.h>
-#include <thrust/transform_reduce.h>
-#include <thrust/unique.h>
-
-
-#include <ginkgo/core/base/exception_helpers.hpp>
-
-
-#include "cuda/base/thrust.cuh"
-#include "cuda/components/atomic.cuh"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-namespace distributed_matrix {
-
-
-#include "common/cuda_hip/distributed/matrix_kernels.hpp.inc"
-
-
-}  // namespace distributed_matrix
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/distributed/partition_helpers_kernels.cu b/cuda/distributed/partition_helpers_kernels.cu
deleted file mode 100644
index b478477ce18..00000000000
--- a/cuda/distributed/partition_helpers_kernels.cu
+++ /dev/null
@@ -1,29 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/distributed/partition_helpers_kernels.hpp"
-
-
-#include <thrust/device_ptr.h>
-#include <thrust/execution_policy.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/sort.h>
-
-
-#include "cuda/base/thrust.cuh"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-namespace partition_helpers {
-
-
-#include "common/cuda_hip/distributed/partition_helpers_kernels.hpp.inc"
-
-
-}  // namespace partition_helpers
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/distributed/partition_kernels.cu b/cuda/distributed/partition_kernels.cu
deleted file mode 100644
index de6c5bc6c02..00000000000
--- a/cuda/distributed/partition_kernels.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/distributed/partition_kernels.hpp"
-
-
-#include <thrust/count.h>
-#include <thrust/device_ptr.h>
-#include <thrust/execution_policy.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/scan.h>
-#include <thrust/sort.h>
-
-
-#include "common/unified/base/kernel_launch.hpp"
-#include "core/components/fill_array_kernels.hpp"
-#include "cuda/base/thrust.cuh"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-namespace partition {
-
-
-#include "common/cuda_hip/distributed/partition_kernels.hpp.inc"
-
-
-}  // namespace partition
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/distributed/vector_kernels.cu b/cuda/distributed/vector_kernels.cu
deleted file mode 100644
index 7b06ada9f0e..00000000000
--- a/cuda/distributed/vector_kernels.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/distributed/vector_kernels.hpp"
-
-
-#include <functional>
-
-
-#include <thrust/binary_search.h>
-#include <thrust/execution_policy.h>
-#include <thrust/iterator/transform_iterator.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/scatter.h>
-#include <thrust/tuple.h>
-
-
-#include <ginkgo/core/base/exception_helpers.hpp>
-
-
-#include "cuda/base/thrust.cuh"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-namespace distributed_vector {
-
-
-#include "common/cuda_hip/distributed/vector_kernels.hpp.inc"
-
-
-}  // namespace distributed_vector
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/factorization/cholesky_kernels.cu b/cuda/factorization/cholesky_kernels.cu
deleted file mode 100644
index e4ff3f4d4d5..00000000000
--- a/cuda/factorization/cholesky_kernels.cu
+++ /dev/null
@@ -1,115 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/factorization/cholesky_kernels.hpp"
-
-
-#include <algorithm>
-#include <memory>
-
-
-#include <thrust/execution_policy.h>
-#include <thrust/functional.h>
-#include <thrust/sequence.h>
-#include <thrust/sort.h>
-#include <thrust/transform.h>
-#include <thrust/tuple.h>
-
-
-#include <ginkgo/core/matrix/csr.hpp>
-
-
-#include "core/components/fill_array_kernels.hpp"
-#include "core/components/format_conversion_kernels.hpp"
-#include "core/factorization/elimination_forest.hpp"
-#include "core/factorization/lu_kernels.hpp"
-#include "core/matrix/csr_lookup.hpp"
-#include "cuda/base/cusparse_bindings.hpp"
-#include "cuda/base/math.hpp"
-#include "cuda/base/thrust.cuh"
-#include "cuda/components/cooperative_groups.cuh"
-#include "cuda/components/intrinsics.cuh"
-#include "cuda/components/reduction.cuh"
-#include "cuda/components/syncfree.cuh"
-#include "cuda/components/thread_ids.cuh"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-/**
- * @brief The Cholesky namespace.
- *
- * @ingroup factor
- */
-namespace cholesky {
-
-
-constexpr int default_block_size = 512;
-
-
-#include "common/cuda_hip/factorization/cholesky_kernels.hpp.inc"
-
-
-template <typename ValueType, typename IndexType>
-void symbolic_count(std::shared_ptr<const DefaultExecutor> exec,
-                    const matrix::Csr<ValueType, IndexType>* mtx,
-                    const factorization::elimination_forest<IndexType>& forest,
-                    IndexType* row_nnz, array<IndexType>& tmp_storage)
-{
-    const auto num_rows = static_cast<IndexType>(mtx->get_size()[0]);
-    if (num_rows == 0) {
-        return;
-    }
-    const auto mtx_nnz = static_cast<IndexType>(mtx->get_num_stored_elements());
-    tmp_storage.resize_and_reset(mtx_nnz + num_rows);
-    const auto postorder_cols = tmp_storage.get_data();
-    const auto lower_ends = postorder_cols + mtx_nnz;
-    const auto row_ptrs = mtx->get_const_row_ptrs();
-    const auto cols = mtx->get_const_col_idxs();
-    const auto inv_postorder = forest.inv_postorder.get_const_data();
-    const auto postorder_parent = forest.postorder_parents.get_const_data();
-    // transform col indices to postorder indices
-    {
-        const auto num_blocks = ceildiv(num_rows, default_block_size);
-        kernel::build_postorder_cols<<<num_blocks, default_block_size, 0,
-                                       exec->get_stream()>>>(
-            num_rows, cols, row_ptrs, inv_postorder, postorder_cols,
-            lower_ends);
-    }
-    // sort postorder_cols inside rows
-    {
-        const auto handle = exec->get_sparselib_handle();
-        auto descr = sparselib::create_mat_descr();
-        array<IndexType> permutation_array(exec, mtx_nnz);
-        auto permutation = permutation_array.get_data();
-        components::fill_seq_array(exec, permutation, mtx_nnz);
-        size_type buffer_size{};
-        sparselib::csrsort_buffer_size(handle, num_rows, num_rows, mtx_nnz,
-                                       row_ptrs, postorder_cols, buffer_size);
-        array<char> buffer_array{exec, buffer_size};
-        auto buffer = buffer_array.get_data();
-        sparselib::csrsort(handle, num_rows, num_rows, mtx_nnz, descr, row_ptrs,
-                           postorder_cols, permutation, buffer);
-        sparselib::destroy(descr);
-    }
-    // count nonzeros per row of L
-    {
-        const auto num_blocks =
-            ceildiv(num_rows, default_block_size / config::warp_size);
-        kernel::symbolic_count<config::warp_size>
-            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
-                num_rows, row_ptrs, lower_ends, inv_postorder, postorder_cols,
-                postorder_parent, row_nnz);
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CHOLESKY_SYMBOLIC_COUNT);
-
-
-}  // namespace cholesky
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/factorization/factorization_kernels.cu b/cuda/factorization/factorization_kernels.cu
deleted file mode 100644
index ac5c14481e9..00000000000
--- a/cuda/factorization/factorization_kernels.cu
+++ /dev/null
@@ -1,43 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/factorization/factorization_kernels.hpp"
-
-
-#include <ginkgo/core/base/array.hpp>
-
-
-#include "common/cuda_hip/base/runtime.hpp"
-#include "core/base/array_access.hpp"
-#include "core/components/prefix_sum_kernels.hpp"
-#include "core/matrix/csr_builder.hpp"
-#include "cuda/base/config.hpp"
-#include "cuda/base/types.hpp"
-#include "cuda/components/cooperative_groups.cuh"
-#include "cuda/components/intrinsics.cuh"
-#include "cuda/components/searching.cuh"
-#include "cuda/components/thread_ids.cuh"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-/**
- * @brief The factorization namespace.
- *
- * @ingroup factor
- */
-namespace factorization {
-
-
-constexpr int default_block_size{512};
-
-
-#include "common/cuda_hip/factorization/factorization_kernels.hpp.inc"
-
-
-}  // namespace factorization
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/factorization/lu_kernels.cu b/cuda/factorization/lu_kernels.cu
deleted file mode 100644
index 583bf51fb67..00000000000
--- a/cuda/factorization/lu_kernels.cu
+++ /dev/null
@@ -1,50 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/factorization/lu_kernels.hpp"
-
-
-#include <algorithm>
-#include <memory>
-
-
-#include <thrust/copy.h>
-#include <thrust/iterator/transform_output_iterator.h>
-#include <thrust/iterator/zip_iterator.h>
-
-
-#include <ginkgo/core/matrix/csr.hpp>
-
-
-#include "core/base/allocator.hpp"
-#include "core/matrix/csr_lookup.hpp"
-#include "cuda/base/thrust.cuh"
-#include "cuda/base/types.hpp"
-#include "cuda/components/cooperative_groups.cuh"
-#include "cuda/components/reduction.cuh"
-#include "cuda/components/syncfree.cuh"
-#include "cuda/components/thread_ids.cuh"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-/**
- * @brief The LU namespace.
- *
- * @ingroup factor
- */
-namespace lu_factorization {
-
-
-constexpr static int default_block_size = 512;
-
-
-#include "common/cuda_hip/factorization/lu_kernels.hpp.inc"
-
-
-}  // namespace lu_factorization
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/factorization/par_ic_kernels.cu b/cuda/factorization/par_ic_kernels.cu
deleted file mode 100644
index a9de634f1f9..00000000000
--- a/cuda/factorization/par_ic_kernels.cu
+++ /dev/null
@@ -1,39 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/factorization/par_ic_kernels.hpp"
-
-
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/matrix/coo.hpp>
-#include <ginkgo/core/matrix/csr.hpp>
-
-
-#include "cuda/base/math.hpp"
-#include "cuda/base/types.hpp"
-#include "cuda/components/memory.cuh"
-#include "cuda/components/thread_ids.cuh"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-/**
- * @brief The parallel ic factorization namespace.
- *
- * @ingroup factor
- */
-namespace par_ic_factorization {
-
-
-constexpr int default_block_size = 512;
-
-
-#include "common/cuda_hip/factorization/par_ic_kernels.hpp.inc"
-
-
-}  // namespace par_ic_factorization
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/factorization/par_ict_kernels.cu b/cuda/factorization/par_ict_kernels.cu
deleted file mode 100644
index 9285e786adf..00000000000
--- a/cuda/factorization/par_ict_kernels.cu
+++ /dev/null
@@ -1,189 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/factorization/par_ict_kernels.hpp"
-
-
-#include <ginkgo/core/base/array.hpp>
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/matrix/coo.hpp>
-#include <ginkgo/core/matrix/csr.hpp>
-#include <ginkgo/core/matrix/dense.hpp>
-
-
-#include "common/cuda_hip/base/runtime.hpp"
-#include "core/components/prefix_sum_kernels.hpp"
-#include "core/matrix/coo_builder.hpp"
-#include "core/matrix/csr_builder.hpp"
-#include "core/matrix/csr_kernels.hpp"
-#include "core/synthesizer/implementation_selection.hpp"
-#include "cuda/base/math.hpp"
-#include "cuda/components/intrinsics.cuh"
-#include "cuda/components/memory.cuh"
-#include "cuda/components/merging.cuh"
-#include "cuda/components/prefix_sum.cuh"
-#include "cuda/components/reduction.cuh"
-#include "cuda/components/searching.cuh"
-#include "cuda/components/thread_ids.cuh"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-/**
- * @brief The parallel ICT factorization namespace.
- *
- * @ingroup factor
- */
-namespace par_ict_factorization {
-
-
-constexpr int default_block_size = 512;
-
-
-// subwarp sizes for all warp-parallel kernels (filter, add_candidates)
-using compiled_kernels =
-    syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
-
-
-#include "common/cuda_hip/factorization/par_ict_kernels.hpp.inc"
-
-
-namespace {
-
-
-template <int subwarp_size, typename ValueType, typename IndexType>
-void add_candidates(syn::value_list<int, subwarp_size>,
-                    std::shared_ptr<const DefaultExecutor> exec,
-                    const matrix::Csr<ValueType, IndexType>* llh,
-                    const matrix::Csr<ValueType, IndexType>* a,
-                    const matrix::Csr<ValueType, IndexType>* l,
-                    matrix::Csr<ValueType, IndexType>* l_new)
-{
-    auto num_rows = static_cast<IndexType>(llh->get_size()[0]);
-    auto subwarps_per_block = default_block_size / subwarp_size;
-    auto num_blocks = ceildiv(num_rows, subwarps_per_block);
-    matrix::CsrBuilder<ValueType, IndexType> l_new_builder(l_new);
-    auto llh_row_ptrs = llh->get_const_row_ptrs();
-    auto llh_col_idxs = llh->get_const_col_idxs();
-    auto llh_vals = llh->get_const_values();
-    auto a_row_ptrs = a->get_const_row_ptrs();
-    auto a_col_idxs = a->get_const_col_idxs();
-    auto a_vals = a->get_const_values();
-    auto l_row_ptrs = l->get_const_row_ptrs();
-    auto l_col_idxs = l->get_const_col_idxs();
-    auto l_vals = l->get_const_values();
-    auto l_new_row_ptrs = l_new->get_row_ptrs();
-    // count non-zeros per row
-    if (num_blocks > 0) {
-        kernel::ict_tri_spgeam_nnz<subwarp_size>
-            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
-                llh_row_ptrs, llh_col_idxs, a_row_ptrs, a_col_idxs,
-                l_new_row_ptrs, num_rows);
-    }
-
-    // build row ptrs
-    components::prefix_sum_nonnegative(exec, l_new_row_ptrs, num_rows + 1);
-
-    // resize output arrays
-    auto l_new_nnz = exec->copy_val_to_host(l_new_row_ptrs + num_rows);
-    l_new_builder.get_col_idx_array().resize_and_reset(l_new_nnz);
-    l_new_builder.get_value_array().resize_and_reset(l_new_nnz);
-
-    auto l_new_col_idxs = l_new->get_col_idxs();
-    auto l_new_vals = l_new->get_values();
-
-    // fill columns and values
-    if (num_blocks > 0) {
-        kernel::ict_tri_spgeam_init<subwarp_size>
-            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
-                llh_row_ptrs, llh_col_idxs, as_device_type(llh_vals),
-                a_row_ptrs, a_col_idxs, as_device_type(a_vals), l_row_ptrs,
-                l_col_idxs, as_device_type(l_vals), l_new_row_ptrs,
-                l_new_col_idxs, as_device_type(l_new_vals), num_rows);
-    }
-}
-
-
-GKO_ENABLE_IMPLEMENTATION_SELECTION(select_add_candidates, add_candidates);
-
-
-template <int subwarp_size, typename ValueType, typename IndexType>
-void compute_factor(syn::value_list<int, subwarp_size>,
-                    std::shared_ptr<const DefaultExecutor> exec,
-                    const matrix::Csr<ValueType, IndexType>* a,
-                    matrix::Csr<ValueType, IndexType>* l,
-                    const matrix::Coo<ValueType, IndexType>* l_coo)
-{
-    auto total_nnz = static_cast<IndexType>(l->get_num_stored_elements());
-    auto block_size = default_block_size / subwarp_size;
-    auto num_blocks = ceildiv(total_nnz, block_size);
-    if (num_blocks > 0) {
-        kernel::ict_sweep<subwarp_size>
-            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
-                a->get_const_row_ptrs(), a->get_const_col_idxs(),
-                as_device_type(a->get_const_values()), l->get_const_row_ptrs(),
-                l_coo->get_const_row_idxs(), l->get_const_col_idxs(),
-                as_device_type(l->get_values()),
-                static_cast<IndexType>(l->get_num_stored_elements()));
-    }
-}
-
-
-GKO_ENABLE_IMPLEMENTATION_SELECTION(select_compute_factor, compute_factor);
-
-
-}  // namespace
-
-
-template <typename ValueType, typename IndexType>
-void add_candidates(std::shared_ptr<const DefaultExecutor> exec,
-                    const matrix::Csr<ValueType, IndexType>* llh,
-                    const matrix::Csr<ValueType, IndexType>* a,
-                    const matrix::Csr<ValueType, IndexType>* l,
-                    matrix::Csr<ValueType, IndexType>* l_new)
-{
-    auto num_rows = a->get_size()[0];
-    auto total_nnz =
-        llh->get_num_stored_elements() + a->get_num_stored_elements();
-    auto total_nnz_per_row = total_nnz / num_rows;
-    select_add_candidates(
-        compiled_kernels(),
-        [&](int compiled_subwarp_size) {
-            return total_nnz_per_row <= compiled_subwarp_size ||
-                   compiled_subwarp_size == config::warp_size;
-        },
-        syn::value_list<int>(), syn::type_list<>(), exec, llh, a, l, l_new);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_PAR_ICT_ADD_CANDIDATES_KERNEL);
-
-
-template <typename ValueType, typename IndexType>
-void compute_factor(std::shared_ptr<const DefaultExecutor> exec,
-                    const matrix::Csr<ValueType, IndexType>* a,
-                    matrix::Csr<ValueType, IndexType>* l,
-                    const matrix::Coo<ValueType, IndexType>* l_coo)
-{
-    auto num_rows = a->get_size()[0];
-    auto total_nnz = 2 * l->get_num_stored_elements();
-    auto total_nnz_per_row = total_nnz / num_rows;
-    select_compute_factor(
-        compiled_kernels(),
-        [&](int compiled_subwarp_size) {
-            return total_nnz_per_row <= compiled_subwarp_size ||
-                   compiled_subwarp_size == config::warp_size;
-        },
-        syn::value_list<int>(), syn::type_list<>(), exec, a, l, l_coo);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_PAR_ICT_COMPUTE_FACTOR_KERNEL);
-
-
-}  // namespace par_ict_factorization
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/factorization/par_ilu_kernels.cu b/cuda/factorization/par_ilu_kernels.cu
deleted file mode 100644
index 11c1ab1b3e2..00000000000
--- a/cuda/factorization/par_ilu_kernels.cu
+++ /dev/null
@@ -1,39 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/factorization/par_ilu_kernels.hpp"
-
-
-#include <ginkgo/core/base/std_extensions.hpp>
-#include <ginkgo/core/matrix/coo.hpp>
-
-
-#include "common/cuda_hip/base/runtime.hpp"
-#include "cuda/base/math.hpp"
-#include "cuda/base/types.hpp"
-#include "cuda/components/memory.cuh"
-#include "cuda/components/thread_ids.cuh"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-/**
- * @brief The parallel ilu factorization namespace.
- *
- * @ingroup factor
- */
-namespace par_ilu_factorization {
-
-
-constexpr int default_block_size{512};
-
-
-#include "common/cuda_hip/factorization/par_ilu_kernels.hpp.inc"
-
-
-}  // namespace par_ilu_factorization
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/factorization/par_ilut_filter_kernels.cu b/cuda/factorization/par_ilut_filter_kernels.cu
deleted file mode 100644
index ddd4b428d55..00000000000
--- a/cuda/factorization/par_ilut_filter_kernels.cu
+++ /dev/null
@@ -1,140 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/factorization/par_ilut_kernels.hpp"
-
-
-#include <ginkgo/core/base/array.hpp>
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/matrix/coo.hpp>
-#include <ginkgo/core/matrix/csr.hpp>
-#include <ginkgo/core/matrix/dense.hpp>
-
-
-#include "common/cuda_hip/base/runtime.hpp"
-#include "core/components/prefix_sum_kernels.hpp"
-#include "core/matrix/coo_builder.hpp"
-#include "core/matrix/csr_builder.hpp"
-#include "core/matrix/csr_kernels.hpp"
-#include "core/synthesizer/implementation_selection.hpp"
-#include "cuda/base/config.hpp"
-#include "cuda/base/math.hpp"
-#include "cuda/base/types.hpp"
-#include "cuda/components/cooperative_groups.cuh"
-#include "cuda/components/intrinsics.cuh"
-#include "cuda/components/thread_ids.cuh"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-/**
- * @brief The parallel ILUT factorization namespace.
- *
- * @ingroup factor
- */
-namespace par_ilut_factorization {
-
-
-constexpr int default_block_size = 512;
-
-
-// subwarp sizes for filter kernels
-using compiled_kernels =
-    syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
-
-
-#include "common/cuda_hip/factorization/par_ilut_filter_kernels.hpp.inc"
-
-
-namespace {
-
-
-template <int subwarp_size, typename ValueType, typename IndexType>
-void threshold_filter(syn::value_list<int, subwarp_size>,
-                      std::shared_ptr<const DefaultExecutor> exec,
-                      const matrix::Csr<ValueType, IndexType>* a,
-                      remove_complex<ValueType> threshold,
-                      matrix::Csr<ValueType, IndexType>* m_out,
-                      matrix::Coo<ValueType, IndexType>* m_out_coo, bool lower)
-{
-    auto old_row_ptrs = a->get_const_row_ptrs();
-    auto old_col_idxs = a->get_const_col_idxs();
-    auto old_vals = a->get_const_values();
-    // compute nnz for each row
-    auto num_rows = static_cast<IndexType>(a->get_size()[0]);
-    auto block_size = default_block_size / subwarp_size;
-    auto num_blocks = ceildiv(num_rows, block_size);
-    auto new_row_ptrs = m_out->get_row_ptrs();
-    if (num_blocks > 0) {
-        kernel::threshold_filter_nnz<subwarp_size>
-            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
-                old_row_ptrs, as_device_type(old_vals), num_rows,
-                as_device_type(threshold), new_row_ptrs, lower);
-    }
-
-    // build row pointers
-    components::prefix_sum_nonnegative(exec, new_row_ptrs, num_rows + 1);
-
-    // build matrix
-    auto new_nnz = exec->copy_val_to_host(new_row_ptrs + num_rows);
-    // resize arrays and update aliases
-    matrix::CsrBuilder<ValueType, IndexType> builder{m_out};
-    builder.get_col_idx_array().resize_and_reset(new_nnz);
-    builder.get_value_array().resize_and_reset(new_nnz);
-    auto new_col_idxs = m_out->get_col_idxs();
-    auto new_vals = m_out->get_values();
-    IndexType* new_row_idxs{};
-    if (m_out_coo) {
-        matrix::CooBuilder<ValueType, IndexType> coo_builder{m_out_coo};
-        coo_builder.get_row_idx_array().resize_and_reset(new_nnz);
-        coo_builder.get_col_idx_array() =
-            make_array_view(exec, new_nnz, new_col_idxs);
-        coo_builder.get_value_array() =
-            make_array_view(exec, new_nnz, new_vals);
-        new_row_idxs = m_out_coo->get_row_idxs();
-    }
-    if (num_blocks > 0) {
-        kernel::threshold_filter<subwarp_size>
-            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
-                old_row_ptrs, old_col_idxs, as_device_type(old_vals), num_rows,
-                as_device_type(threshold), new_row_ptrs, new_row_idxs,
-                new_col_idxs, as_device_type(new_vals), lower);
-    }
-}
-
-
-GKO_ENABLE_IMPLEMENTATION_SELECTION(select_threshold_filter, threshold_filter);
-
-
-}  // namespace
-
-template <typename ValueType, typename IndexType>
-void threshold_filter(std::shared_ptr<const DefaultExecutor> exec,
-                      const matrix::Csr<ValueType, IndexType>* a,
-                      remove_complex<ValueType> threshold,
-                      matrix::Csr<ValueType, IndexType>* m_out,
-                      matrix::Coo<ValueType, IndexType>* m_out_coo, bool lower)
-{
-    auto num_rows = a->get_size()[0];
-    auto total_nnz = a->get_num_stored_elements();
-    auto total_nnz_per_row = total_nnz / num_rows;
-    select_threshold_filter(
-        compiled_kernels(),
-        [&](int compiled_subwarp_size) {
-            return total_nnz_per_row <= compiled_subwarp_size ||
-                   compiled_subwarp_size == config::warp_size;
-        },
-        syn::value_list<int>(), syn::type_list<>(), exec, a, threshold, m_out,
-        m_out_coo, lower);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_KERNEL);
-
-
-}  // namespace par_ilut_factorization
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/factorization/par_ilut_select_kernels.cu b/cuda/factorization/par_ilut_select_kernels.cu
deleted file mode 100644
index 6a7bd53c1c4..00000000000
--- a/cuda/factorization/par_ilut_select_kernels.cu
+++ /dev/null
@@ -1,162 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/factorization/par_ilut_kernels.hpp"
-
-
-#include <algorithm>
-
-
-#include <ginkgo/core/base/array.hpp>
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/matrix/csr.hpp>
-
-
-#include "common/cuda_hip/base/runtime.hpp"
-#include "core/components/prefix_sum_kernels.hpp"
-#include "cuda/base/math.hpp"
-#include "cuda/components/atomic.cuh"
-#include "cuda/components/intrinsics.cuh"
-#include "cuda/components/prefix_sum.cuh"
-#include "cuda/components/searching.cuh"
-#include "cuda/components/sorting.cuh"
-#include "cuda/components/thread_ids.cuh"
-#include "cuda/factorization/par_ilut_select_common.cuh"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-/**
- * @brief The parallel ILUT factorization namespace.
- *
- * @ingroup factor
- */
-namespace par_ilut_factorization {
-
-
-#include "common/cuda_hip/factorization/par_ilut_select_kernels.hpp.inc"
-
-
-template <typename ValueType, typename IndexType>
-void sampleselect_filter(std::shared_ptr<const DefaultExecutor> exec,
-                         const ValueType* values, IndexType size,
-                         const unsigned char* oracles,
-                         const IndexType* partial_counts, IndexType bucket,
-                         remove_complex<ValueType>* out)
-{
-    auto num_threads_total = ceildiv(size, items_per_thread);
-    auto num_blocks =
-        static_cast<IndexType>(ceildiv(num_threads_total, default_block_size));
-    if (num_blocks > 0) {
-        kernel::filter_bucket<<<num_blocks, default_block_size, 0,
-                                exec->get_stream()>>>(
-            as_device_type(values), size, bucket, oracles, partial_counts,
-            as_device_type(out), items_per_thread);
-    }
-}
-
-
-template <typename ValueType, typename IndexType>
-void threshold_select(std::shared_ptr<const DefaultExecutor> exec,
-                      const matrix::Csr<ValueType, IndexType>* m,
-                      IndexType rank, array<ValueType>& tmp1,
-                      array<remove_complex<ValueType>>& tmp2,
-                      remove_complex<ValueType>& threshold)
-{
-    auto values = m->get_const_values();
-    IndexType size = m->get_num_stored_elements();
-    using AbsType = remove_complex<ValueType>;
-    constexpr auto bucket_count = kernel::searchtree_width;
-    auto max_num_threads = ceildiv(size, items_per_thread);
-    auto max_num_blocks = ceildiv(max_num_threads, default_block_size);
-
-    size_type tmp_size_totals =
-        ceildiv((bucket_count + 1) * sizeof(IndexType), sizeof(ValueType));
-    size_type tmp_size_partials = ceildiv(
-        bucket_count * max_num_blocks * sizeof(IndexType), sizeof(ValueType));
-    size_type tmp_size_oracles =
-        ceildiv(size * sizeof(unsigned char), sizeof(ValueType));
-    size_type tmp_size_tree =
-        ceildiv(kernel::searchtree_size * sizeof(AbsType), sizeof(ValueType));
-    size_type tmp_size_vals =
-        size / bucket_count * 4;  // pessimistic estimate for temporary storage
-    size_type tmp_size =
-        tmp_size_totals + tmp_size_partials + tmp_size_oracles + tmp_size_tree;
-    tmp1.resize_and_reset(tmp_size);
-    tmp2.resize_and_reset(tmp_size_vals);
-
-    auto total_counts = reinterpret_cast<IndexType*>(tmp1.get_data());
-    auto partial_counts =
-        reinterpret_cast<IndexType*>(tmp1.get_data() + tmp_size_totals);
-    auto oracles = reinterpret_cast<unsigned char*>(
-        tmp1.get_data() + tmp_size_totals + tmp_size_partials);
-    auto tree =
-        reinterpret_cast<AbsType*>(tmp1.get_data() + tmp_size_totals +
-                                   tmp_size_partials + tmp_size_oracles);
-
-    sampleselect_count(exec, values, size, tree, oracles, partial_counts,
-                       total_counts);
-
-    // determine bucket with correct rank, use bucket-local rank
-    auto bucket = sampleselect_find_bucket(exec, total_counts, rank);
-    rank -= bucket.begin;
-
-    if (bucket.size * 2 > tmp_size_vals) {
-        // we need to reallocate tmp2
-        tmp2.resize_and_reset(bucket.size * 2);
-    }
-    auto tmp21 = tmp2.get_data();
-    auto tmp22 = tmp2.get_data() + bucket.size;
-    // extract target bucket
-    sampleselect_filter(exec, values, size, oracles, partial_counts, bucket.idx,
-                        tmp22);
-
-    // recursively select from smaller buckets
-    int step{};
-    while (bucket.size > kernel::basecase_size) {
-        std::swap(tmp21, tmp22);
-        const auto* tmp_in = tmp21;
-        auto tmp_out = tmp22;
-
-        sampleselect_count(exec, tmp_in, bucket.size, tree, oracles,
-                           partial_counts, total_counts);
-        auto new_bucket = sampleselect_find_bucket(exec, total_counts, rank);
-        sampleselect_filter(exec, tmp_in, bucket.size, oracles, partial_counts,
-                            bucket.idx, tmp_out);
-
-        rank -= new_bucket.begin;
-        bucket.size = new_bucket.size;
-        // we should never need more than 5 recursion steps, this would mean
-        // 256^5 = 2^40. fall back to standard library algorithm in that case.
-        ++step;
-        if (step > 5) {
-            array<AbsType> cpu_out_array{
-                exec->get_master(),
-                make_array_view(exec, bucket.size, tmp_out)};
-            auto begin = cpu_out_array.get_data();
-            auto end = begin + bucket.size;
-            auto middle = begin + rank;
-            std::nth_element(begin, middle, end);
-            threshold = *middle;
-            return;
-        }
-    }
-
-    // base case
-    auto out_ptr = reinterpret_cast<AbsType*>(tmp1.get_data());
-    kernel::basecase_select<<<1, kernel::basecase_block_size, 0,
-                              exec->get_stream()>>>(
-        as_device_type(tmp22), bucket.size, rank, as_device_type(out_ptr));
-    threshold = exec->copy_val_to_host(out_ptr);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_PAR_ILUT_THRESHOLD_SELECT_KERNEL);
-
-
-}  // namespace par_ilut_factorization
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/factorization/par_ilut_spgeam_kernels.cu b/cuda/factorization/par_ilut_spgeam_kernels.cu
deleted file mode 100644
index 7f59e4edc37..00000000000
--- a/cuda/factorization/par_ilut_spgeam_kernels.cu
+++ /dev/null
@@ -1,159 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/factorization/par_ilut_kernels.hpp"
-
-
-#include <ginkgo/core/base/array.hpp>
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/matrix/coo.hpp>
-#include <ginkgo/core/matrix/csr.hpp>
-#include <ginkgo/core/matrix/dense.hpp>
-
-
-#include "common/cuda_hip/base/runtime.hpp"
-#include "core/components/prefix_sum_kernels.hpp"
-#include "core/matrix/coo_builder.hpp"
-#include "core/matrix/csr_builder.hpp"
-#include "core/matrix/csr_kernels.hpp"
-#include "core/synthesizer/implementation_selection.hpp"
-#include "cuda/base/math.hpp"
-#include "cuda/components/cooperative_groups.cuh"
-#include "cuda/components/intrinsics.cuh"
-#include "cuda/components/merging.cuh"
-#include "cuda/components/prefix_sum.cuh"
-#include "cuda/components/searching.cuh"
-#include "cuda/components/thread_ids.cuh"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-/**
- * @brief The parallel ILUT factorization namespace.
- *
- * @ingroup factor
- */
-namespace par_ilut_factorization {
-
-
-constexpr int default_block_size = 512;
-
-
-// subwarp sizes for add_candidates kernels
-using compiled_kernels =
-    syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
-
-
-#include "common/cuda_hip/factorization/par_ilut_spgeam_kernels.hpp.inc"
-
-
-namespace {
-
-
-template <int subwarp_size, typename ValueType, typename IndexType>
-void add_candidates(syn::value_list<int, subwarp_size>,
-                    std::shared_ptr<const DefaultExecutor> exec,
-                    const matrix::Csr<ValueType, IndexType>* lu,
-                    const matrix::Csr<ValueType, IndexType>* a,
-                    const matrix::Csr<ValueType, IndexType>* l,
-                    const matrix::Csr<ValueType, IndexType>* u,
-                    matrix::Csr<ValueType, IndexType>* l_new,
-                    matrix::Csr<ValueType, IndexType>* u_new)
-{
-    auto num_rows = static_cast<IndexType>(lu->get_size()[0]);
-    auto subwarps_per_block = default_block_size / subwarp_size;
-    auto num_blocks = ceildiv(num_rows, subwarps_per_block);
-    matrix::CsrBuilder<ValueType, IndexType> l_new_builder(l_new);
-    matrix::CsrBuilder<ValueType, IndexType> u_new_builder(u_new);
-    auto lu_row_ptrs = lu->get_const_row_ptrs();
-    auto lu_col_idxs = lu->get_const_col_idxs();
-    auto lu_vals = lu->get_const_values();
-    auto a_row_ptrs = a->get_const_row_ptrs();
-    auto a_col_idxs = a->get_const_col_idxs();
-    auto a_vals = a->get_const_values();
-    auto l_row_ptrs = l->get_const_row_ptrs();
-    auto l_col_idxs = l->get_const_col_idxs();
-    auto l_vals = l->get_const_values();
-    auto u_row_ptrs = u->get_const_row_ptrs();
-    auto u_col_idxs = u->get_const_col_idxs();
-    auto u_vals = u->get_const_values();
-    auto l_new_row_ptrs = l_new->get_row_ptrs();
-    auto u_new_row_ptrs = u_new->get_row_ptrs();
-    if (num_blocks > 0) {
-        // count non-zeros per row
-        kernel::tri_spgeam_nnz<subwarp_size>
-            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
-                lu_row_ptrs, lu_col_idxs, a_row_ptrs, a_col_idxs,
-                l_new_row_ptrs, u_new_row_ptrs, num_rows);
-    }
-
-    // build row ptrs
-    components::prefix_sum_nonnegative(exec, l_new_row_ptrs, num_rows + 1);
-    components::prefix_sum_nonnegative(exec, u_new_row_ptrs, num_rows + 1);
-
-    // resize output arrays
-    auto l_new_nnz = exec->copy_val_to_host(l_new_row_ptrs + num_rows);
-    auto u_new_nnz = exec->copy_val_to_host(u_new_row_ptrs + num_rows);
-    l_new_builder.get_col_idx_array().resize_and_reset(l_new_nnz);
-    l_new_builder.get_value_array().resize_and_reset(l_new_nnz);
-    u_new_builder.get_col_idx_array().resize_and_reset(u_new_nnz);
-    u_new_builder.get_value_array().resize_and_reset(u_new_nnz);
-
-    auto l_new_col_idxs = l_new->get_col_idxs();
-    auto l_new_vals = l_new->get_values();
-    auto u_new_col_idxs = u_new->get_col_idxs();
-    auto u_new_vals = u_new->get_values();
-
-    if (num_blocks > 0) {
-        // fill columns and values
-        kernel::tri_spgeam_init<subwarp_size>
-            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
-                lu_row_ptrs, lu_col_idxs, as_device_type(lu_vals), a_row_ptrs,
-                a_col_idxs, as_device_type(a_vals), l_row_ptrs, l_col_idxs,
-                as_device_type(l_vals), u_row_ptrs, u_col_idxs,
-                as_device_type(u_vals), l_new_row_ptrs, l_new_col_idxs,
-                as_device_type(l_new_vals), u_new_row_ptrs, u_new_col_idxs,
-                as_device_type(u_new_vals), num_rows);
-    }
-}
-
-
-GKO_ENABLE_IMPLEMENTATION_SELECTION(select_add_candidates, add_candidates);
-
-
-}  // namespace
-
-
-template <typename ValueType, typename IndexType>
-void add_candidates(std::shared_ptr<const DefaultExecutor> exec,
-                    const matrix::Csr<ValueType, IndexType>* lu,
-                    const matrix::Csr<ValueType, IndexType>* a,
-                    const matrix::Csr<ValueType, IndexType>* l,
-                    const matrix::Csr<ValueType, IndexType>* u,
-                    matrix::Csr<ValueType, IndexType>* l_new,
-                    matrix::Csr<ValueType, IndexType>* u_new)
-{
-    auto num_rows = a->get_size()[0];
-    auto total_nnz =
-        lu->get_num_stored_elements() + a->get_num_stored_elements();
-    auto total_nnz_per_row = total_nnz / num_rows;
-    select_add_candidates(
-        compiled_kernels(),
-        [&](int compiled_subwarp_size) {
-            return total_nnz_per_row <= compiled_subwarp_size ||
-                   compiled_subwarp_size == config::warp_size;
-        },
-        syn::value_list<int>(), syn::type_list<>(), exec, lu, a, l, u, l_new,
-        u_new);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_PAR_ILUT_ADD_CANDIDATES_KERNEL);
-
-
-}  // namespace par_ilut_factorization
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/factorization/par_ilut_sweep_kernels.cu b/cuda/factorization/par_ilut_sweep_kernels.cu
deleted file mode 100644
index 5ec8dd81325..00000000000
--- a/cuda/factorization/par_ilut_sweep_kernels.cu
+++ /dev/null
@@ -1,123 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/factorization/par_ilut_kernels.hpp"
-
-
-#include <ginkgo/core/base/array.hpp>
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/matrix/coo.hpp>
-#include <ginkgo/core/matrix/csr.hpp>
-#include <ginkgo/core/matrix/dense.hpp>
-
-
-#include "common/cuda_hip/base/runtime.hpp"
-#include "core/components/prefix_sum_kernels.hpp"
-#include "core/matrix/coo_builder.hpp"
-#include "core/matrix/csr_builder.hpp"
-#include "core/matrix/csr_kernels.hpp"
-#include "core/synthesizer/implementation_selection.hpp"
-#include "cuda/base/math.hpp"
-#include "cuda/components/intrinsics.cuh"
-#include "cuda/components/memory.cuh"
-#include "cuda/components/merging.cuh"
-#include "cuda/components/prefix_sum.cuh"
-#include "cuda/components/reduction.cuh"
-#include "cuda/components/searching.cuh"
-#include "cuda/components/thread_ids.cuh"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-/**
- * @brief The parallel ILUT factorization namespace.
- *
- * @ingroup factor
- */
-namespace par_ilut_factorization {
-
-
-constexpr int default_block_size = 512;
-
-
-// subwarp sizes for all warp-parallel kernels (filter, add_candidates)
-using compiled_kernels =
-    syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
-
-
-#include "common/cuda_hip/factorization/par_ilut_sweep_kernels.hpp.inc"
-
-
-namespace {
-
-
-template <int subwarp_size, typename ValueType, typename IndexType>
-void compute_l_u_factors(syn::value_list<int, subwarp_size>,
-                         std::shared_ptr<const DefaultExecutor> exec,
-                         const matrix::Csr<ValueType, IndexType>* a,
-                         matrix::Csr<ValueType, IndexType>* l,
-                         const matrix::Coo<ValueType, IndexType>* l_coo,
-                         matrix::Csr<ValueType, IndexType>* u,
-                         const matrix::Coo<ValueType, IndexType>* u_coo,
-                         matrix::Csr<ValueType, IndexType>* u_csc)
-{
-    auto total_nnz = static_cast<IndexType>(l->get_num_stored_elements() +
-                                            u->get_num_stored_elements());
-    auto block_size = default_block_size / subwarp_size;
-    auto num_blocks = ceildiv(total_nnz, block_size);
-    if (num_blocks > 0) {
-        kernel::sweep<subwarp_size>
-            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
-                a->get_const_row_ptrs(), a->get_const_col_idxs(),
-                as_device_type(a->get_const_values()), l->get_const_row_ptrs(),
-                l_coo->get_const_row_idxs(), l->get_const_col_idxs(),
-                as_device_type(l->get_values()),
-                static_cast<IndexType>(l->get_num_stored_elements()),
-                u_coo->get_const_row_idxs(), u_coo->get_const_col_idxs(),
-                as_device_type(u->get_values()), u_csc->get_const_row_ptrs(),
-                u_csc->get_const_col_idxs(),
-                as_device_type(u_csc->get_values()),
-                static_cast<IndexType>(u->get_num_stored_elements()));
-    }
-}
-
-GKO_ENABLE_IMPLEMENTATION_SELECTION(select_compute_l_u_factors,
-                                    compute_l_u_factors);
-
-
-}  // namespace
-
-
-template <typename ValueType, typename IndexType>
-void compute_l_u_factors(std::shared_ptr<const DefaultExecutor> exec,
-                         const matrix::Csr<ValueType, IndexType>* a,
-                         matrix::Csr<ValueType, IndexType>* l,
-                         const matrix::Coo<ValueType, IndexType>* l_coo,
-                         matrix::Csr<ValueType, IndexType>* u,
-                         const matrix::Coo<ValueType, IndexType>* u_coo,
-                         matrix::Csr<ValueType, IndexType>* u_csc)
-{
-    auto num_rows = a->get_size()[0];
-    auto total_nnz =
-        l->get_num_stored_elements() + u->get_num_stored_elements();
-    auto total_nnz_per_row = total_nnz / num_rows;
-    select_compute_l_u_factors(
-        compiled_kernels(),
-        [&](int compiled_subwarp_size) {
-            return total_nnz_per_row <= compiled_subwarp_size ||
-                   compiled_subwarp_size == config::warp_size;
-        },
-        syn::value_list<int>(), syn::type_list<>(), exec, a, l, l_coo, u, u_coo,
-        u_csc);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_PAR_ILUT_COMPUTE_LU_FACTORS_KERNEL);
-
-
-}  // namespace par_ilut_factorization
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/log/batch_logger.cuh b/cuda/log/batch_logger.cuh
deleted file mode 100644
index 3e53d6ef0a6..00000000000
--- a/cuda/log/batch_logger.cuh
+++ /dev/null
@@ -1,27 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_CUDA_LOG_BATCH_LOGGER_CUH_
-#define GKO_CUDA_LOG_BATCH_LOGGER_CUH_
-
-
-#include <ginkgo/core/base/types.hpp>
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-namespace batch_log {
-
-
-#include "common/cuda_hip/log/batch_logger.hpp.inc"
-
-
-}  // namespace batch_log
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
-
-
-#endif  // GKO_CUDA_LOG_BATCH_LOGGER_CUH_
diff --git a/cuda/matrix/batch_csr_kernels.cu b/cuda/matrix/batch_csr_kernels.cu
deleted file mode 100644
index 0d7da274ca8..00000000000
--- a/cuda/matrix/batch_csr_kernels.cu
+++ /dev/null
@@ -1,58 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/matrix/batch_csr_kernels.hpp"
-
-
-#include <thrust/functional.h>
-
-
-#include <ginkgo/core/base/batch_multi_vector.hpp>
-#include <ginkgo/core/base/types.hpp>
-#include <ginkgo/core/matrix/batch_csr.hpp>
-
-
-#include "common/cuda_hip/base/runtime.hpp"
-#include "core/base/batch_struct.hpp"
-#include "core/matrix/batch_struct.hpp"
-#include "cuda/base/batch_struct.hpp"
-#include "cuda/base/config.hpp"
-#include "cuda/base/thrust.cuh"
-#include "cuda/components/cooperative_groups.cuh"
-#include "cuda/components/reduction.cuh"
-#include "cuda/components/thread_ids.cuh"
-#include "cuda/components/uninitialized_array.hpp"
-#include "cuda/matrix/batch_struct.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-/**
- * @brief The Csr matrix format namespace.
- * @ref Csr
- * @ingroup batch_csr
- */
-namespace batch_csr {
-
-
-constexpr auto default_block_size = 256;
-constexpr int sm_oversubscription = 4;
-
-// clang-format off
-
-// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
-
-#include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc"
-
-
-#include "common/cuda_hip/matrix/batch_csr_kernel_launcher.hpp.inc"
-
-// clang-format on
-
-
-}  // namespace batch_csr
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/matrix/batch_dense_kernels.cu b/cuda/matrix/batch_dense_kernels.cu
deleted file mode 100644
index ea10d088b32..00000000000
--- a/cuda/matrix/batch_dense_kernels.cu
+++ /dev/null
@@ -1,59 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/matrix/batch_dense_kernels.hpp"
-
-
-#include <thrust/functional.h>
-
-
-#include <ginkgo/core/base/batch_multi_vector.hpp>
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/matrix/batch_dense.hpp>
-
-
-#include "common/cuda_hip/base/runtime.hpp"
-#include "core/base/batch_struct.hpp"
-#include "core/matrix/batch_struct.hpp"
-#include "cuda/base/batch_struct.hpp"
-#include "cuda/base/config.hpp"
-#include "cuda/base/thrust.cuh"
-#include "cuda/components/cooperative_groups.cuh"
-#include "cuda/components/reduction.cuh"
-#include "cuda/components/thread_ids.cuh"
-#include "cuda/components/uninitialized_array.hpp"
-#include "cuda/matrix/batch_struct.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-/**
- * @brief The Dense matrix format namespace.
- *
- * @ingroup batch_dense
- */
-namespace batch_dense {
-
-
-constexpr auto default_block_size = 256;
-constexpr int sm_oversubscription = 4;
-
-// clang-format off
-
-// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
-
-#include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc"
-
-
-#include "common/cuda_hip/matrix/batch_dense_kernel_launcher.hpp.inc"
-
-
-// clang-format on
-
-
-}  // namespace batch_dense
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/matrix/batch_ell_kernels.cu b/cuda/matrix/batch_ell_kernels.cu
deleted file mode 100644
index 15d6d6bbd5b..00000000000
--- a/cuda/matrix/batch_ell_kernels.cu
+++ /dev/null
@@ -1,58 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/matrix/batch_ell_kernels.hpp"
-
-
-#include <thrust/functional.h>
-
-
-#include <ginkgo/core/base/batch_multi_vector.hpp>
-#include <ginkgo/core/base/types.hpp>
-#include <ginkgo/core/matrix/batch_ell.hpp>
-
-
-#include "common/cuda_hip/base/runtime.hpp"
-#include "core/base/batch_struct.hpp"
-#include "core/matrix/batch_struct.hpp"
-#include "cuda/base/batch_struct.hpp"
-#include "cuda/base/config.hpp"
-#include "cuda/base/thrust.cuh"
-#include "cuda/components/cooperative_groups.cuh"
-#include "cuda/components/reduction.cuh"
-#include "cuda/components/thread_ids.cuh"
-#include "cuda/components/uninitialized_array.hpp"
-#include "cuda/matrix/batch_struct.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-/**
- * @brief The Ell matrix format namespace.
- * @ref Ell
- * @ingroup batch_ell
- */
-namespace batch_ell {
-
-
-constexpr auto default_block_size = 256;
-constexpr int sm_oversubscription = 4;
-
-// clang-format off
-
-// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
-
-#include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc"
-
-
-#include "common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc"
-
-// clang-format on
-
-
-}  // namespace batch_ell
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/matrix/coo_kernels.cu b/cuda/matrix/coo_kernels.cu
deleted file mode 100644
index 38df6a91c9f..00000000000
--- a/cuda/matrix/coo_kernels.cu
+++ /dev/null
@@ -1,49 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/matrix/coo_kernels.hpp"
-
-
-#include <ginkgo/core/base/exception_helpers.hpp>
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/base/types.hpp>
-#include <ginkgo/core/matrix/csr.hpp>
-#include <ginkgo/core/matrix/dense.hpp>
-
-
-#include "common/cuda_hip/base/runtime.hpp"
-#include "core/matrix/dense_kernels.hpp"
-#include "cuda/base/config.hpp"
-#include "cuda/base/cusparse_bindings.hpp"
-#include "cuda/base/math.hpp"
-#include "cuda/base/types.hpp"
-#include "cuda/components/atomic.cuh"
-#include "cuda/components/cooperative_groups.cuh"
-#include "cuda/components/format_conversion.cuh"
-#include "cuda/components/segment_scan.cuh"
-#include "cuda/components/thread_ids.cuh"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-/**
- * @brief The Coordinate matrix format namespace.
- *
- * @ingroup coo
- */
-namespace coo {
-
-
-constexpr int warps_in_block = 4;
-constexpr int spmv_block_size = warps_in_block * config::warp_size;
-
-
-#include "common/cuda_hip/matrix/coo_kernels.hpp.inc"
-
-
-}  // namespace coo
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/matrix/dense_kernels.cu b/cuda/matrix/dense_kernels.cu
deleted file mode 100644
index 2d159282e31..00000000000
--- a/cuda/matrix/dense_kernels.cu
+++ /dev/null
@@ -1,232 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/matrix/dense_kernels.hpp"
-
-
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/base/range_accessors.hpp>
-#include <ginkgo/core/matrix/coo.hpp>
-#include <ginkgo/core/matrix/csr.hpp>
-#include <ginkgo/core/matrix/diagonal.hpp>
-#include <ginkgo/core/matrix/ell.hpp>
-#include <ginkgo/core/matrix/fbcsr.hpp>
-#include <ginkgo/core/matrix/hybrid.hpp>
-#include <ginkgo/core/matrix/sellp.hpp>
-#include <ginkgo/core/matrix/sparsity_csr.hpp>
-
-
-#include "common/cuda_hip/base/runtime.hpp"
-#include "core/base/utils.hpp"
-#include "core/components/prefix_sum_kernels.hpp"
-#include "cuda/base/config.hpp"
-#include "cuda/base/cublas_bindings.hpp"
-#include "cuda/base/pointer_mode_guard.hpp"
-#include "cuda/components/cooperative_groups.cuh"
-#include "cuda/components/intrinsics.cuh"
-#include "cuda/components/reduction.cuh"
-#include "cuda/components/thread_ids.cuh"
-#include "cuda/components/uninitialized_array.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-/**
- * @brief The Dense matrix format namespace.
- *
- * @ingroup dense
- */
-namespace dense {
-
-
-constexpr int default_block_size = 512;
-
-
-#include "common/cuda_hip/matrix/dense_kernels.hpp.inc"
-
-
-template <typename ValueType>
-void compute_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
-                          const matrix::Dense<ValueType>* x,
-                          const matrix::Dense<ValueType>* y,
-                          matrix::Dense<ValueType>* result, array<char>& tmp)
-{
-    if (x->get_size()[1] == 1 && y->get_size()[1] == 1) {
-        if (blas::is_supported<ValueType>::value) {
-            auto handle = exec->get_blas_handle();
-            blas::dot(handle, x->get_size()[0], x->get_const_values(),
-                      x->get_stride(), y->get_const_values(), y->get_stride(),
-                      result->get_values());
-        } else {
-            compute_dot(exec, x, y, result, tmp);
-        }
-    } else {
-        compute_dot(exec, x, y, result, tmp);
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
-    GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL);
-
-
-template <typename ValueType>
-void compute_conj_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
-                               const matrix::Dense<ValueType>* x,
-                               const matrix::Dense<ValueType>* y,
-                               matrix::Dense<ValueType>* result,
-                               array<char>& tmp)
-{
-    if (x->get_size()[1] == 1 && y->get_size()[1] == 1) {
-        if (blas::is_supported<ValueType>::value) {
-            auto handle = exec->get_blas_handle();
-            blas::conj_dot(handle, x->get_size()[0], x->get_const_values(),
-                           x->get_stride(), y->get_const_values(),
-                           y->get_stride(), result->get_values());
-        } else {
-            compute_conj_dot(exec, x, y, result, tmp);
-        }
-    } else {
-        compute_conj_dot(exec, x, y, result, tmp);
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
-    GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL);
-
-
-template <typename ValueType>
-void compute_norm2_dispatch(std::shared_ptr<const DefaultExecutor> exec,
-                            const matrix::Dense<ValueType>* x,
-                            matrix::Dense<remove_complex<ValueType>>* result,
-                            array<char>& tmp)
-{
-    if (x->get_size()[1] == 1) {
-        if (blas::is_supported<ValueType>::value) {
-            auto handle = exec->get_blas_handle();
-            blas::norm2(handle, x->get_size()[0], x->get_const_values(),
-                        x->get_stride(), result->get_values());
-        } else {
-            compute_norm2(exec, x, result, tmp);
-        }
-    } else {
-        compute_norm2(exec, x, result, tmp);
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
-    GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL);
-
-
-template <typename ValueType>
-void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
-                  const matrix::Dense<ValueType>* a,
-                  const matrix::Dense<ValueType>* b,
-                  matrix::Dense<ValueType>* c)
-{
-    if (blas::is_supported<ValueType>::value) {
-        auto handle = exec->get_blas_handle();
-        if (c->get_size()[0] > 0 && c->get_size()[1] > 0) {
-            if (a->get_size()[1] > 0) {
-                blas::pointer_mode_guard pm_guard(handle);
-                auto alpha = one<ValueType>();
-                auto beta = zero<ValueType>();
-                blas::gemm(handle, BLAS_OP_N, BLAS_OP_N, c->get_size()[1],
-                           c->get_size()[0], a->get_size()[1], &alpha,
-                           b->get_const_values(), b->get_stride(),
-                           a->get_const_values(), a->get_stride(), &beta,
-                           c->get_values(), c->get_stride());
-            } else {
-                dense::fill(exec, c, zero<ValueType>());
-            }
-        }
-    } else {
-        GKO_NOT_IMPLEMENTED;
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_SIMPLE_APPLY_KERNEL);
-
-
-template <typename ValueType>
-void apply(std::shared_ptr<const DefaultExecutor> exec,
-           const matrix::Dense<ValueType>* alpha,
-           const matrix::Dense<ValueType>* a, const matrix::Dense<ValueType>* b,
-           const matrix::Dense<ValueType>* beta, matrix::Dense<ValueType>* c)
-{
-    if (blas::is_supported<ValueType>::value) {
-        if (c->get_size()[0] > 0 && c->get_size()[1] > 0) {
-            if (a->get_size()[1] > 0) {
-                blas::gemm(exec->get_blas_handle(), BLAS_OP_N, BLAS_OP_N,
-                           c->get_size()[1], c->get_size()[0], a->get_size()[1],
-                           alpha->get_const_values(), b->get_const_values(),
-                           b->get_stride(), a->get_const_values(),
-                           a->get_stride(), beta->get_const_values(),
-                           c->get_values(), c->get_stride());
-            } else {
-                dense::scale(exec, beta, c);
-            }
-        }
-    } else {
-        GKO_NOT_IMPLEMENTED;
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL);
-
-
-template <typename ValueType>
-void transpose(std::shared_ptr<const DefaultExecutor> exec,
-               const matrix::Dense<ValueType>* orig,
-               matrix::Dense<ValueType>* trans)
-{
-    if (blas::is_supported<ValueType>::value) {
-        auto handle = exec->get_blas_handle();
-        if (orig->get_size()[0] > 0 && orig->get_size()[1] > 0) {
-            blas::pointer_mode_guard pm_guard(handle);
-            auto alpha = one<ValueType>();
-            auto beta = zero<ValueType>();
-            blas::geam(handle, BLAS_OP_T, BLAS_OP_N, orig->get_size()[0],
-                       orig->get_size()[1], &alpha, orig->get_const_values(),
-                       orig->get_stride(), &beta, trans->get_const_values(),
-                       trans->get_stride(), trans->get_values(),
-                       trans->get_stride());
-        }
-    } else {
-        GKO_NOT_IMPLEMENTED;
-    }
-};
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_TRANSPOSE_KERNEL);
-
-
-template <typename ValueType>
-void conj_transpose(std::shared_ptr<const DefaultExecutor> exec,
-                    const matrix::Dense<ValueType>* orig,
-                    matrix::Dense<ValueType>* trans)
-{
-    if (blas::is_supported<ValueType>::value) {
-        auto handle = exec->get_blas_handle();
-        if (orig->get_size()[0] > 0 && orig->get_size()[1] > 0) {
-            blas::pointer_mode_guard pm_guard(handle);
-            auto alpha = one<ValueType>();
-            auto beta = zero<ValueType>();
-            blas::geam(handle, BLAS_OP_C, BLAS_OP_N, orig->get_size()[0],
-                       orig->get_size()[1], &alpha, orig->get_const_values(),
-                       orig->get_stride(), &beta, trans->get_const_values(),
-                       trans->get_stride(), trans->get_values(),
-                       trans->get_stride());
-        }
-    } else {
-        GKO_NOT_IMPLEMENTED;
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_CONJ_TRANSPOSE_KERNEL);
-
-
-}  // namespace dense
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/matrix/diagonal_kernels.cu b/cuda/matrix/diagonal_kernels.cu
deleted file mode 100644
index 7eaa35a638a..00000000000
--- a/cuda/matrix/diagonal_kernels.cu
+++ /dev/null
@@ -1,39 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/matrix/diagonal_kernels.hpp"
-
-
-#include <ginkgo/core/matrix/csr.hpp>
-#include <ginkgo/core/matrix/dense.hpp>
-
-
-#include "common/cuda_hip/base/runtime.hpp"
-#include "cuda/base/config.hpp"
-#include "cuda/base/types.hpp"
-#include "cuda/components/cooperative_groups.cuh"
-#include "cuda/components/thread_ids.cuh"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-/**
- * @brief The Diagonal matrix format namespace.
- *
- * @ingroup diagonal
- */
-namespace diagonal {
-
-
-constexpr int default_block_size = 512;
-
-
-#include "common/cuda_hip/matrix/diagonal_kernels.hpp.inc"
-
-
-}  // namespace diagonal
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/matrix/fbcsr_kernels.template.cu b/cuda/matrix/fbcsr_kernels.template.cu
deleted file mode 100644
index 3ad7f106049..00000000000
--- a/cuda/matrix/fbcsr_kernels.template.cu
+++ /dev/null
@@ -1,303 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/matrix/fbcsr_kernels.hpp"
-
-
-#include <algorithm>
-
-
-#include <thrust/copy.h>
-#include <thrust/count.h>
-#include <thrust/device_ptr.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/transform_output_iterator.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/sort.h>
-
-
-#include <ginkgo/core/base/array.hpp>
-#include <ginkgo/core/base/exception_helpers.hpp>
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/matrix/csr.hpp>
-#include <ginkgo/core/matrix/dense.hpp>
-
-
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/unified/base/kernel_launch.hpp"
-#include "core/base/array_access.hpp"
-#include "core/base/block_sizes.hpp"
-#include "core/base/device_matrix_data_kernels.hpp"
-#include "core/components/fill_array_kernels.hpp"
-#include "core/components/format_conversion_kernels.hpp"
-#include "core/matrix/csr_lookup.hpp"
-#include "core/matrix/dense_kernels.hpp"
-#include "core/synthesizer/implementation_selection.hpp"
-#include "cuda/base/config.hpp"
-#include "cuda/base/cublas_bindings.hpp"
-#include "cuda/base/cusparse_bindings.hpp"
-#include "cuda/base/cusparse_block_bindings.hpp"
-#include "cuda/base/math.hpp"
-#include "cuda/base/pointer_mode_guard.hpp"
-#include "cuda/base/thrust.cuh"
-#include "cuda/base/types.hpp"
-#include "cuda/components/atomic.cuh"
-#include "cuda/components/cooperative_groups.cuh"
-#include "cuda/components/merging.cuh"
-#include "cuda/components/prefix_sum.cuh"
-#include "cuda/components/reduction.cuh"
-#include "cuda/components/thread_ids.cuh"
-#include "cuda/components/uninitialized_array.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-
-
-/**
- * @brief The fixed-size block compressed sparse row matrix format namespace.
- *
- * @ingroup fbcsr
- */
-namespace fbcsr {
-
-
-constexpr int default_block_size{512};
-
-
-#include "common/cuda_hip/matrix/csr_common.hpp.inc"
-#include "common/cuda_hip/matrix/fbcsr_kernels.hpp.inc"
-
-
-namespace {
-
-
-template <typename ValueType>
-void dense_transpose(std::shared_ptr<const CudaExecutor> exec,
-                     const size_type nrows, const size_type ncols,
-                     const size_type orig_stride, const ValueType* const orig,
-                     const size_type trans_stride, ValueType* const trans)
-{
-    if (nrows == 0) {
-        return;
-    }
-    if (blas::is_supported<ValueType>::value) {
-        auto handle = exec->get_blas_handle();
-        {
-            blas::pointer_mode_guard pm_guard(handle);
-            auto alpha = one<ValueType>();
-            auto beta = zero<ValueType>();
-            blas::geam(handle, BLAS_OP_T, BLAS_OP_N, nrows, ncols, &alpha, orig,
-                       orig_stride, &beta, trans, trans_stride, trans,
-                       trans_stride);
-        }
-    } else {
-        GKO_NOT_IMPLEMENTED;
-    }
-}
-
-
-}  // namespace
-
-
-template <typename ValueType, typename IndexType>
-void spmv(std::shared_ptr<const CudaExecutor> exec,
-          const matrix::Fbcsr<ValueType, IndexType>* const a,
-          const matrix::Dense<ValueType>* const b,
-          matrix::Dense<ValueType>* const c)
-{
-    if (c->get_size()[0] == 0 || c->get_size()[1] == 0) {
-        // empty output: nothing to do
-        return;
-    }
-    if (b->get_size()[0] == 0 || a->get_num_stored_blocks() == 0) {
-        // empty input: fill output with zero
-        dense::fill(exec, c, zero<ValueType>());
-        return;
-    }
-    if (sparselib::is_supported<ValueType, IndexType>::value) {
-        auto handle = exec->get_sparselib_handle();
-        sparselib::pointer_mode_guard pm_guard(handle);
-        const auto alpha = one<ValueType>();
-        const auto beta = zero<ValueType>();
-        auto descr = sparselib::create_mat_descr();
-        const auto row_ptrs = a->get_const_row_ptrs();
-        const auto col_idxs = a->get_const_col_idxs();
-        const auto values = a->get_const_values();
-        const int bs = a->get_block_size();
-        const IndexType mb = a->get_num_block_rows();
-        const IndexType nb = a->get_num_block_cols();
-        const auto nnzb = static_cast<IndexType>(a->get_num_stored_blocks());
-        const auto nrhs = static_cast<IndexType>(b->get_size()[1]);
-        const auto nrows = a->get_size()[0];
-        const auto ncols = a->get_size()[1];
-        const auto in_stride = b->get_stride();
-        const auto out_stride = c->get_stride();
-        if (nrhs == 1 && in_stride == 1 && out_stride == 1) {
-            sparselib::bsrmv(handle, SPARSELIB_OPERATION_NON_TRANSPOSE, mb, nb,
-                             nnzb, &alpha, descr, values, row_ptrs, col_idxs,
-                             bs, b->get_const_values(), &beta, c->get_values());
-        } else {
-            const auto trans_stride = nrows;
-            auto trans_c = array<ValueType>(exec, nrows * nrhs);
-            sparselib::bsrmm(handle, SPARSELIB_OPERATION_NON_TRANSPOSE,
-                             SPARSELIB_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb,
-                             &alpha, descr, values, row_ptrs, col_idxs, bs,
-                             b->get_const_values(), in_stride, &beta,
-                             trans_c.get_data(), trans_stride);
-            dense_transpose(exec, nrhs, nrows, trans_stride, trans_c.get_data(),
-                            out_stride, c->get_values());
-        }
-        sparselib::destroy(descr);
-    } else {
-        GKO_NOT_IMPLEMENTED;
-    }
-}
-
-
-template <typename ValueType, typename IndexType>
-void advanced_spmv(std::shared_ptr<const CudaExecutor> exec,
-                   const matrix::Dense<ValueType>* const alpha,
-                   const matrix::Fbcsr<ValueType, IndexType>* const a,
-                   const matrix::Dense<ValueType>* const b,
-                   const matrix::Dense<ValueType>* const beta,
-                   matrix::Dense<ValueType>* const c)
-{
-    if (c->get_size()[0] == 0 || c->get_size()[1] == 0) {
-        // empty output: nothing to do
-        return;
-    }
-    if (b->get_size()[0] == 0 || a->get_num_stored_blocks() == 0) {
-        // empty input: scale output
-        dense::scale(exec, beta, c);
-        return;
-    }
-    if (sparselib::is_supported<ValueType, IndexType>::value) {
-        auto handle = exec->get_sparselib_handle();
-        const auto alphp = alpha->get_const_values();
-        const auto betap = beta->get_const_values();
-        auto descr = sparselib::create_mat_descr();
-        const auto row_ptrs = a->get_const_row_ptrs();
-        const auto col_idxs = a->get_const_col_idxs();
-        const auto values = a->get_const_values();
-        const int bs = a->get_block_size();
-        const IndexType mb = a->get_num_block_rows();
-        const IndexType nb = a->get_num_block_cols();
-        const auto nnzb = static_cast<IndexType>(a->get_num_stored_blocks());
-        const auto nrhs = static_cast<IndexType>(b->get_size()[1]);
-        const auto nrows = a->get_size()[0];
-        const auto ncols = a->get_size()[1];
-        const auto in_stride = b->get_stride();
-        const auto out_stride = c->get_stride();
-        if (nrhs == 1 && in_stride == 1 && out_stride == 1) {
-            sparselib::bsrmv(handle, SPARSELIB_OPERATION_NON_TRANSPOSE, mb, nb,
-                             nnzb, alphp, descr, values, row_ptrs, col_idxs, bs,
-                             b->get_const_values(), betap, c->get_values());
-        } else {
-            const auto trans_stride = nrows;
-            auto trans_c = array<ValueType>(exec, nrows * nrhs);
-            dense_transpose(exec, nrows, nrhs, out_stride, c->get_values(),
-                            trans_stride, trans_c.get_data());
-            sparselib::bsrmm(handle, SPARSELIB_OPERATION_NON_TRANSPOSE,
-                             SPARSELIB_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb,
-                             alphp, descr, values, row_ptrs, col_idxs, bs,
-                             b->get_const_values(), in_stride, betap,
-                             trans_c.get_data(), trans_stride);
-            dense_transpose(exec, nrhs, nrows, trans_stride, trans_c.get_data(),
-                            out_stride, c->get_values());
-        }
-        sparselib::destroy(descr);
-    } else {
-        GKO_NOT_IMPLEMENTED;
-    }
-}
-
-
-namespace {
-
-
-template <int mat_blk_sz, typename ValueType, typename IndexType>
-void transpose_blocks_impl(syn::value_list<int, mat_blk_sz>,
-                           std::shared_ptr<const DefaultExecutor> exec,
-                           matrix::Fbcsr<ValueType, IndexType>* const mat)
-{
-    constexpr int subwarp_size = config::warp_size;
-    const auto nbnz = mat->get_num_stored_blocks();
-    const auto numthreads = nbnz * subwarp_size;
-    const auto block_size = default_block_size;
-    const auto grid_dim = ceildiv(numthreads, block_size);
-    if (grid_dim > 0) {
-        kernel::transpose_blocks<mat_blk_sz, subwarp_size>
-            <<<grid_dim, block_size, 0, exec->get_stream()>>>(
-                nbnz, mat->get_values());
-    }
-}
-
-GKO_ENABLE_IMPLEMENTATION_SELECTION(select_transpose_blocks,
-                                    transpose_blocks_impl);
-
-
-}  // namespace
-
-
-template <typename ValueType, typename IndexType>
-void transpose(const std::shared_ptr<const CudaExecutor> exec,
-               const matrix::Fbcsr<ValueType, IndexType>* const orig,
-               matrix::Fbcsr<ValueType, IndexType>* const trans)
-{
-#ifdef GKO_COMPILING_CUDA
-    if (sparselib::is_supported<ValueType, IndexType>::value) {
-        const int bs = orig->get_block_size();
-        const IndexType nnzb =
-            static_cast<IndexType>(orig->get_num_stored_blocks());
-        cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC;
-        cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO;
-        const IndexType buffer_size = sparselib::bsr_transpose_buffersize(
-            exec->get_sparselib_handle(), orig->get_num_block_rows(),
-            orig->get_num_block_cols(), nnzb, orig->get_const_values(),
-            orig->get_const_row_ptrs(), orig->get_const_col_idxs(), bs, bs);
-        array<char> buffer_array(exec, buffer_size);
-        auto buffer = buffer_array.get_data();
-        sparselib::bsr_transpose(
-            exec->get_sparselib_handle(), orig->get_num_block_rows(),
-            orig->get_num_block_cols(), nnzb, orig->get_const_values(),
-            orig->get_const_row_ptrs(), orig->get_const_col_idxs(), bs, bs,
-            trans->get_values(), trans->get_col_idxs(), trans->get_row_ptrs(),
-            copyValues, idxBase, buffer);
-
-        // transpose blocks
-        select_transpose_blocks(
-            fixedblock::compiled_kernels(),
-            [bs](int compiled_block_size) { return bs == compiled_block_size; },
-            syn::value_list<int>(), syn::type_list<>(), exec, trans);
-    } else
-#endif
-    {
-        fallback_transpose(exec, orig, trans);
-    }
-}
-
-
-template <typename ValueType, typename IndexType>
-void conj_transpose(std::shared_ptr<const CudaExecutor> exec,
-                    const matrix::Fbcsr<ValueType, IndexType>* orig,
-                    matrix::Fbcsr<ValueType, IndexType>* trans)
-{
-    const int grid_size =
-        ceildiv(trans->get_num_stored_elements(), default_block_size);
-    transpose(exec, orig, trans);
-    if (grid_size > 0 && is_complex<ValueType>()) {
-        kernel::
-            conjugate<<<grid_size, default_block_size, 0, exec->get_stream()>>>(
-                trans->get_num_stored_elements(),
-                as_device_type(trans->get_values()));
-    }
-}
-
-
-}  // namespace fbcsr
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/matrix/sellp_kernels.cu b/cuda/matrix/sellp_kernels.cu
deleted file mode 100644
index 4dcc756a186..00000000000
--- a/cuda/matrix/sellp_kernels.cu
+++ /dev/null
@@ -1,44 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/matrix/sellp_kernels.hpp"
-
-
-#include <ginkgo/core/base/exception_helpers.hpp>
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/base/types.hpp>
-#include <ginkgo/core/matrix/csr.hpp>
-#include <ginkgo/core/matrix/dense.hpp>
-
-
-#include "common/cuda_hip/base/runtime.hpp"
-#include "core/components/prefix_sum_kernels.hpp"
-#include "cuda/base/config.hpp"
-#include "cuda/base/cusparse_bindings.hpp"
-#include "cuda/base/types.hpp"
-#include "cuda/components/reduction.cuh"
-#include "cuda/components/thread_ids.cuh"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-/**
- * @brief The SELL-P matrix format namespace.
- *
- * @ingroup sellp
- */
-namespace sellp {
-
-
-constexpr int default_block_size = 512;
-
-
-#include "common/cuda_hip/matrix/sellp_kernels.hpp.inc"
-
-
-}  // namespace sellp
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/matrix/sparsity_csr_kernels.cu b/cuda/matrix/sparsity_csr_kernels.cu
deleted file mode 100644
index 8176581859b..00000000000
--- a/cuda/matrix/sparsity_csr_kernels.cu
+++ /dev/null
@@ -1,226 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/matrix/sparsity_csr_kernels.hpp"
-
-
-#include <thrust/sort.h>
-
-
-#include <ginkgo/core/base/exception_helpers.hpp>
-
-
-#include "accessor/cuda_helper.hpp"
-#include "accessor/reduced_row_major.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "core/base/mixed_precision_types.hpp"
-#include "core/components/fill_array_kernels.hpp"
-#include "core/components/format_conversion_kernels.hpp"
-#include "core/synthesizer/implementation_selection.hpp"
-#include "cuda/base/config.hpp"
-#include "cuda/base/cusparse_bindings.hpp"
-#include "cuda/base/math.hpp"
-#include "cuda/base/thrust.cuh"
-#include "cuda/base/types.hpp"
-#include "cuda/components/cooperative_groups.cuh"
-#include "cuda/components/reduction.cuh"
-#include "cuda/components/thread_ids.cuh"
-#include "cuda/components/uninitialized_array.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-/**
- * @brief The Compressed sparse row matrix format namespace.
- *
- * @ingroup sparsity
- */
-namespace sparsity_csr {
-
-
-constexpr int classical_oversubscription = 32;
-constexpr int default_block_size = 512;
-#ifdef GKO_COMPILING_HIP
-constexpr int spmv_block_size = 256;
-#else
-constexpr int spmv_block_size = 128;
-#endif
-constexpr int warps_in_block = 4;
-
-
-using classical_kernels = syn::value_list<int, 2>;
-
-
-#include "common/cuda_hip/matrix/csr_common.hpp.inc"
-#include "common/cuda_hip/matrix/sparsity_csr_kernels.hpp.inc"
-
-
-namespace host_kernel {
-
-
-template <int subwarp_size, typename MatrixValueType, typename InputValueType,
-          typename OutputValueType, typename IndexType>
-void classical_spmv(syn::value_list<int, subwarp_size>,
-                    std::shared_ptr<const CudaExecutor> exec,
-                    const matrix::SparsityCsr<MatrixValueType, IndexType>* a,
-                    const matrix::Dense<InputValueType>* b,
-                    matrix::Dense<OutputValueType>* c,
-                    const matrix::Dense<MatrixValueType>* alpha = nullptr,
-                    const matrix::Dense<OutputValueType>* beta = nullptr)
-{
-    using arithmetic_type =
-        highest_precision<InputValueType, OutputValueType, MatrixValueType>;
-    using input_accessor =
-        gko::acc::reduced_row_major<2, arithmetic_type, const InputValueType>;
-    using output_accessor =
-        gko::acc::reduced_row_major<2, arithmetic_type, OutputValueType>;
-
-    const auto nwarps = exec->get_num_warps_per_sm() *
-                        exec->get_num_multiprocessor() *
-                        classical_oversubscription;
-    const auto gridx =
-        std::min(ceildiv(a->get_size()[0], spmv_block_size / subwarp_size),
-                 int64(nwarps / warps_in_block));
-    const dim3 grid(gridx, b->get_size()[1]);
-    const auto block = spmv_block_size;
-
-    const auto b_vals = gko::acc::range<input_accessor>(
-        std::array<acc::size_type, 2>{
-            {static_cast<acc::size_type>(b->get_size()[0]),
-             static_cast<acc::size_type>(b->get_size()[1])}},
-        b->get_const_values(),
-        std::array<acc::size_type, 1>{
-            {static_cast<acc::size_type>(b->get_stride())}});
-    auto c_vals = gko::acc::range<output_accessor>(
-        std::array<acc::size_type, 2>{
-            {static_cast<acc::size_type>(c->get_size()[0]),
-             static_cast<acc::size_type>(c->get_size()[1])}},
-        c->get_values(),
-        std::array<acc::size_type, 1>{
-            {static_cast<acc::size_type>(c->get_stride())}});
-    if (c->get_size()[0] == 0 || c->get_size()[1] == 0) {
-        // empty output: nothing to do
-        return;
-    }
-    if (alpha == nullptr && beta == nullptr) {
-        kernel::abstract_classical_spmv<subwarp_size>
-            <<<grid, block, 0, exec->get_stream()>>>(
-                a->get_size()[0], as_device_type(a->get_const_value()),
-                a->get_const_col_idxs(),
-                as_device_type(a->get_const_row_ptrs()),
-                acc::as_cuda_range(b_vals), acc::as_cuda_range(c_vals));
-    } else if (alpha != nullptr && beta != nullptr) {
-        kernel::abstract_classical_spmv<subwarp_size>
-            <<<grid, block, 0, exec->get_stream()>>>(
-                a->get_size()[0], as_device_type(alpha->get_const_values()),
-                as_device_type(a->get_const_value()), a->get_const_col_idxs(),
-                as_device_type(a->get_const_row_ptrs()),
-                acc::as_cuda_range(b_vals),
-                as_device_type(beta->get_const_values()),
-                acc::as_cuda_range(c_vals));
-    } else {
-        GKO_KERNEL_NOT_FOUND;
-    }
-}
-
-GKO_ENABLE_IMPLEMENTATION_SELECTION(select_classical_spmv, classical_spmv);
-
-
-}  // namespace host_kernel
-
-template <typename MatrixValueType, typename InputValueType,
-          typename OutputValueType, typename IndexType>
-void spmv(std::shared_ptr<const CudaExecutor> exec,
-          const matrix::SparsityCsr<MatrixValueType, IndexType>* a,
-          const matrix::Dense<InputValueType>* b,
-          matrix::Dense<OutputValueType>* c)
-{
-    host_kernel::select_classical_spmv(
-        classical_kernels(), [](int compiled_info) { return true; },
-        syn::value_list<int>(), syn::type_list<>(), exec, a, b, c);
-}
-
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_SPARSITY_CSR_SPMV_KERNEL);
-
-
-template <typename MatrixValueType, typename InputValueType,
-          typename OutputValueType, typename IndexType>
-void advanced_spmv(std::shared_ptr<const CudaExecutor> exec,
-                   const matrix::Dense<MatrixValueType>* alpha,
-                   const matrix::SparsityCsr<MatrixValueType, IndexType>* a,
-                   const matrix::Dense<InputValueType>* b,
-                   const matrix::Dense<OutputValueType>* beta,
-                   matrix::Dense<OutputValueType>* c)
-{
-    host_kernel::select_classical_spmv(
-        classical_kernels(), [](int compiled_info) { return true; },
-        syn::value_list<int>(), syn::type_list<>(), exec, a, b, c, alpha, beta);
-}
-
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_SPARSITY_CSR_ADVANCED_SPMV_KERNEL);
-
-
-template <typename ValueType, typename IndexType>
-void sort_by_column_index(std::shared_ptr<const DefaultExecutor> exec,
-                          matrix::SparsityCsr<ValueType, IndexType>* to_sort)
-{
-    const auto nnz = static_cast<IndexType>(to_sort->get_num_nonzeros());
-    const auto num_rows = static_cast<IndexType>(to_sort->get_size()[0]);
-    const auto num_cols = static_cast<IndexType>(to_sort->get_size()[1]);
-    const auto row_ptrs = to_sort->get_const_row_ptrs();
-    const auto col_idxs = to_sort->get_col_idxs();
-    if (sparselib::is_supported<ValueType, IndexType>::value) {
-        const auto handle = exec->get_sparselib_handle();
-        auto descr = sparselib::create_mat_descr();
-        array<IndexType> permutation_array(exec, to_sort->get_num_nonzeros());
-        auto permutation = permutation_array.get_data();
-        components::fill_seq_array(exec, permutation,
-                                   to_sort->get_num_nonzeros());
-        size_type buffer_size{};
-        sparselib::csrsort_buffer_size(handle, num_rows, num_cols, nnz,
-                                       row_ptrs, col_idxs, buffer_size);
-        array<char> buffer_array{exec, buffer_size};
-        auto buffer = buffer_array.get_data();
-        sparselib::csrsort(handle, num_rows, num_cols, nnz, descr, row_ptrs,
-                           col_idxs, permutation, buffer);
-        sparselib::destroy(descr);
-    } else {
-        fallback_sort(exec, to_sort);
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_SPARSITY_CSR_SORT_BY_COLUMN_INDEX);
-
-
-template <typename ValueType, typename IndexType>
-void is_sorted_by_column_index(
-    std::shared_ptr<const DefaultExecutor> exec,
-    const matrix::SparsityCsr<ValueType, IndexType>* to_check, bool* is_sorted)
-{
-    *is_sorted = true;
-    auto cpu_array = make_array_view(exec->get_master(), 1, is_sorted);
-    auto gpu_array = array<bool>{exec, cpu_array};
-    const auto num_rows = static_cast<IndexType>(to_check->get_size()[0]);
-    auto num_blocks = ceildiv(num_rows, default_block_size);
-    if (num_blocks > 0) {
-        kernel::check_unsorted<<<num_blocks, default_block_size, 0,
-                                 exec->get_stream()>>>(
-            to_check->get_const_row_ptrs(), to_check->get_const_col_idxs(),
-            num_rows, gpu_array.get_data());
-    }
-    cpu_array = gpu_array;
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_SPARSITY_CSR_IS_SORTED_BY_COLUMN_INDEX);
-
-
-}  // namespace sparsity_csr
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/multigrid/pgm_kernels.cu b/cuda/multigrid/pgm_kernels.cu
deleted file mode 100644
index 1b3915c82e9..00000000000
--- a/cuda/multigrid/pgm_kernels.cu
+++ /dev/null
@@ -1,43 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/multigrid/pgm_kernels.hpp"
-
-
-#include <memory>
-
-
-#include <thrust/device_ptr.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/reduce.h>
-#include <thrust/sort.h>
-#include <thrust/tuple.h>
-
-
-#include <ginkgo/core/base/exception_helpers.hpp>
-#include <ginkgo/core/base/math.hpp>
-
-
-#include "cuda/base/thrust.cuh"
-#include "cuda/base/types.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-/**
- * @brief The PGM solver namespace.
- *
- * @ingroup pgm
- */
-namespace pgm {
-
-
-#include "common/cuda_hip/multigrid/pgm_kernels.hpp.inc"
-
-
-}  // namespace pgm
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/preconditioner/isai_kernels.cu b/cuda/preconditioner/isai_kernels.cu
deleted file mode 100644
index 0912b4c25f5..00000000000
--- a/cuda/preconditioner/isai_kernels.cu
+++ /dev/null
@@ -1,49 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/preconditioner/isai_kernels.hpp"
-
-
-#include <ginkgo/core/base/exception_helpers.hpp>
-#include <ginkgo/core/base/executor.hpp>
-#include <ginkgo/core/matrix/csr.hpp>
-
-
-#include "common/cuda_hip/base/runtime.hpp"
-#include "core/components/prefix_sum_kernels.hpp"
-#include "core/matrix/csr_builder.hpp"
-#include "cuda/base/config.hpp"
-#include "cuda/base/math.hpp"
-#include "cuda/base/types.hpp"
-#include "cuda/components/cooperative_groups.cuh"
-#include "cuda/components/merging.cuh"
-#include "cuda/components/reduction.cuh"
-#include "cuda/components/thread_ids.cuh"
-#include "cuda/components/uninitialized_array.hpp"
-#include "cuda/components/warp_blas.cuh"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-/**
- * @brief The Isai preconditioner namespace.
- * @ref Isai
- * @ingroup isai
- */
-namespace isai {
-
-
-constexpr int subwarp_size{row_size_limit};
-constexpr int subwarps_per_block{2};
-constexpr int default_block_size{subwarps_per_block * subwarp_size};
-
-
-#include "common/cuda_hip/preconditioner/isai_kernels.hpp.inc"
-
-
-}  // namespace isai
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/preconditioner/jacobi_kernels.cu b/cuda/preconditioner/jacobi_kernels.cu
deleted file mode 100644
index dfe8d042b29..00000000000
--- a/cuda/preconditioner/jacobi_kernels.cu
+++ /dev/null
@@ -1,51 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/preconditioner/jacobi_kernels.hpp"
-
-
-#include <ginkgo/core/base/exception_helpers.hpp>
-
-
-#include "common/cuda_hip/base/runtime.hpp"
-#include "core/base/extended_float.hpp"
-#include "core/preconditioner/jacobi_utils.hpp"
-#include "core/synthesizer/implementation_selection.hpp"
-#include "cuda/base/config.hpp"
-#include "cuda/base/math.hpp"
-#include "cuda/base/types.hpp"
-#include "cuda/components/cooperative_groups.cuh"
-#include "cuda/components/thread_ids.cuh"
-#include "cuda/preconditioner/jacobi_common.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-/**
- * @brief The Jacobi preconditioner namespace.
- * @ref Jacobi
- * @ingroup jacobi
- */
-namespace jacobi {
-
-
-// a total of 32/16 warps (1024 threads)
-#if defined(GKO_COMPILING_HIP) && GINKGO_HIP_PLATFORM_HCC
-constexpr int default_num_warps = 16;
-#else  // !defined(GKO_COMPILING_HIP) || GINKGO_HIP_PLATFORM_NVCC
-constexpr int default_num_warps = 32;
-#endif
-// with current architectures, at most 32 warps can be scheduled per SM (and
-// current GPUs have at most 84 SMs)
-constexpr int default_grid_size = 32 * 32 * 128;
-
-
-#include "common/cuda_hip/preconditioner/jacobi_kernels.hpp.inc"
-
-
-}  // namespace jacobi
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/reorder/rcm_kernels.cu b/cuda/reorder/rcm_kernels.cu
deleted file mode 100644
index d699d00dfb6..00000000000
--- a/cuda/reorder/rcm_kernels.cu
+++ /dev/null
@@ -1,54 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/reorder/rcm_kernels.hpp"
-
-
-#include <thrust/binary_search.h>
-#include <thrust/copy.h>
-#include <thrust/count.h>
-#include <thrust/iterator/discard_iterator.h>
-#include <thrust/iterator/permutation_iterator.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/reduce.h>
-#include <thrust/sequence.h>
-#include <thrust/sort.h>
-#include <thrust/transform.h>
-
-
-#include <ginkgo/core/base/array.hpp>
-#include <ginkgo/core/base/std_extensions.hpp>
-#include <ginkgo/core/base/types.hpp>
-#include <ginkgo/core/matrix/csr.hpp>
-#include <ginkgo/core/matrix/permutation.hpp>
-#include <ginkgo/core/matrix/sparsity_csr.hpp>
-
-
-#include "core/base/array_access.hpp"
-#include "cuda/base/thrust.cuh"
-#include "cuda/components/memory.cuh"
-#include "cuda/components/thread_ids.cuh"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-/**
- * @brief The reordering namespace.
- *
- * @ingroup reorder
- */
-namespace rcm {
-
-
-constexpr int default_block_size = 512;
-
-
-#include "common/cuda_hip/reorder/rcm_kernels.hpp.inc"
-
-
-}  // namespace rcm
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/solver/cb_gmres_kernels.cu b/cuda/solver/cb_gmres_kernels.cu
deleted file mode 100644
index 3cbe036f55f..00000000000
--- a/cuda/solver/cb_gmres_kernels.cu
+++ /dev/null
@@ -1,507 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/solver/cb_gmres_kernels.hpp"
-
-
-#include <algorithm>
-
-
-#include <ginkgo/core/base/exception_helpers.hpp>
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/matrix/dense.hpp>
-#include <ginkgo/core/stop/stopping_status.hpp>
-
-
-#include "accessor/cuda_helper.hpp"
-#include "accessor/range.hpp"
-#include "accessor/reduced_row_major.hpp"
-#include "accessor/scaled_reduced_row_major.hpp"
-#include "core/base/array_access.hpp"
-#include "core/components/fill_array_kernels.hpp"
-#include "core/matrix/dense_kernels.hpp"
-#include "core/solver/cb_gmres_accessor.hpp"
-#include "cuda/base/config.hpp"
-#include "cuda/base/math.hpp"
-#include "cuda/base/types.hpp"
-#include "cuda/components/atomic.cuh"
-#include "cuda/components/cooperative_groups.cuh"
-#include "cuda/components/reduction.cuh"
-#include "cuda/components/thread_ids.cuh"
-#include "cuda/components/uninitialized_array.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-/**
- * @brief The CB_GMRES solver namespace.
- *
- * @ingroup cb_gmres
- */
-namespace cb_gmres {
-
-
-constexpr int default_block_size = 512;
-// default_dot_dim can not be 64 in hip because 64 * 64 exceeds their max block
-// size limit.
-constexpr int default_dot_dim = 32;
-constexpr int default_dot_size = default_dot_dim * default_dot_dim;
-
-
-#include "common/cuda_hip/solver/cb_gmres_kernels.hpp.inc"
-
-
-template <typename ValueType>
-void zero_matrix(std::shared_ptr<const DefaultExecutor> exec, size_type m,
-                 size_type n, size_type stride, ValueType* array)
-{
-    const auto block_size = default_block_size;
-    const auto grid_size = ceildiv(n, block_size);
-    zero_matrix_kernel<<<grid_size, block_size, 0, exec->get_stream()>>>(
-        m, n, stride, as_device_type(array));
-}
-
-
-template <typename ValueType>
-void initialize(std::shared_ptr<const DefaultExecutor> exec,
-                const matrix::Dense<ValueType>* b,
-                matrix::Dense<ValueType>* residual,
-                matrix::Dense<ValueType>* givens_sin,
-                matrix::Dense<ValueType>* givens_cos,
-                array<stopping_status>* stop_status, size_type krylov_dim)
-{
-    const auto num_threads = std::max(b->get_size()[0] * b->get_stride(),
-                                      krylov_dim * b->get_size()[1]);
-    const auto grid_dim = ceildiv(num_threads, default_block_size);
-    const auto block_dim = default_block_size;
-    constexpr auto block_size = default_block_size;
-
-    initialize_kernel<block_size>
-        <<<grid_dim, block_dim, 0, exec->get_stream()>>>(
-            b->get_size()[0], b->get_size()[1], krylov_dim,
-            as_device_type(b->get_const_values()), b->get_stride(),
-            as_device_type(residual->get_values()), residual->get_stride(),
-            as_device_type(givens_sin->get_values()), givens_sin->get_stride(),
-            as_device_type(givens_cos->get_values()), givens_cos->get_stride(),
-            as_device_type(stop_status->get_data()));
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CB_GMRES_INITIALIZE_KERNEL);
-
-
-template <typename ValueType, typename Accessor3d>
-void restart(std::shared_ptr<const DefaultExecutor> exec,
-             const matrix::Dense<ValueType>* residual,
-             matrix::Dense<remove_complex<ValueType>>* residual_norm,
-             matrix::Dense<ValueType>* residual_norm_collection,
-             matrix::Dense<remove_complex<ValueType>>* arnoldi_norm,
-             Accessor3d krylov_bases,
-             matrix::Dense<ValueType>* next_krylov_basis,
-             array<size_type>* final_iter_nums, array<char>& reduction_tmp,
-             size_type krylov_dim)
-{
-    constexpr bool use_scalar =
-        gko::cb_gmres::detail::has_3d_scaled_accessor<Accessor3d>::value;
-    const auto num_rows = residual->get_size()[0];
-    const auto num_rhs = residual->get_size()[1];
-    const auto krylov_stride =
-        gko::cb_gmres::helper_functions_accessor<Accessor3d>::get_stride(
-            krylov_bases);
-    const auto grid_dim_1 =
-        ceildiv((krylov_dim + 1) * krylov_stride[0], default_block_size);
-    const auto block_dim = default_block_size;
-    constexpr auto block_size = default_block_size;
-    const auto stride_arnoldi = arnoldi_norm->get_stride();
-
-    restart_1_kernel<block_size>
-        <<<grid_dim_1, block_dim, 0, exec->get_stream()>>>(
-            residual->get_size()[0], residual->get_size()[1], krylov_dim,
-            acc::as_cuda_range(krylov_bases),
-            as_device_type(residual_norm_collection->get_values()),
-            residual_norm_collection->get_stride());
-    kernels::cuda::dense::compute_norm2_dispatch(exec, residual, residual_norm,
-                                                 reduction_tmp);
-
-    if (use_scalar) {
-        components::fill_array(exec,
-                               arnoldi_norm->get_values() + 2 * stride_arnoldi,
-                               num_rhs, zero<remove_complex<ValueType>>());
-        const dim3 grid_size_nrm(ceildiv(num_rhs, default_dot_dim),
-                                 exec->get_num_multiprocessor() * 2);
-        const dim3 block_size_nrm(default_dot_dim, default_dot_dim);
-        multinorminf_without_stop_kernel<<<grid_size_nrm, block_size_nrm, 0,
-                                           exec->get_stream()>>>(
-            num_rows, num_rhs, as_device_type(residual->get_const_values()),
-            residual->get_stride(),
-            as_device_type(arnoldi_norm->get_values() + 2 * stride_arnoldi), 0);
-    }
-
-    if (gko::cb_gmres::detail::has_3d_scaled_accessor<Accessor3d>::value) {
-        set_scalar_kernel<default_block_size>
-            <<<ceildiv(num_rhs * (krylov_dim + 1), default_block_size),
-               default_block_size, 0, exec->get_stream()>>>(
-                num_rhs, krylov_dim + 1,
-                as_device_type(residual_norm->get_const_values()),
-                residual_norm->get_stride(),
-                as_device_type(arnoldi_norm->get_const_values() +
-                               2 * stride_arnoldi),
-                stride_arnoldi, acc::as_cuda_range(krylov_bases));
-    }
-
-    const auto grid_dim_2 =
-        ceildiv(std::max<size_type>(num_rows, 1) * krylov_stride[1],
-                default_block_size);
-    restart_2_kernel<block_size>
-        <<<grid_dim_2, block_dim, 0, exec->get_stream()>>>(
-            residual->get_size()[0], residual->get_size()[1],
-            as_device_type(residual->get_const_values()),
-            residual->get_stride(),
-            as_device_type(residual_norm->get_const_values()),
-            as_device_type(residual_norm_collection->get_values()),
-            acc::as_cuda_range(krylov_bases),
-            as_device_type(next_krylov_basis->get_values()),
-            next_krylov_basis->get_stride(),
-            as_device_type(final_iter_nums->get_data()));
-}
-
-GKO_INSTANTIATE_FOR_EACH_CB_GMRES_TYPE(GKO_DECLARE_CB_GMRES_RESTART_KERNEL);
-
-
-template <typename ValueType, typename Accessor3dim>
-void finish_arnoldi_CGS(std::shared_ptr<const DefaultExecutor> exec,
-                        matrix::Dense<ValueType>* next_krylov_basis,
-                        Accessor3dim krylov_bases,
-                        matrix::Dense<ValueType>* hessenberg_iter,
-                        matrix::Dense<ValueType>* buffer_iter,
-                        matrix::Dense<remove_complex<ValueType>>* arnoldi_norm,
-                        size_type iter, const stopping_status* stop_status,
-                        stopping_status* reorth_status,
-                        array<size_type>* num_reorth)
-{
-    const auto dim_size = next_krylov_basis->get_size();
-    if (dim_size[1] == 0) {
-        return;
-    }
-    using non_complex = remove_complex<ValueType>;
-    // optimization parameter
-    constexpr int singledot_block_size = default_dot_dim;
-    constexpr bool use_scalar =
-        gko::cb_gmres::detail::has_3d_scaled_accessor<Accessor3dim>::value;
-    const auto stride_next_krylov = next_krylov_basis->get_stride();
-    const auto stride_hessenberg = hessenberg_iter->get_stride();
-    const auto stride_buffer = buffer_iter->get_stride();
-    const auto stride_arnoldi = arnoldi_norm->get_stride();
-    const dim3 grid_size(ceildiv(dim_size[1], default_dot_dim),
-                         exec->get_num_multiprocessor() * 2);
-    const dim3 grid_size_num_iters(ceildiv(dim_size[1], default_dot_dim),
-                                   exec->get_num_multiprocessor() * 2,
-                                   iter + 1);
-    const dim3 block_size(default_dot_dim, default_dot_dim);
-    // Note: having iter first (instead of row_idx information) is likely
-    //       beneficial for avoiding atomic_add conflicts, but that needs
-    //       further investigation.
-    const dim3 grid_size_iters_single(exec->get_num_multiprocessor() * 2,
-                                      iter + 1);
-    const auto block_size_iters_single = singledot_block_size;
-    size_type num_reorth_host;
-
-    components::fill_array(exec, arnoldi_norm->get_values(), dim_size[1],
-                           zero<non_complex>());
-    multinorm2_kernel<<<grid_size, block_size, 0, exec->get_stream()>>>(
-        dim_size[0], dim_size[1],
-        as_device_type(next_krylov_basis->get_const_values()),
-        stride_next_krylov, as_device_type(arnoldi_norm->get_values()),
-        as_device_type(stop_status));
-    // nrmP = norm(next_krylov_basis)
-    zero_matrix(exec, iter + 1, dim_size[1], stride_hessenberg,
-                hessenberg_iter->get_values());
-    if (dim_size[1] > 1) {
-        multidot_kernel<default_dot_dim>
-            <<<grid_size_num_iters, block_size, 0, exec->get_stream()>>>(
-                dim_size[0], dim_size[1],
-                as_device_type(next_krylov_basis->get_const_values()),
-                stride_next_krylov, acc::as_cuda_range(krylov_bases),
-                as_device_type(hessenberg_iter->get_values()),
-                stride_hessenberg, as_device_type(stop_status));
-    } else {
-        singledot_kernel<singledot_block_size>
-            <<<grid_size_iters_single, block_size_iters_single, 0,
-               exec->get_stream()>>>(
-                dim_size[0],
-                as_device_type(next_krylov_basis->get_const_values()),
-                stride_next_krylov, acc::as_cuda_range(krylov_bases),
-                as_device_type(hessenberg_iter->get_values()),
-                stride_hessenberg, as_device_type(stop_status));
-    }
-    // for i in 1:iter
-    //     hessenberg(iter, i) = next_krylov_basis' * krylov_bases(:, i)
-    // end
-    update_next_krylov_kernel<default_block_size>
-        <<<ceildiv(dim_size[0] * stride_next_krylov, default_block_size),
-           default_block_size, 0, exec->get_stream()>>>(
-            iter + 1, dim_size[0], dim_size[1],
-            as_device_type(next_krylov_basis->get_values()), stride_next_krylov,
-            acc::as_cuda_range(krylov_bases),
-            as_device_type(hessenberg_iter->get_const_values()),
-            stride_hessenberg, as_device_type(stop_status));
-
-    // for i in 1:iter
-    //     next_krylov_basis  -= hessenberg(iter, i) * krylov_bases(:, i)
-    // end
-    components::fill_array(exec, arnoldi_norm->get_values() + stride_arnoldi,
-                           dim_size[1], zero<non_complex>());
-    if (use_scalar) {
-        components::fill_array(exec,
-                               arnoldi_norm->get_values() + 2 * stride_arnoldi,
-                               dim_size[1], zero<non_complex>());
-    }
-    multinorm2_inf_kernel<use_scalar>
-        <<<grid_size, block_size, 0, exec->get_stream()>>>(
-            dim_size[0], dim_size[1],
-            as_device_type(next_krylov_basis->get_const_values()),
-            stride_next_krylov,
-            as_device_type(arnoldi_norm->get_values() + stride_arnoldi),
-            as_device_type(arnoldi_norm->get_values() + 2 * stride_arnoldi),
-            as_device_type(stop_status));
-    // nrmN = norm(next_krylov_basis)
-    components::fill_array(exec, num_reorth->get_data(), 1, zero<size_type>());
-    check_arnoldi_norms<default_block_size>
-        <<<ceildiv(dim_size[1], default_block_size), default_block_size, 0,
-           exec->get_stream()>>>(
-            dim_size[1], as_device_type(arnoldi_norm->get_values()),
-            stride_arnoldi, as_device_type(hessenberg_iter->get_values()),
-            stride_hessenberg, iter + 1, acc::as_cuda_range(krylov_bases),
-            as_device_type(stop_status), as_device_type(reorth_status),
-            as_device_type(num_reorth->get_data()));
-    num_reorth_host = get_element(*num_reorth, 0);
-    // num_reorth_host := number of next_krylov vector to be reorthogonalization
-    for (size_type l = 1; (num_reorth_host > 0) && (l < 3); l++) {
-        zero_matrix(exec, iter + 1, dim_size[1], stride_buffer,
-                    buffer_iter->get_values());
-        if (dim_size[1] > 1) {
-            multidot_kernel<default_dot_dim>
-                <<<grid_size_num_iters, block_size, 0, exec->get_stream()>>>(
-                    dim_size[0], dim_size[1],
-                    as_device_type(next_krylov_basis->get_const_values()),
-                    stride_next_krylov, acc::as_cuda_range(krylov_bases),
-                    as_device_type(buffer_iter->get_values()), stride_buffer,
-                    as_device_type(stop_status));
-        } else {
-            singledot_kernel<singledot_block_size>
-                <<<grid_size_iters_single, block_size_iters_single, 0,
-                   exec->get_stream()>>>(
-                    dim_size[0],
-                    as_device_type(next_krylov_basis->get_const_values()),
-                    stride_next_krylov, acc::as_cuda_range(krylov_bases),
-                    as_device_type(buffer_iter->get_values()), stride_buffer,
-                    as_device_type(stop_status));
-        }
-        // for i in 1:iter
-        //     hessenberg(iter, i) = next_krylov_basis' * krylov_bases(:, i)
-        // end
-        update_next_krylov_and_add_kernel<default_block_size>
-            <<<ceildiv(dim_size[0] * stride_next_krylov, default_block_size),
-               default_block_size, 0, exec->get_stream()>>>(
-                iter + 1, dim_size[0], dim_size[1],
-                as_device_type(next_krylov_basis->get_values()),
-                stride_next_krylov, acc::as_cuda_range(krylov_bases),
-                as_device_type(hessenberg_iter->get_values()),
-                stride_hessenberg,
-                as_device_type(buffer_iter->get_const_values()), stride_buffer,
-                as_device_type(stop_status), as_device_type(reorth_status));
-        // for i in 1:iter
-        //     next_krylov_basis  -= hessenberg(iter, i) * krylov_bases(:, i)
-        // end
-        components::fill_array(exec,
-                               arnoldi_norm->get_values() + stride_arnoldi,
-                               dim_size[1], zero<non_complex>());
-        if (use_scalar) {
-            components::fill_array(
-                exec, arnoldi_norm->get_values() + 2 * stride_arnoldi,
-                dim_size[1], zero<non_complex>());
-        }
-        multinorm2_inf_kernel<use_scalar>
-            <<<grid_size, block_size, 0, exec->get_stream()>>>(
-                dim_size[0], dim_size[1],
-                as_device_type(next_krylov_basis->get_const_values()),
-                stride_next_krylov,
-                as_device_type(arnoldi_norm->get_values() + stride_arnoldi),
-                as_device_type(arnoldi_norm->get_values() + 2 * stride_arnoldi),
-                as_device_type(stop_status));
-        // nrmN = norm(next_krylov_basis)
-        components::fill_array(exec, num_reorth->get_data(), 1,
-                               zero<size_type>());
-        check_arnoldi_norms<default_block_size>
-            <<<ceildiv(dim_size[1], default_block_size), default_block_size, 0,
-               exec->get_stream()>>>(
-                dim_size[1], as_device_type(arnoldi_norm->get_values()),
-                stride_arnoldi, as_device_type(hessenberg_iter->get_values()),
-                stride_hessenberg, iter + 1, acc::as_cuda_range(krylov_bases),
-                as_device_type(stop_status), as_device_type(reorth_status),
-                num_reorth->get_data());
-        num_reorth_host = get_element(*num_reorth, 0);
-        // num_reorth_host := number of next_krylov vector to be
-        // reorthogonalization
-    }
-    update_krylov_next_krylov_kernel<default_block_size>
-        <<<ceildiv(dim_size[0] * stride_next_krylov, default_block_size),
-           default_block_size, 0, exec->get_stream()>>>(
-            iter, dim_size[0], dim_size[1],
-            as_device_type(next_krylov_basis->get_values()), stride_next_krylov,
-            acc::as_cuda_range(krylov_bases),
-            as_device_type(hessenberg_iter->get_const_values()),
-            stride_hessenberg, as_device_type(stop_status));
-    // next_krylov_basis /= hessenberg(iter, iter + 1)
-    // krylov_bases(:, iter + 1) = next_krylov_basis
-    // End of arnoldi
-}
-
-template <typename ValueType>
-void givens_rotation(std::shared_ptr<const DefaultExecutor> exec,
-                     matrix::Dense<ValueType>* givens_sin,
-                     matrix::Dense<ValueType>* givens_cos,
-                     matrix::Dense<ValueType>* hessenberg_iter,
-                     matrix::Dense<remove_complex<ValueType>>* residual_norm,
-                     matrix::Dense<ValueType>* residual_norm_collection,
-                     size_type iter, const array<stopping_status>* stop_status)
-{
-    // TODO: tune block_size for optimal performance
-    constexpr auto block_size = default_block_size;
-    const auto num_cols = hessenberg_iter->get_size()[1];
-    const auto block_dim = block_size;
-    const auto grid_dim =
-        static_cast<unsigned int>(ceildiv(num_cols, block_size));
-
-    givens_rotation_kernel<block_size>
-        <<<grid_dim, block_dim, 0, exec->get_stream()>>>(
-            hessenberg_iter->get_size()[0], hessenberg_iter->get_size()[1],
-            iter, as_device_type(hessenberg_iter->get_values()),
-            hessenberg_iter->get_stride(),
-            as_device_type(givens_sin->get_values()), givens_sin->get_stride(),
-            as_device_type(givens_cos->get_values()), givens_cos->get_stride(),
-            as_device_type(residual_norm->get_values()),
-            as_device_type(residual_norm_collection->get_values()),
-            residual_norm_collection->get_stride(),
-            stop_status->get_const_data());
-}
-
-
-template <typename ValueType, typename Accessor3d>
-void arnoldi(std::shared_ptr<const DefaultExecutor> exec,
-             matrix::Dense<ValueType>* next_krylov_basis,
-             matrix::Dense<ValueType>* givens_sin,
-             matrix::Dense<ValueType>* givens_cos,
-             matrix::Dense<remove_complex<ValueType>>* residual_norm,
-             matrix::Dense<ValueType>* residual_norm_collection,
-             Accessor3d krylov_bases, matrix::Dense<ValueType>* hessenberg_iter,
-             matrix::Dense<ValueType>* buffer_iter,
-             matrix::Dense<remove_complex<ValueType>>* arnoldi_norm,
-             size_type iter, array<size_type>* final_iter_nums,
-             const array<stopping_status>* stop_status,
-             array<stopping_status>* reorth_status,
-             array<size_type>* num_reorth)
-{
-    increase_final_iteration_numbers_kernel<<<
-        static_cast<unsigned int>(
-            ceildiv(final_iter_nums->get_size(), default_block_size)),
-        default_block_size, 0, exec->get_stream()>>>(
-        as_device_type(final_iter_nums->get_data()),
-        stop_status->get_const_data(), final_iter_nums->get_size());
-    finish_arnoldi_CGS(exec, next_krylov_basis, krylov_bases, hessenberg_iter,
-                       buffer_iter, arnoldi_norm, iter,
-                       stop_status->get_const_data(), reorth_status->get_data(),
-                       num_reorth);
-    givens_rotation(exec, givens_sin, givens_cos, hessenberg_iter,
-                    residual_norm, residual_norm_collection, iter, stop_status);
-}
-
-GKO_INSTANTIATE_FOR_EACH_CB_GMRES_TYPE(GKO_DECLARE_CB_GMRES_ARNOLDI_KERNEL);
-
-
-template <typename ValueType>
-void solve_upper_triangular(
-    std::shared_ptr<const DefaultExecutor> exec,
-    const matrix::Dense<ValueType>* residual_norm_collection,
-    const matrix::Dense<ValueType>* hessenberg, matrix::Dense<ValueType>* y,
-    const array<size_type>* final_iter_nums)
-{
-    // TODO: tune block_size for optimal performance
-    constexpr auto block_size = default_block_size;
-    const auto num_rhs = residual_norm_collection->get_size()[1];
-    const auto block_dim = block_size;
-    const auto grid_dim =
-        static_cast<unsigned int>(ceildiv(num_rhs, block_size));
-
-    solve_upper_triangular_kernel<block_size>
-        <<<grid_dim, block_dim, 0, exec->get_stream()>>>(
-            hessenberg->get_size()[1], num_rhs,
-            as_device_type(residual_norm_collection->get_const_values()),
-            residual_norm_collection->get_stride(),
-            as_device_type(hessenberg->get_const_values()),
-            hessenberg->get_stride(), as_device_type(y->get_values()),
-            y->get_stride(), as_device_type(final_iter_nums->get_const_data()));
-}
-
-
-template <typename ValueType, typename ConstAccessor3d>
-void calculate_qy(std::shared_ptr<const DefaultExecutor> exec,
-                  ConstAccessor3d krylov_bases, size_type num_krylov_bases,
-                  const matrix::Dense<ValueType>* y,
-                  matrix::Dense<ValueType>* before_preconditioner,
-                  const array<size_type>* final_iter_nums)
-{
-    const auto num_rows = before_preconditioner->get_size()[0];
-    const auto num_cols = before_preconditioner->get_size()[1];
-    const auto stride_before_preconditioner =
-        before_preconditioner->get_stride();
-
-    constexpr auto block_size = default_block_size;
-    const auto grid_dim = static_cast<unsigned int>(
-        ceildiv(num_rows * stride_before_preconditioner, block_size));
-    const auto block_dim = block_size;
-
-    calculate_Qy_kernel<block_size>
-        <<<grid_dim, block_dim, 0, exec->get_stream()>>>(
-            num_rows, num_cols, acc::as_cuda_range(krylov_bases),
-            as_device_type(y->get_const_values()), y->get_stride(),
-            as_device_type(before_preconditioner->get_values()),
-            stride_before_preconditioner,
-            as_device_type(final_iter_nums->get_const_data()));
-    // Calculate qy
-    // before_preconditioner = krylov_bases * y
-}
-
-
-template <typename ValueType, typename ConstAccessor3d>
-void solve_krylov(std::shared_ptr<const DefaultExecutor> exec,
-                  const matrix::Dense<ValueType>* residual_norm_collection,
-                  ConstAccessor3d krylov_bases,
-                  const matrix::Dense<ValueType>* hessenberg,
-                  matrix::Dense<ValueType>* y,
-                  matrix::Dense<ValueType>* before_preconditioner,
-                  const array<size_type>* final_iter_nums)
-{
-    if (before_preconditioner->get_size()[1] == 0) {
-        return;
-    }
-    // since hessenberg has dims:  iters x iters * num_rhs
-    // krylov_bases has dims:  (iters + 1) x sysmtx[0] x num_rhs
-    const auto iters =
-        hessenberg->get_size()[1] / before_preconditioner->get_size()[1];
-    const auto num_krylov_bases = iters + 1;
-    solve_upper_triangular(exec, residual_norm_collection, hessenberg, y,
-                           final_iter_nums);
-    calculate_qy(exec, krylov_bases, num_krylov_bases, y, before_preconditioner,
-                 final_iter_nums);
-}
-
-GKO_INSTANTIATE_FOR_EACH_CB_GMRES_CONST_TYPE(
-    GKO_DECLARE_CB_GMRES_SOLVE_KRYLOV_KERNEL);
-
-
-}  // namespace cb_gmres
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/solver/multigrid_kernels.cu b/cuda/solver/multigrid_kernels.cu
deleted file mode 100644
index eaa913aa064..00000000000
--- a/cuda/solver/multigrid_kernels.cu
+++ /dev/null
@@ -1,41 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/solver/multigrid_kernels.hpp"
-
-
-#include <ginkgo/core/base/array.hpp>
-#include <ginkgo/core/base/exception_helpers.hpp>
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/base/types.hpp>
-
-
-#include "common/cuda_hip/base/runtime.hpp"
-#include "core/base/array_access.hpp"
-#include "core/components/fill_array_kernels.hpp"
-#include "cuda/base/types.hpp"
-#include "cuda/components/thread_ids.cuh"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-/**
- * @brief The MULTIGRID solver namespace.
- *
- * @ingroup multigrid
- */
-namespace multigrid {
-
-
-constexpr int default_block_size = 512;
-
-
-#include "common/cuda_hip/solver/multigrid_kernels.hpp.inc"
-
-
-}  // namespace multigrid
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/cuda/stop/batch_criteria.cuh b/cuda/stop/batch_criteria.cuh
deleted file mode 100644
index f4f434dda11..00000000000
--- a/cuda/stop/batch_criteria.cuh
+++ /dev/null
@@ -1,26 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_CUDA_STOP_BATCH_CRITERIA_CUH_
-#define GKO_CUDA_STOP_BATCH_CRITERIA_CUH_
-
-
-#include <ginkgo/core/base/math.hpp>
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-namespace batch_stop {
-
-
-#include "common/cuda_hip/stop/batch_criteria.hpp.inc"
-
-
-}  // namespace batch_stop
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
-
-#endif  // GKO_CUDA_STOP_BATCH_CRITERIA_CUH_
diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt
index de44eb20682..5bcc1de1f21 100644
--- a/hip/CMakeLists.txt
+++ b/hip/CMakeLists.txt
@@ -5,9 +5,7 @@ add_instantiation_files(. matrix/fbcsr_kernels.instantiate.hip.cpp FBCSR_INSTANT
 # we don't split up the dense kernels into distinct compilations
 list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp)
 set(GINKGO_HIP_SOURCES
-    base/batch_multi_vector_kernels.hip.cpp
     base/device.hip.cpp
-    base/device_matrix_data_kernels.hip.cpp
     base/exception.hip.cpp
     base/executor.hip.cpp
     base/index_set_kernels.hip.cpp
@@ -17,55 +15,28 @@ set(GINKGO_HIP_SOURCES
     base/stream.hip.cpp
     base/timer.hip.cpp
     base/version.hip.cpp
-    components/prefix_sum_kernels.hip.cpp
-    distributed/index_map_kernels.hip.cpp
-    distributed/matrix_kernels.hip.cpp
-    distributed/partition_helpers_kernels.hip.cpp
-    distributed/partition_kernels.hip.cpp
-    distributed/vector_kernels.hip.cpp
-    factorization/cholesky_kernels.hip.cpp
-    factorization/factorization_kernels.hip.cpp
     factorization/ic_kernels.hip.cpp
     factorization/ilu_kernels.hip.cpp
-    factorization/lu_kernels.hip.cpp
-    factorization/par_ic_kernels.hip.cpp
-    factorization/par_ict_kernels.hip.cpp
-    factorization/par_ilu_kernels.hip.cpp
     factorization/par_ilut_approx_filter_kernel.hip.cpp
     factorization/par_ilut_filter_kernel.hip.cpp
     factorization/par_ilut_select_common.hip.cpp
     factorization/par_ilut_select_kernel.hip.cpp
     factorization/par_ilut_spgeam_kernel.hip.cpp
     factorization/par_ilut_sweep_kernel.hip.cpp
-    matrix/batch_csr_kernels.hip.cpp
-    matrix/batch_dense_kernels.hip.cpp
-    matrix/batch_ell_kernels.hip.cpp
-    matrix/coo_kernels.hip.cpp
     ${CSR_INSTANTIATE}
-    matrix/dense_kernels.hip.cpp
-    matrix/diagonal_kernels.hip.cpp
-    matrix/ell_kernels.hip.cpp
     ${FBCSR_INSTANTIATE}
-    matrix/sellp_kernels.hip.cpp
-    matrix/sparsity_csr_kernels.hip.cpp
-    multigrid/pgm_kernels.hip.cpp
     preconditioner/batch_jacobi_kernels.hip.cpp
-    preconditioner/isai_kernels.hip.cpp
     preconditioner/jacobi_advanced_apply_kernel.hip.cpp
     preconditioner/jacobi_generate_kernel.hip.cpp
-    preconditioner/jacobi_kernels.hip.cpp
     preconditioner/jacobi_simple_apply_kernel.hip.cpp
-    reorder/rcm_kernels.hip.cpp
     solver/batch_bicgstab_kernels.hip.cpp
     solver/batch_cg_kernels.hip.cpp
-    solver/cb_gmres_kernels.hip.cpp
-    solver/idr_kernels.hip.cpp
     solver/lower_trs_kernels.hip.cpp
-    solver/multigrid_kernels.hip.cpp
     solver/upper_trs_kernels.hip.cpp
     stop/criterion_kernels.hip.cpp
     stop/residual_norm_kernels.hip.cpp
     ${GKO_UNIFIED_COMMON_SOURCES}
+    ${GKO_CUDA_HIP_COMMON_SOURCES}
     )
 
 if(hipfft_FOUND)
diff --git a/hip/base/batch_multi_vector_kernels.hip.cpp b/hip/base/batch_multi_vector_kernels.hip.cpp
deleted file mode 100644
index 14a915630a5..00000000000
--- a/hip/base/batch_multi_vector_kernels.hip.cpp
+++ /dev/null
@@ -1,59 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/base/batch_multi_vector_kernels.hpp"
-
-
-#include <thrust/functional.h>
-#include <thrust/transform.h>
-
-
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/base/range_accessors.hpp>
-
-
-#include "common/cuda_hip/base/runtime.hpp"
-#include "core/base/batch_struct.hpp"
-#include "hip/base/batch_struct.hip.hpp"
-#include "hip/base/config.hip.hpp"
-#include "hip/base/hipblas_bindings.hip.hpp"
-#include "hip/base/pointer_mode_guard.hip.hpp"
-#include "hip/base/thrust.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
-#include "hip/components/reduction.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-#include "hip/components/uninitialized_array.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-/**
- * @brief The MultiVector matrix format namespace.
- *
- * @ingroup batch_multi_vector
- */
-namespace batch_multi_vector {
-
-
-constexpr auto default_block_size = 256;
-constexpr int sm_oversubscription = 4;
-
-
-// clang-format off
-
-// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
-
-#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc"
-
-
-#include "common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc"
-
-// clang-format on
-
-
-}  // namespace batch_multi_vector
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/base/device_matrix_data_kernels.hip.cpp b/hip/base/device_matrix_data_kernels.hip.cpp
deleted file mode 100644
index 745ba955014..00000000000
--- a/hip/base/device_matrix_data_kernels.hip.cpp
+++ /dev/null
@@ -1,33 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/base/device_matrix_data_kernels.hpp"
-
-
-#include <thrust/copy.h>
-#include <thrust/count.h>
-#include <thrust/device_ptr.h>
-#include <thrust/execution_policy.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/sort.h>
-#include <thrust/tuple.h>
-
-
-#include "hip/base/thrust.hip.hpp"
-#include "hip/base/types.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-namespace components {
-
-
-#include "common/cuda_hip/base/device_matrix_data_kernels.hpp.inc"
-
-
-}  // namespace components
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/base/kernel_launch.hip.hpp b/hip/base/kernel_launch.hip.hpp
deleted file mode 100644
index 2889314f498..00000000000
--- a/hip/base/kernel_launch.hip.hpp
+++ /dev/null
@@ -1,57 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_HPP_
-#error \
-    "This file can only be used from inside common/unified/base/kernel_launch.hpp"
-#endif
-
-
-#include <thrust/tuple.h>
-
-
-#include "accessor/hip_helper.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "hip/base/types.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-
-
-template <typename AccessorType>
-struct to_device_type_impl<gko::acc::range<AccessorType>&> {
-    using type = std::decay_t<decltype(gko::acc::as_hip_range(
-        std::declval<gko::acc::range<AccessorType>>()))>;
-    static type map_to_device(gko::acc::range<AccessorType>& range)
-    {
-        return gko::acc::as_hip_range(range);
-    }
-};
-
-template <typename AccessorType>
-struct to_device_type_impl<const gko::acc::range<AccessorType>&> {
-    using type = std::decay_t<decltype(gko::acc::as_hip_range(
-        std::declval<gko::acc::range<AccessorType>>()))>;
-    static type map_to_device(const gko::acc::range<AccessorType>& range)
-    {
-        return gko::acc::as_hip_range(range);
-    }
-};
-
-
-namespace device_std = thrust;
-
-
-constexpr int default_block_size = 512;
-
-
-#include "common/cuda_hip/base/kernel_launch.hpp.inc"
-
-
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/base/kernel_launch_reduction.hip.hpp b/hip/base/kernel_launch_reduction.hip.hpp
deleted file mode 100644
index 7c5d0c01c9c..00000000000
--- a/hip/base/kernel_launch_reduction.hip.hpp
+++ /dev/null
@@ -1,28 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_REDUCTION_HPP_
-#error \
-    "This file can only be used from inside common/unified/base/kernel_launch_reduction.hpp"
-#endif
-
-
-#include "core/synthesizer/implementation_selection.hpp"
-#include "hip/base/types.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
-#include "hip/components/reduction.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-
-
-#include "common/cuda_hip/base/kernel_launch_reduction.hpp.inc"
-
-
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/base/kernel_launch_solver.hip.hpp b/hip/base/kernel_launch_solver.hip.hpp
deleted file mode 100644
index eda18f35eab..00000000000
--- a/hip/base/kernel_launch_solver.hip.hpp
+++ /dev/null
@@ -1,24 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_SOLVER_HPP_
-#error \
-    "This file can only be used from inside common/unified/base/kernel_launch_solver.hpp"
-#endif
-
-
-#include "common/cuda_hip/base/runtime.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-
-
-#include "common/cuda_hip/base/kernel_launch_solver.hpp.inc"
-
-
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/base/math.hip.hpp b/hip/base/math.hip.hpp
deleted file mode 100644
index f9427089126..00000000000
--- a/hip/base/math.hip.hpp
+++ /dev/null
@@ -1,24 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_HIP_BASE_MATH_HIP_HPP_
-#define GKO_HIP_BASE_MATH_HIP_HPP_
-
-
-#include <ginkgo/core/base/math.hpp>
-
-
-#include <thrust/complex.h>
-
-
-namespace gko {
-
-
-#include "common/cuda_hip/base/math.hpp.inc"
-
-
-}  // namespace gko
-
-
-#endif  // GKO_HIP_BASE_MATH_HIP_HPP_
diff --git a/hip/components/atomic.hip.hpp b/hip/components/atomic.hip.hpp
deleted file mode 100644
index 6c3eaaeb82a..00000000000
--- a/hip/components/atomic.hip.hpp
+++ /dev/null
@@ -1,29 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_HIP_COMPONENTS_ATOMIC_HIP_HPP_
-#define GKO_HIP_COMPONENTS_ATOMIC_HIP_HPP_
-
-
-#include <type_traits>
-
-
-#include "hip/base/math.hip.hpp"
-#include "hip/base/types.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-
-
-#include "common/cuda_hip/components/atomic.hpp.inc"
-
-
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
-
-
-#endif  // GKO_HIP_COMPONENTS_ATOMIC_HIP_HPP_
diff --git a/hip/components/diagonal_block_manipulation.hip.hpp b/hip/components/diagonal_block_manipulation.hip.hpp
deleted file mode 100644
index 0261c7549c5..00000000000
--- a/hip/components/diagonal_block_manipulation.hip.hpp
+++ /dev/null
@@ -1,32 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_HIP_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_HIP_HPP_
-#define GKO_HIP_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_HIP_HPP_
-
-
-#include <type_traits>
-
-
-#include "hip/base/config.hip.hpp"
-#include "hip/base/types.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-namespace csr {
-
-
-#include "common/cuda_hip/components/diagonal_block_manipulation.hpp.inc"
-
-
-}  // namespace csr
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
-
-
-#endif  // GKO_HIP_COMPONENTS_DIAGONAL_BLOCK_MANIPULATION_HIP_HPP_
diff --git a/hip/components/intrinsics.hip.hpp b/hip/components/intrinsics.hip.hpp
deleted file mode 100644
index af849d4471a..00000000000
--- a/hip/components/intrinsics.hip.hpp
+++ /dev/null
@@ -1,25 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_HIP_COMPONENTS_INTRINSICS_HIP_HPP_
-#define GKO_HIP_COMPONENTS_INTRINSICS_HIP_HPP_
-
-
-#include <ginkgo/core/base/types.hpp>
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-
-
-#include "common/cuda_hip/components/intrinsics.hpp.inc"
-
-
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
-
-
-#endif  // GKO_HIP_COMPONENTS_INTRINSICS_HIP_HPP_
diff --git a/hip/components/merging.hip.hpp b/hip/components/merging.hip.hpp
deleted file mode 100644
index 3f031947940..00000000000
--- a/hip/components/merging.hip.hpp
+++ /dev/null
@@ -1,28 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_HIP_COMPONENTS_MERGING_HIP_HPP_
-#define GKO_HIP_COMPONENTS_MERGING_HIP_HPP_
-
-
-#include "core/base/utils.hpp"
-#include "hip/base/math.hip.hpp"
-#include "hip/components/intrinsics.hip.hpp"
-#include "hip/components/searching.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-
-
-#include "common/cuda_hip/components/merging.hpp.inc"
-
-
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
-
-
-#endif  // GKO_HIP_COMPONENTS_MERGING_HIP_HPP_
diff --git a/hip/components/prefix_sum.hip.hpp b/hip/components/prefix_sum.hip.hpp
deleted file mode 100644
index b5065589d8e..00000000000
--- a/hip/components/prefix_sum.hip.hpp
+++ /dev/null
@@ -1,31 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_HIP_COMPONENTS_PREFIX_SUM_HIP_HPP_
-#define GKO_HIP_COMPONENTS_PREFIX_SUM_HIP_HPP_
-
-
-#include <type_traits>
-
-
-#include "hip/base/hipblas_bindings.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
-#include "hip/components/reduction.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-
-
-#include "common/cuda_hip/components/prefix_sum.hpp.inc"
-
-
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
-
-
-#endif  // GKO_HIP_COMPONENTS_PREFIX_SUM_HIP_HPP_
diff --git a/hip/components/prefix_sum_kernels.hip.cpp b/hip/components/prefix_sum_kernels.hip.cpp
deleted file mode 100644
index ad55c0954d1..00000000000
--- a/hip/components/prefix_sum_kernels.hip.cpp
+++ /dev/null
@@ -1,34 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/components/prefix_sum_kernels.hpp"
-
-
-#include <limits>
-
-
-#include <thrust/scan.h>
-
-
-#include <ginkgo/core/base/array.hpp>
-#include <ginkgo/core/base/exception.hpp>
-#include <ginkgo/core/base/name_demangling.hpp>
-
-
-#include "hip/base/thrust.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-namespace components {
-
-
-#include "common/cuda_hip/components/prefix_sum_kernels.hpp.inc"
-
-
-}  // namespace components
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/components/reduction.hip.hpp b/hip/components/reduction.hip.hpp
deleted file mode 100644
index bcff77707ca..00000000000
--- a/hip/components/reduction.hip.hpp
+++ /dev/null
@@ -1,82 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_HIP_COMPONENTS_REDUCTION_HIP_HPP_
-#define GKO_HIP_COMPONENTS_REDUCTION_HIP_HPP_
-
-
-#include <type_traits>
-
-
-#include <ginkgo/core/base/array.hpp>
-#include <ginkgo/core/base/executor.hpp>
-
-
-#include "common/cuda_hip/base/runtime.hpp"
-#include "core/base/array_access.hpp"
-#include "hip/base/config.hip.hpp"
-#include "hip/base/types.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-#include "hip/components/uninitialized_array.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-
-
-constexpr int default_reduce_block_size = 512;
-
-
-#include "common/cuda_hip/components/reduction.hpp.inc"
-
-
-/**
- * Compute a reduction using add operation (+).
- *
- * @param exec  Executor associated to the array
- * @param size  size of the array
- * @param source  the pointer of the array
- *
- * @return the reduction result
- */
-template <typename ValueType>
-__host__ ValueType reduce_add_array(std::shared_ptr<const HipExecutor> exec,
-                                    size_type size, const ValueType* source)
-{
-    auto block_results_val = source;
-    size_type grid_dim = size;
-    auto block_results = array<ValueType>(exec);
-    if (size > default_reduce_block_size) {
-        const auto n = ceildiv(size, default_reduce_block_size);
-        grid_dim =
-            (n <= default_reduce_block_size) ? n : default_reduce_block_size;
-
-        block_results.resize_and_reset(grid_dim);
-
-        reduce_add_array<<<grid_dim, default_reduce_block_size, 0,
-                           exec->get_stream()>>>(
-            size, as_device_type(source),
-            as_device_type(block_results.get_data()));
-
-        block_results_val = block_results.get_const_data();
-    }
-
-    auto d_result = array<ValueType>(exec, 1);
-
-    reduce_add_array<<<1, default_reduce_block_size, 0, exec->get_stream()>>>(
-        grid_dim, as_device_type(block_results_val),
-        as_device_type(d_result.get_data()));
-    auto answer = get_element(d_result, 0);
-    return answer;
-}
-
-
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
-
-
-#endif  // GKO_HIP_COMPONENTS_REDUCTION_HIP_HPP_
diff --git a/hip/components/searching.hip.hpp b/hip/components/searching.hip.hpp
deleted file mode 100644
index 2a6be767c2c..00000000000
--- a/hip/components/searching.hip.hpp
+++ /dev/null
@@ -1,26 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_HIP_COMPONENTS_SEARCHING_HIP_HPP_
-#define GKO_HIP_COMPONENTS_SEARCHING_HIP_HPP_
-
-
-#include "hip/base/config.hip.hpp"
-#include "hip/components/intrinsics.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-
-
-#include "common/cuda_hip/components/searching.hpp.inc"
-
-
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
-
-
-#endif  // GKO_HIP_COMPONENTS_SEARCHING_HIP_HPP_
diff --git a/hip/components/segment_scan.hip.hpp b/hip/components/segment_scan.hip.hpp
deleted file mode 100644
index 7f98d08cf69..00000000000
--- a/hip/components/segment_scan.hip.hpp
+++ /dev/null
@@ -1,25 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_HIP_COMPONENTS_SEGMENT_SCAN_HIP_HPP_
-#define GKO_HIP_COMPONENTS_SEGMENT_SCAN_HIP_HPP_
-
-
-#include "hip/components/cooperative_groups.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-
-
-#include "common/cuda_hip/components/segment_scan.hpp.inc"
-
-
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
-
-
-#endif  // GKO_HIP_COMPONENTS_SEGMENT_SCAN_HIP_HPP_
diff --git a/hip/components/sorting.hip.hpp b/hip/components/sorting.hip.hpp
deleted file mode 100644
index 730c3c56401..00000000000
--- a/hip/components/sorting.hip.hpp
+++ /dev/null
@@ -1,26 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_HIP_COMPONENTS_SORTING_HIP_HPP_
-#define GKO_HIP_COMPONENTS_SORTING_HIP_HPP_
-
-
-#include "hip/base/config.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-
-
-#include "common/cuda_hip/components/sorting.hpp.inc"
-
-
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
-
-
-#endif  // GKO_HIP_COMPONENTS_SORTING_HIP_HPP_
diff --git a/hip/components/syncfree.hip.hpp b/hip/components/syncfree.hip.hpp
deleted file mode 100644
index 9fe48944b56..00000000000
--- a/hip/components/syncfree.hip.hpp
+++ /dev/null
@@ -1,32 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_HIP_COMPONENTS_SYNCFREE_HIP_HPP_
-#define GKO_HIP_COMPONENTS_SYNCFREE_HIP_HPP_
-
-
-#include <ginkgo/core/base/array.hpp>
-
-
-#include "core/components/fill_array_kernels.hpp"
-#include "hip/base/config.hip.hpp"
-#include "hip/components/atomic.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
-#include "hip/components/memory.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-
-
-#include "common/cuda_hip/components/syncfree.hpp.inc"
-
-
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
-
-
-#endif  // GKO_HIP_COMPONENTS_SYNCFREE_HIP_HPP_
diff --git a/hip/components/thread_ids.hip.hpp b/hip/components/thread_ids.hip.hpp
deleted file mode 100644
index 8cd204438ae..00000000000
--- a/hip/components/thread_ids.hip.hpp
+++ /dev/null
@@ -1,27 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_HIP_COMPONENTS_THREAD_IDS_HIP_HPP_
-#define GKO_HIP_COMPONENTS_THREAD_IDS_HIP_HPP_
-
-
-#include "hip/base/config.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-namespace thread {
-
-
-#include "common/cuda_hip/components/thread_ids.hpp.inc"
-
-
-}  // namespace thread
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
-
-
-#endif  // GKO_HIP_COMPONENTS_THREAD_IDS_HIP_HPP_
diff --git a/hip/components/uninitialized_array.hip.hpp b/hip/components/uninitialized_array.hip.hpp
deleted file mode 100644
index e59d2c21a63..00000000000
--- a/hip/components/uninitialized_array.hip.hpp
+++ /dev/null
@@ -1,25 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_HIP_COMPONENTS_UNINITIALIZED_ARRAY_HIP_HPP_
-#define GKO_HIP_COMPONENTS_UNINITIALIZED_ARRAY_HIP_HPP_
-
-
-#include <ginkgo/core/base/types.hpp>
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-
-
-#include "common/cuda_hip/components/uninitialized_array.hpp.inc"
-
-
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
-
-
-#endif  // GKO_HIP_COMPONENTS_UNINITIALIZED_ARRAY_HIP_HPP_
diff --git a/hip/components/warp_blas.hip.hpp b/hip/components/warp_blas.hip.hpp
deleted file mode 100644
index 8ac59719aa7..00000000000
--- a/hip/components/warp_blas.hip.hpp
+++ /dev/null
@@ -1,33 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_HIP_COMPONENTS_WARP_BLAS_HIP_HPP_
-#define GKO_HIP_COMPONENTS_WARP_BLAS_HIP_HPP_
-
-
-#include <cassert>
-#include <type_traits>
-
-
-#include <ginkgo/config.hpp>
-
-
-#include "hip/base/math.hip.hpp"
-#include "hip/components/reduction.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-
-
-#include "common/cuda_hip/components/warp_blas.hpp.inc"
-
-
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
-
-
-#endif  // GKO_HIP_COMPONENTS_WARP_BLAS_HIP_HPP_
diff --git a/hip/distributed/index_map_kernels.hip.cpp b/hip/distributed/index_map_kernels.hip.cpp
deleted file mode 100644
index d45674a66a3..00000000000
--- a/hip/distributed/index_map_kernels.hip.cpp
+++ /dev/null
@@ -1,42 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/distributed/index_map_kernels.hpp"
-
-
-#include <thrust/binary_search.h>
-#include <thrust/copy.h>
-#include <thrust/distance.h>
-#include <thrust/execution_policy.h>
-#include <thrust/for_each.h>
-#include <thrust/iterator/transform_iterator.h>
-#include <thrust/iterator/transform_output_iterator.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/sequence.h>
-#include <thrust/sort.h>
-#include <thrust/transform_reduce.h>
-#include <thrust/unique.h>
-
-
-#include <ginkgo/core/base/exception_helpers.hpp>
-
-
-#include "hip/base/thrust.hip.hpp"
-#include "hip/components/atomic.hip.hpp"
-#include "hip/components/searching.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-namespace index_map {
-
-
-#include "common/cuda_hip/distributed/index_map_kernels.hpp.inc"
-
-
-}  // namespace index_map
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/distributed/matrix_kernels.hip.cpp b/hip/distributed/matrix_kernels.hip.cpp
deleted file mode 100644
index 54cde64c429..00000000000
--- a/hip/distributed/matrix_kernels.hip.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/distributed/matrix_kernels.hpp"
-
-
-#include <thrust/binary_search.h>
-#include <thrust/copy.h>
-#include <thrust/distance.h>
-#include <thrust/execution_policy.h>
-#include <thrust/for_each.h>
-#include <thrust/iterator/transform_iterator.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/sequence.h>
-#include <thrust/sort.h>
-#include <thrust/transform_reduce.h>
-#include <thrust/unique.h>
-
-
-#include <ginkgo/core/base/exception_helpers.hpp>
-
-
-#include "hip/base/thrust.hip.hpp"
-#include "hip/components/atomic.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-namespace distributed_matrix {
-
-
-#include "common/cuda_hip/distributed/matrix_kernels.hpp.inc"
-
-
-}  // namespace distributed_matrix
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/distributed/partition_helpers_kernels.hip.cpp b/hip/distributed/partition_helpers_kernels.hip.cpp
deleted file mode 100644
index 744d8de887b..00000000000
--- a/hip/distributed/partition_helpers_kernels.hip.cpp
+++ /dev/null
@@ -1,29 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/distributed/partition_helpers_kernels.hpp"
-
-
-#include <thrust/device_ptr.h>
-#include <thrust/execution_policy.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/sort.h>
-
-
-#include "hip/base/thrust.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-namespace partition_helpers {
-
-
-#include "common/cuda_hip/distributed/partition_helpers_kernels.hpp.inc"
-
-
-}  // namespace partition_helpers
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/distributed/partition_kernels.hip.cpp b/hip/distributed/partition_kernels.hip.cpp
deleted file mode 100644
index 00dc74b910f..00000000000
--- a/hip/distributed/partition_kernels.hip.cpp
+++ /dev/null
@@ -1,33 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/distributed/partition_kernels.hpp"
-
-
-#include <thrust/count.h>
-#include <thrust/device_ptr.h>
-#include <thrust/execution_policy.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/scan.h>
-#include <thrust/sort.h>
-
-
-#include "common/unified/base/kernel_launch.hpp"
-#include "core/components/fill_array_kernels.hpp"
-#include "hip/base/thrust.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-namespace partition {
-
-
-#include "common/cuda_hip/distributed/partition_kernels.hpp.inc"
-
-
-}  // namespace partition
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/distributed/vector_kernels.hip.cpp b/hip/distributed/vector_kernels.hip.cpp
deleted file mode 100644
index 320d847ed85..00000000000
--- a/hip/distributed/vector_kernels.hip.cpp
+++ /dev/null
@@ -1,37 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/distributed/vector_kernels.hpp"
-
-
-#include <functional>
-
-
-#include <thrust/binary_search.h>
-#include <thrust/execution_policy.h>
-#include <thrust/iterator/transform_iterator.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/scatter.h>
-#include <thrust/tuple.h>
-
-
-#include <ginkgo/core/base/exception_helpers.hpp>
-
-
-#include "hip/base/thrust.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-namespace distributed_vector {
-
-
-#include "common/cuda_hip/distributed/vector_kernels.hpp.inc"
-
-
-}  // namespace distributed_vector
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/factorization/cholesky_kernels.hip.cpp b/hip/factorization/cholesky_kernels.hip.cpp
deleted file mode 100644
index 04aa8da65ca..00000000000
--- a/hip/factorization/cholesky_kernels.hip.cpp
+++ /dev/null
@@ -1,115 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/factorization/cholesky_kernels.hpp"
-
-
-#include <algorithm>
-#include <memory>
-
-
-#include <thrust/execution_policy.h>
-#include <thrust/functional.h>
-#include <thrust/sequence.h>
-#include <thrust/sort.h>
-#include <thrust/transform.h>
-#include <thrust/tuple.h>
-
-
-#include <ginkgo/core/matrix/csr.hpp>
-
-
-#include "core/components/fill_array_kernels.hpp"
-#include "core/components/format_conversion_kernels.hpp"
-#include "core/factorization/elimination_forest.hpp"
-#include "core/factorization/lu_kernels.hpp"
-#include "core/matrix/csr_lookup.hpp"
-#include "hip/base/hipsparse_bindings.hip.hpp"
-#include "hip/base/math.hip.hpp"
-#include "hip/base/thrust.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
-#include "hip/components/intrinsics.hip.hpp"
-#include "hip/components/reduction.hip.hpp"
-#include "hip/components/syncfree.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-/**
- * @brief The Cholesky namespace.
- *
- * @ingroup factor
- */
-namespace cholesky {
-
-
-constexpr int default_block_size = 512;
-
-
-#include "common/cuda_hip/factorization/cholesky_kernels.hpp.inc"
-
-
-template <typename ValueType, typename IndexType>
-void symbolic_count(std::shared_ptr<const DefaultExecutor> exec,
-                    const matrix::Csr<ValueType, IndexType>* mtx,
-                    const factorization::elimination_forest<IndexType>& forest,
-                    IndexType* row_nnz, array<IndexType>& tmp_storage)
-{
-    const auto num_rows = static_cast<IndexType>(mtx->get_size()[0]);
-    if (num_rows == 0) {
-        return;
-    }
-    const auto mtx_nnz = static_cast<IndexType>(mtx->get_num_stored_elements());
-    tmp_storage.resize_and_reset(mtx_nnz + num_rows);
-    const auto postorder_cols = tmp_storage.get_data();
-    const auto lower_ends = postorder_cols + mtx_nnz;
-    const auto row_ptrs = mtx->get_const_row_ptrs();
-    const auto cols = mtx->get_const_col_idxs();
-    const auto inv_postorder = forest.inv_postorder.get_const_data();
-    const auto postorder_parent = forest.postorder_parents.get_const_data();
-    // transform col indices to postorder indices
-    {
-        const auto num_blocks = ceildiv(num_rows, default_block_size);
-        kernel::build_postorder_cols<<<num_blocks, default_block_size, 0,
-                                       exec->get_stream()>>>(
-            num_rows, cols, row_ptrs, inv_postorder, postorder_cols,
-            lower_ends);
-    }
-    // sort postorder_cols inside rows
-    {
-        const auto handle = exec->get_sparselib_handle();
-        auto descr = sparselib::create_mat_descr();
-        array<IndexType> permutation_array(exec, mtx_nnz);
-        auto permutation = permutation_array.get_data();
-        components::fill_seq_array(exec, permutation, mtx_nnz);
-        size_type buffer_size{};
-        sparselib::csrsort_buffer_size(handle, num_rows, num_rows, mtx_nnz,
-                                       row_ptrs, postorder_cols, buffer_size);
-        array<char> buffer_array{exec, buffer_size};
-        auto buffer = buffer_array.get_data();
-        sparselib::csrsort(handle, num_rows, num_rows, mtx_nnz, descr, row_ptrs,
-                           postorder_cols, permutation, buffer);
-        sparselib::destroy(descr);
-    }
-    // count nonzeros per row of L
-    {
-        const auto num_blocks =
-            ceildiv(num_rows, default_block_size / config::warp_size);
-        kernel::symbolic_count<config::warp_size>
-            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
-                num_rows, row_ptrs, lower_ends, inv_postorder, postorder_cols,
-                postorder_parent, row_nnz);
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CHOLESKY_SYMBOLIC_COUNT);
-
-
-}  // namespace cholesky
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/factorization/factorization_kernels.hip.cpp b/hip/factorization/factorization_kernels.hip.cpp
deleted file mode 100644
index 6ad176645f2..00000000000
--- a/hip/factorization/factorization_kernels.hip.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/factorization/factorization_kernels.hpp"
-
-
-#include <ginkgo/core/base/array.hpp>
-
-
-#include "common/cuda_hip/base/runtime.hpp"
-#include "core/base/array_access.hpp"
-#include "core/components/prefix_sum_kernels.hpp"
-#include "core/matrix/csr_builder.hpp"
-#include "hip/base/config.hip.hpp"
-#include "hip/base/types.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
-#include "hip/components/intrinsics.hip.hpp"
-#include "hip/components/searching.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-/**
- * @brief The factorization namespace.
- *
- * @ingroup factor
- */
-namespace factorization {
-
-
-constexpr int default_block_size{512};
-
-
-#include "common/cuda_hip/factorization/factorization_kernels.hpp.inc"
-
-
-}  // namespace factorization
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/factorization/lu_kernels.hip.cpp b/hip/factorization/lu_kernels.hip.cpp
deleted file mode 100644
index e1c60103dd3..00000000000
--- a/hip/factorization/lu_kernels.hip.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/factorization/lu_kernels.hpp"
-
-
-#include <algorithm>
-#include <memory>
-
-
-#include <thrust/copy.h>
-#include <thrust/iterator/transform_output_iterator.h>
-#include <thrust/iterator/zip_iterator.h>
-
-
-#include <ginkgo/core/matrix/csr.hpp>
-
-
-#include "core/base/allocator.hpp"
-#include "core/matrix/csr_lookup.hpp"
-#include "hip/base/thrust.hip.hpp"
-#include "hip/base/types.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
-#include "hip/components/reduction.hip.hpp"
-#include "hip/components/syncfree.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-/**
- * @brief The LU namespace.
- *
- * @ingroup factor
- */
-namespace lu_factorization {
-
-
-constexpr static int default_block_size = 512;
-
-
-#include "common/cuda_hip/factorization/lu_kernels.hpp.inc"
-
-
-}  // namespace lu_factorization
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/factorization/par_ic_kernels.hip.cpp b/hip/factorization/par_ic_kernels.hip.cpp
deleted file mode 100644
index dd91ac27339..00000000000
--- a/hip/factorization/par_ic_kernels.hip.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/factorization/par_ic_kernels.hpp"
-
-
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/matrix/coo.hpp>
-#include <ginkgo/core/matrix/csr.hpp>
-
-
-#include "hip/base/math.hip.hpp"
-#include "hip/base/types.hip.hpp"
-#include "hip/components/memory.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-/**
- * @brief The parallel ic factorization namespace.
- *
- * @ingroup factor
- */
-namespace par_ic_factorization {
-
-
-constexpr int default_block_size = 512;
-
-
-#include "common/cuda_hip/factorization/par_ic_kernels.hpp.inc"
-
-
-}  // namespace par_ic_factorization
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/factorization/par_ict_kernels.hip.cpp b/hip/factorization/par_ict_kernels.hip.cpp
deleted file mode 100644
index 1d5e412e9dd..00000000000
--- a/hip/factorization/par_ict_kernels.hip.cpp
+++ /dev/null
@@ -1,189 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/factorization/par_ict_kernels.hpp"
-
-
-#include <ginkgo/core/base/array.hpp>
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/matrix/coo.hpp>
-#include <ginkgo/core/matrix/csr.hpp>
-#include <ginkgo/core/matrix/dense.hpp>
-
-
-#include "common/cuda_hip/base/runtime.hpp"
-#include "core/components/prefix_sum_kernels.hpp"
-#include "core/matrix/coo_builder.hpp"
-#include "core/matrix/csr_builder.hpp"
-#include "core/matrix/csr_kernels.hpp"
-#include "core/synthesizer/implementation_selection.hpp"
-#include "hip/base/math.hip.hpp"
-#include "hip/components/intrinsics.hip.hpp"
-#include "hip/components/memory.hip.hpp"
-#include "hip/components/merging.hip.hpp"
-#include "hip/components/prefix_sum.hip.hpp"
-#include "hip/components/reduction.hip.hpp"
-#include "hip/components/searching.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-/**
- * @brief The parallel ICT factorization namespace.
- *
- * @ingroup factor
- */
-namespace par_ict_factorization {
-
-
-constexpr int default_block_size = 512;
-
-
-// subwarp sizes for all warp-parallel kernels (filter, add_candidates)
-using compiled_kernels =
-    syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
-
-
-#include "common/cuda_hip/factorization/par_ict_kernels.hpp.inc"
-
-
-namespace {
-
-
-template <int subwarp_size, typename ValueType, typename IndexType>
-void add_candidates(syn::value_list<int, subwarp_size>,
-                    std::shared_ptr<const DefaultExecutor> exec,
-                    const matrix::Csr<ValueType, IndexType>* llh,
-                    const matrix::Csr<ValueType, IndexType>* a,
-                    const matrix::Csr<ValueType, IndexType>* l,
-                    matrix::Csr<ValueType, IndexType>* l_new)
-{
-    auto num_rows = static_cast<IndexType>(llh->get_size()[0]);
-    auto subwarps_per_block = default_block_size / subwarp_size;
-    auto num_blocks = ceildiv(num_rows, subwarps_per_block);
-    matrix::CsrBuilder<ValueType, IndexType> l_new_builder(l_new);
-    auto llh_row_ptrs = llh->get_const_row_ptrs();
-    auto llh_col_idxs = llh->get_const_col_idxs();
-    auto llh_vals = llh->get_const_values();
-    auto a_row_ptrs = a->get_const_row_ptrs();
-    auto a_col_idxs = a->get_const_col_idxs();
-    auto a_vals = a->get_const_values();
-    auto l_row_ptrs = l->get_const_row_ptrs();
-    auto l_col_idxs = l->get_const_col_idxs();
-    auto l_vals = l->get_const_values();
-    auto l_new_row_ptrs = l_new->get_row_ptrs();
-    // count non-zeros per row
-    if (num_blocks > 0) {
-        kernel::ict_tri_spgeam_nnz<subwarp_size>
-            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
-                llh_row_ptrs, llh_col_idxs, a_row_ptrs, a_col_idxs,
-                l_new_row_ptrs, num_rows);
-    }
-
-    // build row ptrs
-    components::prefix_sum_nonnegative(exec, l_new_row_ptrs, num_rows + 1);
-
-    // resize output arrays
-    auto l_new_nnz = exec->copy_val_to_host(l_new_row_ptrs + num_rows);
-    l_new_builder.get_col_idx_array().resize_and_reset(l_new_nnz);
-    l_new_builder.get_value_array().resize_and_reset(l_new_nnz);
-
-    auto l_new_col_idxs = l_new->get_col_idxs();
-    auto l_new_vals = l_new->get_values();
-
-    // fill columns and values
-    if (num_blocks > 0) {
-        kernel::ict_tri_spgeam_init<subwarp_size>
-            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
-                llh_row_ptrs, llh_col_idxs, as_device_type(llh_vals),
-                a_row_ptrs, a_col_idxs, as_device_type(a_vals), l_row_ptrs,
-                l_col_idxs, as_device_type(l_vals), l_new_row_ptrs,
-                l_new_col_idxs, as_device_type(l_new_vals), num_rows);
-    }
-}
-
-
-GKO_ENABLE_IMPLEMENTATION_SELECTION(select_add_candidates, add_candidates);
-
-
-template <int subwarp_size, typename ValueType, typename IndexType>
-void compute_factor(syn::value_list<int, subwarp_size>,
-                    std::shared_ptr<const DefaultExecutor> exec,
-                    const matrix::Csr<ValueType, IndexType>* a,
-                    matrix::Csr<ValueType, IndexType>* l,
-                    const matrix::Coo<ValueType, IndexType>* l_coo)
-{
-    auto total_nnz = static_cast<IndexType>(l->get_num_stored_elements());
-    auto block_size = default_block_size / subwarp_size;
-    auto num_blocks = ceildiv(total_nnz, block_size);
-    if (num_blocks > 0) {
-        kernel::ict_sweep<subwarp_size>
-            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
-                a->get_const_row_ptrs(), a->get_const_col_idxs(),
-                as_device_type(a->get_const_values()), l->get_const_row_ptrs(),
-                l_coo->get_const_row_idxs(), l->get_const_col_idxs(),
-                as_device_type(l->get_values()),
-                static_cast<IndexType>(l->get_num_stored_elements()));
-    }
-}
-
-
-GKO_ENABLE_IMPLEMENTATION_SELECTION(select_compute_factor, compute_factor);
-
-
-}  // namespace
-
-
-template <typename ValueType, typename IndexType>
-void add_candidates(std::shared_ptr<const DefaultExecutor> exec,
-                    const matrix::Csr<ValueType, IndexType>* llh,
-                    const matrix::Csr<ValueType, IndexType>* a,
-                    const matrix::Csr<ValueType, IndexType>* l,
-                    matrix::Csr<ValueType, IndexType>* l_new)
-{
-    auto num_rows = a->get_size()[0];
-    auto total_nnz =
-        llh->get_num_stored_elements() + a->get_num_stored_elements();
-    auto total_nnz_per_row = total_nnz / num_rows;
-    select_add_candidates(
-        compiled_kernels(),
-        [&](int compiled_subwarp_size) {
-            return total_nnz_per_row <= compiled_subwarp_size ||
-                   compiled_subwarp_size == config::warp_size;
-        },
-        syn::value_list<int>(), syn::type_list<>(), exec, llh, a, l, l_new);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_PAR_ICT_ADD_CANDIDATES_KERNEL);
-
-
-template <typename ValueType, typename IndexType>
-void compute_factor(std::shared_ptr<const DefaultExecutor> exec,
-                    const matrix::Csr<ValueType, IndexType>* a,
-                    matrix::Csr<ValueType, IndexType>* l,
-                    const matrix::Coo<ValueType, IndexType>* l_coo)
-{
-    auto num_rows = a->get_size()[0];
-    auto total_nnz = 2 * l->get_num_stored_elements();
-    auto total_nnz_per_row = total_nnz / num_rows;
-    select_compute_factor(
-        compiled_kernels(),
-        [&](int compiled_subwarp_size) {
-            return total_nnz_per_row <= compiled_subwarp_size ||
-                   compiled_subwarp_size == config::warp_size;
-        },
-        syn::value_list<int>(), syn::type_list<>(), exec, a, l, l_coo);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_PAR_ICT_COMPUTE_FACTOR_KERNEL);
-
-
-}  // namespace par_ict_factorization
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/factorization/par_ilu_kernels.hip.cpp b/hip/factorization/par_ilu_kernels.hip.cpp
deleted file mode 100644
index 20537a35965..00000000000
--- a/hip/factorization/par_ilu_kernels.hip.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/factorization/par_ilu_kernels.hpp"
-
-
-#include <ginkgo/core/base/std_extensions.hpp>
-#include <ginkgo/core/matrix/coo.hpp>
-
-
-#include "common/cuda_hip/base/runtime.hpp"
-#include "hip/base/math.hip.hpp"
-#include "hip/base/types.hip.hpp"
-#include "hip/components/memory.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-/**
- * @brief The parallel ilu factorization namespace.
- *
- * @ingroup factor
- */
-namespace par_ilu_factorization {
-
-
-constexpr int default_block_size{512};
-
-
-#include "common/cuda_hip/factorization/par_ilu_kernels.hpp.inc"
-
-
-}  // namespace par_ilu_factorization
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/factorization/par_ilut_filter_kernels.hip.cpp b/hip/factorization/par_ilut_filter_kernels.hip.cpp
deleted file mode 100644
index 2777d218149..00000000000
--- a/hip/factorization/par_ilut_filter_kernels.hip.cpp
+++ /dev/null
@@ -1,140 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/factorization/par_ilut_kernels.hpp"
-
-
-#include <ginkgo/core/base/array.hpp>
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/matrix/coo.hpp>
-#include <ginkgo/core/matrix/csr.hpp>
-#include <ginkgo/core/matrix/dense.hpp>
-
-
-#include "common/cuda_hip/base/runtime.hpp"
-#include "core/components/prefix_sum_kernels.hpp"
-#include "core/matrix/coo_builder.hpp"
-#include "core/matrix/csr_builder.hpp"
-#include "core/matrix/csr_kernels.hpp"
-#include "core/synthesizer/implementation_selection.hpp"
-#include "hip/base/config.hip.hpp"
-#include "hip/base/math.hip.hpp"
-#include "hip/base/types.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
-#include "hip/components/intrinsics.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-/**
- * @brief The parallel ILUT factorization namespace.
- *
- * @ingroup factor
- */
-namespace par_ilut_factorization {
-
-
-constexpr int default_block_size = 512;
-
-
-// subwarp sizes for filter kernels
-using compiled_kernels =
-    syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
-
-
-#include "common/cuda_hip/factorization/par_ilut_filter_kernels.hpp.inc"
-
-
-namespace {
-
-
-template <int subwarp_size, typename ValueType, typename IndexType>
-void threshold_filter(syn::value_list<int, subwarp_size>,
-                      std::shared_ptr<const DefaultExecutor> exec,
-                      const matrix::Csr<ValueType, IndexType>* a,
-                      remove_complex<ValueType> threshold,
-                      matrix::Csr<ValueType, IndexType>* m_out,
-                      matrix::Coo<ValueType, IndexType>* m_out_coo, bool lower)
-{
-    auto old_row_ptrs = a->get_const_row_ptrs();
-    auto old_col_idxs = a->get_const_col_idxs();
-    auto old_vals = a->get_const_values();
-    // compute nnz for each row
-    auto num_rows = static_cast<IndexType>(a->get_size()[0]);
-    auto block_size = default_block_size / subwarp_size;
-    auto num_blocks = ceildiv(num_rows, block_size);
-    auto new_row_ptrs = m_out->get_row_ptrs();
-    if (num_blocks > 0) {
-        kernel::threshold_filter_nnz<subwarp_size>
-            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
-                old_row_ptrs, as_device_type(old_vals), num_rows,
-                as_device_type(threshold), new_row_ptrs, lower);
-    }
-
-    // build row pointers
-    components::prefix_sum_nonnegative(exec, new_row_ptrs, num_rows + 1);
-
-    // build matrix
-    auto new_nnz = exec->copy_val_to_host(new_row_ptrs + num_rows);
-    // resize arrays and update aliases
-    matrix::CsrBuilder<ValueType, IndexType> builder{m_out};
-    builder.get_col_idx_array().resize_and_reset(new_nnz);
-    builder.get_value_array().resize_and_reset(new_nnz);
-    auto new_col_idxs = m_out->get_col_idxs();
-    auto new_vals = m_out->get_values();
-    IndexType* new_row_idxs{};
-    if (m_out_coo) {
-        matrix::CooBuilder<ValueType, IndexType> coo_builder{m_out_coo};
-        coo_builder.get_row_idx_array().resize_and_reset(new_nnz);
-        coo_builder.get_col_idx_array() =
-            make_array_view(exec, new_nnz, new_col_idxs);
-        coo_builder.get_value_array() =
-            make_array_view(exec, new_nnz, new_vals);
-        new_row_idxs = m_out_coo->get_row_idxs();
-    }
-    if (num_blocks > 0) {
-        kernel::threshold_filter<subwarp_size>
-            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
-                old_row_ptrs, old_col_idxs, as_device_type(old_vals), num_rows,
-                as_device_type(threshold), new_row_ptrs, new_row_idxs,
-                new_col_idxs, as_device_type(new_vals), lower);
-    }
-}
-
-
-GKO_ENABLE_IMPLEMENTATION_SELECTION(select_threshold_filter, threshold_filter);
-
-
-}  // namespace
-
-template <typename ValueType, typename IndexType>
-void threshold_filter(std::shared_ptr<const DefaultExecutor> exec,
-                      const matrix::Csr<ValueType, IndexType>* a,
-                      remove_complex<ValueType> threshold,
-                      matrix::Csr<ValueType, IndexType>* m_out,
-                      matrix::Coo<ValueType, IndexType>* m_out_coo, bool lower)
-{
-    auto num_rows = a->get_size()[0];
-    auto total_nnz = a->get_num_stored_elements();
-    auto total_nnz_per_row = total_nnz / num_rows;
-    select_threshold_filter(
-        compiled_kernels(),
-        [&](int compiled_subwarp_size) {
-            return total_nnz_per_row <= compiled_subwarp_size ||
-                   compiled_subwarp_size == config::warp_size;
-        },
-        syn::value_list<int>(), syn::type_list<>(), exec, a, threshold, m_out,
-        m_out_coo, lower);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_KERNEL);
-
-
-}  // namespace par_ilut_factorization
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/factorization/par_ilut_select_kernels.hip.cpp b/hip/factorization/par_ilut_select_kernels.hip.cpp
deleted file mode 100644
index b259133b95d..00000000000
--- a/hip/factorization/par_ilut_select_kernels.hip.cpp
+++ /dev/null
@@ -1,162 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/factorization/par_ilut_kernels.hpp"
-
-
-#include <algorithm>
-
-
-#include <ginkgo/core/base/array.hpp>
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/matrix/csr.hpp>
-
-
-#include "common/cuda_hip/base/runtime.hpp"
-#include "core/components/prefix_sum_kernels.hpp"
-#include "hip/base/math.hip.hpp"
-#include "hip/components/atomic.hip.hpp"
-#include "hip/components/intrinsics.hip.hpp"
-#include "hip/components/prefix_sum.hip.hpp"
-#include "hip/components/searching.hip.hpp"
-#include "hip/components/sorting.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-#include "hip/factorization/par_ilut_select_common.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-/**
- * @brief The parallel ILUT factorization namespace.
- *
- * @ingroup factor
- */
-namespace par_ilut_factorization {
-
-
-#include "common/cuda_hip/factorization/par_ilut_select_kernels.hpp.inc"
-
-
-template <typename ValueType, typename IndexType>
-void sampleselect_filter(std::shared_ptr<const DefaultExecutor> exec,
-                         const ValueType* values, IndexType size,
-                         const unsigned char* oracles,
-                         const IndexType* partial_counts, IndexType bucket,
-                         remove_complex<ValueType>* out)
-{
-    auto num_threads_total = ceildiv(size, items_per_thread);
-    auto num_blocks =
-        static_cast<IndexType>(ceildiv(num_threads_total, default_block_size));
-    if (num_blocks > 0) {
-        kernel::filter_bucket<<<num_blocks, default_block_size, 0,
-                                exec->get_stream()>>>(
-            as_device_type(values), size, bucket, oracles, partial_counts,
-            as_device_type(out), items_per_thread);
-    }
-}
-
-
-template <typename ValueType, typename IndexType>
-void threshold_select(std::shared_ptr<const DefaultExecutor> exec,
-                      const matrix::Csr<ValueType, IndexType>* m,
-                      IndexType rank, array<ValueType>& tmp1,
-                      array<remove_complex<ValueType>>& tmp2,
-                      remove_complex<ValueType>& threshold)
-{
-    auto values = m->get_const_values();
-    IndexType size = m->get_num_stored_elements();
-    using AbsType = remove_complex<ValueType>;
-    constexpr auto bucket_count = kernel::searchtree_width;
-    auto max_num_threads = ceildiv(size, items_per_thread);
-    auto max_num_blocks = ceildiv(max_num_threads, default_block_size);
-
-    size_type tmp_size_totals =
-        ceildiv((bucket_count + 1) * sizeof(IndexType), sizeof(ValueType));
-    size_type tmp_size_partials = ceildiv(
-        bucket_count * max_num_blocks * sizeof(IndexType), sizeof(ValueType));
-    size_type tmp_size_oracles =
-        ceildiv(size * sizeof(unsigned char), sizeof(ValueType));
-    size_type tmp_size_tree =
-        ceildiv(kernel::searchtree_size * sizeof(AbsType), sizeof(ValueType));
-    size_type tmp_size_vals =
-        size / bucket_count * 4;  // pessimistic estimate for temporary storage
-    size_type tmp_size =
-        tmp_size_totals + tmp_size_partials + tmp_size_oracles + tmp_size_tree;
-    tmp1.resize_and_reset(tmp_size);
-    tmp2.resize_and_reset(tmp_size_vals);
-
-    auto total_counts = reinterpret_cast<IndexType*>(tmp1.get_data());
-    auto partial_counts =
-        reinterpret_cast<IndexType*>(tmp1.get_data() + tmp_size_totals);
-    auto oracles = reinterpret_cast<unsigned char*>(
-        tmp1.get_data() + tmp_size_totals + tmp_size_partials);
-    auto tree =
-        reinterpret_cast<AbsType*>(tmp1.get_data() + tmp_size_totals +
-                                   tmp_size_partials + tmp_size_oracles);
-
-    sampleselect_count(exec, values, size, tree, oracles, partial_counts,
-                       total_counts);
-
-    // determine bucket with correct rank, use bucket-local rank
-    auto bucket = sampleselect_find_bucket(exec, total_counts, rank);
-    rank -= bucket.begin;
-
-    if (bucket.size * 2 > tmp_size_vals) {
-        // we need to reallocate tmp2
-        tmp2.resize_and_reset(bucket.size * 2);
-    }
-    auto tmp21 = tmp2.get_data();
-    auto tmp22 = tmp2.get_data() + bucket.size;
-    // extract target bucket
-    sampleselect_filter(exec, values, size, oracles, partial_counts, bucket.idx,
-                        tmp22);
-
-    // recursively select from smaller buckets
-    int step{};
-    while (bucket.size > kernel::basecase_size) {
-        std::swap(tmp21, tmp22);
-        const auto* tmp_in = tmp21;
-        auto tmp_out = tmp22;
-
-        sampleselect_count(exec, tmp_in, bucket.size, tree, oracles,
-                           partial_counts, total_counts);
-        auto new_bucket = sampleselect_find_bucket(exec, total_counts, rank);
-        sampleselect_filter(exec, tmp_in, bucket.size, oracles, partial_counts,
-                            bucket.idx, tmp_out);
-
-        rank -= new_bucket.begin;
-        bucket.size = new_bucket.size;
-        // we should never need more than 5 recursion steps, this would mean
-        // 256^5 = 2^40. fall back to standard library algorithm in that case.
-        ++step;
-        if (step > 5) {
-            array<AbsType> cpu_out_array{
-                exec->get_master(),
-                make_array_view(exec, bucket.size, tmp_out)};
-            auto begin = cpu_out_array.get_data();
-            auto end = begin + bucket.size;
-            auto middle = begin + rank;
-            std::nth_element(begin, middle, end);
-            threshold = *middle;
-            return;
-        }
-    }
-
-    // base case
-    auto out_ptr = reinterpret_cast<AbsType*>(tmp1.get_data());
-    kernel::basecase_select<<<1, kernel::basecase_block_size, 0,
-                              exec->get_stream()>>>(
-        as_device_type(tmp22), bucket.size, rank, as_device_type(out_ptr));
-    threshold = exec->copy_val_to_host(out_ptr);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_PAR_ILUT_THRESHOLD_SELECT_KERNEL);
-
-
-}  // namespace par_ilut_factorization
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/factorization/par_ilut_spgeam_kernels.hip.cpp b/hip/factorization/par_ilut_spgeam_kernels.hip.cpp
deleted file mode 100644
index cd9d7b7124a..00000000000
--- a/hip/factorization/par_ilut_spgeam_kernels.hip.cpp
+++ /dev/null
@@ -1,159 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/factorization/par_ilut_kernels.hpp"
-
-
-#include <ginkgo/core/base/array.hpp>
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/matrix/coo.hpp>
-#include <ginkgo/core/matrix/csr.hpp>
-#include <ginkgo/core/matrix/dense.hpp>
-
-
-#include "common/cuda_hip/base/runtime.hpp"
-#include "core/components/prefix_sum_kernels.hpp"
-#include "core/matrix/coo_builder.hpp"
-#include "core/matrix/csr_builder.hpp"
-#include "core/matrix/csr_kernels.hpp"
-#include "core/synthesizer/implementation_selection.hpp"
-#include "hip/base/math.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
-#include "hip/components/intrinsics.hip.hpp"
-#include "hip/components/merging.hip.hpp"
-#include "hip/components/prefix_sum.hip.hpp"
-#include "hip/components/searching.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-/**
- * @brief The parallel ILUT factorization namespace.
- *
- * @ingroup factor
- */
-namespace par_ilut_factorization {
-
-
-constexpr int default_block_size = 512;
-
-
-// subwarp sizes for add_candidates kernels
-using compiled_kernels =
-    syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
-
-
-#include "common/cuda_hip/factorization/par_ilut_spgeam_kernels.hpp.inc"
-
-
-namespace {
-
-
-template <int subwarp_size, typename ValueType, typename IndexType>
-void add_candidates(syn::value_list<int, subwarp_size>,
-                    std::shared_ptr<const DefaultExecutor> exec,
-                    const matrix::Csr<ValueType, IndexType>* lu,
-                    const matrix::Csr<ValueType, IndexType>* a,
-                    const matrix::Csr<ValueType, IndexType>* l,
-                    const matrix::Csr<ValueType, IndexType>* u,
-                    matrix::Csr<ValueType, IndexType>* l_new,
-                    matrix::Csr<ValueType, IndexType>* u_new)
-{
-    auto num_rows = static_cast<IndexType>(lu->get_size()[0]);
-    auto subwarps_per_block = default_block_size / subwarp_size;
-    auto num_blocks = ceildiv(num_rows, subwarps_per_block);
-    matrix::CsrBuilder<ValueType, IndexType> l_new_builder(l_new);
-    matrix::CsrBuilder<ValueType, IndexType> u_new_builder(u_new);
-    auto lu_row_ptrs = lu->get_const_row_ptrs();
-    auto lu_col_idxs = lu->get_const_col_idxs();
-    auto lu_vals = lu->get_const_values();
-    auto a_row_ptrs = a->get_const_row_ptrs();
-    auto a_col_idxs = a->get_const_col_idxs();
-    auto a_vals = a->get_const_values();
-    auto l_row_ptrs = l->get_const_row_ptrs();
-    auto l_col_idxs = l->get_const_col_idxs();
-    auto l_vals = l->get_const_values();
-    auto u_row_ptrs = u->get_const_row_ptrs();
-    auto u_col_idxs = u->get_const_col_idxs();
-    auto u_vals = u->get_const_values();
-    auto l_new_row_ptrs = l_new->get_row_ptrs();
-    auto u_new_row_ptrs = u_new->get_row_ptrs();
-    if (num_blocks > 0) {
-        // count non-zeros per row
-        kernel::tri_spgeam_nnz<subwarp_size>
-            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
-                lu_row_ptrs, lu_col_idxs, a_row_ptrs, a_col_idxs,
-                l_new_row_ptrs, u_new_row_ptrs, num_rows);
-    }
-
-    // build row ptrs
-    components::prefix_sum_nonnegative(exec, l_new_row_ptrs, num_rows + 1);
-    components::prefix_sum_nonnegative(exec, u_new_row_ptrs, num_rows + 1);
-
-    // resize output arrays
-    auto l_new_nnz = exec->copy_val_to_host(l_new_row_ptrs + num_rows);
-    auto u_new_nnz = exec->copy_val_to_host(u_new_row_ptrs + num_rows);
-    l_new_builder.get_col_idx_array().resize_and_reset(l_new_nnz);
-    l_new_builder.get_value_array().resize_and_reset(l_new_nnz);
-    u_new_builder.get_col_idx_array().resize_and_reset(u_new_nnz);
-    u_new_builder.get_value_array().resize_and_reset(u_new_nnz);
-
-    auto l_new_col_idxs = l_new->get_col_idxs();
-    auto l_new_vals = l_new->get_values();
-    auto u_new_col_idxs = u_new->get_col_idxs();
-    auto u_new_vals = u_new->get_values();
-
-    if (num_blocks > 0) {
-        // fill columns and values
-        kernel::tri_spgeam_init<subwarp_size>
-            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
-                lu_row_ptrs, lu_col_idxs, as_device_type(lu_vals), a_row_ptrs,
-                a_col_idxs, as_device_type(a_vals), l_row_ptrs, l_col_idxs,
-                as_device_type(l_vals), u_row_ptrs, u_col_idxs,
-                as_device_type(u_vals), l_new_row_ptrs, l_new_col_idxs,
-                as_device_type(l_new_vals), u_new_row_ptrs, u_new_col_idxs,
-                as_device_type(u_new_vals), num_rows);
-    }
-}
-
-
-GKO_ENABLE_IMPLEMENTATION_SELECTION(select_add_candidates, add_candidates);
-
-
-}  // namespace
-
-
-template <typename ValueType, typename IndexType>
-void add_candidates(std::shared_ptr<const DefaultExecutor> exec,
-                    const matrix::Csr<ValueType, IndexType>* lu,
-                    const matrix::Csr<ValueType, IndexType>* a,
-                    const matrix::Csr<ValueType, IndexType>* l,
-                    const matrix::Csr<ValueType, IndexType>* u,
-                    matrix::Csr<ValueType, IndexType>* l_new,
-                    matrix::Csr<ValueType, IndexType>* u_new)
-{
-    auto num_rows = a->get_size()[0];
-    auto total_nnz =
-        lu->get_num_stored_elements() + a->get_num_stored_elements();
-    auto total_nnz_per_row = total_nnz / num_rows;
-    select_add_candidates(
-        compiled_kernels(),
-        [&](int compiled_subwarp_size) {
-            return total_nnz_per_row <= compiled_subwarp_size ||
-                   compiled_subwarp_size == config::warp_size;
-        },
-        syn::value_list<int>(), syn::type_list<>(), exec, lu, a, l, u, l_new,
-        u_new);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_PAR_ILUT_ADD_CANDIDATES_KERNEL);
-
-
-}  // namespace par_ilut_factorization
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/factorization/par_ilut_sweep_kernels.hip.cpp b/hip/factorization/par_ilut_sweep_kernels.hip.cpp
deleted file mode 100644
index 26672fd2acb..00000000000
--- a/hip/factorization/par_ilut_sweep_kernels.hip.cpp
+++ /dev/null
@@ -1,123 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/factorization/par_ilut_kernels.hpp"
-
-
-#include <ginkgo/core/base/array.hpp>
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/matrix/coo.hpp>
-#include <ginkgo/core/matrix/csr.hpp>
-#include <ginkgo/core/matrix/dense.hpp>
-
-
-#include "common/cuda_hip/base/runtime.hpp"
-#include "core/components/prefix_sum_kernels.hpp"
-#include "core/matrix/coo_builder.hpp"
-#include "core/matrix/csr_builder.hpp"
-#include "core/matrix/csr_kernels.hpp"
-#include "core/synthesizer/implementation_selection.hpp"
-#include "hip/base/math.hip.hpp"
-#include "hip/components/intrinsics.hip.hpp"
-#include "hip/components/memory.hip.hpp"
-#include "hip/components/merging.hip.hpp"
-#include "hip/components/prefix_sum.hip.hpp"
-#include "hip/components/reduction.hip.hpp"
-#include "hip/components/searching.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-/**
- * @brief The parallel ILUT factorization namespace.
- *
- * @ingroup factor
- */
-namespace par_ilut_factorization {
-
-
-constexpr int default_block_size = 512;
-
-
-// subwarp sizes for all warp-parallel kernels (filter, add_candidates)
-using compiled_kernels =
-    syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
-
-
-#include "common/cuda_hip/factorization/par_ilut_sweep_kernels.hpp.inc"
-
-
-namespace {
-
-
-template <int subwarp_size, typename ValueType, typename IndexType>
-void compute_l_u_factors(syn::value_list<int, subwarp_size>,
-                         std::shared_ptr<const DefaultExecutor> exec,
-                         const matrix::Csr<ValueType, IndexType>* a,
-                         matrix::Csr<ValueType, IndexType>* l,
-                         const matrix::Coo<ValueType, IndexType>* l_coo,
-                         matrix::Csr<ValueType, IndexType>* u,
-                         const matrix::Coo<ValueType, IndexType>* u_coo,
-                         matrix::Csr<ValueType, IndexType>* u_csc)
-{
-    auto total_nnz = static_cast<IndexType>(l->get_num_stored_elements() +
-                                            u->get_num_stored_elements());
-    auto block_size = default_block_size / subwarp_size;
-    auto num_blocks = ceildiv(total_nnz, block_size);
-    if (num_blocks > 0) {
-        kernel::sweep<subwarp_size>
-            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
-                a->get_const_row_ptrs(), a->get_const_col_idxs(),
-                as_device_type(a->get_const_values()), l->get_const_row_ptrs(),
-                l_coo->get_const_row_idxs(), l->get_const_col_idxs(),
-                as_device_type(l->get_values()),
-                static_cast<IndexType>(l->get_num_stored_elements()),
-                u_coo->get_const_row_idxs(), u_coo->get_const_col_idxs(),
-                as_device_type(u->get_values()), u_csc->get_const_row_ptrs(),
-                u_csc->get_const_col_idxs(),
-                as_device_type(u_csc->get_values()),
-                static_cast<IndexType>(u->get_num_stored_elements()));
-    }
-}
-
-GKO_ENABLE_IMPLEMENTATION_SELECTION(select_compute_l_u_factors,
-                                    compute_l_u_factors);
-
-
-}  // namespace
-
-
-template <typename ValueType, typename IndexType>
-void compute_l_u_factors(std::shared_ptr<const DefaultExecutor> exec,
-                         const matrix::Csr<ValueType, IndexType>* a,
-                         matrix::Csr<ValueType, IndexType>* l,
-                         const matrix::Coo<ValueType, IndexType>* l_coo,
-                         matrix::Csr<ValueType, IndexType>* u,
-                         const matrix::Coo<ValueType, IndexType>* u_coo,
-                         matrix::Csr<ValueType, IndexType>* u_csc)
-{
-    auto num_rows = a->get_size()[0];
-    auto total_nnz =
-        l->get_num_stored_elements() + u->get_num_stored_elements();
-    auto total_nnz_per_row = total_nnz / num_rows;
-    select_compute_l_u_factors(
-        compiled_kernels(),
-        [&](int compiled_subwarp_size) {
-            return total_nnz_per_row <= compiled_subwarp_size ||
-                   compiled_subwarp_size == config::warp_size;
-        },
-        syn::value_list<int>(), syn::type_list<>(), exec, a, l, l_coo, u, u_coo,
-        u_csc);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_PAR_ILUT_COMPUTE_LU_FACTORS_KERNEL);
-
-
-}  // namespace par_ilut_factorization
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/log/batch_logger.hip.hpp b/hip/log/batch_logger.hip.hpp
deleted file mode 100644
index a2540f2bd9d..00000000000
--- a/hip/log/batch_logger.hip.hpp
+++ /dev/null
@@ -1,26 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_HIP_LOG_BATCH_LOGGER_HIP_HPP_
-#define GKO_HIP_LOG_BATCH_LOGGER_HIP_HPP_
-
-
-#include <ginkgo/core/base/types.hpp>
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-namespace batch_log {
-
-#include "common/cuda_hip/log/batch_logger.hpp.inc"
-
-
-}  // namespace batch_log
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
-
-
-#endif  // GKO_HIP_LOG_BATCH_LOGGER_HIP_HPP_
diff --git a/hip/matrix/batch_csr_kernels.hip.cpp b/hip/matrix/batch_csr_kernels.hip.cpp
deleted file mode 100644
index 2b5e02a1c31..00000000000
--- a/hip/matrix/batch_csr_kernels.hip.cpp
+++ /dev/null
@@ -1,58 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/matrix/batch_csr_kernels.hpp"
-
-
-#include <thrust/functional.h>
-
-
-#include <ginkgo/core/base/batch_multi_vector.hpp>
-#include <ginkgo/core/base/types.hpp>
-#include <ginkgo/core/matrix/batch_csr.hpp>
-
-
-#include "common/cuda_hip/base/runtime.hpp"
-#include "core/base/batch_struct.hpp"
-#include "core/matrix/batch_struct.hpp"
-#include "hip/base/batch_struct.hip.hpp"
-#include "hip/base/config.hip.hpp"
-#include "hip/base/thrust.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
-#include "hip/components/reduction.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-#include "hip/components/uninitialized_array.hip.hpp"
-#include "hip/matrix/batch_struct.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-/**
- * @brief The Csr matrix format namespace.
- * @ref Csr
- * @ingroup batch_csr
- */
-namespace batch_csr {
-
-
-constexpr auto default_block_size = 256;
-constexpr int sm_oversubscription = 4;
-
-// clang-format off
-
-// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
-
-#include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc"
-
-
-#include "common/cuda_hip/matrix/batch_csr_kernel_launcher.hpp.inc"
-
-// clang-format on
-
-
-}  // namespace batch_csr
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/matrix/batch_dense_kernels.hip.cpp b/hip/matrix/batch_dense_kernels.hip.cpp
deleted file mode 100644
index c53a271598e..00000000000
--- a/hip/matrix/batch_dense_kernels.hip.cpp
+++ /dev/null
@@ -1,59 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/matrix/batch_dense_kernels.hpp"
-
-
-#include <thrust/functional.h>
-
-
-#include <ginkgo/core/base/batch_multi_vector.hpp>
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/matrix/batch_dense.hpp>
-
-
-#include "common/cuda_hip/base/runtime.hpp"
-#include "core/base/batch_struct.hpp"
-#include "core/matrix/batch_struct.hpp"
-#include "hip/base/batch_struct.hip.hpp"
-#include "hip/base/config.hip.hpp"
-#include "hip/base/thrust.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
-#include "hip/components/reduction.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-#include "hip/components/uninitialized_array.hip.hpp"
-#include "hip/matrix/batch_struct.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-/**
- * @brief The Dense matrix format namespace.
- *
- * @ingroup batch_dense
- */
-namespace batch_dense {
-
-
-constexpr auto default_block_size = 256;
-constexpr int sm_oversubscription = 4;
-
-// clang-format off
-
-// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
-
-#include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc"
-
-
-#include "common/cuda_hip/matrix/batch_dense_kernel_launcher.hpp.inc"
-
-
-// clang-format on
-
-
-}  // namespace batch_dense
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/matrix/batch_ell_kernels.hip.cpp b/hip/matrix/batch_ell_kernels.hip.cpp
deleted file mode 100644
index c6ef298803b..00000000000
--- a/hip/matrix/batch_ell_kernels.hip.cpp
+++ /dev/null
@@ -1,58 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/matrix/batch_ell_kernels.hpp"
-
-
-#include <thrust/functional.h>
-
-
-#include <ginkgo/core/base/batch_multi_vector.hpp>
-#include <ginkgo/core/base/types.hpp>
-#include <ginkgo/core/matrix/batch_ell.hpp>
-
-
-#include "common/cuda_hip/base/runtime.hpp"
-#include "core/base/batch_struct.hpp"
-#include "core/matrix/batch_struct.hpp"
-#include "hip/base/batch_struct.hip.hpp"
-#include "hip/base/config.hip.hpp"
-#include "hip/base/thrust.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
-#include "hip/components/reduction.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-#include "hip/components/uninitialized_array.hip.hpp"
-#include "hip/matrix/batch_struct.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-/**
- * @brief The Ell matrix format namespace.
- * @ref Ell
- * @ingroup batch_ell
- */
-namespace batch_ell {
-
-
-constexpr auto default_block_size = 256;
-constexpr int sm_oversubscription = 4;
-
-// clang-format off
-
-// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
-
-#include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc"
-
-
-#include "common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc"
-
-// clang-format on
-
-
-}  // namespace batch_ell
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/matrix/coo_kernels.hip.cpp b/hip/matrix/coo_kernels.hip.cpp
deleted file mode 100644
index 35bc698a4de..00000000000
--- a/hip/matrix/coo_kernels.hip.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/matrix/coo_kernels.hpp"
-
-
-#include <ginkgo/core/base/exception_helpers.hpp>
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/base/types.hpp>
-#include <ginkgo/core/matrix/csr.hpp>
-#include <ginkgo/core/matrix/dense.hpp>
-
-
-#include "common/cuda_hip/base/runtime.hpp"
-#include "core/matrix/dense_kernels.hpp"
-#include "hip/base/config.hip.hpp"
-#include "hip/base/hipsparse_bindings.hip.hpp"
-#include "hip/base/math.hip.hpp"
-#include "hip/base/types.hip.hpp"
-#include "hip/components/atomic.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
-#include "hip/components/format_conversion.hip.hpp"
-#include "hip/components/segment_scan.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-/**
- * @brief The Coordinate matrix format namespace.
- *
- * @ingroup coo
- */
-namespace coo {
-
-
-constexpr int warps_in_block = 4;
-constexpr int spmv_block_size = warps_in_block * config::warp_size;
-
-
-#include "common/cuda_hip/matrix/coo_kernels.hpp.inc"
-
-
-}  // namespace coo
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/matrix/dense_kernels.hip.cpp b/hip/matrix/dense_kernels.hip.cpp
deleted file mode 100644
index 50bd975411e..00000000000
--- a/hip/matrix/dense_kernels.hip.cpp
+++ /dev/null
@@ -1,232 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/matrix/dense_kernels.hpp"
-
-
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/base/range_accessors.hpp>
-#include <ginkgo/core/matrix/coo.hpp>
-#include <ginkgo/core/matrix/csr.hpp>
-#include <ginkgo/core/matrix/diagonal.hpp>
-#include <ginkgo/core/matrix/ell.hpp>
-#include <ginkgo/core/matrix/fbcsr.hpp>
-#include <ginkgo/core/matrix/hybrid.hpp>
-#include <ginkgo/core/matrix/sellp.hpp>
-#include <ginkgo/core/matrix/sparsity_csr.hpp>
-
-
-#include "common/cuda_hip/base/runtime.hpp"
-#include "core/base/utils.hpp"
-#include "core/components/prefix_sum_kernels.hpp"
-#include "hip/base/config.hip.hpp"
-#include "hip/base/hipblas_bindings.hip.hpp"
-#include "hip/base/pointer_mode_guard.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
-#include "hip/components/intrinsics.hip.hpp"
-#include "hip/components/reduction.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-#include "hip/components/uninitialized_array.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-/**
- * @brief The Dense matrix format namespace.
- *
- * @ingroup dense
- */
-namespace dense {
-
-
-constexpr int default_block_size = 512;
-
-
-#include "common/cuda_hip/matrix/dense_kernels.hpp.inc"
-
-
-template <typename ValueType>
-void compute_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
-                          const matrix::Dense<ValueType>* x,
-                          const matrix::Dense<ValueType>* y,
-                          matrix::Dense<ValueType>* result, array<char>& tmp)
-{
-    if (x->get_size()[1] == 1 && y->get_size()[1] == 1) {
-        if (blas::is_supported<ValueType>::value) {
-            auto handle = exec->get_blas_handle();
-            blas::dot(handle, x->get_size()[0], x->get_const_values(),
-                      x->get_stride(), y->get_const_values(), y->get_stride(),
-                      result->get_values());
-        } else {
-            compute_dot(exec, x, y, result, tmp);
-        }
-    } else {
-        compute_dot(exec, x, y, result, tmp);
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
-    GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL);
-
-
-template <typename ValueType>
-void compute_conj_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
-                               const matrix::Dense<ValueType>* x,
-                               const matrix::Dense<ValueType>* y,
-                               matrix::Dense<ValueType>* result,
-                               array<char>& tmp)
-{
-    if (x->get_size()[1] == 1 && y->get_size()[1] == 1) {
-        if (blas::is_supported<ValueType>::value) {
-            auto handle = exec->get_blas_handle();
-            blas::conj_dot(handle, x->get_size()[0], x->get_const_values(),
-                           x->get_stride(), y->get_const_values(),
-                           y->get_stride(), result->get_values());
-        } else {
-            compute_conj_dot(exec, x, y, result, tmp);
-        }
-    } else {
-        compute_conj_dot(exec, x, y, result, tmp);
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
-    GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL);
-
-
-template <typename ValueType>
-void compute_norm2_dispatch(std::shared_ptr<const DefaultExecutor> exec,
-                            const matrix::Dense<ValueType>* x,
-                            matrix::Dense<remove_complex<ValueType>>* result,
-                            array<char>& tmp)
-{
-    if (x->get_size()[1] == 1) {
-        if (blas::is_supported<ValueType>::value) {
-            auto handle = exec->get_blas_handle();
-            blas::norm2(handle, x->get_size()[0], x->get_const_values(),
-                        x->get_stride(), result->get_values());
-        } else {
-            compute_norm2(exec, x, result, tmp);
-        }
-    } else {
-        compute_norm2(exec, x, result, tmp);
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
-    GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL);
-
-
-template <typename ValueType>
-void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
-                  const matrix::Dense<ValueType>* a,
-                  const matrix::Dense<ValueType>* b,
-                  matrix::Dense<ValueType>* c)
-{
-    if (blas::is_supported<ValueType>::value) {
-        auto handle = exec->get_blas_handle();
-        if (c->get_size()[0] > 0 && c->get_size()[1] > 0) {
-            if (a->get_size()[1] > 0) {
-                blas::pointer_mode_guard pm_guard(handle);
-                auto alpha = one<ValueType>();
-                auto beta = zero<ValueType>();
-                blas::gemm(handle, BLAS_OP_N, BLAS_OP_N, c->get_size()[1],
-                           c->get_size()[0], a->get_size()[1], &alpha,
-                           b->get_const_values(), b->get_stride(),
-                           a->get_const_values(), a->get_stride(), &beta,
-                           c->get_values(), c->get_stride());
-            } else {
-                dense::fill(exec, c, zero<ValueType>());
-            }
-        }
-    } else {
-        GKO_NOT_IMPLEMENTED;
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_SIMPLE_APPLY_KERNEL);
-
-
-template <typename ValueType>
-void apply(std::shared_ptr<const DefaultExecutor> exec,
-           const matrix::Dense<ValueType>* alpha,
-           const matrix::Dense<ValueType>* a, const matrix::Dense<ValueType>* b,
-           const matrix::Dense<ValueType>* beta, matrix::Dense<ValueType>* c)
-{
-    if (blas::is_supported<ValueType>::value) {
-        if (c->get_size()[0] > 0 && c->get_size()[1] > 0) {
-            if (a->get_size()[1] > 0) {
-                blas::gemm(exec->get_blas_handle(), BLAS_OP_N, BLAS_OP_N,
-                           c->get_size()[1], c->get_size()[0], a->get_size()[1],
-                           alpha->get_const_values(), b->get_const_values(),
-                           b->get_stride(), a->get_const_values(),
-                           a->get_stride(), beta->get_const_values(),
-                           c->get_values(), c->get_stride());
-            } else {
-                dense::scale(exec, beta, c);
-            }
-        }
-    } else {
-        GKO_NOT_IMPLEMENTED;
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL);
-
-
-template <typename ValueType>
-void transpose(std::shared_ptr<const DefaultExecutor> exec,
-               const matrix::Dense<ValueType>* orig,
-               matrix::Dense<ValueType>* trans)
-{
-    if (blas::is_supported<ValueType>::value) {
-        auto handle = exec->get_blas_handle();
-        if (orig->get_size()[0] > 0 && orig->get_size()[1] > 0) {
-            blas::pointer_mode_guard pm_guard(handle);
-            auto alpha = one<ValueType>();
-            auto beta = zero<ValueType>();
-            blas::geam(handle, BLAS_OP_T, BLAS_OP_N, orig->get_size()[0],
-                       orig->get_size()[1], &alpha, orig->get_const_values(),
-                       orig->get_stride(), &beta, trans->get_const_values(),
-                       trans->get_stride(), trans->get_values(),
-                       trans->get_stride());
-        }
-    } else {
-        GKO_NOT_IMPLEMENTED;
-    }
-};
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_TRANSPOSE_KERNEL);
-
-
-template <typename ValueType>
-void conj_transpose(std::shared_ptr<const DefaultExecutor> exec,
-                    const matrix::Dense<ValueType>* orig,
-                    matrix::Dense<ValueType>* trans)
-{
-    if (blas::is_supported<ValueType>::value) {
-        auto handle = exec->get_blas_handle();
-        if (orig->get_size()[0] > 0 && orig->get_size()[1] > 0) {
-            blas::pointer_mode_guard pm_guard(handle);
-            auto alpha = one<ValueType>();
-            auto beta = zero<ValueType>();
-            blas::geam(handle, BLAS_OP_C, BLAS_OP_N, orig->get_size()[0],
-                       orig->get_size()[1], &alpha, orig->get_const_values(),
-                       orig->get_stride(), &beta, trans->get_const_values(),
-                       trans->get_stride(), trans->get_values(),
-                       trans->get_stride());
-        }
-    } else {
-        GKO_NOT_IMPLEMENTED;
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_CONJ_TRANSPOSE_KERNEL);
-
-
-}  // namespace dense
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/matrix/diagonal_kernels.hip.cpp b/hip/matrix/diagonal_kernels.hip.cpp
deleted file mode 100644
index d707fda9108..00000000000
--- a/hip/matrix/diagonal_kernels.hip.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/matrix/diagonal_kernels.hpp"
-
-
-#include <ginkgo/core/matrix/csr.hpp>
-#include <ginkgo/core/matrix/dense.hpp>
-
-
-#include "common/cuda_hip/base/runtime.hpp"
-#include "hip/base/config.hip.hpp"
-#include "hip/base/types.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-/**
- * @brief The Diagonal matrix format namespace.
- *
- * @ingroup diagonal
- */
-namespace diagonal {
-
-
-constexpr int default_block_size = 512;
-
-
-#include "common/cuda_hip/matrix/diagonal_kernels.hpp.inc"
-
-
-}  // namespace diagonal
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/matrix/ell_kernels.hip.cpp b/hip/matrix/ell_kernels.hip.cpp
deleted file mode 100644
index 669b0934165..00000000000
--- a/hip/matrix/ell_kernels.hip.cpp
+++ /dev/null
@@ -1,272 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/matrix/ell_kernels.hpp"
-
-
-#include <array>
-
-
-#include <ginkgo/core/base/exception_helpers.hpp>
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/base/types.hpp>
-#include <ginkgo/core/matrix/csr.hpp>
-#include <ginkgo/core/matrix/dense.hpp>
-
-
-#include "accessor/hip_helper.hpp"
-#include "accessor/reduced_row_major.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "core/base/mixed_precision_types.hpp"
-#include "core/components/fill_array_kernels.hpp"
-#include "core/components/prefix_sum_kernels.hpp"
-#include "core/matrix/dense_kernels.hpp"
-#include "core/synthesizer/implementation_selection.hpp"
-#include "hip/base/config.hip.hpp"
-#include "hip/base/hipsparse_bindings.hip.hpp"
-#include "hip/base/types.hip.hpp"
-#include "hip/components/atomic.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
-#include "hip/components/format_conversion.hip.hpp"
-#include "hip/components/reduction.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-/**
- * @brief The ELL matrix format namespace.
- *
- * @ingroup ell
- */
-namespace ell {
-
-
-constexpr int default_block_size = 512;
-
-
-// TODO: num_threads_per_core and ratio are parameters should be tuned
-/**
- * num_threads_per_core is the oversubscribing parameter. There are
- * `num_threads_per_core` threads assigned to each physical core.
- */
-constexpr int num_threads_per_core = 4;
-
-
-/**
- * ratio is the parameter to decide when to use threads to do reduction on each
- * row. (#cols/#rows > ratio)
- */
-constexpr double ratio = 1e-2;
-
-
-/**
- * max_thread_per_worker is the max number of thread per worker. The
- * `compiled_kernels` must be a list <0, 1, 2, ..., max_thread_per_worker>
- */
-constexpr int max_thread_per_worker = 32;
-
-
-/**
- * A compile-time list of sub-warp sizes for which the spmv kernels should be
- * compiled.
- * 0 is a special case where it uses a sub-warp size of warp_size in
- * combination with atomic_adds.
- */
-using compiled_kernels = syn::value_list<int, 0, 1, 2, 4, 8, 16, 32>;
-
-
-#include "common/cuda_hip/matrix/ell_kernels.hpp.inc"
-
-
-namespace {
-
-
-template <int info, typename InputValueType, typename MatrixValueType,
-          typename OutputValueType, typename IndexType>
-void abstract_spmv(syn::value_list<int, info>,
-                   std::shared_ptr<const DefaultExecutor> exec,
-                   int num_worker_per_row,
-                   const matrix::Ell<MatrixValueType, IndexType>* a,
-                   const matrix::Dense<InputValueType>* b,
-                   matrix::Dense<OutputValueType>* c,
-                   const matrix::Dense<MatrixValueType>* alpha = nullptr,
-                   const matrix::Dense<OutputValueType>* beta = nullptr)
-{
-    using arithmetic_type =
-        highest_precision<InputValueType, OutputValueType, MatrixValueType>;
-    using a_accessor =
-        acc::reduced_row_major<1, arithmetic_type, const MatrixValueType>;
-    using b_accessor =
-        acc::reduced_row_major<2, arithmetic_type, const InputValueType>;
-
-    const auto nrows = a->get_size()[0];
-    const auto stride = a->get_stride();
-    const auto num_stored_elements_per_row =
-        a->get_num_stored_elements_per_row();
-
-    constexpr int num_thread_per_worker =
-        (info == 0) ? max_thread_per_worker : info;
-    constexpr bool atomic = (info == 0);
-    const dim3 block_size(default_block_size / num_thread_per_worker,
-                          num_thread_per_worker, 1);
-    const dim3 grid_size(ceildiv(nrows * num_worker_per_row, block_size.x),
-                         b->get_size()[1], 1);
-
-    const auto a_vals = acc::range<a_accessor>(
-        std::array<acc::size_type, 1>{{static_cast<acc::size_type>(
-            num_stored_elements_per_row * stride)}},
-        a->get_const_values());
-    const auto b_vals = acc::range<b_accessor>(
-        std::array<acc::size_type, 2>{
-            {static_cast<acc::size_type>(b->get_size()[0]),
-             static_cast<acc::size_type>(b->get_size()[1])}},
-        b->get_const_values(),
-        std::array<acc::size_type, 1>{
-            {static_cast<acc::size_type>(b->get_stride())}});
-
-    if (alpha == nullptr && beta == nullptr) {
-        if (grid_size.x > 0 && grid_size.y > 0) {
-            kernel::spmv<num_thread_per_worker, atomic>
-                <<<grid_size, block_size, 0, exec->get_stream()>>>(
-                    nrows, num_worker_per_row, acc::as_hip_range(a_vals),
-                    a->get_const_col_idxs(), stride,
-                    num_stored_elements_per_row, acc::as_hip_range(b_vals),
-                    as_device_type(c->get_values()), c->get_stride());
-        }
-    } else if (alpha != nullptr && beta != nullptr) {
-        const auto alpha_val = acc::range<a_accessor>(
-            std::array<acc::size_type, 1>{1}, alpha->get_const_values());
-        if (grid_size.x > 0 && grid_size.y > 0) {
-            kernel::spmv<num_thread_per_worker, atomic>
-                <<<grid_size, block_size, 0, exec->get_stream()>>>(
-                    nrows, num_worker_per_row, acc::as_hip_range(alpha_val),
-                    acc::as_hip_range(a_vals), a->get_const_col_idxs(), stride,
-                    num_stored_elements_per_row, acc::as_hip_range(b_vals),
-                    as_device_type(beta->get_const_values()),
-                    as_device_type(c->get_values()), c->get_stride());
-        }
-    } else {
-        GKO_KERNEL_NOT_FOUND;
-    }
-}
-
-GKO_ENABLE_IMPLEMENTATION_SELECTION(select_abstract_spmv, abstract_spmv);
-
-
-template <typename ValueType, typename IndexType>
-std::array<int, 3> compute_thread_worker_and_atomicity(
-    std::shared_ptr<const HipExecutor> exec,
-    const matrix::Ell<ValueType, IndexType>* a)
-{
-    int num_thread_per_worker = 1;
-    int atomic = 0;
-    int num_worker_per_row = 1;
-
-    const auto nrows = a->get_size()[0];
-    const auto ell_ncols = a->get_num_stored_elements_per_row();
-    // TODO: num_threads_per_core should be tuned for AMD gpu
-    const auto nwarps = exec->get_num_warps_per_sm() *
-                        exec->get_num_multiprocessor() * num_threads_per_core;
-
-    // Use multithreads to perform the reduction on each row when the matrix is
-    // wide.
-    // To make every thread have computation, so pick the value which is the
-    // power of 2 less than max_thread_per_worker and is less than or equal to
-    // ell_ncols. If the num_thread_per_worker is max_thread_per_worker and
-    // allow more than one worker to work on the same row, use atomic add to
-    // handle the worker write the value into the same position. The #worker is
-    // decided according to the number of worker allowed on GPU.
-    if (static_cast<double>(ell_ncols) / nrows > ratio) {
-        while (num_thread_per_worker < max_thread_per_worker &&
-               (num_thread_per_worker << 1) <= ell_ncols) {
-            num_thread_per_worker <<= 1;
-        }
-        if (num_thread_per_worker == max_thread_per_worker) {
-            num_worker_per_row =
-                std::min(ell_ncols / max_thread_per_worker, nwarps / nrows);
-            num_worker_per_row = std::max(num_worker_per_row, 1);
-        }
-        if (num_worker_per_row > 1) {
-            atomic = 1;
-        }
-    }
-    return {num_thread_per_worker, atomic, num_worker_per_row};
-}
-
-
-}  // namespace
-
-
-template <typename InputValueType, typename MatrixValueType,
-          typename OutputValueType, typename IndexType>
-void spmv(std::shared_ptr<const HipExecutor> exec,
-          const matrix::Ell<MatrixValueType, IndexType>* a,
-          const matrix::Dense<InputValueType>* b,
-          matrix::Dense<OutputValueType>* c)
-{
-    const auto data = compute_thread_worker_and_atomicity(exec, a);
-    const int num_thread_per_worker = std::get<0>(data);
-    const int atomic = std::get<1>(data);
-    const int num_worker_per_row = std::get<2>(data);
-
-    /**
-     * info is the parameter for selecting the device kernel.
-     * for info == 0, it uses the kernel by warp_size threads with atomic
-     * operation for other value, it uses the kernel without atomic_add
-     */
-    const int info = (!atomic) * num_thread_per_worker;
-    if (atomic) {
-        dense::fill(exec, c, zero<OutputValueType>());
-    }
-    select_abstract_spmv(
-        compiled_kernels(),
-        [&info](int compiled_info) { return info == compiled_info; },
-        syn::value_list<int>(), syn::type_list<>(), exec, num_worker_per_row, a,
-        b, c);
-}
-
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_ELL_SPMV_KERNEL);
-
-
-template <typename InputValueType, typename MatrixValueType,
-          typename OutputValueType, typename IndexType>
-void advanced_spmv(std::shared_ptr<const HipExecutor> exec,
-                   const matrix::Dense<MatrixValueType>* alpha,
-                   const matrix::Ell<MatrixValueType, IndexType>* a,
-                   const matrix::Dense<InputValueType>* b,
-                   const matrix::Dense<OutputValueType>* beta,
-                   matrix::Dense<OutputValueType>* c)
-{
-    const auto data = compute_thread_worker_and_atomicity(exec, a);
-    const int num_thread_per_worker = std::get<0>(data);
-    const int atomic = std::get<1>(data);
-    const int num_worker_per_row = std::get<2>(data);
-
-    /**
-     * info is the parameter for selecting the device kernel.
-     * for info == 0, it uses the kernel by warp_size threads with atomic
-     * operation for other value, it uses the kernel without atomic_add
-     */
-    const int info = (!atomic) * num_thread_per_worker;
-    if (atomic) {
-        dense::scale(exec, beta, c);
-    }
-    select_abstract_spmv(
-        compiled_kernels(),
-        [&info](int compiled_info) { return info == compiled_info; },
-        syn::value_list<int>(), syn::type_list<>(), exec, num_worker_per_row, a,
-        b, c, alpha, beta);
-}
-
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_ELL_ADVANCED_SPMV_KERNEL);
-
-
-}  // namespace ell
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/matrix/fbcsr_kernels.template.hip.cpp b/hip/matrix/fbcsr_kernels.template.hip.cpp
deleted file mode 100644
index 8e4519e74e5..00000000000
--- a/hip/matrix/fbcsr_kernels.template.hip.cpp
+++ /dev/null
@@ -1,303 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/matrix/fbcsr_kernels.hpp"
-
-
-#include <algorithm>
-
-
-#include <thrust/copy.h>
-#include <thrust/count.h>
-#include <thrust/device_ptr.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/transform_output_iterator.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/sort.h>
-
-
-#include <ginkgo/core/base/array.hpp>
-#include <ginkgo/core/base/exception_helpers.hpp>
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/matrix/csr.hpp>
-#include <ginkgo/core/matrix/dense.hpp>
-
-
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/unified/base/kernel_launch.hpp"
-#include "core/base/array_access.hpp"
-#include "core/base/block_sizes.hpp"
-#include "core/base/device_matrix_data_kernels.hpp"
-#include "core/components/fill_array_kernels.hpp"
-#include "core/components/format_conversion_kernels.hpp"
-#include "core/matrix/csr_lookup.hpp"
-#include "core/matrix/dense_kernels.hpp"
-#include "core/synthesizer/implementation_selection.hpp"
-#include "hip/base/config.hip.hpp"
-#include "hip/base/hipblas_bindings.hip.hpp"
-#include "hip/base/hipsparse_bindings.hip.hpp"
-#include "hip/base/hipsparse_block_bindings.hip.hpp"
-#include "hip/base/math.hip.hpp"
-#include "hip/base/pointer_mode_guard.hip.hpp"
-#include "hip/base/thrust.hip.hpp"
-#include "hip/base/types.hip.hpp"
-#include "hip/components/atomic.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
-#include "hip/components/merging.hip.hpp"
-#include "hip/components/prefix_sum.hip.hpp"
-#include "hip/components/reduction.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-#include "hip/components/uninitialized_array.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-
-
-/**
- * @brief The fixed-size block compressed sparse row matrix format namespace.
- *
- * @ingroup fbcsr
- */
-namespace fbcsr {
-
-
-constexpr int default_block_size{512};
-
-
-#include "common/cuda_hip/matrix/csr_common.hpp.inc"
-#include "common/cuda_hip/matrix/fbcsr_kernels.hpp.inc"
-
-
-namespace {
-
-
-template <typename ValueType>
-void dense_transpose(std::shared_ptr<const HipExecutor> exec,
-                     const size_type nrows, const size_type ncols,
-                     const size_type orig_stride, const ValueType* const orig,
-                     const size_type trans_stride, ValueType* const trans)
-{
-    if (nrows == 0) {
-        return;
-    }
-    if (blas::is_supported<ValueType>::value) {
-        auto handle = exec->get_blas_handle();
-        {
-            blas::pointer_mode_guard pm_guard(handle);
-            auto alpha = one<ValueType>();
-            auto beta = zero<ValueType>();
-            blas::geam(handle, BLAS_OP_T, BLAS_OP_N, nrows, ncols, &alpha, orig,
-                       orig_stride, &beta, trans, trans_stride, trans,
-                       trans_stride);
-        }
-    } else {
-        GKO_NOT_IMPLEMENTED;
-    }
-}
-
-
-}  // namespace
-
-
-template <typename ValueType, typename IndexType>
-void spmv(std::shared_ptr<const HipExecutor> exec,
-          const matrix::Fbcsr<ValueType, IndexType>* const a,
-          const matrix::Dense<ValueType>* const b,
-          matrix::Dense<ValueType>* const c)
-{
-    if (c->get_size()[0] == 0 || c->get_size()[1] == 0) {
-        // empty output: nothing to do
-        return;
-    }
-    if (b->get_size()[0] == 0 || a->get_num_stored_blocks() == 0) {
-        // empty input: fill output with zero
-        dense::fill(exec, c, zero<ValueType>());
-        return;
-    }
-    if (sparselib::is_supported<ValueType, IndexType>::value) {
-        auto handle = exec->get_sparselib_handle();
-        sparselib::pointer_mode_guard pm_guard(handle);
-        const auto alpha = one<ValueType>();
-        const auto beta = zero<ValueType>();
-        auto descr = sparselib::create_mat_descr();
-        const auto row_ptrs = a->get_const_row_ptrs();
-        const auto col_idxs = a->get_const_col_idxs();
-        const auto values = a->get_const_values();
-        const int bs = a->get_block_size();
-        const IndexType mb = a->get_num_block_rows();
-        const IndexType nb = a->get_num_block_cols();
-        const auto nnzb = static_cast<IndexType>(a->get_num_stored_blocks());
-        const auto nrhs = static_cast<IndexType>(b->get_size()[1]);
-        const auto nrows = a->get_size()[0];
-        const auto ncols = a->get_size()[1];
-        const auto in_stride = b->get_stride();
-        const auto out_stride = c->get_stride();
-        if (nrhs == 1 && in_stride == 1 && out_stride == 1) {
-            sparselib::bsrmv(handle, SPARSELIB_OPERATION_NON_TRANSPOSE, mb, nb,
-                             nnzb, &alpha, descr, values, row_ptrs, col_idxs,
-                             bs, b->get_const_values(), &beta, c->get_values());
-        } else {
-            const auto trans_stride = nrows;
-            auto trans_c = array<ValueType>(exec, nrows * nrhs);
-            sparselib::bsrmm(handle, SPARSELIB_OPERATION_NON_TRANSPOSE,
-                             SPARSELIB_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb,
-                             &alpha, descr, values, row_ptrs, col_idxs, bs,
-                             b->get_const_values(), in_stride, &beta,
-                             trans_c.get_data(), trans_stride);
-            dense_transpose(exec, nrhs, nrows, trans_stride, trans_c.get_data(),
-                            out_stride, c->get_values());
-        }
-        sparselib::destroy(descr);
-    } else {
-        GKO_NOT_IMPLEMENTED;
-    }
-}
-
-
-template <typename ValueType, typename IndexType>
-void advanced_spmv(std::shared_ptr<const HipExecutor> exec,
-                   const matrix::Dense<ValueType>* const alpha,
-                   const matrix::Fbcsr<ValueType, IndexType>* const a,
-                   const matrix::Dense<ValueType>* const b,
-                   const matrix::Dense<ValueType>* const beta,
-                   matrix::Dense<ValueType>* const c)
-{
-    if (c->get_size()[0] == 0 || c->get_size()[1] == 0) {
-        // empty output: nothing to do
-        return;
-    }
-    if (b->get_size()[0] == 0 || a->get_num_stored_blocks() == 0) {
-        // empty input: scale output
-        dense::scale(exec, beta, c);
-        return;
-    }
-    if (sparselib::is_supported<ValueType, IndexType>::value) {
-        auto handle = exec->get_sparselib_handle();
-        const auto alphp = alpha->get_const_values();
-        const auto betap = beta->get_const_values();
-        auto descr = sparselib::create_mat_descr();
-        const auto row_ptrs = a->get_const_row_ptrs();
-        const auto col_idxs = a->get_const_col_idxs();
-        const auto values = a->get_const_values();
-        const int bs = a->get_block_size();
-        const IndexType mb = a->get_num_block_rows();
-        const IndexType nb = a->get_num_block_cols();
-        const auto nnzb = static_cast<IndexType>(a->get_num_stored_blocks());
-        const auto nrhs = static_cast<IndexType>(b->get_size()[1]);
-        const auto nrows = a->get_size()[0];
-        const auto ncols = a->get_size()[1];
-        const auto in_stride = b->get_stride();
-        const auto out_stride = c->get_stride();
-        if (nrhs == 1 && in_stride == 1 && out_stride == 1) {
-            sparselib::bsrmv(handle, SPARSELIB_OPERATION_NON_TRANSPOSE, mb, nb,
-                             nnzb, alphp, descr, values, row_ptrs, col_idxs, bs,
-                             b->get_const_values(), betap, c->get_values());
-        } else {
-            const auto trans_stride = nrows;
-            auto trans_c = array<ValueType>(exec, nrows * nrhs);
-            dense_transpose(exec, nrows, nrhs, out_stride, c->get_values(),
-                            trans_stride, trans_c.get_data());
-            sparselib::bsrmm(handle, SPARSELIB_OPERATION_NON_TRANSPOSE,
-                             SPARSELIB_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb,
-                             alphp, descr, values, row_ptrs, col_idxs, bs,
-                             b->get_const_values(), in_stride, betap,
-                             trans_c.get_data(), trans_stride);
-            dense_transpose(exec, nrhs, nrows, trans_stride, trans_c.get_data(),
-                            out_stride, c->get_values());
-        }
-        sparselib::destroy(descr);
-    } else {
-        GKO_NOT_IMPLEMENTED;
-    }
-}
-
-
-namespace {
-
-
-template <int mat_blk_sz, typename ValueType, typename IndexType>
-void transpose_blocks_impl(syn::value_list<int, mat_blk_sz>,
-                           std::shared_ptr<const DefaultExecutor> exec,
-                           matrix::Fbcsr<ValueType, IndexType>* const mat)
-{
-    constexpr int subwarp_size = config::warp_size;
-    const auto nbnz = mat->get_num_stored_blocks();
-    const auto numthreads = nbnz * subwarp_size;
-    const auto block_size = default_block_size;
-    const auto grid_dim = ceildiv(numthreads, block_size);
-    if (grid_dim > 0) {
-        kernel::transpose_blocks<mat_blk_sz, subwarp_size>
-            <<<grid_dim, block_size, 0, exec->get_stream()>>>(
-                nbnz, mat->get_values());
-    }
-}
-
-GKO_ENABLE_IMPLEMENTATION_SELECTION(select_transpose_blocks,
-                                    transpose_blocks_impl);
-
-
-}  // namespace
-
-
-template <typename ValueType, typename IndexType>
-void transpose(const std::shared_ptr<const DefaultExecutor> exec,
-               const matrix::Fbcsr<ValueType, IndexType>* const orig,
-               matrix::Fbcsr<ValueType, IndexType>* const trans)
-{
-#ifdef GKO_COMPILING_CUDA
-    if (sparselib::is_supported<ValueType, IndexType>::value) {
-        const int bs = orig->get_block_size();
-        const IndexType nnzb =
-            static_cast<IndexType>(orig->get_num_stored_blocks());
-        cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC;
-        cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO;
-        const IndexType buffer_size = sparselib::bsr_transpose_buffersize(
-            exec->get_sparselib_handle(), orig->get_num_block_rows(),
-            orig->get_num_block_cols(), nnzb, orig->get_const_values(),
-            orig->get_const_row_ptrs(), orig->get_const_col_idxs(), bs, bs);
-        array<char> buffer_array(exec, buffer_size);
-        auto buffer = buffer_array.get_data();
-        sparselib::bsr_transpose(
-            exec->get_sparselib_handle(), orig->get_num_block_rows(),
-            orig->get_num_block_cols(), nnzb, orig->get_const_values(),
-            orig->get_const_row_ptrs(), orig->get_const_col_idxs(), bs, bs,
-            trans->get_values(), trans->get_col_idxs(), trans->get_row_ptrs(),
-            copyValues, idxBase, buffer);
-
-        // transpose blocks
-        select_transpose_blocks(
-            fixedblock::compiled_kernels(),
-            [bs](int compiled_block_size) { return bs == compiled_block_size; },
-            syn::value_list<int>(), syn::type_list<>(), exec, trans);
-    } else
-#endif
-    {
-        fallback_transpose(exec, orig, trans);
-    }
-}
-
-
-template <typename ValueType, typename IndexType>
-void conj_transpose(std::shared_ptr<const HipExecutor> exec,
-                    const matrix::Fbcsr<ValueType, IndexType>* orig,
-                    matrix::Fbcsr<ValueType, IndexType>* trans)
-{
-    const int grid_size =
-        ceildiv(trans->get_num_stored_elements(), default_block_size);
-    transpose(exec, orig, trans);
-    if (grid_size > 0 && is_complex<ValueType>()) {
-        kernel::
-            conjugate<<<grid_size, default_block_size, 0, exec->get_stream()>>>(
-                trans->get_num_stored_elements(),
-                as_device_type(trans->get_values()));
-    }
-}
-
-
-}  // namespace fbcsr
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/matrix/sellp_kernels.hip.cpp b/hip/matrix/sellp_kernels.hip.cpp
deleted file mode 100644
index 16b139987a2..00000000000
--- a/hip/matrix/sellp_kernels.hip.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/matrix/sellp_kernels.hpp"
-
-
-#include <ginkgo/core/base/exception_helpers.hpp>
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/base/types.hpp>
-#include <ginkgo/core/matrix/csr.hpp>
-#include <ginkgo/core/matrix/dense.hpp>
-
-
-#include "common/cuda_hip/base/runtime.hpp"
-#include "core/components/prefix_sum_kernels.hpp"
-#include "hip/base/config.hip.hpp"
-#include "hip/base/hipsparse_bindings.hip.hpp"
-#include "hip/base/types.hip.hpp"
-#include "hip/components/reduction.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-/**
- * @brief The SELL-P matrix format namespace.
- *
- * @ingroup sellp
- */
-namespace sellp {
-
-
-constexpr int default_block_size = 512;
-
-
-#include "common/cuda_hip/matrix/sellp_kernels.hpp.inc"
-
-
-}  // namespace sellp
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/multigrid/pgm_kernels.hip.cpp b/hip/multigrid/pgm_kernels.hip.cpp
deleted file mode 100644
index ed81d1c66dc..00000000000
--- a/hip/multigrid/pgm_kernels.hip.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/multigrid/pgm_kernels.hpp"
-
-
-#include <memory>
-
-
-#include <thrust/device_ptr.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/reduce.h>
-#include <thrust/sort.h>
-#include <thrust/tuple.h>
-
-
-#include <ginkgo/core/base/exception_helpers.hpp>
-#include <ginkgo/core/base/math.hpp>
-
-
-#include "hip/base/thrust.hip.hpp"
-#include "hip/base/types.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-/**
- * @brief The PGM solver namespace.
- *
- * @ingroup pgm
- */
-namespace pgm {
-
-
-#include "common/cuda_hip/multigrid/pgm_kernels.hpp.inc"
-
-
-}  // namespace pgm
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/preconditioner/isai_kernels.hip.cpp b/hip/preconditioner/isai_kernels.hip.cpp
deleted file mode 100644
index 11e0e229abc..00000000000
--- a/hip/preconditioner/isai_kernels.hip.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/preconditioner/isai_kernels.hpp"
-
-
-#include <ginkgo/core/base/exception_helpers.hpp>
-#include <ginkgo/core/base/executor.hpp>
-#include <ginkgo/core/matrix/csr.hpp>
-
-
-#include "common/cuda_hip/base/runtime.hpp"
-#include "core/components/prefix_sum_kernels.hpp"
-#include "core/matrix/csr_builder.hpp"
-#include "hip/base/config.hip.hpp"
-#include "hip/base/math.hip.hpp"
-#include "hip/base/types.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
-#include "hip/components/merging.hip.hpp"
-#include "hip/components/reduction.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-#include "hip/components/uninitialized_array.hip.hpp"
-#include "hip/components/warp_blas.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-/**
- * @brief The Isai preconditioner namespace.
- * @ref Isai
- * @ingroup isai
- */
-namespace isai {
-
-
-constexpr int subwarp_size{row_size_limit};
-constexpr int subwarps_per_block{2};
-constexpr int default_block_size{subwarps_per_block * subwarp_size};
-
-
-#include "common/cuda_hip/preconditioner/isai_kernels.hpp.inc"
-
-
-}  // namespace isai
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/preconditioner/jacobi_kernels.hip.cpp b/hip/preconditioner/jacobi_kernels.hip.cpp
deleted file mode 100644
index 292b040ff1a..00000000000
--- a/hip/preconditioner/jacobi_kernels.hip.cpp
+++ /dev/null
@@ -1,51 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/preconditioner/jacobi_kernels.hpp"
-
-
-#include <ginkgo/core/base/exception_helpers.hpp>
-
-
-#include "common/cuda_hip/base/runtime.hpp"
-#include "core/base/extended_float.hpp"
-#include "core/preconditioner/jacobi_utils.hpp"
-#include "core/synthesizer/implementation_selection.hpp"
-#include "hip/base/config.hip.hpp"
-#include "hip/base/math.hip.hpp"
-#include "hip/base/types.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-#include "hip/preconditioner/jacobi_common.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-/**
- * @brief The Jacobi preconditioner namespace.
- * @ref Jacobi
- * @ingroup jacobi
- */
-namespace jacobi {
-
-
-// a total of 32/16 warps (1024 threads)
-#if defined(GKO_COMPILING_HIP) && GINKGO_HIP_PLATFORM_HCC
-constexpr int default_num_warps = 16;
-#else  // !defined(GKO_COMPILING_HIP) || GINKGO_HIP_PLATFORM_NVCC
-constexpr int default_num_warps = 32;
-#endif
-// with current architectures, at most 32 warps can be scheduled per SM (and
-// current GPUs have at most 84 SMs)
-constexpr int default_grid_size = 32 * 32 * 128;
-
-
-#include "common/cuda_hip/preconditioner/jacobi_kernels.hpp.inc"
-
-
-}  // namespace jacobi
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/reorder/rcm_kernels.hip.cpp b/hip/reorder/rcm_kernels.hip.cpp
deleted file mode 100644
index 0c83c728e79..00000000000
--- a/hip/reorder/rcm_kernels.hip.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/reorder/rcm_kernels.hpp"
-
-
-#include <thrust/binary_search.h>
-#include <thrust/copy.h>
-#include <thrust/count.h>
-#include <thrust/iterator/discard_iterator.h>
-#include <thrust/iterator/permutation_iterator.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/reduce.h>
-#include <thrust/sequence.h>
-#include <thrust/sort.h>
-#include <thrust/transform.h>
-
-
-#include <ginkgo/core/base/array.hpp>
-#include <ginkgo/core/base/std_extensions.hpp>
-#include <ginkgo/core/base/types.hpp>
-#include <ginkgo/core/matrix/csr.hpp>
-#include <ginkgo/core/matrix/permutation.hpp>
-#include <ginkgo/core/matrix/sparsity_csr.hpp>
-
-
-#include "core/base/array_access.hpp"
-#include "hip/base/thrust.hip.hpp"
-#include "hip/components/memory.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-/**
- * @brief The reordering namespace.
- *
- * @ingroup reorder
- */
-namespace rcm {
-
-
-constexpr int default_block_size = 512;
-
-
-#include "common/cuda_hip/reorder/rcm_kernels.hpp.inc"
-
-
-}  // namespace rcm
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/solver/cb_gmres_kernels.hip.cpp b/hip/solver/cb_gmres_kernels.hip.cpp
deleted file mode 100644
index 75bb5475b4c..00000000000
--- a/hip/solver/cb_gmres_kernels.hip.cpp
+++ /dev/null
@@ -1,507 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/solver/cb_gmres_kernels.hpp"
-
-
-#include <algorithm>
-
-
-#include <ginkgo/core/base/exception_helpers.hpp>
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/matrix/dense.hpp>
-#include <ginkgo/core/stop/stopping_status.hpp>
-
-
-#include "accessor/hip_helper.hpp"
-#include "accessor/range.hpp"
-#include "accessor/reduced_row_major.hpp"
-#include "accessor/scaled_reduced_row_major.hpp"
-#include "core/base/array_access.hpp"
-#include "core/components/fill_array_kernels.hpp"
-#include "core/matrix/dense_kernels.hpp"
-#include "core/solver/cb_gmres_accessor.hpp"
-#include "hip/base/config.hip.hpp"
-#include "hip/base/math.hip.hpp"
-#include "hip/base/types.hip.hpp"
-#include "hip/components/atomic.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
-#include "hip/components/reduction.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-#include "hip/components/uninitialized_array.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-/**
- * @brief The CB_GMRES solver namespace.
- *
- * @ingroup cb_gmres
- */
-namespace cb_gmres {
-
-
-constexpr int default_block_size = 512;
-// default_dot_dim can not be 64 in hip because 64 * 64 exceeds their max block
-// size limit.
-constexpr int default_dot_dim = 32;
-constexpr int default_dot_size = default_dot_dim * default_dot_dim;
-
-
-#include "common/cuda_hip/solver/cb_gmres_kernels.hpp.inc"
-
-
-template <typename ValueType>
-void zero_matrix(std::shared_ptr<const DefaultExecutor> exec, size_type m,
-                 size_type n, size_type stride, ValueType* array)
-{
-    const auto block_size = default_block_size;
-    const auto grid_size = ceildiv(n, block_size);
-    zero_matrix_kernel<<<grid_size, block_size, 0, exec->get_stream()>>>(
-        m, n, stride, as_device_type(array));
-}
-
-
-template <typename ValueType>
-void initialize(std::shared_ptr<const DefaultExecutor> exec,
-                const matrix::Dense<ValueType>* b,
-                matrix::Dense<ValueType>* residual,
-                matrix::Dense<ValueType>* givens_sin,
-                matrix::Dense<ValueType>* givens_cos,
-                array<stopping_status>* stop_status, size_type krylov_dim)
-{
-    const auto num_threads = std::max(b->get_size()[0] * b->get_stride(),
-                                      krylov_dim * b->get_size()[1]);
-    const auto grid_dim = ceildiv(num_threads, default_block_size);
-    const auto block_dim = default_block_size;
-    constexpr auto block_size = default_block_size;
-
-    initialize_kernel<block_size>
-        <<<grid_dim, block_dim, 0, exec->get_stream()>>>(
-            b->get_size()[0], b->get_size()[1], krylov_dim,
-            as_device_type(b->get_const_values()), b->get_stride(),
-            as_device_type(residual->get_values()), residual->get_stride(),
-            as_device_type(givens_sin->get_values()), givens_sin->get_stride(),
-            as_device_type(givens_cos->get_values()), givens_cos->get_stride(),
-            as_device_type(stop_status->get_data()));
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CB_GMRES_INITIALIZE_KERNEL);
-
-
-template <typename ValueType, typename Accessor3d>
-void restart(std::shared_ptr<const DefaultExecutor> exec,
-             const matrix::Dense<ValueType>* residual,
-             matrix::Dense<remove_complex<ValueType>>* residual_norm,
-             matrix::Dense<ValueType>* residual_norm_collection,
-             matrix::Dense<remove_complex<ValueType>>* arnoldi_norm,
-             Accessor3d krylov_bases,
-             matrix::Dense<ValueType>* next_krylov_basis,
-             array<size_type>* final_iter_nums, array<char>& reduction_tmp,
-             size_type krylov_dim)
-{
-    constexpr bool use_scalar =
-        gko::cb_gmres::detail::has_3d_scaled_accessor<Accessor3d>::value;
-    const auto num_rows = residual->get_size()[0];
-    const auto num_rhs = residual->get_size()[1];
-    const auto krylov_stride =
-        gko::cb_gmres::helper_functions_accessor<Accessor3d>::get_stride(
-            krylov_bases);
-    const auto grid_dim_1 =
-        ceildiv((krylov_dim + 1) * krylov_stride[0], default_block_size);
-    const auto block_dim = default_block_size;
-    constexpr auto block_size = default_block_size;
-    const auto stride_arnoldi = arnoldi_norm->get_stride();
-
-    restart_1_kernel<block_size>
-        <<<grid_dim_1, block_dim, 0, exec->get_stream()>>>(
-            residual->get_size()[0], residual->get_size()[1], krylov_dim,
-            acc::as_hip_range(krylov_bases),
-            as_device_type(residual_norm_collection->get_values()),
-            residual_norm_collection->get_stride());
-    kernels::hip::dense::compute_norm2_dispatch(exec, residual, residual_norm,
-                                                reduction_tmp);
-
-    if (use_scalar) {
-        components::fill_array(exec,
-                               arnoldi_norm->get_values() + 2 * stride_arnoldi,
-                               num_rhs, zero<remove_complex<ValueType>>());
-        const dim3 grid_size_nrm(ceildiv(num_rhs, default_dot_dim),
-                                 exec->get_num_multiprocessor() * 2);
-        const dim3 block_size_nrm(default_dot_dim, default_dot_dim);
-        multinorminf_without_stop_kernel<<<grid_size_nrm, block_size_nrm, 0,
-                                           exec->get_stream()>>>(
-            num_rows, num_rhs, as_device_type(residual->get_const_values()),
-            residual->get_stride(),
-            as_device_type(arnoldi_norm->get_values() + 2 * stride_arnoldi), 0);
-    }
-
-    if (gko::cb_gmres::detail::has_3d_scaled_accessor<Accessor3d>::value) {
-        set_scalar_kernel<default_block_size>
-            <<<ceildiv(num_rhs * (krylov_dim + 1), default_block_size),
-               default_block_size, 0, exec->get_stream()>>>(
-                num_rhs, krylov_dim + 1,
-                as_device_type(residual_norm->get_const_values()),
-                residual_norm->get_stride(),
-                as_device_type(arnoldi_norm->get_const_values() +
-                               2 * stride_arnoldi),
-                stride_arnoldi, acc::as_hip_range(krylov_bases));
-    }
-
-    const auto grid_dim_2 =
-        ceildiv(std::max<size_type>(num_rows, 1) * krylov_stride[1],
-                default_block_size);
-    restart_2_kernel<block_size>
-        <<<grid_dim_2, block_dim, 0, exec->get_stream()>>>(
-            residual->get_size()[0], residual->get_size()[1],
-            as_device_type(residual->get_const_values()),
-            residual->get_stride(),
-            as_device_type(residual_norm->get_const_values()),
-            as_device_type(residual_norm_collection->get_values()),
-            acc::as_hip_range(krylov_bases),
-            as_device_type(next_krylov_basis->get_values()),
-            next_krylov_basis->get_stride(),
-            as_device_type(final_iter_nums->get_data()));
-}
-
-GKO_INSTANTIATE_FOR_EACH_CB_GMRES_TYPE(GKO_DECLARE_CB_GMRES_RESTART_KERNEL);
-
-
-template <typename ValueType, typename Accessor3dim>
-void finish_arnoldi_CGS(std::shared_ptr<const DefaultExecutor> exec,
-                        matrix::Dense<ValueType>* next_krylov_basis,
-                        Accessor3dim krylov_bases,
-                        matrix::Dense<ValueType>* hessenberg_iter,
-                        matrix::Dense<ValueType>* buffer_iter,
-                        matrix::Dense<remove_complex<ValueType>>* arnoldi_norm,
-                        size_type iter, const stopping_status* stop_status,
-                        stopping_status* reorth_status,
-                        array<size_type>* num_reorth)
-{
-    const auto dim_size = next_krylov_basis->get_size();
-    if (dim_size[1] == 0) {
-        return;
-    }
-    using non_complex = remove_complex<ValueType>;
-    // optimization parameter
-    constexpr int singledot_block_size = default_dot_dim;
-    constexpr bool use_scalar =
-        gko::cb_gmres::detail::has_3d_scaled_accessor<Accessor3dim>::value;
-    const auto stride_next_krylov = next_krylov_basis->get_stride();
-    const auto stride_hessenberg = hessenberg_iter->get_stride();
-    const auto stride_buffer = buffer_iter->get_stride();
-    const auto stride_arnoldi = arnoldi_norm->get_stride();
-    const dim3 grid_size(ceildiv(dim_size[1], default_dot_dim),
-                         exec->get_num_multiprocessor() * 2);
-    const dim3 grid_size_num_iters(ceildiv(dim_size[1], default_dot_dim),
-                                   exec->get_num_multiprocessor() * 2,
-                                   iter + 1);
-    const dim3 block_size(default_dot_dim, default_dot_dim);
-    // Note: having iter first (instead of row_idx information) is likely
-    //       beneficial for avoiding atomic_add conflicts, but that needs
-    //       further investigation.
-    const dim3 grid_size_iters_single(exec->get_num_multiprocessor() * 2,
-                                      iter + 1);
-    const auto block_size_iters_single = singledot_block_size;
-    size_type num_reorth_host;
-
-    components::fill_array(exec, arnoldi_norm->get_values(), dim_size[1],
-                           zero<non_complex>());
-    multinorm2_kernel<<<grid_size, block_size, 0, exec->get_stream()>>>(
-        dim_size[0], dim_size[1],
-        as_device_type(next_krylov_basis->get_const_values()),
-        stride_next_krylov, as_device_type(arnoldi_norm->get_values()),
-        as_device_type(stop_status));
-    // nrmP = norm(next_krylov_basis)
-    zero_matrix(exec, iter + 1, dim_size[1], stride_hessenberg,
-                hessenberg_iter->get_values());
-    if (dim_size[1] > 1) {
-        multidot_kernel<default_dot_dim>
-            <<<grid_size_num_iters, block_size, 0, exec->get_stream()>>>(
-                dim_size[0], dim_size[1],
-                as_device_type(next_krylov_basis->get_const_values()),
-                stride_next_krylov, acc::as_hip_range(krylov_bases),
-                as_device_type(hessenberg_iter->get_values()),
-                stride_hessenberg, as_device_type(stop_status));
-    } else {
-        singledot_kernel<singledot_block_size>
-            <<<grid_size_iters_single, block_size_iters_single, 0,
-               exec->get_stream()>>>(
-                dim_size[0],
-                as_device_type(next_krylov_basis->get_const_values()),
-                stride_next_krylov, acc::as_hip_range(krylov_bases),
-                as_device_type(hessenberg_iter->get_values()),
-                stride_hessenberg, as_device_type(stop_status));
-    }
-    // for i in 1:iter
-    //     hessenberg(iter, i) = next_krylov_basis' * krylov_bases(:, i)
-    // end
-    update_next_krylov_kernel<default_block_size>
-        <<<ceildiv(dim_size[0] * stride_next_krylov, default_block_size),
-           default_block_size, 0, exec->get_stream()>>>(
-            iter + 1, dim_size[0], dim_size[1],
-            as_device_type(next_krylov_basis->get_values()), stride_next_krylov,
-            acc::as_hip_range(krylov_bases),
-            as_device_type(hessenberg_iter->get_const_values()),
-            stride_hessenberg, as_device_type(stop_status));
-
-    // for i in 1:iter
-    //     next_krylov_basis  -= hessenberg(iter, i) * krylov_bases(:, i)
-    // end
-    components::fill_array(exec, arnoldi_norm->get_values() + stride_arnoldi,
-                           dim_size[1], zero<non_complex>());
-    if (use_scalar) {
-        components::fill_array(exec,
-                               arnoldi_norm->get_values() + 2 * stride_arnoldi,
-                               dim_size[1], zero<non_complex>());
-    }
-    multinorm2_inf_kernel<use_scalar>
-        <<<grid_size, block_size, 0, exec->get_stream()>>>(
-            dim_size[0], dim_size[1],
-            as_device_type(next_krylov_basis->get_const_values()),
-            stride_next_krylov,
-            as_device_type(arnoldi_norm->get_values() + stride_arnoldi),
-            as_device_type(arnoldi_norm->get_values() + 2 * stride_arnoldi),
-            as_device_type(stop_status));
-    // nrmN = norm(next_krylov_basis)
-    components::fill_array(exec, num_reorth->get_data(), 1, zero<size_type>());
-    check_arnoldi_norms<default_block_size>
-        <<<ceildiv(dim_size[1], default_block_size), default_block_size, 0,
-           exec->get_stream()>>>(
-            dim_size[1], as_device_type(arnoldi_norm->get_values()),
-            stride_arnoldi, as_device_type(hessenberg_iter->get_values()),
-            stride_hessenberg, iter + 1, acc::as_hip_range(krylov_bases),
-            as_device_type(stop_status), as_device_type(reorth_status),
-            as_device_type(num_reorth->get_data()));
-    num_reorth_host = get_element(*num_reorth, 0);
-    // num_reorth_host := number of next_krylov vector to be reorthogonalization
-    for (size_type l = 1; (num_reorth_host > 0) && (l < 3); l++) {
-        zero_matrix(exec, iter + 1, dim_size[1], stride_buffer,
-                    buffer_iter->get_values());
-        if (dim_size[1] > 1) {
-            multidot_kernel<default_dot_dim>
-                <<<grid_size_num_iters, block_size, 0, exec->get_stream()>>>(
-                    dim_size[0], dim_size[1],
-                    as_device_type(next_krylov_basis->get_const_values()),
-                    stride_next_krylov, acc::as_hip_range(krylov_bases),
-                    as_device_type(buffer_iter->get_values()), stride_buffer,
-                    as_device_type(stop_status));
-        } else {
-            singledot_kernel<singledot_block_size>
-                <<<grid_size_iters_single, block_size_iters_single, 0,
-                   exec->get_stream()>>>(
-                    dim_size[0],
-                    as_device_type(next_krylov_basis->get_const_values()),
-                    stride_next_krylov, acc::as_hip_range(krylov_bases),
-                    as_device_type(buffer_iter->get_values()), stride_buffer,
-                    as_device_type(stop_status));
-        }
-        // for i in 1:iter
-        //     hessenberg(iter, i) = next_krylov_basis' * krylov_bases(:, i)
-        // end
-        update_next_krylov_and_add_kernel<default_block_size>
-            <<<ceildiv(dim_size[0] * stride_next_krylov, default_block_size),
-               default_block_size, 0, exec->get_stream()>>>(
-                iter + 1, dim_size[0], dim_size[1],
-                as_device_type(next_krylov_basis->get_values()),
-                stride_next_krylov, acc::as_hip_range(krylov_bases),
-                as_device_type(hessenberg_iter->get_values()),
-                stride_hessenberg,
-                as_device_type(buffer_iter->get_const_values()), stride_buffer,
-                as_device_type(stop_status), as_device_type(reorth_status));
-        // for i in 1:iter
-        //     next_krylov_basis  -= hessenberg(iter, i) * krylov_bases(:, i)
-        // end
-        components::fill_array(exec,
-                               arnoldi_norm->get_values() + stride_arnoldi,
-                               dim_size[1], zero<non_complex>());
-        if (use_scalar) {
-            components::fill_array(
-                exec, arnoldi_norm->get_values() + 2 * stride_arnoldi,
-                dim_size[1], zero<non_complex>());
-        }
-        multinorm2_inf_kernel<use_scalar>
-            <<<grid_size, block_size, 0, exec->get_stream()>>>(
-                dim_size[0], dim_size[1],
-                as_device_type(next_krylov_basis->get_const_values()),
-                stride_next_krylov,
-                as_device_type(arnoldi_norm->get_values() + stride_arnoldi),
-                as_device_type(arnoldi_norm->get_values() + 2 * stride_arnoldi),
-                as_device_type(stop_status));
-        // nrmN = norm(next_krylov_basis)
-        components::fill_array(exec, num_reorth->get_data(), 1,
-                               zero<size_type>());
-        check_arnoldi_norms<default_block_size>
-            <<<ceildiv(dim_size[1], default_block_size), default_block_size, 0,
-               exec->get_stream()>>>(
-                dim_size[1], as_device_type(arnoldi_norm->get_values()),
-                stride_arnoldi, as_device_type(hessenberg_iter->get_values()),
-                stride_hessenberg, iter + 1, acc::as_hip_range(krylov_bases),
-                as_device_type(stop_status), as_device_type(reorth_status),
-                num_reorth->get_data());
-        num_reorth_host = get_element(*num_reorth, 0);
-        // num_reorth_host := number of next_krylov vector to be
-        // reorthogonalization
-    }
-    update_krylov_next_krylov_kernel<default_block_size>
-        <<<ceildiv(dim_size[0] * stride_next_krylov, default_block_size),
-           default_block_size, 0, exec->get_stream()>>>(
-            iter, dim_size[0], dim_size[1],
-            as_device_type(next_krylov_basis->get_values()), stride_next_krylov,
-            acc::as_hip_range(krylov_bases),
-            as_device_type(hessenberg_iter->get_const_values()),
-            stride_hessenberg, as_device_type(stop_status));
-    // next_krylov_basis /= hessenberg(iter, iter + 1)
-    // krylov_bases(:, iter + 1) = next_krylov_basis
-    // End of arnoldi
-}
-
-template <typename ValueType>
-void givens_rotation(std::shared_ptr<const DefaultExecutor> exec,
-                     matrix::Dense<ValueType>* givens_sin,
-                     matrix::Dense<ValueType>* givens_cos,
-                     matrix::Dense<ValueType>* hessenberg_iter,
-                     matrix::Dense<remove_complex<ValueType>>* residual_norm,
-                     matrix::Dense<ValueType>* residual_norm_collection,
-                     size_type iter, const array<stopping_status>* stop_status)
-{
-    // TODO: tune block_size for optimal performance
-    constexpr auto block_size = default_block_size;
-    const auto num_cols = hessenberg_iter->get_size()[1];
-    const auto block_dim = block_size;
-    const auto grid_dim =
-        static_cast<unsigned int>(ceildiv(num_cols, block_size));
-
-    givens_rotation_kernel<block_size>
-        <<<grid_dim, block_dim, 0, exec->get_stream()>>>(
-            hessenberg_iter->get_size()[0], hessenberg_iter->get_size()[1],
-            iter, as_device_type(hessenberg_iter->get_values()),
-            hessenberg_iter->get_stride(),
-            as_device_type(givens_sin->get_values()), givens_sin->get_stride(),
-            as_device_type(givens_cos->get_values()), givens_cos->get_stride(),
-            as_device_type(residual_norm->get_values()),
-            as_device_type(residual_norm_collection->get_values()),
-            residual_norm_collection->get_stride(),
-            stop_status->get_const_data());
-}
-
-
-template <typename ValueType, typename Accessor3d>
-void arnoldi(std::shared_ptr<const DefaultExecutor> exec,
-             matrix::Dense<ValueType>* next_krylov_basis,
-             matrix::Dense<ValueType>* givens_sin,
-             matrix::Dense<ValueType>* givens_cos,
-             matrix::Dense<remove_complex<ValueType>>* residual_norm,
-             matrix::Dense<ValueType>* residual_norm_collection,
-             Accessor3d krylov_bases, matrix::Dense<ValueType>* hessenberg_iter,
-             matrix::Dense<ValueType>* buffer_iter,
-             matrix::Dense<remove_complex<ValueType>>* arnoldi_norm,
-             size_type iter, array<size_type>* final_iter_nums,
-             const array<stopping_status>* stop_status,
-             array<stopping_status>* reorth_status,
-             array<size_type>* num_reorth)
-{
-    increase_final_iteration_numbers_kernel<<<
-        static_cast<unsigned int>(
-            ceildiv(final_iter_nums->get_size(), default_block_size)),
-        default_block_size, 0, exec->get_stream()>>>(
-        as_device_type(final_iter_nums->get_data()),
-        stop_status->get_const_data(), final_iter_nums->get_size());
-    finish_arnoldi_CGS(exec, next_krylov_basis, krylov_bases, hessenberg_iter,
-                       buffer_iter, arnoldi_norm, iter,
-                       stop_status->get_const_data(), reorth_status->get_data(),
-                       num_reorth);
-    givens_rotation(exec, givens_sin, givens_cos, hessenberg_iter,
-                    residual_norm, residual_norm_collection, iter, stop_status);
-}
-
-GKO_INSTANTIATE_FOR_EACH_CB_GMRES_TYPE(GKO_DECLARE_CB_GMRES_ARNOLDI_KERNEL);
-
-
-template <typename ValueType>
-void solve_upper_triangular(
-    std::shared_ptr<const DefaultExecutor> exec,
-    const matrix::Dense<ValueType>* residual_norm_collection,
-    const matrix::Dense<ValueType>* hessenberg, matrix::Dense<ValueType>* y,
-    const array<size_type>* final_iter_nums)
-{
-    // TODO: tune block_size for optimal performance
-    constexpr auto block_size = default_block_size;
-    const auto num_rhs = residual_norm_collection->get_size()[1];
-    const auto block_dim = block_size;
-    const auto grid_dim =
-        static_cast<unsigned int>(ceildiv(num_rhs, block_size));
-
-    solve_upper_triangular_kernel<block_size>
-        <<<grid_dim, block_dim, 0, exec->get_stream()>>>(
-            hessenberg->get_size()[1], num_rhs,
-            as_device_type(residual_norm_collection->get_const_values()),
-            residual_norm_collection->get_stride(),
-            as_device_type(hessenberg->get_const_values()),
-            hessenberg->get_stride(), as_device_type(y->get_values()),
-            y->get_stride(), as_device_type(final_iter_nums->get_const_data()));
-}
-
-
-template <typename ValueType, typename ConstAccessor3d>
-void calculate_qy(std::shared_ptr<const DefaultExecutor> exec,
-                  ConstAccessor3d krylov_bases, size_type num_krylov_bases,
-                  const matrix::Dense<ValueType>* y,
-                  matrix::Dense<ValueType>* before_preconditioner,
-                  const array<size_type>* final_iter_nums)
-{
-    const auto num_rows = before_preconditioner->get_size()[0];
-    const auto num_cols = before_preconditioner->get_size()[1];
-    const auto stride_before_preconditioner =
-        before_preconditioner->get_stride();
-
-    constexpr auto block_size = default_block_size;
-    const auto grid_dim = static_cast<unsigned int>(
-        ceildiv(num_rows * stride_before_preconditioner, block_size));
-    const auto block_dim = block_size;
-
-    calculate_Qy_kernel<block_size>
-        <<<grid_dim, block_dim, 0, exec->get_stream()>>>(
-            num_rows, num_cols, acc::as_hip_range(krylov_bases),
-            as_device_type(y->get_const_values()), y->get_stride(),
-            as_device_type(before_preconditioner->get_values()),
-            stride_before_preconditioner,
-            as_device_type(final_iter_nums->get_const_data()));
-    // Calculate qy
-    // before_preconditioner = krylov_bases * y
-}
-
-
-template <typename ValueType, typename ConstAccessor3d>
-void solve_krylov(std::shared_ptr<const DefaultExecutor> exec,
-                  const matrix::Dense<ValueType>* residual_norm_collection,
-                  ConstAccessor3d krylov_bases,
-                  const matrix::Dense<ValueType>* hessenberg,
-                  matrix::Dense<ValueType>* y,
-                  matrix::Dense<ValueType>* before_preconditioner,
-                  const array<size_type>* final_iter_nums)
-{
-    if (before_preconditioner->get_size()[1] == 0) {
-        return;
-    }
-    // since hessenberg has dims:  iters x iters * num_rhs
-    // krylov_bases has dims:  (iters + 1) x sysmtx[0] x num_rhs
-    const auto iters =
-        hessenberg->get_size()[1] / before_preconditioner->get_size()[1];
-    const auto num_krylov_bases = iters + 1;
-    solve_upper_triangular(exec, residual_norm_collection, hessenberg, y,
-                           final_iter_nums);
-    calculate_qy(exec, krylov_bases, num_krylov_bases, y, before_preconditioner,
-                 final_iter_nums);
-}
-
-GKO_INSTANTIATE_FOR_EACH_CB_GMRES_CONST_TYPE(
-    GKO_DECLARE_CB_GMRES_SOLVE_KRYLOV_KERNEL);
-
-
-}  // namespace cb_gmres
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/solver/idr_kernels.hip.cpp b/hip/solver/idr_kernels.hip.cpp
deleted file mode 100644
index 049b05c5750..00000000000
--- a/hip/solver/idr_kernels.hip.cpp
+++ /dev/null
@@ -1,343 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/solver/idr_kernels.hpp"
-
-
-#include <ctime>
-#include <random>
-
-
-#include <ginkgo/core/base/exception_helpers.hpp>
-#include <ginkgo/core/base/math.hpp>
-
-
-#include "common/cuda_hip/base/runtime.hpp"
-#include "core/components/fill_array_kernels.hpp"
-#include "hip/base/config.hip.hpp"
-#include "hip/base/hipblas_bindings.hip.hpp"
-#include "hip/base/hiprand_bindings.hip.hpp"
-#include "hip/base/math.hip.hpp"
-#include "hip/base/types.hip.hpp"
-#include "hip/components/atomic.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
-#include "hip/components/reduction.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-/**
- * @brief The IDR solver namespace.
- *
- * @ingroup idr
- */
-namespace idr {
-
-
-constexpr int default_block_size = 512;
-constexpr int default_dot_dim = 32;
-constexpr int default_dot_size = default_dot_dim * default_dot_dim;
-
-
-#include "common/cuda_hip/solver/idr_kernels.hpp.inc"
-
-
-namespace {
-
-
-template <typename ValueType>
-void initialize_m(std::shared_ptr<const DefaultExecutor> exec,
-                  const size_type nrhs, matrix::Dense<ValueType>* m,
-                  array<stopping_status>* stop_status)
-{
-    const auto subspace_dim = m->get_size()[0];
-    const auto m_stride = m->get_stride();
-
-    const auto grid_dim = ceildiv(m_stride * subspace_dim, default_block_size);
-    initialize_m_kernel<<<grid_dim, default_block_size, 0,
-                          exec->get_stream()>>>(
-        subspace_dim, nrhs, as_device_type(m->get_values()), m_stride,
-        as_device_type(stop_status->get_data()));
-}
-
-
-template <typename ValueType>
-void initialize_subspace_vectors(std::shared_ptr<const DefaultExecutor> exec,
-                                 matrix::Dense<ValueType>* subspace_vectors,
-                                 bool deterministic)
-{
-    if (!deterministic) {
-        auto gen = randlib::rand_generator(std::random_device{}(),
-                                           RANDLIB_RNG_PSEUDO_DEFAULT,
-                                           exec->get_stream());
-        randlib::rand_vector(
-            gen,
-            subspace_vectors->get_size()[0] * subspace_vectors->get_stride(),
-            0.0, 1.0, subspace_vectors->get_values());
-        randlib::destroy(gen);
-    }
-}
-
-
-template <typename ValueType>
-void orthonormalize_subspace_vectors(
-    std::shared_ptr<const DefaultExecutor> exec,
-    matrix::Dense<ValueType>* subspace_vectors)
-{
-    orthonormalize_subspace_vectors_kernel<default_block_size>
-        <<<1, default_block_size, 0, exec->get_stream()>>>(
-            subspace_vectors->get_size()[0], subspace_vectors->get_size()[1],
-            as_device_type(subspace_vectors->get_values()),
-            subspace_vectors->get_stride());
-}
-
-
-template <typename ValueType>
-void solve_lower_triangular(std::shared_ptr<const DefaultExecutor> exec,
-                            const size_type nrhs,
-                            const matrix::Dense<ValueType>* m,
-                            const matrix::Dense<ValueType>* f,
-                            matrix::Dense<ValueType>* c,
-                            const array<stopping_status>* stop_status)
-{
-    const auto subspace_dim = m->get_size()[0];
-
-    const auto grid_dim = ceildiv(nrhs, default_block_size);
-    solve_lower_triangular_kernel<<<grid_dim, default_block_size, 0,
-                                    exec->get_stream()>>>(
-        subspace_dim, nrhs, as_device_type(m->get_const_values()),
-        m->get_stride(), as_device_type(f->get_const_values()), f->get_stride(),
-        as_device_type(c->get_values()), c->get_stride(),
-        stop_status->get_const_data());
-}
-
-
-template <typename ValueType>
-void update_g_and_u(std::shared_ptr<const DefaultExecutor> exec,
-                    const size_type nrhs, const size_type k,
-                    const matrix::Dense<ValueType>* p,
-                    const matrix::Dense<ValueType>* m,
-                    matrix::Dense<ValueType>* alpha,
-                    matrix::Dense<ValueType>* g, matrix::Dense<ValueType>* g_k,
-                    matrix::Dense<ValueType>* u,
-                    const array<stopping_status>* stop_status)
-{
-    if (nrhs == 0) {
-        return;
-    }
-    const auto size = g->get_size()[0];
-    const auto p_stride = p->get_stride();
-
-    const dim3 grid_dim(ceildiv(nrhs, default_dot_dim),
-                        exec->get_num_multiprocessor() * 2);
-    const dim3 block_dim(default_dot_dim, default_dot_dim);
-
-    for (size_type i = 0; i < k; i++) {
-        const auto p_i = p->get_const_values() + i * p_stride;
-        if (nrhs > 1 || is_complex<ValueType>()) {
-            components::fill_array(exec, alpha->get_values(), nrhs,
-                                   zero<ValueType>());
-            multidot_kernel<<<grid_dim, block_dim, 0, exec->get_stream()>>>(
-                size, nrhs, as_device_type(p_i),
-                as_device_type(g_k->get_values()), g_k->get_stride(),
-                as_device_type(alpha->get_values()),
-                stop_status->get_const_data());
-        } else {
-            blas::dot(exec->get_blas_handle(), size, p_i, 1, g_k->get_values(),
-                      g_k->get_stride(), alpha->get_values());
-        }
-        update_g_k_and_u_kernel<default_block_size>
-            <<<ceildiv(size * g_k->get_stride(), default_block_size),
-               default_block_size, 0, exec->get_stream()>>>(
-                k, i, size, nrhs, as_device_type(alpha->get_const_values()),
-                as_device_type(m->get_const_values()), m->get_stride(),
-                as_device_type(g->get_const_values()), g->get_stride(),
-                as_device_type(g_k->get_values()), g_k->get_stride(),
-                as_device_type(u->get_values()), u->get_stride(),
-                stop_status->get_const_data());
-    }
-    update_g_kernel<default_block_size>
-        <<<ceildiv(size * g_k->get_stride(), default_block_size),
-           default_block_size, 0, exec->get_stream()>>>(
-            k, size, nrhs, as_device_type(g_k->get_const_values()),
-            g_k->get_stride(), as_device_type(g->get_values()), g->get_stride(),
-            stop_status->get_const_data());
-}
-
-
-template <typename ValueType>
-void update_m(std::shared_ptr<const DefaultExecutor> exec, const size_type nrhs,
-              const size_type k, const matrix::Dense<ValueType>* p,
-              const matrix::Dense<ValueType>* g_k, matrix::Dense<ValueType>* m,
-              const array<stopping_status>* stop_status)
-{
-    if (nrhs == 0) {
-        return;
-    }
-    const auto size = g_k->get_size()[0];
-    const auto subspace_dim = m->get_size()[0];
-    const auto p_stride = p->get_stride();
-    const auto m_stride = m->get_stride();
-
-    const dim3 grid_dim(ceildiv(nrhs, default_dot_dim),
-                        exec->get_num_multiprocessor() * 2);
-    const dim3 block_dim(default_dot_dim, default_dot_dim);
-
-    for (size_type i = k; i < subspace_dim; i++) {
-        const auto p_i = p->get_const_values() + i * p_stride;
-        auto m_i = m->get_values() + i * m_stride + k * nrhs;
-        if (nrhs > 1 || is_complex<ValueType>()) {
-            components::fill_array(exec, m_i, nrhs, zero<ValueType>());
-            multidot_kernel<<<grid_dim, block_dim, 0, exec->get_stream()>>>(
-                size, nrhs, as_device_type(p_i),
-                as_device_type(g_k->get_const_values()), g_k->get_stride(),
-                as_device_type(m_i), stop_status->get_const_data());
-        } else {
-            blas::dot(exec->get_blas_handle(), size, p_i, 1,
-                      g_k->get_const_values(), g_k->get_stride(), m_i);
-        }
-    }
-}
-
-
-template <typename ValueType>
-void update_x_r_and_f(std::shared_ptr<const DefaultExecutor> exec,
-                      const size_type nrhs, const size_type k,
-                      const matrix::Dense<ValueType>* m,
-                      const matrix::Dense<ValueType>* g,
-                      const matrix::Dense<ValueType>* u,
-                      matrix::Dense<ValueType>* f, matrix::Dense<ValueType>* r,
-                      matrix::Dense<ValueType>* x,
-                      const array<stopping_status>* stop_status)
-{
-    const auto size = x->get_size()[0];
-    const auto subspace_dim = m->get_size()[0];
-
-    const auto grid_dim = ceildiv(size * x->get_stride(), default_block_size);
-    update_x_r_and_f_kernel<<<grid_dim, default_block_size, 0,
-                              exec->get_stream()>>>(
-        k, size, subspace_dim, nrhs, as_device_type(m->get_const_values()),
-        m->get_stride(), as_device_type(g->get_const_values()), g->get_stride(),
-        as_device_type(u->get_const_values()), u->get_stride(),
-        as_device_type(f->get_values()), f->get_stride(),
-        as_device_type(r->get_values()), r->get_stride(),
-        as_device_type(x->get_values()), x->get_stride(),
-        stop_status->get_const_data());
-    components::fill_array(exec, f->get_values() + k * f->get_stride(), nrhs,
-                           zero<ValueType>());
-}
-
-
-}  // namespace
-
-
-template <typename ValueType>
-void initialize(std::shared_ptr<const DefaultExecutor> exec,
-                const size_type nrhs, matrix::Dense<ValueType>* m,
-                matrix::Dense<ValueType>* subspace_vectors, bool deterministic,
-                array<stopping_status>* stop_status)
-{
-    initialize_m(exec, nrhs, m, stop_status);
-    initialize_subspace_vectors(exec, subspace_vectors, deterministic);
-    orthonormalize_subspace_vectors(exec, subspace_vectors);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_INITIALIZE_KERNEL);
-
-
-template <typename ValueType>
-void step_1(std::shared_ptr<const DefaultExecutor> exec, const size_type nrhs,
-            const size_type k, const matrix::Dense<ValueType>* m,
-            const matrix::Dense<ValueType>* f,
-            const matrix::Dense<ValueType>* residual,
-            const matrix::Dense<ValueType>* g, matrix::Dense<ValueType>* c,
-            matrix::Dense<ValueType>* v,
-            const array<stopping_status>* stop_status)
-{
-    solve_lower_triangular(exec, nrhs, m, f, c, stop_status);
-
-    const auto num_rows = v->get_size()[0];
-    const auto subspace_dim = m->get_size()[0];
-
-    const auto grid_dim = ceildiv(nrhs * num_rows, default_block_size);
-    step_1_kernel<<<grid_dim, default_block_size, 0, exec->get_stream()>>>(
-        k, num_rows, subspace_dim, nrhs,
-        as_device_type(residual->get_const_values()), residual->get_stride(),
-        as_device_type(c->get_const_values()), c->get_stride(),
-        as_device_type(g->get_const_values()), g->get_stride(),
-        as_device_type(v->get_values()), v->get_stride(),
-        stop_status->get_const_data());
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_STEP_1_KERNEL);
-
-
-template <typename ValueType>
-void step_2(std::shared_ptr<const DefaultExecutor> exec, const size_type nrhs,
-            const size_type k, const matrix::Dense<ValueType>* omega,
-            const matrix::Dense<ValueType>* preconditioned_vector,
-            const matrix::Dense<ValueType>* c, matrix::Dense<ValueType>* u,
-            const array<stopping_status>* stop_status)
-{
-    if (nrhs == 0) {
-        return;
-    }
-    const auto num_rows = preconditioned_vector->get_size()[0];
-    const auto subspace_dim = u->get_size()[1] / nrhs;
-
-    const auto grid_dim = ceildiv(nrhs * num_rows, default_block_size);
-    step_2_kernel<<<grid_dim, default_block_size, 0, exec->get_stream()>>>(
-        k, num_rows, subspace_dim, nrhs,
-        as_device_type(omega->get_const_values()),
-        as_device_type(preconditioned_vector->get_const_values()),
-        preconditioned_vector->get_stride(),
-        as_device_type(c->get_const_values()), c->get_stride(),
-        as_device_type(u->get_values()), u->get_stride(),
-        stop_status->get_const_data());
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_STEP_2_KERNEL);
-
-
-template <typename ValueType>
-void step_3(std::shared_ptr<const DefaultExecutor> exec, const size_type nrhs,
-            const size_type k, const matrix::Dense<ValueType>* p,
-            matrix::Dense<ValueType>* g, matrix::Dense<ValueType>* g_k,
-            matrix::Dense<ValueType>* u, matrix::Dense<ValueType>* m,
-            matrix::Dense<ValueType>* f, matrix::Dense<ValueType>* alpha,
-            matrix::Dense<ValueType>* residual, matrix::Dense<ValueType>* x,
-            const array<stopping_status>* stop_status)
-{
-    update_g_and_u(exec, nrhs, k, p, m, alpha, g, g_k, u, stop_status);
-    update_m(exec, nrhs, k, p, g_k, m, stop_status);
-    update_x_r_and_f(exec, nrhs, k, m, g, u, f, residual, x, stop_status);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_STEP_3_KERNEL);
-
-
-template <typename ValueType>
-void compute_omega(
-    std::shared_ptr<const DefaultExecutor> exec, const size_type nrhs,
-    const remove_complex<ValueType> kappa, const matrix::Dense<ValueType>* tht,
-    const matrix::Dense<remove_complex<ValueType>>* residual_norm,
-    matrix::Dense<ValueType>* omega, const array<stopping_status>* stop_status)
-{
-    const auto grid_dim = ceildiv(nrhs, config::warp_size);
-    compute_omega_kernel<<<grid_dim, config::warp_size, 0,
-                           exec->get_stream()>>>(
-        nrhs, as_device_type(kappa), as_device_type(tht->get_const_values()),
-        as_device_type(residual_norm->get_const_values()),
-        as_device_type(omega->get_values()), stop_status->get_const_data());
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_IDR_COMPUTE_OMEGA_KERNEL);
-
-
-}  // namespace idr
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/solver/multigrid_kernels.hip.cpp b/hip/solver/multigrid_kernels.hip.cpp
deleted file mode 100644
index d09bb2d0a21..00000000000
--- a/hip/solver/multigrid_kernels.hip.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/solver/multigrid_kernels.hpp"
-
-
-#include <ginkgo/core/base/array.hpp>
-#include <ginkgo/core/base/exception_helpers.hpp>
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/base/types.hpp>
-
-
-#include "common/cuda_hip/base/runtime.hpp"
-#include "core/base/array_access.hpp"
-#include "core/components/fill_array_kernels.hpp"
-#include "hip/base/types.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-/**
- * @brief The MULTIGRID solver namespace.
- *
- * @ingroup multigrid
- */
-namespace multigrid {
-
-
-constexpr int default_block_size = 512;
-
-
-#include "common/cuda_hip/solver/multigrid_kernels.hpp.inc"
-
-
-}  // namespace multigrid
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/stop/batch_criteria.hip.hpp b/hip/stop/batch_criteria.hip.hpp
deleted file mode 100644
index 1f721e36aaf..00000000000
--- a/hip/stop/batch_criteria.hip.hpp
+++ /dev/null
@@ -1,26 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#ifndef GKO_HIP_STOP_BATCH_CRITERIA_HIP_HPP_
-#define GKO_HIP_STOP_BATCH_CRITERIA_HIP_HPP_
-
-
-#include <ginkgo/core/base/math.hpp>
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-namespace batch_stop {
-
-
-#include "common/cuda_hip/stop/batch_criteria.hpp.inc"
-
-
-}  // namespace batch_stop
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko
-
-#endif  // GKO_HIP_STOP_BATCH_CRITERIA_HIP_HPP_