From 88702f5d253a69cb1d59aad703e88246c4a82727 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Sun, 19 May 2024 22:12:45 +0200
Subject: [PATCH] fix compilation issues

---
 common/CMakeLists.txt                         |   1 -
 common/cuda_hip/CMakeLists.txt                |  60 ----
 .../base/batch_multi_vector_kernels.hpp.inc   | 301 ++++++++++++++++++
 common/cuda_hip/base/pointer_mode_guard.hpp   |   5 +-
 common/cuda_hip/base/randlib_bindings.hpp     |   2 +-
 common/cuda_hip/base/sparselib_bindings.hpp   |   2 +-
 .../base/sparselib_block_bindings.hpp         |  16 +
 .../cuda_hip/components/format_conversion.hpp |   2 +-
 common/cuda_hip/components/memory.hpp         |   2 +-
 ...ct_kernels.cpp => par_ict_kernels.hpp.inc} | 184 -----------
 ...ls.cpp => par_ilut_filter_kernels.hpp.inc} | 135 --------
 ...ls.cpp => par_ilut_select_kernels.hpp.inc} | 157 ---------
 ...ls.cpp => par_ilut_spgeam_kernels.hpp.inc} | 154 ---------
 .../par_ilut_sweep_kernels.hpp.inc            |  94 ++++++
 ...{batch_logger.hpp => batch_logger.hpp.inc} |  22 --
 ..._kernels.cpp => batch_csr_kernels.hpp.inc} |  54 ----
 ...ernels.cpp => batch_dense_kernels.hpp.inc} |  55 ----
 ..._kernels.cpp => batch_ell_kernels.hpp.inc} |  54 ----
 .../matrix/fbcsr_kernels.template.cpp         |   3 +-
 ...obi_kernels.cpp => jacobi_kernels.hpp.inc} |  48 ---
 common/cuda_hip/sources.cmake                 |  31 ++
 ...ch_criteria.hpp => batch_criteria.hpp.inc} |  21 --
 common/unified/base/kernel_launch.hpp         |   4 +-
 .../unified/base/kernel_launch_reduction.hpp  |   4 +-
 common/unified/base/kernel_launch_solver.hpp  |   4 +-
 cuda/CMakeLists.txt                           |  11 +
 cuda/base/batch_multi_vector_kernels.cu       |  59 ++++
 cuda/factorization/par_ict_kernels.cu         | 189 +++++++++++
 .../par_ilut_approx_filter_kernels.cu         |  12 +-
 cuda/factorization/par_ilut_filter_kernels.cu | 140 ++++++++
 cuda/factorization/par_ilut_select_common.cu  |  14 +-
 cuda/factorization/par_ilut_select_kernels.cu | 162 ++++++++++
 cuda/factorization/par_ilut_spgeam_kernels.cu | 159 +++++++++
 .../factorization/par_ilut_sweep_kernels.cu   |  97 +-----
 cuda/log/batch_logger.cuh                     |  27 ++
 cuda/matrix/batch_csr_kernels.cu              |  58 ++++
 cuda/matrix/batch_dense_kernels.cu            |  59 ++++
 cuda/matrix/batch_ell_kernels.cu              |  58 ++++
 cuda/preconditioner/batch_preconditioners.cuh |   2 +-
 ...cobi_advanced_apply_kernels.instantiate.cu |   8 +-
 .../jacobi_generate_kernels.instantiate.cu    |  12 +-
 cuda/preconditioner/jacobi_kernels.cu         |  51 +++
 ...jacobi_simple_apply_kernels.instantiate.cu |   8 +-
 cuda/solver/batch_bicgstab_kernels.cu         |   7 +-
 cuda/solver/batch_cg_kernels.cu               |   7 +-
 cuda/stop/batch_criteria.cuh                  |  26 ++
 hip/CMakeLists.txt                            |  19 +-
 hip/base/batch_multi_vector_kernels.hip.cpp   |  59 ++++
 hip/base/config.hip.hpp                       |   2 +-
 hip/base/hipblas_bindings.hip.hpp             |   2 +-
 hip/base/hiprand_bindings.hip.hpp             |   2 +-
 hip/components/format_conversion.hip.hpp      |   2 +-
 hip/factorization/par_ict_kernels.hip.cpp     | 189 +++++++++++
 .../par_ilut_approx_filter_kernels.hip.cpp    |  12 +-
 .../par_ilut_filter_kernels.hip.cpp           | 140 ++++++++
 .../par_ilut_select_common.hip.cpp            |  14 +-
 .../par_ilut_select_kernels.hip.cpp           | 162 ++++++++++
 .../par_ilut_spgeam_kernels.hip.cpp           | 159 +++++++++
 .../par_ilut_sweep_kernels.hip.cpp            | 123 +++++++
 hip/log/batch_logger.hip.hpp                  |  26 ++
 hip/matrix/batch_csr_kernels.hip.cpp          |  58 ++++
 hip/matrix/batch_dense_kernels.hip.cpp        |  59 ++++
 hip/matrix/batch_ell_kernels.hip.cpp          |  58 ++++
 hip/matrix/csr_kernels.template.hip.cpp       |  18 +-
 .../batch_preconditioners.hip.hpp             |   2 +-
 ...obi_advanced_apply_instantiate.inc.hip.cpp |   8 +-
 .../jacobi_generate_instantiate.inc.hip.cpp   |  12 +-
 .../jacobi_generate_kernel.hip.cpp            |  12 +-
 hip/preconditioner/jacobi_kernels.hip.cpp     |  51 +++
 ...acobi_simple_apply_instantiate.inc.hip.cpp |   8 +-
 .../jacobi_simple_apply_kernel.hip.cpp        |   8 +-
 hip/solver/batch_bicgstab_kernels.hip.cpp     |   9 +-
 hip/solver/batch_cg_kernels.hip.cpp           |   9 +-
 hip/solver/common_trs_kernels.hip.hpp         |   2 +-
 hip/solver/lower_trs_kernels.hip.cpp          |   2 +-
 hip/solver/upper_trs_kernels.hip.cpp          |   2 +-
 hip/stop/batch_criteria.hip.hpp               |  26 ++
 hip/stop/criterion_kernels.hip.cpp            |   4 +-
 hip/stop/residual_norm_kernels.hip.cpp        |   4 +-
 hip/test/base/math.hip.cpp                    |   2 +-
 hip/test/components/merging.hip.cpp           |   4 +-
 hip/test/components/searching.hip.cpp         |   4 +-
 hip/test/components/sorting.hip.cpp           |   4 +-
 83 files changed, 2681 insertions(+), 1177 deletions(-)
 delete mode 100644 common/cuda_hip/CMakeLists.txt
 create mode 100644 common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc
 create mode 100644 common/cuda_hip/base/sparselib_block_bindings.hpp
 rename common/cuda_hip/factorization/{par_ict_kernels.cpp => par_ict_kernels.hpp.inc} (62%)
 rename common/cuda_hip/factorization/{par_ilut_filter_kernels.cpp => par_ilut_filter_kernels.hpp.inc} (57%)
 rename common/cuda_hip/factorization/{par_ilut_select_kernels.cpp => par_ilut_select_kernels.hpp.inc} (63%)
 rename common/cuda_hip/factorization/{par_ilut_spgeam_kernels.cpp => par_ilut_spgeam_kernels.hpp.inc} (63%)
 create mode 100644 common/cuda_hip/factorization/par_ilut_sweep_kernels.hpp.inc
 rename common/cuda_hip/log/{batch_logger.hpp => batch_logger.hpp.inc} (68%)
 rename common/cuda_hip/matrix/{batch_csr_kernels.cpp => batch_csr_kernels.hpp.inc} (87%)
 rename common/cuda_hip/matrix/{batch_dense_kernels.cpp => batch_dense_kernels.hpp.inc} (89%)
 rename common/cuda_hip/matrix/{batch_ell_kernels.cpp => batch_ell_kernels.hpp.inc} (87%)
 rename common/cuda_hip/preconditioner/{jacobi_kernels.cpp => jacobi_kernels.hpp.inc} (91%)
 create mode 100644 common/cuda_hip/sources.cmake
 rename common/cuda_hip/stop/{batch_criteria.hpp => batch_criteria.hpp.inc} (75%)
 create mode 100644 cuda/base/batch_multi_vector_kernels.cu
 create mode 100644 cuda/factorization/par_ict_kernels.cu
 create mode 100644 cuda/factorization/par_ilut_filter_kernels.cu
 create mode 100644 cuda/factorization/par_ilut_select_kernels.cu
 create mode 100644 cuda/factorization/par_ilut_spgeam_kernels.cu
 rename common/cuda_hip/factorization/par_ilut_sweep_kernels.cpp => cuda/factorization/par_ilut_sweep_kernels.cu (53%)
 create mode 100644 cuda/log/batch_logger.cuh
 create mode 100644 cuda/matrix/batch_csr_kernels.cu
 create mode 100644 cuda/matrix/batch_dense_kernels.cu
 create mode 100644 cuda/matrix/batch_ell_kernels.cu
 create mode 100644 cuda/preconditioner/jacobi_kernels.cu
 create mode 100644 cuda/stop/batch_criteria.cuh
 create mode 100644 hip/base/batch_multi_vector_kernels.hip.cpp
 create mode 100644 hip/factorization/par_ict_kernels.hip.cpp
 create mode 100644 hip/factorization/par_ilut_filter_kernels.hip.cpp
 create mode 100644 hip/factorization/par_ilut_select_kernels.hip.cpp
 create mode 100644 hip/factorization/par_ilut_spgeam_kernels.hip.cpp
 create mode 100644 hip/factorization/par_ilut_sweep_kernels.hip.cpp
 create mode 100644 hip/log/batch_logger.hip.hpp
 create mode 100644 hip/matrix/batch_csr_kernels.hip.cpp
 create mode 100644 hip/matrix/batch_dense_kernels.hip.cpp
 create mode 100644 hip/matrix/batch_ell_kernels.hip.cpp
 create mode 100644 hip/preconditioner/jacobi_kernels.hip.cpp
 create mode 100644 hip/stop/batch_criteria.hip.hpp

diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
index e7c665640b3..77bdd7230b9 100644
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -1,3 +1,2 @@
 add_subdirectory(unified)
 set(GKO_UNIFIED_COMMON_SOURCES ${GKO_UNIFIED_COMMON_SOURCES} PARENT_SCOPE)
-set(GKO_CUDA_HIP_COMMON_SOURCES ${GKO_CUDA_HIP_COMMON_SOURCES} PARENT_SCOPE)
diff --git a/common/cuda_hip/CMakeLists.txt b/common/cuda_hip/CMakeLists.txt
deleted file mode 100644
index 8014d5dad6d..00000000000
--- a/common/cuda_hip/CMakeLists.txt
+++ /dev/null
@@ -1,60 +0,0 @@
-include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake)
-add_instantiation_files(${CMAKE_CURRENT_SOURCE_DIR} matrix/fbcsr_kernels.instantiate.cpp FBCSR_INSTANTIATE)
-set(CUDA_HIP_SOURCES
-    base/batch_multi_vector_kernels.cpp
-    base/device_matrix_data_kernels.cpp
-    base/kernel_launch.hpp
-    base/kernel_launch_reduction.hpp
-    base/kernel_launch_solver.hpp
-    base/math.hpp
-    components/atomic.hpp
-    components/diagonal_block_manipulation.hpp
-    components/intrinsics.hpp
-    components/merging.hpp
-    components/prefix_sum.hpp
-    components/prefix_sum_kernels.cpp
-    components/reduction.hpp
-    components/searching.hpp
-    components/segment_scan.hpp
-    components/sorting.hpp
-    components/syncfree.hpp
-    components/thread_ids.hpp
-    components/uninitialized_array.hpp
-    components/warp_blas.hpp
-    distributed/index_map_kernels.cpp
-    distributed/matrix_kernels.cpp
-    distributed/partition_helpers_kernels.cpp
-    distributed/partition_kernels.cpp
-    distributed/vector_kernels.cpp
-    factorization/cholesky_kernels.cpp
-    factorization/factorization_kernels.cpp
-    factorization/lu_kernels.cpp
-    factorization/par_ic_kernels.cpp
-    factorization/par_ict_kernels.cpp
-    factorization/par_ilu_kernels.cpp
-    factorization/par_ilut_filter_kernels.cpp
-    factorization/par_ilut_select_kernels.cpp
-    factorization/par_ilut_spgeam_kernels.cpp
-    factorization/par_ilut_sweep_kernels.cpp
-    log/batch_logger.hpp
-    matrix/batch_csr_kernels.cpp
-    matrix/batch_dense_kernels.cpp
-    matrix/batch_ell_kernels.cpp
-    matrix/coo_kernels.cpp
-    matrix/dense_kernels.cpp
-    matrix/diagonal_kernels.cpp
-    matrix/ell_kernels.cpp
-    matrix/sellp_kernels.cpp
-    matrix/sparsity_csr_kernels.cpp
-    multigrid/pgm_kernels.cpp
-    preconditioner/isai_kernels.cpp
-    preconditioner/jacobi_kernels.cpp
-    reorder/rcm_kernels.cpp
-    solver/cb_gmres_kernels.cpp
-    solver/idr_kernels.cpp
-    solver/multigrid_kernels.cpp
-    stop/batch_criteria.hpp
-    )
-list(TRANSFORM CUDA_HIP_SOURCES PREPEND ${CMAKE_CURRENT_SOURCE_DIR}/)
-list(APPEND CUDA_HIP_SOURCES ${FBCSR_INSTANTIATE})
-set(GKO_CUDA_HIP_COMMON_SOURCES ${CUDA_HIP_SOURCES} PARENT_SCOPE)
\ No newline at end of file
diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc b/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc
new file mode 100644
index 00000000000..9b6301674be
--- /dev/null
+++ b/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc
@@ -0,0 +1,301 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+template <typename ValueType, typename Mapping>
+__device__ __forceinline__ void scale(
+    const gko::batch::multi_vector::batch_item<const ValueType>& alpha,
+    const gko::batch::multi_vector::batch_item<ValueType>& x, Mapping map)
+{
+    const int max_li = x.num_rows * x.num_rhs;
+    for (int li = threadIdx.x; li < max_li; li += blockDim.x) {
+        const int row = li / x.num_rhs;
+        const int col = li % x.num_rhs;
+
+        x.values[row * x.stride + col] =
+            alpha.values[map(row, col, alpha.stride)] *
+            x.values[row * x.stride + col];
+    }
+}
+
+
+template <typename ValueType, typename Mapping>
+__global__
+__launch_bounds__(default_block_size, sm_oversubscription) void scale_kernel(
+    const gko::batch::multi_vector::uniform_batch<const ValueType> alpha,
+    const gko::batch::multi_vector::uniform_batch<ValueType> x, Mapping map)
+{
+    for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items;
+         batch_id += gridDim.x) {
+        const auto alpha_b = gko::batch::extract_batch_item(alpha, batch_id);
+        const auto x_b = gko::batch::extract_batch_item(x, batch_id);
+        scale(alpha_b, x_b, map);
+    }
+}
+
+
+template <typename ValueType, typename Mapping>
+__device__ __forceinline__ void add_scaled(
+    const gko::batch::multi_vector::batch_item<const ValueType>& alpha,
+    const gko::batch::multi_vector::batch_item<const ValueType>& x,
+    const gko::batch::multi_vector::batch_item<ValueType>& y, Mapping map)
+{
+    const int max_li = x.num_rows * x.num_rhs;
+    for (int li = threadIdx.x; li < max_li; li += blockDim.x) {
+        const int row = li / x.num_rhs;
+        const int col = li % x.num_rhs;
+
+        y.values[row * y.stride + col] +=
+            alpha.values[map(col)] * x.values[row * x.stride + col];
+    }
+}
+
+
+template <typename ValueType, typename Mapping>
+__global__ __launch_bounds__(
+    default_block_size,
+    sm_oversubscription) void add_scaled_kernel(const gko::batch::multi_vector::
+                                                    uniform_batch<
+                                                        const ValueType>
+                                                        alpha,
+                                                const gko::batch::multi_vector::
+                                                    uniform_batch<
+                                                        const ValueType>
+                                                        x,
+                                                const gko::batch::multi_vector::
+                                                    uniform_batch<ValueType>
+                                                        y,
+                                                Mapping map)
+{
+    for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items;
+         batch_id += gridDim.x) {
+        const auto alpha_b = gko::batch::extract_batch_item(alpha, batch_id);
+        const auto x_b = gko::batch::extract_batch_item(x, batch_id);
+        const auto y_b = gko::batch::extract_batch_item(y, batch_id);
+        add_scaled(alpha_b, x_b, y_b, map);
+    }
+}
+
+
+template <typename Group, typename ValueType>
+__device__ __forceinline__ void single_rhs_compute_conj_dot(Group subgroup,
+                                                            const int num_rows,
+                                                            const ValueType* x,
+                                                            const ValueType* y,
+                                                            ValueType& result)
+
+{
+    ValueType val = zero<ValueType>();
+    for (int r = subgroup.thread_rank(); r < num_rows; r += subgroup.size()) {
+        val += conj(x[r]) * y[r];
+    }
+
+    // subgroup level reduction
+    val = reduce(subgroup, val, thrust::plus<ValueType>{});
+
+    if (subgroup.thread_rank() == 0) {
+        result = val;
+    }
+}
+
+
+template <typename Group, typename ValueType, typename Mapping>
+__device__ __forceinline__ void gen_one_dot(
+    const gko::batch::multi_vector::batch_item<const ValueType>& x,
+    const gko::batch::multi_vector::batch_item<const ValueType>& y,
+    const int rhs_index,
+    const gko::batch::multi_vector::batch_item<ValueType>& result,
+    Group subgroup, Mapping conj_map)
+{
+    ValueType val = zero<ValueType>();
+
+    for (int r = subgroup.thread_rank(); r < x.num_rows; r += subgroup.size()) {
+        val += conj_map(x.values[r * x.stride + rhs_index]) *
+               y.values[r * y.stride + rhs_index];
+    }
+
+    // subgroup level reduction
+    val = reduce(subgroup, val, thrust::plus<ValueType>{});
+
+    if (subgroup.thread_rank() == 0) {
+        result.values[rhs_index] = val;
+    }
+}
+
+
+template <typename ValueType, typename Mapping>
+__device__ __forceinline__ void compute_gen_dot_product(
+    const gko::batch::multi_vector::batch_item<const ValueType>& x,
+    const gko::batch::multi_vector::batch_item<const ValueType>& y,
+    const gko::batch::multi_vector::batch_item<ValueType>& result,
+    Mapping conj_map)
+{
+    constexpr auto tile_size = config::warp_size;
+    auto thread_block = group::this_thread_block();
+    auto subgroup = group::tiled_partition<tile_size>(thread_block);
+    const auto subgroup_id = static_cast<int>(threadIdx.x / tile_size);
+    const int num_subgroups_per_block = ceildiv(blockDim.x, tile_size);
+
+    for (int rhs_index = subgroup_id; rhs_index < x.num_rhs;
+         rhs_index += num_subgroups_per_block) {
+        gen_one_dot(x, y, rhs_index, result, subgroup, conj_map);
+    }
+}
+
+
+template <typename ValueType, typename Mapping>
+__global__
+__launch_bounds__(default_block_size, sm_oversubscription) void compute_gen_dot_product_kernel(
+    const gko::batch::multi_vector::uniform_batch<const ValueType> x,
+    const gko::batch::multi_vector::uniform_batch<const ValueType> y,
+    const gko::batch::multi_vector::uniform_batch<ValueType> result,
+    Mapping map)
+{
+    for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items;
+         batch_id += gridDim.x) {
+        const auto x_b = gko::batch::extract_batch_item(x, batch_id);
+        const auto y_b = gko::batch::extract_batch_item(y, batch_id);
+        const auto r_b = gko::batch::extract_batch_item(result, batch_id);
+        compute_gen_dot_product(x_b, y_b, r_b, map);
+    }
+}
+
+
+template <typename Group, typename ValueType>
+__device__ __forceinline__ void single_rhs_compute_norm2(
+    Group subgroup, const int num_rows, const ValueType* x,
+    remove_complex<ValueType>& result)
+{
+    using real_type = typename gko::remove_complex<ValueType>;
+    real_type val = zero<real_type>();
+
+    for (int r = subgroup.thread_rank(); r < num_rows; r += subgroup.size()) {
+        val += squared_norm(x[r]);
+    }
+
+    // subgroup level reduction
+    val = reduce(subgroup, val, thrust::plus<remove_complex<ValueType>>{});
+
+    if (subgroup.thread_rank() == 0) {
+        result = sqrt(val);
+    }
+}
+
+
+template <typename Group, typename ValueType>
+__device__ __forceinline__ void one_norm2(
+    const gko::batch::multi_vector::batch_item<const ValueType>& x,
+    const int rhs_index,
+    const gko::batch::multi_vector::batch_item<remove_complex<ValueType>>&
+        result,
+    Group subgroup)
+{
+    using real_type = typename gko::remove_complex<ValueType>;
+    real_type val = zero<real_type>();
+
+    for (int r = subgroup.thread_rank(); r < x.num_rows; r += subgroup.size()) {
+        val += squared_norm(x.values[r * x.stride + rhs_index]);
+    }
+
+    // subgroup level reduction
+    val = reduce(subgroup, val, thrust::plus<remove_complex<ValueType>>{});
+
+    if (subgroup.thread_rank() == 0) {
+        result.values[rhs_index] = sqrt(val);
+    }
+}
+
+
+/**
+ * Computes the 2-norms of some column vectors in global or shared memory.
+ *
+ * @param x  A row-major multivector with nrhs columns.
+ * @param result  Holds norm value for each vector in x.
+ */
+template <typename ValueType>
+__device__ __forceinline__ void compute_norm2(
+    const gko::batch::multi_vector::batch_item<const ValueType>& x,
+    const gko::batch::multi_vector::batch_item<remove_complex<ValueType>>&
+        result)
+{
+    constexpr auto tile_size = config::warp_size;
+    auto thread_block = group::this_thread_block();
+    auto subgroup = group::tiled_partition<tile_size>(thread_block);
+    const auto subgroup_id = static_cast<int>(threadIdx.x / tile_size);
+    const int num_subgroups_per_block = ceildiv(blockDim.x, tile_size);
+
+    for (int rhs_index = subgroup_id; rhs_index < x.num_rhs;
+         rhs_index += num_subgroups_per_block) {
+        one_norm2(x, rhs_index, result, subgroup);
+    }
+}
+
+
+template <typename ValueType>
+__global__ __launch_bounds__(
+    default_block_size,
+    sm_oversubscription) void compute_norm2_kernel(const gko::batch::
+                                                       multi_vector::
+                                                           uniform_batch<
+                                                               const ValueType>
+                                                               x,
+                                                   const gko::batch::
+                                                       multi_vector::
+                                                           uniform_batch<
+                                                               remove_complex<
+                                                                   ValueType>>
+                                                               result)
+{
+    for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items;
+         batch_id += gridDim.x) {
+        const auto x_b = gko::batch::extract_batch_item(x, batch_id);
+        const auto r_b = gko::batch::extract_batch_item(result, batch_id);
+        compute_norm2(x_b, r_b);
+    }
+}
+
+
+template <typename ValueType>
+__device__ __forceinline__ void single_rhs_copy(const int num_rows,
+                                                const ValueType* in,
+                                                ValueType* out)
+{
+    for (int iz = threadIdx.x; iz < num_rows; iz += blockDim.x) {
+        out[iz] = in[iz];
+    }
+}
+
+
+/**
+ * Copies the values of one multi-vector into another.
+ *
+ * Note that the output multi-vector should already have memory allocated
+ * and stride set.
+ */
+template <typename ValueType>
+__device__ __forceinline__ void copy(
+    const gko::batch::multi_vector::batch_item<const ValueType>& in,
+    const gko::batch::multi_vector::batch_item<ValueType>& out)
+{
+    for (int iz = threadIdx.x; iz < in.num_rows * in.num_rhs;
+         iz += blockDim.x) {
+        const int i = iz / in.num_rhs;
+        const int j = iz % in.num_rhs;
+        out.values[i * out.stride + j] = in.values[i * in.stride + j];
+    }
+}
+
+
+template <typename ValueType>
+__global__
+__launch_bounds__(default_block_size, sm_oversubscription) void copy_kernel(
+    const gko::batch::multi_vector::uniform_batch<const ValueType> src,
+    const gko::batch::multi_vector::uniform_batch<ValueType> dst)
+{
+    for (size_type batch_id = blockIdx.x; batch_id < src.num_batch_items;
+         batch_id += gridDim.x) {
+        const auto dst_b = gko::batch::extract_batch_item(dst, batch_id);
+        const auto src_b = gko::batch::extract_batch_item(src, batch_id);
+        copy(src_b, dst_b);
+    }
+}
diff --git a/common/cuda_hip/base/pointer_mode_guard.hpp b/common/cuda_hip/base/pointer_mode_guard.hpp
index 382fd85ca76..41ff6242e49 100644
--- a/common/cuda_hip/base/pointer_mode_guard.hpp
+++ b/common/cuda_hip/base/pointer_mode_guard.hpp
@@ -6,13 +6,10 @@
 #define GKO_COMMON_CUDA_HIP_BASE_POINTER_MODE_GUARD_HPP_
 
 
-#include "common/unified/base/config.hpp"
-
-
 #ifdef GKO_COMPILING_HIP
 #include "hip/base/pointer_mode_guard.hip.hpp"
 #else  // GKO_COMPILING_CUDA
-#include "common/cuda_hip/base/pointer_mode_guard.hpp"
+#include "cuda/base/pointer_mode_guard.hpp"
 #endif
 
 
diff --git a/common/cuda_hip/base/randlib_bindings.hpp b/common/cuda_hip/base/randlib_bindings.hpp
index 6207723b806..05a616a40f3 100644
--- a/common/cuda_hip/base/randlib_bindings.hpp
+++ b/common/cuda_hip/base/randlib_bindings.hpp
@@ -11,7 +11,7 @@
 
 #define RANDLIB_RNG_PSEUDO_DEFAULT HIPRAND_RNG_PSEUDO_DEFAULT
 #else  // GKO_COMPILING_CUDA
-#include "common/cuda_hip/base/curand_bindings.hpp"
+#include "cuda/base/curand_bindings.hpp"
 
 #define RANDLIB_RNG_PSEUDO_DEFAULT CURAND_RNG_PSEUDO_DEFAULT
 #endif
diff --git a/common/cuda_hip/base/sparselib_bindings.hpp b/common/cuda_hip/base/sparselib_bindings.hpp
index b2d6cf2d398..bc565f9190a 100644
--- a/common/cuda_hip/base/sparselib_bindings.hpp
+++ b/common/cuda_hip/base/sparselib_bindings.hpp
@@ -9,7 +9,7 @@
 #ifdef GKO_COMPILING_HIP
 #include "hip/base/hipsparse_bindings.hip.hpp"
 #else  // GKO_COMPILING_CUDA
-#include "common/cuda_hip/base/cusparse_bindings.hpp"
+#include "cuda/base/cusparse_bindings.hpp"
 #endif
 
 
diff --git a/common/cuda_hip/base/sparselib_block_bindings.hpp b/common/cuda_hip/base/sparselib_block_bindings.hpp
new file mode 100644
index 00000000000..3b0f03444d7
--- /dev/null
+++ b/common/cuda_hip/base/sparselib_block_bindings.hpp
@@ -0,0 +1,16 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_COMMON_CUDA_HIP_BASE_SPARSELIB_BLOCK_BINDINGS_HPP_
+#define GKO_COMMON_CUDA_HIP_BASE_SPARSELIB_BLOCK_BINDINGS_HPP_
+
+
+#ifdef GKO_COMPILING_HIP
+#include "hip/base/hipsparse_block_bindings.hip.hpp"
+#else  // GKO_COMPILING_CUDA
+#include "cuda/base/cusparse_block_bindings.hpp"
+#endif
+
+
+#endif  // GKO_COMMON_CUDA_HIP_BASE_SPARSELIB_BLOCK_BINDINGS_HPP_
diff --git a/common/cuda_hip/components/format_conversion.hpp b/common/cuda_hip/components/format_conversion.hpp
index f0a78a68d45..a16d09b2e3a 100644
--- a/common/cuda_hip/components/format_conversion.hpp
+++ b/common/cuda_hip/components/format_conversion.hpp
@@ -9,7 +9,7 @@
 #ifdef GKO_COMPILING_HIP
 #include "hip/components/format_conversion.hip.hpp"
 #else  // GKO_COMPILING_CUDA
-#include "common/cuda_hip/components/format_conversion.hpp"
+#include "cuda/components/format_conversion.cuh"
 #endif
 
 
diff --git a/common/cuda_hip/components/memory.hpp b/common/cuda_hip/components/memory.hpp
index d1d855f461d..974431e2fb8 100644
--- a/common/cuda_hip/components/memory.hpp
+++ b/common/cuda_hip/components/memory.hpp
@@ -9,7 +9,7 @@
 #ifdef GKO_COMPILING_HIP
 #include "hip/components/memory.hip.hpp"
 #else  // GKO_COMPILING_CUDA
-#include "common/cuda_hip/components/memory.hpp"
+#include "cuda/components/memory.cuh"
 #endif
 
 
diff --git a/common/cuda_hip/factorization/par_ict_kernels.cpp b/common/cuda_hip/factorization/par_ict_kernels.hpp.inc
similarity index 62%
rename from common/cuda_hip/factorization/par_ict_kernels.cpp
rename to common/cuda_hip/factorization/par_ict_kernels.hpp.inc
index 523f89082af..87aa8297345 100644
--- a/common/cuda_hip/factorization/par_ict_kernels.cpp
+++ b/common/cuda_hip/factorization/par_ict_kernels.hpp.inc
@@ -2,51 +2,6 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include "core/factorization/par_ict_kernels.hpp"
-
-
-#include <ginkgo/core/base/array.hpp>
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/matrix/coo.hpp>
-#include <ginkgo/core/matrix/csr.hpp>
-#include <ginkgo/core/matrix/dense.hpp>
-
-
-#include "common/cuda_hip/base/math.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/components/intrinsics.hpp"
-#include "common/cuda_hip/components/memory.hpp"
-#include "common/cuda_hip/components/merging.hpp"
-#include "common/cuda_hip/components/prefix_sum.hpp"
-#include "common/cuda_hip/components/reduction.hpp"
-#include "common/cuda_hip/components/searching.hpp"
-#include "common/cuda_hip/components/thread_ids.hpp"
-#include "core/components/prefix_sum_kernels.hpp"
-#include "core/matrix/coo_builder.hpp"
-#include "core/matrix/csr_builder.hpp"
-#include "core/matrix/csr_kernels.hpp"
-#include "core/synthesizer/implementation_selection.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace GKO_DEVICE_NAMESPACE {
-/**
- * @brief The parallel ICT factorization namespace.
- *
- * @ingroup factor
- */
-namespace par_ict_factorization {
-
-
-constexpr int default_block_size = 512;
-
-
-// subwarp sizes for all warp-parallel kernels (filter, add_candidates)
-using compiled_kernels =
-    syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
-
-
 namespace kernel {
 
 
@@ -320,142 +275,3 @@ __global__ __launch_bounds__(default_block_size) void ict_sweep(
 
 
 }  // namespace kernel
-
-
-namespace {
-
-
-template <int subwarp_size, typename ValueType, typename IndexType>
-void add_candidates(syn::value_list<int, subwarp_size>,
-                    std::shared_ptr<const DefaultExecutor> exec,
-                    const matrix::Csr<ValueType, IndexType>* llh,
-                    const matrix::Csr<ValueType, IndexType>* a,
-                    const matrix::Csr<ValueType, IndexType>* l,
-                    matrix::Csr<ValueType, IndexType>* l_new)
-{
-    auto num_rows = static_cast<IndexType>(llh->get_size()[0]);
-    auto subwarps_per_block = default_block_size / subwarp_size;
-    auto num_blocks = ceildiv(num_rows, subwarps_per_block);
-    matrix::CsrBuilder<ValueType, IndexType> l_new_builder(l_new);
-    auto llh_row_ptrs = llh->get_const_row_ptrs();
-    auto llh_col_idxs = llh->get_const_col_idxs();
-    auto llh_vals = llh->get_const_values();
-    auto a_row_ptrs = a->get_const_row_ptrs();
-    auto a_col_idxs = a->get_const_col_idxs();
-    auto a_vals = a->get_const_values();
-    auto l_row_ptrs = l->get_const_row_ptrs();
-    auto l_col_idxs = l->get_const_col_idxs();
-    auto l_vals = l->get_const_values();
-    auto l_new_row_ptrs = l_new->get_row_ptrs();
-    // count non-zeros per row
-    if (num_blocks > 0) {
-        kernel::ict_tri_spgeam_nnz<subwarp_size>
-            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
-                llh_row_ptrs, llh_col_idxs, a_row_ptrs, a_col_idxs,
-                l_new_row_ptrs, num_rows);
-    }
-
-    // build row ptrs
-    components::prefix_sum_nonnegative(exec, l_new_row_ptrs, num_rows + 1);
-
-    // resize output arrays
-    auto l_new_nnz = exec->copy_val_to_host(l_new_row_ptrs + num_rows);
-    l_new_builder.get_col_idx_array().resize_and_reset(l_new_nnz);
-    l_new_builder.get_value_array().resize_and_reset(l_new_nnz);
-
-    auto l_new_col_idxs = l_new->get_col_idxs();
-    auto l_new_vals = l_new->get_values();
-
-    // fill columns and values
-    if (num_blocks > 0) {
-        kernel::ict_tri_spgeam_init<subwarp_size>
-            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
-                llh_row_ptrs, llh_col_idxs, as_device_type(llh_vals),
-                a_row_ptrs, a_col_idxs, as_device_type(a_vals), l_row_ptrs,
-                l_col_idxs, as_device_type(l_vals), l_new_row_ptrs,
-                l_new_col_idxs, as_device_type(l_new_vals), num_rows);
-    }
-}
-
-
-GKO_ENABLE_IMPLEMENTATION_SELECTION(select_add_candidates, add_candidates);
-
-
-template <int subwarp_size, typename ValueType, typename IndexType>
-void compute_factor(syn::value_list<int, subwarp_size>,
-                    std::shared_ptr<const DefaultExecutor> exec,
-                    const matrix::Csr<ValueType, IndexType>* a,
-                    matrix::Csr<ValueType, IndexType>* l,
-                    const matrix::Coo<ValueType, IndexType>* l_coo)
-{
-    auto total_nnz = static_cast<IndexType>(l->get_num_stored_elements());
-    auto block_size = default_block_size / subwarp_size;
-    auto num_blocks = ceildiv(total_nnz, block_size);
-    if (num_blocks > 0) {
-        kernel::ict_sweep<subwarp_size>
-            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
-                a->get_const_row_ptrs(), a->get_const_col_idxs(),
-                as_device_type(a->get_const_values()), l->get_const_row_ptrs(),
-                l_coo->get_const_row_idxs(), l->get_const_col_idxs(),
-                as_device_type(l->get_values()),
-                static_cast<IndexType>(l->get_num_stored_elements()));
-    }
-}
-
-
-GKO_ENABLE_IMPLEMENTATION_SELECTION(select_compute_factor, compute_factor);
-
-
-}  // namespace
-
-
-template <typename ValueType, typename IndexType>
-void add_candidates(std::shared_ptr<const DefaultExecutor> exec,
-                    const matrix::Csr<ValueType, IndexType>* llh,
-                    const matrix::Csr<ValueType, IndexType>* a,
-                    const matrix::Csr<ValueType, IndexType>* l,
-                    matrix::Csr<ValueType, IndexType>* l_new)
-{
-    auto num_rows = a->get_size()[0];
-    auto total_nnz =
-        llh->get_num_stored_elements() + a->get_num_stored_elements();
-    auto total_nnz_per_row = total_nnz / num_rows;
-    select_add_candidates(
-        compiled_kernels(),
-        [&](int compiled_subwarp_size) {
-            return total_nnz_per_row <= compiled_subwarp_size ||
-                   compiled_subwarp_size == config::warp_size;
-        },
-        syn::value_list<int>(), syn::type_list<>(), exec, llh, a, l, l_new);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_PAR_ICT_ADD_CANDIDATES_KERNEL);
-
-
-template <typename ValueType, typename IndexType>
-void compute_factor(std::shared_ptr<const DefaultExecutor> exec,
-                    const matrix::Csr<ValueType, IndexType>* a,
-                    matrix::Csr<ValueType, IndexType>* l,
-                    const matrix::Coo<ValueType, IndexType>* l_coo)
-{
-    auto num_rows = a->get_size()[0];
-    auto total_nnz = 2 * l->get_num_stored_elements();
-    auto total_nnz_per_row = total_nnz / num_rows;
-    select_compute_factor(
-        compiled_kernels(),
-        [&](int compiled_subwarp_size) {
-            return total_nnz_per_row <= compiled_subwarp_size ||
-                   compiled_subwarp_size == config::warp_size;
-        },
-        syn::value_list<int>(), syn::type_list<>(), exec, a, l, l_coo);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_PAR_ICT_COMPUTE_FACTOR_KERNEL);
-
-
-}  // namespace par_ict_factorization
-}  // namespace GKO_DEVICE_NAMESPACE
-}  // namespace kernels
-}  // namespace gko
diff --git a/common/cuda_hip/factorization/par_ilut_filter_kernels.cpp b/common/cuda_hip/factorization/par_ilut_filter_kernels.hpp.inc
similarity index 57%
rename from common/cuda_hip/factorization/par_ilut_filter_kernels.cpp
rename to common/cuda_hip/factorization/par_ilut_filter_kernels.hpp.inc
index 3622f971878..68794bfc8d1 100644
--- a/common/cuda_hip/factorization/par_ilut_filter_kernels.cpp
+++ b/common/cuda_hip/factorization/par_ilut_filter_kernels.hpp.inc
@@ -2,49 +2,6 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include "core/factorization/par_ilut_kernels.hpp"
-
-
-#include <ginkgo/core/base/array.hpp>
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/matrix/coo.hpp>
-#include <ginkgo/core/matrix/csr.hpp>
-#include <ginkgo/core/matrix/dense.hpp>
-
-
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/math.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/types.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "common/cuda_hip/components/intrinsics.hpp"
-#include "common/cuda_hip/components/thread_ids.hpp"
-#include "core/components/prefix_sum_kernels.hpp"
-#include "core/matrix/coo_builder.hpp"
-#include "core/matrix/csr_builder.hpp"
-#include "core/matrix/csr_kernels.hpp"
-#include "core/synthesizer/implementation_selection.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace GKO_DEVICE_NAMESPACE {
-/**
- * @brief The parallel ILUT factorization namespace.
- *
- * @ingroup factor
- */
-namespace par_ilut_factorization {
-
-
-constexpr int default_block_size = 512;
-
-
-// subwarp sizes for filter kernels
-using compiled_kernels =
-    syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
-
-
 namespace kernel {
 
 
@@ -205,95 +162,3 @@ __global__ __launch_bounds__(default_block_size) void bucket_filter(
 
 
 }  // namespace kernel
-
-
-namespace {
-
-
-template <int subwarp_size, typename ValueType, typename IndexType>
-void threshold_filter(syn::value_list<int, subwarp_size>,
-                      std::shared_ptr<const DefaultExecutor> exec,
-                      const matrix::Csr<ValueType, IndexType>* a,
-                      remove_complex<ValueType> threshold,
-                      matrix::Csr<ValueType, IndexType>* m_out,
-                      matrix::Coo<ValueType, IndexType>* m_out_coo, bool lower)
-{
-    auto old_row_ptrs = a->get_const_row_ptrs();
-    auto old_col_idxs = a->get_const_col_idxs();
-    auto old_vals = a->get_const_values();
-    // compute nnz for each row
-    auto num_rows = static_cast<IndexType>(a->get_size()[0]);
-    auto block_size = default_block_size / subwarp_size;
-    auto num_blocks = ceildiv(num_rows, block_size);
-    auto new_row_ptrs = m_out->get_row_ptrs();
-    if (num_blocks > 0) {
-        kernel::threshold_filter_nnz<subwarp_size>
-            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
-                old_row_ptrs, as_device_type(old_vals), num_rows,
-                as_device_type(threshold), new_row_ptrs, lower);
-    }
-
-    // build row pointers
-    components::prefix_sum_nonnegative(exec, new_row_ptrs, num_rows + 1);
-
-    // build matrix
-    auto new_nnz = exec->copy_val_to_host(new_row_ptrs + num_rows);
-    // resize arrays and update aliases
-    matrix::CsrBuilder<ValueType, IndexType> builder{m_out};
-    builder.get_col_idx_array().resize_and_reset(new_nnz);
-    builder.get_value_array().resize_and_reset(new_nnz);
-    auto new_col_idxs = m_out->get_col_idxs();
-    auto new_vals = m_out->get_values();
-    IndexType* new_row_idxs{};
-    if (m_out_coo) {
-        matrix::CooBuilder<ValueType, IndexType> coo_builder{m_out_coo};
-        coo_builder.get_row_idx_array().resize_and_reset(new_nnz);
-        coo_builder.get_col_idx_array() =
-            make_array_view(exec, new_nnz, new_col_idxs);
-        coo_builder.get_value_array() =
-            make_array_view(exec, new_nnz, new_vals);
-        new_row_idxs = m_out_coo->get_row_idxs();
-    }
-    if (num_blocks > 0) {
-        kernel::threshold_filter<subwarp_size>
-            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
-                old_row_ptrs, old_col_idxs, as_device_type(old_vals), num_rows,
-                as_device_type(threshold), new_row_ptrs, new_row_idxs,
-                new_col_idxs, as_device_type(new_vals), lower);
-    }
-}
-
-
-GKO_ENABLE_IMPLEMENTATION_SELECTION(select_threshold_filter, threshold_filter);
-
-
-}  // namespace
-
-template <typename ValueType, typename IndexType>
-void threshold_filter(std::shared_ptr<const DefaultExecutor> exec,
-                      const matrix::Csr<ValueType, IndexType>* a,
-                      remove_complex<ValueType> threshold,
-                      matrix::Csr<ValueType, IndexType>* m_out,
-                      matrix::Coo<ValueType, IndexType>* m_out_coo, bool lower)
-{
-    auto num_rows = a->get_size()[0];
-    auto total_nnz = a->get_num_stored_elements();
-    auto total_nnz_per_row = total_nnz / num_rows;
-    select_threshold_filter(
-        compiled_kernels(),
-        [&](int compiled_subwarp_size) {
-            return total_nnz_per_row <= compiled_subwarp_size ||
-                   compiled_subwarp_size == config::warp_size;
-        },
-        syn::value_list<int>(), syn::type_list<>(), exec, a, threshold, m_out,
-        m_out_coo, lower);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_KERNEL);
-
-
-}  // namespace par_ilut_factorization
-}  // namespace GKO_DEVICE_NAMESPACE
-}  // namespace kernels
-}  // namespace gko
diff --git a/common/cuda_hip/factorization/par_ilut_select_kernels.cpp b/common/cuda_hip/factorization/par_ilut_select_kernels.hpp.inc
similarity index 63%
rename from common/cuda_hip/factorization/par_ilut_select_kernels.cpp
rename to common/cuda_hip/factorization/par_ilut_select_kernels.hpp.inc
index 5c00503923a..2ee5061d4c5 100644
--- a/common/cuda_hip/factorization/par_ilut_select_kernels.cpp
+++ b/common/cuda_hip/factorization/par_ilut_select_kernels.hpp.inc
@@ -2,40 +2,6 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include "core/factorization/par_ilut_kernels.hpp"
-
-
-#include <algorithm>
-#include <common/cuda_hip/factorization/par_ilut_select_common.hpp>
-
-
-#include <ginkgo/core/base/array.hpp>
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/matrix/csr.hpp>
-
-
-#include "common/cuda_hip/base/math.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/components/atomic.hpp"
-#include "common/cuda_hip/components/intrinsics.hpp"
-#include "common/cuda_hip/components/prefix_sum.hpp"
-#include "common/cuda_hip/components/searching.hpp"
-#include "common/cuda_hip/components/sorting.hpp"
-#include "common/cuda_hip/components/thread_ids.hpp"
-#include "core/components/prefix_sum_kernels.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace GKO_DEVICE_NAMESPACE {
-/**
- * @brief The parallel ILUT factorization namespace.
- *
- * @ingroup factor
- */
-namespace par_ilut_factorization {
-
-
 namespace kernel {
 
 
@@ -312,126 +278,3 @@ __global__ __launch_bounds__(config::warp_size) void find_bucket(
 
 
 }  // namespace kernel
-
-
-template <typename ValueType, typename IndexType>
-void sampleselect_filter(std::shared_ptr<const DefaultExecutor> exec,
-                         const ValueType* values, IndexType size,
-                         const unsigned char* oracles,
-                         const IndexType* partial_counts, IndexType bucket,
-                         remove_complex<ValueType>* out)
-{
-    auto num_threads_total = ceildiv(size, items_per_thread);
-    auto num_blocks =
-        static_cast<IndexType>(ceildiv(num_threads_total, default_block_size));
-    if (num_blocks > 0) {
-        kernel::filter_bucket<<<num_blocks, default_block_size, 0,
-                                exec->get_stream()>>>(
-            as_device_type(values), size, bucket, oracles, partial_counts,
-            as_device_type(out), items_per_thread);
-    }
-}
-
-
-template <typename ValueType, typename IndexType>
-void threshold_select(std::shared_ptr<const DefaultExecutor> exec,
-                      const matrix::Csr<ValueType, IndexType>* m,
-                      IndexType rank, array<ValueType>& tmp1,
-                      array<remove_complex<ValueType>>& tmp2,
-                      remove_complex<ValueType>& threshold)
-{
-    auto values = m->get_const_values();
-    IndexType size = m->get_num_stored_elements();
-    using AbsType = remove_complex<ValueType>;
-    constexpr auto bucket_count = kernel::searchtree_width;
-    auto max_num_threads = ceildiv(size, items_per_thread);
-    auto max_num_blocks = ceildiv(max_num_threads, default_block_size);
-
-    size_type tmp_size_totals =
-        ceildiv((bucket_count + 1) * sizeof(IndexType), sizeof(ValueType));
-    size_type tmp_size_partials = ceildiv(
-        bucket_count * max_num_blocks * sizeof(IndexType), sizeof(ValueType));
-    size_type tmp_size_oracles =
-        ceildiv(size * sizeof(unsigned char), sizeof(ValueType));
-    size_type tmp_size_tree =
-        ceildiv(kernel::searchtree_size * sizeof(AbsType), sizeof(ValueType));
-    size_type tmp_size_vals =
-        size / bucket_count * 4;  // pessimistic estimate for temporary storage
-    size_type tmp_size =
-        tmp_size_totals + tmp_size_partials + tmp_size_oracles + tmp_size_tree;
-    tmp1.resize_and_reset(tmp_size);
-    tmp2.resize_and_reset(tmp_size_vals);
-
-    auto total_counts = reinterpret_cast<IndexType*>(tmp1.get_data());
-    auto partial_counts =
-        reinterpret_cast<IndexType*>(tmp1.get_data() + tmp_size_totals);
-    auto oracles = reinterpret_cast<unsigned char*>(
-        tmp1.get_data() + tmp_size_totals + tmp_size_partials);
-    auto tree =
-        reinterpret_cast<AbsType*>(tmp1.get_data() + tmp_size_totals +
-                                   tmp_size_partials + tmp_size_oracles);
-
-    sampleselect_count(exec, values, size, tree, oracles, partial_counts,
-                       total_counts);
-
-    // determine bucket with correct rank, use bucket-local rank
-    auto bucket = sampleselect_find_bucket(exec, total_counts, rank);
-    rank -= bucket.begin;
-
-    if (bucket.size * 2 > tmp_size_vals) {
-        // we need to reallocate tmp2
-        tmp2.resize_and_reset(bucket.size * 2);
-    }
-    auto tmp21 = tmp2.get_data();
-    auto tmp22 = tmp2.get_data() + bucket.size;
-    // extract target bucket
-    sampleselect_filter(exec, values, size, oracles, partial_counts, bucket.idx,
-                        tmp22);
-
-    // recursively select from smaller buckets
-    int step{};
-    while (bucket.size > kernel::basecase_size) {
-        std::swap(tmp21, tmp22);
-        const auto* tmp_in = tmp21;
-        auto tmp_out = tmp22;
-
-        sampleselect_count(exec, tmp_in, bucket.size, tree, oracles,
-                           partial_counts, total_counts);
-        auto new_bucket = sampleselect_find_bucket(exec, total_counts, rank);
-        sampleselect_filter(exec, tmp_in, bucket.size, oracles, partial_counts,
-                            bucket.idx, tmp_out);
-
-        rank -= new_bucket.begin;
-        bucket.size = new_bucket.size;
-        // we should never need more than 5 recursion steps, this would mean
-        // 256^5 = 2^40. fall back to standard library algorithm in that case.
-        ++step;
-        if (step > 5) {
-            array<AbsType> cpu_out_array{
-                exec->get_master(),
-                make_array_view(exec, bucket.size, tmp_out)};
-            auto begin = cpu_out_array.get_data();
-            auto end = begin + bucket.size;
-            auto middle = begin + rank;
-            std::nth_element(begin, middle, end);
-            threshold = *middle;
-            return;
-        }
-    }
-
-    // base case
-    auto out_ptr = reinterpret_cast<AbsType*>(tmp1.get_data());
-    kernel::basecase_select<<<1, kernel::basecase_block_size, 0,
-                              exec->get_stream()>>>(
-        as_device_type(tmp22), bucket.size, rank, as_device_type(out_ptr));
-    threshold = exec->copy_val_to_host(out_ptr);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_PAR_ILUT_THRESHOLD_SELECT_KERNEL);
-
-
-}  // namespace par_ilut_factorization
-}  // namespace GKO_DEVICE_NAMESPACE
-}  // namespace kernels
-}  // namespace gko
diff --git a/common/cuda_hip/factorization/par_ilut_spgeam_kernels.cpp b/common/cuda_hip/factorization/par_ilut_spgeam_kernels.hpp.inc
similarity index 63%
rename from common/cuda_hip/factorization/par_ilut_spgeam_kernels.cpp
rename to common/cuda_hip/factorization/par_ilut_spgeam_kernels.hpp.inc
index b9658f69f70..a97f0f08937 100644
--- a/common/cuda_hip/factorization/par_ilut_spgeam_kernels.cpp
+++ b/common/cuda_hip/factorization/par_ilut_spgeam_kernels.hpp.inc
@@ -2,50 +2,6 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include "core/factorization/par_ilut_kernels.hpp"
-
-
-#include <ginkgo/core/base/array.hpp>
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/matrix/coo.hpp>
-#include <ginkgo/core/matrix/csr.hpp>
-#include <ginkgo/core/matrix/dense.hpp>
-
-
-#include "common/cuda_hip/base/math.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "common/cuda_hip/components/intrinsics.hpp"
-#include "common/cuda_hip/components/merging.hpp"
-#include "common/cuda_hip/components/prefix_sum.hpp"
-#include "common/cuda_hip/components/searching.hpp"
-#include "common/cuda_hip/components/thread_ids.hpp"
-#include "core/components/prefix_sum_kernels.hpp"
-#include "core/matrix/coo_builder.hpp"
-#include "core/matrix/csr_builder.hpp"
-#include "core/matrix/csr_kernels.hpp"
-#include "core/synthesizer/implementation_selection.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace GKO_DEVICE_NAMESPACE {
-/**
- * @brief The parallel ILUT factorization namespace.
- *
- * @ingroup factor
- */
-namespace par_ilut_factorization {
-
-
-constexpr int default_block_size = 512;
-
-
-// subwarp sizes for add_candidates kernels
-using compiled_kernels =
-    syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
-
-
 namespace kernel {
 
 
@@ -290,113 +246,3 @@ __global__ __launch_bounds__(default_block_size) void tri_spgeam_init(
 
 
 }  // namespace kernel
-
-
-namespace {
-
-
-template <int subwarp_size, typename ValueType, typename IndexType>
-void add_candidates(syn::value_list<int, subwarp_size>,
-                    std::shared_ptr<const DefaultExecutor> exec,
-                    const matrix::Csr<ValueType, IndexType>* lu,
-                    const matrix::Csr<ValueType, IndexType>* a,
-                    const matrix::Csr<ValueType, IndexType>* l,
-                    const matrix::Csr<ValueType, IndexType>* u,
-                    matrix::Csr<ValueType, IndexType>* l_new,
-                    matrix::Csr<ValueType, IndexType>* u_new)
-{
-    auto num_rows = static_cast<IndexType>(lu->get_size()[0]);
-    auto subwarps_per_block = default_block_size / subwarp_size;
-    auto num_blocks = ceildiv(num_rows, subwarps_per_block);
-    matrix::CsrBuilder<ValueType, IndexType> l_new_builder(l_new);
-    matrix::CsrBuilder<ValueType, IndexType> u_new_builder(u_new);
-    auto lu_row_ptrs = lu->get_const_row_ptrs();
-    auto lu_col_idxs = lu->get_const_col_idxs();
-    auto lu_vals = lu->get_const_values();
-    auto a_row_ptrs = a->get_const_row_ptrs();
-    auto a_col_idxs = a->get_const_col_idxs();
-    auto a_vals = a->get_const_values();
-    auto l_row_ptrs = l->get_const_row_ptrs();
-    auto l_col_idxs = l->get_const_col_idxs();
-    auto l_vals = l->get_const_values();
-    auto u_row_ptrs = u->get_const_row_ptrs();
-    auto u_col_idxs = u->get_const_col_idxs();
-    auto u_vals = u->get_const_values();
-    auto l_new_row_ptrs = l_new->get_row_ptrs();
-    auto u_new_row_ptrs = u_new->get_row_ptrs();
-    if (num_blocks > 0) {
-        // count non-zeros per row
-        kernel::tri_spgeam_nnz<subwarp_size>
-            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
-                lu_row_ptrs, lu_col_idxs, a_row_ptrs, a_col_idxs,
-                l_new_row_ptrs, u_new_row_ptrs, num_rows);
-    }
-
-    // build row ptrs
-    components::prefix_sum_nonnegative(exec, l_new_row_ptrs, num_rows + 1);
-    components::prefix_sum_nonnegative(exec, u_new_row_ptrs, num_rows + 1);
-
-    // resize output arrays
-    auto l_new_nnz = exec->copy_val_to_host(l_new_row_ptrs + num_rows);
-    auto u_new_nnz = exec->copy_val_to_host(u_new_row_ptrs + num_rows);
-    l_new_builder.get_col_idx_array().resize_and_reset(l_new_nnz);
-    l_new_builder.get_value_array().resize_and_reset(l_new_nnz);
-    u_new_builder.get_col_idx_array().resize_and_reset(u_new_nnz);
-    u_new_builder.get_value_array().resize_and_reset(u_new_nnz);
-
-    auto l_new_col_idxs = l_new->get_col_idxs();
-    auto l_new_vals = l_new->get_values();
-    auto u_new_col_idxs = u_new->get_col_idxs();
-    auto u_new_vals = u_new->get_values();
-
-    if (num_blocks > 0) {
-        // fill columns and values
-        kernel::tri_spgeam_init<subwarp_size>
-            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
-                lu_row_ptrs, lu_col_idxs, as_device_type(lu_vals), a_row_ptrs,
-                a_col_idxs, as_device_type(a_vals), l_row_ptrs, l_col_idxs,
-                as_device_type(l_vals), u_row_ptrs, u_col_idxs,
-                as_device_type(u_vals), l_new_row_ptrs, l_new_col_idxs,
-                as_device_type(l_new_vals), u_new_row_ptrs, u_new_col_idxs,
-                as_device_type(u_new_vals), num_rows);
-    }
-}
-
-
-GKO_ENABLE_IMPLEMENTATION_SELECTION(select_add_candidates, add_candidates);
-
-
-}  // namespace
-
-
-template <typename ValueType, typename IndexType>
-void add_candidates(std::shared_ptr<const DefaultExecutor> exec,
-                    const matrix::Csr<ValueType, IndexType>* lu,
-                    const matrix::Csr<ValueType, IndexType>* a,
-                    const matrix::Csr<ValueType, IndexType>* l,
-                    const matrix::Csr<ValueType, IndexType>* u,
-                    matrix::Csr<ValueType, IndexType>* l_new,
-                    matrix::Csr<ValueType, IndexType>* u_new)
-{
-    auto num_rows = a->get_size()[0];
-    auto total_nnz =
-        lu->get_num_stored_elements() + a->get_num_stored_elements();
-    auto total_nnz_per_row = total_nnz / num_rows;
-    select_add_candidates(
-        compiled_kernels(),
-        [&](int compiled_subwarp_size) {
-            return total_nnz_per_row <= compiled_subwarp_size ||
-                   compiled_subwarp_size == config::warp_size;
-        },
-        syn::value_list<int>(), syn::type_list<>(), exec, lu, a, l, u, l_new,
-        u_new);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_PAR_ILUT_ADD_CANDIDATES_KERNEL);
-
-
-}  // namespace par_ilut_factorization
-}  // namespace GKO_DEVICE_NAMESPACE
-}  // namespace kernels
-}  // namespace gko
diff --git a/common/cuda_hip/factorization/par_ilut_sweep_kernels.hpp.inc b/common/cuda_hip/factorization/par_ilut_sweep_kernels.hpp.inc
new file mode 100644
index 00000000000..9da94a878b3
--- /dev/null
+++ b/common/cuda_hip/factorization/par_ilut_sweep_kernels.hpp.inc
@@ -0,0 +1,94 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+namespace kernel {
+
+
+template <int subwarp_size, typename ValueType, typename IndexType>
+__global__ __launch_bounds__(default_block_size) void sweep(
+    const IndexType* __restrict__ a_row_ptrs,
+    const IndexType* __restrict__ a_col_idxs,
+    const ValueType* __restrict__ a_vals,
+    const IndexType* __restrict__ l_row_ptrs,
+    const IndexType* __restrict__ l_row_idxs,
+    const IndexType* __restrict__ l_col_idxs, ValueType* __restrict__ l_vals,
+    IndexType l_nnz, const IndexType* __restrict__ u_row_idxs,
+    const IndexType* __restrict__ u_col_idxs, ValueType* __restrict__ u_vals,
+    const IndexType* __restrict__ ut_col_ptrs,
+    const IndexType* __restrict__ ut_row_idxs, ValueType* __restrict__ ut_vals,
+    IndexType u_nnz)
+{
+    auto tidx = thread::get_subwarp_id_flat<subwarp_size, IndexType>();
+    if (tidx >= l_nnz + u_nnz) {
+        return;
+    }
+    // split the subwarps into two halves for lower and upper triangle
+    auto l_nz = tidx;
+    auto u_nz = l_nz - l_nnz;
+    auto lower = u_nz < 0;
+    auto row = lower ? l_row_idxs[l_nz] : u_row_idxs[u_nz];
+    auto col = lower ? l_col_idxs[l_nz] : u_col_idxs[u_nz];
+    if (lower && row == col) {
+        // don't update the diagonal twice
+        return;
+    }
+    auto subwarp =
+        group::tiled_partition<subwarp_size>(group::this_thread_block());
+    // find entry of A at (row, col)
+    auto a_row_begin = a_row_ptrs[row];
+    auto a_row_end = a_row_ptrs[row + 1];
+    auto a_row_size = a_row_end - a_row_begin;
+    auto a_idx =
+        group_wide_search(a_row_begin, a_row_size, subwarp,
+                          [&](IndexType i) { return a_col_idxs[i] >= col; });
+    bool has_a = a_idx < a_row_end && a_col_idxs[a_idx] == col;
+    auto a_val = has_a ? a_vals[a_idx] : zero<ValueType>();
+    auto l_row_begin = l_row_ptrs[row];
+    auto l_row_size = l_row_ptrs[row + 1] - l_row_begin;
+    auto ut_col_begin = ut_col_ptrs[col];
+    auto ut_col_size = ut_col_ptrs[col + 1] - ut_col_begin;
+    ValueType sum{};
+    IndexType ut_nz{};
+    auto last_entry = min(row, col);
+    group_merge<subwarp_size>(
+        l_col_idxs + l_row_begin, l_row_size, ut_row_idxs + ut_col_begin,
+        ut_col_size, subwarp,
+        [&](IndexType l_idx, IndexType l_col, IndexType ut_idx,
+            IndexType ut_row, IndexType, bool) {
+            // we don't need to use the `bool valid` because last_entry is
+            // already a smaller sentinel value than the one used in group_merge
+            if (l_col == ut_row && l_col < last_entry) {
+                sum += load_relaxed(l_vals + (l_idx + l_row_begin)) *
+                       load_relaxed(ut_vals + (ut_idx + ut_col_begin));
+            }
+            // remember the transposed element
+            auto found_transp = subwarp.ballot(ut_row == row);
+            if (found_transp) {
+                ut_nz =
+                    subwarp.shfl(ut_idx + ut_col_begin, ffs(found_transp) - 1);
+            }
+            return true;
+        });
+    // accumulate result from all threads
+    sum = reduce(subwarp, sum, [](ValueType a, ValueType b) { return a + b; });
+
+    if (subwarp.thread_rank() == 0) {
+        if (lower) {
+            auto to_write = (a_val - sum) /
+                            load_relaxed(ut_vals + (ut_col_ptrs[col + 1] - 1));
+            if (is_finite(to_write)) {
+                store_relaxed(l_vals + l_nz, to_write);
+            }
+        } else {
+            auto to_write = a_val - sum;
+            if (is_finite(to_write)) {
+                store_relaxed(u_vals + u_nz, to_write);
+                store_relaxed(ut_vals + ut_nz, to_write);
+            }
+        }
+    }
+}
+
+
+}  // namespace kernel
diff --git a/common/cuda_hip/log/batch_logger.hpp b/common/cuda_hip/log/batch_logger.hpp.inc
similarity index 68%
rename from common/cuda_hip/log/batch_logger.hpp
rename to common/cuda_hip/log/batch_logger.hpp.inc
index 77ec84fb7bd..04b614b50f9 100644
--- a/common/cuda_hip/log/batch_logger.hpp
+++ b/common/cuda_hip/log/batch_logger.hpp.inc
@@ -2,19 +2,6 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#ifndef GKO_COMMON_CUDA_HIP_LOG_BATCH_LOGGER_HPP_
-#define GKO_COMMON_CUDA_HIP_LOG_BATCH_LOGGER_HPP_
-
-
-#include <ginkgo/core/base/types.hpp>
-
-
-namespace gko {
-namespace kernels {
-namespace GKO_DEVICE_NAMESPACE {
-namespace batch_log {
-
-
 /**
  * @see reference/log/batch_logger.hpp
  */
@@ -41,12 +28,3 @@ class SimpleFinalLogger final {
     real_type* const final_residuals_;
     idx_type* const final_iters_;
 };
-
-
-}  // namespace batch_log
-}  // namespace GKO_DEVICE_NAMESPACE
-}  // namespace kernels
-}  // namespace gko
-
-
-#endif  // GKO_COMMON_CUDA_HIP_LOG_BATCH_LOGGER_HPP_
diff --git a/common/cuda_hip/matrix/batch_csr_kernels.cpp b/common/cuda_hip/matrix/batch_csr_kernels.hpp.inc
similarity index 87%
rename from common/cuda_hip/matrix/batch_csr_kernels.cpp
rename to common/cuda_hip/matrix/batch_csr_kernels.hpp.inc
index a07074e29e8..e041dadaa3e 100644
--- a/common/cuda_hip/matrix/batch_csr_kernels.cpp
+++ b/common/cuda_hip/matrix/batch_csr_kernels.hpp.inc
@@ -2,49 +2,6 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <common/cuda_hip/base/batch_struct.hpp>
-#include <common/cuda_hip/matrix/batch_struct.hpp>
-
-
-#include <thrust/functional.h>
-
-
-#include <ginkgo/core/base/batch_multi_vector.hpp>
-#include <ginkgo/core/base/types.hpp>
-#include <ginkgo/core/matrix/batch_csr.hpp>
-
-
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/thrust.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "common/cuda_hip/components/reduction.hpp"
-#include "common/cuda_hip/components/thread_ids.hpp"
-#include "common/cuda_hip/components/uninitialized_array.hpp"
-#include "core/base/batch_struct.hpp"
-#include "core/matrix/batch_csr_kernels.hpp"
-#include "core/matrix/batch_struct.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace GKO_DEVICE_NAMESPACE {
-/**
- * @brief The Csr matrix format namespace.
- * @ref Csr
- * @ingroup batch_csr
- */
-namespace batch_csr {
-
-
-constexpr auto default_block_size = 256;
-constexpr int sm_oversubscription = 4;
-
-// clang-format off
-
-// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
-
-
 template <typename ValueType, typename IndexType>
 __device__ __forceinline__ void simple_apply(
     const gko::batch::matrix::csr::batch_item<const ValueType, IndexType>& mat,
@@ -239,14 +196,3 @@ __global__ void add_scaled_identity_kernel(
         add_scaled_identity(alpha_b.values[0], beta_b.values[0], mat_b);
     }
 }
-
-
-#include "common/cuda_hip/matrix/batch_csr_kernel_launcher.hpp.inc"
-
-// clang-format on
-
-
-}  // namespace batch_csr
-}  // namespace GKO_DEVICE_NAMESPACE
-}  // namespace kernels
-}  // namespace gko
diff --git a/common/cuda_hip/matrix/batch_dense_kernels.cpp b/common/cuda_hip/matrix/batch_dense_kernels.hpp.inc
similarity index 89%
rename from common/cuda_hip/matrix/batch_dense_kernels.cpp
rename to common/cuda_hip/matrix/batch_dense_kernels.hpp.inc
index b5c2dbe1d5d..f8abf9131a1 100644
--- a/common/cuda_hip/matrix/batch_dense_kernels.cpp
+++ b/common/cuda_hip/matrix/batch_dense_kernels.hpp.inc
@@ -2,49 +2,6 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <common/cuda_hip/base/batch_struct.hpp>
-#include <common/cuda_hip/matrix/batch_struct.hpp>
-
-
-#include <thrust/functional.h>
-
-
-#include <ginkgo/core/base/batch_multi_vector.hpp>
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/matrix/batch_dense.hpp>
-
-
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/thrust.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "common/cuda_hip/components/reduction.hpp"
-#include "common/cuda_hip/components/thread_ids.hpp"
-#include "common/cuda_hip/components/uninitialized_array.hpp"
-#include "core/base/batch_struct.hpp"
-#include "core/matrix/batch_dense_kernels.hpp"
-#include "core/matrix/batch_struct.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace GKO_DEVICE_NAMESPACE {
-/**
- * @brief The Dense matrix format namespace.
- *
- * @ingroup batch_dense
- */
-namespace batch_dense {
-
-
-constexpr auto default_block_size = 256;
-constexpr int sm_oversubscription = 4;
-
-// clang-format off
-
-// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
-
-
 template <typename ValueType>
 __device__ __forceinline__ void simple_apply(
     const gko::batch::matrix::dense::batch_item<const ValueType>& mat,
@@ -286,15 +243,3 @@ __global__ void add_scaled_identity_kernel(
         add_scaled_identity(alpha_b.values[0], beta_b.values[0], mat_b);
     }
 }
-
-
-#include "common/cuda_hip/matrix/batch_dense_kernel_launcher.hpp.inc"
-
-
-// clang-format on
-
-
-}  // namespace batch_dense
-}  // namespace GKO_DEVICE_NAMESPACE
-}  // namespace kernels
-}  // namespace gko
diff --git a/common/cuda_hip/matrix/batch_ell_kernels.cpp b/common/cuda_hip/matrix/batch_ell_kernels.hpp.inc
similarity index 87%
rename from common/cuda_hip/matrix/batch_ell_kernels.cpp
rename to common/cuda_hip/matrix/batch_ell_kernels.hpp.inc
index c3bf21c7744..0a6d1927c96 100644
--- a/common/cuda_hip/matrix/batch_ell_kernels.cpp
+++ b/common/cuda_hip/matrix/batch_ell_kernels.hpp.inc
@@ -2,49 +2,6 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <common/cuda_hip/base/batch_struct.hpp>
-#include <common/cuda_hip/matrix/batch_struct.hpp>
-
-
-#include <thrust/functional.h>
-
-
-#include <ginkgo/core/base/batch_multi_vector.hpp>
-#include <ginkgo/core/base/types.hpp>
-#include <ginkgo/core/matrix/batch_ell.hpp>
-
-
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/thrust.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "common/cuda_hip/components/reduction.hpp"
-#include "common/cuda_hip/components/thread_ids.hpp"
-#include "common/cuda_hip/components/uninitialized_array.hpp"
-#include "core/base/batch_struct.hpp"
-#include "core/matrix/batch_ell_kernels.hpp"
-#include "core/matrix/batch_struct.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace GKO_DEVICE_NAMESPACE {
-/**
- * @brief The Ell matrix format namespace.
- * @ref Ell
- * @ingroup batch_ell
- */
-namespace batch_ell {
-
-
-constexpr auto default_block_size = 256;
-constexpr int sm_oversubscription = 4;
-
-// clang-format off
-
-// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
-
-
 template <typename ValueType, typename IndexType>
 __device__ __forceinline__ void simple_apply(
     const gko::batch::matrix::ell::batch_item<const ValueType, IndexType>& mat,
@@ -248,14 +205,3 @@ __global__ void add_scaled_identity_kernel(
         add_scaled_identity(alpha_b.values[0], beta_b.values[0], mat_b);
     }
 }
-
-
-#include "common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc"
-
-// clang-format on
-
-
-}  // namespace batch_ell
-}  // namespace GKO_DEVICE_NAMESPACE
-}  // namespace kernels
-}  // namespace gko
diff --git a/common/cuda_hip/matrix/fbcsr_kernels.template.cpp b/common/cuda_hip/matrix/fbcsr_kernels.template.cpp
index 9e5eed5f570..7e0b38c015b 100644
--- a/common/cuda_hip/matrix/fbcsr_kernels.template.cpp
+++ b/common/cuda_hip/matrix/fbcsr_kernels.template.cpp
@@ -3,7 +3,6 @@
 // SPDX-License-Identifier: BSD-3-Clause
 
 #include <algorithm>
-#include <common/cuda_hip/base/sparselib_block_bindings.hpp>
 
 
 #include <thrust/copy.h>
@@ -28,6 +27,7 @@
 #include "common/cuda_hip/base/pointer_mode_guard.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/sparselib_bindings.hpp"
+#include "common/cuda_hip/base/sparselib_block_bindings.hpp"
 #include "common/cuda_hip/base/thrust.hpp"
 #include "common/cuda_hip/base/types.hpp"
 #include "common/cuda_hip/components/atomic.hpp"
@@ -67,6 +67,7 @@ constexpr int default_block_size{512};
 
 #include "common/cuda_hip/matrix/csr_common.hpp.inc"
 
+
 namespace kernel {
 
 
diff --git a/common/cuda_hip/preconditioner/jacobi_kernels.cpp b/common/cuda_hip/preconditioner/jacobi_kernels.hpp.inc
similarity index 91%
rename from common/cuda_hip/preconditioner/jacobi_kernels.cpp
rename to common/cuda_hip/preconditioner/jacobi_kernels.hpp.inc
index 45d32493f25..e0d7cfef0e9 100644
--- a/common/cuda_hip/preconditioner/jacobi_kernels.cpp
+++ b/common/cuda_hip/preconditioner/jacobi_kernels.hpp.inc
@@ -2,48 +2,6 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include "core/preconditioner/jacobi_kernels.hpp"
-
-
-#include <common/cuda_hip/preconditioner/jacobi_common.hpp>
-
-
-#include <ginkgo/core/base/exception_helpers.hpp>
-
-
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/math.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/types.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "common/cuda_hip/components/thread_ids.hpp"
-#include "core/base/extended_float.hpp"
-#include "core/preconditioner/jacobi_utils.hpp"
-#include "core/synthesizer/implementation_selection.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace GKO_DEVICE_NAMESPACE {
-/**
- * @brief The Jacobi preconditioner namespace.
- * @ref Jacobi
- * @ingroup jacobi
- */
-namespace jacobi {
-
-
-// a total of 32/16 warps (1024 threads)
-#if defined(GKO_COMPILING_HIP) && GINKGO_HIP_PLATFORM_HCC
-constexpr int default_num_warps = 16;
-#else  // !defined(GKO_COMPILING_HIP) || GINKGO_HIP_PLATFORM_NVCC
-constexpr int default_num_warps = 32;
-#endif
-// with current architectures, at most 32 warps can be scheduled per SM (and
-// current GPUs have at most 84 SMs)
-constexpr int default_grid_size = 32 * 32 * 128;
-
-
 namespace {
 
 
@@ -411,9 +369,3 @@ void convert_to_dense(
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_JACOBI_CONVERT_TO_DENSE_KERNEL);
-
-
-}  // namespace jacobi
-}  // namespace GKO_DEVICE_NAMESPACE
-}  // namespace kernels
-}  // namespace gko
diff --git a/common/cuda_hip/sources.cmake b/common/cuda_hip/sources.cmake
new file mode 100644
index 00000000000..aac602cbbd3
--- /dev/null
+++ b/common/cuda_hip/sources.cmake
@@ -0,0 +1,31 @@
+# this file needs to be include(...)'d directly because of scoping issues
+include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake)
+add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip/ matrix/fbcsr_kernels.instantiate.cpp FBCSR_INSTANTIATE)
+set(GKO_CUDA_HIP_COMMON_SOURCES
+    base/device_matrix_data_kernels.cpp
+    components/prefix_sum_kernels.cpp
+    distributed/index_map_kernels.cpp
+    distributed/matrix_kernels.cpp
+    distributed/partition_helpers_kernels.cpp
+    distributed/partition_kernels.cpp
+    distributed/vector_kernels.cpp
+    factorization/cholesky_kernels.cpp
+    factorization/factorization_kernels.cpp
+    factorization/lu_kernels.cpp
+    factorization/par_ic_kernels.cpp
+    factorization/par_ilu_kernels.cpp
+    matrix/coo_kernels.cpp
+    matrix/dense_kernels.cpp
+    matrix/diagonal_kernels.cpp
+    matrix/ell_kernels.cpp
+    matrix/sellp_kernels.cpp
+    matrix/sparsity_csr_kernels.cpp
+    multigrid/pgm_kernels.cpp
+    preconditioner/isai_kernels.cpp
+    reorder/rcm_kernels.cpp
+    solver/cb_gmres_kernels.cpp
+    solver/idr_kernels.cpp
+    solver/multigrid_kernels.cpp
+    )
+list(TRANSFORM GKO_CUDA_HIP_COMMON_SOURCES PREPEND ${PROJECT_SOURCE_DIR}/common/cuda_hip/)
+list(APPEND GKO_CUDA_HIP_COMMON_SOURCES ${FBCSR_INSTANTIATE})
\ No newline at end of file
diff --git a/common/cuda_hip/stop/batch_criteria.hpp b/common/cuda_hip/stop/batch_criteria.hpp.inc
similarity index 75%
rename from common/cuda_hip/stop/batch_criteria.hpp
rename to common/cuda_hip/stop/batch_criteria.hpp.inc
index 7491a143a31..38072467765 100644
--- a/common/cuda_hip/stop/batch_criteria.hpp
+++ b/common/cuda_hip/stop/batch_criteria.hpp.inc
@@ -2,19 +2,6 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#ifndef GKO_COMMON_CUDA_HIP_STOP_BATCH_CRITERIA_HPP_
-#define GKO_COMMON_CUDA_HIP_STOP_BATCH_CRITERIA_HPP_
-
-
-#include <ginkgo/core/base/math.hpp>
-
-
-namespace gko {
-namespace kernels {
-namespace GKO_DEVICE_NAMESPACE {
-namespace batch_stop {
-
-
 /**
  * @see reference/stop/batch_criteria.hpp
  */
@@ -62,11 +49,3 @@ class SimpleAbsResidual {
 private:
     const real_type abs_tol_;
 };
-
-
-}  // namespace batch_stop
-}  // namespace GKO_DEVICE_NAMESPACE
-}  // namespace kernels
-}  // namespace gko
-
-#endif  // GKO_COMMON_CUDA_HIP_STOP_BATCH_CRITERIA_HPP_
diff --git a/common/unified/base/kernel_launch.hpp b/common/unified/base/kernel_launch.hpp
index 04db86c27e5..f4ff02cee2f 100644
--- a/common/unified/base/kernel_launch.hpp
+++ b/common/unified/base/kernel_launch.hpp
@@ -270,10 +270,8 @@ typename to_device_type_impl<T>::type map_to_device(T&& param)
 }  // namespace gko
 
 
-#if defined(GKO_COMPILING_CUDA)
+#if defined(GKO_COMPILING_CUDA) || defined(GKO_COMPILING_HIP)
 #include "common/cuda_hip/base/kernel_launch.hpp"
-#elif defined(GKO_COMPILING_HIP)
-#include "hip/base/kernel_launch.hip.hpp"
 #elif defined(GKO_COMPILING_DPCPP)
 #include "dpcpp/base/kernel_launch.dp.hpp"
 #elif defined(GKO_COMPILING_OMP)
diff --git a/common/unified/base/kernel_launch_reduction.hpp b/common/unified/base/kernel_launch_reduction.hpp
index a9b738f2ca9..b7b3e258dd4 100644
--- a/common/unified/base/kernel_launch_reduction.hpp
+++ b/common/unified/base/kernel_launch_reduction.hpp
@@ -19,10 +19,8 @@
     {}
 
 
-#if defined(GKO_COMPILING_CUDA)
+#if defined(GKO_COMPILING_CUDA) || defined(GKO_COMPILING_HIP)
 #include "common/cuda_hip/base/kernel_launch_reduction.hpp"
-#elif defined(GKO_COMPILING_HIP)
-#include "hip/base/kernel_launch_reduction.hip.hpp"
 #elif defined(GKO_COMPILING_DPCPP)
 #include "dpcpp/base/kernel_launch_reduction.dp.hpp"
 #elif defined(GKO_COMPILING_OMP)
diff --git a/common/unified/base/kernel_launch_solver.hpp b/common/unified/base/kernel_launch_solver.hpp
index 11f282e5c09..14f2cbfeacf 100644
--- a/common/unified/base/kernel_launch_solver.hpp
+++ b/common/unified/base/kernel_launch_solver.hpp
@@ -107,10 +107,8 @@ const device_type<ValueType>* row_vector(const matrix::Dense<ValueType>* mtx)
 }  // namespace gko
 
 
-#if defined(GKO_COMPILING_CUDA)
+#if defined(GKO_COMPILING_CUDA) || defined(GKO_COMPILING_HIP)
 #include "common/cuda_hip/base/kernel_launch_solver.hpp"
-#elif defined(GKO_COMPILING_HIP)
-#include "hip/base/kernel_launch_solver.hip.hpp"
 #elif defined(GKO_COMPILING_DPCPP)
 #include "dpcpp/base/kernel_launch_solver.dp.hpp"
 #elif defined(GKO_COMPILING_OMP)
diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt
index 8abdd7ba3f5..8c7a5309dcc 100644
--- a/cuda/CMakeLists.txt
+++ b/cuda/CMakeLists.txt
@@ -1,11 +1,13 @@
 cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
 add_library(ginkgo_cuda $<TARGET_OBJECTS:ginkgo_cuda_device> "")
+include(${PROJECT_SOURCE_DIR}/common/cuda_hip/sources.cmake)
 include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake)
 add_instantiation_files(. matrix/csr_kernels.instantiate.cu CSR_INSTANTIATE)
 # we don't split up the dense kernels into distinct compilations
 list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp)
 target_sources(ginkgo_cuda
     PRIVATE
+    base/batch_multi_vector_kernels.cu
     base/device.cpp
     base/exception.cpp
     base/executor.cpp
@@ -18,14 +20,23 @@ target_sources(ginkgo_cuda
     base/version.cpp
     factorization/ic_kernels.cu
     factorization/ilu_kernels.cu
+    factorization/par_ict_kernels.cu
     factorization/par_ilut_approx_filter_kernels.cu
+    factorization/par_ilut_filter_kernels.cu
     factorization/par_ilut_select_common.cu
+    factorization/par_ilut_select_kernels.cu
+    factorization/par_ilut_spgeam_kernels.cu
+    factorization/par_ilut_sweep_kernels.cu
+    matrix/batch_csr_kernels.cu
+    matrix/batch_dense_kernels.cu
+    matrix/batch_ell_kernels.cu
     ${CSR_INSTANTIATE}
     matrix/fft_kernels.cu
     preconditioner/batch_jacobi_kernels.cu
     preconditioner/jacobi_advanced_apply_kernels.cu
     preconditioner/jacobi_generate_kernels.cu
     preconditioner/jacobi_simple_apply_kernels.cu
+    preconditioner/jacobi_kernels.cu
     solver/batch_bicgstab_kernels.cu
     solver/batch_cg_kernels.cu
     solver/lower_trs_kernels.cu
diff --git a/cuda/base/batch_multi_vector_kernels.cu b/cuda/base/batch_multi_vector_kernels.cu
new file mode 100644
index 00000000000..61a7ad92e52
--- /dev/null
+++ b/cuda/base/batch_multi_vector_kernels.cu
@@ -0,0 +1,59 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "core/base/batch_multi_vector_kernels.hpp"
+
+
+#include <thrust/functional.h>
+#include <thrust/transform.h>
+
+
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/range_accessors.hpp>
+
+
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/uninitialized_array.hpp"
+#include "core/base/batch_struct.hpp"
+#include "cuda/base/batch_struct.hpp"
+#include "cuda/base/config.hpp"
+#include "cuda/base/cublas_bindings.hpp"
+#include "cuda/base/pointer_mode_guard.hpp"
+#include "cuda/base/thrust.cuh"
+#include "cuda/components/cooperative_groups.cuh"
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+/**
+ * @brief The MultiVector matrix format namespace.
+ *
+ * @ingroup batch_multi_vector
+ */
+namespace batch_multi_vector {
+
+
+constexpr auto default_block_size = 256;
+constexpr int sm_oversubscription = 4;
+
+
+// clang-format off
+
+// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
+
+#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc"
+
+
+#include "common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc"
+
+// clang-format on
+
+
+}  // namespace batch_multi_vector
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko
diff --git a/cuda/factorization/par_ict_kernels.cu b/cuda/factorization/par_ict_kernels.cu
new file mode 100644
index 00000000000..d4bec43a844
--- /dev/null
+++ b/cuda/factorization/par_ict_kernels.cu
@@ -0,0 +1,189 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "core/factorization/par_ict_kernels.hpp"
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/coo.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/intrinsics.hpp"
+#include "common/cuda_hip/components/merging.hpp"
+#include "common/cuda_hip/components/prefix_sum.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/searching.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "core/components/prefix_sum_kernels.hpp"
+#include "core/matrix/coo_builder.hpp"
+#include "core/matrix/csr_builder.hpp"
+#include "core/matrix/csr_kernels.hpp"
+#include "core/synthesizer/implementation_selection.hpp"
+#include "cuda/components/memory.cuh"
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+/**
+ * @brief The parallel ICT factorization namespace.
+ *
+ * @ingroup factor
+ */
+namespace par_ict_factorization {
+
+
+constexpr int default_block_size = 512;
+
+
+// subwarp sizes for all warp-parallel kernels (filter, add_candidates)
+using compiled_kernels =
+    syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
+
+
+#include "common/cuda_hip/factorization/par_ict_kernels.hpp.inc"
+
+
+namespace {
+
+
+template <int subwarp_size, typename ValueType, typename IndexType>
+void add_candidates(syn::value_list<int, subwarp_size>,
+                    std::shared_ptr<const DefaultExecutor> exec,
+                    const matrix::Csr<ValueType, IndexType>* llh,
+                    const matrix::Csr<ValueType, IndexType>* a,
+                    const matrix::Csr<ValueType, IndexType>* l,
+                    matrix::Csr<ValueType, IndexType>* l_new)
+{
+    auto num_rows = static_cast<IndexType>(llh->get_size()[0]);
+    auto subwarps_per_block = default_block_size / subwarp_size;
+    auto num_blocks = ceildiv(num_rows, subwarps_per_block);
+    matrix::CsrBuilder<ValueType, IndexType> l_new_builder(l_new);
+    auto llh_row_ptrs = llh->get_const_row_ptrs();
+    auto llh_col_idxs = llh->get_const_col_idxs();
+    auto llh_vals = llh->get_const_values();
+    auto a_row_ptrs = a->get_const_row_ptrs();
+    auto a_col_idxs = a->get_const_col_idxs();
+    auto a_vals = a->get_const_values();
+    auto l_row_ptrs = l->get_const_row_ptrs();
+    auto l_col_idxs = l->get_const_col_idxs();
+    auto l_vals = l->get_const_values();
+    auto l_new_row_ptrs = l_new->get_row_ptrs();
+    // count non-zeros per row
+    if (num_blocks > 0) {
+        kernel::ict_tri_spgeam_nnz<subwarp_size>
+            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+                llh_row_ptrs, llh_col_idxs, a_row_ptrs, a_col_idxs,
+                l_new_row_ptrs, num_rows);
+    }
+
+    // build row ptrs
+    components::prefix_sum_nonnegative(exec, l_new_row_ptrs, num_rows + 1);
+
+    // resize output arrays
+    auto l_new_nnz = exec->copy_val_to_host(l_new_row_ptrs + num_rows);
+    l_new_builder.get_col_idx_array().resize_and_reset(l_new_nnz);
+    l_new_builder.get_value_array().resize_and_reset(l_new_nnz);
+
+    auto l_new_col_idxs = l_new->get_col_idxs();
+    auto l_new_vals = l_new->get_values();
+
+    // fill columns and values
+    if (num_blocks > 0) {
+        kernel::ict_tri_spgeam_init<subwarp_size>
+            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+                llh_row_ptrs, llh_col_idxs, as_device_type(llh_vals),
+                a_row_ptrs, a_col_idxs, as_device_type(a_vals), l_row_ptrs,
+                l_col_idxs, as_device_type(l_vals), l_new_row_ptrs,
+                l_new_col_idxs, as_device_type(l_new_vals), num_rows);
+    }
+}
+
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_add_candidates, add_candidates);
+
+
+template <int subwarp_size, typename ValueType, typename IndexType>
+void compute_factor(syn::value_list<int, subwarp_size>,
+                    std::shared_ptr<const DefaultExecutor> exec,
+                    const matrix::Csr<ValueType, IndexType>* a,
+                    matrix::Csr<ValueType, IndexType>* l,
+                    const matrix::Coo<ValueType, IndexType>* l_coo)
+{
+    auto total_nnz = static_cast<IndexType>(l->get_num_stored_elements());
+    auto block_size = default_block_size / subwarp_size;
+    auto num_blocks = ceildiv(total_nnz, block_size);
+    if (num_blocks > 0) {
+        kernel::ict_sweep<subwarp_size>
+            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+                a->get_const_row_ptrs(), a->get_const_col_idxs(),
+                as_device_type(a->get_const_values()), l->get_const_row_ptrs(),
+                l_coo->get_const_row_idxs(), l->get_const_col_idxs(),
+                as_device_type(l->get_values()),
+                static_cast<IndexType>(l->get_num_stored_elements()));
+    }
+}
+
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_compute_factor, compute_factor);
+
+
+}  // namespace
+
+
+template <typename ValueType, typename IndexType>
+void add_candidates(std::shared_ptr<const DefaultExecutor> exec,
+                    const matrix::Csr<ValueType, IndexType>* llh,
+                    const matrix::Csr<ValueType, IndexType>* a,
+                    const matrix::Csr<ValueType, IndexType>* l,
+                    matrix::Csr<ValueType, IndexType>* l_new)
+{
+    auto num_rows = a->get_size()[0];
+    auto total_nnz =
+        llh->get_num_stored_elements() + a->get_num_stored_elements();
+    auto total_nnz_per_row = total_nnz / num_rows;
+    select_add_candidates(
+        compiled_kernels(),
+        [&](int compiled_subwarp_size) {
+            return total_nnz_per_row <= compiled_subwarp_size ||
+                   compiled_subwarp_size == config::warp_size;
+        },
+        syn::value_list<int>(), syn::type_list<>(), exec, llh, a, l, l_new);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_PAR_ICT_ADD_CANDIDATES_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void compute_factor(std::shared_ptr<const DefaultExecutor> exec,
+                    const matrix::Csr<ValueType, IndexType>* a,
+                    matrix::Csr<ValueType, IndexType>* l,
+                    const matrix::Coo<ValueType, IndexType>* l_coo)
+{
+    auto num_rows = a->get_size()[0];
+    auto total_nnz = 2 * l->get_num_stored_elements();
+    auto total_nnz_per_row = total_nnz / num_rows;
+    select_compute_factor(
+        compiled_kernels(),
+        [&](int compiled_subwarp_size) {
+            return total_nnz_per_row <= compiled_subwarp_size ||
+                   compiled_subwarp_size == config::warp_size;
+        },
+        syn::value_list<int>(), syn::type_list<>(), exec, a, l, l_coo);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_PAR_ICT_COMPUTE_FACTOR_KERNEL);
+
+
+}  // namespace par_ict_factorization
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko
diff --git a/cuda/factorization/par_ilut_approx_filter_kernels.cu b/cuda/factorization/par_ilut_approx_filter_kernels.cu
index 853519cd36b..2932647231a 100644
--- a/cuda/factorization/par_ilut_approx_filter_kernels.cu
+++ b/cuda/factorization/par_ilut_approx_filter_kernels.cu
@@ -15,20 +15,20 @@
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/components/atomic.hpp"
+#include "common/cuda_hip/components/intrinsics.hpp"
+#include "common/cuda_hip/components/prefix_sum.hpp"
+#include "common/cuda_hip/components/sorting.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/coo_builder.hpp"
 #include "core/matrix/csr_builder.hpp"
 #include "core/matrix/csr_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
 #include "cuda/base/config.hpp"
-#include "cuda/base/math.hpp"
 #include "cuda/base/types.hpp"
-#include "cuda/components/atomic.cuh"
 #include "cuda/components/cooperative_groups.cuh"
-#include "cuda/components/intrinsics.cuh"
-#include "cuda/components/prefix_sum.cuh"
-#include "cuda/components/sorting.cuh"
-#include "cuda/components/thread_ids.cuh"
 #include "cuda/factorization/par_ilut_select_common.cuh"
 
 
diff --git a/cuda/factorization/par_ilut_filter_kernels.cu b/cuda/factorization/par_ilut_filter_kernels.cu
new file mode 100644
index 00000000000..ceb471bc642
--- /dev/null
+++ b/cuda/factorization/par_ilut_filter_kernels.cu
@@ -0,0 +1,140 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "core/factorization/par_ilut_kernels.hpp"
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/coo.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/intrinsics.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "core/components/prefix_sum_kernels.hpp"
+#include "core/matrix/coo_builder.hpp"
+#include "core/matrix/csr_builder.hpp"
+#include "core/matrix/csr_kernels.hpp"
+#include "core/synthesizer/implementation_selection.hpp"
+#include "cuda/base/config.hpp"
+#include "cuda/base/types.hpp"
+#include "cuda/components/cooperative_groups.cuh"
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+/**
+ * @brief The parallel ILUT factorization namespace.
+ *
+ * @ingroup factor
+ */
+namespace par_ilut_factorization {
+
+
+constexpr int default_block_size = 512;
+
+
+// subwarp sizes for filter kernels
+using compiled_kernels =
+    syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
+
+
+#include "common/cuda_hip/factorization/par_ilut_filter_kernels.hpp.inc"
+
+
+namespace {
+
+
+template <int subwarp_size, typename ValueType, typename IndexType>
+void threshold_filter(syn::value_list<int, subwarp_size>,
+                      std::shared_ptr<const DefaultExecutor> exec,
+                      const matrix::Csr<ValueType, IndexType>* a,
+                      remove_complex<ValueType> threshold,
+                      matrix::Csr<ValueType, IndexType>* m_out,
+                      matrix::Coo<ValueType, IndexType>* m_out_coo, bool lower)
+{
+    auto old_row_ptrs = a->get_const_row_ptrs();
+    auto old_col_idxs = a->get_const_col_idxs();
+    auto old_vals = a->get_const_values();
+    // compute nnz for each row
+    auto num_rows = static_cast<IndexType>(a->get_size()[0]);
+    auto block_size = default_block_size / subwarp_size;
+    auto num_blocks = ceildiv(num_rows, block_size);
+    auto new_row_ptrs = m_out->get_row_ptrs();
+    if (num_blocks > 0) {
+        kernel::threshold_filter_nnz<subwarp_size>
+            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+                old_row_ptrs, as_device_type(old_vals), num_rows,
+                as_device_type(threshold), new_row_ptrs, lower);
+    }
+
+    // build row pointers
+    components::prefix_sum_nonnegative(exec, new_row_ptrs, num_rows + 1);
+
+    // build matrix
+    auto new_nnz = exec->copy_val_to_host(new_row_ptrs + num_rows);
+    // resize arrays and update aliases
+    matrix::CsrBuilder<ValueType, IndexType> builder{m_out};
+    builder.get_col_idx_array().resize_and_reset(new_nnz);
+    builder.get_value_array().resize_and_reset(new_nnz);
+    auto new_col_idxs = m_out->get_col_idxs();
+    auto new_vals = m_out->get_values();
+    IndexType* new_row_idxs{};
+    if (m_out_coo) {
+        matrix::CooBuilder<ValueType, IndexType> coo_builder{m_out_coo};
+        coo_builder.get_row_idx_array().resize_and_reset(new_nnz);
+        coo_builder.get_col_idx_array() =
+            make_array_view(exec, new_nnz, new_col_idxs);
+        coo_builder.get_value_array() =
+            make_array_view(exec, new_nnz, new_vals);
+        new_row_idxs = m_out_coo->get_row_idxs();
+    }
+    if (num_blocks > 0) {
+        kernel::threshold_filter<subwarp_size>
+            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+                old_row_ptrs, old_col_idxs, as_device_type(old_vals), num_rows,
+                as_device_type(threshold), new_row_ptrs, new_row_idxs,
+                new_col_idxs, as_device_type(new_vals), lower);
+    }
+}
+
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_threshold_filter, threshold_filter);
+
+
+}  // namespace
+
+template <typename ValueType, typename IndexType>
+void threshold_filter(std::shared_ptr<const DefaultExecutor> exec,
+                      const matrix::Csr<ValueType, IndexType>* a,
+                      remove_complex<ValueType> threshold,
+                      matrix::Csr<ValueType, IndexType>* m_out,
+                      matrix::Coo<ValueType, IndexType>* m_out_coo, bool lower)
+{
+    auto num_rows = a->get_size()[0];
+    auto total_nnz = a->get_num_stored_elements();
+    auto total_nnz_per_row = total_nnz / num_rows;
+    select_threshold_filter(
+        compiled_kernels(),
+        [&](int compiled_subwarp_size) {
+            return total_nnz_per_row <= compiled_subwarp_size ||
+                   compiled_subwarp_size == config::warp_size;
+        },
+        syn::value_list<int>(), syn::type_list<>(), exec, a, threshold, m_out,
+        m_out_coo, lower);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_KERNEL);
+
+
+}  // namespace par_ilut_factorization
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko
diff --git a/cuda/factorization/par_ilut_select_common.cu b/cuda/factorization/par_ilut_select_common.cu
index bbba93595c8..7aa8c43259d 100644
--- a/cuda/factorization/par_ilut_select_common.cu
+++ b/cuda/factorization/par_ilut_select_common.cu
@@ -5,15 +5,15 @@
 #include "cuda/factorization/par_ilut_select_common.cuh"
 
 
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/components/atomic.hpp"
+#include "common/cuda_hip/components/intrinsics.hpp"
+#include "common/cuda_hip/components/prefix_sum.hpp"
+#include "common/cuda_hip/components/searching.hpp"
+#include "common/cuda_hip/components/sorting.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/factorization/par_ilut_kernels.hpp"
-#include "cuda/base/math.hpp"
-#include "cuda/components/atomic.cuh"
-#include "cuda/components/intrinsics.cuh"
-#include "cuda/components/prefix_sum.cuh"
-#include "cuda/components/searching.cuh"
-#include "cuda/components/sorting.cuh"
-#include "cuda/components/thread_ids.cuh"
 
 
 namespace gko {
diff --git a/cuda/factorization/par_ilut_select_kernels.cu b/cuda/factorization/par_ilut_select_kernels.cu
new file mode 100644
index 00000000000..13869f90784
--- /dev/null
+++ b/cuda/factorization/par_ilut_select_kernels.cu
@@ -0,0 +1,162 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "core/factorization/par_ilut_kernels.hpp"
+
+
+#include <algorithm>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+
+
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/atomic.hpp"
+#include "common/cuda_hip/components/intrinsics.hpp"
+#include "common/cuda_hip/components/prefix_sum.hpp"
+#include "common/cuda_hip/components/searching.hpp"
+#include "common/cuda_hip/components/sorting.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "core/components/prefix_sum_kernels.hpp"
+#include "cuda/factorization/par_ilut_select_common.cuh"
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+/**
+ * @brief The parallel ILUT factorization namespace.
+ *
+ * @ingroup factor
+ */
+namespace par_ilut_factorization {
+
+
+#include "common/cuda_hip/factorization/par_ilut_select_kernels.hpp.inc"
+
+
+template <typename ValueType, typename IndexType>
+void sampleselect_filter(std::shared_ptr<const DefaultExecutor> exec,
+                         const ValueType* values, IndexType size,
+                         const unsigned char* oracles,
+                         const IndexType* partial_counts, IndexType bucket,
+                         remove_complex<ValueType>* out)
+{
+    auto num_threads_total = ceildiv(size, items_per_thread);
+    auto num_blocks =
+        static_cast<IndexType>(ceildiv(num_threads_total, default_block_size));
+    if (num_blocks > 0) {
+        kernel::filter_bucket<<<num_blocks, default_block_size, 0,
+                                exec->get_stream()>>>(
+            as_device_type(values), size, bucket, oracles, partial_counts,
+            as_device_type(out), items_per_thread);
+    }
+}
+
+
+template <typename ValueType, typename IndexType>
+void threshold_select(std::shared_ptr<const DefaultExecutor> exec,
+                      const matrix::Csr<ValueType, IndexType>* m,
+                      IndexType rank, array<ValueType>& tmp1,
+                      array<remove_complex<ValueType>>& tmp2,
+                      remove_complex<ValueType>& threshold)
+{
+    auto values = m->get_const_values();
+    IndexType size = m->get_num_stored_elements();
+    using AbsType = remove_complex<ValueType>;
+    constexpr auto bucket_count = kernel::searchtree_width;
+    auto max_num_threads = ceildiv(size, items_per_thread);
+    auto max_num_blocks = ceildiv(max_num_threads, default_block_size);
+
+    size_type tmp_size_totals =
+        ceildiv((bucket_count + 1) * sizeof(IndexType), sizeof(ValueType));
+    size_type tmp_size_partials = ceildiv(
+        bucket_count * max_num_blocks * sizeof(IndexType), sizeof(ValueType));
+    size_type tmp_size_oracles =
+        ceildiv(size * sizeof(unsigned char), sizeof(ValueType));
+    size_type tmp_size_tree =
+        ceildiv(kernel::searchtree_size * sizeof(AbsType), sizeof(ValueType));
+    size_type tmp_size_vals =
+        size / bucket_count * 4;  // pessimistic estimate for temporary storage
+    size_type tmp_size =
+        tmp_size_totals + tmp_size_partials + tmp_size_oracles + tmp_size_tree;
+    tmp1.resize_and_reset(tmp_size);
+    tmp2.resize_and_reset(tmp_size_vals);
+
+    auto total_counts = reinterpret_cast<IndexType*>(tmp1.get_data());
+    auto partial_counts =
+        reinterpret_cast<IndexType*>(tmp1.get_data() + tmp_size_totals);
+    auto oracles = reinterpret_cast<unsigned char*>(
+        tmp1.get_data() + tmp_size_totals + tmp_size_partials);
+    auto tree =
+        reinterpret_cast<AbsType*>(tmp1.get_data() + tmp_size_totals +
+                                   tmp_size_partials + tmp_size_oracles);
+
+    sampleselect_count(exec, values, size, tree, oracles, partial_counts,
+                       total_counts);
+
+    // determine bucket with correct rank, use bucket-local rank
+    auto bucket = sampleselect_find_bucket(exec, total_counts, rank);
+    rank -= bucket.begin;
+
+    if (bucket.size * 2 > tmp_size_vals) {
+        // we need to reallocate tmp2
+        tmp2.resize_and_reset(bucket.size * 2);
+    }
+    auto tmp21 = tmp2.get_data();
+    auto tmp22 = tmp2.get_data() + bucket.size;
+    // extract target bucket
+    sampleselect_filter(exec, values, size, oracles, partial_counts, bucket.idx,
+                        tmp22);
+
+    // recursively select from smaller buckets
+    int step{};
+    while (bucket.size > kernel::basecase_size) {
+        std::swap(tmp21, tmp22);
+        const auto* tmp_in = tmp21;
+        auto tmp_out = tmp22;
+
+        sampleselect_count(exec, tmp_in, bucket.size, tree, oracles,
+                           partial_counts, total_counts);
+        auto new_bucket = sampleselect_find_bucket(exec, total_counts, rank);
+        sampleselect_filter(exec, tmp_in, bucket.size, oracles, partial_counts,
+                            bucket.idx, tmp_out);
+
+        rank -= new_bucket.begin;
+        bucket.size = new_bucket.size;
+        // we should never need more than 5 recursion steps, this would mean
+        // 256^5 = 2^40. fall back to standard library algorithm in that case.
+        ++step;
+        if (step > 5) {
+            array<AbsType> cpu_out_array{
+                exec->get_master(),
+                make_array_view(exec, bucket.size, tmp_out)};
+            auto begin = cpu_out_array.get_data();
+            auto end = begin + bucket.size;
+            auto middle = begin + rank;
+            std::nth_element(begin, middle, end);
+            threshold = *middle;
+            return;
+        }
+    }
+
+    // base case
+    auto out_ptr = reinterpret_cast<AbsType*>(tmp1.get_data());
+    kernel::basecase_select<<<1, kernel::basecase_block_size, 0,
+                              exec->get_stream()>>>(
+        as_device_type(tmp22), bucket.size, rank, as_device_type(out_ptr));
+    threshold = exec->copy_val_to_host(out_ptr);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_PAR_ILUT_THRESHOLD_SELECT_KERNEL);
+
+
+}  // namespace par_ilut_factorization
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko
diff --git a/cuda/factorization/par_ilut_spgeam_kernels.cu b/cuda/factorization/par_ilut_spgeam_kernels.cu
new file mode 100644
index 00000000000..218e4bddee4
--- /dev/null
+++ b/cuda/factorization/par_ilut_spgeam_kernels.cu
@@ -0,0 +1,159 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "core/factorization/par_ilut_kernels.hpp"
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/coo.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/intrinsics.hpp"
+#include "common/cuda_hip/components/merging.hpp"
+#include "common/cuda_hip/components/prefix_sum.hpp"
+#include "common/cuda_hip/components/searching.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "core/components/prefix_sum_kernels.hpp"
+#include "core/matrix/coo_builder.hpp"
+#include "core/matrix/csr_builder.hpp"
+#include "core/matrix/csr_kernels.hpp"
+#include "core/synthesizer/implementation_selection.hpp"
+#include "cuda/components/cooperative_groups.cuh"
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+/**
+ * @brief The parallel ILUT factorization namespace.
+ *
+ * @ingroup factor
+ */
+namespace par_ilut_factorization {
+
+
+constexpr int default_block_size = 512;
+
+
+// subwarp sizes for add_candidates kernels
+using compiled_kernels =
+    syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
+
+
+#include "common/cuda_hip/factorization/par_ilut_spgeam_kernels.hpp.inc"
+
+
+namespace {
+
+
+template <int subwarp_size, typename ValueType, typename IndexType>
+void add_candidates(syn::value_list<int, subwarp_size>,
+                    std::shared_ptr<const DefaultExecutor> exec,
+                    const matrix::Csr<ValueType, IndexType>* lu,
+                    const matrix::Csr<ValueType, IndexType>* a,
+                    const matrix::Csr<ValueType, IndexType>* l,
+                    const matrix::Csr<ValueType, IndexType>* u,
+                    matrix::Csr<ValueType, IndexType>* l_new,
+                    matrix::Csr<ValueType, IndexType>* u_new)
+{
+    auto num_rows = static_cast<IndexType>(lu->get_size()[0]);
+    auto subwarps_per_block = default_block_size / subwarp_size;
+    auto num_blocks = ceildiv(num_rows, subwarps_per_block);
+    matrix::CsrBuilder<ValueType, IndexType> l_new_builder(l_new);
+    matrix::CsrBuilder<ValueType, IndexType> u_new_builder(u_new);
+    auto lu_row_ptrs = lu->get_const_row_ptrs();
+    auto lu_col_idxs = lu->get_const_col_idxs();
+    auto lu_vals = lu->get_const_values();
+    auto a_row_ptrs = a->get_const_row_ptrs();
+    auto a_col_idxs = a->get_const_col_idxs();
+    auto a_vals = a->get_const_values();
+    auto l_row_ptrs = l->get_const_row_ptrs();
+    auto l_col_idxs = l->get_const_col_idxs();
+    auto l_vals = l->get_const_values();
+    auto u_row_ptrs = u->get_const_row_ptrs();
+    auto u_col_idxs = u->get_const_col_idxs();
+    auto u_vals = u->get_const_values();
+    auto l_new_row_ptrs = l_new->get_row_ptrs();
+    auto u_new_row_ptrs = u_new->get_row_ptrs();
+    if (num_blocks > 0) {
+        // count non-zeros per row
+        kernel::tri_spgeam_nnz<subwarp_size>
+            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+                lu_row_ptrs, lu_col_idxs, a_row_ptrs, a_col_idxs,
+                l_new_row_ptrs, u_new_row_ptrs, num_rows);
+    }
+
+    // build row ptrs
+    components::prefix_sum_nonnegative(exec, l_new_row_ptrs, num_rows + 1);
+    components::prefix_sum_nonnegative(exec, u_new_row_ptrs, num_rows + 1);
+
+    // resize output arrays
+    auto l_new_nnz = exec->copy_val_to_host(l_new_row_ptrs + num_rows);
+    auto u_new_nnz = exec->copy_val_to_host(u_new_row_ptrs + num_rows);
+    l_new_builder.get_col_idx_array().resize_and_reset(l_new_nnz);
+    l_new_builder.get_value_array().resize_and_reset(l_new_nnz);
+    u_new_builder.get_col_idx_array().resize_and_reset(u_new_nnz);
+    u_new_builder.get_value_array().resize_and_reset(u_new_nnz);
+
+    auto l_new_col_idxs = l_new->get_col_idxs();
+    auto l_new_vals = l_new->get_values();
+    auto u_new_col_idxs = u_new->get_col_idxs();
+    auto u_new_vals = u_new->get_values();
+
+    if (num_blocks > 0) {
+        // fill columns and values
+        kernel::tri_spgeam_init<subwarp_size>
+            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+                lu_row_ptrs, lu_col_idxs, as_device_type(lu_vals), a_row_ptrs,
+                a_col_idxs, as_device_type(a_vals), l_row_ptrs, l_col_idxs,
+                as_device_type(l_vals), u_row_ptrs, u_col_idxs,
+                as_device_type(u_vals), l_new_row_ptrs, l_new_col_idxs,
+                as_device_type(l_new_vals), u_new_row_ptrs, u_new_col_idxs,
+                as_device_type(u_new_vals), num_rows);
+    }
+}
+
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_add_candidates, add_candidates);
+
+
+}  // namespace
+
+
+template <typename ValueType, typename IndexType>
+void add_candidates(std::shared_ptr<const DefaultExecutor> exec,
+                    const matrix::Csr<ValueType, IndexType>* lu,
+                    const matrix::Csr<ValueType, IndexType>* a,
+                    const matrix::Csr<ValueType, IndexType>* l,
+                    const matrix::Csr<ValueType, IndexType>* u,
+                    matrix::Csr<ValueType, IndexType>* l_new,
+                    matrix::Csr<ValueType, IndexType>* u_new)
+{
+    auto num_rows = a->get_size()[0];
+    auto total_nnz =
+        lu->get_num_stored_elements() + a->get_num_stored_elements();
+    auto total_nnz_per_row = total_nnz / num_rows;
+    select_add_candidates(
+        compiled_kernels(),
+        [&](int compiled_subwarp_size) {
+            return total_nnz_per_row <= compiled_subwarp_size ||
+                   compiled_subwarp_size == config::warp_size;
+        },
+        syn::value_list<int>(), syn::type_list<>(), exec, lu, a, l, u, l_new,
+        u_new);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_PAR_ILUT_ADD_CANDIDATES_KERNEL);
+
+
+}  // namespace par_ilut_factorization
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko
diff --git a/common/cuda_hip/factorization/par_ilut_sweep_kernels.cpp b/cuda/factorization/par_ilut_sweep_kernels.cu
similarity index 53%
rename from common/cuda_hip/factorization/par_ilut_sweep_kernels.cpp
rename to cuda/factorization/par_ilut_sweep_kernels.cu
index 6ae783133e5..416f492ead2 100644
--- a/common/cuda_hip/factorization/par_ilut_sweep_kernels.cpp
+++ b/cuda/factorization/par_ilut_sweep_kernels.cu
@@ -15,7 +15,6 @@
 #include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/components/intrinsics.hpp"
-#include "common/cuda_hip/components/memory.hpp"
 #include "common/cuda_hip/components/merging.hpp"
 #include "common/cuda_hip/components/prefix_sum.hpp"
 #include "common/cuda_hip/components/reduction.hpp"
@@ -26,11 +25,12 @@
 #include "core/matrix/csr_builder.hpp"
 #include "core/matrix/csr_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
+#include "cuda/components/memory.cuh"
 
 
 namespace gko {
 namespace kernels {
-namespace GKO_DEVICE_NAMESPACE {
+namespace cuda {
 /**
  * @brief The parallel ILUT factorization namespace.
  *
@@ -47,96 +47,7 @@ using compiled_kernels =
     syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
 
 
-namespace kernel {
-
-
-template <int subwarp_size, typename ValueType, typename IndexType>
-__global__ __launch_bounds__(default_block_size) void sweep(
-    const IndexType* __restrict__ a_row_ptrs,
-    const IndexType* __restrict__ a_col_idxs,
-    const ValueType* __restrict__ a_vals,
-    const IndexType* __restrict__ l_row_ptrs,
-    const IndexType* __restrict__ l_row_idxs,
-    const IndexType* __restrict__ l_col_idxs, ValueType* __restrict__ l_vals,
-    IndexType l_nnz, const IndexType* __restrict__ u_row_idxs,
-    const IndexType* __restrict__ u_col_idxs, ValueType* __restrict__ u_vals,
-    const IndexType* __restrict__ ut_col_ptrs,
-    const IndexType* __restrict__ ut_row_idxs, ValueType* __restrict__ ut_vals,
-    IndexType u_nnz)
-{
-    auto tidx = thread::get_subwarp_id_flat<subwarp_size, IndexType>();
-    if (tidx >= l_nnz + u_nnz) {
-        return;
-    }
-    // split the subwarps into two halves for lower and upper triangle
-    auto l_nz = tidx;
-    auto u_nz = l_nz - l_nnz;
-    auto lower = u_nz < 0;
-    auto row = lower ? l_row_idxs[l_nz] : u_row_idxs[u_nz];
-    auto col = lower ? l_col_idxs[l_nz] : u_col_idxs[u_nz];
-    if (lower && row == col) {
-        // don't update the diagonal twice
-        return;
-    }
-    auto subwarp =
-        group::tiled_partition<subwarp_size>(group::this_thread_block());
-    // find entry of A at (row, col)
-    auto a_row_begin = a_row_ptrs[row];
-    auto a_row_end = a_row_ptrs[row + 1];
-    auto a_row_size = a_row_end - a_row_begin;
-    auto a_idx =
-        group_wide_search(a_row_begin, a_row_size, subwarp,
-                          [&](IndexType i) { return a_col_idxs[i] >= col; });
-    bool has_a = a_idx < a_row_end && a_col_idxs[a_idx] == col;
-    auto a_val = has_a ? a_vals[a_idx] : zero<ValueType>();
-    auto l_row_begin = l_row_ptrs[row];
-    auto l_row_size = l_row_ptrs[row + 1] - l_row_begin;
-    auto ut_col_begin = ut_col_ptrs[col];
-    auto ut_col_size = ut_col_ptrs[col + 1] - ut_col_begin;
-    ValueType sum{};
-    IndexType ut_nz{};
-    auto last_entry = min(row, col);
-    group_merge<subwarp_size>(
-        l_col_idxs + l_row_begin, l_row_size, ut_row_idxs + ut_col_begin,
-        ut_col_size, subwarp,
-        [&](IndexType l_idx, IndexType l_col, IndexType ut_idx,
-            IndexType ut_row, IndexType, bool) {
-            // we don't need to use the `bool valid` because last_entry is
-            // already a smaller sentinel value than the one used in group_merge
-            if (l_col == ut_row && l_col < last_entry) {
-                sum += load_relaxed(l_vals + (l_idx + l_row_begin)) *
-                       load_relaxed(ut_vals + (ut_idx + ut_col_begin));
-            }
-            // remember the transposed element
-            auto found_transp = subwarp.ballot(ut_row == row);
-            if (found_transp) {
-                ut_nz =
-                    subwarp.shfl(ut_idx + ut_col_begin, ffs(found_transp) - 1);
-            }
-            return true;
-        });
-    // accumulate result from all threads
-    sum = reduce(subwarp, sum, [](ValueType a, ValueType b) { return a + b; });
-
-    if (subwarp.thread_rank() == 0) {
-        if (lower) {
-            auto to_write = (a_val - sum) /
-                            load_relaxed(ut_vals + (ut_col_ptrs[col + 1] - 1));
-            if (is_finite(to_write)) {
-                store_relaxed(l_vals + l_nz, to_write);
-            }
-        } else {
-            auto to_write = a_val - sum;
-            if (is_finite(to_write)) {
-                store_relaxed(u_vals + u_nz, to_write);
-                store_relaxed(ut_vals + ut_nz, to_write);
-            }
-        }
-    }
-}
-
-
-}  // namespace kernel
+#include "common/cuda_hip/factorization/par_ilut_sweep_kernels.hpp.inc"
 
 
 namespace {
@@ -207,6 +118,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 
 }  // namespace par_ilut_factorization
-}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace cuda
 }  // namespace kernels
 }  // namespace gko
diff --git a/cuda/log/batch_logger.cuh b/cuda/log/batch_logger.cuh
new file mode 100644
index 00000000000..3e53d6ef0a6
--- /dev/null
+++ b/cuda/log/batch_logger.cuh
@@ -0,0 +1,27 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_CUDA_LOG_BATCH_LOGGER_CUH_
+#define GKO_CUDA_LOG_BATCH_LOGGER_CUH_
+
+
+#include <ginkgo/core/base/types.hpp>
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+namespace batch_log {
+
+
+#include "common/cuda_hip/log/batch_logger.hpp.inc"
+
+
+}  // namespace batch_log
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_CUDA_LOG_BATCH_LOGGER_CUH_
diff --git a/cuda/matrix/batch_csr_kernels.cu b/cuda/matrix/batch_csr_kernels.cu
new file mode 100644
index 00000000000..29e209f2f3e
--- /dev/null
+++ b/cuda/matrix/batch_csr_kernels.cu
@@ -0,0 +1,58 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "core/matrix/batch_csr_kernels.hpp"
+
+
+#include <thrust/functional.h>
+
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/matrix/batch_csr.hpp>
+
+
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/uninitialized_array.hpp"
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "cuda/base/batch_struct.hpp"
+#include "cuda/base/config.hpp"
+#include "cuda/base/thrust.cuh"
+#include "cuda/components/cooperative_groups.cuh"
+#include "cuda/matrix/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+/**
+ * @brief The Csr matrix format namespace.
+ * @ref Csr
+ * @ingroup batch_csr
+ */
+namespace batch_csr {
+
+
+constexpr auto default_block_size = 256;
+constexpr int sm_oversubscription = 4;
+
+// clang-format off
+
+// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
+
+#include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc"
+
+
+#include "common/cuda_hip/matrix/batch_csr_kernel_launcher.hpp.inc"
+
+// clang-format on
+
+
+}  // namespace batch_csr
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko
diff --git a/cuda/matrix/batch_dense_kernels.cu b/cuda/matrix/batch_dense_kernels.cu
new file mode 100644
index 00000000000..65e5b1c93ac
--- /dev/null
+++ b/cuda/matrix/batch_dense_kernels.cu
@@ -0,0 +1,59 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "core/matrix/batch_dense_kernels.hpp"
+
+
+#include <thrust/functional.h>
+
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/batch_dense.hpp>
+
+
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/uninitialized_array.hpp"
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "cuda/base/batch_struct.hpp"
+#include "cuda/base/config.hpp"
+#include "cuda/base/thrust.cuh"
+#include "cuda/components/cooperative_groups.cuh"
+#include "cuda/matrix/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+/**
+ * @brief The Dense matrix format namespace.
+ *
+ * @ingroup batch_dense
+ */
+namespace batch_dense {
+
+
+constexpr auto default_block_size = 256;
+constexpr int sm_oversubscription = 4;
+
+// clang-format off
+
+// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
+
+#include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc"
+
+
+#include "common/cuda_hip/matrix/batch_dense_kernel_launcher.hpp.inc"
+
+
+// clang-format on
+
+
+}  // namespace batch_dense
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko
diff --git a/cuda/matrix/batch_ell_kernels.cu b/cuda/matrix/batch_ell_kernels.cu
new file mode 100644
index 00000000000..4847cd8a8ff
--- /dev/null
+++ b/cuda/matrix/batch_ell_kernels.cu
@@ -0,0 +1,58 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "core/matrix/batch_ell_kernels.hpp"
+
+
+#include <thrust/functional.h>
+
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/matrix/batch_ell.hpp>
+
+
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/uninitialized_array.hpp"
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "cuda/base/batch_struct.hpp"
+#include "cuda/base/config.hpp"
+#include "cuda/base/thrust.cuh"
+#include "cuda/components/cooperative_groups.cuh"
+#include "cuda/matrix/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+/**
+ * @brief The Ell matrix format namespace.
+ * @ref Ell
+ * @ingroup batch_ell
+ */
+namespace batch_ell {
+
+
+constexpr auto default_block_size = 256;
+constexpr int sm_oversubscription = 4;
+
+// clang-format off
+
+// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
+
+#include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc"
+
+
+#include "common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc"
+
+// clang-format on
+
+
+}  // namespace batch_ell
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko
diff --git a/cuda/preconditioner/batch_preconditioners.cuh b/cuda/preconditioner/batch_preconditioners.cuh
index 373f68bb376..966add291ff 100644
--- a/cuda/preconditioner/batch_preconditioners.cuh
+++ b/cuda/preconditioner/batch_preconditioners.cuh
@@ -9,9 +9,9 @@
 #include <ginkgo/core/matrix/batch_identity.hpp>
 
 
+#include "common/cuda_hip/components/reduction.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "cuda/components/cooperative_groups.cuh"
-#include "cuda/components/reduction.cuh"
 
 
 namespace gko {
diff --git a/cuda/preconditioner/jacobi_advanced_apply_kernels.instantiate.cu b/cuda/preconditioner/jacobi_advanced_apply_kernels.instantiate.cu
index ca7bf20372c..ae873ccaa29 100644
--- a/cuda/preconditioner/jacobi_advanced_apply_kernels.instantiate.cu
+++ b/cuda/preconditioner/jacobi_advanced_apply_kernels.instantiate.cu
@@ -8,16 +8,16 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/warp_blas.hpp"
 #include "core/base/extended_float.hpp"
 #include "core/matrix/dense_kernels.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
 #include "cuda/base/config.hpp"
-#include "cuda/base/math.hpp"
 #include "cuda/base/types.hpp"
 #include "cuda/components/cooperative_groups.cuh"
-#include "cuda/components/thread_ids.cuh"
-#include "cuda/components/warp_blas.cuh"
 #include "cuda/preconditioner/jacobi_common.hpp"
 
 
@@ -32,7 +32,7 @@ namespace cuda {
 namespace jacobi {
 
 
-#include <common/cuda_hip/preconditioner/jacobi_advanced_apply_kernel.hpp.inc>
+#include "common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.hpp.inc"
 
 
 // clang-format off
diff --git a/cuda/preconditioner/jacobi_generate_kernels.instantiate.cu b/cuda/preconditioner/jacobi_generate_kernels.instantiate.cu
index a3ad8890042..da10af38f0d 100644
--- a/cuda/preconditioner/jacobi_generate_kernels.instantiate.cu
+++ b/cuda/preconditioner/jacobi_generate_kernels.instantiate.cu
@@ -9,18 +9,18 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/components/diagonal_block_manipulation.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/uninitialized_array.hpp"
+#include "common/cuda_hip/components/warp_blas.hpp"
 #include "core/base/extended_float.hpp"
 #include "core/components/fill_array_kernels.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
 #include "cuda/base/config.hpp"
-#include "cuda/base/math.hpp"
 #include "cuda/base/types.hpp"
 #include "cuda/components/cooperative_groups.cuh"
-#include "cuda/components/diagonal_block_manipulation.cuh"
-#include "cuda/components/thread_ids.cuh"
-#include "cuda/components/uninitialized_array.hpp"
-#include "cuda/components/warp_blas.cuh"
 #include "cuda/preconditioner/jacobi_common.hpp"
 
 
@@ -35,7 +35,7 @@ namespace cuda {
 namespace jacobi {
 
 
-#include <common/cuda_hip/preconditioner/jacobi_generate_kernel.hpp.inc>
+#include "common/cuda_hip/preconditioner/jacobi_generate_kernels.hpp.inc"
 
 
 // clang-format off
diff --git a/cuda/preconditioner/jacobi_kernels.cu b/cuda/preconditioner/jacobi_kernels.cu
new file mode 100644
index 00000000000..c35f62b7364
--- /dev/null
+++ b/cuda/preconditioner/jacobi_kernels.cu
@@ -0,0 +1,51 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "core/preconditioner/jacobi_kernels.hpp"
+
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+
+
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "core/base/extended_float.hpp"
+#include "core/preconditioner/jacobi_utils.hpp"
+#include "core/synthesizer/implementation_selection.hpp"
+#include "cuda/preconditioner/jacobi_common.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+/**
+ * @brief The Jacobi preconditioner namespace.
+ * @ref Jacobi
+ * @ingroup jacobi
+ */
+namespace jacobi {
+
+
+// a total of 32/16 warps (1024 threads)
+#if defined(GKO_COMPILING_HIP) && GINKGO_HIP_PLATFORM_HCC
+constexpr int default_num_warps = 16;
+#else  // !defined(GKO_COMPILING_HIP) || GINKGO_HIP_PLATFORM_NVCC
+constexpr int default_num_warps = 32;
+#endif
+// with current architectures, at most 32 warps can be scheduled per SM (and
+// current GPUs have at most 84 SMs)
+constexpr int default_grid_size = 32 * 32 * 128;
+
+
+#include "common/cuda_hip/preconditioner/jacobi_kernels.hpp.inc"
+
+
+}  // namespace jacobi
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko
diff --git a/cuda/preconditioner/jacobi_simple_apply_kernels.instantiate.cu b/cuda/preconditioner/jacobi_simple_apply_kernels.instantiate.cu
index a227adb701b..3997eeeb43a 100644
--- a/cuda/preconditioner/jacobi_simple_apply_kernels.instantiate.cu
+++ b/cuda/preconditioner/jacobi_simple_apply_kernels.instantiate.cu
@@ -8,16 +8,16 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/warp_blas.hpp"
 #include "core/base/extended_float.hpp"
 #include "core/matrix/dense_kernels.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
 #include "cuda/base/config.hpp"
-#include "cuda/base/math.hpp"
 #include "cuda/base/types.hpp"
 #include "cuda/components/cooperative_groups.cuh"
-#include "cuda/components/thread_ids.cuh"
-#include "cuda/components/warp_blas.cuh"
 #include "cuda/preconditioner/jacobi_common.hpp"
 
 
@@ -32,7 +32,7 @@ namespace cuda {
 namespace jacobi {
 
 
-#include <common/cuda_hip/preconditioner/jacobi_simple_apply_kernel.hpp.inc>
+#include "common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.hpp.inc"
 
 
 // clang-format off
diff --git a/cuda/solver/batch_bicgstab_kernels.cu b/cuda/solver/batch_bicgstab_kernels.cu
index b1114f05d20..1d3e1a4c68f 100644
--- a/cuda/solver/batch_bicgstab_kernels.cu
+++ b/cuda/solver/batch_bicgstab_kernels.cu
@@ -14,6 +14,9 @@
 
 
 #include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/uninitialized_array.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/solver/batch_dispatch.hpp"
@@ -23,9 +26,6 @@
 #include "cuda/base/thrust.cuh"
 #include "cuda/base/types.hpp"
 #include "cuda/components/cooperative_groups.cuh"
-#include "cuda/components/reduction.cuh"
-#include "cuda/components/thread_ids.cuh"
-#include "cuda/components/uninitialized_array.hpp"
 #include "cuda/matrix/batch_struct.hpp"
 
 
@@ -48,7 +48,6 @@ namespace batch_bicgstab {
 
 
 #include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc"
-#include "common/cuda_hip/components/uninitialized_array.hpp.inc"
 #include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc"
 #include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc"
 #include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc"
diff --git a/cuda/solver/batch_cg_kernels.cu b/cuda/solver/batch_cg_kernels.cu
index 57c3612df69..a3360d31737 100644
--- a/cuda/solver/batch_cg_kernels.cu
+++ b/cuda/solver/batch_cg_kernels.cu
@@ -13,6 +13,9 @@
 #include <ginkgo/core/base/math.hpp>
 
 
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/uninitialized_array.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/solver/batch_dispatch.hpp"
@@ -22,9 +25,6 @@
 #include "cuda/base/thrust.cuh"
 #include "cuda/base/types.hpp"
 #include "cuda/components/cooperative_groups.cuh"
-#include "cuda/components/reduction.cuh"
-#include "cuda/components/thread_ids.cuh"
-#include "cuda/components/uninitialized_array.hpp"
 #include "cuda/matrix/batch_struct.hpp"
 
 
@@ -47,7 +47,6 @@ namespace batch_cg {
 
 
 #include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc"
-#include "common/cuda_hip/components/uninitialized_array.hpp.inc"
 #include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc"
 #include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc"
 #include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc"
diff --git a/cuda/stop/batch_criteria.cuh b/cuda/stop/batch_criteria.cuh
new file mode 100644
index 00000000000..f4f434dda11
--- /dev/null
+++ b/cuda/stop/batch_criteria.cuh
@@ -0,0 +1,26 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_CUDA_STOP_BATCH_CRITERIA_CUH_
+#define GKO_CUDA_STOP_BATCH_CRITERIA_CUH_
+
+
+#include <ginkgo/core/base/math.hpp>
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+namespace batch_stop {
+
+
+#include "common/cuda_hip/stop/batch_criteria.hpp.inc"
+
+
+}  // namespace batch_stop
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko
+
+#endif  // GKO_CUDA_STOP_BATCH_CRITERIA_CUH_
diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt
index 5bcc1de1f21..54d8dfb0d09 100644
--- a/hip/CMakeLists.txt
+++ b/hip/CMakeLists.txt
@@ -1,10 +1,11 @@
 cmake_minimum_required(VERSION 3.21)
 include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake)
+include(${PROJECT_SOURCE_DIR}/common/cuda_hip/sources.cmake)
 add_instantiation_files(. matrix/csr_kernels.instantiate.hip.cpp CSR_INSTANTIATE)
-add_instantiation_files(. matrix/fbcsr_kernels.instantiate.hip.cpp FBCSR_INSTANTIATE)
 # we don't split up the dense kernels into distinct compilations
 list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp)
 set(GINKGO_HIP_SOURCES
+    base/batch_multi_vector_kernels.hip.cpp
     base/device.hip.cpp
     base/exception.hip.cpp
     base/executor.hip.cpp
@@ -17,17 +18,21 @@ set(GINKGO_HIP_SOURCES
     base/version.hip.cpp
     factorization/ic_kernels.hip.cpp
     factorization/ilu_kernels.hip.cpp
-    factorization/par_ilut_approx_filter_kernel.hip.cpp
-    factorization/par_ilut_filter_kernel.hip.cpp
+    factorization/par_ict_kernels.hip.cpp
+    factorization/par_ilut_approx_filter_kernels.hip.cpp
+    factorization/par_ilut_filter_kernels.hip.cpp
     factorization/par_ilut_select_common.hip.cpp
-    factorization/par_ilut_select_kernel.hip.cpp
-    factorization/par_ilut_spgeam_kernel.hip.cpp
-    factorization/par_ilut_sweep_kernel.hip.cpp
+    factorization/par_ilut_select_kernels.hip.cpp
+    factorization/par_ilut_spgeam_kernels.hip.cpp
+    factorization/par_ilut_sweep_kernels.hip.cpp
+    matrix/batch_csr_kernels.hip.cpp
+    matrix/batch_dense_kernels.hip.cpp
+    matrix/batch_ell_kernels.hip.cpp
     ${CSR_INSTANTIATE}
-    ${FBCSR_INSTANTIATE}
     preconditioner/batch_jacobi_kernels.hip.cpp
     preconditioner/jacobi_advanced_apply_kernel.hip.cpp
     preconditioner/jacobi_generate_kernel.hip.cpp
+    preconditioner/jacobi_kernels.hip.cpp
     preconditioner/jacobi_simple_apply_kernel.hip.cpp
     solver/batch_bicgstab_kernels.hip.cpp
     solver/batch_cg_kernels.hip.cpp
diff --git a/hip/base/batch_multi_vector_kernels.hip.cpp b/hip/base/batch_multi_vector_kernels.hip.cpp
new file mode 100644
index 00000000000..4483fe77214
--- /dev/null
+++ b/hip/base/batch_multi_vector_kernels.hip.cpp
@@ -0,0 +1,59 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "core/base/batch_multi_vector_kernels.hpp"
+
+
+#include <thrust/functional.h>
+#include <thrust/transform.h>
+
+
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/range_accessors.hpp>
+
+
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/uninitialized_array.hpp"
+#include "core/base/batch_struct.hpp"
+#include "hip/base/batch_struct.hip.hpp"
+#include "hip/base/config.hip.hpp"
+#include "hip/base/hipblas_bindings.hip.hpp"
+#include "hip/base/pointer_mode_guard.hip.hpp"
+#include "hip/base/thrust.hip.hpp"
+#include "hip/components/cooperative_groups.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+/**
+ * @brief The MultiVector matrix format namespace.
+ *
+ * @ingroup batch_multi_vector
+ */
+namespace batch_multi_vector {
+
+
+constexpr auto default_block_size = 256;
+constexpr int sm_oversubscription = 4;
+
+
+// clang-format off
+
+// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
+
+#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc"
+
+
+#include "common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc"
+
+// clang-format on
+
+
+}  // namespace batch_multi_vector
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
diff --git a/hip/base/config.hip.hpp b/hip/base/config.hip.hpp
index fbad841fd0f..3c4b67d52a9 100644
--- a/hip/base/config.hip.hpp
+++ b/hip/base/config.hip.hpp
@@ -15,7 +15,7 @@
 #include <ginkgo/core/base/types.hpp>
 
 
-#include "hip/base/math.hip.hpp"
+#include "common/cuda_hip/base/math.hpp"
 
 
 namespace gko {
diff --git a/hip/base/hipblas_bindings.hip.hpp b/hip/base/hipblas_bindings.hip.hpp
index 380f051d53b..a4b4431382b 100644
--- a/hip/base/hipblas_bindings.hip.hpp
+++ b/hip/base/hipblas_bindings.hip.hpp
@@ -18,7 +18,7 @@
 #include <ginkgo/core/base/executor.hpp>
 
 
-#include "hip/base/math.hip.hpp"
+#include "common/cuda_hip/base/math.hpp"
 #include "hip/base/types.hip.hpp"
 
 
diff --git a/hip/base/hiprand_bindings.hip.hpp b/hip/base/hiprand_bindings.hip.hpp
index b78fa2780eb..f2748eb59df 100644
--- a/hip/base/hiprand_bindings.hip.hpp
+++ b/hip/base/hiprand_bindings.hip.hpp
@@ -17,7 +17,7 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
-#include "hip/base/math.hip.hpp"
+#include "common/cuda_hip/base/math.hpp"
 #include "hip/base/types.hip.hpp"
 
 
diff --git a/hip/components/format_conversion.hip.hpp b/hip/components/format_conversion.hip.hpp
index 59c0405a874..e0199ddd374 100644
--- a/hip/components/format_conversion.hip.hpp
+++ b/hip/components/format_conversion.hip.hpp
@@ -13,8 +13,8 @@
 #include <ginkgo/core/base/executor.hpp>
 
 
+#include "common/cuda_hip/components/thread_ids.hpp"
 #include "hip/components/cooperative_groups.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
 
 
 #ifdef GINKGO_BENCHMARK_ENABLE_TUNING
diff --git a/hip/factorization/par_ict_kernels.hip.cpp b/hip/factorization/par_ict_kernels.hip.cpp
new file mode 100644
index 00000000000..1970ac49634
--- /dev/null
+++ b/hip/factorization/par_ict_kernels.hip.cpp
@@ -0,0 +1,189 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "core/factorization/par_ict_kernels.hpp"
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/coo.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/intrinsics.hpp"
+#include "common/cuda_hip/components/merging.hpp"
+#include "common/cuda_hip/components/prefix_sum.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/searching.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "core/components/prefix_sum_kernels.hpp"
+#include "core/matrix/coo_builder.hpp"
+#include "core/matrix/csr_builder.hpp"
+#include "core/matrix/csr_kernels.hpp"
+#include "core/synthesizer/implementation_selection.hpp"
+#include "hip/components/memory.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+/**
+ * @brief The parallel ICT factorization namespace.
+ *
+ * @ingroup factor
+ */
+namespace par_ict_factorization {
+
+
+constexpr int default_block_size = 512;
+
+
+// subwarp sizes for all warp-parallel kernels (filter, add_candidates)
+using compiled_kernels =
+    syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
+
+
+#include "common/cuda_hip/factorization/par_ict_kernels.hpp.inc"
+
+
+namespace {
+
+
+template <int subwarp_size, typename ValueType, typename IndexType>
+void add_candidates(syn::value_list<int, subwarp_size>,
+                    std::shared_ptr<const DefaultExecutor> exec,
+                    const matrix::Csr<ValueType, IndexType>* llh,
+                    const matrix::Csr<ValueType, IndexType>* a,
+                    const matrix::Csr<ValueType, IndexType>* l,
+                    matrix::Csr<ValueType, IndexType>* l_new)
+{
+    auto num_rows = static_cast<IndexType>(llh->get_size()[0]);
+    auto subwarps_per_block = default_block_size / subwarp_size;
+    auto num_blocks = ceildiv(num_rows, subwarps_per_block);
+    matrix::CsrBuilder<ValueType, IndexType> l_new_builder(l_new);
+    auto llh_row_ptrs = llh->get_const_row_ptrs();
+    auto llh_col_idxs = llh->get_const_col_idxs();
+    auto llh_vals = llh->get_const_values();
+    auto a_row_ptrs = a->get_const_row_ptrs();
+    auto a_col_idxs = a->get_const_col_idxs();
+    auto a_vals = a->get_const_values();
+    auto l_row_ptrs = l->get_const_row_ptrs();
+    auto l_col_idxs = l->get_const_col_idxs();
+    auto l_vals = l->get_const_values();
+    auto l_new_row_ptrs = l_new->get_row_ptrs();
+    // count non-zeros per row
+    if (num_blocks > 0) {
+        kernel::ict_tri_spgeam_nnz<subwarp_size>
+            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+                llh_row_ptrs, llh_col_idxs, a_row_ptrs, a_col_idxs,
+                l_new_row_ptrs, num_rows);
+    }
+
+    // build row ptrs
+    components::prefix_sum_nonnegative(exec, l_new_row_ptrs, num_rows + 1);
+
+    // resize output arrays
+    auto l_new_nnz = exec->copy_val_to_host(l_new_row_ptrs + num_rows);
+    l_new_builder.get_col_idx_array().resize_and_reset(l_new_nnz);
+    l_new_builder.get_value_array().resize_and_reset(l_new_nnz);
+
+    auto l_new_col_idxs = l_new->get_col_idxs();
+    auto l_new_vals = l_new->get_values();
+
+    // fill columns and values
+    if (num_blocks > 0) {
+        kernel::ict_tri_spgeam_init<subwarp_size>
+            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+                llh_row_ptrs, llh_col_idxs, as_device_type(llh_vals),
+                a_row_ptrs, a_col_idxs, as_device_type(a_vals), l_row_ptrs,
+                l_col_idxs, as_device_type(l_vals), l_new_row_ptrs,
+                l_new_col_idxs, as_device_type(l_new_vals), num_rows);
+    }
+}
+
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_add_candidates, add_candidates);
+
+
+template <int subwarp_size, typename ValueType, typename IndexType>
+void compute_factor(syn::value_list<int, subwarp_size>,
+                    std::shared_ptr<const DefaultExecutor> exec,
+                    const matrix::Csr<ValueType, IndexType>* a,
+                    matrix::Csr<ValueType, IndexType>* l,
+                    const matrix::Coo<ValueType, IndexType>* l_coo)
+{
+    auto total_nnz = static_cast<IndexType>(l->get_num_stored_elements());
+    auto block_size = default_block_size / subwarp_size;
+    auto num_blocks = ceildiv(total_nnz, block_size);
+    if (num_blocks > 0) {
+        kernel::ict_sweep<subwarp_size>
+            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+                a->get_const_row_ptrs(), a->get_const_col_idxs(),
+                as_device_type(a->get_const_values()), l->get_const_row_ptrs(),
+                l_coo->get_const_row_idxs(), l->get_const_col_idxs(),
+                as_device_type(l->get_values()),
+                static_cast<IndexType>(l->get_num_stored_elements()));
+    }
+}
+
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_compute_factor, compute_factor);
+
+
+}  // namespace
+
+
+template <typename ValueType, typename IndexType>
+void add_candidates(std::shared_ptr<const DefaultExecutor> exec,
+                    const matrix::Csr<ValueType, IndexType>* llh,
+                    const matrix::Csr<ValueType, IndexType>* a,
+                    const matrix::Csr<ValueType, IndexType>* l,
+                    matrix::Csr<ValueType, IndexType>* l_new)
+{
+    auto num_rows = a->get_size()[0];
+    auto total_nnz =
+        llh->get_num_stored_elements() + a->get_num_stored_elements();
+    auto total_nnz_per_row = total_nnz / num_rows;
+    select_add_candidates(
+        compiled_kernels(),
+        [&](int compiled_subwarp_size) {
+            return total_nnz_per_row <= compiled_subwarp_size ||
+                   compiled_subwarp_size == config::warp_size;
+        },
+        syn::value_list<int>(), syn::type_list<>(), exec, llh, a, l, l_new);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_PAR_ICT_ADD_CANDIDATES_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void compute_factor(std::shared_ptr<const DefaultExecutor> exec,
+                    const matrix::Csr<ValueType, IndexType>* a,
+                    matrix::Csr<ValueType, IndexType>* l,
+                    const matrix::Coo<ValueType, IndexType>* l_coo)
+{
+    auto num_rows = a->get_size()[0];
+    auto total_nnz = 2 * l->get_num_stored_elements();
+    auto total_nnz_per_row = total_nnz / num_rows;
+    select_compute_factor(
+        compiled_kernels(),
+        [&](int compiled_subwarp_size) {
+            return total_nnz_per_row <= compiled_subwarp_size ||
+                   compiled_subwarp_size == config::warp_size;
+        },
+        syn::value_list<int>(), syn::type_list<>(), exec, a, l, l_coo);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_PAR_ICT_COMPUTE_FACTOR_KERNEL);
+
+
+}  // namespace par_ict_factorization
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
diff --git a/hip/factorization/par_ilut_approx_filter_kernels.hip.cpp b/hip/factorization/par_ilut_approx_filter_kernels.hip.cpp
index d730e33e418..f682cb9efad 100644
--- a/hip/factorization/par_ilut_approx_filter_kernels.hip.cpp
+++ b/hip/factorization/par_ilut_approx_filter_kernels.hip.cpp
@@ -18,20 +18,20 @@
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/components/atomic.hpp"
+#include "common/cuda_hip/components/intrinsics.hpp"
+#include "common/cuda_hip/components/prefix_sum.hpp"
+#include "common/cuda_hip/components/sorting.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/coo_builder.hpp"
 #include "core/matrix/csr_builder.hpp"
 #include "core/matrix/csr_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
 #include "hip/base/config.hip.hpp"
-#include "hip/base/math.hip.hpp"
 #include "hip/base/types.hip.hpp"
-#include "hip/components/atomic.hip.hpp"
 #include "hip/components/cooperative_groups.hip.hpp"
-#include "hip/components/intrinsics.hip.hpp"
-#include "hip/components/prefix_sum.hip.hpp"
-#include "hip/components/sorting.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
 #include "hip/factorization/par_ilut_select_common.hip.hpp"
 
 
diff --git a/hip/factorization/par_ilut_filter_kernels.hip.cpp b/hip/factorization/par_ilut_filter_kernels.hip.cpp
new file mode 100644
index 00000000000..36cd0873903
--- /dev/null
+++ b/hip/factorization/par_ilut_filter_kernels.hip.cpp
@@ -0,0 +1,140 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "core/factorization/par_ilut_kernels.hpp"
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/coo.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/intrinsics.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "core/components/prefix_sum_kernels.hpp"
+#include "core/matrix/coo_builder.hpp"
+#include "core/matrix/csr_builder.hpp"
+#include "core/matrix/csr_kernels.hpp"
+#include "core/synthesizer/implementation_selection.hpp"
+#include "hip/base/config.hip.hpp"
+#include "hip/base/types.hip.hpp"
+#include "hip/components/cooperative_groups.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+/**
+ * @brief The parallel ILUT factorization namespace.
+ *
+ * @ingroup factor
+ */
+namespace par_ilut_factorization {
+
+
+constexpr int default_block_size = 512;
+
+
+// subwarp sizes for filter kernels
+using compiled_kernels =
+    syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
+
+
+#include "common/cuda_hip/factorization/par_ilut_filter_kernels.hpp.inc"
+
+
+namespace {
+
+
+template <int subwarp_size, typename ValueType, typename IndexType>
+void threshold_filter(syn::value_list<int, subwarp_size>,
+                      std::shared_ptr<const DefaultExecutor> exec,
+                      const matrix::Csr<ValueType, IndexType>* a,
+                      remove_complex<ValueType> threshold,
+                      matrix::Csr<ValueType, IndexType>* m_out,
+                      matrix::Coo<ValueType, IndexType>* m_out_coo, bool lower)
+{
+    auto old_row_ptrs = a->get_const_row_ptrs();
+    auto old_col_idxs = a->get_const_col_idxs();
+    auto old_vals = a->get_const_values();
+    // compute nnz for each row
+    auto num_rows = static_cast<IndexType>(a->get_size()[0]);
+    auto block_size = default_block_size / subwarp_size;
+    auto num_blocks = ceildiv(num_rows, block_size);
+    auto new_row_ptrs = m_out->get_row_ptrs();
+    if (num_blocks > 0) {
+        kernel::threshold_filter_nnz<subwarp_size>
+            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+                old_row_ptrs, as_device_type(old_vals), num_rows,
+                as_device_type(threshold), new_row_ptrs, lower);
+    }
+
+    // build row pointers
+    components::prefix_sum_nonnegative(exec, new_row_ptrs, num_rows + 1);
+
+    // build matrix
+    auto new_nnz = exec->copy_val_to_host(new_row_ptrs + num_rows);
+    // resize arrays and update aliases
+    matrix::CsrBuilder<ValueType, IndexType> builder{m_out};
+    builder.get_col_idx_array().resize_and_reset(new_nnz);
+    builder.get_value_array().resize_and_reset(new_nnz);
+    auto new_col_idxs = m_out->get_col_idxs();
+    auto new_vals = m_out->get_values();
+    IndexType* new_row_idxs{};
+    if (m_out_coo) {
+        matrix::CooBuilder<ValueType, IndexType> coo_builder{m_out_coo};
+        coo_builder.get_row_idx_array().resize_and_reset(new_nnz);
+        coo_builder.get_col_idx_array() =
+            make_array_view(exec, new_nnz, new_col_idxs);
+        coo_builder.get_value_array() =
+            make_array_view(exec, new_nnz, new_vals);
+        new_row_idxs = m_out_coo->get_row_idxs();
+    }
+    if (num_blocks > 0) {
+        kernel::threshold_filter<subwarp_size>
+            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+                old_row_ptrs, old_col_idxs, as_device_type(old_vals), num_rows,
+                as_device_type(threshold), new_row_ptrs, new_row_idxs,
+                new_col_idxs, as_device_type(new_vals), lower);
+    }
+}
+
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_threshold_filter, threshold_filter);
+
+
+}  // namespace
+
+template <typename ValueType, typename IndexType>
+void threshold_filter(std::shared_ptr<const DefaultExecutor> exec,
+                      const matrix::Csr<ValueType, IndexType>* a,
+                      remove_complex<ValueType> threshold,
+                      matrix::Csr<ValueType, IndexType>* m_out,
+                      matrix::Coo<ValueType, IndexType>* m_out_coo, bool lower)
+{
+    auto num_rows = a->get_size()[0];
+    auto total_nnz = a->get_num_stored_elements();
+    auto total_nnz_per_row = total_nnz / num_rows;
+    select_threshold_filter(
+        compiled_kernels(),
+        [&](int compiled_subwarp_size) {
+            return total_nnz_per_row <= compiled_subwarp_size ||
+                   compiled_subwarp_size == config::warp_size;
+        },
+        syn::value_list<int>(), syn::type_list<>(), exec, a, threshold, m_out,
+        m_out_coo, lower);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_KERNEL);
+
+
+}  // namespace par_ilut_factorization
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
diff --git a/hip/factorization/par_ilut_select_common.hip.cpp b/hip/factorization/par_ilut_select_common.hip.cpp
index 85c2eaa7036..e10f2298463 100644
--- a/hip/factorization/par_ilut_select_common.hip.cpp
+++ b/hip/factorization/par_ilut_select_common.hip.cpp
@@ -11,15 +11,15 @@
 #include "hip/factorization/par_ilut_select_common.hip.hpp"
 
 
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/components/atomic.hpp"
+#include "common/cuda_hip/components/intrinsics.hpp"
+#include "common/cuda_hip/components/prefix_sum.hpp"
+#include "common/cuda_hip/components/searching.hpp"
+#include "common/cuda_hip/components/sorting.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/factorization/par_ilut_kernels.hpp"
-#include "hip/base/math.hip.hpp"
-#include "hip/components/atomic.hip.hpp"
-#include "hip/components/intrinsics.hip.hpp"
-#include "hip/components/prefix_sum.hip.hpp"
-#include "hip/components/searching.hip.hpp"
-#include "hip/components/sorting.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
 
 
 namespace gko {
diff --git a/hip/factorization/par_ilut_select_kernels.hip.cpp b/hip/factorization/par_ilut_select_kernels.hip.cpp
new file mode 100644
index 00000000000..1e778c3951d
--- /dev/null
+++ b/hip/factorization/par_ilut_select_kernels.hip.cpp
@@ -0,0 +1,162 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "core/factorization/par_ilut_kernels.hpp"
+
+
+#include <algorithm>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+
+
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/atomic.hpp"
+#include "common/cuda_hip/components/intrinsics.hpp"
+#include "common/cuda_hip/components/prefix_sum.hpp"
+#include "common/cuda_hip/components/searching.hpp"
+#include "common/cuda_hip/components/sorting.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "core/components/prefix_sum_kernels.hpp"
+#include "hip/factorization/par_ilut_select_common.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+/**
+ * @brief The parallel ILUT factorization namespace.
+ *
+ * @ingroup factor
+ */
+namespace par_ilut_factorization {
+
+
+#include "common/cuda_hip/factorization/par_ilut_select_kernels.hpp.inc"
+
+
+template <typename ValueType, typename IndexType>
+void sampleselect_filter(std::shared_ptr<const DefaultExecutor> exec,
+                         const ValueType* values, IndexType size,
+                         const unsigned char* oracles,
+                         const IndexType* partial_counts, IndexType bucket,
+                         remove_complex<ValueType>* out)
+{
+    auto num_threads_total = ceildiv(size, items_per_thread);
+    auto num_blocks =
+        static_cast<IndexType>(ceildiv(num_threads_total, default_block_size));
+    if (num_blocks > 0) {
+        kernel::filter_bucket<<<num_blocks, default_block_size, 0,
+                                exec->get_stream()>>>(
+            as_device_type(values), size, bucket, oracles, partial_counts,
+            as_device_type(out), items_per_thread);
+    }
+}
+
+
+template <typename ValueType, typename IndexType>
+void threshold_select(std::shared_ptr<const DefaultExecutor> exec,
+                      const matrix::Csr<ValueType, IndexType>* m,
+                      IndexType rank, array<ValueType>& tmp1,
+                      array<remove_complex<ValueType>>& tmp2,
+                      remove_complex<ValueType>& threshold)
+{
+    auto values = m->get_const_values();
+    IndexType size = m->get_num_stored_elements();
+    using AbsType = remove_complex<ValueType>;
+    constexpr auto bucket_count = kernel::searchtree_width;
+    auto max_num_threads = ceildiv(size, items_per_thread);
+    auto max_num_blocks = ceildiv(max_num_threads, default_block_size);
+
+    size_type tmp_size_totals =
+        ceildiv((bucket_count + 1) * sizeof(IndexType), sizeof(ValueType));
+    size_type tmp_size_partials = ceildiv(
+        bucket_count * max_num_blocks * sizeof(IndexType), sizeof(ValueType));
+    size_type tmp_size_oracles =
+        ceildiv(size * sizeof(unsigned char), sizeof(ValueType));
+    size_type tmp_size_tree =
+        ceildiv(kernel::searchtree_size * sizeof(AbsType), sizeof(ValueType));
+    size_type tmp_size_vals =
+        size / bucket_count * 4;  // pessimistic estimate for temporary storage
+    size_type tmp_size =
+        tmp_size_totals + tmp_size_partials + tmp_size_oracles + tmp_size_tree;
+    tmp1.resize_and_reset(tmp_size);
+    tmp2.resize_and_reset(tmp_size_vals);
+
+    auto total_counts = reinterpret_cast<IndexType*>(tmp1.get_data());
+    auto partial_counts =
+        reinterpret_cast<IndexType*>(tmp1.get_data() + tmp_size_totals);
+    auto oracles = reinterpret_cast<unsigned char*>(
+        tmp1.get_data() + tmp_size_totals + tmp_size_partials);
+    auto tree =
+        reinterpret_cast<AbsType*>(tmp1.get_data() + tmp_size_totals +
+                                   tmp_size_partials + tmp_size_oracles);
+
+    sampleselect_count(exec, values, size, tree, oracles, partial_counts,
+                       total_counts);
+
+    // determine bucket with correct rank, use bucket-local rank
+    auto bucket = sampleselect_find_bucket(exec, total_counts, rank);
+    rank -= bucket.begin;
+
+    if (bucket.size * 2 > tmp_size_vals) {
+        // we need to reallocate tmp2
+        tmp2.resize_and_reset(bucket.size * 2);
+    }
+    auto tmp21 = tmp2.get_data();
+    auto tmp22 = tmp2.get_data() + bucket.size;
+    // extract target bucket
+    sampleselect_filter(exec, values, size, oracles, partial_counts, bucket.idx,
+                        tmp22);
+
+    // recursively select from smaller buckets
+    int step{};
+    while (bucket.size > kernel::basecase_size) {
+        std::swap(tmp21, tmp22);
+        const auto* tmp_in = tmp21;
+        auto tmp_out = tmp22;
+
+        sampleselect_count(exec, tmp_in, bucket.size, tree, oracles,
+                           partial_counts, total_counts);
+        auto new_bucket = sampleselect_find_bucket(exec, total_counts, rank);
+        sampleselect_filter(exec, tmp_in, bucket.size, oracles, partial_counts,
+                            bucket.idx, tmp_out);
+
+        rank -= new_bucket.begin;
+        bucket.size = new_bucket.size;
+        // we should never need more than 5 recursion steps, this would mean
+        // 256^5 = 2^40. fall back to standard library algorithm in that case.
+        ++step;
+        if (step > 5) {
+            array<AbsType> cpu_out_array{
+                exec->get_master(),
+                make_array_view(exec, bucket.size, tmp_out)};
+            auto begin = cpu_out_array.get_data();
+            auto end = begin + bucket.size;
+            auto middle = begin + rank;
+            std::nth_element(begin, middle, end);
+            threshold = *middle;
+            return;
+        }
+    }
+
+    // base case
+    auto out_ptr = reinterpret_cast<AbsType*>(tmp1.get_data());
+    kernel::basecase_select<<<1, kernel::basecase_block_size, 0,
+                              exec->get_stream()>>>(
+        as_device_type(tmp22), bucket.size, rank, as_device_type(out_ptr));
+    threshold = exec->copy_val_to_host(out_ptr);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_PAR_ILUT_THRESHOLD_SELECT_KERNEL);
+
+
+}  // namespace par_ilut_factorization
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
diff --git a/hip/factorization/par_ilut_spgeam_kernels.hip.cpp b/hip/factorization/par_ilut_spgeam_kernels.hip.cpp
new file mode 100644
index 00000000000..1c8ea4bd27c
--- /dev/null
+++ b/hip/factorization/par_ilut_spgeam_kernels.hip.cpp
@@ -0,0 +1,159 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "core/factorization/par_ilut_kernels.hpp"
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/coo.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/intrinsics.hpp"
+#include "common/cuda_hip/components/merging.hpp"
+#include "common/cuda_hip/components/prefix_sum.hpp"
+#include "common/cuda_hip/components/searching.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "core/components/prefix_sum_kernels.hpp"
+#include "core/matrix/coo_builder.hpp"
+#include "core/matrix/csr_builder.hpp"
+#include "core/matrix/csr_kernels.hpp"
+#include "core/synthesizer/implementation_selection.hpp"
+#include "hip/components/cooperative_groups.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+/**
+ * @brief The parallel ILUT factorization namespace.
+ *
+ * @ingroup factor
+ */
+namespace par_ilut_factorization {
+
+
+constexpr int default_block_size = 512;
+
+
+// subwarp sizes for add_candidates kernels
+using compiled_kernels =
+    syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
+
+
+#include "common/cuda_hip/factorization/par_ilut_spgeam_kernels.hpp.inc"
+
+
+namespace {
+
+
+template <int subwarp_size, typename ValueType, typename IndexType>
+void add_candidates(syn::value_list<int, subwarp_size>,
+                    std::shared_ptr<const DefaultExecutor> exec,
+                    const matrix::Csr<ValueType, IndexType>* lu,
+                    const matrix::Csr<ValueType, IndexType>* a,
+                    const matrix::Csr<ValueType, IndexType>* l,
+                    const matrix::Csr<ValueType, IndexType>* u,
+                    matrix::Csr<ValueType, IndexType>* l_new,
+                    matrix::Csr<ValueType, IndexType>* u_new)
+{
+    auto num_rows = static_cast<IndexType>(lu->get_size()[0]);
+    auto subwarps_per_block = default_block_size / subwarp_size;
+    auto num_blocks = ceildiv(num_rows, subwarps_per_block);
+    matrix::CsrBuilder<ValueType, IndexType> l_new_builder(l_new);
+    matrix::CsrBuilder<ValueType, IndexType> u_new_builder(u_new);
+    auto lu_row_ptrs = lu->get_const_row_ptrs();
+    auto lu_col_idxs = lu->get_const_col_idxs();
+    auto lu_vals = lu->get_const_values();
+    auto a_row_ptrs = a->get_const_row_ptrs();
+    auto a_col_idxs = a->get_const_col_idxs();
+    auto a_vals = a->get_const_values();
+    auto l_row_ptrs = l->get_const_row_ptrs();
+    auto l_col_idxs = l->get_const_col_idxs();
+    auto l_vals = l->get_const_values();
+    auto u_row_ptrs = u->get_const_row_ptrs();
+    auto u_col_idxs = u->get_const_col_idxs();
+    auto u_vals = u->get_const_values();
+    auto l_new_row_ptrs = l_new->get_row_ptrs();
+    auto u_new_row_ptrs = u_new->get_row_ptrs();
+    if (num_blocks > 0) {
+        // count non-zeros per row
+        kernel::tri_spgeam_nnz<subwarp_size>
+            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+                lu_row_ptrs, lu_col_idxs, a_row_ptrs, a_col_idxs,
+                l_new_row_ptrs, u_new_row_ptrs, num_rows);
+    }
+
+    // build row ptrs
+    components::prefix_sum_nonnegative(exec, l_new_row_ptrs, num_rows + 1);
+    components::prefix_sum_nonnegative(exec, u_new_row_ptrs, num_rows + 1);
+
+    // resize output arrays
+    auto l_new_nnz = exec->copy_val_to_host(l_new_row_ptrs + num_rows);
+    auto u_new_nnz = exec->copy_val_to_host(u_new_row_ptrs + num_rows);
+    l_new_builder.get_col_idx_array().resize_and_reset(l_new_nnz);
+    l_new_builder.get_value_array().resize_and_reset(l_new_nnz);
+    u_new_builder.get_col_idx_array().resize_and_reset(u_new_nnz);
+    u_new_builder.get_value_array().resize_and_reset(u_new_nnz);
+
+    auto l_new_col_idxs = l_new->get_col_idxs();
+    auto l_new_vals = l_new->get_values();
+    auto u_new_col_idxs = u_new->get_col_idxs();
+    auto u_new_vals = u_new->get_values();
+
+    if (num_blocks > 0) {
+        // fill columns and values
+        kernel::tri_spgeam_init<subwarp_size>
+            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+                lu_row_ptrs, lu_col_idxs, as_device_type(lu_vals), a_row_ptrs,
+                a_col_idxs, as_device_type(a_vals), l_row_ptrs, l_col_idxs,
+                as_device_type(l_vals), u_row_ptrs, u_col_idxs,
+                as_device_type(u_vals), l_new_row_ptrs, l_new_col_idxs,
+                as_device_type(l_new_vals), u_new_row_ptrs, u_new_col_idxs,
+                as_device_type(u_new_vals), num_rows);
+    }
+}
+
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_add_candidates, add_candidates);
+
+
+}  // namespace
+
+
+template <typename ValueType, typename IndexType>
+void add_candidates(std::shared_ptr<const DefaultExecutor> exec,
+                    const matrix::Csr<ValueType, IndexType>* lu,
+                    const matrix::Csr<ValueType, IndexType>* a,
+                    const matrix::Csr<ValueType, IndexType>* l,
+                    const matrix::Csr<ValueType, IndexType>* u,
+                    matrix::Csr<ValueType, IndexType>* l_new,
+                    matrix::Csr<ValueType, IndexType>* u_new)
+{
+    auto num_rows = a->get_size()[0];
+    auto total_nnz =
+        lu->get_num_stored_elements() + a->get_num_stored_elements();
+    auto total_nnz_per_row = total_nnz / num_rows;
+    select_add_candidates(
+        compiled_kernels(),
+        [&](int compiled_subwarp_size) {
+            return total_nnz_per_row <= compiled_subwarp_size ||
+                   compiled_subwarp_size == config::warp_size;
+        },
+        syn::value_list<int>(), syn::type_list<>(), exec, lu, a, l, u, l_new,
+        u_new);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_PAR_ILUT_ADD_CANDIDATES_KERNEL);
+
+
+}  // namespace par_ilut_factorization
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
diff --git a/hip/factorization/par_ilut_sweep_kernels.hip.cpp b/hip/factorization/par_ilut_sweep_kernels.hip.cpp
new file mode 100644
index 00000000000..71cc1166179
--- /dev/null
+++ b/hip/factorization/par_ilut_sweep_kernels.hip.cpp
@@ -0,0 +1,123 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "core/factorization/par_ilut_kernels.hpp"
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/coo.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/intrinsics.hpp"
+#include "common/cuda_hip/components/merging.hpp"
+#include "common/cuda_hip/components/prefix_sum.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/searching.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "core/components/prefix_sum_kernels.hpp"
+#include "core/matrix/coo_builder.hpp"
+#include "core/matrix/csr_builder.hpp"
+#include "core/matrix/csr_kernels.hpp"
+#include "core/synthesizer/implementation_selection.hpp"
+#include "hip/components/memory.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+/**
+ * @brief The parallel ILUT factorization namespace.
+ *
+ * @ingroup factor
+ */
+namespace par_ilut_factorization {
+
+
+constexpr int default_block_size = 512;
+
+
+// subwarp sizes for all warp-parallel kernels (filter, add_candidates)
+using compiled_kernels =
+    syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
+
+
+#include "common/cuda_hip/factorization/par_ilut_sweep_kernels.hpp.inc"
+
+
+namespace {
+
+
+template <int subwarp_size, typename ValueType, typename IndexType>
+void compute_l_u_factors(syn::value_list<int, subwarp_size>,
+                         std::shared_ptr<const DefaultExecutor> exec,
+                         const matrix::Csr<ValueType, IndexType>* a,
+                         matrix::Csr<ValueType, IndexType>* l,
+                         const matrix::Coo<ValueType, IndexType>* l_coo,
+                         matrix::Csr<ValueType, IndexType>* u,
+                         const matrix::Coo<ValueType, IndexType>* u_coo,
+                         matrix::Csr<ValueType, IndexType>* u_csc)
+{
+    auto total_nnz = static_cast<IndexType>(l->get_num_stored_elements() +
+                                            u->get_num_stored_elements());
+    auto block_size = default_block_size / subwarp_size;
+    auto num_blocks = ceildiv(total_nnz, block_size);
+    if (num_blocks > 0) {
+        kernel::sweep<subwarp_size>
+            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+                a->get_const_row_ptrs(), a->get_const_col_idxs(),
+                as_device_type(a->get_const_values()), l->get_const_row_ptrs(),
+                l_coo->get_const_row_idxs(), l->get_const_col_idxs(),
+                as_device_type(l->get_values()),
+                static_cast<IndexType>(l->get_num_stored_elements()),
+                u_coo->get_const_row_idxs(), u_coo->get_const_col_idxs(),
+                as_device_type(u->get_values()), u_csc->get_const_row_ptrs(),
+                u_csc->get_const_col_idxs(),
+                as_device_type(u_csc->get_values()),
+                static_cast<IndexType>(u->get_num_stored_elements()));
+    }
+}
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_compute_l_u_factors,
+                                    compute_l_u_factors);
+
+
+}  // namespace
+
+
+template <typename ValueType, typename IndexType>
+void compute_l_u_factors(std::shared_ptr<const DefaultExecutor> exec,
+                         const matrix::Csr<ValueType, IndexType>* a,
+                         matrix::Csr<ValueType, IndexType>* l,
+                         const matrix::Coo<ValueType, IndexType>* l_coo,
+                         matrix::Csr<ValueType, IndexType>* u,
+                         const matrix::Coo<ValueType, IndexType>* u_coo,
+                         matrix::Csr<ValueType, IndexType>* u_csc)
+{
+    auto num_rows = a->get_size()[0];
+    auto total_nnz =
+        l->get_num_stored_elements() + u->get_num_stored_elements();
+    auto total_nnz_per_row = total_nnz / num_rows;
+    select_compute_l_u_factors(
+        compiled_kernels(),
+        [&](int compiled_subwarp_size) {
+            return total_nnz_per_row <= compiled_subwarp_size ||
+                   compiled_subwarp_size == config::warp_size;
+        },
+        syn::value_list<int>(), syn::type_list<>(), exec, a, l, l_coo, u, u_coo,
+        u_csc);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_PAR_ILUT_COMPUTE_LU_FACTORS_KERNEL);
+
+
+}  // namespace par_ilut_factorization
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
diff --git a/hip/log/batch_logger.hip.hpp b/hip/log/batch_logger.hip.hpp
new file mode 100644
index 00000000000..a2540f2bd9d
--- /dev/null
+++ b/hip/log/batch_logger.hip.hpp
@@ -0,0 +1,26 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_HIP_LOG_BATCH_LOGGER_HIP_HPP_
+#define GKO_HIP_LOG_BATCH_LOGGER_HIP_HPP_
+
+
+#include <ginkgo/core/base/types.hpp>
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+namespace batch_log {
+
+#include "common/cuda_hip/log/batch_logger.hpp.inc"
+
+
+}  // namespace batch_log
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_HIP_LOG_BATCH_LOGGER_HIP_HPP_
diff --git a/hip/matrix/batch_csr_kernels.hip.cpp b/hip/matrix/batch_csr_kernels.hip.cpp
new file mode 100644
index 00000000000..8540a542138
--- /dev/null
+++ b/hip/matrix/batch_csr_kernels.hip.cpp
@@ -0,0 +1,58 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "core/matrix/batch_csr_kernels.hpp"
+
+
+#include <thrust/functional.h>
+
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/matrix/batch_csr.hpp>
+
+
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/uninitialized_array.hpp"
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "hip/base/batch_struct.hip.hpp"
+#include "hip/base/config.hip.hpp"
+#include "hip/base/thrust.hip.hpp"
+#include "hip/components/cooperative_groups.hip.hpp"
+#include "hip/matrix/batch_struct.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+/**
+ * @brief The Csr matrix format namespace.
+ * @ref Csr
+ * @ingroup batch_csr
+ */
+namespace batch_csr {
+
+
+constexpr auto default_block_size = 256;
+constexpr int sm_oversubscription = 4;
+
+// clang-format off
+
+// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
+
+#include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc"
+
+
+#include "common/cuda_hip/matrix/batch_csr_kernel_launcher.hpp.inc"
+
+// clang-format on
+
+
+}  // namespace batch_csr
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
diff --git a/hip/matrix/batch_dense_kernels.hip.cpp b/hip/matrix/batch_dense_kernels.hip.cpp
new file mode 100644
index 00000000000..981490d560a
--- /dev/null
+++ b/hip/matrix/batch_dense_kernels.hip.cpp
@@ -0,0 +1,59 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "core/matrix/batch_dense_kernels.hpp"
+
+
+#include <thrust/functional.h>
+
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/batch_dense.hpp>
+
+
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/uninitialized_array.hpp"
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "hip/base/batch_struct.hip.hpp"
+#include "hip/base/config.hip.hpp"
+#include "hip/base/thrust.hip.hpp"
+#include "hip/components/cooperative_groups.hip.hpp"
+#include "hip/matrix/batch_struct.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+/**
+ * @brief The Dense matrix format namespace.
+ *
+ * @ingroup batch_dense
+ */
+namespace batch_dense {
+
+
+constexpr auto default_block_size = 256;
+constexpr int sm_oversubscription = 4;
+
+// clang-format off
+
+// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
+
+#include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc"
+
+
+#include "common/cuda_hip/matrix/batch_dense_kernel_launcher.hpp.inc"
+
+
+// clang-format on
+
+
+}  // namespace batch_dense
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
diff --git a/hip/matrix/batch_ell_kernels.hip.cpp b/hip/matrix/batch_ell_kernels.hip.cpp
new file mode 100644
index 00000000000..373f87f1f81
--- /dev/null
+++ b/hip/matrix/batch_ell_kernels.hip.cpp
@@ -0,0 +1,58 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "core/matrix/batch_ell_kernels.hpp"
+
+
+#include <thrust/functional.h>
+
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/matrix/batch_ell.hpp>
+
+
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/uninitialized_array.hpp"
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "hip/base/batch_struct.hip.hpp"
+#include "hip/base/config.hip.hpp"
+#include "hip/base/thrust.hip.hpp"
+#include "hip/components/cooperative_groups.hip.hpp"
+#include "hip/matrix/batch_struct.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+/**
+ * @brief The Ell matrix format namespace.
+ * @ref Ell
+ * @ingroup batch_ell
+ */
+namespace batch_ell {
+
+
+constexpr auto default_block_size = 256;
+constexpr int sm_oversubscription = 4;
+
+// clang-format off
+
+// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
+
+#include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc"
+
+
+#include "common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc"
+
+// clang-format on
+
+
+}  // namespace batch_ell
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
diff --git a/hip/matrix/csr_kernels.template.hip.cpp b/hip/matrix/csr_kernels.template.hip.cpp
index 8371bc45b52..f80d9d07285 100644
--- a/hip/matrix/csr_kernels.template.hip.cpp
+++ b/hip/matrix/csr_kernels.template.hip.cpp
@@ -29,6 +29,15 @@
 
 
 #include "accessor/hip_helper.hpp"
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/components/atomic.hpp"
+#include "common/cuda_hip/components/intrinsics.hpp"
+#include "common/cuda_hip/components/merging.hpp"
+#include "common/cuda_hip/components/prefix_sum.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/segment_scan.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/uninitialized_array.hpp"
 #include "core/base/array_access.hpp"
 #include "core/base/mixed_precision_types.hpp"
 #include "core/components/fill_array_kernels.hpp"
@@ -41,19 +50,10 @@
 #include "core/synthesizer/implementation_selection.hpp"
 #include "hip/base/config.hip.hpp"
 #include "hip/base/hipsparse_bindings.hip.hpp"
-#include "hip/base/math.hip.hpp"
 #include "hip/base/pointer_mode_guard.hip.hpp"
 #include "hip/base/thrust.hip.hpp"
 #include "hip/base/types.hip.hpp"
-#include "hip/components/atomic.hip.hpp"
 #include "hip/components/cooperative_groups.hip.hpp"
-#include "hip/components/intrinsics.hip.hpp"
-#include "hip/components/merging.hip.hpp"
-#include "hip/components/prefix_sum.hip.hpp"
-#include "hip/components/reduction.hip.hpp"
-#include "hip/components/segment_scan.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-#include "hip/components/uninitialized_array.hip.hpp"
 
 
 namespace gko {
diff --git a/hip/preconditioner/batch_preconditioners.hip.hpp b/hip/preconditioner/batch_preconditioners.hip.hpp
index 5d2358378d2..71549260dfb 100644
--- a/hip/preconditioner/batch_preconditioners.hip.hpp
+++ b/hip/preconditioner/batch_preconditioners.hip.hpp
@@ -9,9 +9,9 @@
 #include <ginkgo/core/matrix/batch_identity.hpp>
 
 
+#include "common/cuda_hip/components/reduction.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "hip/components/cooperative_groups.hip.hpp"
-#include "hip/components/reduction.hip.hpp"
 
 
 namespace gko {
diff --git a/hip/preconditioner/jacobi_advanced_apply_instantiate.inc.hip.cpp b/hip/preconditioner/jacobi_advanced_apply_instantiate.inc.hip.cpp
index 326b9f6b720..f5c53022564 100644
--- a/hip/preconditioner/jacobi_advanced_apply_instantiate.inc.hip.cpp
+++ b/hip/preconditioner/jacobi_advanced_apply_instantiate.inc.hip.cpp
@@ -11,16 +11,16 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/warp_blas.hpp"
 #include "core/base/extended_float.hpp"
 #include "core/matrix/dense_kernels.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
 #include "hip/base/config.hip.hpp"
-#include "hip/base/math.hip.hpp"
 #include "hip/base/types.hip.hpp"
 #include "hip/components/cooperative_groups.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-#include "hip/components/warp_blas.hip.hpp"
 #include "hip/preconditioner/jacobi_common.hip.hpp"
 
 
@@ -35,7 +35,7 @@ namespace hip {
 namespace jacobi {
 
 
-#include "common/cuda_hip/preconditioner/jacobi_advanced_apply_kernel.hpp.inc"
+#include "common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.hpp.inc"
 
 
 // clang-format off
diff --git a/hip/preconditioner/jacobi_generate_instantiate.inc.hip.cpp b/hip/preconditioner/jacobi_generate_instantiate.inc.hip.cpp
index 86a3b799590..a78e7b7cd45 100644
--- a/hip/preconditioner/jacobi_generate_instantiate.inc.hip.cpp
+++ b/hip/preconditioner/jacobi_generate_instantiate.inc.hip.cpp
@@ -9,18 +9,18 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/components/diagonal_block_manipulation.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/uninitialized_array.hpp"
+#include "common/cuda_hip/components/warp_blas.hpp"
 #include "core/base/extended_float.hpp"
 #include "core/components/fill_array_kernels.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
 #include "hip/base/config.hip.hpp"
-#include "hip/base/math.hip.hpp"
 #include "hip/base/types.hip.hpp"
 #include "hip/components/cooperative_groups.hip.hpp"
-#include "hip/components/diagonal_block_manipulation.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-#include "hip/components/uninitialized_array.hip.hpp"
-#include "hip/components/warp_blas.hip.hpp"
 #include "hip/preconditioner/jacobi_common.hip.hpp"
 
 
@@ -35,7 +35,7 @@ namespace hip {
 namespace jacobi {
 
 
-#include "common/cuda_hip/preconditioner/jacobi_generate_kernel.hpp.inc"
+#include "common/cuda_hip/preconditioner/jacobi_generate_kernels.hpp.inc"
 
 
 // clang-format off
diff --git a/hip/preconditioner/jacobi_generate_kernel.hip.cpp b/hip/preconditioner/jacobi_generate_kernel.hip.cpp
index 713be193250..6da1b964595 100644
--- a/hip/preconditioner/jacobi_generate_kernel.hip.cpp
+++ b/hip/preconditioner/jacobi_generate_kernel.hip.cpp
@@ -12,18 +12,18 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/components/diagonal_block_manipulation.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/uninitialized_array.hpp"
+#include "common/cuda_hip/components/warp_blas.hpp"
 #include "core/base/extended_float.hpp"
 #include "core/components/fill_array_kernels.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
 #include "hip/base/config.hip.hpp"
-#include "hip/base/math.hip.hpp"
 #include "hip/base/types.hip.hpp"
 #include "hip/components/cooperative_groups.hip.hpp"
-#include "hip/components/diagonal_block_manipulation.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-#include "hip/components/uninitialized_array.hip.hpp"
-#include "hip/components/warp_blas.hip.hpp"
 #include "hip/preconditioner/jacobi_common.hip.hpp"
 
 
@@ -38,7 +38,7 @@ namespace hip {
 namespace jacobi {
 
 
-#include "common/cuda_hip/preconditioner/jacobi_generate_kernel.hpp.inc"
+#include "common/cuda_hip/preconditioner/jacobi_generate_kernels.hpp.inc"
 
 
 template <int warps_per_block, int max_block_size, typename ValueType,
diff --git a/hip/preconditioner/jacobi_kernels.hip.cpp b/hip/preconditioner/jacobi_kernels.hip.cpp
new file mode 100644
index 00000000000..cf464ae32f1
--- /dev/null
+++ b/hip/preconditioner/jacobi_kernels.hip.cpp
@@ -0,0 +1,51 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "core/preconditioner/jacobi_kernels.hpp"
+
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+
+
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "core/base/extended_float.hpp"
+#include "core/preconditioner/jacobi_utils.hpp"
+#include "core/synthesizer/implementation_selection.hpp"
+#include "hip/base/config.hip.hpp"
+#include "hip/base/types.hip.hpp"
+#include "hip/components/cooperative_groups.hip.hpp"
+#include "hip/preconditioner/jacobi_common.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+/**
+ * @brief The Jacobi preconditioner namespace.
+ * @ref Jacobi
+ * @ingroup jacobi
+ */
+namespace jacobi {
+
+
+// a total of 32/16 warps (1024 threads)
+#if defined(GKO_COMPILING_HIP) && GINKGO_HIP_PLATFORM_HCC
+constexpr int default_num_warps = 16;
+#else  // !defined(GKO_COMPILING_HIP) || GINKGO_HIP_PLATFORM_NVCC
+constexpr int default_num_warps = 32;
+#endif
+// with current architectures, at most 32 warps can be scheduled per SM (and
+// current GPUs have at most 84 SMs)
+constexpr int default_grid_size = 32 * 32 * 128;
+
+
+#include "common/cuda_hip/preconditioner/jacobi_kernels.hpp.inc"
+
+
+}  // namespace jacobi
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
diff --git a/hip/preconditioner/jacobi_simple_apply_instantiate.inc.hip.cpp b/hip/preconditioner/jacobi_simple_apply_instantiate.inc.hip.cpp
index be485af5730..b191d7dbf2e 100644
--- a/hip/preconditioner/jacobi_simple_apply_instantiate.inc.hip.cpp
+++ b/hip/preconditioner/jacobi_simple_apply_instantiate.inc.hip.cpp
@@ -8,16 +8,16 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/warp_blas.hpp"
 #include "core/base/extended_float.hpp"
 #include "core/matrix/dense_kernels.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
 #include "hip/base/config.hip.hpp"
-#include "hip/base/math.hip.hpp"
 #include "hip/base/types.hip.hpp"
 #include "hip/components/cooperative_groups.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-#include "hip/components/warp_blas.hip.hpp"
 #include "hip/preconditioner/jacobi_common.hip.hpp"
 
 
@@ -32,7 +32,7 @@ namespace hip {
 namespace jacobi {
 
 
-#include "common/cuda_hip/preconditioner/jacobi_simple_apply_kernel.hpp.inc"
+#include "common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.hpp.inc"
 
 
 // clang-format off
diff --git a/hip/preconditioner/jacobi_simple_apply_kernel.hip.cpp b/hip/preconditioner/jacobi_simple_apply_kernel.hip.cpp
index 0763e986d41..10e3a580b3c 100644
--- a/hip/preconditioner/jacobi_simple_apply_kernel.hip.cpp
+++ b/hip/preconditioner/jacobi_simple_apply_kernel.hip.cpp
@@ -11,16 +11,16 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/warp_blas.hpp"
 #include "core/base/extended_float.hpp"
 #include "core/matrix/dense_kernels.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
 #include "hip/base/config.hip.hpp"
-#include "hip/base/math.hip.hpp"
 #include "hip/base/types.hip.hpp"
 #include "hip/components/cooperative_groups.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-#include "hip/components/warp_blas.hip.hpp"
 #include "hip/preconditioner/jacobi_common.hip.hpp"
 
 
@@ -35,7 +35,7 @@ namespace hip {
 namespace jacobi {
 
 
-#include "common/cuda_hip/preconditioner/jacobi_simple_apply_kernel.hpp.inc"
+#include "common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.hpp.inc"
 
 
 template <int warps_per_block, int max_block_size, typename ValueType,
diff --git a/hip/solver/batch_bicgstab_kernels.hip.cpp b/hip/solver/batch_bicgstab_kernels.hip.cpp
index 2477af587cc..b40984612ca 100644
--- a/hip/solver/batch_bicgstab_kernels.hip.cpp
+++ b/hip/solver/batch_bicgstab_kernels.hip.cpp
@@ -14,19 +14,19 @@
 #include <ginkgo/core/base/math.hpp>
 
 
+#include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/uninitialized_array.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/solver/batch_dispatch.hpp"
 #include "hip/base/batch_struct.hip.hpp"
 #include "hip/base/config.hip.hpp"
-#include "hip/base/math.hip.hpp"
 #include "hip/base/thrust.hip.hpp"
 #include "hip/base/types.hip.hpp"
 #include "hip/components/cooperative_groups.hip.hpp"
-#include "hip/components/reduction.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-#include "hip/components/uninitialized_array.hip.hpp"
 #include "hip/matrix/batch_struct.hip.hpp"
 
 
@@ -47,7 +47,6 @@ namespace batch_bicgstab {
 
 
 #include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc"
-#include "common/cuda_hip/components/uninitialized_array.hpp.inc"
 #include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc"
 #include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc"
 #include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc"
diff --git a/hip/solver/batch_cg_kernels.hip.cpp b/hip/solver/batch_cg_kernels.hip.cpp
index 1288c15b4c0..b5232aea8b2 100644
--- a/hip/solver/batch_cg_kernels.hip.cpp
+++ b/hip/solver/batch_cg_kernels.hip.cpp
@@ -14,18 +14,18 @@
 #include <ginkgo/core/base/math.hpp>
 
 
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/uninitialized_array.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/solver/batch_dispatch.hpp"
 #include "hip/base/batch_struct.hip.hpp"
 #include "hip/base/config.hip.hpp"
-#include "hip/base/math.hip.hpp"
 #include "hip/base/thrust.hip.hpp"
 #include "hip/base/types.hip.hpp"
 #include "hip/components/cooperative_groups.hip.hpp"
-#include "hip/components/reduction.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
-#include "hip/components/uninitialized_array.hip.hpp"
 #include "hip/matrix/batch_struct.hip.hpp"
 
 
@@ -46,7 +46,6 @@ namespace batch_cg {
 
 
 #include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc"
-#include "common/cuda_hip/components/uninitialized_array.hpp.inc"
 #include "common/cuda_hip/matrix/batch_csr_kernels.hpp.inc"
 #include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc"
 #include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc"
diff --git a/hip/solver/common_trs_kernels.hip.hpp b/hip/solver/common_trs_kernels.hip.hpp
index 8bba503ba0a..72683f7d8ec 100644
--- a/hip/solver/common_trs_kernels.hip.hpp
+++ b/hip/solver/common_trs_kernels.hip.hpp
@@ -22,10 +22,10 @@
 #include <ginkgo/core/base/math.hpp>
 
 
+#include "common/cuda_hip/base/math.hpp"
 #include "core/matrix/dense_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
 #include "hip/base/hipsparse_bindings.hip.hpp"
-#include "hip/base/math.hip.hpp"
 #include "hip/base/pointer_mode_guard.hip.hpp"
 #include "hip/base/types.hip.hpp"
 
diff --git a/hip/solver/lower_trs_kernels.hip.cpp b/hip/solver/lower_trs_kernels.hip.cpp
index 08f35d3d674..6a3ace8231a 100644
--- a/hip/solver/lower_trs_kernels.hip.cpp
+++ b/hip/solver/lower_trs_kernels.hip.cpp
@@ -21,8 +21,8 @@
 #include <ginkgo/core/solver/triangular.hpp>
 
 
+#include "common/cuda_hip/base/math.hpp"
 #include "hip/base/hipsparse_bindings.hip.hpp"
-#include "hip/base/math.hip.hpp"
 #include "hip/base/types.hip.hpp"
 #include "hip/solver/common_trs_kernels.hip.hpp"
 
diff --git a/hip/solver/upper_trs_kernels.hip.cpp b/hip/solver/upper_trs_kernels.hip.cpp
index cd6b0719844..613d2f33577 100644
--- a/hip/solver/upper_trs_kernels.hip.cpp
+++ b/hip/solver/upper_trs_kernels.hip.cpp
@@ -21,8 +21,8 @@
 #include <ginkgo/core/solver/triangular.hpp>
 
 
+#include "common/cuda_hip/base/math.hpp"
 #include "hip/base/hipsparse_bindings.hip.hpp"
-#include "hip/base/math.hip.hpp"
 #include "hip/base/types.hip.hpp"
 #include "hip/solver/common_trs_kernels.hip.hpp"
 
diff --git a/hip/stop/batch_criteria.hip.hpp b/hip/stop/batch_criteria.hip.hpp
new file mode 100644
index 00000000000..1f721e36aaf
--- /dev/null
+++ b/hip/stop/batch_criteria.hip.hpp
@@ -0,0 +1,26 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_HIP_STOP_BATCH_CRITERIA_HIP_HPP_
+#define GKO_HIP_STOP_BATCH_CRITERIA_HIP_HPP_
+
+
+#include <ginkgo/core/base/math.hpp>
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+namespace batch_stop {
+
+
+#include "common/cuda_hip/stop/batch_criteria.hpp.inc"
+
+
+}  // namespace batch_stop
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
+
+#endif  // GKO_HIP_STOP_BATCH_CRITERIA_HIP_HPP_
diff --git a/hip/stop/criterion_kernels.hip.cpp b/hip/stop/criterion_kernels.hip.cpp
index 8c7caeb32b8..6ce3fe81ab3 100644
--- a/hip/stop/criterion_kernels.hip.cpp
+++ b/hip/stop/criterion_kernels.hip.cpp
@@ -10,9 +10,9 @@
 #include <ginkgo/core/stop/stopping_status.hpp>
 
 
-#include "hip/base/math.hip.hpp"
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
 #include "hip/base/types.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
 
 
 namespace gko {
diff --git a/hip/stop/residual_norm_kernels.hip.cpp b/hip/stop/residual_norm_kernels.hip.cpp
index d790dd652f0..efb6c31a7eb 100644
--- a/hip/stop/residual_norm_kernels.hip.cpp
+++ b/hip/stop/residual_norm_kernels.hip.cpp
@@ -13,10 +13,10 @@
 #include <ginkgo/core/stop/residual_norm.hpp>
 
 
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
 #include "core/base/array_access.hpp"
-#include "hip/base/math.hip.hpp"
 #include "hip/base/types.hip.hpp"
-#include "hip/components/thread_ids.hip.hpp"
 
 
 namespace gko {
diff --git a/hip/test/base/math.hip.cpp b/hip/test/base/math.hip.cpp
index 2c25f5b3a7a..77a20eb09cd 100644
--- a/hip/test/base/math.hip.cpp
+++ b/hip/test/base/math.hip.cpp
@@ -23,7 +23,7 @@
 #include <ginkgo/core/base/executor.hpp>
 
 
-#include "hip/base/math.hip.hpp"
+#include "common/cuda_hip/base/math.hpp"
 #include "hip/base/types.hip.hpp"
 #include "hip/test/utils.hip.hpp"
 
diff --git a/hip/test/components/merging.hip.cpp b/hip/test/components/merging.hip.cpp
index 7bfab76f795..244d81c2cc1 100644
--- a/hip/test/components/merging.hip.cpp
+++ b/hip/test/components/merging.hip.cpp
@@ -8,9 +8,6 @@
 // force-top: off
 
 
-#include "hip/components/merging.hip.hpp"
-
-
 #include <algorithm>
 #include <memory>
 #include <random>
@@ -24,6 +21,7 @@
 #include <ginkgo/core/base/executor.hpp>
 
 
+#include "common/cuda_hip/components/merging.hpp"
 #include "hip/components/cooperative_groups.hip.hpp"
 #include "hip/test/utils.hip.hpp"
 
diff --git a/hip/test/components/searching.hip.cpp b/hip/test/components/searching.hip.cpp
index 1db0c6e9562..250611b509a 100644
--- a/hip/test/components/searching.hip.cpp
+++ b/hip/test/components/searching.hip.cpp
@@ -8,9 +8,6 @@
 // force-top: off
 
 
-#include "hip/components/searching.hip.hpp"
-
-
 #include <memory>
 #include <numeric>
 #include <vector>
@@ -23,6 +20,7 @@
 #include <ginkgo/core/base/executor.hpp>
 
 
+#include "common/cuda_hip/components/searching.hpp"
 #include "hip/components/cooperative_groups.hip.hpp"
 #include "hip/test/utils.hip.hpp"
 
diff --git a/hip/test/components/sorting.hip.cpp b/hip/test/components/sorting.hip.cpp
index 5cab0048a4b..2a7c244615d 100644
--- a/hip/test/components/sorting.hip.cpp
+++ b/hip/test/components/sorting.hip.cpp
@@ -2,9 +2,6 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include "hip/components/sorting.hip.hpp"
-
-
 #include <memory>
 #include <random>
 
@@ -16,6 +13,7 @@
 #include <ginkgo/core/base/executor.hpp>
 
 
+#include "common/cuda_hip/components/sorting.hpp"
 #include "hip/test/utils.hip.hpp"