diff --git a/accessor/cuda_hip_helper.hpp b/accessor/cuda_hip_helper.hpp
new file mode 100644
index 00000000000..225fdfe1b15
--- /dev/null
+++ b/accessor/cuda_hip_helper.hpp
@@ -0,0 +1,38 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_ACCESSOR_CUDA_HIP_HELPER_HPP_
+#define GKO_ACCESSOR_CUDA_HIP_HELPER_HPP_
+
+
+#include <utility>
+
+
+#ifdef GKO_COMPILING_HIP
+#include "accessor/hip_helper.hpp"
+#else  // GKO_COMPILING_CUDA
+#include "accessor/cuda_helper.hpp"
+#endif
+
+
+namespace gko {
+namespace acc {
+
+
+template <typename AccType>
+GKO_ACC_INLINE auto as_device_range(AccType&& acc)
+{
+#ifdef GKO_COMPILING_HIP
+    return as_hip_range(std::forward<AccType>(acc));
+#else  // GKO_COMPILING_CUDA
+    return as_cuda_range(std::forward<AccType>(acc));
+#endif
+}
+
+
+}  // namespace acc
+}  // namespace gko
+
+
+#endif  // GKO_ACCESSOR_CUDA_HIP_HELPER_HPP_
diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
index ca209e65057..306655d2315 100644
--- a/benchmark/CMakeLists.txt
+++ b/benchmark/CMakeLists.txt
@@ -20,6 +20,7 @@ function(ginkgo_benchmark_cusparse_linops type def)
     endif()
     # make the dependency public to catch issues
     target_compile_definitions(cusparse_linops_${type} PUBLIC ${def})
+    target_compile_definitions(cusparse_linops_${type} PRIVATE GKO_COMPILING_CUDA)
     target_link_libraries(cusparse_linops_${type} Ginkgo::ginkgo CUDA::cudart CUDA::cublas CUDA::cusparse)
 endfunction()
 
@@ -27,6 +28,7 @@ function(ginkgo_benchmark_hipsparse_linops type def)
     add_library(hipsparse_linops_${type} utils/hip_linops.hip.cpp)
     set_source_files_properties(utils/hip_linops.hip.cpp PROPERTIES LANGUAGE HIP)
     target_compile_definitions(hipsparse_linops_${type} PUBLIC ${def})
+    target_compile_definitions(hipsparse_linops_${type} PRIVATE GKO_COMPILING_HIP)
     target_include_directories(hipsparse_linops_${type} SYSTEM PRIVATE ${HIPBLAS_INCLUDE_DIRS} ${HIPSPARSE_INCLUDE_DIRS})
     target_link_libraries(hipsparse_linops_${type} Ginkgo::ginkgo ${HIPSPARSE_LIBRARIES})
 endfunction()
diff --git a/benchmark/utils/cuda_linops.cpp b/benchmark/utils/cuda_linops.cpp
index f239740d655..a404f9151ea 100644
--- a/benchmark/utils/cuda_linops.cpp
+++ b/benchmark/utils/cuda_linops.cpp
@@ -139,7 +139,7 @@ class CusparseCsrmp
 
         auto guard = this->get_gpu_exec()->get_scoped_device_id_guard();
         gko::kernels::cuda::cusparse::spmv_mp(
-            this->get_gpu_exec()->get_cusparse_handle(), trans_,
+            this->get_gpu_exec()->get_sparselib_handle(), trans_,
             this->get_size()[0], this->get_size()[1],
             csr_->get_num_stored_elements(), &scalars.get_const_data()[0],
             this->get_descr(), csr_->get_const_values(),
@@ -156,7 +156,7 @@ class CusparseCsrmp
         : gko::EnableLinOp<CusparseCsrmp, CusparseBase>(exec, size),
           csr_(std::move(
               csr::create(exec, std::make_shared<typename csr::classical>()))),
-          trans_(CUSPARSE_OPERATION_NON_TRANSPOSE)
+          trans_(SPARSELIB_OPERATION_NON_TRANSPOSE)
     {}
 
 private:
@@ -213,7 +213,7 @@ class CusparseCsr
 
         auto guard = this->get_gpu_exec()->get_scoped_device_id_guard();
         gko::kernels::cuda::cusparse::spmv(
-            this->get_gpu_exec()->get_cusparse_handle(), trans_,
+            this->get_gpu_exec()->get_sparselib_handle(), trans_,
             this->get_size()[0], this->get_size()[1],
             csr_->get_num_stored_elements(), &scalars.get_const_data()[0],
             this->get_descr(), csr_->get_const_values(),
@@ -230,7 +230,7 @@ class CusparseCsr
         : gko::EnableLinOp<CusparseCsr, CusparseBase>(exec, size),
           csr_(std::move(
               csr::create(exec, std::make_shared<typename csr::classical>()))),
-          trans_(CUSPARSE_OPERATION_NON_TRANSPOSE)
+          trans_(SPARSELIB_OPERATION_NON_TRANSPOSE)
     {}
 
 private:
@@ -288,7 +288,7 @@ class CusparseCsrmm
 
         auto guard = this->get_gpu_exec()->get_scoped_device_id_guard();
         gko::kernels::cuda::cusparse::spmm(
-            this->get_gpu_exec()->get_cusparse_handle(), trans_,
+            this->get_gpu_exec()->get_sparselib_handle(), trans_,
             this->get_size()[0], dense_b->get_size()[1], this->get_size()[1],
             csr_->get_num_stored_elements(), &scalars.get_const_data()[0],
             this->get_descr(), csr_->get_const_values(),
@@ -306,7 +306,7 @@ class CusparseCsrmm
         : gko::EnableLinOp<CusparseCsrmm, CusparseBase>(exec, size),
           csr_(std::move(
               csr::create(exec, std::make_shared<typename csr::classical>()))),
-          trans_(CUSPARSE_OPERATION_NON_TRANSPOSE)
+          trans_(SPARSELIB_OPERATION_NON_TRANSPOSE)
     {}
 
 private:
@@ -376,7 +376,7 @@ class CusparseCsrEx
         gko::size_type buffer_size = 0;
 
         auto guard = this->get_gpu_exec()->get_scoped_device_id_guard();
-        auto handle = this->get_gpu_exec()->get_cusparse_handle();
+        auto handle = this->get_gpu_exec()->get_sparselib_handle();
         // This function seems to require the pointer mode to be set to HOST.
         // Ginkgo use pointer mode DEVICE by default, so we change this
         // temporarily.
@@ -407,7 +407,7 @@ class CusparseCsrEx
         : gko::EnableLinOp<CusparseCsrEx, CusparseBase>(exec, size),
           csr_(std::move(
               csr::create(exec, std::make_shared<typename csr::classical>()))),
-          trans_(CUSPARSE_OPERATION_NON_TRANSPOSE),
+          trans_(SPARSELIB_OPERATION_NON_TRANSPOSE),
           buffer_(exec)
     {
         algmode_ = CUSPARSE_ALG_MERGE_PATH;
@@ -465,7 +465,7 @@ class CusparseHybrid
 
         auto guard = this->get_gpu_exec()->get_scoped_device_id_guard();
         gko::kernels::cuda::cusparse::csr2hyb(
-            this->get_gpu_exec()->get_cusparse_handle(), this->get_size()[0],
+            this->get_gpu_exec()->get_sparselib_handle(), this->get_size()[0],
             this->get_size()[1], this->get_descr(), t_csr->get_const_values(),
             t_csr->get_const_row_ptrs(), t_csr->get_const_col_idxs(), hyb_,
             Threshold, Partition);
@@ -496,7 +496,7 @@ class CusparseHybrid
 
         auto guard = this->get_gpu_exec()->get_scoped_device_id_guard();
         gko::kernels::cuda::cusparse::spmv(
-            this->get_gpu_exec()->get_cusparse_handle(), trans_,
+            this->get_gpu_exec()->get_sparselib_handle(), trans_,
             &scalars.get_const_data()[0], this->get_descr(), hyb_, db,
             &scalars.get_const_data()[1], dx);
     }
@@ -508,7 +508,7 @@ class CusparseHybrid
     CusparseHybrid(std::shared_ptr<const gko::Executor> exec,
                    const gko::dim<2>& size = gko::dim<2>{})
         : gko::EnableLinOp<CusparseHybrid, CusparseBase>(exec, size),
-          trans_(CUSPARSE_OPERATION_NON_TRANSPOSE)
+          trans_(SPARSELIB_OPERATION_NON_TRANSPOSE)
     {
         auto guard = this->get_gpu_exec()->get_scoped_device_id_guard();
         GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseCreateHybMat(&hyb_));
@@ -555,13 +555,13 @@ void cusparse_generic_spmv(std::shared_ptr<const gko::CudaExecutor> gpu_exec,
 
     gko::size_type buffer_size = 0;
     GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseSpMV_bufferSize(
-        gpu_exec->get_cusparse_handle(), trans, &scalars.get_const_data()[0],
+        gpu_exec->get_sparselib_handle(), trans, &scalars.get_const_data()[0],
         mat, vecb, &scalars.get_const_data()[1], vecx, cu_value, alg,
         &buffer_size));
     gko::array<char> buffer_array(gpu_exec, buffer_size);
     auto dbuffer = buffer_array.get_data();
     GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseSpMV(
-        gpu_exec->get_cusparse_handle(), trans, &scalars.get_const_data()[0],
+        gpu_exec->get_sparselib_handle(), trans, &scalars.get_const_data()[0],
         mat, vecb, &scalars.get_const_data()[1], vecx, cu_value, alg, dbuffer));
     GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroyDnVec(vecx));
     GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroyDnVec(vecb));
@@ -654,7 +654,7 @@ class CusparseGenericCsr
         : gko::EnableLinOp<CusparseGenericCsr, CusparseBase>(exec, size),
           csr_(std::move(
               csr::create(exec, std::make_shared<typename csr::classical>()))),
-          trans_(CUSPARSE_OPERATION_NON_TRANSPOSE)
+          trans_(SPARSELIB_OPERATION_NON_TRANSPOSE)
     {}
 
 private:
@@ -745,7 +745,7 @@ class CusparseGenericCoo
                        const gko::dim<2>& size = gko::dim<2>{})
         : gko::EnableLinOp<CusparseGenericCoo, CusparseBase>(exec, size),
           coo_(std::move(coo::create(exec))),
-          trans_(CUSPARSE_OPERATION_NON_TRANSPOSE)
+          trans_(SPARSELIB_OPERATION_NON_TRANSPOSE)
     {}
 
 private:
diff --git a/benchmark/utils/hip_linops.hip.cpp b/benchmark/utils/hip_linops.hip.cpp
index 2d952ce60e9..f0d7edb45c3 100644
--- a/benchmark/utils/hip_linops.hip.cpp
+++ b/benchmark/utils/hip_linops.hip.cpp
@@ -126,7 +126,7 @@ class HipsparseCsr
 
         auto guard = this->get_gpu_exec()->get_scoped_device_id_guard();
         gko::kernels::hip::hipsparse::spmv(
-            this->get_gpu_exec()->get_hipsparse_handle(), trans_,
+            this->get_gpu_exec()->get_sparselib_handle(), trans_,
             this->get_size()[0], this->get_size()[1],
             csr_->get_num_stored_elements(), &scalars.get_const_data()[0],
             this->get_descr(), csr_->get_const_values(),
@@ -143,7 +143,7 @@ class HipsparseCsr
         : gko::EnableLinOp<HipsparseCsr, HipsparseBase>(exec, size),
           csr_(std::move(
               csr::create(exec, std::make_shared<typename csr::classical>()))),
-          trans_(HIPSPARSE_OPERATION_NON_TRANSPOSE)
+          trans_(SPARSELIB_OPERATION_NON_TRANSPOSE)
     {}
 
 private:
@@ -201,7 +201,7 @@ class HipsparseCsrmm
 
         auto guard = this->get_gpu_exec()->get_scoped_device_id_guard();
         gko::kernels::hip::hipsparse::spmm(
-            this->get_gpu_exec()->get_hipsparse_handle(), trans_,
+            this->get_gpu_exec()->get_sparselib_handle(), trans_,
             this->get_size()[0], dense_b->get_size()[1], this->get_size()[1],
             csr_->get_num_stored_elements(), &scalars.get_const_data()[0],
             this->get_descr(), csr_->get_const_values(),
@@ -219,7 +219,7 @@ class HipsparseCsrmm
         : gko::EnableLinOp<HipsparseCsrmm, HipsparseBase>(exec, size),
           csr_(std::move(
               csr::create(exec, std::make_shared<typename csr::classical>()))),
-          trans_(HIPSPARSE_OPERATION_NON_TRANSPOSE)
+          trans_(SPARSELIB_OPERATION_NON_TRANSPOSE)
     {}
 
 private:
@@ -269,7 +269,7 @@ class HipsparseHybrid
 
         auto guard = this->get_gpu_exec()->get_scoped_device_id_guard();
         gko::kernels::hip::hipsparse::csr2hyb(
-            this->get_gpu_exec()->get_hipsparse_handle(), this->get_size()[0],
+            this->get_gpu_exec()->get_sparselib_handle(), this->get_size()[0],
             this->get_size()[1], this->get_descr(), t_csr->get_const_values(),
             t_csr->get_const_row_ptrs(), t_csr->get_const_col_idxs(), hyb_,
             Threshold, Partition);
@@ -300,7 +300,7 @@ class HipsparseHybrid
 
         auto guard = this->get_gpu_exec()->get_scoped_device_id_guard();
         gko::kernels::hip::hipsparse::spmv(
-            this->get_gpu_exec()->get_hipsparse_handle(), trans_,
+            this->get_gpu_exec()->get_sparselib_handle(), trans_,
             &scalars.get_const_data()[0], this->get_descr(), hyb_, db,
             &scalars.get_const_data()[1], dx);
     }
@@ -312,7 +312,7 @@ class HipsparseHybrid
     HipsparseHybrid(std::shared_ptr<const gko::Executor> exec,
                     const gko::dim<2>& size = gko::dim<2>{})
         : gko::EnableLinOp<HipsparseHybrid, HipsparseBase>(exec, size),
-          trans_(HIPSPARSE_OPERATION_NON_TRANSPOSE)
+          trans_(SPARSELIB_OPERATION_NON_TRANSPOSE)
     {
         auto guard = this->get_gpu_exec()->get_scoped_device_id_guard();
         GKO_ASSERT_NO_HIPSPARSE_ERRORS(hipsparseCreateHybMat(&hyb_));
diff --git a/cmake/create_test.cmake b/cmake/create_test.cmake
index 0aa93a3b141..9f7079f60a3 100644
--- a/cmake/create_test.cmake
+++ b/cmake/create_test.cmake
@@ -160,7 +160,7 @@ endfunction(ginkgo_create_cuda_test)
 ## Internal function allowing separate test name, filename and target name
 function(ginkgo_create_cuda_test_internal test_name filename test_target_name)
     add_executable(${test_target_name} ${filename})
-    target_compile_definitions(${test_target_name} PRIVATE GKO_COMPILING_CUDA)
+    target_compile_definitions(${test_target_name} PRIVATE GKO_COMPILING_CUDA GKO_DEVICE_NAMESPACE=cuda)
     if(MSVC)
         target_compile_options(${test_target_name}
             PRIVATE
@@ -188,7 +188,7 @@ endfunction(ginkgo_create_hip_test)
 function(ginkgo_create_hip_test_internal test_name filename test_target_name)
     set_source_files_properties(${filename} PROPERTIES LANGUAGE HIP)
     add_executable(${test_target_name} ${filename})
-    target_compile_definitions(${test_target_name} PRIVATE GKO_COMPILING_HIP)
+    target_compile_definitions(${test_target_name} PRIVATE GKO_COMPILING_HIP GKO_DEVICE_NAMESPACE=hip)
     ginkgo_set_test_target_properties(${test_target_name} "_hip" ${ARGN})
     ginkgo_add_test(${test_name} ${test_target_name} ${ARGN} RESOURCE_TYPE hipgpu)
 endfunction(ginkgo_create_hip_test_internal)
@@ -203,7 +203,7 @@ endfunction()
 function(ginkgo_create_omp_test_internal test_name filename test_target_name)
     ginkgo_build_test_name(${test_name} test_target_name)
     add_executable(${test_target_name} ${test_name}.cpp)
-    target_compile_definitions(${test_target_name} PRIVATE GKO_COMPILING_OMP)
+    target_compile_definitions(${test_target_name} PRIVATE GKO_COMPILING_OMP GKO_DEVICE_NAMESPACE=omp)
     target_link_libraries(${test_target_name} PRIVATE OpenMP::OpenMP_CXX)
     ginkgo_set_test_target_properties(${test_target_name} "_omp" ${ARGN})
     ginkgo_add_test(${test_name} ${test_target_name} ${ARGN} RESOURCE_TYPE cpu)
@@ -253,7 +253,7 @@ function(ginkgo_create_common_test_internal test_name exec_type exec)
         target_link_libraries(${test_target_name} PRIVATE OpenMP::OpenMP_CXX)
     endif ()
 
-    target_compile_definitions(${test_target_name} PRIVATE EXEC_TYPE=${exec_type} EXEC_NAMESPACE=${exec} GKO_COMPILING_${exec_upper})
+    target_compile_definitions(${test_target_name} PRIVATE EXEC_TYPE=${exec_type} GKO_DEVICE_NAMESPACE=${exec} GKO_COMPILING_${exec_upper})
     target_link_libraries(${test_target_name} PRIVATE ${common_test_ADDITIONAL_LIBRARIES})
     # use float for DPC++ if necessary
     if((exec STREQUAL "dpcpp") AND GINKGO_DPCPP_SINGLE_MODE)
@@ -285,13 +285,13 @@ function(ginkgo_create_common_device_test test_name)
         # need to make a separate file for this, since we can't set conflicting properties on the same file
         configure_file(${test_name}.cpp ${test_name}.cu COPYONLY)
         ginkgo_create_cuda_test_internal(${test_name}_cuda ${CMAKE_CURRENT_BINARY_DIR}/${test_name}.cu ${test_target_name}_cuda ${ARGN})
-        target_compile_definitions(${test_target_name}_cuda PRIVATE EXEC_TYPE=CudaExecutor EXEC_NAMESPACE=cuda)
+        target_compile_definitions(${test_target_name}_cuda PRIVATE EXEC_TYPE=CudaExecutor GKO_DEVICE_NAMESPACE=cuda)
     endif()
     if(GINKGO_BUILD_HIP)
         # need to make a separate file for this, since we can't set conflicting properties on the same file
         configure_file(${test_name}.cpp ${test_name}.hip.cpp COPYONLY)
         ginkgo_create_hip_test_internal(${test_name}_hip ${CMAKE_CURRENT_BINARY_DIR}/${test_name}.hip.cpp ${test_target_name}_hip ${ARGN})
-        target_compile_definitions(${test_target_name}_hip PRIVATE EXEC_TYPE=HipExecutor EXEC_NAMESPACE=hip)
+        target_compile_definitions(${test_target_name}_hip PRIVATE EXEC_TYPE=HipExecutor GKO_DEVICE_NAMESPACE=hip)
     endif()
 endfunction(ginkgo_create_common_device_test)
 
diff --git a/common/cuda_hip/base/blas_bindings.hpp b/common/cuda_hip/base/blas_bindings.hpp
new file mode 100644
index 00000000000..e59bbf0d7a0
--- /dev/null
+++ b/common/cuda_hip/base/blas_bindings.hpp
@@ -0,0 +1,18 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_COMMON_CUDA_HIP_BASE_BLAS_BINDINGS_HPP_
+#define GKO_COMMON_CUDA_HIP_BASE_BLAS_BINDINGS_HPP_
+
+
+#if defined(GKO_COMPILING_CUDA)
+#include "cuda/base/cublas_bindings.hpp"
+#elif defined(GKO_COMPILING_HIP)
+#include "hip/base/hipblas_bindings.hip.hpp"
+#else
+#error "Executor definition missing"
+#endif
+
+
+#endif  // GKO_COMMON_CUDA_HIP_BASE_BLAS_BINDINGS_HPP_
diff --git a/common/cuda_hip/base/config.hpp b/common/cuda_hip/base/config.hpp
new file mode 100644
index 00000000000..00825fe8b72
--- /dev/null
+++ b/common/cuda_hip/base/config.hpp
@@ -0,0 +1,18 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_COMMON_CUDA_HIP_BASE_CONFIG_HPP_
+#define GKO_COMMON_CUDA_HIP_BASE_CONFIG_HPP_
+
+
+#if defined(GKO_COMPILING_CUDA)
+#include "cuda/base/config.hpp"
+#elif defined(GKO_COMPILING_HIP)
+#include "hip/base/config.hip.hpp"
+#else
+#error "Executor definition missing"
+#endif
+
+
+#endif  // GKO_COMMON_CUDA_HIP_BASE_CONFIG_HPP_
diff --git a/common/cuda_hip/base/pointer_mode_guard.hpp b/common/cuda_hip/base/pointer_mode_guard.hpp
new file mode 100644
index 00000000000..40bf694ef73
--- /dev/null
+++ b/common/cuda_hip/base/pointer_mode_guard.hpp
@@ -0,0 +1,18 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_COMMON_CUDA_HIP_BASE_POINTER_MODE_GUARD_HPP_
+#define GKO_COMMON_CUDA_HIP_BASE_POINTER_MODE_GUARD_HPP_
+
+
+#if defined(GKO_COMPILING_CUDA)
+#include "cuda/base/pointer_mode_guard.hpp"
+#elif defined(GKO_COMPILING_HIP)
+#include "hip/base/pointer_mode_guard.hip.hpp"
+#else
+#error "Executor definition missing"
+#endif
+
+
+#endif  // GKO_COMMON_CUDA_HIP_BASE_POINTER_MODE_GUARD_HPP_
diff --git a/common/cuda_hip/base/randlib_bindings.hpp b/common/cuda_hip/base/randlib_bindings.hpp
new file mode 100644
index 00000000000..7797ad38c64
--- /dev/null
+++ b/common/cuda_hip/base/randlib_bindings.hpp
@@ -0,0 +1,18 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_COMMON_CUDA_HIP_BASE_RANDLIB_BINDINGS_HPP_
+#define GKO_COMMON_CUDA_HIP_BASE_RANDLIB_BINDINGS_HPP_
+
+
+#if defined(GKO_COMPILING_CUDA)
+#include "cuda/base/curand_bindings.hpp"
+#elif defined(GKO_COMPILING_HIP)
+#include "hip/base/hiprand_bindings.hip.hpp"
+#else
+#error "Executor definition missing"
+#endif
+
+
+#endif  // GKO_COMMON_CUDA_HIP_BASE_RANDLIB_BINDINGS_HPP_
diff --git a/common/cuda_hip/base/runtime.hpp b/common/cuda_hip/base/runtime.hpp
new file mode 100644
index 00000000000..6a7a7a3c4a2
--- /dev/null
+++ b/common/cuda_hip/base/runtime.hpp
@@ -0,0 +1,18 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_COMMON_CUDA_HIP_BASE_RUNTIME_HPP_
+#define GKO_COMMON_CUDA_HIP_BASE_RUNTIME_HPP_
+
+
+#if defined(GKO_COMPILING_CUDA)
+// nothing needed here
+#elif defined(GKO_COMPILING_HIP)
+#include <hip/hip_runtime.h>
+#else
+#error "Executor definition missing"
+#endif
+
+
+#endif  // GKO_COMMON_CUDA_HIP_BASE_RUNTIME_HPP_
diff --git a/common/cuda_hip/base/sparselib_bindings.hpp b/common/cuda_hip/base/sparselib_bindings.hpp
new file mode 100644
index 00000000000..26c0bda236d
--- /dev/null
+++ b/common/cuda_hip/base/sparselib_bindings.hpp
@@ -0,0 +1,18 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_COMMON_CUDA_HIP_BASE_SPARSELIB_BINDINGS_HPP_
+#define GKO_COMMON_CUDA_HIP_BASE_SPARSELIB_BINDINGS_HPP_
+
+
+#if defined(GKO_COMPILING_CUDA)
+#include "cuda/base/cusparse_bindings.hpp"
+#elif defined(GKO_COMPILING_HIP)
+#include "hip/base/hipsparse_bindings.hip.hpp"
+#else
+#error "Executor definition missing"
+#endif
+
+
+#endif  // GKO_COMMON_CUDA_HIP_BASE_SPARSELIB_BINDINGS_HPP_
diff --git a/common/cuda_hip/base/thrust.hpp b/common/cuda_hip/base/thrust.hpp
new file mode 100644
index 00000000000..02aaebc9f3d
--- /dev/null
+++ b/common/cuda_hip/base/thrust.hpp
@@ -0,0 +1,53 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_COMMON_CUDA_HIP_BASE_THRUST_HPP_
+#define GKO_COMMON_CUDA_HIP_BASE_THRUST_HPP_
+
+
+#include <thrust/execution_policy.h>
+
+
+#include <ginkgo/config.hpp>
+#include <ginkgo/core/base/executor.hpp>
+
+
+#if defined(GKO_COMPILING_CUDA) || \
+    (defined(GKO_COMPILING_HIP) && !GINKGO_HIP_PLATFORM_HCC)
+#include <thrust/system/cuda/detail/execution_policy.h>
+#else
+#include <thrust/system/hip/detail/execution_policy.h>
+#endif
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+
+
+#if defined(GKO_COMPILING_CUDA)
+inline auto thrust_policy(std::shared_ptr<const CudaExecutor> exec)
+{
+    return thrust::cuda::par.on(exec->get_stream());
+}
+#elif defined(GKO_COMPILING_HIP)
+inline auto thrust_policy(std::shared_ptr<const HipExecutor> exec)
+{
+#if GINKGO_HIP_PLATFORM_HCC
+    return thrust::hip::par.on(exec->get_stream());
+#else
+    return thrust::cuda::par.on(exec->get_stream());
+#endif
+}
+#else
+#error "Executor definition missing"
+#endif
+
+
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_COMMON_CUDA_HIP_BASE_THRUST_HPP_
diff --git a/common/cuda_hip/base/types.hpp b/common/cuda_hip/base/types.hpp
new file mode 100644
index 00000000000..08f0516d691
--- /dev/null
+++ b/common/cuda_hip/base/types.hpp
@@ -0,0 +1,18 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_COMMON_CUDA_HIP_BASE_TYPES_HPP_
+#define GKO_COMMON_CUDA_HIP_BASE_TYPES_HPP_
+
+
+#if defined(GKO_COMPILING_CUDA)
+#include "cuda/base/types.hpp"
+#elif defined(GKO_COMPILING_HIP)
+#include "hip/base/types.hip.hpp"
+#else
+#error "Executor definition missing"
+#endif
+
+
+#endif  // GKO_COMMON_CUDA_HIP_BASE_TYPES_HPP_
diff --git a/common/cuda_hip/components/atomic.hpp.inc b/common/cuda_hip/components/atomic.hpp.inc
index 3d76cfdcb79..60eaf5a9dd9 100644
--- a/common/cuda_hip/components/atomic.hpp.inc
+++ b/common/cuda_hip/components/atomic.hpp.inc
@@ -196,3 +196,35 @@ GKO_BIND_ATOMIC_MAX(unsigned long long int);
 
 
 #undef GKO_BIND_ATOMIC_MAX
+
+
+/**
+ * @internal
+ *
+ * @note It is not 'real' complex<float> atomic add operation
+ */
+__forceinline__ __device__ thrust::complex<float> atomic_add(
+    thrust::complex<float>* __restrict__ address, thrust::complex<float> val)
+{
+    auto addr = reinterpret_cast<float*>(address);
+    // Separate to real part and imag part
+    auto real = atomic_add(addr, val.real());
+    auto imag = atomic_add(addr + 1, val.imag());
+    return {real, imag};
+}
+
+
+/**
+ * @internal
+ *
+ * @note It is not 'real' complex<double> atomic add operation
+ */
+__forceinline__ __device__ thrust::complex<double> atomic_add(
+    thrust::complex<double>* __restrict__ address, thrust::complex<double> val)
+{
+    auto addr = reinterpret_cast<double*>(address);
+    // Separate to real part and imag part
+    auto real = atomic_add(addr, val.real());
+    auto imag = atomic_add(addr + 1, val.imag());
+    return {real, imag};
+}
diff --git a/common/cuda_hip/components/cooperative_groups.hpp b/common/cuda_hip/components/cooperative_groups.hpp
new file mode 100644
index 00000000000..a57440f6d30
--- /dev/null
+++ b/common/cuda_hip/components/cooperative_groups.hpp
@@ -0,0 +1,18 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_COOPERATIVE_GROUPS_HPP_
+#define GKO_COMMON_CUDA_HIP_COMPONENTS_COOPERATIVE_GROUPS_HPP_
+
+
+#if defined(GKO_COMPILING_CUDA)
+#include "cuda/components/cooperative_groups.cuh"
+#elif defined(GKO_COMPILING_HIP)
+#include "hip/components/cooperative_groups.hip.hpp"
+#else
+#error "Executor definition missing"
+#endif
+
+
+#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_COOPERATIVE_GROUPS_HPP_
diff --git a/common/cuda_hip/components/format_conversion.hpp b/common/cuda_hip/components/format_conversion.hpp
new file mode 100644
index 00000000000..9faf7a58c25
--- /dev/null
+++ b/common/cuda_hip/components/format_conversion.hpp
@@ -0,0 +1,18 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_FORMAT_CONVERSION_HPP_
+#define GKO_COMMON_CUDA_HIP_COMPONENTS_FORMAT_CONVERSION_HPP_
+
+
+#if defined(GKO_COMPILING_CUDA)
+#include "cuda/components/format_conversion.cuh"
+#elif defined(GKO_COMPILING_HIP)
+#include "hip/components/format_conversion.hip.hpp"
+#else
+#error "Executor definition missing"
+#endif
+
+
+#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_FORMAT_CONVERSION_HPP_
diff --git a/common/cuda_hip/components/memory.hpp b/common/cuda_hip/components/memory.hpp
new file mode 100644
index 00000000000..9bfd9cba1e0
--- /dev/null
+++ b/common/cuda_hip/components/memory.hpp
@@ -0,0 +1,18 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_COMMON_CUDA_HIP_COMPONENTS_MEMORY_HPP_
+#define GKO_COMMON_CUDA_HIP_COMPONENTS_MEMORY_HPP_
+
+
+#if defined(GKO_COMPILING_CUDA)
+#include "cuda/components/memory.cuh"
+#elif defined(GKO_COMPILING_HIP)
+#include "hip/components/memory.hip.hpp"
+#else
+#error "Executor definition missing"
+#endif
+
+
+#endif  // GKO_COMMON_CUDA_HIP_COMPONENTS_MEMORY_HPP_
diff --git a/common/cuda_hip/factorization/par_ict_spgeam_kernels.hpp.inc b/common/cuda_hip/factorization/par_ict_kernels.hpp.inc
similarity index 75%
rename from common/cuda_hip/factorization/par_ict_spgeam_kernels.hpp.inc
rename to common/cuda_hip/factorization/par_ict_kernels.hpp.inc
index 93a49e56d21..87aa8297345 100644
--- a/common/cuda_hip/factorization/par_ict_spgeam_kernels.hpp.inc
+++ b/common/cuda_hip/factorization/par_ict_kernels.hpp.inc
@@ -206,4 +206,72 @@ __global__ __launch_bounds__(default_block_size) void ict_tri_spgeam_init(
 }
 
 
+template <int subwarp_size, typename ValueType, typename IndexType>
+__global__ __launch_bounds__(default_block_size) void ict_sweep(
+    const IndexType* __restrict__ a_row_ptrs,
+    const IndexType* __restrict__ a_col_idxs,
+    const ValueType* __restrict__ a_vals,
+    const IndexType* __restrict__ l_row_ptrs,
+    const IndexType* __restrict__ l_row_idxs,
+    const IndexType* __restrict__ l_col_idxs, ValueType* __restrict__ l_vals,
+    IndexType l_nnz)
+{
+    auto l_nz = thread::get_subwarp_id_flat<subwarp_size, IndexType>();
+    if (l_nz >= l_nnz) {
+        return;
+    }
+    auto row = l_row_idxs[l_nz];
+    auto col = l_col_idxs[l_nz];
+    auto subwarp =
+        group::tiled_partition<subwarp_size>(group::this_thread_block());
+    // find entry of A at (row, col)
+    auto a_row_begin = a_row_ptrs[row];
+    auto a_row_end = a_row_ptrs[row + 1];
+    auto a_row_size = a_row_end - a_row_begin;
+    auto a_idx =
+        group_wide_search(a_row_begin, a_row_size, subwarp,
+                          [&](IndexType i) { return a_col_idxs[i] >= col; });
+    bool has_a = a_idx < a_row_end && a_col_idxs[a_idx] == col;
+    auto a_val = has_a ? a_vals[a_idx] : zero<ValueType>();
+    auto l_row_begin = l_row_ptrs[row];
+    auto l_row_size = l_row_ptrs[row + 1] - l_row_begin;
+    auto lh_col_begin = l_row_ptrs[col];
+    auto lh_col_size = l_row_ptrs[col + 1] - lh_col_begin;
+    ValueType sum{};
+    IndexType lh_nz{};
+    auto last_entry = col;
+    group_merge<subwarp_size>(
+        l_col_idxs + l_row_begin, l_row_size, l_col_idxs + lh_col_begin,
+        lh_col_size, subwarp,
+        [&](IndexType l_idx, IndexType l_col, IndexType lh_idx,
+            IndexType lh_row, IndexType, bool) {
+            // we don't need to use the `bool valid` because last_entry is
+            // already a smaller sentinel value than the one used in group_merge
+            if (l_col == lh_row && l_col < last_entry) {
+                sum += load_relaxed(l_vals + (l_idx + l_row_begin)) *
+                       conj(load_relaxed(l_vals + (lh_idx + lh_col_begin)));
+            }
+            // remember the transposed element
+            auto found_transp = subwarp.ballot(lh_row == row);
+            if (found_transp) {
+                lh_nz =
+                    subwarp.shfl(lh_idx + lh_col_begin, ffs(found_transp) - 1);
+            }
+            return true;
+        });
+    // accumulate result from all threads
+    sum = reduce(subwarp, sum, [](ValueType a, ValueType b) { return a + b; });
+
+    if (subwarp.thread_rank() == 0) {
+        auto to_write =
+            row == col ? sqrt(a_val - sum)
+                       : (a_val - sum) /
+                             load_relaxed(l_vals + (l_row_ptrs[col + 1] - 1));
+        if (is_finite(to_write)) {
+            store_relaxed(l_vals + l_nz, to_write);
+        }
+    }
+}
+
+
 }  // namespace kernel
diff --git a/common/cuda_hip/factorization/par_ict_sweep_kernels.hpp.inc b/common/cuda_hip/factorization/par_ict_sweep_kernels.hpp.inc
deleted file mode 100644
index bc58f0a9799..00000000000
--- a/common/cuda_hip/factorization/par_ict_sweep_kernels.hpp.inc
+++ /dev/null
@@ -1,76 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-namespace kernel {
-
-
-template <int subwarp_size, typename ValueType, typename IndexType>
-__global__ __launch_bounds__(default_block_size) void ict_sweep(
-    const IndexType* __restrict__ a_row_ptrs,
-    const IndexType* __restrict__ a_col_idxs,
-    const ValueType* __restrict__ a_vals,
-    const IndexType* __restrict__ l_row_ptrs,
-    const IndexType* __restrict__ l_row_idxs,
-    const IndexType* __restrict__ l_col_idxs, ValueType* __restrict__ l_vals,
-    IndexType l_nnz)
-{
-    auto l_nz = thread::get_subwarp_id_flat<subwarp_size, IndexType>();
-    if (l_nz >= l_nnz) {
-        return;
-    }
-    auto row = l_row_idxs[l_nz];
-    auto col = l_col_idxs[l_nz];
-    auto subwarp =
-        group::tiled_partition<subwarp_size>(group::this_thread_block());
-    // find entry of A at (row, col)
-    auto a_row_begin = a_row_ptrs[row];
-    auto a_row_end = a_row_ptrs[row + 1];
-    auto a_row_size = a_row_end - a_row_begin;
-    auto a_idx =
-        group_wide_search(a_row_begin, a_row_size, subwarp,
-                          [&](IndexType i) { return a_col_idxs[i] >= col; });
-    bool has_a = a_idx < a_row_end && a_col_idxs[a_idx] == col;
-    auto a_val = has_a ? a_vals[a_idx] : zero<ValueType>();
-    auto l_row_begin = l_row_ptrs[row];
-    auto l_row_size = l_row_ptrs[row + 1] - l_row_begin;
-    auto lh_col_begin = l_row_ptrs[col];
-    auto lh_col_size = l_row_ptrs[col + 1] - lh_col_begin;
-    ValueType sum{};
-    IndexType lh_nz{};
-    auto last_entry = col;
-    group_merge<subwarp_size>(
-        l_col_idxs + l_row_begin, l_row_size, l_col_idxs + lh_col_begin,
-        lh_col_size, subwarp,
-        [&](IndexType l_idx, IndexType l_col, IndexType lh_idx,
-            IndexType lh_row, IndexType, bool) {
-            // we don't need to use the `bool valid` because last_entry is
-            // already a smaller sentinel value than the one used in group_merge
-            if (l_col == lh_row && l_col < last_entry) {
-                sum += load_relaxed(l_vals + (l_idx + l_row_begin)) *
-                       conj(load_relaxed(l_vals + (lh_idx + lh_col_begin)));
-            }
-            // remember the transposed element
-            auto found_transp = subwarp.ballot(lh_row == row);
-            if (found_transp) {
-                lh_nz =
-                    subwarp.shfl(lh_idx + lh_col_begin, ffs(found_transp) - 1);
-            }
-            return true;
-        });
-    // accumulate result from all threads
-    sum = reduce(subwarp, sum, [](ValueType a, ValueType b) { return a + b; });
-
-    if (subwarp.thread_rank() == 0) {
-        auto to_write =
-            row == col ? sqrt(a_val - sum)
-                       : (a_val - sum) /
-                             load_relaxed(l_vals + (l_row_ptrs[col + 1] - 1));
-        if (is_finite(to_write)) {
-            store_relaxed(l_vals + l_nz, to_write);
-        }
-    }
-}
-
-
-}  // namespace kernel
diff --git a/common/cuda_hip/preconditioner/jacobi_advanced_apply_kernel.hpp.inc b/common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.hpp.inc
similarity index 100%
rename from common/cuda_hip/preconditioner/jacobi_advanced_apply_kernel.hpp.inc
rename to common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.hpp.inc
diff --git a/common/cuda_hip/preconditioner/jacobi_generate_kernel.hpp.inc b/common/cuda_hip/preconditioner/jacobi_generate_kernels.hpp.inc
similarity index 100%
rename from common/cuda_hip/preconditioner/jacobi_generate_kernel.hpp.inc
rename to common/cuda_hip/preconditioner/jacobi_generate_kernels.hpp.inc
diff --git a/common/cuda_hip/preconditioner/jacobi_simple_apply_kernel.hpp.inc b/common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.hpp.inc
similarity index 100%
rename from common/cuda_hip/preconditioner/jacobi_simple_apply_kernel.hpp.inc
rename to common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.hpp.inc
diff --git a/common/unified/base/kernel_launch.hpp b/common/unified/base/kernel_launch.hpp
index b32572546f0..5ca25ecb1e3 100644
--- a/common/unified/base/kernel_launch.hpp
+++ b/common/unified/base/kernel_launch.hpp
@@ -19,7 +19,7 @@
 
 #define GKO_DEVICE_NAMESPACE cuda
 #define GKO_KERNEL __device__
-#include "cuda/base/types.hpp"
+#include "common/cuda_hip/base/types.hpp"
 
 
 namespace gko {
@@ -46,7 +46,7 @@ GKO_INLINE GKO_ATTRIBUTES constexpr unpack_member_type<T> unpack_member(T value)
 
 #define GKO_DEVICE_NAMESPACE hip
 #define GKO_KERNEL __device__
-#include "hip/base/types.hip.hpp"
+#include "common/cuda_hip/base/types.hpp"
 
 
 namespace gko {
diff --git a/core/test/gtest/CMakeLists.txt b/core/test/gtest/CMakeLists.txt
index 56f83181375..f500ddb6ae5 100644
--- a/core/test/gtest/CMakeLists.txt
+++ b/core/test/gtest/CMakeLists.txt
@@ -25,14 +25,14 @@ if (GINKGO_BUILD_MPI)
     add_library(ginkgo_gtest_main_mpi_cpu ALIAS ginkgo_gtest_main_mpi)
 endif()
 if (GINKGO_BUILD_OMP)
-    add_gtest_main("_omp" "GKO_COMPILING_OMP")
+    add_gtest_main("_omp" "GKO_COMPILING_OMP;GKO_DEVICE_NAMESPACE=omp")
 endif()
 if (GINKGO_BUILD_CUDA)
-    add_gtest_main("_cuda" "GKO_COMPILING_CUDA")
+    add_gtest_main("_cuda" "GKO_COMPILING_CUDA;GKO_DEVICE_NAMESPACE=cuda")
 endif()
 if (GINKGO_BUILD_HIP)
-    add_gtest_main("_hip" "GKO_COMPILING_HIP")
+    add_gtest_main("_hip" "GKO_COMPILING_HIP;GKO_DEVICE_NAMESPACE=hip")
 endif()
 if (GINKGO_BUILD_SYCL)
-    add_gtest_main("_dpcpp" "GKO_COMPILING_DPCPP")
+    add_gtest_main("_dpcpp" "GKO_COMPILING_DPCPP;GKO_DEVICE_NAMESPACE=dpcpp")
 endif()
diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt
index bd214691a2e..3d251ecfa82 100644
--- a/cuda/CMakeLists.txt
+++ b/cuda/CMakeLists.txt
@@ -33,12 +33,12 @@ target_sources(ginkgo_cuda
     factorization/par_ic_kernels.cu
     factorization/par_ict_kernels.cu
     factorization/par_ilu_kernels.cu
-    factorization/par_ilut_approx_filter_kernel.cu
-    factorization/par_ilut_filter_kernel.cu
+    factorization/par_ilut_approx_filter_kernels.cu
+    factorization/par_ilut_filter_kernels.cu
     factorization/par_ilut_select_common.cu
-    factorization/par_ilut_select_kernel.cu
-    factorization/par_ilut_spgeam_kernel.cu
-    factorization/par_ilut_sweep_kernel.cu
+    factorization/par_ilut_select_kernels.cu
+    factorization/par_ilut_spgeam_kernels.cu
+    factorization/par_ilut_sweep_kernels.cu
     matrix/batch_csr_kernels.cu
     matrix/batch_dense_kernels.cu
     matrix/batch_ell_kernels.cu
@@ -54,10 +54,10 @@ target_sources(ginkgo_cuda
     multigrid/pgm_kernels.cu
     preconditioner/batch_jacobi_kernels.cu
     preconditioner/isai_kernels.cu
-    preconditioner/jacobi_advanced_apply_kernel.cu
-    preconditioner/jacobi_generate_kernel.cu
+    preconditioner/jacobi_advanced_apply_kernels.cu
+    preconditioner/jacobi_generate_kernels.cu
     preconditioner/jacobi_kernels.cu
-    preconditioner/jacobi_simple_apply_kernel.cu
+    preconditioner/jacobi_simple_apply_kernels.cu
     reorder/rcm_kernels.cu
     solver/batch_bicgstab_kernels.cu
     solver/batch_cg_kernels.cu
@@ -85,18 +85,18 @@ endif()
 set(GKO_CUDA_JACOBI_SOURCES)
 foreach(GKO_JACOBI_BLOCK_SIZE IN LISTS GKO_CUDA_JACOBI_BLOCK_SIZES)
     configure_file(
-        preconditioner/jacobi_generate_instantiate.inc.cu
-        preconditioner/jacobi_generate_instantiate.${GKO_JACOBI_BLOCK_SIZE}.cu)
+        preconditioner/jacobi_generate_kernels.instantiate.cu
+        preconditioner/jacobi_generate_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.cu)
     configure_file(
-        preconditioner/jacobi_simple_apply_instantiate.inc.cu
-        preconditioner/jacobi_simple_apply_instantiate.${GKO_JACOBI_BLOCK_SIZE}.cu)
+        preconditioner/jacobi_simple_apply_kernels.instantiate.cu
+        preconditioner/jacobi_simple_apply_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.cu)
     configure_file(
-        preconditioner/jacobi_advanced_apply_instantiate.inc.cu
-        preconditioner/jacobi_advanced_apply_instantiate.${GKO_JACOBI_BLOCK_SIZE}.cu)
+        preconditioner/jacobi_advanced_apply_kernels.instantiate.cu
+        preconditioner/jacobi_advanced_apply_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.cu)
     list(APPEND GKO_CUDA_JACOBI_SOURCES
-        ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_generate_instantiate.${GKO_JACOBI_BLOCK_SIZE}.cu
-        ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_simple_apply_instantiate.${GKO_JACOBI_BLOCK_SIZE}.cu
-        ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_advanced_apply_instantiate.${GKO_JACOBI_BLOCK_SIZE}.cu)
+        ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_generate_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.cu
+        ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_simple_apply_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.cu
+        ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_advanced_apply_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.cu)
 endforeach()
 target_sources(ginkgo_cuda PRIVATE ${GKO_CUDA_JACOBI_SOURCES})
 string(REPLACE ";" "," GKO_CUDA_JACOBI_BLOCK_SIZES_CODE "${GKO_CUDA_JACOBI_BLOCK_SIZES}")
@@ -120,7 +120,7 @@ if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA")
 endif()
 
 ginkgo_compile_features(ginkgo_cuda)
-target_compile_definitions(ginkgo_cuda PRIVATE GKO_COMPILING_CUDA)
+target_compile_definitions(ginkgo_cuda PRIVATE GKO_COMPILING_CUDA GKO_DEVICE_NAMESPACE=cuda)
 
 # include path for generated headers like jacobi_common.hpp
 target_include_directories(ginkgo_cuda
@@ -133,7 +133,7 @@ ginkgo_default_includes(ginkgo_cuda)
 ginkgo_install_library(ginkgo_cuda)
 
 if (GINKGO_CHECK_CIRCULAR_DEPS)
-    ginkgo_check_headers(ginkgo_cuda GKO_COMPILING_CUDA)
+    ginkgo_check_headers(ginkgo_cuda "GKO_COMPILING_CUDA;GKO_DEVICE_NAMESPACE=cuda")
 endif()
 
 if(GINKGO_BUILD_TESTS)
diff --git a/cuda/base/batch_multi_vector_kernels.cu b/cuda/base/batch_multi_vector_kernels.cu
index 5bc899c11ed..dcaafd5a46c 100644
--- a/cuda/base/batch_multi_vector_kernels.cu
+++ b/cuda/base/batch_multi_vector_kernels.cu
@@ -13,13 +13,14 @@
 #include <ginkgo/core/base/range_accessors.hpp>
 
 
+#include "common/cuda_hip/base/blas_bindings.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/pointer_mode_guard.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/batch_struct.hpp"
 #include "cuda/base/batch_struct.hpp"
-#include "cuda/base/config.hpp"
-#include "cuda/base/cublas_bindings.hpp"
-#include "cuda/base/pointer_mode_guard.hpp"
 #include "cuda/base/thrust.cuh"
-#include "cuda/components/cooperative_groups.cuh"
 #include "cuda/components/reduction.cuh"
 #include "cuda/components/thread_ids.cuh"
 #include "cuda/components/uninitialized_array.hpp"
@@ -39,6 +40,7 @@ namespace batch_multi_vector {
 constexpr auto default_block_size = 256;
 constexpr int sm_oversubscription = 4;
 
+
 // clang-format off
 
 // NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
diff --git a/cuda/base/batch_struct.hpp b/cuda/base/batch_struct.hpp
index 7c968ec2c6e..5251c594d42 100644
--- a/cuda/base/batch_struct.hpp
+++ b/cuda/base/batch_struct.hpp
@@ -10,9 +10,9 @@
 #include <ginkgo/core/base/math.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/types.hpp"
 #include "core/base/batch_struct.hpp"
-#include "cuda/base/config.hpp"
-#include "cuda/base/types.hpp"
 
 
 namespace gko {
diff --git a/cuda/base/cublas_bindings.hpp b/cuda/base/cublas_bindings.hpp
index 485249b7665..c1cdf1f996e 100644
--- a/cuda/base/cublas_bindings.hpp
+++ b/cuda/base/cublas_bindings.hpp
@@ -12,8 +12,8 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
+#include "common/cuda_hip/base/types.hpp"
 #include "cuda/base/math.hpp"
-#include "cuda/base/types.hpp"
 
 
 namespace gko {
@@ -249,6 +249,20 @@ inline void destroy(cublasHandle_t handle)
 
 
 }  // namespace cublas
+
+
+namespace blas {
+
+
+using namespace cublas;
+
+
+#define BLAS_OP_N CUBLAS_OP_N
+#define BLAS_OP_T CUBLAS_OP_T
+#define BLAS_OP_C CUBLAS_OP_C
+
+
+}  // namespace blas
 }  // namespace cuda
 }  // namespace kernels
 }  // namespace gko
diff --git a/cuda/base/curand_bindings.hpp b/cuda/base/curand_bindings.hpp
index b0ae52c5f00..10e09f4a356 100644
--- a/cuda/base/curand_bindings.hpp
+++ b/cuda/base/curand_bindings.hpp
@@ -12,8 +12,8 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
+#include "common/cuda_hip/base/types.hpp"
 #include "cuda/base/math.hpp"
-#include "cuda/base/types.hpp"
 
 
 namespace gko {
@@ -85,6 +85,18 @@ GKO_BIND_CURAND_RANDOM_VECTOR(std::complex<double>, curandGenerateNormalDouble);
 
 
 }  // namespace curand
+
+
+namespace randlib {
+
+
+using namespace curand;
+
+
+#define RANDLIB_RNG_PSEUDO_DEFAULT CURAND_RNG_PSEUDO_DEFAULT
+
+
+}  // namespace randlib
 }  // namespace cuda
 }  // namespace kernels
 }  // namespace gko
diff --git a/cuda/base/cusparse_bindings.hpp b/cuda/base/cusparse_bindings.hpp
index 87737e8865e..c18e1d7e9a6 100644
--- a/cuda/base/cusparse_bindings.hpp
+++ b/cuda/base/cusparse_bindings.hpp
@@ -13,7 +13,7 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
-#include "cuda/base/types.hpp"
+#include "common/cuda_hip/base/types.hpp"
 
 
 namespace gko {
@@ -940,6 +940,7 @@ inline void destroy(csrsm2Info_t info)
 #endif  // defined(CUDA_VERSION) && (CUDA_VERSION < 11031)
 
 
+GKO_BEGIN_DISABLE_DEPRECATION_WARNINGS
 inline csrilu02Info_t create_ilu0_info()
 {
     csrilu02Info_t info{};
@@ -948,7 +949,7 @@ inline csrilu02Info_t create_ilu0_info()
 }
 
 
-inline void destroy(csrilu02Info_t info)
+inline void destroy_ilu0_info(csrilu02Info_t info)
 {
     GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroyCsrilu02Info(info));
 }
@@ -962,10 +963,11 @@ inline csric02Info_t create_ic0_info()
 }
 
 
-inline void destroy(csric02Info_t info)
+inline void destroy_ic0_info(csric02Info_t info)
 {
     GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroyCsric02Info(info));
 }
+GKO_END_DISABLE_DEPRECATION_WARNINGS
 
 
 #if (defined(CUDA_VERSION) && (CUDA_VERSION < 11031))
@@ -1174,19 +1176,6 @@ void spsm_solve(cusparseHandle_t handle, cusparseOperation_t op_a,
 #endif  // (defined(CUDA_VERSION) && (CUDA_VERSION >= 11031))
 
 
-template <typename IndexType>
-void create_identity_permutation(cusparseHandle_t handle, IndexType size,
-                                 IndexType* permutation) GKO_NOT_IMPLEMENTED;
-
-template <>
-inline void create_identity_permutation<int32>(cusparseHandle_t handle,
-                                               int32 size, int32* permutation)
-{
-    GKO_ASSERT_NO_CUSPARSE_ERRORS(
-        cusparseCreateIdentityPermutation(handle, size, permutation));
-}
-
-
 template <typename IndexType>
 void csrsort_buffer_size(cusparseHandle_t handle, IndexType m, IndexType n,
                          IndexType nnz, const IndexType* row_ptrs,
@@ -1264,6 +1253,7 @@ inline void gather(cusparseHandle_t handle, cusparseDnVecDescr_t in,
 #endif
 
 
+GKO_BEGIN_DISABLE_DEPRECATION_WARNINGS
 template <typename ValueType, typename IndexType>
 void ilu0_buffer_size(cusparseHandle_t handle, IndexType m, IndexType nnz,
                       const cusparseMatDescr_t descr, const ValueType* vals,
@@ -1458,11 +1448,26 @@ GKO_BIND_CUSPARSE_IC0(float, cusparseScsric02);
 GKO_BIND_CUSPARSE_IC0(double, cusparseDcsric02);
 GKO_BIND_CUSPARSE_IC0(std::complex<float>, cusparseCcsric02);
 GKO_BIND_CUSPARSE_IC0(std::complex<double>, cusparseZcsric02);
+GKO_END_DISABLE_DEPRECATION_WARNINGS
 
 #undef GKO_BIND_CUSPARSE_IC0
 
 
 }  // namespace cusparse
+
+
+namespace sparselib {
+
+
+using namespace cusparse;
+
+
+#define SPARSELIB_OPERATION_TRANSPOSE CUSPARSE_OPERATION_TRANSPOSE
+#define SPARSELIB_OPERATION_NON_TRANSPOSE CUSPARSE_OPERATION_NON_TRANSPOSE
+#define SPARSELIB_SOLVE_POLICY_USE_LEVEL CUSPARSE_SOLVE_POLICY_USE_LEVEL
+
+
+}  // namespace sparselib
 }  // namespace cuda
 }  // namespace kernels
 }  // namespace gko
diff --git a/cuda/base/cusparse_block_bindings.hpp b/cuda/base/cusparse_block_bindings.hpp
index eddf249a22b..c3db763f0da 100644
--- a/cuda/base/cusparse_block_bindings.hpp
+++ b/cuda/base/cusparse_block_bindings.hpp
@@ -13,8 +13,8 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
+#include "common/cuda_hip/base/types.hpp"
 #include "cuda/base/cusparse_bindings.hpp"
-#include "cuda/base/types.hpp"
 
 
 namespace gko {
@@ -190,6 +190,7 @@ GKO_BIND_CUSPARSE_BLOCK_TRANSPOSE32(std::complex<double>, cusparseZgebsr2gebsc);
 #undef GKO_BIND_CUSPARSE_BLOCK_TRANSPOSE32
 
 
+GKO_BEGIN_DISABLE_DEPRECATION_WARNINGS
 inline std::unique_ptr<std::remove_pointer_t<bsrsm2Info_t>,
                        std::function<void(bsrsm2Info_t)>>
 create_bsr_trsm_info()
@@ -457,6 +458,7 @@ GKO_BIND_CUSPARSE_BILU0(std::complex<float>, cusparseCbsrilu02);
 GKO_BIND_CUSPARSE_BILU0(std::complex<double>, cusparseZbsrilu02);
 
 #undef GKO_BIND_CUSPARSE_BILU0
+GKO_END_DISABLE_DEPRECATION_WARNINGS
 
 
 }  // namespace cusparse
diff --git a/cuda/base/device_matrix_data_kernels.cu b/cuda/base/device_matrix_data_kernels.cu
index ed5601f57a5..554abe8bc37 100644
--- a/cuda/base/device_matrix_data_kernels.cu
+++ b/cuda/base/device_matrix_data_kernels.cu
@@ -14,8 +14,8 @@
 #include <thrust/tuple.h>
 
 
+#include "common/cuda_hip/base/types.hpp"
 #include "cuda/base/thrust.cuh"
-#include "cuda/base/types.hpp"
 
 
 namespace gko {
diff --git a/cuda/base/executor.cpp b/cuda/base/executor.cpp
index 52a92132689..3d1dbf7c92c 100644
--- a/cuda/base/executor.cpp
+++ b/cuda/base/executor.cpp
@@ -20,7 +20,7 @@
 #include <ginkgo/core/base/memory.hpp>
 
 
-#include "cuda/base/config.hpp"
+#include "common/cuda_hip/base/config.hpp"
 #include "cuda/base/cublas_bindings.hpp"
 #include "cuda/base/cusparse_handle.hpp"
 #include "cuda/base/scoped_device_id.hpp"
diff --git a/cuda/base/kernel_launch.cuh b/cuda/base/kernel_launch.cuh
index ec8d31ba747..0d4bc4eebd5 100644
--- a/cuda/base/kernel_launch.cuh
+++ b/cuda/base/kernel_launch.cuh
@@ -11,8 +11,9 @@
 #include <thrust/tuple.h>
 
 
-#include "accessor/cuda_helper.hpp"
-#include "cuda/base/types.hpp"
+#include "accessor/cuda_hip_helper.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
 #include "cuda/components/thread_ids.cuh"
 
 
@@ -23,21 +24,21 @@ namespace cuda {
 
 template <typename AccessorType>
 struct to_device_type_impl<gko::acc::range<AccessorType>&> {
-    using type = std::decay_t<decltype(gko::acc::as_cuda_range(
+    using type = std::decay_t<decltype(gko::acc::as_device_range(
         std::declval<gko::acc::range<AccessorType>>()))>;
     static type map_to_device(gko::acc::range<AccessorType>& range)
     {
-        return gko::acc::as_cuda_range(range);
+        return gko::acc::as_device_range(range);
     }
 };
 
 template <typename AccessorType>
 struct to_device_type_impl<const gko::acc::range<AccessorType>&> {
-    using type = std::decay_t<decltype(gko::acc::as_cuda_range(
+    using type = std::decay_t<decltype(gko::acc::as_device_range(
         std::declval<gko::acc::range<AccessorType>>()))>;
     static type map_to_device(const gko::acc::range<AccessorType>& range)
     {
-        return gko::acc::as_cuda_range(range);
+        return gko::acc::as_device_range(range);
     }
 };
 
diff --git a/cuda/base/kernel_launch_reduction.cuh b/cuda/base/kernel_launch_reduction.cuh
index 6146d7248d0..817d19006bc 100644
--- a/cuda/base/kernel_launch_reduction.cuh
+++ b/cuda/base/kernel_launch_reduction.cuh
@@ -8,9 +8,9 @@
 #endif
 
 
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "cuda/base/types.hpp"
-#include "cuda/components/cooperative_groups.cuh"
 #include "cuda/components/reduction.cuh"
 #include "cuda/components/thread_ids.cuh"
 
diff --git a/cuda/base/kernel_launch_solver.cuh b/cuda/base/kernel_launch_solver.cuh
index 17988755517..0d9eaeb2653 100644
--- a/cuda/base/kernel_launch_solver.cuh
+++ b/cuda/base/kernel_launch_solver.cuh
@@ -8,6 +8,9 @@
 #endif
 
 
+#include "common/cuda_hip/base/runtime.hpp"
+
+
 namespace gko {
 namespace kernels {
 namespace cuda {
diff --git a/cuda/base/types.hpp b/cuda/base/types.hpp
index 88e9eb17a35..561612f2869 100644
--- a/cuda/base/types.hpp
+++ b/cuda/base/types.hpp
@@ -394,6 +394,10 @@ GKO_INLINE GKO_ATTRIBUTES constexpr
 }
 
 
+using deviceComplex = cuComplex;
+using deviceDoubleComplex = cuDoubleComplex;
+
+
 }  // namespace cuda
 }  // namespace kernels
 }  // namespace gko
diff --git a/cuda/components/atomic.cuh b/cuda/components/atomic.cuh
index 6dbed0b0d25..1964f0ae196 100644
--- a/cuda/components/atomic.cuh
+++ b/cuda/components/atomic.cuh
@@ -9,8 +9,8 @@
 #include <type_traits>
 
 
+#include "common/cuda_hip/base/types.hpp"
 #include "cuda/base/math.hpp"
-#include "cuda/base/types.hpp"
 
 
 namespace gko {
@@ -21,38 +21,6 @@ namespace cuda {
 #include "common/cuda_hip/components/atomic.hpp.inc"
 
 
-/**
- * @internal
- *
- * @note It is not 'real' complex<float> atomic add operation
- */
-__forceinline__ __device__ thrust::complex<float> atomic_add(
-    thrust::complex<float>* __restrict__ address, thrust::complex<float> val)
-{
-    cuComplex* addr = reinterpret_cast<cuComplex*>(address);
-    // Separate to real part and imag part
-    auto real = atomic_add(&(addr->x), val.real());
-    auto imag = atomic_add(&(addr->y), val.imag());
-    return {real, imag};
-}
-
-
-/**
- * @internal
- *
- * @note It is not 'real' complex<double> atomic add operation
- */
-__forceinline__ __device__ thrust::complex<double> atomic_add(
-    thrust::complex<double>* __restrict__ address, thrust::complex<double> val)
-{
-    cuDoubleComplex* addr = reinterpret_cast<cuDoubleComplex*>(address);
-    // Separate to real part and imag part
-    auto real = atomic_add(&(addr->x), val.real());
-    auto imag = atomic_add(&(addr->y), val.imag());
-    return {real, imag};
-}
-
-
 }  // namespace cuda
 }  // namespace kernels
 }  // namespace gko
diff --git a/cuda/components/cooperative_groups.cuh b/cuda/components/cooperative_groups.cuh
index eae0c957f21..70643a3b16a 100644
--- a/cuda/components/cooperative_groups.cuh
+++ b/cuda/components/cooperative_groups.cuh
@@ -13,7 +13,7 @@
 #include <cuda.h>
 
 
-#include "cuda/base/config.hpp"
+#include "common/cuda_hip/base/config.hpp"
 
 
 namespace gko {
diff --git a/cuda/components/diagonal_block_manipulation.cuh b/cuda/components/diagonal_block_manipulation.cuh
index d748fcab2e5..a8f27d3a81f 100644
--- a/cuda/components/diagonal_block_manipulation.cuh
+++ b/cuda/components/diagonal_block_manipulation.cuh
@@ -9,9 +9,9 @@
 #include <type_traits>
 
 
-#include "cuda/base/config.hpp"
-#include "cuda/base/types.hpp"
-#include "cuda/components/cooperative_groups.cuh"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 
 
 namespace gko {
diff --git a/cuda/components/format_conversion.cuh b/cuda/components/format_conversion.cuh
index bccc927c9cd..f0ef007c53c 100644
--- a/cuda/components/format_conversion.cuh
+++ b/cuda/components/format_conversion.cuh
@@ -10,7 +10,7 @@
 #include <ginkgo/core/base/executor.hpp>
 
 
-#include "cuda/components/cooperative_groups.cuh"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "cuda/components/thread_ids.cuh"
 
 
diff --git a/cuda/components/memory.cuh b/cuda/components/memory.cuh
index 22bedca9699..97e5d67c23a 100644
--- a/cuda/components/memory.cuh
+++ b/cuda/components/memory.cuh
@@ -12,7 +12,7 @@
 #include <ginkgo/core/base/math.hpp>
 
 
-#include "cuda/base/types.hpp"
+#include "common/cuda_hip/base/types.hpp"
 
 
 namespace gko {
diff --git a/cuda/components/prefix_sum.cuh b/cuda/components/prefix_sum.cuh
index 653de4e9e15..2f6f145e304 100644
--- a/cuda/components/prefix_sum.cuh
+++ b/cuda/components/prefix_sum.cuh
@@ -9,8 +9,8 @@
 #include <type_traits>
 
 
-#include "cuda/base/cublas_bindings.hpp"
-#include "cuda/components/cooperative_groups.cuh"
+#include "common/cuda_hip/base/blas_bindings.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "cuda/components/reduction.cuh"
 #include "cuda/components/thread_ids.cuh"
 
diff --git a/cuda/components/reduction.cuh b/cuda/components/reduction.cuh
index ded80fae40a..250c560d44b 100644
--- a/cuda/components/reduction.cuh
+++ b/cuda/components/reduction.cuh
@@ -13,10 +13,11 @@
 #include <ginkgo/core/base/executor.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/array_access.hpp"
-#include "cuda/base/config.hpp"
-#include "cuda/base/types.hpp"
-#include "cuda/components/cooperative_groups.cuh"
 #include "cuda/components/thread_ids.cuh"
 #include "cuda/components/uninitialized_array.hpp"
 
diff --git a/cuda/components/searching.cuh b/cuda/components/searching.cuh
index 1dc1304a82a..5472ac46ed1 100644
--- a/cuda/components/searching.cuh
+++ b/cuda/components/searching.cuh
@@ -6,7 +6,7 @@
 #define GKO_CUDA_COMPONENTS_SEARCHING_CUH_
 
 
-#include "cuda/base/config.hpp"
+#include "common/cuda_hip/base/config.hpp"
 #include "cuda/components/intrinsics.cuh"
 
 
diff --git a/cuda/components/segment_scan.cuh b/cuda/components/segment_scan.cuh
index 842f1e06760..6ffb8028334 100644
--- a/cuda/components/segment_scan.cuh
+++ b/cuda/components/segment_scan.cuh
@@ -6,7 +6,7 @@
 #define GKO_CUDA_COMPONENTS_SEGMENT_SCAN_CUH_
 
 
-#include "cuda/components/cooperative_groups.cuh"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 
 
 namespace gko {
diff --git a/cuda/components/sorting.cuh b/cuda/components/sorting.cuh
index e6eb17ec8e4..59e44d1bb82 100644
--- a/cuda/components/sorting.cuh
+++ b/cuda/components/sorting.cuh
@@ -6,8 +6,8 @@
 #define GKO_CUDA_COMPONENTS_SORTING_CUH_
 
 
-#include "cuda/base/config.hpp"
-#include "cuda/components/cooperative_groups.cuh"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 
 
 namespace gko {
diff --git a/cuda/components/syncfree.cuh b/cuda/components/syncfree.cuh
index 0d45c8db516..0d5c0d11f43 100644
--- a/cuda/components/syncfree.cuh
+++ b/cuda/components/syncfree.cuh
@@ -9,11 +9,11 @@
 #include <ginkgo/core/base/array.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/memory.hpp"
 #include "core/components/fill_array_kernels.hpp"
-#include "cuda/base/config.hpp"
 #include "cuda/components/atomic.cuh"
-#include "cuda/components/cooperative_groups.cuh"
-#include "cuda/components/memory.cuh"
 
 
 namespace gko {
diff --git a/cuda/components/thread_ids.cuh b/cuda/components/thread_ids.cuh
index c3e517e0f9d..1113ea75fc6 100644
--- a/cuda/components/thread_ids.cuh
+++ b/cuda/components/thread_ids.cuh
@@ -6,17 +6,12 @@
 #define GKO_CUDA_COMPONENTS_THREAD_IDS_CUH_
 
 
-#include "cuda/base/config.hpp"
+#include "common/cuda_hip/base/config.hpp"
 
 
 namespace gko {
 namespace kernels {
 namespace cuda {
-/**
- * @brief The CUDA thread namespace.
- *
- * @ingroup cuda_thread
- */
 namespace thread {
 
 
diff --git a/cuda/factorization/cholesky_kernels.cu b/cuda/factorization/cholesky_kernels.cu
index 79779f2f54b..e05b0803dc2 100644
--- a/cuda/factorization/cholesky_kernels.cu
+++ b/cuda/factorization/cholesky_kernels.cu
@@ -20,15 +20,15 @@
 #include <ginkgo/core/matrix/csr.hpp>
 
 
+#include "common/cuda_hip/base/sparselib_bindings.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/components/fill_array_kernels.hpp"
 #include "core/components/format_conversion_kernels.hpp"
 #include "core/factorization/elimination_forest.hpp"
 #include "core/factorization/lu_kernels.hpp"
 #include "core/matrix/csr_lookup.hpp"
-#include "cuda/base/cusparse_bindings.hpp"
 #include "cuda/base/math.hpp"
 #include "cuda/base/thrust.cuh"
-#include "cuda/components/cooperative_groups.cuh"
 #include "cuda/components/intrinsics.cuh"
 #include "cuda/components/reduction.cuh"
 #include "cuda/components/syncfree.cuh"
@@ -80,19 +80,19 @@ void symbolic_count(std::shared_ptr<const DefaultExecutor> exec,
     }
     // sort postorder_cols inside rows
     {
-        const auto handle = exec->get_cusparse_handle();
-        auto descr = cusparse::create_mat_descr();
+        const auto handle = exec->get_sparselib_handle();
+        auto descr = sparselib::create_mat_descr();
         array<IndexType> permutation_array(exec, mtx_nnz);
         auto permutation = permutation_array.get_data();
         components::fill_seq_array(exec, permutation, mtx_nnz);
         size_type buffer_size{};
-        cusparse::csrsort_buffer_size(handle, num_rows, num_rows, mtx_nnz,
-                                      row_ptrs, postorder_cols, buffer_size);
+        sparselib::csrsort_buffer_size(handle, num_rows, num_rows, mtx_nnz,
+                                       row_ptrs, postorder_cols, buffer_size);
         array<char> buffer_array{exec, buffer_size};
         auto buffer = buffer_array.get_data();
-        cusparse::csrsort(handle, num_rows, num_rows, mtx_nnz, descr, row_ptrs,
-                          postorder_cols, permutation, buffer);
-        cusparse::destroy(descr);
+        sparselib::csrsort(handle, num_rows, num_rows, mtx_nnz, descr, row_ptrs,
+                           postorder_cols, permutation, buffer);
+        sparselib::destroy(descr);
     }
     // count nonzeros per row of L
     {
diff --git a/cuda/factorization/factorization_kernels.cu b/cuda/factorization/factorization_kernels.cu
index 4ea03981a15..309ded37d34 100644
--- a/cuda/factorization/factorization_kernels.cu
+++ b/cuda/factorization/factorization_kernels.cu
@@ -8,12 +8,13 @@
 #include <ginkgo/core/base/array.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/array_access.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/csr_builder.hpp"
-#include "cuda/base/config.hpp"
-#include "cuda/base/types.hpp"
-#include "cuda/components/cooperative_groups.cuh"
 #include "cuda/components/intrinsics.cuh"
 #include "cuda/components/searching.cuh"
 #include "cuda/components/thread_ids.cuh"
diff --git a/cuda/factorization/ic_kernels.cu b/cuda/factorization/ic_kernels.cu
index 1afb10ce57a..9d55856f139 100644
--- a/cuda/factorization/ic_kernels.cu
+++ b/cuda/factorization/ic_kernels.cu
@@ -8,7 +8,7 @@
 #include <ginkgo/core/base/array.hpp>
 
 
-#include "cuda/base/cusparse_bindings.hpp"
+#include "common/cuda_hip/base/sparselib_bindings.hpp"
 
 
 namespace gko {
@@ -27,37 +27,37 @@ void compute(std::shared_ptr<const DefaultExecutor> exec,
              matrix::Csr<ValueType, IndexType>* m)
 {
     const auto id = exec->get_device_id();
-    auto handle = exec->get_cusparse_handle();
-    auto desc = cusparse::create_mat_descr();
-    auto info = cusparse::create_ic0_info();
+    auto handle = exec->get_sparselib_handle();
+    auto desc = sparselib::create_mat_descr();
+    auto info = sparselib::create_ic0_info();
 
     // get buffer size for IC
     IndexType num_rows = m->get_size()[0];
     IndexType nnz = m->get_num_stored_elements();
     size_type buffer_size{};
-    cusparse::ic0_buffer_size(handle, num_rows, nnz, desc,
-                              m->get_const_values(), m->get_const_row_ptrs(),
-                              m->get_const_col_idxs(), info, buffer_size);
+    sparselib::ic0_buffer_size(handle, num_rows, nnz, desc,
+                               m->get_const_values(), m->get_const_row_ptrs(),
+                               m->get_const_col_idxs(), info, buffer_size);
 
     array<char> buffer{exec, buffer_size};
 
     // set up IC(0)
-    cusparse::ic0_analysis(handle, num_rows, nnz, desc, m->get_const_values(),
-                           m->get_const_row_ptrs(), m->get_const_col_idxs(),
-                           info, CUSPARSE_SOLVE_POLICY_USE_LEVEL,
-                           buffer.get_data());
+    sparselib::ic0_analysis(handle, num_rows, nnz, desc, m->get_const_values(),
+                            m->get_const_row_ptrs(), m->get_const_col_idxs(),
+                            info, SPARSELIB_SOLVE_POLICY_USE_LEVEL,
+                            buffer.get_data());
 
-    cusparse::ic0(handle, num_rows, nnz, desc, m->get_values(),
-                  m->get_const_row_ptrs(), m->get_const_col_idxs(), info,
-                  CUSPARSE_SOLVE_POLICY_USE_LEVEL, buffer.get_data());
+    sparselib::ic0(handle, num_rows, nnz, desc, m->get_values(),
+                   m->get_const_row_ptrs(), m->get_const_col_idxs(), info,
+                   SPARSELIB_SOLVE_POLICY_USE_LEVEL, buffer.get_data());
 
     // CUDA 11.4 has a use-after-free bug on Turing
 #if (CUDA_VERSION >= 11040)
     exec->synchronize();
 #endif
 
-    cusparse::destroy(info);
-    cusparse::destroy(desc);
+    sparselib::destroy_ic0_info(info);
+    sparselib::destroy(desc);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_IC_COMPUTE_KERNEL);
diff --git a/cuda/factorization/ilu_kernels.cu b/cuda/factorization/ilu_kernels.cu
index 33e59bb54c9..acebec6e94c 100644
--- a/cuda/factorization/ilu_kernels.cu
+++ b/cuda/factorization/ilu_kernels.cu
@@ -8,7 +8,7 @@
 #include <ginkgo/core/base/array.hpp>
 
 
-#include "cuda/base/cusparse_bindings.hpp"
+#include "common/cuda_hip/base/sparselib_bindings.hpp"
 
 
 namespace gko {
@@ -27,37 +27,37 @@ void compute_lu(std::shared_ptr<const DefaultExecutor> exec,
                 matrix::Csr<ValueType, IndexType>* m)
 {
     const auto id = exec->get_device_id();
-    auto handle = exec->get_cusparse_handle();
-    auto desc = cusparse::create_mat_descr();
-    auto info = cusparse::create_ilu0_info();
+    auto handle = exec->get_sparselib_handle();
+    auto desc = sparselib::create_mat_descr();
+    auto info = sparselib::create_ilu0_info();
 
     // get buffer size for ILU
     IndexType num_rows = m->get_size()[0];
     IndexType nnz = m->get_num_stored_elements();
     size_type buffer_size{};
-    cusparse::ilu0_buffer_size(handle, num_rows, nnz, desc,
-                               m->get_const_values(), m->get_const_row_ptrs(),
-                               m->get_const_col_idxs(), info, buffer_size);
+    sparselib::ilu0_buffer_size(handle, num_rows, nnz, desc,
+                                m->get_const_values(), m->get_const_row_ptrs(),
+                                m->get_const_col_idxs(), info, buffer_size);
 
     array<char> buffer{exec, buffer_size};
 
     // set up ILU(0)
-    cusparse::ilu0_analysis(handle, num_rows, nnz, desc, m->get_const_values(),
-                            m->get_const_row_ptrs(), m->get_const_col_idxs(),
-                            info, CUSPARSE_SOLVE_POLICY_USE_LEVEL,
-                            buffer.get_data());
+    sparselib::ilu0_analysis(handle, num_rows, nnz, desc, m->get_const_values(),
+                             m->get_const_row_ptrs(), m->get_const_col_idxs(),
+                             info, SPARSELIB_SOLVE_POLICY_USE_LEVEL,
+                             buffer.get_data());
 
-    cusparse::ilu0(handle, num_rows, nnz, desc, m->get_values(),
-                   m->get_const_row_ptrs(), m->get_const_col_idxs(), info,
-                   CUSPARSE_SOLVE_POLICY_USE_LEVEL, buffer.get_data());
+    sparselib::ilu0(handle, num_rows, nnz, desc, m->get_values(),
+                    m->get_const_row_ptrs(), m->get_const_col_idxs(), info,
+                    SPARSELIB_SOLVE_POLICY_USE_LEVEL, buffer.get_data());
 
     // CUDA 11.4 has a use-after-free bug on Turing
 #if (CUDA_VERSION >= 11040)
     exec->synchronize();
 #endif
 
-    cusparse::destroy(info);
-    cusparse::destroy(desc);
+    sparselib::destroy_ilu0_info(info);
+    sparselib::destroy(desc);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
diff --git a/cuda/factorization/lu_kernels.cu b/cuda/factorization/lu_kernels.cu
index 583bf51fb67..9c3069f62cf 100644
--- a/cuda/factorization/lu_kernels.cu
+++ b/cuda/factorization/lu_kernels.cu
@@ -17,11 +17,11 @@
 #include <ginkgo/core/matrix/csr.hpp>
 
 
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/allocator.hpp"
 #include "core/matrix/csr_lookup.hpp"
 #include "cuda/base/thrust.cuh"
-#include "cuda/base/types.hpp"
-#include "cuda/components/cooperative_groups.cuh"
 #include "cuda/components/reduction.cuh"
 #include "cuda/components/syncfree.cuh"
 #include "cuda/components/thread_ids.cuh"
diff --git a/cuda/factorization/par_ic_kernels.cu b/cuda/factorization/par_ic_kernels.cu
index a9de634f1f9..f493cb11fd1 100644
--- a/cuda/factorization/par_ic_kernels.cu
+++ b/cuda/factorization/par_ic_kernels.cu
@@ -10,9 +10,9 @@
 #include <ginkgo/core/matrix/csr.hpp>
 
 
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/memory.hpp"
 #include "cuda/base/math.hpp"
-#include "cuda/base/types.hpp"
-#include "cuda/components/memory.cuh"
 #include "cuda/components/thread_ids.cuh"
 
 
diff --git a/cuda/factorization/par_ict_kernels.cu b/cuda/factorization/par_ict_kernels.cu
index 5f48ceef2f8..d958f81d2f4 100644
--- a/cuda/factorization/par_ict_kernels.cu
+++ b/cuda/factorization/par_ict_kernels.cu
@@ -12,6 +12,8 @@
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/memory.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/coo_builder.hpp"
 #include "core/matrix/csr_builder.hpp"
@@ -19,7 +21,6 @@
 #include "core/synthesizer/implementation_selection.hpp"
 #include "cuda/base/math.hpp"
 #include "cuda/components/intrinsics.cuh"
-#include "cuda/components/memory.cuh"
 #include "cuda/components/merging.cuh"
 #include "cuda/components/prefix_sum.cuh"
 #include "cuda/components/reduction.cuh"
@@ -46,8 +47,7 @@ using compiled_kernels =
     syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
 
 
-#include "common/cuda_hip/factorization/par_ict_spgeam_kernels.hpp.inc"
-#include "common/cuda_hip/factorization/par_ict_sweep_kernels.hpp.inc"
+#include "common/cuda_hip/factorization/par_ict_kernels.hpp.inc"
 
 
 namespace {
diff --git a/cuda/factorization/par_ilu_kernels.cu b/cuda/factorization/par_ilu_kernels.cu
index 7a770a39353..755723e7d4c 100644
--- a/cuda/factorization/par_ilu_kernels.cu
+++ b/cuda/factorization/par_ilu_kernels.cu
@@ -8,9 +8,10 @@
 #include <ginkgo/core/matrix/coo.hpp>
 
 
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/memory.hpp"
 #include "cuda/base/math.hpp"
-#include "cuda/base/types.hpp"
-#include "cuda/components/memory.cuh"
 #include "cuda/components/thread_ids.cuh"
 
 
diff --git a/cuda/factorization/par_ilut_approx_filter_kernel.cu b/cuda/factorization/par_ilut_approx_filter_kernels.cu
similarity index 97%
rename from cuda/factorization/par_ilut_approx_filter_kernel.cu
rename to cuda/factorization/par_ilut_approx_filter_kernels.cu
index 853519cd36b..ae544939e17 100644
--- a/cuda/factorization/par_ilut_approx_filter_kernel.cu
+++ b/cuda/factorization/par_ilut_approx_filter_kernels.cu
@@ -15,16 +15,16 @@
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/coo_builder.hpp"
 #include "core/matrix/csr_builder.hpp"
 #include "core/matrix/csr_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "cuda/base/config.hpp"
 #include "cuda/base/math.hpp"
-#include "cuda/base/types.hpp"
 #include "cuda/components/atomic.cuh"
-#include "cuda/components/cooperative_groups.cuh"
 #include "cuda/components/intrinsics.cuh"
 #include "cuda/components/prefix_sum.cuh"
 #include "cuda/components/sorting.cuh"
diff --git a/cuda/factorization/par_ilut_filter_kernel.cu b/cuda/factorization/par_ilut_filter_kernels.cu
similarity index 96%
rename from cuda/factorization/par_ilut_filter_kernel.cu
rename to cuda/factorization/par_ilut_filter_kernels.cu
index 0e63f102b72..4a24c5f305b 100644
--- a/cuda/factorization/par_ilut_filter_kernel.cu
+++ b/cuda/factorization/par_ilut_filter_kernels.cu
@@ -12,15 +12,16 @@
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/coo_builder.hpp"
 #include "core/matrix/csr_builder.hpp"
 #include "core/matrix/csr_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "cuda/base/config.hpp"
 #include "cuda/base/math.hpp"
-#include "cuda/base/types.hpp"
-#include "cuda/components/cooperative_groups.cuh"
 #include "cuda/components/intrinsics.cuh"
 #include "cuda/components/thread_ids.cuh"
 
diff --git a/cuda/factorization/par_ilut_select_kernel.cu b/cuda/factorization/par_ilut_select_kernels.cu
similarity index 98%
rename from cuda/factorization/par_ilut_select_kernel.cu
rename to cuda/factorization/par_ilut_select_kernels.cu
index ca8b55e504b..6a7bd53c1c4 100644
--- a/cuda/factorization/par_ilut_select_kernel.cu
+++ b/cuda/factorization/par_ilut_select_kernels.cu
@@ -13,6 +13,7 @@
 #include <ginkgo/core/matrix/csr.hpp>
 
 
+#include "common/cuda_hip/base/runtime.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "cuda/base/math.hpp"
 #include "cuda/components/atomic.cuh"
@@ -147,7 +148,7 @@ void threshold_select(std::shared_ptr<const DefaultExecutor> exec,
     auto out_ptr = reinterpret_cast<AbsType*>(tmp1.get_data());
     kernel::basecase_select<<<1, kernel::basecase_block_size, 0,
                               exec->get_stream()>>>(
-        as_cuda_type(tmp22), bucket.size, rank, as_cuda_type(out_ptr));
+        as_device_type(tmp22), bucket.size, rank, as_device_type(out_ptr));
     threshold = exec->copy_val_to_host(out_ptr);
 }
 
diff --git a/cuda/factorization/par_ilut_spgeam_kernel.cu b/cuda/factorization/par_ilut_spgeam_kernels.cu
similarity index 97%
rename from cuda/factorization/par_ilut_spgeam_kernel.cu
rename to cuda/factorization/par_ilut_spgeam_kernels.cu
index c4372f66219..0a751c2f48f 100644
--- a/cuda/factorization/par_ilut_spgeam_kernel.cu
+++ b/cuda/factorization/par_ilut_spgeam_kernels.cu
@@ -12,13 +12,14 @@
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/coo_builder.hpp"
 #include "core/matrix/csr_builder.hpp"
 #include "core/matrix/csr_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
 #include "cuda/base/math.hpp"
-#include "cuda/components/cooperative_groups.cuh"
 #include "cuda/components/intrinsics.cuh"
 #include "cuda/components/merging.cuh"
 #include "cuda/components/prefix_sum.cuh"
@@ -80,8 +81,8 @@ void add_candidates(syn::value_list<int, subwarp_size>,
     auto u_vals = u->get_const_values();
     auto l_new_row_ptrs = l_new->get_row_ptrs();
     auto u_new_row_ptrs = u_new->get_row_ptrs();
-    // count non-zeros per row
     if (num_blocks > 0) {
+        // count non-zeros per row
         kernel::tri_spgeam_nnz<subwarp_size>
             <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
                 lu_row_ptrs, lu_col_idxs, a_row_ptrs, a_col_idxs,
@@ -105,8 +106,8 @@ void add_candidates(syn::value_list<int, subwarp_size>,
     auto u_new_col_idxs = u_new->get_col_idxs();
     auto u_new_vals = u_new->get_values();
 
-    // fill columns and values
     if (num_blocks > 0) {
+        // fill columns and values
         kernel::tri_spgeam_init<subwarp_size>
             <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
                 lu_row_ptrs, lu_col_idxs, as_device_type(lu_vals), a_row_ptrs,
diff --git a/cuda/factorization/par_ilut_sweep_kernel.cu b/cuda/factorization/par_ilut_sweep_kernels.cu
similarity index 97%
rename from cuda/factorization/par_ilut_sweep_kernel.cu
rename to cuda/factorization/par_ilut_sweep_kernels.cu
index 85fb3f26e21..5924ebe328d 100644
--- a/cuda/factorization/par_ilut_sweep_kernel.cu
+++ b/cuda/factorization/par_ilut_sweep_kernels.cu
@@ -12,6 +12,8 @@
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/memory.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/coo_builder.hpp"
 #include "core/matrix/csr_builder.hpp"
@@ -19,7 +21,6 @@
 #include "core/synthesizer/implementation_selection.hpp"
 #include "cuda/base/math.hpp"
 #include "cuda/components/intrinsics.cuh"
-#include "cuda/components/memory.cuh"
 #include "cuda/components/merging.cuh"
 #include "cuda/components/prefix_sum.cuh"
 #include "cuda/components/reduction.cuh"
diff --git a/cuda/log/batch_logger.cuh b/cuda/log/batch_logger.cuh
index 26c60ae78eb..3e53d6ef0a6 100644
--- a/cuda/log/batch_logger.cuh
+++ b/cuda/log/batch_logger.cuh
@@ -23,4 +23,5 @@ namespace batch_log {
 }  // namespace kernels
 }  // namespace gko
 
+
 #endif  // GKO_CUDA_LOG_BATCH_LOGGER_CUH_
diff --git a/cuda/matrix/batch_csr_kernels.cu b/cuda/matrix/batch_csr_kernels.cu
index 6be0a2cab3b..6ec20480405 100644
--- a/cuda/matrix/batch_csr_kernels.cu
+++ b/cuda/matrix/batch_csr_kernels.cu
@@ -13,12 +13,13 @@
 #include <ginkgo/core/matrix/batch_csr.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "cuda/base/batch_struct.hpp"
-#include "cuda/base/config.hpp"
 #include "cuda/base/thrust.cuh"
-#include "cuda/components/cooperative_groups.cuh"
 #include "cuda/components/reduction.cuh"
 #include "cuda/components/thread_ids.cuh"
 #include "cuda/components/uninitialized_array.hpp"
diff --git a/cuda/matrix/batch_dense_kernels.cu b/cuda/matrix/batch_dense_kernels.cu
index 56268d8d6b4..673b08e5db1 100644
--- a/cuda/matrix/batch_dense_kernels.cu
+++ b/cuda/matrix/batch_dense_kernels.cu
@@ -9,15 +9,17 @@
 
 
 #include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/batch_dense.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "cuda/base/batch_struct.hpp"
-#include "cuda/base/config.hpp"
 #include "cuda/base/thrust.cuh"
-#include "cuda/components/cooperative_groups.cuh"
 #include "cuda/components/reduction.cuh"
 #include "cuda/components/thread_ids.cuh"
 #include "cuda/components/uninitialized_array.hpp"
diff --git a/cuda/matrix/batch_ell_kernels.cu b/cuda/matrix/batch_ell_kernels.cu
index 3c824cf8da4..8f0160bd154 100644
--- a/cuda/matrix/batch_ell_kernels.cu
+++ b/cuda/matrix/batch_ell_kernels.cu
@@ -13,12 +13,13 @@
 #include <ginkgo/core/matrix/batch_ell.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "cuda/base/batch_struct.hpp"
-#include "cuda/base/config.hpp"
 #include "cuda/base/thrust.cuh"
-#include "cuda/components/cooperative_groups.cuh"
 #include "cuda/components/reduction.cuh"
 #include "cuda/components/thread_ids.cuh"
 #include "cuda/components/uninitialized_array.hpp"
diff --git a/cuda/matrix/batch_struct.hpp b/cuda/matrix/batch_struct.hpp
index 1c17aea3bfe..5e9c803c9f6 100644
--- a/cuda/matrix/batch_struct.hpp
+++ b/cuda/matrix/batch_struct.hpp
@@ -13,8 +13,8 @@
 #include <ginkgo/core/matrix/batch_ell.hpp>
 
 
+#include "common/cuda_hip/base/types.hpp"
 #include "core/base/batch_struct.hpp"
-#include "cuda/base/types.hpp"
 
 
 namespace gko {
diff --git a/cuda/matrix/coo_kernels.cu b/cuda/matrix/coo_kernels.cu
index 3d67144c9ec..f138d0b934e 100644
--- a/cuda/matrix/coo_kernels.cu
+++ b/cuda/matrix/coo_kernels.cu
@@ -12,25 +12,21 @@
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/sparselib_bindings.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/format_conversion.hpp"
 #include "core/matrix/dense_kernels.hpp"
-#include "cuda/base/config.hpp"
-#include "cuda/base/cusparse_bindings.hpp"
 #include "cuda/base/math.hpp"
-#include "cuda/base/types.hpp"
 #include "cuda/components/atomic.cuh"
-#include "cuda/components/cooperative_groups.cuh"
-#include "cuda/components/format_conversion.cuh"
 #include "cuda/components/segment_scan.cuh"
 #include "cuda/components/thread_ids.cuh"
 
 
 namespace gko {
 namespace kernels {
-/**
- * @brief The CUDA namespace.
- *
- * @ingroup cuda
- */
 namespace cuda {
 /**
  * @brief The Coordinate matrix format namespace.
diff --git a/cuda/matrix/csr_kernels.template.cu b/cuda/matrix/csr_kernels.template.cu
index 4a779775670..a0a7e4e97b8 100644
--- a/cuda/matrix/csr_kernels.template.cu
+++ b/cuda/matrix/csr_kernels.template.cu
@@ -27,7 +27,13 @@
 #include <ginkgo/core/matrix/sellp.hpp>
 
 
-#include "accessor/cuda_helper.hpp"
+#include "accessor/cuda_hip_helper.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/pointer_mode_guard.hpp"
+#include "common/cuda_hip/base/sparselib_bindings.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/format_conversion.hpp"
 #include "core/base/array_access.hpp"
 #include "core/base/mixed_precision_types.hpp"
 #include "core/components/fill_array_kernels.hpp"
@@ -38,15 +44,9 @@
 #include "core/matrix/csr_lookup.hpp"
 #include "core/matrix/dense_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "cuda/base/config.hpp"
-#include "cuda/base/cusparse_bindings.hpp"
 #include "cuda/base/math.hpp"
-#include "cuda/base/pointer_mode_guard.hpp"
 #include "cuda/base/thrust.cuh"
-#include "cuda/base/types.hpp"
 #include "cuda/components/atomic.cuh"
-#include "cuda/components/cooperative_groups.cuh"
-#include "cuda/components/format_conversion.cuh"
 #include "cuda/components/intrinsics.cuh"
 #include "cuda/components/merging.cuh"
 #include "cuda/components/prefix_sum.cuh"
@@ -133,10 +133,11 @@ void merge_path_spmv(syn::value_list<int, items_per_thread>,
                 kernel::abstract_merge_path_spmv<items_per_thread>
                     <<<grid, block, 0, exec->get_stream()>>>(
                         static_cast<IndexType>(a->get_size()[0]),
-                        acc::as_cuda_range(a_vals), a->get_const_col_idxs(),
+                        acc::as_device_range(a_vals), a->get_const_col_idxs(),
                         as_device_type(a->get_const_row_ptrs()),
                         as_device_type(a->get_const_srow()),
-                        acc::as_cuda_range(b_vals), acc::as_cuda_range(c_vals),
+                        acc::as_device_range(b_vals),
+                        acc::as_device_range(c_vals),
                         as_device_type(row_out.get_data()),
                         as_device_type(val_out.get_data()));
             }
@@ -144,7 +145,7 @@ void merge_path_spmv(syn::value_list<int, items_per_thread>,
                 abstract_reduce<<<1, spmv_block_size, 0, exec->get_stream()>>>(
                     grid_num, as_device_type(val_out.get_data()),
                     as_device_type(row_out.get_data()),
-                    acc::as_cuda_range(c_vals));
+                    acc::as_device_range(c_vals));
 
         } else if (alpha != nullptr && beta != nullptr) {
             if (grid_num > 0) {
@@ -152,12 +153,12 @@ void merge_path_spmv(syn::value_list<int, items_per_thread>,
                     <<<grid, block, 0, exec->get_stream()>>>(
                         static_cast<IndexType>(a->get_size()[0]),
                         as_device_type(alpha->get_const_values()),
-                        acc::as_cuda_range(a_vals), a->get_const_col_idxs(),
+                        acc::as_device_range(a_vals), a->get_const_col_idxs(),
                         as_device_type(a->get_const_row_ptrs()),
                         as_device_type(a->get_const_srow()),
-                        acc::as_cuda_range(b_vals),
+                        acc::as_device_range(b_vals),
                         as_device_type(beta->get_const_values()),
-                        acc::as_cuda_range(c_vals),
+                        acc::as_device_range(c_vals),
                         as_device_type(row_out.get_data()),
                         as_device_type(val_out.get_data()));
             }
@@ -166,7 +167,7 @@ void merge_path_spmv(syn::value_list<int, items_per_thread>,
                     grid_num, as_device_type(val_out.get_data()),
                     as_device_type(row_out.get_data()),
                     as_device_type(alpha->get_const_values()),
-                    acc::as_cuda_range(c_vals));
+                    acc::as_device_range(c_vals));
         } else {
             GKO_KERNEL_NOT_FOUND;
         }
@@ -245,21 +246,21 @@ void classical_spmv(syn::value_list<int, subwarp_size>,
         if (grid.x > 0 && grid.y > 0) {
             kernel::abstract_classical_spmv<subwarp_size>
                 <<<grid, block, 0, exec->get_stream()>>>(
-                    a->get_size()[0], acc::as_cuda_range(a_vals),
+                    a->get_size()[0], acc::as_device_range(a_vals),
                     a->get_const_col_idxs(),
                     as_device_type(a->get_const_row_ptrs()),
-                    acc::as_cuda_range(b_vals), acc::as_cuda_range(c_vals));
+                    acc::as_device_range(b_vals), acc::as_device_range(c_vals));
         }
     } else if (alpha != nullptr && beta != nullptr) {
         if (grid.x > 0 && grid.y > 0) {
             kernel::abstract_classical_spmv<subwarp_size>
                 <<<grid, block, 0, exec->get_stream()>>>(
                     a->get_size()[0], as_device_type(alpha->get_const_values()),
-                    acc::as_cuda_range(a_vals), a->get_const_col_idxs(),
+                    acc::as_device_range(a_vals), a->get_const_col_idxs(),
                     as_device_type(a->get_const_row_ptrs()),
-                    acc::as_cuda_range(b_vals),
+                    acc::as_device_range(b_vals),
                     as_device_type(beta->get_const_values()),
-                    acc::as_cuda_range(c_vals));
+                    acc::as_device_range(c_vals));
         }
     } else {
         GKO_KERNEL_NOT_FOUND;
@@ -301,20 +302,20 @@ void load_balance_spmv(std::shared_ptr<const DefaultExecutor> exec,
                                         exec->get_stream()>>>(
                     nwarps, static_cast<IndexType>(a->get_size()[0]),
                     as_device_type(alpha->get_const_values()),
-                    acc::as_cuda_range(a_vals), a->get_const_col_idxs(),
+                    acc::as_device_range(a_vals), a->get_const_col_idxs(),
                     as_device_type(a->get_const_row_ptrs()),
                     as_device_type(a->get_const_srow()),
-                    acc::as_cuda_range(b_vals), acc::as_cuda_range(c_vals));
+                    acc::as_device_range(b_vals), acc::as_device_range(c_vals));
             }
         } else {
             if (csr_grid.x > 0 && csr_grid.y > 0) {
                 kernel::abstract_spmv<<<csr_grid, csr_block, 0,
                                         exec->get_stream()>>>(
                     nwarps, static_cast<IndexType>(a->get_size()[0]),
-                    acc::as_cuda_range(a_vals), a->get_const_col_idxs(),
+                    acc::as_device_range(a_vals), a->get_const_col_idxs(),
                     as_device_type(a->get_const_row_ptrs()),
                     as_device_type(a->get_const_srow()),
-                    acc::as_cuda_range(b_vals), acc::as_cuda_range(c_vals));
+                    acc::as_device_range(b_vals), acc::as_device_range(c_vals));
             }
         }
     }
@@ -329,55 +330,55 @@ bool try_general_sparselib_spmv(std::shared_ptr<const DefaultExecutor> exec,
                                 const ValueType* beta,
                                 matrix::Dense<ValueType>* c)
 {
-    auto handle = exec->get_cusparse_handle();
+    auto handle = exec->get_sparselib_handle();
 #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-    if (!cusparse::is_supported<ValueType, IndexType>::value ||
+    if (!sparselib::is_supported<ValueType, IndexType>::value ||
         b->get_stride() != 1 || c->get_stride() != 1 || b->get_size()[0] == 0 ||
         c->get_size()[0] == 0) {
         return false;
     }
 
-    auto descr = cusparse::create_mat_descr();
+    auto descr = sparselib::create_mat_descr();
     auto row_ptrs = a->get_const_row_ptrs();
     auto col_idxs = a->get_const_col_idxs();
-    cusparse::spmv(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, a->get_size()[0],
-                   a->get_size()[1], a->get_num_stored_elements(), alpha, descr,
-                   a->get_const_values(), row_ptrs, col_idxs,
-                   b->get_const_values(), beta, c->get_values());
+    sparselib::spmv(handle, SPARSELIB_OPERATION_NON_TRANSPOSE, a->get_size()[0],
+                    a->get_size()[1], a->get_num_stored_elements(), alpha,
+                    descr, a->get_const_values(), row_ptrs, col_idxs,
+                    b->get_const_values(), beta, c->get_values());
 
-    cusparse::destroy(descr);
+    sparselib::destroy(descr);
 #else  // CUDA_VERSION >= 11000
     // workaround for a division by zero in cuSPARSE 11.?
     if (a->get_size()[1] == 0) {
         return false;
     }
-    cusparseOperation_t trans = CUSPARSE_OPERATION_NON_TRANSPOSE;
+    cusparseOperation_t trans = SPARSELIB_OPERATION_NON_TRANSPOSE;
     auto row_ptrs = const_cast<IndexType*>(a->get_const_row_ptrs());
     auto col_idxs = const_cast<IndexType*>(a->get_const_col_idxs());
     auto values = const_cast<ValueType*>(a->get_const_values());
-    auto mat = cusparse::create_csr(a->get_size()[0], a->get_size()[1],
-                                    a->get_num_stored_elements(), row_ptrs,
-                                    col_idxs, values);
+    auto mat = sparselib::create_csr(a->get_size()[0], a->get_size()[1],
+                                     a->get_num_stored_elements(), row_ptrs,
+                                     col_idxs, values);
     auto b_val = const_cast<ValueType*>(b->get_const_values());
     auto c_val = c->get_values();
     if (b->get_stride() == 1 && c->get_stride() == 1) {
-        auto vecb = cusparse::create_dnvec(b->get_size()[0], b_val);
-        auto vecc = cusparse::create_dnvec(c->get_size()[0], c_val);
+        auto vecb = sparselib::create_dnvec(b->get_size()[0], b_val);
+        auto vecc = sparselib::create_dnvec(c->get_size()[0], c_val);
 #if CUDA_VERSION >= 11021
         constexpr auto alg = CUSPARSE_SPMV_CSR_ALG1;
 #else
         constexpr auto alg = CUSPARSE_CSRMV_ALG1;
 #endif
         size_type buffer_size = 0;
-        cusparse::spmv_buffersize<ValueType>(handle, trans, alpha, mat, vecb,
-                                             beta, vecc, alg, &buffer_size);
+        sparselib::spmv_buffersize<ValueType>(handle, trans, alpha, mat, vecb,
+                                              beta, vecc, alg, &buffer_size);
 
         array<char> buffer_array(exec, buffer_size);
         auto buffer = buffer_array.get_data();
-        cusparse::spmv<ValueType>(handle, trans, alpha, mat, vecb, beta, vecc,
-                                  alg, buffer);
-        cusparse::destroy(vecb);
-        cusparse::destroy(vecc);
+        sparselib::spmv<ValueType>(handle, trans, alpha, mat, vecb, beta, vecc,
+                                   alg, buffer);
+        sparselib::destroy(vecb);
+        sparselib::destroy(vecc);
     } else {
 #if CUDA_VERSION >= 11060
         if (b->get_size()[1] == 1) {
@@ -388,22 +389,22 @@ bool try_general_sparselib_spmv(std::shared_ptr<const DefaultExecutor> exec,
 #endif  // CUDA_VERSION >= 11060
         cusparseSpMMAlg_t alg = CUSPARSE_SPMM_CSR_ALG2;
         auto vecb =
-            cusparse::create_dnmat(b->get_size(), b->get_stride(), b_val);
+            sparselib::create_dnmat(b->get_size(), b->get_stride(), b_val);
         auto vecc =
-            cusparse::create_dnmat(c->get_size(), c->get_stride(), c_val);
+            sparselib::create_dnmat(c->get_size(), c->get_stride(), c_val);
         size_type buffer_size = 0;
-        cusparse::spmm_buffersize<ValueType>(handle, trans, trans, alpha, mat,
-                                             vecb, beta, vecc, alg,
-                                             &buffer_size);
+        sparselib::spmm_buffersize<ValueType>(handle, trans, trans, alpha, mat,
+                                              vecb, beta, vecc, alg,
+                                              &buffer_size);
 
         array<char> buffer_array(exec, buffer_size);
         auto buffer = buffer_array.get_data();
-        cusparse::spmm<ValueType>(handle, trans, trans, alpha, mat, vecb, beta,
-                                  vecc, alg, buffer);
-        cusparse::destroy(vecb);
-        cusparse::destroy(vecc);
+        sparselib::spmm<ValueType>(handle, trans, trans, alpha, mat, vecb, beta,
+                                   vecc, alg, buffer);
+        sparselib::destroy(vecb);
+        sparselib::destroy(vecc);
     }
-    cusparse::destroy(mat);
+    sparselib::destroy(mat);
 #endif
     return true;
 }
@@ -437,8 +438,8 @@ bool try_sparselib_spmv(std::shared_ptr<const DefaultExecutor> exec,
         return try_general_sparselib_spmv(exec, alpha->get_const_values(), a, b,
                                           beta->get_const_values(), c);
     } else {
-        auto handle = exec->get_cusparse_handle();
-        cusparse::pointer_mode_guard pm_guard(handle);
+        auto handle = exec->get_sparselib_handle();
+        sparselib::pointer_mode_guard pm_guard(handle);
         const auto valpha = one<ValueType>();
         const auto vbeta = zero<ValueType>();
         return try_general_sparselib_spmv(exec, &valpha, a, b, &vbeta, c);
@@ -583,8 +584,8 @@ void spgemm(std::shared_ptr<const DefaultExecutor> exec,
     auto b_col_idxs = b->get_const_col_idxs();
     auto c_row_ptrs = c->get_row_ptrs();
 
-    auto handle = exec->get_cusparse_handle();
-    cusparse::pointer_mode_guard pm_guard(handle);
+    auto handle = exec->get_sparselib_handle();
+    sparselib::pointer_mode_guard pm_guard(handle);
 
     auto alpha = one<ValueType>();
     auto a_nnz = static_cast<IndexType>(a->get_num_stored_elements());
@@ -600,18 +601,18 @@ void spgemm(std::shared_ptr<const DefaultExecutor> exec,
     auto& c_vals_array = c_builder.get_value_array();
 
 #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-    if (!cusparse::is_supported<ValueType, IndexType>::value) {
+    if (!sparselib::is_supported<ValueType, IndexType>::value) {
         GKO_NOT_IMPLEMENTED;
     }
 
-    auto a_descr = cusparse::create_mat_descr();
-    auto b_descr = cusparse::create_mat_descr();
-    auto c_descr = cusparse::create_mat_descr();
-    auto d_descr = cusparse::create_mat_descr();
-    auto info = cusparse::create_spgemm_info();
+    auto a_descr = sparselib::create_mat_descr();
+    auto b_descr = sparselib::create_mat_descr();
+    auto c_descr = sparselib::create_mat_descr();
+    auto d_descr = sparselib::create_mat_descr();
+    auto info = sparselib::create_spgemm_info();
     // allocate buffer
     size_type buffer_size{};
-    cusparse::spgemm_buffer_size(
+    sparselib::spgemm_buffer_size(
         handle, m, n, k, &alpha, a_descr, a_nnz, a_row_ptrs, a_col_idxs,
         b_descr, b_nnz, b_row_ptrs, b_col_idxs, null_value, d_descr, zero_nnz,
         null_index, null_index, info, buffer_size);
@@ -620,74 +621,75 @@ void spgemm(std::shared_ptr<const DefaultExecutor> exec,
 
     // count nnz
     IndexType c_nnz{};
-    cusparse::spgemm_nnz(handle, m, n, k, a_descr, a_nnz, a_row_ptrs,
-                         a_col_idxs, b_descr, b_nnz, b_row_ptrs, b_col_idxs,
-                         d_descr, zero_nnz, null_index, null_index, c_descr,
-                         c_row_ptrs, &c_nnz, info, buffer);
+    sparselib::spgemm_nnz(handle, m, n, k, a_descr, a_nnz, a_row_ptrs,
+                          a_col_idxs, b_descr, b_nnz, b_row_ptrs, b_col_idxs,
+                          d_descr, zero_nnz, null_index, null_index, c_descr,
+                          c_row_ptrs, &c_nnz, info, buffer);
 
     // accumulate non-zeros
     c_col_idxs_array.resize_and_reset(c_nnz);
     c_vals_array.resize_and_reset(c_nnz);
     auto c_col_idxs = c_col_idxs_array.get_data();
     auto c_vals = c_vals_array.get_data();
-    cusparse::spgemm(handle, m, n, k, &alpha, a_descr, a_nnz, a_vals,
-                     a_row_ptrs, a_col_idxs, b_descr, b_nnz, b_vals, b_row_ptrs,
-                     b_col_idxs, null_value, d_descr, zero_nnz, null_value,
-                     null_index, null_index, c_descr, c_vals, c_row_ptrs,
-                     c_col_idxs, info, buffer);
-
-    cusparse::destroy(info);
-    cusparse::destroy(d_descr);
-    cusparse::destroy(c_descr);
-    cusparse::destroy(b_descr);
-    cusparse::destroy(a_descr);
+    sparselib::spgemm(handle, m, n, k, &alpha, a_descr, a_nnz, a_vals,
+                      a_row_ptrs, a_col_idxs, b_descr, b_nnz, b_vals,
+                      b_row_ptrs, b_col_idxs, null_value, d_descr, zero_nnz,
+                      null_value, null_index, null_index, c_descr, c_vals,
+                      c_row_ptrs, c_col_idxs, info, buffer);
+
+    sparselib::destroy(info);
+    sparselib::destroy(d_descr);
+    sparselib::destroy(c_descr);
+    sparselib::destroy(b_descr);
+    sparselib::destroy(a_descr);
 
 #else   // CUDA_VERSION >= 11000
     const auto beta = zero<ValueType>();
-    auto spgemm_descr = cusparse::create_spgemm_descr();
-    auto a_descr = cusparse::create_csr(
+    auto spgemm_descr = sparselib::create_spgemm_descr();
+    auto a_descr = sparselib::create_csr(
         m, k, a_nnz, const_cast<IndexType*>(a_row_ptrs),
         const_cast<IndexType*>(a_col_idxs), const_cast<ValueType*>(a_vals));
-    auto b_descr = cusparse::create_csr(
+    auto b_descr = sparselib::create_csr(
         k, n, b_nnz, const_cast<IndexType*>(b_row_ptrs),
         const_cast<IndexType*>(b_col_idxs), const_cast<ValueType*>(b_vals));
-    auto c_descr = cusparse::create_csr(m, n, zero_nnz, null_index, null_index,
-                                        null_value);
+    auto c_descr = sparselib::create_csr(m, n, zero_nnz, null_index, null_index,
+                                         null_value);
 
     // estimate work
     size_type buffer1_size{};
-    cusparse::spgemm_work_estimation(handle, &alpha, a_descr, b_descr, &beta,
-                                     c_descr, spgemm_descr, buffer1_size,
-                                     nullptr);
+    sparselib::spgemm_work_estimation(handle, &alpha, a_descr, b_descr, &beta,
+                                      c_descr, spgemm_descr, buffer1_size,
+                                      nullptr);
     array<char> buffer1{exec, buffer1_size};
-    cusparse::spgemm_work_estimation(handle, &alpha, a_descr, b_descr, &beta,
-                                     c_descr, spgemm_descr, buffer1_size,
-                                     buffer1.get_data());
+    sparselib::spgemm_work_estimation(handle, &alpha, a_descr, b_descr, &beta,
+                                      c_descr, spgemm_descr, buffer1_size,
+                                      buffer1.get_data());
 
     // compute spgemm
     size_type buffer2_size{};
-    cusparse::spgemm_compute(handle, &alpha, a_descr, b_descr, &beta, c_descr,
-                             spgemm_descr, buffer1.get_data(), buffer2_size,
-                             nullptr);
+    sparselib::spgemm_compute(handle, &alpha, a_descr, b_descr, &beta, c_descr,
+                              spgemm_descr, buffer1.get_data(), buffer2_size,
+                              nullptr);
     array<char> buffer2{exec, buffer2_size};
-    cusparse::spgemm_compute(handle, &alpha, a_descr, b_descr, &beta, c_descr,
-                             spgemm_descr, buffer1.get_data(), buffer2_size,
-                             buffer2.get_data());
+    sparselib::spgemm_compute(handle, &alpha, a_descr, b_descr, &beta, c_descr,
+                              spgemm_descr, buffer1.get_data(), buffer2_size,
+                              buffer2.get_data());
 
     // copy data to result
-    auto c_nnz = cusparse::sparse_matrix_nnz(c_descr);
+    auto c_nnz = sparselib::sparse_matrix_nnz(c_descr);
     c_col_idxs_array.resize_and_reset(c_nnz);
     c_vals_array.resize_and_reset(c_nnz);
-    cusparse::csr_set_pointers(c_descr, c_row_ptrs, c_col_idxs_array.get_data(),
-                               c_vals_array.get_data());
+    sparselib::csr_set_pointers(c_descr, c_row_ptrs,
+                                c_col_idxs_array.get_data(),
+                                c_vals_array.get_data());
 
-    cusparse::spgemm_copy(handle, &alpha, a_descr, b_descr, &beta, c_descr,
-                          spgemm_descr);
+    sparselib::spgemm_copy(handle, &alpha, a_descr, b_descr, &beta, c_descr,
+                           spgemm_descr);
 
-    cusparse::destroy(c_descr);
-    cusparse::destroy(b_descr);
-    cusparse::destroy(a_descr);
-    cusparse::destroy(spgemm_descr);
+    sparselib::destroy(c_descr);
+    sparselib::destroy(b_descr);
+    sparselib::destroy(a_descr);
+    sparselib::destroy(spgemm_descr);
 #endif  // CUDA_VERSION >= 11000
 }
 
@@ -701,8 +703,8 @@ void advanced_spgemm(std::shared_ptr<const DefaultExecutor> exec,
                      const matrix::Csr<ValueType, IndexType>* d,
                      matrix::Csr<ValueType, IndexType>* c)
 {
-    auto handle = exec->get_cusparse_handle();
-    cusparse::pointer_mode_guard pm_guard(handle);
+    auto handle = exec->get_sparselib_handle();
+    sparselib::pointer_mode_guard pm_guard(handle);
 
     auto valpha = exec->copy_val_to_host(alpha->get_const_values());
     auto a_nnz = IndexType(a->get_num_stored_elements());
@@ -724,102 +726,102 @@ void advanced_spgemm(std::shared_ptr<const DefaultExecutor> exec,
     auto c_row_ptrs = c->get_row_ptrs();
 
 #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-    if (!cusparse::is_supported<ValueType, IndexType>::value) {
+    if (!sparselib::is_supported<ValueType, IndexType>::value) {
         GKO_NOT_IMPLEMENTED;
     }
 
     matrix::CsrBuilder<ValueType, IndexType> c_builder{c};
     auto& c_col_idxs_array = c_builder.get_col_idx_array();
     auto& c_vals_array = c_builder.get_value_array();
-    auto a_descr = cusparse::create_mat_descr();
-    auto b_descr = cusparse::create_mat_descr();
-    auto c_descr = cusparse::create_mat_descr();
-    auto d_descr = cusparse::create_mat_descr();
-    auto info = cusparse::create_spgemm_info();
+    auto a_descr = sparselib::create_mat_descr();
+    auto b_descr = sparselib::create_mat_descr();
+    auto c_descr = sparselib::create_mat_descr();
+    auto d_descr = sparselib::create_mat_descr();
+    auto info = sparselib::create_spgemm_info();
     // allocate buffer
     size_type buffer_size{};
-    cusparse::spgemm_buffer_size(handle, m, n, k, &valpha, a_descr, a_nnz,
-                                 a_row_ptrs, a_col_idxs, b_descr, b_nnz,
-                                 b_row_ptrs, b_col_idxs, &vbeta, d_descr, d_nnz,
-                                 d_row_ptrs, d_col_idxs, info, buffer_size);
+    sparselib::spgemm_buffer_size(
+        handle, m, n, k, &valpha, a_descr, a_nnz, a_row_ptrs, a_col_idxs,
+        b_descr, b_nnz, b_row_ptrs, b_col_idxs, &vbeta, d_descr, d_nnz,
+        d_row_ptrs, d_col_idxs, info, buffer_size);
     array<char> buffer_array(exec, buffer_size);
     auto buffer = buffer_array.get_data();
 
     // count nnz
     IndexType c_nnz{};
-    cusparse::spgemm_nnz(handle, m, n, k, a_descr, a_nnz, a_row_ptrs,
-                         a_col_idxs, b_descr, b_nnz, b_row_ptrs, b_col_idxs,
-                         d_descr, d_nnz, d_row_ptrs, d_col_idxs, c_descr,
-                         c_row_ptrs, &c_nnz, info, buffer);
+    sparselib::spgemm_nnz(handle, m, n, k, a_descr, a_nnz, a_row_ptrs,
+                          a_col_idxs, b_descr, b_nnz, b_row_ptrs, b_col_idxs,
+                          d_descr, d_nnz, d_row_ptrs, d_col_idxs, c_descr,
+                          c_row_ptrs, &c_nnz, info, buffer);
 
     // accumulate non-zeros
     c_col_idxs_array.resize_and_reset(c_nnz);
     c_vals_array.resize_and_reset(c_nnz);
     auto c_col_idxs = c_col_idxs_array.get_data();
     auto c_vals = c_vals_array.get_data();
-    cusparse::spgemm(handle, m, n, k, &valpha, a_descr, a_nnz, a_vals,
-                     a_row_ptrs, a_col_idxs, b_descr, b_nnz, b_vals, b_row_ptrs,
-                     b_col_idxs, &vbeta, d_descr, d_nnz, d_vals, d_row_ptrs,
-                     d_col_idxs, c_descr, c_vals, c_row_ptrs, c_col_idxs, info,
-                     buffer);
-
-    cusparse::destroy(info);
-    cusparse::destroy(d_descr);
-    cusparse::destroy(c_descr);
-    cusparse::destroy(b_descr);
-    cusparse::destroy(a_descr);
+    sparselib::spgemm(handle, m, n, k, &valpha, a_descr, a_nnz, a_vals,
+                      a_row_ptrs, a_col_idxs, b_descr, b_nnz, b_vals,
+                      b_row_ptrs, b_col_idxs, &vbeta, d_descr, d_nnz, d_vals,
+                      d_row_ptrs, d_col_idxs, c_descr, c_vals, c_row_ptrs,
+                      c_col_idxs, info, buffer);
+
+    sparselib::destroy(info);
+    sparselib::destroy(d_descr);
+    sparselib::destroy(c_descr);
+    sparselib::destroy(b_descr);
+    sparselib::destroy(a_descr);
 #else   // CUDA_VERSION >= 11000
     auto null_value = static_cast<ValueType*>(nullptr);
     auto null_index = static_cast<IndexType*>(nullptr);
     auto one_val = one<ValueType>();
     auto zero_val = zero<ValueType>();
     auto zero_nnz = IndexType{};
-    auto spgemm_descr = cusparse::create_spgemm_descr();
-    auto a_descr = cusparse::create_csr(
+    auto spgemm_descr = sparselib::create_spgemm_descr();
+    auto a_descr = sparselib::create_csr(
         m, k, a_nnz, const_cast<IndexType*>(a_row_ptrs),
         const_cast<IndexType*>(a_col_idxs), const_cast<ValueType*>(a_vals));
-    auto b_descr = cusparse::create_csr(
+    auto b_descr = sparselib::create_csr(
         k, n, b_nnz, const_cast<IndexType*>(b_row_ptrs),
         const_cast<IndexType*>(b_col_idxs), const_cast<ValueType*>(b_vals));
-    auto c_descr = cusparse::create_csr(m, n, zero_nnz, null_index, null_index,
-                                        null_value);
+    auto c_descr = sparselib::create_csr(m, n, zero_nnz, null_index, null_index,
+                                         null_value);
 
     // estimate work
     size_type buffer1_size{};
-    cusparse::spgemm_work_estimation(handle, &one_val, a_descr, b_descr,
-                                     &zero_val, c_descr, spgemm_descr,
-                                     buffer1_size, nullptr);
+    sparselib::spgemm_work_estimation(handle, &one_val, a_descr, b_descr,
+                                      &zero_val, c_descr, spgemm_descr,
+                                      buffer1_size, nullptr);
     array<char> buffer1{exec, buffer1_size};
-    cusparse::spgemm_work_estimation(handle, &one_val, a_descr, b_descr,
-                                     &zero_val, c_descr, spgemm_descr,
-                                     buffer1_size, buffer1.get_data());
+    sparselib::spgemm_work_estimation(handle, &one_val, a_descr, b_descr,
+                                      &zero_val, c_descr, spgemm_descr,
+                                      buffer1_size, buffer1.get_data());
 
     // compute spgemm
     size_type buffer2_size{};
-    cusparse::spgemm_compute(handle, &one_val, a_descr, b_descr, &zero_val,
-                             c_descr, spgemm_descr, buffer1.get_data(),
-                             buffer2_size, nullptr);
+    sparselib::spgemm_compute(handle, &one_val, a_descr, b_descr, &zero_val,
+                              c_descr, spgemm_descr, buffer1.get_data(),
+                              buffer2_size, nullptr);
     array<char> buffer2{exec, buffer2_size};
-    cusparse::spgemm_compute(handle, &one_val, a_descr, b_descr, &zero_val,
-                             c_descr, spgemm_descr, buffer1.get_data(),
-                             buffer2_size, buffer2.get_data());
+    sparselib::spgemm_compute(handle, &one_val, a_descr, b_descr, &zero_val,
+                              c_descr, spgemm_descr, buffer1.get_data(),
+                              buffer2_size, buffer2.get_data());
 
     // write result to temporary storage
-    auto c_tmp_nnz = cusparse::sparse_matrix_nnz(c_descr);
+    auto c_tmp_nnz = sparselib::sparse_matrix_nnz(c_descr);
     array<IndexType> c_tmp_row_ptrs_array(exec, m + 1);
     array<IndexType> c_tmp_col_idxs_array(exec, c_tmp_nnz);
     array<ValueType> c_tmp_vals_array(exec, c_tmp_nnz);
-    cusparse::csr_set_pointers(c_descr, c_tmp_row_ptrs_array.get_data(),
-                               c_tmp_col_idxs_array.get_data(),
-                               c_tmp_vals_array.get_data());
+    sparselib::csr_set_pointers(c_descr, c_tmp_row_ptrs_array.get_data(),
+                                c_tmp_col_idxs_array.get_data(),
+                                c_tmp_vals_array.get_data());
 
-    cusparse::spgemm_copy(handle, &one_val, a_descr, b_descr, &zero_val,
-                          c_descr, spgemm_descr);
+    sparselib::spgemm_copy(handle, &one_val, a_descr, b_descr, &zero_val,
+                           c_descr, spgemm_descr);
 
-    cusparse::destroy(c_descr);
-    cusparse::destroy(b_descr);
-    cusparse::destroy(a_descr);
-    cusparse::destroy(spgemm_descr);
+    sparselib::destroy(c_descr);
+    sparselib::destroy(b_descr);
+    sparselib::destroy(a_descr);
+    sparselib::destroy(spgemm_descr);
 
     auto spgeam_total_nnz = c_tmp_nnz + d->get_num_stored_elements();
     auto nnz_per_row = spgeam_total_nnz / m;
@@ -846,13 +848,13 @@ void transpose(std::shared_ptr<const DefaultExecutor> exec,
     if (orig->get_size()[0] == 0) {
         return;
     }
-    if (cusparse::is_supported<ValueType, IndexType>::value) {
+    if (sparselib::is_supported<ValueType, IndexType>::value) {
 #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
         cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC;
         cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO;
 
-        cusparse::transpose(
-            exec->get_cusparse_handle(), orig->get_size()[0],
+        sparselib::transpose(
+            exec->get_sparselib_handle(), orig->get_size()[0],
             orig->get_size()[1], orig->get_num_stored_elements(),
             orig->get_const_values(), orig->get_const_row_ptrs(),
             orig->get_const_col_idxs(), trans->get_values(),
@@ -864,8 +866,8 @@ void transpose(std::shared_ptr<const DefaultExecutor> exec,
         cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO;
         cusparseCsr2CscAlg_t alg = CUSPARSE_CSR2CSC_ALG1;
         size_type buffer_size = 0;
-        cusparse::transpose_buffersize(
-            exec->get_cusparse_handle(), orig->get_size()[0],
+        sparselib::transpose_buffersize(
+            exec->get_sparselib_handle(), orig->get_size()[0],
             orig->get_size()[1], orig->get_num_stored_elements(),
             orig->get_const_values(), orig->get_const_row_ptrs(),
             orig->get_const_col_idxs(), trans->get_values(),
@@ -873,8 +875,8 @@ void transpose(std::shared_ptr<const DefaultExecutor> exec,
             idxBase, alg, &buffer_size);
         array<char> buffer_array(exec, buffer_size);
         auto buffer = buffer_array.get_data();
-        cusparse::transpose(
-            exec->get_cusparse_handle(), orig->get_size()[0],
+        sparselib::transpose(
+            exec->get_sparselib_handle(), orig->get_size()[0],
             orig->get_size()[1], orig->get_num_stored_elements(),
             orig->get_const_values(), orig->get_const_row_ptrs(),
             orig->get_const_col_idxs(), trans->get_values(),
@@ -898,13 +900,13 @@ void conj_transpose(std::shared_ptr<const DefaultExecutor> exec,
     const auto block_size = default_block_size;
     const auto grid_size =
         ceildiv(trans->get_num_stored_elements(), block_size);
-    if (cusparse::is_supported<ValueType, IndexType>::value) {
+    if (sparselib::is_supported<ValueType, IndexType>::value) {
 #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
         cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC;
         cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO;
 
-        cusparse::transpose(
-            exec->get_cusparse_handle(), orig->get_size()[0],
+        sparselib::transpose(
+            exec->get_sparselib_handle(), orig->get_size()[0],
             orig->get_size()[1], orig->get_num_stored_elements(),
             orig->get_const_values(), orig->get_const_row_ptrs(),
             orig->get_const_col_idxs(), trans->get_values(),
@@ -916,8 +918,8 @@ void conj_transpose(std::shared_ptr<const DefaultExecutor> exec,
         cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO;
         cusparseCsr2CscAlg_t alg = CUSPARSE_CSR2CSC_ALG1;
         size_type buffer_size = 0;
-        cusparse::transpose_buffersize(
-            exec->get_cusparse_handle(), orig->get_size()[0],
+        sparselib::transpose_buffersize(
+            exec->get_sparselib_handle(), orig->get_size()[0],
             orig->get_size()[1], orig->get_num_stored_elements(),
             orig->get_const_values(), orig->get_const_row_ptrs(),
             orig->get_const_col_idxs(), trans->get_values(),
@@ -925,8 +927,8 @@ void conj_transpose(std::shared_ptr<const DefaultExecutor> exec,
             idxBase, alg, &buffer_size);
         array<char> buffer_array(exec, buffer_size);
         auto buffer = buffer_array.get_data();
-        cusparse::transpose(
-            exec->get_cusparse_handle(), orig->get_size()[0],
+        sparselib::transpose(
+            exec->get_sparselib_handle(), orig->get_size()[0],
             orig->get_size()[1], orig->get_num_stored_elements(),
             orig->get_const_values(), orig->get_const_row_ptrs(),
             orig->get_const_col_idxs(), trans->get_values(),
@@ -948,9 +950,9 @@ template <typename ValueType, typename IndexType>
 void sort_by_column_index(std::shared_ptr<const DefaultExecutor> exec,
                           matrix::Csr<ValueType, IndexType>* to_sort)
 {
-    if (cusparse::is_supported<ValueType, IndexType>::value) {
-        auto handle = exec->get_cusparse_handle();
-        auto descr = cusparse::create_mat_descr();
+    if (sparselib::is_supported<ValueType, IndexType>::value) {
+        auto handle = exec->get_sparselib_handle();
+        auto descr = sparselib::create_mat_descr();
         auto m = IndexType(to_sort->get_size()[0]);
         auto n = IndexType(to_sort->get_size()[1]);
         auto nnz = IndexType(to_sort->get_num_stored_elements());
@@ -966,30 +968,30 @@ void sort_by_column_index(std::shared_ptr<const DefaultExecutor> exec,
         // init identity permutation
         array<IndexType> permutation_array(exec, nnz);
         auto permutation = permutation_array.get_data();
-        cusparse::create_identity_permutation(handle, nnz, permutation);
+        components::fill_seq_array(exec, permutation, nnz);
 
         // allocate buffer
         size_type buffer_size{};
-        cusparse::csrsort_buffer_size(handle, m, n, nnz, row_ptrs, col_idxs,
-                                      buffer_size);
+        sparselib::csrsort_buffer_size(handle, m, n, nnz, row_ptrs, col_idxs,
+                                       buffer_size);
         array<char> buffer_array{exec, buffer_size};
         auto buffer = buffer_array.get_data();
 
         // sort column indices
-        cusparse::csrsort(handle, m, n, nnz, descr, row_ptrs, col_idxs,
-                          permutation, buffer);
+        sparselib::csrsort(handle, m, n, nnz, descr, row_ptrs, col_idxs,
+                           permutation, buffer);
 
         // sort values
 #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-        cusparse::gather(handle, nnz, tmp_vals, vals, permutation);
+        sparselib::gather(handle, nnz, tmp_vals, vals, permutation);
 #else  // CUDA_VERSION >= 11000
-        auto val_vec = cusparse::create_spvec(nnz, nnz, permutation, vals);
+        auto val_vec = sparselib::create_spvec(nnz, nnz, permutation, vals);
         auto tmp_vec =
-            cusparse::create_dnvec(nnz, const_cast<ValueType*>(tmp_vals));
-        cusparse::gather(handle, tmp_vec, val_vec);
+            sparselib::create_dnvec(nnz, const_cast<ValueType*>(tmp_vals));
+        sparselib::gather(handle, tmp_vec, val_vec);
 #endif
 
-        cusparse::destroy(descr);
+        sparselib::destroy(descr);
     } else {
         fallback_sort(exec, to_sort);
     }
diff --git a/cuda/matrix/dense_kernels.cu b/cuda/matrix/dense_kernels.cu
index 04b34953c6a..b117c39107b 100644
--- a/cuda/matrix/dense_kernels.cu
+++ b/cuda/matrix/dense_kernels.cu
@@ -17,12 +17,13 @@
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
 
+#include "common/cuda_hip/base/blas_bindings.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/pointer_mode_guard.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/utils.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
-#include "cuda/base/config.hpp"
-#include "cuda/base/cublas_bindings.hpp"
-#include "cuda/base/pointer_mode_guard.hpp"
-#include "cuda/components/cooperative_groups.cuh"
 #include "cuda/components/intrinsics.cuh"
 #include "cuda/components/reduction.cuh"
 #include "cuda/components/thread_ids.cuh"
@@ -53,11 +54,11 @@ void compute_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
                           matrix::Dense<ValueType>* result, array<char>& tmp)
 {
     if (x->get_size()[1] == 1 && y->get_size()[1] == 1) {
-        if (cublas::is_supported<ValueType>::value) {
-            auto handle = exec->get_cublas_handle();
-            cublas::dot(handle, x->get_size()[0], x->get_const_values(),
-                        x->get_stride(), y->get_const_values(), y->get_stride(),
-                        result->get_values());
+        if (blas::is_supported<ValueType>::value) {
+            auto handle = exec->get_blas_handle();
+            blas::dot(handle, x->get_size()[0], x->get_const_values(),
+                      x->get_stride(), y->get_const_values(), y->get_stride(),
+                      result->get_values());
         } else {
             compute_dot(exec, x, y, result, tmp);
         }
@@ -78,11 +79,11 @@ void compute_conj_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
                                array<char>& tmp)
 {
     if (x->get_size()[1] == 1 && y->get_size()[1] == 1) {
-        if (cublas::is_supported<ValueType>::value) {
-            auto handle = exec->get_cublas_handle();
-            cublas::conj_dot(handle, x->get_size()[0], x->get_const_values(),
-                             x->get_stride(), y->get_const_values(),
-                             y->get_stride(), result->get_values());
+        if (blas::is_supported<ValueType>::value) {
+            auto handle = exec->get_blas_handle();
+            blas::conj_dot(handle, x->get_size()[0], x->get_const_values(),
+                           x->get_stride(), y->get_const_values(),
+                           y->get_stride(), result->get_values());
         } else {
             compute_conj_dot(exec, x, y, result, tmp);
         }
@@ -102,10 +103,10 @@ void compute_norm2_dispatch(std::shared_ptr<const DefaultExecutor> exec,
                             array<char>& tmp)
 {
     if (x->get_size()[1] == 1) {
-        if (cublas::is_supported<ValueType>::value) {
-            auto handle = exec->get_cublas_handle();
-            cublas::norm2(handle, x->get_size()[0], x->get_const_values(),
-                          x->get_stride(), result->get_values());
+        if (blas::is_supported<ValueType>::value) {
+            auto handle = exec->get_blas_handle();
+            blas::norm2(handle, x->get_size()[0], x->get_const_values(),
+                        x->get_stride(), result->get_values());
         } else {
             compute_norm2(exec, x, result, tmp);
         }
@@ -124,18 +125,18 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
                   const matrix::Dense<ValueType>* b,
                   matrix::Dense<ValueType>* c)
 {
-    if (cublas::is_supported<ValueType>::value) {
-        auto handle = exec->get_cublas_handle();
+    if (blas::is_supported<ValueType>::value) {
+        auto handle = exec->get_blas_handle();
         if (c->get_size()[0] > 0 && c->get_size()[1] > 0) {
             if (a->get_size()[1] > 0) {
-                cublas::pointer_mode_guard pm_guard(handle);
+                blas::pointer_mode_guard pm_guard(handle);
                 auto alpha = one<ValueType>();
                 auto beta = zero<ValueType>();
-                cublas::gemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, c->get_size()[1],
-                             c->get_size()[0], a->get_size()[1], &alpha,
-                             b->get_const_values(), b->get_stride(),
-                             a->get_const_values(), a->get_stride(), &beta,
-                             c->get_values(), c->get_stride());
+                blas::gemm(handle, BLAS_OP_N, BLAS_OP_N, c->get_size()[1],
+                           c->get_size()[0], a->get_size()[1], &alpha,
+                           b->get_const_values(), b->get_stride(),
+                           a->get_const_values(), a->get_stride(), &beta,
+                           c->get_values(), c->get_stride());
             } else {
                 dense::fill(exec, c, zero<ValueType>());
             }
@@ -154,15 +155,15 @@ void apply(std::shared_ptr<const DefaultExecutor> exec,
            const matrix::Dense<ValueType>* a, const matrix::Dense<ValueType>* b,
            const matrix::Dense<ValueType>* beta, matrix::Dense<ValueType>* c)
 {
-    if (cublas::is_supported<ValueType>::value) {
+    if (blas::is_supported<ValueType>::value) {
         if (c->get_size()[0] > 0 && c->get_size()[1] > 0) {
             if (a->get_size()[1] > 0) {
-                cublas::gemm(
-                    exec->get_cublas_handle(), CUBLAS_OP_N, CUBLAS_OP_N,
-                    c->get_size()[1], c->get_size()[0], a->get_size()[1],
-                    alpha->get_const_values(), b->get_const_values(),
-                    b->get_stride(), a->get_const_values(), a->get_stride(),
-                    beta->get_const_values(), c->get_values(), c->get_stride());
+                blas::gemm(exec->get_blas_handle(), BLAS_OP_N, BLAS_OP_N,
+                           c->get_size()[1], c->get_size()[0], a->get_size()[1],
+                           alpha->get_const_values(), b->get_const_values(),
+                           b->get_stride(), a->get_const_values(),
+                           a->get_stride(), beta->get_const_values(),
+                           c->get_values(), c->get_stride());
             } else {
                 dense::scale(exec, beta, c);
             }
@@ -180,17 +181,17 @@ void transpose(std::shared_ptr<const DefaultExecutor> exec,
                const matrix::Dense<ValueType>* orig,
                matrix::Dense<ValueType>* trans)
 {
-    if (cublas::is_supported<ValueType>::value) {
-        auto handle = exec->get_cublas_handle();
+    if (blas::is_supported<ValueType>::value) {
+        auto handle = exec->get_blas_handle();
         if (orig->get_size()[0] > 0 && orig->get_size()[1] > 0) {
-            cublas::pointer_mode_guard pm_guard(handle);
+            blas::pointer_mode_guard pm_guard(handle);
             auto alpha = one<ValueType>();
             auto beta = zero<ValueType>();
-            cublas::geam(handle, CUBLAS_OP_T, CUBLAS_OP_N, orig->get_size()[0],
-                         orig->get_size()[1], &alpha, orig->get_const_values(),
-                         orig->get_stride(), &beta, trans->get_values(),
-                         trans->get_stride(), trans->get_values(),
-                         trans->get_stride());
+            blas::geam(handle, BLAS_OP_T, BLAS_OP_N, orig->get_size()[0],
+                       orig->get_size()[1], &alpha, orig->get_const_values(),
+                       orig->get_stride(), &beta, trans->get_const_values(),
+                       trans->get_stride(), trans->get_values(),
+                       trans->get_stride());
         }
     } else {
         GKO_NOT_IMPLEMENTED;
@@ -205,17 +206,17 @@ void conj_transpose(std::shared_ptr<const DefaultExecutor> exec,
                     const matrix::Dense<ValueType>* orig,
                     matrix::Dense<ValueType>* trans)
 {
-    if (cublas::is_supported<ValueType>::value) {
-        auto handle = exec->get_cublas_handle();
+    if (blas::is_supported<ValueType>::value) {
+        auto handle = exec->get_blas_handle();
         if (orig->get_size()[0] > 0 && orig->get_size()[1] > 0) {
-            cublas::pointer_mode_guard pm_guard(handle);
+            blas::pointer_mode_guard pm_guard(handle);
             auto alpha = one<ValueType>();
             auto beta = zero<ValueType>();
-            cublas::geam(handle, CUBLAS_OP_C, CUBLAS_OP_N, orig->get_size()[0],
-                         orig->get_size()[1], &alpha, orig->get_const_values(),
-                         orig->get_stride(), &beta, trans->get_values(),
-                         trans->get_stride(), trans->get_values(),
-                         trans->get_stride());
+            blas::geam(handle, BLAS_OP_C, BLAS_OP_N, orig->get_size()[0],
+                       orig->get_size()[1], &alpha, orig->get_const_values(),
+                       orig->get_stride(), &beta, trans->get_const_values(),
+                       trans->get_stride(), trans->get_values(),
+                       trans->get_stride());
         }
     } else {
         GKO_NOT_IMPLEMENTED;
diff --git a/cuda/matrix/diagonal_kernels.cu b/cuda/matrix/diagonal_kernels.cu
index b81329e29a0..e362ff0462b 100644
--- a/cuda/matrix/diagonal_kernels.cu
+++ b/cuda/matrix/diagonal_kernels.cu
@@ -9,9 +9,10 @@
 #include <ginkgo/core/matrix/dense.hpp>
 
 
-#include "cuda/base/config.hpp"
-#include "cuda/base/types.hpp"
-#include "cuda/components/cooperative_groups.cuh"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "cuda/components/thread_ids.cuh"
 
 
diff --git a/cuda/matrix/ell_kernels.cu b/cuda/matrix/ell_kernels.cu
index 9c23abc9dc4..105122ec4a9 100644
--- a/cuda/matrix/ell_kernels.cu
+++ b/cuda/matrix/ell_kernels.cu
@@ -15,19 +15,20 @@
 #include <ginkgo/core/matrix/dense.hpp>
 
 
-#include "accessor/cuda_helper.hpp"
+#include "accessor/cuda_hip_helper.hpp"
 #include "accessor/reduced_row_major.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/sparselib_bindings.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/format_conversion.hpp"
 #include "core/base/mixed_precision_types.hpp"
 #include "core/components/fill_array_kernels.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/dense_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "cuda/base/config.hpp"
-#include "cuda/base/cusparse_bindings.hpp"
-#include "cuda/base/types.hpp"
 #include "cuda/components/atomic.cuh"
-#include "cuda/components/cooperative_groups.cuh"
-#include "cuda/components/format_conversion.cuh"
 #include "cuda/components/reduction.cuh"
 #include "cuda/components/thread_ids.cuh"
 
@@ -97,9 +98,9 @@ void abstract_spmv(syn::value_list<int, info>,
     using arithmetic_type =
         highest_precision<InputValueType, OutputValueType, MatrixValueType>;
     using a_accessor =
-        gko::acc::reduced_row_major<1, arithmetic_type, const MatrixValueType>;
+        acc::reduced_row_major<1, arithmetic_type, const MatrixValueType>;
     using b_accessor =
-        gko::acc::reduced_row_major<2, arithmetic_type, const InputValueType>;
+        acc::reduced_row_major<2, arithmetic_type, const InputValueType>;
 
     const auto nrows = a->get_size()[0];
     const auto stride = a->get_stride();
@@ -114,11 +115,11 @@ void abstract_spmv(syn::value_list<int, info>,
     const dim3 grid_size(ceildiv(nrows * num_worker_per_row, block_size.x),
                          b->get_size()[1], 1);
 
-    const auto a_vals = gko::acc::range<a_accessor>(
+    const auto a_vals = acc::range<a_accessor>(
         std::array<acc::size_type, 1>{{static_cast<acc::size_type>(
             num_stored_elements_per_row * stride)}},
         a->get_const_values());
-    const auto b_vals = gko::acc::range<b_accessor>(
+    const auto b_vals = acc::range<b_accessor>(
         std::array<acc::size_type, 2>{
             {static_cast<acc::size_type>(b->get_size()[0]),
              static_cast<acc::size_type>(b->get_size()[1])}},
@@ -130,20 +131,21 @@ void abstract_spmv(syn::value_list<int, info>,
         if (grid_size.x > 0 && grid_size.y > 0) {
             kernel::spmv<num_thread_per_worker, atomic>
                 <<<grid_size, block_size, 0, exec->get_stream()>>>(
-                    nrows, num_worker_per_row, acc::as_cuda_range(a_vals),
+                    nrows, num_worker_per_row, acc::as_device_range(a_vals),
                     a->get_const_col_idxs(), stride,
-                    num_stored_elements_per_row, acc::as_cuda_range(b_vals),
+                    num_stored_elements_per_row, acc::as_device_range(b_vals),
                     as_device_type(c->get_values()), c->get_stride());
         }
     } else if (alpha != nullptr && beta != nullptr) {
-        const auto alpha_val = gko::acc::range<a_accessor>(
+        const auto alpha_val = acc::range<a_accessor>(
             std::array<acc::size_type, 1>{1}, alpha->get_const_values());
         if (grid_size.x > 0 && grid_size.y > 0) {
             kernel::spmv<num_thread_per_worker, atomic>
                 <<<grid_size, block_size, 0, exec->get_stream()>>>(
-                    nrows, num_worker_per_row, acc::as_cuda_range(alpha_val),
-                    acc::as_cuda_range(a_vals), a->get_const_col_idxs(), stride,
-                    num_stored_elements_per_row, acc::as_cuda_range(b_vals),
+                    nrows, num_worker_per_row, acc::as_device_range(alpha_val),
+                    acc::as_device_range(a_vals), a->get_const_col_idxs(),
+                    stride, num_stored_elements_per_row,
+                    acc::as_device_range(b_vals),
                     as_device_type(beta->get_const_values()),
                     as_device_type(c->get_values()), c->get_stride());
         }
@@ -212,7 +214,7 @@ void spmv(std::shared_ptr<const CudaExecutor> exec,
     const int num_worker_per_row = std::get<2>(data);
 
     /**
-     * info is the parameter for selecting the cuda kernel.
+     * info is the parameter for selecting the device kernel.
      * for info == 0, it uses the kernel by warp_size threads with atomic
      * operation for other value, it uses the kernel without atomic_add
      */
@@ -246,7 +248,7 @@ void advanced_spmv(std::shared_ptr<const CudaExecutor> exec,
     const int num_worker_per_row = std::get<2>(data);
 
     /**
-     * info is the parameter for selecting the cuda kernel.
+     * info is the parameter for selecting the device kernel.
      * for info == 0, it uses the kernel by warp_size threads with atomic
      * operation for other value, it uses the kernel without atomic_add
      */
diff --git a/cuda/matrix/fbcsr_kernels.template.cu b/cuda/matrix/fbcsr_kernels.template.cu
index 8b835c6fd7d..ad36c84216e 100644
--- a/cuda/matrix/fbcsr_kernels.template.cu
+++ b/cuda/matrix/fbcsr_kernels.template.cu
@@ -24,6 +24,13 @@
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "common/cuda_hip/base/blas_bindings.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/pointer_mode_guard.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/sparselib_bindings.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "common/unified/base/kernel_launch.hpp"
 #include "core/base/array_access.hpp"
 #include "core/base/block_sizes.hpp"
@@ -33,16 +40,10 @@
 #include "core/matrix/csr_lookup.hpp"
 #include "core/matrix/dense_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "cuda/base/config.hpp"
-#include "cuda/base/cublas_bindings.hpp"
-#include "cuda/base/cusparse_bindings.hpp"
 #include "cuda/base/cusparse_block_bindings.hpp"
 #include "cuda/base/math.hpp"
-#include "cuda/base/pointer_mode_guard.hpp"
 #include "cuda/base/thrust.cuh"
-#include "cuda/base/types.hpp"
 #include "cuda/components/atomic.cuh"
-#include "cuda/components/cooperative_groups.cuh"
 #include "cuda/components/merging.cuh"
 #include "cuda/components/prefix_sum.cuh"
 #include "cuda/components/reduction.cuh"
@@ -72,6 +73,7 @@ constexpr int default_block_size{512};
 
 namespace {
 
+
 template <typename ValueType>
 void dense_transpose(std::shared_ptr<const CudaExecutor> exec,
                      const size_type nrows, const size_type ncols,
@@ -81,21 +83,22 @@ void dense_transpose(std::shared_ptr<const CudaExecutor> exec,
     if (nrows == 0) {
         return;
     }
-    if (cublas::is_supported<ValueType>::value) {
-        auto handle = exec->get_cublas_handle();
+    if (blas::is_supported<ValueType>::value) {
+        auto handle = exec->get_blas_handle();
         {
-            cublas::pointer_mode_guard pm_guard(handle);
+            blas::pointer_mode_guard pm_guard(handle);
             auto alpha = one<ValueType>();
             auto beta = zero<ValueType>();
-            cublas::geam(handle, CUBLAS_OP_T, CUBLAS_OP_N, nrows, ncols, &alpha,
-                         orig, orig_stride, &beta, trans, trans_stride, trans,
-                         trans_stride);
+            blas::geam(handle, BLAS_OP_T, BLAS_OP_N, nrows, ncols, &alpha, orig,
+                       orig_stride, &beta, trans, trans_stride, trans,
+                       trans_stride);
         }
     } else {
         GKO_NOT_IMPLEMENTED;
     }
 }
 
+
 }  // namespace
 
 
@@ -114,12 +117,12 @@ void spmv(std::shared_ptr<const CudaExecutor> exec,
         dense::fill(exec, c, zero<ValueType>());
         return;
     }
-    if (cusparse::is_supported<ValueType, IndexType>::value) {
-        auto handle = exec->get_cusparse_handle();
-        cusparse::pointer_mode_guard pm_guard(handle);
+    if (sparselib::is_supported<ValueType, IndexType>::value) {
+        auto handle = exec->get_sparselib_handle();
+        sparselib::pointer_mode_guard pm_guard(handle);
         const auto alpha = one<ValueType>();
         const auto beta = zero<ValueType>();
-        auto descr = cusparse::create_mat_descr();
+        auto descr = sparselib::create_mat_descr();
         const auto row_ptrs = a->get_const_row_ptrs();
         const auto col_idxs = a->get_const_col_idxs();
         const auto values = a->get_const_values();
@@ -133,21 +136,21 @@ void spmv(std::shared_ptr<const CudaExecutor> exec,
         const auto in_stride = b->get_stride();
         const auto out_stride = c->get_stride();
         if (nrhs == 1 && in_stride == 1 && out_stride == 1) {
-            cusparse::bsrmv(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, mb, nb,
-                            nnzb, &alpha, descr, values, row_ptrs, col_idxs, bs,
-                            b->get_const_values(), &beta, c->get_values());
+            sparselib::bsrmv(handle, SPARSELIB_OPERATION_NON_TRANSPOSE, mb, nb,
+                             nnzb, &alpha, descr, values, row_ptrs, col_idxs,
+                             bs, b->get_const_values(), &beta, c->get_values());
         } else {
             const auto trans_stride = nrows;
             auto trans_c = array<ValueType>(exec, nrows * nrhs);
-            cusparse::bsrmm(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
-                            CUSPARSE_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb,
-                            &alpha, descr, values, row_ptrs, col_idxs, bs,
-                            b->get_const_values(), in_stride, &beta,
-                            trans_c.get_data(), trans_stride);
+            sparselib::bsrmm(handle, SPARSELIB_OPERATION_NON_TRANSPOSE,
+                             SPARSELIB_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb,
+                             &alpha, descr, values, row_ptrs, col_idxs, bs,
+                             b->get_const_values(), in_stride, &beta,
+                             trans_c.get_data(), trans_stride);
             dense_transpose(exec, nrhs, nrows, trans_stride, trans_c.get_data(),
                             out_stride, c->get_values());
         }
-        cusparse::destroy(descr);
+        sparselib::destroy(descr);
     } else {
         GKO_NOT_IMPLEMENTED;
     }
@@ -171,11 +174,11 @@ void advanced_spmv(std::shared_ptr<const CudaExecutor> exec,
         dense::scale(exec, beta, c);
         return;
     }
-    if (cusparse::is_supported<ValueType, IndexType>::value) {
-        auto handle = exec->get_cusparse_handle();
+    if (sparselib::is_supported<ValueType, IndexType>::value) {
+        auto handle = exec->get_sparselib_handle();
         const auto alphp = alpha->get_const_values();
         const auto betap = beta->get_const_values();
-        auto descr = cusparse::create_mat_descr();
+        auto descr = sparselib::create_mat_descr();
         const auto row_ptrs = a->get_const_row_ptrs();
         const auto col_idxs = a->get_const_col_idxs();
         const auto values = a->get_const_values();
@@ -189,23 +192,23 @@ void advanced_spmv(std::shared_ptr<const CudaExecutor> exec,
         const auto in_stride = b->get_stride();
         const auto out_stride = c->get_stride();
         if (nrhs == 1 && in_stride == 1 && out_stride == 1) {
-            cusparse::bsrmv(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, mb, nb,
-                            nnzb, alphp, descr, values, row_ptrs, col_idxs, bs,
-                            b->get_const_values(), betap, c->get_values());
+            sparselib::bsrmv(handle, SPARSELIB_OPERATION_NON_TRANSPOSE, mb, nb,
+                             nnzb, alphp, descr, values, row_ptrs, col_idxs, bs,
+                             b->get_const_values(), betap, c->get_values());
         } else {
             const auto trans_stride = nrows;
             auto trans_c = array<ValueType>(exec, nrows * nrhs);
             dense_transpose(exec, nrows, nrhs, out_stride, c->get_values(),
                             trans_stride, trans_c.get_data());
-            cusparse::bsrmm(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
-                            CUSPARSE_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb,
-                            alphp, descr, values, row_ptrs, col_idxs, bs,
-                            b->get_const_values(), in_stride, betap,
-                            trans_c.get_data(), trans_stride);
+            sparselib::bsrmm(handle, SPARSELIB_OPERATION_NON_TRANSPOSE,
+                             SPARSELIB_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb,
+                             alphp, descr, values, row_ptrs, col_idxs, bs,
+                             b->get_const_values(), in_stride, betap,
+                             trans_c.get_data(), trans_stride);
             dense_transpose(exec, nrhs, nrows, trans_stride, trans_c.get_data(),
                             out_stride, c->get_values());
         }
-        cusparse::destroy(descr);
+        sparselib::destroy(descr);
     } else {
         GKO_NOT_IMPLEMENTED;
     }
@@ -244,20 +247,21 @@ void transpose(const std::shared_ptr<const CudaExecutor> exec,
                const matrix::Fbcsr<ValueType, IndexType>* const orig,
                matrix::Fbcsr<ValueType, IndexType>* const trans)
 {
-    if (cusparse::is_supported<ValueType, IndexType>::value) {
+#ifdef GKO_COMPILING_CUDA
+    if (sparselib::is_supported<ValueType, IndexType>::value) {
         const int bs = orig->get_block_size();
         const IndexType nnzb =
             static_cast<IndexType>(orig->get_num_stored_blocks());
         cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC;
         cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO;
-        const IndexType buffer_size = cusparse::bsr_transpose_buffersize(
-            exec->get_cusparse_handle(), orig->get_num_block_rows(),
+        const IndexType buffer_size = sparselib::bsr_transpose_buffersize(
+            exec->get_sparselib_handle(), orig->get_num_block_rows(),
             orig->get_num_block_cols(), nnzb, orig->get_const_values(),
             orig->get_const_row_ptrs(), orig->get_const_col_idxs(), bs, bs);
         array<char> buffer_array(exec, buffer_size);
         auto buffer = buffer_array.get_data();
-        cusparse::bsr_transpose(
-            exec->get_cusparse_handle(), orig->get_num_block_rows(),
+        sparselib::bsr_transpose(
+            exec->get_sparselib_handle(), orig->get_num_block_rows(),
             orig->get_num_block_cols(), nnzb, orig->get_const_values(),
             orig->get_const_row_ptrs(), orig->get_const_col_idxs(), bs, bs,
             trans->get_values(), trans->get_col_idxs(), trans->get_row_ptrs(),
@@ -268,7 +272,9 @@ void transpose(const std::shared_ptr<const CudaExecutor> exec,
             fixedblock::compiled_kernels(),
             [bs](int compiled_block_size) { return bs == compiled_block_size; },
             syn::value_list<int>(), syn::type_list<>(), exec, trans);
-    } else {
+    } else
+#endif
+    {
         fallback_transpose(exec, orig, trans);
     }
 }
diff --git a/cuda/matrix/sellp_kernels.cu b/cuda/matrix/sellp_kernels.cu
index 5eadf0d3858..d6c20075ef4 100644
--- a/cuda/matrix/sellp_kernels.cu
+++ b/cuda/matrix/sellp_kernels.cu
@@ -12,10 +12,11 @@
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/sparselib_bindings.hpp"
+#include "common/cuda_hip/base/types.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
-#include "cuda/base/config.hpp"
-#include "cuda/base/cusparse_bindings.hpp"
-#include "cuda/base/types.hpp"
 #include "cuda/components/reduction.cuh"
 #include "cuda/components/thread_ids.cuh"
 
diff --git a/cuda/matrix/sparsity_csr_kernels.cu b/cuda/matrix/sparsity_csr_kernels.cu
index 3a3d60b24e0..311e4d3782c 100644
--- a/cuda/matrix/sparsity_csr_kernels.cu
+++ b/cuda/matrix/sparsity_csr_kernels.cu
@@ -11,18 +11,19 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
-#include "accessor/cuda_helper.hpp"
+#include "accessor/cuda_hip_helper.hpp"
 #include "accessor/reduced_row_major.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/sparselib_bindings.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/mixed_precision_types.hpp"
 #include "core/components/fill_array_kernels.hpp"
 #include "core/components/format_conversion_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "cuda/base/config.hpp"
-#include "cuda/base/cusparse_bindings.hpp"
 #include "cuda/base/math.hpp"
 #include "cuda/base/thrust.cuh"
-#include "cuda/base/types.hpp"
-#include "cuda/components/cooperative_groups.cuh"
 #include "cuda/components/reduction.cuh"
 #include "cuda/components/thread_ids.cuh"
 #include "cuda/components/uninitialized_array.hpp"
@@ -41,7 +42,11 @@ namespace sparsity_csr {
 
 constexpr int classical_oversubscription = 32;
 constexpr int default_block_size = 512;
+#ifdef GKO_COMPILING_HIP
+constexpr int spmv_block_size = 256;
+#else
 constexpr int spmv_block_size = 128;
+#endif
 constexpr int warps_in_block = 4;
 
 
@@ -105,16 +110,16 @@ void classical_spmv(syn::value_list<int, subwarp_size>,
                 a->get_size()[0], as_device_type(a->get_const_value()),
                 a->get_const_col_idxs(),
                 as_device_type(a->get_const_row_ptrs()),
-                acc::as_cuda_range(b_vals), acc::as_cuda_range(c_vals));
+                acc::as_device_range(b_vals), acc::as_device_range(c_vals));
     } else if (alpha != nullptr && beta != nullptr) {
         kernel::abstract_classical_spmv<subwarp_size>
             <<<grid, block, 0, exec->get_stream()>>>(
                 a->get_size()[0], as_device_type(alpha->get_const_values()),
                 as_device_type(a->get_const_value()), a->get_const_col_idxs(),
                 as_device_type(a->get_const_row_ptrs()),
-                acc::as_cuda_range(b_vals),
+                acc::as_device_range(b_vals),
                 as_device_type(beta->get_const_values()),
-                acc::as_cuda_range(c_vals));
+                acc::as_device_range(c_vals));
     } else {
         GKO_KERNEL_NOT_FOUND;
     }
@@ -168,21 +173,21 @@ void sort_by_column_index(std::shared_ptr<const DefaultExecutor> exec,
     const auto num_cols = static_cast<IndexType>(to_sort->get_size()[1]);
     const auto row_ptrs = to_sort->get_const_row_ptrs();
     const auto col_idxs = to_sort->get_col_idxs();
-    if (cusparse::is_supported<ValueType, IndexType>::value) {
-        const auto handle = exec->get_cusparse_handle();
-        auto descr = cusparse::create_mat_descr();
+    if (sparselib::is_supported<ValueType, IndexType>::value) {
+        const auto handle = exec->get_sparselib_handle();
+        auto descr = sparselib::create_mat_descr();
         array<IndexType> permutation_array(exec, to_sort->get_num_nonzeros());
         auto permutation = permutation_array.get_data();
         components::fill_seq_array(exec, permutation,
                                    to_sort->get_num_nonzeros());
         size_type buffer_size{};
-        cusparse::csrsort_buffer_size(handle, num_rows, num_cols, nnz, row_ptrs,
-                                      col_idxs, buffer_size);
+        sparselib::csrsort_buffer_size(handle, num_rows, num_cols, nnz,
+                                       row_ptrs, col_idxs, buffer_size);
         array<char> buffer_array{exec, buffer_size};
         auto buffer = buffer_array.get_data();
-        cusparse::csrsort(handle, num_rows, num_cols, nnz, descr, row_ptrs,
-                          col_idxs, permutation, buffer);
-        cusparse::destroy(descr);
+        sparselib::csrsort(handle, num_rows, num_cols, nnz, descr, row_ptrs,
+                           col_idxs, permutation, buffer);
+        sparselib::destroy(descr);
     } else {
         fallback_sort(exec, to_sort);
     }
diff --git a/cuda/multigrid/pgm_kernels.cu b/cuda/multigrid/pgm_kernels.cu
index b5e9fa1612d..75c3dd911ad 100644
--- a/cuda/multigrid/pgm_kernels.cu
+++ b/cuda/multigrid/pgm_kernels.cu
@@ -8,8 +8,6 @@
 #include <memory>
 
 
-#include <cuda.h>
-#include <cusparse.h>
 #include <thrust/device_ptr.h>
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/reduce.h>
@@ -21,8 +19,8 @@
 #include <ginkgo/core/base/math.hpp>
 
 
+#include "common/cuda_hip/base/types.hpp"
 #include "cuda/base/thrust.cuh"
-#include "cuda/base/types.hpp"
 
 
 namespace gko {
diff --git a/cuda/preconditioner/batch_preconditioners.cuh b/cuda/preconditioner/batch_preconditioners.cuh
index 0eae8650bdc..e83d6e04ee9 100644
--- a/cuda/preconditioner/batch_preconditioners.cuh
+++ b/cuda/preconditioner/batch_preconditioners.cuh
@@ -6,9 +6,9 @@
 #define GKO_CUDA_PRECONDITIONER_BATCH_PRECONDITIONERS_CUH_
 
 
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/preconditioner/batch_jacobi_helpers.hpp"
-#include "cuda/components/cooperative_groups.cuh"
 #include "cuda/components/reduction.cuh"
 
 
diff --git a/cuda/preconditioner/isai_kernels.cu b/cuda/preconditioner/isai_kernels.cu
index 6551f32bb86..d0dd516466a 100644
--- a/cuda/preconditioner/isai_kernels.cu
+++ b/cuda/preconditioner/isai_kernels.cu
@@ -10,12 +10,13 @@
 #include <ginkgo/core/matrix/csr.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/csr_builder.hpp"
-#include "cuda/base/config.hpp"
 #include "cuda/base/math.hpp"
-#include "cuda/base/types.hpp"
-#include "cuda/components/cooperative_groups.cuh"
 #include "cuda/components/merging.cuh"
 #include "cuda/components/reduction.cuh"
 #include "cuda/components/thread_ids.cuh"
diff --git a/cuda/preconditioner/jacobi_advanced_apply_kernel.cu b/cuda/preconditioner/jacobi_advanced_apply_kernels.cu
similarity index 100%
rename from cuda/preconditioner/jacobi_advanced_apply_kernel.cu
rename to cuda/preconditioner/jacobi_advanced_apply_kernels.cu
diff --git a/cuda/preconditioner/jacobi_advanced_apply_instantiate.inc.cu b/cuda/preconditioner/jacobi_advanced_apply_kernels.instantiate.cu
similarity index 95%
rename from cuda/preconditioner/jacobi_advanced_apply_instantiate.inc.cu
rename to cuda/preconditioner/jacobi_advanced_apply_kernels.instantiate.cu
index 5633ad15a4b..10ede90da7e 100644
--- a/cuda/preconditioner/jacobi_advanced_apply_instantiate.inc.cu
+++ b/cuda/preconditioner/jacobi_advanced_apply_kernels.instantiate.cu
@@ -8,14 +8,14 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/extended_float.hpp"
 #include "core/matrix/dense_kernels.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "cuda/base/config.hpp"
 #include "cuda/base/math.hpp"
-#include "cuda/base/types.hpp"
-#include "cuda/components/cooperative_groups.cuh"
 #include "cuda/components/thread_ids.cuh"
 #include "cuda/components/warp_blas.cuh"
 #include "cuda/preconditioner/jacobi_common.hpp"
@@ -32,7 +32,7 @@ namespace cuda {
 namespace jacobi {
 
 
-#include "common/cuda_hip/preconditioner/jacobi_advanced_apply_kernel.hpp.inc"
+#include "common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.hpp.inc"
 
 
 // clang-format off
diff --git a/cuda/preconditioner/jacobi_common.hpp.in b/cuda/preconditioner/jacobi_common.hpp.in
index fe99fd88227..aeb47fec97e 100644
--- a/cuda/preconditioner/jacobi_common.hpp.in
+++ b/cuda/preconditioner/jacobi_common.hpp.in
@@ -6,7 +6,7 @@
 #include <ginkgo/core/synthesizer/containers.hpp>
 
 
-#include "cuda/base/config.hpp"
+#include "common/cuda_hip/base/config.hpp"
 
 namespace gko {
 namespace kernels {
diff --git a/cuda/preconditioner/jacobi_generate_kernel.cu b/cuda/preconditioner/jacobi_generate_kernels.cu
similarity index 100%
rename from cuda/preconditioner/jacobi_generate_kernel.cu
rename to cuda/preconditioner/jacobi_generate_kernels.cu
diff --git a/cuda/preconditioner/jacobi_generate_instantiate.inc.cu b/cuda/preconditioner/jacobi_generate_kernels.instantiate.cu
similarity index 94%
rename from cuda/preconditioner/jacobi_generate_instantiate.inc.cu
rename to cuda/preconditioner/jacobi_generate_kernels.instantiate.cu
index a76c4fba271..129c50625f4 100644
--- a/cuda/preconditioner/jacobi_generate_instantiate.inc.cu
+++ b/cuda/preconditioner/jacobi_generate_kernels.instantiate.cu
@@ -9,14 +9,14 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/extended_float.hpp"
 #include "core/components/fill_array_kernels.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "cuda/base/config.hpp"
 #include "cuda/base/math.hpp"
-#include "cuda/base/types.hpp"
-#include "cuda/components/cooperative_groups.cuh"
 #include "cuda/components/diagonal_block_manipulation.cuh"
 #include "cuda/components/thread_ids.cuh"
 #include "cuda/components/uninitialized_array.hpp"
@@ -35,7 +35,7 @@ namespace cuda {
 namespace jacobi {
 
 
-#include "common/cuda_hip/preconditioner/jacobi_generate_kernel.hpp.inc"
+#include "common/cuda_hip/preconditioner/jacobi_generate_kernels.hpp.inc"
 
 
 // clang-format off
diff --git a/cuda/preconditioner/jacobi_kernels.cu b/cuda/preconditioner/jacobi_kernels.cu
index 2508f33acb9..bce2ff23303 100644
--- a/cuda/preconditioner/jacobi_kernels.cu
+++ b/cuda/preconditioner/jacobi_kernels.cu
@@ -8,13 +8,14 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/extended_float.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "cuda/base/config.hpp"
 #include "cuda/base/math.hpp"
-#include "cuda/base/types.hpp"
-#include "cuda/components/cooperative_groups.cuh"
 #include "cuda/components/thread_ids.cuh"
 #include "cuda/preconditioner/jacobi_common.hpp"
 
@@ -30,8 +31,12 @@ namespace cuda {
 namespace jacobi {
 
 
-// a total of 32 warps (1024 threads)
+// a total of 32/16 warps (1024 threads)
+#if defined(GKO_COMPILING_HIP) && GINKGO_HIP_PLATFORM_HCC
+constexpr int default_num_warps = 16;
+#else  // !defined(GKO_COMPILING_HIP) || GINKGO_HIP_PLATFORM_NVCC
 constexpr int default_num_warps = 32;
+#endif
 // with current architectures, at most 32 warps can be scheduled per SM (and
 // current GPUs have at most 84 SMs)
 constexpr int default_grid_size = 32 * 32 * 128;
diff --git a/cuda/preconditioner/jacobi_simple_apply_kernel.cu b/cuda/preconditioner/jacobi_simple_apply_kernels.cu
similarity index 100%
rename from cuda/preconditioner/jacobi_simple_apply_kernel.cu
rename to cuda/preconditioner/jacobi_simple_apply_kernels.cu
diff --git a/cuda/preconditioner/jacobi_simple_apply_instantiate.inc.cu b/cuda/preconditioner/jacobi_simple_apply_kernels.instantiate.cu
similarity index 95%
rename from cuda/preconditioner/jacobi_simple_apply_instantiate.inc.cu
rename to cuda/preconditioner/jacobi_simple_apply_kernels.instantiate.cu
index 07689daa815..15f6dc138ad 100644
--- a/cuda/preconditioner/jacobi_simple_apply_instantiate.inc.cu
+++ b/cuda/preconditioner/jacobi_simple_apply_kernels.instantiate.cu
@@ -8,14 +8,14 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/extended_float.hpp"
 #include "core/matrix/dense_kernels.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "cuda/base/config.hpp"
 #include "cuda/base/math.hpp"
-#include "cuda/base/types.hpp"
-#include "cuda/components/cooperative_groups.cuh"
 #include "cuda/components/thread_ids.cuh"
 #include "cuda/components/warp_blas.cuh"
 #include "cuda/preconditioner/jacobi_common.hpp"
@@ -32,7 +32,7 @@ namespace cuda {
 namespace jacobi {
 
 
-#include "common/cuda_hip/preconditioner/jacobi_simple_apply_kernel.hpp.inc"
+#include "common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.hpp.inc"
 
 
 // clang-format off
diff --git a/cuda/reorder/rcm_kernels.cu b/cuda/reorder/rcm_kernels.cu
index d699d00dfb6..72322016fba 100644
--- a/cuda/reorder/rcm_kernels.cu
+++ b/cuda/reorder/rcm_kernels.cu
@@ -25,9 +25,9 @@
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
 
+#include "common/cuda_hip/components/memory.hpp"
 #include "core/base/array_access.hpp"
 #include "cuda/base/thrust.cuh"
-#include "cuda/components/memory.cuh"
 #include "cuda/components/thread_ids.cuh"
 
 
diff --git a/cuda/solver/batch_bicgstab_kernels.cu b/cuda/solver/batch_bicgstab_kernels.cu
index 0ce95e2d34f..58e1a6b7b0d 100644
--- a/cuda/solver/batch_bicgstab_kernels.cu
+++ b/cuda/solver/batch_bicgstab_kernels.cu
@@ -13,15 +13,16 @@
 #include <ginkgo/core/base/math.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/solver/batch_dispatch.hpp"
 #include "cuda/base/batch_struct.hpp"
-#include "cuda/base/config.hpp"
 #include "cuda/base/kernel_config.hpp"
 #include "cuda/base/thrust.cuh"
-#include "cuda/base/types.hpp"
-#include "cuda/components/cooperative_groups.cuh"
 #include "cuda/components/reduction.cuh"
 #include "cuda/components/thread_ids.cuh"
 #include "cuda/components/uninitialized_array.hpp"
diff --git a/cuda/solver/batch_cg_kernels.cu b/cuda/solver/batch_cg_kernels.cu
index f429e5f22f0..398e831eb09 100644
--- a/cuda/solver/batch_cg_kernels.cu
+++ b/cuda/solver/batch_cg_kernels.cu
@@ -13,15 +13,15 @@
 #include <ginkgo/core/base/math.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/solver/batch_dispatch.hpp"
 #include "cuda/base/batch_struct.hpp"
-#include "cuda/base/config.hpp"
 #include "cuda/base/kernel_config.hpp"
 #include "cuda/base/thrust.cuh"
-#include "cuda/base/types.hpp"
-#include "cuda/components/cooperative_groups.cuh"
 #include "cuda/components/reduction.cuh"
 #include "cuda/components/thread_ids.cuh"
 #include "cuda/components/uninitialized_array.hpp"
diff --git a/cuda/solver/cb_gmres_kernels.cu b/cuda/solver/cb_gmres_kernels.cu
index 107835ca1b5..3dbefadf22a 100644
--- a/cuda/solver/cb_gmres_kernels.cu
+++ b/cuda/solver/cb_gmres_kernels.cu
@@ -8,25 +8,25 @@
 #include <algorithm>
 
 
+#include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/base/range.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/stop/stopping_status.hpp>
 
 
-#include "accessor/cuda_helper.hpp"
+#include "accessor/cuda_hip_helper.hpp"
 #include "accessor/range.hpp"
 #include "accessor/reduced_row_major.hpp"
 #include "accessor/scaled_reduced_row_major.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/array_access.hpp"
 #include "core/components/fill_array_kernels.hpp"
 #include "core/matrix/dense_kernels.hpp"
 #include "core/solver/cb_gmres_accessor.hpp"
-#include "cuda/base/config.hpp"
 #include "cuda/base/math.hpp"
-#include "cuda/base/types.hpp"
 #include "cuda/components/atomic.cuh"
-#include "cuda/components/cooperative_groups.cuh"
 #include "cuda/components/reduction.cuh"
 #include "cuda/components/thread_ids.cuh"
 #include "cuda/components/uninitialized_array.hpp"
@@ -44,6 +44,8 @@ namespace cb_gmres {
 
 
 constexpr int default_block_size = 512;
+// default_dot_dim can not be 64 in hip because 64 * 64 exceeds their max block
+// size limit.
 constexpr int default_dot_dim = 32;
 constexpr int default_dot_size = default_dot_dim * default_dot_dim;
 
@@ -116,7 +118,7 @@ void restart(std::shared_ptr<const DefaultExecutor> exec,
     restart_1_kernel<block_size>
         <<<grid_dim_1, block_dim, 0, exec->get_stream()>>>(
             residual->get_size()[0], residual->get_size()[1], krylov_dim,
-            acc::as_cuda_range(krylov_bases),
+            acc::as_device_range(krylov_bases),
             as_device_type(residual_norm_collection->get_values()),
             residual_norm_collection->get_stride());
     kernels::cuda::dense::compute_norm2_dispatch(exec, residual, residual_norm,
@@ -145,7 +147,7 @@ void restart(std::shared_ptr<const DefaultExecutor> exec,
                 residual_norm->get_stride(),
                 as_device_type(arnoldi_norm->get_const_values() +
                                2 * stride_arnoldi),
-                stride_arnoldi, acc::as_cuda_range(krylov_bases));
+                stride_arnoldi, acc::as_device_range(krylov_bases));
     }
 
     const auto grid_dim_2 =
@@ -158,7 +160,7 @@ void restart(std::shared_ptr<const DefaultExecutor> exec,
             residual->get_stride(),
             as_device_type(residual_norm->get_const_values()),
             as_device_type(residual_norm_collection->get_values()),
-            acc::as_cuda_range(krylov_bases),
+            acc::as_device_range(krylov_bases),
             as_device_type(next_krylov_basis->get_values()),
             next_krylov_basis->get_stride(),
             as_device_type(final_iter_nums->get_data()));
@@ -212,6 +214,7 @@ void finish_arnoldi_CGS(std::shared_ptr<const DefaultExecutor> exec,
         as_device_type(next_krylov_basis->get_const_values()),
         stride_next_krylov, as_device_type(arnoldi_norm->get_values()),
         as_device_type(stop_status));
+    // nrmP = norm(next_krylov_basis)
     zero_matrix(exec, iter + 1, dim_size[1], stride_hessenberg,
                 hessenberg_iter->get_values());
     if (dim_size[1] > 1) {
@@ -219,7 +222,7 @@ void finish_arnoldi_CGS(std::shared_ptr<const DefaultExecutor> exec,
             <<<grid_size_num_iters, block_size, 0, exec->get_stream()>>>(
                 dim_size[0], dim_size[1],
                 as_device_type(next_krylov_basis->get_const_values()),
-                stride_next_krylov, acc::as_cuda_range(krylov_bases),
+                stride_next_krylov, acc::as_device_range(krylov_bases),
                 as_device_type(hessenberg_iter->get_values()),
                 stride_hessenberg, as_device_type(stop_status));
     } else {
@@ -228,7 +231,7 @@ void finish_arnoldi_CGS(std::shared_ptr<const DefaultExecutor> exec,
                exec->get_stream()>>>(
                 dim_size[0],
                 as_device_type(next_krylov_basis->get_const_values()),
-                stride_next_krylov, acc::as_cuda_range(krylov_bases),
+                stride_next_krylov, acc::as_device_range(krylov_bases),
                 as_device_type(hessenberg_iter->get_values()),
                 stride_hessenberg, as_device_type(stop_status));
     }
@@ -240,7 +243,7 @@ void finish_arnoldi_CGS(std::shared_ptr<const DefaultExecutor> exec,
            default_block_size, 0, exec->get_stream()>>>(
             iter + 1, dim_size[0], dim_size[1],
             as_device_type(next_krylov_basis->get_values()), stride_next_krylov,
-            acc::as_cuda_range(krylov_bases),
+            acc::as_device_range(krylov_bases),
             as_device_type(hessenberg_iter->get_const_values()),
             stride_hessenberg, as_device_type(stop_status));
 
@@ -269,7 +272,7 @@ void finish_arnoldi_CGS(std::shared_ptr<const DefaultExecutor> exec,
            exec->get_stream()>>>(
             dim_size[1], as_device_type(arnoldi_norm->get_values()),
             stride_arnoldi, as_device_type(hessenberg_iter->get_values()),
-            stride_hessenberg, iter + 1, acc::as_cuda_range(krylov_bases),
+            stride_hessenberg, iter + 1, acc::as_device_range(krylov_bases),
             as_device_type(stop_status), as_device_type(reorth_status),
             as_device_type(num_reorth->get_data()));
     num_reorth_host = get_element(*num_reorth, 0);
@@ -282,7 +285,7 @@ void finish_arnoldi_CGS(std::shared_ptr<const DefaultExecutor> exec,
                 <<<grid_size_num_iters, block_size, 0, exec->get_stream()>>>(
                     dim_size[0], dim_size[1],
                     as_device_type(next_krylov_basis->get_const_values()),
-                    stride_next_krylov, acc::as_cuda_range(krylov_bases),
+                    stride_next_krylov, acc::as_device_range(krylov_bases),
                     as_device_type(buffer_iter->get_values()), stride_buffer,
                     as_device_type(stop_status));
         } else {
@@ -291,7 +294,7 @@ void finish_arnoldi_CGS(std::shared_ptr<const DefaultExecutor> exec,
                    exec->get_stream()>>>(
                     dim_size[0],
                     as_device_type(next_krylov_basis->get_const_values()),
-                    stride_next_krylov, acc::as_cuda_range(krylov_bases),
+                    stride_next_krylov, acc::as_device_range(krylov_bases),
                     as_device_type(buffer_iter->get_values()), stride_buffer,
                     as_device_type(stop_status));
         }
@@ -303,7 +306,7 @@ void finish_arnoldi_CGS(std::shared_ptr<const DefaultExecutor> exec,
                default_block_size, 0, exec->get_stream()>>>(
                 iter + 1, dim_size[0], dim_size[1],
                 as_device_type(next_krylov_basis->get_values()),
-                stride_next_krylov, acc::as_cuda_range(krylov_bases),
+                stride_next_krylov, acc::as_device_range(krylov_bases),
                 as_device_type(hessenberg_iter->get_values()),
                 stride_hessenberg,
                 as_device_type(buffer_iter->get_const_values()), stride_buffer,
@@ -335,18 +338,19 @@ void finish_arnoldi_CGS(std::shared_ptr<const DefaultExecutor> exec,
                exec->get_stream()>>>(
                 dim_size[1], as_device_type(arnoldi_norm->get_values()),
                 stride_arnoldi, as_device_type(hessenberg_iter->get_values()),
-                stride_hessenberg, iter + 1, acc::as_cuda_range(krylov_bases),
+                stride_hessenberg, iter + 1, acc::as_device_range(krylov_bases),
                 as_device_type(stop_status), as_device_type(reorth_status),
                 num_reorth->get_data());
         num_reorth_host = get_element(*num_reorth, 0);
+        // num_reorth_host := number of next_krylov vector to be
+        // reorthogonalization
     }
-
     update_krylov_next_krylov_kernel<default_block_size>
         <<<ceildiv(dim_size[0] * stride_next_krylov, default_block_size),
            default_block_size, 0, exec->get_stream()>>>(
             iter, dim_size[0], dim_size[1],
             as_device_type(next_krylov_basis->get_values()), stride_next_krylov,
-            acc::as_cuda_range(krylov_bases),
+            acc::as_device_range(krylov_bases),
             as_device_type(hessenberg_iter->get_const_values()),
             stride_hessenberg, as_device_type(stop_status));
     // next_krylov_basis /= hessenberg(iter, iter + 1)
@@ -460,7 +464,7 @@ void calculate_qy(std::shared_ptr<const DefaultExecutor> exec,
 
     calculate_Qy_kernel<block_size>
         <<<grid_dim, block_dim, 0, exec->get_stream()>>>(
-            num_rows, num_cols, acc::as_cuda_range(krylov_bases),
+            num_rows, num_cols, acc::as_device_range(krylov_bases),
             as_device_type(y->get_const_values()), y->get_stride(),
             as_device_type(before_preconditioner->get_values()),
             stride_before_preconditioner,
diff --git a/cuda/solver/common_trs_kernels.cuh b/cuda/solver/common_trs_kernels.cuh
index cb627e04b53..992974e95ef 100644
--- a/cuda/solver/common_trs_kernels.cuh
+++ b/cuda/solver/common_trs_kernels.cuh
@@ -20,15 +20,15 @@
 #include <ginkgo/core/base/math.hpp>
 
 
+#include "common/cuda_hip/base/pointer_mode_guard.hpp"
+#include "common/cuda_hip/base/sparselib_bindings.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/memory.hpp"
 #include "core/base/array_access.hpp"
 #include "core/matrix/dense_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "cuda/base/cusparse_bindings.hpp"
 #include "cuda/base/math.hpp"
-#include "cuda/base/pointer_mode_guard.hpp"
-#include "cuda/base/types.hpp"
 #include "cuda/components/atomic.cuh"
-#include "cuda/components/memory.cuh"
 #include "cuda/components/thread_ids.cuh"
 #include "cuda/components/uninitialized_array.hpp"
 
@@ -66,7 +66,7 @@ struct CudaSolveStruct : gko::solver::SolveStruct {
     CudaSolveStruct(std::shared_ptr<const gko::CudaExecutor> exec,
                     const matrix::Csr<ValueType, IndexType>* matrix,
                     size_type num_rhs, bool is_upper, bool unit_diag)
-        : handle{exec->get_cusparse_handle()},
+        : handle{exec->get_sparselib_handle()},
           spsm_descr{},
           descr_a{},
           num_rhs{num_rhs},
@@ -75,18 +75,18 @@ struct CudaSolveStruct : gko::solver::SolveStruct {
         if (num_rhs == 0) {
             return;
         }
-        cusparse::pointer_mode_guard pm_guard(handle);
-        spsm_descr = cusparse::create_spsm_descr();
-        descr_a = cusparse::create_csr(
+        sparselib::pointer_mode_guard pm_guard(handle);
+        spsm_descr = sparselib::create_spsm_descr();
+        descr_a = sparselib::create_csr(
             matrix->get_size()[0], matrix->get_size()[1],
             matrix->get_num_stored_elements(),
             const_cast<IndexType*>(matrix->get_const_row_ptrs()),
             const_cast<IndexType*>(matrix->get_const_col_idxs()),
             const_cast<ValueType*>(matrix->get_const_values()));
-        cusparse::set_attribute<cusparseFillMode_t>(
+        sparselib::set_attribute<cusparseFillMode_t>(
             descr_a, CUSPARSE_SPMAT_FILL_MODE,
             is_upper ? CUSPARSE_FILL_MODE_UPPER : CUSPARSE_FILL_MODE_LOWER);
-        cusparse::set_attribute<cusparseDiagType_t>(
+        sparselib::set_attribute<cusparseDiagType_t>(
             descr_a, CUSPARSE_SPMAT_DIAG_TYPE,
             unit_diag ? CUSPARSE_DIAG_TYPE_UNIT : CUSPARSE_DIAG_TYPE_NON_UNIT);
 
@@ -94,28 +94,28 @@ struct CudaSolveStruct : gko::solver::SolveStruct {
         // workaround suggested by NVIDIA engineers: for some reason
         // cusparse needs non-nullptr input vectors even for analysis
         // also make sure they are aligned by 16 bytes
-        auto descr_b = cusparse::create_dnmat(
+        auto descr_b = sparselib::create_dnmat(
             dim<2>{matrix->get_size()[0], num_rhs}, matrix->get_size()[1],
             reinterpret_cast<ValueType*>(0xDEAD0));
-        auto descr_c = cusparse::create_dnmat(
+        auto descr_c = sparselib::create_dnmat(
             dim<2>{matrix->get_size()[0], num_rhs}, matrix->get_size()[1],
             reinterpret_cast<ValueType*>(0xDEAF0));
 
-        auto work_size = cusparse::spsm_buffer_size(
-            handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
-            CUSPARSE_OPERATION_NON_TRANSPOSE, one<ValueType>(), descr_a,
+        auto work_size = sparselib::spsm_buffer_size(
+            handle, SPARSELIB_OPERATION_NON_TRANSPOSE,
+            SPARSELIB_OPERATION_NON_TRANSPOSE, one<ValueType>(), descr_a,
             descr_b, descr_c, CUSPARSE_SPSM_ALG_DEFAULT, spsm_descr);
 
         work.resize_and_reset(work_size);
 
-        cusparse::spsm_analysis(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
-                                CUSPARSE_OPERATION_NON_TRANSPOSE,
-                                one<ValueType>(), descr_a, descr_b, descr_c,
-                                CUSPARSE_SPSM_ALG_DEFAULT, spsm_descr,
-                                work.get_data());
+        sparselib::spsm_analysis(handle, SPARSELIB_OPERATION_NON_TRANSPOSE,
+                                 SPARSELIB_OPERATION_NON_TRANSPOSE,
+                                 one<ValueType>(), descr_a, descr_b, descr_c,
+                                 CUSPARSE_SPSM_ALG_DEFAULT, spsm_descr,
+                                 work.get_data());
 
-        cusparse::destroy(descr_b);
-        cusparse::destroy(descr_c);
+        sparselib::destroy(descr_b);
+        sparselib::destroy(descr_c);
     }
 
     void solve(const matrix::Csr<ValueType, IndexType>*,
@@ -134,30 +134,30 @@ struct CudaSolveStruct : gko::solver::SolveStruct {
                 "provided at generation time. Check the value specified in "
                 ".with_num_rhs(...)."};
         }
-        cusparse::pointer_mode_guard pm_guard(handle);
-        auto descr_b = cusparse::create_dnmat(
+        sparselib::pointer_mode_guard pm_guard(handle);
+        auto descr_b = sparselib::create_dnmat(
             input->get_size(), input->get_stride(),
             const_cast<ValueType*>(input->get_const_values()));
-        auto descr_c = cusparse::create_dnmat(
+        auto descr_c = sparselib::create_dnmat(
             output->get_size(), output->get_stride(), output->get_values());
 
-        cusparse::spsm_solve(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
-                             CUSPARSE_OPERATION_NON_TRANSPOSE, one<ValueType>(),
-                             descr_a, descr_b, descr_c,
-                             CUSPARSE_SPSM_ALG_DEFAULT, spsm_descr);
+        sparselib::spsm_solve(handle, SPARSELIB_OPERATION_NON_TRANSPOSE,
+                              SPARSELIB_OPERATION_NON_TRANSPOSE,
+                              one<ValueType>(), descr_a, descr_b, descr_c,
+                              CUSPARSE_SPSM_ALG_DEFAULT, spsm_descr);
 
-        cusparse::destroy(descr_b);
-        cusparse::destroy(descr_c);
+        sparselib::destroy(descr_b);
+        sparselib::destroy(descr_c);
     }
 
     ~CudaSolveStruct()
     {
         if (descr_a) {
-            cusparse::destroy(descr_a);
+            sparselib::destroy(descr_a);
             descr_a = nullptr;
         }
         if (spsm_descr) {
-            cusparse::destroy(spsm_descr);
+            sparselib::destroy(spsm_descr);
             spsm_descr = nullptr;
         }
     }
@@ -189,7 +189,7 @@ struct CudaSolveStruct : gko::solver::SolveStruct {
                     const matrix::Csr<ValueType, IndexType>* matrix,
                     size_type num_rhs, bool is_upper, bool unit_diag)
         : exec{exec},
-          handle{exec->get_cusparse_handle()},
+          handle{exec->get_sparselib_handle()},
           algorithm{},
           solve_info{},
           policy{},
@@ -200,23 +200,23 @@ struct CudaSolveStruct : gko::solver::SolveStruct {
         if (num_rhs == 0) {
             return;
         }
-        cusparse::pointer_mode_guard pm_guard(handle);
-        factor_descr = cusparse::create_mat_descr();
-        solve_info = cusparse::create_solve_info();
-        cusparse::set_mat_fill_mode(
+        sparselib::pointer_mode_guard pm_guard(handle);
+        factor_descr = sparselib::create_mat_descr();
+        solve_info = sparselib::create_solve_info();
+        sparselib::set_mat_fill_mode(
             factor_descr,
             is_upper ? CUSPARSE_FILL_MODE_UPPER : CUSPARSE_FILL_MODE_LOWER);
-        cusparse::set_mat_diag_type(
+        sparselib::set_mat_diag_type(
             factor_descr,
             unit_diag ? CUSPARSE_DIAG_TYPE_UNIT : CUSPARSE_DIAG_TYPE_NON_UNIT);
         algorithm = 0;
-        policy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
+        policy = SPARSELIB_SOLVE_POLICY_USE_LEVEL;
 
         size_type work_size{};
 
-        cusparse::buffer_size_ext(
-            handle, algorithm, CUSPARSE_OPERATION_NON_TRANSPOSE,
-            CUSPARSE_OPERATION_TRANSPOSE, matrix->get_size()[0], num_rhs,
+        sparselib::buffer_size_ext(
+            handle, algorithm, SPARSELIB_OPERATION_NON_TRANSPOSE,
+            SPARSELIB_OPERATION_TRANSPOSE, matrix->get_size()[0], num_rhs,
             matrix->get_num_stored_elements(), one<ValueType>(), factor_descr,
             matrix->get_const_values(), matrix->get_const_row_ptrs(),
             matrix->get_const_col_idxs(), nullptr, num_rhs, solve_info, policy,
@@ -225,9 +225,9 @@ struct CudaSolveStruct : gko::solver::SolveStruct {
         // allocate workspace
         work.resize_and_reset(work_size);
 
-        cusparse::csrsm2_analysis(
-            handle, algorithm, CUSPARSE_OPERATION_NON_TRANSPOSE,
-            CUSPARSE_OPERATION_TRANSPOSE, matrix->get_size()[0], num_rhs,
+        sparselib::csrsm2_analysis(
+            handle, algorithm, SPARSELIB_OPERATION_NON_TRANSPOSE,
+            SPARSELIB_OPERATION_TRANSPOSE, matrix->get_size()[0], num_rhs,
             matrix->get_num_stored_elements(), one<ValueType>(), factor_descr,
             matrix->get_const_values(), matrix->get_const_row_ptrs(),
             matrix->get_const_col_idxs(), nullptr, num_rhs, solve_info, policy,
@@ -250,11 +250,11 @@ struct CudaSolveStruct : gko::solver::SolveStruct {
                 "provided at generation time. Check the value specified in "
                 ".with_num_rhs(...)."};
         }
-        cusparse::pointer_mode_guard pm_guard(handle);
+        sparselib::pointer_mode_guard pm_guard(handle);
         dense::copy(exec, input, output);
-        cusparse::csrsm2_solve(
-            handle, algorithm, CUSPARSE_OPERATION_NON_TRANSPOSE,
-            CUSPARSE_OPERATION_TRANSPOSE, matrix->get_size()[0],
+        sparselib::csrsm2_solve(
+            handle, algorithm, SPARSELIB_OPERATION_NON_TRANSPOSE,
+            SPARSELIB_OPERATION_TRANSPOSE, matrix->get_size()[0],
             output->get_stride(), matrix->get_num_stored_elements(),
             one<ValueType>(), factor_descr, matrix->get_const_values(),
             matrix->get_const_row_ptrs(), matrix->get_const_col_idxs(),
@@ -265,11 +265,11 @@ struct CudaSolveStruct : gko::solver::SolveStruct {
     ~CudaSolveStruct()
     {
         if (factor_descr) {
-            cusparse::destroy(factor_descr);
+            sparselib::destroy(factor_descr);
             factor_descr = nullptr;
         }
         if (solve_info) {
-            cusparse::destroy(solve_info);
+            sparselib::destroy(solve_info);
             solve_info = nullptr;
         }
     }
@@ -304,7 +304,7 @@ void generate_kernel(std::shared_ptr<const CudaExecutor> exec,
     if (matrix->get_size()[0] == 0) {
         return;
     }
-    if (cusparse::is_supported<ValueType, IndexType>::value) {
+    if (sparselib::is_supported<ValueType, IndexType>::value) {
         solve_struct = std::make_shared<CudaSolveStruct<ValueType, IndexType>>(
             exec, matrix, num_rhs, is_upper, unit_diag);
     } else {
@@ -327,7 +327,7 @@ void solve_kernel(std::shared_ptr<const CudaExecutor> exec,
     }
     using vec = matrix::Dense<ValueType>;
 
-    if (cusparse::is_supported<ValueType, IndexType>::value) {
+    if (sparselib::is_supported<ValueType, IndexType>::value) {
         if (auto cuda_solve_struct =
                 dynamic_cast<const CudaSolveStruct<ValueType, IndexType>*>(
                     solve_struct)) {
diff --git a/cuda/solver/idr_kernels.cu b/cuda/solver/idr_kernels.cu
index 9c97d99f13c..f7e89c9d9d8 100644
--- a/cuda/solver/idr_kernels.cu
+++ b/cuda/solver/idr_kernels.cu
@@ -13,14 +13,15 @@
 #include <ginkgo/core/base/math.hpp>
 
 
+#include "common/cuda_hip/base/blas_bindings.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/randlib_bindings.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/components/fill_array_kernels.hpp"
-#include "cuda/base/config.hpp"
-#include "cuda/base/cublas_bindings.hpp"
-#include "cuda/base/curand_bindings.hpp"
 #include "cuda/base/math.hpp"
-#include "cuda/base/types.hpp"
 #include "cuda/components/atomic.cuh"
-#include "cuda/components/cooperative_groups.cuh"
 #include "cuda/components/reduction.cuh"
 #include "cuda/components/thread_ids.cuh"
 
@@ -69,14 +70,14 @@ void initialize_subspace_vectors(std::shared_ptr<const DefaultExecutor> exec,
                                  bool deterministic)
 {
     if (!deterministic) {
-        auto gen = curand::rand_generator(std::random_device{}(),
-                                          CURAND_RNG_PSEUDO_DEFAULT,
-                                          exec->get_stream());
-        curand::rand_vector(
+        auto gen = randlib::rand_generator(std::random_device{}(),
+                                           RANDLIB_RNG_PSEUDO_DEFAULT,
+                                           exec->get_stream());
+        randlib::rand_vector(
             gen,
             subspace_vectors->get_size()[0] * subspace_vectors->get_stride(),
             0.0, 1.0, subspace_vectors->get_values());
-        curand::destroy(gen);
+        randlib::destroy(gen);
     }
 }
 
@@ -145,9 +146,8 @@ void update_g_and_u(std::shared_ptr<const DefaultExecutor> exec,
                 as_device_type(alpha->get_values()),
                 stop_status->get_const_data());
         } else {
-            cublas::dot(exec->get_cublas_handle(), size, p_i, 1,
-                        g_k->get_values(), g_k->get_stride(),
-                        alpha->get_values());
+            blas::dot(exec->get_blas_handle(), size, p_i, 1, g_k->get_values(),
+                      g_k->get_stride(), alpha->get_values());
         }
         update_g_k_and_u_kernel<default_block_size>
             <<<ceildiv(size * g_k->get_stride(), default_block_size),
@@ -196,8 +196,8 @@ void update_m(std::shared_ptr<const DefaultExecutor> exec, const size_type nrhs,
                 as_device_type(g_k->get_const_values()), g_k->get_stride(),
                 as_device_type(m_i), stop_status->get_const_data());
         } else {
-            cublas::dot(exec->get_cublas_handle(), size, p_i, 1,
-                        g_k->get_const_values(), g_k->get_stride(), m_i);
+            blas::dot(exec->get_blas_handle(), size, p_i, 1,
+                      g_k->get_const_values(), g_k->get_stride(), m_i);
         }
     }
 }
diff --git a/cuda/solver/lower_trs_kernels.cu b/cuda/solver/lower_trs_kernels.cu
index 46b4cb4c2e4..002cc0140cb 100644
--- a/cuda/solver/lower_trs_kernels.cu
+++ b/cuda/solver/lower_trs_kernels.cu
@@ -17,9 +17,9 @@
 #include <ginkgo/core/solver/triangular.hpp>
 
 
-#include "cuda/base/cusparse_bindings.hpp"
+#include "common/cuda_hip/base/sparselib_bindings.hpp"
+#include "common/cuda_hip/base/types.hpp"
 #include "cuda/base/math.hpp"
-#include "cuda/base/types.hpp"
 #include "cuda/solver/common_trs_kernels.cuh"
 
 
diff --git a/cuda/solver/multigrid_kernels.cu b/cuda/solver/multigrid_kernels.cu
index 4eea02883b2..1d31130623a 100644
--- a/cuda/solver/multigrid_kernels.cu
+++ b/cuda/solver/multigrid_kernels.cu
@@ -11,9 +11,10 @@
 #include <ginkgo/core/base/types.hpp>
 
 
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
 #include "core/base/array_access.hpp"
 #include "core/components/fill_array_kernels.hpp"
-#include "cuda/base/types.hpp"
 #include "cuda/components/thread_ids.cuh"
 
 
diff --git a/cuda/solver/upper_trs_kernels.cu b/cuda/solver/upper_trs_kernels.cu
index a8ee5f77cca..e1e01538f79 100644
--- a/cuda/solver/upper_trs_kernels.cu
+++ b/cuda/solver/upper_trs_kernels.cu
@@ -17,9 +17,9 @@
 #include <ginkgo/core/solver/triangular.hpp>
 
 
-#include "cuda/base/cusparse_bindings.hpp"
+#include "common/cuda_hip/base/sparselib_bindings.hpp"
+#include "common/cuda_hip/base/types.hpp"
 #include "cuda/base/math.hpp"
-#include "cuda/base/types.hpp"
 #include "cuda/solver/common_trs_kernels.cuh"
 
 
diff --git a/cuda/stop/criterion_kernels.cu b/cuda/stop/criterion_kernels.cu
index 17bcbbc1567..e54b5d140f2 100644
--- a/cuda/stop/criterion_kernels.cu
+++ b/cuda/stop/criterion_kernels.cu
@@ -10,8 +10,8 @@
 #include <ginkgo/core/stop/stopping_status.hpp>
 
 
+#include "common/cuda_hip/base/types.hpp"
 #include "cuda/base/math.hpp"
-#include "cuda/base/types.hpp"
 #include "cuda/components/thread_ids.cuh"
 
 
diff --git a/cuda/stop/residual_norm_kernels.cu b/cuda/stop/residual_norm_kernels.cu
index 18102d91ec5..7146d0cbf04 100644
--- a/cuda/stop/residual_norm_kernels.cu
+++ b/cuda/stop/residual_norm_kernels.cu
@@ -10,9 +10,9 @@
 #include <ginkgo/core/stop/residual_norm.hpp>
 
 
+#include "common/cuda_hip/base/types.hpp"
 #include "core/base/array_access.hpp"
 #include "cuda/base/math.hpp"
-#include "cuda/base/types.hpp"
 #include "cuda/components/thread_ids.cuh"
 
 
diff --git a/cuda/test/base/math.cu b/cuda/test/base/math.cu
index c7f70fe3011..944e7642223 100644
--- a/cuda/test/base/math.cu
+++ b/cuda/test/base/math.cu
@@ -17,8 +17,8 @@
 #include <ginkgo/core/base/executor.hpp>
 
 
+#include "common/cuda_hip/base/types.hpp"
 #include "cuda/base/math.hpp"
-#include "cuda/base/types.hpp"
 #include "cuda/test/utils.hpp"
 
 
diff --git a/cuda/test/components/cooperative_groups.cu b/cuda/test/components/cooperative_groups.cu
index 1b514842e84..c9d9e6bf124 100644
--- a/cuda/test/components/cooperative_groups.cu
+++ b/cuda/test/components/cooperative_groups.cu
@@ -2,9 +2,6 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include "cuda/components/cooperative_groups.cuh"
-
-
 #include <memory>
 
 
@@ -15,7 +12,8 @@
 #include <ginkgo/core/base/executor.hpp>
 
 
-#include "cuda/base/config.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "cuda/test/utils.hpp"
 
 
diff --git a/cuda/test/components/merging.cu b/cuda/test/components/merging.cu
index 6ef7d3ab3c4..37b032eb794 100644
--- a/cuda/test/components/merging.cu
+++ b/cuda/test/components/merging.cu
@@ -18,7 +18,7 @@
 #include <ginkgo/core/base/executor.hpp>
 
 
-#include "cuda/components/cooperative_groups.cuh"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "cuda/test/utils.hpp"
 
 
diff --git a/cuda/test/components/searching.cu b/cuda/test/components/searching.cu
index 0eeb383c05c..ffe00c247c0 100644
--- a/cuda/test/components/searching.cu
+++ b/cuda/test/components/searching.cu
@@ -17,7 +17,7 @@
 #include <ginkgo/core/base/executor.hpp>
 
 
-#include "cuda/components/cooperative_groups.cuh"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "cuda/test/utils.hpp"
 
 
diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt
index 8c68efae046..ee373243842 100644
--- a/dpcpp/CMakeLists.txt
+++ b/dpcpp/CMakeLists.txt
@@ -93,7 +93,7 @@ string(REPLACE ";" "," GKO_DPCPP_JACOBI_BLOCK_SIZES_CODE "${GKO_DPCPP_JACOBI_BLO
 configure_file(preconditioner/jacobi_common.hpp.in preconditioner/jacobi_common.hpp)
 
 ginkgo_compile_features(ginkgo_dpcpp)
-target_compile_definitions(ginkgo_dpcpp PRIVATE GKO_COMPILING_DPCPP _ONEDPL_COMPILE_KERNEL=0)
+target_compile_definitions(ginkgo_dpcpp PRIVATE GKO_COMPILING_DPCPP GKO_DEVICE_NAMESPACE=dpcpp _ONEDPL_COMPILE_KERNEL=0)
 
 set(GINKGO_DPCPP_FLAGS ${GINKGO_DPCPP_FLAGS} PARENT_SCOPE)
 target_compile_options(ginkgo_dpcpp PRIVATE "${GINKGO_DPCPP_FLAGS}")
@@ -126,7 +126,7 @@ ginkgo_default_includes(ginkgo_dpcpp)
 ginkgo_install_library(ginkgo_dpcpp)
 
 if (GINKGO_CHECK_CIRCULAR_DEPS)
-    ginkgo_check_headers(ginkgo_dpcpp GKO_COMPILING_DPCPP)
+    ginkgo_check_headers(ginkgo_dpcpp "GKO_COMPILING_DPCPP;GKO_DEVICE_NAMESPACE=dpcpp")
 endif()
 
 if(GINKGO_BUILD_TESTS)
diff --git a/dpcpp/test/base/CMakeLists.txt b/dpcpp/test/base/CMakeLists.txt
index bb9c8a75050..38ecad08271 100644
--- a/dpcpp/test/base/CMakeLists.txt
+++ b/dpcpp/test/base/CMakeLists.txt
@@ -2,4 +2,4 @@ ginkgo_create_dpcpp_test(executor)
 ginkgo_create_dpcpp_test(dim3)
 ginkgo_create_dpcpp_test(kernel_launch)
 # set correct flags for kernel_launch.hpp
-target_compile_definitions(dpcpp_test_base_kernel_launch PRIVATE GKO_COMPILING_DPCPP)
+target_compile_definitions(dpcpp_test_base_kernel_launch PRIVATE GKO_COMPILING_DPCPP GKO_DEVICE_NAMESPACE=dpcpp)
diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt
index 046fd1e4d7a..bf2d6a6cf58 100644
--- a/hip/CMakeLists.txt
+++ b/hip/CMakeLists.txt
@@ -31,12 +31,12 @@ set(GINKGO_HIP_SOURCES
     factorization/par_ic_kernels.hip.cpp
     factorization/par_ict_kernels.hip.cpp
     factorization/par_ilu_kernels.hip.cpp
-    factorization/par_ilut_approx_filter_kernel.hip.cpp
-    factorization/par_ilut_filter_kernel.hip.cpp
+    factorization/par_ilut_approx_filter_kernels.hip.cpp
+    factorization/par_ilut_filter_kernels.hip.cpp
     factorization/par_ilut_select_common.hip.cpp
-    factorization/par_ilut_select_kernel.hip.cpp
-    factorization/par_ilut_spgeam_kernel.hip.cpp
-    factorization/par_ilut_sweep_kernel.hip.cpp
+    factorization/par_ilut_select_kernels.hip.cpp
+    factorization/par_ilut_spgeam_kernels.hip.cpp
+    factorization/par_ilut_sweep_kernels.hip.cpp
     matrix/batch_csr_kernels.hip.cpp
     matrix/batch_dense_kernels.hip.cpp
     matrix/batch_ell_kernels.hip.cpp
@@ -51,10 +51,10 @@ set(GINKGO_HIP_SOURCES
     multigrid/pgm_kernels.hip.cpp
     preconditioner/batch_jacobi_kernels.hip.cpp
     preconditioner/isai_kernels.hip.cpp
-    preconditioner/jacobi_advanced_apply_kernel.hip.cpp
-    preconditioner/jacobi_generate_kernel.hip.cpp
+    preconditioner/jacobi_advanced_apply_kernels.hip.cpp
+    preconditioner/jacobi_generate_kernels.hip.cpp
     preconditioner/jacobi_kernels.hip.cpp
-    preconditioner/jacobi_simple_apply_kernel.hip.cpp
+    preconditioner/jacobi_simple_apply_kernels.hip.cpp
     reorder/rcm_kernels.hip.cpp
     solver/batch_bicgstab_kernels.hip.cpp
     solver/batch_cg_kernels.hip.cpp
@@ -86,28 +86,28 @@ else()
 endif()
 foreach(GKO_JACOBI_BLOCK_SIZE IN LISTS GKO_HIP_JACOBI_BLOCK_SIZES)
     configure_file(
-        preconditioner/jacobi_generate_instantiate.inc.hip.cpp
-        preconditioner/jacobi_generate_instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp)
+        preconditioner/jacobi_generate_kernels.instantiate.hip.cpp
+        preconditioner/jacobi_generate_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp)
     configure_file(
-        preconditioner/jacobi_simple_apply_instantiate.inc.hip.cpp
-        preconditioner/jacobi_simple_apply_instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp)
+        preconditioner/jacobi_simple_apply_kernels.instantiate.hip.cpp
+        preconditioner/jacobi_simple_apply_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp)
     configure_file(
-        preconditioner/jacobi_advanced_apply_instantiate.inc.hip.cpp
-        preconditioner/jacobi_advanced_apply_instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp)
+        preconditioner/jacobi_advanced_apply_kernels.instantiate.hip.cpp
+        preconditioner/jacobi_advanced_apply_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp)
     # The 3D indexing used in Jacobi kernel triggers an instruction selection bug in Debug builds
     # Probably the same as https://github.com/llvm/llvm-project/issues/67574
     # Fixed in ROCm 6.0 https://github.com/ROCm/llvm-project/commit/cd7f574a1fd1d3f3e8b9c1cae61fa8133a51de5f
     # and in LLVM trunk https://github.com/llvm/llvm-project/commit/cc3d2533cc2e4ea06981b86ede5087fbf801e789
     set_source_files_properties(
-        ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_generate_instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp
-        ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_simple_apply_instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp
-        ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_advanced_apply_instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp
+        ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_generate_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp
+        ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_simple_apply_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp
+        ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_advanced_apply_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp
         PROPERTIES
         COMPILE_OPTIONS $<$<CONFIG:Debug>:-O2>)
     list(APPEND GINKGO_HIP_SOURCES
-        ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_generate_instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp
-        ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_simple_apply_instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp
-        ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_advanced_apply_instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp)
+        ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_generate_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp
+        ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_simple_apply_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp
+        ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_advanced_apply_kernels.instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp)
 endforeach()
 string(REPLACE ";" "," GKO_HIP_JACOBI_BLOCK_SIZES_CODE "${GKO_HIP_JACOBI_BLOCK_SIZES}")
 configure_file(preconditioner/jacobi_common.hip.hpp.in preconditioner/jacobi_common.hip.hpp)
@@ -119,7 +119,7 @@ target_include_directories(ginkgo_hip
     PRIVATE
         ${CMAKE_CURRENT_BINARY_DIR}/.. # for generated headers like jacobi_common.hip.hpp
         )
-target_compile_definitions(ginkgo_hip PRIVATE GKO_COMPILING_HIP)
+target_compile_definitions(ginkgo_hip PRIVATE GKO_COMPILING_HIP GKO_DEVICE_NAMESPACE=hip)
 
 target_link_libraries(ginkgo_hip PUBLIC ginkgo_device)
 target_link_libraries(ginkgo_hip PRIVATE hip::host roc::hipblas roc::hipsparse hip::hiprand roc::rocrand)
@@ -138,7 +138,7 @@ ginkgo_default_includes(ginkgo_hip)
 ginkgo_install_library(ginkgo_hip)
 
 if (GINKGO_CHECK_CIRCULAR_DEPS)
-    ginkgo_check_headers(ginkgo_hip GKO_COMPILING_HIP)
+    ginkgo_check_headers(ginkgo_hip "GKO_COMPILING_HIP;GKO_DEVICE_NAMESPACE=hip")
 endif()
 
 if(GINKGO_BUILD_TESTS)
diff --git a/hip/base/batch_multi_vector_kernels.hip.cpp b/hip/base/batch_multi_vector_kernels.hip.cpp
index f5a1dba3977..74e6c34dc5d 100644
--- a/hip/base/batch_multi_vector_kernels.hip.cpp
+++ b/hip/base/batch_multi_vector_kernels.hip.cpp
@@ -5,7 +5,6 @@
 #include "core/base/batch_multi_vector_kernels.hpp"
 
 
-#include <hip/hip_runtime.h>
 #include <thrust/functional.h>
 #include <thrust/transform.h>
 
@@ -14,13 +13,14 @@
 #include <ginkgo/core/base/range_accessors.hpp>
 
 
+#include "common/cuda_hip/base/blas_bindings.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/pointer_mode_guard.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/batch_struct.hpp"
 #include "hip/base/batch_struct.hip.hpp"
-#include "hip/base/config.hip.hpp"
-#include "hip/base/hipblas_bindings.hip.hpp"
-#include "hip/base/pointer_mode_guard.hip.hpp"
 #include "hip/base/thrust.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
 #include "hip/components/reduction.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
 #include "hip/components/uninitialized_array.hip.hpp"
diff --git a/hip/base/batch_struct.hip.hpp b/hip/base/batch_struct.hip.hpp
index fa44a22b554..4f09ec66bb8 100644
--- a/hip/base/batch_struct.hip.hpp
+++ b/hip/base/batch_struct.hip.hpp
@@ -10,9 +10,9 @@
 #include <ginkgo/core/base/math.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/types.hpp"
 #include "core/base/batch_struct.hpp"
-#include "hip/base/config.hip.hpp"
-#include "hip/base/types.hip.hpp"
 
 
 namespace gko {
diff --git a/hip/base/config.hip.hpp b/hip/base/config.hip.hpp
index fbad841fd0f..89dc67255fc 100644
--- a/hip/base/config.hip.hpp
+++ b/hip/base/config.hip.hpp
@@ -6,15 +6,10 @@
 #define GKO_HIP_BASE_CONFIG_HIP_HPP_
 
 
-#include <hip/hip_runtime.h>
-
-
-#include <hip/device_functions.h>
-
-
 #include <ginkgo/core/base/types.hpp>
 
 
+#include "common/cuda_hip/base/runtime.hpp"
 #include "hip/base/math.hip.hpp"
 
 
diff --git a/hip/base/device.hip.cpp b/hip/base/device.hip.cpp
index 58376c2175b..be897510056 100644
--- a/hip/base/device.hip.cpp
+++ b/hip/base/device.hip.cpp
@@ -5,14 +5,12 @@
 #include <ginkgo/core/base/device.hpp>
 
 
-#include <hip/hip_runtime.h>
-
-
 #include <ginkgo/config.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/stream.hpp>
 
 
+#include "common/cuda_hip/base/runtime.hpp"
 #include "hip/base/scoped_device_id.hip.hpp"
 
 
diff --git a/hip/base/device_matrix_data_kernels.hip.cpp b/hip/base/device_matrix_data_kernels.hip.cpp
index 745ba955014..5a0b762ea57 100644
--- a/hip/base/device_matrix_data_kernels.hip.cpp
+++ b/hip/base/device_matrix_data_kernels.hip.cpp
@@ -14,8 +14,8 @@
 #include <thrust/tuple.h>
 
 
+#include "common/cuda_hip/base/types.hpp"
 #include "hip/base/thrust.hip.hpp"
-#include "hip/base/types.hip.hpp"
 
 
 namespace gko {
diff --git a/hip/base/exception.hip.cpp b/hip/base/exception.hip.cpp
index aed5e803d60..f0e17f4e873 100644
--- a/hip/base/exception.hip.cpp
+++ b/hip/base/exception.hip.cpp
@@ -8,7 +8,6 @@
 #include <string>
 
 
-#include <hip/hip_runtime.h>
 #if HIP_VERSION >= 50200000
 #include <hipblas/hipblas.h>
 #include <hiprand/hiprand.h>
@@ -23,6 +22,9 @@
 #include <ginkgo/core/base/types.hpp>
 
 
+#include "common/cuda_hip/base/runtime.hpp"
+
+
 namespace gko {
 
 
diff --git a/hip/base/executor.hip.cpp b/hip/base/executor.hip.cpp
index 2694ce4177f..4b5ce7afa7b 100644
--- a/hip/base/executor.hip.cpp
+++ b/hip/base/executor.hip.cpp
@@ -8,15 +8,13 @@
 #include <iostream>
 
 
-#include <hip/hip_runtime.h>
-
-
 #include <ginkgo/config.hpp>
 #include <ginkgo/core/base/device.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
-#include "hip/base/config.hip.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
 #include "hip/base/hipblas_bindings.hip.hpp"
 #include "hip/base/hipsparse_bindings.hip.hpp"
 #include "hip/base/scoped_device_id.hip.hpp"
diff --git a/hip/base/hipblas_bindings.hip.hpp b/hip/base/hipblas_bindings.hip.hpp
index f4dd3f1a1e8..d5dc94d6138 100644
--- a/hip/base/hipblas_bindings.hip.hpp
+++ b/hip/base/hipblas_bindings.hip.hpp
@@ -6,7 +6,6 @@
 #define GKO_HIP_BASE_HIPBLAS_BINDINGS_HIP_HPP_
 
 
-#include <hip/hip_runtime.h>
 #if HIP_VERSION >= 50200000
 #include <hipblas/hipblas.h>
 #else
@@ -18,8 +17,9 @@
 #include <ginkgo/core/base/executor.hpp>
 
 
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
 #include "hip/base/math.hip.hpp"
-#include "hip/base/types.hip.hpp"
 
 
 namespace gko {
@@ -260,6 +260,20 @@ inline void destroy_hipblas_handle(hipblasContext* handle)
 
 
 }  // namespace hipblas
+
+
+namespace blas {
+
+
+using namespace hipblas;
+
+
+#define BLAS_OP_N HIPBLAS_OP_N
+#define BLAS_OP_T HIPBLAS_OP_T
+#define BLAS_OP_C HIPBLAS_OP_C
+
+
+}  // namespace blas
 }  // namespace hip
 }  // namespace kernels
 }  // namespace gko
diff --git a/hip/base/hiprand_bindings.hip.hpp b/hip/base/hiprand_bindings.hip.hpp
index 471abb3ccd5..9fd7ade8231 100644
--- a/hip/base/hiprand_bindings.hip.hpp
+++ b/hip/base/hiprand_bindings.hip.hpp
@@ -6,7 +6,6 @@
 #define GKO_HIP_BASE_HIPRAND_BINDINGS_HIP_HPP_
 
 
-#include <hip/hip_runtime.h>
 #if HIP_VERSION >= 50200000
 #include <hiprand/hiprand.h>
 #else
@@ -17,8 +16,9 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
 #include "hip/base/math.hip.hpp"
-#include "hip/base/types.hip.hpp"
 
 
 namespace gko {
@@ -90,6 +90,18 @@ GKO_BIND_HIPRAND_RANDOM_VECTOR(std::complex<double>,
 
 
 }  // namespace hiprand
+
+
+namespace randlib {
+
+
+using namespace hiprand;
+
+
+#define RANDLIB_RNG_PSEUDO_DEFAULT HIPRAND_RNG_PSEUDO_DEFAULT
+
+
+}  // namespace randlib
 }  // namespace hip
 }  // namespace kernels
 }  // namespace gko
diff --git a/hip/base/hipsparse_bindings.hip.hpp b/hip/base/hipsparse_bindings.hip.hpp
index 62c7e60995e..0337f0a03c6 100644
--- a/hip/base/hipsparse_bindings.hip.hpp
+++ b/hip/base/hipsparse_bindings.hip.hpp
@@ -6,7 +6,6 @@
 #define GKO_HIP_BASE_HIPSPARSE_BINDINGS_HIP_HPP_
 
 
-#include <hip/hip_runtime.h>
 #if HIP_VERSION >= 50200000
 #include <hipsparse/hipsparse.h>
 #else
@@ -18,7 +17,8 @@
 #include <ginkgo/core/base/executor.hpp>
 
 
-#include "hip/base/types.hip.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
 
 
 namespace gko {
@@ -955,6 +955,20 @@ GKO_BIND_HIPSPARSE_IC0(std::complex<double>, hipsparseZcsric02);
 
 
 }  // namespace hipsparse
+
+
+namespace sparselib {
+
+
+using namespace hipsparse;
+
+
+#define SPARSELIB_OPERATION_TRANSPOSE HIPSPARSE_OPERATION_TRANSPOSE
+#define SPARSELIB_OPERATION_NON_TRANSPOSE HIPSPARSE_OPERATION_NON_TRANSPOSE
+#define SPARSELIB_SOLVE_POLICY_USE_LEVEL HIPSPARSE_SOLVE_POLICY_USE_LEVEL
+
+
+}  // namespace sparselib
 }  // namespace hip
 }  // namespace kernels
 }  // namespace gko
diff --git a/hip/base/hipsparse_block_bindings.hip.hpp b/hip/base/hipsparse_block_bindings.hip.hpp
index eb9e8a31481..6fb70c4571c 100644
--- a/hip/base/hipsparse_block_bindings.hip.hpp
+++ b/hip/base/hipsparse_block_bindings.hip.hpp
@@ -6,7 +6,6 @@
 #define GKO_HIP_BASE_HIPSPARSE_BLOCK_BINDINGS_HIP_HPP_
 
 
-#include <hip/hip_runtime.h>
 #if HIP_VERSION >= 50200000
 #include <hipsparse/hipsparse.h>
 #else
@@ -17,8 +16,9 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
 #include "hip/base/hipsparse_bindings.hip.hpp"
-#include "hip/base/types.hip.hpp"
 
 
 namespace gko {
diff --git a/hip/base/kernel_launch.hip.hpp b/hip/base/kernel_launch.hip.hpp
index 1a00e99cac7..890b9922a4c 100644
--- a/hip/base/kernel_launch.hip.hpp
+++ b/hip/base/kernel_launch.hip.hpp
@@ -8,12 +8,12 @@
 #endif
 
 
-#include <hip/hip_runtime.h>
 #include <thrust/tuple.h>
 
 
-#include "accessor/hip_helper.hpp"
-#include "hip/base/types.hip.hpp"
+#include "accessor/cuda_hip_helper.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
 #include "hip/components/thread_ids.hip.hpp"
 
 
@@ -24,21 +24,21 @@ namespace hip {
 
 template <typename AccessorType>
 struct to_device_type_impl<gko::acc::range<AccessorType>&> {
-    using type = std::decay_t<decltype(gko::acc::as_hip_range(
+    using type = std::decay_t<decltype(gko::acc::as_device_range(
         std::declval<gko::acc::range<AccessorType>>()))>;
     static type map_to_device(gko::acc::range<AccessorType>& range)
     {
-        return gko::acc::as_hip_range(range);
+        return gko::acc::as_device_range(range);
     }
 };
 
 template <typename AccessorType>
 struct to_device_type_impl<const gko::acc::range<AccessorType>&> {
-    using type = std::decay_t<decltype(gko::acc::as_hip_range(
+    using type = std::decay_t<decltype(gko::acc::as_device_range(
         std::declval<gko::acc::range<AccessorType>>()))>;
     static type map_to_device(const gko::acc::range<AccessorType>& range)
     {
-        return gko::acc::as_hip_range(range);
+        return gko::acc::as_device_range(range);
     }
 };
 
diff --git a/hip/base/kernel_launch_reduction.hip.hpp b/hip/base/kernel_launch_reduction.hip.hpp
index 7c5d0c01c9c..c32fb592de0 100644
--- a/hip/base/kernel_launch_reduction.hip.hpp
+++ b/hip/base/kernel_launch_reduction.hip.hpp
@@ -8,9 +8,9 @@
 #endif
 
 
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "hip/base/types.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
 #include "hip/components/reduction.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
 
diff --git a/hip/base/kernel_launch_solver.hip.hpp b/hip/base/kernel_launch_solver.hip.hpp
index 18532c9754c..eda18f35eab 100644
--- a/hip/base/kernel_launch_solver.hip.hpp
+++ b/hip/base/kernel_launch_solver.hip.hpp
@@ -8,7 +8,7 @@
 #endif
 
 
-#include <hip/hip_runtime.h>
+#include "common/cuda_hip/base/runtime.hpp"
 
 
 namespace gko {
diff --git a/hip/base/memory.hip.cpp b/hip/base/memory.hip.cpp
index 0e14bf9f511..5fde8f518c6 100644
--- a/hip/base/memory.hip.cpp
+++ b/hip/base/memory.hip.cpp
@@ -5,12 +5,10 @@
 #include <ginkgo/core/base/memory.hpp>
 
 
-#include <hip/hip_runtime.h>
-
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
+#include "common/cuda_hip/base/runtime.hpp"
 #include "hip/base/scoped_device_id.hip.hpp"
 
 
diff --git a/hip/base/pointer_mode_guard.hip.hpp b/hip/base/pointer_mode_guard.hip.hpp
index 2908164cccd..5cd4b3ec58f 100644
--- a/hip/base/pointer_mode_guard.hip.hpp
+++ b/hip/base/pointer_mode_guard.hip.hpp
@@ -9,7 +9,6 @@
 #include <exception>
 
 
-#include <hip/hip_runtime.h>
 #if HIP_VERSION >= 50200000
 #include <hipblas/hipblas.h>
 #include <hipsparse/hipsparse.h>
@@ -24,6 +23,9 @@
 #include <ginkgo/core/base/std_extensions.hpp>
 
 
+#include "common/cuda_hip/base/runtime.hpp"
+
+
 namespace gko {
 namespace kernels {
 namespace hip {
diff --git a/hip/base/roctx.hip.cpp b/hip/base/roctx.hip.cpp
index 0ed12a54786..46dad3be816 100644
--- a/hip/base/roctx.hip.cpp
+++ b/hip/base/roctx.hip.cpp
@@ -2,10 +2,10 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
-#include <hip/hip_runtime.h>
+#include <ginkgo/config.hpp>
 
 
-#include <ginkgo/config.hpp>
+#include "common/cuda_hip/base/runtime.hpp"
 
 
 #if GINKGO_HIP_PLATFORM_HCC && GKO_HAVE_ROCTX
diff --git a/hip/base/scoped_device_id.hip.cpp b/hip/base/scoped_device_id.hip.cpp
index ab6ed703da8..1fd7211b106 100644
--- a/hip/base/scoped_device_id.hip.cpp
+++ b/hip/base/scoped_device_id.hip.cpp
@@ -6,12 +6,10 @@
 #include <utility>
 
 
-#include <hip/hip_runtime.h>
-
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
+#include "common/cuda_hip/base/runtime.hpp"
 #include "hip/base/scoped_device_id.hip.hpp"
 
 
diff --git a/hip/base/stream.hip.cpp b/hip/base/stream.hip.cpp
index 93c1fc008d9..b56c5104428 100644
--- a/hip/base/stream.hip.cpp
+++ b/hip/base/stream.hip.cpp
@@ -5,14 +5,12 @@
 #include <ginkgo/core/base/stream.hpp>
 
 
-#include <hip/hip_runtime.h>
-
-
 #include <ginkgo/config.hpp>
 #include <ginkgo/core/base/device.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
+#include "common/cuda_hip/base/runtime.hpp"
 #include "hip/base/scoped_device_id.hip.hpp"
 
 
diff --git a/hip/base/timer.hip.cpp b/hip/base/timer.hip.cpp
index 44fe5b7cbeb..bd81d9f3be5 100644
--- a/hip/base/timer.hip.cpp
+++ b/hip/base/timer.hip.cpp
@@ -5,12 +5,10 @@
 #include <ginkgo/core/base/timer.hpp>
 
 
-#include <hip/hip_runtime.h>
-
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
+#include "common/cuda_hip/base/runtime.hpp"
 #include "hip/base/scoped_device_id.hip.hpp"
 
 
diff --git a/hip/base/types.hip.hpp b/hip/base/types.hip.hpp
index 8827b2bea41..9ae2224c064 100644
--- a/hip/base/types.hip.hpp
+++ b/hip/base/types.hip.hpp
@@ -14,7 +14,8 @@
 
 #include <hip/hip_complex.h>
 #include <hip/hip_fp16.h>
-#include <hip/hip_runtime.h>
+
+
 #if HIP_VERSION >= 50200000
 #include <hipblas/hipblas.h>
 #else
@@ -26,6 +27,9 @@
 #include <ginkgo/core/base/matrix_data.hpp>
 
 
+#include "common/cuda_hip/base/runtime.hpp"
+
+
 namespace gko {
 
 
@@ -430,6 +434,10 @@ GKO_INLINE GKO_ATTRIBUTES constexpr
 }
 
 
+using deviceComplex = hipComplex;
+using deviceDoubleComplex = hipDoubleComplex;
+
+
 }  // namespace hip
 }  // namespace kernels
 }  // namespace gko
diff --git a/hip/components/atomic.hip.hpp b/hip/components/atomic.hip.hpp
index f57705ff408..0dc8d7a3b46 100644
--- a/hip/components/atomic.hip.hpp
+++ b/hip/components/atomic.hip.hpp
@@ -9,8 +9,8 @@
 #include <type_traits>
 
 
+#include "common/cuda_hip/base/types.hpp"
 #include "hip/base/math.hip.hpp"
-#include "hip/base/types.hip.hpp"
 
 
 namespace gko {
@@ -21,38 +21,6 @@ namespace hip {
 #include "common/cuda_hip/components/atomic.hpp.inc"
 
 
-/**
- * @internal
- *
- * @note It is not 'real' complex<float> atomic add operation
- */
-__forceinline__ __device__ thrust::complex<float> atomic_add(
-    thrust::complex<float>* __restrict__ address, thrust::complex<float> val)
-{
-    hipComplex* addr = reinterpret_cast<hipComplex*>(address);
-    // Separate to real part and imag part
-    auto real = atomic_add(static_cast<float*>(&(addr->x)), val.real());
-    auto imag = atomic_add(static_cast<float*>(&(addr->y)), val.imag());
-    return {real, imag};
-}
-
-
-/**
- * @internal
- *
- * @note It is not 'real' complex<double> atomic add operation
- */
-__forceinline__ __device__ thrust::complex<double> atomic_add(
-    thrust::complex<double>* __restrict__ address, thrust::complex<double> val)
-{
-    hipDoubleComplex* addr = reinterpret_cast<hipDoubleComplex*>(address);
-    // Separate to real part and imag part
-    auto real = atomic_add(static_cast<double*>(&(addr->x)), val.real());
-    auto imag = atomic_add(static_cast<double*>(&(addr->y)), val.imag());
-    return {real, imag};
-}
-
-
 }  // namespace hip
 }  // namespace kernels
 }  // namespace gko
diff --git a/hip/components/cooperative_groups.hip.hpp b/hip/components/cooperative_groups.hip.hpp
index 247218a1457..e81441a092b 100644
--- a/hip/components/cooperative_groups.hip.hpp
+++ b/hip/components/cooperative_groups.hip.hpp
@@ -9,8 +9,8 @@
 #include <type_traits>
 
 
-#include "hip/base/config.hip.hpp"
-#include "hip/base/types.hip.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/types.hpp"
 
 
 namespace gko {
diff --git a/hip/components/diagonal_block_manipulation.hip.hpp b/hip/components/diagonal_block_manipulation.hip.hpp
index 0261c7549c5..290511e7583 100644
--- a/hip/components/diagonal_block_manipulation.hip.hpp
+++ b/hip/components/diagonal_block_manipulation.hip.hpp
@@ -9,9 +9,9 @@
 #include <type_traits>
 
 
-#include "hip/base/config.hip.hpp"
-#include "hip/base/types.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 
 
 namespace gko {
diff --git a/hip/components/format_conversion.hip.hpp b/hip/components/format_conversion.hip.hpp
index 59c0405a874..07daf486d84 100644
--- a/hip/components/format_conversion.hip.hpp
+++ b/hip/components/format_conversion.hip.hpp
@@ -6,14 +6,12 @@
 #define GKO_HIP_COMPONENTS_FORMAT_CONVERSION_HIP_HPP_
 
 
-#include <hip/hip_runtime.h>
-
-
 #include <ginkgo/config.hpp>
 #include <ginkgo/core/base/executor.hpp>
 
 
-#include "hip/components/cooperative_groups.hip.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "hip/components/thread_ids.hip.hpp"
 
 
diff --git a/hip/components/memory.hip.hpp b/hip/components/memory.hip.hpp
index fd4fbb8ce11..4bb6fa19ec0 100644
--- a/hip/components/memory.hip.hpp
+++ b/hip/components/memory.hip.hpp
@@ -13,7 +13,7 @@
 #include <ginkgo/core/base/math.hpp>
 
 
-#include "hip/base/types.hip.hpp"
+#include "common/cuda_hip/base/types.hpp"
 
 
 namespace gko {
diff --git a/hip/components/prefix_sum.hip.hpp b/hip/components/prefix_sum.hip.hpp
index b5065589d8e..5acde03cbec 100644
--- a/hip/components/prefix_sum.hip.hpp
+++ b/hip/components/prefix_sum.hip.hpp
@@ -9,8 +9,8 @@
 #include <type_traits>
 
 
-#include "hip/base/hipblas_bindings.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
+#include "common/cuda_hip/base/blas_bindings.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "hip/components/reduction.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
 
diff --git a/hip/components/reduction.hip.hpp b/hip/components/reduction.hip.hpp
index c8fa5e58b4f..fb0539952ff 100644
--- a/hip/components/reduction.hip.hpp
+++ b/hip/components/reduction.hip.hpp
@@ -9,16 +9,15 @@
 #include <type_traits>
 
 
-#include <hip/hip_runtime.h>
-
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/executor.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/array_access.hpp"
-#include "hip/base/types.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
 #include "hip/components/uninitialized_array.hip.hpp"
 
@@ -57,7 +56,6 @@ __host__ ValueType reduce_add_array(std::shared_ptr<const HipExecutor> exec,
 
         block_results.resize_and_reset(grid_dim);
 
-
         reduce_add_array<<<grid_dim, default_reduce_block_size, 0,
                            exec->get_stream()>>>(
             size, as_device_type(source),
diff --git a/hip/components/searching.hip.hpp b/hip/components/searching.hip.hpp
index 2a6be767c2c..9222de9e1d6 100644
--- a/hip/components/searching.hip.hpp
+++ b/hip/components/searching.hip.hpp
@@ -6,7 +6,7 @@
 #define GKO_HIP_COMPONENTS_SEARCHING_HIP_HPP_
 
 
-#include "hip/base/config.hip.hpp"
+#include "common/cuda_hip/base/config.hpp"
 #include "hip/components/intrinsics.hip.hpp"
 
 
diff --git a/hip/components/segment_scan.hip.hpp b/hip/components/segment_scan.hip.hpp
index 7f98d08cf69..93ebb35833a 100644
--- a/hip/components/segment_scan.hip.hpp
+++ b/hip/components/segment_scan.hip.hpp
@@ -6,7 +6,7 @@
 #define GKO_HIP_COMPONENTS_SEGMENT_SCAN_HIP_HPP_
 
 
-#include "hip/components/cooperative_groups.hip.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 
 
 namespace gko {
diff --git a/hip/components/sorting.hip.hpp b/hip/components/sorting.hip.hpp
index 730c3c56401..4a664aee453 100644
--- a/hip/components/sorting.hip.hpp
+++ b/hip/components/sorting.hip.hpp
@@ -6,8 +6,8 @@
 #define GKO_HIP_COMPONENTS_SORTING_HIP_HPP_
 
 
-#include "hip/base/config.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 
 
 namespace gko {
diff --git a/hip/components/syncfree.hip.hpp b/hip/components/syncfree.hip.hpp
index 9fe48944b56..7627a0a2781 100644
--- a/hip/components/syncfree.hip.hpp
+++ b/hip/components/syncfree.hip.hpp
@@ -9,11 +9,11 @@
 #include <ginkgo/core/base/array.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/memory.hpp"
 #include "core/components/fill_array_kernels.hpp"
-#include "hip/base/config.hip.hpp"
 #include "hip/components/atomic.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
-#include "hip/components/memory.hip.hpp"
 
 
 namespace gko {
diff --git a/hip/components/thread_ids.hip.hpp b/hip/components/thread_ids.hip.hpp
index 03761983e02..6f0bd44ba9c 100644
--- a/hip/components/thread_ids.hip.hpp
+++ b/hip/components/thread_ids.hip.hpp
@@ -6,17 +6,12 @@
 #define GKO_HIP_COMPONENTS_THREAD_IDS_HIP_HPP_
 
 
-#include "hip/base/config.hip.hpp"
+#include "common/cuda_hip/base/config.hpp"
 
 
 namespace gko {
 namespace kernels {
 namespace hip {
-/**
- * @brief The HIP thread namespace.
- *
- * @ingroup hip_thread
- */
 namespace thread {
 
 
diff --git a/hip/distributed/vector_kernels.hip.cpp b/hip/distributed/vector_kernels.hip.cpp
index 320d847ed85..fc6718dec0d 100644
--- a/hip/distributed/vector_kernels.hip.cpp
+++ b/hip/distributed/vector_kernels.hip.cpp
@@ -5,9 +5,6 @@
 #include "core/distributed/vector_kernels.hpp"
 
 
-#include <functional>
-
-
 #include <thrust/binary_search.h>
 #include <thrust/execution_policy.h>
 #include <thrust/iterator/transform_iterator.h>
diff --git a/hip/factorization/cholesky_kernels.hip.cpp b/hip/factorization/cholesky_kernels.hip.cpp
index 1dd94bb05d0..419db21b811 100644
--- a/hip/factorization/cholesky_kernels.hip.cpp
+++ b/hip/factorization/cholesky_kernels.hip.cpp
@@ -20,15 +20,15 @@
 #include <ginkgo/core/matrix/csr.hpp>
 
 
+#include "common/cuda_hip/base/sparselib_bindings.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/components/fill_array_kernels.hpp"
 #include "core/components/format_conversion_kernels.hpp"
 #include "core/factorization/elimination_forest.hpp"
 #include "core/factorization/lu_kernels.hpp"
 #include "core/matrix/csr_lookup.hpp"
-#include "hip/base/hipsparse_bindings.hip.hpp"
 #include "hip/base/math.hip.hpp"
 #include "hip/base/thrust.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
 #include "hip/components/intrinsics.hip.hpp"
 #include "hip/components/reduction.hip.hpp"
 #include "hip/components/syncfree.hip.hpp"
@@ -80,19 +80,19 @@ void symbolic_count(std::shared_ptr<const DefaultExecutor> exec,
     }
     // sort postorder_cols inside rows
     {
-        const auto handle = exec->get_hipsparse_handle();
-        auto descr = hipsparse::create_mat_descr();
+        const auto handle = exec->get_sparselib_handle();
+        auto descr = sparselib::create_mat_descr();
         array<IndexType> permutation_array(exec, mtx_nnz);
         auto permutation = permutation_array.get_data();
         components::fill_seq_array(exec, permutation, mtx_nnz);
         size_type buffer_size{};
-        hipsparse::csrsort_buffer_size(handle, num_rows, num_rows, mtx_nnz,
+        sparselib::csrsort_buffer_size(handle, num_rows, num_rows, mtx_nnz,
                                        row_ptrs, postorder_cols, buffer_size);
         array<char> buffer_array{exec, buffer_size};
         auto buffer = buffer_array.get_data();
-        hipsparse::csrsort(handle, num_rows, num_rows, mtx_nnz, descr, row_ptrs,
+        sparselib::csrsort(handle, num_rows, num_rows, mtx_nnz, descr, row_ptrs,
                            postorder_cols, permutation, buffer);
-        hipsparse::destroy(descr);
+        sparselib::destroy(descr);
     }
     // count nonzeros per row of L
     {
diff --git a/hip/factorization/factorization_kernels.hip.cpp b/hip/factorization/factorization_kernels.hip.cpp
index a2de4912fdb..4080768bc07 100644
--- a/hip/factorization/factorization_kernels.hip.cpp
+++ b/hip/factorization/factorization_kernels.hip.cpp
@@ -5,17 +5,16 @@
 #include "core/factorization/factorization_kernels.hpp"
 
 
-#include <hip/hip_runtime.h>
-
-
 #include <ginkgo/core/base/array.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/array_access.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/csr_builder.hpp"
-#include "hip/base/types.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
 #include "hip/components/intrinsics.hip.hpp"
 #include "hip/components/searching.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
diff --git a/hip/factorization/ic_kernels.hip.cpp b/hip/factorization/ic_kernels.hip.cpp
index 7a845547d0d..edda974fd36 100644
--- a/hip/factorization/ic_kernels.hip.cpp
+++ b/hip/factorization/ic_kernels.hip.cpp
@@ -5,13 +5,11 @@
 #include "core/factorization/ic_kernels.hpp"
 
 
-#include <hip/hip_runtime.h>
-
-
 #include <ginkgo/core/base/array.hpp>
 
 
-#include "hip/base/hipsparse_bindings.hip.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/sparselib_bindings.hpp"
 
 
 namespace gko {
@@ -30,32 +28,32 @@ void compute(std::shared_ptr<const DefaultExecutor> exec,
              matrix::Csr<ValueType, IndexType>* m)
 {
     const auto id = exec->get_device_id();
-    auto handle = exec->get_hipsparse_handle();
-    auto desc = hipsparse::create_mat_descr();
-    auto info = hipsparse::create_ic0_info();
+    auto handle = exec->get_sparselib_handle();
+    auto desc = sparselib::create_mat_descr();
+    auto info = sparselib::create_ic0_info();
 
     // get buffer size for IC
     IndexType num_rows = m->get_size()[0];
     IndexType nnz = m->get_num_stored_elements();
     size_type buffer_size{};
-    hipsparse::ic0_buffer_size(handle, num_rows, nnz, desc,
+    sparselib::ic0_buffer_size(handle, num_rows, nnz, desc,
                                m->get_const_values(), m->get_const_row_ptrs(),
                                m->get_const_col_idxs(), info, buffer_size);
 
     array<char> buffer{exec, buffer_size};
 
     // set up IC(0)
-    hipsparse::ic0_analysis(handle, num_rows, nnz, desc, m->get_const_values(),
+    sparselib::ic0_analysis(handle, num_rows, nnz, desc, m->get_const_values(),
                             m->get_const_row_ptrs(), m->get_const_col_idxs(),
-                            info, HIPSPARSE_SOLVE_POLICY_USE_LEVEL,
+                            info, SPARSELIB_SOLVE_POLICY_USE_LEVEL,
                             buffer.get_data());
 
-    hipsparse::ic0(handle, num_rows, nnz, desc, m->get_values(),
+    sparselib::ic0(handle, num_rows, nnz, desc, m->get_values(),
                    m->get_const_row_ptrs(), m->get_const_col_idxs(), info,
-                   HIPSPARSE_SOLVE_POLICY_USE_LEVEL, buffer.get_data());
+                   SPARSELIB_SOLVE_POLICY_USE_LEVEL, buffer.get_data());
 
-    hipsparse::destroy_ic0_info(info);
-    hipsparse::destroy(desc);
+    sparselib::destroy_ic0_info(info);
+    sparselib::destroy(desc);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_IC_COMPUTE_KERNEL);
diff --git a/hip/factorization/ilu_kernels.hip.cpp b/hip/factorization/ilu_kernels.hip.cpp
index 071d3721536..f50df5ca75b 100644
--- a/hip/factorization/ilu_kernels.hip.cpp
+++ b/hip/factorization/ilu_kernels.hip.cpp
@@ -5,13 +5,11 @@
 #include "core/factorization/ilu_kernels.hpp"
 
 
-#include <hip/hip_runtime.h>
-
-
 #include <ginkgo/core/base/array.hpp>
 
 
-#include "hip/base/hipsparse_bindings.hip.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/sparselib_bindings.hpp"
 
 
 namespace gko {
@@ -30,32 +28,32 @@ void compute_lu(std::shared_ptr<const DefaultExecutor> exec,
                 matrix::Csr<ValueType, IndexType>* m)
 {
     const auto id = exec->get_device_id();
-    auto handle = exec->get_hipsparse_handle();
-    auto desc = hipsparse::create_mat_descr();
-    auto info = hipsparse::create_ilu0_info();
+    auto handle = exec->get_sparselib_handle();
+    auto desc = sparselib::create_mat_descr();
+    auto info = sparselib::create_ilu0_info();
 
     // get buffer size for ILU
     IndexType num_rows = m->get_size()[0];
     IndexType nnz = m->get_num_stored_elements();
     size_type buffer_size{};
-    hipsparse::ilu0_buffer_size(handle, num_rows, nnz, desc,
+    sparselib::ilu0_buffer_size(handle, num_rows, nnz, desc,
                                 m->get_const_values(), m->get_const_row_ptrs(),
                                 m->get_const_col_idxs(), info, buffer_size);
 
     array<char> buffer{exec, buffer_size};
 
     // set up ILU(0)
-    hipsparse::ilu0_analysis(handle, num_rows, nnz, desc, m->get_const_values(),
+    sparselib::ilu0_analysis(handle, num_rows, nnz, desc, m->get_const_values(),
                              m->get_const_row_ptrs(), m->get_const_col_idxs(),
-                             info, HIPSPARSE_SOLVE_POLICY_USE_LEVEL,
+                             info, SPARSELIB_SOLVE_POLICY_USE_LEVEL,
                              buffer.get_data());
 
-    hipsparse::ilu0(handle, num_rows, nnz, desc, m->get_values(),
+    sparselib::ilu0(handle, num_rows, nnz, desc, m->get_values(),
                     m->get_const_row_ptrs(), m->get_const_col_idxs(), info,
-                    HIPSPARSE_SOLVE_POLICY_USE_LEVEL, buffer.get_data());
+                    SPARSELIB_SOLVE_POLICY_USE_LEVEL, buffer.get_data());
 
-    hipsparse::destroy_ilu0_info(info);
-    hipsparse::destroy(desc);
+    sparselib::destroy_ilu0_info(info);
+    sparselib::destroy(desc);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
diff --git a/hip/factorization/lu_kernels.hip.cpp b/hip/factorization/lu_kernels.hip.cpp
index e1c60103dd3..ec3e771134e 100644
--- a/hip/factorization/lu_kernels.hip.cpp
+++ b/hip/factorization/lu_kernels.hip.cpp
@@ -17,11 +17,11 @@
 #include <ginkgo/core/matrix/csr.hpp>
 
 
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/allocator.hpp"
 #include "core/matrix/csr_lookup.hpp"
 #include "hip/base/thrust.hip.hpp"
-#include "hip/base/types.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
 #include "hip/components/reduction.hip.hpp"
 #include "hip/components/syncfree.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
diff --git a/hip/factorization/par_ic_kernels.hip.cpp b/hip/factorization/par_ic_kernels.hip.cpp
index dd91ac27339..e4cd0b2470b 100644
--- a/hip/factorization/par_ic_kernels.hip.cpp
+++ b/hip/factorization/par_ic_kernels.hip.cpp
@@ -10,9 +10,9 @@
 #include <ginkgo/core/matrix/csr.hpp>
 
 
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/memory.hpp"
 #include "hip/base/math.hip.hpp"
-#include "hip/base/types.hip.hpp"
-#include "hip/components/memory.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
 
 
diff --git a/hip/factorization/par_ict_kernels.hip.cpp b/hip/factorization/par_ict_kernels.hip.cpp
index 4b27383bff5..7f5dba82eba 100644
--- a/hip/factorization/par_ict_kernels.hip.cpp
+++ b/hip/factorization/par_ict_kernels.hip.cpp
@@ -5,9 +5,6 @@
 #include "core/factorization/par_ict_kernels.hpp"
 
 
-#include <hip/hip_runtime.h>
-
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
@@ -15,6 +12,8 @@
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/memory.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/coo_builder.hpp"
 #include "core/matrix/csr_builder.hpp"
@@ -22,7 +21,6 @@
 #include "core/synthesizer/implementation_selection.hpp"
 #include "hip/base/math.hip.hpp"
 #include "hip/components/intrinsics.hip.hpp"
-#include "hip/components/memory.hip.hpp"
 #include "hip/components/merging.hip.hpp"
 #include "hip/components/prefix_sum.hip.hpp"
 #include "hip/components/reduction.hip.hpp"
@@ -49,8 +47,7 @@ using compiled_kernels =
     syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
 
 
-#include "common/cuda_hip/factorization/par_ict_spgeam_kernels.hpp.inc"
-#include "common/cuda_hip/factorization/par_ict_sweep_kernels.hpp.inc"
+#include "common/cuda_hip/factorization/par_ict_kernels.hpp.inc"
 
 
 namespace {
diff --git a/hip/factorization/par_ilu_kernels.hip.cpp b/hip/factorization/par_ilu_kernels.hip.cpp
index b10941d44f1..49608d6801f 100644
--- a/hip/factorization/par_ilu_kernels.hip.cpp
+++ b/hip/factorization/par_ilu_kernels.hip.cpp
@@ -5,16 +5,13 @@
 #include "core/factorization/par_ilu_kernels.hpp"
 
 
-#include <hip/hip_runtime.h>
-
-
-#include <ginkgo/core/base/std_extensions.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
 
 
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/memory.hpp"
 #include "hip/base/math.hip.hpp"
-#include "hip/base/types.hip.hpp"
-#include "hip/components/memory.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
 
 
diff --git a/hip/factorization/par_ilut_approx_filter_kernel.hip.cpp b/hip/factorization/par_ilut_approx_filter_kernels.hip.cpp
similarity index 97%
rename from hip/factorization/par_ilut_approx_filter_kernel.hip.cpp
rename to hip/factorization/par_ilut_approx_filter_kernels.hip.cpp
index d730e33e418..b5612ea29c6 100644
--- a/hip/factorization/par_ilut_approx_filter_kernel.hip.cpp
+++ b/hip/factorization/par_ilut_approx_filter_kernels.hip.cpp
@@ -8,9 +8,6 @@
 #include <algorithm>
 
 
-#include <hip/hip_runtime.h>
-
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
@@ -18,16 +15,17 @@
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/coo_builder.hpp"
 #include "core/matrix/csr_builder.hpp"
 #include "core/matrix/csr_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "hip/base/config.hip.hpp"
 #include "hip/base/math.hip.hpp"
-#include "hip/base/types.hip.hpp"
 #include "hip/components/atomic.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
 #include "hip/components/intrinsics.hip.hpp"
 #include "hip/components/prefix_sum.hip.hpp"
 #include "hip/components/sorting.hip.hpp"
diff --git a/hip/factorization/par_ilut_filter_kernel.hip.cpp b/hip/factorization/par_ilut_filter_kernels.hip.cpp
similarity index 96%
rename from hip/factorization/par_ilut_filter_kernel.hip.cpp
rename to hip/factorization/par_ilut_filter_kernels.hip.cpp
index eef1044878e..e6d0a6348cc 100644
--- a/hip/factorization/par_ilut_filter_kernel.hip.cpp
+++ b/hip/factorization/par_ilut_filter_kernels.hip.cpp
@@ -5,9 +5,6 @@
 #include "core/factorization/par_ilut_kernels.hpp"
 
 
-#include <hip/hip_runtime.h>
-
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
@@ -15,15 +12,16 @@
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/coo_builder.hpp"
 #include "core/matrix/csr_builder.hpp"
 #include "core/matrix/csr_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "hip/base/config.hip.hpp"
 #include "hip/base/math.hip.hpp"
-#include "hip/base/types.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
 #include "hip/components/intrinsics.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
 
diff --git a/hip/factorization/par_ilut_select_common.hip.cpp b/hip/factorization/par_ilut_select_common.hip.cpp
index 85c2eaa7036..ddad307dc62 100644
--- a/hip/factorization/par_ilut_select_common.hip.cpp
+++ b/hip/factorization/par_ilut_select_common.hip.cpp
@@ -4,7 +4,7 @@
 
 // force-top: on
 // prevent compilation failure related to disappearing assert(...) statements
-#include <hip/hip_runtime.h>
+#include "common/cuda_hip/base/runtime.hpp"
 // force-top: off
 
 
diff --git a/hip/factorization/par_ilut_select_kernel.hip.cpp b/hip/factorization/par_ilut_select_kernels.hip.cpp
similarity index 99%
rename from hip/factorization/par_ilut_select_kernel.hip.cpp
rename to hip/factorization/par_ilut_select_kernels.hip.cpp
index b6d93e65b24..b259133b95d 100644
--- a/hip/factorization/par_ilut_select_kernel.hip.cpp
+++ b/hip/factorization/par_ilut_select_kernels.hip.cpp
@@ -8,14 +8,12 @@
 #include <algorithm>
 
 
-#include <hip/hip_runtime.h>
-
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 
 
+#include "common/cuda_hip/base/runtime.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "hip/base/math.hip.hpp"
 #include "hip/components/atomic.hip.hpp"
diff --git a/hip/factorization/par_ilut_spgeam_kernel.hip.cpp b/hip/factorization/par_ilut_spgeam_kernels.hip.cpp
similarity index 98%
rename from hip/factorization/par_ilut_spgeam_kernel.hip.cpp
rename to hip/factorization/par_ilut_spgeam_kernels.hip.cpp
index ad102e49488..df77b1ba7a2 100644
--- a/hip/factorization/par_ilut_spgeam_kernel.hip.cpp
+++ b/hip/factorization/par_ilut_spgeam_kernels.hip.cpp
@@ -5,9 +5,6 @@
 #include "core/factorization/par_ilut_kernels.hpp"
 
 
-#include <hip/hip_runtime.h>
-
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
@@ -15,13 +12,14 @@
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/coo_builder.hpp"
 #include "core/matrix/csr_builder.hpp"
 #include "core/matrix/csr_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
 #include "hip/base/math.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
 #include "hip/components/intrinsics.hip.hpp"
 #include "hip/components/merging.hip.hpp"
 #include "hip/components/prefix_sum.hip.hpp"
diff --git a/hip/factorization/par_ilut_sweep_kernel.hip.cpp b/hip/factorization/par_ilut_sweep_kernels.hip.cpp
similarity index 97%
rename from hip/factorization/par_ilut_sweep_kernel.hip.cpp
rename to hip/factorization/par_ilut_sweep_kernels.hip.cpp
index bdcecc609d5..0f1e6455812 100644
--- a/hip/factorization/par_ilut_sweep_kernel.hip.cpp
+++ b/hip/factorization/par_ilut_sweep_kernels.hip.cpp
@@ -5,9 +5,6 @@
 #include "core/factorization/par_ilut_kernels.hpp"
 
 
-#include <hip/hip_runtime.h>
-
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
@@ -15,6 +12,8 @@
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/memory.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/coo_builder.hpp"
 #include "core/matrix/csr_builder.hpp"
@@ -22,7 +21,6 @@
 #include "core/synthesizer/implementation_selection.hpp"
 #include "hip/base/math.hip.hpp"
 #include "hip/components/intrinsics.hip.hpp"
-#include "hip/components/memory.hip.hpp"
 #include "hip/components/merging.hip.hpp"
 #include "hip/components/prefix_sum.hip.hpp"
 #include "hip/components/reduction.hip.hpp"
@@ -85,7 +83,6 @@ void compute_l_u_factors(syn::value_list<int, subwarp_size>,
     }
 }
 
-
 GKO_ENABLE_IMPLEMENTATION_SELECTION(select_compute_l_u_factors,
                                     compute_l_u_factors);
 
diff --git a/hip/matrix/batch_csr_kernels.hip.cpp b/hip/matrix/batch_csr_kernels.hip.cpp
index 432213f3083..de73576ffed 100644
--- a/hip/matrix/batch_csr_kernels.hip.cpp
+++ b/hip/matrix/batch_csr_kernels.hip.cpp
@@ -5,7 +5,6 @@
 #include "core/matrix/batch_csr_kernels.hpp"
 
 
-#include <hip/hip_runtime.h>
 #include <thrust/functional.h>
 
 
@@ -14,12 +13,13 @@
 #include <ginkgo/core/matrix/batch_csr.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "hip/base/batch_struct.hip.hpp"
-#include "hip/base/config.hip.hpp"
 #include "hip/base/thrust.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
 #include "hip/components/reduction.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
 #include "hip/components/uninitialized_array.hip.hpp"
diff --git a/hip/matrix/batch_dense_kernels.hip.cpp b/hip/matrix/batch_dense_kernels.hip.cpp
index 0d03d4ea10b..5d3b9d8cef9 100644
--- a/hip/matrix/batch_dense_kernels.hip.cpp
+++ b/hip/matrix/batch_dense_kernels.hip.cpp
@@ -5,19 +5,21 @@
 #include "core/matrix/batch_dense_kernels.hpp"
 
 
-#include <hip/hip_runtime.h>
 #include <thrust/functional.h>
 
 
+#include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/batch_dense.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "hip/base/batch_struct.hip.hpp"
-#include "hip/base/config.hip.hpp"
 #include "hip/base/thrust.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
 #include "hip/components/reduction.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
 #include "hip/components/uninitialized_array.hip.hpp"
diff --git a/hip/matrix/batch_ell_kernels.hip.cpp b/hip/matrix/batch_ell_kernels.hip.cpp
index 221a3ec65dd..d415f114c3b 100644
--- a/hip/matrix/batch_ell_kernels.hip.cpp
+++ b/hip/matrix/batch_ell_kernels.hip.cpp
@@ -5,7 +5,6 @@
 #include "core/matrix/batch_ell_kernels.hpp"
 
 
-#include <hip/hip_runtime.h>
 #include <thrust/functional.h>
 
 
@@ -14,12 +13,13 @@
 #include <ginkgo/core/matrix/batch_ell.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "hip/base/batch_struct.hip.hpp"
-#include "hip/base/config.hip.hpp"
 #include "hip/base/thrust.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
 #include "hip/components/reduction.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
 #include "hip/components/uninitialized_array.hip.hpp"
diff --git a/hip/matrix/batch_struct.hip.hpp b/hip/matrix/batch_struct.hip.hpp
index 6c98146161e..16a267d95b6 100644
--- a/hip/matrix/batch_struct.hip.hpp
+++ b/hip/matrix/batch_struct.hip.hpp
@@ -13,8 +13,8 @@
 #include <ginkgo/core/matrix/batch_ell.hpp>
 
 
+#include "common/cuda_hip/base/types.hpp"
 #include "core/base/batch_struct.hpp"
-#include "hip/base/types.hip.hpp"
 
 
 namespace gko {
diff --git a/hip/matrix/coo_kernels.hip.cpp b/hip/matrix/coo_kernels.hip.cpp
index 5e32e1d8502..8f7a050ef87 100644
--- a/hip/matrix/coo_kernels.hip.cpp
+++ b/hip/matrix/coo_kernels.hip.cpp
@@ -5,9 +5,6 @@
 #include "core/matrix/coo_kernels.hpp"
 
 
-#include <hip/hip_runtime.h>
-
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/types.hpp>
@@ -15,25 +12,21 @@
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/sparselib_bindings.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/format_conversion.hpp"
 #include "core/matrix/dense_kernels.hpp"
-#include "hip/base/config.hip.hpp"
-#include "hip/base/hipsparse_bindings.hip.hpp"
 #include "hip/base/math.hip.hpp"
-#include "hip/base/types.hip.hpp"
 #include "hip/components/atomic.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
-#include "hip/components/format_conversion.hip.hpp"
 #include "hip/components/segment_scan.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
 
 
 namespace gko {
 namespace kernels {
-/**
- * @brief The HIP namespace.
- *
- * @ingroup hip
- */
 namespace hip {
 /**
  * @brief The Coordinate matrix format namespace.
diff --git a/hip/matrix/csr_kernels.template.hip.cpp b/hip/matrix/csr_kernels.template.hip.cpp
index 599a2df3669..8b3579f049c 100644
--- a/hip/matrix/csr_kernels.template.hip.cpp
+++ b/hip/matrix/csr_kernels.template.hip.cpp
@@ -8,7 +8,6 @@
 #include <algorithm>
 
 
-#include <hip/hip_runtime.h>
 #include <thrust/copy.h>
 #include <thrust/count.h>
 #include <thrust/device_ptr.h>
@@ -28,7 +27,13 @@
 #include <ginkgo/core/matrix/sellp.hpp>
 
 
-#include "accessor/hip_helper.hpp"
+#include "accessor/cuda_hip_helper.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/pointer_mode_guard.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/sparselib_bindings.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/array_access.hpp"
 #include "core/base/mixed_precision_types.hpp"
 #include "core/components/fill_array_kernels.hpp"
@@ -39,14 +44,9 @@
 #include "core/matrix/csr_lookup.hpp"
 #include "core/matrix/dense_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "hip/base/config.hip.hpp"
-#include "hip/base/hipsparse_bindings.hip.hpp"
 #include "hip/base/math.hip.hpp"
-#include "hip/base/pointer_mode_guard.hip.hpp"
 #include "hip/base/thrust.hip.hpp"
-#include "hip/base/types.hip.hpp"
 #include "hip/components/atomic.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
 #include "hip/components/intrinsics.hip.hpp"
 #include "hip/components/merging.hip.hpp"
 #include "hip/components/prefix_sum.hip.hpp"
@@ -133,10 +133,11 @@ void merge_path_spmv(syn::value_list<int, items_per_thread>,
                 kernel::abstract_merge_path_spmv<items_per_thread>
                     <<<grid, block, 0, exec->get_stream()>>>(
                         static_cast<IndexType>(a->get_size()[0]),
-                        acc::as_hip_range(a_vals), a->get_const_col_idxs(),
+                        acc::as_device_range(a_vals), a->get_const_col_idxs(),
                         as_device_type(a->get_const_row_ptrs()),
                         as_device_type(a->get_const_srow()),
-                        acc::as_hip_range(b_vals), acc::as_hip_range(c_vals),
+                        acc::as_device_range(b_vals),
+                        acc::as_device_range(c_vals),
                         as_device_type(row_out.get_data()),
                         as_device_type(val_out.get_data()));
             }
@@ -144,7 +145,7 @@ void merge_path_spmv(syn::value_list<int, items_per_thread>,
                 abstract_reduce<<<1, spmv_block_size, 0, exec->get_stream()>>>(
                     grid_num, as_device_type(val_out.get_data()),
                     as_device_type(row_out.get_data()),
-                    acc::as_hip_range(c_vals));
+                    acc::as_device_range(c_vals));
 
         } else if (alpha != nullptr && beta != nullptr) {
             if (grid_num > 0) {
@@ -152,12 +153,12 @@ void merge_path_spmv(syn::value_list<int, items_per_thread>,
                     <<<grid, block, 0, exec->get_stream()>>>(
                         static_cast<IndexType>(a->get_size()[0]),
                         as_device_type(alpha->get_const_values()),
-                        acc::as_hip_range(a_vals), a->get_const_col_idxs(),
+                        acc::as_device_range(a_vals), a->get_const_col_idxs(),
                         as_device_type(a->get_const_row_ptrs()),
                         as_device_type(a->get_const_srow()),
-                        acc::as_hip_range(b_vals),
+                        acc::as_device_range(b_vals),
                         as_device_type(beta->get_const_values()),
-                        acc::as_hip_range(c_vals),
+                        acc::as_device_range(c_vals),
                         as_device_type(row_out.get_data()),
                         as_device_type(val_out.get_data()));
             }
@@ -166,7 +167,7 @@ void merge_path_spmv(syn::value_list<int, items_per_thread>,
                     grid_num, as_device_type(val_out.get_data()),
                     as_device_type(row_out.get_data()),
                     as_device_type(alpha->get_const_values()),
-                    acc::as_hip_range(c_vals));
+                    acc::as_device_range(c_vals));
         } else {
             GKO_KERNEL_NOT_FOUND;
         }
@@ -262,21 +263,21 @@ void classical_spmv(syn::value_list<int, subwarp_size>,
         if (grid.x > 0 && grid.y > 0) {
             kernel::abstract_classical_spmv<subwarp_size>
                 <<<grid, block, 0, exec->get_stream()>>>(
-                    a->get_size()[0], acc::as_hip_range(a_vals),
+                    a->get_size()[0], acc::as_device_range(a_vals),
                     a->get_const_col_idxs(),
                     as_device_type(a->get_const_row_ptrs()),
-                    acc::as_hip_range(b_vals), acc::as_hip_range(c_vals));
+                    acc::as_device_range(b_vals), acc::as_device_range(c_vals));
         }
     } else if (alpha != nullptr && beta != nullptr) {
         if (grid.x > 0 && grid.y > 0) {
             kernel::abstract_classical_spmv<subwarp_size>
                 <<<grid, block, 0, exec->get_stream()>>>(
                     a->get_size()[0], as_device_type(alpha->get_const_values()),
-                    acc::as_hip_range(a_vals), a->get_const_col_idxs(),
+                    acc::as_device_range(a_vals), a->get_const_col_idxs(),
                     as_device_type(a->get_const_row_ptrs()),
-                    acc::as_hip_range(b_vals),
+                    acc::as_device_range(b_vals),
                     as_device_type(beta->get_const_values()),
-                    acc::as_hip_range(c_vals));
+                    acc::as_device_range(c_vals));
         }
     } else {
         GKO_KERNEL_NOT_FOUND;
@@ -318,20 +319,20 @@ void load_balance_spmv(std::shared_ptr<const HipExecutor> exec,
                                         exec->get_stream()>>>(
                     nwarps, static_cast<IndexType>(a->get_size()[0]),
                     as_device_type(alpha->get_const_values()),
-                    acc::as_hip_range(a_vals), a->get_const_col_idxs(),
+                    acc::as_device_range(a_vals), a->get_const_col_idxs(),
                     as_device_type(a->get_const_row_ptrs()),
                     as_device_type(a->get_const_srow()),
-                    acc::as_hip_range(b_vals), acc::as_hip_range(c_vals));
+                    acc::as_device_range(b_vals), acc::as_device_range(c_vals));
             }
         } else {
             if (csr_grid.x > 0 && csr_grid.y > 0) {
                 kernel::abstract_spmv<<<csr_grid, csr_block, 0,
                                         exec->get_stream()>>>(
                     nwarps, static_cast<IndexType>(a->get_size()[0]),
-                    acc::as_hip_range(a_vals), a->get_const_col_idxs(),
+                    acc::as_device_range(a_vals), a->get_const_col_idxs(),
                     as_device_type(a->get_const_row_ptrs()),
                     as_device_type(a->get_const_srow()),
-                    acc::as_hip_range(b_vals), acc::as_hip_range(c_vals));
+                    acc::as_device_range(b_vals), acc::as_device_range(c_vals));
             }
         }
     }
@@ -346,24 +347,24 @@ bool try_general_sparselib_spmv(std::shared_ptr<const HipExecutor> exec,
                                 const ValueType* beta,
                                 matrix::Dense<ValueType>* c)
 {
-    bool try_sparselib = hipsparse::is_supported<ValueType, IndexType>::value;
+    bool try_sparselib = sparselib::is_supported<ValueType, IndexType>::value;
     try_sparselib =
         try_sparselib && b->get_stride() == 1 && c->get_stride() == 1;
     // rocSPARSE has issues with zero matrices
     try_sparselib = try_sparselib && a->get_num_stored_elements() > 0;
     if (try_sparselib) {
-        auto descr = hipsparse::create_mat_descr();
+        auto descr = sparselib::create_mat_descr();
 
         auto row_ptrs = a->get_const_row_ptrs();
         auto col_idxs = a->get_const_col_idxs();
 
-        hipsparse::spmv(exec->get_hipsparse_handle(),
-                        HIPSPARSE_OPERATION_NON_TRANSPOSE, a->get_size()[0],
+        sparselib::spmv(exec->get_sparselib_handle(),
+                        SPARSELIB_OPERATION_NON_TRANSPOSE, a->get_size()[0],
                         a->get_size()[1], a->get_num_stored_elements(), alpha,
                         descr, a->get_const_values(), row_ptrs, col_idxs,
                         b->get_const_values(), beta, c->get_values());
 
-        hipsparse::destroy(descr);
+        sparselib::destroy(descr);
     }
     return try_sparselib;
 }
@@ -397,8 +398,8 @@ bool try_sparselib_spmv(std::shared_ptr<const HipExecutor> exec,
         return try_general_sparselib_spmv(exec, alpha->get_const_values(), a, b,
                                           beta->get_const_values(), c);
     } else {
-        auto handle = exec->get_hipsparse_handle();
-        hipsparse::pointer_mode_guard pm_guard(handle);
+        auto handle = exec->get_sparselib_handle();
+        sparselib::pointer_mode_guard pm_guard(handle);
         const auto valpha = one<ValueType>();
         const auto vbeta = zero<ValueType>();
         return try_general_sparselib_spmv(exec, &valpha, a, b, &vbeta, c);
@@ -535,14 +536,14 @@ void spgemm(std::shared_ptr<const HipExecutor> exec,
             const matrix::Csr<ValueType, IndexType>* b,
             matrix::Csr<ValueType, IndexType>* c)
 {
-    if (hipsparse::is_supported<ValueType, IndexType>::value) {
-        auto handle = exec->get_hipsparse_handle();
-        hipsparse::pointer_mode_guard pm_guard(handle);
-        auto a_descr = hipsparse::create_mat_descr();
-        auto b_descr = hipsparse::create_mat_descr();
-        auto c_descr = hipsparse::create_mat_descr();
-        auto d_descr = hipsparse::create_mat_descr();
-        auto info = hipsparse::create_spgemm_info();
+    if (sparselib::is_supported<ValueType, IndexType>::value) {
+        auto handle = exec->get_sparselib_handle();
+        sparselib::pointer_mode_guard pm_guard(handle);
+        auto a_descr = sparselib::create_mat_descr();
+        auto b_descr = sparselib::create_mat_descr();
+        auto c_descr = sparselib::create_mat_descr();
+        auto d_descr = sparselib::create_mat_descr();
+        auto info = sparselib::create_spgemm_info();
 
         auto alpha = one<ValueType>();
         auto a_nnz = static_cast<IndexType>(a->get_num_stored_elements());
@@ -566,7 +567,7 @@ void spgemm(std::shared_ptr<const HipExecutor> exec,
 
         // allocate buffer
         size_type buffer_size{};
-        hipsparse::spgemm_buffer_size(
+        sparselib::spgemm_buffer_size(
             handle, m, n, k, &alpha, a_descr, a_nnz, a_row_ptrs, a_col_idxs,
             b_descr, b_nnz, b_row_ptrs, b_col_idxs, null_value, d_descr,
             zero_nnz, null_index, null_index, info, buffer_size);
@@ -575,7 +576,7 @@ void spgemm(std::shared_ptr<const HipExecutor> exec,
 
         // count nnz
         IndexType c_nnz{};
-        hipsparse::spgemm_nnz(
+        sparselib::spgemm_nnz(
             handle, m, n, k, a_descr, a_nnz, a_row_ptrs, a_col_idxs, b_descr,
             b_nnz, b_row_ptrs, b_col_idxs, d_descr, zero_nnz, null_index,
             null_index, c_descr, c_row_ptrs, &c_nnz, info, buffer);
@@ -585,17 +586,17 @@ void spgemm(std::shared_ptr<const HipExecutor> exec,
         c_vals_array.resize_and_reset(c_nnz);
         auto c_col_idxs = c_col_idxs_array.get_data();
         auto c_vals = c_vals_array.get_data();
-        hipsparse::spgemm(handle, m, n, k, &alpha, a_descr, a_nnz, a_vals,
+        sparselib::spgemm(handle, m, n, k, &alpha, a_descr, a_nnz, a_vals,
                           a_row_ptrs, a_col_idxs, b_descr, b_nnz, b_vals,
                           b_row_ptrs, b_col_idxs, null_value, d_descr, zero_nnz,
                           null_value, null_index, null_index, c_descr, c_vals,
                           c_row_ptrs, c_col_idxs, info, buffer);
 
-        hipsparse::destroy_spgemm_info(info);
-        hipsparse::destroy(d_descr);
-        hipsparse::destroy(c_descr);
-        hipsparse::destroy(b_descr);
-        hipsparse::destroy(a_descr);
+        sparselib::destroy_spgemm_info(info);
+        sparselib::destroy(d_descr);
+        sparselib::destroy(c_descr);
+        sparselib::destroy(b_descr);
+        sparselib::destroy(a_descr);
     } else {
         GKO_NOT_IMPLEMENTED;
     }
@@ -611,14 +612,14 @@ void advanced_spgemm(std::shared_ptr<const HipExecutor> exec,
                      const matrix::Csr<ValueType, IndexType>* d,
                      matrix::Csr<ValueType, IndexType>* c)
 {
-    if (hipsparse::is_supported<ValueType, IndexType>::value) {
-        auto handle = exec->get_hipsparse_handle();
-        hipsparse::pointer_mode_guard pm_guard(handle);
-        auto a_descr = hipsparse::create_mat_descr();
-        auto b_descr = hipsparse::create_mat_descr();
-        auto c_descr = hipsparse::create_mat_descr();
-        auto d_descr = hipsparse::create_mat_descr();
-        auto info = hipsparse::create_spgemm_info();
+    if (sparselib::is_supported<ValueType, IndexType>::value) {
+        auto handle = exec->get_sparselib_handle();
+        sparselib::pointer_mode_guard pm_guard(handle);
+        auto a_descr = sparselib::create_mat_descr();
+        auto b_descr = sparselib::create_mat_descr();
+        auto c_descr = sparselib::create_mat_descr();
+        auto d_descr = sparselib::create_mat_descr();
+        auto info = sparselib::create_spgemm_info();
 
         auto a_nnz = static_cast<IndexType>(a->get_num_stored_elements());
         auto a_vals = a->get_const_values();
@@ -640,7 +641,7 @@ void advanced_spgemm(std::shared_ptr<const HipExecutor> exec,
 
         // allocate buffer
         size_type buffer_size{};
-        hipsparse::spgemm_buffer_size(
+        sparselib::spgemm_buffer_size(
             handle, m, n, k, &one_value, a_descr, a_nnz, a_row_ptrs, a_col_idxs,
             b_descr, b_nnz, b_row_ptrs, b_col_idxs, null_value, d_descr,
             IndexType{}, null_index, null_index, info, buffer_size);
@@ -651,7 +652,7 @@ void advanced_spgemm(std::shared_ptr<const HipExecutor> exec,
         array<IndexType> c_tmp_row_ptrs_array(exec, m + 1);
         auto c_tmp_row_ptrs = c_tmp_row_ptrs_array.get_data();
         IndexType c_nnz{};
-        hipsparse::spgemm_nnz(
+        sparselib::spgemm_nnz(
             handle, m, n, k, a_descr, a_nnz, a_row_ptrs, a_col_idxs, b_descr,
             b_nnz, b_row_ptrs, b_col_idxs, d_descr, IndexType{}, null_index,
             null_index, c_descr, c_tmp_row_ptrs, &c_nnz, info, buffer);
@@ -661,7 +662,7 @@ void advanced_spgemm(std::shared_ptr<const HipExecutor> exec,
         array<ValueType> c_tmp_vals_array(exec, c_nnz);
         auto c_tmp_col_idxs = c_tmp_col_idxs_array.get_data();
         auto c_tmp_vals = c_tmp_vals_array.get_data();
-        hipsparse::spgemm(handle, m, n, k, &one_value, a_descr, a_nnz, a_vals,
+        sparselib::spgemm(handle, m, n, k, &one_value, a_descr, a_nnz, a_vals,
                           a_row_ptrs, a_col_idxs, b_descr, b_nnz, b_vals,
                           b_row_ptrs, b_col_idxs, null_value, d_descr,
                           IndexType{}, null_value, null_index, null_index,
@@ -669,11 +670,11 @@ void advanced_spgemm(std::shared_ptr<const HipExecutor> exec,
                           info, buffer);
 
         // destroy hipsparse context
-        hipsparse::destroy_spgemm_info(info);
-        hipsparse::destroy(d_descr);
-        hipsparse::destroy(c_descr);
-        hipsparse::destroy(b_descr);
-        hipsparse::destroy(a_descr);
+        sparselib::destroy_spgemm_info(info);
+        sparselib::destroy(d_descr);
+        sparselib::destroy(c_descr);
+        sparselib::destroy(b_descr);
+        sparselib::destroy(a_descr);
 
         auto total_nnz = c_nnz + d->get_num_stored_elements();
         auto nnz_per_row = total_nnz / m;
@@ -701,12 +702,12 @@ void transpose(std::shared_ptr<const HipExecutor> exec,
     if (orig->get_size()[0] == 0) {
         return;
     }
-    if (hipsparse::is_supported<ValueType, IndexType>::value) {
+    if (sparselib::is_supported<ValueType, IndexType>::value) {
         hipsparseAction_t copyValues = HIPSPARSE_ACTION_NUMERIC;
         hipsparseIndexBase_t idxBase = HIPSPARSE_INDEX_BASE_ZERO;
 
-        hipsparse::transpose(
-            exec->get_hipsparse_handle(), orig->get_size()[0],
+        sparselib::transpose(
+            exec->get_sparselib_handle(), orig->get_size()[0],
             orig->get_size()[1], orig->get_num_stored_elements(),
             orig->get_const_values(), orig->get_const_row_ptrs(),
             orig->get_const_col_idxs(), trans->get_values(),
@@ -728,12 +729,12 @@ void conj_transpose(std::shared_ptr<const HipExecutor> exec,
     const auto block_size = default_block_size;
     const auto grid_size =
         ceildiv(trans->get_num_stored_elements(), block_size);
-    if (hipsparse::is_supported<ValueType, IndexType>::value) {
+    if (sparselib::is_supported<ValueType, IndexType>::value) {
         hipsparseAction_t copyValues = HIPSPARSE_ACTION_NUMERIC;
         hipsparseIndexBase_t idxBase = HIPSPARSE_INDEX_BASE_ZERO;
 
-        hipsparse::transpose(
-            exec->get_hipsparse_handle(), orig->get_size()[0],
+        sparselib::transpose(
+            exec->get_sparselib_handle(), orig->get_size()[0],
             orig->get_size()[1], orig->get_num_stored_elements(),
             orig->get_const_values(), orig->get_const_row_ptrs(),
             orig->get_const_col_idxs(), trans->get_values(),
@@ -753,9 +754,9 @@ template <typename ValueType, typename IndexType>
 void sort_by_column_index(std::shared_ptr<const HipExecutor> exec,
                           matrix::Csr<ValueType, IndexType>* to_sort)
 {
-    if (hipsparse::is_supported<ValueType, IndexType>::value) {
-        auto handle = exec->get_hipsparse_handle();
-        auto descr = hipsparse::create_mat_descr();
+    if (sparselib::is_supported<ValueType, IndexType>::value) {
+        auto handle = exec->get_sparselib_handle();
+        auto descr = sparselib::create_mat_descr();
         auto m = IndexType(to_sort->get_size()[0]);
         auto n = IndexType(to_sort->get_size()[1]);
         auto nnz = IndexType(to_sort->get_num_stored_elements());
@@ -771,23 +772,23 @@ void sort_by_column_index(std::shared_ptr<const HipExecutor> exec,
         // init identity permutation
         array<IndexType> permutation_array(exec, nnz);
         auto permutation = permutation_array.get_data();
-        hipsparse::create_identity_permutation(handle, nnz, permutation);
+        components::fill_seq_array(exec, permutation, nnz);
 
         // allocate buffer
         size_type buffer_size{};
-        hipsparse::csrsort_buffer_size(handle, m, n, nnz, row_ptrs, col_idxs,
+        sparselib::csrsort_buffer_size(handle, m, n, nnz, row_ptrs, col_idxs,
                                        buffer_size);
         array<char> buffer_array{exec, buffer_size};
         auto buffer = buffer_array.get_data();
 
         // sort column indices
-        hipsparse::csrsort(handle, m, n, nnz, descr, row_ptrs, col_idxs,
+        sparselib::csrsort(handle, m, n, nnz, descr, row_ptrs, col_idxs,
                            permutation, buffer);
 
         // sort values
-        hipsparse::gather(handle, nnz, tmp_vals, vals, permutation);
+        sparselib::gather(handle, nnz, tmp_vals, vals, permutation);
 
-        hipsparse::destroy(descr);
+        sparselib::destroy(descr);
     } else {
         fallback_sort(exec, to_sort);
     }
diff --git a/hip/matrix/dense_kernels.hip.cpp b/hip/matrix/dense_kernels.hip.cpp
index 36e581049e0..8fed3c97c1b 100644
--- a/hip/matrix/dense_kernels.hip.cpp
+++ b/hip/matrix/dense_kernels.hip.cpp
@@ -5,9 +5,6 @@
 #include "core/matrix/dense_kernels.hpp"
 
 
-#include <hip/hip_runtime.h>
-
-
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/range_accessors.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
@@ -20,12 +17,13 @@
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
 
+#include "common/cuda_hip/base/blas_bindings.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/pointer_mode_guard.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/utils.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
-#include "hip/base/config.hip.hpp"
-#include "hip/base/hipblas_bindings.hip.hpp"
-#include "hip/base/pointer_mode_guard.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
 #include "hip/components/intrinsics.hip.hpp"
 #include "hip/components/reduction.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
@@ -56,11 +54,11 @@ void compute_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
                           matrix::Dense<ValueType>* result, array<char>& tmp)
 {
     if (x->get_size()[1] == 1 && y->get_size()[1] == 1) {
-        if (hipblas::is_supported<ValueType>::value) {
-            auto handle = exec->get_hipblas_handle();
-            hipblas::dot(handle, x->get_size()[0], x->get_const_values(),
-                         x->get_stride(), y->get_const_values(),
-                         y->get_stride(), result->get_values());
+        if (blas::is_supported<ValueType>::value) {
+            auto handle = exec->get_blas_handle();
+            blas::dot(handle, x->get_size()[0], x->get_const_values(),
+                      x->get_stride(), y->get_const_values(), y->get_stride(),
+                      result->get_values());
         } else {
             compute_dot(exec, x, y, result, tmp);
         }
@@ -81,11 +79,11 @@ void compute_conj_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
                                array<char>& tmp)
 {
     if (x->get_size()[1] == 1 && y->get_size()[1] == 1) {
-        if (hipblas::is_supported<ValueType>::value) {
-            auto handle = exec->get_hipblas_handle();
-            hipblas::conj_dot(handle, x->get_size()[0], x->get_const_values(),
-                              x->get_stride(), y->get_const_values(),
-                              y->get_stride(), result->get_values());
+        if (blas::is_supported<ValueType>::value) {
+            auto handle = exec->get_blas_handle();
+            blas::conj_dot(handle, x->get_size()[0], x->get_const_values(),
+                           x->get_stride(), y->get_const_values(),
+                           y->get_stride(), result->get_values());
         } else {
             compute_conj_dot(exec, x, y, result, tmp);
         }
@@ -105,10 +103,10 @@ void compute_norm2_dispatch(std::shared_ptr<const DefaultExecutor> exec,
                             array<char>& tmp)
 {
     if (x->get_size()[1] == 1) {
-        if (hipblas::is_supported<ValueType>::value) {
-            auto handle = exec->get_hipblas_handle();
-            hipblas::norm2(handle, x->get_size()[0], x->get_const_values(),
-                           x->get_stride(), result->get_values());
+        if (blas::is_supported<ValueType>::value) {
+            auto handle = exec->get_blas_handle();
+            blas::norm2(handle, x->get_size()[0], x->get_const_values(),
+                        x->get_stride(), result->get_values());
         } else {
             compute_norm2(exec, x, result, tmp);
         }
@@ -127,19 +125,18 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
                   const matrix::Dense<ValueType>* b,
                   matrix::Dense<ValueType>* c)
 {
-    if (hipblas::is_supported<ValueType>::value) {
-        auto handle = exec->get_hipblas_handle();
+    if (blas::is_supported<ValueType>::value) {
+        auto handle = exec->get_blas_handle();
         if (c->get_size()[0] > 0 && c->get_size()[1] > 0) {
             if (a->get_size()[1] > 0) {
-                hipblas::pointer_mode_guard pm_guard(handle);
+                blas::pointer_mode_guard pm_guard(handle);
                 auto alpha = one<ValueType>();
                 auto beta = zero<ValueType>();
-                hipblas::gemm(handle, HIPBLAS_OP_N, HIPBLAS_OP_N,
-                              c->get_size()[1], c->get_size()[0],
-                              a->get_size()[1], &alpha, b->get_const_values(),
-                              b->get_stride(), a->get_const_values(),
-                              a->get_stride(), &beta, c->get_values(),
-                              c->get_stride());
+                blas::gemm(handle, BLAS_OP_N, BLAS_OP_N, c->get_size()[1],
+                           c->get_size()[0], a->get_size()[1], &alpha,
+                           b->get_const_values(), b->get_stride(),
+                           a->get_const_values(), a->get_stride(), &beta,
+                           c->get_values(), c->get_stride());
             } else {
                 dense::fill(exec, c, zero<ValueType>());
             }
@@ -158,15 +155,15 @@ void apply(std::shared_ptr<const DefaultExecutor> exec,
            const matrix::Dense<ValueType>* a, const matrix::Dense<ValueType>* b,
            const matrix::Dense<ValueType>* beta, matrix::Dense<ValueType>* c)
 {
-    if (hipblas::is_supported<ValueType>::value) {
+    if (blas::is_supported<ValueType>::value) {
         if (c->get_size()[0] > 0 && c->get_size()[1] > 0) {
             if (a->get_size()[1] > 0) {
-                hipblas::gemm(
-                    exec->get_hipblas_handle(), HIPBLAS_OP_N, HIPBLAS_OP_N,
-                    c->get_size()[1], c->get_size()[0], a->get_size()[1],
-                    alpha->get_const_values(), b->get_const_values(),
-                    b->get_stride(), a->get_const_values(), a->get_stride(),
-                    beta->get_const_values(), c->get_values(), c->get_stride());
+                blas::gemm(exec->get_blas_handle(), BLAS_OP_N, BLAS_OP_N,
+                           c->get_size()[1], c->get_size()[0], a->get_size()[1],
+                           alpha->get_const_values(), b->get_const_values(),
+                           b->get_stride(), a->get_const_values(),
+                           a->get_stride(), beta->get_const_values(),
+                           c->get_values(), c->get_stride());
             } else {
                 dense::scale(exec, beta, c);
             }
@@ -184,17 +181,17 @@ void transpose(std::shared_ptr<const DefaultExecutor> exec,
                const matrix::Dense<ValueType>* orig,
                matrix::Dense<ValueType>* trans)
 {
-    if (hipblas::is_supported<ValueType>::value) {
-        auto handle = exec->get_hipblas_handle();
+    if (blas::is_supported<ValueType>::value) {
+        auto handle = exec->get_blas_handle();
         if (orig->get_size()[0] > 0 && orig->get_size()[1] > 0) {
-            hipblas::pointer_mode_guard pm_guard(handle);
+            blas::pointer_mode_guard pm_guard(handle);
             auto alpha = one<ValueType>();
             auto beta = zero<ValueType>();
-            hipblas::geam(handle, HIPBLAS_OP_T, HIPBLAS_OP_N,
-                          orig->get_size()[0], orig->get_size()[1], &alpha,
-                          orig->get_const_values(), orig->get_stride(), &beta,
-                          trans->get_const_values(), trans->get_stride(),
-                          trans->get_values(), trans->get_stride());
+            blas::geam(handle, BLAS_OP_T, BLAS_OP_N, orig->get_size()[0],
+                       orig->get_size()[1], &alpha, orig->get_const_values(),
+                       orig->get_stride(), &beta, trans->get_const_values(),
+                       trans->get_stride(), trans->get_values(),
+                       trans->get_stride());
         }
     } else {
         GKO_NOT_IMPLEMENTED;
@@ -209,17 +206,17 @@ void conj_transpose(std::shared_ptr<const DefaultExecutor> exec,
                     const matrix::Dense<ValueType>* orig,
                     matrix::Dense<ValueType>* trans)
 {
-    if (hipblas::is_supported<ValueType>::value) {
-        auto handle = exec->get_hipblas_handle();
+    if (blas::is_supported<ValueType>::value) {
+        auto handle = exec->get_blas_handle();
         if (orig->get_size()[0] > 0 && orig->get_size()[1] > 0) {
-            hipblas::pointer_mode_guard pm_guard(handle);
+            blas::pointer_mode_guard pm_guard(handle);
             auto alpha = one<ValueType>();
             auto beta = zero<ValueType>();
-            hipblas::geam(handle, HIPBLAS_OP_C, HIPBLAS_OP_N,
-                          orig->get_size()[0], orig->get_size()[1], &alpha,
-                          orig->get_const_values(), orig->get_stride(), &beta,
-                          trans->get_values(), trans->get_stride(),
-                          trans->get_values(), trans->get_stride());
+            blas::geam(handle, BLAS_OP_C, BLAS_OP_N, orig->get_size()[0],
+                       orig->get_size()[1], &alpha, orig->get_const_values(),
+                       orig->get_stride(), &beta, trans->get_const_values(),
+                       trans->get_stride(), trans->get_values(),
+                       trans->get_stride());
         }
     } else {
         GKO_NOT_IMPLEMENTED;
diff --git a/hip/matrix/diagonal_kernels.hip.cpp b/hip/matrix/diagonal_kernels.hip.cpp
index deedb9543ec..01033004c6b 100644
--- a/hip/matrix/diagonal_kernels.hip.cpp
+++ b/hip/matrix/diagonal_kernels.hip.cpp
@@ -5,16 +5,14 @@
 #include "core/matrix/diagonal_kernels.hpp"
 
 
-#include <hip/hip_runtime.h>
-
-
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
 
-#include "hip/base/hipsparse_bindings.hip.hpp"
-#include "hip/base/types.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "hip/components/thread_ids.hip.hpp"
 
 
diff --git a/hip/matrix/ell_kernels.hip.cpp b/hip/matrix/ell_kernels.hip.cpp
index 51c34430f5c..4f1ff6a3539 100644
--- a/hip/matrix/ell_kernels.hip.cpp
+++ b/hip/matrix/ell_kernels.hip.cpp
@@ -8,9 +8,6 @@
 #include <array>
 
 
-#include <hip/hip_runtime.h>
-
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/types.hpp>
@@ -18,19 +15,20 @@
 #include <ginkgo/core/matrix/dense.hpp>
 
 
-#include "accessor/hip_helper.hpp"
+#include "accessor/cuda_hip_helper.hpp"
 #include "accessor/reduced_row_major.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/sparselib_bindings.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/format_conversion.hpp"
 #include "core/base/mixed_precision_types.hpp"
 #include "core/components/fill_array_kernels.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/dense_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "hip/base/config.hip.hpp"
-#include "hip/base/hipsparse_bindings.hip.hpp"
-#include "hip/base/types.hip.hpp"
 #include "hip/components/atomic.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
-#include "hip/components/format_conversion.hip.hpp"
 #include "hip/components/reduction.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
 
@@ -133,20 +131,21 @@ void abstract_spmv(syn::value_list<int, info>,
         if (grid_size.x > 0 && grid_size.y > 0) {
             kernel::spmv<num_thread_per_worker, atomic>
                 <<<grid_size, block_size, 0, exec->get_stream()>>>(
-                    nrows, num_worker_per_row, acc::as_hip_range(a_vals),
+                    nrows, num_worker_per_row, acc::as_device_range(a_vals),
                     a->get_const_col_idxs(), stride,
-                    num_stored_elements_per_row, acc::as_hip_range(b_vals),
+                    num_stored_elements_per_row, acc::as_device_range(b_vals),
                     as_device_type(c->get_values()), c->get_stride());
         }
     } else if (alpha != nullptr && beta != nullptr) {
+        const auto alpha_val = acc::range<a_accessor>(
+            std::array<acc::size_type, 1>{1}, alpha->get_const_values());
         if (grid_size.x > 0 && grid_size.y > 0) {
-            const auto alpha_val = acc::range<a_accessor>(
-                std::array<acc::size_type, 1>{1}, alpha->get_const_values());
             kernel::spmv<num_thread_per_worker, atomic>
                 <<<grid_size, block_size, 0, exec->get_stream()>>>(
-                    nrows, num_worker_per_row, acc::as_hip_range(alpha_val),
-                    acc::as_hip_range(a_vals), a->get_const_col_idxs(), stride,
-                    num_stored_elements_per_row, acc::as_hip_range(b_vals),
+                    nrows, num_worker_per_row, acc::as_device_range(alpha_val),
+                    acc::as_device_range(a_vals), a->get_const_col_idxs(),
+                    stride, num_stored_elements_per_row,
+                    acc::as_device_range(b_vals),
                     as_device_type(beta->get_const_values()),
                     as_device_type(c->get_values()), c->get_stride());
         }
@@ -215,7 +214,7 @@ void spmv(std::shared_ptr<const HipExecutor> exec,
     const int num_worker_per_row = std::get<2>(data);
 
     /**
-     * info is the parameter for selecting the hip kernel.
+     * info is the parameter for selecting the device kernel.
      * for info == 0, it uses the kernel by warp_size threads with atomic
      * operation for other value, it uses the kernel without atomic_add
      */
@@ -249,7 +248,7 @@ void advanced_spmv(std::shared_ptr<const HipExecutor> exec,
     const int num_worker_per_row = std::get<2>(data);
 
     /**
-     * info is the parameter for selecting the hip kernel.
+     * info is the parameter for selecting the device kernel.
      * for info == 0, it uses the kernel by warp_size threads with atomic
      * operation for other value, it uses the kernel without atomic_add
      */
diff --git a/hip/matrix/fbcsr_kernels.template.hip.cpp b/hip/matrix/fbcsr_kernels.template.hip.cpp
index b84e7644e80..0286aff0bba 100644
--- a/hip/matrix/fbcsr_kernels.template.hip.cpp
+++ b/hip/matrix/fbcsr_kernels.template.hip.cpp
@@ -8,7 +8,6 @@
 #include <algorithm>
 
 
-#include <hip/hip_runtime.h>
 #include <thrust/copy.h>
 #include <thrust/count.h>
 #include <thrust/device_ptr.h>
@@ -25,6 +24,13 @@
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "common/cuda_hip/base/blas_bindings.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/pointer_mode_guard.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/sparselib_bindings.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "common/unified/base/kernel_launch.hpp"
 #include "core/base/array_access.hpp"
 #include "core/base/block_sizes.hpp"
@@ -34,22 +40,17 @@
 #include "core/matrix/csr_lookup.hpp"
 #include "core/matrix/dense_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "hip/base/config.hip.hpp"
-#include "hip/base/hipblas_bindings.hip.hpp"
-#include "hip/base/hipsparse_bindings.hip.hpp"
 #include "hip/base/hipsparse_block_bindings.hip.hpp"
 #include "hip/base/math.hip.hpp"
-#include "hip/base/pointer_mode_guard.hip.hpp"
 #include "hip/base/thrust.hip.hpp"
-#include "hip/base/types.hip.hpp"
 #include "hip/components/atomic.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
 #include "hip/components/merging.hip.hpp"
 #include "hip/components/prefix_sum.hip.hpp"
 #include "hip/components/reduction.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
 #include "hip/components/uninitialized_array.hip.hpp"
 
+
 namespace gko {
 namespace kernels {
 namespace hip {
@@ -82,15 +83,15 @@ void dense_transpose(std::shared_ptr<const HipExecutor> exec,
     if (nrows == 0) {
         return;
     }
-    if (hipblas::is_supported<ValueType>::value) {
-        auto handle = exec->get_hipblas_handle();
+    if (blas::is_supported<ValueType>::value) {
+        auto handle = exec->get_blas_handle();
         {
-            hipblas::pointer_mode_guard pm_guard(handle);
+            blas::pointer_mode_guard pm_guard(handle);
             auto alpha = one<ValueType>();
             auto beta = zero<ValueType>();
-            hipblas::geam(handle, HIPBLAS_OP_T, HIPBLAS_OP_N, nrows, ncols,
-                          &alpha, orig, orig_stride, &beta, trans, trans_stride,
-                          trans, trans_stride);
+            blas::geam(handle, BLAS_OP_T, BLAS_OP_N, nrows, ncols, &alpha, orig,
+                       orig_stride, &beta, trans, trans_stride, trans,
+                       trans_stride);
         }
     } else {
         GKO_NOT_IMPLEMENTED;
@@ -116,12 +117,12 @@ void spmv(std::shared_ptr<const HipExecutor> exec,
         dense::fill(exec, c, zero<ValueType>());
         return;
     }
-    if (hipsparse::is_supported<ValueType, IndexType>::value) {
-        auto handle = exec->get_hipsparse_handle();
-        hipsparse::pointer_mode_guard pm_guard(handle);
+    if (sparselib::is_supported<ValueType, IndexType>::value) {
+        auto handle = exec->get_sparselib_handle();
+        sparselib::pointer_mode_guard pm_guard(handle);
         const auto alpha = one<ValueType>();
         const auto beta = zero<ValueType>();
-        auto descr = hipsparse::create_mat_descr();
+        auto descr = sparselib::create_mat_descr();
         const auto row_ptrs = a->get_const_row_ptrs();
         const auto col_idxs = a->get_const_col_idxs();
         const auto values = a->get_const_values();
@@ -135,21 +136,21 @@ void spmv(std::shared_ptr<const HipExecutor> exec,
         const auto in_stride = b->get_stride();
         const auto out_stride = c->get_stride();
         if (nrhs == 1 && in_stride == 1 && out_stride == 1) {
-            hipsparse::bsrmv(handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, mb, nb,
+            sparselib::bsrmv(handle, SPARSELIB_OPERATION_NON_TRANSPOSE, mb, nb,
                              nnzb, &alpha, descr, values, row_ptrs, col_idxs,
                              bs, b->get_const_values(), &beta, c->get_values());
         } else {
             const auto trans_stride = nrows;
             auto trans_c = array<ValueType>(exec, nrows * nrhs);
-            hipsparse::bsrmm(handle, HIPSPARSE_OPERATION_NON_TRANSPOSE,
-                             HIPSPARSE_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb,
+            sparselib::bsrmm(handle, SPARSELIB_OPERATION_NON_TRANSPOSE,
+                             SPARSELIB_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb,
                              &alpha, descr, values, row_ptrs, col_idxs, bs,
                              b->get_const_values(), in_stride, &beta,
                              trans_c.get_data(), trans_stride);
             dense_transpose(exec, nrhs, nrows, trans_stride, trans_c.get_data(),
                             out_stride, c->get_values());
         }
-        hipsparse::destroy(descr);
+        sparselib::destroy(descr);
     } else {
         GKO_NOT_IMPLEMENTED;
     }
@@ -173,11 +174,11 @@ void advanced_spmv(std::shared_ptr<const HipExecutor> exec,
         dense::scale(exec, beta, c);
         return;
     }
-    if (hipsparse::is_supported<ValueType, IndexType>::value) {
-        auto handle = exec->get_hipsparse_handle();
+    if (sparselib::is_supported<ValueType, IndexType>::value) {
+        auto handle = exec->get_sparselib_handle();
         const auto alphp = alpha->get_const_values();
         const auto betap = beta->get_const_values();
-        auto descr = hipsparse::create_mat_descr();
+        auto descr = sparselib::create_mat_descr();
         const auto row_ptrs = a->get_const_row_ptrs();
         const auto col_idxs = a->get_const_col_idxs();
         const auto values = a->get_const_values();
@@ -191,7 +192,7 @@ void advanced_spmv(std::shared_ptr<const HipExecutor> exec,
         const auto in_stride = b->get_stride();
         const auto out_stride = c->get_stride();
         if (nrhs == 1 && in_stride == 1 && out_stride == 1) {
-            hipsparse::bsrmv(handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, mb, nb,
+            sparselib::bsrmv(handle, SPARSELIB_OPERATION_NON_TRANSPOSE, mb, nb,
                              nnzb, alphp, descr, values, row_ptrs, col_idxs, bs,
                              b->get_const_values(), betap, c->get_values());
         } else {
@@ -199,27 +200,83 @@ void advanced_spmv(std::shared_ptr<const HipExecutor> exec,
             auto trans_c = array<ValueType>(exec, nrows * nrhs);
             dense_transpose(exec, nrows, nrhs, out_stride, c->get_values(),
                             trans_stride, trans_c.get_data());
-            hipsparse::bsrmm(handle, HIPSPARSE_OPERATION_NON_TRANSPOSE,
-                             HIPSPARSE_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb,
+            sparselib::bsrmm(handle, SPARSELIB_OPERATION_NON_TRANSPOSE,
+                             SPARSELIB_OPERATION_TRANSPOSE, mb, nrhs, nb, nnzb,
                              alphp, descr, values, row_ptrs, col_idxs, bs,
                              b->get_const_values(), in_stride, betap,
                              trans_c.get_data(), trans_stride);
             dense_transpose(exec, nrhs, nrows, trans_stride, trans_c.get_data(),
                             out_stride, c->get_values());
         }
-        hipsparse::destroy(descr);
+        sparselib::destroy(descr);
     } else {
         GKO_NOT_IMPLEMENTED;
     }
 }
 
 
+namespace {
+
+
+template <int mat_blk_sz, typename ValueType, typename IndexType>
+void transpose_blocks_impl(syn::value_list<int, mat_blk_sz>,
+                           std::shared_ptr<const DefaultExecutor> exec,
+                           matrix::Fbcsr<ValueType, IndexType>* const mat)
+{
+    constexpr int subwarp_size = config::warp_size;
+    const auto nbnz = mat->get_num_stored_blocks();
+    const auto numthreads = nbnz * subwarp_size;
+    const auto block_size = default_block_size;
+    const auto grid_dim = ceildiv(numthreads, block_size);
+    if (grid_dim > 0) {
+        kernel::transpose_blocks<mat_blk_sz, subwarp_size>
+            <<<grid_dim, block_size, 0, exec->get_stream()>>>(
+                nbnz, mat->get_values());
+    }
+}
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_transpose_blocks,
+                                    transpose_blocks_impl);
+
+
+}  // namespace
+
+
 template <typename ValueType, typename IndexType>
 void transpose(const std::shared_ptr<const DefaultExecutor> exec,
-               const matrix::Fbcsr<ValueType, IndexType>* const input,
-               matrix::Fbcsr<ValueType, IndexType>* const output)
+               const matrix::Fbcsr<ValueType, IndexType>* const orig,
+               matrix::Fbcsr<ValueType, IndexType>* const trans)
 {
-    fallback_transpose(exec, input, output);
+#ifdef GKO_COMPILING_CUDA
+    if (sparselib::is_supported<ValueType, IndexType>::value) {
+        const int bs = orig->get_block_size();
+        const IndexType nnzb =
+            static_cast<IndexType>(orig->get_num_stored_blocks());
+        cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC;
+        cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO;
+        const IndexType buffer_size = sparselib::bsr_transpose_buffersize(
+            exec->get_sparselib_handle(), orig->get_num_block_rows(),
+            orig->get_num_block_cols(), nnzb, orig->get_const_values(),
+            orig->get_const_row_ptrs(), orig->get_const_col_idxs(), bs, bs);
+        array<char> buffer_array(exec, buffer_size);
+        auto buffer = buffer_array.get_data();
+        sparselib::bsr_transpose(
+            exec->get_sparselib_handle(), orig->get_num_block_rows(),
+            orig->get_num_block_cols(), nnzb, orig->get_const_values(),
+            orig->get_const_row_ptrs(), orig->get_const_col_idxs(), bs, bs,
+            trans->get_values(), trans->get_col_idxs(), trans->get_row_ptrs(),
+            copyValues, idxBase, buffer);
+
+        // transpose blocks
+        select_transpose_blocks(
+            fixedblock::compiled_kernels(),
+            [bs](int compiled_block_size) { return bs == compiled_block_size; },
+            syn::value_list<int>(), syn::type_list<>(), exec, trans);
+    } else
+#endif
+    {
+        fallback_transpose(exec, orig, trans);
+    }
 }
 
 
diff --git a/hip/matrix/fft_kernels.hip.cpp b/hip/matrix/fft_kernels.hip.cpp
index dc397b20892..31e180b4414 100644
--- a/hip/matrix/fft_kernels.hip.cpp
+++ b/hip/matrix/fft_kernels.hip.cpp
@@ -8,7 +8,6 @@
 #include <array>
 
 
-#include <hip/hip_runtime.h>
 #if HIP_VERSION >= 50200000
 #include <hipfft/hipfft.h>
 #else
@@ -21,6 +20,9 @@
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "common/cuda_hip/base/runtime.hpp"
+
+
 namespace gko {
 
 
diff --git a/hip/matrix/sellp_kernels.hip.cpp b/hip/matrix/sellp_kernels.hip.cpp
index 8028dd0777f..f1e15c946e0 100644
--- a/hip/matrix/sellp_kernels.hip.cpp
+++ b/hip/matrix/sellp_kernels.hip.cpp
@@ -5,9 +5,6 @@
 #include "core/matrix/sellp_kernels.hpp"
 
 
-#include <hip/hip_runtime.h>
-
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/types.hpp>
@@ -15,10 +12,11 @@
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/sparselib_bindings.hpp"
+#include "common/cuda_hip/base/types.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
-#include "hip/base/config.hip.hpp"
-#include "hip/base/hipsparse_bindings.hip.hpp"
-#include "hip/base/types.hip.hpp"
 #include "hip/components/reduction.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
 
diff --git a/hip/matrix/sparsity_csr_kernels.hip.cpp b/hip/matrix/sparsity_csr_kernels.hip.cpp
index e5a6900cdfe..487b134d28a 100644
--- a/hip/matrix/sparsity_csr_kernels.hip.cpp
+++ b/hip/matrix/sparsity_csr_kernels.hip.cpp
@@ -5,25 +5,25 @@
 #include "core/matrix/sparsity_csr_kernels.hpp"
 
 
-#include <hip/hip_runtime.h>
 #include <thrust/sort.h>
 
 
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
-#include "accessor/hip_helper.hpp"
+#include "accessor/cuda_hip_helper.hpp"
 #include "accessor/reduced_row_major.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/sparselib_bindings.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/mixed_precision_types.hpp"
 #include "core/components/fill_array_kernels.hpp"
 #include "core/components/format_conversion_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "hip/base/config.hip.hpp"
-#include "hip/base/hipsparse_bindings.hip.hpp"
 #include "hip/base/math.hip.hpp"
 #include "hip/base/thrust.hip.hpp"
-#include "hip/base/types.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
 #include "hip/components/reduction.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
 #include "hip/components/uninitialized_array.hip.hpp"
@@ -42,7 +42,11 @@ namespace sparsity_csr {
 
 constexpr int classical_oversubscription = 32;
 constexpr int default_block_size = 512;
+#ifdef GKO_COMPILING_HIP
 constexpr int spmv_block_size = 256;
+#else
+constexpr int spmv_block_size = 128;
+#endif
 constexpr int warps_in_block = 4;
 
 
@@ -106,16 +110,16 @@ void classical_spmv(syn::value_list<int, subwarp_size>,
                 a->get_size()[0], as_device_type(a->get_const_value()),
                 a->get_const_col_idxs(),
                 as_device_type(a->get_const_row_ptrs()),
-                acc::as_hip_range(b_vals), acc::as_hip_range(c_vals));
+                acc::as_device_range(b_vals), acc::as_device_range(c_vals));
     } else if (alpha != nullptr && beta != nullptr) {
         kernel::abstract_classical_spmv<subwarp_size>
             <<<grid, block, 0, exec->get_stream()>>>(
                 a->get_size()[0], as_device_type(alpha->get_const_values()),
                 as_device_type(a->get_const_value()), a->get_const_col_idxs(),
                 as_device_type(a->get_const_row_ptrs()),
-                acc::as_hip_range(b_vals),
+                acc::as_device_range(b_vals),
                 as_device_type(beta->get_const_values()),
-                acc::as_hip_range(c_vals));
+                acc::as_device_range(c_vals));
     } else {
         GKO_KERNEL_NOT_FOUND;
     }
@@ -169,21 +173,21 @@ void sort_by_column_index(std::shared_ptr<const DefaultExecutor> exec,
     const auto num_cols = static_cast<IndexType>(to_sort->get_size()[1]);
     const auto row_ptrs = to_sort->get_const_row_ptrs();
     const auto col_idxs = to_sort->get_col_idxs();
-    if (hipsparse::is_supported<ValueType, IndexType>::value) {
-        const auto handle = exec->get_hipsparse_handle();
-        auto descr = hipsparse::create_mat_descr();
+    if (sparselib::is_supported<ValueType, IndexType>::value) {
+        const auto handle = exec->get_sparselib_handle();
+        auto descr = sparselib::create_mat_descr();
         array<IndexType> permutation_array(exec, to_sort->get_num_nonzeros());
         auto permutation = permutation_array.get_data();
         components::fill_seq_array(exec, permutation,
                                    to_sort->get_num_nonzeros());
         size_type buffer_size{};
-        hipsparse::csrsort_buffer_size(handle, num_rows, num_cols, nnz,
+        sparselib::csrsort_buffer_size(handle, num_rows, num_cols, nnz,
                                        row_ptrs, col_idxs, buffer_size);
         array<char> buffer_array{exec, buffer_size};
         auto buffer = buffer_array.get_data();
-        hipsparse::csrsort(handle, num_rows, num_cols, nnz, descr, row_ptrs,
+        sparselib::csrsort(handle, num_rows, num_cols, nnz, descr, row_ptrs,
                            col_idxs, permutation, buffer);
-        hipsparse::destroy(descr);
+        sparselib::destroy(descr);
     } else {
         fallback_sort(exec, to_sort);
     }
diff --git a/hip/multigrid/pgm_kernels.hip.cpp b/hip/multigrid/pgm_kernels.hip.cpp
index ed81d1c66dc..18c1f0957c4 100644
--- a/hip/multigrid/pgm_kernels.hip.cpp
+++ b/hip/multigrid/pgm_kernels.hip.cpp
@@ -19,8 +19,8 @@
 #include <ginkgo/core/base/math.hpp>
 
 
+#include "common/cuda_hip/base/types.hpp"
 #include "hip/base/thrust.hip.hpp"
-#include "hip/base/types.hip.hpp"
 
 
 namespace gko {
diff --git a/hip/preconditioner/batch_preconditioners.hip.hpp b/hip/preconditioner/batch_preconditioners.hip.hpp
index 6d58244a41a..f3969c16b81 100644
--- a/hip/preconditioner/batch_preconditioners.hip.hpp
+++ b/hip/preconditioner/batch_preconditioners.hip.hpp
@@ -6,9 +6,9 @@
 #define GKO_HIP_PRECONDITIONER_BATCH_PRECONDITIONERS_HIP_HPP_
 
 
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/preconditioner/batch_jacobi_helpers.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
 #include "hip/components/reduction.hip.hpp"
 
 
diff --git a/hip/preconditioner/isai_kernels.hip.cpp b/hip/preconditioner/isai_kernels.hip.cpp
index 7339bd0a754..4eaf65cc438 100644
--- a/hip/preconditioner/isai_kernels.hip.cpp
+++ b/hip/preconditioner/isai_kernels.hip.cpp
@@ -5,21 +5,18 @@
 #include "core/preconditioner/isai_kernels.hpp"
 
 
-#include <hip/hip_runtime.h>
-
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/csr_builder.hpp"
-#include "hip/base/config.hip.hpp"
 #include "hip/base/math.hip.hpp"
-#include "hip/base/types.hip.hpp"
-#include "hip/components/atomic.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
 #include "hip/components/merging.hip.hpp"
 #include "hip/components/reduction.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
diff --git a/hip/preconditioner/jacobi_advanced_apply_kernel.hip.cpp b/hip/preconditioner/jacobi_advanced_apply_kernels.hip.cpp
similarity index 100%
rename from hip/preconditioner/jacobi_advanced_apply_kernel.hip.cpp
rename to hip/preconditioner/jacobi_advanced_apply_kernels.hip.cpp
diff --git a/hip/preconditioner/jacobi_advanced_apply_instantiate.inc.hip.cpp b/hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.hip.cpp
similarity index 94%
rename from hip/preconditioner/jacobi_advanced_apply_instantiate.inc.hip.cpp
rename to hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.hip.cpp
index 326b9f6b720..358c6f3b337 100644
--- a/hip/preconditioner/jacobi_advanced_apply_instantiate.inc.hip.cpp
+++ b/hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.hip.cpp
@@ -5,20 +5,18 @@
 #include "core/preconditioner/jacobi_kernels.hpp"
 
 
-#include <hip/hip_runtime.h>
-
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/extended_float.hpp"
 #include "core/matrix/dense_kernels.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "hip/base/config.hip.hpp"
 #include "hip/base/math.hip.hpp"
-#include "hip/base/types.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
 #include "hip/components/warp_blas.hip.hpp"
 #include "hip/preconditioner/jacobi_common.hip.hpp"
@@ -35,7 +33,7 @@ namespace hip {
 namespace jacobi {
 
 
-#include "common/cuda_hip/preconditioner/jacobi_advanced_apply_kernel.hpp.inc"
+#include "common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.hpp.inc"
 
 
 // clang-format off
diff --git a/hip/preconditioner/jacobi_common.hip.hpp.in b/hip/preconditioner/jacobi_common.hip.hpp.in
index 6e9c279a46f..2185e124db6 100644
--- a/hip/preconditioner/jacobi_common.hip.hpp.in
+++ b/hip/preconditioner/jacobi_common.hip.hpp.in
@@ -6,7 +6,7 @@
 #include <ginkgo/core/synthesizer/containers.hpp>
 
 
-#include "hip/base/config.hip.hpp"
+#include "common/cuda_hip/base/config.hpp"
 
 
 namespace gko {
diff --git a/hip/preconditioner/jacobi_generate_kernel.hip.cpp b/hip/preconditioner/jacobi_generate_kernels.hip.cpp
similarity index 91%
rename from hip/preconditioner/jacobi_generate_kernel.hip.cpp
rename to hip/preconditioner/jacobi_generate_kernels.hip.cpp
index 713be193250..6365f6c132e 100644
--- a/hip/preconditioner/jacobi_generate_kernel.hip.cpp
+++ b/hip/preconditioner/jacobi_generate_kernels.hip.cpp
@@ -5,21 +5,19 @@
 #include "core/preconditioner/jacobi_kernels.hpp"
 
 
-#include <hip/hip_runtime.h>
-
-
 #include <ginkgo/config.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/extended_float.hpp"
 #include "core/components/fill_array_kernels.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "hip/base/config.hip.hpp"
 #include "hip/base/math.hip.hpp"
-#include "hip/base/types.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
 #include "hip/components/diagonal_block_manipulation.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
 #include "hip/components/uninitialized_array.hip.hpp"
@@ -38,7 +36,7 @@ namespace hip {
 namespace jacobi {
 
 
-#include "common/cuda_hip/preconditioner/jacobi_generate_kernel.hpp.inc"
+#include "common/cuda_hip/preconditioner/jacobi_generate_kernels.hpp.inc"
 
 
 template <int warps_per_block, int max_block_size, typename ValueType,
diff --git a/hip/preconditioner/jacobi_generate_instantiate.inc.hip.cpp b/hip/preconditioner/jacobi_generate_kernels.instantiate.hip.cpp
similarity index 94%
rename from hip/preconditioner/jacobi_generate_instantiate.inc.hip.cpp
rename to hip/preconditioner/jacobi_generate_kernels.instantiate.hip.cpp
index 86a3b799590..4634f8a0c57 100644
--- a/hip/preconditioner/jacobi_generate_instantiate.inc.hip.cpp
+++ b/hip/preconditioner/jacobi_generate_kernels.instantiate.hip.cpp
@@ -9,14 +9,14 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/extended_float.hpp"
 #include "core/components/fill_array_kernels.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "hip/base/config.hip.hpp"
 #include "hip/base/math.hip.hpp"
-#include "hip/base/types.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
 #include "hip/components/diagonal_block_manipulation.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
 #include "hip/components/uninitialized_array.hip.hpp"
@@ -35,7 +35,7 @@ namespace hip {
 namespace jacobi {
 
 
-#include "common/cuda_hip/preconditioner/jacobi_generate_kernel.hpp.inc"
+#include "common/cuda_hip/preconditioner/jacobi_generate_kernels.hpp.inc"
 
 
 // clang-format off
diff --git a/hip/preconditioner/jacobi_kernels.hip.cpp b/hip/preconditioner/jacobi_kernels.hip.cpp
index 1646a7fb376..a3b2b7e5412 100644
--- a/hip/preconditioner/jacobi_kernels.hip.cpp
+++ b/hip/preconditioner/jacobi_kernels.hip.cpp
@@ -5,19 +5,17 @@
 #include "core/preconditioner/jacobi_kernels.hpp"
 
 
-#include <hip/hip_runtime.h>
-
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/extended_float.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "hip/base/config.hip.hpp"
 #include "hip/base/math.hip.hpp"
-#include "hip/base/types.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
 #include "hip/preconditioner/jacobi_common.hip.hpp"
 
@@ -34,9 +32,9 @@ namespace jacobi {
 
 
 // a total of 32/16 warps (1024 threads)
-#if GINKGO_HIP_PLATFORM_HCC
+#if defined(GKO_COMPILING_HIP) && GINKGO_HIP_PLATFORM_HCC
 constexpr int default_num_warps = 16;
-#else  // GINKGO_HIP_PLATFORM_NVCC
+#else  // !defined(GKO_COMPILING_HIP) || GINKGO_HIP_PLATFORM_NVCC
 constexpr int default_num_warps = 32;
 #endif
 // with current architectures, at most 32 warps can be scheduled per SM (and
diff --git a/hip/preconditioner/jacobi_simple_apply_kernel.hip.cpp b/hip/preconditioner/jacobi_simple_apply_kernels.hip.cpp
similarity index 93%
rename from hip/preconditioner/jacobi_simple_apply_kernel.hip.cpp
rename to hip/preconditioner/jacobi_simple_apply_kernels.hip.cpp
index 0763e986d41..37b78f17469 100644
--- a/hip/preconditioner/jacobi_simple_apply_kernel.hip.cpp
+++ b/hip/preconditioner/jacobi_simple_apply_kernels.hip.cpp
@@ -5,20 +5,18 @@
 #include "core/preconditioner/jacobi_kernels.hpp"
 
 
-#include <hip/hip_runtime.h>
-
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/extended_float.hpp"
 #include "core/matrix/dense_kernels.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "hip/base/config.hip.hpp"
 #include "hip/base/math.hip.hpp"
-#include "hip/base/types.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
 #include "hip/components/warp_blas.hip.hpp"
 #include "hip/preconditioner/jacobi_common.hip.hpp"
@@ -35,7 +33,7 @@ namespace hip {
 namespace jacobi {
 
 
-#include "common/cuda_hip/preconditioner/jacobi_simple_apply_kernel.hpp.inc"
+#include "common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.hpp.inc"
 
 
 template <int warps_per_block, int max_block_size, typename ValueType,
diff --git a/hip/preconditioner/jacobi_simple_apply_instantiate.inc.hip.cpp b/hip/preconditioner/jacobi_simple_apply_kernels.instantiate.hip.cpp
similarity index 95%
rename from hip/preconditioner/jacobi_simple_apply_instantiate.inc.hip.cpp
rename to hip/preconditioner/jacobi_simple_apply_kernels.instantiate.hip.cpp
index be485af5730..421a32c3efc 100644
--- a/hip/preconditioner/jacobi_simple_apply_instantiate.inc.hip.cpp
+++ b/hip/preconditioner/jacobi_simple_apply_kernels.instantiate.hip.cpp
@@ -8,14 +8,14 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/extended_float.hpp"
 #include "core/matrix/dense_kernels.hpp"
 #include "core/preconditioner/jacobi_utils.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "hip/base/config.hip.hpp"
 #include "hip/base/math.hip.hpp"
-#include "hip/base/types.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
 #include "hip/components/warp_blas.hip.hpp"
 #include "hip/preconditioner/jacobi_common.hip.hpp"
@@ -32,7 +32,7 @@ namespace hip {
 namespace jacobi {
 
 
-#include "common/cuda_hip/preconditioner/jacobi_simple_apply_kernel.hpp.inc"
+#include "common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.hpp.inc"
 
 
 // clang-format off
diff --git a/hip/reorder/rcm_kernels.hip.cpp b/hip/reorder/rcm_kernels.hip.cpp
index 0c83c728e79..9a5739064eb 100644
--- a/hip/reorder/rcm_kernels.hip.cpp
+++ b/hip/reorder/rcm_kernels.hip.cpp
@@ -25,9 +25,9 @@
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
 
+#include "common/cuda_hip/components/memory.hpp"
 #include "core/base/array_access.hpp"
 #include "hip/base/thrust.hip.hpp"
-#include "hip/components/memory.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
 
 
diff --git a/hip/solver/batch_bicgstab_kernels.hip.cpp b/hip/solver/batch_bicgstab_kernels.hip.cpp
index c62c11405a5..fdeb0580931 100644
--- a/hip/solver/batch_bicgstab_kernels.hip.cpp
+++ b/hip/solver/batch_bicgstab_kernels.hip.cpp
@@ -5,7 +5,6 @@
 #include "core/solver/batch_bicgstab_kernels.hpp"
 
 
-#include <hip/hip_runtime.h>
 #include <thrust/functional.h>
 #include <thrust/transform.h>
 
@@ -14,15 +13,16 @@
 #include <ginkgo/core/base/math.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/solver/batch_dispatch.hpp"
 #include "hip/base/batch_struct.hip.hpp"
-#include "hip/base/config.hip.hpp"
 #include "hip/base/math.hip.hpp"
 #include "hip/base/thrust.hip.hpp"
-#include "hip/base/types.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
 #include "hip/components/reduction.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
 #include "hip/components/uninitialized_array.hip.hpp"
diff --git a/hip/solver/batch_cg_kernels.hip.cpp b/hip/solver/batch_cg_kernels.hip.cpp
index d61eead6fab..47c2bc498eb 100644
--- a/hip/solver/batch_cg_kernels.hip.cpp
+++ b/hip/solver/batch_cg_kernels.hip.cpp
@@ -5,7 +5,6 @@
 #include "core/solver/batch_cg_kernels.hpp"
 
 
-#include <hip/hip_runtime.h>
 #include <thrust/functional.h>
 #include <thrust/transform.h>
 
@@ -14,15 +13,16 @@
 #include <ginkgo/core/base/math.hpp>
 
 
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
 #include "core/solver/batch_dispatch.hpp"
 #include "hip/base/batch_struct.hip.hpp"
-#include "hip/base/config.hip.hpp"
 #include "hip/base/math.hip.hpp"
 #include "hip/base/thrust.hip.hpp"
-#include "hip/base/types.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
 #include "hip/components/reduction.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
 #include "hip/components/uninitialized_array.hip.hpp"
diff --git a/hip/solver/cb_gmres_kernels.hip.cpp b/hip/solver/cb_gmres_kernels.hip.cpp
index 794ac9fd8a6..2f2df4ddf84 100644
--- a/hip/solver/cb_gmres_kernels.hip.cpp
+++ b/hip/solver/cb_gmres_kernels.hip.cpp
@@ -14,19 +14,19 @@
 #include <ginkgo/core/stop/stopping_status.hpp>
 
 
-#include "accessor/hip_helper.hpp"
+#include "accessor/cuda_hip_helper.hpp"
 #include "accessor/range.hpp"
 #include "accessor/reduced_row_major.hpp"
 #include "accessor/scaled_reduced_row_major.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/base/array_access.hpp"
 #include "core/components/fill_array_kernels.hpp"
 #include "core/matrix/dense_kernels.hpp"
 #include "core/solver/cb_gmres_accessor.hpp"
-#include "hip/base/config.hip.hpp"
 #include "hip/base/math.hip.hpp"
-#include "hip/base/types.hip.hpp"
 #include "hip/components/atomic.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
 #include "hip/components/reduction.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
 #include "hip/components/uninitialized_array.hip.hpp"
@@ -118,7 +118,7 @@ void restart(std::shared_ptr<const DefaultExecutor> exec,
     restart_1_kernel<block_size>
         <<<grid_dim_1, block_dim, 0, exec->get_stream()>>>(
             residual->get_size()[0], residual->get_size()[1], krylov_dim,
-            acc::as_hip_range(krylov_bases),
+            acc::as_device_range(krylov_bases),
             as_device_type(residual_norm_collection->get_values()),
             residual_norm_collection->get_stride());
     kernels::hip::dense::compute_norm2_dispatch(exec, residual, residual_norm,
@@ -147,7 +147,7 @@ void restart(std::shared_ptr<const DefaultExecutor> exec,
                 residual_norm->get_stride(),
                 as_device_type(arnoldi_norm->get_const_values() +
                                2 * stride_arnoldi),
-                stride_arnoldi, acc::as_hip_range(krylov_bases));
+                stride_arnoldi, acc::as_device_range(krylov_bases));
     }
 
     const auto grid_dim_2 =
@@ -160,7 +160,7 @@ void restart(std::shared_ptr<const DefaultExecutor> exec,
             residual->get_stride(),
             as_device_type(residual_norm->get_const_values()),
             as_device_type(residual_norm_collection->get_values()),
-            acc::as_hip_range(krylov_bases),
+            acc::as_device_range(krylov_bases),
             as_device_type(next_krylov_basis->get_values()),
             next_krylov_basis->get_stride(),
             as_device_type(final_iter_nums->get_data()));
@@ -214,7 +214,7 @@ void finish_arnoldi_CGS(std::shared_ptr<const DefaultExecutor> exec,
         as_device_type(next_krylov_basis->get_const_values()),
         stride_next_krylov, as_device_type(arnoldi_norm->get_values()),
         as_device_type(stop_status));
-    // nrmP = norm(next_krylov_basis
+    // nrmP = norm(next_krylov_basis)
     zero_matrix(exec, iter + 1, dim_size[1], stride_hessenberg,
                 hessenberg_iter->get_values());
     if (dim_size[1] > 1) {
@@ -222,7 +222,7 @@ void finish_arnoldi_CGS(std::shared_ptr<const DefaultExecutor> exec,
             <<<grid_size_num_iters, block_size, 0, exec->get_stream()>>>(
                 dim_size[0], dim_size[1],
                 as_device_type(next_krylov_basis->get_const_values()),
-                stride_next_krylov, acc::as_hip_range(krylov_bases),
+                stride_next_krylov, acc::as_device_range(krylov_bases),
                 as_device_type(hessenberg_iter->get_values()),
                 stride_hessenberg, as_device_type(stop_status));
     } else {
@@ -231,7 +231,7 @@ void finish_arnoldi_CGS(std::shared_ptr<const DefaultExecutor> exec,
                exec->get_stream()>>>(
                 dim_size[0],
                 as_device_type(next_krylov_basis->get_const_values()),
-                stride_next_krylov, acc::as_hip_range(krylov_bases),
+                stride_next_krylov, acc::as_device_range(krylov_bases),
                 as_device_type(hessenberg_iter->get_values()),
                 stride_hessenberg, as_device_type(stop_status));
     }
@@ -243,7 +243,7 @@ void finish_arnoldi_CGS(std::shared_ptr<const DefaultExecutor> exec,
            default_block_size, 0, exec->get_stream()>>>(
             iter + 1, dim_size[0], dim_size[1],
             as_device_type(next_krylov_basis->get_values()), stride_next_krylov,
-            acc::as_hip_range(krylov_bases),
+            acc::as_device_range(krylov_bases),
             as_device_type(hessenberg_iter->get_const_values()),
             stride_hessenberg, as_device_type(stop_status));
 
@@ -272,7 +272,7 @@ void finish_arnoldi_CGS(std::shared_ptr<const DefaultExecutor> exec,
            exec->get_stream()>>>(
             dim_size[1], as_device_type(arnoldi_norm->get_values()),
             stride_arnoldi, as_device_type(hessenberg_iter->get_values()),
-            stride_hessenberg, iter + 1, acc::as_hip_range(krylov_bases),
+            stride_hessenberg, iter + 1, acc::as_device_range(krylov_bases),
             as_device_type(stop_status), as_device_type(reorth_status),
             as_device_type(num_reorth->get_data()));
     num_reorth_host = get_element(*num_reorth, 0);
@@ -285,7 +285,7 @@ void finish_arnoldi_CGS(std::shared_ptr<const DefaultExecutor> exec,
                 <<<grid_size_num_iters, block_size, 0, exec->get_stream()>>>(
                     dim_size[0], dim_size[1],
                     as_device_type(next_krylov_basis->get_const_values()),
-                    stride_next_krylov, acc::as_hip_range(krylov_bases),
+                    stride_next_krylov, acc::as_device_range(krylov_bases),
                     as_device_type(buffer_iter->get_values()), stride_buffer,
                     as_device_type(stop_status));
         } else {
@@ -294,7 +294,7 @@ void finish_arnoldi_CGS(std::shared_ptr<const DefaultExecutor> exec,
                    exec->get_stream()>>>(
                     dim_size[0],
                     as_device_type(next_krylov_basis->get_const_values()),
-                    stride_next_krylov, acc::as_hip_range(krylov_bases),
+                    stride_next_krylov, acc::as_device_range(krylov_bases),
                     as_device_type(buffer_iter->get_values()), stride_buffer,
                     as_device_type(stop_status));
         }
@@ -306,7 +306,7 @@ void finish_arnoldi_CGS(std::shared_ptr<const DefaultExecutor> exec,
                default_block_size, 0, exec->get_stream()>>>(
                 iter + 1, dim_size[0], dim_size[1],
                 as_device_type(next_krylov_basis->get_values()),
-                stride_next_krylov, acc::as_hip_range(krylov_bases),
+                stride_next_krylov, acc::as_device_range(krylov_bases),
                 as_device_type(hessenberg_iter->get_values()),
                 stride_hessenberg,
                 as_device_type(buffer_iter->get_const_values()), stride_buffer,
@@ -338,7 +338,7 @@ void finish_arnoldi_CGS(std::shared_ptr<const DefaultExecutor> exec,
                exec->get_stream()>>>(
                 dim_size[1], as_device_type(arnoldi_norm->get_values()),
                 stride_arnoldi, as_device_type(hessenberg_iter->get_values()),
-                stride_hessenberg, iter + 1, acc::as_hip_range(krylov_bases),
+                stride_hessenberg, iter + 1, acc::as_device_range(krylov_bases),
                 as_device_type(stop_status), as_device_type(reorth_status),
                 num_reorth->get_data());
         num_reorth_host = get_element(*num_reorth, 0);
@@ -350,7 +350,7 @@ void finish_arnoldi_CGS(std::shared_ptr<const DefaultExecutor> exec,
            default_block_size, 0, exec->get_stream()>>>(
             iter, dim_size[0], dim_size[1],
             as_device_type(next_krylov_basis->get_values()), stride_next_krylov,
-            acc::as_hip_range(krylov_bases),
+            acc::as_device_range(krylov_bases),
             as_device_type(hessenberg_iter->get_const_values()),
             stride_hessenberg, as_device_type(stop_status));
     // next_krylov_basis /= hessenberg(iter, iter + 1)
@@ -464,7 +464,7 @@ void calculate_qy(std::shared_ptr<const DefaultExecutor> exec,
 
     calculate_Qy_kernel<block_size>
         <<<grid_dim, block_dim, 0, exec->get_stream()>>>(
-            num_rows, num_cols, acc::as_hip_range(krylov_bases),
+            num_rows, num_cols, acc::as_device_range(krylov_bases),
             as_device_type(y->get_const_values()), y->get_stride(),
             as_device_type(before_preconditioner->get_values()),
             stride_before_preconditioner,
diff --git a/hip/solver/common_trs_kernels.hip.hpp b/hip/solver/common_trs_kernels.hip.hpp
index d05bc1a9f6f..9fac4be8547 100644
--- a/hip/solver/common_trs_kernels.hip.hpp
+++ b/hip/solver/common_trs_kernels.hip.hpp
@@ -10,7 +10,6 @@
 #include <memory>
 
 
-#include <hip/hip_runtime.h>
 #if HIP_VERSION >= 50200000
 #include <hipsparse/hipsparse.h>
 #else
@@ -22,12 +21,13 @@
 #include <ginkgo/core/base/math.hpp>
 
 
+#include "common/cuda_hip/base/pointer_mode_guard.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/sparselib_bindings.hpp"
+#include "common/cuda_hip/base/types.hpp"
 #include "core/matrix/dense_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "hip/base/hipsparse_bindings.hip.hpp"
 #include "hip/base/math.hip.hpp"
-#include "hip/base/pointer_mode_guard.hip.hpp"
-#include "hip/base/types.hip.hpp"
 
 
 namespace gko {
@@ -63,7 +63,7 @@ struct SolveStruct : gko::solver::SolveStruct {
             factor_descr, unit_diag ? HIPSPARSE_DIAG_TYPE_UNIT
                                     : HIPSPARSE_DIAG_TYPE_NON_UNIT));
         GKO_ASSERT_NO_HIPSPARSE_ERRORS(hipsparseCreateCsrsv2Info(&solve_info));
-        policy = HIPSPARSE_SOLVE_POLICY_USE_LEVEL;
+        policy = SPARSELIB_SOLVE_POLICY_USE_LEVEL;
     }
 
     SolveStruct(const SolveStruct&) = delete;
@@ -114,18 +114,18 @@ void generate_kernel(std::shared_ptr<const HipExecutor> exec,
     if (matrix->get_size()[0] == 0) {
         return;
     }
-    if (hipsparse::is_supported<ValueType, IndexType>::value) {
+    if (sparselib::is_supported<ValueType, IndexType>::value) {
         solve_struct =
             std::make_shared<solver::hip::SolveStruct>(is_upper, unit_diag);
         if (auto hip_solve_struct =
                 std::dynamic_pointer_cast<solver::hip::SolveStruct>(
                     solve_struct)) {
-            auto handle = exec->get_hipsparse_handle();
+            auto handle = exec->get_sparselib_handle();
 
             {
-                hipsparse::pointer_mode_guard pm_guard(handle);
-                hipsparse::csrsv2_buffer_size(
-                    handle, HIPSPARSE_OPERATION_NON_TRANSPOSE,
+                sparselib::pointer_mode_guard pm_guard(handle);
+                sparselib::csrsv2_buffer_size(
+                    handle, SPARSELIB_OPERATION_NON_TRANSPOSE,
                     matrix->get_size()[0], matrix->get_num_stored_elements(),
                     hip_solve_struct->factor_descr, matrix->get_const_values(),
                     matrix->get_const_row_ptrs(), matrix->get_const_col_idxs(),
@@ -139,8 +139,8 @@ void generate_kernel(std::shared_ptr<const HipExecutor> exec,
                 hip_solve_struct->factor_work_vec =
                     exec->alloc<void*>(hip_solve_struct->factor_work_size);
 
-                hipsparse::csrsv2_analysis(
-                    handle, HIPSPARSE_OPERATION_NON_TRANSPOSE,
+                sparselib::csrsv2_analysis(
+                    handle, SPARSELIB_OPERATION_NON_TRANSPOSE,
                     matrix->get_size()[0], matrix->get_num_stored_elements(),
                     hip_solve_struct->factor_descr, matrix->get_const_values(),
                     matrix->get_const_row_ptrs(), matrix->get_const_col_idxs(),
@@ -170,17 +170,17 @@ void solve_kernel(std::shared_ptr<const HipExecutor> exec,
     }
     using vec = matrix::Dense<ValueType>;
 
-    if (hipsparse::is_supported<ValueType, IndexType>::value) {
+    if (sparselib::is_supported<ValueType, IndexType>::value) {
         if (auto hip_solve_struct =
                 dynamic_cast<const solver::hip::SolveStruct*>(solve_struct)) {
             ValueType one = 1.0;
-            auto handle = exec->get_hipsparse_handle();
+            auto handle = exec->get_sparselib_handle();
 
             {
-                hipsparse::pointer_mode_guard pm_guard(handle);
+                sparselib::pointer_mode_guard pm_guard(handle);
                 if (b->get_stride() == 1) {
-                    hipsparse::csrsv2_solve(
-                        handle, HIPSPARSE_OPERATION_NON_TRANSPOSE,
+                    sparselib::csrsv2_solve(
+                        handle, SPARSELIB_OPERATION_NON_TRANSPOSE,
                         matrix->get_size()[0],
                         matrix->get_num_stored_elements(), &one,
                         hip_solve_struct->factor_descr,
@@ -194,8 +194,8 @@ void solve_kernel(std::shared_ptr<const HipExecutor> exec,
                     dense::transpose(exec, b, trans_b);
                     dense::transpose(exec, x, trans_x);
                     for (IndexType i = 0; i < trans_b->get_size()[0]; i++) {
-                        hipsparse::csrsv2_solve(
-                            handle, HIPSPARSE_OPERATION_NON_TRANSPOSE,
+                        sparselib::csrsv2_solve(
+                            handle, SPARSELIB_OPERATION_NON_TRANSPOSE,
                             matrix->get_size()[0],
                             matrix->get_num_stored_elements(), &one,
                             hip_solve_struct->factor_descr,
diff --git a/hip/solver/idr_kernels.hip.cpp b/hip/solver/idr_kernels.hip.cpp
index 83dbfe61f48..b1ef414c091 100644
--- a/hip/solver/idr_kernels.hip.cpp
+++ b/hip/solver/idr_kernels.hip.cpp
@@ -9,20 +9,19 @@
 #include <random>
 
 
-#include <hip/hip_runtime.h>
-
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 
 
+#include "common/cuda_hip/base/blas_bindings.hpp"
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/randlib_bindings.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "core/components/fill_array_kernels.hpp"
-#include "hip/base/hipblas_bindings.hip.hpp"
-#include "hip/base/hiprand_bindings.hip.hpp"
 #include "hip/base/math.hip.hpp"
-#include "hip/base/types.hip.hpp"
 #include "hip/components/atomic.hip.hpp"
-#include "hip/components/cooperative_groups.hip.hpp"
 #include "hip/components/reduction.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
 
@@ -71,14 +70,14 @@ void initialize_subspace_vectors(std::shared_ptr<const DefaultExecutor> exec,
                                  bool deterministic)
 {
     if (!deterministic) {
-        auto gen = hiprand::rand_generator(std::random_device{}(),
-                                           HIPRAND_RNG_PSEUDO_DEFAULT,
+        auto gen = randlib::rand_generator(std::random_device{}(),
+                                           RANDLIB_RNG_PSEUDO_DEFAULT,
                                            exec->get_stream());
-        hiprand::rand_vector(
+        randlib::rand_vector(
             gen,
             subspace_vectors->get_size()[0] * subspace_vectors->get_stride(),
             0.0, 1.0, subspace_vectors->get_values());
-        hiprand::destroy(gen);
+        randlib::destroy(gen);
     }
 }
 
@@ -147,9 +146,8 @@ void update_g_and_u(std::shared_ptr<const DefaultExecutor> exec,
                 as_device_type(alpha->get_values()),
                 stop_status->get_const_data());
         } else {
-            hipblas::dot(exec->get_hipblas_handle(), size, p_i, 1,
-                         g_k->get_values(), g_k->get_stride(),
-                         alpha->get_values());
+            blas::dot(exec->get_blas_handle(), size, p_i, 1, g_k->get_values(),
+                      g_k->get_stride(), alpha->get_values());
         }
         update_g_k_and_u_kernel<default_block_size>
             <<<ceildiv(size * g_k->get_stride(), default_block_size),
@@ -198,8 +196,8 @@ void update_m(std::shared_ptr<const DefaultExecutor> exec, const size_type nrhs,
                 as_device_type(g_k->get_const_values()), g_k->get_stride(),
                 as_device_type(m_i), stop_status->get_const_data());
         } else {
-            hipblas::dot(exec->get_hipblas_handle(), size, p_i, 1,
-                         g_k->get_const_values(), g_k->get_stride(), m_i);
+            blas::dot(exec->get_blas_handle(), size, p_i, 1,
+                      g_k->get_const_values(), g_k->get_stride(), m_i);
         }
     }
 }
diff --git a/hip/solver/lower_trs_kernels.hip.cpp b/hip/solver/lower_trs_kernels.hip.cpp
index 08f35d3d674..d355940a487 100644
--- a/hip/solver/lower_trs_kernels.hip.cpp
+++ b/hip/solver/lower_trs_kernels.hip.cpp
@@ -8,7 +8,6 @@
 #include <memory>
 
 
-#include <hip/hip_runtime.h>
 #if HIP_VERSION >= 50200000
 #include <hipsparse/hipsparse.h>
 #else
@@ -21,9 +20,10 @@
 #include <ginkgo/core/solver/triangular.hpp>
 
 
-#include "hip/base/hipsparse_bindings.hip.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/sparselib_bindings.hpp"
+#include "common/cuda_hip/base/types.hpp"
 #include "hip/base/math.hip.hpp"
-#include "hip/base/types.hip.hpp"
 #include "hip/solver/common_trs_kernels.hip.hpp"
 
 
diff --git a/hip/solver/multigrid_kernels.hip.cpp b/hip/solver/multigrid_kernels.hip.cpp
index 41aab8003bd..f68105ba6d8 100644
--- a/hip/solver/multigrid_kernels.hip.cpp
+++ b/hip/solver/multigrid_kernels.hip.cpp
@@ -5,18 +5,16 @@
 #include "core/solver/multigrid_kernels.hpp"
 
 
-#include <hip/hip_runtime.h>
-
-
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/types.hpp>
 
 
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
 #include "core/base/array_access.hpp"
 #include "core/components/fill_array_kernels.hpp"
-#include "hip/base/types.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
 
 
diff --git a/hip/solver/upper_trs_kernels.hip.cpp b/hip/solver/upper_trs_kernels.hip.cpp
index cd6b0719844..2a31e450d27 100644
--- a/hip/solver/upper_trs_kernels.hip.cpp
+++ b/hip/solver/upper_trs_kernels.hip.cpp
@@ -8,7 +8,6 @@
 #include <memory>
 
 
-#include <hip/hip_runtime.h>
 #if HIP_VERSION >= 50200000
 #include <hipsparse/hipsparse.h>
 #else
@@ -21,9 +20,10 @@
 #include <ginkgo/core/solver/triangular.hpp>
 
 
-#include "hip/base/hipsparse_bindings.hip.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/sparselib_bindings.hpp"
+#include "common/cuda_hip/base/types.hpp"
 #include "hip/base/math.hip.hpp"
-#include "hip/base/types.hip.hpp"
 #include "hip/solver/common_trs_kernels.hip.hpp"
 
 
diff --git a/hip/stop/criterion_kernels.hip.cpp b/hip/stop/criterion_kernels.hip.cpp
index 8c7caeb32b8..3d24daa5bd5 100644
--- a/hip/stop/criterion_kernels.hip.cpp
+++ b/hip/stop/criterion_kernels.hip.cpp
@@ -10,8 +10,8 @@
 #include <ginkgo/core/stop/stopping_status.hpp>
 
 
+#include "common/cuda_hip/base/types.hpp"
 #include "hip/base/math.hip.hpp"
-#include "hip/base/types.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
 
 
diff --git a/hip/stop/residual_norm_kernels.hip.cpp b/hip/stop/residual_norm_kernels.hip.cpp
index d790dd652f0..7f2b0646ea2 100644
--- a/hip/stop/residual_norm_kernels.hip.cpp
+++ b/hip/stop/residual_norm_kernels.hip.cpp
@@ -5,17 +5,15 @@
 #include "core/stop/residual_norm_kernels.hpp"
 
 
-#include <hip/hip_runtime.h>
-
-
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/stop/residual_norm.hpp>
 
 
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
 #include "core/base/array_access.hpp"
 #include "hip/base/math.hip.hpp"
-#include "hip/base/types.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
 
 
diff --git a/hip/test/base/math.hip.cpp b/hip/test/base/math.hip.cpp
index 2c25f5b3a7a..8462cbe5716 100644
--- a/hip/test/base/math.hip.cpp
+++ b/hip/test/base/math.hip.cpp
@@ -23,8 +23,8 @@
 #include <ginkgo/core/base/executor.hpp>
 
 
+#include "common/cuda_hip/base/types.hpp"
 #include "hip/base/math.hip.hpp"
-#include "hip/base/types.hip.hpp"
 #include "hip/test/utils.hip.hpp"
 
 
diff --git a/hip/test/components/cooperative_groups.hip.cpp b/hip/test/components/cooperative_groups.hip.cpp
index d22dfeca0b6..53f4b9a72a0 100644
--- a/hip/test/components/cooperative_groups.hip.cpp
+++ b/hip/test/components/cooperative_groups.hip.cpp
@@ -8,9 +8,6 @@
 // force-top: off
 
 
-#include "hip/components/cooperative_groups.hip.hpp"
-
-
 #include <cstring>
 #include <memory>
 
@@ -22,7 +19,8 @@
 #include <ginkgo/core/base/executor.hpp>
 
 
-#include "hip/base/types.hip.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "hip/test/utils.hip.hpp"
 
 
diff --git a/hip/test/components/merging.hip.cpp b/hip/test/components/merging.hip.cpp
index 7bfab76f795..b8ee2f03d29 100644
--- a/hip/test/components/merging.hip.cpp
+++ b/hip/test/components/merging.hip.cpp
@@ -24,7 +24,7 @@
 #include <ginkgo/core/base/executor.hpp>
 
 
-#include "hip/components/cooperative_groups.hip.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "hip/test/utils.hip.hpp"
 
 
diff --git a/hip/test/components/searching.hip.cpp b/hip/test/components/searching.hip.cpp
index 1db0c6e9562..2662d367f4d 100644
--- a/hip/test/components/searching.hip.cpp
+++ b/hip/test/components/searching.hip.cpp
@@ -23,7 +23,7 @@
 #include <ginkgo/core/base/executor.hpp>
 
 
-#include "hip/components/cooperative_groups.hip.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "hip/test/utils.hip.hpp"
 
 
diff --git a/include/ginkgo/core/base/executor.hpp b/include/ginkgo/core/base/executor.hpp
index c1e3f54a720..761405c0b3d 100644
--- a/include/ginkgo/core/base/executor.hpp
+++ b/include/ginkgo/core/base/executor.hpp
@@ -1600,14 +1600,29 @@ class CudaExecutor : public detail::ExecutorBase<CudaExecutor>,
      *
      * @return  the cublas handle (cublasContext*) for this executor
      */
-    cublasContext* get_cublas_handle() const { return cublas_handle_.get(); }
+    GKO_DEPRECATED("use get_blas_handle() instead")
+    cublasContext* get_cublas_handle() const { return get_blas_handle(); }
+
+    /**
+     * @copydoc get_cublas_handle()
+     */
+    cublasContext* get_blas_handle() const { return cublas_handle_.get(); }
 
     /**
      * Get the cusparse handle for this executor
      *
      * @return the cusparse handle (cusparseContext*) for this executor
      */
+    GKO_DEPRECATED("use get_sparselib_handle() instead")
     cusparseContext* get_cusparse_handle() const
+    {
+        return get_sparselib_handle();
+    }
+
+    /**
+     * @copydoc get_cusparse_handle()
+     */
+    cusparseContext* get_sparselib_handle() const
     {
         return cusparse_handle_.get();
     }
@@ -1805,14 +1820,29 @@ class HipExecutor : public detail::ExecutorBase<HipExecutor>,
      *
      * @return  the hipblas handle (hipblasContext*) for this executor
      */
-    hipblasContext* get_hipblas_handle() const { return hipblas_handle_.get(); }
+    GKO_DEPRECATED("use get_blas_handle() instead")
+    hipblasContext* get_hipblas_handle() const { return get_blas_handle(); }
+
+    /**
+     * @copydoc get_hipblas_handle()
+     */
+    hipblasContext* get_blas_handle() const { return hipblas_handle_.get(); }
 
     /**
      * Get the hipsparse handle for this executor
      *
      * @return the hipsparse handle (hipsparseContext*) for this executor
      */
+    GKO_DEPRECATED("use get_sparselib_handle() instead")
     hipsparseContext* get_hipsparse_handle() const
+    {
+        return get_sparselib_handle();
+    }
+
+    /**
+     * @copydoc get_hipsparse_handle()
+     */
+    hipsparseContext* get_sparselib_handle() const
     {
         return hipsparse_handle_.get();
     }
diff --git a/omp/CMakeLists.txt b/omp/CMakeLists.txt
index 59d49e44140..41bec80673f 100644
--- a/omp/CMakeLists.txt
+++ b/omp/CMakeLists.txt
@@ -54,7 +54,7 @@ target_sources(ginkgo_omp
     )
 
 ginkgo_compile_features(ginkgo_omp)
-target_compile_definitions(ginkgo_omp PRIVATE GKO_COMPILING_OMP)
+target_compile_definitions(ginkgo_omp PRIVATE GKO_COMPILING_OMP GKO_DEVICE_NAMESPACE=omp)
 
 # TODO FIXME: Currently nvhpc 22.7+ optimizations break the omp jacobi's custom
 # precision implementation (mantissa segmentation)
@@ -94,7 +94,7 @@ ginkgo_default_includes(ginkgo_omp)
 ginkgo_install_library(ginkgo_omp)
 
 if (GINKGO_CHECK_CIRCULAR_DEPS)
-    ginkgo_check_headers(ginkgo_omp GKO_COMPILING_OMP)
+    ginkgo_check_headers(ginkgo_omp "GKO_COMPILING_OMP;GKO_DEVICE_NAMESPACE=omp")
 endif()
 
 if(GINKGO_BUILD_TESTS)
diff --git a/test/base/batch_multi_vector_kernels.cpp b/test/base/batch_multi_vector_kernels.cpp
index ab15e1a99a3..07749d9bed2 100644
--- a/test/base/batch_multi_vector_kernels.cpp
+++ b/test/base/batch_multi_vector_kernels.cpp
@@ -312,8 +312,8 @@ TEST_F(MultiVector, CopySingleIsEquivalentToRef)
 
     gko::kernels::reference::batch_multi_vector::copy(this->ref, x.get(),
                                                       y.get());
-    gko::kernels::EXEC_NAMESPACE::batch_multi_vector::copy(this->exec, dx.get(),
-                                                           dy.get());
+    gko::kernels::GKO_DEVICE_NAMESPACE::batch_multi_vector::copy(
+        this->exec, dx.get(), dy.get());
 
     GKO_ASSERT_BATCH_MTX_NEAR(dy, y, 0.0);
 }
@@ -325,8 +325,8 @@ TEST_F(MultiVector, CopyIsEquivalentToRef)
 
     gko::kernels::reference::batch_multi_vector::copy(this->ref, x.get(),
                                                       y.get());
-    gko::kernels::EXEC_NAMESPACE::batch_multi_vector::copy(this->exec, dx.get(),
-                                                           dy.get());
+    gko::kernels::GKO_DEVICE_NAMESPACE::batch_multi_vector::copy(
+        this->exec, dx.get(), dy.get());
 
     GKO_ASSERT_BATCH_MTX_NEAR(dy, y, 0.0);
 }
diff --git a/test/base/executor.cpp b/test/base/executor.cpp
index 8ea3b01fb24..541360d01d4 100644
--- a/test/base/executor.cpp
+++ b/test/base/executor.cpp
@@ -72,7 +72,7 @@ TEST_F(Executor, RunsCorrectOperation)
 
     exec->run(ExampleOperation(value));
 
-    ASSERT_EQ(EXEC_NAMESPACE::value, value);
+    ASSERT_EQ(GKO_DEVICE_NAMESPACE::value, value);
 }
 
 
@@ -104,7 +104,7 @@ TEST_F(Executor, RunsCorrectLambdaOperation)
 
     exec->run(omp_lambda, cuda_lambda, hip_lambda, dpcpp_lambda);
 
-    ASSERT_EQ(EXEC_NAMESPACE::value, value);
+    ASSERT_EQ(GKO_DEVICE_NAMESPACE::value, value);
 }
 
 
diff --git a/test/base/index_range.cpp b/test/base/index_range.cpp
index 044202fd8e2..b16b5fb9046 100644
--- a/test/base/index_range.cpp
+++ b/test/base/index_range.cpp
@@ -30,7 +30,7 @@ class IndexRange : public CommonTestFixture {
 void run_range_for(std::shared_ptr<gko::EXEC_TYPE> exec,
                    gko::array<int>& result_array)
 {
-    gko::kernels::EXEC_NAMESPACE::run_kernel(
+    gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel(
         exec,
         [] GKO_KERNEL(auto i, auto result, auto size) {
             for (auto i : gko::irange<int>{size}) {
diff --git a/test/base/kernel_launch_generic.cpp b/test/base/kernel_launch_generic.cpp
index 55e1268a77a..c746a5b3461 100644
--- a/test/base/kernel_launch_generic.cpp
+++ b/test/base/kernel_launch_generic.cpp
@@ -46,7 +46,7 @@ move_only_type move_only_val{};
 
 namespace gko {
 namespace kernels {
-namespace EXEC_NAMESPACE {
+namespace GKO_DEVICE_NAMESPACE {
 
 
 template <>
@@ -57,7 +57,7 @@ struct to_device_type_impl<move_only_type&> {
 };
 
 
-}  // namespace EXEC_NAMESPACE
+}  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
 }  // namespace gko
 
@@ -108,7 +108,7 @@ class KernelLaunch : public CommonTestFixture {
 // nvcc doesn't like device lambdas declared in complex classes, move it out
 void run1d(std::shared_ptr<gko::EXEC_TYPE> exec, size_type dim, int* data)
 {
-    gko::kernels::EXEC_NAMESPACE::run_kernel(
+    gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel(
         exec,
         [] GKO_KERNEL(auto i, auto d, auto dummy) {
             static_assert(is_same<decltype(i), int64>::value, "index");
@@ -129,7 +129,7 @@ TEST_F(KernelLaunch, Runs1D)
 
 void run1d(std::shared_ptr<gko::EXEC_TYPE> exec, gko::array<int>& data)
 {
-    gko::kernels::EXEC_NAMESPACE::run_kernel(
+    gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel(
         exec,
         [] GKO_KERNEL(auto i, auto d, auto d_ptr, auto dummy) {
             static_assert(is_same<decltype(i), int64>::value, "index");
@@ -155,7 +155,7 @@ TEST_F(KernelLaunch, Runs1DArray)
 
 void run1d(std::shared_ptr<gko::EXEC_TYPE> exec, Mtx* m)
 {
-    gko::kernels::EXEC_NAMESPACE::run_kernel(
+    gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel(
         exec,
         [] GKO_KERNEL(auto i, auto d, auto d2, auto d_ptr, auto dummy) {
             static_assert(is_same<decltype(i), int64>::value, "index");
@@ -193,7 +193,7 @@ TEST_F(KernelLaunch, Runs1DDense)
 
 void run2d(std::shared_ptr<gko::EXEC_TYPE> exec, int* data)
 {
-    gko::kernels::EXEC_NAMESPACE::run_kernel(
+    gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel(
         exec,
         [] GKO_KERNEL(auto i, auto j, auto d, auto dummy) {
             static_assert(is_same<decltype(i), int64>::value, "index");
@@ -215,7 +215,7 @@ TEST_F(KernelLaunch, Runs2D)
 
 void run2d(std::shared_ptr<gko::EXEC_TYPE> exec, gko::array<int>& data)
 {
-    gko::kernels::EXEC_NAMESPACE::run_kernel(
+    gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel(
         exec,
         [] GKO_KERNEL(auto i, auto j, auto d, auto d_ptr, auto dummy) {
             static_assert(is_same<decltype(i), int64>::value, "index");
@@ -242,7 +242,7 @@ TEST_F(KernelLaunch, Runs2DArray)
 
 void run2d(std::shared_ptr<gko::EXEC_TYPE> exec, Mtx* m1, Mtx* m2, Mtx* m3)
 {
-    gko::kernels::EXEC_NAMESPACE::run_kernel_solver(
+    gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel_solver(
         exec,
         [] GKO_KERNEL(auto i, auto j, auto d, auto d2, auto d_ptr, auto d3,
                       auto d4, auto d2_ptr, auto d3_ptr, auto dummy) {
@@ -280,8 +280,8 @@ void run2d(std::shared_ptr<gko::EXEC_TYPE> exec, Mtx* m1, Mtx* m2, Mtx* m3)
         },
         dim<2>{4, 4}, m2->get_stride(), m1, static_cast<const Mtx*>(m1),
         m1->get_const_values(),
-        gko::kernels::EXEC_NAMESPACE::default_stride(m2),
-        gko::kernels::EXEC_NAMESPACE::row_vector(m3), m2->get_values(),
+        gko::kernels::GKO_DEVICE_NAMESPACE::default_stride(m2),
+        gko::kernels::GKO_DEVICE_NAMESPACE::row_vector(m3), m2->get_values(),
         m3->get_values(), move_only_val);
 }
 
@@ -297,7 +297,7 @@ void run1d_reduction(std::shared_ptr<gko::EXEC_TYPE> exec)
 {
     gko::array<int64> output{exec, {-1l}};
     auto run_reduction = [&](int64 init, size_type size) {
-        gko::kernels::EXEC_NAMESPACE::run_kernel_reduction(
+        gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel_reduction(
             exec,
             [] GKO_KERNEL(auto i, auto a, auto dummy) {
                 static_assert(is_same<decltype(i), int64>::value, "index");
@@ -343,7 +343,7 @@ void run1d_reduction_cached(std::shared_ptr<gko::EXEC_TYPE> exec,
     gko::array<char> temp(exec);
     for (const auto& size : sizes) {
         temp.clear();
-        gko::kernels::EXEC_NAMESPACE::run_kernel_reduction_cached(
+        gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel_reduction_cached(
             exec, [] GKO_KERNEL(auto i) { return i + 1; },
             [] GKO_KERNEL(auto i, auto j) { return std::max(i, j); },
             [] GKO_KERNEL(auto j) { return j; }, int64{}, output.get_data(),
@@ -366,7 +366,7 @@ void run2d_reduction(std::shared_ptr<gko::EXEC_TYPE> exec)
 {
     gko::array<int64> output{exec, {-1l}};
     auto run_reduction = [&](int64 init, gko::dim<2> size) {
-        gko::kernels::EXEC_NAMESPACE::run_kernel_reduction(
+        gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel_reduction(
             exec,
             [] GKO_KERNEL(auto i, auto j, auto a, auto dummy) {
                 static_assert(is_same<decltype(i), int64>::value, "index");
@@ -435,7 +435,7 @@ void run2d_reduction_cached(std::shared_ptr<gko::EXEC_TYPE> exec,
     gko::array<char> temp(exec);
     for (const auto& dim : dims) {
         temp.clear();
-        gko::kernels::EXEC_NAMESPACE::run_kernel_reduction_cached(
+        gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel_reduction_cached(
             exec, [] GKO_KERNEL(auto i, auto j) { return i + j + 2; },
             [] GKO_KERNEL(auto i, auto j) { return std::max(i, j); },
             [] GKO_KERNEL(auto j) { return j; }, int64{}, output.get_data(),
@@ -482,7 +482,7 @@ void run2d_row_reduction(std::shared_ptr<gko::EXEC_TYPE> exec)
                     static_cast<int64>(num_cols) * (num_cols + 1) * (i + 1);
             }
 
-            gko::kernels::EXEC_NAMESPACE::run_kernel_row_reduction(
+            gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel_row_reduction(
                 exec,
                 [] GKO_KERNEL(auto i, auto j, auto a, auto dummy) {
                     static_assert(is_same<decltype(i), int64>::value, "index");
@@ -527,7 +527,7 @@ void run2d_row_reduction_cached(std::shared_ptr<gko::EXEC_TYPE> exec,
             host_ref.get_data()[i] = dim[1] + i + 1;
         }
 
-        gko::kernels::EXEC_NAMESPACE::run_kernel_row_reduction_cached(
+        gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel_row_reduction_cached(
             exec, [] GKO_KERNEL(auto i, auto j) { return i + j + 2; },
             [] GKO_KERNEL(auto i, auto j) { return std::max(i, j); },
             [] GKO_KERNEL(auto j) { return j; }, int64{}, output.get_data(),
@@ -576,7 +576,7 @@ void run2d_col_reduction(std::shared_ptr<gko::EXEC_TYPE> exec)
                     static_cast<int64>(num_rows) * (num_rows + 1) * (i + 1);
             }
 
-            gko::kernels::EXEC_NAMESPACE::run_kernel_col_reduction(
+            gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel_col_reduction(
                 exec,
                 [] GKO_KERNEL(auto i, auto j, auto a, auto dummy) {
                     static_assert(is_same<decltype(i), int64>::value, "index");
@@ -620,7 +620,7 @@ void run2d_col_reduction_cached(std::shared_ptr<gko::EXEC_TYPE> exec,
             host_ref.get_data()[i] = dim[0] + i + 1;
         }
 
-        gko::kernels::EXEC_NAMESPACE::run_kernel_col_reduction_cached(
+        gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel_col_reduction_cached(
             exec, [] GKO_KERNEL(auto i, auto j) { return i + j + 2; },
             [] GKO_KERNEL(auto i, auto j) { return std::max(i, j); },
             [] GKO_KERNEL(auto j) { return j; }, int64{}, output.get_data(),
diff --git a/test/components/absolute_array_kernels.cpp b/test/components/absolute_array_kernels.cpp
index 6e00ad6e185..08dd52f35e3 100644
--- a/test/components/absolute_array_kernels.cpp
+++ b/test/components/absolute_array_kernels.cpp
@@ -46,7 +46,7 @@ class AbsoluteArray : public CommonTestFixture {
 
 TEST_F(AbsoluteArray, InplaceEqualsReference)
 {
-    gko::kernels::EXEC_NAMESPACE::components::inplace_absolute_array(
+    gko::kernels::GKO_DEVICE_NAMESPACE::components::inplace_absolute_array(
         exec, dvals.get_data(), total_size);
     gko::kernels::reference::components::inplace_absolute_array(
         ref, vals.get_data(), total_size);
@@ -57,7 +57,7 @@ TEST_F(AbsoluteArray, InplaceEqualsReference)
 
 TEST_F(AbsoluteArray, InplaceComplexEqualsReference)
 {
-    gko::kernels::EXEC_NAMESPACE::components::inplace_absolute_array(
+    gko::kernels::GKO_DEVICE_NAMESPACE::components::inplace_absolute_array(
         exec, dcomplex_vals.get_data(), total_size);
     gko::kernels::reference::components::inplace_absolute_array(
         ref, complex_vals.get_data(), total_size);
@@ -71,7 +71,7 @@ TEST_F(AbsoluteArray, OutplaceEqualsReference)
     gko::array<value_type> abs_vals(ref, total_size);
     gko::array<value_type> dabs_vals(exec, total_size);
 
-    gko::kernels::EXEC_NAMESPACE::components::outplace_absolute_array(
+    gko::kernels::GKO_DEVICE_NAMESPACE::components::outplace_absolute_array(
         exec, dvals.get_const_data(), total_size, dabs_vals.get_data());
     gko::kernels::reference::components::outplace_absolute_array(
         ref, vals.get_const_data(), total_size, abs_vals.get_data());
@@ -85,7 +85,7 @@ TEST_F(AbsoluteArray, OutplaceComplexEqualsReference)
     gko::array<value_type> abs_vals(ref, total_size);
     gko::array<value_type> dabs_vals(exec, total_size);
 
-    gko::kernels::EXEC_NAMESPACE::components::outplace_absolute_array(
+    gko::kernels::GKO_DEVICE_NAMESPACE::components::outplace_absolute_array(
         exec, dcomplex_vals.get_const_data(), total_size, dabs_vals.get_data());
     gko::kernels::reference::components::outplace_absolute_array(
         ref, complex_vals.get_const_data(), total_size, abs_vals.get_data());
diff --git a/test/components/fill_array_kernels.cpp b/test/components/fill_array_kernels.cpp
index 9ccf63e5c88..3997c5830ea 100644
--- a/test/components/fill_array_kernels.cpp
+++ b/test/components/fill_array_kernels.cpp
@@ -47,7 +47,7 @@ TYPED_TEST_SUITE(FillArray, gko::test::ValueAndIndexTypes,
 TYPED_TEST(FillArray, EqualsReference)
 {
     using T = typename TestFixture::value_type;
-    gko::kernels::EXEC_NAMESPACE::components::fill_array(
+    gko::kernels::GKO_DEVICE_NAMESPACE::components::fill_array(
         this->exec, this->dvals.get_data(), this->total_size, T(1523));
 
     GKO_ASSERT_ARRAY_EQ(this->vals, this->dvals);
@@ -57,7 +57,7 @@ TYPED_TEST(FillArray, EqualsReference)
 TYPED_TEST(FillArray, FillSeqEqualsReference)
 {
     using T = typename TestFixture::value_type;
-    gko::kernels::EXEC_NAMESPACE::components::fill_seq_array(
+    gko::kernels::GKO_DEVICE_NAMESPACE::components::fill_seq_array(
         this->exec, this->dvals.get_data(), this->total_size);
 
     GKO_ASSERT_ARRAY_EQ(this->seqs, this->dvals);
diff --git a/test/components/format_conversion_kernels.cpp b/test/components/format_conversion_kernels.cpp
index fee77ea5986..053171ffbe2 100644
--- a/test/components/format_conversion_kernels.cpp
+++ b/test/components/format_conversion_kernels.cpp
@@ -63,7 +63,7 @@ TYPED_TEST(FormatConversion, ConvertsEmptyPtrsToIdxs)
     ptrs.fill(0);
     TypeParam* output = nullptr;
 
-    gko::kernels::EXEC_NAMESPACE::components::convert_ptrs_to_idxs(
+    gko::kernels::GKO_DEVICE_NAMESPACE::components::convert_ptrs_to_idxs(
         this->exec, ptrs.get_const_data(), this->size, output);
 
     // mustn't segfault
@@ -75,7 +75,7 @@ TYPED_TEST(FormatConversion, ConvertPtrsToIdxs)
     auto ref_idxs = this->idxs;
     this->idxs.fill(-1);
 
-    gko::kernels::EXEC_NAMESPACE::components::convert_ptrs_to_idxs(
+    gko::kernels::GKO_DEVICE_NAMESPACE::components::convert_ptrs_to_idxs(
         this->exec, this->ptrs.get_const_data(), this->size,
         this->idxs.get_data());
 
@@ -90,7 +90,7 @@ TYPED_TEST(FormatConversion, ConvertsEmptyIdxsToPtrs)
     this->ptrs.fill(-1);
     TypeParam* input = nullptr;
 
-    gko::kernels::EXEC_NAMESPACE::components::convert_idxs_to_ptrs(
+    gko::kernels::GKO_DEVICE_NAMESPACE::components::convert_idxs_to_ptrs(
         this->exec, input, 0, this->size, this->ptrs.get_data());
 
     GKO_ASSERT_ARRAY_EQ(this->ptrs, ref_ptrs);
@@ -102,7 +102,7 @@ TYPED_TEST(FormatConversion, ConvertIdxsToPtrsIsEquivalentToRef)
     auto ref_ptrs = this->ptrs;
     this->ptrs.fill(-1);
 
-    gko::kernels::EXEC_NAMESPACE::components::convert_idxs_to_ptrs(
+    gko::kernels::GKO_DEVICE_NAMESPACE::components::convert_idxs_to_ptrs(
         this->exec, this->idxs.get_const_data(), this->idxs.get_size(),
         this->size, this->ptrs.get_data());
 
@@ -115,7 +115,7 @@ TYPED_TEST(FormatConversion, ConvertPtrsToSizesIsEquivalentToRef)
     auto ref_sizes = this->sizes;
     this->sizes.fill(12345);
 
-    gko::kernels::EXEC_NAMESPACE::components::convert_ptrs_to_sizes(
+    gko::kernels::GKO_DEVICE_NAMESPACE::components::convert_ptrs_to_sizes(
         this->exec, this->ptrs.get_const_data(), this->size,
         this->sizes.get_data());
 
diff --git a/test/components/prefix_sum_kernels.cpp b/test/components/prefix_sum_kernels.cpp
index cf1777bb6ae..73cb0c7874e 100644
--- a/test/components/prefix_sum_kernels.cpp
+++ b/test/components/prefix_sum_kernels.cpp
@@ -57,7 +57,7 @@ TYPED_TEST(PrefixSum, EqualsReference)
         SCOPED_TRACE(size);
         gko::kernels::reference::components::prefix_sum_nonnegative(
             this->ref, this->vals.get_data(), size);
-        gko::kernels::EXEC_NAMESPACE::components::prefix_sum_nonnegative(
+        gko::kernels::GKO_DEVICE_NAMESPACE::components::prefix_sum_nonnegative(
             this->exec, this->dvals.get_data(), size);
 
         GKO_ASSERT_ARRAY_EQ(this->vals, this->dvals);
@@ -74,7 +74,7 @@ TYPED_TEST(PrefixSum, WorksCloseToOverflow)
                      std::is_unsigned<TypeParam>::value;
     gko::array<TypeParam> data{this->exec, I<TypeParam>({max - 1, 1, 0})};
 
-    gko::kernels::EXEC_NAMESPACE::components::prefix_sum_nonnegative(
+    gko::kernels::GKO_DEVICE_NAMESPACE::components::prefix_sum_nonnegative(
         this->exec, data.get_data(), data.get_size());
 
     GKO_ASSERT_ARRAY_EQ(data, I<TypeParam>({0, max - 1, max}));
@@ -86,7 +86,7 @@ TYPED_TEST(PrefixSum, DoesntOverflowFromLastElement)
     const auto max = std::numeric_limits<TypeParam>::max();
     gko::array<TypeParam> data{this->exec, I<TypeParam>({2, max - 1})};
 
-    gko::kernels::EXEC_NAMESPACE::components::prefix_sum_nonnegative(
+    gko::kernels::GKO_DEVICE_NAMESPACE::components::prefix_sum_nonnegative(
         this->exec, data.get_data(), data.get_size());
 
     GKO_ASSERT_ARRAY_EQ(data, I<TypeParam>({0, 2}));
@@ -103,7 +103,7 @@ TYPED_TEST(PrefixSum, ThrowsOnOverflow)
                                {max / 3, max / 2, max / 4, max / 3, max / 4}};
 
     ASSERT_THROW(
-        gko::kernels::EXEC_NAMESPACE::components::prefix_sum_nonnegative(
+        gko::kernels::GKO_DEVICE_NAMESPACE::components::prefix_sum_nonnegative(
             this->exec, data.get_data(), data.get_size()),
         gko::OverflowError);
 }
diff --git a/test/components/reduce_array_kernels.cpp b/test/components/reduce_array_kernels.cpp
index cd6c2a8d7bf..dfc2e046c84 100644
--- a/test/components/reduce_array_kernels.cpp
+++ b/test/components/reduce_array_kernels.cpp
@@ -50,7 +50,7 @@ TYPED_TEST(ReduceArray, EqualsReference)
 {
     gko::kernels::reference::components::reduce_add_array(this->ref, this->vals,
                                                           this->out);
-    gko::kernels::EXEC_NAMESPACE::components::reduce_add_array(
+    gko::kernels::GKO_DEVICE_NAMESPACE::components::reduce_add_array(
         this->exec, this->dvals, this->dout);
 
     GKO_ASSERT_ARRAY_EQ(this->out, this->dout);
diff --git a/test/distributed/index_map_kernels.cpp b/test/distributed/index_map_kernels.cpp
index 458ca594a56..cafd7b4da35 100644
--- a/test/distributed/index_map_kernels.cpp
+++ b/test/distributed/index_map_kernels.cpp
@@ -97,7 +97,7 @@ TEST_F(IndexMapBuildMapping, BuildMappingSameAsRef)
     gko::kernels::reference::index_map::build_mapping(
         ref, part.get(), query, target_ids, remote_local_idxs,
         remote_global_idxs, remote_sizes);
-    gko::kernels::EXEC_NAMESPACE::index_map::build_mapping(
+    gko::kernels::GKO_DEVICE_NAMESPACE::index_map::build_mapping(
         exec, dpart.get(), dquery, dtarget_ids, dremote_local_idxs,
         dremote_global_idxs, dremote_sizes);
 
@@ -136,7 +136,7 @@ class IndexMap : public CommonTestFixture {
         gko::kernels::reference::index_map::build_mapping(
             ref, part.get(), connections, target_ids, flat_remote_local_idxs,
             flat_remote_global_idxs, remote_sizes);
-        gko::kernels::EXEC_NAMESPACE::index_map::build_mapping(
+        gko::kernels::GKO_DEVICE_NAMESPACE::index_map::build_mapping(
             exec, dpart.get(), dconnections, dtarget_ids,
             dflat_remote_local_idxs, dflat_remote_global_idxs, dremote_sizes);
 
@@ -247,7 +247,7 @@ TEST_F(IndexMap, GetLocalWithLocalIndexSpaceSameAsRef)
         ref, part.get(), target_ids, to_device_const(remote_global_idxs),
         this_rank, query, gko::experimental::distributed::index_space::local,
         result);
-    gko::kernels::EXEC_NAMESPACE::index_map::map_to_local(
+    gko::kernels::GKO_DEVICE_NAMESPACE::index_map::map_to_local(
         exec, dpart.get(), dtarget_ids, to_device_const(dremote_global_idxs),
         this_rank, dquery, gko::experimental::distributed::index_space::local,
         dresult);
@@ -275,7 +275,7 @@ TEST_F(IndexMap, GetLocalWithLocalIndexSpaceWithInvalidIndexSameAsRef)
         ref, part.get(), target_ids, to_device_const(remote_global_idxs),
         this_rank, query, gko::experimental::distributed::index_space::local,
         result);
-    gko::kernels::EXEC_NAMESPACE::index_map::map_to_local(
+    gko::kernels::GKO_DEVICE_NAMESPACE::index_map::map_to_local(
         exec, dpart.get(), dtarget_ids, to_device_const(dremote_global_idxs),
         this_rank, dquery, gko::experimental::distributed::index_space::local,
         dresult);
@@ -304,7 +304,7 @@ TEST_F(IndexMap, GetLocalWithNonLocalIndexSpaceSameAsRef)
         ref, part.get(), target_ids, to_device_const(remote_global_idxs),
         this_rank, query,
         gko::experimental::distributed::index_space::non_local, result);
-    gko::kernels::EXEC_NAMESPACE::index_map::map_to_local(
+    gko::kernels::GKO_DEVICE_NAMESPACE::index_map::map_to_local(
         exec, dpart.get(), dtarget_ids, to_device_const(dremote_global_idxs),
         this_rank, dquery,
         gko::experimental::distributed::index_space::non_local, dresult);
@@ -330,7 +330,7 @@ TEST_F(IndexMap, GetLocalWithNonLocalIndexSpaceWithInvalidIndexSameAsRef)
         ref, part.get(), target_ids, to_device_const(remote_global_idxs),
         this_rank, query,
         gko::experimental::distributed::index_space::non_local, result);
-    gko::kernels::EXEC_NAMESPACE::index_map::map_to_local(
+    gko::kernels::GKO_DEVICE_NAMESPACE::index_map::map_to_local(
         exec, dpart.get(), dtarget_ids, to_device_const(dremote_global_idxs),
         this_rank, dquery,
         gko::experimental::distributed::index_space::non_local, dresult);
@@ -355,7 +355,7 @@ TEST_F(IndexMap, GetLocalWithCombinedIndexSpaceSameAsRef)
         ref, part.get(), target_ids, to_device_const(remote_global_idxs),
         this_rank, query, gko::experimental::distributed::index_space::combined,
         result);
-    gko::kernels::EXEC_NAMESPACE::index_map::map_to_local(
+    gko::kernels::GKO_DEVICE_NAMESPACE::index_map::map_to_local(
         exec, dpart.get(), dtarget_ids, to_device_const(dremote_global_idxs),
         this_rank, dquery,
         gko::experimental::distributed::index_space::combined, dresult);
@@ -385,7 +385,7 @@ TEST_F(IndexMap, GetLocalWithCombinedIndexSpaceWithInvalidIndexSameAsRef)
         ref, part.get(), target_ids, to_device_const(remote_global_idxs),
         this_rank, query,
         gko::experimental::distributed::index_space::non_local, result);
-    gko::kernels::EXEC_NAMESPACE::index_map::map_to_local(
+    gko::kernels::GKO_DEVICE_NAMESPACE::index_map::map_to_local(
         exec, dpart.get(), dtarget_ids, to_device_const(dremote_global_idxs),
         this_rank, dquery,
         gko::experimental::distributed::index_space::non_local, dresult);
diff --git a/test/distributed/matrix_kernels.cpp b/test/distributed/matrix_kernels.cpp
index 5e3677db2f4..8445aee6a0e 100644
--- a/test/distributed/matrix_kernels.cpp
+++ b/test/distributed/matrix_kernels.cpp
@@ -72,7 +72,7 @@ class Matrix : public CommonTestFixture {
                     ref, input, row_partition.get(), col_partition.get(), part,
                     local_row_idxs, local_col_idxs, local_values,
                     non_local_row_idxs, non_local_col_idxs, non_local_values);
-            gko::kernels::EXEC_NAMESPACE::distributed_matrix::
+            gko::kernels::GKO_DEVICE_NAMESPACE::distributed_matrix::
                 separate_local_nonlocal(
                     exec, d_input, d_row_partition.get(), d_col_partition.get(),
                     part, d_local_row_idxs, d_local_col_idxs, d_local_values,
diff --git a/test/distributed/partition_helper_kernels.cpp b/test/distributed/partition_helper_kernels.cpp
index 8121a720908..9e985ffec9e 100644
--- a/test/distributed/partition_helper_kernels.cpp
+++ b/test/distributed/partition_helper_kernels.cpp
@@ -147,8 +147,8 @@ TYPED_TEST(PartitionHelpers, CanCheckConsecutiveRanges)
     auto offsets = make_array(this->exec, create_ranges<index_type>(100));
     bool result = false;
 
-    gko::kernels::EXEC_NAMESPACE::partition_helpers::check_consecutive_ranges(
-        this->exec, offsets, result);
+    gko::kernels::GKO_DEVICE_NAMESPACE::partition_helpers::
+        check_consecutive_ranges(this->exec, offsets, result);
 
     ASSERT_TRUE(result);
 }
@@ -163,8 +163,8 @@ TYPED_TEST(PartitionHelpers, CanCheckNonConsecutiveRanges)
         make_array(this->exec, remove_indices(full_range_ends, removal_idxs));
     bool result = true;
 
-    gko::kernels::EXEC_NAMESPACE::partition_helpers::check_consecutive_ranges(
-        this->exec, start_ends, result);
+    gko::kernels::GKO_DEVICE_NAMESPACE::partition_helpers::
+        check_consecutive_ranges(this->exec, start_ends, result);
 
     ASSERT_FALSE(result);
 }
@@ -176,8 +176,8 @@ TYPED_TEST(PartitionHelpers, CanCheckConsecutiveRangesWithSingleRange)
     auto start_ends = make_array(this->ref, create_ranges<index_type>(1));
     bool result = false;
 
-    gko::kernels::EXEC_NAMESPACE::partition_helpers::check_consecutive_ranges(
-        this->exec, start_ends, result);
+    gko::kernels::GKO_DEVICE_NAMESPACE::partition_helpers::
+        check_consecutive_ranges(this->exec, start_ends, result);
 
     ASSERT_TRUE(result);
 }
@@ -189,8 +189,8 @@ TYPED_TEST(PartitionHelpers, CanCheckConsecutiveRangesWithSingleElement)
     auto start_ends = gko::array<index_type>(this->exec, {1});
     bool result = false;
 
-    gko::kernels::EXEC_NAMESPACE::partition_helpers::check_consecutive_ranges(
-        this->exec, start_ends, result);
+    gko::kernels::GKO_DEVICE_NAMESPACE::partition_helpers::
+        check_consecutive_ranges(this->exec, start_ends, result);
 
     ASSERT_TRUE(result);
 }
@@ -206,7 +206,7 @@ TYPED_TEST(PartitionHelpers, CanSortConsecutiveRanges)
     auto expected_start_ends = start_ends;
     auto expected_part_ids = part_ids_arr;
 
-    gko::kernels::EXEC_NAMESPACE::partition_helpers::sort_by_range_start(
+    gko::kernels::GKO_DEVICE_NAMESPACE::partition_helpers::sort_by_range_start(
         this->exec, start_ends, part_ids_arr);
 
     GKO_ASSERT_ARRAY_EQ(expected_start_ends, start_ends);
@@ -227,7 +227,7 @@ TYPED_TEST(PartitionHelpers, CanSortNonConsecutiveRanges)
     auto part_ids_arr = gko::array<comm_index_type>(
         this->exec, shuffled.second.begin(), shuffled.second.end());
 
-    gko::kernels::EXEC_NAMESPACE::partition_helpers::sort_by_range_start(
+    gko::kernels::GKO_DEVICE_NAMESPACE::partition_helpers::sort_by_range_start(
         this->exec, start_ends, part_ids_arr);
 
     GKO_ASSERT_ARRAY_EQ(expected_start_ends, start_ends);
@@ -242,7 +242,7 @@ TYPED_TEST(PartitionHelpers, CanCompressRanges)
     auto ranges = make_array(this->exec, create_ranges(expected_offsets));
     gko::array<index_type> offsets{this->exec, expected_offsets.size()};
 
-    gko::kernels::EXEC_NAMESPACE::partition_helpers::compress_ranges(
+    gko::kernels::GKO_DEVICE_NAMESPACE::partition_helpers::compress_ranges(
         this->exec, ranges, offsets);
 
     GKO_ASSERT_ARRAY_EQ(offsets, make_array(this->exec, expected_offsets));
diff --git a/test/distributed/vector_kernels.cpp b/test/distributed/vector_kernels.cpp
index e8e3d6a7e7b..86faca6b2b2 100644
--- a/test/distributed/vector_kernels.cpp
+++ b/test/distributed/vector_kernels.cpp
@@ -61,7 +61,7 @@ class Vector : public CommonTestFixture {
 
             gko::kernels::reference::distributed_vector::build_local(
                 ref, input, partition.get(), part, output.get());
-            gko::kernels::EXEC_NAMESPACE::distributed_vector::build_local(
+            gko::kernels::GKO_DEVICE_NAMESPACE::distributed_vector::build_local(
                 exec, d_input, d_partition.get(), part, d_output.get());
 
             GKO_ASSERT_MTX_NEAR(output, d_output, 0);
diff --git a/test/factorization/cholesky_kernels.cpp b/test/factorization/cholesky_kernels.cpp
index 82c59477fd8..c1d0a6c7336 100644
--- a/test/factorization/cholesky_kernels.cpp
+++ b/test/factorization/cholesky_kernels.cpp
@@ -150,7 +150,7 @@ TYPED_TEST(CholeskySymbolic, KernelSymbolicCount)
 
         gko::kernels::reference::cholesky::symbolic_count(
             this->ref, mtx.get(), *forest, row_nnz.get_data(), this->tmp);
-        gko::kernels::EXEC_NAMESPACE::cholesky::symbolic_count(
+        gko::kernels::GKO_DEVICE_NAMESPACE::cholesky::symbolic_count(
             this->exec, dmtx.get(), *dforest, drow_nnz.get_data(), this->dtmp);
 
         GKO_ASSERT_ARRAY_EQ(drow_nnz, row_nnz);
@@ -189,12 +189,12 @@ TYPED_TEST(CholeskySymbolic, KernelSymbolicFactorize)
         std::unique_ptr<elimination_forest> dforest;
         gko::factorization::compute_elim_forest(dmtx.get(), dforest);
         gko::array<index_type> dtmp_ptrs{this->exec, num_rows + 1};
-        gko::kernels::EXEC_NAMESPACE::cholesky::symbolic_count(
+        gko::kernels::GKO_DEVICE_NAMESPACE::cholesky::symbolic_count(
             this->exec, dmtx.get(), *dforest, dtmp_ptrs.get_data(), this->dtmp);
 
         gko::kernels::reference::cholesky::symbolic_factorize(
             this->ref, mtx.get(), *forest, l_factor.get(), this->tmp);
-        gko::kernels::EXEC_NAMESPACE::cholesky::symbolic_factorize(
+        gko::kernels::GKO_DEVICE_NAMESPACE::cholesky::symbolic_factorize(
             this->exec, dmtx.get(), *dforest, dl_factor.get(), this->dtmp);
 
         GKO_ASSERT_MTX_EQ_SPARSITY(dl_factor, l_factor);
@@ -239,7 +239,7 @@ TYPED_TEST(CholeskySymbolic, KernelForestFromFactorWorks)
         elimination_forest dforest{this->exec,
                                    static_cast<index_type>(mtx->get_size()[0])};
 
-        gko::kernels::EXEC_NAMESPACE::cholesky::forest_from_factor(
+        gko::kernels::GKO_DEVICE_NAMESPACE::cholesky::forest_from_factor(
             this->exec, dfactors.get(), dforest);
 
         this->assert_equal_forests(*forest, dforest);
@@ -367,7 +367,7 @@ TYPED_TEST(Cholesky, KernelInitializeIsEquivalentToRef)
     this->forall_matrices([this] {
         const auto nnz = this->mtx_chol->get_num_stored_elements();
         std::fill_n(this->mtx_chol->get_values(), nnz, gko::zero<value_type>());
-        gko::kernels::EXEC_NAMESPACE::components::fill_array(
+        gko::kernels::GKO_DEVICE_NAMESPACE::components::fill_array(
             this->exec, this->dmtx_chol->get_values(), nnz,
             gko::zero<value_type>());
         gko::array<index_type> diag_idxs{this->ref, this->num_rows};
@@ -380,7 +380,7 @@ TYPED_TEST(Cholesky, KernelInitializeIsEquivalentToRef)
             this->row_descs.get_const_data(), this->storage.get_const_data(),
             diag_idxs.get_data(), transpose_idxs.get_data(),
             this->mtx_chol.get());
-        gko::kernels::EXEC_NAMESPACE::cholesky::initialize(
+        gko::kernels::GKO_DEVICE_NAMESPACE::cholesky::initialize(
             this->exec, this->dmtx.get(),
             this->dstorage_offsets.get_const_data(),
             this->drow_descs.get_const_data(), this->dstorage.get_const_data(),
@@ -410,7 +410,7 @@ TYPED_TEST(Cholesky, KernelFactorizeIsEquivalentToRef)
             this->row_descs.get_const_data(), this->storage.get_const_data(),
             diag_idxs.get_data(), transpose_idxs.get_data(),
             this->mtx_chol.get());
-        gko::kernels::EXEC_NAMESPACE::cholesky::initialize(
+        gko::kernels::GKO_DEVICE_NAMESPACE::cholesky::initialize(
             this->exec, this->dmtx.get(),
             this->dstorage_offsets.get_const_data(),
             this->drow_descs.get_const_data(), this->dstorage.get_const_data(),
@@ -422,7 +422,7 @@ TYPED_TEST(Cholesky, KernelFactorizeIsEquivalentToRef)
             this->row_descs.get_const_data(), this->storage.get_const_data(),
             diag_idxs.get_const_data(), transpose_idxs.get_const_data(),
             *this->forest, this->mtx_chol.get(), tmp);
-        gko::kernels::EXEC_NAMESPACE::cholesky::factorize(
+        gko::kernels::GKO_DEVICE_NAMESPACE::cholesky::factorize(
             this->exec, this->dstorage_offsets.get_const_data(),
             this->drow_descs.get_const_data(), this->dstorage.get_const_data(),
             ddiag_idxs.get_const_data(), dtranspose_idxs.get_const_data(),
diff --git a/test/factorization/lu_kernels.cpp b/test/factorization/lu_kernels.cpp
index fdcaa0cfad0..0ea06bed506 100644
--- a/test/factorization/lu_kernels.cpp
+++ b/test/factorization/lu_kernels.cpp
@@ -156,7 +156,7 @@ TYPED_TEST(Lu, KernelInitializeIsEquivalentToRef)
         std::fill_n(this->mtx_lu->get_values(),
                     this->mtx_lu->get_num_stored_elements(),
                     gko::zero<value_type>());
-        gko::kernels::EXEC_NAMESPACE::components::fill_array(
+        gko::kernels::GKO_DEVICE_NAMESPACE::components::fill_array(
             this->exec, this->dmtx_lu->get_values(),
             this->dmtx_lu->get_num_stored_elements(), gko::zero<value_type>());
         gko::array<index_type> diag_idxs{this->ref, this->num_rows};
@@ -166,7 +166,7 @@ TYPED_TEST(Lu, KernelInitializeIsEquivalentToRef)
             this->ref, this->mtx.get(), this->storage_offsets.get_const_data(),
             this->row_descs.get_const_data(), this->storage.get_const_data(),
             diag_idxs.get_data(), this->mtx_lu.get());
-        gko::kernels::EXEC_NAMESPACE::lu_factorization::initialize(
+        gko::kernels::GKO_DEVICE_NAMESPACE::lu_factorization::initialize(
             this->exec, this->dmtx.get(),
             this->dstorage_offsets.get_const_data(),
             this->drow_descs.get_const_data(), this->dstorage.get_const_data(),
@@ -191,7 +191,7 @@ TYPED_TEST(Lu, KernelFactorizeIsEquivalentToRef)
             this->ref, this->mtx.get(), this->storage_offsets.get_const_data(),
             this->row_descs.get_const_data(), this->storage.get_const_data(),
             diag_idxs.get_data(), this->mtx_lu.get());
-        gko::kernels::EXEC_NAMESPACE::lu_factorization::initialize(
+        gko::kernels::GKO_DEVICE_NAMESPACE::lu_factorization::initialize(
             this->exec, this->dmtx.get(),
             this->dstorage_offsets.get_const_data(),
             this->drow_descs.get_const_data(), this->dstorage.get_const_data(),
@@ -201,7 +201,7 @@ TYPED_TEST(Lu, KernelFactorizeIsEquivalentToRef)
             this->ref, this->storage_offsets.get_const_data(),
             this->row_descs.get_const_data(), this->storage.get_const_data(),
             diag_idxs.get_const_data(), this->mtx_lu.get(), tmp);
-        gko::kernels::EXEC_NAMESPACE::lu_factorization::factorize(
+        gko::kernels::GKO_DEVICE_NAMESPACE::lu_factorization::factorize(
             this->exec, this->dstorage_offsets.get_const_data(),
             this->drow_descs.get_const_data(), this->dstorage.get_const_data(),
             ddiag_idxs.get_const_data(), this->dmtx_lu.get(), dtmp);
diff --git a/test/factorization/par_ic_kernels.cpp b/test/factorization/par_ic_kernels.cpp
index 57086a1550d..40a40b5acf5 100644
--- a/test/factorization/par_ic_kernels.cpp
+++ b/test/factorization/par_ic_kernels.cpp
@@ -100,7 +100,7 @@ TYPED_TEST(ParIc, KernelInitFactorIsEquivalentToRef)
 
     gko::kernels::reference::par_ic_factorization::init_factor(
         this->ref, this->mtx_l.get());
-    gko::kernels::EXEC_NAMESPACE::par_ic_factorization::init_factor(
+    gko::kernels::GKO_DEVICE_NAMESPACE::par_ic_factorization::init_factor(
         this->exec, this->dmtx_l.get());
 
     GKO_ASSERT_MTX_NEAR(this->mtx_l, this->dmtx_l, r<value_type>::value);
@@ -118,7 +118,7 @@ TYPED_TEST(ParIc, KernelComputeFactorIsEquivalentToRef)
 
     gko::kernels::reference::par_ic_factorization::compute_factor(
         this->ref, 1, mtx_l_coo.get(), this->mtx_l_ani_init.get());
-    gko::kernels::EXEC_NAMESPACE::par_ic_factorization::compute_factor(
+    gko::kernels::GKO_DEVICE_NAMESPACE::par_ic_factorization::compute_factor(
         this->exec, 100, dmtx_l_coo.get(), this->dmtx_l_ani_init.get());
 
     GKO_ASSERT_MTX_NEAR(this->mtx_l_ani_init, this->dmtx_l_ani_init, 1e-4);
diff --git a/test/factorization/par_ict_kernels.cpp b/test/factorization/par_ict_kernels.cpp
index 254c2e4a40e..81d1dd83ffb 100644
--- a/test/factorization/par_ict_kernels.cpp
+++ b/test/factorization/par_ict_kernels.cpp
@@ -118,7 +118,7 @@ TYPED_TEST(ParIct, KernelAddCandidatesIsEquivalentToRef)
     gko::kernels::reference::par_ict_factorization::add_candidates(
         this->ref, mtx_llh.get(), this->mtx.get(), this->mtx_l.get(),
         res_mtx_l.get());
-    gko::kernels::EXEC_NAMESPACE::par_ict_factorization::add_candidates(
+    gko::kernels::GKO_DEVICE_NAMESPACE::par_ict_factorization::add_candidates(
         this->exec, dmtx_llh.get(), this->dmtx.get(), this->dmtx_l.get(),
         dres_mtx_l.get());
 
@@ -140,9 +140,9 @@ TYPED_TEST(ParIct, KernelComputeFactorIsEquivalentToRef)
     gko::kernels::reference::par_ict_factorization::compute_factor(
         this->ref, this->mtx_ani.get(), this->mtx_l_ani.get(), mtx_l_coo.get());
     for (int i = 0; i < 20; ++i) {
-        gko::kernels::EXEC_NAMESPACE::par_ict_factorization::compute_factor(
-            this->exec, this->dmtx_ani.get(), this->dmtx_l_ani.get(),
-            dmtx_l_coo.get());
+        gko::kernels::GKO_DEVICE_NAMESPACE::par_ict_factorization::
+            compute_factor(this->exec, this->dmtx_ani.get(),
+                           this->dmtx_l_ani.get(), dmtx_l_coo.get());
     }
 
     GKO_ASSERT_MTX_NEAR(this->mtx_l_ani, this->dmtx_l_ani, 1e-2);
diff --git a/test/factorization/par_ilu_kernels.cpp b/test/factorization/par_ilu_kernels.cpp
index 94e2eb6512f..0d853af0745 100644
--- a/test/factorization/par_ilu_kernels.cpp
+++ b/test/factorization/par_ilu_kernels.cpp
@@ -89,8 +89,8 @@ class ParIlu : public CommonTestFixture {
     {
         gko::kernels::reference::factorization::initialize_row_ptrs_l_u(
             ref, mtx.get(), l_row_ptrs, u_row_ptrs);
-        gko::kernels::EXEC_NAMESPACE::factorization::initialize_row_ptrs_l_u(
-            exec, dmtx.get(), dl_row_ptrs, du_row_ptrs);
+        gko::kernels::GKO_DEVICE_NAMESPACE::factorization::
+            initialize_row_ptrs_l_u(exec, dmtx.get(), dl_row_ptrs, du_row_ptrs);
     }
 
     void initialize_lu(std::unique_ptr<Csr>& l, std::unique_ptr<Csr>& u,
@@ -121,7 +121,7 @@ class ParIlu : public CommonTestFixture {
 
         gko::kernels::reference::factorization::initialize_l_u(
             ref, mtx.get(), l.get(), u.get());
-        gko::kernels::EXEC_NAMESPACE::factorization::initialize_l_u(
+        gko::kernels::GKO_DEVICE_NAMESPACE::factorization::initialize_l_u(
             exec, dmtx.get(), dl.get(), du.get());
     }
 
@@ -139,7 +139,7 @@ class ParIlu : public CommonTestFixture {
 
         gko::kernels::reference::par_ilu_factorization::compute_l_u_factors(
             ref, iterations, coo.get(), l.get(), u_transpose_mtx.get());
-        gko::kernels::EXEC_NAMESPACE::par_ilu_factorization::
+        gko::kernels::GKO_DEVICE_NAMESPACE::par_ilu_factorization::
             compute_l_u_factors(exec, iterations, dcoo.get(), dl.get(),
                                 u_transpose_dmtx.get());
         auto u_lin_op = u_transpose_mtx->transpose();
@@ -160,7 +160,7 @@ TYPED_TEST(ParIlu, KernelAddDiagonalElementsSortedEquivalentToRef)
 
     gko::kernels::reference::factorization::add_diagonal_elements(
         this->ref, mtx.get(), true);
-    gko::kernels::EXEC_NAMESPACE::factorization::add_diagonal_elements(
+    gko::kernels::GKO_DEVICE_NAMESPACE::factorization::add_diagonal_elements(
         this->exec, dmtx.get(), true);
 
     ASSERT_TRUE(mtx->is_sorted_by_column_index());
@@ -176,7 +176,7 @@ TYPED_TEST(ParIlu, KernelAddDiagonalElementsUnsortedEquivalentToRef)
 
     gko::kernels::reference::factorization::add_diagonal_elements(
         this->ref, mtx.get(), false);
-    gko::kernels::EXEC_NAMESPACE::factorization::add_diagonal_elements(
+    gko::kernels::GKO_DEVICE_NAMESPACE::factorization::add_diagonal_elements(
         this->exec, dmtx.get(), false);
 
     ASSERT_FALSE(mtx->is_sorted_by_column_index());
@@ -193,7 +193,7 @@ TYPED_TEST(ParIlu, KernelAddDiagonalElementsNonSquareEquivalentToRef)
 
     gko::kernels::reference::factorization::add_diagonal_elements(
         this->ref, mtx.get(), true);
-    gko::kernels::EXEC_NAMESPACE::factorization::add_diagonal_elements(
+    gko::kernels::GKO_DEVICE_NAMESPACE::factorization::add_diagonal_elements(
         this->exec, dmtx.get(), true);
 
     ASSERT_TRUE(mtx->is_sorted_by_column_index());
diff --git a/test/factorization/par_ilut_kernels.cpp b/test/factorization/par_ilut_kernels.cpp
index c4ad7fe412a..7d46f7979ac 100644
--- a/test/factorization/par_ilut_kernels.cpp
+++ b/test/factorization/par_ilut_kernels.cpp
@@ -151,8 +151,8 @@ class ParIlut : public CommonTestFixture {
 
         gko::kernels::reference::par_ilut_factorization::threshold_select(
             ref, mtx.get(), rank, tmp, tmp2, res);
-        gko::kernels::EXEC_NAMESPACE::par_ilut_factorization::threshold_select(
-            exec, dmtx.get(), rank, dtmp, dtmp2, dres);
+        gko::kernels::GKO_DEVICE_NAMESPACE::par_ilut_factorization::
+            threshold_select(exec, dmtx.get(), rank, dtmp, dtmp2, dres);
 
         ASSERT_NEAR(res, dres, tolerance);
     }
@@ -174,9 +174,9 @@ class ParIlut : public CommonTestFixture {
 
         gko::kernels::reference::par_ilut_factorization::threshold_filter(
             ref, local_mtx.get(), threshold, res.get(), res_coo.get(), lower);
-        gko::kernels::EXEC_NAMESPACE::par_ilut_factorization::threshold_filter(
-            exec, local_dmtx.get(), threshold, dres.get(), dres_coo.get(),
-            lower);
+        gko::kernels::GKO_DEVICE_NAMESPACE::par_ilut_factorization::
+            threshold_filter(exec, local_dmtx.get(), threshold, dres.get(),
+                             dres_coo.get(), lower);
 
         GKO_ASSERT_MTX_NEAR(res, dres, 0);
         GKO_ASSERT_MTX_EQ_SPARSITY(res, dres);
@@ -208,7 +208,7 @@ class ParIlut : public CommonTestFixture {
         gko::kernels::reference::par_ilut_factorization::
             threshold_filter_approx(ref, mtx.get(), rank, tmp, threshold,
                                     res.get(), res_coo.get());
-        gko::kernels::EXEC_NAMESPACE::par_ilut_factorization::
+        gko::kernels::GKO_DEVICE_NAMESPACE::par_ilut_factorization::
             threshold_filter_approx(exec, dmtx.get(), rank, dtmp, dthreshold,
                                     dres.get(), dres_coo.get());
 
@@ -283,8 +283,9 @@ TYPED_TEST(ParIlut, KernelThresholdFilterNullptrCooIsEquivalentToRef)
 
     gko::kernels::reference::par_ilut_factorization::threshold_filter(
         this->ref, this->mtx_l.get(), 0.5, res.get(), null_coo, true);
-    gko::kernels::EXEC_NAMESPACE::par_ilut_factorization::threshold_filter(
-        this->exec, this->dmtx_l.get(), 0.5, dres.get(), null_coo, true);
+    gko::kernels::GKO_DEVICE_NAMESPACE::par_ilut_factorization::
+        threshold_filter(this->exec, this->dmtx_l.get(), 0.5, dres.get(),
+                         null_coo, true);
 
     GKO_ASSERT_MTX_NEAR(res, dres, 0);
     GKO_ASSERT_MTX_EQ_SPARSITY(res, dres);
@@ -346,7 +347,7 @@ TYPED_TEST(ParIlut, KernelThresholdFilterApproxNullptrCooIsEquivalentToRef)
     gko::kernels::reference::par_ilut_factorization::threshold_filter_approx(
         this->ref, this->mtx_l.get(), rank, tmp, threshold, res.get(),
         null_coo);
-    gko::kernels::EXEC_NAMESPACE::par_ilut_factorization::
+    gko::kernels::GKO_DEVICE_NAMESPACE::par_ilut_factorization::
         threshold_filter_approx(this->exec, this->dmtx_l.get(), rank, dtmp,
                                 dthreshold, dres.get(), null_coo);
 
@@ -393,7 +394,7 @@ TYPED_TEST(ParIlut, KernelAddCandidatesIsEquivalentToRef)
     gko::kernels::reference::par_ilut_factorization::add_candidates(
         this->ref, mtx_lu.get(), this->mtx_square.get(), this->mtx_l2.get(),
         this->mtx_u.get(), res_mtx_l.get(), res_mtx_u.get());
-    gko::kernels::EXEC_NAMESPACE::par_ilut_factorization::add_candidates(
+    gko::kernels::GKO_DEVICE_NAMESPACE::par_ilut_factorization::add_candidates(
         this->exec, dmtx_lu.get(), this->dmtx_square.get(), this->dmtx_l2.get(),
         this->dmtx_u.get(), dres_mtx_l.get(), dres_mtx_u.get());
 
@@ -422,7 +423,7 @@ TYPED_TEST(ParIlut, KernelComputeLUIsEquivalentToRef)
         this->ref, this->mtx_ani.get(), this->mtx_l_ani.get(), mtx_l_coo.get(),
         this->mtx_u_ani.get(), mtx_u_coo.get(), this->mtx_ut_ani.get());
     for (int i = 0; i < 20; ++i) {
-        gko::kernels::EXEC_NAMESPACE::par_ilut_factorization::
+        gko::kernels::GKO_DEVICE_NAMESPACE::par_ilut_factorization::
             compute_l_u_factors(this->exec, this->dmtx_ani.get(),
                                 this->dmtx_l_ani.get(), dmtx_l_coo.get(),
                                 this->dmtx_u_ani.get(), dmtx_u_coo.get(),
diff --git a/test/matrix/csr_kernels.cpp b/test/matrix/csr_kernels.cpp
index 347425175bb..d3a7bb8f8e5 100644
--- a/test/matrix/csr_kernels.cpp
+++ b/test/matrix/csr_kernels.cpp
@@ -149,7 +149,7 @@ void assert_lookup_correct(std::shared_ptr<const gko::EXEC_TYPE> exec,
     const auto row_ptrs = mtx->get_const_row_ptrs();
     const auto col_idxs = mtx->get_const_col_idxs();
     gko::array<bool> correct{exec, {true}};
-    gko::kernels::EXEC_NAMESPACE::run_kernel(
+    gko::kernels::GKO_DEVICE_NAMESPACE::run_kernel(
         exec,
         [] GKO_KERNEL(auto row, auto num_cols, auto row_ptrs, auto col_idxs,
                       auto storage_offsets, auto storage, auto row_descs,
@@ -215,7 +215,7 @@ TYPED_TEST(CsrLookup, BuildLookupWorks)
         // otherwise things might crash
         gko::kernels::reference::csr::build_lookup_offsets(
             this->ref, row_ptrs, col_idxs, num_rows, allowed, storage_offsets);
-        gko::kernels::EXEC_NAMESPACE::csr::build_lookup_offsets(
+        gko::kernels::GKO_DEVICE_NAMESPACE::csr::build_lookup_offsets(
             this->exec, drow_ptrs, dcol_idxs, num_rows, allowed,
             dstorage_offsets);
 
@@ -238,7 +238,7 @@ TYPED_TEST(CsrLookup, BuildLookupWorks)
         gko::kernels::reference::csr::build_lookup(
             this->ref, row_ptrs, col_idxs, num_rows, allowed, storage_offsets,
             row_descs, storage);
-        gko::kernels::EXEC_NAMESPACE::csr::build_lookup(
+        gko::kernels::GKO_DEVICE_NAMESPACE::csr::build_lookup(
             this->exec, drow_ptrs, dcol_idxs, num_rows, allowed,
             dstorage_offsets, drow_descs, dstorage);
 
diff --git a/test/matrix/csr_kernels2.cpp b/test/matrix/csr_kernels2.cpp
index 713593b4ae5..4ff8e749766 100644
--- a/test/matrix/csr_kernels2.cpp
+++ b/test/matrix/csr_kernels2.cpp
@@ -1346,7 +1346,7 @@ TEST_F(Csr, CalculateNnzPerRowInSpanIsEquivalentToRef)
 
     gko::kernels::reference::csr::calculate_nonzeros_per_row_in_span(
         this->ref, this->mtx2.get(), rspan, cspan, &row_nnz);
-    gko::kernels::EXEC_NAMESPACE::csr::calculate_nonzeros_per_row_in_span(
+    gko::kernels::GKO_DEVICE_NAMESPACE::csr::calculate_nonzeros_per_row_in_span(
         this->exec, this->dmtx2.get(), rspan, cspan, &drow_nnz);
 
     GKO_ASSERT_ARRAY_EQ(row_nnz, drow_nnz);
@@ -1382,7 +1382,7 @@ TEST_F(Csr, ComputeSubmatrixIsEquivalentToRef)
 
     gko::kernels::reference::csr::compute_submatrix(this->ref, this->mtx2.get(),
                                                     rspan, cspan, smat1.get());
-    gko::kernels::EXEC_NAMESPACE::csr::compute_submatrix(
+    gko::kernels::GKO_DEVICE_NAMESPACE::csr::compute_submatrix(
         this->exec, this->dmtx2.get(), rspan, cspan, sdmat1.get());
 
     GKO_ASSERT_MTX_NEAR(sdmat1, smat1, 0.0);
@@ -1408,8 +1408,9 @@ TEST_F(Csr, CalculateNnzPerRowInIndexSetIsEquivalentToRef)
 
     gko::kernels::reference::csr::calculate_nonzeros_per_row_in_index_set(
         this->ref, this->mtx2.get(), rset, cset, row_nnz.get_data());
-    gko::kernels::EXEC_NAMESPACE::csr::calculate_nonzeros_per_row_in_index_set(
-        this->exec, this->dmtx2.get(), drset, dcset, drow_nnz.get_data());
+    gko::kernels::GKO_DEVICE_NAMESPACE::csr::
+        calculate_nonzeros_per_row_in_index_set(
+            this->exec, this->dmtx2.get(), drset, dcset, drow_nnz.get_data());
 
     GKO_ASSERT_ARRAY_EQ(row_nnz, drow_nnz);
 }
@@ -1446,7 +1447,7 @@ TEST_F(Csr, ComputeSubmatrixFromIndexSetIsEquivalentToRef)
 
     gko::kernels::reference::csr::compute_submatrix_from_index_set(
         this->ref, this->mtx2.get(), rset, cset, smat1.get());
-    gko::kernels::EXEC_NAMESPACE::csr::compute_submatrix_from_index_set(
+    gko::kernels::GKO_DEVICE_NAMESPACE::csr::compute_submatrix_from_index_set(
         this->exec, this->dmtx2.get(), drset, dcset, sdmat1.get());
 
     GKO_ASSERT_MTX_NEAR(sdmat1, smat1, 0.0);
@@ -1501,7 +1502,7 @@ TEST_F(Csr, CanDetectMissingDiagonalEntry)
     auto mtx = gko::clone(exec, ref_mtx);
     bool has_diags = true;
 
-    gko::kernels::EXEC_NAMESPACE::csr::check_diagonal_entries_exist(
+    gko::kernels::GKO_DEVICE_NAMESPACE::csr::check_diagonal_entries_exist(
         exec, mtx.get(), has_diags);
 
     ASSERT_FALSE(has_diags);
@@ -1516,7 +1517,7 @@ TEST_F(Csr, CanDetectWhenAllDiagonalEntriesArePresent)
     auto mtx = gko::clone(exec, ref_mtx);
     bool has_diags = true;
 
-    gko::kernels::EXEC_NAMESPACE::csr::check_diagonal_entries_exist(
+    gko::kernels::GKO_DEVICE_NAMESPACE::csr::check_diagonal_entries_exist(
         exec, mtx.get(), has_diags);
 
     ASSERT_TRUE(has_diags);
diff --git a/test/matrix/dense_kernels.cpp b/test/matrix/dense_kernels.cpp
index 25b82215dcd..56ca536187e 100644
--- a/test/matrix/dense_kernels.cpp
+++ b/test/matrix/dense_kernels.cpp
@@ -603,7 +603,7 @@ TEST_F(Dense, CalculateNNZPerRowIsEquivalentToRef)
 
     gko::kernels::reference::dense::count_nonzeros_per_row(
         ref, x.get(), nnz_per_row.get_data());
-    gko::kernels::EXEC_NAMESPACE::dense::count_nonzeros_per_row(
+    gko::kernels::GKO_DEVICE_NAMESPACE::dense::count_nonzeros_per_row(
         exec, dx.get(), dnnz_per_row.get_data());
 
     auto tmp = gko::array<gko::size_type>(ref, dnnz_per_row);
@@ -621,8 +621,8 @@ TEST_F(Dense, ComputeMaxNNZPerRowIsEquivalentToRef)
 
     gko::kernels::reference::dense::compute_max_nnz_per_row(ref, x.get(),
                                                             max_nnz);
-    gko::kernels::EXEC_NAMESPACE::dense::compute_max_nnz_per_row(exec, dx.get(),
-                                                                 dmax_nnz);
+    gko::kernels::GKO_DEVICE_NAMESPACE::dense::compute_max_nnz_per_row(
+        exec, dx.get(), dmax_nnz);
 
     ASSERT_EQ(max_nnz, dmax_nnz);
 }
@@ -2017,7 +2017,7 @@ TEST_F(Dense, ComputeNorm2SquaredIsEquivalentToRef)
 
     gko::kernels::reference::dense::compute_squared_norm2(
         ref, x.get(), norm_expected.get(), tmp);
-    gko::kernels::EXEC_NAMESPACE::dense::compute_squared_norm2(
+    gko::kernels::GKO_DEVICE_NAMESPACE::dense::compute_squared_norm2(
         exec, dx.get(), dnorm.get(), dtmp);
 
     GKO_ASSERT_MTX_NEAR(dnorm, norm_expected, r<value_type>::value);
@@ -2033,7 +2033,7 @@ TEST_F(Dense, ComputesSqrt)
     auto dmtx = gko::clone(exec, mtx);
 
     gko::kernels::reference::dense::compute_sqrt(ref, mtx.get());
-    gko::kernels::EXEC_NAMESPACE::dense::compute_sqrt(exec, dmtx.get());
+    gko::kernels::GKO_DEVICE_NAMESPACE::dense::compute_sqrt(exec, dmtx.get());
 
     GKO_ASSERT_MTX_NEAR(mtx, dmtx, r<value_type>::value);
 }
diff --git a/test/matrix/ell_kernels.cpp b/test/matrix/ell_kernels.cpp
index f6b9a9d1edb..b61d97a0a7a 100644
--- a/test/matrix/ell_kernels.cpp
+++ b/test/matrix/ell_kernels.cpp
@@ -533,7 +533,7 @@ TEST_F(Ell, CalculateNNZPerRowIsEquivalentToRef)
 
     gko::kernels::reference::ell::count_nonzeros_per_row(
         ref, mtx.get(), nnz_per_row.get_data());
-    gko::kernels::EXEC_NAMESPACE::ell::count_nonzeros_per_row(
+    gko::kernels::GKO_DEVICE_NAMESPACE::ell::count_nonzeros_per_row(
         exec, dmtx.get(), dnnz_per_row.get_data());
 
     GKO_ASSERT_ARRAY_EQ(nnz_per_row, dnnz_per_row);
diff --git a/test/matrix/sparsity_csr_kernels.cpp b/test/matrix/sparsity_csr_kernels.cpp
index 6fc3caf60ad..010bd7faa86 100644
--- a/test/matrix/sparsity_csr_kernels.cpp
+++ b/test/matrix/sparsity_csr_kernels.cpp
@@ -64,8 +64,8 @@ TEST_F(SparsityCsr, KernelDiagonalElementPrefixSumIsEquivalentToRef)
 
     gko::kernels::reference::sparsity_csr::diagonal_element_prefix_sum(
         ref, mtx.get(), prefix_sum.get_data());
-    gko::kernels::EXEC_NAMESPACE::sparsity_csr::diagonal_element_prefix_sum(
-        exec, dmtx.get(), dprefix_sum.get_data());
+    gko::kernels::GKO_DEVICE_NAMESPACE::sparsity_csr::
+        diagonal_element_prefix_sum(exec, dmtx.get(), dprefix_sum.get_data());
 
     GKO_ASSERT_ARRAY_EQ(prefix_sum, dprefix_sum);
 }
@@ -88,7 +88,7 @@ TEST_F(SparsityCsr, KernelRemoveDiagonalElementsIsEquivalentToRef)
     gko::kernels::reference::sparsity_csr::remove_diagonal_elements(
         ref, mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(),
         prefix_sum.get_const_data(), out_mtx.get());
-    gko::kernels::EXEC_NAMESPACE::sparsity_csr::remove_diagonal_elements(
+    gko::kernels::GKO_DEVICE_NAMESPACE::sparsity_csr::remove_diagonal_elements(
         exec, dmtx->get_const_row_ptrs(), dmtx->get_const_col_idxs(),
         dprefix_sum.get_const_data(), dout_mtx.get());
 
diff --git a/test/multigrid/pgm_kernels.cpp b/test/multigrid/pgm_kernels.cpp
index a5f2d32fe32..10e5cf01a7a 100644
--- a/test/multigrid/pgm_kernels.cpp
+++ b/test/multigrid/pgm_kernels.cpp
@@ -159,8 +159,8 @@ TEST_F(Pgm, MatchEdgeIsEquivalentToRef)
     auto d_x = d_unfinished_agg;
 
     gko::kernels::reference::pgm::match_edge(ref, strongest_neighbor, x);
-    gko::kernels::EXEC_NAMESPACE::pgm::match_edge(exec, d_strongest_neighbor,
-                                                  d_x);
+    gko::kernels::GKO_DEVICE_NAMESPACE::pgm::match_edge(
+        exec, d_strongest_neighbor, d_x);
 
     GKO_ASSERT_ARRAY_EQ(d_x, x);
 }
@@ -173,8 +173,8 @@ TEST_F(Pgm, CountUnaggIsEquivalentToRef)
     index_type d_num_unagg;
 
     gko::kernels::reference::pgm::count_unagg(ref, unfinished_agg, &num_unagg);
-    gko::kernels::EXEC_NAMESPACE::pgm::count_unagg(exec, d_unfinished_agg,
-                                                   &d_num_unagg);
+    gko::kernels::GKO_DEVICE_NAMESPACE::pgm::count_unagg(exec, d_unfinished_agg,
+                                                         &d_num_unagg);
 
     ASSERT_EQ(d_num_unagg, num_unagg);
 }
@@ -187,7 +187,7 @@ TEST_F(Pgm, RenumberIsEquivalentToRef)
     index_type d_num_agg;
 
     gko::kernels::reference::pgm::renumber(ref, agg, &num_agg);
-    gko::kernels::EXEC_NAMESPACE::pgm::renumber(exec, d_agg, &d_num_agg);
+    gko::kernels::GKO_DEVICE_NAMESPACE::pgm::renumber(exec, d_agg, &d_num_agg);
 
     ASSERT_EQ(d_num_agg, num_agg);
     GKO_ASSERT_ARRAY_EQ(d_agg, agg);
@@ -203,7 +203,7 @@ TEST_F(Pgm, FindStrongestNeighborIsEquivalentToRef)
 
     gko::kernels::reference::pgm::find_strongest_neighbor(
         ref, weight_csr.get(), weight_diag.get(), agg, snb);
-    gko::kernels::EXEC_NAMESPACE::pgm::find_strongest_neighbor(
+    gko::kernels::GKO_DEVICE_NAMESPACE::pgm::find_strongest_neighbor(
         exec, d_weight_csr.get(), d_weight_diag.get(), d_agg, d_snb);
 
     GKO_ASSERT_ARRAY_EQ(d_snb, snb);
@@ -220,7 +220,7 @@ TEST_F(Pgm, AssignToExistAggIsEquivalentToRef)
 
     gko::kernels::reference::pgm::assign_to_exist_agg(
         ref, weight_csr.get(), weight_diag.get(), x, intermediate_agg);
-    gko::kernels::EXEC_NAMESPACE::pgm::assign_to_exist_agg(
+    gko::kernels::GKO_DEVICE_NAMESPACE::pgm::assign_to_exist_agg(
         exec, d_weight_csr.get(), d_weight_diag.get(), d_x, d_intermediate_agg);
 
     GKO_ASSERT_ARRAY_EQ(d_x, x);
@@ -234,9 +234,10 @@ TEST_F(Pgm, AssignToExistAggUnderteminsticIsEquivalentToRef)
     auto d_intermediate_agg = gko::array<index_type>(exec, 0);
     index_type d_num_unagg;
 
-    gko::kernels::EXEC_NAMESPACE::pgm::assign_to_exist_agg(
+    gko::kernels::GKO_DEVICE_NAMESPACE::pgm::assign_to_exist_agg(
         exec, d_weight_csr.get(), d_weight_diag.get(), d_x, d_intermediate_agg);
-    gko::kernels::EXEC_NAMESPACE::pgm::count_unagg(exec, d_agg, &d_num_unagg);
+    gko::kernels::GKO_DEVICE_NAMESPACE::pgm::count_unagg(exec, d_agg,
+                                                         &d_num_unagg);
 
     // only test whether all elements are aggregated.
     GKO_ASSERT_EQ(d_num_unagg, 0);
@@ -257,7 +258,7 @@ TEST_F(Pgm, GatherIndexIsEquivalentToRef)
     gko::kernels::reference::pgm::gather_index(ref, num, orig.get_const_data(),
                                                map.get_const_data(),
                                                result.get_data());
-    gko::kernels::EXEC_NAMESPACE::pgm::gather_index(
+    gko::kernels::GKO_DEVICE_NAMESPACE::pgm::gather_index(
         exec, num, d_orig.get_const_data(), d_map.get_const_data(),
         d_result.get_data());
 
diff --git a/test/preconditioner/batch_jacobi_kernels.cpp b/test/preconditioner/batch_jacobi_kernels.cpp
index 30dbfa271ee..f8a1bd015ef 100644
--- a/test/preconditioner/batch_jacobi_kernels.cpp
+++ b/test/preconditioner/batch_jacobi_kernels.cpp
@@ -117,7 +117,7 @@ class BatchJacobi : public CommonTestFixture {
                                   const gko::batch::BatchLinOp* prec,
                                   const Mtx* mtx, const MVec* b, MVec* x,
                                   LogData& log_data) {
-            gko::kernels::EXEC_NAMESPACE::batch_bicgstab::apply<
+            gko::kernels::GKO_DEVICE_NAMESPACE::batch_bicgstab::apply<
                 typename Mtx::value_type>(executor, settings, mtx, prec, b, x,
                                           log_data);
         };
diff --git a/test/preconditioner/isai_kernels.cpp b/test/preconditioner/isai_kernels.cpp
index 57f8c14ac27..6e737d31790 100644
--- a/test/preconditioner/isai_kernels.cpp
+++ b/test/preconditioner/isai_kernels.cpp
@@ -122,7 +122,7 @@ TEST_F(Isai, IsaiGenerateLinverseShortIsEquivalentToRef)
 
     gko::kernels::reference::isai::generate_tri_inverse(
         ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), true);
-    gko::kernels::EXEC_NAMESPACE::isai::generate_tri_inverse(
+    gko::kernels::GKO_DEVICE_NAMESPACE::isai::generate_tri_inverse(
         exec, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(),
         true);
 
@@ -145,7 +145,7 @@ TEST_F(Isai, IsaiGenerateUinverseShortIsEquivalentToRef)
 
     gko::kernels::reference::isai::generate_tri_inverse(
         ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), false);
-    gko::kernels::EXEC_NAMESPACE::isai::generate_tri_inverse(
+    gko::kernels::GKO_DEVICE_NAMESPACE::isai::generate_tri_inverse(
         exec, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(),
         false);
 
@@ -168,7 +168,7 @@ TEST_F(Isai, IsaiGenerateAinverseShortIsEquivalentToRef)
 
     gko::kernels::reference::isai::generate_general_inverse(
         ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), false);
-    gko::kernels::EXEC_NAMESPACE::isai::generate_general_inverse(
+    gko::kernels::GKO_DEVICE_NAMESPACE::isai::generate_general_inverse(
         exec, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(),
         false);
 
@@ -191,7 +191,7 @@ TEST_F(Isai, IsaiGenerateSpdinverseShortIsEquivalentToRef)
 
     gko::kernels::reference::isai::generate_general_inverse(
         ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), true);
-    gko::kernels::EXEC_NAMESPACE::isai::generate_general_inverse(
+    gko::kernels::GKO_DEVICE_NAMESPACE::isai::generate_general_inverse(
         exec, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(),
         true);
 
@@ -214,7 +214,7 @@ TEST_F(Isai, IsaiGenerateLinverseLongIsEquivalentToRef)
 
     gko::kernels::reference::isai::generate_tri_inverse(
         ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), true);
-    gko::kernels::EXEC_NAMESPACE::isai::generate_tri_inverse(
+    gko::kernels::GKO_DEVICE_NAMESPACE::isai::generate_tri_inverse(
         exec, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(),
         true);
 
@@ -237,7 +237,7 @@ TEST_F(Isai, IsaiGenerateUinverseLongIsEquivalentToRef)
 
     gko::kernels::reference::isai::generate_tri_inverse(
         ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), false);
-    gko::kernels::EXEC_NAMESPACE::isai::generate_tri_inverse(
+    gko::kernels::GKO_DEVICE_NAMESPACE::isai::generate_tri_inverse(
         exec, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(),
         false);
 
@@ -260,7 +260,7 @@ TEST_F(Isai, IsaiGenerateAinverseLongIsEquivalentToRef)
 
     gko::kernels::reference::isai::generate_general_inverse(
         ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), false);
-    gko::kernels::EXEC_NAMESPACE::isai::generate_general_inverse(
+    gko::kernels::GKO_DEVICE_NAMESPACE::isai::generate_general_inverse(
         exec, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(),
         false);
 
@@ -283,7 +283,7 @@ TEST_F(Isai, IsaiGenerateSpdinverseLongIsEquivalentToRef)
 
     gko::kernels::reference::isai::generate_general_inverse(
         ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), false);
-    gko::kernels::EXEC_NAMESPACE::isai::generate_general_inverse(
+    gko::kernels::GKO_DEVICE_NAMESPACE::isai::generate_general_inverse(
         exec, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(),
         false);
 
@@ -315,7 +315,7 @@ TEST_F(Isai, IsaiGenerateExcessLinverseLongIsEquivalentToRef)
     gko::kernels::reference::isai::generate_excess_system(
         ref, mtx.get(), inverse.get(), a1.get_const_data(), a2.get_const_data(),
         excess.get(), e_rhs.get(), 0, num_rows);
-    gko::kernels::EXEC_NAMESPACE::isai::generate_excess_system(
+    gko::kernels::GKO_DEVICE_NAMESPACE::isai::generate_excess_system(
         exec, d_mtx.get(), d_inverse.get(), da1.get_const_data(),
         da2.get_const_data(), dexcess.get(), de_rhs.get(), 0, num_rows);
 
@@ -346,7 +346,7 @@ TEST_F(Isai, IsaiGenerateExcessUinverseLongIsEquivalentToRef)
     gko::kernels::reference::isai::generate_excess_system(
         ref, mtx.get(), inverse.get(), a1.get_const_data(), a2.get_const_data(),
         excess.get(), e_rhs.get(), 0, num_rows);
-    gko::kernels::EXEC_NAMESPACE::isai::generate_excess_system(
+    gko::kernels::GKO_DEVICE_NAMESPACE::isai::generate_excess_system(
         exec, d_mtx.get(), d_inverse.get(), da1.get_const_data(),
         da2.get_const_data(), dexcess.get(), de_rhs.get(), 0, num_rows);
 
@@ -377,7 +377,7 @@ TEST_F(Isai, IsaiGenerateExcessAinverseLongIsEquivalentToRef)
     gko::kernels::reference::isai::generate_excess_system(
         ref, mtx.get(), inverse.get(), a1.get_const_data(), a2.get_const_data(),
         excess.get(), e_rhs.get(), 0, num_rows);
-    gko::kernels::EXEC_NAMESPACE::isai::generate_excess_system(
+    gko::kernels::GKO_DEVICE_NAMESPACE::isai::generate_excess_system(
         exec, d_mtx.get(), d_inverse.get(), da1.get_const_data(),
         da2.get_const_data(), dexcess.get(), de_rhs.get(), 0, num_rows);
 
@@ -408,7 +408,7 @@ TEST_F(Isai, IsaiGenerateExcessSpdinverseLongIsEquivalentToRef)
     gko::kernels::reference::isai::generate_excess_system(
         ref, mtx.get(), inverse.get(), a1.get_const_data(), a2.get_const_data(),
         excess.get(), e_rhs.get(), 0, num_rows);
-    gko::kernels::EXEC_NAMESPACE::isai::generate_excess_system(
+    gko::kernels::GKO_DEVICE_NAMESPACE::isai::generate_excess_system(
         exec, d_mtx.get(), d_inverse.get(), da1.get_const_data(),
         da2.get_const_data(), dexcess.get(), de_rhs.get(), 0, num_rows);
 
@@ -439,7 +439,7 @@ TEST_F(Isai, IsaiGeneratePartialExcessIsEquivalentToRef)
     gko::kernels::reference::isai::generate_excess_system(
         ref, mtx.get(), inverse.get(), a1.get_const_data(), a2.get_const_data(),
         excess.get(), e_rhs.get(), 5u, 10u);
-    gko::kernels::EXEC_NAMESPACE::isai::generate_excess_system(
+    gko::kernels::GKO_DEVICE_NAMESPACE::isai::generate_excess_system(
         exec, d_mtx.get(), d_inverse.get(), da1.get_const_data(),
         da2.get_const_data(), dexcess.get(), de_rhs.get(), 5u, 10u);
 
@@ -467,7 +467,7 @@ TEST_F(Isai, IsaiScaleExcessSolutionIsEquivalentToRef)
 
     gko::kernels::reference::isai::scale_excess_solution(
         ref, a1.get_const_data(), e_rhs.get(), 0, num_rows);
-    gko::kernels::EXEC_NAMESPACE::isai::scale_excess_solution(
+    gko::kernels::GKO_DEVICE_NAMESPACE::isai::scale_excess_solution(
         exec, da1.get_const_data(), de_rhs.get(), 0, num_rows);
 
     GKO_ASSERT_MTX_NEAR(e_rhs, de_rhs, 0);
@@ -490,7 +490,7 @@ TEST_F(Isai, IsaiScalePartialExcessSolutionIsEquivalentToRef)
 
     gko::kernels::reference::isai::scale_excess_solution(
         ref, a1.get_const_data(), e_rhs.get(), 5u, 10u);
-    gko::kernels::EXEC_NAMESPACE::isai::scale_excess_solution(
+    gko::kernels::GKO_DEVICE_NAMESPACE::isai::scale_excess_solution(
         exec, da1.get_const_data(), de_rhs.get(), 5u, 10u);
 
     GKO_ASSERT_MTX_NEAR(e_rhs, de_rhs, 0);
@@ -514,7 +514,7 @@ TEST_F(Isai, IsaiScatterExcessSolutionLIsEquivalentToRef)
 
     gko::kernels::reference::isai::scatter_excess_solution(
         ref, a1.get_const_data(), e_rhs.get(), inverse.get(), 0, num_rows);
-    gko::kernels::EXEC_NAMESPACE::isai::scatter_excess_solution(
+    gko::kernels::GKO_DEVICE_NAMESPACE::isai::scatter_excess_solution(
         exec, da1.get_const_data(), de_rhs.get(), d_inverse.get(), 0, num_rows);
 
     GKO_ASSERT_MTX_NEAR(inverse, d_inverse, 0);
@@ -540,7 +540,7 @@ TEST_F(Isai, IsaiScatterExcessSolutionUIsEquivalentToRef)
 
     gko::kernels::reference::isai::scatter_excess_solution(
         ref, a1.get_const_data(), e_rhs.get(), inverse.get(), 0, num_rows);
-    gko::kernels::EXEC_NAMESPACE::isai::scatter_excess_solution(
+    gko::kernels::GKO_DEVICE_NAMESPACE::isai::scatter_excess_solution(
         exec, da1.get_const_data(), de_rhs.get(), d_inverse.get(), 0, num_rows);
 
     GKO_ASSERT_MTX_NEAR(inverse, d_inverse, 0);
@@ -566,7 +566,7 @@ TEST_F(Isai, IsaiScatterExcessSolutionAIsEquivalentToRef)
 
     gko::kernels::reference::isai::scatter_excess_solution(
         ref, a1.get_const_data(), e_rhs.get(), inverse.get(), 0, num_rows);
-    gko::kernels::EXEC_NAMESPACE::isai::scatter_excess_solution(
+    gko::kernels::GKO_DEVICE_NAMESPACE::isai::scatter_excess_solution(
         exec, da1.get_const_data(), de_rhs.get(), d_inverse.get(), 0, num_rows);
 
     GKO_ASSERT_MTX_NEAR(inverse, d_inverse, 0);
@@ -592,7 +592,7 @@ TEST_F(Isai, IsaiScatterExcessSolutionSpdIsEquivalentToRef)
 
     gko::kernels::reference::isai::scatter_excess_solution(
         ref, a1.get_const_data(), e_rhs.get(), inverse.get(), 0, num_rows);
-    gko::kernels::EXEC_NAMESPACE::isai::scatter_excess_solution(
+    gko::kernels::GKO_DEVICE_NAMESPACE::isai::scatter_excess_solution(
         exec, da1.get_const_data(), de_rhs.get(), d_inverse.get(), 0, num_rows);
 
     GKO_ASSERT_MTX_NEAR(inverse, d_inverse, 0);
@@ -618,7 +618,7 @@ TEST_F(Isai, IsaiScatterPartialExcessSolutionIsEquivalentToRef)
 
     gko::kernels::reference::isai::scatter_excess_solution(
         ref, a1.get_const_data(), e_rhs.get(), inverse.get(), 5u, 10u);
-    gko::kernels::EXEC_NAMESPACE::isai::scatter_excess_solution(
+    gko::kernels::GKO_DEVICE_NAMESPACE::isai::scatter_excess_solution(
         exec, da1.get_const_data(), de_rhs.get(), d_inverse.get(), 5u, 10u);
 
     GKO_ASSERT_MTX_NEAR(inverse, d_inverse, 0);
diff --git a/test/solver/batch_bicgstab_kernels.cpp b/test/solver/batch_bicgstab_kernels.cpp
index 821a8a6d29c..14bca65e41f 100644
--- a/test/solver/batch_bicgstab_kernels.cpp
+++ b/test/solver/batch_bicgstab_kernels.cpp
@@ -52,7 +52,7 @@ class BatchBicgstab : public CommonTestFixture {
                                   const gko::batch::BatchLinOp* prec,
                                   const Mtx* mtx, const MVec* b, MVec* x,
                                   LogData& log_data) {
-            gko::kernels::EXEC_NAMESPACE::batch_bicgstab::apply<
+            gko::kernels::GKO_DEVICE_NAMESPACE::batch_bicgstab::apply<
                 typename Mtx::value_type>(executor, settings, mtx, prec, b, x,
                                           log_data);
         };
diff --git a/test/solver/batch_cg_kernels.cpp b/test/solver/batch_cg_kernels.cpp
index 49f0db2a09b..7c013020686 100644
--- a/test/solver/batch_cg_kernels.cpp
+++ b/test/solver/batch_cg_kernels.cpp
@@ -50,7 +50,7 @@ class BatchCg : public CommonTestFixture {
                                   const gko::batch::BatchLinOp* prec,
                                   const Mtx* mtx, const MVec* b, MVec* x,
                                   LogData& log_data) {
-            gko::kernels::EXEC_NAMESPACE::batch_cg::apply<
+            gko::kernels::GKO_DEVICE_NAMESPACE::batch_cg::apply<
                 typename Mtx::value_type>(executor, settings, mtx, prec, b, x,
                                           log_data);
         };
diff --git a/test/solver/bicg_kernels.cpp b/test/solver/bicg_kernels.cpp
index 616f7eff096..ab63b01f9cc 100644
--- a/test/solver/bicg_kernels.cpp
+++ b/test/solver/bicg_kernels.cpp
@@ -139,7 +139,7 @@ TEST_F(Bicg, BicgInitializeIsEquivalentToRef)
     gko::kernels::reference::bicg::initialize(
         ref, b.get(), r.get(), z.get(), p.get(), q.get(), prev_rho.get(),
         rho.get(), r2.get(), z2.get(), p2.get(), q2.get(), stop_status.get());
-    gko::kernels::EXEC_NAMESPACE::bicg::initialize(
+    gko::kernels::GKO_DEVICE_NAMESPACE::bicg::initialize(
         exec, d_b.get(), d_r.get(), d_z.get(), d_p.get(), d_q.get(),
         d_prev_rho.get(), d_rho.get(), d_r2.get(), d_z2.get(), d_p2.get(),
         d_q2.get(), d_stop_status.get());
@@ -165,7 +165,7 @@ TEST_F(Bicg, BicgStep1IsEquivalentToRef)
     gko::kernels::reference::bicg::step_1(ref, p.get(), z.get(), p2.get(),
                                           z2.get(), rho.get(), prev_rho.get(),
                                           stop_status.get());
-    gko::kernels::EXEC_NAMESPACE::bicg::step_1(
+    gko::kernels::GKO_DEVICE_NAMESPACE::bicg::step_1(
         exec, d_p.get(), d_z.get(), d_p2.get(), d_z2.get(), d_rho.get(),
         d_prev_rho.get(), d_stop_status.get());
 
@@ -183,7 +183,7 @@ TEST_F(Bicg, BicgStep2IsEquivalentToRef)
     gko::kernels::reference::bicg::step_2(
         ref, x.get(), r.get(), r2.get(), p.get(), q.get(), q2.get(), beta.get(),
         rho.get(), stop_status.get());
-    gko::kernels::EXEC_NAMESPACE::bicg::step_2(
+    gko::kernels::GKO_DEVICE_NAMESPACE::bicg::step_2(
         exec, d_x.get(), d_r.get(), d_r2.get(), d_p.get(), d_q.get(),
         d_q2.get(), d_beta.get(), d_rho.get(), d_stop_status.get());
 
diff --git a/test/solver/bicgstab_kernels.cpp b/test/solver/bicgstab_kernels.cpp
index a63ff7f39f4..4f68edd6a8e 100644
--- a/test/solver/bicgstab_kernels.cpp
+++ b/test/solver/bicgstab_kernels.cpp
@@ -176,7 +176,7 @@ TEST_F(Bicgstab, BicgstabInitializeIsEquivalentToRef)
         ref, b.get(), r.get(), rr.get(), y.get(), s.get(), t.get(), z.get(),
         v.get(), p.get(), prev_rho.get(), rho.get(), alpha.get(), beta.get(),
         gamma.get(), omega.get(), stop_status.get());
-    gko::kernels::EXEC_NAMESPACE::bicgstab::initialize(
+    gko::kernels::GKO_DEVICE_NAMESPACE::bicgstab::initialize(
         exec, d_b.get(), d_r.get(), d_rr.get(), d_y.get(), d_s.get(), d_t.get(),
         d_z.get(), d_v.get(), d_p.get(), d_prev_rho.get(), d_rho.get(),
         d_alpha.get(), d_beta.get(), d_gamma.get(), d_omega.get(),
@@ -207,7 +207,7 @@ TEST_F(Bicgstab, BicgstabStep1IsEquivalentToRef)
     gko::kernels::reference::bicgstab::step_1(
         ref, r.get(), p.get(), v.get(), rho.get(), prev_rho.get(), alpha.get(),
         omega.get(), stop_status.get());
-    gko::kernels::EXEC_NAMESPACE::bicgstab::step_1(
+    gko::kernels::GKO_DEVICE_NAMESPACE::bicgstab::step_1(
         exec, d_r.get(), d_p.get(), d_v.get(), d_rho.get(), d_prev_rho.get(),
         d_alpha.get(), d_omega.get(), d_stop_status.get());
 
@@ -222,7 +222,7 @@ TEST_F(Bicgstab, BicgstabStep2IsEquivalentToRef)
     gko::kernels::reference::bicgstab::step_2(ref, r.get(), s.get(), v.get(),
                                               rho.get(), alpha.get(),
                                               beta.get(), stop_status.get());
-    gko::kernels::EXEC_NAMESPACE::bicgstab::step_2(
+    gko::kernels::GKO_DEVICE_NAMESPACE::bicgstab::step_2(
         exec, d_r.get(), d_s.get(), d_v.get(), d_rho.get(), d_alpha.get(),
         d_beta.get(), d_stop_status.get());
 
@@ -238,7 +238,7 @@ TEST_F(Bicgstab, BicgstabStep3IsEquivalentToRef)
     gko::kernels::reference::bicgstab::step_3(
         ref, x.get(), r.get(), s.get(), t.get(), y.get(), z.get(), alpha.get(),
         beta.get(), gamma.get(), omega.get(), stop_status.get());
-    gko::kernels::EXEC_NAMESPACE::bicgstab::step_3(
+    gko::kernels::GKO_DEVICE_NAMESPACE::bicgstab::step_3(
         exec, d_x.get(), d_r.get(), d_s.get(), d_t.get(), d_y.get(), d_z.get(),
         d_alpha.get(), d_beta.get(), d_gamma.get(), d_omega.get(),
         d_stop_status.get());
diff --git a/test/solver/cb_gmres_kernels.cpp b/test/solver/cb_gmres_kernels.cpp
index 4f854a26180..3b5f5956c2e 100644
--- a/test/solver/cb_gmres_kernels.cpp
+++ b/test/solver/cb_gmres_kernels.cpp
@@ -209,7 +209,7 @@ TEST_F(CbGmres, CbGmresInitialize1IsEquivalentToRef)
     gko::kernels::reference::cb_gmres::initialize(
         ref, b.get(), residual.get(), givens_sin.get(), givens_cos.get(),
         stop_status.get(), default_krylov_dim_mixed);
-    gko::kernels::EXEC_NAMESPACE::cb_gmres::initialize(
+    gko::kernels::GKO_DEVICE_NAMESPACE::cb_gmres::initialize(
         exec, d_b.get(), d_residual.get(), d_givens_sin.get(),
         d_givens_cos.get(), d_stop_status.get(), default_krylov_dim_mixed);
 
@@ -230,7 +230,7 @@ TEST_F(CbGmres, CbGmresInitialize2IsEquivalentToRef)
         residual_norm_collection.get(), arnoldi_norm.get(),
         range_helper.get_range(), next_krylov_basis.get(),
         final_iter_nums.get(), tmp, default_krylov_dim_mixed);
-    gko::kernels::EXEC_NAMESPACE::cb_gmres::restart(
+    gko::kernels::GKO_DEVICE_NAMESPACE::cb_gmres::restart(
         exec, d_residual.get(), d_residual_norm.get(),
         d_residual_norm_collection.get(), d_arnoldi_norm.get(),
         d_range_helper.get_range(), d_next_krylov_basis.get(),
@@ -255,7 +255,7 @@ TEST_F(CbGmres, CbGmresStep1IsEquivalentToRef)
         range_helper.get_range(), hessenberg_iter.get(), buffer_iter.get(),
         arnoldi_norm.get(), iter, final_iter_nums.get(), stop_status.get(),
         reorth_status.get(), num_reorth.get());
-    gko::kernels::EXEC_NAMESPACE::cb_gmres::arnoldi(
+    gko::kernels::GKO_DEVICE_NAMESPACE::cb_gmres::arnoldi(
         exec, d_next_krylov_basis.get(), d_givens_sin.get(), d_givens_cos.get(),
         d_residual_norm.get(), d_residual_norm_collection.get(),
         d_range_helper.get_range(), d_hessenberg_iter.get(),
@@ -285,7 +285,7 @@ TEST_F(CbGmres, CbGmresStep2IsEquivalentToRef)
         ref, residual_norm_collection.get(),
         range_helper.get_range().get_accessor().to_const(), hessenberg.get(),
         y.get(), before_preconditioner.get(), final_iter_nums.get());
-    gko::kernels::EXEC_NAMESPACE::cb_gmres::solve_krylov(
+    gko::kernels::GKO_DEVICE_NAMESPACE::cb_gmres::solve_krylov(
         exec, d_residual_norm_collection.get(),
         d_range_helper.get_range().get_accessor().to_const(),
         d_hessenberg.get(), d_y.get(), d_before_preconditioner.get(),
diff --git a/test/solver/cg_kernels.cpp b/test/solver/cg_kernels.cpp
index 41af16489a2..be9dc052314 100644
--- a/test/solver/cg_kernels.cpp
+++ b/test/solver/cg_kernels.cpp
@@ -114,7 +114,7 @@ TEST_F(Cg, CgInitializeIsEquivalentToRef)
     gko::kernels::reference::cg::initialize(ref, b.get(), r.get(), z.get(),
                                             p.get(), q.get(), prev_rho.get(),
                                             rho.get(), stop_status.get());
-    gko::kernels::EXEC_NAMESPACE::cg::initialize(
+    gko::kernels::GKO_DEVICE_NAMESPACE::cg::initialize(
         exec, d_b.get(), d_r.get(), d_z.get(), d_p.get(), d_q.get(),
         d_prev_rho.get(), d_rho.get(), d_stop_status.get());
 
@@ -134,9 +134,9 @@ TEST_F(Cg, CgStep1IsEquivalentToRef)
 
     gko::kernels::reference::cg::step_1(ref, p.get(), z.get(), rho.get(),
                                         prev_rho.get(), stop_status.get());
-    gko::kernels::EXEC_NAMESPACE::cg::step_1(exec, d_p.get(), d_z.get(),
-                                             d_rho.get(), d_prev_rho.get(),
-                                             d_stop_status.get());
+    gko::kernels::GKO_DEVICE_NAMESPACE::cg::step_1(
+        exec, d_p.get(), d_z.get(), d_rho.get(), d_prev_rho.get(),
+        d_stop_status.get());
 
     GKO_ASSERT_MTX_NEAR(d_p, p, ::r<value_type>::value);
     GKO_ASSERT_MTX_NEAR(d_z, z, ::r<value_type>::value);
@@ -149,9 +149,9 @@ TEST_F(Cg, CgStep2IsEquivalentToRef)
     gko::kernels::reference::cg::step_2(ref, x.get(), r.get(), p.get(), q.get(),
                                         beta.get(), rho.get(),
                                         stop_status.get());
-    gko::kernels::EXEC_NAMESPACE::cg::step_2(exec, d_x.get(), d_r.get(),
-                                             d_p.get(), d_q.get(), d_beta.get(),
-                                             d_rho.get(), d_stop_status.get());
+    gko::kernels::GKO_DEVICE_NAMESPACE::cg::step_2(
+        exec, d_x.get(), d_r.get(), d_p.get(), d_q.get(), d_beta.get(),
+        d_rho.get(), d_stop_status.get());
 
     GKO_ASSERT_MTX_NEAR(d_x, x, ::r<value_type>::value);
     GKO_ASSERT_MTX_NEAR(d_r, r, ::r<value_type>::value);
diff --git a/test/solver/cgs_kernels.cpp b/test/solver/cgs_kernels.cpp
index 123f76727b5..6c2bab293e3 100644
--- a/test/solver/cgs_kernels.cpp
+++ b/test/solver/cgs_kernels.cpp
@@ -167,7 +167,7 @@ TEST_F(Cgs, CgsInitializeIsEquivalentToRef)
         ref, b.get(), r.get(), r_tld.get(), p.get(), q.get(), u.get(),
         u_hat.get(), v_hat.get(), t.get(), alpha.get(), beta.get(), gamma.get(),
         rho_prev.get(), rho.get(), stop_status.get());
-    gko::kernels::EXEC_NAMESPACE::cgs::initialize(
+    gko::kernels::GKO_DEVICE_NAMESPACE::cgs::initialize(
         exec, d_b.get(), d_r.get(), d_r_tld.get(), d_p.get(), d_q.get(),
         d_u.get(), d_u_hat.get(), d_v_hat.get(), d_t.get(), d_alpha.get(),
         d_beta.get(), d_gamma.get(), d_rho_prev.get(), d_rho.get(),
@@ -197,7 +197,7 @@ TEST_F(Cgs, CgsStep1IsEquivalentToRef)
     gko::kernels::reference::cgs::step_1(ref, r.get(), u.get(), p.get(),
                                          q.get(), beta.get(), rho.get(),
                                          rho_prev.get(), stop_status.get());
-    gko::kernels::EXEC_NAMESPACE::cgs::step_1(
+    gko::kernels::GKO_DEVICE_NAMESPACE::cgs::step_1(
         exec, d_r.get(), d_u.get(), d_p.get(), d_q.get(), d_beta.get(),
         d_rho.get(), d_rho_prev.get(), d_stop_status.get());
 
@@ -214,7 +214,7 @@ TEST_F(Cgs, CgsStep2IsEquivalentToRef)
     gko::kernels::reference::cgs::step_2(ref, u.get(), v_hat.get(), q.get(),
                                          t.get(), alpha.get(), rho.get(),
                                          gamma.get(), stop_status.get());
-    gko::kernels::EXEC_NAMESPACE::cgs::step_2(
+    gko::kernels::GKO_DEVICE_NAMESPACE::cgs::step_2(
         exec, d_u.get(), d_v_hat.get(), d_q.get(), d_t.get(), d_alpha.get(),
         d_rho.get(), d_gamma.get(), d_stop_status.get());
 
@@ -231,7 +231,7 @@ TEST_F(Cgs, CgsStep3IsEquivalentToRef)
     gko::kernels::reference::cgs::step_3(ref, t.get(), u_hat.get(), r.get(),
                                          x.get(), alpha.get(),
                                          stop_status.get());
-    gko::kernels::EXEC_NAMESPACE::cgs::step_3(
+    gko::kernels::GKO_DEVICE_NAMESPACE::cgs::step_3(
         exec, d_t.get(), d_u_hat.get(), d_r.get(), d_x.get(), d_alpha.get(),
         d_stop_status.get());
 
diff --git a/test/solver/fcg_kernels.cpp b/test/solver/fcg_kernels.cpp
index faf7225c883..f1f09f759bc 100644
--- a/test/solver/fcg_kernels.cpp
+++ b/test/solver/fcg_kernels.cpp
@@ -122,7 +122,7 @@ TEST_F(Fcg, FcgInitializeIsEquivalentToRef)
     gko::kernels::reference::fcg::initialize(
         ref, b.get(), r.get(), z.get(), p.get(), q.get(), t.get(),
         prev_rho.get(), rho.get(), rho_t.get(), stop_status.get());
-    gko::kernels::EXEC_NAMESPACE::fcg::initialize(
+    gko::kernels::GKO_DEVICE_NAMESPACE::fcg::initialize(
         exec, d_b.get(), d_r.get(), d_z.get(), d_p.get(), d_q.get(), d_t.get(),
         d_prev_rho.get(), d_rho.get(), d_rho_t.get(), d_stop_status.get());
 
@@ -144,9 +144,9 @@ TEST_F(Fcg, FcgStep1IsEquivalentToRef)
 
     gko::kernels::reference::fcg::step_1(ref, p.get(), z.get(), rho_t.get(),
                                          prev_rho.get(), stop_status.get());
-    gko::kernels::EXEC_NAMESPACE::fcg::step_1(exec, d_p.get(), d_z.get(),
-                                              d_rho_t.get(), d_prev_rho.get(),
-                                              d_stop_status.get());
+    gko::kernels::GKO_DEVICE_NAMESPACE::fcg::step_1(
+        exec, d_p.get(), d_z.get(), d_rho_t.get(), d_prev_rho.get(),
+        d_stop_status.get());
 
     GKO_ASSERT_MTX_NEAR(d_p, p, ::r<value_type>::value);
     GKO_ASSERT_MTX_NEAR(d_z, z, ::r<value_type>::value);
@@ -159,7 +159,7 @@ TEST_F(Fcg, FcgStep2IsEquivalentToRef)
     gko::kernels::reference::fcg::step_2(ref, x.get(), r.get(), t.get(),
                                          p.get(), q.get(), beta.get(),
                                          rho.get(), stop_status.get());
-    gko::kernels::EXEC_NAMESPACE::fcg::step_2(
+    gko::kernels::GKO_DEVICE_NAMESPACE::fcg::step_2(
         exec, d_x.get(), d_r.get(), d_t.get(), d_p.get(), d_q.get(),
         d_beta.get(), d_rho.get(), d_stop_status.get());
 
diff --git a/test/solver/gcr_kernels.cpp b/test/solver/gcr_kernels.cpp
index 575d55ded87..7a00b3fed30 100644
--- a/test/solver/gcr_kernels.cpp
+++ b/test/solver/gcr_kernels.cpp
@@ -153,7 +153,7 @@ TEST_F(Gcr, GcrKernelInitializeIsEquivalentToRef)
 
     gko::kernels::reference::gcr::initialize(ref, b.get(), residual.get(),
                                              stop_status.get_data());
-    gko::kernels::EXEC_NAMESPACE::gcr::initialize(
+    gko::kernels::GKO_DEVICE_NAMESPACE::gcr::initialize(
         exec, d_b.get(), d_residual.get(), d_stop_status.get_data());
 
     GKO_ASSERT_MTX_NEAR(d_residual, residual, r<value_type>::value);
@@ -168,7 +168,7 @@ TEST_F(Gcr, GcrKernelRestartIsEquivalentToRef)
     gko::kernels::reference::gcr::restart(ref, residual.get(), A_residual.get(),
                                           p_bases.get(), Ap_bases.get(),
                                           final_iter_nums.get_data());
-    gko::kernels::EXEC_NAMESPACE::gcr::restart(
+    gko::kernels::GKO_DEVICE_NAMESPACE::gcr::restart(
         exec, d_residual.get(), d_A_residual.get(), d_p_bases.get(),
         d_Ap_bases.get(), d_final_iter_nums.get_data());
 
@@ -186,7 +186,7 @@ TEST_F(Gcr, GcrStep1IsEquivalentToRef)
     gko::kernels::reference::gcr::step_1(ref, x.get(), residual.get(), p.get(),
                                          Ap.get(), Ap_norm.get(), rAp.get(),
                                          stop_status.get_data());
-    gko::kernels::EXEC_NAMESPACE::gcr::step_1(
+    gko::kernels::GKO_DEVICE_NAMESPACE::gcr::step_1(
         exec, d_x.get(), d_residual.get(), d_p.get(), d_Ap.get(),
         d_Ap_norm.get(), d_rAp.get(), d_stop_status.get_data());
 
diff --git a/test/solver/gmres_kernels.cpp b/test/solver/gmres_kernels.cpp
index ac9139d81aa..08259c91ce0 100644
--- a/test/solver/gmres_kernels.cpp
+++ b/test/solver/gmres_kernels.cpp
@@ -159,7 +159,7 @@ TEST_F(Gmres, GmresKernelInitializeIsEquivalentToRef)
     gko::kernels::reference::common_gmres::initialize(
         ref, b.get(), residual.get(), givens_sin.get(), givens_cos.get(),
         stop_status.get_data());
-    gko::kernels::EXEC_NAMESPACE::common_gmres::initialize(
+    gko::kernels::GKO_DEVICE_NAMESPACE::common_gmres::initialize(
         exec, d_b.get(), d_residual.get(), d_givens_sin.get(),
         d_givens_cos.get(), d_stop_status.get_data());
 
@@ -180,7 +180,7 @@ TEST_F(Gmres, GmresKernelRestartIsEquivalentToRef)
         ref, residual.get(), residual_norm.get(),
         residual_norm_collection.get(), krylov_bases.get(),
         final_iter_nums.get_data());
-    gko::kernels::EXEC_NAMESPACE::gmres::restart(
+    gko::kernels::GKO_DEVICE_NAMESPACE::gmres::restart(
         exec, d_residual.get(), d_residual_norm.get(),
         d_residual_norm_collection.get(), d_krylov_bases.get(),
         d_final_iter_nums.get_data());
@@ -202,7 +202,7 @@ TEST_F(Gmres, GmresKernelHessenbergQRIsEquivalentToRef)
         ref, givens_sin.get(), givens_cos.get(), residual_norm.get(),
         residual_norm_collection.get(), hessenberg_iter.get(), iter,
         final_iter_nums.get_data(), stop_status.get_const_data());
-    gko::kernels::EXEC_NAMESPACE::common_gmres::hessenberg_qr(
+    gko::kernels::GKO_DEVICE_NAMESPACE::common_gmres::hessenberg_qr(
         exec, d_givens_sin.get(), d_givens_cos.get(), d_residual_norm.get(),
         d_residual_norm_collection.get(), d_hessenberg_iter.get(), iter,
         d_final_iter_nums.get_data(), d_stop_status.get_const_data());
@@ -228,7 +228,7 @@ TEST_F(Gmres, GmresKernelHessenbergQROnSingleRHSIsEquivalentToRef)
         ref, givens_sin.get(), givens_cos.get(), residual_norm.get(),
         residual_norm_collection.get(), hessenberg_iter.get(), iter,
         final_iter_nums.get_data(), stop_status.get_const_data());
-    gko::kernels::EXEC_NAMESPACE::common_gmres::hessenberg_qr(
+    gko::kernels::GKO_DEVICE_NAMESPACE::common_gmres::hessenberg_qr(
         exec, d_givens_sin.get(), d_givens_cos.get(), d_residual_norm.get(),
         d_residual_norm_collection.get(), d_hessenberg_iter.get(), iter,
         d_final_iter_nums.get_data(), d_stop_status.get_const_data());
@@ -252,7 +252,7 @@ TEST_F(Gmres, GmresKernelSolveKrylovIsEquivalentToRef)
     gko::kernels::reference::common_gmres::solve_krylov(
         ref, residual_norm_collection.get(), hessenberg.get(), y.get(),
         final_iter_nums.get_const_data(), stop_status.get_const_data());
-    gko::kernels::EXEC_NAMESPACE::common_gmres::solve_krylov(
+    gko::kernels::GKO_DEVICE_NAMESPACE::common_gmres::solve_krylov(
         exec, d_residual_norm_collection.get(), d_hessenberg.get(), d_y.get(),
         d_final_iter_nums.get_const_data(), d_stop_status.get_const_data());
 
@@ -267,7 +267,7 @@ TEST_F(Gmres, GmresKernelMultiAxpyIsEquivalentToRef)
     gko::kernels::reference::gmres::multi_axpy(
         ref, krylov_bases.get(), y.get(), before_preconditioner.get(),
         final_iter_nums.get_const_data(), stop_status.get_data());
-    gko::kernels::EXEC_NAMESPACE::gmres::multi_axpy(
+    gko::kernels::GKO_DEVICE_NAMESPACE::gmres::multi_axpy(
         exec, d_krylov_bases.get(), d_y.get(), d_before_preconditioner.get(),
         d_final_iter_nums.get_const_data(), d_stop_status.get_data());
 
diff --git a/test/solver/idr_kernels.cpp b/test/solver/idr_kernels.cpp
index 31c7df99168..b165824dbe0 100644
--- a/test/solver/idr_kernels.cpp
+++ b/test/solver/idr_kernels.cpp
@@ -160,7 +160,7 @@ TEST_F(Idr, IdrInitializeIsEquivalentToRef)
 
     gko::kernels::reference::idr::initialize(ref, nrhs, m.get(), p.get(), true,
                                              stop_status.get());
-    gko::kernels::EXEC_NAMESPACE::idr::initialize(
+    gko::kernels::GKO_DEVICE_NAMESPACE::idr::initialize(
         exec, nrhs, d_m.get(), d_p.get(), true, d_stop_status.get());
 
     GKO_ASSERT_MTX_NEAR(m, d_m, rr<value_type>::value);
@@ -176,7 +176,7 @@ TEST_F(Idr, IdrStep1IsEquivalentToRef)
     gko::kernels::reference::idr::step_1(ref, nrhs, k, m.get(), f.get(),
                                          r.get(), g.get(), c.get(), v.get(),
                                          stop_status.get());
-    gko::kernels::EXEC_NAMESPACE::idr::step_1(
+    gko::kernels::GKO_DEVICE_NAMESPACE::idr::step_1(
         exec, nrhs, k, d_m.get(), d_f.get(), d_r.get(), d_g.get(), d_c.get(),
         d_v.get(), d_stop_status.get());
 
@@ -192,9 +192,9 @@ TEST_F(Idr, IdrStep2IsEquivalentToRef)
     gko::size_type k = 2;
     gko::kernels::reference::idr::step_2(ref, nrhs, k, omega.get(), v.get(),
                                          c.get(), u.get(), stop_status.get());
-    gko::kernels::EXEC_NAMESPACE::idr::step_2(exec, nrhs, k, d_omega.get(),
-                                              d_v.get(), d_c.get(), d_u.get(),
-                                              d_stop_status.get());
+    gko::kernels::GKO_DEVICE_NAMESPACE::idr::step_2(
+        exec, nrhs, k, d_omega.get(), d_v.get(), d_c.get(), d_u.get(),
+        d_stop_status.get());
 
     GKO_ASSERT_MTX_NEAR(u, d_u, rr<value_type>::value);
 }
@@ -208,7 +208,7 @@ TEST_F(Idr, IdrStep3IsEquivalentToRef)
     gko::kernels::reference::idr::step_3(
         ref, nrhs, k, p.get(), g.get(), v.get(), u.get(), m.get(), f.get(),
         alpha.get(), r.get(), x.get(), stop_status.get());
-    gko::kernels::EXEC_NAMESPACE::idr::step_3(
+    gko::kernels::GKO_DEVICE_NAMESPACE::idr::step_3(
         exec, nrhs, k, d_p.get(), d_g.get(), d_v.get(), d_u.get(), d_m.get(),
         d_f.get(), d_alpha.get(), d_r.get(), d_x.get(), d_stop_status.get());
 
@@ -230,7 +230,7 @@ TEST_F(Idr, IdrComputeOmegaIsEquivalentToRef)
     gko::kernels::reference::idr::compute_omega(ref, nrhs, kappa, tht.get(),
                                                 residual_norm.get(),
                                                 omega.get(), stop_status.get());
-    gko::kernels::EXEC_NAMESPACE::idr::compute_omega(
+    gko::kernels::GKO_DEVICE_NAMESPACE::idr::compute_omega(
         exec, nrhs, kappa, d_tht.get(), d_residual_norm.get(), d_omega.get(),
         d_stop_status.get());
 
diff --git a/test/solver/ir_kernels.cpp b/test/solver/ir_kernels.cpp
index 99550dfd99f..7a8e84324bd 100644
--- a/test/solver/ir_kernels.cpp
+++ b/test/solver/ir_kernels.cpp
@@ -55,7 +55,7 @@ TEST_F(Ir, InitializeIsEquivalentToRef)
     auto d_stop_status = gko::array<gko::stopping_status>(exec, stop_status);
 
     gko::kernels::reference::ir::initialize(ref, &stop_status);
-    gko::kernels::EXEC_NAMESPACE::ir::initialize(exec, &d_stop_status);
+    gko::kernels::GKO_DEVICE_NAMESPACE::ir::initialize(exec, &d_stop_status);
 
     auto tmp = gko::array<gko::stopping_status>(ref, d_stop_status);
     for (int i = 0; i < stop_status.get_size(); ++i) {
diff --git a/test/solver/multigrid_kernels.cpp b/test/solver/multigrid_kernels.cpp
index 139cb1a4647..4b4b0157df5 100644
--- a/test/solver/multigrid_kernels.cpp
+++ b/test/solver/multigrid_kernels.cpp
@@ -144,7 +144,7 @@ TEST_F(Multigrid, MultigridKCycleStep1IsEquivalentToRef)
 
     gko::kernels::reference::multigrid::kcycle_step_1(
         ref, alpha.get(), rho.get(), v.get(), g.get(), d.get(), e.get());
-    gko::kernels::EXEC_NAMESPACE::multigrid::kcycle_step_1(
+    gko::kernels::GKO_DEVICE_NAMESPACE::multigrid::kcycle_step_1(
         exec, d_alpha.get(), d_rho.get(), d_v.get(), d_g.get(), d_d.get(),
         d_e.get());
 
@@ -161,7 +161,7 @@ TEST_F(Multigrid, MultigridKCycleStep2IsEquivalentToRef)
     gko::kernels::reference::multigrid::kcycle_step_2(
         ref, alpha.get(), rho.get(), gamma.get(), beta.get(), zeta.get(),
         d.get(), e.get());
-    gko::kernels::EXEC_NAMESPACE::multigrid::kcycle_step_2(
+    gko::kernels::GKO_DEVICE_NAMESPACE::multigrid::kcycle_step_2(
         exec, d_alpha.get(), d_rho.get(), d_gamma.get(), d_beta.get(),
         d_zeta.get(), d_d.get(), d_e.get());
 
@@ -179,11 +179,11 @@ TEST_F(Multigrid, MultigridKCycleCheckStopIsEquivalentToRef)
 
     gko::kernels::reference::multigrid::kcycle_check_stop(
         ref, old_norm.get(), new_norm.get(), 1.0, is_stop_10);
-    gko::kernels::EXEC_NAMESPACE::multigrid::kcycle_check_stop(
+    gko::kernels::GKO_DEVICE_NAMESPACE::multigrid::kcycle_check_stop(
         exec, d_old_norm.get(), d_new_norm.get(), 1.0, d_is_stop_10);
     gko::kernels::reference::multigrid::kcycle_check_stop(
         ref, old_norm.get(), new_norm.get(), 0.5, is_stop_5);
-    gko::kernels::EXEC_NAMESPACE::multigrid::kcycle_check_stop(
+    gko::kernels::GKO_DEVICE_NAMESPACE::multigrid::kcycle_check_stop(
         exec, d_old_norm.get(), d_new_norm.get(), 0.5, d_is_stop_5);
 
     GKO_ASSERT_EQ(d_is_stop_10, is_stop_10);