diff --git a/.github/_typos.toml b/.github/_typos.toml
new file mode 100644
index 00000000000..4b9d9be6403
--- /dev/null
+++ b/.github/_typos.toml
@@ -0,0 +1,23 @@
+[files]
+extend-exclude = ["third_party/*", "*.svg"]
+
+[default.extend-words]
+dout = "dout"
+nd = "nd"
+tht = "tht"
+automatical = "automatical"
+strat = "strat"
+entrie = "entrie"
+agregate = "agregate" # since that script name is already in ginkgo-data repo
+
+[default.extend-identifiers]
+set_complex_subpsace = "set_complex_subpsace" # remove when deprecated function is gone
+HSA_HEADER = "HSA_HEADER"
+conj_operaton = "conj_operaton" # considered interface break in range.hpp
+imag_operaton = "imag_operaton" # considered interface break in range.hpp
+real_operaton = "real_operaton" # considered interface break in range.hpp
+one_operaton = "one_operaton" # considered interface break in range.hpp
+abs_operaton = "abs_operaton" # considered interface break in range.hpp
+max_operaton = "max_operaton" # considered interface break in range.hpp
+min_operaton = "min_operaton" # considered interface break in range.hpp
+squared_norm_operaton = "squared_norm_operaton" # considered interface break in range.hpp
diff --git a/.github/workflows/bot-pr-updated.yml b/.github/workflows/bot-pr-updated.yml
index ae357c9db96..8554ca3b1e9 100644
--- a/.github/workflows/bot-pr-updated.yml
+++ b/.github/workflows/bot-pr-updated.yml
@@ -28,7 +28,7 @@ jobs:
     runs-on: ubuntu-latest
     if: github.event.pull_request.author_association == 'COLLABORATOR' || github.event.pull_request.author_association == 'MEMBER' || github.event.pull_request.author_association == 'OWNER'
     env:
-      CMAKE_FLAGS: -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=DEBUG -DGINKGO_BUILD_TESTS=OFF -DGINKGO_BUILD_EXAMPLES=OFF -DGINKGO_BUILD_BENCHMARKS=OFF -DGINKGO_BUILD_HWLOC=OFF -DGINKGO_BUILD_REFERENCE=OFF -DGINKGO_BUILD_OMP=OFF -DGINKGO_BUILD_CUDA=OFF -DGINKGO_BUILD_HIP=OFF -DGINKGO_BUILD_DPCPP=OFF
+      CMAKE_FLAGS: -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=DEBUG -DGINKGO_BUILD_TESTS=OFF -DGINKGO_BUILD_EXAMPLES=OFF -DGINKGO_BUILD_BENCHMARKS=OFF -DGINKGO_BUILD_HWLOC=OFF -DGINKGO_BUILD_REFERENCE=OFF -DGINKGO_BUILD_OMP=OFF -DGINKGO_BUILD_CUDA=OFF -DGINKGO_BUILD_HIP=OFF -DGINKGO_BUILD_SYCL=OFF
     steps:
       - name: Checkout the new code (shallow clone)
         uses: actions/checkout@v3
diff --git a/.github/workflows/intel.yml b/.github/workflows/intel.yml
index e612c72b7e7..db18b510e21 100644
--- a/.github/workflows/intel.yml
+++ b/.github/workflows/intel.yml
@@ -21,7 +21,8 @@ jobs:
       fail-fast: false
       matrix:
         config:
-        - {build_type: "Release", name: "intel/release/shared", "mixed": "ON"}
+        - {compiler: "dpcpp", build_type: "Release", name: "intel/dpcpp/release/shared", mixed: "ON"}
+        - {compiler: "icpx", build_type: "Release", name: "intel/icpx/release/shared", mixed: "OFF"}
     name: ${{ matrix.config.name }}
     runs-on: [gpu_intel]
 
@@ -35,9 +36,9 @@ jobs:
         spack find --loaded
         mkdir build
         cd build
-        cmake .. -DCMAKE_INSTALL_PREFIX=install_ginkgo -DCMAKE_CXX_COMPILER=dpcpp -DCMAKE_BUILD_TYPE=${{ matrix.config.build_type }} -DGINKGO_MIXED_PRECISION=${{ matrix.config.mixed }} -DGINKGO_DPCPP_SINGLE_MODE=ON
+        cmake .. -DCMAKE_INSTALL_PREFIX=install_ginkgo -DGINKGO_COMPILER_FLAGS="-ffp-model=precise" -DCMAKE_CXX_COMPILER=${{ matrix.config.compiler }} -DCMAKE_BUILD_TYPE=${{ matrix.config.build_type }} -DGINKGO_MIXED_PRECISION=${{ matrix.config.mixed }} -DGINKGO_DPCPP_SINGLE_MODE=ON
         make -j8
-        SYCL_DEVICE_FILTER=level_zero ctest -j10 --output-on-failure
+        ONEAPI_DEVICE_SELECTOR=level_zero:gpu ctest -j10 --output-on-failure
 
     - name: install
       run: |
diff --git a/.github/workflows/spell_check.yml b/.github/workflows/spell_check.yml
new file mode 100644
index 00000000000..0049dce9180
--- /dev/null
+++ b/.github/workflows/spell_check.yml
@@ -0,0 +1,16 @@
+name: Test GitHub Action
+on:
+  pull_request:
+    types: [opened, synchronize]
+
+jobs:
+  run:
+    name: Spell Check with Typos
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v3
+    - name: Check for typos
+      uses: crate-ci/typos@master
+      with:
+        config: .github/_typos.toml
+
diff --git a/.github/workflows/windows-msvc-ref.yml b/.github/workflows/windows-msvc-ref.yml
index f7d73e2fd82..87f9214876d 100644
--- a/.github/workflows/windows-msvc-ref.yml
+++ b/.github/workflows/windows-msvc-ref.yml
@@ -27,8 +27,10 @@ jobs:
       fail-fast: false
       matrix:
         config:
-        - {shared: "ON", build_type: "Debug", name: "reference/debug/shared"}
+        # Debug shared exceeds symbol limit
+        # - {shared: "ON", build_type: "Debug", name: "reference/debug/shared"}
         - {shared: "OFF", build_type: "Release", name: "reference/release/static"}
+        - {shared: "ON", build_type: "Release", name: "reference/release/shared"}
         # Debug static needs too much storage
         # - {shared: "OFF", build_type: "Debug", name: "reference/debug/static"}
     name: msvc/${{ matrix.config.name }}
diff --git a/.gitignore b/.gitignore
index af0a88ef513..827f4025a2e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,6 +12,9 @@ compile_commands.json
 CTestTestfile.cmake
 build
 
+### Python
+__pycache__
+
 ### IDE
 # Clion
 .idea
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index f81e271288c..ab78943a409 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -19,6 +19,13 @@ include:
   - local: '.gitlab/rules.yml'
   - local: '.gitlab/scripts.yml'
   - local: '.gitlab/variables.yml'
+  # This is a workaround to conditionally make the branch pipelines
+  # interruptible, because the flag does not directly support rules [1].
+  #
+  # [1] https://gitlab.com/gitlab-org/gitlab/-/issues/194023#note_1225906002
+  - local: '.gitlab/add-interrupt.yml'
+    rules:
+      - if: $CI_COMMIT_BRANCH != "master" && $CI_COMMIT_BRANCH != "develop" && $CI_COMMIT_TAG !~ /^v\d+\.\d+\.\d+/
 
 sync:
   stage: sync
@@ -86,20 +93,6 @@ trigger_pipeline:
 
 # Build jobs
 # Job with example runs.
-# cuda 9.2 and friends
-build/cuda92/nompi/gcc/all/release/shared:
-  extends:
-    - .build_and_test_template
-    - .default_variables
-    - .quick_test_condition
-    - .use_gko-cuda92-mvapich2-gnu7-llvm50-intel2017
-  variables:
-    BUILD_OMP: "ON"
-    BUILD_CUDA: "ON"
-    BUILD_HIP: "ON"
-    BUILD_TYPE: "Release"
-    CUDA_ARCH: 61
-
 # cuda 10.1 and friends
 # Build CUDA NVIDIA without omp
 # Make sure that our jobs run when HWLOC is
@@ -120,6 +113,7 @@ build/cuda101/nompi/clang/cuda_wo_omp/release/shared:
     CUDA_ARCH: 35
 
 # Job with example runs.
+# Also explicitly test PAPI SDE
 build/cuda101/openmpi/gcc/all/debug/shared:
   extends:
     - .build_template
@@ -133,6 +127,7 @@ build/cuda101/openmpi/gcc/all/debug/shared:
     MPI_AS_ROOT: "ON"
     BUILD_HIP: "ON"
     BUILD_TYPE: "Debug"
+    BUILD_PAPI_SDE: "ON"
     RUN_EXAMPLES: "ON"
     CUDA_ARCH: 35
 
@@ -169,7 +164,6 @@ build/cuda101/nompi/clang/all/release/static:
 #    MPI_AS_ROOT: "ON"
 #    BUILD_HIP: "OFF"
 #    BUILD_TYPE: "Release"
-#    CUDA_ARCH: 61
 
 
 #build/clang-cuda101/nompi/clang/cuda/debug/static:
@@ -187,7 +181,6 @@ build/cuda101/nompi/clang/all/release/static:
 #    BUILD_TYPE: "Debug"
 #    FAST_TESTS: "ON"
 #    BUILD_SHARED_LIBS: "OFF"
-#    CUDA_ARCH: 61
 
 
 # cuda 10.2 and friends
@@ -272,6 +265,8 @@ test/cuda110/mvapich2/gcc/cuda/debug/shared:
     SLURM_GRES: "gpu:4"
     SLURM_TIME: "02:00:00"
   dependencies: null
+  # FIXME: current slurm always reports failure even if all tests are passed.
+  allow_failure: yes
   needs: [ "build/cuda110/mvapich2/gcc/cuda/debug/shared" ]
 
 
@@ -302,9 +297,11 @@ test/cuda110/nompi/clang/cuda/release/static:
   variables:
     USE_NAME: "cuda110-nompi-clang-${CI_PIPELINE_ID}"
     SLURM_PARTITION: "accelerated"
-    SLURM_GRES: "gpu:1"
+    SLURM_GRES: "gpu:4"
     SLURM_TIME: "01:30:00"
   dependencies: null
+  # FIXME: current slurm always reports failure even if all tests are passed.
+  allow_failure: yes
   needs: [ "build/cuda110/nompi/clang/cuda/release/static" ]
 
 
@@ -336,9 +333,11 @@ test/cuda110/nompi/intel/cuda/debug/static:
   variables:
     USE_NAME: "cuda110-nompi-intel-${CI_PIPELINE_ID}"
     SLURM_PARTITION: "accelerated"
-    SLURM_GRES: "gpu:1"
+    SLURM_GRES: "gpu:4"
     SLURM_TIME: "02:00:00"
   dependencies: null
+  # FIXME: current slurm always reports failure even if all tests are passed.
+  allow_failure: yes
   needs: [ "build/cuda110/nompi/intel/cuda/debug/static" ]
 
 
@@ -348,7 +347,7 @@ build/cuda114/nompi/gcc/cuda/debug/shared:
     - .build_and_test_template
     - .default_variables
     - .quick_test_condition
-    - .use_gko_cuda114-openmpi-gnu11-llvm12
+    - .use_gko_cuda114-openmpi-gnu10-llvm12
   variables:
     BUILD_OMP: "ON"
     BUILD_CUDA: "ON"
@@ -358,7 +357,6 @@ build/cuda114/nompi/gcc/cuda/debug/shared:
     CXX_FLAGS: "-Wno-error=maybe-uninitialized"
     # disable spurious unused argument warning
     EXTRA_CMAKE_FLAGS: "-DCMAKE_CUDA_FLAGS=-diag-suppress=177"
-    CUDA_ARCH: 61
 
 
 # nvhpc and friends
@@ -381,7 +379,6 @@ build/nvhpc233/cuda120/nompi/nvcpp/release/static:
     CXX_FLAGS: "--diag_suppress=useless_using_declaration,declared_but_not_referenced"
     # disable spurious unused argument warning
     EXTRA_CMAKE_FLAGS: "-DCMAKE_CUDA_FLAGS=-diag-suppress=177"
-    CUDA_ARCH: 61
 
 build/nvhpc227/cuda117/nompi/nvcpp/debug/shared:
   extends:
@@ -401,7 +398,6 @@ build/nvhpc227/cuda117/nompi/nvcpp/debug/shared:
     CXX_FLAGS: "--diag_suppress=useless_using_declaration,declared_but_not_referenced"
     # disable spurious unused argument warning
     EXTRA_CMAKE_FLAGS: "-DCMAKE_CUDA_FLAGS=-diag-suppress=177"
-    CUDA_ARCH: 61
 
 # ROCm 4.5 and friends
 build/amd/nompi/gcc/rocm45/release/shared:
@@ -538,24 +534,13 @@ build/nocuda/openmpi/clang/omp/debug/static:
     FAST_TESTS: "ON"
     BUILD_SHARED_LIBS: "OFF"
 
-test/nocuda/openmpi/clang/omp/debug/static:
-  extends:
-    - .build_and_test_template
-    - .default_variables
-    - .full_test_condition
-    - .use_gko-nocuda-openmpi-gnu9-llvm8
-  variables:
-    USE_NAME: "nocuda-openmpi-clang-${CI_PIPELINE_ID}"
-  dependencies: null
-  needs: [ "build/nocuda/openmpi/clang/omp/debug/static" ]
-
 # nocuda with the oldest supported compiler
 build/nocuda/nompi/gcc/omp/release/static:
   extends:
     - .build_and_test_template
     - .default_variables
     - .quick_test_condition
-    - .use_gko-nocuda-mvapich2-gnu5-llvm39-intel2018
+    - .use_gko-nocuda-mvapich2-gnu5-llvm39-intel2019
   variables:
     BUILD_OMP: "ON"
     BUILD_TYPE: "Release"
@@ -566,7 +551,7 @@ build/nocuda-nomixed/nompi/clang/omp/release/static:
     - .build_and_test_template
     - .default_variables
     - .full_test_condition
-    - .use_gko-nocuda-mvapich2-gnu5-llvm39-intel2018
+    - .use_gko-nocuda-mvapich2-gnu5-llvm39-intel2019
   variables:
     C_COMPILER: "clang"
     CXX_COMPILER: "clang++"
@@ -602,7 +587,7 @@ build/nocuda-nomixed/nompi/clang/omp/debug/static:
     BUILD_SHARED_LIBS: "OFF"
     MIXED_PRECISION: "OFF"
 
-build/dpcpp/2022-1/cpu/release/static:
+build/dpcpp/2022-1/cpu/release/shared:
   extends:
     - .build_and_test_template
     - .default_variables
@@ -611,10 +596,11 @@ build/dpcpp/2022-1/cpu/release/static:
   variables:
     C_COMPILER: "gcc"
     CXX_COMPILER: "dpcpp"
-    BUILD_DPCPP: "ON"
+    BUILD_SYCL: "ON"
+    GKO_COMPILER_FLAGS: "-ffp-model=precise"
     BUILD_TYPE: "Release"
     BUILD_SHARED_LIBS: "ON"
-    SYCL_DEVICE_TYPE: "CPU"
+    SYCL_DEVICE_FILTER: "*:cpu"
     SLURM_PARTITION: "cpu"
     SLURM_TIME: "2:00:00"
     # This job is not in exclusive mode
@@ -629,11 +615,12 @@ build/dpcpp/igpu/release/shared:
   variables:
     C_COMPILER: "gcc"
     CXX_COMPILER: "dpcpp"
-    BUILD_DPCPP: "ON"
+    BUILD_SYCL: "ON"
+    GKO_COMPILER_FLAGS: "-ffp-model=precise"
     BUILD_TYPE: "Release"
     BUILD_SHARED_LIBS: "ON"
     DPCPP_SINGLE_MODE: "ON"
-    SYCL_DEVICE_TYPE: "GPU"
+    ONEAPI_DEVICE_SELECTOR: "*:gpu"
 
 # TODO: Enable when debug shared library size issues are fixed
 # build/dpcpp/level_zero_igpu/debug/shared:
@@ -645,11 +632,12 @@ build/dpcpp/igpu/release/shared:
 #   variables:
 #     C_COMPILER: "gcc"
 #     CXX_COMPILER: "dpcpp"
-#     BUILD_DPCPP: "ON"
+#     BUILD_SYCL: "ON"
+#     GKO_COMPILER_FLAGS: "-ffp-model=precise"
 #     BUILD_TYPE: "Debug"
 #     BUILD_SHARED_LIBS: "ON"
 #     DPCPP_SINGLE_MODE: "ON"
-#     SYCL_DEVICE_FILTER: "Level_Zero:GPU"
+#     ONEAPI_DEVICE_SELECTOR: "level_zero:gpu"
 
 # It gives two available backends of GPU on tests
 build/dpcpp/dgpu/release/static:
@@ -661,11 +649,12 @@ build/dpcpp/dgpu/release/static:
   variables:
     C_COMPILER: "gcc"
     CXX_COMPILER: "dpcpp"
-    BUILD_DPCPP: "ON"
+    BUILD_SYCL: "ON"
+    GKO_COMPILER_FLAGS: "-ffp-model=precise"
     BUILD_TYPE: "Release"
     BUILD_SHARED_LIBS: "OF"
     DPCPP_SINGLE_MODE: "ON"
-    SYCL_DEVICE_TYPE: "GPU"
+    ONEAPI_DEVICE_SELECTOR: "*:gpu"
 
 build/dpcpp/level_zero_dgpu/release/shared:
   extends:
@@ -676,10 +665,26 @@ build/dpcpp/level_zero_dgpu/release/shared:
   variables:
     C_COMPILER: "gcc"
     CXX_COMPILER: "dpcpp"
-    BUILD_DPCPP: "ON"
+    BUILD_SYCL: "ON"
+    GKO_COMPILER_FLAGS: "-ffp-model=precise"
+    BUILD_TYPE: "Release"
+    DPCPP_SINGLE_MODE: "ON"
+    ONEAPI_DEVICE_SELECTOR: "level_zero:gpu"
+
+build/icpx/level_zero_dgpu/release/shared:
+  extends:
+    - .build_and_test_template
+    - .default_variables
+    - .quick_test_condition
+    - .use_gko-oneapi-dgpu
+  variables:
+    C_COMPILER: "icx"
+    CXX_COMPILER: "icpx"
+    BUILD_SYCL: "ON"
+    GKO_COMPILER_FLAGS: "-ffp-model=precise"
     BUILD_TYPE: "Release"
     DPCPP_SINGLE_MODE: "ON"
-    SYCL_DEVICE_FILTER: "Level_Zero:GPU"
+    ONEAPI_DEVICE_SELECTOR: "level_zero:gpu"
 
 # Job with important warnings as error
 warnings:
@@ -694,6 +699,7 @@ warnings:
     BUILD_CUDA: "ON"
     BUILD_HIP: "ON"
     CXX_FLAGS: "-Werror=pedantic -pedantic-errors"
+    GKO_COMPILER_FLAGS: "-Wpedantic"
   allow_failure: yes
 
 # Ensure kernel modules do not depend on core
@@ -818,6 +824,7 @@ sonarqube_cov:
 # Deploy documentation to github-pages
 gh-pages:
   stage: deploy
+  interruptible: false
   extends:
     - .default_variables
     - .deploy_condition
@@ -833,7 +840,7 @@ gh-pages:
         -DCMAKE_CUDA_COMPILER=${CUDA_COMPILER} -DCMAKE_BUILD_TYPE=${BUILD_TYPE}
         -DBUILD_SHARED_LIBS=ON ${EXTRA_CMAKE_FLAGS} -DGINKGO_DEVEL_TOOLS=OFF
         -DGINKGO_BUILD_REFERENCE=OFF -DGINKGO_BUILD_OMP=OFF -DGINKGO_BUILD_CUDA=OFF
-        -DGINKGO_BUILD_HIP=OFF -DGINKGO_BUILD_DPCPP=OFF -DGINKGO_BUILD_MPI=OFF
+        -DGINKGO_BUILD_HIP=OFF -DGINKGO_BUILD_SYCL=OFF -DGINKGO_BUILD_MPI=OFF
         -DGINKGO_BUILD_TESTS=OFF -DGINKGO_BUILD_EXAMPLES=OFF
         -DGINKGO_BUILD_DOC=ON -DGINKGO_DOC_GENERATE_PDF=ON
     - make usr
@@ -921,6 +928,7 @@ cudamemcheck:
 
 new-issue-on-failure:
   stage: on-failure
+  interruptible: false
   extends:
     - .default_variables
     - .use_status-job-settings
diff --git a/.gitlab/add-interrupt.yml b/.gitlab/add-interrupt.yml
new file mode 100644
index 00000000000..cf6fd95fe1e
--- /dev/null
+++ b/.gitlab/add-interrupt.yml
@@ -0,0 +1,2 @@
+default:
+  interruptible: true
diff --git a/.gitlab/image.yml b/.gitlab/image.yml
index 50dfbe9d2f8..eb1ab5128af 100644
--- a/.gitlab/image.yml
+++ b/.gitlab/image.yml
@@ -17,19 +17,13 @@
     - cpu
     - amdci
 
-.use_gko-nocuda-mvapich2-gnu5-llvm39-intel2018:
-  image: ginkgohub/cpu:mvapich2-gnu5-llvm39-intel2018
+.use_gko-nocuda-mvapich2-gnu5-llvm39-intel2019:
+  image: ginkgohub/cpu:mvapich2-gnu5-llvm39-intel2019
   tags:
     - private_ci
     - cpu
     - controller
 
-.use_gko-cuda92-mvapich2-gnu7-llvm50-intel2017:
-  image: ginkgohub/cuda:92-mvapich2-gnu7-llvm50-intel2017
-  tags:
-    - private_ci
-    - nvidia-gpu
-
 .use_gko-cuda101-openmpi-gnu8-llvm7-intel2019:
   image: ginkgohub/cuda:101-openmpi-gnu8-llvm7-intel2019
   tags:
@@ -56,8 +50,8 @@
     - private_ci
     - horeka
 
-.use_gko_cuda114-openmpi-gnu11-llvm12:
-  image: ginkgohub/cuda:114-openmpi-gnu11-llvm12
+.use_gko_cuda114-openmpi-gnu10-llvm12:
+  image: ginkgohub/cuda:114-openmpi-gnu10-llvm12
   tags:
     - private_ci
     - nvidia-gpu
@@ -78,15 +72,13 @@
   image: ginkgohub/rocm:45-mvapich2-gnu8-llvm8
   tags:
     - private_ci
-    - amdci
-    - gpu
+    - amd-gpu
 
 .use_gko-rocm502-nompi-gnu11-llvm11:
   image: ginkgohub/rocm:502-openmpi-gnu11-llvm11
   tags:
     - private_ci
-    - amdci
-    - gpu
+    - amd-gpu
 
 .use_gko-oneapi-cpu:
   image: ginkgohub/oneapi:2022.1
diff --git a/.gitlab/scripts.yml b/.gitlab/scripts.yml
index 537f2e5e83e..504aa7dad40 100644
--- a/.gitlab/scripts.yml
+++ b/.gitlab/scripts.yml
@@ -22,8 +22,7 @@
   script:
     - mkdir -p ${CI_JOB_NAME} && cd ${CI_JOB_NAME}
     - if [ -n "${CUDA_ARCH}" ]; then
-      CUDA_ARCH_STR=-DGINKGO_CUDA_ARCHITECTURES=${CUDA_ARCH};
-      CUDA_HOST_STR=-DCMAKE_CUDA_HOST_COMPILER=$(which ${CXX_COMPILER});
+      export CUDA_ARCH_STR=-DGINKGO_CUDA_ARCHITECTURES=${CUDA_ARCH};
       fi
     - if [[ "${MPI_AS_ROOT}" == "ON" ]];then
       export OMPI_ALLOW_RUN_AS_ROOT=1;
@@ -32,17 +31,19 @@
     - if [[ "${BUILD_MPI}" == "ON" ]]; then
       MPI_STR=-DGINKGO_MPI_EXEC_SUFFIX=${MPI_SUFFIX};
       fi
+    - export CC=${C_COMPILER} CXX=${CXX_COMPILER} CUDAHOSTCXX=${CXX_COMPILER} CUDACXX=${CUDA_COMPILER}
     - cmake ${CI_PROJECT_DIR}${CI_PROJECT_DIR_SUFFIX}
         -GNinja
-        -DCMAKE_C_COMPILER=${C_COMPILER} -DCMAKE_CXX_COMPILER=${CXX_COMPILER}
-        -DCMAKE_CUDA_COMPILER=${CUDA_COMPILER} -DCMAKE_BUILD_TYPE=${BUILD_TYPE}
+        -DCMAKE_BUILD_TYPE=${BUILD_TYPE}
         -DCMAKE_CXX_FLAGS="${CXX_FLAGS}" -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS}
-        ${EXTRA_CMAKE_FLAGS} ${CUDA_ARCH_STR} ${CUDA_HOST_STR}
+        ${EXTRA_CMAKE_FLAGS} ${CUDA_ARCH_STR}
+        -DGINKGO_COMPILER_FLAGS=${GKO_COMPILER_FLAGS}
         -DGINKGO_DEVEL_TOOLS=OFF -DGINKGO_BUILD_REFERENCE=${BUILD_REFERENCE}
         -DGINKGO_BUILD_OMP=${BUILD_OMP} -DGINKGO_BUILD_CUDA=${BUILD_CUDA}
-        -DGINKGO_BUILD_HIP=${BUILD_HIP}
+        -DGINKGO_BUILD_HIP=${BUILD_HIP} -DGINKGO_BUILD_SYCL=${BUILD_SYCL}
         -DGINKGO_BUILD_MPI=${BUILD_MPI} ${MPI_STR}
         -DGINKGO_BUILD_HWLOC=${BUILD_HWLOC}
+        -DGINKGO_BUILD_PAPI_SDE=${BUILD_PAPI_SDE}
         -DGINKGO_BUILD_TESTS=ON -DGINKGO_BUILD_EXAMPLES=ON
         -DGINKGO_FAST_TESTS=${FAST_TESTS}
         -DGINKGO_TEST_NONDEFAULT_STREAM=${NONDEFAULT_STREAM}
@@ -52,6 +53,7 @@
         -DGINKGO_DPCPP_SINGLE_MODE=${DPCPP_SINGLE_MODE}
         -DGINKGO_EXPORT_BUILD_DIR=${EXPORT_BUILD_DIR}
     - ninja -j${NUM_CORES} -l${CI_LOAD_LIMIT} install
+    - awk '!/^#/ { print ($2 - $1)/1000 " " $4 }' .ninja_log | sort -nr
     - if [ "${EXPORT_BUILD_DIR}" == "ON" ]; then ninja test_exportbuild; fi
     - LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH ninja test_pkgconfig
   dependencies: []
@@ -63,11 +65,11 @@
   script:
     - mkdir -p ${CI_JOB_NAME} && cd ${CI_JOB_NAME}
     - if [ -n "${CUDA_ARCH}" ]; then
-      CUDA_ARCH_STR=-DGINKGO_CUDA_ARCHITECTURES=${CUDA_ARCH};
-      CUDA_HOST_STR=-DCMAKE_CUDA_HOST_COMPILER=$(which ${CXX_COMPILER});
+      export CUDA_ARCH_STR=-DGINKGO_CUDA_ARCHITECTURES=${CUDA_ARCH};
       fi
     - if [ -n "${SYCL_DEVICE_TYPE}" ]; then export SYCL_DEVICE_TYPE; fi
     - if [ -n "${SYCL_DEVICE_FILTER}" ]; then export SYCL_DEVICE_FILTER; fi
+    - if [ -n "${ONEAPI_DEVICE_SELECTOR}" ]; then export ONEAPI_DEVICE_SELECTOR; fi
     - if [[ "${MPI_AS_ROOT}" == "ON" ]];then
       export OMPI_ALLOW_RUN_AS_ROOT=1;
       export OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1;
@@ -75,17 +77,18 @@
     - if [[ "${BUILD_MPI}" == "ON" ]]; then
       MPI_STR=-DGINKGO_MPI_EXEC_SUFFIX=${MPI_SUFFIX};
       fi
+    - export CC=${C_COMPILER} CXX=${CXX_COMPILER} CUDAHOSTCXX=${CXX_COMPILER} CUDACXX=${CUDA_COMPILER}
     - cmake ${CI_PROJECT_DIR}${CI_PROJECT_DIR_SUFFIX}
-        -GNinja
-        -DCMAKE_C_COMPILER=${C_COMPILER} -DCMAKE_CXX_COMPILER=${CXX_COMPILER}
-        -DCMAKE_CUDA_COMPILER=${CUDA_COMPILER} -DCMAKE_BUILD_TYPE=${BUILD_TYPE}
+        -GNinja -DCMAKE_BUILD_TYPE=${BUILD_TYPE}
         -DCMAKE_CXX_FLAGS="${CXX_FLAGS}" -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS}
-        ${EXTRA_CMAKE_FLAGS} ${CUDA_ARCH_STR} ${CUDA_HOST_STR}
+        ${EXTRA_CMAKE_FLAGS} ${CUDA_ARCH_STR}
+        -DGINKGO_COMPILER_FLAGS=${GKO_COMPILER_FLAGS}
         -DGINKGO_DEVEL_TOOLS=OFF -DGINKGO_BUILD_REFERENCE=${BUILD_REFERENCE}
         -DGINKGO_BUILD_OMP=${BUILD_OMP} -DGINKGO_BUILD_CUDA=${BUILD_CUDA}
-        -DGINKGO_BUILD_HIP=${BUILD_HIP}
+        -DGINKGO_BUILD_HIP=${BUILD_HIP} -DGINKGO_BUILD_SYCL=${BUILD_SYCL}
         -DGINKGO_BUILD_MPI=${BUILD_MPI} ${MPI_STR}
         -DGINKGO_BUILD_HWLOC=${BUILD_HWLOC}
+        -DGINKGO_BUILD_PAPI_SDE=${BUILD_PAPI_SDE}
         -DGINKGO_BUILD_TESTS=ON -DGINKGO_BUILD_EXAMPLES=ON
         -DGINKGO_FAST_TESTS=${FAST_TESTS}
         -DGINKGO_MIXED_PRECISION=${MIXED_PRECISION}
@@ -94,9 +97,10 @@
         -DGINKGO_RUN_EXAMPLES=${RUN_EXAMPLES}
         -DGINKGO_EXPORT_BUILD_DIR=${EXPORT_BUILD_DIR}
     - ninja -j${NUM_CORES} -l${CI_LOAD_LIMIT} install
+    - awk '!/^#/ { print ($2 - $1)/1000 " " $4 }' .ninja_log | sort -nr
     - |
         (( $(ctest -N | tail -1 | sed 's/Total Tests: //') != 0 )) || exit 1
-    - ctest -V --timeout 6000
+    - ctest --output-on-failure --timeout 6000 ${CTEST_EXTRA_ARGS}
     - ninja test_install
     - pushd test/test_install
     - ninja install
@@ -148,7 +152,7 @@
     - cd ${CI_JOB_NAME/test/build}
     - |
         (( $(ctest -N | tail -1 | sed 's/Total Tests: //') != 0 )) || exit 1
-    - ctest -V --timeout 6000
+    - ctest --output-on-failure --timeout 6000 ${CTEST_EXTRA_ARGS}
     - ninja test_install
     - pushd test/test_install
     - ninja install
diff --git a/.gitlab/variables.yml b/.gitlab/variables.yml
index 6ae62b8c899..6c75d60d069 100644
--- a/.gitlab/variables.yml
+++ b/.gitlab/variables.yml
@@ -11,8 +11,11 @@
     BUILD_OMP: "OFF"
     BUILD_CUDA: "OFF"
     BUILD_HIP: "OFF"
+    BUILD_SYCL: "OFF"
     BUILD_HWLOC: "ON"
+    BUILD_PAPI_SDE: "OFF"
     BUILD_MPI: "OFF"
+    GKO_COMPILER_FLAGS: ""
     MPI_AS_ROOT: "OFF"
     FAST_TESTS: "OFF"
     NONDEFAULT_STREAM: "OFF"
diff --git a/ABOUT-LICENSING.md b/ABOUT-LICENSING.md
index df081e2211b..d6e68911d1a 100644
--- a/ABOUT-LICENSING.md
+++ b/ABOUT-LICENSING.md
@@ -76,7 +76,7 @@ the following license:
 
 When compiling Ginkgo with `-DGINKGO_BUILD_BENCHMARKS=ON` the build system will
 download, build, and link [gflags](https://github.com/gflags/gflags) and
-[RapidJSON](https://github.com/Tencent/rapidjson) with the
+[nlohmann-json](https://github.com/nlohmann/json) with the
 benchmark suites. gtest is available under the following license:
 
 > Copyright (c) 2006, Google Inc.
@@ -108,110 +108,22 @@ benchmark suites. gtest is available under the following license:
 > (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 > OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-RapidJSON is available under the following license (note that Ginkgo's build
-system automatically removes the `bin/jsonchecker/` directory which is licensed
-under the problematic JSON license):
+nlohmann-json is available under the following license:
 
-> Tencent is pleased to support the open source community by making RapidJSON
-> available.
->
-> Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip.  All
-> rights reserved.
->
-> If you have downloaded a copy of the RapidJSON binary from Tencent, please
-> note that the RapidJSON binary is licensed under the MIT License.  If you have
-> downloaded a copy of the RapidJSON source code from Tencent, please note that
-> RapidJSON source code is licensed under the MIT License, except for the
-> third-party components listed below which are subject to different license
-> terms.  Your integration of RapidJSON into your own projects may require
-> compliance with the MIT License, as well as the other licenses applicable to
-> the third-party components included within RapidJSON. To avoid the problematic
-> JSON license in your own projects, it's sufficient to exclude the
-> bin/jsonchecker/ directory, as it's the only code under the JSON license.  A
-> copy of the MIT License is included in this file.
->
-> Other dependencies and licenses:
->
-> Open Source Software Licensed Under the BSD License:
-> --------------------------------------------------------------------
->
-> The msinttypes r29
->
-> Copyright (c) 2006-2013 Alexander Chemeris
-> All rights reserved.
->
-> Redistribution and use in source and binary forms, with or without
-> modification, are permitted provided that the following conditions are met:
->
-> * Redistributions of source code must retain the above copyright notice, this
->   list of conditions and the following disclaimer.
-> * Redistributions in binary form must reproduce the above copyright notice,
->   this list of conditions and the following disclaimer in the documentation
->   and/or other materials provided with the distribution.
-> * Neither the name of  copyright holder nor the names of its contributors may
->   be used to endorse or promote products derived from this software without
->   specific prior written permission.
->
-> THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND ANY
-> EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-> WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-> DISCLAIMED. IN NO EVENT SHALL THE REGENTS AND CONTRIBUTORS BE LIABLE FOR ANY
-> DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-> (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-> LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-> ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-> (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-> SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
->
-> Open Source Software Licensed Under the JSON License:
-> --------------------------------------------------------------------
->
-> json.org
-> Copyright (c) 2002
-> JSON.org All Rights Reserved.
->
-> JSON_checker
-> Copyright (c) 2002 JSON.org
-> All Rights Reserved.
->
->
-> Terms of the JSON License:
-> ---------------------------------------------------
->
-> Permission is hereby granted, free of charge, to any person obtaining a copy
-> of this software and associated documentation files (the "Software"), to deal
-> in the Software without restriction, including without limitation the rights
-> to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-> copies of the Software, and to permit persons to whom the Software is
-> furnished to do so, subject to the following conditions:
->
-> The above copyright notice and this permission notice shall be included in all
-> copies or substantial portions of the Software.
->
-> The Software shall be used for Good, not Evil.
->
-> THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-> IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-> FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-> AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-> LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-> OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-> SOFTWARE.
->
->
-> Terms of the MIT License:
-> --------------------------------------------------------------------
->
+> MIT License 
+> 
+> Copyright (c) 2013-2022 Niels Lohmann
+> 
 > Permission is hereby granted, free of charge, to any person obtaining a copy
 > of this software and associated documentation files (the "Software"), to deal
 > in the Software without restriction, including without limitation the rights
 > to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 > copies of the Software, and to permit persons to whom the Software is
 > furnished to do so, subject to the following conditions:
->
+
 > The above copyright notice and this permission notice shall be included in all
 > copies or substantial portions of the Software.
->
+> 
 > THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 > IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 > FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -220,7 +132,6 @@ under the problematic JSON license):
 > OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 > SOFTWARE.
 
-
 For generating the documentation of Ginkgo, some scripts from the deal.II
 library are used. You can refer to the `doc/` folder to see which files are a
 modified version of deal.II's documentation generation scripts. Additionally,
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 34d53363898..90834b209dc 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,6 +11,121 @@ git log --first-parent
 
 Please visit our wiki [Changelog](https://github.com/ginkgo-project/ginkgo/wiki/Changelog) for unreleased changes.
 
+## Version 1.7.0
+
+The Ginkgo team is proud to announce the new Ginkgo minor release 1.7.0. This release brings new features such as:
+- Complete GPU-resident sparse direct solvers feature set and interfaces,
+- Improved Cholesky factorization performance,
+- A new MC64 reordering,
+- Batched iterative solver support with the BiCGSTAB solver with batched Dense and ELL matrix types,
+- MPI support for the SYCL backend,
+- Improved ParILU(T)/ParIC(T) preconditioner convergence,
+and more!
+
+If you face an issue, please first check our [known issues page](https://github.com/ginkgo-project/ginkgo/wiki/Known-Issues) and the [open issues list](https://github.com/ginkgo-project/ginkgo/issues) and if you do not find a solution, feel free to [open a new issue](https://github.com/ginkgo-project/ginkgo/issues/new/choose) or ask a question using the [github discussions](https://github.com/ginkgo-project/ginkgo/discussions).
+
+Supported systems and requirements:
++ For all platforms, CMake 3.16+
++ C++14 compliant compiler
++ Linux and macOS
+  + GCC: 5.5+
+  + clang: 3.9+
+  + Intel compiler: 2019+
+  + Apple Clang: 14.0 is tested. Earlier versions might also work.
+  + NVHPC: 22.7+
+  + Cray Compiler: 14.0.1+
+  + CUDA module: CMake 3.18+, and CUDA 10.1+ or NVHPC 22.7+
+  + HIP module: ROCm 4.5+
+  + DPC++ module: Intel oneAPI 2022.1+ with oneMKL and oneDPL. Set the CXX compiler to `dpcpp` or `icpx`.
+  + MPI: standard version 3.1+, ideally GPU Aware, for best performance
++ Windows
+  + MinGW: GCC 5.5+
+  + Microsoft Visual Studio: VS 2019+
+  + CUDA module: CUDA 10.1+, Microsoft Visual Studio
+  + OpenMP module: MinGW.
+
+### Version support changes
+
++ CUDA 9.2 is no longer supported and 10.0 is untested [#1382](https://github.com/ginkgo-project/ginkgo/pull/1382)
++ Ginkgo now requires CMake version 3.16 (and 3.18 for CUDA) [#1368](https://github.com/ginkgo-project/ginkgo/pull/1368)
+
+### Interface changes
+
++ `const` Factory parameters can no longer be modified through `with_*` functions, as this breaks const-correctness [#1336](https://github.com/ginkgo-project/ginkgo/pull/1336) [#1439](https://github.com/ginkgo-project/ginkgo/pull/1439)
+
+### New Deprecations
+
++ The `device_reset` parameter of CUDA and HIP executors no longer has an effect, and its `allocation_mode` parameters have been deprecated in favor of the `Allocator` interface. [#1315](https://github.com/ginkgo-project/ginkgo/pull/1315)
++ The CMake parameter `GINKGO_BUILD_DPCPP` has been deprecated in favor of `GINKGO_BUILD_SYCL`. [#1350](https://github.com/ginkgo-project/ginkgo/pull/1350)
++ The `gko::reorder::Rcm` interface has been deprecated in favor of `gko::experimental::reorder::Rcm` based on `Permutation`. [#1418](https://github.com/ginkgo-project/ginkgo/pull/1418)
++ The Permutation class' `permute_mask` functionality. [#1415](https://github.com/ginkgo-project/ginkgo/pull/1415)
++ Multiple functions with typos (`set_complex_subpsace()`, range functions such as `conj_operaton` etc). [#1348](https://github.com/ginkgo-project/ginkgo/pull/1348)
+
+### Summary of previous deprecations
++ `gko::lend()` is not necessary anymore.
++ The classes `RelativeResidualNorm` and `AbsoluteResidualNorm` are deprecated in favor of `ResidualNorm`.
++ The class `AmgxPgm` is deprecated in favor of `Pgm`.
++ Default constructors for the CSR `load_balance` and `automatical` strategies
++ The PolymorphicObject's move-semantic `copy_from` variant
++ The templated `SolverBase` class.
++ The class `MachineTopology` is deprecated in favor of `machine_topology`.
++ Logger constructors and create functions with the `executor` parameter.
++ The virtual, protected, Dense functions `compute_norm1_impl`, `add_scaled_impl`, etc.
++ Logger events for solvers and criterion without the additional `implicit_tau_sq` parameter.
++ The global `gko::solver::default_krylov_dim`, use instead `gko::solver::gmres_default_krylov_dim`.
+
+### Added features
+
++ Adds a batch::BatchLinOp class that forms a base class for batched linear operators such as batched matrix formats, solver and preconditioners [#1379](https://github.com/ginkgo-project/ginkgo/pull/1379)
++ Adds a batch::MultiVector class that enables operations such as dot, norm, scale on batched vectors [#1371](https://github.com/ginkgo-project/ginkgo/pull/1371)
++ Adds a batch::Dense matrix format that stores batched dense matrices and provides gemv operations for these dense matrices. [#1413](https://github.com/ginkgo-project/ginkgo/pull/1413)
++ Adds a batch::Ell matrix format that stores batched Ell matrices and provides spmv operations for these batched Ell matrices. [#1416](https://github.com/ginkgo-project/ginkgo/pull/1416) [#1437](https://github.com/ginkgo-project/ginkgo/pull/1437)
++ Add a batch::Bicgstab solver (class, core, and reference kernels) that enables iterative solution of batched linear systems [#1438](https://github.com/ginkgo-project/ginkgo/pull/1438).
++ Add device kernels (CUDA, HIP, and DPCPP) for batch::Bicgstab solver. [#1443](https://github.com/ginkgo-project/ginkgo/pull/1443).
++ New MC64 reordering algorithm which optimizes the diagonal product or sum of a matrix by permuting the rows, and computes additional scaling factors for equilibriation [#1120](https://github.com/ginkgo-project/ginkgo/pull/1120)
++ New interface for (non-symmetric) permutation and scaled permutation of Dense and Csr matrices [#1415](https://github.com/ginkgo-project/ginkgo/pull/1415)
++ LU and Cholesky Factorizations can now be separated into their factors [#1432](https://github.com/ginkgo-project/ginkgo/pull/1432)
++ New symbolic LU factorization algorithm that is optimized for matrices with an almost-symmetric sparsity pattern [#1445](https://github.com/ginkgo-project/ginkgo/pull/1445)
++ Sorting kernels for SparsityCsr on all backends [#1343](https://github.com/ginkgo-project/ginkgo/pull/1343)
++ Allow passing pre-generated local solver as factory parameter for the distributed Schwarz preconditioner [#1426](https://github.com/ginkgo-project/ginkgo/pull/1426)
++ Add DPCPP kernels for Partition [#1034](https://github.com/ginkgo-project/ginkgo/pull/1034), and CSR's `check_diagonal_entries` and `add_scaled_identity` functionality [#1436](https://github.com/ginkgo-project/ginkgo/pull/1436)
++ Adds a helper function to create a partition based on either local sizes, or local ranges [#1227](https://github.com/ginkgo-project/ginkgo/pull/1227)
++ Add function to compute arithmetic mean of dense and distributed vectors [#1275](https://github.com/ginkgo-project/ginkgo/pull/1275)
++ Adds `icpx` compiler supports [#1350](https://github.com/ginkgo-project/ginkgo/pull/1350)
++ All backends can be built simultaneously [#1333](https://github.com/ginkgo-project/ginkgo/pull/1333)
++ Emits a CMake warning in downstream projects that use different compilers than the installed Ginkgo [#1372](https://github.com/ginkgo-project/ginkgo/pull/1372)
++ Reordering algorithms in sparse_blas benchmark [#1354](https://github.com/ginkgo-project/ginkgo/pull/1354)
++ Benchmarks gained an `-allocator` parameter to specify device allocators [#1385](https://github.com/ginkgo-project/ginkgo/pull/1385)
++ Benchmarks gained an `-input_matrix` parameter that initializes the input JSON based on the filename [#1387](https://github.com/ginkgo-project/ginkgo/pull/1387)
++ Benchmark inputs can now be reordered as a preprocessing step [#1408](https://github.com/ginkgo-project/ginkgo/pull/1408)
+
+
+### Improvements
+
++ Significantly improve Cholesky factorization performance [#1366](https://github.com/ginkgo-project/ginkgo/pull/1366)
++ Improve parallel build performance [#1378](https://github.com/ginkgo-project/ginkgo/pull/1378)
++ Allow constrained parallel test execution using CTest resources [#1373](https://github.com/ginkgo-project/ginkgo/pull/1373)
++ Use arithmetic type more inside mixed precision ELL [#1414](https://github.com/ginkgo-project/ginkgo/pull/1414)
++ Most factory parameters of factory type no longer need to be constructed explicitly via `.on(exec)` [#1336](https://github.com/ginkgo-project/ginkgo/pull/1336) [#1439](https://github.com/ginkgo-project/ginkgo/pull/1439)
++ Improve ParILU(T)/ParIC(T) convergence by using more appropriate atomic operations [#1434](https://github.com/ginkgo-project/ginkgo/pull/1434)
+
+### Fixes
+
++ Fix an over-allocation for OpenMP reductions [#1369](https://github.com/ginkgo-project/ginkgo/pull/1369)
++ Fix DPCPP's common-kernel reduction for empty input sizes [#1362](https://github.com/ginkgo-project/ginkgo/pull/1362)
++ Fix several typos in the API and documentation [#1348](https://github.com/ginkgo-project/ginkgo/pull/1348)
++ Fix inconsistent `Threads` between generations [#1388](https://github.com/ginkgo-project/ginkgo/pull/1388)
++ Fix benchmark median condition [#1398](https://github.com/ginkgo-project/ginkgo/pull/1398)
++ Fix HIP 5.6.0 compilation [#1411](https://github.com/ginkgo-project/ginkgo/pull/1411)
++ Fix missing destruction of rand_generator from cuda/hip [#1417](https://github.com/ginkgo-project/ginkgo/pull/1417)
++ Fix PAPI logger destruction order [#1419](https://github.com/ginkgo-project/ginkgo/pull/1419)
++ Fix TAU logger compilation [#1422](https://github.com/ginkgo-project/ginkgo/pull/1422)
++ Fix relative criterion to not iterate if the residual is already zero [#1079](https://github.com/ginkgo-project/ginkgo/pull/1079)
++ Fix memory_order invocations with C++20 changes [#1402](https://github.com/ginkgo-project/ginkgo/pull/1402)
++ Fix `check_diagonal_entries_exist` report correctly when only missing diagonal value in the last rows. [#1440](https://github.com/ginkgo-project/ginkgo/pull/1440)
++ Fix checking OpenMPI version in cross-compilation settings [#1446](https://github.com/ginkgo-project/ginkgo/pull/1446)
++ Fix false-positive deprecation warnings in Ginkgo, especially for the old Rcm (it doesn't emit deprecation warnings anymore as a result but is still considered deprecated) [#1444](https://github.com/ginkgo-project/ginkgo/pull/1444)
+
 ## Version 1.6.0
 
 The Ginkgo team is proud to announce the new Ginkgo minor release 1.6.0. This release brings new features such as:
@@ -215,7 +330,7 @@ Supported systems and requirements:
 + Add reduce_add for arrays ([#831](https://github.com/ginkgo-project/ginkgo/pull/831))
 + Add utility to simplify Dense View creation from an existing Dense vector ([#1136](https://github.com/ginkgo-project/ginkgo/pull/1136)).
 + Add a custom transpose implementation for Fbcsr and Csr transpose for unsupported vendor types ([#1123](https://github.com/ginkgo-project/ginkgo/pull/1123))
-+ Make IDR random initilization deterministic ([#1116](https://github.com/ginkgo-project/ginkgo/pull/1116))
++ Make IDR random initialization deterministic ([#1116](https://github.com/ginkgo-project/ginkgo/pull/1116))
 + Move the algorithm choice for triangular solvers from Csr::strategy_type to a factory parameter ([#1088](https://github.com/ginkgo-project/ginkgo/pull/1088))
 + Update CUDA archCoresPerSM ([#1175](https://github.com/ginkgo-project/ginkgo/pull/1116))
 + Add kernels for Csr sparsity pattern lookup ([#994](https://github.com/ginkgo-project/ginkgo/pull/994))
@@ -620,7 +735,7 @@ page](https://github.com/ginkgo-project/ginkgo/wiki/Known-Issues).
 
 
 ### Additions
-+ Upper and lower triangular solvers ([#327](https://github.com/ginkgo-project/ginkgo/issues/327), [#336](https://github.com/ginkgo-project/ginkgo/issues/336), [#341](https://github.com/ginkgo-project/ginkgo/issues/341), [#342](https://github.com/ginkgo-project/ginkgo/issues/342)) 
++ Upper and lower triangular solvers ([#327](https://github.com/ginkgo-project/ginkgo/issues/327), [#336](https://github.com/ginkgo-project/ginkgo/issues/336), [#341](https://github.com/ginkgo-project/ginkgo/issues/341), [#342](https://github.com/ginkgo-project/ginkgo/issues/342))
 + New factorization support in Ginkgo, and addition of the ParILU
   algorithm ([#305](https://github.com/ginkgo-project/ginkgo/issues/305), [#315](https://github.com/ginkgo-project/ginkgo/issues/315), [#319](https://github.com/ginkgo-project/ginkgo/issues/319), [#324](https://github.com/ginkgo-project/ginkgo/issues/324))
 + New ILU preconditioner ([#348](https://github.com/ginkgo-project/ginkgo/issues/348), [#353](https://github.com/ginkgo-project/ginkgo/issues/353))
@@ -632,7 +747,7 @@ page](https://github.com/ginkgo-project/ginkgo/wiki/Known-Issues).
 + Allow benchmarking CuSPARSE spmv formats through Ginkgo's benchmarks ([#303](https://github.com/ginkgo-project/ginkgo/issues/303))
 + New benchmark for sparse matrix format conversions ([#312](https://github.com/ginkgo-project/ginkgo/issues/312)[#317](https://github.com/ginkgo-project/ginkgo/issues/317))
 + Add conversions between CSR and Hybrid formats ([#302](https://github.com/ginkgo-project/ginkgo/issues/302), [#310](https://github.com/ginkgo-project/ginkgo/issues/310))
-+ Support for sorting rows in the CSR format by column idices ([#322](https://github.com/ginkgo-project/ginkgo/issues/322))
++ Support for sorting rows in the CSR format by column indices ([#322](https://github.com/ginkgo-project/ginkgo/issues/322))
 + Addition of a CUDA COO SpMM kernel for improved performance ([#345](https://github.com/ginkgo-project/ginkgo/issues/345))
 + Addition of a LinOp to handle perturbations of the form (identity + scalar *
   basis * projector) ([#334](https://github.com/ginkgo-project/ginkgo/issues/334))
@@ -847,7 +962,7 @@ Ginkgo 1.0.0 is brought to you by:
 
 **Karlsruhe Institute of Technology**, Germany  
 **Universitat Jaume I**, Spain  
-**University of Tennessee, Knoxville**, US   
+**University of Tennessee, Knoxville**, US  
 
 These universities, along with various project grants, supported the development team and provided resources needed for the development of Ginkgo.
 
@@ -859,7 +974,7 @@ Ginkgo 1.0.0 contains contributions from:
 **Goran Flegar**, Universitat Jaume I  
 **Fritz Göbel**, Karlsruhe Institute of Technology  
 **Thomas Grützmacher**, Karlsruhe Institute of Technology  
-**Pratik Nayak**, Karlsruhe Institue of Technologgy  
+**Pratik Nayak**, Karlsruhe Institute of Technology  
 **Tobias Ribizel**, Karlsruhe Institute of Technology  
 **Yuhsiang Tsai**, National Taiwan University  
 
@@ -869,11 +984,11 @@ Supporting materials are provided by the following individuals:
 **Frithjof Fleischhammer** - the Ginkgo website  
 
 The development team is grateful to the following individuals for discussions and comments:
- 
+
 **Erik Boman**  
 **Jelena Držaić**  
 **Mike Heroux**  
 **Mark Hoemmen**  
-**Timo Heister**    
+**Timo Heister**  
 **Jens Saak**  
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index d7940e7f40b..e4ffbc4efd5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,23 +1,20 @@
-cmake_minimum_required(VERSION 3.13)
+cmake_minimum_required(VERSION 3.16)
 
-# Use *_ROOT environment variables for find_package calls
-cmake_policy(SET CMP0074 NEW)
-
-# Let CAS handle the CUDA architecture flags (for now)
-# Windows still gives CMP0104 warning if putting it in cuda.
-if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.18)
-    cmake_policy(SET CMP0104 OLD)
-endif()
-
-project(Ginkgo LANGUAGES C CXX VERSION 1.6.0 DESCRIPTION "A numerical linear algebra library targeting many-core architectures")
+project(Ginkgo LANGUAGES C CXX VERSION 1.7.0 DESCRIPTION "A numerical linear algebra library targeting many-core architectures")
 set(Ginkgo_VERSION_TAG "master")
 set(PROJECT_VERSION_TAG ${Ginkgo_VERSION_TAG})
+# Cuda and Hip also look for Threads. Set it before any find_package to ensure the Threads setting is not changed.
+set(THREADS_PREFER_PTHREAD_FLAG ON)
 
 # Determine which modules can be compiled
 include(cmake/hip_path.cmake)
 include(cmake/autodetect_executors.cmake)
 
 list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake/Modules/")
+include(cmake/autodetect_system_libs.cmake)
+
+# rename helper
+include(cmake/rename.cmake)
 
 # Ginkgo configuration options
 option(GINKGO_DEVEL_TOOLS "Add development tools to the build system" OFF)
@@ -27,8 +24,9 @@ option(GINKGO_BUILD_BENCHMARKS "Build Ginkgo's benchmarks" ON)
 option(GINKGO_BUILD_REFERENCE "Compile reference CPU kernels" ON)
 option(GINKGO_BUILD_OMP "Compile OpenMP kernels for CPU" ${GINKGO_HAS_OMP})
 option(GINKGO_BUILD_MPI "Compile the MPI module" ${GINKGO_HAS_MPI})
-option(GINKGO_BUILD_DPCPP
-    "Compile DPC++ kernels for Intel GPUs or other DPC++ enabled hardware" ${GINKGO_HAS_DPCPP})
+gko_rename_cache(GINKGO_BUILD_DPCPP GINKGO_BUILD_SYCL BOOL "Compile SYCL kernels for Intel GPUs or other SYCL enabled hardware")
+option(GINKGO_BUILD_SYCL
+    "Compile SYCL kernels for Intel GPUs or other SYCL enabled hardware" ${GINKGO_HAS_SYCL})
 option(GINKGO_BUILD_CUDA "Compile kernels for NVIDIA GPUs" ${GINKGO_HAS_CUDA})
 option(GINKGO_BUILD_HIP "Compile kernels for AMD or NVIDIA GPUs" ${GINKGO_HAS_HIP})
 option(GINKGO_BUILD_DOC "Generate documentation" OFF)
@@ -56,7 +54,7 @@ set(GINKGO_VERBOSE_LEVEL "1" CACHE STRING
 if(MSVC)
     set(GINKGO_COMPILER_FLAGS "" CACHE STRING
         "Set the required CXX compiler flags, mainly used for warnings. Current default is ``")
-elseif(GINKGO_BUILD_DPCPP OR CMAKE_CXX_COMPILER MATCHES "dpcpp")
+elseif(GINKGO_BUILD_SYCL OR CMAKE_CXX_COMPILER MATCHES "dpcpp|icpx")
     # For now always use `-ffp-model=precise` with DPC++. This can be removed when
     # the floating point issues are fixed.
     set(GINKGO_COMPILER_FLAGS "-Wpedantic;-ffp-model=precise" CACHE STRING
@@ -68,8 +66,7 @@ endif()
 set(GINKGO_CUDA_COMPILER_FLAGS "" CACHE STRING
     "Set the required NVCC compiler flags, mainly used for warnings. Current default is an empty string")
 set(GINKGO_CUDA_ARCHITECTURES "Auto" CACHE STRING
-    "A list of target NVIDIA GPU achitectures. See README.md for more detail.")
-option(GINKGO_CUDA_DEFAULT_HOST_COMPILER "Tell Ginkgo to not automatically set the CUDA host compiler" OFF)
+    "A list of target NVIDIA GPU architectures. See README.md for more detail.")
 # the details of fine/coarse grain memory and unsafe atomic are available https://docs.olcf.ornl.gov/systems/crusher_quick_start_guide.html#floating-point-fp-atomic-operations-and-coarse-fine-grained-memory-allocations
 option(GINKGO_HIP_AMD_UNSAFE_ATOMIC "Compiler uses unsafe floating point atomic (only for AMD GPU and ROCM >= 5). Default is ON because we use hipMalloc, which is always on coarse grain. Must turn off when allocating memory on fine grain" ON)
 set(GINKGO_HIP_COMPILER_FLAGS "" CACHE STRING
@@ -80,19 +77,24 @@ set(GINKGO_HIP_CLANG_COMPILER_FLAGS "" CACHE STRING
     "Set the required HIP CLANG compiler flags. Current default is an empty string.")
 set(GINKGO_HIP_AMDGPU "" CACHE STRING
     "The amdgpu_target(s) variable passed to hipcc. The default is none (auto).")
+option(GINKGO_SPLIT_TEMPLATE_INSTANTIATIONS "Split template instantiations for slow-to-compile files. This improves parallel build performance" ON)
+mark_as_advanced(GINKGO_SPLIT_TEMPLATE_INSTANTIATIONS)
 option(GINKGO_JACOBI_FULL_OPTIMIZATIONS "Use all the optimizations for the CUDA Jacobi algorithm" OFF)
 option(BUILD_SHARED_LIBS "Build shared (.so, .dylib, .dll) libraries" ON)
 if(MSVC OR WIN32 OR CYGWIN OR APPLE)
     option(GINKGO_BUILD_HWLOC "Build Ginkgo with HWLOC. Default is OFF. Ginkgo does not support HWLOC on Windows/MacOS" OFF)
 else()
-    option(GINKGO_BUILD_HWLOC "Build Ginkgo with HWLOC. Default is ON. If a system HWLOC is not found, then we try to build it ourselves. Switch this OFF to disable HWLOC." ON)
+    option(GINKGO_BUILD_HWLOC "Build Ginkgo with HWLOC. Enabled if a system installation is found." ${HWLOC_FOUND})
 endif()
+option(GINKGO_BUILD_PAPI_SDE "Build Ginkgo with PAPI SDE. Enabled if a system installation is found." ${PAPI_SDE_FOUND})
 option(GINKGO_DPCPP_SINGLE_MODE "Do not compile double kernels for the DPC++ backend." OFF)
 option(GINKGO_INSTALL_RPATH "Set the RPATH when installing its libraries." ON)
 option(GINKGO_INSTALL_RPATH_ORIGIN "Add $ORIGIN (Linux) or @loader_path (MacOS) to the installation RPATH." ON)
 option(GINKGO_INSTALL_RPATH_DEPENDENCIES "Add dependencies to the installation RPATH." OFF)
 option(GINKGO_FORCE_GPU_AWARE_MPI "Assert that the MPI library is GPU aware. This forces Ginkgo to assume that GPU aware functionality is available (OFF (default) or ON), but may fail
      catastrophically in case the MPI implementation is not GPU Aware, and GPU aware functionality has been forced" OFF)
+set(GINKGO_CI_TEST_OMP_PARALLELISM "4" CACHE STRING
+    "The number of OpenMP threads to use for a test binary during CTest resource file-constrained test.")
 
 # load executor-specific configuration
 if(GINKGO_BUILD_CUDA)
@@ -101,10 +103,13 @@ endif()
 if(GINKGO_BUILD_HIP)
     include(cmake/hip.cmake)
 endif()
+if(GINKGO_BUILD_SYCL)
+    include(cmake/sycl.cmake)
+endif()
 if(GINKGO_BUILD_OMP)
     find_package(OpenMP 3.0 REQUIRED)
 endif()
-set(THREADS_PREFER_PTHREAD_FLAG ON)
+
 find_package(Threads REQUIRED)
 include(cmake/build_type_helpers.cmake)
 
@@ -197,13 +202,6 @@ endif()
 include(CheckIncludeFileCXX)
 check_include_file_cxx(cxxabi.h GKO_HAVE_CXXABI_H)
 
-# Automatically find PAPI and search for the required 'sde' component
-set(GINKGO_HAVE_PAPI_SDE 0)
-find_package(PAPI OPTIONAL_COMPONENTS sde)
-if(PAPI_sde_FOUND)
-    set(GINKGO_HAVE_PAPI_SDE 1)
-endif()
-
 # Automatically find TAU
 set(GINKGO_HAVE_TAU 0)
 find_package(PerfStubs QUIET)
@@ -233,12 +231,6 @@ if(GINKGO_BUILD_HWLOC AND (MSVC OR WIN32 OR CYGWIN OR APPLE))
     set(GINKGO_BUILD_HWLOC OFF CACHE BOOL "Build Ginkgo with HWLOC. Default is OFF. Ginkgo does not support HWLOC on Windows/MacOS" FORCE)
     message(WARNING "Ginkgo does not support HWLOC on Windows/MacOS, switch GINKGO_BUILD_HWLOC to OFF")
 endif()
-if(GINKGO_BUILD_HWLOC)
-    set(GINKGO_HAVE_HWLOC 1)
-else()
-    set(GINKGO_HAVE_HWLOC 0)
-    message(STATUS "HWLOC is being forcibly switched off")
-endif()
 
 set(GINKGO_HAVE_GPU_AWARE_MPI OFF)
 set(GINKGO_FORCE_SPMV_BLOCKING_COMM OFF)
@@ -250,14 +242,21 @@ if(GINKGO_BUILD_MPI)
         set(GINKGO_HAVE_GPU_AWARE_MPI OFF)
     endif()
 
-    try_run(uses_openmpi gko_result_unused
+    # use try_compile instead of try_run to prevent cross-compiling issues
+    try_compile(uses_openmpi
         ${Ginkgo_BINARY_DIR}
         ${Ginkgo_SOURCE_DIR}/cmake/openmpi_test.cpp
+                COMPILE_DEFINITIONS -DCHECK_HAS_OPEN_MPI=1
         LINK_LIBRARIES MPI::MPI_CXX
-        RUN_OUTPUT_VARIABLE openmpi_version
         )
     if(uses_openmpi)
-        if(openmpi_version VERSION_LESS "4.1")
+        try_compile(valid_openmpi_version
+                    ${Ginkgo_BINARY_DIR}
+                    ${Ginkgo_SOURCE_DIR}/cmake/openmpi_test.cpp
+                    COMPILE_DEFINITIONS -DCHECK_OPEN_MPI_VERSION=1
+                    LINK_LIBRARIES MPI::MPI_CXX
+        )
+        if(NOT valid_openmpi_version)
             message(WARNING
                 "OpenMPI v4.0.x has a bug that forces us to use blocking communication in our distributed "
                 "matrix class. To enable faster, non-blocking communication, consider updating your OpenMPI version or "
@@ -268,17 +267,38 @@ if(GINKGO_BUILD_MPI)
 endif()
 
 # Try to find the third party packages before using our subdirectories
-include(cmake/package_helpers.cmake)
 if(GINKGO_BUILD_TESTS)
     find_package(GTest 1.10.0) # No need for QUIET as CMake ships FindGTest
 endif()
 if(GINKGO_BUILD_BENCHMARKS)
     find_package(gflags 2.2.2 QUIET)
-    find_package(RapidJSON 1.1.0 QUIET)
+    find_package(nlohmann_json 3.9.1 QUIET)
 endif()
+
+# System provided, third party libraries (not bundled!)
+set(GINKGO_HAVE_HWLOC 0)
 if(GINKGO_BUILD_HWLOC)
-    find_package(HWLOC 2.1) # No need for QUIET as we ship FindHWLOC
+    find_package(HWLOC 2.1)
+    if (HWLOC_FOUND)
+        set(GINKGO_HAVE_HWLOC 1)
+    else()
+        message(WARNING "HWLOC could not be found. HWLOC support will be disabled.")
+        set(GINKGO_BUILD_HWLOC OFF CACHE BOOL "HWLOC support was disabled because a system package could not be found." FORCE)
+    endif()
 endif()
+
+set(GINKGO_HAVE_PAPI_SDE 0)
+if(GINKGO_BUILD_PAPI_SDE)
+    find_package(PAPI 7.0.1.0 COMPONENTS sde)
+    if (PAPI_SDE_FOUND)
+        set(GINKGO_HAVE_PAPI_SDE 1)
+    else()
+        message(WARNING "PAPI (SDE) could not be found. PAPI_SDE support will be disabled.")
+        set(GINKGO_BUILD_PAPI_SDE OFF CACHE BOOL "PAPI_SDE support was disabled because a system package could not be found." FORCE)
+    endif()
+endif()
+
+# Bundled third party libraries
 add_subdirectory(third_party)    # Third-party tools and libraries
 
 if(MSVC)
@@ -289,7 +309,7 @@ if(MSVC)
     endif()
 endif()
 
-if(GINKGO_BUILD_DPCPP)
+if(GINKGO_BUILD_SYCL)
     ginkgo_extract_dpcpp_version(${CMAKE_CXX_COMPILER} GINKGO_DPCPP_MAJOR_VERSION __LIBSYCL_MAJOR_VERSION)
     ginkgo_extract_dpcpp_version(${CMAKE_CXX_COMPILER} GINKGO_DPCPP_VERSION __SYCL_COMPILER_VERSION)
 else()
@@ -312,7 +332,7 @@ endif()
 if(GINKGO_BUILD_HIP)
     add_subdirectory(hip)        # High-performance kernels for AMD or NVIDIA GPUs
 endif()
-if(GINKGO_BUILD_DPCPP)
+if(GINKGO_BUILD_SYCL)
     add_subdirectory(dpcpp)        # High-performance DPC++ kernels
 endif()
 if(GINKGO_BUILD_OMP)
@@ -505,3 +525,8 @@ else()
     FILE(READ ${PROJECT_BINARY_DIR}/minimal.log GINKGO_LOG_SUMMARY)
 endif()
 MESSAGE(STATUS "${GINKGO_LOG_SUMMARY}")
+
+# make sure no build files get committed accidentally
+if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/.gitignore)
+    file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/.gitignore "*")
+endif()
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 1dd6f412876..8e2f3990aca 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -312,7 +312,7 @@ Thus, contributors should be aware of the following rules for blank lines:
         However, simply calling function `f` from function `g` does not imply
         that `f` and `g` are "related".
 2.  Statements within structures / classes are separated with 1 blank line.
-    There are no blank lines betweeen the first / last statement in the
+    There are no blank lines between the first / last statement in the
     structure / class.
     1.  _exception_: there is no blank line between an access modifier (`private`, `protected`, `public`) and the following statement.
        _example_:
diff --git a/INSTALL.md b/INSTALL.md
index 5f788ed0e28..4da58010ba8 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -31,7 +31,7 @@ Ginkgo adds the following additional switches to control what is being built:
 *   `-DGINKGO_FAST_TESTS={ON, OFF}` reduces the input sizes for a few slow tests
     to speed them up, default is `OFF`.
 *   `-DGINKGO_BUILD_BENCHMARKS={ON, OFF}` builds Ginkgo's benchmarks
-    (will download gflags and rapidjson), default is `ON`.
+    (will download gflags and nlohmann-json), default is `ON`.
 *   `-DGINKGO_BUILD_EXAMPLES={ON, OFF}` builds Ginkgo's examples, default is `ON`
 *   `-DGINKGO_BUILD_EXTLIB_EXAMPLE={ON, OFF}` builds the interfacing example
     with deal.II, default is `OFF`.
@@ -42,9 +42,10 @@ Ginkgo adds the following additional switches to control what is being built:
 *   `-DGINKGO_BUILD_CUDA={ON, OFF}` builds optimized cuda versions of the kernels
     (requires CUDA), default is `ON` if a CUDA compiler could be detected,
     `OFF` otherwise.
-*   `-DGINKGO_BUILD_DPCPP={ON, OFF}` builds optimized DPC++ versions of the
-    kernels (requires `CMAKE_CXX_COMPILER` to be set to the `dpcpp` compiler).
-    The default is `ON` if `CMAKE_CXX_COMPILER` is a DPC++ compiler, `OFF`
+*   `-DGINKGO_BUILD_DPCPP={ON, OFF}` is deprecated. Please use `GINKGO_BUILD_SYCL` instead.
+*   `-DGINKGO_BUILD_SYCL={ON, OFF}` builds optimized SYCL versions of the
+    kernels (requires `CMAKE_CXX_COMPILER` to be set to the `dpcpp` or `icpx` compiler).
+    The default is `ON` if `CMAKE_CXX_COMPILER` is a SYCL compiler, `OFF`
     otherwise.
 *   `-DGINKGO_BUILD_HIP={ON, OFF}` builds optimized HIP versions of the kernels
     (requires HIP), default is `ON` if an installation of HIP could be detected,
@@ -205,7 +206,7 @@ packages can be turned off by disabling the relevant options.
   Test](https://github.com/google/googletest);
 + GINKGO_BUILD_BENCHMARKS=ON: For argument management we use
   [gflags](https://github.com/gflags/gflags) and for JSON parsing we use
-  [RapidJSON](https://github.com/Tencent/rapidjson);
+  [nlohmann-json](https://github.com/nlohmann/json);
 + GINKGO_DEVEL_TOOLS=ON:
   [git-cmake-format](https://github.com/gflegar/git-cmake-format) is our CMake
   helper for code formatting.
@@ -224,7 +225,7 @@ packages can be turned off by disabling the relevant options.
 Ginkgo attempts to use pre-installed versions of these package if they match
 version requirements using `find_package`. Otherwise, the configuration step
 will download the files for each of the packages `GTest`, `gflags`,
-`RapidJSON` and `hwloc` and build them internally.
+`nlohmann-json` and `hwloc` and build them internally.
 
 Note that, if the external packages were not installed to the default location,
 the CMake option `-DCMAKE_PREFIX_PATH=<path-list>` needs to be set to the
diff --git a/README.md b/README.md
index be865e933f2..d873026a34f 100644
--- a/README.md
+++ b/README.md
@@ -16,7 +16,7 @@
 Ginkgo is a high-performance linear algebra library for manycore systems, with a
 focus on the solution of sparse linear systems. It is implemented using modern C++
 (you will need an at least C++14 compliant compiler to build it), with GPU kernels
-implemented in CUDA, HIP, and DPC++.
+implemented in CUDA, HIP, and DPC++(SYCL).
 
 
 Performance
@@ -36,7 +36,7 @@ Prerequisites
 
 For Ginkgo core library:
 
-*   _cmake 3.13+_
+*   _cmake 3.16+_
 *   C++14 compliant compiler, one of:
     *   _gcc 5.5+_
     *   _clang 3.9+_
@@ -47,7 +47,8 @@ For Ginkgo core library:
 
 The Ginkgo CUDA module has the following __additional__ requirements:
 
-*   _CUDA 9.2+_ or _NVHPC Package 22.7+_
+*   _cmake 3.18+_ (If CUDA was installed through the NVIDIA HPC Toolkit, we require _cmake 3.22+_)
+*   _CUDA 10.1+_ or _NVHPC Package 22.7+_
 *   Any host compiler restrictions your version of CUDA may impose also apply
     here. For the newest CUDA version, this information can be found in the
     [CUDA installation guide for Linux](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html)
@@ -58,13 +59,13 @@ The Ginkgo HIP module has the following __additional__ requirements:
 * _ROCm 4.5+_
 *    the HIP, hipBLAS, hipSPARSE, hip/rocRAND and rocThrust packages compiled with either:
     * _AMD_ backend (using the `clang` compiler)
-    * _9.2 <= CUDA < 11_ backend
+    * _10.1 <= CUDA < 11_ backend
 * if the hipFFT package is available, it is used to implement the FFT LinOps.
 
-The Ginkgo DPC++ module has the following __additional__ requirements:
+The Ginkgo DPC++(SYCL) module has the following __additional__ requirements:
 
-* _OneAPI 2021.3+_
-* Set `dpcpp` as the `CMAKE_CXX_COMPILER`
+* _oneAPI 2022.1+_
+* Set `dpcpp` or `icpx` as the `CMAKE_CXX_COMPILER`
 * `c++17` is used to compile Ginkgo
 * The following oneAPI packages should be available:
     * oneMKL
@@ -90,7 +91,7 @@ following:
 
 The Ginkgo CUDA module has the following __additional__ requirements:
 
-*   _CUDA 9.2+_
+*   _CUDA 10.1+_
 *   _Microsoft Visual Studio_
 *   Any host compiler restrictions your version of CUDA may impose also apply
     here. For the newest CUDA version, this information can be found in the
@@ -122,7 +123,7 @@ cmake -G "Unix Makefiles" .. && make
 By default, `GINKGO_BUILD_REFERENCE` is enabled. You should be able to run
 examples with this executor. By default, Ginkgo tries to enable the relevant
 modules depending on your machine environment (present of CUDA, ...). You can
-also explicitly compile with the OpenMP, CUDA, HIP or DPC++ modules enabled to
+also explicitly compile with the OpenMP, CUDA, HIP or DPC++(SYCL) modules enabled to
 run the examples with these executors. Please refer to the [Installation
 page](./INSTALL.md) for more details.
 
diff --git a/accessor/accessor_helper.hpp b/accessor/accessor_helper.hpp
index 5ee536d28db..5b80f4e13d8 100644
--- a/accessor/accessor_helper.hpp
+++ b/accessor/accessor_helper.hpp
@@ -78,7 +78,7 @@ struct row_major_helper_s {
         const std::array<SizeType, (total_dim > 1 ? total_dim - 1 : 0)>& stride,
         IndexType first, Indices&&... idxs)
     {
-        // The ASSERT size check must NOT be indexed with `dim_idx` directy,
+        // The ASSERT size check must NOT be indexed with `dim_idx` directly,
         // otherwise, it leads to a linker error. The reason is likely that
         // `std::array<size_type, N>::operator[](const size_type &)` uses a
         // reference. Since `dim_idx` is constexpr (and not defined in a
diff --git a/accessor/row_major.hpp b/accessor/row_major.hpp
index 757110f4912..9026cef2116 100644
--- a/accessor/row_major.hpp
+++ b/accessor/row_major.hpp
@@ -55,7 +55,7 @@ namespace acc {
  * constructor parameters for this class to the range (it will forward it to
  * this class).
  *
- * @warning For backward compatability reasons, a specialization is provided
+ * @warning For backward compatibility reasons, a specialization is provided
  *          for dimensionality == 2.
  *
  * @tparam ValueType  type of values this accessor returns
diff --git a/accessor/utils.hpp b/accessor/utils.hpp
index e692138ee4d..dfe30188f83 100644
--- a/accessor/utils.hpp
+++ b/accessor/utils.hpp
@@ -243,7 +243,7 @@ to_arithmetic_type(const Ref& ref)
  * @internal
  * Struct used for testing if an implicit cast is present. The constructor only
  * takes an OutType, so any argument of a type that is not implicitly
- * convertable to OutType is incompatible.
+ * convertible to OutType is incompatible.
  */
 template <typename OutType>
 struct test_for_implicit_cast {
diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
index 040356f1666..347ecec7699 100644
--- a/benchmark/CMakeLists.txt
+++ b/benchmark/CMakeLists.txt
@@ -20,9 +20,7 @@ function(ginkgo_benchmark_cusparse_linops type def)
     endif()
     # make the dependency public to catch issues
     target_compile_definitions(cusparse_linops_${type} PUBLIC ${def})
-    target_link_libraries(cusparse_linops_${type} Ginkgo::ginkgo ${CUDA_RUNTIME_LIBS} ${CUBLAS} ${CUSPARSE})
-    target_include_directories(cusparse_linops_${type} SYSTEM PRIVATE ${CUDA_INCLUDE_DIRS})
-    target_compile_definitions(cusparse_linops_${type} PRIVATE ALLOWMP=1)
+    target_link_libraries(cusparse_linops_${type} Ginkgo::ginkgo CUDA::cudart CUDA::cublas CUDA::cusparse)
 endfunction()
 
 function(ginkgo_benchmark_hipsparse_linops type def)
@@ -48,7 +46,7 @@ endfunction()
 
 
 # Generates an executable for one precision. Each executable will be linked to
-# `ginkgo`, `gflags` and `rapidjson`.
+# `ginkgo`, `gflags` and `nlohmann-json`.
 # Note: This should only be used by `ginkgo_add_typed_benchmark_executables`
 #
 # \param name            name for the executable to create (including type suffix)
@@ -59,7 +57,7 @@ endfunction()
 # All remaining arguments will be treated as source files
 function(ginkgo_add_single_benchmark_executable name use_lib_linops macro_def type)
     add_executable("${name}" ${ARGN})
-    target_link_libraries("${name}" ginkgo gflags rapidjson)
+    target_link_libraries("${name}" ginkgo gflags nlohmann_json::nlohmann_json)
     # always include the device timer
     if (GINKGO_BUILD_CUDA)
         target_compile_definitions("${name}" PRIVATE HAS_CUDA_TIMER=1)
@@ -69,7 +67,7 @@ function(ginkgo_add_single_benchmark_executable name use_lib_linops macro_def ty
         target_compile_definitions("${name}" PRIVATE HAS_HIP_TIMER=1)
         target_link_libraries("${name}" hip_timer)
     endif()
-    if (GINKGO_BUILD_DPCPP)
+    if (GINKGO_BUILD_SYCL)
         target_compile_definitions("${name}" PRIVATE HAS_DPCPP_TIMER=1)
         target_link_libraries("${name}" dpcpp_timer)
     endif()
@@ -89,7 +87,7 @@ function(ginkgo_add_single_benchmark_executable name use_lib_linops macro_def ty
             target_compile_definitions("${name}" PRIVATE HAS_HIP=1)
             target_link_libraries("${name}" hipsparse_linops_${type})
         endif()
-        if (GINKGO_BUILD_DPCPP)
+        if (GINKGO_BUILD_SYCL)
             target_compile_definitions("${name}" PRIVATE HAS_DPCPP=1)
             target_link_libraries("${name}" onemkl_linops_${type})
         endif()
@@ -98,7 +96,7 @@ endfunction(ginkgo_add_single_benchmark_executable)
 
 
 # Generates an executable for each supported precision. Each executable will be
-# linked to `ginkgo`, `gflags` and `rapidjson`.
+# linked to `ginkgo`, `gflags` and `nlohmann-json`.
 #
 # \param name            base-name for the executable to create
 # \param use_lib_linops  Boolean indicating if linking against hipsparse/cusparse
@@ -122,8 +120,7 @@ if (GINKGO_BUILD_CUDA)
     ginkgo_benchmark_cusparse_linops(z GKO_BENCHMARK_USE_DOUBLE_COMPLEX_PRECISION)
     ginkgo_benchmark_cusparse_linops(c GKO_BENCHMARK_USE_SINGLE_COMPLEX_PRECISION)
     add_library(cuda_timer utils/cuda_timer.cpp)
-    target_link_libraries(cuda_timer ginkgo ${CUDA_RUNTIME_LIBS})
-    target_include_directories(cuda_timer SYSTEM PRIVATE ${CUDA_INCLUDE_DIRS})
+    target_link_libraries(cuda_timer ginkgo CUDA::cudart)
 endif()
 if (GINKGO_BUILD_HIP)
     ginkgo_benchmark_hipsparse_linops(d GKO_BENCHMARK_USE_DOUBLE_PRECISION)
@@ -137,12 +134,14 @@ if (GINKGO_BUILD_HIP)
     target_link_libraries(hip_timer ginkgo)
 endif()
 
-if (GINKGO_BUILD_DPCPP)
+if (GINKGO_BUILD_SYCL)
     ginkgo_benchmark_onemkl_linops(d GKO_BENCHMARK_USE_DOUBLE_PRECISION)
     ginkgo_benchmark_onemkl_linops(s GKO_BENCHMARK_USE_SINGLE_PRECISION)
     ginkgo_benchmark_onemkl_linops(z GKO_BENCHMARK_USE_DOUBLE_COMPLEX_PRECISION)
     ginkgo_benchmark_onemkl_linops(c GKO_BENCHMARK_USE_SINGLE_COMPLEX_PRECISION)
     add_library(dpcpp_timer utils/dpcpp_timer.dp.cpp)
+    target_compile_options(dpcpp_timer PRIVATE ${GINKGO_DPCPP_FLAGS})
+    gko_add_sycl_to_target(TARGET dpcpp_timer SOURCES utils/dpcpp_timer.dp.cpp)
     target_link_libraries(dpcpp_timer ginkgo)
 endif()
 
@@ -152,7 +151,7 @@ if (GINKGO_BUILD_MPI)
 endif()
 
 add_subdirectory(blas)
-add_subdirectory(conversions)
+add_subdirectory(conversion)
 add_subdirectory(matrix_generator)
 add_subdirectory(matrix_statistics)
 add_subdirectory(preconditioner)
@@ -160,22 +159,14 @@ add_subdirectory(solver)
 add_subdirectory(sparse_blas)
 add_subdirectory(spmv)
 add_subdirectory(tools)
+if (GINKGO_BUILD_TESTS)
+    add_subdirectory(test)
+endif()
 
-add_custom_target(make_run_all_benchmarks ALL)
-file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/run_all_benchmarks.sh
-    DESTINATION ${CMAKE_CURRENT_BINARY_DIR}
-    FILE_PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE
-    WORLD_READ WORLD_EXECUTE)
-
-add_custom_command(
-    TARGET make_run_all_benchmarks POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E copy
-            ${CMAKE_CURRENT_SOURCE_DIR}/run_all_benchmarks.sh
-            ${CMAKE_CURRENT_BINARY_DIR}/run_all_benchmarks.sh)
+configure_file(run_all_benchmarks.sh run_all_benchmarks.sh COPYONLY)
 
 add_custom_target(benchmark)
 add_custom_command(
     TARGET benchmark POST_BUILD
     COMMAND bash run_all_benchmarks.sh >/dev/null
-    DEPENDS make_run_all_benchmarks
     WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
diff --git a/benchmark/blas/blas.cpp b/benchmark/blas/blas.cpp
index ee2dc06d01b..f7ad8120a80 100644
--- a/benchmark/blas/blas.cpp
+++ b/benchmark/blas/blas.cpp
@@ -130,25 +130,17 @@ Parameters for a benchmark case are:
     stride_B: stride for B matrix in gemm (optional, default m)
     stride_C: stride for C matrix in gemm (optional, default m)
 )";
-    std::string format = example_config;
+    std::string format = Generator::get_example_config();
     initialize_argument_parsing(&argc, &argv, header, format);
 
     std::string extra_information = "The operations are " + FLAGS_operations;
     print_general_information(extra_information);
     auto exec = executor_factory.at(FLAGS_executor)(FLAGS_gpu_timer);
 
-    rapidjson::IStreamWrapper jcin(get_input_stream());
-    rapidjson::Document test_cases;
-    test_cases.ParseStream(jcin);
-    if (!test_cases.IsArray()) {
-        std::cerr
-            << "Input has to be a JSON array of benchmark configurations:\n"
-            << format;
-        std::exit(1);
-    }
+    auto test_cases = json::parse(get_input_stream());
 
-    run_blas_benchmarks(exec, get_timer(exec, FLAGS_gpu_timer), operation_map,
-                        test_cases, true);
+    run_test_cases(BlasBenchmark{operation_map}, exec,
+                   get_timer(exec, FLAGS_gpu_timer), test_cases);
 
-    std::cout << test_cases << std::endl;
+    std::cout << std::setw(4) << test_cases << std::endl;
 }
diff --git a/benchmark/blas/blas_common.hpp b/benchmark/blas/blas_common.hpp
index f36b7649ffc..1267dc57c15 100644
--- a/benchmark/blas/blas_common.hpp
+++ b/benchmark/blas/blas_common.hpp
@@ -43,7 +43,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include "benchmark/utils/general.hpp"
+#include "benchmark/utils/iteration_control.hpp"
 #include "benchmark/utils/loggers.hpp"
+#include "benchmark/utils/runner.hpp"
 #include "benchmark/utils/timer.hpp"
 #include "benchmark/utils/types.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
@@ -70,14 +72,6 @@ DEFINE_string(
     "C has dimensions n x m and x and y have dimensions n x r");
 
 
-std::string example_config = R"(
-  [
-    { "n": 100 },
-    { "n": 200, "m": 200, "k": 200 }
-  ]
-)";
-
-
 class BenchmarkOperation {
 public:
     virtual ~BenchmarkOperation() = default;
@@ -404,172 +398,129 @@ struct dimensions {
 };
 
 
-dimensions parse_dims(rapidjson::Value& test_case)
-{
-    auto get_optional = [](rapidjson::Value& obj, const char* name,
-                           gko::size_type default_value) -> gko::size_type {
-        if (obj.HasMember(name)) {
-            return obj[name].GetUint64();
-        } else {
-            return default_value;
-        }
-    };
-
-    dimensions result;
-    result.n = test_case["n"].GetInt64();
-    result.k = get_optional(test_case, "k", result.n);
-    result.m = get_optional(test_case, "m", result.n);
-    result.r = get_optional(test_case, "r", 1);
-    if (test_case.HasMember("stride")) {
-        result.stride_x = test_case["stride"].GetInt64();
-        result.stride_y = result.stride_x;
-    } else {
-        result.stride_x = get_optional(test_case, "stride_x", result.r);
-        result.stride_y = get_optional(test_case, "stride_y", result.r);
+struct BlasBenchmark : Benchmark<dimensions> {
+    using map_type =
+        std::map<std::string,
+                 std::function<std::unique_ptr<BenchmarkOperation>(
+                     std::shared_ptr<const gko::Executor>, dimensions)>>;
+    map_type operation_map;
+    std::vector<std::string> operations;
+    std::string name;
+    bool do_print;
+
+    BlasBenchmark(map_type operation_map, bool do_print = true)
+        : operation_map{std::move(operation_map)},
+          name{"blas"},
+          operations{split(FLAGS_operations)},
+          do_print{do_print}
+    {}
+
+    const std::string& get_name() const override { return name; }
+
+    const std::vector<std::string>& get_operations() const override
+    {
+        return operations;
     }
-    result.stride_A = get_optional(test_case, "stride_A", result.k);
-    result.stride_B = get_optional(test_case, "stride_B", result.m);
-    result.stride_C = get_optional(test_case, "stride_C", result.m);
-    return result;
-}
 
+    bool should_print() const override { return do_print; }
 
-std::string describe(rapidjson::Value& test_case)
-{
-    std::stringstream ss;
-    auto optional_output = [&](const char* name) {
-        if (test_case.HasMember(name) && test_case[name].IsInt64()) {
-            ss << name << " = " << test_case[name].GetInt64() << " ";
-        }
-    };
-    optional_output("n");
-    optional_output("k");
-    optional_output("m");
-    optional_output("r");
-    optional_output("stride");
-    optional_output("stride_x");
-    optional_output("stride_y");
-    optional_output("stride_A");
-    optional_output("stride_B");
-    optional_output("stride_C");
-    return ss.str();
-}
+    std::string get_example_config() const override
+    {
+        return json::parse(R"([{"n": 100}, {"n": 200, "m": 200, "k": 200}])")
+            .dump(4);
+    }
 
+    bool validate_config(const json& value) const override
+    {
+        return value.contains("n") && value["n"].is_number_integer();
+    }
+
+    std::string describe_config(const json& test_case) const override
+    {
+        std::stringstream ss;
+        auto optional_output = [&](const char* name) {
+            if (test_case.contains(name) &&
+                test_case[name].is_number_integer()) {
+                ss << name << " = " << test_case[name].get<gko::int64>() << " ";
+            }
+        };
+        optional_output("n");
+        optional_output("k");
+        optional_output("m");
+        optional_output("r");
+        optional_output("stride");
+        optional_output("stride_x");
+        optional_output("stride_y");
+        optional_output("stride_A");
+        optional_output("stride_B");
+        optional_output("stride_C");
+        return ss.str();
+    }
+
+    dimensions setup(std::shared_ptr<gko::Executor> exec,
+                     json& test_case) const override
+    {
+        auto get_optional = [](json& obj, const char* name,
+                               gko::size_type default_value) -> gko::size_type {
+            if (obj.contains(name)) {
+                return obj[name].get<gko::uint64>();
+            } else {
+                return default_value;
+            }
+        };
+
+        dimensions result;
+        result.n = test_case["n"].get<gko::int64>();
+        result.k = get_optional(test_case, "k", result.n);
+        result.m = get_optional(test_case, "m", result.n);
+        result.r = get_optional(test_case, "r", 1);
+        if (test_case.contains("stride")) {
+            result.stride_x = test_case["stride"].get<gko::int64>();
+            result.stride_y = result.stride_x;
+        } else {
+            result.stride_x = get_optional(test_case, "stride_x", result.r);
+            result.stride_y = get_optional(test_case, "stride_y", result.r);
+        }
+        result.stride_A = get_optional(test_case, "stride_A", result.k);
+        result.stride_B = get_optional(test_case, "stride_B", result.m);
+        result.stride_C = get_optional(test_case, "stride_C", result.m);
+        return result;
+    }
 
-template <typename OpMap>
-void apply_blas(const char* operation_name, std::shared_ptr<gko::Executor> exec,
-                std::shared_ptr<Timer> timer, const OpMap& operation_map,
-                rapidjson::Value& test_case,
-                rapidjson::MemoryPoolAllocator<>& allocator)
-{
-    try {
-        auto& blas_case = test_case["blas"];
-        add_or_set_member(blas_case, operation_name,
-                          rapidjson::Value(rapidjson::kObjectType), allocator);
 
-        auto op = operation_map.at(operation_name)(exec, parse_dims(test_case));
+    void run(std::shared_ptr<gko::Executor> exec, std::shared_ptr<Timer> timer,
+             annotate_functor annotate, dimensions& dims,
+             const std::string& operation_name,
+             json& operation_case) const override
+    {
+        auto op = operation_map.at(operation_name)(exec, dims);
 
         IterationControl ic(timer);
 
         // warm run
-        for (auto _ : ic.warmup_run()) {
-            op->prepare();
-            exec->synchronize();
-            op->run();
-            exec->synchronize();
+        {
+            auto range = annotate("warmup", FLAGS_warmup > 0);
+            for (auto _ : ic.warmup_run()) {
+                op->prepare();
+                exec->synchronize();
+                op->run();
+                exec->synchronize();
+            }
         }
 
         // timed run
         op->prepare();
         for (auto _ : ic.run()) {
+            auto range = annotate("repetition");
             op->run();
         }
         const auto runtime = ic.compute_time(FLAGS_timer_method);
         const auto flops = static_cast<double>(op->get_flops());
         const auto mem = static_cast<double>(op->get_memory());
         const auto repetitions = ic.get_num_repetitions();
-        add_or_set_member(blas_case[operation_name], "time", runtime,
-                          allocator);
-        add_or_set_member(blas_case[operation_name], "flops", flops / runtime,
-                          allocator);
-        add_or_set_member(blas_case[operation_name], "bandwidth", mem / runtime,
-                          allocator);
-        add_or_set_member(blas_case[operation_name], "repetitions", repetitions,
-                          allocator);
-
-        // compute and write benchmark data
-        add_or_set_member(blas_case[operation_name], "completed", true,
-                          allocator);
-    } catch (const std::exception& e) {
-        add_or_set_member(test_case["blas"][operation_name], "completed", false,
-                          allocator);
-        if (FLAGS_keep_errors) {
-            rapidjson::Value msg_value;
-            msg_value.SetString(e.what(), allocator);
-            add_or_set_member(test_case["blas"][operation_name], "error",
-                              msg_value, allocator);
-        }
-        std::cerr << "Error when processing test case " << test_case << "\n"
-                  << "what(): " << e.what() << std::endl;
+        operation_case["time"] = runtime;
+        operation_case["flops"] = flops / runtime;
+        operation_case["bandwidth"] = mem / runtime;
+        operation_case["repetitions"] = repetitions;
     }
-}
-
-
-template <typename OpMap>
-void run_blas_benchmarks(std::shared_ptr<gko::Executor> exec,
-                         std::shared_ptr<Timer> timer,
-                         const OpMap& operation_map,
-                         rapidjson::Document& test_cases, bool do_print)
-{
-    auto operations = split(FLAGS_operations, ',');
-    auto& allocator = test_cases.GetAllocator();
-    auto profiler_hook = create_profiler_hook(exec);
-    if (profiler_hook) {
-        exec->add_logger(profiler_hook);
-    }
-    auto annotate = annotate_functor{profiler_hook};
-
-    for (auto& test_case : test_cases.GetArray()) {
-        try {
-            // set up benchmark
-            if (!test_case.HasMember("blas")) {
-                test_case.AddMember("blas",
-                                    rapidjson::Value(rapidjson::kObjectType),
-                                    allocator);
-            }
-            auto& blas_case = test_case["blas"];
-            if (!FLAGS_overwrite &&
-                all_of(begin(operations), end(operations),
-                       [&blas_case](const std::string& s) {
-                           return blas_case.HasMember(s.c_str());
-                       })) {
-                continue;
-            }
-            if (do_print) {
-                std::clog << "Running test case: " << test_case << std::endl;
-            }
-            // annotate the test case
-            auto test_case_range = annotate(describe(test_case));
-            for (const auto& operation_name : operations) {
-                {
-                    auto operation_range = annotate(operation_name.c_str());
-                    apply_blas(operation_name.c_str(), exec, timer,
-                               operation_map, test_case, allocator);
-                }
-
-                if (do_print) {
-                    std::clog << "Current state:" << std::endl
-                              << test_cases << std::endl;
-
-                    backup_results(test_cases);
-                }
-            }
-        } catch (const std::exception& e) {
-            std::cerr << "Error setting up benchmark, what(): " << e.what()
-                      << std::endl;
-        }
-    }
-    if (profiler_hook) {
-        exec->remove_logger(profiler_hook);
-    }
-}
+};
diff --git a/benchmark/blas/distributed/CMakeLists.txt b/benchmark/blas/distributed/CMakeLists.txt
index 1371294efb8..a756b9c0071 100644
--- a/benchmark/blas/distributed/CMakeLists.txt
+++ b/benchmark/blas/distributed/CMakeLists.txt
@@ -1 +1 @@
-ginkgo_add_typed_benchmark_executables(multi-vector-distributed "NO" multi_vector.cpp)
+ginkgo_add_typed_benchmark_executables(multi_vector_distributed "NO" multi_vector.cpp)
diff --git a/benchmark/blas/distributed/multi_vector.cpp b/benchmark/blas/distributed/multi_vector.cpp
index 4d3b821ed2e..fe5eea5a38c 100644
--- a/benchmark/blas/distributed/multi_vector.cpp
+++ b/benchmark/blas/distributed/multi_vector.cpp
@@ -38,6 +38,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <iostream>
 
 
+#define GKO_BENCHMARK_DISTRIBUTED
+
+
 #include "benchmark/blas/blas_common.hpp"
 #include "benchmark/utils/general.hpp"
 #include "benchmark/utils/generator.hpp"
@@ -50,6 +53,10 @@ int main(int argc, char* argv[])
 {
     gko::experimental::mpi::environment mpi_env{argc, argv};
 
+    const auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
+    const auto rank = comm.rank();
+    const auto do_print = rank == 0;
+
     std::string header = R"("
 A benchmark for measuring performance of Ginkgo's BLAS-like "
 operations.
@@ -60,26 +67,19 @@ Parameters for a benchmark case are:
     stride_x: stride for input vector x (optional, default r)
     stride_y: stride for in/out vector y (optional, default r)
 )";
-    std::string format = example_config;
-    initialize_argument_parsing(&argc, &argv, header, format);
+    std::string format = Generator::get_example_config();
+    initialize_argument_parsing(&argc, &argv, header, format, do_print);
 
-    std::string extra_information = "The operations are " + FLAGS_operations;
-    print_general_information(extra_information);
-
-    const auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
-    const auto rank = comm.rank();
+    if (do_print) {
+        std::string extra_information =
+            "The operations are " + FLAGS_operations;
+        print_general_information(extra_information);
+    }
 
     auto exec = executor_factory_mpi.at(FLAGS_executor)(comm.get());
 
     std::string json_input = broadcast_json_input(get_input_stream(), comm);
-    rapidjson::Document test_cases;
-    test_cases.Parse(json_input.c_str());
-    if (!test_cases.IsArray()) {
-        std::cerr
-            << "Input has to be a JSON array of benchmark configurations:\n"
-            << format;
-        std::exit(1);
-    }
+    auto test_cases = json::parse(json_input);
 
     std::map<std::string,
              std::function<std::unique_ptr<BenchmarkOperation>(
@@ -127,10 +127,10 @@ Parameters for a benchmark case are:
                      exec, Generator{comm, {}}, dims.n, dims.r, dims.stride_y);
              }}};
 
-    run_blas_benchmarks(exec, get_mpi_timer(exec, comm, FLAGS_gpu_timer),
-                        operation_map, test_cases, rank == 0);
+    run_test_cases(BlasBenchmark{operation_map, do_print}, exec,
+                   get_mpi_timer(exec, comm, FLAGS_gpu_timer), test_cases);
 
-    if (rank == 0) {
-        std::cout << test_cases << std::endl;
+    if (do_print) {
+        std::cout << std::setw(4) << test_cases << std::endl;
     }
 }
diff --git a/benchmark/conversion/CMakeLists.txt b/benchmark/conversion/CMakeLists.txt
new file mode 100644
index 00000000000..7ecf578c055
--- /dev/null
+++ b/benchmark/conversion/CMakeLists.txt
@@ -0,0 +1 @@
+ginkgo_add_typed_benchmark_executables(conversion "NO" conversion.cpp)
diff --git a/benchmark/conversion/conversion.cpp b/benchmark/conversion/conversion.cpp
new file mode 100644
index 00000000000..e45046329d7
--- /dev/null
+++ b/benchmark/conversion/conversion.cpp
@@ -0,0 +1,207 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/ginkgo.hpp>
+
+
+#include <algorithm>
+#include <chrono>
+#include <cstdlib>
+#include <exception>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <typeinfo>
+
+
+#include "benchmark/utils/formats.hpp"
+#include "benchmark/utils/general.hpp"
+#include "benchmark/utils/general_matrix.hpp"
+#include "benchmark/utils/generator.hpp"
+#include "benchmark/utils/iteration_control.hpp"
+#include "benchmark/utils/runner.hpp"
+#include "benchmark/utils/timer.hpp"
+#include "benchmark/utils/types.hpp"
+
+
+#ifdef GINKGO_BENCHMARK_ENABLE_TUNING
+#include "benchmark/utils/tuning_variables.hpp"
+#endif  // GINKGO_BENCHMARK_ENABLE_TUNING
+
+
+using Generator = DefaultSystemGenerator<>;
+
+
+struct ConversionBenchmark : Benchmark<gko::device_matrix_data<etype, itype>> {
+    std::string name;
+    std::vector<std::string> operations;
+
+    ConversionBenchmark() : name{"conversion"}
+    {
+        auto ref_exec = gko::ReferenceExecutor::create();
+        auto formats = split(FLAGS_formats);
+        for (const auto& from_format : formats) {
+            operations.push_back(from_format + "-read");
+            auto from_mtx =
+                formats::matrix_type_factory.at(from_format)(ref_exec);
+            // all pairs of conversions that are supported by Ginkgo
+            for (const auto& to_format : formats) {
+                if (from_format == to_format) {
+                    continue;
+                }
+                auto to_mtx =
+                    formats::matrix_type_factory.at(to_format)(ref_exec);
+                try {
+                    to_mtx->copy_from(from_mtx);
+                    operations.push_back(from_format + "-" + to_format);
+                } catch (const std::exception& e) {
+                }
+            }
+        }
+    }
+
+    const std::string& get_name() const override { return name; }
+
+    const std::vector<std::string>& get_operations() const override
+    {
+        return operations;
+    }
+
+    bool should_print() const override { return true; }
+
+    std::string get_example_config() const override
+    {
+        return Generator::get_example_config();
+    }
+
+    bool validate_config(const json& test_case) const override
+    {
+        return Generator::validate_config(test_case);
+    }
+
+    std::string describe_config(const json& test_case) const override
+    {
+        return Generator::describe_config(test_case);
+    }
+
+    gko::device_matrix_data<etype, itype> setup(
+        std::shared_ptr<gko::Executor> exec, json& test_case) const override
+    {
+        gko::matrix_data<etype, itype> data;
+        data = Generator::generate_matrix_data(test_case);
+        // no reordering here, as it doesn't impact conversions beyond
+        // dense-sparse conversions
+        std::clog << "Matrix is of size (" << data.size[0] << ", "
+                  << data.size[1] << "), " << data.nonzeros.size() << std::endl;
+        test_case["rows"] = data.size[0];
+        test_case["cols"] = data.size[1];
+        test_case["nonzeros"] = data.nonzeros.size();
+        return gko::device_matrix_data<etype, itype>::create_from_host(exec,
+                                                                       data);
+    }
+
+
+    void run(std::shared_ptr<gko::Executor> exec, std::shared_ptr<Timer> timer,
+             annotate_functor annotate,
+             gko::device_matrix_data<etype, itype>& data,
+             const std::string& operation_name,
+             json& operation_case) const override
+    {
+        auto split_it =
+            std::find(operation_name.begin(), operation_name.end(), '-');
+        std::string from_name{operation_name.begin(), split_it};
+        std::string to_name{split_it + 1, operation_name.end()};
+        auto mtx_from = formats::matrix_type_factory.at(from_name)(exec);
+        auto readable =
+            gko::as<gko::ReadableFromMatrixData<etype, itype>>(mtx_from.get());
+        IterationControl ic{timer};
+        if (to_name == "read") {
+            // warm run
+            {
+                auto range = annotate("warmup", FLAGS_warmup > 0);
+                for (auto _ : ic.warmup_run()) {
+                    exec->synchronize();
+                    readable->read(data);
+                    exec->synchronize();
+                }
+            }
+            // timed run
+            for (auto _ : ic.run()) {
+                auto range = annotate("repetition");
+                readable->read(data);
+            }
+        } else {
+            readable->read(data);
+            auto mtx_to = formats::matrix_type_factory.at(to_name)(exec);
+
+            // warm run
+            {
+                auto range = annotate("warmup", FLAGS_warmup > 0);
+                for (auto _ : ic.warmup_run()) {
+                    exec->synchronize();
+                    mtx_to->copy_from(mtx_from);
+                    exec->synchronize();
+                }
+            }
+            // timed run
+            for (auto _ : ic.run()) {
+                auto range = annotate("repetition");
+                mtx_to->copy_from(mtx_from);
+            }
+        }
+        operation_case["time"] = ic.compute_time(FLAGS_timer_method);
+        operation_case["repetitions"] = ic.get_num_repetitions();
+    }
+};
+
+
+int main(int argc, char* argv[])
+{
+    std::string header =
+        "A benchmark for measuring performance of Ginkgo's conversions.\n";
+    std::string format_str = Generator::get_example_config();
+    initialize_argument_parsing_matrix(&argc, &argv, header, format_str);
+
+    std::string extra_information =
+        std::string() + "The formats are " + FLAGS_formats;
+    print_general_information(extra_information);
+
+    auto exec = executor_factory.at(FLAGS_executor)(FLAGS_gpu_timer);
+    auto formats = split(FLAGS_formats, ',');
+
+    auto test_cases = json::parse(get_input_stream());
+
+    run_test_cases(ConversionBenchmark{}, exec,
+                   get_timer(exec, FLAGS_gpu_timer), test_cases);
+
+    std::cout << std::setw(4) << test_cases << std::endl;
+}
diff --git a/benchmark/conversions/CMakeLists.txt b/benchmark/conversions/CMakeLists.txt
deleted file mode 100644
index 0e0893c3aec..00000000000
--- a/benchmark/conversions/CMakeLists.txt
+++ /dev/null
@@ -1 +0,0 @@
-ginkgo_add_typed_benchmark_executables(conversions "NO" conversions.cpp)
diff --git a/benchmark/conversions/conversions.cpp b/benchmark/conversions/conversions.cpp
deleted file mode 100644
index ec7febf262f..00000000000
--- a/benchmark/conversions/conversions.cpp
+++ /dev/null
@@ -1,222 +0,0 @@
-/*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2023, the Ginkgo authors
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions
-are met:
-
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in the
-documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its
-contributors may be used to endorse or promote products derived from
-this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-******************************<GINKGO LICENSE>*******************************/
-
-#include <ginkgo/ginkgo.hpp>
-
-
-#include <algorithm>
-#include <chrono>
-#include <cstdlib>
-#include <exception>
-#include <fstream>
-#include <iomanip>
-#include <iostream>
-#include <typeinfo>
-
-
-#include "benchmark/utils/formats.hpp"
-#include "benchmark/utils/general.hpp"
-#include "benchmark/utils/generator.hpp"
-#include "benchmark/utils/spmv_validation.hpp"
-#include "benchmark/utils/timer.hpp"
-#include "benchmark/utils/types.hpp"
-
-
-#ifdef GINKGO_BENCHMARK_ENABLE_TUNING
-#include "benchmark/utils/tuning_variables.hpp"
-#endif  // GINKGO_BENCHMARK_ENABLE_TUNING
-
-
-// This function supposes that management of `FLAGS_overwrite` is done before
-// calling it
-void convert_matrix(const gko::LinOp* matrix_from, const char* format_to,
-                    const char* conversion_name,
-                    std::shared_ptr<gko::Executor> exec,
-                    rapidjson::Value& test_case,
-                    rapidjson::MemoryPoolAllocator<>& allocator)
-{
-    try {
-        auto& conversion_case = test_case["conversions"];
-        add_or_set_member(conversion_case, conversion_name,
-                          rapidjson::Value(rapidjson::kObjectType), allocator);
-
-        gko::matrix_data<etype, itype> data{gko::dim<2>{1, 1}, 1};
-        auto matrix_to = share(formats::matrix_factory(format_to, exec, data));
-
-        auto timer = get_timer(exec, FLAGS_gpu_timer);
-        IterationControl ic{timer};
-
-        // warm run
-        for (auto _ : ic.warmup_run()) {
-            exec->synchronize();
-            matrix_to->copy_from(matrix_from);
-            exec->synchronize();
-            matrix_to->clear();
-        }
-        // timed run
-        for (auto _ : ic.run()) {
-            matrix_to->copy_from(matrix_from);
-        }
-        add_or_set_member(conversion_case[conversion_name], "time",
-                          ic.compute_time(FLAGS_timer_method), allocator);
-        add_or_set_member(conversion_case[conversion_name], "repetitions",
-                          ic.get_num_repetitions(), allocator);
-
-        // compute and write benchmark data
-        add_or_set_member(conversion_case[conversion_name], "completed", true,
-                          allocator);
-    } catch (const std::exception& e) {
-        add_or_set_member(test_case["conversions"][conversion_name],
-                          "completed", false, allocator);
-        if (FLAGS_keep_errors) {
-            rapidjson::Value msg_value;
-            msg_value.SetString(e.what(), allocator);
-            add_or_set_member(test_case["conversions"][conversion_name],
-                              "error", msg_value, allocator);
-        }
-        std::cerr << "Error when processing test case " << test_case << "\n"
-                  << "what(): " << e.what() << std::endl;
-    }
-}
-
-
-int main(int argc, char* argv[])
-{
-    std::string header =
-        "A benchmark for measuring performance of Ginkgo's conversions.\n";
-    std::string format_str = example_config;
-    initialize_argument_parsing(&argc, &argv, header, format_str);
-
-    std::string extra_information =
-        std::string() + "The formats are " + FLAGS_formats + "\n";
-    print_general_information(extra_information);
-
-    auto exec = executor_factory.at(FLAGS_executor)(FLAGS_gpu_timer);
-    auto formats = split(FLAGS_formats, ',');
-
-    rapidjson::IStreamWrapper jcin(get_input_stream());
-    rapidjson::Document test_cases;
-    test_cases.ParseStream(jcin);
-    if (!test_cases.IsArray()) {
-        print_config_error_and_exit();
-    }
-
-    auto& allocator = test_cases.GetAllocator();
-    auto profiler_hook = create_profiler_hook(exec);
-    if (profiler_hook) {
-        exec->add_logger(profiler_hook);
-    }
-    auto annotate = annotate_functor{profiler_hook};
-
-    DefaultSystemGenerator<> generator{};
-
-    for (auto& test_case : test_cases.GetArray()) {
-        std::clog << "Benchmarking conversions. " << std::endl;
-        // set up benchmark
-        validate_option_object(test_case);
-        if (!test_case.HasMember("conversions")) {
-            test_case.AddMember("conversions",
-                                rapidjson::Value(rapidjson::kObjectType),
-                                allocator);
-        }
-        auto& conversion_case = test_case["conversions"];
-
-        std::clog << "Running test case: " << test_case << std::endl;
-        gko::matrix_data<etype, itype> data;
-        try {
-            data = generator.generate_matrix_data(test_case);
-        } catch (std::exception& e) {
-            std::cerr << "Error setting up matrix data, what(): " << e.what()
-                      << std::endl;
-            if (FLAGS_keep_errors) {
-                rapidjson::Value msg_value;
-                msg_value.SetString(e.what(), allocator);
-                add_or_set_member(test_case, "error", msg_value, allocator);
-            }
-            continue;
-        }
-        std::clog << "Matrix is of size (" << data.size[0] << ", "
-                  << data.size[1] << ")" << std::endl;
-        add_or_set_member(test_case, "size", data.size[0], allocator);
-        // annotate the test case
-        auto test_case_range = annotate(generator.describe_config(test_case));
-        for (const auto& format_from : formats) {
-            try {
-                auto matrix_from =
-                    share(formats::matrix_factory(format_from, exec, data));
-                for (const auto& format_to : formats) {
-                    if (format_from == format_to) {
-                        continue;
-                    }
-                    auto conversion_name =
-                        std::string(format_from) + "-" + format_to;
-
-                    if (!FLAGS_overwrite &&
-                        conversion_case.HasMember(conversion_name.c_str())) {
-                        continue;
-                    }
-                    {
-                        auto conversion_range =
-                            annotate(conversion_name.c_str());
-                        convert_matrix(matrix_from.get(), format_to.c_str(),
-                                       conversion_name.c_str(), exec, test_case,
-                                       allocator);
-                    }
-                    std::clog << "Current state:" << std::endl
-                              << test_cases << std::endl;
-                }
-                backup_results(test_cases);
-            } catch (const gko::AllocationError& e) {
-                for (const auto& format : formats::matrix_type_factory) {
-                    const auto format_to = std::get<0>(format);
-                    auto conversion_name =
-                        std::string(format_from) + "-" + format_to;
-                    add_or_set_member(
-                        test_case["conversions"][conversion_name.c_str()],
-                        "completed", false, allocator);
-                }
-                std::cerr << "Error when allocating data for type "
-                          << format_from << ". what(): " << e.what()
-                          << std::endl;
-                backup_results(test_cases);
-            } catch (const std::exception& e) {
-                std::cerr << "Error when running benchmark, what(): "
-                          << e.what() << std::endl;
-            }
-        }
-    }
-    if (profiler_hook) {
-        exec->remove_logger(profiler_hook);
-    }
-
-    std::cout << test_cases << std::endl;
-}
diff --git a/benchmark/matrix_generator/matrix_generator.cpp b/benchmark/matrix_generator/matrix_generator.cpp
index 138b5a9c2ce..193d95f897f 100644
--- a/benchmark/matrix_generator/matrix_generator.cpp
+++ b/benchmark/matrix_generator/matrix_generator.cpp
@@ -85,31 +85,33 @@ std::string input_format =
 // clang-format on
 
 
-void validate_option_object(const rapidjson::Value& value)
+void validate_option_object(const json& value)
 {
-    if (!value.IsObject() || !value.HasMember("filename") ||
-        !value["filename"].IsString() || !value.HasMember("problem") ||
-        !value["problem"].IsObject() || !value["problem"].HasMember("type") ||
-        !value["problem"]["type"].IsString()) {
+    if (!value.is_object() || !value.contains("filename") ||
+        !value["filename"].is_string() || !value.contains("problem") ||
+        !value["problem"].is_object() || !value["problem"].contains("type") ||
+        !value["problem"]["type"].is_string()) {
         print_config_error_and_exit(2);
     }
 }
 
 
 using generator_function = std::function<gko::matrix_data<etype, itype>(
-    rapidjson::Value&, std::default_random_engine&)>;
+    json&, std::default_random_engine&)>;
 
 
 // matrix generators
 gko::matrix_data<etype, itype> generate_block_diagonal(
-    rapidjson::Value& config, std::default_random_engine& engine)
+    json& config, std::default_random_engine& engine)
 {
-    if (!config.HasMember("num_blocks") || !config["num_blocks"].IsUint() ||
-        !config.HasMember("block_size") || !config["block_size"].IsUint()) {
+    if (!config.contains("num_blocks") ||
+        !config["num_blocks"].is_number_unsigned() ||
+        !config.contains("block_size") ||
+        !config["block_size"].is_number_unsigned()) {
         print_config_error_and_exit(2);
     }
-    auto num_blocks = config["num_blocks"].GetUint();
-    auto block_size = config["block_size"].GetUint();
+    auto num_blocks = config["num_blocks"].get<gko::uint64>();
+    auto block_size = config["block_size"].get<gko::uint64>();
     auto block = gko::matrix_data<etype, itype>(
         gko::dim<2>(block_size),
         std::uniform_real_distribution<rc_etype>(-1.0, 1.0), engine);
@@ -132,20 +134,18 @@ int main(int argc, char* argv[])
     std::clog << gko::version_info::get() << std::endl;
 
     auto engine = get_engine();
-    rapidjson::IStreamWrapper jcin(get_input_stream());
-    rapidjson::Document configurations;
-    configurations.ParseStream(jcin);
+    auto configurations = json::parse(get_input_stream());
 
-    if (!configurations.IsArray()) {
+    if (!configurations.is_array()) {
         print_config_error_and_exit(1);
     }
 
-    for (auto& config : configurations.GetArray()) {
+    for (auto& config : configurations) {
         try {
             validate_option_object(config);
             std::clog << "Generating matrix: " << config << std::endl;
-            auto filename = config["filename"].GetString();
-            auto type = config["problem"]["type"].GetString();
+            auto filename = config["filename"].get<std::string>();
+            auto type = config["problem"]["type"].get<std::string>();
             auto mdata = generator[type](config["problem"], engine);
             std::ofstream ofs(filename);
             gko::write_raw(ofs, mdata, gko::layout_type::coordinate);
diff --git a/benchmark/matrix_statistics/matrix_statistics.cpp b/benchmark/matrix_statistics/matrix_statistics.cpp
index 45f21ca1e35..576d6fa7d52 100644
--- a/benchmark/matrix_statistics/matrix_statistics.cpp
+++ b/benchmark/matrix_statistics/matrix_statistics.cpp
@@ -38,9 +38,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <iostream>
 
 
-#include "benchmark/utils/general.hpp"
+#include <ginkgo/core/base/executor.hpp>
+
+
+#include "benchmark/utils/general_matrix.hpp"
 #include "benchmark/utils/generator.hpp"
-#include "benchmark/utils/spmv_validation.hpp"
+#include "benchmark/utils/runner.hpp"
 #include "benchmark/utils/types.hpp"
 
 
@@ -51,9 +54,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 // See en.wikipedia.org/wiki/Five-number_summary
 // Quartile computation uses Method 3 from en.wikipedia.org/wiki/Quartile
-void compute_summary(const std::vector<gko::size_type>& dist,
-                     rapidjson::Value& out,
-                     rapidjson::MemoryPoolAllocator<>& allocator)
+void compute_summary(const std::vector<gko::size_type>& dist, json& out)
 {
     const auto q = dist.size() / 4;
     const auto r = dist.size() % 4;
@@ -72,23 +73,14 @@ void compute_summary(const std::vector<gko::size_type>& dist,
     };
     // clang-format on
 
-    add_or_set_member(out, "min", dist[0], allocator);
-    add_or_set_member(
-        out, "q1",
-        coefs[r][0] * static_cast<double>(dist[positions[r][0]]) +
-            coefs[r][1] * static_cast<double>(dist[positions[r][1]]),
-        allocator);
-    add_or_set_member(
-        out, "median",
-        coefs[r][2] * static_cast<double>(dist[positions[r][2]]) +
-            coefs[r][3] * static_cast<double>(dist[positions[r][3]]),
-        allocator);
-    add_or_set_member(
-        out, "q3",
-        coefs[r][4] * static_cast<double>(dist[positions[r][4]]) +
-            coefs[r][5] * static_cast<double>(dist[positions[r][5]]),
-        allocator);
-    add_or_set_member(out, "max", dist[dist.size() - 1], allocator);
+    out["min"] = dist.front();
+    out["q1"] = coefs[r][0] * static_cast<double>(dist[positions[r][0]]) +
+                coefs[r][1] * static_cast<double>(dist[positions[r][1]]);
+    out["median"] = coefs[r][2] * static_cast<double>(dist[positions[r][2]]) +
+                    coefs[r][3] * static_cast<double>(dist[positions[r][3]]);
+    out["q3"] = coefs[r][4] * static_cast<double>(dist[positions[r][4]]) +
+                coefs[r][5] * static_cast<double>(dist[positions[r][5]]);
+    out["max"] = dist.back();
 }
 
 
@@ -108,39 +100,30 @@ double compute_moment(int degree, const std::vector<gko::size_type>& dist,
 
 
 // See en.wikipedia.org/wiki/Moment_(mathematics)
-void compute_moments(const std::vector<gko::size_type>& dist,
-                     rapidjson::Value& out,
-                     rapidjson::MemoryPoolAllocator<>& allocator)
+void compute_moments(const std::vector<gko::size_type>& dist, json& out)
 {
     const auto mean = compute_moment(1, dist);
-    add_or_set_member(out, "mean", mean, allocator);
+    out["mean"] = mean;
     const auto variance = compute_moment(2, dist, mean);
-    add_or_set_member(out, "variance", variance, allocator);
+    out["variance"] = variance;
     const auto dev = std::sqrt(variance);
-    add_or_set_member(out, "skewness", compute_moment(3, dist, mean, dev),
-                      allocator);
-    add_or_set_member(out, "kurtosis", compute_moment(4, dist, mean, dev),
-                      allocator);
-    add_or_set_member(out, "hyperskewness", compute_moment(5, dist, mean, dev),
-                      allocator);
-    add_or_set_member(out, "hyperflatness", compute_moment(6, dist, mean, dev),
-                      allocator);
+    out["skewness"] = compute_moment(3, dist, mean, dev);
+    out["kurtosis"] = compute_moment(4, dist, mean, dev);
+    out["hyperskewness"] = compute_moment(5, dist, mean, dev);
+    out["hyperflatness"] = compute_moment(6, dist, mean, dev);
 }
 
 
-template <typename Allocator>
 void compute_distribution_properties(const std::vector<gko::size_type>& dist,
-                                     rapidjson::Value& out,
-                                     Allocator& allocator)
+                                     json& out)
 {
-    compute_summary(dist, out, allocator);
-    compute_moments(dist, out, allocator);
+    compute_summary(dist, out);
+    compute_moments(dist, out);
 }
 
 
-template <typename Allocator>
 void extract_matrix_statistics(gko::matrix_data<etype, gko::int64>& data,
-                               rapidjson::Value& problem, Allocator& allocator)
+                               json& problem)
 {
     std::vector<gko::size_type> row_dist(data.size[0]);
     std::vector<gko::size_type> col_dist(data.size[1]);
@@ -149,72 +132,95 @@ void extract_matrix_statistics(gko::matrix_data<etype, gko::int64>& data,
         ++col_dist[v.column];
     }
 
-    add_or_set_member(problem, "rows", data.size[0], allocator);
-    add_or_set_member(problem, "columns", data.size[1], allocator);
-    add_or_set_member(problem, "nonzeros", data.nonzeros.size(), allocator);
+    problem["rows"] = data.size[0];
+    problem["columns"] = data.size[1];
+    problem["nonzeros"] = data.nonzeros.size();
 
     std::sort(begin(row_dist), end(row_dist));
-    add_or_set_member(problem, "row_distribution",
-                      rapidjson::Value(rapidjson::kObjectType), allocator);
-    compute_distribution_properties(row_dist, problem["row_distribution"],
-                                    allocator);
+    problem["row_distribution"] = json::object();
+    compute_distribution_properties(row_dist, problem["row_distribution"]);
 
     std::sort(begin(col_dist), end(col_dist));
-    add_or_set_member(problem, "col_distribution",
-                      rapidjson::Value(rapidjson::kObjectType), allocator);
-    compute_distribution_properties(col_dist, problem["col_distribution"],
-                                    allocator);
+    problem["col_distribution"] = json::object();
+    compute_distribution_properties(col_dist, problem["col_distribution"]);
 }
 
 
-int main(int argc, char* argv[])
-{
-    std::string header =
-        "A utility that collects additional statistical properties of the "
-        "matrix.\n";
-    std::string format = example_config;
-    initialize_argument_parsing(&argc, &argv, header, format);
+using Generator = DefaultSystemGenerator<etype, gko::int64>;
 
-    std::clog << gko::version_info::get() << std::endl;
 
-    rapidjson::IStreamWrapper jcin(get_input_stream());
-    rapidjson::Document test_cases;
-    test_cases.ParseStream(jcin);
-    if (!test_cases.IsArray()) {
-        print_config_error_and_exit();
-    }
+struct empty_state {};
+
 
-    auto& allocator = test_cases.GetAllocator();
+struct MatrixStatistics : Benchmark<empty_state> {
+    std::string name;
+    std::vector<std::string> empty;
 
-    for (auto& test_case : test_cases.GetArray()) {
-        try {
-            // set up benchmark
-            validate_option_object(test_case);
-            if (!test_case.HasMember("problem")) {
-                test_case.AddMember("problem",
-                                    rapidjson::Value(rapidjson::kObjectType),
-                                    allocator);
-            }
-            auto& problem = test_case["problem"];
+    MatrixStatistics() : name{"problem"} {}
 
-            std::clog << "Running test case: " << test_case << std::endl;
+    const std::string& get_name() const override { return name; }
 
-            auto matrix =
-                DefaultSystemGenerator<etype, gko::int64>::generate_matrix_data(
-                    test_case);
+    const std::vector<std::string>& get_operations() const override
+    {
+        return empty;
+    }
 
-            std::clog << "Matrix is of size (" << matrix.size[0] << ", "
-                      << matrix.size[1] << ")" << std::endl;
-            add_or_set_member(test_case, "size", matrix.size[0], allocator);
+    bool should_print() const override { return true; }
 
-            extract_matrix_statistics(matrix, test_case["problem"], allocator);
+    std::string get_example_config() const override
+    {
+        return Generator::get_example_config();
+    }
 
-            backup_results(test_cases);
-        } catch (const std::exception& e) {
-            std::cerr << "Error extracting statistics, what(): " << e.what()
-                      << std::endl;
-        }
+    bool validate_config(const json& test_case) const override
+    {
+        return Generator::validate_config(test_case);
     }
 
-    std::cout << test_cases << std::endl;
+    std::string describe_config(const json& test_case) const override
+    {
+        return Generator::describe_config(test_case);
+    }
+
+    empty_state setup(std::shared_ptr<gko::Executor> exec,
+                      json& test_case) const override
+    {
+        auto data = Generator::generate_matrix_data(test_case);
+        // no reordering here, as it doesn't change statistics
+        std::clog << "Matrix is of size (" << data.size[0] << ", "
+                  << data.size[1] << "), " << data.nonzeros.size() << std::endl;
+        test_case["rows"] = data.size[0];
+        test_case["cols"] = data.size[1];
+        test_case["nonzeros"] = data.nonzeros.size();
+
+        extract_matrix_statistics(data, test_case["problem"]);
+        return {};
+    }
+
+
+    void run(std::shared_ptr<gko::Executor> exec, std::shared_ptr<Timer> timer,
+             annotate_functor annotate, empty_state& data,
+             const std::string& operation_name,
+             json& operation_case) const override
+    {}
+};
+
+
+int main(int argc, char* argv[])
+{
+    std::string header =
+        "A utility that collects additional statistical properties of the "
+        "matrix.\n";
+    std::string format = Generator::get_example_config();
+    initialize_argument_parsing_matrix(&argc, &argv, header, format);
+
+    std::clog << gko::version_info::get() << std::endl;
+
+    auto test_cases = json::parse(get_input_stream());
+    auto exec = gko::ReferenceExecutor::create();
+
+    run_test_cases(MatrixStatistics{}, exec, get_timer(exec, false),
+                   test_cases);
+
+    std::cout << std::setw(4) << test_cases << std::endl;
 }
diff --git a/benchmark/preconditioner/preconditioner.cpp b/benchmark/preconditioner/preconditioner.cpp
index 281e64ddd76..d81dfaa4d5d 100644
--- a/benchmark/preconditioner/preconditioner.cpp
+++ b/benchmark/preconditioner/preconditioner.cpp
@@ -42,10 +42,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "benchmark/utils/formats.hpp"
 #include "benchmark/utils/general.hpp"
+#include "benchmark/utils/general_matrix.hpp"
 #include "benchmark/utils/generator.hpp"
+#include "benchmark/utils/iteration_control.hpp"
 #include "benchmark/utils/loggers.hpp"
 #include "benchmark/utils/preconditioners.hpp"
-#include "benchmark/utils/spmv_validation.hpp"
+#include "benchmark/utils/runner.hpp"
 #include "benchmark/utils/timer.hpp"
 #include "benchmark/utils/types.hpp"
 
@@ -128,34 +130,85 @@ std::string encode_parameters(const char* precond_name)
 }
 
 
-void run_preconditioner(const char* precond_name,
-                        std::shared_ptr<gko::Executor> exec,
-                        std::shared_ptr<const gko::LinOp> system_matrix,
-                        const vec<etype>* b, const vec<etype>* x,
-                        rapidjson::Value& test_case,
-                        rapidjson::MemoryPoolAllocator<>& allocator)
-{
-    try {
-        auto& precond_object = test_case["preconditioner"];
-        auto encoded_name = encode_parameters(precond_name);
+struct preconditioner_benchmark_state {
+    std::unique_ptr<gko::LinOp> x;
+    std::unique_ptr<gko::LinOp> b;
+    std::shared_ptr<const gko::LinOp> system_matrix;
+};
+
+
+using Generator = DefaultSystemGenerator<>;
+
+
+struct PreconditionerBenchmark : Benchmark<preconditioner_benchmark_state> {
+    std::string name;
+    std::vector<std::string> preconditioners;
+    std::map<std::string, std::string> precond_decoder;
 
-        if (!FLAGS_overwrite &&
-            precond_object.HasMember(encoded_name.c_str())) {
-            return;
+    PreconditionerBenchmark()
+        : name{"preconditioner"}, preconditioners{split(FLAGS_preconditioners)}
+    {
+        for (auto precond : split(FLAGS_preconditioners)) {
+            preconditioners.push_back(encode_parameters(precond.c_str()));
+            precond_decoder[preconditioners.back()] = precond;
         }
+    }
+
+    const std::string& get_name() const override { return name; }
+
+    const std::vector<std::string>& get_operations() const override
+    {
+        return preconditioners;
+    }
 
-        add_or_set_member(precond_object, encoded_name.c_str(),
-                          rapidjson::Value(rapidjson::kObjectType), allocator);
-        auto& this_precond_data = precond_object[encoded_name.c_str()];
+    bool should_print() const override { return true; }
+
+    bool validate_config(const json& value) const override
+    {
+        return Generator::validate_config(value);
+    }
+
+    std::string get_example_config() const override
+    {
+        return Generator::get_example_config();
+    }
+
+    std::string describe_config(const json& test_case) const override
+    {
+        return Generator::describe_config(test_case);
+    }
+
+    preconditioner_benchmark_state setup(std::shared_ptr<gko::Executor> exec,
+                                         json& test_case) const override
+    {
+        preconditioner_benchmark_state state;
+        auto data = Generator::generate_matrix_data(test_case);
+        reorder(data, test_case);
+
+        state.system_matrix =
+            formats::matrix_factory(FLAGS_formats, exec, data);
+        state.b = Generator::create_multi_vector_random(exec, data.size[0]);
+        state.x = Generator::create_multi_vector(exec, data.size[0],
+                                                 gko::zero<etype>());
+
+        std::clog << "Matrix is of size (" << data.size[0] << ", "
+                  << data.size[1] << "), " << data.nonzeros.size() << std::endl;
+        test_case["rows"] = data.size[0];
+        test_case["cols"] = data.size[1];
+        test_case["nonzeros"] = data.nonzeros.size();
+        return state;
+    }
 
-        add_or_set_member(this_precond_data, "generate",
-                          rapidjson::Value(rapidjson::kObjectType), allocator);
-        add_or_set_member(this_precond_data, "apply",
-                          rapidjson::Value(rapidjson::kObjectType), allocator);
+
+    void run(std::shared_ptr<gko::Executor> exec, std::shared_ptr<Timer> timer,
+             annotate_functor annotate, preconditioner_benchmark_state& state,
+             const std::string& encoded_precond_name,
+             json& precond_case) const override
+    {
+        auto decoded_precond_name = precond_decoder.at(encoded_precond_name);
         for (auto stage : {"generate", "apply"}) {
-            add_or_set_member(this_precond_data[stage], "components",
-                              rapidjson::Value(rapidjson::kObjectType),
-                              allocator);
+            precond_case[stage] = json::object();
+            precond_case[stage]["components"] = json::object();
         }
 
         IterationControl ic_gen{get_timer(exec, FLAGS_gpu_timer)};
@@ -163,54 +216,57 @@ void run_preconditioner(const char* precond_name,
 
         {
             // fast run, gets total time
-            auto x_clone = clone(x);
-
-            auto precond = precond_factory.at(precond_name)(exec);
+            auto x_clone = clone(state.x);
 
+            auto precond = precond_factory.at(decoded_precond_name)(exec);
 
-            for (auto _ : ic_apply.warmup_run()) {
-                precond->generate(system_matrix)->apply(b, x_clone);
+            {
+                auto range = annotate("warmup", FLAGS_warmup > 0);
+                for (auto _ : ic_apply.warmup_run()) {
+                    precond->generate(state.system_matrix)
+                        ->apply(state.b, x_clone);
+                }
             }
 
             std::unique_ptr<gko::LinOp> precond_op;
             for (auto _ : ic_gen.run()) {
-                precond_op = precond->generate(system_matrix);
+                auto range = annotate("repetition generate");
+                precond_op = precond->generate(state.system_matrix);
             }
 
-            add_or_set_member(this_precond_data["generate"], "time",
-                              ic_gen.compute_time(FLAGS_timer_method),
-                              allocator);
-            add_or_set_member(this_precond_data["generate"], "repetitions",
-                              ic_gen.get_num_repetitions(), allocator);
+            precond_case["generate"]["time"] =
+                ic_gen.compute_time(FLAGS_timer_method);
+            precond_case["generate"]["repetitions"] =
+                ic_gen.get_num_repetitions();
 
             for (auto _ : ic_apply.run()) {
-                precond_op->apply(b, x_clone);
+                auto range = annotate("repetition apply");
+                precond_op->apply(state.b, x_clone);
             }
 
-            add_or_set_member(this_precond_data["apply"], "time",
-                              ic_apply.compute_time(FLAGS_timer_method),
-                              allocator);
-            add_or_set_member(this_precond_data["apply"], "repetitions",
-                              ic_apply.get_num_repetitions(), allocator);
+            precond_case["apply"]["time"] =
+                ic_apply.compute_time(FLAGS_timer_method);
+            precond_case["apply"]["repetitions"] =
+                ic_apply.get_num_repetitions();
         }
 
         if (FLAGS_detailed) {
             // slow run, times each component separately
-            auto x_clone = clone(x);
-            auto precond = precond_factory.at(precond_name)(exec);
+            auto x_clone = clone(state.x);
+            auto precond = precond_factory.at(decoded_precond_name)(exec);
 
             std::unique_ptr<gko::LinOp> precond_op;
             {
                 auto gen_logger = create_operations_logger(
                     FLAGS_gpu_timer, FLAGS_nested_names, exec,
-                    this_precond_data["generate"]["components"], allocator,
+                    precond_case["generate"]["components"],
                     ic_gen.get_num_repetitions());
                 exec->add_logger(gen_logger);
                 if (exec->get_master() != exec) {
                     exec->get_master()->add_logger(gen_logger);
                 }
                 for (auto i = 0u; i < ic_gen.get_num_repetitions(); ++i) {
-                    precond_op = precond->generate(system_matrix);
+                    precond_op = precond->generate(state.system_matrix);
                 }
                 if (exec->get_master() != exec) {
                     exec->get_master()->remove_logger(gen_logger);
@@ -220,38 +276,22 @@ void run_preconditioner(const char* precond_name,
 
             auto apply_logger = create_operations_logger(
                 FLAGS_gpu_timer, FLAGS_nested_names, exec,
-                this_precond_data["apply"]["components"], allocator,
+                precond_case["apply"]["components"],
                 ic_apply.get_num_repetitions());
             exec->add_logger(apply_logger);
             if (exec->get_master() != exec) {
                 exec->get_master()->add_logger(apply_logger);
             }
             for (auto i = 0u; i < ic_apply.get_num_repetitions(); ++i) {
-                precond_op->apply(b, x_clone);
+                precond_op->apply(state.b, x_clone);
             }
             if (exec->get_master() != exec) {
                 exec->get_master()->remove_logger(apply_logger);
             }
             exec->remove_logger(apply_logger);
         }
-
-        add_or_set_member(this_precond_data, "completed", true, allocator);
-    } catch (const std::exception& e) {
-        auto encoded_name = encode_parameters(precond_name);
-        add_or_set_member(test_case["preconditioner"], encoded_name.c_str(),
-                          rapidjson::Value(rapidjson::kObjectType), allocator);
-        add_or_set_member(test_case["preconditioner"][encoded_name.c_str()],
-                          "completed", false, allocator);
-        if (FLAGS_keep_errors) {
-            rapidjson::Value msg_value;
-            msg_value.SetString(e.what(), allocator);
-            add_or_set_member(test_case["preconditioner"][encoded_name.c_str()],
-                              "error", msg_value, allocator);
-        }
-        std::cerr << "Error when processing test case " << test_case << "\n"
-                  << "what(): " << e.what() << std::endl;
     }
-}
+};
 
 
 int main(int argc, char* argv[])
@@ -260,11 +300,11 @@ int main(int argc, char* argv[])
     FLAGS_formats = "csr";
     std::string header =
         "A benchmark for measuring preconditioner performance.\n";
-    std::string format = example_config;
-    initialize_argument_parsing(&argc, &argv, header, format);
+    std::string format = Generator::get_example_config();
+    initialize_argument_parsing_matrix(&argc, &argv, header, format);
 
     std::string extra_information =
-        "Running with preconditioners: " + FLAGS_preconditioners + "\n";
+        "Running with preconditioners: " + FLAGS_preconditioners;
     print_general_information(extra_information);
 
     auto exec = get_executor(FLAGS_gpu_timer);
@@ -278,76 +318,10 @@ int main(int argc, char* argv[])
         std::exit(1);
     }
 
-    rapidjson::IStreamWrapper jcin(get_input_stream());
-    rapidjson::Document test_cases;
-    test_cases.ParseStream(jcin);
-    if (!test_cases.IsArray()) {
-        print_config_error_and_exit();
-    }
+    auto test_cases = json::parse(get_input_stream());
 
-    auto& allocator = test_cases.GetAllocator();
-    auto profiler_hook = create_profiler_hook(exec);
-    if (profiler_hook) {
-        exec->add_logger(profiler_hook);
-    }
-    auto annotate = annotate_functor{profiler_hook};
-    DefaultSystemGenerator<> generator{};
-
-    for (auto& test_case : test_cases.GetArray()) {
-        try {
-            // set up benchmark
-            validate_option_object(test_case);
-            if (!test_case.HasMember("preconditioner")) {
-                test_case.AddMember("preconditioner",
-                                    rapidjson::Value(rapidjson::kObjectType),
-                                    allocator);
-            }
-            auto& precond_object = test_case["preconditioner"];
-            if (!FLAGS_overwrite &&
-                all_of(begin(preconditioners), end(preconditioners),
-                       [&precond_object](const std::string& s) {
-                           return precond_object.HasMember(s.c_str());
-                       })) {
-                continue;
-            }
-            std::clog << "Running test case: " << test_case << std::endl;
-
-            // annotate the test case
-            auto test_case_range =
-                annotate(generator.describe_config(test_case));
-
-            auto data = generator.generate_matrix_data(test_case);
-
-            auto system_matrix =
-                share(formats::matrix_factory(FLAGS_formats, exec, data));
-            auto b = generator.create_multi_vector_random(
-                exec, system_matrix->get_size()[0]);
-            auto x = generator.create_multi_vector(
-                exec, system_matrix->get_size()[0], gko::zero<etype>());
-
-            std::clog << "Matrix is of size (" << system_matrix->get_size()[0]
-                      << ", " << system_matrix->get_size()[1] << ")"
-                      << std::endl;
-            add_or_set_member(test_case, "size", data.size[0], allocator);
-            for (const auto& precond_name : preconditioners) {
-                {
-                    auto precond_range = annotate(precond_name.c_str());
-                    run_preconditioner(precond_name.c_str(), exec,
-                                       system_matrix, b.get(), x.get(),
-                                       test_case, allocator);
-                }
-                std::clog << "Current state:" << std::endl
-                          << test_cases << std::endl;
-                backup_results(test_cases);
-            }
-        } catch (const std::exception& e) {
-            std::cerr << "Error setting up preconditioner, what(): " << e.what()
-                      << std::endl;
-        }
-    }
-    if (profiler_hook) {
-        exec->remove_logger(profiler_hook);
-    }
+    run_test_cases(PreconditionerBenchmark{}, exec,
+                   get_timer(exec, FLAGS_gpu_timer), test_cases);
 
-    std::cout << test_cases << std::endl;
+    std::cout << std::setw(4) << test_cases << std::endl;
 }
diff --git a/benchmark/run_all_benchmarks.sh b/benchmark/run_all_benchmarks.sh
old mode 100644
new mode 100755
diff --git a/benchmark/solver/distributed/CMakeLists.txt b/benchmark/solver/distributed/CMakeLists.txt
index ca6586f1acf..5f6acd5a06c 100644
--- a/benchmark/solver/distributed/CMakeLists.txt
+++ b/benchmark/solver/distributed/CMakeLists.txt
@@ -1 +1 @@
-ginkgo_add_typed_benchmark_executables(solver-distributed "YES" solver.cpp)
+ginkgo_add_typed_benchmark_executables(solver_distributed "YES" solver.cpp)
diff --git a/benchmark/solver/distributed/solver.cpp b/benchmark/solver/distributed/solver.cpp
index 2db71c16ca3..6577c12e52e 100644
--- a/benchmark/solver/distributed/solver.cpp
+++ b/benchmark/solver/distributed/solver.cpp
@@ -39,8 +39,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <set>
 
 
+#define GKO_BENCHMARK_DISTRIBUTED
+
+
 #include "benchmark/solver/solver_common.hpp"
-#include "benchmark/utils/general.hpp"
+#include "benchmark/utils/general_matrix.hpp"
 #include "benchmark/utils/generator.hpp"
 
 
@@ -52,7 +55,7 @@ struct Generator : public DistributedDefaultSystemGenerator<SolverGenerator> {
 
     std::unique_ptr<Vec> generate_rhs(std::shared_ptr<const gko::Executor> exec,
                                       const gko::LinOp* system_matrix,
-                                      rapidjson::Value& config) const
+                                      json& config) const
     {
         return Vec::create(
             exec, comm, gko::dim<2>{system_matrix->get_size()[0], FLAGS_nrhs},
@@ -82,9 +85,13 @@ int main(int argc, char* argv[])
     FLAGS_repetitions = "1";
     FLAGS_min_repetitions = 1;
 
+    const auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
+    const auto rank = comm.rank();
+    const auto do_print = rank == 0;
+
     std::string header =
         "A benchmark for measuring Ginkgo's distributed solvers\n";
-    std::string format = example_config + R"(
+    std::string format = solver_example_config + R"(
   The matrix will either be read from an input file if the filename parameter
   is given, or generated as a stencil matrix.
   If the filename parameter is given, all processes will read the file and
@@ -98,10 +105,9 @@ int main(int argc, char* argv[])
   "<local_format>-<non_local_format>", where both "local_format" and
   "non_local_format" can be any of the recognized spmv formats.
 )";
-    initialize_argument_parsing(&argc, &argv, header, format);
-
-    const auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
-    const auto rank = comm.rank();
+    std::string additional_json = R"(,"optimal":{"spmv":"csr-csr"})";
+    initialize_argument_parsing_matrix(&argc, &argv, header, format,
+                                       additional_json, do_print);
 
     auto exec = executor_factory_mpi.at(FLAGS_executor)(comm.get());
 
@@ -112,8 +118,8 @@ int main(int argc, char* argv[])
         "Running " + FLAGS_solvers + " with " +
         std::to_string(FLAGS_max_iters) + " iterations and residual goal of " +
         ss_rel_res_goal.str() + "\nThe number of right hand sides is " +
-        std::to_string(FLAGS_nrhs) + "\n";
-    if (rank == 0) {
+        std::to_string(FLAGS_nrhs);
+    if (do_print) {
         print_general_information(extra_information);
     }
 
@@ -134,17 +140,12 @@ int main(int argc, char* argv[])
   "optimal": {"spmv": "csr-csr"}]
 )"
                        : broadcast_json_input(get_input_stream(), comm);
-    rapidjson::Document test_cases;
-    test_cases.Parse(json_input.c_str());
-
-    if (!test_cases.IsArray()) {
-        print_config_error_and_exit();
-    }
+    auto test_cases = json::parse(json_input);
 
-    run_solver_benchmarks(exec, get_mpi_timer(exec, comm, FLAGS_gpu_timer),
-                          test_cases, Generator(comm), rank == 0);
+    run_test_cases(SolverBenchmark<Generator>{Generator{comm}}, exec,
+                   get_mpi_timer(exec, comm, FLAGS_gpu_timer), test_cases);
 
-    if (rank == 0) {
-        std::cout << test_cases << std::endl;
+    if (do_print) {
+        std::cout << std::setw(4) << test_cases << std::endl;
     }
 }
diff --git a/benchmark/solver/solver.cpp b/benchmark/solver/solver.cpp
index 9190c99dad0..b656102e5df 100644
--- a/benchmark/solver/solver.cpp
+++ b/benchmark/solver/solver.cpp
@@ -47,7 +47,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include "benchmark/solver/solver_common.hpp"
-#include "benchmark/utils/general.hpp"
+#include "benchmark/utils/general_matrix.hpp"
 #include "benchmark/utils/generator.hpp"
 
 
@@ -58,10 +58,12 @@ int main(int argc, char* argv[])
     FLAGS_min_repetitions = 1;
     std::string header =
         "A benchmark for measuring performance of Ginkgo's solvers.\n";
-    std::string format = example_config + R"(
+    std::string format = solver_example_config + R"(
   "optimal":"spmv" can be one of the recognized spmv formats
 )";
-    initialize_argument_parsing(&argc, &argv, header, format);
+    std::string additional_json = R"(,"optimal":{"spmv":"csr"})";
+    initialize_argument_parsing_matrix(&argc, &argv, header, format,
+                                       additional_json);
 
     std::stringstream ss_rel_res_goal;
     ss_rel_res_goal << std::scientific << FLAGS_rel_res_goal;
@@ -70,29 +72,24 @@ int main(int argc, char* argv[])
         "Running " + FLAGS_solvers + " with " +
         std::to_string(FLAGS_max_iters) + " iterations and residual goal of " +
         ss_rel_res_goal.str() + "\nThe number of right hand sides is " +
-        std::to_string(FLAGS_nrhs) + "\n";
+        std::to_string(FLAGS_nrhs);
     print_general_information(extra_information);
 
     auto exec = get_executor(FLAGS_gpu_timer);
 
-    rapidjson::Document test_cases;
+    json test_cases;
     if (!FLAGS_overhead) {
-        rapidjson::IStreamWrapper jcin(get_input_stream());
-        test_cases.ParseStream(jcin);
+        test_cases = json::parse(get_input_stream());
     } else {
         // Fake test case to run once
         auto overhead_json = std::string() +
                              " [{\"filename\": \"overhead.mtx\", \"optimal\": "
                              "{ \"spmv\": \"csr\"}}]";
-        test_cases.Parse(overhead_json.c_str());
+        test_cases = json::parse(overhead_json);
     }
 
-    if (!test_cases.IsArray()) {
-        print_config_error_and_exit();
-    }
-
-    run_solver_benchmarks(exec, get_timer(exec, FLAGS_gpu_timer), test_cases,
-                          SolverGenerator{}, true);
+    run_test_cases(SolverBenchmark<SolverGenerator>{SolverGenerator{}}, exec,
+                   get_timer(exec, FLAGS_gpu_timer), test_cases);
 
-    std::cout << test_cases << std::endl;
+    std::cout << std::setw(4) << test_cases << std::endl;
 }
diff --git a/benchmark/solver/solver_common.hpp b/benchmark/solver/solver_common.hpp
index 64190f8d968..a46cc188c50 100644
--- a/benchmark/solver/solver_common.hpp
+++ b/benchmark/solver/solver_common.hpp
@@ -36,9 +36,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "benchmark/utils/formats.hpp"
 #include "benchmark/utils/general.hpp"
+#include "benchmark/utils/general_matrix.hpp"
 #include "benchmark/utils/generator.hpp"
+#include "benchmark/utils/iteration_control.hpp"
 #include "benchmark/utils/loggers.hpp"
 #include "benchmark/utils/preconditioners.hpp"
+#include "benchmark/utils/runner.hpp"
 
 
 #ifdef GINKGO_BENCHMARK_ENABLE_TUNING
@@ -59,13 +62,13 @@ DEFINE_bool(
     rel_residual, false,
     "Use relative residual instead of residual reduction stopping criterion");
 
-DEFINE_string(
-    solvers, "cg",
-    "A comma-separated list of solvers to run. "
-    "Supported values are: bicgstab, bicg, cb_gmres_keep, "
-    "cb_gmres_reduce1, cb_gmres_reduce2, cb_gmres_integer, "
-    "cb_gmres_ireduce1, cb_gmres_ireduce2, cg, cgs, fcg, gmres, idr, "
-    "lower_trs, upper_trs, spd_direct, symm_direct, direct, overhead");
+DEFINE_string(solvers, "cg",
+              "A comma-separated list of solvers to run. "
+              "Supported values are: bicgstab, bicg, cb_gmres_keep, "
+              "cb_gmres_reduce1, cb_gmres_reduce2, cb_gmres_integer, "
+              "cb_gmres_ireduce1, cb_gmres_ireduce2, cg, cgs, fcg, gmres, idr, "
+              "lower_trs, upper_trs, spd_direct, symm_direct, "
+              "near_symm_direct, direct, overhead");
 
 DEFINE_uint32(
     nrhs, 1,
@@ -107,7 +110,7 @@ DEFINE_bool(overhead, false,
             "If set, uses dummy data to benchmark Ginkgo overhead");
 
 
-std::string example_config = R"(
+std::string solver_example_config = R"(
   [
     {"filename": "my_file.mtx", "optimal": {"spmv": "ell-csr"},
      "rhs": "my_file_rhs.mtx"},
@@ -119,28 +122,6 @@ std::string example_config = R"(
 )";
 
 
-// input validation
-[[noreturn]] void print_config_error_and_exit()
-{
-    std::cerr << "Input has to be a JSON array of solver configurations:\n"
-              << example_config << std::endl;
-    std::exit(1);
-}
-
-
-void validate_option_object(const rapidjson::Value& value)
-{
-    if (!value.IsObject() ||
-        !((value.HasMember("size") && value.HasMember("stencil") &&
-           value["size"].IsInt64() && value["stencil"].IsString()) ||
-          (value.HasMember("filename") && value["filename"].IsString())) ||
-        (!value.HasMember("optimal") && !value["optimal"].HasMember("spmv") &&
-         !value["optimal"]["spmv"].IsString())) {
-        print_config_error_and_exit();
-    }
-}
-
-
 std::shared_ptr<const gko::stop::CriterionFactory> create_criterion(
     std::shared_ptr<const gko::Executor> exec, std::uint32_t max_iters)
 {
@@ -259,21 +240,26 @@ std::unique_ptr<gko::LinOpFactory> generate_solver(
         return gko::experimental::solver::Direct<etype, itype>::build()
             .with_factorization(
                 gko::experimental::factorization::Cholesky<etype,
-                                                           itype>::build()
-                    .on(exec))
+                                                           itype>::build())
             .on(exec);
     } else if (description == "symm_direct") {
         return gko::experimental::solver::Direct<etype, itype>::build()
             .with_factorization(
                 gko::experimental::factorization::Lu<etype, itype>::build()
-                    .with_symmetric_sparsity(true)
-                    .on(exec))
+                    .with_symbolic_algorithm(gko::experimental::factorization::
+                                                 symbolic_type::symmetric))
+            .on(exec);
+    } else if (description == "near_symm_direct") {
+        return gko::experimental::solver::Direct<etype, itype>::build()
+            .with_factorization(
+                gko::experimental::factorization::Lu<etype, itype>::build()
+                    .with_symbolic_algorithm(gko::experimental::factorization::
+                                                 symbolic_type::near_symmetric))
             .on(exec);
     } else if (description == "direct") {
         return gko::experimental::solver::Direct<etype, itype>::build()
             .with_factorization(
-                gko::experimental::factorization::Lu<etype, itype>::build().on(
-                    exec))
+                gko::experimental::factorization::Lu<etype, itype>::build())
             .on(exec);
     } else if (description == "overhead") {
         return add_criteria_precond_finalize<gko::Overhead<etype>>(
@@ -284,21 +270,17 @@ std::unique_ptr<gko::LinOpFactory> generate_solver(
 }
 
 
-void write_precond_info(const gko::LinOp* precond,
-                        rapidjson::Value& precond_info,
-                        rapidjson::MemoryPoolAllocator<>& allocator)
+void write_precond_info(const gko::LinOp* precond, json& precond_info)
 {
     if (const auto jacobi =
             dynamic_cast<const gko::preconditioner::Jacobi<etype>*>(precond)) {
         // extract block sizes
         const auto bdata =
             jacobi->get_parameters().block_pointers.get_const_data();
-        add_or_set_member(precond_info, "block_sizes",
-                          rapidjson::Value(rapidjson::kArrayType), allocator);
+        precond_info["block_sizes"] = json::array();
         const auto nblocks = jacobi->get_num_blocks();
         for (auto i = decltype(nblocks){0}; i < nblocks; ++i) {
-            precond_info["block_sizes"].PushBack(bdata[i + 1] - bdata[i],
-                                                 allocator);
+            precond_info["block_sizes"].push_back(bdata[i + 1] - bdata[i]);
         }
 
         // extract block precisions
@@ -306,24 +288,19 @@ void write_precond_info(const gko::LinOp* precond,
             jacobi->get_parameters()
                 .storage_optimization.block_wise.get_const_data();
         if (pdata) {
-            add_or_set_member(precond_info, "block_precisions",
-                              rapidjson::Value(rapidjson::kArrayType),
-                              allocator);
+            precond_info["block_precisions"] = json::array();
             for (auto i = decltype(nblocks){0}; i < nblocks; ++i) {
-                precond_info["block_precisions"].PushBack(
-                    static_cast<int>(pdata[i]), allocator);
+                precond_info["block_precisions"].push_back(
+                    static_cast<int>(pdata[i]));
             }
         }
 
         // extract condition numbers
         const auto cdata = jacobi->get_conditioning();
         if (cdata) {
-            add_or_set_member(precond_info, "block_conditioning",
-                              rapidjson::Value(rapidjson::kArrayType),
-                              allocator);
+            precond_info["block_conditioning"] = json::array();
             for (auto i = decltype(nblocks){0}; i < nblocks; ++i) {
-                precond_info["block_conditioning"].PushBack(cdata[i],
-                                                            allocator);
+                precond_info["block_conditioning"].push_back(cdata[i]);
             }
         }
     }
@@ -335,10 +312,10 @@ struct SolverGenerator : DefaultSystemGenerator<> {
 
     std::unique_ptr<Vec> generate_rhs(std::shared_ptr<const gko::Executor> exec,
                                       const gko::LinOp* system_matrix,
-                                      rapidjson::Value& config) const
+                                      json& config) const
     {
-        if (config.HasMember("rhs")) {
-            std::ifstream rhs_fd{config["rhs"].GetString()};
+        if (config.contains("rhs")) {
+            std::ifstream rhs_fd{config["rhs"].get<std::string>()};
             return gko::read<Vec>(rhs_fd, std::move(exec));
         } else {
             gko::dim<2> vec_size{system_matrix->get_size()[0], FLAGS_nrhs};
@@ -399,70 +376,143 @@ struct SolverGenerator : DefaultSystemGenerator<> {
 };
 
 
-template <typename VectorType>
-void solve_system(const std::string& solver_name,
-                  const std::string& precond_name,
-                  const char* precond_solver_name,
-                  std::shared_ptr<gko::Executor> exec,
-                  std::shared_ptr<Timer> timer,
-                  std::shared_ptr<const gko::LinOp> system_matrix,
-                  const VectorType* b, const VectorType* x,
-                  rapidjson::Value& test_case,
-                  rapidjson::MemoryPoolAllocator<>& allocator)
-{
-    try {
-        auto& solver_case = test_case["solver"];
-        if (!FLAGS_overwrite && solver_case.HasMember(precond_solver_name)) {
-            return;
+template <typename Generator>
+struct solver_benchmark_state {
+    using Vec = typename Generator::Vec;
+    std::shared_ptr<gko::LinOp> system_matrix;
+    std::unique_ptr<Vec> b;
+    std::unique_ptr<Vec> x;
+};
+
+
+template <typename Generator>
+struct SolverBenchmark : Benchmark<solver_benchmark_state<Generator>> {
+    std::string name;
+    std::vector<std::string> precond_solvers;
+    std::map<std::string, std::pair<std::string, std::string>> decoder;
+    Generator generator;
+
+    SolverBenchmark(Generator generator) : name{"solver"}, generator{generator}
+    {
+        auto solvers = split(FLAGS_solvers, ',');
+        auto preconds = split(FLAGS_preconditioners, ',');
+        for (const auto& s : solvers) {
+            for (const auto& p : preconds) {
+                precond_solvers.push_back(s + (p == "none" ? "" : "-" + p));
+                decoder[precond_solvers.back()] = {s, p};
+            }
+        }
+    }
+
+    const std::string& get_name() const override { return name; }
+
+    const std::vector<std::string>& get_operations() const override
+    {
+        return precond_solvers;
+    }
+
+    bool should_print() const override { return true; }
+
+    std::string get_example_config() const override
+    {
+        return solver_example_config;
+    }
+
+    bool validate_config(const json& value) const override
+    {
+        return generator.validate_config(value) &&
+               (value.contains("optimal") &&
+                value["optimal"].contains("spmv") &&
+                value["optimal"]["spmv"].is_string());
+    }
+
+    std::string describe_config(const json& test_case) const override
+    {
+        return Generator::describe_config(test_case);
+    }
+
+    solver_benchmark_state<Generator> setup(std::shared_ptr<gko::Executor> exec,
+                                            json& test_case) const override
+    {
+        solver_benchmark_state<Generator> state;
+
+        if (FLAGS_overhead) {
+            state.system_matrix = generator.initialize({1.0}, exec);
+            state.b = generator.initialize(
+                {std::numeric_limits<rc_etype>::quiet_NaN()}, exec);
+            state.x = generator.initialize({0.0}, exec);
+        } else {
+            auto data = generator.generate_matrix_data(test_case);
+            auto permutation = reorder(data, test_case);
+
+            state.system_matrix = generator.generate_matrix_with_format(
+                exec, test_case["optimal"]["spmv"].get<std::string>(), data);
+            state.b = generator.generate_rhs(exec, state.system_matrix.get(),
+                                             test_case);
+            if (permutation) {
+                permute(state.b, permutation.get());
+            }
+            state.x = generator.generate_initial_guess(
+                exec, state.system_matrix.get(), state.b.get());
         }
 
-        add_or_set_member(solver_case, precond_solver_name,
-                          rapidjson::Value(rapidjson::kObjectType), allocator);
-        auto& solver_json = solver_case[precond_solver_name];
-        add_or_set_member(solver_json, "recurrent_residuals",
-                          rapidjson::Value(rapidjson::kArrayType), allocator);
-        add_or_set_member(solver_json, "true_residuals",
-                          rapidjson::Value(rapidjson::kArrayType), allocator);
-        add_or_set_member(solver_json, "implicit_residuals",
-                          rapidjson::Value(rapidjson::kArrayType), allocator);
-        add_or_set_member(solver_json, "iteration_timestamps",
-                          rapidjson::Value(rapidjson::kArrayType), allocator);
-        if (b->get_size()[1] == 1 && !FLAGS_overhead) {
-            auto rhs_norm = compute_norm2(b);
-            add_or_set_member(solver_json, "rhs_norm", rhs_norm, allocator);
+        std::clog << "Matrix is of size (" << state.system_matrix->get_size()[0]
+                  << ", " << state.system_matrix->get_size()[1] << ")"
+                  << std::endl;
+        test_case["rows"] = state.system_matrix->get_size()[0];
+        test_case["cols"] = state.system_matrix->get_size()[1];
+        return state;
+    }
+
+
+    void run(std::shared_ptr<gko::Executor> exec, std::shared_ptr<Timer> timer,
+             annotate_functor annotate,
+             solver_benchmark_state<Generator>& state,
+             const std::string& encoded_solver_name,
+             json& solver_case) const override
+    {
+        const auto decoded_pair = decoder.at(encoded_solver_name);
+        auto& solver_name = decoded_pair.first;
+        auto& precond_name = decoded_pair.second;
+        solver_case["recurrent_residuals"] = json::array();
+        solver_case["true_residuals"] = json::array();
+        solver_case["implicit_residuals"] = json::array();
+        solver_case["iteration_timestamps"] = json::array();
+        if (state.b->get_size()[1] == 1 && !FLAGS_overhead) {
+            auto rhs_norm = compute_norm2(state.b.get());
+            solver_case["rhs_norm"] = rhs_norm;
         }
         for (auto stage : {"generate", "apply"}) {
-            add_or_set_member(solver_json, stage,
-                              rapidjson::Value(rapidjson::kObjectType),
-                              allocator);
-            add_or_set_member(solver_json[stage], "components",
-                              rapidjson::Value(rapidjson::kObjectType),
-                              allocator);
+            solver_case[stage] = json::object();
+            solver_case[stage]["components"] = json::object();
         }
 
         IterationControl ic{timer};
 
         // warm run
         std::shared_ptr<gko::LinOp> solver;
-        for (auto _ : ic.warmup_run()) {
-            auto x_clone = clone(x);
-            auto precond = precond_factory.at(precond_name)(exec);
-            solver = generate_solver(exec, give(precond), solver_name,
-                                     FLAGS_warmup_max_iters)
-                         ->generate(system_matrix);
-            solver->apply(b, x_clone);
-            exec->synchronize();
+        {
+            auto range = annotate("warmup", FLAGS_warmup > 0);
+            for (auto _ : ic.warmup_run()) {
+                auto x_clone = clone(state.x);
+                auto precond = precond_factory.at(precond_name)(exec);
+                solver = generate_solver(exec, give(precond), solver_name,
+                                         FLAGS_warmup_max_iters)
+                             ->generate(state.system_matrix);
+                solver->apply(state.b, x_clone);
+                exec->synchronize();
+            }
         }
 
         // detail run
         if (FLAGS_detailed && !FLAGS_overhead) {
             // slow run, get the time of each functions
-            auto x_clone = clone(x);
+            auto x_clone = clone(state.x);
 
             {
                 auto gen_logger = create_operations_logger(
                     FLAGS_gpu_timer, FLAGS_nested_names, exec,
-                    solver_json["generate"]["components"], allocator, 1);
+                    solver_case["generate"]["components"], 1);
                 exec->add_logger(gen_logger);
                 if (exec != exec->get_master()) {
                     exec->get_master()->add_logger(gen_logger);
@@ -471,7 +521,7 @@ void solve_system(const std::string& solver_name,
                 auto precond = precond_factory.at(precond_name)(exec);
                 solver = generate_solver(exec, give(precond), solver_name,
                                          FLAGS_max_iters)
-                             ->generate(system_matrix);
+                             ->generate(state.system_matrix);
 
                 exec->remove_logger(gen_logger);
                 if (exec != exec->get_master()) {
@@ -481,25 +531,22 @@ void solve_system(const std::string& solver_name,
 
             if (auto prec =
                     dynamic_cast<const gko::Preconditionable*>(solver.get())) {
-                add_or_set_member(solver_json, "preconditioner",
-                                  rapidjson::Value(rapidjson::kObjectType),
-                                  allocator);
+                solver_case["preconditioner"] = json::object();
                 write_precond_info(
                     clone(exec->get_master(), prec->get_preconditioner()).get(),
-                    solver_json["preconditioner"], allocator);
+                    solver_case["preconditioner"]);
             }
 
             {
                 auto apply_logger = create_operations_logger(
                     FLAGS_gpu_timer, FLAGS_nested_names, exec,
-                    solver_json["apply"]["components"], allocator, 1);
+                    solver_case["apply"]["components"], 1);
                 exec->add_logger(apply_logger);
                 if (exec != exec->get_master()) {
                     exec->get_master()->add_logger(apply_logger);
                 }
 
-
-                solver->apply(b, x_clone);
+                solver->apply(state.b, x_clone);
 
                 exec->remove_logger(apply_logger);
                 if (exec != exec->get_master()) {
@@ -508,17 +555,18 @@ void solve_system(const std::string& solver_name,
             }
 
             // slow run, gets the recurrent and true residuals of each iteration
-            if (b->get_size()[1] == 1) {
-                x_clone = clone(x);
+            if (state.b->get_size()[1] == 1) {
+                x_clone = clone(state.x);
                 auto res_logger = std::make_shared<ResidualLogger<etype>>(
-                    system_matrix, b, solver_json["recurrent_residuals"],
-                    solver_json["true_residuals"],
-                    solver_json["implicit_residuals"],
-                    solver_json["iteration_timestamps"], allocator);
+                    state.system_matrix, state.b,
+                    solver_case["recurrent_residuals"],
+                    solver_case["true_residuals"],
+                    solver_case["implicit_residuals"],
+                    solver_case["iteration_timestamps"]);
                 solver->add_logger(res_logger);
-                solver->apply(b, x_clone);
+                solver->apply(state.b, x_clone);
                 if (!res_logger->has_implicit_res_norms()) {
-                    solver_json.RemoveMember("implicit_residuals");
+                    solver_case.erase("implicit_residuals");
                 }
             }
             exec->synchronize();
@@ -528,16 +576,17 @@ void solve_system(const std::string& solver_name,
         auto it_logger = std::make_shared<IterationLogger>();
         auto generate_timer = get_timer(exec, FLAGS_gpu_timer);
         auto apply_timer = ic.get_timer();
-        auto x_clone = clone(x);
+        auto x_clone = clone(state.x);
         for (auto status : ic.run(false)) {
-            x_clone = clone(x);
+            auto range = annotate("repetition");
+            x_clone = clone(state.x);
 
             exec->synchronize();
             generate_timer->tic();
             auto precond = precond_factory.at(precond_name)(exec);
             solver = generate_solver(exec, give(precond), solver_name,
                                      FLAGS_max_iters)
-                         ->generate(system_matrix);
+                         ->generate(state.system_matrix);
             generate_timer->toc();
 
             exec->synchronize();
@@ -545,164 +594,33 @@ void solve_system(const std::string& solver_name,
                 solver->add_logger(it_logger);
             }
             apply_timer->tic();
-            solver->apply(b, x_clone);
+            solver->apply(state.b, x_clone);
             apply_timer->toc();
             if (ic.get_num_repetitions() == 0) {
                 solver->remove_logger(it_logger);
             }
         }
-        it_logger->write_data(solver_json["apply"], allocator);
+        it_logger->write_data(solver_case["apply"]);
 
-        if (b->get_size()[1] == 1 && !FLAGS_overhead) {
+        if (state.b->get_size()[1] == 1 && !FLAGS_overhead) {
             // a solver is considered direct if it didn't log any iterations
-            if (solver_json["apply"].HasMember("iterations") &&
-                solver_json["apply"]["iterations"].GetInt() == 0) {
-                auto error =
-                    compute_direct_error(solver.get(), b, x_clone.get());
-                add_or_set_member(solver_json, "forward_error", error,
-                                  allocator);
-            }
-            auto residual =
-                compute_residual_norm(system_matrix.get(), b, x_clone.get());
-            add_or_set_member(solver_json, "residual_norm", residual,
-                              allocator);
-        }
-        add_or_set_member(solver_json["generate"], "time",
-                          generate_timer->compute_time(FLAGS_timer_method),
-                          allocator);
-        add_or_set_member(solver_json["apply"], "time",
-                          apply_timer->compute_time(FLAGS_timer_method),
-                          allocator);
-        add_or_set_member(solver_json, "repetitions",
-                          apply_timer->get_num_repetitions(), allocator);
-
-        // compute and write benchmark data
-        add_or_set_member(solver_json, "completed", true, allocator);
-    } catch (const std::exception& e) {
-        add_or_set_member(test_case["solver"][precond_solver_name], "completed",
-                          false, allocator);
-        if (FLAGS_keep_errors) {
-            rapidjson::Value msg_value;
-            msg_value.SetString(e.what(), allocator);
-            add_or_set_member(test_case["solver"][precond_solver_name], "error",
-                              msg_value, allocator);
-        }
-        std::cerr << "Error when processing test case " << test_case << "\n"
-                  << "what(): " << e.what() << std::endl;
-    }
-}
-
-
-template <typename SystemGenerator>
-void run_solver_benchmarks(std::shared_ptr<gko::Executor> exec,
-                           std::shared_ptr<Timer> timer,
-                           rapidjson::Document& test_cases,
-                           const SystemGenerator& system_generator,
-                           bool do_print)
-{
-    auto solvers = split(FLAGS_solvers, ',');
-    auto preconds = split(FLAGS_preconditioners, ',');
-    std::vector<std::string> precond_solvers;
-    for (const auto& s : solvers) {
-        for (const auto& p : preconds) {
-            precond_solvers.push_back(s + (p == "none" ? "" : "-" + p));
-        }
-    }
-
-    auto& allocator = test_cases.GetAllocator();
-    auto profiler_hook = create_profiler_hook(exec);
-    if (profiler_hook) {
-        exec->add_logger(profiler_hook);
-    }
-    auto annotate = annotate_functor{profiler_hook};
-
-    for (auto& test_case : test_cases.GetArray()) {
-        try {
-            // set up benchmark
-            validate_option_object(test_case);
-            if (!test_case.HasMember("solver")) {
-                test_case.AddMember("solver",
-                                    rapidjson::Value(rapidjson::kObjectType),
-                                    allocator);
-            }
-            auto& solver_case = test_case["solver"];
-            if (!FLAGS_overwrite &&
-                all_of(begin(precond_solvers), end(precond_solvers),
-                       [&solver_case](const std::string& s) {
-                           return solver_case.HasMember(s.c_str());
-                       })) {
-                continue;
-            }
-            // annotate the test case
-            auto test_case_range =
-                annotate(system_generator.describe_config(test_case));
-
-            if (do_print) {
-                std::clog << "Running test case: " << test_case << std::endl;
-            }
-
-            using Vec = typename SystemGenerator::Vec;
-            std::shared_ptr<gko::LinOp> system_matrix;
-            std::unique_ptr<Vec> b;
-            std::unique_ptr<Vec> x;
-            if (FLAGS_overhead) {
-                system_matrix = system_generator.initialize({1.0}, exec);
-                b = system_generator.initialize(
-                    {std::numeric_limits<rc_etype>::quiet_NaN()}, exec);
-                x = system_generator.initialize({0.0}, exec);
-            } else {
-                system_matrix =
-                    system_generator.generate_matrix_with_optimal_format(
-                        exec, test_case);
-                b = system_generator.generate_rhs(exec, system_matrix.get(),
-                                                  test_case);
-                x = system_generator.generate_initial_guess(
-                    exec, system_matrix.get(), b.get());
-            }
-
-            if (do_print) {
-                std::clog << "Matrix is of size ("
-                          << system_matrix->get_size()[0] << ", "
-                          << system_matrix->get_size()[1] << ")" << std::endl;
-            }
-            add_or_set_member(test_case, "size", system_matrix->get_size()[0],
-                              allocator);
-            auto precond_solver_name = begin(precond_solvers);
-            for (const auto& solver_name : solvers) {
-                auto solver_range = annotate(solver_name.c_str());
-                for (const auto& precond_name : preconds) {
-                    if (do_print) {
-                        std::clog
-                            << "\tRunning solver: " << *precond_solver_name
-                            << std::endl;
-                    }
-                    {
-                        auto precond_range = annotate(precond_name.c_str());
-                        solve_system(solver_name, precond_name,
-                                     precond_solver_name->c_str(), exec, timer,
-                                     system_matrix, b.get(), x.get(), test_case,
-                                     allocator);
-                    }
-                    if (do_print) {
-                        backup_results(test_cases);
-                    }
-                    ++precond_solver_name;
-                }
-            }
-        } catch (const std::exception& e) {
-            std::cerr << "Error setting up solver, what(): " << e.what()
-                      << std::endl;
-            if (FLAGS_keep_errors) {
-                rapidjson::Value msg_value;
-                msg_value.SetString(e.what(), allocator);
-                add_or_set_member(test_case, "error", msg_value, allocator);
+            if (solver_case["apply"].contains("iterations") &&
+                solver_case["apply"]["iterations"].get<gko::int64>() == 0) {
+                auto error = compute_direct_error(solver.get(), state.b.get(),
+                                                  x_clone.get());
+                solver_case["forward_error"] = error;
             }
+            auto residual = compute_residual_norm(state.system_matrix.get(),
+                                                  state.b.get(), x_clone.get());
+            solver_case["residual_norm"] = residual;
         }
+        solver_case["generate"]["time"] =
+            generate_timer->compute_time(FLAGS_timer_method);
+        solver_case["apply"]["time"] =
+            apply_timer->compute_time(FLAGS_timer_method);
+        solver_case["repetitions"] = apply_timer->get_num_repetitions();
     }
-    if (profiler_hook) {
-        exec->remove_logger(profiler_hook);
-    }
-}
+};
 
 
 #endif  // GINKGO_BENCHMARK_SOLVER_SOLVER_COMMON_HPP
diff --git a/benchmark/sparse_blas/operations.cpp b/benchmark/sparse_blas/operations.cpp
index 6a817a67c0d..f8d93f6a2c0 100644
--- a/benchmark/sparse_blas/operations.cpp
+++ b/benchmark/sparse_blas/operations.cpp
@@ -38,7 +38,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include "benchmark/sparse_blas/operations.hpp"
-#include "benchmark/utils/json.hpp"
 #include "core/factorization/elimination_forest.hpp"
 #include "core/factorization/symbolic.hpp"
 #include "core/matrix/csr_kernels.hpp"
@@ -632,11 +631,40 @@ class SymbolicLuOperation : public BenchmarkOperation {
 
     void run() override { gko::factorization::symbolic_lu(mtx_, result_); }
 
-    void write_stats(rapidjson::Value& object,
-                     rapidjson::MemoryPoolAllocator<>& allocator) override
+    void write_stats(json& object) override
     {
-        add_or_set_member(object, "factor_nonzeros",
-                          result_->get_num_stored_elements(), allocator);
+        object["factor_nonzeros"] = result_->get_num_stored_elements();
+    }
+
+private:
+    const Mtx* mtx_;
+    std::unique_ptr<Mtx> result_;
+};
+
+
+class SymbolicLuNearSymmOperation : public BenchmarkOperation {
+public:
+    explicit SymbolicLuNearSymmOperation(const Mtx* mtx) : mtx_{mtx}, result_{}
+    {}
+
+    std::pair<bool, double> validate() const override
+    {
+        return std::make_pair(
+            validate_symbolic_factorization(mtx_, result_.get()), 0.0);
+    }
+
+    gko::size_type get_flops() const override { return 0; }
+
+    gko::size_type get_memory() const override { return 0; }
+
+    void run() override
+    {
+        gko::factorization::symbolic_lu_near_symm(mtx_, result_);
+    }
+
+    void write_stats(json& object) override
+    {
+        object["factor_nonzeros"] = result_->get_num_stored_elements();
     }
 
 private:
@@ -680,11 +708,9 @@ class SymbolicCholeskyOperation : public BenchmarkOperation {
                                               forest_);
     }
 
-    void write_stats(rapidjson::Value& object,
-                     rapidjson::MemoryPoolAllocator<>& allocator) override
+    void write_stats(json& object) override
     {
-        add_or_set_member(object, "factor_nonzeros",
-                          result_->get_num_stored_elements(), allocator);
+        object["factor_nonzeros"] = result_->get_num_stored_elements();
     }
 
 private:
@@ -695,6 +721,106 @@ class SymbolicCholeskyOperation : public BenchmarkOperation {
 };
 
 
+class ReorderRcmOperation : public BenchmarkOperation {
+    using reorder_type = gko::experimental::reorder::Rcm<itype>;
+    using permute_type = gko::matrix::Permutation<itype>;
+
+public:
+    explicit ReorderRcmOperation(const Mtx* mtx)
+        : mtx_{mtx->clone()},
+          factory_{reorder_type::build().on(mtx->get_executor())}
+    {}
+
+    std::pair<bool, double> validate() const override
+    {
+        // validating RCM correctness is hard, let's leave it out for now
+        return {true, 0.0};
+    }
+
+    gko::size_type get_flops() const override { return 0; }
+
+    gko::size_type get_memory() const override { return 0; }
+
+    void prepare() override {}
+
+    void run() override { reorder_ = factory_->generate(mtx_); }
+
+private:
+    std::shared_ptr<Mtx> mtx_;
+    std::unique_ptr<reorder_type> factory_;
+    std::unique_ptr<permute_type> reorder_;
+};
+
+
+#if GKO_HAVE_METIS
+
+
+class ReorderNestedDissectionOperation : public BenchmarkOperation {
+    using factory_type =
+        gko::experimental::reorder::NestedDissection<etype, itype>;
+    using reorder_type = gko::matrix::Permutation<itype>;
+
+public:
+    explicit ReorderNestedDissectionOperation(const Mtx* mtx)
+        : mtx_{mtx->clone()},
+          factory_{factory_type::build().on(mtx->get_executor())}
+    {}
+
+    std::pair<bool, double> validate() const override
+    {
+        // validating ND correctness is hard, let's leave it out for now
+        return {true, 0.0};
+    }
+
+    gko::size_type get_flops() const override { return 0; }
+
+    gko::size_type get_memory() const override { return 0; }
+
+    void prepare() override {}
+
+    void run() override { reorder_ = factory_->generate(mtx_); }
+
+private:
+    std::shared_ptr<Mtx> mtx_;
+    std::unique_ptr<factory_type> factory_;
+    std::unique_ptr<reorder_type> reorder_;
+};
+
+
+#endif
+
+
+class ReorderApproxMinDegOperation : public BenchmarkOperation {
+    using factory_type = gko::experimental::reorder::Amd<itype>;
+    using reorder_type = gko::matrix::Permutation<itype>;
+
+public:
+    explicit ReorderApproxMinDegOperation(const Mtx* mtx)
+        : mtx_{mtx->clone()},
+          factory_{factory_type::build().on(mtx->get_executor())}
+    {}
+
+    std::pair<bool, double> validate() const override
+    {
+        // validating AMD correctness is hard, let's leave it out for now
+        return {true, 0.0};
+    }
+
+    gko::size_type get_flops() const override { return 0; }
+
+    gko::size_type get_memory() const override { return 0; }
+
+    void prepare() override {}
+
+    void run() override { reorder_ = factory_->generate(mtx_); }
+
+private:
+    std::shared_ptr<Mtx> mtx_;
+    std::unique_ptr<factory_type> factory_;
+    std::unique_ptr<reorder_type> reorder_;
+};
+
+
 const std::map<std::string,
                std::function<std::unique_ptr<BenchmarkOperation>(const Mtx*)>>
     operation_map{
@@ -722,12 +848,33 @@ const std::map<std::string,
          [](const Mtx* mtx) {
              return std::make_unique<SymbolicLuOperation>(mtx);
          }},
+        {"symbolic_lu_near_symm",
+         [](const Mtx* mtx) {
+             return std::make_unique<SymbolicLuNearSymmOperation>(mtx);
+         }},
         {"symbolic_cholesky",
          [](const Mtx* mtx) {
              return std::make_unique<SymbolicCholeskyOperation>(mtx, false);
          }},
-        {"symbolic_cholesky_symmetric", [](const Mtx* mtx) {
+        {"symbolic_cholesky_symmetric",
+         [](const Mtx* mtx) {
              return std::make_unique<SymbolicCholeskyOperation>(mtx, true);
+         }},
+        {"reorder_rcm",
+         [](const Mtx* mtx) {
+             return std::make_unique<ReorderRcmOperation>(mtx);
+         }},
+        {"reorder_amd",
+         [](const Mtx* mtx) {
+             return std::make_unique<ReorderApproxMinDegOperation>(mtx);
+         }},
+        {"reorder_nd",
+         [](const Mtx* mtx) -> std::unique_ptr<BenchmarkOperation> {
+#if GKO_HAVE_METIS
+             return std::make_unique<ReorderNestedDissectionOperation>(mtx);
+#else
+             GKO_NOT_COMPILED(METIS);
+#endif
          }}};
 
 
diff --git a/benchmark/sparse_blas/operations.hpp b/benchmark/sparse_blas/operations.hpp
index 99cf72b8e59..48034eb8a1f 100644
--- a/benchmark/sparse_blas/operations.hpp
+++ b/benchmark/sparse_blas/operations.hpp
@@ -36,9 +36,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <tuple>
 
 
-#include <rapidjson/document.h>
-
-
+#include "benchmark/utils/json.hpp"
 #include "benchmark/utils/types.hpp"
 
 
@@ -79,9 +77,7 @@ class BenchmarkOperation {
     /**
      * Allows the operation to write arbitrary information to the JSON output.
      */
-    virtual void write_stats(rapidjson::Value& object,
-                             rapidjson::MemoryPoolAllocator<>& allocator)
-    {}
+    virtual void write_stats(json& object) {}
 };
 
 
diff --git a/benchmark/sparse_blas/sparse_blas.cpp b/benchmark/sparse_blas/sparse_blas.cpp
index 4fb06d2a4a0..d1dc67f8d2d 100644
--- a/benchmark/sparse_blas/sparse_blas.cpp
+++ b/benchmark/sparse_blas/sparse_blas.cpp
@@ -45,9 +45,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include "benchmark/sparse_blas/operations.hpp"
-#include "benchmark/utils/general.hpp"
+#include "benchmark/utils/general_matrix.hpp"
 #include "benchmark/utils/generator.hpp"
-#include "benchmark/utils/spmv_validation.hpp"
+#include "benchmark/utils/iteration_control.hpp"
+#include "benchmark/utils/runner.hpp"
 #include "benchmark/utils/types.hpp"
 #include "core/test/utils/matrix_generator.hpp"
 
@@ -57,92 +58,129 @@ const auto benchmark_name = "sparse_blas";
 
 using mat_data = gko::matrix_data<etype, itype>;
 
-DEFINE_string(
-    operations, "spgemm,spgeam,transpose",
+const char* operations_string =
     "Comma-separated list of operations to be benchmarked. Can be "
     "spgemm, spgeam, transpose, sort, is_sorted, generate_lookup, "
-    "lookup, symbolic_lu, symbolic_cholesky, symbolic_cholesky_symmetric");
+    "lookup, symbolic_lu, symbolic_lu_near_symm, symbolic_cholesky, "
+    "symbolic_cholesky_symmetric, reorder_rcm, "
+#if GKO_HAVE_METIS
+    "reorder_nd, "
+#endif
+    "reorder_amd";
+
+DEFINE_string(operations, "spgemm,spgeam,transpose", operations_string);
 
 DEFINE_bool(validate, false,
             "Check for correct sparsity pattern and compute the L2 norm "
             "against the ReferenceExecutor solution.");
 
 
-void apply_sparse_blas(const char* operation_name,
-                       std::shared_ptr<gko::Executor> exec, const Mtx* mtx,
-                       rapidjson::Value& test_case,
-                       rapidjson::MemoryPoolAllocator<>& allocator)
-{
-    try {
-        add_or_set_member(test_case, operation_name,
-                          rapidjson::Value(rapidjson::kObjectType), allocator);
+using Generator = DefaultSystemGenerator<>;
+
+
+struct SparseBlasBenchmark : Benchmark<std::unique_ptr<Mtx>> {
+    std::string name;
+    std::vector<std::string> operations;
+
+    SparseBlasBenchmark()
+        : name{"sparse_blas"}, operations{split(FLAGS_operations)}
+    {}
+
+    const std::string& get_name() const override { return name; }
+
+    const std::vector<std::string>& get_operations() const override
+    {
+        return operations;
+    }
+
+    bool should_print() const override { return true; }
+
+    bool validate_config(const json& value) const override
+    {
+        return Generator::validate_config(value);
+    }
+
+    std::string get_example_config() const override
+    {
+        return Generator::get_example_config();
+    }
+
+    std::string describe_config(const json& test_case) const override
+    {
+        return Generator::describe_config(test_case);
+    }
+
+    std::unique_ptr<Mtx> setup(std::shared_ptr<gko::Executor> exec,
+                               json& test_case) const override
+    {
+        auto data = Generator::generate_matrix_data(test_case);
+        reorder(data, test_case);
+        std::clog << "Matrix is of size (" << data.size[0] << ", "
+                  << data.size[1] << "), " << data.nonzeros.size() << std::endl;
+        test_case["rows"] = data.size[0];
+        test_case["cols"] = data.size[1];
+        test_case["nonzeros"] = data.nonzeros.size();
+
+        auto mtx = Mtx::create(exec, data.size, data.nonzeros.size());
+        mtx->read(data);
+        return mtx;
+    }
+
 
-        auto op = get_operation(operation_name, mtx);
+    void run(std::shared_ptr<gko::Executor> exec, std::shared_ptr<Timer> timer,
+             annotate_functor annotate, std::unique_ptr<Mtx>& mtx,
+             const std::string& operation_name,
+             json& operation_case) const override
+    {
+        auto op = get_operation(operation_name, mtx.get());
 
-        auto timer = get_timer(exec, FLAGS_gpu_timer);
         IterationControl ic(timer);
 
         // warm run
-        for (auto _ : ic.warmup_run()) {
-            op->prepare();
-            exec->synchronize();
-            op->run();
-            exec->synchronize();
+        {
+            auto range = annotate("warmup", FLAGS_warmup > 0);
+            for (auto _ : ic.warmup_run()) {
+                op->prepare();
+                exec->synchronize();
+                op->run();
+                exec->synchronize();
+            }
         }
 
         // timed run
         op->prepare();
         for (auto _ : ic.run()) {
+            auto range = annotate("repetition");
             op->run();
         }
         const auto runtime = ic.compute_time(FLAGS_timer_method);
         const auto flops = static_cast<double>(op->get_flops());
         const auto mem = static_cast<double>(op->get_memory());
         const auto repetitions = ic.get_num_repetitions();
-        add_or_set_member(test_case[operation_name], "time", runtime,
-                          allocator);
-        add_or_set_member(test_case[operation_name], "flops", flops / runtime,
-                          allocator);
-        add_or_set_member(test_case[operation_name], "bandwidth", mem / runtime,
-                          allocator);
-        add_or_set_member(test_case[operation_name], "repetitions", repetitions,
-                          allocator);
+        operation_case["time"] = runtime;
+        operation_case["flops"] = flops / runtime;
+        operation_case["bandwidth"] = mem / runtime;
+        operation_case["repetitions"] = repetitions;
 
         if (FLAGS_validate) {
             auto validation_result = op->validate();
-            add_or_set_member(test_case[operation_name], "correct",
-                              validation_result.first, allocator);
-            add_or_set_member(test_case[operation_name], "error",
-                              validation_result.second, allocator);
+            operation_case["correct"] = validation_result.first;
+            operation_case["error"] = validation_result.second;
         }
         if (FLAGS_detailed) {
-            add_or_set_member(test_case[operation_name], "components",
-                              rapidjson::Value(rapidjson::kObjectType),
-                              allocator);
+            operation_case["components"] = json::object();
             auto gen_logger = create_operations_logger(
                 FLAGS_gpu_timer, FLAGS_nested_names, exec,
-                test_case[operation_name]["components"], allocator, 1);
+                operation_case["components"], repetitions);
             exec->add_logger(gen_logger);
-            op->run();
+            for (unsigned i = 0; i < repetitions; i++) {
+                op->run();
+            }
             exec->remove_logger(gen_logger);
         }
-        op->write_stats(test_case[operation_name], allocator);
-
-        add_or_set_member(test_case[operation_name], "completed", true,
-                          allocator);
-    } catch (const std::exception& e) {
-        add_or_set_member(test_case[operation_name], "completed", false,
-                          allocator);
-        if (FLAGS_keep_errors) {
-            rapidjson::Value msg_value;
-            msg_value.SetString(e.what(), allocator);
-            add_or_set_member(test_case[operation_name], "error", msg_value,
-                              allocator);
-        }
-        std::cerr << "Error when processing test case " << test_case << "\n"
-                  << "what(): " << e.what() << std::endl;
+        op->write_stats(operation_case);
     }
-}
+};
 
 
 int main(int argc, char* argv[])
@@ -150,86 +188,18 @@ int main(int argc, char* argv[])
     std::string header =
         "A benchmark for measuring performance of Ginkgo's sparse BLAS "
         "operations.\n";
-    std::string format = example_config;
-    initialize_argument_parsing(&argc, &argv, header, format);
+    std::string format = Generator::get_example_config();
+    initialize_argument_parsing_matrix(&argc, &argv, header, format);
 
     auto exec = executor_factory.at(FLAGS_executor)(FLAGS_gpu_timer);
 
-    rapidjson::IStreamWrapper jcin(get_input_stream());
-    rapidjson::Document test_cases;
-    test_cases.ParseStream(jcin);
-    if (!test_cases.IsArray()) {
-        print_config_error_and_exit();
-    }
+    auto test_cases = json::parse(get_input_stream());
 
     std::string extra_information = "The operations are " + FLAGS_operations;
     print_general_information(extra_information);
 
-    auto& allocator = test_cases.GetAllocator();
-    auto profiler_hook = create_profiler_hook(exec);
-    if (profiler_hook) {
-        exec->add_logger(profiler_hook);
-    }
-    auto annotate = annotate_functor{profiler_hook};
-
-    auto operations = split(FLAGS_operations, ',');
-
-    DefaultSystemGenerator<> generator{};
-
-    for (auto& test_case : test_cases.GetArray()) {
-        try {
-            // set up benchmark
-            validate_option_object(test_case);
-            if (!test_case.HasMember(benchmark_name)) {
-                test_case.AddMember(rapidjson::Value(benchmark_name, allocator),
-                                    rapidjson::Value(rapidjson::kObjectType),
-                                    allocator);
-            }
-            auto& sp_blas_case = test_case[benchmark_name];
-            std::clog << "Running test case: " << test_case << std::endl;
-            auto data = generator.generate_matrix_data(test_case);
-            data.ensure_row_major_order();
-            std::clog << "Matrix is of size (" << data.size[0] << ", "
-                      << data.size[1] << "), " << data.nonzeros.size()
-                      << std::endl;
-            add_or_set_member(test_case, "rows", data.size[0], allocator);
-            add_or_set_member(test_case, "cols", data.size[1], allocator);
-            add_or_set_member(test_case, "nonzeros", data.nonzeros.size(),
-                              allocator);
-
-            auto mtx = Mtx::create(exec, data.size, data.nonzeros.size());
-            mtx->read(data);
-            // annotate the test case
-            auto test_case_range =
-                annotate(generator.describe_config(test_case));
-            for (const auto& operation_name : operations) {
-                if (FLAGS_overwrite ||
-                    !sp_blas_case.HasMember(operation_name.c_str())) {
-                    {
-                        auto operation_range = annotate(operation_name.c_str());
-                        apply_sparse_blas(operation_name.c_str(), exec,
-                                          mtx.get(), sp_blas_case, allocator);
-                    }
-                    std::clog << "Current state:" << std::endl
-                              << test_cases << std::endl;
-                    backup_results(test_cases);
-                }
-            }
-            // write the output if we have no strategies
-            backup_results(test_cases);
-        } catch (const std::exception& e) {
-            std::cerr << "Error setting up matrix data, what(): " << e.what()
-                      << std::endl;
-            if (FLAGS_keep_errors) {
-                rapidjson::Value msg_value;
-                msg_value.SetString(e.what(), allocator);
-                add_or_set_member(test_case, "error", msg_value, allocator);
-            }
-        }
-    }
-    if (profiler_hook) {
-        exec->remove_logger(profiler_hook);
-    }
+    run_test_cases(SparseBlasBenchmark{}, exec,
+                   get_timer(exec, FLAGS_gpu_timer), test_cases);
 
-    std::cout << test_cases << std::endl;
+    std::cout << std::setw(4) << test_cases << std::endl;
 }
diff --git a/benchmark/spmv/distributed/CMakeLists.txt b/benchmark/spmv/distributed/CMakeLists.txt
index cadde3eea34..4322dd70e90 100644
--- a/benchmark/spmv/distributed/CMakeLists.txt
+++ b/benchmark/spmv/distributed/CMakeLists.txt
@@ -1 +1 @@
-ginkgo_add_typed_benchmark_executables(spmv-distributed "YES" spmv.cpp)
+ginkgo_add_typed_benchmark_executables(spmv_distributed "YES" spmv.cpp)
diff --git a/benchmark/spmv/distributed/spmv.cpp b/benchmark/spmv/distributed/spmv.cpp
index 3c2986846b3..d3925dabcf2 100644
--- a/benchmark/spmv/distributed/spmv.cpp
+++ b/benchmark/spmv/distributed/spmv.cpp
@@ -43,8 +43,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <typeinfo>
 
 
+#define GKO_BENCHMARK_DISTRIBUTED
+
+
 #include "benchmark/spmv/spmv_common.hpp"
-#include "benchmark/utils/general.hpp"
+#include "benchmark/utils/general_matrix.hpp"
 #include "benchmark/utils/generator.hpp"
 #include "benchmark/utils/timer.hpp"
 #include "benchmark/utils/types.hpp"
@@ -58,38 +61,7 @@ DEFINE_string(non_local_formats, "csr",
               "run. See the 'formats' option for a list of supported versions");
 
 
-std::string example_config = R"(
-  [
-    {"size": 100, "stencil": "7pt", "comm_pattern": "stencil"},
-    {"filename": "my_file.mtx"}
-  ]
-)";
-
-
-[[noreturn]] void print_config_error_and_exit()
-{
-    std::cerr << "Input has to be a JSON array of matrix configurations:\n"
-              << example_config << std::endl;
-    std::exit(1);
-}
-
-
-struct Generator : DistributedDefaultSystemGenerator<DefaultSystemGenerator<>> {
-    Generator(gko::experimental::mpi::communicator comm)
-        : DistributedDefaultSystemGenerator<DefaultSystemGenerator<>>{
-              std::move(comm), {}}
-    {}
-
-    void validate_options(const rapidjson::Value& options) const
-    {
-        if (!options.IsObject() ||
-            !((options.HasMember("size") && options.HasMember("stencil") &&
-               options.HasMember("comm_pattern")) ||
-              options.HasMember("filename"))) {
-            print_config_error_and_exit();
-        }
-    }
-};
+using Generator = DistributedDefaultSystemGenerator<DefaultSystemGenerator<>>;
 
 
 int main(int argc, char* argv[])
@@ -98,18 +70,19 @@ int main(int argc, char* argv[])
 
     const auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
     const auto rank = comm.rank();
+    const auto do_print = rank == 0;
 
     std::string header =
         "A benchmark for measuring performance of Ginkgo's spmv.\n";
-    std::string format = example_config;
-    initialize_argument_parsing(&argc, &argv, header, format);
-
-    if (rank == 0) {
-        std::string extra_information = "The formats are [" +
-                                        FLAGS_local_formats + "]x[" +
-                                        FLAGS_non_local_formats + "]\n" +
-                                        "The number of right hand sides is " +
-                                        std::to_string(FLAGS_nrhs) + "\n";
+    std::string format = Generator::get_example_config();
+    initialize_argument_parsing_matrix(&argc, &argv, header, format, "",
+                                       do_print);
+
+    if (do_print) {
+        std::string extra_information =
+            "The formats are [" + FLAGS_local_formats + "]x[" +
+            FLAGS_non_local_formats + "]\n" +
+            "The number of right hand sides is " + std::to_string(FLAGS_nrhs);
         print_general_information(extra_information);
     }
 
@@ -125,16 +98,13 @@ int main(int argc, char* argv[])
     }
 
     std::string json_input = broadcast_json_input(get_input_stream(), comm);
-    rapidjson::Document test_cases;
-    test_cases.Parse(json_input.c_str());
-    if (!test_cases.IsArray()) {
-        print_config_error_and_exit();
-    }
+    auto test_cases = json::parse(json_input);
 
-    run_spmv_benchmark(exec, test_cases, formats, Generator{comm},
-                       get_mpi_timer(exec, comm, FLAGS_gpu_timer), rank == 0);
+    run_test_cases(SpmvBenchmark<Generator>{Generator{comm}, formats, do_print},
+                   exec, get_mpi_timer(exec, comm, FLAGS_gpu_timer),
+                   test_cases);
 
-    if (rank == 0) {
-        std::cout << test_cases << std::endl;
+    if (do_print) {
+        std::cout << std::setw(4) << test_cases << std::endl;
     }
 }
diff --git a/benchmark/spmv/spmv.cpp b/benchmark/spmv/spmv.cpp
index df000cecd47..abd1b783019 100644
--- a/benchmark/spmv/spmv.cpp
+++ b/benchmark/spmv/spmv.cpp
@@ -39,50 +39,31 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "benchmark/spmv/spmv_common.hpp"
 #include "benchmark/utils/formats.hpp"
-#include "benchmark/utils/general.hpp"
+#include "benchmark/utils/general_matrix.hpp"
 #include "benchmark/utils/generator.hpp"
-#include "benchmark/utils/spmv_validation.hpp"
 
 
-struct Generator : DefaultSystemGenerator<> {
-    void validate_options(const rapidjson::Value& options) const
-    {
-        if (!options.IsObject() ||
-            !((options.HasMember("size") && options.HasMember("stencil")) ||
-              options.HasMember("filename"))) {
-            std::cerr
-                << "Input has to be a JSON array of matrix configurations:\n"
-                << example_config << std::endl;
-            std::exit(1);
-        }
-    }
-};
+using Generator = DefaultSystemGenerator<>;
 
 
 int main(int argc, char* argv[])
 {
     std::string header =
         "A benchmark for measuring performance of Ginkgo's spmv.\n";
-    std::string format = example_config;
-    initialize_argument_parsing(&argc, &argv, header, format);
+    std::string format = Generator::get_example_config();
+    initialize_argument_parsing_matrix(&argc, &argv, header, format);
 
     std::string extra_information = "The formats are " + FLAGS_formats +
                                     "\nThe number of right hand sides is " +
-                                    std::to_string(FLAGS_nrhs) + "\n";
+                                    std::to_string(FLAGS_nrhs);
     print_general_information(extra_information);
 
     auto exec = executor_factory.at(FLAGS_executor)(FLAGS_gpu_timer);
-    auto formats = split(FLAGS_formats, ',');
 
-    rapidjson::IStreamWrapper jcin(get_input_stream());
-    rapidjson::Document test_cases;
-    test_cases.ParseStream(jcin);
-    if (!test_cases.IsArray()) {
-        print_config_error_and_exit();
-    }
+    auto test_cases = json::parse(get_input_stream());
 
-    run_spmv_benchmark(exec, test_cases, formats, Generator{},
-                       get_timer(exec, FLAGS_gpu_timer), true);
+    run_test_cases(SpmvBenchmark<Generator>{Generator{}, split(FLAGS_formats)},
+                   exec, get_timer(exec, FLAGS_gpu_timer), test_cases);
 
-    std::cout << test_cases << std::endl;
+    std::cout << std::setw(4) << test_cases << std::endl;
 }
diff --git a/benchmark/spmv/spmv_common.hpp b/benchmark/spmv/spmv_common.hpp
index 4c40f1b9a7b..1d43e3ed327 100644
--- a/benchmark/spmv/spmv_common.hpp
+++ b/benchmark/spmv/spmv_common.hpp
@@ -36,7 +36,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "benchmark/utils/formats.hpp"
 #include "benchmark/utils/general.hpp"
+#include "benchmark/utils/general_matrix.hpp"
+#include "benchmark/utils/iteration_control.hpp"
 #include "benchmark/utils/loggers.hpp"
+#include "benchmark/utils/runner.hpp"
 #include "benchmark/utils/timer.hpp"
 #include "benchmark/utils/types.hpp"
 #ifdef GINKGO_BENCHMARK_ENABLE_TUNING
@@ -48,57 +51,123 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 DEFINE_uint32(nrhs, 1, "The number of right hand sides");
 
 
-// This function supposes that management of `FLAGS_overwrite` is done before
-// calling it
-template <typename Generator, typename VectorType, typename IndexType>
-void apply_spmv(const char* format_name, std::shared_ptr<gko::Executor> exec,
-                const Generator& generator, std::shared_ptr<Timer> timer,
-                const gko::matrix_data<etype, IndexType>& data,
-                const VectorType* b, const VectorType* x,
-                const VectorType* answer, rapidjson::Value& test_case,
-                rapidjson::MemoryPoolAllocator<>& allocator)
-{
-    try {
-        auto& spmv_case = test_case["spmv"];
-        add_or_set_member(spmv_case, format_name,
-                          rapidjson::Value(rapidjson::kObjectType), allocator);
+template <typename Generator>
+struct spmv_benchmark_state {
+    gko::matrix_data<etype, typename Generator::index_type> data;
+    std::unique_ptr<typename Generator::Vec> x;
+    std::unique_ptr<typename Generator::Vec> b;
+    std::unique_ptr<typename Generator::Vec> answer;
+};
+
+
+template <typename Generator>
+struct SpmvBenchmark : Benchmark<spmv_benchmark_state<Generator>> {
+    using Vec = typename Generator::Vec;
+    std::string name;
+    std::vector<std::string> formats;
+    bool do_print;
+    Generator generator;
+
+    SpmvBenchmark(Generator generator, std::vector<std::string> formats,
+                  bool do_print = true)
+        : name{"spmv"},
+          formats{std::move(formats)},
+          generator{generator},
+          do_print{do_print}
+    {}
+
+    const std::string& get_name() const override { return name; }
+
+    const std::vector<std::string>& get_operations() const override
+    {
+        return formats;
+    }
+
+    bool should_print() const override { return do_print; }
+
+    std::string get_example_config() const override
+    {
+        return generator.get_example_config();
+    }
+
+    bool validate_config(const json& test_case) const override
+    {
+        return generator.validate_config(test_case);
+    }
+
+    std::string describe_config(const json& test_case) const override
+    {
+        return generator.describe_config(test_case);
+    }
+
+    spmv_benchmark_state<Generator> setup(std::shared_ptr<gko::Executor> exec,
+                                          json& test_case) const override
+    {
+        spmv_benchmark_state<Generator> state;
+        state.data = generator.generate_matrix_data(test_case);
+        reorder(state.data, test_case);
+
+        auto nrhs = FLAGS_nrhs;
+        state.b = generator.create_multi_vector_random(
+            exec, gko::dim<2>{state.data.size[1], nrhs});
+        state.x = generator.create_multi_vector_random(
+            exec, gko::dim<2>{state.data.size[0], nrhs});
+        if (do_print) {
+            std::clog << "Matrix is of size (" << state.data.size[0] << ", "
+                      << state.data.size[1] << "), "
+                      << state.data.nonzeros.size() << std::endl;
+        }
+        test_case["rows"] = state.data.size[0];
+        test_case["cols"] = state.data.size[1];
+        test_case["nonzeros"] = state.data.nonzeros.size();
+        if (FLAGS_detailed) {
+            state.answer = gko::clone(state.x);
+            auto system_matrix =
+                generator.generate_matrix_with_default_format(exec, state.data);
+            exec->synchronize();
+            system_matrix->apply(state.b, state.answer);
+            exec->synchronize();
+        }
+        return state;
+    }
 
+    void run(std::shared_ptr<gko::Executor> exec, std::shared_ptr<Timer> timer,
+             annotate_functor annotate, spmv_benchmark_state<Generator>& state,
+             const std::string& format_name, json& format_case) const override
+    {
         auto system_matrix = generator.generate_matrix_with_format(
-            exec, format_name, data, &spmv_case[format_name], &allocator);
+            exec, format_name, state.data, &format_case);
 
         // check the residual
         if (FLAGS_detailed) {
-            auto x_clone = clone(x);
+            auto x_clone = clone(state.x);
             exec->synchronize();
-            system_matrix->apply(b, x_clone);
+            system_matrix->apply(state.b, x_clone);
             exec->synchronize();
             auto max_relative_norm2 =
-                compute_max_relative_norm2(x_clone.get(), answer);
-            add_or_set_member(spmv_case[format_name], "max_relative_norm2",
-                              max_relative_norm2, allocator);
+                compute_max_relative_norm2(x_clone.get(), state.answer.get());
+            format_case["max_relative_norm2"] = max_relative_norm2;
         }
 
         IterationControl ic{timer};
         // warm run
-        for (auto _ : ic.warmup_run()) {
-            auto x_clone = clone(x);
-            exec->synchronize();
-            system_matrix->apply(b, x_clone);
-            exec->synchronize();
+        {
+            auto range = annotate("warmup", FLAGS_warmup > 0);
+            for (auto _ : ic.warmup_run()) {
+                auto x_clone = clone(state.x);
+                exec->synchronize();
+                system_matrix->apply(state.b, x_clone);
+                exec->synchronize();
+            }
         }
 
         // tuning run
 #ifdef GINKGO_BENCHMARK_ENABLE_TUNING
         auto& format_case = spmv_case[format_name];
-        if (!format_case.HasMember("tuning")) {
-            format_case.AddMember(
-                "tuning", rapidjson::Value(rapidjson::kObjectType), allocator);
-        }
+        format_case["tuning"] = json::object();
         auto& tuning_case = format_case["tuning"];
-        add_or_set_member(tuning_case, "time",
-                          rapidjson::Value(rapidjson::kArrayType), allocator);
-        add_or_set_member(tuning_case, "values",
-                          rapidjson::Value(rapidjson::kArrayType), allocator);
+        tuning_case["time"] = json::array();
+        tuning_case["values"] = json::array();
 
         // Enable tuning for this portion of code
         gko::_tuning_flag = true;
@@ -112,13 +181,13 @@ void apply_spmv(const char* format_name, std::shared_ptr<gko::Executor> exec,
             gko::_tuned_value = val;
             auto tuning_timer = get_timer(exec, FLAGS_gpu_timer);
             IterationControl ic_tuning{tuning_timer};
-            auto x_clone = clone(x);
+            auto x_clone = clone(state.x);
             for (auto _ : ic_tuning.run()) {
-                system_matrix->apply(b, x_clone);
+                system_matrix->apply(state.b, x_clone);
             }
-            tuning_case["time"].PushBack(
-                ic_tuning.compute_time(FLAGS_timer_method), allocator);
-            tuning_case["values"].PushBack(val, allocator);
+            tuning_case["time"].push_back(
+                ic_tuning.compute_time(FLAGS_timer_method));
+            tuning_case["values"].push_back(val);
         }
         // We put back the flag to false to use the default (non-tuned) values
         // for the following
@@ -126,141 +195,39 @@ void apply_spmv(const char* format_name, std::shared_ptr<gko::Executor> exec,
 #endif  // GINKGO_BENCHMARK_ENABLE_TUNING
 
         // timed run
-        auto x_clone = clone(x);
+        auto x_clone = clone(state.x);
         for (auto _ : ic.run()) {
-            system_matrix->apply(b, x_clone);
-        }
-        add_or_set_member(spmv_case[format_name], "time",
-                          ic.compute_time(FLAGS_timer_method), allocator);
-        add_or_set_member(spmv_case[format_name], "repetitions",
-                          ic.get_num_repetitions(), allocator);
-
-        // compute and write benchmark data
-        add_or_set_member(spmv_case[format_name], "completed", true, allocator);
-    } catch (const std::exception& e) {
-        add_or_set_member(test_case["spmv"][format_name], "completed", false,
-                          allocator);
-        if (FLAGS_keep_errors) {
-            rapidjson::Value msg_value;
-            msg_value.SetString(e.what(), allocator);
-            add_or_set_member(test_case["spmv"][format_name], "error",
-                              msg_value, allocator);
+            auto range = annotate("repetition");
+            system_matrix->apply(state.b, x_clone);
         }
-        std::cerr << "Error when processing test case " << test_case << "\n"
-                  << "what(): " << e.what() << std::endl;
-    }
-}
-
-
-template <typename SystemGenerator>
-void run_spmv_benchmark(std::shared_ptr<gko::Executor> exec,
-                        rapidjson::Document& test_cases,
-                        const std::vector<std::string> formats,
-                        const SystemGenerator& system_generator,
-                        std::shared_ptr<Timer> timer, bool do_print)
-{
-    auto& allocator = test_cases.GetAllocator();
-    auto profiler_hook = create_profiler_hook(exec);
-    if (profiler_hook) {
-        exec->add_logger(profiler_hook);
+        format_case["time"] = ic.compute_time(FLAGS_timer_method);
+        format_case["repetitions"] = ic.get_num_repetitions();
     }
-    auto annotate = annotate_functor{profiler_hook};
-
-    for (auto& test_case : test_cases.GetArray()) {
-        try {
-            // set up benchmark
-            system_generator.validate_options(test_case);
-            if (!test_case.HasMember("spmv")) {
-                test_case.AddMember("spmv",
-                                    rapidjson::Value(rapidjson::kObjectType),
-                                    allocator);
-            }
-            auto& spmv_case = test_case["spmv"];
-            if (!FLAGS_overwrite &&
-                all_of(begin(formats), end(formats),
-                       [&spmv_case](const std::string& s) {
-                           return spmv_case.HasMember(s.c_str());
-                       })) {
-                continue;
-            }
-            if (do_print) {
-                std::clog << "Running test case: " << test_case << std::endl;
-            }
-            // annotate the test case
-            auto test_case_range =
-                annotate(system_generator.describe_config(test_case));
-
-            auto data = system_generator.generate_matrix_data(test_case);
-
-            auto nrhs = FLAGS_nrhs;
-            auto b = system_generator.create_multi_vector_random(
-                exec, gko::dim<2>{data.size[1], nrhs});
-            auto x = system_generator.create_multi_vector_random(
-                exec, gko::dim<2>{data.size[0], nrhs});
-            if (do_print) {
-                std::clog << "Matrix is of size (" << data.size[0] << ", "
-                          << data.size[1] << ")" << std::endl;
-            }
-            add_or_set_member(test_case, "size", data.size[0], allocator);
-            add_or_set_member(test_case, "nnz", data.nonzeros.size(),
-                              allocator);
-            auto best_performance = std::numeric_limits<double>::max();
-            if (!test_case.HasMember("optimal")) {
-                test_case.AddMember("optimal",
-                                    rapidjson::Value(rapidjson::kObjectType),
-                                    allocator);
-            }
 
-            // Compute the result from ginkgo::coo as the correct answer
-            auto answer = gko::clone(x);
-            if (FLAGS_detailed) {
-                auto system_matrix =
-                    system_generator.generate_matrix_with_default_format(exec,
-                                                                         data);
-                exec->synchronize();
-                system_matrix->apply(b, answer);
-                exec->synchronize();
-            }
-            for (const auto& format_name : formats) {
-                {
-                    auto format_range = annotate(format_name.c_str());
-                    apply_spmv(format_name.c_str(), exec, system_generator,
-                               timer, data, b.get(), x.get(), answer.get(),
-                               test_case, allocator);
-                }
-                if (do_print) {
-                    std::clog << "Current state:" << std::endl
-                              << test_cases << std::endl;
-                }
-                if (spmv_case[format_name.c_str()]["completed"].GetBool()) {
-                    auto performance =
-                        spmv_case[format_name.c_str()]["time"].GetDouble();
-                    if (performance < best_performance) {
-                        best_performance = performance;
-                        add_or_set_member(
-                            test_case["optimal"], "spmv",
-                            rapidjson::Value(format_name.c_str(), allocator)
-                                .Move(),
-                            allocator);
-                    }
-                }
-                if (do_print) {
-                    backup_results(test_cases);
+    void postprocess(json& test_case) const override
+    {
+        if (!test_case.contains("optimal")) {
+            test_case["optimal"] = json::object();
+        }
+        auto best_time = std::numeric_limits<double>::max();
+        std::string best_format;
+        // find the fastest among all formats we tested
+        for (const auto& format : formats) {
+            auto& format_case = test_case[name][format];
+            if (format_case.contains("completed") &&
+                format_case["completed"].template get<bool>()) {
+                auto time = format_case["time"];
+                if (time < best_time) {
+                    best_time = time;
+                    best_format = format;
                 }
             }
-        } catch (const std::exception& e) {
-            std::cerr << "Error setting up matrix data, what(): " << e.what()
-                      << std::endl;
-            if (FLAGS_keep_errors) {
-                rapidjson::Value msg_value;
-                msg_value.SetString(e.what(), allocator);
-                add_or_set_member(test_case, "error", msg_value, allocator);
-            }
+        }
+        if (!best_format.empty()) {
+            test_case["optimal"][name] = best_format;
         }
     }
-    if (profiler_hook) {
-        exec->remove_logger(profiler_hook);
-    }
-}
+};
+
 
 #endif  // GINKGO_BENCHMARK_SPMV_SPMV_COMMON_HPP
diff --git a/benchmark/test/CMakeLists.txt b/benchmark/test/CMakeLists.txt
new file mode 100644
index 00000000000..2f43b6eaf71
--- /dev/null
+++ b/benchmark/test/CMakeLists.txt
@@ -0,0 +1,28 @@
+find_package(Python3 COMPONENTS Interpreter REQUIRED)
+function(add_benchmark_test test_name)
+    configure_file(${test_name}.py ${test_name}.py COPYONLY)
+    add_test(NAME benchmark_${test_name}
+             COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/${test_name}.py $<TARGET_FILE:${test_name}>
+             WORKING_DIRECTORY "$<TARGET_FILE_DIR:ginkgo>")
+    set(regenerate_target benchmark_test_${test_name}_regenerate)
+    add_custom_target(${regenerate_target}
+                      COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/${test_name}.py $<TARGET_FILE:${test_name}> --generate
+                      COMMENT "Regenerating reference output for ${test_name}"
+                      WORKING_DIRECTORY "$<TARGET_FILE_DIR:ginkgo>")
+    add_dependencies(${regenerate_target} ${test_name})
+    add_dependencies(benchmark_test_regenerate ${regenerate_target})
+endfunction()
+add_custom_target(benchmark_test_regenerate)
+configure_file(test_framework.py.in test_framework.py @ONLY)
+add_benchmark_test(blas)
+add_benchmark_test(conversion)
+add_benchmark_test(matrix_statistics)
+add_benchmark_test(preconditioner)
+add_benchmark_test(solver)
+add_benchmark_test(sparse_blas)
+add_benchmark_test(spmv)
+if (GINKGO_BUILD_MPI)
+    add_benchmark_test(multi_vector_distributed)
+    add_benchmark_test(spmv_distributed)
+    add_benchmark_test(solver_distributed)
+endif()
diff --git a/benchmark/test/blas.py b/benchmark/test/blas.py
new file mode 100755
index 00000000000..ff5bddc5d08
--- /dev/null
+++ b/benchmark/test/blas.py
@@ -0,0 +1,32 @@
+#!/usr/bin/env python3
+import test_framework
+
+# check that all input modes work:
+# parameter
+test_framework.compare_output(
+    ["-input", '[{"n": 100}]'],
+    expected_stdout="blas.simple.stdout",
+    expected_stderr="blas.simple.stderr",
+)
+
+# stdin
+test_framework.compare_output(
+    [],
+    expected_stdout="blas.simple.stdout",
+    expected_stderr="blas.simple.stderr",
+    stdin='[{"n": 100}]',
+)
+
+# file
+test_framework.compare_output(
+    ["-input", str(test_framework.sourcepath / "input.blas.json")],
+    expected_stdout="blas.simple.stdout",
+    expected_stderr="blas.simple.stderr",
+)
+
+# profiler annotations
+test_framework.compare_output(
+    ["-input", '[{"n": 100}]', "-profile", "-profiler_hook", "debug"],
+    expected_stdout="blas.profile.stdout",
+    expected_stderr="blas.profile.stderr",
+)
diff --git a/benchmark/test/conversion.py b/benchmark/test/conversion.py
new file mode 100755
index 00000000000..2eada100731
--- /dev/null
+++ b/benchmark/test/conversion.py
@@ -0,0 +1,69 @@
+#!/usr/bin/env python3
+import test_framework
+
+# check that all input modes work:
+# parameter
+test_framework.compare_output(
+    ["-input", '[{"size": 100, "stencil": "7pt"}]', "-formats", "coo,csr"],
+    expected_stdout="conversion.simple.stdout",
+    expected_stderr="conversion.simple.stderr",
+)
+
+# stdin
+test_framework.compare_output(
+    ["-formats", "coo,csr"],
+    expected_stdout="conversion.simple.stdout",
+    expected_stderr="conversion.simple.stderr",
+    stdin='[{"size": 100, "stencil": "7pt"}]',
+)
+
+# input file
+test_framework.compare_output(
+    [
+        "-input",
+        str(test_framework.sourcepath / "input.mtx.json"),
+        "-formats",
+        "coo,csr",
+    ],
+    expected_stdout="conversion.simple.stdout",
+    expected_stderr="conversion.simple.stderr",
+)
+
+# input matrixfile
+test_framework.compare_output(
+    [
+        "-input_matrix",
+        str(test_framework.matrixpath),
+        "-formats",
+        "coo,csr",
+    ],
+    expected_stdout="conversion.matrix.stdout",
+    expected_stderr="conversion.matrix.stderr",
+)
+
+# check that all conversions work
+test_framework.compare_output(
+    [
+        "-input",
+        '[{"size": 100, "stencil": "7pt"}]',
+        "-formats",
+        "coo,csr,ell,sellp,hybrid",
+    ],
+    expected_stdout="conversion.all.stdout",
+    expected_stderr="conversion.all.stderr",
+)
+
+# profiler annotations
+test_framework.compare_output(
+    [
+        "-input",
+        '[{"size": 100, "stencil": "7pt"}]',
+        "-formats",
+        "coo,csr",
+        "-profile",
+        "-profiler_hook",
+        "debug",
+    ],
+    expected_stdout="conversion.profile.stdout",
+    expected_stderr="conversion.profile.stderr",
+)
diff --git a/benchmark/test/input.blas.json b/benchmark/test/input.blas.json
new file mode 100644
index 00000000000..fe366aa6fa0
--- /dev/null
+++ b/benchmark/test/input.blas.json
@@ -0,0 +1,5 @@
+[
+    {
+        "n": 100
+    }
+]
\ No newline at end of file
diff --git a/benchmark/test/input.distributed_mtx.json b/benchmark/test/input.distributed_mtx.json
new file mode 100644
index 00000000000..aca115179e6
--- /dev/null
+++ b/benchmark/test/input.distributed_mtx.json
@@ -0,0 +1,7 @@
+[
+    {
+        "size": 100,
+        "stencil": "7pt",
+        "comm_pattern": "stencil"
+    }
+]
\ No newline at end of file
diff --git a/benchmark/test/input.distributed_solver.json b/benchmark/test/input.distributed_solver.json
new file mode 100644
index 00000000000..16efbf03fba
--- /dev/null
+++ b/benchmark/test/input.distributed_solver.json
@@ -0,0 +1,10 @@
+[
+    {
+        "size": 100,
+        "stencil": "7pt",
+        "comm_pattern": "stencil",
+        "optimal": {
+            "spmv": "csr-csr"
+        }
+    }
+]
\ No newline at end of file
diff --git a/benchmark/test/input.mtx.json b/benchmark/test/input.mtx.json
new file mode 100644
index 00000000000..fdeb10c8eee
--- /dev/null
+++ b/benchmark/test/input.mtx.json
@@ -0,0 +1,6 @@
+[
+    {
+        "size": 100,
+        "stencil": "7pt"
+    }
+]
\ No newline at end of file
diff --git a/benchmark/test/input.solver.json b/benchmark/test/input.solver.json
new file mode 100644
index 00000000000..0183700dfe8
--- /dev/null
+++ b/benchmark/test/input.solver.json
@@ -0,0 +1,9 @@
+[
+    {
+        "size": 100,
+        "stencil": "7pt",
+        "optimal": {
+            "spmv": "csr"
+        }
+    }
+]
\ No newline at end of file
diff --git a/benchmark/test/matrix_statistics.py b/benchmark/test/matrix_statistics.py
new file mode 100755
index 00000000000..6e4d8b1d2f5
--- /dev/null
+++ b/benchmark/test/matrix_statistics.py
@@ -0,0 +1,32 @@
+#!/usr/bin/env python3
+import test_framework
+
+# check that all input modes work:
+# parameter
+test_framework.compare_output(
+    ["-input", '[{"size": 100, "stencil": "7pt"}]'],
+    expected_stdout="matrix_statistics.simple.stdout",
+    expected_stderr="matrix_statistics.simple.stderr",
+)
+
+# stdin
+test_framework.compare_output(
+    [],
+    expected_stdout="matrix_statistics.simple.stdout",
+    expected_stderr="matrix_statistics.simple.stderr",
+    stdin='[{"size": 100, "stencil": "7pt"}]',
+)
+
+# input file
+test_framework.compare_output(
+    ["-input", str(test_framework.sourcepath / "input.mtx.json")],
+    expected_stdout="matrix_statistics.simple.stdout",
+    expected_stderr="matrix_statistics.simple.stderr",
+)
+
+# input matrix file
+test_framework.compare_output(
+    ["-input_matrix", str(test_framework.matrixpath)],
+    expected_stdout="matrix_statistics.matrix.stdout",
+    expected_stderr="matrix_statistics.matrix.stderr",
+)
diff --git a/benchmark/test/multi_vector_distributed.py b/benchmark/test/multi_vector_distributed.py
new file mode 100644
index 00000000000..c62cb8ebd17
--- /dev/null
+++ b/benchmark/test/multi_vector_distributed.py
@@ -0,0 +1,36 @@
+#!/usr/bin/env python3
+import test_framework
+
+# check that all input modes work:
+# parameter
+test_framework.compare_output_distributed(
+    ["-input", '[{"n": 100}]'],
+    expected_stdout="multi_vector_distributed.simple.stdout",
+    expected_stderr="multi_vector_distributed.simple.stderr",
+    num_procs=3,
+)
+
+# stdin
+test_framework.compare_output_distributed(
+    [],
+    expected_stdout="multi_vector_distributed.simple.stdout",
+    expected_stderr="multi_vector_distributed.simple.stderr",
+    stdin='[{"n": 100}]',
+    num_procs=3,
+)
+
+# file
+test_framework.compare_output_distributed(
+    ["-input", str(test_framework.sourcepath / "input.blas.json")],
+    expected_stdout="multi_vector_distributed.simple.stdout",
+    expected_stderr="multi_vector_distributed.simple.stderr",
+    num_procs=3,
+)
+
+# profiler annotations
+test_framework.compare_output_distributed(
+    ["-input", '[{"n": 100}]', "-profile", "-profiler_hook", "debug"],
+    expected_stdout="multi_vector_distributed.profile.stdout",
+    expected_stderr="multi_vector_distributed.profile.stderr",
+    num_procs=3,
+)
diff --git a/benchmark/test/preconditioner.py b/benchmark/test/preconditioner.py
new file mode 100755
index 00000000000..7226964dd05
--- /dev/null
+++ b/benchmark/test/preconditioner.py
@@ -0,0 +1,53 @@
+#!/usr/bin/env python3
+import test_framework
+
+# check that all input modes work:
+# parameter
+test_framework.compare_output(
+    ["-input", '[{"size": 100, "stencil": "7pt"}]'],
+    expected_stdout="preconditioner.simple.stdout",
+    expected_stderr="preconditioner.simple.stderr",
+)
+
+# stdin
+test_framework.compare_output(
+    [],
+    expected_stdout="preconditioner.simple.stdout",
+    expected_stderr="preconditioner.simple.stderr",
+    stdin='[{"size": 100, "stencil": "7pt"}]',
+)
+
+# input file
+test_framework.compare_output(
+    ["-input", str(test_framework.sourcepath / "input.mtx.json")],
+    expected_stdout="preconditioner.simple.stdout",
+    expected_stderr="preconditioner.simple.stderr",
+)
+
+# input matrix file
+test_framework.compare_output(
+    ["-input_matrix", str(test_framework.matrixpath)],
+    expected_stdout="preconditioner.matrix.stdout",
+    expected_stderr="preconditioner.matrix.stderr",
+)
+
+# profiler annotations
+test_framework.compare_output(
+    [
+        "-input",
+        '[{"size": 100, "stencil": "7pt"}]',
+        "-profile",
+        "-profiler_hook",
+        "debug",
+    ],
+    expected_stdout="preconditioner.profile.stdout",
+    expected_stderr="preconditioner.profile.stderr",
+)
+
+# stdin
+test_framework.compare_output(
+    ["-reorder", "amd"],
+    expected_stdout="preconditioner.reordered.stdout",
+    expected_stderr="preconditioner.reordered.stderr",
+    stdin='[{"size": 100, "stencil": "7pt"}]',
+)
diff --git a/benchmark/test/reference/blas.profile.stderr b/benchmark/test/reference/blas.profile.stderr
new file mode 100644
index 00000000000..e156d489be3
--- /dev/null
+++ b/benchmark/test/reference/blas.profile.stderr
@@ -0,0 +1,42 @@
+This is Ginkgo 1.7.0 (master)
+    running with core module 1.7.0 (master)
+Running on reference(0)
+Running with 0 warm iterations and 1 running iterations
+The random seed for right hand sides is 42
+The operations are copy,axpy,scal
+Running test case n = 100 
+DEBUG: begin n = 100 
+	Running blas: copy
+DEBUG: begin copy
+DEBUG: begin dense::fill
+DEBUG: end   dense::fill
+DEBUG: begin repetition
+DEBUG: begin dense::copy
+DEBUG: end   dense::copy
+DEBUG: end   repetition
+DEBUG: end   copy
+	Running blas: axpy
+DEBUG: begin axpy
+DEBUG: begin dense::fill
+DEBUG: end   dense::fill
+DEBUG: begin dense::fill
+DEBUG: end   dense::fill
+DEBUG: begin dense::fill
+DEBUG: end   dense::fill
+DEBUG: begin repetition
+DEBUG: begin dense::add_scaled
+DEBUG: end   dense::add_scaled
+DEBUG: end   repetition
+DEBUG: end   axpy
+	Running blas: scal
+DEBUG: begin scal
+DEBUG: begin dense::fill
+DEBUG: end   dense::fill
+DEBUG: begin dense::fill
+DEBUG: end   dense::fill
+DEBUG: begin repetition
+DEBUG: begin dense::scale
+DEBUG: end   dense::scale
+DEBUG: end   repetition
+DEBUG: end   scal
+DEBUG: end   n = 100 
diff --git a/benchmark/test/reference/blas.profile.stdout b/benchmark/test/reference/blas.profile.stdout
new file mode 100644
index 00000000000..209e115b557
--- /dev/null
+++ b/benchmark/test/reference/blas.profile.stdout
@@ -0,0 +1,28 @@
+[
+    {
+        "n": 100,
+        "blas": {
+            "copy": {
+                "time": 1.0,
+                "flops": 1.0,
+                "bandwidth": 1.0,
+                "repetitions": 1,
+                "completed": true
+            },
+            "axpy": {
+                "time": 1.0,
+                "flops": 1.0,
+                "bandwidth": 1.0,
+                "repetitions": 1,
+                "completed": true
+            },
+            "scal": {
+                "time": 1.0,
+                "flops": 1.0,
+                "bandwidth": 1.0,
+                "repetitions": 1,
+                "completed": true
+            }
+        }
+    }
+]
diff --git a/benchmark/test/reference/blas.simple.stderr b/benchmark/test/reference/blas.simple.stderr
new file mode 100644
index 00000000000..7c5e66b9188
--- /dev/null
+++ b/benchmark/test/reference/blas.simple.stderr
@@ -0,0 +1,10 @@
+This is Ginkgo 1.7.0 (master)
+    running with core module 1.7.0 (master)
+Running on reference(0)
+Running with 2 warm iterations and 10 running iterations
+The random seed for right hand sides is 42
+The operations are copy,axpy,scal
+Running test case n = 100 
+	Running blas: copy
+	Running blas: axpy
+	Running blas: scal
diff --git a/benchmark/test/reference/blas.simple.stdout b/benchmark/test/reference/blas.simple.stdout
new file mode 100644
index 00000000000..54745d81104
--- /dev/null
+++ b/benchmark/test/reference/blas.simple.stdout
@@ -0,0 +1,28 @@
+[
+    {
+        "n": 100,
+        "blas": {
+            "copy": {
+                "time": 1.0,
+                "flops": 1.0,
+                "bandwidth": 1.0,
+                "repetitions": 10,
+                "completed": true
+            },
+            "axpy": {
+                "time": 1.0,
+                "flops": 1.0,
+                "bandwidth": 1.0,
+                "repetitions": 10,
+                "completed": true
+            },
+            "scal": {
+                "time": 1.0,
+                "flops": 1.0,
+                "bandwidth": 1.0,
+                "repetitions": 10,
+                "completed": true
+            }
+        }
+    }
+]
diff --git a/benchmark/test/reference/conversion.all.stderr b/benchmark/test/reference/conversion.all.stderr
new file mode 100644
index 00000000000..37c88fd8b86
--- /dev/null
+++ b/benchmark/test/reference/conversion.all.stderr
@@ -0,0 +1,21 @@
+This is Ginkgo 1.7.0 (master)
+    running with core module 1.7.0 (master)
+Running on reference(0)
+Running with 2 warm iterations and 10 running iterations
+The random seed for right hand sides is 42
+The formats are coo,csr,ell,sellp,hybrid
+Running test case stencil(100, 7pt)
+Matrix is of size (125, 125), 725
+	Running conversion: coo-read
+	Running conversion: coo-csr
+	Running conversion: csr-read
+	Running conversion: csr-coo
+	Running conversion: csr-ell
+	Running conversion: csr-sellp
+	Running conversion: csr-hybrid
+	Running conversion: ell-read
+	Running conversion: ell-csr
+	Running conversion: sellp-read
+	Running conversion: sellp-csr
+	Running conversion: hybrid-read
+	Running conversion: hybrid-csr
diff --git a/benchmark/test/reference/conversion.all.stdout b/benchmark/test/reference/conversion.all.stdout
new file mode 100644
index 00000000000..e7a5b8f0f51
--- /dev/null
+++ b/benchmark/test/reference/conversion.all.stdout
@@ -0,0 +1,76 @@
+[
+    {
+        "size": 100,
+        "stencil": "7pt",
+        "conversion": {
+            "coo-read": {
+                "time": 1.0,
+                "repetitions": 10,
+                "completed": true
+            },
+            "coo-csr": {
+                "time": 1.0,
+                "repetitions": 10,
+                "completed": true
+            },
+            "csr-read": {
+                "time": 1.0,
+                "repetitions": 10,
+                "completed": true
+            },
+            "csr-coo": {
+                "time": 1.0,
+                "repetitions": 10,
+                "completed": true
+            },
+            "csr-ell": {
+                "time": 1.0,
+                "repetitions": 10,
+                "completed": true
+            },
+            "csr-sellp": {
+                "time": 1.0,
+                "repetitions": 10,
+                "completed": true
+            },
+            "csr-hybrid": {
+                "time": 1.0,
+                "repetitions": 10,
+                "completed": true
+            },
+            "ell-read": {
+                "time": 1.0,
+                "repetitions": 10,
+                "completed": true
+            },
+            "ell-csr": {
+                "time": 1.0,
+                "repetitions": 10,
+                "completed": true
+            },
+            "sellp-read": {
+                "time": 1.0,
+                "repetitions": 10,
+                "completed": true
+            },
+            "sellp-csr": {
+                "time": 1.0,
+                "repetitions": 10,
+                "completed": true
+            },
+            "hybrid-read": {
+                "time": 1.0,
+                "repetitions": 10,
+                "completed": true
+            },
+            "hybrid-csr": {
+                "time": 1.0,
+                "repetitions": 10,
+                "completed": true
+            }
+        },
+        "rows": 125,
+        "cols": 125,
+        "nonzeros": 725
+    }
+]
diff --git a/benchmark/test/reference/conversion.matrix.stderr b/benchmark/test/reference/conversion.matrix.stderr
new file mode 100644
index 00000000000..c828fe11267
--- /dev/null
+++ b/benchmark/test/reference/conversion.matrix.stderr
@@ -0,0 +1,12 @@
+This is Ginkgo 1.7.0 (master)
+    running with core module 1.7.0 (master)
+Running on reference(0)
+Running with 2 warm iterations and 10 running iterations
+The random seed for right hand sides is 42
+The formats are coo,csr
+Running test case <filename>
+Matrix is of size (36, 36), 208
+	Running conversion: coo-read
+	Running conversion: coo-csr
+	Running conversion: csr-read
+	Running conversion: csr-coo
diff --git a/benchmark/test/reference/conversion.matrix.stdout b/benchmark/test/reference/conversion.matrix.stdout
new file mode 100644
index 00000000000..8489e4b30b4
--- /dev/null
+++ b/benchmark/test/reference/conversion.matrix.stdout
@@ -0,0 +1,30 @@
+[
+    {
+        "filename": "",
+        "conversion": {
+            "coo-read": {
+                "time": 1.0,
+                "repetitions": 10,
+                "completed": true
+            },
+            "coo-csr": {
+                "time": 1.0,
+                "repetitions": 10,
+                "completed": true
+            },
+            "csr-read": {
+                "time": 1.0,
+                "repetitions": 10,
+                "completed": true
+            },
+            "csr-coo": {
+                "time": 1.0,
+                "repetitions": 10,
+                "completed": true
+            }
+        },
+        "rows": 36,
+        "cols": 36,
+        "nonzeros": 208
+    }
+]
diff --git a/benchmark/test/reference/conversion.profile.stderr b/benchmark/test/reference/conversion.profile.stderr
new file mode 100644
index 00000000000..417c7bd71e7
--- /dev/null
+++ b/benchmark/test/reference/conversion.profile.stderr
@@ -0,0 +1,82 @@
+This is Ginkgo 1.7.0 (master)
+    running with core module 1.7.0 (master)
+Running on reference(0)
+Running with 0 warm iterations and 1 running iterations
+The random seed for right hand sides is 42
+The formats are coo,csr
+Running test case stencil(100, 7pt)
+Matrix is of size (125, 125), 725
+DEBUG: begin components::aos_to_soa
+DEBUG: end   components::aos_to_soa
+DEBUG: begin stencil(100, 7pt)
+	Running conversion: coo-read
+DEBUG: begin coo-read
+DEBUG: begin repetition
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: end   repetition
+DEBUG: end   coo-read
+	Running conversion: coo-csr
+DEBUG: begin coo-csr
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin repetition
+DEBUG: begin copy(<typename>)
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin components::convert_idxs_to_ptrs
+DEBUG: end   components::convert_idxs_to_ptrs
+DEBUG: end   copy(<typename>)
+DEBUG: end   repetition
+DEBUG: end   coo-csr
+	Running conversion: csr-read
+DEBUG: begin csr-read
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin repetition
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin components::convert_idxs_to_ptrs
+DEBUG: end   components::convert_idxs_to_ptrs
+DEBUG: end   repetition
+DEBUG: end   csr-read
+	Running conversion: csr-coo
+DEBUG: begin csr-coo
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin components::convert_idxs_to_ptrs
+DEBUG: end   components::convert_idxs_to_ptrs
+DEBUG: begin repetition
+DEBUG: begin copy(<typename>)
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin components::convert_ptrs_to_idxs
+DEBUG: end   components::convert_ptrs_to_idxs
+DEBUG: end   copy(<typename>)
+DEBUG: end   repetition
+DEBUG: end   csr-coo
+DEBUG: end   stencil(100, 7pt)
diff --git a/benchmark/test/reference/conversion.profile.stdout b/benchmark/test/reference/conversion.profile.stdout
new file mode 100644
index 00000000000..907eac5b951
--- /dev/null
+++ b/benchmark/test/reference/conversion.profile.stdout
@@ -0,0 +1,31 @@
+[
+    {
+        "size": 100,
+        "stencil": "7pt",
+        "conversion": {
+            "coo-read": {
+                "time": 1.0,
+                "repetitions": 1,
+                "completed": true
+            },
+            "coo-csr": {
+                "time": 1.0,
+                "repetitions": 1,
+                "completed": true
+            },
+            "csr-read": {
+                "time": 1.0,
+                "repetitions": 1,
+                "completed": true
+            },
+            "csr-coo": {
+                "time": 1.0,
+                "repetitions": 1,
+                "completed": true
+            }
+        },
+        "rows": 125,
+        "cols": 125,
+        "nonzeros": 725
+    }
+]
diff --git a/benchmark/test/reference/conversion.simple.stderr b/benchmark/test/reference/conversion.simple.stderr
new file mode 100644
index 00000000000..317330a2334
--- /dev/null
+++ b/benchmark/test/reference/conversion.simple.stderr
@@ -0,0 +1,12 @@
+This is Ginkgo 1.7.0 (master)
+    running with core module 1.7.0 (master)
+Running on reference(0)
+Running with 2 warm iterations and 10 running iterations
+The random seed for right hand sides is 42
+The formats are coo,csr
+Running test case stencil(100, 7pt)
+Matrix is of size (125, 125), 725
+	Running conversion: coo-read
+	Running conversion: coo-csr
+	Running conversion: csr-read
+	Running conversion: csr-coo
diff --git a/benchmark/test/reference/conversion.simple.stdout b/benchmark/test/reference/conversion.simple.stdout
new file mode 100644
index 00000000000..91b69b8a248
--- /dev/null
+++ b/benchmark/test/reference/conversion.simple.stdout
@@ -0,0 +1,31 @@
+[
+    {
+        "size": 100,
+        "stencil": "7pt",
+        "conversion": {
+            "coo-read": {
+                "time": 1.0,
+                "repetitions": 10,
+                "completed": true
+            },
+            "coo-csr": {
+                "time": 1.0,
+                "repetitions": 10,
+                "completed": true
+            },
+            "csr-read": {
+                "time": 1.0,
+                "repetitions": 10,
+                "completed": true
+            },
+            "csr-coo": {
+                "time": 1.0,
+                "repetitions": 10,
+                "completed": true
+            }
+        },
+        "rows": 125,
+        "cols": 125,
+        "nonzeros": 725
+    }
+]
diff --git a/benchmark/test/reference/distributed_solver.matrix.stderr b/benchmark/test/reference/distributed_solver.matrix.stderr
new file mode 100644
index 00000000000..fe739a2b773
--- /dev/null
+++ b/benchmark/test/reference/distributed_solver.matrix.stderr
@@ -0,0 +1,10 @@
+This is Ginkgo 1.7.0 (master)
+    running with core module 1.7.0 (master)
+Running on reference(0)
+Running with 2 warm iterations and 1 running iterations
+The random seed for right hand sides is 42
+Running cg with 1000 iterations and residual goal of 1.000000e-06
+The number of right hand sides is 1
+Running test case <filename>
+Matrix is of size (36, 36)
+	Running solver: cg
diff --git a/benchmark/test/reference/distributed_solver.matrix.stdout b/benchmark/test/reference/distributed_solver.matrix.stdout
new file mode 100644
index 00000000000..67ac333bec5
--- /dev/null
+++ b/benchmark/test/reference/distributed_solver.matrix.stdout
@@ -0,0 +1,57 @@
+[
+    {
+        "filename": "",
+        "optimal": {
+            "spmv": "csr-csr"
+        },
+        "solver": {
+            "cg": {
+                "recurrent_residuals": [],
+                "true_residuals": [],
+                "implicit_residuals": [],
+                "iteration_timestamps": [],
+                "rhs_norm": 1.0,
+                "generate": {
+                    "components": {
+                        "generate(<typename>)": 1.0,
+                        "free": 1.0,
+                        "overhead": 1.0
+                    },
+                    "time": 1.0
+                },
+                "apply": {
+                    "components": {
+                        "apply(<typename>)": 1.0,
+                        "iteration": 1.0,
+                        "allocate": 1.0,
+                        "dense::fill": 1.0,
+                        "cg::initialize": 1.0,
+                        "advanced_apply(<typename>)": 1.0,
+                        "dense::row_gather": 1.0,
+                        "csr::advanced_spmv": 1.0,
+                        "dense::compute_squared_norm2": 1.0,
+                        "dense::compute_sqrt": 1.0,
+                        "copy(<typename>)": 1.0,
+                        "dense::copy": 1.0,
+                        "dense::compute_conj_dot_dispatch": 1.0,
+                        "check(<typename>)": 1.0,
+                        "residual_norm::residual_norm": 1.0,
+                        "cg::step_1": 1.0,
+                        "csr::spmv": 1.0,
+                        "cg::step_2": 1.0,
+                        "free": 1.0,
+                        "overhead": 1.0
+                    },
+                    "iterations": 27,
+                    "time": 1.0
+                },
+                "preconditioner": {},
+                "residual_norm": 1.0,
+                "repetitions": 1,
+                "completed": true
+            }
+        },
+        "rows": 36,
+        "cols": 36
+    }
+]
diff --git a/benchmark/test/reference/distributed_solver.profile.stderr b/benchmark/test/reference/distributed_solver.profile.stderr
new file mode 100644
index 00000000000..ade54da3089
--- /dev/null
+++ b/benchmark/test/reference/distributed_solver.profile.stderr
@@ -0,0 +1,448 @@
+This is Ginkgo 1.7.0 (master)
+    running with core module 1.7.0 (master)
+Running on reference(0)
+Running with 0 warm iterations and 1 running iterations
+The random seed for right hand sides is 42
+Running cg with 1000 iterations and residual goal of 1.000000e-06
+The number of right hand sides is 1
+Running test case stencil(100, 7pt, stencil)
+DEBUG: begin partition::build_ranges_from_global_size
+DEBUG: end   partition::build_ranges_from_global_size
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin partition::build_from_contiguous
+DEBUG: end   partition::build_from_contiguous
+DEBUG: begin partition::build_starting_indices
+DEBUG: end   partition::build_starting_indices
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin copy(<typename>)
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: end   copy(<typename>)
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin copy(<typename>)
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: end   copy(<typename>)
+DEBUG: begin dense::fill
+DEBUG: end   dense::fill
+DEBUG: begin components::aos_to_soa
+DEBUG: end   components::aos_to_soa
+DEBUG: begin distributed_matrix::build_local_nonlocal
+DEBUG: end   distributed_matrix::build_local_nonlocal
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin components::convert_idxs_to_ptrs
+DEBUG: end   components::convert_idxs_to_ptrs
+DEBUG: begin components::convert_idxs_to_ptrs
+DEBUG: end   components::convert_idxs_to_ptrs
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin components::aos_to_soa
+DEBUG: end   components::aos_to_soa
+DEBUG: begin dense::fill
+DEBUG: end   dense::fill
+DEBUG: begin dense::fill_in_matrix_data
+DEBUG: end   dense::fill_in_matrix_data
+DEBUG: begin copy(<typename>)
+DEBUG: begin dense::copy
+DEBUG: end   dense::copy
+DEBUG: end   copy(<typename>)
+Matrix is of size (125, 125)
+DEBUG: begin stencil(100, 7pt, stencil)
+	Running solver: cg
+DEBUG: begin cg
+DEBUG: begin dense::compute_squared_norm2
+DEBUG: end   dense::compute_squared_norm2
+DEBUG: begin dense::compute_sqrt
+DEBUG: end   dense::compute_sqrt
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin copy(<typename>)
+DEBUG: begin dense::copy
+DEBUG: end   dense::copy
+DEBUG: end   copy(<typename>)
+DEBUG: begin repetition
+DEBUG: begin copy(<typename>)
+DEBUG: begin dense::copy
+DEBUG: end   dense::copy
+DEBUG: end   copy(<typename>)
+DEBUG: begin generate(<typename>)
+DEBUG: begin generate(<typename>)
+DEBUG: end   generate(<typename>)
+DEBUG: end   generate(<typename>)
+DEBUG: begin apply(<typename>)
+DEBUG: begin iteration
+DEBUG: begin dense::fill
+DEBUG: end   dense::fill
+DEBUG: begin dense::fill
+DEBUG: end   dense::fill
+DEBUG: begin cg::initialize
+DEBUG: end   cg::initialize
+DEBUG: begin advanced_apply(<typename>)
+DEBUG: begin dense::row_gather
+DEBUG: end   dense::row_gather
+DEBUG: begin advanced_apply(<typename>)
+DEBUG: begin csr::advanced_spmv
+DEBUG: end   csr::advanced_spmv
+DEBUG: end   advanced_apply(<typename>)
+DEBUG: begin advanced_apply(<typename>)
+DEBUG: begin csr::advanced_spmv
+DEBUG: end   csr::advanced_spmv
+DEBUG: end   advanced_apply(<typename>)
+DEBUG: end   advanced_apply(<typename>)
+DEBUG: begin dense::compute_squared_norm2
+DEBUG: end   dense::compute_squared_norm2
+DEBUG: begin dense::compute_sqrt
+DEBUG: end   dense::compute_sqrt
+DEBUG: begin apply(<typename>)
+DEBUG: begin copy(<typename>)
+DEBUG: begin dense::copy
+DEBUG: end   dense::copy
+DEBUG: end   copy(<typename>)
+DEBUG: end   apply(<typename>)
+DEBUG: begin dense::compute_conj_dot_dispatch
+DEBUG: end   dense::compute_conj_dot_dispatch
+DEBUG: begin check(<typename>)
+DEBUG: begin check(<typename>)
+DEBUG: begin dense::compute_squared_norm2
+DEBUG: end   dense::compute_squared_norm2
+DEBUG: begin dense::compute_sqrt
+DEBUG: end   dense::compute_sqrt
+DEBUG: begin residual_norm::residual_norm
+DEBUG: end   residual_norm::residual_norm
+DEBUG: end   check(<typename>)
+DEBUG: begin check(<typename>)
+DEBUG: end   check(<typename>)
+DEBUG: end   check(<typename>)
+DEBUG: begin cg::step_1
+DEBUG: end   cg::step_1
+DEBUG: begin apply(<typename>)
+DEBUG: begin dense::row_gather
+DEBUG: end   dense::row_gather
+DEBUG: begin apply(<typename>)
+DEBUG: begin csr::spmv
+DEBUG: end   csr::spmv
+DEBUG: end   apply(<typename>)
+DEBUG: begin advanced_apply(<typename>)
+DEBUG: begin csr::advanced_spmv
+DEBUG: end   csr::advanced_spmv
+DEBUG: end   advanced_apply(<typename>)
+DEBUG: end   apply(<typename>)
+DEBUG: begin dense::compute_conj_dot_dispatch
+DEBUG: end   dense::compute_conj_dot_dispatch
+DEBUG: begin cg::step_2
+DEBUG: end   cg::step_2
+DEBUG: begin apply(<typename>)
+DEBUG: begin copy(<typename>)
+DEBUG: begin dense::copy
+DEBUG: end   dense::copy
+DEBUG: end   copy(<typename>)
+DEBUG: end   apply(<typename>)
+DEBUG: begin dense::compute_conj_dot_dispatch
+DEBUG: end   dense::compute_conj_dot_dispatch
+DEBUG: begin check(<typename>)
+DEBUG: begin check(<typename>)
+DEBUG: begin dense::compute_squared_norm2
+DEBUG: end   dense::compute_squared_norm2
+DEBUG: begin dense::compute_sqrt
+DEBUG: end   dense::compute_sqrt
+DEBUG: begin residual_norm::residual_norm
+DEBUG: end   residual_norm::residual_norm
+DEBUG: end   check(<typename>)
+DEBUG: begin check(<typename>)
+DEBUG: end   check(<typename>)
+DEBUG: end   check(<typename>)
+DEBUG: end   iteration
+DEBUG: begin iteration
+DEBUG: begin cg::step_1
+DEBUG: end   cg::step_1
+DEBUG: begin apply(<typename>)
+DEBUG: begin dense::row_gather
+DEBUG: end   dense::row_gather
+DEBUG: begin apply(<typename>)
+DEBUG: begin csr::spmv
+DEBUG: end   csr::spmv
+DEBUG: end   apply(<typename>)
+DEBUG: begin advanced_apply(<typename>)
+DEBUG: begin csr::advanced_spmv
+DEBUG: end   csr::advanced_spmv
+DEBUG: end   advanced_apply(<typename>)
+DEBUG: end   apply(<typename>)
+DEBUG: begin dense::compute_conj_dot_dispatch
+DEBUG: end   dense::compute_conj_dot_dispatch
+DEBUG: begin cg::step_2
+DEBUG: end   cg::step_2
+DEBUG: begin apply(<typename>)
+DEBUG: begin copy(<typename>)
+DEBUG: begin dense::copy
+DEBUG: end   dense::copy
+DEBUG: end   copy(<typename>)
+DEBUG: end   apply(<typename>)
+DEBUG: begin dense::compute_conj_dot_dispatch
+DEBUG: end   dense::compute_conj_dot_dispatch
+DEBUG: begin check(<typename>)
+DEBUG: begin check(<typename>)
+DEBUG: begin dense::compute_squared_norm2
+DEBUG: end   dense::compute_squared_norm2
+DEBUG: begin dense::compute_sqrt
+DEBUG: end   dense::compute_sqrt
+DEBUG: begin residual_norm::residual_norm
+DEBUG: end   residual_norm::residual_norm
+DEBUG: end   check(<typename>)
+DEBUG: begin check(<typename>)
+DEBUG: end   check(<typename>)
+DEBUG: end   check(<typename>)
+DEBUG: end   iteration
+DEBUG: begin iteration
+DEBUG: begin cg::step_1
+DEBUG: end   cg::step_1
+DEBUG: begin apply(<typename>)
+DEBUG: begin dense::row_gather
+DEBUG: end   dense::row_gather
+DEBUG: begin apply(<typename>)
+DEBUG: begin csr::spmv
+DEBUG: end   csr::spmv
+DEBUG: end   apply(<typename>)
+DEBUG: begin advanced_apply(<typename>)
+DEBUG: begin csr::advanced_spmv
+DEBUG: end   csr::advanced_spmv
+DEBUG: end   advanced_apply(<typename>)
+DEBUG: end   apply(<typename>)
+DEBUG: begin dense::compute_conj_dot_dispatch
+DEBUG: end   dense::compute_conj_dot_dispatch
+DEBUG: begin cg::step_2
+DEBUG: end   cg::step_2
+DEBUG: begin apply(<typename>)
+DEBUG: begin copy(<typename>)
+DEBUG: begin dense::copy
+DEBUG: end   dense::copy
+DEBUG: end   copy(<typename>)
+DEBUG: end   apply(<typename>)
+DEBUG: begin dense::compute_conj_dot_dispatch
+DEBUG: end   dense::compute_conj_dot_dispatch
+DEBUG: begin check(<typename>)
+DEBUG: begin check(<typename>)
+DEBUG: begin dense::compute_squared_norm2
+DEBUG: end   dense::compute_squared_norm2
+DEBUG: begin dense::compute_sqrt
+DEBUG: end   dense::compute_sqrt
+DEBUG: begin residual_norm::residual_norm
+DEBUG: end   residual_norm::residual_norm
+DEBUG: end   check(<typename>)
+DEBUG: begin check(<typename>)
+DEBUG: end   check(<typename>)
+DEBUG: end   check(<typename>)
+DEBUG: end   iteration
+DEBUG: begin iteration
+DEBUG: begin cg::step_1
+DEBUG: end   cg::step_1
+DEBUG: begin apply(<typename>)
+DEBUG: begin dense::row_gather
+DEBUG: end   dense::row_gather
+DEBUG: begin apply(<typename>)
+DEBUG: begin csr::spmv
+DEBUG: end   csr::spmv
+DEBUG: end   apply(<typename>)
+DEBUG: begin advanced_apply(<typename>)
+DEBUG: begin csr::advanced_spmv
+DEBUG: end   csr::advanced_spmv
+DEBUG: end   advanced_apply(<typename>)
+DEBUG: end   apply(<typename>)
+DEBUG: begin dense::compute_conj_dot_dispatch
+DEBUG: end   dense::compute_conj_dot_dispatch
+DEBUG: begin cg::step_2
+DEBUG: end   cg::step_2
+DEBUG: begin apply(<typename>)
+DEBUG: begin copy(<typename>)
+DEBUG: begin dense::copy
+DEBUG: end   dense::copy
+DEBUG: end   copy(<typename>)
+DEBUG: end   apply(<typename>)
+DEBUG: begin dense::compute_conj_dot_dispatch
+DEBUG: end   dense::compute_conj_dot_dispatch
+DEBUG: begin check(<typename>)
+DEBUG: begin check(<typename>)
+DEBUG: begin dense::compute_squared_norm2
+DEBUG: end   dense::compute_squared_norm2
+DEBUG: begin dense::compute_sqrt
+DEBUG: end   dense::compute_sqrt
+DEBUG: begin residual_norm::residual_norm
+DEBUG: end   residual_norm::residual_norm
+DEBUG: end   check(<typename>)
+DEBUG: begin check(<typename>)
+DEBUG: end   check(<typename>)
+DEBUG: end   check(<typename>)
+DEBUG: end   iteration
+DEBUG: begin iteration
+DEBUG: begin cg::step_1
+DEBUG: end   cg::step_1
+DEBUG: begin apply(<typename>)
+DEBUG: begin dense::row_gather
+DEBUG: end   dense::row_gather
+DEBUG: begin apply(<typename>)
+DEBUG: begin csr::spmv
+DEBUG: end   csr::spmv
+DEBUG: end   apply(<typename>)
+DEBUG: begin advanced_apply(<typename>)
+DEBUG: begin csr::advanced_spmv
+DEBUG: end   csr::advanced_spmv
+DEBUG: end   advanced_apply(<typename>)
+DEBUG: end   apply(<typename>)
+DEBUG: begin dense::compute_conj_dot_dispatch
+DEBUG: end   dense::compute_conj_dot_dispatch
+DEBUG: begin cg::step_2
+DEBUG: end   cg::step_2
+DEBUG: begin apply(<typename>)
+DEBUG: begin copy(<typename>)
+DEBUG: begin dense::copy
+DEBUG: end   dense::copy
+DEBUG: end   copy(<typename>)
+DEBUG: end   apply(<typename>)
+DEBUG: begin dense::compute_conj_dot_dispatch
+DEBUG: end   dense::compute_conj_dot_dispatch
+DEBUG: begin check(<typename>)
+DEBUG: begin check(<typename>)
+DEBUG: begin dense::compute_squared_norm2
+DEBUG: end   dense::compute_squared_norm2
+DEBUG: begin dense::compute_sqrt
+DEBUG: end   dense::compute_sqrt
+DEBUG: begin residual_norm::residual_norm
+DEBUG: end   residual_norm::residual_norm
+DEBUG: end   check(<typename>)
+DEBUG: begin check(<typename>)
+DEBUG: end   check(<typename>)
+DEBUG: end   check(<typename>)
+DEBUG: end   iteration
+DEBUG: begin iteration
+DEBUG: begin cg::step_1
+DEBUG: end   cg::step_1
+DEBUG: begin apply(<typename>)
+DEBUG: begin dense::row_gather
+DEBUG: end   dense::row_gather
+DEBUG: begin apply(<typename>)
+DEBUG: begin csr::spmv
+DEBUG: end   csr::spmv
+DEBUG: end   apply(<typename>)
+DEBUG: begin advanced_apply(<typename>)
+DEBUG: begin csr::advanced_spmv
+DEBUG: end   csr::advanced_spmv
+DEBUG: end   advanced_apply(<typename>)
+DEBUG: end   apply(<typename>)
+DEBUG: begin dense::compute_conj_dot_dispatch
+DEBUG: end   dense::compute_conj_dot_dispatch
+DEBUG: begin cg::step_2
+DEBUG: end   cg::step_2
+DEBUG: begin apply(<typename>)
+DEBUG: begin copy(<typename>)
+DEBUG: begin dense::copy
+DEBUG: end   dense::copy
+DEBUG: end   copy(<typename>)
+DEBUG: end   apply(<typename>)
+DEBUG: begin dense::compute_conj_dot_dispatch
+DEBUG: end   dense::compute_conj_dot_dispatch
+DEBUG: begin check(<typename>)
+DEBUG: begin check(<typename>)
+DEBUG: begin dense::compute_squared_norm2
+DEBUG: end   dense::compute_squared_norm2
+DEBUG: begin dense::compute_sqrt
+DEBUG: end   dense::compute_sqrt
+DEBUG: begin residual_norm::residual_norm
+DEBUG: end   residual_norm::residual_norm
+DEBUG: end   check(<typename>)
+DEBUG: begin check(<typename>)
+DEBUG: end   check(<typename>)
+DEBUG: end   check(<typename>)
+DEBUG: end   iteration
+DEBUG: begin iteration
+DEBUG: begin cg::step_1
+DEBUG: end   cg::step_1
+DEBUG: begin apply(<typename>)
+DEBUG: begin dense::row_gather
+DEBUG: end   dense::row_gather
+DEBUG: begin apply(<typename>)
+DEBUG: begin csr::spmv
+DEBUG: end   csr::spmv
+DEBUG: end   apply(<typename>)
+DEBUG: begin advanced_apply(<typename>)
+DEBUG: begin csr::advanced_spmv
+DEBUG: end   csr::advanced_spmv
+DEBUG: end   advanced_apply(<typename>)
+DEBUG: end   apply(<typename>)
+DEBUG: begin dense::compute_conj_dot_dispatch
+DEBUG: end   dense::compute_conj_dot_dispatch
+DEBUG: begin cg::step_2
+DEBUG: end   cg::step_2
+DEBUG: begin apply(<typename>)
+DEBUG: begin copy(<typename>)
+DEBUG: begin dense::copy
+DEBUG: end   dense::copy
+DEBUG: end   copy(<typename>)
+DEBUG: end   apply(<typename>)
+DEBUG: begin dense::compute_conj_dot_dispatch
+DEBUG: end   dense::compute_conj_dot_dispatch
+DEBUG: begin check(<typename>)
+DEBUG: begin check(<typename>)
+DEBUG: begin dense::compute_squared_norm2
+DEBUG: end   dense::compute_squared_norm2
+DEBUG: begin dense::compute_sqrt
+DEBUG: end   dense::compute_sqrt
+DEBUG: begin residual_norm::residual_norm
+DEBUG: end   residual_norm::residual_norm
+DEBUG: end   check(<typename>)
+DEBUG: end   check(<typename>)
+DEBUG: end   iteration
+DEBUG: end   apply(<typename>)
+DEBUG: end   repetition
+DEBUG: begin copy(<typename>)
+DEBUG: begin dense::copy
+DEBUG: end   dense::copy
+DEBUG: end   copy(<typename>)
+DEBUG: begin advanced_apply(<typename>)
+DEBUG: begin dense::row_gather
+DEBUG: end   dense::row_gather
+DEBUG: begin advanced_apply(<typename>)
+DEBUG: begin csr::advanced_spmv
+DEBUG: end   csr::advanced_spmv
+DEBUG: end   advanced_apply(<typename>)
+DEBUG: begin advanced_apply(<typename>)
+DEBUG: begin csr::advanced_spmv
+DEBUG: end   csr::advanced_spmv
+DEBUG: end   advanced_apply(<typename>)
+DEBUG: end   advanced_apply(<typename>)
+DEBUG: begin dense::compute_squared_norm2
+DEBUG: end   dense::compute_squared_norm2
+DEBUG: begin dense::compute_sqrt
+DEBUG: end   dense::compute_sqrt
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: end   cg
+DEBUG: end   stencil(100, 7pt, stencil)
diff --git a/benchmark/test/reference/distributed_solver.profile.stdout b/benchmark/test/reference/distributed_solver.profile.stdout
new file mode 100644
index 00000000000..0a844879c4f
--- /dev/null
+++ b/benchmark/test/reference/distributed_solver.profile.stdout
@@ -0,0 +1,33 @@
+[
+    {
+        "size": 100,
+        "stencil": "7pt",
+        "comm_pattern": "stencil",
+        "optimal": {
+            "spmv": "csr-csr"
+        },
+        "solver": {
+            "cg": {
+                "recurrent_residuals": [],
+                "true_residuals": [],
+                "implicit_residuals": [],
+                "iteration_timestamps": [],
+                "rhs_norm": 1.0,
+                "generate": {
+                    "components": {},
+                    "time": 1.0
+                },
+                "apply": {
+                    "components": {},
+                    "iterations": 7,
+                    "time": 1.0
+                },
+                "residual_norm": 1.0,
+                "repetitions": 1,
+                "completed": true
+            }
+        },
+        "rows": 125,
+        "cols": 125
+    }
+]
diff --git a/benchmark/test/reference/distributed_solver.simple.stderr b/benchmark/test/reference/distributed_solver.simple.stderr
new file mode 100644
index 00000000000..02c580674b3
--- /dev/null
+++ b/benchmark/test/reference/distributed_solver.simple.stderr
@@ -0,0 +1,10 @@
+This is Ginkgo 1.7.0 (master)
+    running with core module 1.7.0 (master)
+Running on reference(0)
+Running with 2 warm iterations and 1 running iterations
+The random seed for right hand sides is 42
+Running cg with 1000 iterations and residual goal of 1.000000e-06
+The number of right hand sides is 1
+Running test case stencil(100, 7pt, stencil)
+Matrix is of size (125, 125)
+	Running solver: cg
diff --git a/benchmark/test/reference/distributed_solver.simple.stdout b/benchmark/test/reference/distributed_solver.simple.stdout
new file mode 100644
index 00000000000..458115e6ab2
--- /dev/null
+++ b/benchmark/test/reference/distributed_solver.simple.stdout
@@ -0,0 +1,59 @@
+[
+    {
+        "size": 100,
+        "stencil": "7pt",
+        "comm_pattern": "stencil",
+        "optimal": {
+            "spmv": "csr-csr"
+        },
+        "solver": {
+            "cg": {
+                "recurrent_residuals": [],
+                "true_residuals": [],
+                "implicit_residuals": [],
+                "iteration_timestamps": [],
+                "rhs_norm": 1.0,
+                "generate": {
+                    "components": {
+                        "generate(<typename>)": 1.0,
+                        "free": 1.0,
+                        "overhead": 1.0
+                    },
+                    "time": 1.0
+                },
+                "apply": {
+                    "components": {
+                        "apply(<typename>)": 1.0,
+                        "iteration": 1.0,
+                        "allocate": 1.0,
+                        "dense::fill": 1.0,
+                        "cg::initialize": 1.0,
+                        "advanced_apply(<typename>)": 1.0,
+                        "dense::row_gather": 1.0,
+                        "csr::advanced_spmv": 1.0,
+                        "dense::compute_squared_norm2": 1.0,
+                        "dense::compute_sqrt": 1.0,
+                        "copy(<typename>)": 1.0,
+                        "dense::copy": 1.0,
+                        "dense::compute_conj_dot_dispatch": 1.0,
+                        "check(<typename>)": 1.0,
+                        "residual_norm::residual_norm": 1.0,
+                        "cg::step_1": 1.0,
+                        "csr::spmv": 1.0,
+                        "cg::step_2": 1.0,
+                        "free": 1.0,
+                        "overhead": 1.0
+                    },
+                    "iterations": 7,
+                    "time": 1.0
+                },
+                "preconditioner": {},
+                "residual_norm": 1.0,
+                "repetitions": 1,
+                "completed": true
+            }
+        },
+        "rows": 125,
+        "cols": 125
+    }
+]
diff --git a/benchmark/test/reference/matrix_statistics.matrix.stderr b/benchmark/test/reference/matrix_statistics.matrix.stderr
new file mode 100644
index 00000000000..b25e792459a
--- /dev/null
+++ b/benchmark/test/reference/matrix_statistics.matrix.stderr
@@ -0,0 +1,4 @@
+This is Ginkgo 1.7.0 (master)
+    running with core module 1.7.0 (master)
+Running test case <filename>
+Matrix is of size (36, 36), 208
diff --git a/benchmark/test/reference/matrix_statistics.matrix.stdout b/benchmark/test/reference/matrix_statistics.matrix.stdout
new file mode 100644
index 00000000000..f5eba9461f7
--- /dev/null
+++ b/benchmark/test/reference/matrix_statistics.matrix.stdout
@@ -0,0 +1,39 @@
+[
+    {
+        "filename": "",
+        "problem": {
+            "rows": 36,
+            "columns": 36,
+            "nonzeros": 208,
+            "row_distribution": {
+                "min": 4,
+                "q1": 4.5,
+                "median": 6.0,
+                "q3": 7.0,
+                "max": 9,
+                "mean": 5.777777777777778,
+                "variance": 2.061728395061728,
+                "skewness": 0.3366362745126052,
+                "kurtosis": 2.0507009932231366,
+                "hyperskewness": 1.9165991338199193,
+                "hyperflatness": 6.0545648993883665
+            },
+            "col_distribution": {
+                "min": 4,
+                "q1": 4.5,
+                "median": 6.0,
+                "q3": 7.0,
+                "max": 9,
+                "mean": 5.777777777777778,
+                "variance": 2.061728395061728,
+                "skewness": 0.3366362745126052,
+                "kurtosis": 2.0507009932231366,
+                "hyperskewness": 1.9165991338199193,
+                "hyperflatness": 6.0545648993883665
+            }
+        },
+        "rows": 36,
+        "cols": 36,
+        "nonzeros": 208
+    }
+]
diff --git a/benchmark/test/reference/matrix_statistics.simple.stderr b/benchmark/test/reference/matrix_statistics.simple.stderr
new file mode 100644
index 00000000000..06e12e1159e
--- /dev/null
+++ b/benchmark/test/reference/matrix_statistics.simple.stderr
@@ -0,0 +1,4 @@
+This is Ginkgo 1.7.0 (master)
+    running with core module 1.7.0 (master)
+Running test case stencil(100, 7pt)
+Matrix is of size (125, 125), 725
diff --git a/benchmark/test/reference/matrix_statistics.simple.stdout b/benchmark/test/reference/matrix_statistics.simple.stdout
new file mode 100644
index 00000000000..23124781a7d
--- /dev/null
+++ b/benchmark/test/reference/matrix_statistics.simple.stdout
@@ -0,0 +1,40 @@
+[
+    {
+        "size": 100,
+        "stencil": "7pt",
+        "problem": {
+            "rows": 125,
+            "columns": 125,
+            "nonzeros": 725,
+            "row_distribution": {
+                "min": 4,
+                "q1": 5.0,
+                "median": 6.0,
+                "q3": 6.0,
+                "max": 7,
+                "mean": 5.8,
+                "variance": 0.7199999999999992,
+                "skewness": -0.23570226039551892,
+                "kurtosis": 2.388888888888889,
+                "hyperskewness": -1.741577812922432,
+                "hyperflatness": 7.762345679012379
+            },
+            "col_distribution": {
+                "min": 4,
+                "q1": 5.0,
+                "median": 6.0,
+                "q3": 6.0,
+                "max": 7,
+                "mean": 5.8,
+                "variance": 0.7199999999999992,
+                "skewness": -0.23570226039551892,
+                "kurtosis": 2.388888888888889,
+                "hyperskewness": -1.741577812922432,
+                "hyperflatness": 7.762345679012379
+            }
+        },
+        "rows": 125,
+        "cols": 125,
+        "nonzeros": 725
+    }
+]
diff --git a/benchmark/test/reference/multi_vector_distributed.profile.stderr b/benchmark/test/reference/multi_vector_distributed.profile.stderr
new file mode 100644
index 00000000000..29dc6b8d286
--- /dev/null
+++ b/benchmark/test/reference/multi_vector_distributed.profile.stderr
@@ -0,0 +1,132 @@
+This is Ginkgo 1.7.0 (master)
+    running with core module 1.7.0 (master)
+Running on reference(0)
+Running with 0 warm iterations and 1 running iterations
+The random seed for right hand sides is 42
+The operations are copy,axpy,scal
+Running test case n = 100 
+DEBUG: begin n = 100 
+	Running blas: copy
+DEBUG: begin copy
+DEBUG: begin partition::build_ranges_from_global_size
+DEBUG: end   partition::build_ranges_from_global_size
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin partition::build_from_contiguous
+DEBUG: end   partition::build_from_contiguous
+DEBUG: begin partition::build_starting_indices
+DEBUG: end   partition::build_starting_indices
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin partition::build_ranges_from_global_size
+DEBUG: end   partition::build_ranges_from_global_size
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin partition::build_from_contiguous
+DEBUG: end   partition::build_from_contiguous
+DEBUG: begin partition::build_starting_indices
+DEBUG: end   partition::build_starting_indices
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin dense::fill
+DEBUG: end   dense::fill
+DEBUG: begin repetition
+DEBUG: begin dense::copy
+DEBUG: end   dense::copy
+DEBUG: end   repetition
+DEBUG: end   copy
+	Running blas: axpy
+DEBUG: begin axpy
+DEBUG: begin partition::build_ranges_from_global_size
+DEBUG: end   partition::build_ranges_from_global_size
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin partition::build_from_contiguous
+DEBUG: end   partition::build_from_contiguous
+DEBUG: begin partition::build_starting_indices
+DEBUG: end   partition::build_starting_indices
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin partition::build_ranges_from_global_size
+DEBUG: end   partition::build_ranges_from_global_size
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin partition::build_from_contiguous
+DEBUG: end   partition::build_from_contiguous
+DEBUG: begin partition::build_starting_indices
+DEBUG: end   partition::build_starting_indices
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin dense::fill
+DEBUG: end   dense::fill
+DEBUG: begin dense::fill
+DEBUG: end   dense::fill
+DEBUG: begin dense::fill
+DEBUG: end   dense::fill
+DEBUG: begin repetition
+DEBUG: begin dense::add_scaled
+DEBUG: end   dense::add_scaled
+DEBUG: end   repetition
+DEBUG: end   axpy
+	Running blas: scal
+DEBUG: begin scal
+DEBUG: begin partition::build_ranges_from_global_size
+DEBUG: end   partition::build_ranges_from_global_size
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin partition::build_from_contiguous
+DEBUG: end   partition::build_from_contiguous
+DEBUG: begin partition::build_starting_indices
+DEBUG: end   partition::build_starting_indices
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin dense::fill
+DEBUG: end   dense::fill
+DEBUG: begin dense::fill
+DEBUG: end   dense::fill
+DEBUG: begin repetition
+DEBUG: begin dense::scale
+DEBUG: end   dense::scale
+DEBUG: end   repetition
+DEBUG: end   scal
+DEBUG: end   n = 100 
diff --git a/benchmark/test/reference/multi_vector_distributed.profile.stdout b/benchmark/test/reference/multi_vector_distributed.profile.stdout
new file mode 100644
index 00000000000..209e115b557
--- /dev/null
+++ b/benchmark/test/reference/multi_vector_distributed.profile.stdout
@@ -0,0 +1,28 @@
+[
+    {
+        "n": 100,
+        "blas": {
+            "copy": {
+                "time": 1.0,
+                "flops": 1.0,
+                "bandwidth": 1.0,
+                "repetitions": 1,
+                "completed": true
+            },
+            "axpy": {
+                "time": 1.0,
+                "flops": 1.0,
+                "bandwidth": 1.0,
+                "repetitions": 1,
+                "completed": true
+            },
+            "scal": {
+                "time": 1.0,
+                "flops": 1.0,
+                "bandwidth": 1.0,
+                "repetitions": 1,
+                "completed": true
+            }
+        }
+    }
+]
diff --git a/benchmark/test/reference/multi_vector_distributed.simple.stderr b/benchmark/test/reference/multi_vector_distributed.simple.stderr
new file mode 100644
index 00000000000..7c5e66b9188
--- /dev/null
+++ b/benchmark/test/reference/multi_vector_distributed.simple.stderr
@@ -0,0 +1,10 @@
+This is Ginkgo 1.7.0 (master)
+    running with core module 1.7.0 (master)
+Running on reference(0)
+Running with 2 warm iterations and 10 running iterations
+The random seed for right hand sides is 42
+The operations are copy,axpy,scal
+Running test case n = 100 
+	Running blas: copy
+	Running blas: axpy
+	Running blas: scal
diff --git a/benchmark/test/reference/multi_vector_distributed.simple.stdout b/benchmark/test/reference/multi_vector_distributed.simple.stdout
new file mode 100644
index 00000000000..54745d81104
--- /dev/null
+++ b/benchmark/test/reference/multi_vector_distributed.simple.stdout
@@ -0,0 +1,28 @@
+[
+    {
+        "n": 100,
+        "blas": {
+            "copy": {
+                "time": 1.0,
+                "flops": 1.0,
+                "bandwidth": 1.0,
+                "repetitions": 10,
+                "completed": true
+            },
+            "axpy": {
+                "time": 1.0,
+                "flops": 1.0,
+                "bandwidth": 1.0,
+                "repetitions": 10,
+                "completed": true
+            },
+            "scal": {
+                "time": 1.0,
+                "flops": 1.0,
+                "bandwidth": 1.0,
+                "repetitions": 10,
+                "completed": true
+            }
+        }
+    }
+]
diff --git a/benchmark/test/reference/preconditioner.matrix.stderr b/benchmark/test/reference/preconditioner.matrix.stderr
new file mode 100644
index 00000000000..82212a3d2c4
--- /dev/null
+++ b/benchmark/test/reference/preconditioner.matrix.stderr
@@ -0,0 +1,9 @@
+This is Ginkgo 1.7.0 (master)
+    running with core module 1.7.0 (master)
+Running on reference(0)
+Running with 2 warm iterations and 10 running iterations
+The random seed for right hand sides is 42
+Running with preconditioners: none
+Running test case <filename>
+Matrix is of size (36, 36), 208
+	Running preconditioner: none
diff --git a/benchmark/test/reference/preconditioner.matrix.stdout b/benchmark/test/reference/preconditioner.matrix.stdout
new file mode 100644
index 00000000000..742ec55c41d
--- /dev/null
+++ b/benchmark/test/reference/preconditioner.matrix.stdout
@@ -0,0 +1,31 @@
+[
+    {
+        "filename": "",
+        "preconditioner": {
+            "none": {
+                "generate": {
+                    "components": {
+                        "generate(<typename>)": 1.0,
+                        "overhead": 1.0
+                    },
+                    "time": 1.0,
+                    "repetitions": 10
+                },
+                "apply": {
+                    "components": {
+                        "apply(<typename>)": 1.0,
+                        "copy(<typename>)": 1.0,
+                        "dense::copy": 1.0,
+                        "overhead": 1.0
+                    },
+                    "time": 1.0,
+                    "repetitions": 10
+                },
+                "completed": true
+            }
+        },
+        "rows": 36,
+        "cols": 36,
+        "nonzeros": 208
+    }
+]
diff --git a/benchmark/test/reference/preconditioner.profile.stderr b/benchmark/test/reference/preconditioner.profile.stderr
new file mode 100644
index 00000000000..b90c5e44912
--- /dev/null
+++ b/benchmark/test/reference/preconditioner.profile.stderr
@@ -0,0 +1,47 @@
+This is Ginkgo 1.7.0 (master)
+    running with core module 1.7.0 (master)
+Running on reference(0)
+Running with 0 warm iterations and 1 running iterations
+The random seed for right hand sides is 42
+Running with preconditioners: none
+Running test case stencil(100, 7pt)
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin components::aos_to_soa
+DEBUG: end   components::aos_to_soa
+DEBUG: begin components::convert_idxs_to_ptrs
+DEBUG: end   components::convert_idxs_to_ptrs
+DEBUG: begin components::aos_to_soa
+DEBUG: end   components::aos_to_soa
+DEBUG: begin dense::fill
+DEBUG: end   dense::fill
+DEBUG: begin dense::fill_in_matrix_data
+DEBUG: end   dense::fill_in_matrix_data
+DEBUG: begin components::aos_to_soa
+DEBUG: end   components::aos_to_soa
+DEBUG: begin dense::fill
+DEBUG: end   dense::fill
+DEBUG: begin dense::fill_in_matrix_data
+DEBUG: end   dense::fill_in_matrix_data
+Matrix is of size (125, 125), 725
+DEBUG: begin stencil(100, 7pt)
+	Running preconditioner: none
+DEBUG: begin none
+DEBUG: begin copy(<typename>)
+DEBUG: begin dense::copy
+DEBUG: end   dense::copy
+DEBUG: end   copy(<typename>)
+DEBUG: begin repetition generate
+DEBUG: begin generate(<typename>)
+DEBUG: end   generate(<typename>)
+DEBUG: end   repetition generate
+DEBUG: begin repetition apply
+DEBUG: begin apply(<typename>)
+DEBUG: begin copy(<typename>)
+DEBUG: begin dense::copy
+DEBUG: end   dense::copy
+DEBUG: end   copy(<typename>)
+DEBUG: end   apply(<typename>)
+DEBUG: end   repetition apply
+DEBUG: end   none
+DEBUG: end   stencil(100, 7pt)
diff --git a/benchmark/test/reference/preconditioner.profile.stdout b/benchmark/test/reference/preconditioner.profile.stdout
new file mode 100644
index 00000000000..526349b55ad
--- /dev/null
+++ b/benchmark/test/reference/preconditioner.profile.stdout
@@ -0,0 +1,24 @@
+[
+    {
+        "size": 100,
+        "stencil": "7pt",
+        "preconditioner": {
+            "none": {
+                "generate": {
+                    "components": {},
+                    "time": 1.0,
+                    "repetitions": 1
+                },
+                "apply": {
+                    "components": {},
+                    "time": 1.0,
+                    "repetitions": 1
+                },
+                "completed": true
+            }
+        },
+        "rows": 125,
+        "cols": 125,
+        "nonzeros": 725
+    }
+]
diff --git a/benchmark/test/reference/preconditioner.reordered.stderr b/benchmark/test/reference/preconditioner.reordered.stderr
new file mode 100644
index 00000000000..a0bec924a46
--- /dev/null
+++ b/benchmark/test/reference/preconditioner.reordered.stderr
@@ -0,0 +1,9 @@
+This is Ginkgo 1.7.0 (master)
+    running with core module 1.7.0 (master)
+Running on reference(0)
+Running with 2 warm iterations and 10 running iterations
+The random seed for right hand sides is 42
+Running with preconditioners: none
+Running test case stencil(100, 7pt)
+Matrix is of size (125, 125), 725
+	Running preconditioner: none
diff --git a/benchmark/test/reference/preconditioner.reordered.stdout b/benchmark/test/reference/preconditioner.reordered.stdout
new file mode 100644
index 00000000000..51adfb3b58b
--- /dev/null
+++ b/benchmark/test/reference/preconditioner.reordered.stdout
@@ -0,0 +1,33 @@
+[
+    {
+        "size": 100,
+        "stencil": "7pt",
+        "preconditioner": {
+            "none": {
+                "generate": {
+                    "components": {
+                        "generate(<typename>)": 1.0,
+                        "overhead": 1.0
+                    },
+                    "time": 1.0,
+                    "repetitions": 10
+                },
+                "apply": {
+                    "components": {
+                        "apply(<typename>)": 1.0,
+                        "copy(<typename>)": 1.0,
+                        "dense::copy": 1.0,
+                        "overhead": 1.0
+                    },
+                    "time": 1.0,
+                    "repetitions": 10
+                },
+                "completed": true
+            }
+        },
+        "reordered": "amd",
+        "rows": 125,
+        "cols": 125,
+        "nonzeros": 725
+    }
+]
diff --git a/benchmark/test/reference/preconditioner.simple.stderr b/benchmark/test/reference/preconditioner.simple.stderr
new file mode 100644
index 00000000000..a0bec924a46
--- /dev/null
+++ b/benchmark/test/reference/preconditioner.simple.stderr
@@ -0,0 +1,9 @@
+This is Ginkgo 1.7.0 (master)
+    running with core module 1.7.0 (master)
+Running on reference(0)
+Running with 2 warm iterations and 10 running iterations
+The random seed for right hand sides is 42
+Running with preconditioners: none
+Running test case stencil(100, 7pt)
+Matrix is of size (125, 125), 725
+	Running preconditioner: none
diff --git a/benchmark/test/reference/preconditioner.simple.stdout b/benchmark/test/reference/preconditioner.simple.stdout
new file mode 100644
index 00000000000..ed567dcbb13
--- /dev/null
+++ b/benchmark/test/reference/preconditioner.simple.stdout
@@ -0,0 +1,32 @@
+[
+    {
+        "size": 100,
+        "stencil": "7pt",
+        "preconditioner": {
+            "none": {
+                "generate": {
+                    "components": {
+                        "generate(<typename>)": 1.0,
+                        "overhead": 1.0
+                    },
+                    "time": 1.0,
+                    "repetitions": 10
+                },
+                "apply": {
+                    "components": {
+                        "apply(<typename>)": 1.0,
+                        "copy(<typename>)": 1.0,
+                        "dense::copy": 1.0,
+                        "overhead": 1.0
+                    },
+                    "time": 1.0,
+                    "repetitions": 10
+                },
+                "completed": true
+            }
+        },
+        "rows": 125,
+        "cols": 125,
+        "nonzeros": 725
+    }
+]
diff --git a/benchmark/test/reference/solver.matrix.stderr b/benchmark/test/reference/solver.matrix.stderr
new file mode 100644
index 00000000000..fe739a2b773
--- /dev/null
+++ b/benchmark/test/reference/solver.matrix.stderr
@@ -0,0 +1,10 @@
+This is Ginkgo 1.7.0 (master)
+    running with core module 1.7.0 (master)
+Running on reference(0)
+Running with 2 warm iterations and 1 running iterations
+The random seed for right hand sides is 42
+Running cg with 1000 iterations and residual goal of 1.000000e-06
+The number of right hand sides is 1
+Running test case <filename>
+Matrix is of size (36, 36)
+	Running solver: cg
diff --git a/benchmark/test/reference/solver.matrix.stdout b/benchmark/test/reference/solver.matrix.stdout
new file mode 100644
index 00000000000..594a3887921
--- /dev/null
+++ b/benchmark/test/reference/solver.matrix.stdout
@@ -0,0 +1,55 @@
+[
+    {
+        "filename": "",
+        "optimal": {
+            "spmv": "csr"
+        },
+        "solver": {
+            "cg": {
+                "recurrent_residuals": [],
+                "true_residuals": [],
+                "implicit_residuals": [],
+                "iteration_timestamps": [],
+                "rhs_norm": 1.0,
+                "generate": {
+                    "components": {
+                        "generate(<typename>)": 1.0,
+                        "free": 1.0,
+                        "overhead": 1.0
+                    },
+                    "time": 1.0
+                },
+                "apply": {
+                    "components": {
+                        "apply(<typename>)": 1.0,
+                        "iteration": 1.0,
+                        "allocate": 1.0,
+                        "dense::fill": 1.0,
+                        "cg::initialize": 1.0,
+                        "advanced_apply(<typename>)": 1.0,
+                        "csr::advanced_spmv": 1.0,
+                        "dense::compute_norm2_dispatch": 1.0,
+                        "copy(<typename>)": 1.0,
+                        "dense::copy": 1.0,
+                        "dense::compute_conj_dot_dispatch": 1.0,
+                        "check(<typename>)": 1.0,
+                        "residual_norm::residual_norm": 1.0,
+                        "cg::step_1": 1.0,
+                        "csr::spmv": 1.0,
+                        "cg::step_2": 1.0,
+                        "free": 1.0,
+                        "overhead": 1.0
+                    },
+                    "iterations": 27,
+                    "time": 1.0
+                },
+                "preconditioner": {},
+                "residual_norm": 1.0,
+                "repetitions": 1,
+                "completed": true
+            }
+        },
+        "rows": 36,
+        "cols": 36
+    }
+]
diff --git a/benchmark/test/reference/solver.profile.stderr b/benchmark/test/reference/solver.profile.stderr
new file mode 100644
index 00000000000..0f972f0aec8
--- /dev/null
+++ b/benchmark/test/reference/solver.profile.stderr
@@ -0,0 +1,300 @@
+This is Ginkgo 1.7.0 (master)
+    running with core module 1.7.0 (master)
+Running on reference(0)
+Running with 0 warm iterations and 1 running iterations
+The random seed for right hand sides is 42
+Running cg with 1000 iterations and residual goal of 1.000000e-06
+The number of right hand sides is 1
+Running test case stencil(100, 7pt)
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin components::aos_to_soa
+DEBUG: end   components::aos_to_soa
+DEBUG: begin components::convert_idxs_to_ptrs
+DEBUG: end   components::convert_idxs_to_ptrs
+DEBUG: begin components::aos_to_soa
+DEBUG: end   components::aos_to_soa
+DEBUG: begin dense::fill
+DEBUG: end   dense::fill
+DEBUG: begin dense::fill_in_matrix_data
+DEBUG: end   dense::fill_in_matrix_data
+DEBUG: begin copy(<typename>)
+DEBUG: begin dense::copy
+DEBUG: end   dense::copy
+DEBUG: end   copy(<typename>)
+Matrix is of size (125, 125)
+DEBUG: begin stencil(100, 7pt)
+	Running solver: cg
+DEBUG: begin cg
+DEBUG: begin dense::compute_norm2_dispatch
+DEBUG: end   dense::compute_norm2_dispatch
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin copy(<typename>)
+DEBUG: begin dense::copy
+DEBUG: end   dense::copy
+DEBUG: end   copy(<typename>)
+DEBUG: begin repetition
+DEBUG: begin copy(<typename>)
+DEBUG: begin dense::copy
+DEBUG: end   dense::copy
+DEBUG: end   copy(<typename>)
+DEBUG: begin generate(<typename>)
+DEBUG: begin generate(<typename>)
+DEBUG: end   generate(<typename>)
+DEBUG: end   generate(<typename>)
+DEBUG: begin apply(<typename>)
+DEBUG: begin iteration
+DEBUG: begin dense::fill
+DEBUG: end   dense::fill
+DEBUG: begin dense::fill
+DEBUG: end   dense::fill
+DEBUG: begin cg::initialize
+DEBUG: end   cg::initialize
+DEBUG: begin advanced_apply(<typename>)
+DEBUG: begin csr::advanced_spmv
+DEBUG: end   csr::advanced_spmv
+DEBUG: end   advanced_apply(<typename>)
+DEBUG: begin dense::compute_norm2_dispatch
+DEBUG: end   dense::compute_norm2_dispatch
+DEBUG: begin apply(<typename>)
+DEBUG: begin copy(<typename>)
+DEBUG: begin dense::copy
+DEBUG: end   dense::copy
+DEBUG: end   copy(<typename>)
+DEBUG: end   apply(<typename>)
+DEBUG: begin dense::compute_conj_dot_dispatch
+DEBUG: end   dense::compute_conj_dot_dispatch
+DEBUG: begin check(<typename>)
+DEBUG: begin check(<typename>)
+DEBUG: begin dense::compute_norm2_dispatch
+DEBUG: end   dense::compute_norm2_dispatch
+DEBUG: begin residual_norm::residual_norm
+DEBUG: end   residual_norm::residual_norm
+DEBUG: end   check(<typename>)
+DEBUG: begin check(<typename>)
+DEBUG: end   check(<typename>)
+DEBUG: end   check(<typename>)
+DEBUG: begin cg::step_1
+DEBUG: end   cg::step_1
+DEBUG: begin apply(<typename>)
+DEBUG: begin csr::spmv
+DEBUG: end   csr::spmv
+DEBUG: end   apply(<typename>)
+DEBUG: begin dense::compute_conj_dot_dispatch
+DEBUG: end   dense::compute_conj_dot_dispatch
+DEBUG: begin cg::step_2
+DEBUG: end   cg::step_2
+DEBUG: begin apply(<typename>)
+DEBUG: begin copy(<typename>)
+DEBUG: begin dense::copy
+DEBUG: end   dense::copy
+DEBUG: end   copy(<typename>)
+DEBUG: end   apply(<typename>)
+DEBUG: begin dense::compute_conj_dot_dispatch
+DEBUG: end   dense::compute_conj_dot_dispatch
+DEBUG: begin check(<typename>)
+DEBUG: begin check(<typename>)
+DEBUG: begin dense::compute_norm2_dispatch
+DEBUG: end   dense::compute_norm2_dispatch
+DEBUG: begin residual_norm::residual_norm
+DEBUG: end   residual_norm::residual_norm
+DEBUG: end   check(<typename>)
+DEBUG: begin check(<typename>)
+DEBUG: end   check(<typename>)
+DEBUG: end   check(<typename>)
+DEBUG: end   iteration
+DEBUG: begin iteration
+DEBUG: begin cg::step_1
+DEBUG: end   cg::step_1
+DEBUG: begin apply(<typename>)
+DEBUG: begin csr::spmv
+DEBUG: end   csr::spmv
+DEBUG: end   apply(<typename>)
+DEBUG: begin dense::compute_conj_dot_dispatch
+DEBUG: end   dense::compute_conj_dot_dispatch
+DEBUG: begin cg::step_2
+DEBUG: end   cg::step_2
+DEBUG: begin apply(<typename>)
+DEBUG: begin copy(<typename>)
+DEBUG: begin dense::copy
+DEBUG: end   dense::copy
+DEBUG: end   copy(<typename>)
+DEBUG: end   apply(<typename>)
+DEBUG: begin dense::compute_conj_dot_dispatch
+DEBUG: end   dense::compute_conj_dot_dispatch
+DEBUG: begin check(<typename>)
+DEBUG: begin check(<typename>)
+DEBUG: begin dense::compute_norm2_dispatch
+DEBUG: end   dense::compute_norm2_dispatch
+DEBUG: begin residual_norm::residual_norm
+DEBUG: end   residual_norm::residual_norm
+DEBUG: end   check(<typename>)
+DEBUG: begin check(<typename>)
+DEBUG: end   check(<typename>)
+DEBUG: end   check(<typename>)
+DEBUG: end   iteration
+DEBUG: begin iteration
+DEBUG: begin cg::step_1
+DEBUG: end   cg::step_1
+DEBUG: begin apply(<typename>)
+DEBUG: begin csr::spmv
+DEBUG: end   csr::spmv
+DEBUG: end   apply(<typename>)
+DEBUG: begin dense::compute_conj_dot_dispatch
+DEBUG: end   dense::compute_conj_dot_dispatch
+DEBUG: begin cg::step_2
+DEBUG: end   cg::step_2
+DEBUG: begin apply(<typename>)
+DEBUG: begin copy(<typename>)
+DEBUG: begin dense::copy
+DEBUG: end   dense::copy
+DEBUG: end   copy(<typename>)
+DEBUG: end   apply(<typename>)
+DEBUG: begin dense::compute_conj_dot_dispatch
+DEBUG: end   dense::compute_conj_dot_dispatch
+DEBUG: begin check(<typename>)
+DEBUG: begin check(<typename>)
+DEBUG: begin dense::compute_norm2_dispatch
+DEBUG: end   dense::compute_norm2_dispatch
+DEBUG: begin residual_norm::residual_norm
+DEBUG: end   residual_norm::residual_norm
+DEBUG: end   check(<typename>)
+DEBUG: begin check(<typename>)
+DEBUG: end   check(<typename>)
+DEBUG: end   check(<typename>)
+DEBUG: end   iteration
+DEBUG: begin iteration
+DEBUG: begin cg::step_1
+DEBUG: end   cg::step_1
+DEBUG: begin apply(<typename>)
+DEBUG: begin csr::spmv
+DEBUG: end   csr::spmv
+DEBUG: end   apply(<typename>)
+DEBUG: begin dense::compute_conj_dot_dispatch
+DEBUG: end   dense::compute_conj_dot_dispatch
+DEBUG: begin cg::step_2
+DEBUG: end   cg::step_2
+DEBUG: begin apply(<typename>)
+DEBUG: begin copy(<typename>)
+DEBUG: begin dense::copy
+DEBUG: end   dense::copy
+DEBUG: end   copy(<typename>)
+DEBUG: end   apply(<typename>)
+DEBUG: begin dense::compute_conj_dot_dispatch
+DEBUG: end   dense::compute_conj_dot_dispatch
+DEBUG: begin check(<typename>)
+DEBUG: begin check(<typename>)
+DEBUG: begin dense::compute_norm2_dispatch
+DEBUG: end   dense::compute_norm2_dispatch
+DEBUG: begin residual_norm::residual_norm
+DEBUG: end   residual_norm::residual_norm
+DEBUG: end   check(<typename>)
+DEBUG: begin check(<typename>)
+DEBUG: end   check(<typename>)
+DEBUG: end   check(<typename>)
+DEBUG: end   iteration
+DEBUG: begin iteration
+DEBUG: begin cg::step_1
+DEBUG: end   cg::step_1
+DEBUG: begin apply(<typename>)
+DEBUG: begin csr::spmv
+DEBUG: end   csr::spmv
+DEBUG: end   apply(<typename>)
+DEBUG: begin dense::compute_conj_dot_dispatch
+DEBUG: end   dense::compute_conj_dot_dispatch
+DEBUG: begin cg::step_2
+DEBUG: end   cg::step_2
+DEBUG: begin apply(<typename>)
+DEBUG: begin copy(<typename>)
+DEBUG: begin dense::copy
+DEBUG: end   dense::copy
+DEBUG: end   copy(<typename>)
+DEBUG: end   apply(<typename>)
+DEBUG: begin dense::compute_conj_dot_dispatch
+DEBUG: end   dense::compute_conj_dot_dispatch
+DEBUG: begin check(<typename>)
+DEBUG: begin check(<typename>)
+DEBUG: begin dense::compute_norm2_dispatch
+DEBUG: end   dense::compute_norm2_dispatch
+DEBUG: begin residual_norm::residual_norm
+DEBUG: end   residual_norm::residual_norm
+DEBUG: end   check(<typename>)
+DEBUG: begin check(<typename>)
+DEBUG: end   check(<typename>)
+DEBUG: end   check(<typename>)
+DEBUG: end   iteration
+DEBUG: begin iteration
+DEBUG: begin cg::step_1
+DEBUG: end   cg::step_1
+DEBUG: begin apply(<typename>)
+DEBUG: begin csr::spmv
+DEBUG: end   csr::spmv
+DEBUG: end   apply(<typename>)
+DEBUG: begin dense::compute_conj_dot_dispatch
+DEBUG: end   dense::compute_conj_dot_dispatch
+DEBUG: begin cg::step_2
+DEBUG: end   cg::step_2
+DEBUG: begin apply(<typename>)
+DEBUG: begin copy(<typename>)
+DEBUG: begin dense::copy
+DEBUG: end   dense::copy
+DEBUG: end   copy(<typename>)
+DEBUG: end   apply(<typename>)
+DEBUG: begin dense::compute_conj_dot_dispatch
+DEBUG: end   dense::compute_conj_dot_dispatch
+DEBUG: begin check(<typename>)
+DEBUG: begin check(<typename>)
+DEBUG: begin dense::compute_norm2_dispatch
+DEBUG: end   dense::compute_norm2_dispatch
+DEBUG: begin residual_norm::residual_norm
+DEBUG: end   residual_norm::residual_norm
+DEBUG: end   check(<typename>)
+DEBUG: begin check(<typename>)
+DEBUG: end   check(<typename>)
+DEBUG: end   check(<typename>)
+DEBUG: end   iteration
+DEBUG: begin iteration
+DEBUG: begin cg::step_1
+DEBUG: end   cg::step_1
+DEBUG: begin apply(<typename>)
+DEBUG: begin csr::spmv
+DEBUG: end   csr::spmv
+DEBUG: end   apply(<typename>)
+DEBUG: begin dense::compute_conj_dot_dispatch
+DEBUG: end   dense::compute_conj_dot_dispatch
+DEBUG: begin cg::step_2
+DEBUG: end   cg::step_2
+DEBUG: begin apply(<typename>)
+DEBUG: begin copy(<typename>)
+DEBUG: begin dense::copy
+DEBUG: end   dense::copy
+DEBUG: end   copy(<typename>)
+DEBUG: end   apply(<typename>)
+DEBUG: begin dense::compute_conj_dot_dispatch
+DEBUG: end   dense::compute_conj_dot_dispatch
+DEBUG: begin check(<typename>)
+DEBUG: begin check(<typename>)
+DEBUG: begin dense::compute_norm2_dispatch
+DEBUG: end   dense::compute_norm2_dispatch
+DEBUG: begin residual_norm::residual_norm
+DEBUG: end   residual_norm::residual_norm
+DEBUG: end   check(<typename>)
+DEBUG: end   check(<typename>)
+DEBUG: end   iteration
+DEBUG: end   apply(<typename>)
+DEBUG: end   repetition
+DEBUG: begin copy(<typename>)
+DEBUG: begin dense::copy
+DEBUG: end   dense::copy
+DEBUG: end   copy(<typename>)
+DEBUG: begin advanced_apply(<typename>)
+DEBUG: begin csr::advanced_spmv
+DEBUG: end   csr::advanced_spmv
+DEBUG: end   advanced_apply(<typename>)
+DEBUG: begin dense::compute_norm2_dispatch
+DEBUG: end   dense::compute_norm2_dispatch
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: end   cg
+DEBUG: end   stencil(100, 7pt)
diff --git a/benchmark/test/reference/solver.profile.stdout b/benchmark/test/reference/solver.profile.stdout
new file mode 100644
index 00000000000..c132ed1a572
--- /dev/null
+++ b/benchmark/test/reference/solver.profile.stdout
@@ -0,0 +1,32 @@
+[
+    {
+        "size": 100,
+        "stencil": "7pt",
+        "optimal": {
+            "spmv": "csr"
+        },
+        "solver": {
+            "cg": {
+                "recurrent_residuals": [],
+                "true_residuals": [],
+                "implicit_residuals": [],
+                "iteration_timestamps": [],
+                "rhs_norm": 1.0,
+                "generate": {
+                    "components": {},
+                    "time": 1.0
+                },
+                "apply": {
+                    "components": {},
+                    "iterations": 7,
+                    "time": 1.0
+                },
+                "residual_norm": 1.0,
+                "repetitions": 1,
+                "completed": true
+            }
+        },
+        "rows": 125,
+        "cols": 125
+    }
+]
diff --git a/benchmark/test/reference/solver.reordered.stderr b/benchmark/test/reference/solver.reordered.stderr
new file mode 100644
index 00000000000..b133e6bfc57
--- /dev/null
+++ b/benchmark/test/reference/solver.reordered.stderr
@@ -0,0 +1,10 @@
+This is Ginkgo 1.7.0 (master)
+    running with core module 1.7.0 (master)
+Running on reference(0)
+Running with 2 warm iterations and 1 running iterations
+The random seed for right hand sides is 42
+Running cg with 1000 iterations and residual goal of 1.000000e-06
+The number of right hand sides is 1
+Running test case stencil(100, 7pt)
+Matrix is of size (125, 125)
+	Running solver: cg
diff --git a/benchmark/test/reference/solver.reordered.stdout b/benchmark/test/reference/solver.reordered.stdout
new file mode 100644
index 00000000000..c1b826ae3fc
--- /dev/null
+++ b/benchmark/test/reference/solver.reordered.stdout
@@ -0,0 +1,57 @@
+[
+    {
+        "size": 100,
+        "stencil": "7pt",
+        "optimal": {
+            "spmv": "csr"
+        },
+        "solver": {
+            "cg": {
+                "recurrent_residuals": [],
+                "true_residuals": [],
+                "implicit_residuals": [],
+                "iteration_timestamps": [],
+                "rhs_norm": 1.0,
+                "generate": {
+                    "components": {
+                        "generate(<typename>)": 1.0,
+                        "free": 1.0,
+                        "overhead": 1.0
+                    },
+                    "time": 1.0
+                },
+                "apply": {
+                    "components": {
+                        "apply(<typename>)": 1.0,
+                        "iteration": 1.0,
+                        "allocate": 1.0,
+                        "dense::fill": 1.0,
+                        "cg::initialize": 1.0,
+                        "advanced_apply(<typename>)": 1.0,
+                        "csr::advanced_spmv": 1.0,
+                        "dense::compute_norm2_dispatch": 1.0,
+                        "copy(<typename>)": 1.0,
+                        "dense::copy": 1.0,
+                        "dense::compute_conj_dot_dispatch": 1.0,
+                        "check(<typename>)": 1.0,
+                        "residual_norm::residual_norm": 1.0,
+                        "cg::step_1": 1.0,
+                        "csr::spmv": 1.0,
+                        "cg::step_2": 1.0,
+                        "free": 1.0,
+                        "overhead": 1.0
+                    },
+                    "iterations": 7,
+                    "time": 1.0
+                },
+                "preconditioner": {},
+                "residual_norm": 1.0,
+                "repetitions": 1,
+                "completed": true
+            }
+        },
+        "reordered": "amd",
+        "rows": 125,
+        "cols": 125
+    }
+]
diff --git a/benchmark/test/reference/solver.simple.stderr b/benchmark/test/reference/solver.simple.stderr
new file mode 100644
index 00000000000..b133e6bfc57
--- /dev/null
+++ b/benchmark/test/reference/solver.simple.stderr
@@ -0,0 +1,10 @@
+This is Ginkgo 1.7.0 (master)
+    running with core module 1.7.0 (master)
+Running on reference(0)
+Running with 2 warm iterations and 1 running iterations
+The random seed for right hand sides is 42
+Running cg with 1000 iterations and residual goal of 1.000000e-06
+The number of right hand sides is 1
+Running test case stencil(100, 7pt)
+Matrix is of size (125, 125)
+	Running solver: cg
diff --git a/benchmark/test/reference/solver.simple.stdout b/benchmark/test/reference/solver.simple.stdout
new file mode 100644
index 00000000000..0ee0e4b9a4b
--- /dev/null
+++ b/benchmark/test/reference/solver.simple.stdout
@@ -0,0 +1,56 @@
+[
+    {
+        "size": 100,
+        "stencil": "7pt",
+        "optimal": {
+            "spmv": "csr"
+        },
+        "solver": {
+            "cg": {
+                "recurrent_residuals": [],
+                "true_residuals": [],
+                "implicit_residuals": [],
+                "iteration_timestamps": [],
+                "rhs_norm": 1.0,
+                "generate": {
+                    "components": {
+                        "generate(<typename>)": 1.0,
+                        "free": 1.0,
+                        "overhead": 1.0
+                    },
+                    "time": 1.0
+                },
+                "apply": {
+                    "components": {
+                        "apply(<typename>)": 1.0,
+                        "iteration": 1.0,
+                        "allocate": 1.0,
+                        "dense::fill": 1.0,
+                        "cg::initialize": 1.0,
+                        "advanced_apply(<typename>)": 1.0,
+                        "csr::advanced_spmv": 1.0,
+                        "dense::compute_norm2_dispatch": 1.0,
+                        "copy(<typename>)": 1.0,
+                        "dense::copy": 1.0,
+                        "dense::compute_conj_dot_dispatch": 1.0,
+                        "check(<typename>)": 1.0,
+                        "residual_norm::residual_norm": 1.0,
+                        "cg::step_1": 1.0,
+                        "csr::spmv": 1.0,
+                        "cg::step_2": 1.0,
+                        "free": 1.0,
+                        "overhead": 1.0
+                    },
+                    "iterations": 7,
+                    "time": 1.0
+                },
+                "preconditioner": {},
+                "residual_norm": 1.0,
+                "repetitions": 1,
+                "completed": true
+            }
+        },
+        "rows": 125,
+        "cols": 125
+    }
+]
diff --git a/benchmark/test/reference/sparse_blas.matrix.stderr b/benchmark/test/reference/sparse_blas.matrix.stderr
new file mode 100644
index 00000000000..cbd08e1d21e
--- /dev/null
+++ b/benchmark/test/reference/sparse_blas.matrix.stderr
@@ -0,0 +1,9 @@
+This is Ginkgo 1.7.0 (master)
+    running with core module 1.7.0 (master)
+Running on reference(0)
+Running with 2 warm iterations and 10 running iterations
+The random seed for right hand sides is 42
+The operations are transpose
+Running test case <filename>
+Matrix is of size (36, 36), 208
+	Running sparse_blas: transpose
diff --git a/benchmark/test/reference/sparse_blas.matrix.stdout b/benchmark/test/reference/sparse_blas.matrix.stdout
new file mode 100644
index 00000000000..a50fa1159d9
--- /dev/null
+++ b/benchmark/test/reference/sparse_blas.matrix.stdout
@@ -0,0 +1,24 @@
+[
+    {
+        "filename": "",
+        "sparse_blas": {
+            "transpose": {
+                "time": 1.0,
+                "flops": 1.0,
+                "bandwidth": 1.0,
+                "repetitions": 10,
+                "components": {
+                    "allocate": 1.0,
+                    "components::fill_array": 1.0,
+                    "csr::transpose": 1.0,
+                    "free": 1.0,
+                    "overhead": 1.0
+                },
+                "completed": true
+            }
+        },
+        "rows": 36,
+        "cols": 36,
+        "nonzeros": 208
+    }
+]
diff --git a/benchmark/test/reference/sparse_blas.profile.stderr b/benchmark/test/reference/sparse_blas.profile.stderr
new file mode 100644
index 00000000000..e8376ca2713
--- /dev/null
+++ b/benchmark/test/reference/sparse_blas.profile.stderr
@@ -0,0 +1,25 @@
+This is Ginkgo 1.7.0 (master)
+    running with core module 1.7.0 (master)
+Running on reference(0)
+Running with 0 warm iterations and 1 running iterations
+The random seed for right hand sides is 42
+The operations are transpose
+Running test case stencil(100, 7pt)
+Matrix is of size (125, 125), 725
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin components::aos_to_soa
+DEBUG: end   components::aos_to_soa
+DEBUG: begin components::convert_idxs_to_ptrs
+DEBUG: end   components::convert_idxs_to_ptrs
+DEBUG: begin stencil(100, 7pt)
+	Running sparse_blas: transpose
+DEBUG: begin transpose
+DEBUG: begin repetition
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin csr::transpose
+DEBUG: end   csr::transpose
+DEBUG: end   repetition
+DEBUG: end   transpose
+DEBUG: end   stencil(100, 7pt)
diff --git a/benchmark/test/reference/sparse_blas.profile.stdout b/benchmark/test/reference/sparse_blas.profile.stdout
new file mode 100644
index 00000000000..45cb7e2638a
--- /dev/null
+++ b/benchmark/test/reference/sparse_blas.profile.stdout
@@ -0,0 +1,18 @@
+[
+    {
+        "size": 100,
+        "stencil": "7pt",
+        "sparse_blas": {
+            "transpose": {
+                "time": 1.0,
+                "flops": 1.0,
+                "bandwidth": 1.0,
+                "repetitions": 1,
+                "completed": true
+            }
+        },
+        "rows": 125,
+        "cols": 125,
+        "nonzeros": 725
+    }
+]
diff --git a/benchmark/test/reference/sparse_blas.reordered.stderr b/benchmark/test/reference/sparse_blas.reordered.stderr
new file mode 100644
index 00000000000..2a7bd2a6665
--- /dev/null
+++ b/benchmark/test/reference/sparse_blas.reordered.stderr
@@ -0,0 +1,9 @@
+This is Ginkgo 1.7.0 (master)
+    running with core module 1.7.0 (master)
+Running on reference(0)
+Running with 2 warm iterations and 10 running iterations
+The random seed for right hand sides is 42
+The operations are symbolic_cholesky
+Running test case stencil(100, 7pt)
+Matrix is of size (125, 125), 725
+	Running sparse_blas: symbolic_cholesky
diff --git a/benchmark/test/reference/sparse_blas.reordered.stdout b/benchmark/test/reference/sparse_blas.reordered.stdout
new file mode 100644
index 00000000000..b5fc8998be0
--- /dev/null
+++ b/benchmark/test/reference/sparse_blas.reordered.stdout
@@ -0,0 +1,32 @@
+[
+    {
+        "size": 100,
+        "stencil": "7pt",
+        "sparse_blas": {
+            "symbolic_cholesky": {
+                "time": 1.0,
+                "flops": 1.0,
+                "bandwidth": 1.0,
+                "repetitions": 10,
+                "components": {
+                    "compute_elim_forest": 1.0,
+                    "allocate": 1.0,
+                    "free": 1.0,
+                    "components::fill_array": 1.0,
+                    "cholesky::symbolic_count": 1.0,
+                    "components::prefix_sum_nonnegative": 1.0,
+                    "copy": 1.0,
+                    "cholesky::symbolic_factorize": 1.0,
+                    "csr::sort_by_column_index": 1.0,
+                    "overhead": 1.0
+                },
+                "factor_nonzeros": 1324,
+                "completed": true
+            }
+        },
+        "reordered": "amd",
+        "rows": 125,
+        "cols": 125,
+        "nonzeros": 725
+    }
+]
diff --git a/benchmark/test/reference/sparse_blas.simple.stderr b/benchmark/test/reference/sparse_blas.simple.stderr
new file mode 100644
index 00000000000..21c2241c6a5
--- /dev/null
+++ b/benchmark/test/reference/sparse_blas.simple.stderr
@@ -0,0 +1,9 @@
+This is Ginkgo 1.7.0 (master)
+    running with core module 1.7.0 (master)
+Running on reference(0)
+Running with 2 warm iterations and 10 running iterations
+The random seed for right hand sides is 42
+The operations are transpose
+Running test case stencil(100, 7pt)
+Matrix is of size (125, 125), 725
+	Running sparse_blas: transpose
diff --git a/benchmark/test/reference/sparse_blas.simple.stdout b/benchmark/test/reference/sparse_blas.simple.stdout
new file mode 100644
index 00000000000..a44f4f189b2
--- /dev/null
+++ b/benchmark/test/reference/sparse_blas.simple.stdout
@@ -0,0 +1,25 @@
+[
+    {
+        "size": 100,
+        "stencil": "7pt",
+        "sparse_blas": {
+            "transpose": {
+                "time": 1.0,
+                "flops": 1.0,
+                "bandwidth": 1.0,
+                "repetitions": 10,
+                "components": {
+                    "allocate": 1.0,
+                    "components::fill_array": 1.0,
+                    "csr::transpose": 1.0,
+                    "free": 1.0,
+                    "overhead": 1.0
+                },
+                "completed": true
+            }
+        },
+        "rows": 125,
+        "cols": 125,
+        "nonzeros": 725
+    }
+]
diff --git a/benchmark/test/reference/spmv.matrix.stderr b/benchmark/test/reference/spmv.matrix.stderr
new file mode 100644
index 00000000000..a184b39b9fd
--- /dev/null
+++ b/benchmark/test/reference/spmv.matrix.stderr
@@ -0,0 +1,10 @@
+This is Ginkgo 1.7.0 (master)
+    running with core module 1.7.0 (master)
+Running on reference(0)
+Running with 2 warm iterations and 10 running iterations
+The random seed for right hand sides is 42
+The formats are coo
+The number of right hand sides is 1
+Running test case <filename>
+Matrix is of size (36, 36), 208
+	Running spmv: coo
diff --git a/benchmark/test/reference/spmv.matrix.stdout b/benchmark/test/reference/spmv.matrix.stdout
new file mode 100644
index 00000000000..ea5927ba148
--- /dev/null
+++ b/benchmark/test/reference/spmv.matrix.stdout
@@ -0,0 +1,20 @@
+[
+    {
+        "filename": "",
+        "spmv": {
+            "coo": {
+                "storage": 3328,
+                "max_relative_norm2": 1.0,
+                "time": 1.0,
+                "repetitions": 10,
+                "completed": true
+            }
+        },
+        "rows": 36,
+        "cols": 36,
+        "nonzeros": 208,
+        "optimal": {
+            "spmv": "coo"
+        }
+    }
+]
diff --git a/benchmark/test/reference/spmv.profile.stderr b/benchmark/test/reference/spmv.profile.stderr
new file mode 100644
index 00000000000..dff3b58a9dd
--- /dev/null
+++ b/benchmark/test/reference/spmv.profile.stderr
@@ -0,0 +1,38 @@
+This is Ginkgo 1.7.0 (master)
+    running with core module 1.7.0 (master)
+Running on reference(0)
+Running with 0 warm iterations and 1 running iterations
+The random seed for right hand sides is 42
+The formats are coo
+The number of right hand sides is 1
+Running test case stencil(100, 7pt)
+DEBUG: begin components::aos_to_soa
+DEBUG: end   components::aos_to_soa
+DEBUG: begin dense::fill
+DEBUG: end   dense::fill
+DEBUG: begin dense::fill_in_matrix_data
+DEBUG: end   dense::fill_in_matrix_data
+DEBUG: begin components::aos_to_soa
+DEBUG: end   components::aos_to_soa
+DEBUG: begin dense::fill
+DEBUG: end   dense::fill
+DEBUG: begin dense::fill_in_matrix_data
+DEBUG: end   dense::fill_in_matrix_data
+Matrix is of size (125, 125), 725
+DEBUG: begin stencil(100, 7pt)
+	Running spmv: coo
+DEBUG: begin coo
+DEBUG: begin components::aos_to_soa
+DEBUG: end   components::aos_to_soa
+DEBUG: begin copy(<typename>)
+DEBUG: begin dense::copy
+DEBUG: end   dense::copy
+DEBUG: end   copy(<typename>)
+DEBUG: begin repetition
+DEBUG: begin apply(<typename>)
+DEBUG: begin coo::spmv
+DEBUG: end   coo::spmv
+DEBUG: end   apply(<typename>)
+DEBUG: end   repetition
+DEBUG: end   coo
+DEBUG: end   stencil(100, 7pt)
diff --git a/benchmark/test/reference/spmv.profile.stdout b/benchmark/test/reference/spmv.profile.stdout
new file mode 100644
index 00000000000..6e4701af719
--- /dev/null
+++ b/benchmark/test/reference/spmv.profile.stdout
@@ -0,0 +1,20 @@
+[
+    {
+        "size": 100,
+        "stencil": "7pt",
+        "spmv": {
+            "coo": {
+                "storage": 11600,
+                "time": 1.0,
+                "repetitions": 1,
+                "completed": true
+            }
+        },
+        "rows": 125,
+        "cols": 125,
+        "nonzeros": 725,
+        "optimal": {
+            "spmv": "coo"
+        }
+    }
+]
diff --git a/benchmark/test/reference/spmv.reordered.stderr b/benchmark/test/reference/spmv.reordered.stderr
new file mode 100644
index 00000000000..07044cc70f8
--- /dev/null
+++ b/benchmark/test/reference/spmv.reordered.stderr
@@ -0,0 +1,10 @@
+This is Ginkgo 1.7.0 (master)
+    running with core module 1.7.0 (master)
+Running on reference(0)
+Running with 2 warm iterations and 10 running iterations
+The random seed for right hand sides is 42
+The formats are coo
+The number of right hand sides is 1
+Running test case stencil(100, 7pt)
+Matrix is of size (125, 125), 725
+	Running spmv: coo
diff --git a/benchmark/test/reference/spmv.reordered.stdout b/benchmark/test/reference/spmv.reordered.stdout
new file mode 100644
index 00000000000..5404235cdf7
--- /dev/null
+++ b/benchmark/test/reference/spmv.reordered.stdout
@@ -0,0 +1,22 @@
+[
+    {
+        "size": 100,
+        "stencil": "7pt",
+        "spmv": {
+            "coo": {
+                "storage": 11600,
+                "max_relative_norm2": 1.0,
+                "time": 1.0,
+                "repetitions": 10,
+                "completed": true
+            }
+        },
+        "reordered": "amd",
+        "rows": 125,
+        "cols": 125,
+        "nonzeros": 725,
+        "optimal": {
+            "spmv": "coo"
+        }
+    }
+]
diff --git a/benchmark/test/reference/spmv.simple.stderr b/benchmark/test/reference/spmv.simple.stderr
new file mode 100644
index 00000000000..07044cc70f8
--- /dev/null
+++ b/benchmark/test/reference/spmv.simple.stderr
@@ -0,0 +1,10 @@
+This is Ginkgo 1.7.0 (master)
+    running with core module 1.7.0 (master)
+Running on reference(0)
+Running with 2 warm iterations and 10 running iterations
+The random seed for right hand sides is 42
+The formats are coo
+The number of right hand sides is 1
+Running test case stencil(100, 7pt)
+Matrix is of size (125, 125), 725
+	Running spmv: coo
diff --git a/benchmark/test/reference/spmv.simple.stdout b/benchmark/test/reference/spmv.simple.stdout
new file mode 100644
index 00000000000..38f2598c616
--- /dev/null
+++ b/benchmark/test/reference/spmv.simple.stdout
@@ -0,0 +1,21 @@
+[
+    {
+        "size": 100,
+        "stencil": "7pt",
+        "spmv": {
+            "coo": {
+                "storage": 11600,
+                "max_relative_norm2": 1.0,
+                "time": 1.0,
+                "repetitions": 10,
+                "completed": true
+            }
+        },
+        "rows": 125,
+        "cols": 125,
+        "nonzeros": 725,
+        "optimal": {
+            "spmv": "coo"
+        }
+    }
+]
diff --git a/benchmark/test/reference/spmv_distributed.profile.stderr b/benchmark/test/reference/spmv_distributed.profile.stderr
new file mode 100644
index 00000000000..4cd21d00758
--- /dev/null
+++ b/benchmark/test/reference/spmv_distributed.profile.stderr
@@ -0,0 +1,140 @@
+This is Ginkgo 1.7.0 (master)
+    running with core module 1.7.0 (master)
+Running on reference(0)
+Running with 0 warm iterations and 1 running iterations
+The random seed for right hand sides is 42
+The formats are [csr]x[csr]
+The number of right hand sides is 1
+Running test case stencil(100, 7pt, stencil)
+DEBUG: begin partition::build_ranges_from_global_size
+DEBUG: end   partition::build_ranges_from_global_size
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin partition::build_from_contiguous
+DEBUG: end   partition::build_from_contiguous
+DEBUG: begin partition::build_starting_indices
+DEBUG: end   partition::build_starting_indices
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin components::aos_to_soa
+DEBUG: end   components::aos_to_soa
+DEBUG: begin dense::fill
+DEBUG: end   dense::fill
+DEBUG: begin dense::fill_in_matrix_data
+DEBUG: end   dense::fill_in_matrix_data
+DEBUG: begin partition::build_ranges_from_global_size
+DEBUG: end   partition::build_ranges_from_global_size
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin partition::build_from_contiguous
+DEBUG: end   partition::build_from_contiguous
+DEBUG: begin partition::build_starting_indices
+DEBUG: end   partition::build_starting_indices
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin components::aos_to_soa
+DEBUG: end   components::aos_to_soa
+DEBUG: begin dense::fill
+DEBUG: end   dense::fill
+DEBUG: begin dense::fill_in_matrix_data
+DEBUG: end   dense::fill_in_matrix_data
+Matrix is of size (81, 81), 144
+DEBUG: begin stencil(100, 7pt, stencil)
+	Running spmv: csr-csr
+DEBUG: begin csr-csr
+DEBUG: begin partition::build_ranges_from_global_size
+DEBUG: end   partition::build_ranges_from_global_size
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin partition::build_from_contiguous
+DEBUG: end   partition::build_from_contiguous
+DEBUG: begin partition::build_starting_indices
+DEBUG: end   partition::build_starting_indices
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin copy(<typename>)
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: end   copy(<typename>)
+DEBUG: begin components::fill_array
+DEBUG: end   components::fill_array
+DEBUG: begin copy(<typename>)
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: end   copy(<typename>)
+DEBUG: begin dense::fill
+DEBUG: end   dense::fill
+DEBUG: begin components::aos_to_soa
+DEBUG: end   components::aos_to_soa
+DEBUG: begin distributed_matrix::build_local_nonlocal
+DEBUG: end   distributed_matrix::build_local_nonlocal
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin components::convert_idxs_to_ptrs
+DEBUG: end   components::convert_idxs_to_ptrs
+DEBUG: begin components::convert_idxs_to_ptrs
+DEBUG: end   components::convert_idxs_to_ptrs
+DEBUG: begin copy
+DEBUG: end   copy
+DEBUG: begin copy(<typename>)
+DEBUG: begin dense::copy
+DEBUG: end   dense::copy
+DEBUG: end   copy(<typename>)
+DEBUG: begin repetition
+DEBUG: begin apply(<typename>)
+DEBUG: begin dense::row_gather
+DEBUG: end   dense::row_gather
+DEBUG: begin apply(<typename>)
+DEBUG: begin csr::spmv
+DEBUG: end   csr::spmv
+DEBUG: end   apply(<typename>)
+DEBUG: begin advanced_apply(<typename>)
+DEBUG: begin csr::advanced_spmv
+DEBUG: end   csr::advanced_spmv
+DEBUG: end   advanced_apply(<typename>)
+DEBUG: end   apply(<typename>)
+DEBUG: end   repetition
+DEBUG: end   csr-csr
+DEBUG: end   stencil(100, 7pt, stencil)
diff --git a/benchmark/test/reference/spmv_distributed.profile.stdout b/benchmark/test/reference/spmv_distributed.profile.stdout
new file mode 100644
index 00000000000..bbef87d0b89
--- /dev/null
+++ b/benchmark/test/reference/spmv_distributed.profile.stdout
@@ -0,0 +1,21 @@
+[
+    {
+        "size": 100,
+        "stencil": "7pt",
+        "comm_pattern": "stencil",
+        "spmv": {
+            "csr-csr": {
+                "storage": 6420,
+                "time": 1.0,
+                "repetitions": 1,
+                "completed": true
+            }
+        },
+        "rows": 81,
+        "cols": 81,
+        "nonzeros": 144,
+        "optimal": {
+            "spmv": "csr-csr"
+        }
+    }
+]
diff --git a/benchmark/test/reference/spmv_distributed.simple.stderr b/benchmark/test/reference/spmv_distributed.simple.stderr
new file mode 100644
index 00000000000..7d59e4f4190
--- /dev/null
+++ b/benchmark/test/reference/spmv_distributed.simple.stderr
@@ -0,0 +1,10 @@
+This is Ginkgo 1.7.0 (master)
+    running with core module 1.7.0 (master)
+Running on reference(0)
+Running with 2 warm iterations and 10 running iterations
+The random seed for right hand sides is 42
+The formats are [csr]x[csr]
+The number of right hand sides is 1
+Running test case stencil(100, 7pt, stencil)
+Matrix is of size (81, 81), 144
+	Running spmv: csr-csr
diff --git a/benchmark/test/reference/spmv_distributed.simple.stdout b/benchmark/test/reference/spmv_distributed.simple.stdout
new file mode 100644
index 00000000000..77bdef168d3
--- /dev/null
+++ b/benchmark/test/reference/spmv_distributed.simple.stdout
@@ -0,0 +1,22 @@
+[
+    {
+        "size": 100,
+        "stencil": "7pt",
+        "comm_pattern": "stencil",
+        "spmv": {
+            "csr-csr": {
+                "storage": 6420,
+                "max_relative_norm2": 1.0,
+                "time": 1.0,
+                "repetitions": 10,
+                "completed": true
+            }
+        },
+        "rows": 81,
+        "cols": 81,
+        "nonzeros": 144,
+        "optimal": {
+            "spmv": "csr-csr"
+        }
+    }
+]
diff --git a/benchmark/test/solver.py b/benchmark/test/solver.py
new file mode 100755
index 00000000000..5dd1d840a4e
--- /dev/null
+++ b/benchmark/test/solver.py
@@ -0,0 +1,53 @@
+#!/usr/bin/env python3
+import test_framework
+
+# check that all input modes work:
+# parameter
+test_framework.compare_output(
+    ["-input", '[{"size": 100, "stencil": "7pt", "optimal": {"spmv": "csr"}}]'],
+    expected_stdout="solver.simple.stdout",
+    expected_stderr="solver.simple.stderr",
+)
+
+# stdin
+test_framework.compare_output(
+    [],
+    expected_stdout="solver.simple.stdout",
+    expected_stderr="solver.simple.stderr",
+    stdin='[{"size": 100, "stencil": "7pt", "optimal": {"spmv": "csr"}}]',
+)
+
+# input file
+test_framework.compare_output(
+    ["-input", str(test_framework.sourcepath / "input.solver.json")],
+    expected_stdout="solver.simple.stdout",
+    expected_stderr="solver.simple.stderr",
+)
+
+# input matrix file
+test_framework.compare_output(
+    ["-input_matrix", str(test_framework.matrixpath)],
+    expected_stdout="solver.matrix.stdout",
+    expected_stderr="solver.matrix.stderr",
+)
+
+# profiler annotations
+test_framework.compare_output(
+    [
+        "-input",
+        '[{"size": 100, "stencil": "7pt", "optimal": {"spmv": "csr"}}]',
+        "-profile",
+        "-profiler_hook",
+        "debug",
+    ],
+    expected_stdout="solver.profile.stdout",
+    expected_stderr="solver.profile.stderr",
+)
+
+# reordering
+test_framework.compare_output(
+    ["-reorder", "amd"],
+    expected_stdout="solver.reordered.stdout",
+    expected_stderr="solver.reordered.stderr",
+    stdin='[{"size": 100, "stencil": "7pt", "optimal": {"spmv": "csr"}}]',
+)
diff --git a/benchmark/test/solver_distributed.py b/benchmark/test/solver_distributed.py
new file mode 100644
index 00000000000..54bbb030077
--- /dev/null
+++ b/benchmark/test/solver_distributed.py
@@ -0,0 +1,48 @@
+#!/usr/bin/env python3
+import test_framework
+
+# check that all input modes work:
+# parameter
+test_framework.compare_output(
+    [
+        "-input",
+        '[{"size": 100, "stencil": "7pt", "comm_pattern": "stencil", "optimal": {"spmv": "csr-csr"}}]',
+    ],
+    expected_stdout="distributed_solver.simple.stdout",
+    expected_stderr="distributed_solver.simple.stderr",
+)
+
+# stdin
+test_framework.compare_output(
+    [],
+    expected_stdout="distributed_solver.simple.stdout",
+    expected_stderr="distributed_solver.simple.stderr",
+    stdin='[{"size": 100, "stencil": "7pt", "comm_pattern": "stencil", "optimal": {"spmv": "csr-csr"}}]',
+)
+
+# input file
+test_framework.compare_output(
+    ["-input", str(test_framework.sourcepath / "input.distributed_solver.json")],
+    expected_stdout="distributed_solver.simple.stdout",
+    expected_stderr="distributed_solver.simple.stderr",
+)
+
+# input matrix file
+test_framework.compare_output(
+    ["-input_matrix", str(test_framework.matrixpath)],
+    expected_stdout="distributed_solver.matrix.stdout",
+    expected_stderr="distributed_solver.matrix.stderr",
+)
+
+# profiler annotations
+test_framework.compare_output(
+    [
+        "-input",
+        '[{"size": 100, "stencil": "7pt", "comm_pattern": "stencil", "optimal": {"spmv": "csr-csr"}}]',
+        "-profile",
+        "-profiler_hook",
+        "debug",
+    ],
+    expected_stdout="distributed_solver.profile.stdout",
+    expected_stderr="distributed_solver.profile.stderr",
+)
diff --git a/benchmark/test/sparse_blas.py b/benchmark/test/sparse_blas.py
new file mode 100755
index 00000000000..8e6cda3c9bd
--- /dev/null
+++ b/benchmark/test/sparse_blas.py
@@ -0,0 +1,66 @@
+#!/usr/bin/env python3
+import test_framework
+
+# check that all input modes work:
+# parameter
+test_framework.compare_output(
+    ["-operations", "transpose", "-input",
+        '[{"size": 100, "stencil": "7pt"}]'],
+    expected_stdout="sparse_blas.simple.stdout",
+    expected_stderr="sparse_blas.simple.stderr",
+)
+
+# stdin
+test_framework.compare_output(
+    ["-operations", "transpose"],
+    expected_stdout="sparse_blas.simple.stdout",
+    expected_stderr="sparse_blas.simple.stderr",
+    stdin='[{"size": 100, "stencil": "7pt"}]',
+)
+
+# input file
+test_framework.compare_output(
+    [
+        "-operations",
+        "transpose",
+        "-input",
+        str(test_framework.sourcepath / "input.mtx.json"),
+    ],
+    expected_stdout="sparse_blas.simple.stdout",
+    expected_stderr="sparse_blas.simple.stderr",
+)
+
+# input matrix file
+test_framework.compare_output(
+    [
+        "-operations",
+        "transpose",
+        "-input_matrix",
+        str(test_framework.matrixpath),
+    ],
+    expected_stdout="sparse_blas.matrix.stdout",
+    expected_stderr="sparse_blas.matrix.stderr",
+)
+
+# profiler annotations (transpose has the smallest number of allocations)
+test_framework.compare_output(
+    [
+        "-operations",
+        "transpose",
+        "-input",
+        '[{"size": 100, "stencil": "7pt"}]',
+        "-profile",
+        "-profiler_hook",
+        "debug",
+    ],
+    expected_stdout="sparse_blas.profile.stdout",
+    expected_stderr="sparse_blas.profile.stderr",
+)
+
+# reordering
+test_framework.compare_output(
+    ["-operations", "symbolic_cholesky", "-reorder", "amd"],
+    expected_stdout="sparse_blas.reordered.stdout",
+    expected_stderr="sparse_blas.reordered.stderr",
+    stdin='[{"size": 100, "stencil": "7pt"}]',
+)
diff --git a/benchmark/test/spmv.py b/benchmark/test/spmv.py
new file mode 100755
index 00000000000..f6f4a4b5c39
--- /dev/null
+++ b/benchmark/test/spmv.py
@@ -0,0 +1,53 @@
+#!/usr/bin/env python3
+import test_framework
+
+# check that all input modes work:
+# parameter
+test_framework.compare_output(
+    ["-input", '[{"size": 100, "stencil": "7pt"}]'],
+    expected_stdout="spmv.simple.stdout",
+    expected_stderr="spmv.simple.stderr",
+)
+
+# stdin
+test_framework.compare_output(
+    [],
+    expected_stdout="spmv.simple.stdout",
+    expected_stderr="spmv.simple.stderr",
+    stdin='[{"size": 100, "stencil": "7pt"}]',
+)
+
+# input file
+test_framework.compare_output(
+    ["-input", str(test_framework.sourcepath / "input.mtx.json")],
+    expected_stdout="spmv.simple.stdout",
+    expected_stderr="spmv.simple.stderr",
+)
+
+# input matrix file
+test_framework.compare_output(
+    ["-input_matrix", str(test_framework.matrixpath)],
+    expected_stdout="spmv.matrix.stdout",
+    expected_stderr="spmv.matrix.stderr",
+)
+
+# profiler annotations
+test_framework.compare_output(
+    [
+        "-input",
+        '[{"size": 100, "stencil": "7pt"}]',
+        "-profile",
+        "-profiler_hook",
+        "debug",
+    ],
+    expected_stdout="spmv.profile.stdout",
+    expected_stderr="spmv.profile.stderr",
+)
+
+# stdin
+test_framework.compare_output(
+    ["-reorder", "amd"],
+    expected_stdout="spmv.reordered.stdout",
+    expected_stderr="spmv.reordered.stderr",
+    stdin='[{"size": 100, "stencil": "7pt"}]',
+)
diff --git a/benchmark/test/spmv_distributed.py b/benchmark/test/spmv_distributed.py
new file mode 100644
index 00000000000..356db48459e
--- /dev/null
+++ b/benchmark/test/spmv_distributed.py
@@ -0,0 +1,42 @@
+#!/usr/bin/env python3
+import test_framework
+
+# check that all input modes work:
+# parameter
+test_framework.compare_output_distributed(
+    ["-input", '[{"size": 100, "stencil": "7pt", "comm_pattern": "stencil"}]'],
+    expected_stdout="spmv_distributed.simple.stdout",
+    expected_stderr="spmv_distributed.simple.stderr",
+    num_procs=3,
+)
+
+# stdin
+test_framework.compare_output_distributed(
+    [],
+    expected_stdout="spmv_distributed.simple.stdout",
+    expected_stderr="spmv_distributed.simple.stderr",
+    num_procs=3,
+    stdin='[{"size": 100, "stencil": "7pt", "comm_pattern": "stencil"}]',
+)
+
+# input file
+test_framework.compare_output_distributed(
+    ["-input", str(test_framework.sourcepath / "input.distributed_mtx.json")],
+    expected_stdout="spmv_distributed.simple.stdout",
+    expected_stderr="spmv_distributed.simple.stderr",
+    num_procs=3,
+)
+
+# profiler annotations
+test_framework.compare_output_distributed(
+    [
+        "-input",
+        '[{"size": 100, "stencil": "7pt", "comm_pattern": "stencil"}]',
+        "-profile",
+        "-profiler_hook",
+        "debug",
+    ],
+    expected_stdout="spmv_distributed.profile.stdout",
+    expected_stderr="spmv_distributed.profile.stderr",
+    num_procs=3,
+)
diff --git a/benchmark/test/test_framework.py.in b/benchmark/test/test_framework.py.in
new file mode 100644
index 00000000000..62c4293e7c0
--- /dev/null
+++ b/benchmark/test/test_framework.py.in
@@ -0,0 +1,233 @@
+#!/usr/bin/env python3
+import subprocess
+import difflib
+import json
+from typing import List, Tuple
+import re
+import pathlib
+import sys
+
+sourcepath = pathlib.Path("@CMAKE_CURRENT_SOURCE_DIR@")
+binpath = pathlib.Path("@PROJECT_BINARY_DIR@")
+matrixpath = pathlib.Path("@PROJECT_BINARY_DIR@/matrices/test/ani1.mtx")
+generate = False
+if len(sys.argv) > 2 and sys.argv[2] == "--generate":
+    generate = True
+denumberify_paths = [
+    "time",
+    "bandwidth",
+    "flops",
+    "components",
+    "residual_norm",
+    "rhs_norm",
+    "max_relative_norm2",
+]
+detypenameify_key_starts = [
+    "generate(", "apply(", "advanced_apply(", "copy(", "check("]
+empty_string_paths = ["filename"]
+empty_array_paths = [
+    "recurrent_residuals",
+    "true_residuals",
+    "implicit_residuals",
+    "iteration_timestamps",
+]
+
+
+def sanitize_json_key(key: str):
+    """Applies sanitation to a single key.
+
+    Strings that start with a name in detypenameify_key_starts will be truncated
+    """
+
+    for start in detypenameify_key_starts:
+        if key.startswith(start):
+            return start + "<typename>)"
+    return key
+
+
+def sanitize_json_value(key: str, value, sanitize_all: bool):
+    """Applies sanitation to a single key-value pair.
+
+    Strings with a key in empty_string_paths will be emptied
+    Numbers with a key in denumberify_paths will be set to 1.0
+    """
+
+    if key in empty_string_paths and isinstance(value, str):
+        return ""
+    if key in denumberify_paths and isinstance(value, float):
+        return 1.0
+    if key in denumberify_paths and isinstance(value, dict):
+        return sanitize_json(value, True)
+    if key in empty_array_paths and isinstance(value, list):
+        return []
+    return sanitize_json(value, sanitize_all)
+
+
+def sanitize_json(parsed_input, sanitize_all: bool = False):
+    """Removes non-deterministic parts of a parsed JSON input.
+
+    If sanitize_all is set to True, all nested float values will be set to 0.
+    Otherwise, only JSON object entries will be sanitized
+    using sanitize_json_key_value.
+    """
+
+    if isinstance(parsed_input, dict):
+        return {
+            sanitize_json_key(key): sanitize_json_value(key, value, sanitize_all)
+            for key, value in parsed_input.items()
+        }
+    elif isinstance(parsed_input, list):
+        return [sanitize_json(e, sanitize_all) for e in parsed_input]
+    elif sanitize_all and isinstance(parsed_input, float):
+        return 1.0
+    else:
+        return parsed_input
+
+
+def sanitize_json_text(input: str) -> List[str]:
+    """Sanitizes the given input JSON string.
+
+    The JSON values will be parsed and sanitized through sanitize_json(...)
+    and pretty-printed to replace the original JSON input.
+    """
+
+    result = json.dumps(sanitize_json(json.loads(input)), indent=4)
+    # json.dumps doesn't add a trailing newline
+    return result.splitlines() + [""]
+
+
+def sanitize_text(
+    input: str,
+    ignore_patterns: List[str],
+    replace_patterns: List[Tuple[str, str]],
+) -> List[str]:
+    """Sanitizes the given input string.
+
+    Every input line matching an entry from ignore_patterns will be removed.
+    Every line matching the first string in an entry from replace_patterns
+    will be replaced by the second string.
+    The output is guaranteed to end with an empty line.
+    """
+
+    lines = input.splitlines()
+    output_lines = []
+    patterns = [re.compile(pattern) for pattern in ignore_patterns]
+    for line in lines:
+        for pattern, replacement in replace_patterns:
+            line = re.sub(pattern, replacement, line)
+        keep = True
+        for compiled_pattern in patterns:
+            if re.match(compiled_pattern, line):
+                keep = False
+                break
+        if keep:
+            output_lines.append(line)
+    if len(output_lines) == 0 or output_lines[-1] != "":
+        output_lines.append("")
+    return output_lines
+
+
+def compare_output_impl(
+    args: List[str],
+    expected_stdout: str,
+    expected_stderr: str,
+    stdin: str,
+    launcher_flags: List[str],
+):
+    args = [sys.argv[1]] + args
+    expected_stdout = str(sourcepath / "reference" / expected_stdout)
+    expected_stderr = str(sourcepath / "reference" / expected_stderr)
+    result = subprocess.run(
+        args=launcher_flags + args,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        input=bytes(stdin, "utf-8"),
+    )
+    print(
+        "TEST: {}".format(
+            " ".join(["'{}'".format(arg) for arg in launcher_flags + args])
+        )
+    )
+    ignore_patterns = [
+        "    the .* module is",  # version numbers
+        "DEBUG: (begin|end  ) (allocate|free)",  # allocations
+    ]
+    typename_patterns = [
+        ("(apply|generate|check|copy|move)\([^())]*\)", "\\1(<typename>)"),
+        ("what\\(\\): .*", "what(): <removed>"),
+        (re.escape(str(matrixpath)), "<filename>"),
+    ]
+    if generate:
+        open(expected_stdout, "w").write(
+            "\n".join(sanitize_json_text(result.stdout.decode()))
+        )
+        open(expected_stderr, "w").write(
+            "\n".join(
+                sanitize_text(
+                    result.stderr.decode(),
+                    ignore_patterns=ignore_patterns,
+                    replace_patterns=typename_patterns,
+                )
+            )
+        )
+        print("GENERATED")
+        return
+    result_stdout_processed = sanitize_json_text(result.stdout.decode())
+    result_stderr_processed = sanitize_text(
+        result.stderr.decode(),
+        ignore_patterns=ignore_patterns,
+        replace_patterns=typename_patterns,
+    )
+    expected_stdout_processed = sanitize_json_text(
+        open(expected_stdout).read())
+    expected_stderr_processed = sanitize_text(
+        open(expected_stderr).read(),
+        ignore_patterns=ignore_patterns,
+        replace_patterns=typename_patterns,
+    )
+    failed = False
+    if result_stdout_processed != expected_stdout_processed:
+        print("FAIL: stdout differs")
+        print(
+            "\n".join(
+                difflib.unified_diff(
+                    expected_stdout_processed, result_stdout_processed)
+            )
+        )
+        failed = True
+    if result_stderr_processed != expected_stderr_processed:
+        print("FAIL: stderr differs")
+        print(
+            "\n".join(
+                difflib.unified_diff(
+                    expected_stderr_processed, result_stderr_processed)
+            )
+        )
+        failed = True
+    if failed:
+        exit(1)
+    print("PASS")
+
+
+def compare_output(
+    args: List[str], expected_stdout: str, expected_stderr: str, stdin: str = ""
+):
+    compare_output_impl(
+        args,
+        expected_stdout=expected_stdout,
+        expected_stderr=expected_stderr,
+        stdin=stdin,
+        launcher_flags=[],
+    )
+
+
+def compare_output_distributed(
+    args, expected_stdout, expected_stderr, num_procs, stdin=""
+):
+    compare_output_impl(
+        args,
+        expected_stdout,
+        expected_stderr,
+        stdin,
+        ["@MPIEXEC_EXECUTABLE@", "@MPIEXEC_NUMPROC_FLAG@", str(num_procs)],
+    )
diff --git a/benchmark/tools/mtx_to_binary.cpp b/benchmark/tools/mtx_to_binary.cpp
index 487687ff605..1d2f4f94e02 100644
--- a/benchmark/tools/mtx_to_binary.cpp
+++ b/benchmark/tools/mtx_to_binary.cpp
@@ -61,8 +61,8 @@ void process(const char* input, const char* output, bool validate)
         }
     }
     if (validate) {
-        std::ifstream ois(output, std::ios_base::in | std::ios_base::binary);
-        auto data2 = gko::read_binary_raw<ValueType, gko::int64>(ois);
+        std::ifstream is(output, std::ios_base::in | std::ios_base::binary);
+        auto data2 = gko::read_binary_raw<ValueType, gko::int64>(is);
         std::cerr << "Comparing against previously read data\n";
         if (data.size != data2.size) {
             throw GKO_STREAM_ERROR("Mismatching sizes!");
diff --git a/benchmark/utils/cuda_linops.cpp b/benchmark/utils/cuda_linops.cpp
index dd1dda5c774..e2221614d9c 100644
--- a/benchmark/utils/cuda_linops.cpp
+++ b/benchmark/utils/cuda_linops.cpp
@@ -438,9 +438,7 @@ class CusparseCsrEx
           trans_(CUSPARSE_OPERATION_NON_TRANSPOSE),
           buffer_(exec)
     {
-#ifdef ALLOWMP
         algmode_ = CUSPARSE_ALG_MERGE_PATH;
-#endif  // ALLOWMP
     }
 
 private:
diff --git a/benchmark/utils/formats.hpp b/benchmark/utils/formats.hpp
index deecc4b530c..6b024b16d1c 100644
--- a/benchmark/utils/formats.hpp
+++ b/benchmark/utils/formats.hpp
@@ -78,8 +78,8 @@ std::string format_description =
     "     Irregular Sparse Matrices.\n"
     "csr: Compressed Sparse Row storage. Ginkgo implementation with\n"
     "     automatic strategy.\n"
-    "csrc: Ginkgo's CSR implementation with automatic stategy.\n"
-    "csri: Ginkgo's CSR implementation with inbalance strategy.\n"
+    "csrc: Ginkgo's CSR implementation with automatic strategy.\n"
+    "csri: Ginkgo's CSR implementation with imbalance strategy.\n"
     "csrm: Ginkgo's CSR implementation with merge_path strategy.\n"
     "csrs: Ginkgo's CSR implementation with sparselib strategy.\n"
     "ell: Ellpack format according to Bell and Garland: Efficient Sparse\n"
diff --git a/benchmark/utils/general.hpp b/benchmark/utils/general.hpp
index 92c3e5c9b13..6012cb6c77b 100644
--- a/benchmark/utils/general.hpp
+++ b/benchmark/utils/general.hpp
@@ -41,10 +41,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <array>
 #include <fstream>
 #include <functional>
+#include <iomanip>
 #include <map>
 #include <ostream>
 #include <random>
 #include <sstream>
+#include <stdexcept>
 #include <string>
 #include <type_traits>
 #include <utility>
@@ -52,10 +54,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <gflags/gflags.h>
-#include <rapidjson/document.h>
-#include <rapidjson/istreamwrapper.h>
-#include <rapidjson/ostreamwrapper.h>
-#include <rapidjson/prettywriter.h>
+
+
+#include <ginkgo/core/base/memory.hpp>
 
 
 #include "benchmark/utils/json.hpp"
@@ -69,6 +70,10 @@ DEFINE_string(executor, "reference",
               "The executor used to run the benchmarks, one of: reference, "
               "omp, cuda, hip");
 
+DEFINE_string(allocator, "default",
+              "The allocator used in the executor. Only relevant for CUDA and "
+              "HIP executors, one of: default, async, host, unified");
+
 DEFINE_uint32(device_id, 0, "ID of the device where to run the code");
 
 DEFINE_bool(overwrite, false,
@@ -92,10 +97,6 @@ DEFINE_string(
 DEFINE_bool(detailed, true,
             "If set, performs several runs to obtain more detailed results");
 
-DEFINE_bool(keep_errors, true,
-            "If set, writes exception messages during the execution into the "
-            "JSON output");
-
 DEFINE_bool(nested_names, false, "If set, separately logs nested operations");
 
 DEFINE_bool(profile, false,
@@ -137,6 +138,9 @@ DEFINE_double(
     "is lower than or equal to 1, the timing region is always 1 repetition.");
 
 
+std::unique_ptr<std::istream> input_stream;
+
+
 /**
  * Parses arguments through gflags and initialize a documentation string.
  *
@@ -146,27 +150,32 @@ DEFINE_double(
  * @param format  the format of the benchmark input data
  */
 void initialize_argument_parsing(int* argc, char** argv[], std::string& header,
-                                 std::string& format)
+                                 std::string& format, bool do_print = true)
 {
-    std::ostringstream doc;
-    doc << header << "Usage: " << (*argv)[0] << " [options]\n"
-        << format
-        << "  The results are written on standard output, in the same "
-           "format,\n"
-        << "  but with test cases extended to include an additional member "
-           "\n"
-        << "  object for each benchmark run.\n"
-        << "  If run with a --backup flag, an intermediate result is "
-           "written \n"
-        << "  to a file in the same format. The backup file can be used as "
-           "\n"
-        << "  input to this test suite, and the benchmarking will \n"
-        << "  continue from the point where the backup file was created.";
-
-    gflags::SetUsageMessage(doc.str());
-    std::ostringstream ver;
-    ver << gko::version_info::get();
-    gflags::SetVersionString(ver.str());
+    if (do_print) {
+        std::ostringstream doc;
+        doc << header << "Usage: " << (*argv)[0] << " [options]\n"
+            << format
+            << "  The results are written on standard output, in the same "
+               "format,\n"
+            << "  but with test cases extended to include an additional member "
+               "\n"
+            << "  object for each benchmark run.\n"
+            << "  If run with a --backup flag, an intermediate result is "
+               "written \n"
+            << "  to a file in the same format. The backup file can be used as "
+               "\n"
+            << "  input to this test suite, and the benchmarking will \n"
+            << "  continue from the point where the backup file was created.";
+
+        gflags::SetUsageMessage(doc.str());
+        std::ostringstream ver;
+        ver << gko::version_info::get();
+        gflags::SetVersionString(ver.str());
+    } else {
+        gflags::SetUsageMessage("");
+        gflags::SetVersionString("");
+    }
     gflags::ParseCommandLineFlags(argc, argv, true);
     if (FLAGS_profile) {
         FLAGS_repetitions = "1";
@@ -176,10 +185,18 @@ void initialize_argument_parsing(int* argc, char** argv[], std::string& header,
             FLAGS_profiler_hook = "auto";
         }
     }
+    std::string input_str(FLAGS_input);
+    if (!input_str.empty()) {
+        if (input_str.back() == ']') {
+            input_stream = std::make_unique<std::stringstream>(input_str);
+        } else {
+            input_stream = std::make_unique<std::ifstream>(input_str);
+        }
+    }
 }
 
 /**
- * Print general benchmark informations using the common available parameters
+ * Print general benchmark information using the common available parameters
  *
  * @param extra  describes benchmark specific extra parameters to output
  */
@@ -187,20 +204,19 @@ void print_general_information(const std::string& extra)
 {
     std::clog << gko::version_info::get() << std::endl
               << "Running on " << FLAGS_executor << "(" << FLAGS_device_id
-              << ")" << std::endl
+              << ")\n"
               << "Running with " << FLAGS_warmup << " warm iterations and ";
     if (FLAGS_repetitions == "auto") {
         std::clog << "adaptively determined repetititions with "
                   << FLAGS_min_repetitions
                   << " <= rep <= " << FLAGS_max_repetitions
-                  << " and a minimal runtime of " << FLAGS_min_runtime << "s"
-                  << std::endl;
+                  << " and a minimal runtime of " << FLAGS_min_runtime << "s\n";
     } else {
-        std::clog << FLAGS_repetitions << " running iterations" << std::endl;
+        std::clog << FLAGS_repetitions << " running iterations\n";
     }
     std::clog << "The random seed for right hand sides is " << FLAGS_seed
-              << std::endl
-              << extra;
+              << '\n'
+              << extra << std::endl;
 }
 
 
@@ -229,32 +245,19 @@ std::shared_ptr<gko::log::ProfilerHook> create_profiler_hook(
 }
 
 
-struct owning_profiling_scope_guard {
-    std::string name;
-    gko::log::profiling_scope_guard guard;
-
-    owning_profiling_scope_guard() = default;
-
-    owning_profiling_scope_guard(std::string name_,
-                                 gko::log::ProfilerHook* profiler_hook)
-        : name(std::move(name_)), guard{profiler_hook->user_range(name.c_str())}
-    {}
-};
-
-
 struct annotate_functor {
-    owning_profiling_scope_guard operator()(std::string name) const
+    gko::log::profiling_scope_guard operator()(const char* name) const
     {
         if (profiler_hook) {
-            return owning_profiling_scope_guard{std::move(name),
-                                                profiler_hook.get()};
+            return profiler_hook->user_range(name);
         }
         return {};
     }
 
-    gko::log::profiling_scope_guard operator()(const char* name) const
+    gko::log::profiling_scope_guard operator()(const char* name,
+                                               bool should_annotate) const
     {
-        if (profiler_hook) {
+        if (profiler_hook && should_annotate) {
             return profiler_hook->user_range(name);
         }
         return {};
@@ -292,25 +295,15 @@ std::vector<std::string> split(const std::string& s, char delimiter = ',')
 // returns the stream to be used as input of the application
 std::istream& get_input_stream()
 {
-    static auto stream = []() -> std::unique_ptr<std::istream> {
-        std::string input_str(FLAGS_input);
-        if (input_str.empty()) {
-            return nullptr;
-        }
-        if (input_str.back() == ']') {
-            return std::make_unique<std::stringstream>(input_str);
-        }
-        return std::make_unique<std::ifstream>(input_str);
-    }();
-    if (stream) {
-        return *stream;
+    if (input_stream) {
+        return *input_stream;
     }
     return std::cin;
 }
 
 
 // backup generation
-void backup_results(rapidjson::Document& results)
+void backup_results(json& results)
 {
     static int next = 0;
     static auto filenames = []() -> std::array<std::string, 2> {
@@ -329,6 +322,40 @@ void backup_results(rapidjson::Document& results)
 }
 
 
+inline std::shared_ptr<gko::CudaAllocatorBase> create_cuda_allocator()
+{
+    std::string flag{FLAGS_allocator};
+    if (flag == "default") {
+        return std::make_shared<gko::CudaAllocator>();
+    } else if (flag == "async") {
+        return std::make_shared<gko::CudaAsyncAllocator>(nullptr);
+    } else if (flag == "unified") {
+        return std::make_shared<gko::CudaUnifiedAllocator>(FLAGS_device_id);
+    } else if (flag == "host") {
+        return std::make_shared<gko::CudaHostAllocator>(FLAGS_device_id);
+    } else {
+        throw std::runtime_error{"Unknown allocator type " + flag};
+    }
+}
+
+
+inline std::shared_ptr<gko::HipAllocatorBase> create_hip_allocator()
+{
+    std::string flag{FLAGS_allocator};
+    if (flag == "default") {
+        return std::make_shared<gko::HipAllocator>();
+    } else if (flag == "async") {
+        return std::make_shared<gko::HipAsyncAllocator>(nullptr);
+    } else if (flag == "unified") {
+        return std::make_shared<gko::HipUnifiedAllocator>(FLAGS_device_id);
+    } else if (flag == "host") {
+        return std::make_shared<gko::HipHostAllocator>(FLAGS_device_id);
+    } else {
+        throw std::runtime_error{"Unknown allocator type " + flag};
+    }
+}
+
+
 // executor mapping
 const std::map<std::string, std::function<std::shared_ptr<gko::Executor>(bool)>>
     executor_factory{
@@ -337,12 +364,14 @@ const std::map<std::string, std::function<std::shared_ptr<gko::Executor>(bool)>>
         {"cuda",
          [](bool) {
              return gko::CudaExecutor::create(FLAGS_device_id,
-                                              gko::OmpExecutor::create(), true);
+                                              gko::OmpExecutor::create(),
+                                              create_cuda_allocator());
          }},
         {"hip",
          [](bool) {
              return gko::HipExecutor::create(FLAGS_device_id,
-                                             gko::OmpExecutor::create(), true);
+                                             gko::OmpExecutor::create(),
+                                             create_hip_allocator());
          }},
         {"dpcpp", [](bool use_gpu_timer) {
              auto property = dpcpp_queue_property::in_order;
@@ -368,16 +397,17 @@ const std::map<std::string,
          [](MPI_Comm comm) {
              FLAGS_device_id = gko::experimental::mpi::map_rank_to_device_id(
                  comm, gko::CudaExecutor::get_num_devices());
-             return gko::CudaExecutor::create(
-                 FLAGS_device_id, gko::ReferenceExecutor::create(), false,
-                 gko::allocation_mode::device);
+             return gko::CudaExecutor::create(FLAGS_device_id,
+                                              gko::ReferenceExecutor::create(),
+                                              create_cuda_allocator());
          }},
         {"hip",
          [](MPI_Comm comm) {
              FLAGS_device_id = gko::experimental::mpi::map_rank_to_device_id(
                  comm, gko::HipExecutor::get_num_devices());
-             return gko::HipExecutor::create(
-                 FLAGS_device_id, gko::ReferenceExecutor::create(), true);
+             return gko::HipExecutor::create(FLAGS_device_id,
+                                             gko::ReferenceExecutor::create(),
+                                             create_hip_allocator());
          }},
         {"dpcpp", [](MPI_Comm comm) {
              if (gko::DpcppExecutor::get_num_devices("gpu")) {
@@ -530,279 +560,4 @@ gko::remove_complex<ValueType> compute_max_relative_norm2(
 }
 
 
-/**
- * A class for controlling the number warmup and timed iterations.
- *
- * The behavior is determined by the following flags
- * - 'repetitions' switch between fixed and adaptive number of iterations
- * - 'warmup' warmup iterations, applies in fixed and adaptive case
- * - 'min_repetitions' minimal number of repetitions (adaptive case)
- * - 'max_repetitions' maximal number of repetitions (adaptive case)
- * - 'min_runtime' minimal total runtime (adaptive case)
- * - 'repetition_growth_factor' controls the increase between two successive
- *   timings
- *
- * Usage:
- * `IterationControl` exposes the member functions:
- * - `warmup_run()`: controls run defined by `warmup` flag
- * - `run(bool)`: controls run defined by all other flags
- * - `get_timer()`: access to underlying timer
- * The first two methods return an object that is to be used in a range-based
- * for loop:
- * ```
- * IterationControl ic(get_timer(...));
- *
- * // warmup run always uses fixed number of iteration and does not issue
- * // timings
- * for(auto status: ic.warmup_run()){
- *   // execute benchmark
- * }
- * // run may use adaptive number of iterations (depending on cmd line flag)
- * // and issues timing (unless manage_timings is false)
- * for(auto status: ic.run(manage_timings [default is true])){
- *   if(! manage_timings) ic.get_timer->tic();
- *   // execute benchmark
- *   if(! manage_timings) ic.get_timer->toc();
- * }
- *
- * ```
- * At the beginning of both methods, the timer is reset.
- * The `status` object exposes the member
- * - `cur_it`, containing the current iteration number,
- * and the methods
- * - `is_finished`, checks if the benchmark is finished,
- */
-class IterationControl {
-    using IndexType = unsigned int;  //!< to be compatible with GFLAGS type
-
-    class run_control;
-
-public:
-    /**
-     * Creates an `IterationControl` object.
-     *
-     * Uses the commandline flags to setup the stopping criteria for the
-     * warmup and timed run.
-     *
-     * @param timer  the timer that is to be used for the timings
-     */
-    explicit IterationControl(const std::shared_ptr<Timer>& timer)
-    {
-        status_warmup_ = {TimerManager{timer, false}, FLAGS_warmup,
-                          FLAGS_warmup, 0., 0};
-        if (FLAGS_repetitions == "auto") {
-            status_run_ = {TimerManager{timer, true}, FLAGS_min_repetitions,
-                           FLAGS_max_repetitions, FLAGS_min_runtime};
-        } else {
-            const auto reps =
-                static_cast<unsigned int>(std::stoi(FLAGS_repetitions));
-            status_run_ = {TimerManager{timer, true}, reps, reps, 0., 0};
-        }
-    }
-
-    IterationControl() = default;
-    IterationControl(const IterationControl&) = default;
-    IterationControl(IterationControl&&) = default;
-
-    /**
-     * Creates iterable `run_control` object for the warmup run.
-     *
-     * This run uses always a fixed number of iterations.
-     */
-    run_control warmup_run()
-    {
-        status_warmup_.cur_it = 0;
-        status_warmup_.managed_timer.clear();
-        return run_control{&status_warmup_};
-    }
-
-    /**
-     * Creates iterable `run_control` object for the timed run.
-     *
-     * This run may be adaptive, depending on the commandline flags.
-     *
-     * @param manage_timings If true, the timer calls (`tic/toc`) are handled
-     * by the `run_control` object, otherwise they need to be executed outside
-     */
-    run_control run(bool manage_timings = true)
-    {
-        status_run_.cur_it = 0;
-        status_run_.managed_timer.clear();
-        status_run_.managed_timer.manage_timings = manage_timings;
-        return run_control{&status_run_};
-    }
-
-    std::shared_ptr<Timer> get_timer() const
-    {
-        return status_run_.managed_timer.timer;
-    }
-
-    /**
-     * Compute the time from the given statistical method
-     *
-     * @param method  the statistical method. If the timer does not have the
-     *                same iteration as the IterationControl, it can only use
-     *                average from the IterationControl.
-     *
-     * @return the statistical time
-     */
-    double compute_time(const std::string& method = "average") const
-    {
-        if (status_run_.managed_timer.timer->get_num_repetitions() ==
-            this->get_num_repetitions()) {
-            return status_run_.managed_timer.compute_time(method);
-        } else {
-            assert(method == "average");
-            return status_run_.managed_timer.get_total_time() /
-                   this->get_num_repetitions();
-        }
-    }
-
-    IndexType get_num_repetitions() const { return status_run_.cur_it; }
-
-private:
-    struct TimerManager {
-        std::shared_ptr<Timer> timer;
-        bool manage_timings = false;
-
-        void tic()
-        {
-            if (manage_timings) {
-                timer->tic();
-            }
-        }
-        void toc(unsigned int num = 1)
-        {
-            if (manage_timings) {
-                timer->toc(num);
-            }
-        }
-
-        void clear() { timer->clear(); }
-
-        double get_total_time() const { return timer->get_total_time(); }
-
-        double compute_time(const std::string& method = "average") const
-        {
-            return timer->compute_time(method);
-        }
-    };
-
-    /**
-     * Stores stopping criteria of the adaptive benchmark run as well as the
-     * current iteration number.
-     */
-    struct status {
-        TimerManager managed_timer{};
-
-        IndexType min_it = 0;
-        IndexType max_it = 0;
-        double max_runtime = 0.;
-
-        IndexType cur_it = 0;
-
-        /**
-         * checks if the adaptive run is complete
-         *
-         * the adaptive run is complete if:
-         * - the minimum number of iteration is reached
-         * - and either:
-         *   - the maximum number of repetitions is reached
-         *   - the total runtime is above the threshold
-         *
-         * @return completeness state of the adaptive run
-         */
-        bool is_finished() const
-        {
-            return cur_it >= min_it &&
-                   (cur_it >= max_it ||
-                    managed_timer.get_total_time() >= max_runtime);
-        }
-    };
-
-    /**
-     * Iterable class managing the benchmark iteration.
-     *
-     * Has to be used in a range-based for loop.
-     */
-    struct run_control {
-        struct iterator {
-            /**
-             * Increases the current iteration count and finishes timing if
-             * necessary.
-             *
-             * As `++it` is the last step of a for-loop, the managed_timer is
-             * stopped, if enough iterations have passed since the last timing.
-             * The interval between two timings is steadily increased to
-             * reduce the timing overhead.
-             */
-            iterator operator++()
-            {
-                cur_info->cur_it++;
-                if (cur_info->cur_it >= next_timing && !stopped) {
-                    cur_info->managed_timer.toc(
-                        static_cast<unsigned>(cur_info->cur_it - start_timing));
-                    stopped = true;
-                    next_timing = static_cast<IndexType>(std::ceil(
-                        next_timing * FLAGS_repetition_growth_factor));
-                    // If repetition_growth_factor <= 1, next_timing will be
-                    // next iteration.
-                    if (next_timing <= cur_info->cur_it) {
-                        next_timing = cur_info->cur_it + 1;
-                    }
-                }
-                return *this;
-            }
-
-            status operator*() const { return *cur_info; }
-
-            /**
-             * Checks if the benchmark is finished and handles timing, if
-             * necessary.
-             *
-             * As `begin != end` is the first step in a for-loop, the
-             * managed_timer is started, if it was previously stopped.
-             * Additionally, if the benchmark is complete and the managed_timer
-             * is still running it is stopped. (This may occur if the maximal
-             * number of repetitions is surpassed)
-             *
-             * Uses only the information from the `status` object, i.e.
-             * the right hand side is ignored.
-             *
-             * @return true if benchmark is not finished, else false
-             */
-            bool operator!=(const iterator&)
-            {
-                const bool is_finished = cur_info->is_finished();
-                if (!is_finished && stopped) {
-                    stopped = false;
-                    cur_info->managed_timer.tic();
-                    start_timing = cur_info->cur_it;
-                } else if (is_finished && !stopped) {
-                    cur_info->managed_timer.toc(
-                        static_cast<unsigned>(cur_info->cur_it - start_timing));
-                    stopped = true;
-                }
-                return !is_finished;
-            }
-
-            status* cur_info;
-            IndexType next_timing = 1;   //!< next iteration to stop timing
-            IndexType start_timing = 0;  //!< iteration for starting timing
-            bool stopped = true;
-        };
-
-        iterator begin() const { return iterator{info}; }
-
-        // not used, could potentially be used in c++17 as a sentinel
-        iterator end() const { return iterator{}; }
-
-        status* info;
-    };
-
-    status status_warmup_;
-    status status_run_;
-};
-
-
 #endif  // GKO_BENCHMARK_UTILS_GENERAL_HPP_
diff --git a/benchmark/utils/general_matrix.hpp b/benchmark/utils/general_matrix.hpp
new file mode 100644
index 00000000000..914684ce6e4
--- /dev/null
+++ b/benchmark/utils/general_matrix.hpp
@@ -0,0 +1,160 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_BENCHMARK_UTILS_GENERAL_MATRIX_HPP_
+#define GKO_BENCHMARK_UTILS_GENERAL_MATRIX_HPP_
+
+
+#include <ginkgo/ginkgo.hpp>
+
+
+#include <gflags/gflags.h>
+
+
+#include "benchmark/utils/general.hpp"
+#include "benchmark/utils/generator.hpp"
+
+
+std::string reordering_algorithm_desc =
+    "Reordering algorithm to apply to the input matrices:\n"
+    "    none - no reordering\n"
+    "    amd - Approximate Minimum Degree reordering algorithm\n"
+#if GKO_HAVE_METIS
+    "    nd - Nested Dissection reordering algorithm\n"
+#endif
+    "    rcm - Reverse Cuthill-McKee reordering algorithm\n"
+    "This is a preprocessing step whose runtime will not be included\n"
+    "in the measurements.";
+
+
+DEFINE_string(input_matrix, "",
+              "Filename of a matrix to be used as the single input. Overwrites "
+              "the value of the -input flag");
+
+
+#ifndef GKO_BENCHMARK_DISTRIBUTED
+DEFINE_string(reorder, "none", reordering_algorithm_desc.c_str());
+#endif
+
+
+template <typename ValueType, typename IndexType>
+std::unique_ptr<gko::matrix::Permutation<IndexType>> reorder(
+    gko::matrix_data<ValueType, IndexType>& data, json& test_case)
+{
+#ifndef GKO_BENCHMARK_DISTRIBUTED
+    if (FLAGS_reorder == "none") {
+        return nullptr;
+    }
+    using Csr = gko::matrix::Csr<ValueType, IndexType>;
+    auto ref = gko::ReferenceExecutor::create();
+    auto mtx = gko::share(Csr::create(ref));
+    mtx->read(data);
+    std::unique_ptr<gko::matrix::Permutation<IndexType>> perm;
+    if (FLAGS_reorder == "amd") {
+        perm = gko::experimental::reorder::Amd<IndexType>::build()
+                   .on(ref)
+                   ->generate(mtx);
+#if GKO_HAVE_METIS
+    } else if (FLAGS_reorder == "nd") {
+        perm = gko::experimental::reorder::NestedDissection<ValueType,
+                                                            IndexType>::build()
+                   .on(ref)
+                   ->generate(mtx);
+#endif
+    } else if (FLAGS_reorder == "rcm") {
+        perm = gko::experimental::reorder::Rcm<IndexType>::build()
+                   .on(ref)
+                   ->generate(mtx);
+    } else {
+        throw std::runtime_error{"Unknown reordering algorithm " +
+                                 FLAGS_reorder};
+    }
+    auto perm_arr =
+        gko::array<IndexType>::view(ref, data.size[0], perm->get_permutation());
+    gko::as<Csr>(mtx->permute(&perm_arr))->write(data);
+    test_case["reordered"] = FLAGS_reorder;
+    return perm;
+#else
+    // no reordering for distributed benchmarks
+    return nullptr;
+#endif
+}
+
+
+template <typename ValueType, typename IndexType>
+void permute(std::unique_ptr<gko::matrix::Dense<ValueType>>& vec,
+             gko::matrix::Permutation<IndexType>* perm)
+{
+    auto perm_arr = gko::array<IndexType>::view(
+        perm->get_executor(), perm->get_size()[0], perm->get_permutation());
+    vec = gko::as<gko::matrix::Dense<ValueType>>(vec->row_permute(&perm_arr));
+}
+
+
+template <typename ValueType, typename IndexType>
+void permute(
+    std::unique_ptr<gko::experimental::distributed::Vector<ValueType>>& vec,
+    gko::matrix::Permutation<IndexType>* perm)
+{}
+
+
+/**
+ * @copydoc initialize_argument_parsing
+ * @param additional_matrix_file_json  text to be appended to the
+ *                                     `{"filename":"..."}` JSON object that
+ *                                     will be used as input for the benchmark
+ *                                     if the `-input_matrix` flag is used.
+ */
+void initialize_argument_parsing_matrix(
+    int* argc, char** argv[], std::string& header, std::string& format,
+    std::string additional_matrix_file_json = "", bool do_print = true)
+{
+    initialize_argument_parsing(argc, argv, header, format, do_print);
+    std::string input_matrix_str{FLAGS_input_matrix};
+    if (!input_matrix_str.empty()) {
+        if (input_stream) {
+            std::cerr
+                << "-input and -input_matrix cannot be used simultaneously\n";
+            std::exit(1);
+        }
+        // create JSON for the filename via nlohmann_json to ensure the string
+        // is correctly escaped
+        auto json_template =
+            R"([{"filename":"")" + additional_matrix_file_json + "}]";
+        auto doc = json::parse(json_template);
+        doc[0]["filename"] = input_matrix_str;
+        input_stream = std::make_unique<std::stringstream>(doc.dump());
+    }
+}
+
+
+#endif  // GKO_BENCHMARK_UTILS_GENERAL_MATRIX_HPP_
diff --git a/benchmark/utils/generator.hpp b/benchmark/utils/generator.hpp
index 076d2954980..3491fb0fc2c 100644
--- a/benchmark/utils/generator.hpp
+++ b/benchmark/utils/generator.hpp
@@ -53,28 +53,48 @@ struct DefaultSystemGenerator {
     using Vec = vec<ValueType>;
 
     static gko::matrix_data<ValueType, IndexType> generate_matrix_data(
-        rapidjson::Value& config)
+        const json& config)
     {
-        if (config.HasMember("filename")) {
-            std::ifstream in(config["filename"].GetString());
-            return gko::read_generic_raw<ValueType, IndexType>(in);
-        } else if (config.HasMember("stencil")) {
-            return generate_stencil<ValueType, IndexType>(
-                config["stencil"].GetString(), config["size"].GetInt64());
+        gko::matrix_data<ValueType, IndexType> data;
+        if (config.contains("filename")) {
+            std::ifstream in(config["filename"].get<std::string>());
+            data = gko::read_generic_raw<ValueType, IndexType>(in);
+        } else if (config.contains("stencil")) {
+            data = generate_stencil<ValueType, IndexType>(
+                config["stencil"].get<std::string>(),
+                config["size"].get<gko::int64>());
         } else {
             throw std::runtime_error(
                 "No known way to generate matrix data found.");
         }
+        data.ensure_row_major_order();
+        return data;
     }
 
-    static std::string describe_config(rapidjson::Value& config)
+    static std::string get_example_config()
     {
-        if (config.HasMember("filename")) {
-            return config["filename"].GetString();
-        } else if (config.HasMember("stencil")) {
+        return json::
+            parse(R"([{"filename": "my_file.mtx"},{"filename": "my_file2.mtx"},{"size": 100, "stencil": "7pt"}])")
+                .dump(4);
+    }
+
+    static bool validate_config(const json& test_case)
+    {
+        return ((test_case.contains("size") && test_case.contains("stencil") &&
+                 test_case["size"].is_number_integer() &&
+                 test_case["stencil"].is_string()) ||
+                (test_case.contains("filename") &&
+                 test_case["filename"].is_string()));
+    }
+
+    static std::string describe_config(const json& config)
+    {
+        if (config.contains("filename")) {
+            return config["filename"].get<std::string>();
+        } else if (config.contains("stencil")) {
             std::stringstream ss;
-            ss << "stencil(" << config["size"].GetInt64() << ","
-               << config["stencil"].GetString() << ")";
+            ss << "stencil(" << config["size"].get<gko::int64>() << ", "
+               << config["stencil"].get<std::string>() << ")";
             return ss.str();
         } else {
             throw std::runtime_error("No known way to describe config.");
@@ -82,30 +102,30 @@ struct DefaultSystemGenerator {
     }
 
     static std::shared_ptr<gko::LinOp> generate_matrix_with_optimal_format(
-        std::shared_ptr<gko::Executor> exec, rapidjson::Value& config)
+        std::shared_ptr<gko::Executor> exec, json& config)
     {
         auto data = generate_matrix_data(config);
         return generate_matrix_with_format(
-            std::move(exec), config["optimal"]["spmv"].GetString(), data);
+            std::move(exec), config["optimal"]["spmv"].get<std::string>(),
+            data);
     }
 
     static std::shared_ptr<gko::LinOp> generate_matrix_with_format(
         std::shared_ptr<gko::Executor> exec, const std::string& format_name,
         const gko::matrix_data<ValueType, itype>& data,
-        rapidjson::Value* spmv_case = nullptr,
-        rapidjson::MemoryPoolAllocator<>* allocator = nullptr)
+        json* spmv_case = nullptr)
     {
         auto storage_logger = std::make_shared<StorageLogger>();
-        if (spmv_case && allocator) {
+        if (spmv_case) {
             exec->add_logger(storage_logger);
         }
 
         auto mtx =
             gko::share(::formats::matrix_factory(format_name, exec, data));
 
-        if (spmv_case && allocator) {
+        if (spmv_case) {
             exec->remove_logger(storage_logger);
-            storage_logger->write_data(*spmv_case, *allocator);
+            storage_logger->write_data(*spmv_case);
         }
 
         return mtx;
@@ -172,62 +192,79 @@ struct DistributedDefaultSystemGenerator {
     using Vec = dist_vec<value_type>;
 
     gko::matrix_data<value_type, index_type> generate_matrix_data(
-        rapidjson::Value& config) const
+        const json& config) const
     {
-        if (config.HasMember("filename")) {
-            std::ifstream in(config["filename"].GetString());
-            return gko::read_generic_raw<value_type, index_type>(in);
-        } else if (config.HasMember("stencil")) {
+        gko::matrix_data<value_type, index_type> data;
+        if (config.contains("filename")) {
+            std::ifstream in(config["filename"].get<std::string>());
+            data = gko::read_generic_raw<value_type, index_type>(in);
+        } else if (config.contains("stencil")) {
             auto local_size = static_cast<global_itype>(
-                config["size"].GetInt64() / comm.size());
-            return generate_stencil<value_type, index_type>(
-                config["stencil"].GetString(), comm, local_size,
-                config["comm_pattern"].GetString() == std::string("optimal"));
+                config["size"].get<gko::int64>() / comm.size());
+            data = generate_stencil<value_type, index_type>(
+                config["stencil"].get<std::string>(), comm, local_size,
+                config["comm_pattern"].get<std::string>() ==
+                    std::string("optimal"));
         } else {
             throw std::runtime_error(
                 "No known way to generate matrix data found.");
         }
+        data.ensure_row_major_order();
+        return data;
+    }
+
+    static std::string get_example_config()
+    {
+        return json::
+            parse(R"([{"size": 100, "stencil": "7pt", "comm_pattern": "stencil"}, {"filename": "my_file.mtx"}])")
+                .dump(4);
     }
 
-    std::string describe_config(rapidjson::Value& config) const
+    static bool validate_config(const json& test_case)
     {
-        if (config.HasMember("filename")) {
-            return config["filename"].GetString();
-        } else if (config.HasMember("stencil")) {
+        return ((test_case.contains("size") && test_case.contains("stencil") &&
+                 test_case.contains("comm_pattern") &&
+                 test_case["size"].is_number_integer() &&
+                 test_case["stencil"].is_string() &&
+                 test_case["comm_pattern"].is_string()) ||
+                (test_case.contains("filename") &&
+                 test_case["filename"].is_string()));
+    }
+
+    static std::string describe_config(const json& config)
+    {
+        if (config.contains("filename")) {
+            return config["filename"].get<std::string>();
+        } else if (config.contains("stencil")) {
             std::stringstream ss;
-            ss << "stencil(" << config["size"].GetInt64() << ","
-               << config["stencil"].GetString() << ","
-               << config["comm_pattern"].GetString() << ")";
+            ss << "stencil(" << config["size"].get<gko::int64>() << ", "
+               << config["stencil"].get<std::string>() << ", "
+               << config["comm_pattern"].get<std::string>() << ")";
             return ss.str();
         } else {
             throw std::runtime_error("No known way to describe config.");
         }
     }
 
-    std::shared_ptr<gko::LinOp> generate_matrix_with_optimal_format(
-        std::shared_ptr<gko::Executor> exec, rapidjson::Value& config) const
-    {
-        auto data = generate_matrix_data(config);
-        return generate_matrix_with_format(
-            std::move(exec), config["optimal"]["spmv"].GetString(), data);
-    }
-
     std::shared_ptr<gko::LinOp> generate_matrix_with_format(
         std::shared_ptr<gko::Executor> exec, const std::string& format_name,
         const gko::matrix_data<value_type, index_type>& data,
-        rapidjson::Value* spmv_case = nullptr,
-        rapidjson::MemoryPoolAllocator<>* allocator = nullptr) const
+        json* spmv_case = nullptr) const
     {
         auto part = gko::experimental::distributed::
             Partition<itype, global_itype>::build_from_global_size_uniform(
                 exec, comm.size(), static_cast<global_itype>(data.size[0]));
         auto formats = split(format_name, '-');
+        if (formats.size() != 2) {
+            throw std::runtime_error{"Invalid distributed format specifier " +
+                                     format_name};
+        }
 
         auto local_mat = formats::matrix_type_factory.at(formats[0])(exec);
         auto non_local_mat = formats::matrix_type_factory.at(formats[1])(exec);
 
         auto storage_logger = std::make_shared<StorageLogger>();
-        if (spmv_case && allocator) {
+        if (spmv_case) {
             exec->add_logger(storage_logger);
         }
 
@@ -235,9 +272,9 @@ struct DistributedDefaultSystemGenerator {
             exec, comm, local_mat, non_local_mat);
         dist_mat->read_distributed(data, part);
 
-        if (spmv_case && allocator) {
+        if (spmv_case) {
             exec->remove_logger(storage_logger);
-            storage_logger->write_data(comm, *spmv_case, *allocator);
+            storage_logger->write_data(comm, *spmv_case);
         }
 
         return dist_mat;
diff --git a/benchmark/utils/hip_linops.hip.cpp b/benchmark/utils/hip_linops.hip.cpp
index 627dfad980e..c8664778e02 100644
--- a/benchmark/utils/hip_linops.hip.cpp
+++ b/benchmark/utils/hip_linops.hip.cpp
@@ -36,9 +36,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <memory>
 
 
-#include <hipsparse.h>
-
-
 #include "benchmark/utils/sparselib_linops.hpp"
 #include "benchmark/utils/types.hpp"
 #include "hip/base/hipsparse_bindings.hip.hpp"
diff --git a/benchmark/utils/iteration_control.hpp b/benchmark/utils/iteration_control.hpp
new file mode 100644
index 00000000000..295ae7870d6
--- /dev/null
+++ b/benchmark/utils/iteration_control.hpp
@@ -0,0 +1,326 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_BENCHMARK_UTILS_ITERATION_CONTROL_HPP_
+#define GKO_BENCHMARK_UTILS_ITERATION_CONTROL_HPP_
+
+
+#include <ginkgo/ginkgo.hpp>
+
+
+#include <memory>
+#include <string>
+#include <utility>
+
+
+#include "benchmark/utils/general.hpp"
+#include "benchmark/utils/timer.hpp"
+#include "benchmark/utils/types.hpp"
+#include "core/distributed/helpers.hpp"
+
+
+/**
+ * A class for controlling the number warmup and timed iterations.
+ *
+ * The behavior is determined by the following flags
+ * - 'repetitions' switch between fixed and adaptive number of iterations
+ * - 'warmup' warmup iterations, applies in fixed and adaptive case
+ * - 'min_repetitions' minimal number of repetitions (adaptive case)
+ * - 'max_repetitions' maximal number of repetitions (adaptive case)
+ * - 'min_runtime' minimal total runtime (adaptive case)
+ * - 'repetition_growth_factor' controls the increase between two successive
+ *   timings
+ *
+ * Usage:
+ * `IterationControl` exposes the member functions:
+ * - `warmup_run()`: controls run defined by `warmup` flag
+ * - `run(bool)`: controls run defined by all other flags
+ * - `get_timer()`: access to underlying timer
+ * The first two methods return an object that is to be used in a range-based
+ * for loop:
+ * ```
+ * IterationControl ic(get_timer(...));
+ *
+ * // warmup run always uses fixed number of iteration and does not issue
+ * // timings
+ * for(auto status: ic.warmup_run()){
+ *   // execute benchmark
+ * }
+ * // run may use adaptive number of iterations (depending on cmd line flag)
+ * // and issues timing (unless manage_timings is false)
+ * for(auto status: ic.run(manage_timings [default is true])){
+ *   if(! manage_timings) ic.get_timer->tic();
+ *   // execute benchmark
+ *   if(! manage_timings) ic.get_timer->toc();
+ * }
+ *
+ * ```
+ * At the beginning of both methods, the timer is reset.
+ * The `status` object exposes the member
+ * - `cur_it`, containing the current iteration number,
+ * and the methods
+ * - `is_finished`, checks if the benchmark is finished,
+ */
+class IterationControl {
+    using IndexType = unsigned int;  //!< to be compatible with GFLAGS type
+
+    class run_control;
+
+public:
+    /**
+     * Creates an `IterationControl` object.
+     *
+     * Uses the commandline flags to setup the stopping criteria for the
+     * warmup and timed run.
+     *
+     * @param timer  the timer that is to be used for the timings
+     */
+    explicit IterationControl(const std::shared_ptr<Timer>& timer)
+    {
+        status_warmup_ = {TimerManager{timer, false}, FLAGS_warmup,
+                          FLAGS_warmup, 0., 0};
+        if (FLAGS_repetitions == "auto") {
+            status_run_ = {TimerManager{timer, true}, FLAGS_min_repetitions,
+                           FLAGS_max_repetitions, FLAGS_min_runtime};
+        } else {
+            const auto reps =
+                static_cast<unsigned int>(std::stoi(FLAGS_repetitions));
+            status_run_ = {TimerManager{timer, true}, reps, reps, 0., 0};
+        }
+    }
+
+    IterationControl() = default;
+    IterationControl(const IterationControl&) = default;
+    IterationControl(IterationControl&&) = default;
+
+    /**
+     * Creates iterable `run_control` object for the warmup run.
+     *
+     * This run uses always a fixed number of iterations.
+     */
+    run_control warmup_run()
+    {
+        status_warmup_.cur_it = 0;
+        status_warmup_.managed_timer.clear();
+        return run_control{&status_warmup_};
+    }
+
+    /**
+     * Creates iterable `run_control` object for the timed run.
+     *
+     * This run may be adaptive, depending on the commandline flags.
+     *
+     * @param manage_timings If true, the timer calls (`tic/toc`) are handled
+     * by the `run_control` object, otherwise they need to be executed outside
+     */
+    run_control run(bool manage_timings = true)
+    {
+        status_run_.cur_it = 0;
+        status_run_.managed_timer.clear();
+        status_run_.managed_timer.manage_timings = manage_timings;
+        return run_control{&status_run_};
+    }
+
+    std::shared_ptr<Timer> get_timer() const
+    {
+        return status_run_.managed_timer.timer;
+    }
+
+    /**
+     * Compute the time from the given statistical method
+     *
+     * @param method  the statistical method. If the timer does not have the
+     *                same iteration as the IterationControl, it can only use
+     *                average from the IterationControl.
+     *
+     * @return the statistical time
+     */
+    double compute_time(const std::string& method = "average") const
+    {
+        if (status_run_.managed_timer.timer->get_num_repetitions() ==
+            this->get_num_repetitions()) {
+            return status_run_.managed_timer.compute_time(method);
+        } else {
+            assert(method == "average");
+            return status_run_.managed_timer.get_total_time() /
+                   this->get_num_repetitions();
+        }
+    }
+
+    IndexType get_num_repetitions() const { return status_run_.cur_it; }
+
+private:
+    struct TimerManager {
+        std::shared_ptr<Timer> timer;
+        bool manage_timings = false;
+
+        void tic()
+        {
+            if (manage_timings) {
+                timer->tic();
+            }
+        }
+        void toc(unsigned int num = 1)
+        {
+            if (manage_timings) {
+                timer->toc(num);
+            }
+        }
+
+        void clear() { timer->clear(); }
+
+        double get_total_time() const { return timer->get_total_time(); }
+
+        double compute_time(const std::string& method = "average") const
+        {
+            return timer->compute_time(method);
+        }
+    };
+
+    /**
+     * Stores stopping criteria of the adaptive benchmark run as well as the
+     * current iteration number.
+     */
+    struct status {
+        TimerManager managed_timer{};
+
+        IndexType min_it = 0;
+        IndexType max_it = 0;
+        double max_runtime = 0.;
+
+        IndexType cur_it = 0;
+
+        /**
+         * checks if the adaptive run is complete
+         *
+         * the adaptive run is complete if:
+         * - the minimum number of iteration is reached
+         * - and either:
+         *   - the maximum number of repetitions is reached
+         *   - the total runtime is above the threshold
+         *
+         * @return completeness state of the adaptive run
+         */
+        bool is_finished() const
+        {
+            return cur_it >= min_it &&
+                   (cur_it >= max_it ||
+                    managed_timer.get_total_time() >= max_runtime);
+        }
+    };
+
+    /**
+     * Iterable class managing the benchmark iteration.
+     *
+     * Has to be used in a range-based for loop.
+     */
+    struct run_control {
+        struct iterator {
+            /**
+             * Increases the current iteration count and finishes timing if
+             * necessary.
+             *
+             * As `++it` is the last step of a for-loop, the managed_timer is
+             * stopped, if enough iterations have passed since the last timing.
+             * The interval between two timings is steadily increased to
+             * reduce the timing overhead.
+             */
+            iterator operator++()
+            {
+                cur_info->cur_it++;
+                if (cur_info->cur_it >= next_timing && !stopped) {
+                    cur_info->managed_timer.toc(
+                        static_cast<unsigned>(cur_info->cur_it - start_timing));
+                    stopped = true;
+                    next_timing = static_cast<IndexType>(std::ceil(
+                        next_timing * FLAGS_repetition_growth_factor));
+                    // If repetition_growth_factor <= 1, next_timing will be
+                    // next iteration.
+                    if (next_timing <= cur_info->cur_it) {
+                        next_timing = cur_info->cur_it + 1;
+                    }
+                }
+                return *this;
+            }
+
+            status operator*() const { return *cur_info; }
+
+            /**
+             * Checks if the benchmark is finished and handles timing, if
+             * necessary.
+             *
+             * As `begin != end` is the first step in a for-loop, the
+             * managed_timer is started, if it was previously stopped.
+             * Additionally, if the benchmark is complete and the managed_timer
+             * is still running it is stopped. (This may occur if the maximal
+             * number of repetitions is surpassed)
+             *
+             * Uses only the information from the `status` object, i.e.
+             * the right hand side is ignored.
+             *
+             * @return true if benchmark is not finished, else false
+             */
+            bool operator!=(const iterator&)
+            {
+                const bool is_finished = cur_info->is_finished();
+                if (!is_finished && stopped) {
+                    stopped = false;
+                    cur_info->managed_timer.tic();
+                    start_timing = cur_info->cur_it;
+                } else if (is_finished && !stopped) {
+                    cur_info->managed_timer.toc(
+                        static_cast<unsigned>(cur_info->cur_it - start_timing));
+                    stopped = true;
+                }
+                return !is_finished;
+            }
+
+            status* cur_info;
+            IndexType next_timing = 1;   //!< next iteration to stop timing
+            IndexType start_timing = 0;  //!< iteration for starting timing
+            bool stopped = true;
+        };
+
+        iterator begin() const { return iterator{info}; }
+
+        // not used, could potentially be used in c++17 as a sentinel
+        iterator end() const { return iterator{}; }
+
+        status* info;
+    };
+
+    status status_warmup_;
+    status status_run_;
+};
+
+
+#endif  // GKO_BENCHMARK_UTILS_ITERATION_CONTROL_HPP_
diff --git a/benchmark/utils/json.hpp b/benchmark/utils/json.hpp
index b0cd384cae5..684db0229aa 100644
--- a/benchmark/utils/json.hpp
+++ b/benchmark/utils/json.hpp
@@ -34,69 +34,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GKO_BENCHMARK_UTILS_JSON_HPP_
 
 
-#include <ginkgo/ginkgo.hpp>
+#include <nlohmann/json.hpp>
 
 
-#include <type_traits>
-
-
-#include <rapidjson/document.h>
-#include <rapidjson/istreamwrapper.h>
-#include <rapidjson/ostreamwrapper.h>
-#include <rapidjson/prettywriter.h>
-
-
-// helper for setting rapidjson object members
-template <typename T, typename NameType, typename Allocator>
-std::enable_if_t<
-    !std::is_same<typename std::decay<T>::type, gko::size_type>::value, void>
-add_or_set_member(rapidjson::Value& object, NameType&& name, T&& value,
-                  Allocator&& allocator)
-{
-    if (object.HasMember(name)) {
-        object[name] = std::forward<T>(value);
-    } else {
-        auto n = rapidjson::Value(name, allocator);
-        object.AddMember(n, std::forward<T>(value), allocator);
-    }
-}
-
-
-/**
-   @internal This is required to fix some MacOS problems (and possibly other
-   compilers). There is no explicit RapidJSON constructor for `std::size_t` so a
-   conversion to a known constructor is required to solve any ambiguity. See the
-   last comments of https://github.com/ginkgo-project/ginkgo/issues/270.
- */
-template <typename T, typename NameType, typename Allocator>
-std::enable_if_t<
-    std::is_same<typename std::decay<T>::type, gko::size_type>::value, void>
-add_or_set_member(rapidjson::Value& object, NameType&& name, T&& value,
-                  Allocator&& allocator)
-{
-    if (object.HasMember(name)) {
-        object[name] =
-            std::forward<std::uint64_t>(static_cast<std::uint64_t>(value));
-    } else {
-        auto n = rapidjson::Value(name, allocator);
-        object.AddMember(
-            n, std::forward<std::uint64_t>(static_cast<std::uint64_t>(value)),
-            allocator);
-    }
-}
-
-
-// helper for writing out rapidjson Values
-inline std::ostream& operator<<(std::ostream& os, const rapidjson::Value& value)
-{
-    rapidjson::OStreamWrapper jos(os);
-    rapidjson::PrettyWriter<rapidjson::OStreamWrapper, rapidjson::UTF8<>,
-                            rapidjson::UTF8<>, rapidjson::CrtAllocator,
-                            rapidjson::kWriteNanAndInfFlag>
-        writer(jos);
-    value.Accept(writer);
-    return os;
-}
+using json = nlohmann::ordered_json;
 
 
 #endif  // GKO_BENCHMARK_UTILS_JSON_HPP_
diff --git a/benchmark/utils/loggers.hpp b/benchmark/utils/loggers.hpp
index e3e6228604e..89ea6108eda 100644
--- a/benchmark/utils/loggers.hpp
+++ b/benchmark/utils/loggers.hpp
@@ -50,10 +50,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 struct JsonSummaryWriter : gko::log::ProfilerHook::SummaryWriter,
                            gko::log::ProfilerHook::NestedSummaryWriter {
-    JsonSummaryWriter(rapidjson::Value& object,
-                      rapidjson::MemoryPoolAllocator<>& alloc,
-                      gko::uint32 repetitions)
-        : object{&object}, alloc{&alloc}, repetitions{repetitions}
+    JsonSummaryWriter(json& object, gko::uint32 repetitions)
+        : object{&object}, repetitions{repetitions}
     {}
 
     void write(
@@ -62,13 +60,11 @@ struct JsonSummaryWriter : gko::log::ProfilerHook::SummaryWriter,
     {
         for (const auto& entry : entries) {
             if (entry.name != "total") {
-                add_or_set_member(*object, entry.name.c_str(),
-                                  entry.exclusive.count() * 1e-9 / repetitions,
-                                  *alloc);
+                (*object)[entry.name] =
+                    entry.exclusive.count() * 1e-9 / repetitions;
             }
         }
-        add_or_set_member(*object, "overhead",
-                          overhead.count() * 1e-9 / repetitions, *alloc);
+        (*object)["overhead"] = overhead.count() * 1e-9 / repetitions;
     }
 
     void write_nested(const gko::log::ProfilerHook::nested_summary_entry& root,
@@ -84,27 +80,24 @@ struct JsonSummaryWriter : gko::log::ProfilerHook::SummaryWriter,
                 visit(visit, child, new_prefix);
                 exclusive -= child.elapsed;
             }
-            add_or_set_member(*object, (prefix + node.name).c_str(),
-                              exclusive.count() * 1e-9 / repetitions, *alloc);
+            (*object)[prefix + node.name] =
+                exclusive.count() * 1e-9 / repetitions;
         };
         // we don't need to annotate the total
         for (const auto& child : root.children) {
             visit(visit, child, "");
         }
-        add_or_set_member(*object, "overhead",
-                          overhead.count() * 1e-9 / repetitions, *alloc);
+        (*object)["overhead"] = overhead.count() * 1e-9 / repetitions;
     }
 
-    rapidjson::Value* object;
-    rapidjson::MemoryPoolAllocator<>* alloc;
+    json* object;
     gko::uint32 repetitions;
 };
 
 
 inline std::shared_ptr<gko::log::ProfilerHook> create_operations_logger(
     bool gpu_timer, bool nested, std::shared_ptr<gko::Executor> exec,
-    rapidjson::Value& object, rapidjson::MemoryPoolAllocator<>& alloc,
-    gko::uint32 repetitions)
+    json& object, gko::uint32 repetitions)
 {
     std::shared_ptr<gko::Timer> timer;
     if (gpu_timer) {
@@ -114,12 +107,10 @@ inline std::shared_ptr<gko::log::ProfilerHook> create_operations_logger(
     }
     if (nested) {
         return gko::log::ProfilerHook::create_nested_summary(
-            timer,
-            std::make_unique<JsonSummaryWriter>(object, alloc, repetitions));
+            timer, std::make_unique<JsonSummaryWriter>(object, repetitions));
     } else {
         return gko::log::ProfilerHook::create_summary(
-            timer,
-            std::make_unique<JsonSummaryWriter>(object, alloc, repetitions));
+            timer, std::make_unique<JsonSummaryWriter>(object, repetitions));
     }
 }
 
@@ -140,21 +131,18 @@ struct StorageLogger : gko::log::Logger {
         storage[location] = 0;
     }
 
-    void write_data(rapidjson::Value& output,
-                    rapidjson::MemoryPoolAllocator<>& allocator)
+    void write_data(json& output)
     {
         const std::lock_guard<std::mutex> lock(mutex);
         gko::size_type total{};
         for (const auto& e : storage) {
             total += e.second;
         }
-        add_or_set_member(output, "storage", total, allocator);
+        output["storage"] = total;
     }
 
 #if GINKGO_BUILD_MPI
-    void write_data(gko::experimental::mpi::communicator comm,
-                    rapidjson::Value& output,
-                    rapidjson::MemoryPoolAllocator<>& allocator)
+    void write_data(gko::experimental::mpi::communicator comm, json& output)
     {
         const std::lock_guard<std::mutex> lock(mutex);
         gko::size_type total{};
@@ -166,7 +154,7 @@ struct StorageLogger : gko::log::Logger {
                         ? static_cast<gko::size_type*>(MPI_IN_PLACE)
                         : &total,
                     &total, 1, MPI_SUM, 0);
-        add_or_set_member(output, "storage", total, allocator);
+        output["storage"] = total;
     }
 #endif
 
@@ -191,17 +179,16 @@ struct ResidualLogger : gko::log::Logger {
                                const gko::array<gko::stopping_status>* status,
                                bool all_stopped) const override
     {
-        timestamps.PushBack(std::chrono::duration<double>(
-                                std::chrono::steady_clock::now() - start)
-                                .count(),
-                            alloc);
+        timestamps->push_back(std::chrono::duration<double>(
+                                  std::chrono::steady_clock::now() - start)
+                                  .count());
         if (residual_norm) {
-            rec_res_norms.PushBack(
-                get_norm(gko::as<vec<rc_vtype>>(residual_norm)), alloc);
+            rec_res_norms->push_back(
+                get_norm(gko::as<vec<rc_vtype>>(residual_norm)));
         } else {
             gko::detail::vector_dispatch<rc_vtype>(
                 residual, [&](const auto v_residual) {
-                    rec_res_norms.PushBack(compute_norm2(v_residual), alloc);
+                    rec_res_norms->push_back(compute_norm2(v_residual));
                 });
         }
         if (solution) {
@@ -209,42 +196,34 @@ struct ResidualLogger : gko::log::Logger {
                 rc_vtype>(solution, [&](auto v_solution) {
                 using concrete_type =
                     std::remove_pointer_t<std::decay_t<decltype(v_solution)>>;
-                true_res_norms.PushBack(
-                    compute_residual_norm(matrix, gko::as<concrete_type>(b),
-                                          v_solution),
-                    alloc);
+                true_res_norms->push_back(compute_residual_norm(
+                    matrix, gko::as<concrete_type>(b), v_solution));
             });
         } else {
-            true_res_norms.PushBack(-1.0, alloc);
+            true_res_norms->push_back(-1.0);
         }
         if (implicit_sq_residual_norm) {
-            implicit_res_norms.PushBack(
-                std::sqrt(get_norm(
-                    gko::as<vec<rc_vtype>>(implicit_sq_residual_norm))),
-                alloc);
+            implicit_res_norms->push_back(std::sqrt(
+                get_norm(gko::as<vec<rc_vtype>>(implicit_sq_residual_norm))));
             has_implicit_res_norm = true;
         } else {
-            implicit_res_norms.PushBack(-1.0, alloc);
+            implicit_res_norms->push_back(-1.0);
         }
     }
 
     ResidualLogger(gko::ptr_param<const gko::LinOp> matrix,
-                   gko::ptr_param<const gko::LinOp> b,
-                   rapidjson::Value& rec_res_norms,
-                   rapidjson::Value& true_res_norms,
-                   rapidjson::Value& implicit_res_norms,
-                   rapidjson::Value& timestamps,
-                   rapidjson::MemoryPoolAllocator<>& alloc)
+                   gko::ptr_param<const gko::LinOp> b, json& rec_res_norms,
+                   json& true_res_norms, json& implicit_res_norms,
+                   json& timestamps)
         : gko::log::Logger(gko::log::Logger::iteration_complete_mask),
           matrix{matrix.get()},
           b{b.get()},
           start{std::chrono::steady_clock::now()},
-          rec_res_norms{rec_res_norms},
-          true_res_norms{true_res_norms},
+          rec_res_norms{&rec_res_norms},
+          true_res_norms{&true_res_norms},
           has_implicit_res_norm{},
-          implicit_res_norms{implicit_res_norms},
-          timestamps{timestamps},
-          alloc{alloc}
+          implicit_res_norms{&implicit_res_norms},
+          timestamps{&timestamps}
     {}
 
     bool has_implicit_res_norms() const { return has_implicit_res_norm; }
@@ -253,12 +232,11 @@ struct ResidualLogger : gko::log::Logger {
     const gko::LinOp* matrix;
     const gko::LinOp* b;
     std::chrono::steady_clock::time_point start;
-    rapidjson::Value& rec_res_norms;
-    rapidjson::Value& true_res_norms;
+    json* rec_res_norms;
+    json* true_res_norms;
     mutable bool has_implicit_res_norm;
-    rapidjson::Value& implicit_res_norms;
-    rapidjson::Value& timestamps;
-    rapidjson::MemoryPoolAllocator<>& alloc;
+    json* implicit_res_norms;
+    json* timestamps;
 };
 
 
@@ -279,11 +257,7 @@ struct IterationLogger : gko::log::Logger {
         : gko::log::Logger(gko::log::Logger::iteration_complete_mask)
     {}
 
-    void write_data(rapidjson::Value& output,
-                    rapidjson::MemoryPoolAllocator<>& allocator)
-    {
-        add_or_set_member(output, "iterations", this->num_iters, allocator);
-    }
+    void write_data(json& output) { output["iterations"] = this->num_iters; }
 
 private:
     mutable gko::size_type num_iters{0};
diff --git a/benchmark/utils/overhead_linop.hpp b/benchmark/utils/overhead_linop.hpp
index 168e650234d..d947b8de38e 100644
--- a/benchmark/utils/overhead_linop.hpp
+++ b/benchmark/utils/overhead_linop.hpp
@@ -104,27 +104,12 @@ class Overhead : public EnableLinOp<Overhead<ValueType>>,
     friend class EnablePolymorphicObject<Overhead, LinOp>;
 
 public:
-    GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory)
-    {
-        /**
-         * Criterion factories.
-         */
-        std::vector<std::shared_ptr<const stop::CriterionFactory>>
-            GKO_FACTORY_PARAMETER_VECTOR(criteria, nullptr);
-
-        /**
-         * Preconditioner factory.
-         */
-        std::shared_ptr<const LinOpFactory> GKO_FACTORY_PARAMETER_SCALAR(
-            preconditioner, nullptr);
-
-        /**
-         * Already generated preconditioner. If one is provided, the factory
-         * `preconditioner` will be ignored.
-         */
-        std::shared_ptr<const LinOp> GKO_FACTORY_PARAMETER_SCALAR(
-            generated_preconditioner, nullptr);
-    };
+    class Factory;
+
+    struct parameters_type
+        : public gko::solver::
+              enable_preconditioned_iterative_solver_factory_parameters<
+                  parameters_type, Factory> {};
 
     GKO_ENABLE_LIN_OP_FACTORY(Overhead, parameters, Factory);
     GKO_ENABLE_BUILD_METHOD(Factory);
diff --git a/benchmark/utils/preconditioners.hpp b/benchmark/utils/preconditioners.hpp
index 466d5f2d3f9..3450eb71b44 100644
--- a/benchmark/utils/preconditioners.hpp
+++ b/benchmark/utils/preconditioners.hpp
@@ -122,7 +122,7 @@ const std::map<std::string, std::function<std::unique_ptr<gko::LinOpFactory>(
                                 .on(exec));
              return gko::preconditioner::Ic<gko::solver::LowerTrs<etype, itype>,
                                             itype>::build()
-                 .with_factorization_factory(fact)
+                 .with_factorization(fact)
                  .on(exec);
          }},
         {"parict",
@@ -137,7 +137,7 @@ const std::map<std::string, std::function<std::unique_ptr<gko::LinOpFactory>(
              return gko::preconditioner::
                  Ilu<gko::solver::LowerTrs<etype, itype>,
                      gko::solver::UpperTrs<etype, itype>, false, itype>::build()
-                     .with_factorization_factory(fact)
+                     .with_factorization(fact)
                      .on(exec);
          }},
         {"parilu",
@@ -150,7 +150,7 @@ const std::map<std::string, std::function<std::unique_ptr<gko::LinOpFactory>(
              return gko::preconditioner::
                  Ilu<gko::solver::LowerTrs<etype, itype>,
                      gko::solver::UpperTrs<etype, itype>, false, itype>::build()
-                     .with_factorization_factory(fact)
+                     .with_factorization(fact)
                      .on(exec);
          }},
         {"parilut",
@@ -165,7 +165,7 @@ const std::map<std::string, std::function<std::unique_ptr<gko::LinOpFactory>(
              return gko::preconditioner::
                  Ilu<gko::solver::LowerTrs<etype, itype>,
                      gko::solver::UpperTrs<etype, itype>, false, itype>::build()
-                     .with_factorization_factory(fact)
+                     .with_factorization(fact)
                      .on(exec);
          }},
         {"ic",
@@ -174,7 +174,7 @@ const std::map<std::string, std::function<std::unique_ptr<gko::LinOpFactory>(
                  gko::factorization::Ic<etype, itype>::build().on(exec));
              return gko::preconditioner::Ic<gko::solver::LowerTrs<etype, itype>,
                                             itype>::build()
-                 .with_factorization_factory(fact)
+                 .with_factorization(fact)
                  .on(exec);
          }},
         {"ilu",
@@ -184,7 +184,7 @@ const std::map<std::string, std::function<std::unique_ptr<gko::LinOpFactory>(
              return gko::preconditioner::
                  Ilu<gko::solver::LowerTrs<etype, itype>,
                      gko::solver::UpperTrs<etype, itype>, false, itype>::build()
-                     .with_factorization_factory(fact)
+                     .with_factorization(fact)
                      .on(exec);
          }},
         {"paric-isai",
@@ -201,8 +201,8 @@ const std::map<std::string, std::function<std::unique_ptr<gko::LinOpFactory>(
              return gko::preconditioner::Ic<
                         gko::preconditioner::LowerIsai<etype, itype>,
                         itype>::build()
-                 .with_factorization_factory(fact)
-                 .with_l_solver_factory(lisai)
+                 .with_factorization(fact)
+                 .with_l_solver(lisai)
                  .on(exec);
          }},
         {"parict-isai",
@@ -221,8 +221,8 @@ const std::map<std::string, std::function<std::unique_ptr<gko::LinOpFactory>(
              return gko::preconditioner::Ic<
                         gko::preconditioner::LowerIsai<etype, itype>,
                         itype>::build()
-                 .with_factorization_factory(fact)
-                 .with_l_solver_factory(lisai)
+                 .with_factorization(fact)
+                 .with_l_solver(lisai)
                  .on(exec);
          }},
         {"parilu-isai",
@@ -244,9 +244,9 @@ const std::map<std::string, std::function<std::unique_ptr<gko::LinOpFactory>(
                         gko::preconditioner::LowerIsai<etype, itype>,
                         gko::preconditioner::UpperIsai<etype, itype>, false,
                         itype>::build()
-                 .with_factorization_factory(fact)
-                 .with_l_solver_factory(lisai)
-                 .with_u_solver_factory(uisai)
+                 .with_factorization(fact)
+                 .with_l_solver(lisai)
+                 .with_u_solver(uisai)
                  .on(exec);
          }},
         {"parilut-isai",
@@ -270,9 +270,9 @@ const std::map<std::string, std::function<std::unique_ptr<gko::LinOpFactory>(
                         gko::preconditioner::LowerIsai<etype, itype>,
                         gko::preconditioner::UpperIsai<etype, itype>, false,
                         itype>::build()
-                 .with_factorization_factory(fact)
-                 .with_l_solver_factory(lisai)
-                 .with_u_solver_factory(uisai)
+                 .with_factorization(fact)
+                 .with_l_solver(lisai)
+                 .with_u_solver(uisai)
                  .on(exec);
          }},
         {"ic-isai",
@@ -286,8 +286,8 @@ const std::map<std::string, std::function<std::unique_ptr<gko::LinOpFactory>(
              return gko::preconditioner::Ic<
                         gko::preconditioner::LowerIsai<etype, itype>,
                         itype>::build()
-                 .with_factorization_factory(fact)
-                 .with_l_solver_factory(lisai)
+                 .with_factorization(fact)
+                 .with_l_solver(lisai)
                  .on(exec);
          }},
         {"ilu-isai",
@@ -306,9 +306,9 @@ const std::map<std::string, std::function<std::unique_ptr<gko::LinOpFactory>(
                         gko::preconditioner::LowerIsai<etype, itype>,
                         gko::preconditioner::UpperIsai<etype, itype>, false,
                         itype>::build()
-                 .with_factorization_factory(fact)
-                 .with_l_solver_factory(lisai)
-                 .with_u_solver_factory(uisai)
+                 .with_factorization(fact)
+                 .with_l_solver(lisai)
+                 .with_u_solver(uisai)
                  .on(exec);
          }},
         {"general-isai",
@@ -326,8 +326,7 @@ const std::map<std::string, std::function<std::unique_ptr<gko::LinOpFactory>(
         {"overhead", [](std::shared_ptr<const gko::Executor> exec) {
              return gko::Overhead<etype>::build()
                  .with_criteria(gko::stop::ResidualNorm<etype>::build()
-                                    .with_reduction_factor(rc_etype{})
-                                    .on(exec))
+                                    .with_reduction_factor(rc_etype{}))
                  .on(exec);
          }}};
 
diff --git a/benchmark/utils/runner.hpp b/benchmark/utils/runner.hpp
new file mode 100644
index 00000000000..264dc3965db
--- /dev/null
+++ b/benchmark/utils/runner.hpp
@@ -0,0 +1,203 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_BENCHMARK_UTILS_RUNNER_HPP_
+#define GKO_BENCHMARK_UTILS_RUNNER_HPP_
+
+
+#include <ginkgo/ginkgo.hpp>
+
+
+#include <iomanip>
+#include <iostream>
+#include <vector>
+
+
+#include "benchmark/utils/general.hpp"
+
+
+std::shared_ptr<gko::log::ProfilerHook> create_profiler_hook(
+    std::shared_ptr<const gko::Executor> exec, bool do_print = true)
+{
+    using gko::log::ProfilerHook;
+    std::map<std::string, std::function<std::shared_ptr<ProfilerHook>()>>
+        hook_map{
+            {"none", [] { return std::shared_ptr<ProfilerHook>{}; }},
+            {"auto", [&] { return ProfilerHook::create_for_executor(exec); }},
+            {"nvtx", [] { return ProfilerHook::create_nvtx(); }},
+            {"roctx", [] { return ProfilerHook::create_roctx(); }},
+            {"tau", [] { return ProfilerHook::create_tau(); }},
+            {"vtune", [] { return ProfilerHook::create_vtune(); }},
+            {"debug", [do_print] {
+                 return ProfilerHook::create_custom(
+                     [do_print](const char* name,
+                                gko::log::profile_event_category) {
+                         if (do_print) {
+                             std::clog << "DEBUG: begin " << name << '\n';
+                         }
+                     },
+                     [do_print](const char* name,
+                                gko::log::profile_event_category) {
+                         if (do_print) {
+                             std::clog << "DEBUG: end   " << name << '\n';
+                         }
+                     });
+             }}};
+    return hook_map.at(FLAGS_profiler_hook)();
+}
+
+
+template <typename State>
+struct Benchmark {
+    /** The name to be used in the JSON output. */
+    virtual const std::string& get_name() const = 0;
+
+    /** The operations to loop over for each test case. */
+    virtual const std::vector<std::string>& get_operations() const = 0;
+
+    /** Should we write logging output? */
+    virtual bool should_print() const = 0;
+
+    /** Example JSON input */
+    virtual std::string get_example_config() const = 0;
+
+    /** Is the input test case in the correct format? */
+    virtual bool validate_config(const json& value) const = 0;
+
+    /** Textual representation of the test case for profiler annotation */
+    virtual std::string describe_config(const json& test_case) const = 0;
+
+    /** Sets up shared state and test case info */
+    virtual State setup(std::shared_ptr<gko::Executor> exec,
+                        json& test_case) const = 0;
+
+    /** Runs a single operation of the benchmark */
+    virtual void run(std::shared_ptr<gko::Executor> exec,
+                     std::shared_ptr<Timer> timer, annotate_functor annotate,
+                     State& state, const std::string& operation,
+                     json& operation_case) const = 0;
+
+    /** Post-process test case info. */
+    virtual void postprocess(json& test_case) const {}
+};
+
+
+template <typename State>
+void run_test_cases(const Benchmark<State>& benchmark,
+                    std::shared_ptr<gko::Executor> exec,
+                    std::shared_ptr<Timer> timer, json& test_cases)
+{
+    if (!test_cases.is_array()) {
+        if (benchmark.should_print()) {
+            std::cerr
+                << "Input has to be a JSON array of benchmark configurations:\n"
+                << benchmark.get_example_config() << std::endl;
+        }
+        std::exit(1);
+    }
+    for (const auto& test_case : test_cases) {
+        if (!test_case.is_object() || !benchmark.validate_config(test_case)) {
+            if (benchmark.should_print()) {
+                std::cerr << "Invalid test case:\n"
+                          << std::setw(4) << test_case << "\nInput format:\n"
+                          << benchmark.get_example_config() << std::endl;
+            }
+            std::exit(2);
+        }
+    }
+
+    auto profiler_hook = create_profiler_hook(exec, benchmark.should_print());
+    if (profiler_hook) {
+        exec->add_logger(profiler_hook);
+    }
+    auto annotate = annotate_functor(profiler_hook);
+
+    for (auto& test_case : test_cases) {
+        try {
+            // set up benchmark
+            if (!test_case.contains(benchmark.get_name())) {
+                test_case[benchmark.get_name()] = json::object();
+            }
+            auto test_case_desc = benchmark.describe_config(test_case);
+            if (benchmark.should_print()) {
+                std::clog << "Running test case " << test_case_desc
+                          << std::endl;
+            }
+            auto test_case_state = benchmark.setup(exec, test_case);
+            auto test_case_range = annotate(test_case_desc.c_str());
+            auto& benchmark_case = test_case[benchmark.get_name()];
+            for (const auto& operation_name : benchmark.get_operations()) {
+                if (benchmark_case.contains(operation_name) &&
+                    !FLAGS_overwrite) {
+                    continue;
+                }
+                benchmark_case[operation_name] = json::object();
+                if (benchmark.should_print()) {
+                    std::clog << "\tRunning " << benchmark.get_name() << ": "
+                              << operation_name << std::endl;
+                }
+                auto& operation_case = benchmark_case[operation_name];
+                try {
+                    auto operation_range = annotate(operation_name.c_str());
+                    benchmark.run(exec, timer, annotate, test_case_state,
+                                  operation_name, operation_case);
+                    operation_case["completed"] = true;
+                } catch (const std::exception& e) {
+                    operation_case["completed"] = false;
+                    operation_case["error_type"] =
+                        gko::name_demangling::get_dynamic_type(e);
+                    operation_case["error"] = e.what();
+                    std::cerr << "Error when processing test case\n"
+                              << test_case_desc << "\n"
+                              << "what(): " << e.what() << std::endl;
+                }
+
+                if (benchmark.should_print()) {
+                    backup_results(test_cases);
+                }
+            }
+            benchmark.postprocess(test_case);
+        } catch (const std::exception& e) {
+            std::cerr << "Error setting up benchmark, what(): " << e.what()
+                      << std::endl;
+            test_case["error_type"] = gko::name_demangling::get_dynamic_type(e);
+            test_case["error"] = e.what();
+        }
+    }
+
+    if (profiler_hook) {
+        exec->remove_logger(profiler_hook);
+    }
+}
+
+
+#endif  // GKO_BENCHMARK_UTILS_RUNNER_HPP_
diff --git a/benchmark/utils/timer_impl.hpp b/benchmark/utils/timer_impl.hpp
index 888cb496248..a6b9d968713 100644
--- a/benchmark/utils/timer_impl.hpp
+++ b/benchmark/utils/timer_impl.hpp
@@ -111,7 +111,8 @@ class Timer {
             return copy.back();
         } else if (method == "median") {
             auto mid = copy.size() / 2;
-            if (copy.size() % 2) {
+            if (copy.size() % 2 == 0) {
+                // contains even elements
                 return (copy.at(mid) + copy.at(mid - 1)) / 2;
             } else {
                 return copy.at(mid);
diff --git a/cmake/CTestScript.cmake b/cmake/CTestScript.cmake
index 61d53b0442a..81ff86625d1 100644
--- a/cmake/CTestScript.cmake
+++ b/cmake/CTestScript.cmake
@@ -4,7 +4,7 @@
 #
 # Runs our tests through CTest, with support for Coverage or memory checking.
 #
-# This script provides a full CTest run whith result submission to Ginkgo's
+# This script provides a full CTest run with result submission to Ginkgo's
 # CDash dashboard. The supported runs are:
 # + With or without coverage, requires the gcov tool.
 # + With or without address sanitizers.
diff --git a/cmake/DownloadNonCMakeCMakeLists.txt.in b/cmake/DownloadNonCMakeCMakeLists.txt.in
deleted file mode 100644
index c2d848e8d49..00000000000
--- a/cmake/DownloadNonCMakeCMakeLists.txt.in
+++ /dev/null
@@ -1,14 +0,0 @@
-cmake_minimum_required(VERSION 3.9)
-project(${package_name})
-
-include(ExternalProject)
-ExternalProject_Add(${package_name}
-    URL "${package_url}"
-    URL_HASH "${package_hash}"
-    DOWNLOAD_NO_PROGRESS TRUE
-    SOURCE_DIR        "${CMAKE_CURRENT_BINARY_DIR}/src"
-    BINARY_DIR        "${CMAKE_CURRENT_BINARY_DIR}/build"
-    CONFIGURE_COMMAND "${config_command}" "${ARGN}"
-    INSTALL_COMMAND ""
-    UPDATE_DISCONNECTED ${GINKGO_SKIP_DEPENDENCY_UPDATE}
-    )
diff --git a/cmake/GinkgoConfig.cmake.in b/cmake/GinkgoConfig.cmake.in
index a2857310183..41f3b8f2879 100644
--- a/cmake/GinkgoConfig.cmake.in
+++ b/cmake/GinkgoConfig.cmake.in
@@ -37,7 +37,7 @@ set(GINKGO_BUILD_OMP @GINKGO_BUILD_OMP@)
 set(GINKGO_BUILD_CUDA @GINKGO_BUILD_CUDA@)
 set(GINKGO_BUILD_HIP @GINKGO_BUILD_HIP@)
 set(GINKGO_BUILD_MPI @GINKGO_BUILD_MPI@)
-set(GINKGO_BUILD_DPCPP @GINKGO_BUILD_DPCPP@)
+set(GINKGO_BUILD_SYCL @GINKGO_BUILD_SYCL@)
 
 set(GINKGO_DEVEL_TOOLS @GINKGO_DEVEL_TOOLS@)
 set(GINKGO_BUILD_TESTS @GINKGO_BUILD_TESTS@)
@@ -61,27 +61,25 @@ set(GINKGO_IWYU_PATH @GINKGO_IWYU_PATH@)
 
 set(GINKGO_JACOBI_FULL_OPTIMIZATIONS @GINKGO_JACOBI_FULL_OPTIMIZATIONS@)
 
-set(GINKGO_CUDA_ARCHITECTURES @GINKGO_CUDA_ARCHITECTURES@)
-set(GINKGO_CUDA_DEFAULT_HOST_COMPILER @GINKGO_CUDA_DEFAULT_HOST_COMPILER@)
-set(GINKGO_CUDA_HOST_COMPILER @CMAKE_CUDA_HOST_COMPILER@)
-set(GINKGO_CUDA_ARCH_FLAGS @GINKGO_CUDA_ARCH_FLAGS@)
+set(GINKGO_CUDA_ARCHITECTURES "@CMAKE_CUDA_ARCHITECTURES@")
+set(GINKGO_CUDA_HOST_COMPILER "@CMAKE_CUDA_HOST_COMPILER@")
 
-set(GINKGO_HIP_COMPILER_FLAGS @GINKGO_HIP_COMPILER_FLAGS@)
-set(GINKGO_HIP_HCC_COMPILER_FLAGS @GINKGO_HIP_HCC_COMPILER_FLAGS@)
-set(GINKGO_HIP_NVCC_COMPILER_FLAGS @GINKGO_HIP_NVCC_COMPILER_FLAGS@)
-set(GINKGO_HIP_CLANG_COMPILER_FLAGS @GINKGO_HIP_CLANG_COMPILER_FLAGS@)
+set(GINKGO_HIP_COMPILER_FLAGS "@GINKGO_HIP_COMPILER_FLAGS@")
+set(GINKGO_HIP_HCC_COMPILER_FLAGS "@GINKGO_HIP_HCC_COMPILER_FLAGS@")
+set(GINKGO_HIP_NVCC_COMPILER_FLAGS "@GINKGO_HIP_NVCC_COMPILER_FLAGS@")
+set(GINKGO_HIP_CLANG_COMPILER_FLAGS "@GINKGO_HIP_CLANG_COMPILER_FLAGS@")
 set(GINKGO_HIP_PLATFORM @GINKGO_HIP_PLATFORM@)
-set(GINKGO_HIP_PLATFORM_AMD_REGEX @HIP_PLATFORM_AMD_REGEX@)
-set(GINKGO_HIP_PLATFORM_NVIDIA_REGEX @HIP_PLATFORM_NVIDIA_REGEX@)
-set(GINKGO_HIP_AMDGPU @GINKGO_HIP_AMDGPU@)
+set(GINKGO_HIP_PLATFORM_AMD_REGEX "@HIP_PLATFORM_AMD_REGEX@")
+set(GINKGO_HIP_PLATFORM_NVIDIA_REGEX "@HIP_PLATFORM_NVIDIA_REGEX@")
+set(GINKGO_HIP_AMDGPU "@GINKGO_HIP_AMDGPU@")
 set(GINKGO_HIP_VERSION @GINKGO_HIP_VERSION@)
-set(GINKGO_AMD_ARCH_FLAGS @GINKGO_AMD_ARCH_FLAGS@)
+set(GINKGO_AMD_ARCH_FLAGS "@GINKGO_AMD_ARCH_FLAGS@")
 
 set(GINKGO_DPCPP_VERSION @GINKGO_DPCPP_VERSION@)
 set(GINKGO_DPCPP_MAJOR_VERSION @GINKGO_DPCPP_MAJOR_VERSION@)
-set(GINKGO_DPCPP_FLAGS @GINKGO_DPCPP_FLAGS@)
-set(GINKGO_MKL_ROOT @GINKGO_MKL_ROOT@)
-set(GINKGO_DPL_ROOT @GINKGO_DPL_ROOT@)
+set(GINKGO_DPCPP_FLAGS "@GINKGO_DPCPP_FLAGS@")
+set(GINKGO_MKL_ROOT "@GINKGO_MKL_ROOT@")
+set(GINKGO_DPL_ROOT "@GINKGO_DPL_ROOT@")
 
 set(GINKGO_BUILD_MPI @GINKGO_BUILD_MPI@)
 
@@ -91,6 +89,14 @@ set(GINKGO_HAVE_HWLOC @GINKGO_HAVE_HWLOC@)
 
 set(GINKGO_HAVE_ROCTX @GINKGO_HAVE_ROCTX@)
 
+# Ginkgo compiler information
+set(GINKGO_CXX_COMPILER "@CMAKE_CXX_COMPILER@")
+set(GINKGO_CXX_COMPILER_SHORT "@CMAKE_CXX_COMPILER_ID@:@CMAKE_CXX_COMPILER_VERSION@")
+set(GINKGO_CUDA_COMPILER "@CMAKE_CUDA_COMPILER@")
+set(GINKGO_CUDA_COMPILER_SHORT "@CMAKE_CUDA_COMPILER_ID@:@CMAKE_CUDA_COMPILER_VERSION@")
+set(GINKGO_CUDA_HOST_COMPILER "@CMAKE_CUDA_HOST_COMPILER@")
+set(GINKGO_CUDA_HOST_COMPILER_SHORT "")  # dummy value to stay consistent
+
 # Ginkgo installation configuration
 set(GINKGO_INSTALL_PREFIX "@PACKAGE_CMAKE_INSTALL_PREFIX@")
 set(GINKGO_INSTALL_INCLUDE_DIR "@PACKAGE_CMAKE_INSTALL_FULL_INCLUDEDIR@")
@@ -107,7 +113,6 @@ if(GINKGO_BUILD_HIP)
 endif()
 list(APPEND CMAKE_PREFIX_PATH "${GINKGO_INSTALL_PREFIX}")
 
-
 set(GINKGO_INTERFACE_LINK_LIBRARIES "@GINKGO_INTERFACE_LINK_LIBRARIES@")
 set(GINKGO_INTERFACE_LINK_FLAGS "@GINKGO_INTERFACE_LINK_FLAGS@")
 set(GINKGO_INTERFACE_CXX_FLAGS "@GINKGO_INTERFACE_CXX_FLAGS@")
@@ -117,11 +122,6 @@ set(GINKGO_CUDA_COMPILER "@CMAKE_CUDA_COMPILER@")
 set(GINKGO_CUDA_COMPILER_VERSION @CMAKE_CUDA_COMPILER_VERSION@)
 set(GINKGO_CUDA_HOST_LINK_LAUNCHER "@CMAKE_CUDA_HOST_LINK_LAUNCHER@")
 
-set(GINKGO_CUBLAS_LIBRARIES @CUBLAS@)
-set(GINKGO_CUSPARSE_LIBRARIES @CUSPARSE@)
-set(GINKGO_CUDA_LIBRARIES @CUDA_RUNTIME_LIBS@)
-set(GINKGO_CUDA_TOOLKIT_INCLUDE_DIRECTORIES "@CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES@")
-
 set(GINKGO_CUDA_FLAGS "@CMAKE_CUDA_FLAGS_MODIFY@")
 set(GINKGO_CUDA_FLAGS_DEBUG "@CMAKE_CUDA_FLAGS_DEBUG_MODIFY@")
 set(GINKGO_CUDA_FLAGS_RELEASE "@CMAKE_CUDA_FLAGS_RELEASE_MODIFY@")
@@ -129,8 +129,8 @@ set(GINKGO_CUDA_FLAGS_RELEASE "@CMAKE_CUDA_FLAGS_RELEASE_MODIFY@")
 # OpenMP
 set(GINKGO_OPENMP_VERSION @OpenMP_CXX_VERSION@)
 
-set(GINKGO_OPENMP_LIB_NAMES @OpenMP_CXX_LIB_NAMES@)
-set(GINKGO_OPENMP_LIBRARIES @OpenMP_CXX_LIBRARIES@)
+set(GINKGO_OPENMP_LIB_NAMES "@OpenMP_CXX_LIB_NAMES@")
+set(GINKGO_OPENMP_LIBRARIES "@OpenMP_CXX_LIBRARIES@")
 
 set(GINKGO_OPENMP_FLAGS "@OpenMP_CXX_FLAGS@")
 
@@ -139,21 +139,14 @@ set(GINKGO_HAVE_VTUNE "@GINKGO_HAVE_VTUNE@")
 set(GINKGO_HAVE_METIS "@GINKGO_HAVE_METIS@")
 set(VTune_PATH "@VTune_PATH@")
 
+# ensure Threads settings 
+set(THREADS_PREFER_PTHREAD_FLAG ON)
+
 # NOTE: we do not export benchmarks, examples, tests or devel tools
 #     so `third_party` libraries are currently unneeded.
 
-# propagate CUDA_HOST_COMPILER if needed
-if (GINKGO_BUILD_CUDA OR (GINKGO_BUILD_HIP
-    AND GINKGO_HIP_PLATFORM MATCHES "${GINKGO_HIP_PLATFORM_NVIDIA_REGEX}"))
-    if (GINKGO_CUDA_HOST_COMPILER AND NOT CMAKE_CUDA_HOST_COMPILER
-        AND EXISTS "${GINKGO_CUDA_HOST_COMPILER}")
-        message(STATUS "Ginkgo: Setting CUDA host compiler to ${GINKGO_CUDA_HOST_COMPILER}")
-        set(CMAKE_CUDA_HOST_COMPILER "${GINKGO_CUDA_HOST_COMPILER}" CACHE STRING "" FORCE)
-    endif()
-endif()
-
 if(GINKGO_HAVE_PAPI_SDE)
-    find_package(PAPI REQUIRED OPTIONAL_COMPONENTS sde)
+    find_package(PAPI REQUIRED COMPONENTS sde)
 endif()
 
 if(GINKGO_HAVE_HWLOC)
@@ -174,6 +167,7 @@ endif()
 # For details, see https://gitlab.kitware.com/cmake/cmake/issues/18614
 if((NOT GINKGO_BUILD_SHARED_LIBS) AND GINKGO_BUILD_CUDA)
     enable_language(CUDA)
+    find_package(CUDAToolkit REQUIRED)
     find_package(NVTX REQUIRED)
 endif()
 
@@ -190,7 +184,7 @@ if((NOT GINKGO_BUILD_SHARED_LIBS) AND GINKGO_BUILD_HIP)
     endif()
 endif()
 
-if((NOT GINKGO_BUILD_SHARED_LIBS) AND GINKGO_BUILD_DPCPP)
+if((NOT GINKGO_BUILD_SHARED_LIBS) AND GINKGO_BUILD_SYCL)
     find_package(MKL CONFIG REQUIRED HINTS "${GINKGO_MKL_ROOT}")
     find_package(oneDPL REQUIRED HINTS "${GINKGO_DPL_ROOT}")
 endif()
@@ -207,4 +201,20 @@ if((NOT GINKGO_BUILD_SHARED_LIBS) AND GINKGO_HAVE_TAU)
     find_package(PerfStubs REQUIRED)
 endif()
 
+# Check that the same compilers as for Ginkgo are used
+function(_ginkgo_check_compiler lang)
+    if(NOT ${CMAKE_${lang}_COMPILER} STREQUAL ${GINKGO_${lang}_COMPILER})
+        set(_compiler_short "${CMAKE_${lang}_COMPILER_ID}:${CMAKE_${lang}_COMPILER_VERSION}")
+        if(NOT _compiler_short STREQUAL "${GINKGO_${lang}_COMPILER_SHORT}")
+            message(WARNING "The currently used ${lang} compiler: ${CMAKE_${lang}_COMPILER} does not match the compiler used to "
+                            "build Ginkgo: ${GINKGO_${lang}_COMPILER}. It is encouraged to use the same compiler as Ginkgo to prevent ABI mismatch.")
+        endif()
+    endif()
+endfunction()
+_ginkgo_check_compiler(CXX)
+if(GINKGO_BUILD_CUDA)
+    _ginkgo_check_compiler(CUDA)
+    _ginkgo_check_compiler(CUDA_HOST)
+endif()
+
 include(${CMAKE_CURRENT_LIST_DIR}/GinkgoTargets.cmake)
diff --git a/cmake/Modules/CudaArchitectureSelector.cmake b/cmake/Modules/CudaArchitectureSelector.cmake
index 63e8c767446..f863b144ab7 100644
--- a/cmake/Modules/CudaArchitectureSelector.cmake
+++ b/cmake/Modules/CudaArchitectureSelector.cmake
@@ -65,6 +65,15 @@
 # The command has the same result as ``cas_target_cuda_architectures``. It does 
 # not add the compiler flags to the target, but stores the compiler flags in 
 # the variable (string).
+# 
+#   cas_variable_cmake_cuda_architectures(
+#    [<variable>]               # variable for storing architecture list
+#    [<spec>]                   # list of architecture specifications
+#   )
+#
+# The command prepares an architecture list supported by the CMake
+# ``CUDA_ARCHITECTURES`` target property and ``CMAKE_CUDA_ARCHITECTURES``
+# variable. The architecture specification 
 #
 # 
 # ``ARCHITECTURES`` specification list
@@ -119,7 +128,7 @@
 # identifiers in this list will be removed from the list specified by the
 # ``ARCHITECTURES`` list. A warning will be printed for each removed entry.
 # The list also supports aggregates ``All``, ``Auto`` and GPU generation names
-# wich have the same meaning as in the ``ARCHITECTURES'' specification list.
+# which have the same meaning as in the ``ARCHITECTURES'' specification list.
 
 
 if(NOT DEFINED CMAKE_CUDA_COMPILER)
@@ -404,3 +413,34 @@ function(cas_variable_cuda_architectures variable)
     cas_get_compiler_flags(flags ${ARGN})
     set(${variable} "${flags}" PARENT_SCOPE)
 endfunction()
+
+
+function(cas_variable_cmake_cuda_architectures variable)
+    cas_get_supported_architectures(supported_archs)
+    if("${ARGN}" STREQUAL "All")
+        set(archs "${supported_archs}")
+    elseif("${ARGN}" STREQUAL "Auto")
+        cas_get_onboard_architectures(onboard_archs)
+        if (onboard_archs)
+            set(archs "${onboard_archs}")
+        else()
+            set(archs "${supported_archs}")
+        endif()
+    else()
+        set(archs)
+        foreach(arch IN LISTS ARGN)
+            if(arch MATCHES "${cas_spec_regex}")
+                if(CMAKE_MATCH_1)
+                    list(APPEND archs ${CMAKE_MATCH_1}-real)
+                endif()
+                if(CMAKE_MATCH_3)
+                    list(APPEND archs ${CMAKE_MATCH_3}-virtual)
+                endif()
+            else()
+                cas_get_architectures_by_name("${arch}" arch)
+                list(APPEND archs ${arch})
+            endif()
+        endforeach()
+    endif()
+    set("${variable}" "${archs}" PARENT_SCOPE)
+endfunction()
diff --git a/cmake/Modules/FindNVTX.cmake b/cmake/Modules/FindNVTX.cmake
index 7078c9dcb36..879c66f2d59 100644
--- a/cmake/Modules/FindNVTX.cmake
+++ b/cmake/Modules/FindNVTX.cmake
@@ -27,8 +27,8 @@
 # ``NVTX_FOUND``
 #   If false, do not try to use the NVTX library.
 
-find_path(NVTX3_INCLUDE_DIR NAMES nvToolsExt.h HINTS ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}/nvtx3)
-find_path(NVTX_INCLUDE_DIR NAMES nvToolsExt.h HINTS ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+find_path(NVTX3_INCLUDE_DIR NAMES nvToolsExt.h HINTS ${CUDAToolkit_INCLUDE_DIRS}/nvtx3)
+find_path(NVTX_INCLUDE_DIR NAMES nvToolsExt.h HINTS ${CUDAToolkit_INCLUDE_DIRS})
 mark_as_advanced(NVTX3_INCLUDE_DIR)
 mark_as_advanced(NVTX_INCLUDE_DIR)
 include(FindPackageHandleStandardArgs)
diff --git a/cmake/Modules/FindPAPI.cmake b/cmake/Modules/FindPAPI.cmake
index 95f26a24684..04962970e35 100644
--- a/cmake/Modules/FindPAPI.cmake
+++ b/cmake/Modules/FindPAPI.cmake
@@ -57,6 +57,7 @@ if(NOT PAPI_LIBRARY)
     select_library_configurations(PAPI)
 endif()
 
+set(WORK_DIR "${PROJECT_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/FindPAPI")
 if(PAPI_INCLUDE_DIR)
     if(EXISTS "${PAPI_INCLUDE_DIR}/papi.h")
         file(STRINGS "${PAPI_INCLUDE_DIR}/papi.h" papi_version_str REGEX "^#define[\t ]+PAPI_VERSION[\t ]+.*")
@@ -70,7 +71,9 @@ if(PAPI_INCLUDE_DIR)
         # find the components
         enable_language(C)
         foreach(component IN LISTS PAPI_FIND_COMPONENTS)
-            file(WRITE "${PROJECT_BINARY_DIR}/papi_${component}_detect.c"
+            set(SRC_FILE "${WORK_DIR}/papi_${component}_detect.c")
+            set(BIN_FILE "${WORK_DIR}/papi_${component}_detect.bin")
+            file(WRITE "${SRC_FILE}"
                 "
                 #include <papi.h>
                 int main() {
@@ -78,17 +81,18 @@ if(PAPI_INCLUDE_DIR)
                  retval = PAPI_library_init(PAPI_VER_CURRENT);
                    if (retval != PAPI_VER_CURRENT && retval > 0)
                     return -1;
-                   if (PAPI_get_component_index(\"${component}\") < 0)
+                   if (PAPI_get_component_index(\"${component}\") == PAPI_ENOCMP)
                     return 0;
                    return 1;
                 }"
                 )
             try_run(PAPI_${component}_FOUND
                 gko_result_unused
-                "${PROJECT_BINARY_DIR}"
-                "${PROJECT_BINARY_DIR}/papi_${component}_detect.c"
+                "${WORK_DIR}"
+                "${SRC_FILE}"
                 CMAKE_FLAGS -DINCLUDE_DIRECTORIES=${PAPI_INCLUDE_DIR}
                 LINK_LIBRARIES ${PAPI_LIBRARY}
+                COPY_FILE ${BIN_FILE}
                 )
 
             if (NOT PAPI_${component}_FOUND EQUAL 1)
@@ -105,6 +109,33 @@ find_package_handle_standard_args(PAPI
                                   VERSION_VAR PAPI_VERSION_STRING
                                   HANDLE_COMPONENTS)
 
+if(PAPI_sde_FOUND)
+    # PAPI SDE is another library and header, let's try to find them
+    find_path(PAPI_SDE_INCLUDE_DIR NAMES sde_lib.h)
+    mark_as_advanced(PAPI_SDE_INCLUDE_DIR)
+
+    if(NOT PAPI_SDE_LIBRARY)
+        find_library(PAPI_SDE_LIBRARY_RELEASE NAMES
+            sde
+        )
+        mark_as_advanced(PAPI_SDE_LIBRARY_RELEASE)
+
+        find_library(PAPI_SDE_LIBRARY_DEBUG NAMES
+            sded
+            sde-d
+        )
+        mark_as_advanced(PAPI_SDE_LIBRARY_DEBUG)
+
+        include(SelectLibraryConfigurations)
+        select_library_configurations(PAPI_SDE)
+    endif()
+
+    # FIXME: with CMake>=3.17, use NAME_MISMATCHED to get rid of the warning
+    find_package_handle_standard_args(PAPI_SDE
+        REQUIRED_VARS PAPI_SDE_LIBRARY PAPI_SDE_INCLUDE_DIR
+        VERSION_VAR PAPI_VERSION_STRING)
+endif()
+
 if(PAPI_FOUND)
     set(PAPI_LIBRARIES ${PAPI_LIBRARY})
     set(PAPI_INCLUDE_DIRS ${PAPI_INCLUDE_DIR})
@@ -142,3 +173,41 @@ if(PAPI_FOUND)
         endif()
     endif()
 endif()
+
+if (PAPI_SDE_FOUND AND NOT TARGET PAPI::PAPI_SDE)
+    set(PAPI_SDE_LIBRARIES ${PAPI_SDE_LIBRARY})
+    set(PAPI_SDE_INCLUDE_DIRS ${PAPI_SDE_INCLUDE_DIR})
+    unset(PAPI_SDE_LIBRARY)
+    unset(PAPI_SDE_INCLUDE_DIR)
+
+    if(NOT TARGET PAPI::PAPI_SDE)
+        add_library(PAPI::PAPI_SDE UNKNOWN IMPORTED)
+        set_target_properties(PAPI::PAPI_SDE PROPERTIES
+            INTERFACE_INCLUDE_DIRECTORIES "${PAPI_SDE_INCLUDE_DIRS}")
+
+        if(EXISTS "${PAPI_SDE_LIBRARIES}")
+            set_target_properties(PAPI::PAPI_SDE PROPERTIES
+                IMPORTED_LINK_INTERFACE_LANGUAGES "C"
+                INTERFACE_LINK_LIBRARIES "${PAPI_SDE_LIBRARIES}"
+                IMPORTED_LOCATION "${PAPI_SDE_LIBRARIES}")
+        endif()
+        if(PAPI_SDE_LIBRARY_RELEASE)
+            set_property(TARGET PAPI::PAPI_SDE APPEND PROPERTY
+                IMPORTED_CONFIGURATIONS RELEASE)
+            set_target_properties(PAPI::PAPI_SDE PROPERTIES
+                IMPORTED_LINK_INTERFACE_LANGUAGES "C"
+                INTERFACE_LINK_LIBRARIES_RELEASE "${PAPI_SDE_LIBRARY_RELEASE}"
+                IMPORTED_LOCATION_RELEASE "${PAPI_SDE_LIBRARY_RELEASE}")
+            unset(PAPI_SDE_LIBRARY_RELEASE)
+        endif()
+        if(PAPI_SDE_LIBRARY_DEBUG)
+            set_property(TARGET PAPI::PAPI_SDE APPEND PROPERTY
+                IMPORTED_CONFIGURATIONS DEBUG)
+            set_target_properties(PAPI::PAPI_SDE PROPERTIES
+                IMPORTED_LINK_INTERFACE_LANGUAGES "C"
+                INTERFACE_LINK_LIBRARIES_DEBUG "${PAPI_SDE_LIBRARY_DEBUG}"
+                IMPORTED_LOCATION_DEBUG "${PAPI_SDE_LIBRARY_DEBUG}")
+            unset(PAPI_SDE_LIBRARY_DEBUG)
+        endif()
+    endif()
+endif()
diff --git a/cmake/autodetect_executors.cmake b/cmake/autodetect_executors.cmake
index 315e0eb3e38..757262f1ea1 100644
--- a/cmake/autodetect_executors.cmake
+++ b/cmake/autodetect_executors.cmake
@@ -1,7 +1,7 @@
 set(GINKGO_HAS_OMP OFF)
 set(GINKGO_HAS_MPI OFF)
 set(GINKGO_HAS_CUDA OFF)
-set(GINKGO_HAS_DPCPP OFF)
+set(GINKGO_HAS_SYCL OFF)
 set(GINKGO_HAS_HIP OFF)
 
 include(CheckLanguage)
@@ -37,12 +37,16 @@ if (NOT DEFINED GINKGO_BUILD_HIP)
     endif()
 endif()
 
-if (NOT DEFINED GINKGO_BUILD_DPCPP)
+if (NOT DEFINED GINKGO_BUILD_DPCPP AND NOT DEFINED GINKGO_BUILD_SYCL)
     try_compile(GKO_CAN_COMPILE_DPCPP ${PROJECT_BINARY_DIR}/dpcpp
         SOURCES ${PROJECT_SOURCE_DIR}/dpcpp/test_dpcpp.dp.cpp
+        # try_compile will pass the project CMAKE_CXX_FLAGS so passing -DCMAKE_CXX_FLAGS does not affect it.
+        # They append COMPILE_DEFINITIONS into CMAKE_CXX_FLAGS.
+        # Note. it is different from try_compile COMPILE_DEFINITIONS affect
+        CMAKE_FLAGS -DCOMPILE_DEFINITIONS=-fsycl
         CXX_STANDARD 17)
     if (GKO_CAN_COMPILE_DPCPP)
         message(STATUS "Enabling DPCPP executor")
-        set(GINKGO_HAS_DPCPP ON)
+        set(GINKGO_HAS_SYCL ON)
     endif()
 endif()
diff --git a/cmake/autodetect_system_libs.cmake b/cmake/autodetect_system_libs.cmake
new file mode 100644
index 00000000000..6f59a759aa8
--- /dev/null
+++ b/cmake/autodetect_system_libs.cmake
@@ -0,0 +1,7 @@
+if (NOT DEFINED GINKGO_BUILD_HWLOC)
+    find_package(HWLOC 2.1)
+endif()
+
+if (NOT DEFINED GINKGO_BUILD_PAPI_SDE)
+    find_package(PAPI 7.0.1.0 COMPONENTS sde)
+endif()
diff --git a/cmake/build_helpers.cmake b/cmake/build_helpers.cmake
index a7b8c48acf3..34189a09450 100644
--- a/cmake/build_helpers.cmake
+++ b/cmake/build_helpers.cmake
@@ -139,7 +139,8 @@ function(ginkgo_extract_dpcpp_version DPCPP_COMPILER GINKGO_DPCPP_VERSION MACRO_
         "int main() {std::cout << ${MACRO_VAR} << '\\n'\;"
         "return 0\;}")
     file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/extract_dpcpp_ver.cpp" ${DPCPP_VERSION_PROG})
-    execute_process(COMMAND ${DPCPP_COMPILER} ${CMAKE_CURRENT_BINARY_DIR}/extract_dpcpp_ver.cpp
+    # we always add -fsycl
+    execute_process(COMMAND ${DPCPP_COMPILER} -fsycl ${CMAKE_CURRENT_BINARY_DIR}/extract_dpcpp_ver.cpp
         -o ${CMAKE_CURRENT_BINARY_DIR}/extract_dpcpp_ver
         ERROR_VARIABLE DPCPP_EXTRACT_VER_ERROR)
     execute_process(COMMAND ${CMAKE_CURRENT_BINARY_DIR}/extract_dpcpp_ver
diff --git a/cmake/create_test.cmake b/cmake/create_test.cmake
index 3fbafe35858..522ad5f2ba7 100644
--- a/cmake/create_test.cmake
+++ b/cmake/create_test.cmake
@@ -1,26 +1,19 @@
-set(gko_test_single_args "MPI_SIZE")
+set(gko_test_resource_args "RESOURCE_LOCAL_CORES;RESOURCE_TYPE")
+set(gko_test_single_args "MPI_SIZE;${gko_test_resource_args}")
 set(gko_test_multi_args "DISABLE_EXECUTORS;ADDITIONAL_LIBRARIES;ADDITIONAL_INCLUDES")
+set(gko_test_option_args "NO_RESOURCES")
 
 ## Replaces / by _ to create valid target names from relative paths
 function(ginkgo_build_test_name test_name target_name)
     file(RELATIVE_PATH REL_BINARY_DIR
-        ${PROJECT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR})
+         ${PROJECT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR})
     string(REPLACE "/" "_" TEST_TARGET_NAME "${REL_BINARY_DIR}/${test_name}")
     set(${target_name} ${TEST_TARGET_NAME} PARENT_SCOPE)
-endfunction(ginkgo_build_test_name)
-
-function(ginkgo_create_gtest_mpi_main)
-    add_library(gtest_mpi_main "")
-    target_sources(gtest_mpi_main
-      PRIVATE
-      ${PROJECT_SOURCE_DIR}/core/test/mpi/gtest/mpi_listener.cpp)
-    find_package(MPI 3.1 COMPONENTS CXX REQUIRED)
-    target_link_libraries(gtest_mpi_main PRIVATE GTest::GTest MPI::MPI_CXX)
-endfunction(ginkgo_create_gtest_mpi_main)
+endfunction()
 
 ## Set up shared target properties and handle ADDITIONAL_LIBRARIES/ADDITIONAL_INCLUDES
 ## `MPI_SIZE size` causes the tests to be run with `size` MPI processes.
-function(ginkgo_set_test_target_properties test_target_name)
+function(ginkgo_set_test_target_properties test_target_name test_library_suffix)
     cmake_parse_arguments(PARSE_ARGV 1 set_properties "" "${gko_test_single_args}" "${gko_test_multi_args}")
     if (GINKGO_FAST_TESTS)
         target_compile_definitions(${test_target_name} PRIVATE GINKGO_FAST_TESTS)
@@ -31,26 +24,56 @@ function(ginkgo_set_test_target_properties test_target_name)
     if (GINKGO_COMPILING_DPCPP_TEST AND GINKGO_DPCPP_SINGLE_MODE)
         target_compile_definitions(${test_target_name} PRIVATE GINKGO_DPCPP_SINGLE_MODE=1)
     endif()
-    if (GINKGO_CHECK_CIRCULAR_DEPS)
+    if(GINKGO_CHECK_CIRCULAR_DEPS)
         target_link_libraries(${test_target_name} PRIVATE "${GINKGO_CIRCULAR_DEPS_FLAGS}")
     endif()
-    if (set_properties_MPI_SIZE)
-        if(NOT TARGET gtest_mpi_main)
-            ginkgo_create_gtest_mpi_main()
-        endif()
-        set(gtest_main gtest_mpi_main MPI::MPI_CXX)
+    if(set_properties_MPI_SIZE)
+        target_link_libraries(${test_target_name} PRIVATE ginkgo_gtest_main_mpi${test_library_suffix})
     else()
-        set(gtest_main GTest::Main)
+        target_link_libraries(${test_target_name} PRIVATE ginkgo_gtest_main${test_library_suffix})
     endif()
     target_compile_features(${test_target_name} PUBLIC cxx_std_14)
     target_compile_options(${test_target_name} PRIVATE $<$<COMPILE_LANGUAGE:CXX>:${GINKGO_COMPILER_FLAGS}>)
     target_include_directories(${test_target_name} PRIVATE ${Ginkgo_BINARY_DIR} ${set_properties_ADDITIONAL_INCLUDES})
-    target_link_libraries(${test_target_name} PRIVATE ginkgo ${gtest_main} GTest::GTest ${set_properties_ADDITIONAL_LIBRARIES})
+    target_link_libraries(${test_target_name} PRIVATE ginkgo GTest::GTest ${set_properties_ADDITIONAL_LIBRARIES})
+endfunction()
+
+function(ginkgo_add_resource_requirement test_name)
+    cmake_parse_arguments(PARSE_ARGV 1 add_rr "${gko_test_option_args}" "${gko_test_single_args}" "")
+    if(add_rr_NO_RESOURCES OR (NOT add_rr_RESOURCE_TYPE))
+        return()
+    endif ()
+
+    if(add_rr_RESOURCE_TYPE STREQUAL "cpu")
+        if(NOT add_rr_RESOURCE_LOCAL_CORES)
+            set(add_rr_RESOURCE_LOCAL_CORES ${GINKGO_CI_TEST_OMP_PARALLELISM})
+        endif()
+        if(NOT add_rr_RESOURCE_LOCAL_CORES MATCHES "^[0-9]+")
+            message(FATAL_ERROR "Resource specification is invalid: RESOURCE_LOCAL_CORES=${add_rr_RESOURCE_LOCAL_CORES}")
+        endif()
+
+        set(single_resource "cpu:${add_rr_RESOURCE_LOCAL_CORES}")
+    elseif(add_rr_RESOURCE_TYPE MATCHES "^(cudagpu|hipgpu|sycl)$")
+        set(single_resource "${add_rr_RESOURCE_TYPE}:1")
+    else()
+        message(FATAL_ERROR "Unrecognized resource type ${add_rr_RESOURCE_TYPE}, allowed are: cpu, cudagpu, hipgpu, sycl.")
+    endif()
+
+    if(NOT add_rr_MPI_SIZE)
+        set(add_rr_MPI_SIZE 1)
+    endif()
+    set_property(TEST ${test_name}
+                 PROPERTY
+                 RESOURCE_GROUPS "${add_rr_MPI_SIZE},${single_resource}")
 endfunction()
 
+
 ## Adds a test to the list executed by ctest and sets its output binary name
 ## Possible additional arguments:
 ## - `MPI_SIZE size` causes the tests to be run with `size` MPI processes.
+## - `RESOURCE_LOCAL_CORES` the number of threads used by a test, default is
+##    $GINKGO_CI_TEST_OMP_PARALLELISM
+## - `RESOURCE_TYPE` the resource type, can be cpu, cudagpu, hipgpu, sycl
 ## - `DISABLE_EXECUTORS exec1 exec2` disables the test for certain backends (if built for multiple)
 ## - `ADDITIONAL_LIBRARIES lib1 lib2` adds additional target link dependencies
 ## - `ADDITIONAL_INCLUDES path1 path2` adds additional target include paths
@@ -71,6 +94,9 @@ function(ginkgo_add_test test_name test_target_name)
                  COMMAND ${test_target_name}
                  WORKING_DIRECTORY "$<TARGET_FILE_DIR:ginkgo>")
     endif()
+
+    ginkgo_add_resource_requirement(${REL_BINARY_DIR}/${test_name} ${ARGN})
+
     set(test_preload)
     if (GINKGO_TEST_NONDEFAULT_STREAM AND GINKGO_BUILD_CUDA)
         set(test_preload $<TARGET_FILE:identify_stream_usage_cuda>:${test_preload})
@@ -87,8 +113,8 @@ endfunction()
 function(ginkgo_create_test test_name)
     ginkgo_build_test_name(${test_name} test_target_name)
     add_executable(${test_target_name} ${test_name}.cpp)
-    target_link_libraries(${test_target_name} PRIVATE ${create_test_ADDITIONAL_LIBRARIES})
-    ginkgo_set_test_target_properties(${test_target_name} ${ARGN})
+    target_link_libraries(${test_target_name})
+    ginkgo_set_test_target_properties(${test_target_name} "_cpu" ${ARGN})
     ginkgo_add_test(${test_name} ${test_target_name} ${ARGN})
 endfunction(ginkgo_create_test)
 
@@ -98,9 +124,10 @@ function(ginkgo_create_dpcpp_test test_name)
     add_executable(${test_target_name} ${test_name}.dp.cpp)
     target_compile_features(${test_target_name} PUBLIC cxx_std_17)
     target_compile_options(${test_target_name} PRIVATE ${GINKGO_DPCPP_FLAGS})
+    gko_add_sycl_to_target(TARGET ${test_target_name} SOURCES ${test_name}.dp.cpp)
     target_link_options(${test_target_name} PRIVATE -fsycl-device-code-split=per_kernel)
-    ginkgo_set_test_target_properties(${test_target_name} ${ARGN})
-    ginkgo_add_test(${test_name} ${test_target_name} ${ARGN})
+    ginkgo_set_test_target_properties(${test_target_name} "_dpcpp" ${ARGN})
+    ginkgo_add_test(${test_name} ${test_target_name} ${ARGN} RESOURCE_TYPE sycl)
     # Note: MKL_ENV is empty on linux. Maybe need to apply MKL_ENV to all test.
     if (MKL_ENV)
         set_tests_properties(${test_target_name} PROPERTIES ENVIRONMENT "${MKL_ENV}")
@@ -119,7 +146,6 @@ function(ginkgo_create_cuda_test_internal test_name filename test_target_name)
     target_compile_definitions(${test_target_name} PRIVATE GKO_COMPILING_CUDA)
     target_compile_options(${test_target_name}
         PRIVATE
-            $<$<COMPILE_LANGUAGE:CUDA>:${GINKGO_CUDA_ARCH_FLAGS}>
             $<$<COMPILE_LANGUAGE:CUDA>:${GINKGO_CUDA_COMPILER_FLAGS}>)
     if(MSVC)
         target_compile_options(${test_target_name}
@@ -134,8 +160,8 @@ function(ginkgo_create_cuda_test_internal test_name filename test_target_name)
     if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.18)
         set_target_properties(${test_target_name} PROPERTIES CUDA_ARCHITECTURES OFF)
     endif()
-    ginkgo_set_test_target_properties(${test_target_name} ${ARGN})
-    ginkgo_add_test(${test_name} ${test_target_name} ${ARGN})
+    ginkgo_set_test_target_properties(${test_target_name} "_cuda" ${ARGN})
+    ginkgo_add_test(${test_name} ${test_target_name} ${ARGN} RESOURCE_TYPE cudagpu)
 endfunction(ginkgo_create_cuda_test_internal)
 
 ## Test compiled with HIP
@@ -190,10 +216,26 @@ function(ginkgo_create_hip_test_internal test_name filename test_target_name add
         ${hiprand_INCLUDE_DIRS}
         ${HIPSPARSE_INCLUDE_DIRS}
         )
-    ginkgo_set_test_target_properties(${test_target_name} ${ARGN})
-    ginkgo_add_test(${test_name} ${test_target_name} ${ARGN})
+    ginkgo_set_test_target_properties(${test_target_name} "_hip" ${ARGN})
+    ginkgo_add_test(${test_name} ${test_target_name} ${ARGN} RESOURCE_TYPE hipgpu)
 endfunction(ginkgo_create_hip_test_internal)
 
+
+## Test compiled with OpenMP
+function(ginkgo_create_omp_test test_name)
+    ginkgo_build_test_name(${test_name} test_target_name)
+    ginkgo_create_omp_test_internal(${test_name} ${test_name}.cpp ${test_target_name} "" ${ARGN})
+endfunction()
+
+function(ginkgo_create_omp_test_internal test_name filename test_target_name)
+    ginkgo_build_test_name(${test_name} test_target_name)
+    add_executable(${test_target_name} ${test_name}.cpp)
+    target_compile_definitions(${test_target_name} PRIVATE GKO_COMPILING_OMP)
+    target_link_libraries(${test_target_name} PRIVATE OpenMP::OpenMP_CXX)
+    ginkgo_set_test_target_properties(${test_target_name} "_omp" ${ARGN})
+    ginkgo_add_test(${test_name} ${test_target_name} ${ARGN} RESOURCE_TYPE cpu)
+endfunction()
+
 ## Common test compiled with the host compiler, one target for each enabled backend
 function(ginkgo_create_common_test test_name)
     if(GINKGO_BUILD_OMP)
@@ -205,7 +247,7 @@ function(ginkgo_create_common_test test_name)
     if(GINKGO_BUILD_CUDA)
         ginkgo_create_common_test_internal(${test_name} CudaExecutor cuda ${ARGN})
     endif()
-    if(GINKGO_BUILD_DPCPP)
+    if(GINKGO_BUILD_SYCL)
         ginkgo_create_common_test_internal(${test_name} DpcppExecutor dpcpp ${ARGN})
     endif()
 endfunction(ginkgo_create_common_test)
@@ -215,11 +257,29 @@ function(ginkgo_create_common_test_internal test_name exec_type exec)
     if(exec IN_LIST common_test_DISABLE_EXECUTORS)
         return()
     endif()
+    if (exec STREQUAL reference)
+        set(test_resource_type "")
+    elseif (exec STREQUAL omp)
+        set(test_resource_type cpu)
+    elseif (exec STREQUAL cuda)
+        set(test_resource_type cudagpu)
+    elseif (exec STREQUAL hip)
+        set(test_resource_type hipgpu)
+    else ()
+        set(test_resource_type sycl)
+    endif ()
     ginkgo_build_test_name(${test_name} test_target_name)
     string(TOUPPER ${exec} exec_upper)
+
     # set up actual test
     set(test_target_name ${test_target_name}_${exec})
     add_executable(${test_target_name} ${test_name}.cpp)
+
+    # also need to add runtime libraries for other backends
+    if (exec STREQUAL omp)
+        target_link_libraries(${test_target_name} PRIVATE OpenMP::OpenMP_CXX)
+    endif ()
+
     target_compile_definitions(${test_target_name} PRIVATE EXEC_TYPE=${exec_type} EXEC_NAMESPACE=${exec} GKO_COMPILING_${exec_upper})
     target_link_libraries(${test_target_name} PRIVATE ${common_test_ADDITIONAL_LIBRARIES})
     # use float for DPC++ if necessary
@@ -227,18 +287,21 @@ function(ginkgo_create_common_test_internal test_name exec_type exec)
         target_compile_definitions(${test_target_name} PRIVATE GINKGO_COMMON_SINGLE_MODE=1)
         target_compile_definitions(${test_target_name} PRIVATE GINKGO_DPCPP_SINGLE_MODE=1)
     endif()
-    ginkgo_set_test_target_properties(${test_target_name} ${ARGN})
-    ginkgo_add_test(${test_name}_${exec} ${test_target_name} ${ARGN})
+    ginkgo_set_test_target_properties(${test_target_name} "_${exec}" ${ARGN})
+    ginkgo_add_test(${test_name}_${exec} ${test_target_name} ${ARGN} RESOURCE_TYPE ${test_resource_type})
 endfunction(ginkgo_create_common_test_internal)
 
 ## Common test compiled with the device compiler, one target for each enabled backend
 function(ginkgo_create_common_device_test test_name)
     cmake_parse_arguments(PARSE_ARGV 1 common_device_test "" "${gko_test_single_args}" "${gko_test_multi_args}")
     ginkgo_build_test_name(${test_name} test_target_name)
-    if(GINKGO_BUILD_DPCPP)
+    if(GINKGO_BUILD_SYCL)
         ginkgo_create_common_test_internal(${test_name} DpcppExecutor dpcpp ${ARGN})
         target_compile_features(${test_target_name}_dpcpp PRIVATE cxx_std_17)
         target_compile_options(${test_target_name}_dpcpp PRIVATE ${GINKGO_DPCPP_FLAGS})
+        # We need to use a new file to avoid sycl setting in other backends because add_sycl_to_target will change the source property.
+        configure_file(${test_name}.cpp ${test_name}.dp.cpp COPYONLY)
+        gko_add_sycl_to_target(TARGET ${test_target_name}_dpcpp SOURCES ${test_name}.dp.cpp)
         target_link_options(${test_target_name}_dpcpp PRIVATE -fsycl-device-lib=all -fsycl-device-code-split=per_kernel)
     endif()
     if(GINKGO_BUILD_OMP)
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index c5ba334e983..2e1c82db6b0 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -1,82 +1,16 @@
 enable_language(CUDA)
 
-if(MSVC)
-    # MSVC can not find CUDA automatically
-    # Use CUDA_COMPILER PATH to define the CUDA TOOLKIT ROOT DIR
-    string(REPLACE "/bin/nvcc.exe" "" CMAKE_CUDA_ROOT_DIR ${CMAKE_CUDA_COMPILER})
-    if("${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}" STREQUAL "")
-        set(CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES "${CMAKE_CUDA_ROOT_DIR}/include")
-    endif()
-    if("${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}" STREQUAL "")
-        set(CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES "${CMAKE_CUDA_ROOT_DIR}/lib/x64")
-    endif()
-endif()
+find_package(CUDAToolkit REQUIRED)
 
 include(cmake/Modules/CudaArchitectureSelector.cmake)
 
-set(CUDA_INCLUDE_DIRS ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
-
-# Detect the CUDA architecture flags and propagate to all the project
-cas_variable_cuda_architectures(GINKGO_CUDA_ARCH_FLAGS
-    ARCHITECTURES ${GINKGO_CUDA_ARCHITECTURES}
-    UNSUPPORTED "20" "21")
-
-if (CMAKE_CXX_COMPILER_ID MATCHES "PGI|NVHPC")
-    find_package(NVHPC REQUIRED
-        HINTS
-        $ENV{NVIDIA_PATH}
-        ${CMAKE_CUDA_COMPILER}/../../..
-        )
-
-    set(CUDA_RUNTIME_LIBS_DYNAMIC ${NVHPC_CUDART_LIBRARY})
-    set(CUDA_RUNTIME_LIBS_STATIC ${NVHPC_CUDART_LIBRARY_STATIC})
-    set(CUBLAS ${NVHPC_CUBLAS_LIBRARY})
-    set(CUSPARSE ${NVHPC_CUSPARSE_LIBRARY})
-    set(CURAND ${NVHPC_CURAND_LIBRARY})
-    set(CUFFT ${NVHPC_CUFFT_LIBRARY})
-else()
-    find_library(CUDA_RUNTIME_LIBS_DYNAMIC cudart
-        HINT ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
-    find_library(CUDA_RUNTIME_LIBS_STATIC cudart_static
-        HINT ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
-
-    # CUDA 10.1/10.2 put cublas, cublasLt, cudnn in /usr/lib/<arch>-linux-gnu/, but
-    # others (<= 10.0 or >= 11) put them in cuda own directory
-    # If the environment installs several cuda including 10.1/10.2, cmake will find
-    # the 10.1/10.2 .so files when searching others cuda in the default path.
-    # CMake already puts /usr/lib/<arch>-linux-gnu/ after cuda own directory in the
-    # `CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES`, so we always put NO_DEFAULT_PATH here.
-    find_library(CUBLAS cublas
-        HINT ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES} NO_DEFAULT_PATH)
-    find_library(CUSPARSE cusparse
-        HINT ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
-    find_library(CURAND curand
-        HINT ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
-    find_library(CUFFT cufft
-        HINT ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
+if(NOT CMAKE_CUDA_ARCHITECTURES)
+    # Detect the CUDA architecture and propagate it to the entire project
+    cas_variable_cmake_cuda_architectures(CMAKE_CUDA_ARCHITECTURES ${GINKGO_CUDA_ARCHITECTURES})
 endif()
 
 find_package(NVTX REQUIRED)
 
-# MSVC nvcc uses static cudartlibrary by default, and other platforms use shared cudartlibrary.
-# add `-cudart shared` or `-cudart=shared` according system into CMAKE_CUDA_FLAGS
-# to force nvcc to use dynamic cudart library in MSVC.
-if(MSVC)
-    if("${CMAKE_CUDA_FLAGS}" MATCHES "-cudart(=| )shared")
-        set(CUDA_RUNTIME_LIBS "${CUDA_RUNTIME_LIBS_DYNAMIC}" CACHE STRING "Path to a library" FORCE)
-    else()
-        set(CUDA_RUNTIME_LIBS "${CUDA_RUNTIME_LIBS_STATIC}" CACHE STRING "Path to a library" FORCE)
-    endif()
-else()
-    set(CUDA_RUNTIME_LIBS "${CUDA_RUNTIME_LIBS_DYNAMIC}" CACHE STRING "Path to a library" FORCE)
-endif()
-
-if (NOT CMAKE_CUDA_HOST_COMPILER AND NOT GINKGO_CUDA_DEFAULT_HOST_COMPILER)
-    set(CMAKE_CUDA_HOST_COMPILER "${CMAKE_CXX_COMPILER}" CACHE STRING "" FORCE)
-elseif(GINKGO_CUDA_DEFAULT_HOST_COMPILER)
-    unset(CMAKE_CUDA_HOST_COMPILER CACHE)
-endif()
-
 if(CMAKE_CUDA_HOST_COMPILER AND NOT CMAKE_CXX_COMPILER STREQUAL CMAKE_CUDA_HOST_COMPILER)
     message(WARNING "The CMake CXX compiler and CUDA host compiler do not match. "
         "If you encounter any build error, especially while linking, try to use "
@@ -84,13 +18,3 @@ if(CMAKE_CUDA_HOST_COMPILER AND NOT CMAKE_CXX_COMPILER STREQUAL CMAKE_CUDA_HOST_
         "The CXX compiler is ${CMAKE_CXX_COMPILER} with version ${CMAKE_CXX_COMPILER_VERSION}.\n"
         "The CUDA host compiler is ${CMAKE_CUDA_HOST_COMPILER}.")
 endif()
-
-if (CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA" AND CMAKE_CUDA_COMPILER_VERSION
-    MATCHES "9.2" AND CMAKE_CUDA_HOST_COMPILER MATCHES ".*clang.*" )
-    ginkgo_extract_clang_version(${CMAKE_CUDA_HOST_COMPILER} GINKGO_CUDA_HOST_CLANG_VERSION)
-
-    if (GINKGO_CUDA_HOST_CLANG_VERSION MATCHES "5\.0.*")
-        message(FATAL_ERROR "There is a bug between nvcc 9.2 and clang 5.0 which create a compiling issue."
-            "Consider using a different CUDA host compiler or CUDA version.")
-    endif()
-endif()
diff --git a/cmake/get_info.cmake b/cmake/get_info.cmake
index 2cf8dd06c3f..6b904189151 100644
--- a/cmake/get_info.cmake
+++ b/cmake/get_info.cmake
@@ -127,7 +127,7 @@ foreach(log_type ${log_types})
     ginkgo_print_module_footer(${${log_type}} "User configuration:")
     ginkgo_print_module_footer(${${log_type}} "  Enabled modules:")
     ginkgo_print_foreach_variable(${${log_type}}
-        "GINKGO_BUILD_OMP;GINKGO_BUILD_MPI;GINKGO_BUILD_REFERENCE;GINKGO_BUILD_CUDA;GINKGO_BUILD_HIP;GINKGO_BUILD_DPCPP")
+        "GINKGO_BUILD_OMP;GINKGO_BUILD_MPI;GINKGO_BUILD_REFERENCE;GINKGO_BUILD_CUDA;GINKGO_BUILD_HIP;GINKGO_BUILD_SYCL")
     ginkgo_print_module_footer(${${log_type}} "  Enabled features:")
     ginkgo_print_foreach_variable(${${log_type}}
         "GINKGO_MIXED_PRECISION;GINKGO_HAVE_GPU_AWARE_MPI")
@@ -167,7 +167,7 @@ IF(GINKGO_BUILD_HIP)
     include(hip/get_info.cmake)
 ENDIF()
 
-IF(GINKGO_BUILD_DPCPP)
+IF(GINKGO_BUILD_SYCL)
     include(dpcpp/get_info.cmake)
 ENDIF()
 
@@ -190,16 +190,21 @@ ginkgo_print_module_footer(${detailed_log} "")
 
 ginkgo_print_generic_header(${minimal_log} "  Components:")
 ginkgo_print_generic_header(${detailed_log} "  Components:")
-if(PAPI_sde_FOUND)
+ginkgo_print_variable(${minimal_log} "GINKGO_BUILD_PAPI_SDE")
+ginkgo_print_variable(${detailed_log} "GINKGO_BUILD_PAPI_SDE")
+if(TARGET PAPI::PAPI)
     ginkgo_print_variable(${detailed_log} "PAPI_VERSION")
     ginkgo_print_variable(${detailed_log} "PAPI_INCLUDE_DIR")
     ginkgo_print_flags(${detailed_log} "PAPI_LIBRARY")
 endif()
+
 ginkgo_print_variable(${minimal_log} "GINKGO_BUILD_HWLOC")
 ginkgo_print_variable(${detailed_log} "GINKGO_BUILD_HWLOC")
-ginkgo_print_variable(${detailed_log} "HWLOC_VERSION")
-ginkgo_print_variable(${detailed_log} "HWLOC_LIBRARIES")
-ginkgo_print_variable(${detailed_log} "HWLOC_INCLUDE_DIRS")
+if(TARGET hwloc)
+    ginkgo_print_variable(${detailed_log} "HWLOC_VERSION")
+    ginkgo_print_variable(${detailed_log} "HWLOC_LIBRARIES")
+    ginkgo_print_variable(${detailed_log} "HWLOC_INCLUDE_DIRS")
+endif()
 
 _minimal(
     "
diff --git a/cmake/hip.cmake b/cmake/hip.cmake
index 1b9aa0e8723..72a7a3a86d8 100644
--- a/cmake/hip.cmake
+++ b/cmake/hip.cmake
@@ -22,11 +22,6 @@ if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.21)
     set(CMAKE_HIP_ARCHITECTURES OFF)
 endif()
 
-if (GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_NVIDIA_REGEX}"
-    AND GINKGO_BUILD_CUDA AND CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 9.2)
-    message(FATAL_ERROR "Ginkgo HIP backend requires CUDA >= 9.2.")
-endif()
-
 if(NOT DEFINED ROCM_PATH)
     if(DEFINED ENV{ROCM_PATH})
         set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "Path to which ROCM has been installed")
@@ -185,11 +180,6 @@ endif()
 
 set(GINKGO_HIP_NVCC_ARCH "")
 if (GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_NVIDIA_REGEX}")
-    if (NOT CMAKE_CUDA_HOST_COMPILER AND NOT GINKGO_CUDA_DEFAULT_HOST_COMPILER)
-        set(CMAKE_CUDA_HOST_COMPILER "${CMAKE_CXX_COMPILER}" CACHE STRING "" FORCE)
-    elseif(GINKGO_CUDA_DEFAULT_HOST_COMPILER)
-        unset(CMAKE_CUDA_HOST_COMPILER CACHE)
-    endif()
     if (CMAKE_CUDA_HOST_COMPILER)
         list(APPEND GINKGO_HIP_NVCC_ADDITIONAL_FLAGS "-ccbin=${CMAKE_CUDA_HOST_COMPILER}")
     endif()
@@ -197,16 +187,6 @@ if (GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_NVIDIA_REGEX}")
     # Remove false positive CUDA warnings when calling one<T>() and zero<T>()
     list(APPEND GINKGO_HIP_NVCC_ADDITIONAL_FLAGS --expt-relaxed-constexpr --expt-extended-lambda)
 
-    if (GINKGO_HIP_PLATFROM MATCHES "${HIP_PLATFORM_NVIDIA_REGEX}"
-            AND CMAKE_CUDA_COMPILER_VERSION MATCHES "9.2"
-            AND CMAKE_CUDA_HOST_COMPILER MATCHES ".*clang.*" )
-        ginkgo_extract_clang_version(${CMAKE_CUDA_HOST_COMPILER} GINKGO_CUDA_HOST_CLANG_VERSION)
-
-        if (GINKGO_CUDA_HOST_CLANG_VERSION MATCHES "5\.0.*")
-            message(FATAL_ERROR "There is a bug between nvcc 9.2 and clang 5.0 which create a compiling issue."
-                "Consider using a different CUDA host compiler or CUDA version.")
-        endif()
-    endif()
     # select GPU architecture    
     include(cmake/Modules/CudaArchitectureSelector.cmake)
     cas_variable_cuda_architectures(GINKGO_HIP_NVCC_ARCH
@@ -227,7 +207,7 @@ set(GINKGO_HIPCC_OPTIONS ${GINKGO_HIP_COMPILER_FLAGS} "-std=c++14 -DGKO_COMPILIN
 set(GINKGO_HIP_NVCC_OPTIONS ${GINKGO_HIP_NVCC_COMPILER_FLAGS} ${GINKGO_HIP_NVCC_ARCH} ${GINKGO_HIP_NVCC_ADDITIONAL_FLAGS})
 set(GINKGO_HIP_CLANG_OPTIONS ${GINKGO_HIP_CLANG_COMPILER_FLAGS} ${GINKGO_AMD_ARCH_FLAGS})
 if(GINKGO_HIP_AMD_UNSAFE_ATOMIC AND HIP_VERSION VERSION_GREATER_EQUAL 5)
-    list(APPEND GINKGO_HIP_CLANG_OPTIONS -munsafe-fp-atomics)
+    list(APPEND GINKGO_HIP_CLANG_OPTIONS "-munsafe-fp-atomics -Wno-unused-command-line-argument")
 endif()
 # HIP's cmake support secretly carries around global state to remember
 # whether we created any shared libraries, and sets PIC flags accordingly.
diff --git a/cmake/information_helpers.cmake b/cmake/information_helpers.cmake
index 8bed7320caa..7ac7fdfeda5 100644
--- a/cmake/information_helpers.cmake
+++ b/cmake/information_helpers.cmake
@@ -78,11 +78,7 @@ macro(ginkgo_interface_libraries_recursively INTERFACE_LIBS)
             list(TRANSFORM GINKGO_LIBS_INTERFACE_LIBS REPLACE "\\$<LINK_ONLY:(.*)>" "\\1")
             ginkgo_interface_libraries_recursively("${GINKGO_LIBS_INTERFACE_LIBS}")
         elseif(EXISTS "${_libs}")
-            if ("${_libs}" MATCHES "${PROJECT_BINARY_DIR}.*hwloc.so")
-                list(APPEND GINKGO_INTERFACE_LIBS_FOUND "${CMAKE_INSTALL_FULL_LIBDIR}/libhwloc.so")
-            else()
-                list(APPEND GINKGO_INTERFACE_LIBS_FOUND "${_libs}")
-            endif()
+            list(APPEND GINKGO_INTERFACE_LIBS_FOUND "${_libs}")
         elseif("${_libs}" STREQUAL "${CMAKE_DL_LIBS}")
             list(APPEND GINKGO_INTERFACE_LIBS_FOUND "-l${_libs}")
         endif()
@@ -103,7 +99,7 @@ macro(ginkgo_interface_information)
     get_target_property(GINKGO_INTERFACE_LINK_LIBRARIES ginkgo INTERFACE_LINK_LIBRARIES)
     ginkgo_interface_libraries_recursively("${GINKGO_INTERFACE_LINK_LIBRARIES}")
     # Format and store the interface libraries found
-    # remove duplicates on the reversed list to keep the dependecy in the end of list.
+    # remove duplicates on the reversed list to keep the dependency in the end of list.
     list(REVERSE GINKGO_INTERFACE_LIBS_FOUND)
     list(REMOVE_DUPLICATES GINKGO_INTERFACE_LIBS_FOUND)
     list(REVERSE GINKGO_INTERFACE_LIBS_FOUND)
diff --git a/cmake/install_helpers.cmake b/cmake/install_helpers.cmake
index 58cc730bb14..601fc89a3db 100644
--- a/cmake/install_helpers.cmake
+++ b/cmake/install_helpers.cmake
@@ -30,10 +30,6 @@ function(ginkgo_add_install_rpath name)
     endif()
     if (GINKGO_INSTALL_RPATH_DEPENDENCIES)
         set(RPATH_DEPENDENCIES "${ARGN}")
-        if(GINKGO_HAVE_HWLOC AND HWLOC_FOUND)
-            get_filename_component(HWLOC_LIB_PATH ${HWLOC_LIBRARIES} DIRECTORY)
-            list(APPEND RPATH_DEPENDENCIES "${HWLOC_LIBRARIES}")
-        endif()
     endif()
     if (GINKGO_INSTALL_RPATH)
         set_property(TARGET "${name}" PROPERTY INSTALL_RPATH
@@ -80,11 +76,6 @@ function(ginkgo_install)
     install(FILES "${Ginkgo_BINARY_DIR}/include/ginkgo/config.hpp"
         DESTINATION "${CMAKE_INSTALL_FULL_INCLUDEDIR}/ginkgo"
         )
-    if (GINKGO_HAVE_PAPI_SDE)
-        install(FILES "${Ginkgo_SOURCE_DIR}/third_party/papi_sde/papi_sde_interface.h"
-            DESTINATION "${CMAKE_INSTALL_FULL_INCLUDEDIR}/third_party/papi_sde"
-            )
-    endif()
 
     if  (GINKGO_HAVE_HWLOC AND NOT HWLOC_FOUND)
         get_filename_component(HWLOC_LIB_PATH ${HWLOC_LIBRARIES} DIRECTORY)
diff --git a/cmake/openmpi_test.cpp b/cmake/openmpi_test.cpp
index 3b6f33dd5d0..94b2774503b 100644
--- a/cmake/openmpi_test.cpp
+++ b/cmake/openmpi_test.cpp
@@ -38,11 +38,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 int main()
 {
-#if defined(OPEN_MPI) && OPEN_MPI
-    std::printf("%d.%d.%d", OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION,
-                OMPI_RELEASE_VERSION);
-    return 1;
+#if CHECK_HAS_OPEN_MPI && defined(OPEN_MPI) && OPEN_MPI
+    static_assert(true, "Check availability of OpenMPI");
+#elif CHECK_OPEN_MPI_VERSION && defined(OPEN_MPI) && OPEN_MPI
+    static_assert(OMPI_MAJOR_VERSION > 4 ||
+                      (OMPI_MAJOR_VERSION == 4 && OMPI_MINOR_VERSION >= 1),
+                  "Check OpenMPI version.");
 #else
-    return 0;
+    static_assert(false, "No OpenMPI available");
 #endif
 }
diff --git a/cmake/package_helpers.cmake b/cmake/package_helpers.cmake
deleted file mode 100644
index e1d196ad553..00000000000
--- a/cmake/package_helpers.cmake
+++ /dev/null
@@ -1,59 +0,0 @@
-set(NON_CMAKE_PACKAGE_DOWNLOADER_SCRIPT
-    "${CMAKE_CURRENT_LIST_DIR}/DownloadNonCMakeCMakeLists.txt.in")
-
-
-#   Load a package from the url provided and run configure (Non-CMake projects)
-#
-#   \param package_name     Name of the package
-#   \param package_url      Url of the package
-#   \param package_tag      Tag or version of the package to be downloaded.
-#   \param config_command   The command for the configuration step.
-#
-function(ginkgo_load_and_configure_package package_name package_url package_hash config_command)
-    set(GINKGO_THIRD_PARTY_BUILD_TYPE "Debug")
-    if (CMAKE_BUILD_TYPE MATCHES "[Rr][Ee][Ll][Ee][Aa][Ss][Ee]")
-        set(GINKGO_THIRD_PARTY_BUILD_TYPE "Release")
-    endif()
-    configure_file(${NON_CMAKE_PACKAGE_DOWNLOADER_SCRIPT}
-        download/CMakeLists.txt)
-    set(TOOLSET "")
-    if (NOT "${CMAKE_GENERATOR_TOOLSET}" STREQUAL "")
-        set(TOOLSET "-T${CMAKE_GENERATOR_TOOLSET}")
-    endif()
-    execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" "${TOOLSET}" .
-        RESULT_VARIABLE result
-        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/download)
-    if(result)
-        message(FATAL_ERROR
-            "CMake step for ${package_name}/download failed: ${result}")
-        return()
-    endif()
-    execute_process(COMMAND ${CMAKE_COMMAND} --build .
-        RESULT_VARIABLE result
-        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/download)
-    if(result)
-        message(FATAL_ERROR
-            "Build step for ${package_name}/download failed: ${result}")
-        return()
-    endif()
-endfunction()
-
-
-#   Download a file and verify the download
-#
-#   \param url          The url of file to be downloaded
-#   \param filename     The name of the file
-#   \param hash_type    The type of hash, See CMake file() documentation for more details.
-#   \param hash         The hash itself, See CMake file() documentation for more details.
-#
-function(ginkgo_download_file url filename hash_type hash)
-    file(DOWNLOAD ${url} ${filename}
-        TIMEOUT 60  # seconds
-        EXPECTED_HASH "${hash_type}=${hash}"
-        TLS_VERIFY ON)
-    if(EXISTS ${filename})
-        message(STATUS "${filename} downloaded from ${url}")
-    else()
-        message(FATAL_ERROR "Download of ${filename} failed.")
-    endif()
-endfunction(ginkgo_download_file)
diff --git a/cmake/rename.cmake b/cmake/rename.cmake
new file mode 100644
index 00000000000..6c386bc24c6
--- /dev/null
+++ b/cmake/rename.cmake
@@ -0,0 +1,20 @@
+# Only for CACHE variable (option)
+macro(gko_rename_cache deprecated actual type doc_string)
+    if(DEFINED ${deprecated})
+        if(DEFINED ${actual})
+            message("actual ${actual} and deprecated ${deprecated}")
+            if("${${actual}}" STREQUAL "${${deprecated}}")
+                # They are the same, so only throw warning
+                message(WARNING "${deprecated} was deprecated, please only use ${actual} instead.")
+            else()
+                # They are different
+                message(FATAL_ERROR "Both ${deprecated} and ${actual} were specified differently, please only use ${actual} instead.")
+            endif()
+        else()
+            # Only set `deprecated`, move it to `actual`.
+            message(WARNING "${deprecated} was deprecated, please use ${actual} instead.  "
+                "We copy ${${deprecated}} to ${actual}")
+            set(${actual} ${${deprecated}} CACHE ${type} "${doc_string}")
+        endif()
+    endif()
+endmacro()
\ No newline at end of file
diff --git a/cmake/sycl.cmake b/cmake/sycl.cmake
new file mode 100644
index 00000000000..b0f4eab91f1
--- /dev/null
+++ b/cmake/sycl.cmake
@@ -0,0 +1,33 @@
+# IntelSYCL for dpcpp and icpx if the config is existed and cmake reaches the requirement
+if(CMAKE_CXX_COMPILER MATCHES "dpcpp|icpx")
+    if(CMAKE_HOST_WIN32 AND CMAKE_VERSION VERSION_GREATER_EQUAL 3.25)
+        find_package(IntelSYCL QUIET)
+    elseif(CMAKE_VERSION VERSION_GREATER_EQUAL 3.20.5)
+        find_package(IntelSYCL QUIET)
+    endif()
+endif()
+# If we do not have the config from compiler, try to set components to make it work.
+if(NOT COMMAND add_sycl_to_target) 
+    if(NOT DEFINED SYCL_FLAGS)
+        set(SYCL_FLAGS "-fsycl" CACHE STRING "SYCL flags for compiler")
+    endif()
+endif()
+
+# Provide a uniform way for those package without add_sycl_to_target
+function(gko_add_sycl_to_target)
+    if(COMMAND add_sycl_to_target)
+        add_sycl_to_target(${ARGN})
+        return()
+    endif()
+    # We handle them by adding SYCL_FLAGS to compile and link to the target
+    set(one_value_args TARGET)
+    set(multi_value_args SOURCES)
+    cmake_parse_arguments(SYCL
+        ""
+        "${one_value_args}"
+        "${multi_value_args}"
+        ${ARGN})
+    target_compile_options(${SYCL_TARGET} PRIVATE "${SYCL_FLAGS}")
+    target_link_options(${SYCL_TARGET} PRIVATE "${SYCL_FLAGS}")
+endfunction()
+
diff --git a/cmake/template_instantiation.cmake b/cmake/template_instantiation.cmake
new file mode 100644
index 00000000000..f77527e0092
--- /dev/null
+++ b/cmake/template_instantiation.cmake
@@ -0,0 +1,81 @@
+function(add_instantiation_files source_dir source_file output_files_var)
+    # if instantiation is disabled, compile the file directly
+    if(NOT GINKGO_SPLIT_TEMPLATE_INSTANTIATIONS)
+        set(${output_files_var} "${source_dir}/${source_file}" PARENT_SCOPE)
+        return()
+    endif()
+    # read full file into variable
+    set(source_path "${source_dir}/${source_file}")
+    file(READ "${source_path}" file_contents)
+    # escape semicolons and use them for line separation
+    string(REPLACE ";" "<semicolon>" file_contents "${file_contents}")
+    string(REGEX REPLACE "[\r\n]" ";" file_contents "${file_contents}")
+    # find location of // begin|split|end comments
+    set(begin_location)
+    set(end_location)
+    set(split_locations)
+    list(LENGTH file_contents total_length)
+    set(counter 0)
+    foreach(line IN LISTS file_contents)
+        if(line MATCHES "// begin")
+            if(begin_location)
+                message(FATAL_ERROR "Duplicate begin in line ${counter}, first found in ${begin_location}")
+            endif()
+            set(begin_location ${counter})
+        elseif(line MATCHES "// split")
+            if((NOT begin_location) OR end_location)
+                message(FATAL_ERROR "Found split outside begin/end in line ${counter}")
+            endif()
+            list(APPEND split_locations ${counter})
+        elseif(line MATCHES "// end")
+            if(end_location)
+                message(FATAL_ERROR "Duplicate end in line ${counter}, first found in ${end_location}")
+            endif()
+            set(end_location ${counter})
+        endif()
+        math(EXPR counter "${counter} + 1")
+    endforeach()
+    if (NOT (begin_location AND end_location AND split_locations))
+        message(FATAL_ERROR "Nothing to split")
+    endif()
+    if (begin_location GREATER_EQUAL end_location)
+        message(FATAL_ERROR "Incorrect begin/end order")
+    endif()
+    # determine which lines belong to the header and footer
+    set(range_begins ${begin_location} ${split_locations})
+    set(range_ends ${split_locations} ${end_location})
+    list(LENGTH split_locations range_count_minus_one)
+    math(EXPR length_header "${begin_location}")
+    math(EXPR end_location_past "${end_location} + 1")
+    math(EXPR length_footer "${total_length} - ${end_location_past}")
+    list(SUBLIST file_contents 0 ${length_header} header)
+    list(SUBLIST file_contents ${end_location_past} ${length_footer} footer)
+    set(output_files)
+    # for each range between // begin|split|end pairs
+    foreach(range RANGE 0 ${range_count_minus_one})
+        # create an output filename
+        string(REGEX REPLACE "(\.hip\.cpp|\.dp\.cpp|\.cpp|\.cu)$" ".${range}\\1" target_file "${source_file}")
+        set(target_path "${CMAKE_CURRENT_BINARY_DIR}/${target_file}")
+        list(APPEND output_files "${target_path}")
+        # extract the range between the comments
+        list(GET range_begins ${range} begin)
+        list(GET range_ends ${range} end)
+        math(EXPR begin "${begin} + 1")
+        math(EXPR length "${end} - ${begin}")
+        list(SUBLIST file_contents ${begin} ${length} content)
+        # concatenate header, content and footer and turn semicolons into newlines
+        string(REPLACE ";" "\n" content "${header};${content};${footer}")
+        # and escaped semicolons into regular semicolons again
+        string(REPLACE "<semicolon>" ";" content "${content}")
+        # create a .tmp file, but only copy it over if source file changed
+        # this way, we don't rebuild unnecessarily
+        file(WRITE "${target_path}.tmp" "${content}")
+        add_custom_command(
+            OUTPUT "${target_path}"
+            COMMAND ${CMAKE_COMMAND} -E copy "${target_path}.tmp" "${target_path}"
+            MAIN_DEPENDENCY "${source_path}")
+    endforeach()
+    # make sure cmake gets called when the source file was updated
+    set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS "${source_path}")
+    set(${output_files_var} ${output_files} PARENT_SCOPE)
+endfunction()
diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
index 3a7cb1ceb15..77bdd7230b9 100644
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -1,31 +1,2 @@
-set(UNIFIED_SOURCES
-    base/device_matrix_data_kernels.cpp
-    base/index_set_kernels.cpp
-    components/absolute_array_kernels.cpp
-    components/fill_array_kernels.cpp
-    components/format_conversion_kernels.cpp
-    components/precision_conversion_kernels.cpp
-    components/reduce_array_kernels.cpp
-    distributed/partition_kernels.cpp
-    matrix/coo_kernels.cpp
-    matrix/csr_kernels.cpp
-    matrix/dense_kernels.cpp
-    matrix/ell_kernels.cpp
-    matrix/hybrid_kernels.cpp
-    matrix/sellp_kernels.cpp
-    matrix/sparsity_csr_kernels.cpp
-    matrix/diagonal_kernels.cpp
-    multigrid/pgm_kernels.cpp
-    preconditioner/jacobi_kernels.cpp
-    solver/bicg_kernels.cpp
-    solver/bicgstab_kernels.cpp
-    solver/cg_kernels.cpp
-    solver/cgs_kernels.cpp
-    solver/common_gmres_kernels.cpp
-    solver/fcg_kernels.cpp
-    solver/gcr_kernels.cpp
-    solver/gmres_kernels.cpp
-    solver/ir_kernels.cpp
-    )
-list(TRANSFORM UNIFIED_SOURCES PREPEND ${CMAKE_CURRENT_SOURCE_DIR}/unified/)
-set(GKO_UNIFIED_COMMON_SOURCES ${UNIFIED_SOURCES} PARENT_SCOPE)
+add_subdirectory(unified)
+set(GKO_UNIFIED_COMMON_SOURCES ${GKO_UNIFIED_COMMON_SOURCES} PARENT_SCOPE)
diff --git a/common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc b/common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc
new file mode 100644
index 00000000000..6c0c5363baa
--- /dev/null
+++ b/common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc
@@ -0,0 +1,150 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+
+template <typename ValueType>
+void scale(std::shared_ptr<const DefaultExecutor> exec,
+           const batch::MultiVector<ValueType>* const alpha,
+           batch::MultiVector<ValueType>* const x)
+{
+    const auto num_blocks = x->get_num_batch_items();
+    const auto alpha_ub = get_batch_struct(alpha);
+    const auto x_ub = get_batch_struct(x);
+    if (alpha->get_common_size()[1] == 1) {
+        scale_kernel<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+            alpha_ub, x_ub, [] __device__(int col) { return 0; });
+    } else {
+        scale_kernel<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+            alpha_ub, x_ub, [] __device__(int col) { return col; });
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_BATCH_MULTI_VECTOR_SCALE_KERNEL);
+
+
+template <typename ValueType>
+void add_scaled(std::shared_ptr<const DefaultExecutor> exec,
+                const batch::MultiVector<ValueType>* const alpha,
+                const batch::MultiVector<ValueType>* const x,
+                batch::MultiVector<ValueType>* const y)
+{
+    const auto num_blocks = x->get_num_batch_items();
+    const size_type nrhs = x->get_common_size()[1];
+    const auto alpha_ub = get_batch_struct(alpha);
+    const auto x_ub = get_batch_struct(x);
+    const auto y_ub = get_batch_struct(y);
+    if (alpha->get_common_size()[1] == 1) {
+        add_scaled_kernel<<<num_blocks, default_block_size, 0,
+                            exec->get_stream()>>>(
+            alpha_ub, x_ub, y_ub, [] __device__(int col) { return 0; });
+    } else {
+        add_scaled_kernel<<<num_blocks, default_block_size, 0,
+                            exec->get_stream()>>>(
+            alpha_ub, x_ub, y_ub, [] __device__(int col) { return col; });
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_BATCH_MULTI_VECTOR_ADD_SCALED_KERNEL);
+
+
+template <typename ValueType>
+void compute_dot(std::shared_ptr<const DefaultExecutor> exec,
+                 const batch::MultiVector<ValueType>* x,
+                 const batch::MultiVector<ValueType>* y,
+                 batch::MultiVector<ValueType>* result)
+{
+    const auto num_blocks = x->get_num_batch_items();
+    const auto num_rhs = x->get_common_size()[1];
+    const auto x_ub = get_batch_struct(x);
+    const auto y_ub = get_batch_struct(y);
+    const auto res_ub = get_batch_struct(result);
+    compute_gen_dot_product_kernel<<<num_blocks, default_block_size, 0,
+                                     exec->get_stream()>>>(
+        x_ub, y_ub, res_ub, [] __device__(auto val) { return val; });
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_DOT_KERNEL);
+
+
+template <typename ValueType>
+void compute_conj_dot(std::shared_ptr<const DefaultExecutor> exec,
+                      const batch::MultiVector<ValueType>* x,
+                      const batch::MultiVector<ValueType>* y,
+                      batch::MultiVector<ValueType>* result)
+{
+    const auto num_blocks = x->get_num_batch_items();
+    const auto num_rhs = x->get_common_size()[1];
+    const auto x_ub = get_batch_struct(x);
+    const auto y_ub = get_batch_struct(y);
+    const auto res_ub = get_batch_struct(result);
+    compute_gen_dot_product_kernel<<<num_blocks, default_block_size, 0,
+                                     exec->get_stream()>>>(
+        x_ub, y_ub, res_ub, [] __device__(auto val) { return conj(val); });
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_CONJ_DOT_KERNEL);
+
+
+template <typename ValueType>
+void compute_norm2(std::shared_ptr<const DefaultExecutor> exec,
+                   const batch::MultiVector<ValueType>* const x,
+                   batch::MultiVector<remove_complex<ValueType>>* const result)
+{
+    const auto num_blocks = x->get_num_batch_items();
+    const auto num_rhs = x->get_common_size()[1];
+    const auto x_ub = get_batch_struct(x);
+    const auto res_ub = get_batch_struct(result);
+    compute_norm2_kernel<<<num_blocks, default_block_size, 0,
+                           exec->get_stream()>>>(x_ub, res_ub);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_NORM2_KERNEL);
+
+
+template <typename ValueType>
+void copy(std::shared_ptr<const DefaultExecutor> exec,
+          const batch::MultiVector<ValueType>* x,
+          batch::MultiVector<ValueType>* result)
+{
+    const auto num_blocks = x->get_num_batch_items();
+    const auto result_ub = get_batch_struct(result);
+    const auto x_ub = get_batch_struct(x);
+    copy_kernel<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+        x_ub, result_ub);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR_COPY_KERNEL);
diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc b/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc
new file mode 100644
index 00000000000..cb157d80fd5
--- /dev/null
+++ b/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc
@@ -0,0 +1,327 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+
+template <typename ValueType, typename Mapping>
+__device__ __forceinline__ void scale(
+    const gko::batch::multi_vector::batch_item<const ValueType>& alpha,
+    const gko::batch::multi_vector::batch_item<ValueType>& x, Mapping map)
+{
+    const int max_li = x.num_rows * x.num_rhs;
+    for (int li = threadIdx.x; li < max_li; li += blockDim.x) {
+        const int row = li / x.num_rhs;
+        const int col = li % x.num_rhs;
+
+        x.values[row * x.stride + col] =
+            alpha.values[map(col)] * x.values[row * x.stride + col];
+    }
+}
+
+template <typename ValueType, typename Mapping>
+__global__
+__launch_bounds__(default_block_size, sm_oversubscription) void scale_kernel(
+    const gko::batch::multi_vector::uniform_batch<const ValueType> alpha,
+    const gko::batch::multi_vector::uniform_batch<ValueType> x, Mapping map)
+{
+    for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items;
+         batch_id += gridDim.x) {
+        const auto alpha_b = gko::batch::extract_batch_item(alpha, batch_id);
+        const auto x_b = gko::batch::extract_batch_item(x, batch_id);
+        scale(alpha_b, x_b, map);
+    }
+}
+
+
+template <typename ValueType, typename Mapping>
+__device__ __forceinline__ void add_scaled(
+    const gko::batch::multi_vector::batch_item<const ValueType>& alpha,
+    const gko::batch::multi_vector::batch_item<const ValueType>& x,
+    const gko::batch::multi_vector::batch_item<ValueType>& y, Mapping map)
+{
+    const int max_li = x.num_rows * x.num_rhs;
+    for (int li = threadIdx.x; li < max_li; li += blockDim.x) {
+        const int row = li / x.num_rhs;
+        const int col = li % x.num_rhs;
+
+        y.values[row * y.stride + col] +=
+            alpha.values[map(col)] * x.values[row * x.stride + col];
+    }
+}
+
+template <typename ValueType, typename Mapping>
+__global__ __launch_bounds__(
+    default_block_size,
+    sm_oversubscription) void add_scaled_kernel(const gko::batch::multi_vector::
+                                                    uniform_batch<
+                                                        const ValueType>
+                                                        alpha,
+                                                const gko::batch::multi_vector::
+                                                    uniform_batch<
+                                                        const ValueType>
+                                                        x,
+                                                const gko::batch::multi_vector::
+                                                    uniform_batch<ValueType>
+                                                        y,
+                                                Mapping map)
+{
+    for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items;
+         batch_id += gridDim.x) {
+        const auto alpha_b = gko::batch::extract_batch_item(alpha, batch_id);
+        const auto x_b = gko::batch::extract_batch_item(x, batch_id);
+        const auto y_b = gko::batch::extract_batch_item(y, batch_id);
+        add_scaled(alpha_b, x_b, y_b, map);
+    }
+}
+
+
+template <typename Group, typename ValueType>
+__device__ __forceinline__ void single_rhs_compute_conj_dot(Group subgroup,
+                                                            const int num_rows,
+                                                            const ValueType* x,
+                                                            const ValueType* y,
+                                                            ValueType& result)
+
+{
+    ValueType val = zero<ValueType>();
+    for (int r = subgroup.thread_rank(); r < num_rows; r += subgroup.size()) {
+        val += conj(x[r]) * y[r];
+    }
+
+    // subgroup level reduction
+    val = reduce(subgroup, val, thrust::plus<ValueType>{});
+
+    if (subgroup.thread_rank() == 0) {
+        result = val;
+    }
+}
+
+
+template <typename Group, typename ValueType, typename Mapping>
+__device__ __forceinline__ void gen_one_dot(
+    const gko::batch::multi_vector::batch_item<const ValueType>& x,
+    const gko::batch::multi_vector::batch_item<const ValueType>& y,
+    const int rhs_index,
+    const gko::batch::multi_vector::batch_item<ValueType>& result,
+    Group subgroup, Mapping conj_map)
+{
+    ValueType val = zero<ValueType>();
+
+    for (int r = subgroup.thread_rank(); r < x.num_rows; r += subgroup.size()) {
+        val += conj_map(x.values[r * x.stride + rhs_index]) *
+               y.values[r * y.stride + rhs_index];
+    }
+
+    // subgroup level reduction
+    val = reduce(subgroup, val, thrust::plus<ValueType>{});
+
+    if (subgroup.thread_rank() == 0) {
+        result.values[rhs_index] = val;
+    }
+}
+
+
+template <typename ValueType, typename Mapping>
+__device__ __forceinline__ void compute_gen_dot_product(
+    const gko::batch::multi_vector::batch_item<const ValueType>& x,
+    const gko::batch::multi_vector::batch_item<const ValueType>& y,
+    const gko::batch::multi_vector::batch_item<ValueType>& result,
+    Mapping conj_map)
+{
+    constexpr auto tile_size = config::warp_size;
+    auto thread_block = group::this_thread_block();
+    auto subgroup = group::tiled_partition<tile_size>(thread_block);
+    const auto subgroup_id = static_cast<int>(threadIdx.x / tile_size);
+    const int num_subgroups_per_block = ceildiv(blockDim.x, tile_size);
+
+    for (int rhs_index = subgroup_id; rhs_index < x.num_rhs;
+         rhs_index += num_subgroups_per_block) {
+        gen_one_dot(x, y, rhs_index, result, subgroup, conj_map);
+    }
+}
+
+
+template <typename ValueType, typename Mapping>
+__global__
+__launch_bounds__(default_block_size, sm_oversubscription) void compute_gen_dot_product_kernel(
+    const gko::batch::multi_vector::uniform_batch<const ValueType> x,
+    const gko::batch::multi_vector::uniform_batch<const ValueType> y,
+    const gko::batch::multi_vector::uniform_batch<ValueType> result,
+    Mapping map)
+{
+    for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items;
+         batch_id += gridDim.x) {
+        const auto x_b = gko::batch::extract_batch_item(x, batch_id);
+        const auto y_b = gko::batch::extract_batch_item(y, batch_id);
+        const auto r_b = gko::batch::extract_batch_item(result, batch_id);
+        compute_gen_dot_product(x_b, y_b, r_b, map);
+    }
+}
+
+
+template <typename Group, typename ValueType>
+__device__ __forceinline__ void single_rhs_compute_norm2(
+    Group subgroup, const int num_rows, const ValueType* x,
+    remove_complex<ValueType>& result)
+{
+    using real_type = typename gko::remove_complex<ValueType>;
+    real_type val = zero<real_type>();
+
+    for (int r = subgroup.thread_rank(); r < num_rows; r += subgroup.size()) {
+        val += squared_norm(x[r]);
+    }
+
+    // subgroup level reduction
+    val = reduce(subgroup, val, thrust::plus<remove_complex<ValueType>>{});
+
+    if (subgroup.thread_rank() == 0) {
+        result = sqrt(val);
+    }
+}
+
+
+template <typename Group, typename ValueType>
+__device__ __forceinline__ void one_norm2(
+    const gko::batch::multi_vector::batch_item<const ValueType>& x,
+    const int rhs_index,
+    const gko::batch::multi_vector::batch_item<remove_complex<ValueType>>&
+        result,
+    Group subgroup)
+{
+    using real_type = typename gko::remove_complex<ValueType>;
+    real_type val = zero<real_type>();
+
+    for (int r = subgroup.thread_rank(); r < x.num_rows; r += subgroup.size()) {
+        val += squared_norm(x.values[r * x.stride + rhs_index]);
+    }
+
+    // subgroup level reduction
+    val = reduce(subgroup, val, thrust::plus<remove_complex<ValueType>>{});
+
+    if (subgroup.thread_rank() == 0) {
+        result.values[rhs_index] = sqrt(val);
+    }
+}
+
+
+/**
+ * Computes the 2-norms of some column vectors in global or shared memory.
+ *
+ * @param x  A row-major multivector with nrhs columns.
+ * @param result  Holds norm value for each vector in x.
+ */
+template <typename ValueType>
+__device__ __forceinline__ void compute_norm2(
+    const gko::batch::multi_vector::batch_item<const ValueType>& x,
+    const gko::batch::multi_vector::batch_item<remove_complex<ValueType>>&
+        result)
+{
+    constexpr auto tile_size = config::warp_size;
+    auto thread_block = group::this_thread_block();
+    auto subgroup = group::tiled_partition<tile_size>(thread_block);
+    const auto subgroup_id = static_cast<int>(threadIdx.x / tile_size);
+    const int num_subgroups_per_block = ceildiv(blockDim.x, tile_size);
+
+    for (int rhs_index = subgroup_id; rhs_index < x.num_rhs;
+         rhs_index += num_subgroups_per_block) {
+        one_norm2(x, rhs_index, result, subgroup);
+    }
+}
+
+
+template <typename ValueType>
+__global__ __launch_bounds__(
+    default_block_size,
+    sm_oversubscription) void compute_norm2_kernel(const gko::batch::
+                                                       multi_vector::
+                                                           uniform_batch<
+                                                               const ValueType>
+                                                               x,
+                                                   const gko::batch::
+                                                       multi_vector::
+                                                           uniform_batch<
+                                                               remove_complex<
+                                                                   ValueType>>
+                                                               result)
+{
+    for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items;
+         batch_id += gridDim.x) {
+        const auto x_b = gko::batch::extract_batch_item(x, batch_id);
+        const auto r_b = gko::batch::extract_batch_item(result, batch_id);
+        compute_norm2(x_b, r_b);
+    }
+}
+
+
+template <typename ValueType>
+__device__ __forceinline__ void single_rhs_copy(const int num_rows,
+                                                const ValueType* in,
+                                                ValueType* out)
+{
+    for (int iz = threadIdx.x; iz < num_rows; iz += blockDim.x) {
+        out[iz] = in[iz];
+    }
+}
+
+
+/**
+ * Copies the values of one multi-vector into another.
+ *
+ * Note that the output multi-vector should already have memory allocated
+ * and stride set.
+ */
+template <typename ValueType>
+__device__ __forceinline__ void copy(
+    const gko::batch::multi_vector::batch_item<const ValueType>& in,
+    const gko::batch::multi_vector::batch_item<ValueType>& out)
+{
+    for (int iz = threadIdx.x; iz < in.num_rows * in.num_rhs;
+         iz += blockDim.x) {
+        const int i = iz / in.num_rhs;
+        const int j = iz % in.num_rhs;
+        out.values[i * out.stride + j] = in.values[i * in.stride + j];
+    }
+}
+
+
+template <typename ValueType>
+__global__
+__launch_bounds__(default_block_size, sm_oversubscription) void copy_kernel(
+    const gko::batch::multi_vector::uniform_batch<const ValueType> src,
+    const gko::batch::multi_vector::uniform_batch<ValueType> dst)
+{
+    for (size_type batch_id = blockIdx.x; batch_id < src.num_batch_items;
+         batch_id += gridDim.x) {
+        const auto dst_b = gko::batch::extract_batch_item(dst, batch_id);
+        const auto src_b = gko::batch::extract_batch_item(src, batch_id);
+        copy(src_b, dst_b);
+    }
+}
diff --git a/common/cuda_hip/base/device_matrix_data_kernels.hpp.inc b/common/cuda_hip/base/device_matrix_data_kernels.hpp.inc
index 5930902ed37..faf0ad15146 100644
--- a/common/cuda_hip/base/device_matrix_data_kernels.hpp.inc
+++ b/common/cuda_hip/base/device_matrix_data_kernels.hpp.inc
@@ -35,19 +35,13 @@ void remove_zeros(std::shared_ptr<const DefaultExecutor> exec,
                   array<ValueType>& values, array<IndexType>& row_idxs,
                   array<IndexType>& col_idxs)
 {
-    // workaround for CUDA 9.2 Thrust: Their complex<> implementation is broken
-    // due to overly generic assignment operator and constructor leading to
-    // ambiguities. So we need to use our own fake_complex type
-    using device_value_type = device_member_type<ValueType>;
-    auto value_ptr =
-        reinterpret_cast<const device_value_type*>(values.get_const_data());
+    using device_value_type = device_type<ValueType>;
+    auto value_ptr = as_device_type(values.get_const_data());
     auto size = values.get_num_elems();
     // count nonzeros
-    auto nnz =
-        thrust::count_if(thrust_policy(exec), value_ptr, value_ptr + size,
-                         [] __device__(device_value_type value) {
-                             return is_nonzero(fake_complex_unpack(value));
-                         });
+    auto nnz = thrust::count_if(
+        thrust_policy(exec), value_ptr, value_ptr + size,
+        [] __device__(device_value_type value) { return is_nonzero(value); });
     if (nnz < size) {
         using tuple_type =
             thrust::tuple<IndexType, IndexType, device_value_type>;
@@ -58,14 +52,13 @@ void remove_zeros(std::shared_ptr<const DefaultExecutor> exec,
         // copy nonzeros
         auto it = thrust::make_zip_iterator(thrust::make_tuple(
             row_idxs.get_const_data(), col_idxs.get_const_data(), value_ptr));
-        auto out_it = thrust::make_zip_iterator(thrust::make_tuple(
-            new_row_idxs.get_data(), new_col_idxs.get_data(),
-            reinterpret_cast<device_value_type*>(new_values.get_data())));
-        thrust::copy_if(
-            thrust_policy(exec), it, it + size, out_it,
-            [] __device__(tuple_type entry) {
-                return is_nonzero(fake_complex_unpack(thrust::get<2>(entry)));
-            });
+        auto out_it = thrust::make_zip_iterator(
+            thrust::make_tuple(new_row_idxs.get_data(), new_col_idxs.get_data(),
+                               as_device_type(new_values.get_data())));
+        thrust::copy_if(thrust_policy(exec), it, it + size, out_it,
+                        [] __device__(tuple_type entry) {
+                            return is_nonzero(thrust::get<2>(entry));
+                        });
         // swap out storage
         values = std::move(new_values);
         row_idxs = std::move(new_row_idxs);
@@ -82,7 +75,6 @@ void sum_duplicates(std::shared_ptr<const DefaultExecutor> exec, size_type,
                     array<ValueType>& values, array<IndexType>& row_idxs,
                     array<IndexType>& col_idxs)
 {
-    using device_value_type = device_member_type<ValueType>;
     const auto size = values.get_num_elems();
     const auto rows = row_idxs.get_const_data();
     const auto cols = col_idxs.get_const_data();
@@ -104,12 +96,10 @@ void sum_duplicates(std::shared_ptr<const DefaultExecutor> exec, size_type,
         // reduce duplicates
         auto in_locs =
             thrust::make_zip_iterator(thrust::make_tuple(rows, cols));
-        auto in_vals =
-            reinterpret_cast<const device_value_type*>(values.get_const_data());
+        auto in_vals = as_device_type(values.get_const_data());
         auto out_locs = thrust::make_zip_iterator(thrust::make_tuple(
             new_row_idxs.get_data(), new_col_idxs.get_data()));
-        auto out_vals =
-            reinterpret_cast<device_value_type*>(new_values.get_data());
+        auto out_vals = as_device_type(new_values.get_data());
         thrust::reduce_by_key(thrust_policy(exec), in_locs, in_locs + size,
                               in_vals, out_locs, out_vals);
         // swap out storage
@@ -127,13 +117,9 @@ template <typename ValueType, typename IndexType>
 void sort_row_major(std::shared_ptr<const DefaultExecutor> exec,
                     device_matrix_data<ValueType, IndexType>& data)
 {
-    // workaround for CUDA 9.2 Thrust: Their complex<> implementation is broken
-    // due to overly generic assignment operator and constructor leading to
-    // ambiguities. So we need to use our own fake_complex type
-    using device_value_type = device_member_type<ValueType>;
     auto it = thrust::make_zip_iterator(
         thrust::make_tuple(data.get_row_idxs(), data.get_col_idxs()));
-    auto vals = reinterpret_cast<device_value_type*>(data.get_values());
+    auto vals = as_device_type(data.get_values());
     thrust::sort_by_key(thrust_policy(exec), it, it + data.get_num_elems(),
                         vals);
 }
diff --git a/common/cuda_hip/base/executor.hpp.inc b/common/cuda_hip/base/executor.hpp.inc
index 7e71a3e24c0..ad641ecea5b 100644
--- a/common/cuda_hip/base/executor.hpp.inc
+++ b/common/cuda_hip/base/executor.hpp.inc
@@ -40,7 +40,7 @@ inline int convert_sm_ver_to_cores(int major, int minor)
     // Defines for GPU Architecture types (using the SM version to determine
     // the # of cores per SM
     typedef struct {
-        int SM;  // 0xMm (hexidecimal notation), M = SM Major version,
+        int SM;  // 0xMm (hexadecimal notation), M = SM Major version,
         // and m = SM minor version
         int Cores;
     } sSMtoCores;
diff --git a/common/cuda_hip/components/volatile.hpp.inc b/common/cuda_hip/components/memory.hpp.inc
similarity index 100%
rename from common/cuda_hip/components/volatile.hpp.inc
rename to common/cuda_hip/components/memory.hpp.inc
diff --git a/common/cuda_hip/components/segment_scan.hpp.inc b/common/cuda_hip/components/segment_scan.hpp.inc
index 947c2c3afd7..584f44b6415 100644
--- a/common/cuda_hip/components/segment_scan.hpp.inc
+++ b/common/cuda_hip/components/segment_scan.hpp.inc
@@ -33,7 +33,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 /**
  * @internal
  *
- * Compute a segement scan using add operation (+) of a subwarp. Each segment
+ * Compute a segment scan using add operation (+) of a subwarp. Each segment
  * performs suffix sum. Works on the source array and returns whether the thread
  * is the first element of its segment with same `ind`.
  */
diff --git a/common/cuda_hip/components/syncfree.hpp.inc b/common/cuda_hip/components/syncfree.hpp.inc
index 6b6dcc70f24..a8fa767e4dd 100644
--- a/common/cuda_hip/components/syncfree.hpp.inc
+++ b/common/cuda_hip/components/syncfree.hpp.inc
@@ -93,18 +93,18 @@ public:
         const auto dep_block = dependency / (block_size / subwarp_size);
         const auto dep_local = dependency % (block_size / subwarp_size);
         // assert(dependency < work_id);
-        if (dep_block == block_id) {
-            // wait for a local dependency
-            while (!load(local.status, dep_local)) {
-                __threadfence();
-            }
-        } else {
-            // wait for a global dependency
-            while (!load(global.status, dependency)) {
-                __threadfence();
+        if (get_lane() == 0) {
+            if (dep_block == block_id) {
+                // wait for a local dependency
+                while (!load_acquire_shared(local.status + dep_local)) {
+                }
+            } else {
+                // wait for a global dependency
+                while (!load_acquire(global.status + dependency)) {
+                }
             }
         }
-        __threadfence();
+        group::tiled_partition<subwarp_size>(group::this_thread_block()).sync();
     }
 
     __device__ __forceinline__ bool peek(IndexType dependency)
@@ -114,27 +114,22 @@ public:
         // assert(dependency < work_id);
         if (dep_block == block_id) {
             // peek at a local dependency
-            auto finished = load(local.status, dep_local) != 0;
-            __threadfence();
-            return finished;
+            return load_acquire_shared(local.status + dep_local);
         } else {
             // peek at a global dependency
-            auto finished = load(global.status, dependency);
-            __threadfence();
-            return finished;
+            return load_acquire(global.status + dependency);
         }
     }
 
     __device__ __forceinline__ void mark_ready()
     {
         group::tiled_partition<subwarp_size>(group::this_thread_block()).sync();
-        __threadfence();
         if (get_lane() == 0) {
             const auto sh_id = get_work_id() % (block_size / subwarp_size);
             // notify local warps
-            store(local.status, sh_id, 1);
+            store_release_shared(local.status + sh_id, 1);
             // notify other blocks
-            store(global.status, get_work_id(), 1);
+            store_release(global.status + get_work_id(), 1);
         }
     }
 
diff --git a/common/cuda_hip/distributed/partition_helpers_kernels.hpp.inc b/common/cuda_hip/distributed/partition_helpers_kernels.hpp.inc
new file mode 100644
index 00000000000..f92794ec138
--- /dev/null
+++ b/common/cuda_hip/distributed/partition_helpers_kernels.hpp.inc
@@ -0,0 +1,54 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+template <typename GlobalIndexType>
+void sort_by_range_start(
+    std::shared_ptr<const DefaultExecutor> exec,
+    array<GlobalIndexType>& range_start_ends,
+    array<experimental::distributed::comm_index_type>& part_ids)
+{
+    auto num_ranges = range_start_ends.get_num_elems() / 2;
+    auto strided_indices = thrust::make_transform_iterator(
+        thrust::make_counting_iterator(0),
+        [] __host__ __device__(const int i) { return 2 * i; });
+    auto start_it = thrust::make_permutation_iterator(
+        range_start_ends.get_data(), strided_indices);
+    auto end_it = thrust::make_permutation_iterator(
+        range_start_ends.get_data() + 1, strided_indices);
+    auto zip_it = thrust::make_zip_iterator(
+        thrust::make_tuple(end_it, part_ids.get_data()));
+    thrust::stable_sort_by_key(thrust_policy(exec), start_it,
+                               start_it + num_ranges, zip_it);
+}
+
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(
+    GKO_DECLARE_PARTITION_HELPERS_SORT_BY_RANGE_START);
diff --git a/common/cuda_hip/factorization/cholesky_kernels.hpp.inc b/common/cuda_hip/factorization/cholesky_kernels.hpp.inc
index f87969a7ad0..eb90127a8ca 100644
--- a/common/cuda_hip/factorization/cholesky_kernels.hpp.inc
+++ b/common/cuda_hip/factorization/cholesky_kernels.hpp.inc
@@ -149,8 +149,6 @@ __global__ __launch_bounds__(default_block_size) void symbolic_factorize(
 template <typename ValueType, typename IndexType>
 __global__ __launch_bounds__(default_block_size) void factorize(
     const IndexType* __restrict__ row_ptrs, const IndexType* __restrict__ cols,
-    const IndexType* __restrict__ elim_tree_child_ptrs,
-    const IndexType* __restrict__ elim_tree_children,
     const IndexType* __restrict__ storage_offsets,
     const int32* __restrict__ storage, const int64* __restrict__ row_descs,
     const IndexType* __restrict__ diag_idxs,
@@ -171,32 +169,21 @@ __global__ __launch_bounds__(default_block_size) void factorize(
     const auto row_begin = row_ptrs[row];
     const auto row_diag = diag_idxs[row];
     const auto row_end = row_ptrs[row + 1];
-    const auto child_begin = elim_tree_child_ptrs[row];
-    const auto child_end = elim_tree_child_ptrs[row + 1];
     gko::matrix::csr::device_sparsity_lookup<IndexType> lookup{
         row_ptrs, cols,      storage_offsets,
         storage,  row_descs, static_cast<size_type>(row)};
-    for (auto child = child_begin; child < child_end; child++) {
-        const auto dep = elim_tree_children[child];
-        scheduler.wait(dep);
-        // TODO evaluate parallel waiting with __all_sync
-    }
-    // for each lower triangular entry: eliminate with corresponding row
+    // for each lower triangular entry: eliminate with corresponding column
     for (auto lower_nz = row_begin; lower_nz < row_diag; lower_nz++) {
         const auto dep = cols[lower_nz];
-        auto val = vals[lower_nz];
+        scheduler.wait(dep);
+        const auto scale = vals[lower_nz];
         const auto diag_idx = diag_idxs[dep];
         const auto dep_end = row_ptrs[dep + 1];
-        const auto diag = vals[diag_idx];
-        const auto scale = val / diag;
-        if (lane == 0) {
-            vals[lower_nz] = scale;
-        }
-        // subtract all entries past the diagonal
-        for (auto upper_nz = diag_idx + 1 + lane; upper_nz < dep_end;
+        // subtract column dep from current column
+        for (auto upper_nz = diag_idx + lane; upper_nz < dep_end;
              upper_nz += config::warp_size) {
             const auto upper_col = cols[upper_nz];
-            if (upper_col < row) {
+            if (upper_col >= row) {
                 const auto upper_val = vals[upper_nz];
                 const auto output_pos =
                     lookup.lookup_unsafe(upper_col) + row_begin;
@@ -204,17 +191,16 @@ __global__ __launch_bounds__(default_block_size) void factorize(
             }
         }
     }
-    ValueType sum{};
-    for (auto lower_nz = row_begin + lane; lower_nz < row_diag;
-         lower_nz += config::warp_size) {
-        sum += squared_norm(vals[lower_nz]);
-        // copy the lower triangular entries to the transpose
-        vals[transpose_idxs[lower_nz]] = conj(vals[lower_nz]);
+    auto diag_val = sqrt(vals[row_diag]);
+    for (auto upper_nz = row_diag + 1 + lane; upper_nz < row_end;
+         upper_nz += config::warp_size) {
+        vals[upper_nz] /= diag_val;
+        // copy the upper triangular entries to the transpose
+        vals[transpose_idxs[upper_nz]] = conj(vals[upper_nz]);
     }
-    sum = reduce(warp, sum, thrust::plus<ValueType>{});
     if (lane == 0) {
         // store computed diagonal
-        vals[row_diag] = sqrt(vals[row_diag] - sum);
+        vals[row_diag] = diag_val;
     }
     scheduler.mark_ready();
 }
@@ -365,10 +351,9 @@ void factorize(std::shared_ptr<const DefaultExecutor> exec,
         kernel::factorize<<<num_blocks, default_block_size, 0,
                             exec->get_stream()>>>(
             factors->get_const_row_ptrs(), factors->get_const_col_idxs(),
-            forest.child_ptrs.get_const_data(),
-            forest.children.get_const_data(), lookup_offsets, lookup_storage,
-            lookup_descs, diag_idxs, transpose_idxs,
-            as_device_type(factors->get_values()), storage, num_rows);
+            lookup_offsets, lookup_storage, lookup_descs, diag_idxs,
+            transpose_idxs, as_device_type(factors->get_values()), storage,
+            num_rows);
     }
 }
 
diff --git a/common/cuda_hip/factorization/lu_kernels.hpp.inc b/common/cuda_hip/factorization/lu_kernels.hpp.inc
index f3db34b3631..1503ede4be3 100644
--- a/common/cuda_hip/factorization/lu_kernels.hpp.inc
+++ b/common/cuda_hip/factorization/lu_kernels.hpp.inc
@@ -106,7 +106,10 @@ __global__ __launch_bounds__(default_block_size) void factorize(
     // for each lower triangular entry: eliminate with corresponding row
     for (auto lower_nz = row_begin; lower_nz < row_diag; lower_nz++) {
         const auto dep = cols[lower_nz];
-        auto val = vals[lower_nz];
+        // we can load the value before synchronizing because the following
+        // updates only go past the diagonal of the dependency row, i.e. at
+        // least column dep + 1
+        const auto val = vals[lower_nz];
         const auto diag_idx = diag_idxs[dep];
         const auto dep_end = row_ptrs[dep + 1];
         scheduler.wait(dep);
@@ -128,6 +131,88 @@ __global__ __launch_bounds__(default_block_size) void factorize(
 }
 
 
+template <typename ValueType, typename IndexType>
+__global__ __launch_bounds__(default_block_size) void symbolic_factorize_simple(
+    const IndexType* __restrict__ mtx_row_ptrs,
+    const IndexType* __restrict__ mtx_cols,
+    const IndexType* __restrict__ factor_row_ptrs,
+    const IndexType* __restrict__ factor_cols,
+    const IndexType* __restrict__ storage_offsets,
+    const int32* __restrict__ storage, const int64* __restrict__ row_descs,
+    IndexType* __restrict__ diag_idxs, ValueType* __restrict__ factor_vals,
+    IndexType* __restrict__ out_row_nnz, syncfree_storage dep_storage,
+    size_type num_rows)
+{
+    using scheduler_t =
+        syncfree_scheduler<default_block_size, config::warp_size, IndexType>;
+    __shared__ typename scheduler_t::shared_storage sh_dep_storage;
+    scheduler_t scheduler(dep_storage, sh_dep_storage);
+    const auto row = scheduler.get_work_id();
+    if (row >= num_rows) {
+        return;
+    }
+    const auto warp =
+        group::tiled_partition<config::warp_size>(group::this_thread_block());
+    const auto lane = warp.thread_rank();
+    const auto factor_begin = factor_row_ptrs[row];
+    const auto factor_end = factor_row_ptrs[row + 1];
+    const auto mtx_begin = mtx_row_ptrs[row];
+    const auto mtx_end = mtx_row_ptrs[row + 1];
+    gko::matrix::csr::device_sparsity_lookup<IndexType> lookup{
+        factor_row_ptrs, factor_cols, storage_offsets,
+        storage,         row_descs,   static_cast<size_type>(row)};
+    const auto row_diag = lookup.lookup_unsafe(row) + factor_begin;
+    // fill with zeros first
+    for (auto nz = factor_begin + lane; nz < factor_end;
+         nz += config::warp_size) {
+        factor_vals[nz] = zero<float>();
+    }
+    warp.sync();
+    // then fill in the system matrix
+    for (auto nz = mtx_begin + lane; nz < mtx_end; nz += config::warp_size) {
+        const auto col = mtx_cols[nz];
+        factor_vals[lookup.lookup_unsafe(col) + factor_begin] = one<float>();
+    }
+    // finally set diagonal and store diagonal index
+    if (lane == 0) {
+        diag_idxs[row] = row_diag;
+        factor_vals[row_diag] = one<float>();
+    }
+    warp.sync();
+    // for each lower triangular entry: eliminate with corresponding row
+    for (auto lower_nz = factor_begin; lower_nz < row_diag; lower_nz++) {
+        const auto dep = factor_cols[lower_nz];
+        const auto dep_end = factor_row_ptrs[dep + 1];
+        scheduler.wait(dep);
+        // read the diag entry after we are sure it was written.
+        const auto diag_idx = diag_idxs[dep];
+        if (factor_vals[lower_nz] == one<float>()) {
+            // eliminate with upper triangle/entries past the diagonal
+            for (auto upper_nz = diag_idx + 1 + lane; upper_nz < dep_end;
+                 upper_nz += config::warp_size) {
+                const auto upper_col = factor_cols[upper_nz];
+                const auto upper_val = factor_vals[upper_nz];
+                const auto output_pos =
+                    lookup.lookup_unsafe(upper_col) + factor_begin;
+                if (upper_val == one<float>()) {
+                    factor_vals[output_pos] = one<float>();
+                }
+            }
+        }
+    }
+    scheduler.mark_ready();
+    IndexType row_nnz{};
+    for (auto nz = factor_begin + lane; nz < factor_end;
+         nz += config::warp_size) {
+        row_nnz += factor_vals[nz] == one<float>() ? 1 : 0;
+    }
+    row_nnz = reduce(warp, row_nnz, thrust::plus<IndexType>{});
+    if (lane == 0) {
+        out_row_nnz[row] = row_nnz;
+    }
+}
+
+
 }  // namespace kernel
 
 
@@ -177,3 +262,70 @@ void factorize(std::shared_ptr<const DefaultExecutor> exec,
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LU_FACTORIZE);
+
+
+template <typename IndexType>
+void symbolic_factorize_simple(
+    std::shared_ptr<const DefaultExecutor> exec, const IndexType* row_ptrs,
+    const IndexType* col_idxs, const IndexType* lookup_offsets,
+    const int64* lookup_descs, const int32* lookup_storage,
+    matrix::Csr<float, IndexType>* factors, IndexType* out_row_nnz)
+{
+    const auto num_rows = factors->get_size()[0];
+    const auto factor_row_ptrs = factors->get_const_row_ptrs();
+    const auto factor_cols = factors->get_const_col_idxs();
+    const auto factor_vals = factors->get_values();
+    array<IndexType> diag_idx_array{exec, num_rows};
+    array<int> tmp_storage{exec};
+    const auto diag_idxs = diag_idx_array.get_data();
+    if (num_rows > 0) {
+        syncfree_storage dep_storage(exec, tmp_storage, num_rows);
+        const auto num_blocks =
+            ceildiv(num_rows, default_block_size / config::warp_size);
+        kernel::symbolic_factorize_simple<<<num_blocks, default_block_size, 0,
+                                            exec->get_stream()>>>(
+            row_ptrs, col_idxs, factor_row_ptrs, factor_cols, lookup_offsets,
+            lookup_storage, lookup_descs, diag_idxs, factor_vals, out_row_nnz,
+            dep_storage, num_rows);
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_LU_SYMMETRIC_FACTORIZE_SIMPLE);
+
+
+struct first_eq_one_functor {
+    template <typename Pair>
+    __device__ __forceinline__ bool operator()(Pair pair) const
+    {
+        return thrust::get<0>(pair) == one<float>();
+    }
+};
+
+
+struct return_second_functor {
+    template <typename Pair>
+    __device__ __forceinline__ auto operator()(Pair pair) const
+    {
+        return thrust::get<1>(pair);
+    }
+};
+
+
+template <typename IndexType>
+void symbolic_factorize_simple_finalize(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const matrix::Csr<float, IndexType>* factors, IndexType* out_col_idxs)
+{
+    const auto col_idxs = factors->get_const_col_idxs();
+    const auto vals = factors->get_const_values();
+    const auto input_it =
+        thrust::make_zip_iterator(thrust::make_tuple(vals, col_idxs));
+    const auto output_it = thrust::make_transform_output_iterator(
+        out_col_idxs, return_second_functor{});
+    thrust::copy_if(thrust_policy(exec), input_it,
+                    input_it + factors->get_num_stored_elements(), output_it,
+                    first_eq_one_functor{});
+}
+
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(
+    GKO_DECLARE_LU_SYMMETRIC_FACTORIZE_SIMPLE_FINALIZE);
diff --git a/common/cuda_hip/factorization/par_ic_kernels.hpp.inc b/common/cuda_hip/factorization/par_ic_kernels.hpp.inc
index 9a4d605c6a3..7a3b3da8e32 100644
--- a/common/cuda_hip/factorization/par_ic_kernels.hpp.inc
+++ b/common/cuda_hip/factorization/par_ic_kernels.hpp.inc
@@ -78,16 +78,18 @@ __global__ __launch_bounds__(default_block_size) void ic_sweep(
         auto l_col = l_col_idxs[l_row_begin];
         auto lh_row = l_col_idxs[lh_col_begin];
         if (l_col == lh_row && l_col < last_entry) {
-            sum += l_vals[l_row_begin] * conj(l_vals[lh_col_begin]);
+            sum += load_relaxed(l_vals + l_row_begin) *
+                   conj(load_relaxed(l_vals + lh_col_begin));
         }
         l_row_begin += l_col <= lh_row;
         lh_col_begin += l_col >= lh_row;
     }
-    auto to_write = row == col
-                        ? sqrt(a_val - sum)
-                        : (a_val - sum) / l_vals[l_row_ptrs[col + 1] - 1];
+    auto to_write =
+        row == col
+            ? sqrt(a_val - sum)
+            : (a_val - sum) / load_relaxed(l_vals + (l_row_ptrs[col + 1] - 1));
     if (is_finite(to_write)) {
-        l_vals[l_nz] = to_write;
+        store_relaxed(l_vals + l_nz, to_write);
     }
 }
 
diff --git a/common/cuda_hip/factorization/par_ict_sweep_kernels.hpp.inc b/common/cuda_hip/factorization/par_ict_sweep_kernels.hpp.inc
index 7eccbda61d2..d54fe3c6c77 100644
--- a/common/cuda_hip/factorization/par_ict_sweep_kernels.hpp.inc
+++ b/common/cuda_hip/factorization/par_ict_sweep_kernels.hpp.inc
@@ -75,8 +75,8 @@ __global__ __launch_bounds__(default_block_size) void ict_sweep(
             // we don't need to use the `bool valid` because last_entry is
             // already a smaller sentinel value than the one used in group_merge
             if (l_col == lh_row && l_col < last_entry) {
-                sum += l_vals[l_idx + l_row_begin] *
-                       conj(l_vals[lh_idx + lh_col_begin]);
+                sum += load_relaxed(l_vals + (l_idx + l_row_begin)) *
+                       conj(load_relaxed(l_vals + (lh_idx + lh_col_begin)));
             }
             // remember the transposed element
             auto found_transp = subwarp.ballot(lh_row == row);
@@ -90,11 +90,12 @@ __global__ __launch_bounds__(default_block_size) void ict_sweep(
     sum = reduce(subwarp, sum, [](ValueType a, ValueType b) { return a + b; });
 
     if (subwarp.thread_rank() == 0) {
-        auto to_write = row == col
-                            ? sqrt(a_val - sum)
-                            : (a_val - sum) / l_vals[l_row_ptrs[col + 1] - 1];
+        auto to_write =
+            row == col ? sqrt(a_val - sum)
+                       : (a_val - sum) /
+                             load_relaxed(l_vals + (l_row_ptrs[col + 1] - 1));
         if (is_finite(to_write)) {
-            l_vals[l_nz] = to_write;
+            store_relaxed(l_vals + l_nz, to_write);
         }
     }
 }
diff --git a/common/cuda_hip/factorization/par_ilu_kernels.hpp.inc b/common/cuda_hip/factorization/par_ilu_kernels.hpp.inc
index 08bd5bf8b4e..6785c161674 100644
--- a/common/cuda_hip/factorization/par_ilu_kernels.hpp.inc
+++ b/common/cuda_hip/factorization/par_ilu_kernels.hpp.inc
@@ -57,7 +57,8 @@ __global__ __launch_bounds__(default_block_size) void compute_l_u_factors(
             const auto u_col = u_col_idxs[u_idx];
             last_operation = zero<ValueType>();
             if (l_col == u_col) {
-                last_operation = l_values[l_idx] * u_values[u_idx];
+                last_operation = load_relaxed(l_values + l_idx) *
+                                 load_relaxed(u_values + u_idx);
                 sum -= last_operation;
             }
             l_idx += (l_col <= u_col);
@@ -65,14 +66,15 @@ __global__ __launch_bounds__(default_block_size) void compute_l_u_factors(
         }
         sum += last_operation;  // undo the last operation
         if (row > col) {
-            auto to_write = sum / u_values[u_row_ptrs[col + 1] - 1];
+            auto to_write =
+                sum / load_relaxed(u_values + (u_row_ptrs[col + 1] - 1));
             if (is_finite(to_write)) {
-                l_values[l_idx - 1] = to_write;
+                store_relaxed(l_values + (l_idx - 1), to_write);
             }
         } else {
             auto to_write = sum;
             if (is_finite(to_write)) {
-                u_values[u_idx - 1] = to_write;
+                store_relaxed(u_values + (u_idx - 1), to_write);
             }
         }
     }
diff --git a/common/cuda_hip/factorization/par_ilut_sweep_kernels.hpp.inc b/common/cuda_hip/factorization/par_ilut_sweep_kernels.hpp.inc
index e99888b35b3..d3cc4330c39 100644
--- a/common/cuda_hip/factorization/par_ilut_sweep_kernels.hpp.inc
+++ b/common/cuda_hip/factorization/par_ilut_sweep_kernels.hpp.inc
@@ -87,8 +87,8 @@ __global__ __launch_bounds__(default_block_size) void sweep(
             // we don't need to use the `bool valid` because last_entry is
             // already a smaller sentinel value than the one used in group_merge
             if (l_col == ut_row && l_col < last_entry) {
-                sum += l_vals[l_idx + l_row_begin] *
-                       ut_vals[ut_idx + ut_col_begin];
+                sum += load_relaxed(l_vals + (l_idx + l_row_begin)) *
+                       load_relaxed(ut_vals + (ut_idx + ut_col_begin));
             }
             // remember the transposed element
             auto found_transp = subwarp.ballot(ut_row == row);
@@ -103,15 +103,16 @@ __global__ __launch_bounds__(default_block_size) void sweep(
 
     if (subwarp.thread_rank() == 0) {
         if (lower) {
-            auto to_write = (a_val - sum) / ut_vals[ut_col_ptrs[col + 1] - 1];
+            auto to_write = (a_val - sum) /
+                            load_relaxed(ut_vals + (ut_col_ptrs[col + 1] - 1));
             if (is_finite(to_write)) {
-                l_vals[l_nz] = to_write;
+                store_relaxed(l_vals + l_nz, to_write);
             }
         } else {
             auto to_write = a_val - sum;
             if (is_finite(to_write)) {
-                u_vals[u_nz] = to_write;
-                ut_vals[ut_nz] = to_write;
+                store_relaxed(u_vals + u_nz, to_write);
+                store_relaxed(ut_vals + ut_nz, to_write);
             }
         }
     }
diff --git a/common/cuda_hip/log/batch_logger.hpp.inc b/common/cuda_hip/log/batch_logger.hpp.inc
new file mode 100644
index 00000000000..e8cf77960ef
--- /dev/null
+++ b/common/cuda_hip/log/batch_logger.hpp.inc
@@ -0,0 +1,56 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+/**
+ * @see reference/log/batch_logger.hpp
+ */
+template <typename RealType>
+class SimpleFinalLogger final {
+public:
+    using real_type = RealType;
+
+    SimpleFinalLogger(real_type* const batch_residuals, int* const batch_iters)
+        : final_residuals_{batch_residuals}, final_iters_{batch_iters}
+    {}
+
+    __device__ __forceinline__ void log_iteration(const size_type batch_idx,
+                                                  const int iter,
+                                                  const real_type res_norm)
+    {
+        final_iters_[batch_idx] = iter;
+        final_residuals_[batch_idx] = res_norm;
+    }
+
+private:
+    real_type* const final_residuals_;
+    int* const final_iters_;
+};
diff --git a/common/cuda_hip/matrix/batch_dense_kernel_launcher.hpp.inc b/common/cuda_hip/matrix/batch_dense_kernel_launcher.hpp.inc
new file mode 100644
index 00000000000..23ae8ebd5f0
--- /dev/null
+++ b/common/cuda_hip/matrix/batch_dense_kernel_launcher.hpp.inc
@@ -0,0 +1,78 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+
+template <typename ValueType>
+void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
+                  const batch::matrix::Dense<ValueType>* mat,
+                  const batch::MultiVector<ValueType>* b,
+                  batch::MultiVector<ValueType>* x)
+{
+    const auto num_blocks = mat->get_num_batch_items();
+    const auto b_ub = get_batch_struct(b);
+    const auto x_ub = get_batch_struct(x);
+    const auto mat_ub = get_batch_struct(mat);
+    if (b->get_common_size()[1] > 1) {
+        GKO_NOT_IMPLEMENTED;
+    }
+    simple_apply_kernel<<<num_blocks, default_block_size, 0,
+                          exec->get_stream()>>>(mat_ub, b_ub, x_ub);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_BATCH_DENSE_SIMPLE_APPLY_KERNEL);
+
+
+template <typename ValueType>
+void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
+                    const batch::MultiVector<ValueType>* alpha,
+                    const batch::matrix::Dense<ValueType>* mat,
+                    const batch::MultiVector<ValueType>* b,
+                    const batch::MultiVector<ValueType>* beta,
+                    batch::MultiVector<ValueType>* x)
+{
+    const auto num_blocks = mat->get_num_batch_items();
+    const auto b_ub = get_batch_struct(b);
+    const auto x_ub = get_batch_struct(x);
+    const auto mat_ub = get_batch_struct(mat);
+    const auto alpha_ub = get_batch_struct(alpha);
+    const auto beta_ub = get_batch_struct(beta);
+    if (b->get_common_size()[1] > 1) {
+        GKO_NOT_IMPLEMENTED;
+    }
+    advanced_apply_kernel<<<num_blocks, default_block_size, 0,
+                            exec->get_stream()>>>(alpha_ub, mat_ub, b_ub,
+                                                  beta_ub, x_ub);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_BATCH_DENSE_ADVANCED_APPLY_KERNEL);
diff --git a/common/cuda_hip/matrix/batch_dense_kernels.hpp.inc b/common/cuda_hip/matrix/batch_dense_kernels.hpp.inc
new file mode 100644
index 00000000000..7a38cfea215
--- /dev/null
+++ b/common/cuda_hip/matrix/batch_dense_kernels.hpp.inc
@@ -0,0 +1,164 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+
+template <typename ValueType>
+__device__ __forceinline__ void simple_apply(
+    const gko::batch::matrix::dense::batch_item<const ValueType>& mat,
+    const ValueType* const __restrict__ b, ValueType* const __restrict__ x)
+{
+    constexpr auto tile_size = config::warp_size;
+
+    auto thread_block = group::this_thread_block();
+    auto subgroup = group::tiled_partition<tile_size>(thread_block);
+    const auto subgroup_id = static_cast<int>(threadIdx.x / tile_size);
+    const int num_subgroups_per_block = ceildiv(blockDim.x, tile_size);
+
+    for (int row = subgroup_id; row < mat.num_rows;
+         row += num_subgroups_per_block) {
+        ValueType temp = zero<ValueType>();
+        for (int j = subgroup.thread_rank(); j < mat.num_cols;
+             j += subgroup.size()) {
+            const ValueType val = mat.values[row * mat.stride + j];
+            temp += val * b[j];
+        }
+
+        // subgroup level reduction
+        temp = reduce(subgroup, temp, thrust::plus<ValueType>{});
+
+        if (subgroup.thread_rank() == 0) {
+            x[row] = temp;
+        }
+    }
+}
+
+template <typename ValueType>
+__global__ __launch_bounds__(
+    default_block_size,
+    sm_oversubscription) void simple_apply_kernel(const gko::batch::matrix::
+                                                      dense::uniform_batch<
+                                                          const ValueType>
+                                                          mat,
+                                                  const gko::batch::
+                                                      multi_vector::
+                                                          uniform_batch<
+                                                              const ValueType>
+                                                              b,
+                                                  const gko::batch::
+                                                      multi_vector::
+                                                          uniform_batch<
+                                                              ValueType>
+                                                              x)
+{
+    for (size_type batch_id = blockIdx.x; batch_id < mat.num_batch_items;
+         batch_id += gridDim.x) {
+        const auto mat_b =
+            gko::batch::matrix::extract_batch_item(mat, batch_id);
+        const auto b_b = gko::batch::extract_batch_item(b, batch_id);
+        const auto x_b = gko::batch::extract_batch_item(x, batch_id);
+        simple_apply(mat_b, b_b.values, x_b.values);
+    }
+}
+
+
+template <typename ValueType>
+__device__ __forceinline__ void advanced_apply(
+    const ValueType alpha,
+    const gko::batch::matrix::dense::batch_item<const ValueType>& mat,
+    const ValueType* const __restrict__ b, const ValueType beta,
+    ValueType* const __restrict__ x)
+{
+    constexpr auto tile_size = config::warp_size;
+
+    auto thread_block = group::this_thread_block();
+    auto subgroup = group::tiled_partition<tile_size>(thread_block);
+    const auto subgroup_id = static_cast<int>(threadIdx.x / tile_size);
+    const int num_subgroups_per_block = ceildiv(blockDim.x, tile_size);
+
+    for (int row = subgroup_id; row < mat.num_rows;
+         row += num_subgroups_per_block) {
+        ValueType temp = zero<ValueType>();
+        for (int j = subgroup.thread_rank(); j < mat.num_cols;
+             j += subgroup.size()) {
+            const ValueType val = mat.values[row * mat.stride + j];
+            temp += alpha * val * b[j];
+        }
+
+        // subgroup level reduction
+        temp = reduce(subgroup, temp, thrust::plus<ValueType>{});
+
+        if (subgroup.thread_rank() == 0) {
+            x[row] = temp + beta * x[row];
+        }
+    }
+}
+
+template <typename ValueType>
+__global__ __launch_bounds__(
+    default_block_size,
+    sm_oversubscription) void advanced_apply_kernel(const gko::batch::
+                                                        multi_vector::
+                                                            uniform_batch<
+                                                                const ValueType>
+                                                                alpha,
+                                                    const gko::batch::matrix::
+                                                        dense::uniform_batch<
+                                                            const ValueType>
+                                                            mat,
+                                                    const gko::batch::
+                                                        multi_vector::
+                                                            uniform_batch<
+                                                                const ValueType>
+                                                                b,
+                                                    const gko::batch::
+                                                        multi_vector::
+                                                            uniform_batch<
+                                                                const ValueType>
+                                                                beta,
+                                                    const gko::batch::
+                                                        multi_vector::
+                                                            uniform_batch<
+                                                                ValueType>
+                                                                x)
+{
+    for (size_type batch_id = blockIdx.x; batch_id < mat.num_batch_items;
+         batch_id += gridDim.x) {
+        const auto mat_b =
+            gko::batch::matrix::extract_batch_item(mat, batch_id);
+        const auto b_b = gko::batch::extract_batch_item(b, batch_id);
+        const auto x_b = gko::batch::extract_batch_item(x, batch_id);
+        const auto alpha_b = gko::batch::extract_batch_item(alpha, batch_id);
+        const auto beta_b = gko::batch::extract_batch_item(beta, batch_id);
+        advanced_apply(alpha_b.values[0], mat_b, b_b.values, beta_b.values[0],
+                       x_b.values);
+    }
+}
diff --git a/common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc b/common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc
new file mode 100644
index 00000000000..f8da432aa4d
--- /dev/null
+++ b/common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc
@@ -0,0 +1,78 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+template <typename ValueType, typename IndexType>
+void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
+                  const batch::matrix::Ell<ValueType, IndexType>* mat,
+                  const batch::MultiVector<ValueType>* b,
+                  batch::MultiVector<ValueType>* x)
+{
+    const auto num_blocks = mat->get_num_batch_items();
+    const auto b_ub = get_batch_struct(b);
+    const auto x_ub = get_batch_struct(x);
+    const auto mat_ub = get_batch_struct(mat);
+    if (b->get_common_size()[1] > 1) {
+        GKO_NOT_IMPLEMENTED;
+    }
+    simple_apply_kernel<<<num_blocks, default_block_size, 0,
+                          exec->get_stream()>>>(mat_ub, b_ub, x_ub);
+}
+
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
+    GKO_DECLARE_BATCH_ELL_SIMPLE_APPLY_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
+                    const batch::MultiVector<ValueType>* alpha,
+                    const batch::matrix::Ell<ValueType, IndexType>* mat,
+                    const batch::MultiVector<ValueType>* b,
+                    const batch::MultiVector<ValueType>* beta,
+                    batch::MultiVector<ValueType>* x)
+{
+    const auto num_blocks = mat->get_num_batch_items();
+    const auto b_ub = get_batch_struct(b);
+    const auto x_ub = get_batch_struct(x);
+    const auto mat_ub = get_batch_struct(mat);
+    const auto alpha_ub = get_batch_struct(alpha);
+    const auto beta_ub = get_batch_struct(beta);
+    if (b->get_common_size()[1] > 1) {
+        GKO_NOT_IMPLEMENTED;
+    }
+    advanced_apply_kernel<<<num_blocks, default_block_size, 0,
+                            exec->get_stream()>>>(alpha_ub, mat_ub, b_ub,
+                                                  beta_ub, x_ub);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
+    GKO_DECLARE_BATCH_ELL_ADVANCED_APPLY_KERNEL);
diff --git a/common/cuda_hip/matrix/batch_ell_kernels.hpp.inc b/common/cuda_hip/matrix/batch_ell_kernels.hpp.inc
new file mode 100644
index 00000000000..de6ca879890
--- /dev/null
+++ b/common/cuda_hip/matrix/batch_ell_kernels.hpp.inc
@@ -0,0 +1,156 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+
+template <typename ValueType, typename IndexType>
+__device__ __forceinline__ void simple_apply(
+    const gko::batch::matrix::ell::batch_item<const ValueType, IndexType>& mat,
+    const ValueType* const __restrict__ b, ValueType* const __restrict__ x)
+{
+    const auto num_rows = mat.num_rows;
+    const auto num_stored_elements_per_row = mat.num_stored_elems_per_row;
+    const auto stride = mat.stride;
+    const auto val = mat.values;
+    const auto col = mat.col_idxs;
+    for (int tidx = threadIdx.x; tidx < num_rows; tidx += blockDim.x) {
+        auto temp = zero<ValueType>();
+        for (size_type idx = 0; idx < num_stored_elements_per_row; idx++) {
+            const auto ind = tidx + idx * stride;
+            const auto col_idx = col[ind];
+            if (col_idx == invalid_index<IndexType>()) {
+                break;
+            } else {
+                temp += val[ind] * b[col_idx];
+            }
+        }
+        x[tidx] = temp;
+    }
+}
+
+template <typename ValueType, typename IndexType>
+__global__ __launch_bounds__(
+    default_block_size,
+    sm_oversubscription) void simple_apply_kernel(const gko::batch::matrix::
+                                                      ell::uniform_batch<
+                                                          const ValueType,
+                                                          IndexType>
+                                                          mat,
+                                                  const gko::batch::
+                                                      multi_vector::
+                                                          uniform_batch<
+                                                              const ValueType>
+                                                              b,
+                                                  const gko::batch::
+                                                      multi_vector::
+                                                          uniform_batch<
+                                                              ValueType>
+                                                              x)
+{
+    for (size_type batch_id = blockIdx.x; batch_id < mat.num_batch_items;
+         batch_id += gridDim.x) {
+        const auto mat_b =
+            gko::batch::matrix::extract_batch_item(mat, batch_id);
+        const auto b_b = gko::batch::extract_batch_item(b, batch_id);
+        const auto x_b = gko::batch::extract_batch_item(x, batch_id);
+        simple_apply(mat_b, b_b.values, x_b.values);
+    }
+}
+
+
+template <typename ValueType, typename IndexType>
+__device__ __forceinline__ void advanced_apply(
+    const ValueType alpha,
+    const gko::batch::matrix::ell::batch_item<const ValueType, IndexType>& mat,
+    const ValueType* const __restrict__ b, const ValueType beta,
+    ValueType* const __restrict__ x)
+{
+    const auto num_rows = mat.num_rows;
+    const auto num_stored_elements_per_row = mat.num_stored_elems_per_row;
+    const auto stride = mat.stride;
+    const auto val = mat.values;
+    const auto col = mat.col_idxs;
+    for (int tidx = threadIdx.x; tidx < num_rows; tidx += blockDim.x) {
+        auto temp = zero<ValueType>();
+        for (size_type idx = 0; idx < num_stored_elements_per_row; idx++) {
+            const auto ind = tidx + idx * stride;
+            const auto col_idx = col[ind];
+            if (col_idx == invalid_index<IndexType>()) {
+                break;
+            } else {
+                temp += alpha * val[ind] * b[col_idx];
+            }
+        }
+        x[tidx] = temp + beta * x[tidx];
+    }
+}
+
+template <typename ValueType, typename IndexType>
+__global__ __launch_bounds__(
+    default_block_size,
+    sm_oversubscription) void advanced_apply_kernel(const gko::batch::
+                                                        multi_vector::
+                                                            uniform_batch<
+                                                                const ValueType>
+                                                                alpha,
+                                                    const gko::batch::matrix::
+                                                        ell::uniform_batch<
+                                                            const ValueType,
+                                                            IndexType>
+                                                            mat,
+                                                    const gko::batch::
+                                                        multi_vector::
+                                                            uniform_batch<
+                                                                const ValueType>
+                                                                b,
+                                                    const gko::batch::
+                                                        multi_vector::
+                                                            uniform_batch<
+                                                                const ValueType>
+                                                                beta,
+                                                    const gko::batch::
+                                                        multi_vector::
+                                                            uniform_batch<
+                                                                ValueType>
+                                                                x)
+{
+    for (size_type batch_id = blockIdx.x; batch_id < mat.num_batch_items;
+         batch_id += gridDim.x) {
+        const auto mat_b =
+            gko::batch::matrix::extract_batch_item(mat, batch_id);
+        const auto b_b = gko::batch::extract_batch_item(b, batch_id);
+        const auto x_b = gko::batch::extract_batch_item(x, batch_id);
+        const auto alpha_b = gko::batch::extract_batch_item(alpha, batch_id);
+        const auto beta_b = gko::batch::extract_batch_item(beta, batch_id);
+        advanced_apply(alpha_b.values[0], mat_b, b_b.values, beta_b.values[0],
+                       x_b.values);
+    }
+}
diff --git a/common/cuda_hip/matrix/csr_common.hpp.inc b/common/cuda_hip/matrix/csr_common.hpp.inc
index 0fce02aecfa..35718464c42 100644
--- a/common/cuda_hip/matrix/csr_common.hpp.inc
+++ b/common/cuda_hip/matrix/csr_common.hpp.inc
@@ -102,7 +102,6 @@ __global__ __launch_bounds__(default_block_size) void check_diagonal_entries(
             if (tile_grp.thread_rank() == 0) {
                 *has_all_diags = false;
             }
-            return;
         }
     }
 }
diff --git a/common/cuda_hip/matrix/csr_kernels.hpp.inc b/common/cuda_hip/matrix/csr_kernels.hpp.inc
index e73dfde00fb..9687678dc58 100644
--- a/common/cuda_hip/matrix/csr_kernels.hpp.inc
+++ b/common/cuda_hip/matrix/csr_kernels.hpp.inc
@@ -606,7 +606,7 @@ __global__ __launch_bounds__(default_block_size) void spgeam(
             }
             // advance by the number of merged elements
             // in theory, we would need to mask by `valid`, but this
-            // would only be false somwhere in the last iteration, where
+            // would only be false somewhere in the last iteration, where
             // we don't need the value of c_begin afterwards, anyways.
             c_begin += popcnt(~prev_equal_mask & lanemask_full);
             return true;
@@ -665,8 +665,8 @@ __global__ __launch_bounds__(default_block_size) void row_ptr_permute(
     if (tid >= num_rows) {
         return;
     }
-    auto in_row = permutation[tid];
-    auto out_row = tid;
+    const auto in_row = permutation[tid];
+    const auto out_row = tid;
     out_nnz[out_row] = in_row_ptrs[in_row + 1] - in_row_ptrs[in_row];
 }
 
@@ -680,8 +680,8 @@ __global__ __launch_bounds__(default_block_size) void inv_row_ptr_permute(
     if (tid >= num_rows) {
         return;
     }
-    auto in_row = tid;
-    auto out_row = permutation[tid];
+    const auto in_row = tid;
+    const auto out_row = permutation[tid];
     out_nnz[out_row] = in_row_ptrs[in_row + 1] - in_row_ptrs[in_row];
 }
 
@@ -699,12 +699,12 @@ __global__ __launch_bounds__(default_block_size) void row_permute(
     if (tid >= num_rows) {
         return;
     }
-    auto lane = threadIdx.x % subwarp_size;
-    auto in_row = permutation[tid];
-    auto out_row = tid;
-    auto in_begin = in_row_ptrs[in_row];
-    auto in_size = in_row_ptrs[in_row + 1] - in_begin;
-    auto out_begin = out_row_ptrs[out_row];
+    const auto lane = threadIdx.x % subwarp_size;
+    const auto in_row = permutation[tid];
+    const auto out_row = tid;
+    const auto in_begin = in_row_ptrs[in_row];
+    const auto in_size = in_row_ptrs[in_row + 1] - in_begin;
+    const auto out_begin = out_row_ptrs[out_row];
     for (IndexType i = lane; i < in_size; i += subwarp_size) {
         out_cols[out_begin + i] = in_cols[in_begin + i];
         out_vals[out_begin + i] = in_vals[in_begin + i];
@@ -725,12 +725,12 @@ __global__ __launch_bounds__(default_block_size) void inv_row_permute(
     if (tid >= num_rows) {
         return;
     }
-    auto lane = threadIdx.x % subwarp_size;
-    auto in_row = tid;
-    auto out_row = permutation[tid];
-    auto in_begin = in_row_ptrs[in_row];
-    auto in_size = in_row_ptrs[in_row + 1] - in_begin;
-    auto out_begin = out_row_ptrs[out_row];
+    const auto lane = threadIdx.x % subwarp_size;
+    const auto in_row = tid;
+    const auto out_row = permutation[tid];
+    const auto in_begin = in_row_ptrs[in_row];
+    const auto in_size = in_row_ptrs[in_row + 1] - in_begin;
+    const auto out_begin = out_row_ptrs[out_row];
     for (IndexType i = lane; i < in_size; i += subwarp_size) {
         out_cols[out_begin + i] = in_cols[in_begin + i];
         out_vals[out_begin + i] = in_vals[in_begin + i];
@@ -751,12 +751,12 @@ __global__ __launch_bounds__(default_block_size) void inv_symm_permute(
     if (tid >= num_rows) {
         return;
     }
-    auto lane = threadIdx.x % subwarp_size;
-    auto in_row = tid;
-    auto out_row = permutation[tid];
-    auto in_begin = in_row_ptrs[in_row];
-    auto in_size = in_row_ptrs[in_row + 1] - in_begin;
-    auto out_begin = out_row_ptrs[out_row];
+    const auto lane = threadIdx.x % subwarp_size;
+    const auto in_row = tid;
+    const auto out_row = permutation[tid];
+    const auto in_begin = in_row_ptrs[in_row];
+    const auto in_size = in_row_ptrs[in_row + 1] - in_begin;
+    const auto out_begin = out_row_ptrs[out_row];
     for (IndexType i = lane; i < in_size; i += subwarp_size) {
         out_cols[out_begin + i] = permutation[in_cols[in_begin + i]];
         out_vals[out_begin + i] = in_vals[in_begin + i];
@@ -764,6 +764,147 @@ __global__ __launch_bounds__(default_block_size) void inv_symm_permute(
 }
 
 
+template <int subwarp_size, typename ValueType, typename IndexType>
+__global__ __launch_bounds__(default_block_size) void inv_nonsymm_permute(
+    size_type num_rows, const IndexType* __restrict__ row_permutation,
+    const IndexType* __restrict__ col_permutation,
+    const IndexType* __restrict__ in_row_ptrs,
+    const IndexType* __restrict__ in_cols,
+    const ValueType* __restrict__ in_vals,
+    const IndexType* __restrict__ out_row_ptrs,
+    IndexType* __restrict__ out_cols, ValueType* __restrict__ out_vals)
+{
+    auto tid = thread::get_subwarp_id_flat<subwarp_size>();
+    if (tid >= num_rows) {
+        return;
+    }
+    const auto lane = threadIdx.x % subwarp_size;
+    const auto in_row = tid;
+    const auto out_row = row_permutation[tid];
+    const auto in_begin = in_row_ptrs[in_row];
+    const auto in_size = in_row_ptrs[in_row + 1] - in_begin;
+    const auto out_begin = out_row_ptrs[out_row];
+    for (IndexType i = lane; i < in_size; i += subwarp_size) {
+        out_cols[out_begin + i] = col_permutation[in_cols[in_begin + i]];
+        out_vals[out_begin + i] = in_vals[in_begin + i];
+    }
+}
+
+
+template <int subwarp_size, typename ValueType, typename IndexType>
+__global__ __launch_bounds__(default_block_size) void row_scale_permute(
+    size_type num_rows, const ValueType* __restrict__ scale,
+    const IndexType* __restrict__ permutation,
+    const IndexType* __restrict__ in_row_ptrs,
+    const IndexType* __restrict__ in_cols,
+    const ValueType* __restrict__ in_vals,
+    const IndexType* __restrict__ out_row_ptrs,
+    IndexType* __restrict__ out_cols, ValueType* __restrict__ out_vals)
+{
+    auto tid = thread::get_subwarp_id_flat<subwarp_size>();
+    if (tid >= num_rows) {
+        return;
+    }
+    const auto lane = threadIdx.x % subwarp_size;
+    const auto in_row = permutation[tid];
+    const auto out_row = tid;
+    const auto in_begin = in_row_ptrs[in_row];
+    const auto in_size = in_row_ptrs[in_row + 1] - in_begin;
+    const auto out_begin = out_row_ptrs[out_row];
+    for (IndexType i = lane; i < in_size; i += subwarp_size) {
+        out_cols[out_begin + i] = in_cols[in_begin + i];
+        out_vals[out_begin + i] = in_vals[in_begin + i] * scale[in_row];
+    }
+}
+
+
+template <int subwarp_size, typename ValueType, typename IndexType>
+__global__ __launch_bounds__(default_block_size) void inv_row_scale_permute(
+    size_type num_rows, const ValueType* __restrict__ scale,
+    const IndexType* __restrict__ permutation,
+    const IndexType* __restrict__ in_row_ptrs,
+    const IndexType* __restrict__ in_cols,
+    const ValueType* __restrict__ in_vals,
+    const IndexType* __restrict__ out_row_ptrs,
+    IndexType* __restrict__ out_cols, ValueType* __restrict__ out_vals)
+{
+    auto tid = thread::get_subwarp_id_flat<subwarp_size>();
+    if (tid >= num_rows) {
+        return;
+    }
+    const auto lane = threadIdx.x % subwarp_size;
+    const auto in_row = tid;
+    const auto out_row = permutation[tid];
+    const auto in_begin = in_row_ptrs[in_row];
+    const auto in_size = in_row_ptrs[in_row + 1] - in_begin;
+    const auto out_begin = out_row_ptrs[out_row];
+    for (IndexType i = lane; i < in_size; i += subwarp_size) {
+        out_cols[out_begin + i] = in_cols[in_begin + i];
+        out_vals[out_begin + i] = in_vals[in_begin + i] / scale[out_row];
+    }
+}
+
+
+template <int subwarp_size, typename ValueType, typename IndexType>
+__global__ __launch_bounds__(default_block_size) void inv_symm_scale_permute(
+    size_type num_rows, const ValueType* __restrict__ scale,
+    const IndexType* __restrict__ permutation,
+    const IndexType* __restrict__ in_row_ptrs,
+    const IndexType* __restrict__ in_cols,
+    const ValueType* __restrict__ in_vals,
+    const IndexType* __restrict__ out_row_ptrs,
+    IndexType* __restrict__ out_cols, ValueType* __restrict__ out_vals)
+{
+    auto tid = thread::get_subwarp_id_flat<subwarp_size>();
+    if (tid >= num_rows) {
+        return;
+    }
+    const auto lane = threadIdx.x % subwarp_size;
+    const auto in_row = tid;
+    const auto out_row = permutation[tid];
+    const auto in_begin = in_row_ptrs[in_row];
+    const auto in_size = in_row_ptrs[in_row + 1] - in_begin;
+    const auto out_begin = out_row_ptrs[out_row];
+    for (IndexType i = lane; i < in_size; i += subwarp_size) {
+        const auto out_col = permutation[in_cols[in_begin + i]];
+        out_cols[out_begin + i] = out_col;
+        out_vals[out_begin + i] =
+            in_vals[in_begin + i] / (scale[out_row] * scale[out_col]);
+    }
+}
+
+
+template <int subwarp_size, typename ValueType, typename IndexType>
+__global__ __launch_bounds__(default_block_size) void inv_nonsymm_scale_permute(
+    size_type num_rows, const ValueType* __restrict__ row_scale,
+    const IndexType* __restrict__ row_permutation,
+    const ValueType* __restrict__ col_scale,
+    const IndexType* __restrict__ col_permutation,
+    const IndexType* __restrict__ in_row_ptrs,
+    const IndexType* __restrict__ in_cols,
+    const ValueType* __restrict__ in_vals,
+    const IndexType* __restrict__ out_row_ptrs,
+    IndexType* __restrict__ out_cols, ValueType* __restrict__ out_vals)
+{
+    auto tid = thread::get_subwarp_id_flat<subwarp_size>();
+    if (tid >= num_rows) {
+        return;
+    }
+    const auto lane = threadIdx.x % subwarp_size;
+    const auto in_row = tid;
+    const auto out_row = row_permutation[tid];
+    const auto in_begin = in_row_ptrs[in_row];
+    const auto in_size = in_row_ptrs[in_row + 1] - in_begin;
+    const auto out_begin = out_row_ptrs[out_row];
+    for (IndexType i = lane; i < in_size; i += subwarp_size) {
+        const auto out_col = col_permutation[in_cols[in_begin + i]];
+        out_cols[out_begin + i] = out_col;
+        out_vals[out_begin + i] =
+            in_vals[in_begin + i] / (row_scale[out_row] * col_scale[out_col]);
+    }
+}
+
+
 template <typename ValueType, typename IndexType>
 __global__
 __launch_bounds__(default_block_size) void compute_submatrix_idxs_and_vals(
@@ -826,15 +967,19 @@ __global__ __launch_bounds__(default_block_size) void add_scaled_identity(
     auto tile_grp =
         group::tiled_partition<warp_size>(group::this_thread_block());
     const auto warpid = thread::get_subwarp_id_flat<warp_size, IndexType>();
-    const auto num_warps = thread::get_subwarp_num_flat<warp_size, IndexType>();
     if (warpid < num_rows) {
         const auto tid_in_warp = tile_grp.thread_rank();
         const IndexType row_start = row_ptrs[warpid];
         const IndexType num_nz = row_ptrs[warpid + 1] - row_start;
+        const auto beta_val = beta[0];
+        const auto alpha_val = alpha[0];
         for (IndexType iz = tid_in_warp; iz < num_nz; iz += warp_size) {
-            values[iz + row_start] *= beta[0];
-            if (col_idxs[iz + row_start] == warpid) {
-                values[iz + row_start] += alpha[0];
+            if (beta_val != one<ValueType>()) {
+                values[iz + row_start] *= beta_val;
+            }
+            if (col_idxs[iz + row_start] == warpid &&
+                alpha_val != zero<ValueType>()) {
+                values[iz + row_start] += alpha_val;
             }
         }
     }
@@ -872,11 +1017,7 @@ void convert_to_fbcsr(std::shared_ptr<const DefaultExecutor> exec,
     }
     auto in_rows = in_row_idxs.get_data();
     auto in_cols = in_col_idxs.get_data();
-    // workaround for CUDA 9.2 Thrust: Their complex<> implementation is broken
-    // due to overly generic assignment operator and constructor leading to
-    // ambiguities. So we need to use our own fake_complex type
-    auto in_vals =
-        reinterpret_cast<device_member_type<ValueType>*>(in_values.get_data());
+    auto in_vals = as_device_type(in_values.get_data());
     auto in_loc_it =
         thrust::make_zip_iterator(thrust::make_tuple(in_rows, in_cols));
     thrust::sort_by_key(thrust_policy(exec), in_loc_it, in_loc_it + nnz,
@@ -924,22 +1065,19 @@ void convert_to_fbcsr(std::shared_ptr<const DefaultExecutor> exec,
     // fill in values
     components::fill_array(exec, block_value_array.get_data(),
                            num_blocks * bs * bs, zero<ValueType>());
-    thrust::for_each_n(
-        thrust_policy(exec), iota, num_blocks,
-        [block_ptrs, nnz, num_blocks, bs, in_rows, in_cols, in_vals,
-         values] __device__(size_type i) {
-            const auto block_begin = block_ptrs[i];
-            const auto block_end = i < num_blocks - 1 ? block_ptrs[i + 1] : nnz;
-            for (auto nz = block_begin; nz < block_end; nz++) {
-                values[i * bs * bs + (in_cols[nz] % bs) * bs +
-                       (in_rows[nz] % bs)] = fake_complex_unpack(in_vals[nz]);
-            }
-        });
+    thrust::for_each_n(thrust_policy(exec), iota, num_blocks,
+                       [block_ptrs, nnz, num_blocks, bs, in_rows, in_cols,
+                        in_vals, values] __device__(size_type i) {
+                           const auto block_begin = block_ptrs[i];
+                           const auto block_end =
+                               i < num_blocks - 1 ? block_ptrs[i + 1] : nnz;
+                           for (auto nz = block_begin; nz < block_end; nz++) {
+                               values[i * bs * bs + (in_cols[nz] % bs) * bs +
+                                      (in_rows[nz] % bs)] = in_vals[nz];
+                           }
+                       });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CSR_CONVERT_TO_FBCSR_KERNEL);
-
 
 namespace kernel {
 
@@ -1122,7 +1260,407 @@ void build_lookup(std::shared_ptr<const DefaultExecutor> exec,
             storage);
 }
 
-GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CSR_BUILD_LOOKUP_KERNEL);
+
+namespace {
+
+
+template <int subwarp_size, typename ValueType, typename IndexType>
+void spgeam(syn::value_list<int, subwarp_size>,
+            std::shared_ptr<const DefaultExecutor> exec, const ValueType* alpha,
+            const IndexType* a_row_ptrs, const IndexType* a_col_idxs,
+            const ValueType* a_vals, const ValueType* beta,
+            const IndexType* b_row_ptrs, const IndexType* b_col_idxs,
+            const ValueType* b_vals, matrix::Csr<ValueType, IndexType>* c)
+{
+    auto m = static_cast<IndexType>(c->get_size()[0]);
+    auto c_row_ptrs = c->get_row_ptrs();
+    // count nnz for alpha * A + beta * B
+    auto subwarps_per_block = default_block_size / subwarp_size;
+    auto num_blocks = ceildiv(m, subwarps_per_block);
+    if (num_blocks > 0) {
+        kernel::spgeam_nnz<subwarp_size>
+            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+                a_row_ptrs, a_col_idxs, b_row_ptrs, b_col_idxs, m, c_row_ptrs);
+    }
+
+    // build row pointers
+    components::prefix_sum_nonnegative(exec, c_row_ptrs, m + 1);
+
+    // accumulate non-zeros for alpha * A + beta * B
+    matrix::CsrBuilder<ValueType, IndexType> c_builder{c};
+    auto c_nnz = exec->copy_val_to_host(c_row_ptrs + m);
+    c_builder.get_col_idx_array().resize_and_reset(c_nnz);
+    c_builder.get_value_array().resize_and_reset(c_nnz);
+    auto c_col_idxs = c->get_col_idxs();
+    auto c_vals = c->get_values();
+    if (num_blocks > 0) {
+        kernel::spgeam<subwarp_size>
+            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+                as_device_type(alpha), a_row_ptrs, a_col_idxs,
+                as_device_type(a_vals), as_device_type(beta), b_row_ptrs,
+                b_col_idxs, as_device_type(b_vals), m, c_row_ptrs, c_col_idxs,
+                as_device_type(c_vals));
+    }
+}
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_spgeam, spgeam);
+
+
+}  // namespace
+
+
+template <typename ValueType, typename IndexType>
+void spgeam(std::shared_ptr<const DefaultExecutor> exec,
+            const matrix::Dense<ValueType>* alpha,
+            const matrix::Csr<ValueType, IndexType>* a,
+            const matrix::Dense<ValueType>* beta,
+            const matrix::Csr<ValueType, IndexType>* b,
+            matrix::Csr<ValueType, IndexType>* c)
+{
+    auto total_nnz =
+        a->get_num_stored_elements() + b->get_num_stored_elements();
+    auto nnz_per_row = total_nnz / a->get_size()[0];
+    select_spgeam(
+        spgeam_kernels(),
+        [&](int compiled_subwarp_size) {
+            return compiled_subwarp_size >= nnz_per_row ||
+                   compiled_subwarp_size == config::warp_size;
+        },
+        syn::value_list<int>(), syn::type_list<>(), exec,
+        alpha->get_const_values(), a->get_const_row_ptrs(),
+        a->get_const_col_idxs(), a->get_const_values(),
+        beta->get_const_values(), b->get_const_row_ptrs(),
+        b->get_const_col_idxs(), b->get_const_values(), c);
+}
+
+
+template <typename ValueType, typename IndexType>
+void fill_in_dense(std::shared_ptr<const DefaultExecutor> exec,
+                   const matrix::Csr<ValueType, IndexType>* source,
+                   matrix::Dense<ValueType>* result)
+{
+    const auto num_rows = result->get_size()[0];
+    const auto num_cols = result->get_size()[1];
+    const auto stride = result->get_stride();
+    const auto row_ptrs = source->get_const_row_ptrs();
+    const auto col_idxs = source->get_const_col_idxs();
+    const auto vals = source->get_const_values();
+
+    auto grid_dim = ceildiv(num_rows, default_block_size);
+    if (grid_dim > 0) {
+        kernel::fill_in_dense<<<grid_dim, default_block_size, 0,
+                                exec->get_stream()>>>(
+            num_rows, as_device_type(row_ptrs), as_device_type(col_idxs),
+            as_device_type(vals), stride, as_device_type(result->get_values()));
+    }
+}
+
+
+template <typename ValueType, typename IndexType>
+void inv_symm_permute(std::shared_ptr<const DefaultExecutor> exec,
+                      const IndexType* perm,
+                      const matrix::Csr<ValueType, IndexType>* orig,
+                      matrix::Csr<ValueType, IndexType>* permuted)
+{
+    auto num_rows = orig->get_size()[0];
+    auto count_num_blocks = ceildiv(num_rows, default_block_size);
+    if (count_num_blocks > 0) {
+        kernel::inv_row_ptr_permute<<<count_num_blocks, default_block_size, 0,
+                                      exec->get_stream()>>>(
+            num_rows, perm, orig->get_const_row_ptrs(),
+            permuted->get_row_ptrs());
+    }
+    components::prefix_sum_nonnegative(exec, permuted->get_row_ptrs(),
+                                       num_rows + 1);
+    auto copy_num_blocks =
+        ceildiv(num_rows, default_block_size / config::warp_size);
+    if (copy_num_blocks > 0) {
+        kernel::inv_symm_permute<config::warp_size>
+            <<<copy_num_blocks, default_block_size, 0, exec->get_stream()>>>(
+                num_rows, perm, orig->get_const_row_ptrs(),
+                orig->get_const_col_idxs(),
+                as_device_type(orig->get_const_values()),
+                permuted->get_row_ptrs(), permuted->get_col_idxs(),
+                as_device_type(permuted->get_values()));
+    }
+}
+
+
+template <typename ValueType, typename IndexType>
+void inv_nonsymm_permute(std::shared_ptr<const DefaultExecutor> exec,
+                         const IndexType* row_perm, const IndexType* col_perm,
+                         const matrix::Csr<ValueType, IndexType>* orig,
+                         matrix::Csr<ValueType, IndexType>* permuted)
+{
+    auto num_rows = orig->get_size()[0];
+    auto count_num_blocks = ceildiv(num_rows, default_block_size);
+    if (count_num_blocks > 0) {
+        kernel::inv_row_ptr_permute<<<count_num_blocks, default_block_size, 0,
+                                      exec->get_stream()>>>(
+            num_rows, row_perm, orig->get_const_row_ptrs(),
+            permuted->get_row_ptrs());
+    }
+    components::prefix_sum_nonnegative(exec, permuted->get_row_ptrs(),
+                                       num_rows + 1);
+    auto copy_num_blocks =
+        ceildiv(num_rows, default_block_size / config::warp_size);
+    if (copy_num_blocks > 0) {
+        kernel::inv_nonsymm_permute<config::warp_size>
+            <<<copy_num_blocks, default_block_size, 0, exec->get_stream()>>>(
+                num_rows, row_perm, col_perm, orig->get_const_row_ptrs(),
+                orig->get_const_col_idxs(),
+                as_device_type(orig->get_const_values()),
+                permuted->get_row_ptrs(), permuted->get_col_idxs(),
+                as_device_type(permuted->get_values()));
+    }
+}
+
+
+template <typename ValueType, typename IndexType>
+void row_permute(std::shared_ptr<const DefaultExecutor> exec,
+                 const IndexType* perm,
+                 const matrix::Csr<ValueType, IndexType>* orig,
+                 matrix::Csr<ValueType, IndexType>* row_permuted)
+{
+    auto num_rows = orig->get_size()[0];
+    auto count_num_blocks = ceildiv(num_rows, default_block_size);
+    if (count_num_blocks > 0) {
+        kernel::row_ptr_permute<<<count_num_blocks, default_block_size, 0,
+                                  exec->get_stream()>>>(
+            num_rows, perm, orig->get_const_row_ptrs(),
+            row_permuted->get_row_ptrs());
+    }
+    components::prefix_sum_nonnegative(exec, row_permuted->get_row_ptrs(),
+                                       num_rows + 1);
+    auto copy_num_blocks =
+        ceildiv(num_rows, default_block_size / config::warp_size);
+    if (copy_num_blocks > 0) {
+        kernel::row_permute<config::warp_size>
+            <<<copy_num_blocks, default_block_size, 0, exec->get_stream()>>>(
+                num_rows, perm, orig->get_const_row_ptrs(),
+                orig->get_const_col_idxs(),
+                as_device_type(orig->get_const_values()),
+                row_permuted->get_row_ptrs(), row_permuted->get_col_idxs(),
+                as_device_type(row_permuted->get_values()));
+    }
+}
+
+
+template <typename ValueType, typename IndexType>
+void inv_row_permute(std::shared_ptr<const DefaultExecutor> exec,
+                     const IndexType* perm,
+                     const matrix::Csr<ValueType, IndexType>* orig,
+                     matrix::Csr<ValueType, IndexType>* row_permuted)
+{
+    auto num_rows = orig->get_size()[0];
+    auto count_num_blocks = ceildiv(num_rows, default_block_size);
+    if (count_num_blocks > 0) {
+        kernel::inv_row_ptr_permute<<<count_num_blocks, default_block_size, 0,
+                                      exec->get_stream()>>>(
+            num_rows, perm, orig->get_const_row_ptrs(),
+            row_permuted->get_row_ptrs());
+    }
+    components::prefix_sum_nonnegative(exec, row_permuted->get_row_ptrs(),
+                                       num_rows + 1);
+    auto copy_num_blocks =
+        ceildiv(num_rows, default_block_size / config::warp_size);
+    if (copy_num_blocks > 0) {
+        kernel::inv_row_permute<config::warp_size>
+            <<<copy_num_blocks, default_block_size, 0, exec->get_stream()>>>(
+                num_rows, perm, orig->get_const_row_ptrs(),
+                orig->get_const_col_idxs(),
+                as_device_type(orig->get_const_values()),
+                row_permuted->get_row_ptrs(), row_permuted->get_col_idxs(),
+                as_device_type(row_permuted->get_values()));
+    }
+}
+
+
+template <typename ValueType, typename IndexType>
+void inv_symm_scale_permute(std::shared_ptr<const DefaultExecutor> exec,
+                            const ValueType* scale, const IndexType* perm,
+                            const matrix::Csr<ValueType, IndexType>* orig,
+                            matrix::Csr<ValueType, IndexType>* permuted)
+{
+    auto num_rows = orig->get_size()[0];
+    auto count_num_blocks = ceildiv(num_rows, default_block_size);
+    if (count_num_blocks > 0) {
+        kernel::inv_row_ptr_permute<<<count_num_blocks, default_block_size, 0,
+                                      exec->get_stream()>>>(
+            num_rows, perm, orig->get_const_row_ptrs(),
+            permuted->get_row_ptrs());
+    }
+    components::prefix_sum_nonnegative(exec, permuted->get_row_ptrs(),
+                                       num_rows + 1);
+    auto copy_num_blocks =
+        ceildiv(num_rows, default_block_size / config::warp_size);
+    if (copy_num_blocks > 0) {
+        kernel::inv_symm_scale_permute<config::warp_size>
+            <<<copy_num_blocks, default_block_size, 0, exec->get_stream()>>>(
+                num_rows, as_device_type(scale), perm,
+                orig->get_const_row_ptrs(), orig->get_const_col_idxs(),
+                as_device_type(orig->get_const_values()),
+                permuted->get_row_ptrs(), permuted->get_col_idxs(),
+                as_device_type(permuted->get_values()));
+    }
+}
+
+
+template <typename ValueType, typename IndexType>
+void inv_nonsymm_scale_permute(std::shared_ptr<const DefaultExecutor> exec,
+                               const ValueType* row_scale,
+                               const IndexType* row_perm,
+                               const ValueType* col_scale,
+                               const IndexType* col_perm,
+                               const matrix::Csr<ValueType, IndexType>* orig,
+                               matrix::Csr<ValueType, IndexType>* permuted)
+{
+    auto num_rows = orig->get_size()[0];
+    auto count_num_blocks = ceildiv(num_rows, default_block_size);
+    if (count_num_blocks > 0) {
+        kernel::inv_row_ptr_permute<<<count_num_blocks, default_block_size, 0,
+                                      exec->get_stream()>>>(
+            num_rows, row_perm, orig->get_const_row_ptrs(),
+            permuted->get_row_ptrs());
+    }
+    components::prefix_sum_nonnegative(exec, permuted->get_row_ptrs(),
+                                       num_rows + 1);
+    auto copy_num_blocks =
+        ceildiv(num_rows, default_block_size / config::warp_size);
+    if (copy_num_blocks > 0) {
+        kernel::inv_nonsymm_scale_permute<config::warp_size>
+            <<<copy_num_blocks, default_block_size, 0, exec->get_stream()>>>(
+                num_rows, as_device_type(row_scale), row_perm,
+                as_device_type(col_scale), col_perm, orig->get_const_row_ptrs(),
+                orig->get_const_col_idxs(),
+                as_device_type(orig->get_const_values()),
+                permuted->get_row_ptrs(), permuted->get_col_idxs(),
+                as_device_type(permuted->get_values()));
+    }
+}
+
+
+template <typename ValueType, typename IndexType>
+void row_scale_permute(std::shared_ptr<const DefaultExecutor> exec,
+                       const ValueType* scale, const IndexType* perm,
+                       const matrix::Csr<ValueType, IndexType>* orig,
+                       matrix::Csr<ValueType, IndexType>* row_permuted)
+{
+    auto num_rows = orig->get_size()[0];
+    auto count_num_blocks = ceildiv(num_rows, default_block_size);
+    if (count_num_blocks > 0) {
+        kernel::row_ptr_permute<<<count_num_blocks, default_block_size, 0,
+                                  exec->get_stream()>>>(
+            num_rows, perm, orig->get_const_row_ptrs(),
+            row_permuted->get_row_ptrs());
+    }
+    components::prefix_sum_nonnegative(exec, row_permuted->get_row_ptrs(),
+                                       num_rows + 1);
+    auto copy_num_blocks =
+        ceildiv(num_rows, default_block_size / config::warp_size);
+    if (copy_num_blocks > 0) {
+        kernel::row_scale_permute<config::warp_size>
+            <<<copy_num_blocks, default_block_size, 0, exec->get_stream()>>>(
+                num_rows, as_device_type(scale), perm,
+                orig->get_const_row_ptrs(), orig->get_const_col_idxs(),
+                as_device_type(orig->get_const_values()),
+                row_permuted->get_row_ptrs(), row_permuted->get_col_idxs(),
+                as_device_type(row_permuted->get_values()));
+    }
+}
+
+
+template <typename ValueType, typename IndexType>
+void inv_row_scale_permute(std::shared_ptr<const DefaultExecutor> exec,
+                           const ValueType* scale, const IndexType* perm,
+                           const matrix::Csr<ValueType, IndexType>* orig,
+                           matrix::Csr<ValueType, IndexType>* row_permuted)
+{
+    auto num_rows = orig->get_size()[0];
+    auto count_num_blocks = ceildiv(num_rows, default_block_size);
+    if (count_num_blocks > 0) {
+        kernel::inv_row_ptr_permute<<<count_num_blocks, default_block_size, 0,
+                                      exec->get_stream()>>>(
+            num_rows, perm, orig->get_const_row_ptrs(),
+            row_permuted->get_row_ptrs());
+    }
+    components::prefix_sum_nonnegative(exec, row_permuted->get_row_ptrs(),
+                                       num_rows + 1);
+    auto copy_num_blocks =
+        ceildiv(num_rows, default_block_size / config::warp_size);
+    if (copy_num_blocks > 0) {
+        kernel::inv_row_scale_permute<config::warp_size>
+            <<<copy_num_blocks, default_block_size, 0, exec->get_stream()>>>(
+                num_rows, as_device_type(scale), perm,
+                orig->get_const_row_ptrs(), orig->get_const_col_idxs(),
+                as_device_type(orig->get_const_values()),
+                row_permuted->get_row_ptrs(), row_permuted->get_col_idxs(),
+                as_device_type(row_permuted->get_values()));
+    }
+}
+
+
+template <typename ValueType, typename IndexType>
+void calculate_nonzeros_per_row_in_span(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const matrix::Csr<ValueType, IndexType>* source, const span& row_span,
+    const span& col_span, array<IndexType>* row_nnz)
+{
+    const auto num_rows = source->get_size()[0];
+    auto row_ptrs = source->get_const_row_ptrs();
+    auto col_idxs = source->get_const_col_idxs();
+    auto grid_dim = ceildiv(row_span.length(), default_block_size);
+    if (grid_dim > 0) {
+        kernel::calculate_nnz_per_row_in_span<<<grid_dim, default_block_size, 0,
+                                                exec->get_stream()>>>(
+            row_span, col_span, as_device_type(row_ptrs),
+            as_device_type(col_idxs), as_device_type(row_nnz->get_data()));
+    }
+}
+
+
+template <typename ValueType, typename IndexType>
+void compute_submatrix(std::shared_ptr<const DefaultExecutor> exec,
+                       const matrix::Csr<ValueType, IndexType>* source,
+                       gko::span row_span, gko::span col_span,
+                       matrix::Csr<ValueType, IndexType>* result)
+{
+    auto row_offset = row_span.begin;
+    auto col_offset = col_span.begin;
+    auto num_rows = result->get_size()[0];
+    auto num_cols = result->get_size()[1];
+    auto row_ptrs = source->get_const_row_ptrs();
+    auto grid_dim = ceildiv(num_rows, default_block_size);
+    if (grid_dim > 0) {
+        kernel::compute_submatrix_idxs_and_vals<<<grid_dim, default_block_size,
+                                                  0, exec->get_stream()>>>(
+            num_rows, num_cols, row_offset, col_offset,
+            as_device_type(source->get_const_row_ptrs()),
+            as_device_type(source->get_const_col_idxs()),
+            as_device_type(source->get_const_values()),
+            as_device_type(result->get_const_row_ptrs()),
+            as_device_type(result->get_col_idxs()),
+            as_device_type(result->get_values()));
+    }
+}
+
+
+template <typename ValueType, typename IndexType>
+void calculate_nonzeros_per_row_in_index_set(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const matrix::Csr<ValueType, IndexType>* source,
+    const gko::index_set<IndexType>& row_index_set,
+    const gko::index_set<IndexType>& col_index_set,
+    IndexType* row_nnz) GKO_NOT_IMPLEMENTED;
+
+
+template <typename ValueType, typename IndexType>
+void compute_submatrix_from_index_set(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const matrix::Csr<ValueType, IndexType>* source,
+    const gko::index_set<IndexType>& row_index_set,
+    const gko::index_set<IndexType>& col_index_set,
+    matrix::Csr<ValueType, IndexType>* result) GKO_NOT_IMPLEMENTED;
 
 
 template <typename ValueType, typename IndexType>
@@ -1135,13 +1673,10 @@ void fallback_transpose(std::shared_ptr<const DefaultExecutor> exec,
     const auto nnz = output->get_num_stored_elements();
     const auto in_row_ptrs = input->get_const_row_ptrs();
     const auto in_col_idxs = input->get_const_col_idxs();
-    // workaround for CUDA 9.2 Thrust unconstrained constructor issues
-    const auto in_vals = reinterpret_cast<const device_member_type<ValueType>*>(
-        input->get_const_values());
+    const auto in_vals = as_device_type(input->get_const_values());
     const auto out_row_ptrs = output->get_row_ptrs();
     const auto out_col_idxs = output->get_col_idxs();
-    const auto out_vals =
-        reinterpret_cast<device_member_type<ValueType>*>(output->get_values());
+    const auto out_vals = as_device_type(output->get_values());
     array<IndexType> out_row_idxs{exec, nnz};
     components::convert_ptrs_to_idxs(exec, in_row_ptrs, in_num_rows,
                                      out_col_idxs);
@@ -1161,8 +1696,7 @@ void fallback_sort(std::shared_ptr<const DefaultExecutor> exec,
 {
     const auto row_ptrs = to_sort->get_const_row_ptrs();
     const auto col_idxs = to_sort->get_col_idxs();
-    const auto vals =
-        reinterpret_cast<device_member_type<ValueType>*>(to_sort->get_values());
+    const auto vals = as_device_type(to_sort->get_values());
     const auto nnz = to_sort->get_num_stored_elements();
     const auto num_rows = to_sort->get_size()[0];
     array<IndexType> row_idx_array(exec, nnz);
@@ -1178,3 +1712,91 @@ void fallback_sort(std::shared_ptr<const DefaultExecutor> exec,
     thrust::stable_sort_by_key(thrust_policy(exec), row_idxs, row_idxs + nnz,
                                col_val_it);
 }
+
+
+template <typename ValueType, typename IndexType>
+void is_sorted_by_column_index(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const matrix::Csr<ValueType, IndexType>* to_check, bool* is_sorted)
+{
+    *is_sorted = true;
+    auto cpu_array = make_array_view(exec->get_master(), 1, is_sorted);
+    auto gpu_array = array<bool>{exec, cpu_array};
+    auto block_size = default_block_size;
+    auto num_rows = static_cast<IndexType>(to_check->get_size()[0]);
+    auto num_blocks = ceildiv(num_rows, block_size);
+    if (num_blocks > 0) {
+        kernel::
+            check_unsorted<<<num_blocks, block_size, 0, exec->get_stream()>>>(
+                to_check->get_const_row_ptrs(), to_check->get_const_col_idxs(),
+                num_rows, gpu_array.get_data());
+    }
+    cpu_array = gpu_array;
+}
+
+
+template <typename ValueType, typename IndexType>
+void extract_diagonal(std::shared_ptr<const DefaultExecutor> exec,
+                      const matrix::Csr<ValueType, IndexType>* orig,
+                      matrix::Diagonal<ValueType>* diag)
+{
+    const auto nnz = orig->get_num_stored_elements();
+    const auto diag_size = diag->get_size()[0];
+    const auto num_blocks =
+        ceildiv(config::warp_size * diag_size, default_block_size);
+
+    const auto orig_values = orig->get_const_values();
+    const auto orig_row_ptrs = orig->get_const_row_ptrs();
+    const auto orig_col_idxs = orig->get_const_col_idxs();
+    auto diag_values = diag->get_values();
+    if (num_blocks > 0) {
+        kernel::extract_diagonal<<<num_blocks, default_block_size, 0,
+                                   exec->get_stream()>>>(
+            diag_size, nnz, as_device_type(orig_values),
+            as_device_type(orig_row_ptrs), as_device_type(orig_col_idxs),
+            as_device_type(diag_values));
+    }
+}
+
+
+template <typename ValueType, typename IndexType>
+void check_diagonal_entries_exist(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const matrix::Csr<ValueType, IndexType>* const mtx, bool& has_all_diags)
+{
+    const auto num_diag = static_cast<IndexType>(
+        std::min(mtx->get_size()[0], mtx->get_size()[1]));
+    if (num_diag > 0) {
+        const IndexType num_blocks =
+            ceildiv(num_diag, default_block_size / config::warp_size);
+        array<bool> has_diags(exec, {true});
+        kernel::check_diagonal_entries<<<num_blocks, default_block_size, 0,
+                                         exec->get_stream()>>>(
+            num_diag, mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(),
+            has_diags.get_data());
+        has_all_diags = exec->copy_val_to_host(has_diags.get_const_data());
+    } else {
+        has_all_diags = true;
+    }
+}
+
+
+template <typename ValueType, typename IndexType>
+void add_scaled_identity(std::shared_ptr<const DefaultExecutor> exec,
+                         const matrix::Dense<ValueType>* const alpha,
+                         const matrix::Dense<ValueType>* const beta,
+                         matrix::Csr<ValueType, IndexType>* const mtx)
+{
+    const auto nrows = mtx->get_size()[0];
+    if (nrows == 0) {
+        return;
+    }
+    const auto nthreads = nrows * config::warp_size;
+    const auto nblocks = ceildiv(nthreads, default_block_size);
+    kernel::add_scaled_identity<<<nblocks, default_block_size, 0,
+                                  exec->get_stream()>>>(
+        as_device_type(alpha->get_const_values()),
+        as_device_type(beta->get_const_values()), static_cast<IndexType>(nrows),
+        mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(),
+        as_device_type(mtx->get_values()));
+}
diff --git a/common/cuda_hip/matrix/ell_kernels.hpp.inc b/common/cuda_hip/matrix/ell_kernels.hpp.inc
index 6c81fb4964c..e7bcac351cb 100644
--- a/common/cuda_hip/matrix/ell_kernels.hpp.inc
+++ b/common/cuda_hip/matrix/ell_kernels.hpp.inc
@@ -43,13 +43,14 @@ __device__ void spmv_kernel(
     acc::range<b_accessor> b, OutputValueType* __restrict__ c,
     const size_type c_stride, Closure op)
 {
+    using arithmetic_type = typename a_accessor::arithmetic_type;
     const auto tidx = thread::get_thread_id_flat();
     const decltype(tidx) column_id = blockIdx.y;
     if (num_thread_per_worker == 1) {
         // Specialize the num_thread_per_worker = 1. It doesn't need the shared
         // memory, __syncthreads, and atomic_add
         if (tidx < num_rows) {
-            auto temp = zero<OutputValueType>();
+            auto temp = zero<arithmetic_type>();
             for (size_type idx = 0; idx < num_stored_elements_per_row; idx++) {
                 const auto ind = tidx + idx * stride;
                 const auto col_idx = col[ind];
@@ -69,13 +70,13 @@ __device__ void spmv_kernel(
             const auto worker_id = tidx / num_rows;
             const auto step_size = num_worker_per_row * num_thread_per_worker;
             __shared__ uninitialized_array<
-                OutputValueType, default_block_size / num_thread_per_worker>
+                arithmetic_type, default_block_size / num_thread_per_worker>
                 storage;
             if (idx_in_worker == 0) {
                 storage[threadIdx.x] = 0;
             }
             __syncthreads();
-            auto temp = zero<OutputValueType>();
+            auto temp = zero<arithmetic_type>();
             for (size_type idx =
                      worker_id * num_thread_per_worker + idx_in_worker;
                  idx < num_stored_elements_per_row; idx += step_size) {
@@ -114,7 +115,9 @@ __global__ __launch_bounds__(default_block_size) void spmv(
     spmv_kernel<num_thread_per_worker, atomic>(
         num_rows, num_worker_per_row, val, col, stride,
         num_stored_elements_per_row, b, c, c_stride,
-        [](const OutputValueType& x, const OutputValueType& y) { return x; });
+        [](const auto& x, const OutputValueType& y) {
+            return static_cast<OutputValueType>(x);
+        });
 }
 
 
@@ -128,7 +131,8 @@ __global__ __launch_bounds__(default_block_size) void spmv(
     const OutputValueType* __restrict__ beta, OutputValueType* __restrict__ c,
     const size_type c_stride)
 {
-    const OutputValueType alpha_val = alpha(0);
+    using arithmetic_type = typename a_accessor::arithmetic_type;
+    const auto alpha_val = alpha(0);
     const OutputValueType beta_val = beta[0];
     if (atomic) {
         // Because the atomic operation changes the values of c during
@@ -139,16 +143,16 @@ __global__ __launch_bounds__(default_block_size) void spmv(
         spmv_kernel<num_thread_per_worker, atomic>(
             num_rows, num_worker_per_row, val, col, stride,
             num_stored_elements_per_row, b, c, c_stride,
-            [&alpha_val](const OutputValueType& x, const OutputValueType& y) {
-                return alpha_val * x;
+            [&alpha_val](const auto& x, const OutputValueType& y) {
+                return static_cast<OutputValueType>(alpha_val * x);
             });
     } else {
         spmv_kernel<num_thread_per_worker, atomic>(
             num_rows, num_worker_per_row, val, col, stride,
             num_stored_elements_per_row, b, c, c_stride,
-            [&alpha_val, &beta_val](const OutputValueType& x,
-                                    const OutputValueType& y) {
-                return alpha_val * x + beta_val * y;
+            [&alpha_val, &beta_val](const auto& x, const OutputValueType& y) {
+                return static_cast<OutputValueType>(
+                    alpha_val * x + static_cast<arithmetic_type>(beta_val * y));
             });
     }
 }
diff --git a/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc b/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc
index 27314c06a59..607ec5046ea 100644
--- a/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc
+++ b/common/cuda_hip/matrix/fbcsr_kernels.hpp.inc
@@ -172,11 +172,7 @@ void fill_in_matrix_data(std::shared_ptr<const DefaultExecutor> exec,
     }
     auto in_rows = data.get_row_idxs();
     auto in_cols = data.get_col_idxs();
-    // workaround for CUDA 9.2 Thrust: Their complex<> implementation is broken
-    // due to overly generic assignment operator and constructor leading to
-    // ambiguities. So we need to use our own fake_complex type
-    auto in_vals =
-        reinterpret_cast<device_member_type<ValueType>*>(data.get_values());
+    auto in_vals = as_device_type(data.get_values());
     auto in_loc_it =
         thrust::make_zip_iterator(thrust::make_tuple(in_rows, in_cols));
     thrust::sort_by_key(thrust_policy(exec), in_loc_it, in_loc_it + nnz,
@@ -232,15 +228,11 @@ void fill_in_matrix_data(std::shared_ptr<const DefaultExecutor> exec,
             const auto block_end = i < num_blocks - 1 ? block_ptrs[i + 1] : nnz;
             for (auto nz = block_begin; nz < block_end; nz++) {
                 block_values[i * bs * bs + (in_cols[nz] % bs) * bs +
-                             (in_rows[nz] % bs)] =
-                    fake_complex_unpack(in_vals[nz]);
+                             (in_rows[nz] % bs)] = in_vals[nz];
             }
         });
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_FBCSR_FILL_IN_MATRIX_DATA_KERNEL);
-
 
 namespace kernel {
 
@@ -323,9 +315,6 @@ void fill_in_dense(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_FBCSR_FILL_IN_DENSE_KERNEL);
-
 
 template <typename ValueType, typename IndexType>
 void convert_to_csr(const std::shared_ptr<const DefaultExecutor> exec,
@@ -345,9 +334,6 @@ void convert_to_csr(const std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_FBCSR_CONVERT_TO_CSR_KERNEL);
-
 
 template <typename ValueType, typename IndexType>
 void is_sorted_by_column_index(
@@ -372,23 +358,14 @@ void is_sorted_by_column_index(
     *is_sorted = exec->copy_val_to_host(gpu_array.get_data());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_FBCSR_IS_SORTED_BY_COLUMN_INDEX);
-
 
 template <typename ValueType, typename IndexType>
 void sort_by_column_index(const std::shared_ptr<const DefaultExecutor> exec,
                           matrix::Fbcsr<ValueType, IndexType>* const to_sort)
     GKO_NOT_IMPLEMENTED;
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_FBCSR_SORT_BY_COLUMN_INDEX);
-
 
 template <typename ValueType, typename IndexType>
 void extract_diagonal(std::shared_ptr<const DefaultExecutor> exec,
                       const matrix::Fbcsr<ValueType, IndexType>* orig,
                       matrix::Diagonal<ValueType>* diag) GKO_NOT_IMPLEMENTED;
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_FBCSR_EXTRACT_DIAGONAL);
diff --git a/common/cuda_hip/matrix/sparsity_csr_kernels.hpp.inc b/common/cuda_hip/matrix/sparsity_csr_kernels.hpp.inc
index dddd7946a04..2d2ca9a5183 100644
--- a/common/cuda_hip/matrix/sparsity_csr_kernels.hpp.inc
+++ b/common/cuda_hip/matrix/sparsity_csr_kernels.hpp.inc
@@ -121,19 +121,19 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 
 template <typename ValueType, typename IndexType>
-void sort_by_column_index(std::shared_ptr<const DefaultExecutor> exec,
-                          matrix::SparsityCsr<ValueType, IndexType>* to_sort)
-    GKO_NOT_IMPLEMENTED;
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_SPARSITY_CSR_SORT_BY_COLUMN_INDEX);
-
-
-template <typename ValueType, typename IndexType>
-void is_sorted_by_column_index(
-    std::shared_ptr<const DefaultExecutor> exec,
-    const matrix::SparsityCsr<ValueType, IndexType>* to_check,
-    bool* is_sorted) GKO_NOT_IMPLEMENTED;
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_SPARSITY_CSR_IS_SORTED_BY_COLUMN_INDEX);
+void fallback_sort(std::shared_ptr<const DefaultExecutor> exec,
+                   matrix::SparsityCsr<ValueType, IndexType>* to_sort)
+{
+    const auto row_ptrs = to_sort->get_const_row_ptrs();
+    const auto col_idxs = to_sort->get_col_idxs();
+    const auto nnz = to_sort->get_num_nonzeros();
+    const auto num_rows = to_sort->get_size()[0];
+    array<IndexType> row_idx_array(exec, nnz);
+    const auto row_idxs = row_idx_array.get_data();
+    components::convert_ptrs_to_idxs(exec, row_ptrs, num_rows, row_idxs);
+    // two sorts by integer keys hopefully enable Thrust to use cub's RadixSort
+    thrust::sort_by_key(thrust_policy(exec), col_idxs, col_idxs + nnz,
+                        row_idxs);
+    thrust::stable_sort_by_key(thrust_policy(exec), row_idxs, row_idxs + nnz,
+                               col_idxs);
+}
diff --git a/common/cuda_hip/multigrid/pgm_kernels.hpp.inc b/common/cuda_hip/multigrid/pgm_kernels.hpp.inc
index 30cce92b8de..b08e86efaaa 100644
--- a/common/cuda_hip/multigrid/pgm_kernels.hpp.inc
+++ b/common/cuda_hip/multigrid/pgm_kernels.hpp.inc
@@ -45,15 +45,11 @@ template <typename ValueType, typename IndexType>
 void sort_row_major(std::shared_ptr<const DefaultExecutor> exec, size_type nnz,
                     IndexType* row_idxs, IndexType* col_idxs, ValueType* vals)
 {
-    // workaround for CUDA 9.2 Thrust: Their complex<> implementation is broken
-    // due to overly generic assignment operator and constructor leading to
-    // ambiguities. So we need to use our own fake_complex type
-    using device_value_type = device_member_type<ValueType>;
-    auto vals_it = reinterpret_cast<device_value_type*>(vals);
+    auto vals_it = as_device_type(vals);
     auto it = thrust::make_zip_iterator(thrust::make_tuple(row_idxs, col_idxs));
-    // Because reduce_by_key is not determinstic, so we do not need
+    // Because reduce_by_key is not deterministic, so we do not need
     // stable_sort_by_key
-    // TODO: If we have determinstic reduce_by_key, it should be
+    // TODO: If we have deterministic reduce_by_key, it should be
     // stable_sort_by_key
     thrust::sort_by_key(thrust_policy(exec), it, it + nnz, vals_it);
 }
@@ -67,16 +63,11 @@ void compute_coarse_coo(std::shared_ptr<const DefaultExecutor> exec,
                         const IndexType* col_idxs, const ValueType* vals,
                         matrix::Coo<ValueType, IndexType>* coarse_coo)
 {
-    // workaround for CUDA 9.2 Thrust: Their complex<> implementation is broken
-    // due to overly generic assignment operator and constructor leading to
-    // ambiguities. So we need to use our own fake_complex type
-    using device_value_type = device_member_type<ValueType>;
-    auto vals_it = reinterpret_cast<const device_value_type*>(vals);
+    auto vals_it = as_device_type(vals);
     auto key_it =
         thrust::make_zip_iterator(thrust::make_tuple(row_idxs, col_idxs));
 
-    auto coarse_vals_it =
-        reinterpret_cast<device_value_type*>(coarse_coo->get_values());
+    auto coarse_vals_it = as_device_type(coarse_coo->get_values());
     auto coarse_key_it = thrust::make_zip_iterator(thrust::make_tuple(
         coarse_coo->get_row_idxs(), coarse_coo->get_col_idxs()));
 
diff --git a/common/cuda_hip/preconditioner/batch_identity.hpp.inc b/common/cuda_hip/preconditioner/batch_identity.hpp.inc
new file mode 100644
index 00000000000..923ed4ce946
--- /dev/null
+++ b/common/cuda_hip/preconditioner/batch_identity.hpp.inc
@@ -0,0 +1,61 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+/**
+ * @see reference/preconditioner/batch_identity.hpp
+ */
+template <typename ValueType>
+class Identity final {
+public:
+    using value_type = ValueType;
+
+    static constexpr int work_size = 0;
+
+    __host__ __device__ static constexpr int dynamic_work_size(int, int)
+    {
+        return 0;
+    }
+
+    template <typename batch_item_type>
+    __device__ __forceinline__ void generate(size_type, const batch_item_type&,
+                                             ValueType*)
+    {}
+
+    __device__ __forceinline__ void apply(const int num_rows,
+                                          const ValueType* const r,
+                                          ValueType* const z) const
+    {
+        for (int li = threadIdx.x; li < num_rows; li += blockDim.x) {
+            z[li] = r[li];
+        }
+    }
+};
diff --git a/common/cuda_hip/preconditioner/jacobi_kernels.hpp.inc b/common/cuda_hip/preconditioner/jacobi_kernels.hpp.inc
index 8827a47620b..2a0f7bd0dd7 100644
--- a/common/cuda_hip/preconditioner/jacobi_kernels.hpp.inc
+++ b/common/cuda_hip/preconditioner/jacobi_kernels.hpp.inc
@@ -197,23 +197,23 @@ __launch_bounds__(warps_per_block* config::warp_size) adaptive_transpose_jacobi(
 
     const auto block_stride = storage_scheme.get_stride();
     const auto rank = subwarp.thread_rank();
-    if (rank < block_size) {
-        GKO_PRECONDITIONER_JACOBI_RESOLVE_PRECISION(
-            ValueType, block_precisions[block_id],
-            auto local_block =
-                reinterpret_cast<const resolved_precision*>(
-                    blocks + storage_scheme.get_group_offset(block_id)) +
-                storage_scheme.get_block_offset(block_id);
-            auto local_out_block =
-                reinterpret_cast<resolved_precision*>(
-                    out_blocks + storage_scheme.get_group_offset(block_id)) +
-                storage_scheme.get_block_offset(block_id);
-            for (IndexType i = 0; i < block_size; ++i) {
-                auto val = local_block[i * block_stride + rank];
-                local_out_block[i + rank * block_stride] =
-                    conjugate ? conj(val) : val;
-            });
-    }
+    GKO_PRECONDITIONER_JACOBI_RESOLVE_PRECISION(
+        ValueType, block_precisions[block_id],
+        auto local_block =
+            reinterpret_cast<const resolved_precision*>(
+                blocks + storage_scheme.get_group_offset(block_id)) +
+            storage_scheme.get_block_offset(block_id);
+        auto local_out_block =
+            reinterpret_cast<resolved_precision*>(
+                out_blocks + storage_scheme.get_group_offset(block_id)) +
+            storage_scheme.get_block_offset(block_id);
+        for (int i = rank; i < block_size * block_size; i += subwarp_size) {
+            int row = i % block_size;
+            int col = i / block_size;
+            auto val = local_block[row + col * block_stride];
+            local_out_block[row * block_stride + col] =
+                conjugate ? conj(val) : val;
+        });
 }
 
 
diff --git a/common/cuda_hip/solver/batch_bicgstab_kernels.hpp.inc b/common/cuda_hip/solver/batch_bicgstab_kernels.hpp.inc
new file mode 100644
index 00000000000..faee2e069a7
--- /dev/null
+++ b/common/cuda_hip/solver/batch_bicgstab_kernels.hpp.inc
@@ -0,0 +1,382 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+
+template <typename Group, typename BatchMatrixType_entry, typename ValueType>
+__device__ __forceinline__ void initialize(
+    Group subgroup, const int num_rows, const BatchMatrixType_entry& mat_entry,
+    const ValueType* const b_global_entry,
+    const ValueType* const x_global_entry, ValueType& rho_old, ValueType& omega,
+    ValueType& alpha, ValueType* const x_shared_entry,
+    ValueType* const r_shared_entry, ValueType* const r_hat_shared_entry,
+    ValueType* const p_shared_entry, ValueType* const p_hat_shared_entry,
+    ValueType* const v_shared_entry,
+    typename gko::remove_complex<ValueType>& rhs_norm,
+    typename gko::remove_complex<ValueType>& res_norm)
+{
+    rho_old = one<ValueType>();
+    omega = one<ValueType>();
+    alpha = one<ValueType>();
+
+    // copy x from global to shared memory
+    // r = b
+    for (int iz = threadIdx.x; iz < num_rows; iz += blockDim.x) {
+        x_shared_entry[iz] = x_global_entry[iz];
+        r_shared_entry[iz] = b_global_entry[iz];
+    }
+    __syncthreads();
+
+    // r = b - A*x
+    advanced_apply(static_cast<ValueType>(-1.0), mat_entry, x_shared_entry,
+                   static_cast<ValueType>(1.0), r_shared_entry);
+    __syncthreads();
+
+    if (threadIdx.x / config::warp_size == 0) {
+        single_rhs_compute_norm2(subgroup, num_rows, r_shared_entry, res_norm);
+    } else if (threadIdx.x / config::warp_size == 1) {
+        // Compute norms of rhs
+        single_rhs_compute_norm2(subgroup, num_rows, b_global_entry, rhs_norm);
+    }
+    __syncthreads();
+
+    for (int iz = threadIdx.x; iz < num_rows; iz += blockDim.x) {
+        r_hat_shared_entry[iz] = r_shared_entry[iz];
+        p_shared_entry[iz] = zero<ValueType>();
+        p_hat_shared_entry[iz] = zero<ValueType>();
+        v_shared_entry[iz] = zero<ValueType>();
+    }
+}
+
+
+template <typename ValueType>
+__device__ __forceinline__ void update_p(
+    const int num_rows, const ValueType& rho_new, const ValueType& rho_old,
+    const ValueType& alpha, const ValueType& omega,
+    const ValueType* const r_shared_entry,
+    const ValueType* const v_shared_entry, ValueType* const p_shared_entry)
+{
+    const ValueType beta = (rho_new / rho_old) * (alpha / omega);
+    for (int r = threadIdx.x; r < num_rows; r += blockDim.x) {
+        p_shared_entry[r] =
+            r_shared_entry[r] +
+            beta * (p_shared_entry[r] - omega * v_shared_entry[r]);
+    }
+}
+
+template <typename Group, typename ValueType>
+__device__ __forceinline__ void compute_alpha(
+    Group subgroup, const int num_rows, const ValueType& rho_new,
+    const ValueType* const r_hat_shared_entry,
+    const ValueType* const v_shared_entry, ValueType& alpha)
+{
+    if (threadIdx.x / config::warp_size == 0) {
+        single_rhs_compute_conj_dot(subgroup, num_rows, r_hat_shared_entry,
+                                    v_shared_entry, alpha);
+    }
+    __syncthreads();
+    if (threadIdx.x == 0) {
+        alpha = rho_new / alpha;
+    }
+}
+
+
+template <typename ValueType>
+__device__ __forceinline__ void update_s(const int num_rows,
+                                         const ValueType* const r_shared_entry,
+                                         const ValueType& alpha,
+                                         const ValueType* const v_shared_entry,
+                                         ValueType* const s_shared_entry)
+{
+    for (int r = threadIdx.x; r < num_rows; r += blockDim.x) {
+        s_shared_entry[r] = r_shared_entry[r] - alpha * v_shared_entry[r];
+    }
+}
+
+
+template <typename Group, typename ValueType>
+__device__ __forceinline__ void compute_omega(
+    Group subgroup, const int num_rows, const ValueType* const t_shared_entry,
+    const ValueType* const s_shared_entry, ValueType& temp, ValueType& omega)
+{
+    if (threadIdx.x / config::warp_size == 0) {
+        single_rhs_compute_conj_dot(subgroup, num_rows, t_shared_entry,
+                                    s_shared_entry, omega);
+    } else if (threadIdx.x / config::warp_size == 1) {
+        single_rhs_compute_conj_dot(subgroup, num_rows, t_shared_entry,
+                                    t_shared_entry, temp);
+    }
+
+    __syncthreads();
+    if (threadIdx.x == 0) {
+        omega /= temp;
+    }
+}
+
+template <typename ValueType>
+__device__ __forceinline__ void update_x_and_r(
+    const int num_rows, const ValueType* const p_hat_shared_entry,
+    const ValueType* const s_hat_shared_entry, const ValueType& alpha,
+    const ValueType& omega, const ValueType* const s_shared_entry,
+    const ValueType* const t_shared_entry, ValueType* const x_shared_entry,
+    ValueType* const r_shared_entry)
+{
+    for (int r = threadIdx.x; r < num_rows; r += blockDim.x) {
+        x_shared_entry[r] = x_shared_entry[r] + alpha * p_hat_shared_entry[r] +
+                            omega * s_hat_shared_entry[r];
+        r_shared_entry[r] = s_shared_entry[r] - omega * t_shared_entry[r];
+    }
+}
+
+
+template <typename ValueType>
+__device__ __forceinline__ void update_x_middle(
+    const int num_rows, const ValueType& alpha,
+    const ValueType* const p_hat_shared_entry, ValueType* const x_shared_entry)
+{
+    for (int r = threadIdx.x; r < num_rows; r += blockDim.x) {
+        x_shared_entry[r] = x_shared_entry[r] + alpha * p_hat_shared_entry[r];
+    }
+}
+
+
+template <typename StopType, int n_shared, bool prec_shared_bool,
+          typename PrecType, typename LogType, typename BatchMatrixType,
+          typename ValueType>
+__global__ void apply_kernel(
+    const gko::kernels::batch_bicgstab::storage_config sconf,
+    const int max_iter, const gko::remove_complex<ValueType> tol,
+    LogType logger, PrecType prec_shared, const BatchMatrixType mat,
+    const ValueType* const __restrict__ b, ValueType* const __restrict__ x,
+    ValueType* const __restrict__ workspace = nullptr)
+{
+    using real_type = typename gko::remove_complex<ValueType>;
+    const auto num_batch_items = mat.num_batch_items;
+    const auto num_rows = mat.num_rows;
+
+    constexpr auto tile_size = config::warp_size;
+    auto thread_block = group::this_thread_block();
+    auto subgroup = group::tiled_partition<tile_size>(thread_block);
+
+    for (int batch_id = blockIdx.x; batch_id < num_batch_items;
+         batch_id += gridDim.x) {
+        const int gmem_offset =
+            batch_id * sconf.gmem_stride_bytes / sizeof(ValueType);
+        extern __shared__ char local_mem_sh[];
+
+        ValueType* p_hat_sh;
+        ValueType* s_hat_sh;
+        ValueType* p_sh;
+        ValueType* s_sh;
+        ValueType* r_sh;
+        ValueType* r_hat_sh;
+        ValueType* v_sh;
+        ValueType* t_sh;
+        ValueType* x_sh;
+        ValueType* prec_work_sh;
+
+        if (n_shared >= 1) {
+            p_hat_sh = reinterpret_cast<ValueType*>(local_mem_sh);
+        } else {
+            p_hat_sh = workspace + gmem_offset;
+        }
+        if (n_shared == 1) {
+            s_hat_sh = workspace + gmem_offset;
+        } else {
+            s_hat_sh = p_hat_sh + sconf.padded_vec_len;
+        }
+        if (n_shared == 2) {
+            v_sh = workspace + gmem_offset;
+        } else {
+            v_sh = s_hat_sh + sconf.padded_vec_len;
+        }
+        if (n_shared == 3) {
+            t_sh = workspace + gmem_offset;
+        } else {
+            t_sh = v_sh + sconf.padded_vec_len;
+        }
+        if (n_shared == 4) {
+            p_sh = workspace + gmem_offset;
+        } else {
+            p_sh = t_sh + sconf.padded_vec_len;
+        }
+        if (n_shared == 5) {
+            s_sh = workspace + gmem_offset;
+        } else {
+            s_sh = p_sh + sconf.padded_vec_len;
+        }
+        if (n_shared == 6) {
+            r_sh = workspace + gmem_offset;
+        } else {
+            r_sh = s_sh + sconf.padded_vec_len;
+        }
+        if (n_shared == 7) {
+            r_hat_sh = workspace + gmem_offset;
+        } else {
+            r_hat_sh = r_sh + sconf.padded_vec_len;
+        }
+        if (n_shared == 8) {
+            x_sh = workspace + gmem_offset;
+        } else {
+            x_sh = r_hat_sh + sconf.padded_vec_len;
+        }
+        if (!prec_shared_bool && n_shared == 9) {
+            prec_work_sh = workspace + gmem_offset;
+        } else {
+            prec_work_sh = x_sh + sconf.padded_vec_len;
+        }
+
+        __shared__ uninitialized_array<ValueType, 1> rho_old_sh;
+        __shared__ uninitialized_array<ValueType, 1> rho_new_sh;
+        __shared__ uninitialized_array<ValueType, 1> omega_sh;
+        __shared__ uninitialized_array<ValueType, 1> alpha_sh;
+        __shared__ uninitialized_array<ValueType, 1> temp_sh;
+        __shared__ real_type norms_rhs_sh[1];
+        __shared__ real_type norms_res_sh[1];
+
+        const auto mat_entry =
+            gko::batch::matrix::extract_batch_item(mat, batch_id);
+        const ValueType* const b_entry_ptr =
+            gko::batch::multi_vector::batch_item_ptr(b, 1, num_rows, batch_id);
+        ValueType* const x_gl_entry_ptr =
+            gko::batch::multi_vector::batch_item_ptr(x, 1, num_rows, batch_id);
+
+        // generate preconditioner
+        prec_shared.generate(batch_id, mat_entry, prec_work_sh);
+
+        // initialization
+        // rho_old = 1, omega = 1, alpha = 1
+        // compute b norms
+        // copy x from global to shared memory
+        // r = b - A*x
+        // compute residual norms
+        // r_hat = r
+        // p = 0
+        // p_hat = 0
+        // v = 0
+        initialize(subgroup, num_rows, mat_entry, b_entry_ptr, x_gl_entry_ptr,
+                   rho_old_sh[0], omega_sh[0], alpha_sh[0], x_sh, r_sh,
+                   r_hat_sh, p_sh, p_hat_sh, v_sh, norms_rhs_sh[0],
+                   norms_res_sh[0]);
+        __syncthreads();
+
+        // stopping criterion object
+        StopType stop(tol, norms_rhs_sh);
+
+        int iter = 0;
+        for (; iter < max_iter; iter++) {
+            if (stop.check_converged(norms_res_sh)) {
+                logger.log_iteration(batch_id, iter, norms_res_sh[0]);
+                break;
+            }
+
+            // rho_new =  < r_hat , r > = (r_hat)' * (r)
+            if (threadIdx.x / config::warp_size == 0) {
+                single_rhs_compute_conj_dot(subgroup, num_rows, r_hat_sh, r_sh,
+                                            rho_new_sh[0]);
+            }
+            __syncthreads();
+
+            // beta = (rho_new / rho_old)*(alpha / omega)
+            // p = r + beta*(p - omega * v)
+            update_p(num_rows, rho_new_sh[0], rho_old_sh[0], alpha_sh[0],
+                     omega_sh[0], r_sh, v_sh, p_sh);
+            __syncthreads();
+
+            // p_hat = precond * p
+            prec_shared.apply(num_rows, p_sh, p_hat_sh);
+            __syncthreads();
+
+            // v = A * p_hat
+            simple_apply(mat_entry, p_hat_sh, v_sh);
+            __syncthreads();
+
+            // alpha = rho_new / < r_hat , v>
+            compute_alpha(subgroup, num_rows, rho_new_sh[0], r_hat_sh, v_sh,
+                          alpha_sh[0]);
+            __syncthreads();
+
+            // s = r - alpha*v
+            update_s(num_rows, r_sh, alpha_sh[0], v_sh, s_sh);
+            __syncthreads();
+
+            // an estimate of residual norms
+            if (threadIdx.x / config::warp_size == 0) {
+                single_rhs_compute_norm2(subgroup, num_rows, s_sh,
+                                         norms_res_sh[0]);
+            }
+            __syncthreads();
+
+            // if (norms_res_sh[0] / norms_rhs_sh[0] < tol) {
+            if (stop.check_converged(norms_res_sh)) {
+                update_x_middle(num_rows, alpha_sh[0], p_hat_sh, x_sh);
+                logger.log_iteration(batch_id, iter, norms_res_sh[0]);
+                break;
+            }
+
+            // s_hat = precond * s
+            prec_shared.apply(num_rows, s_sh, s_hat_sh);
+            __syncthreads();
+
+            // t = A * s_hat
+            simple_apply(mat_entry, s_hat_sh, t_sh);
+            __syncthreads();
+
+            // omega = <t,s> / <t,t>
+            compute_omega(subgroup, num_rows, t_sh, s_sh, temp_sh[0],
+                          omega_sh[0]);
+            __syncthreads();
+
+            // x = x + alpha*p_hat + omega *s_hat
+            // r = s - omega * t
+            update_x_and_r(num_rows, p_hat_sh, s_hat_sh, alpha_sh[0],
+                           omega_sh[0], s_sh, t_sh, x_sh, r_sh);
+            __syncthreads();
+
+            if (threadIdx.x / config::warp_size == 0) {
+                single_rhs_compute_norm2(subgroup, num_rows, r_sh,
+                                         norms_res_sh[0]);
+            }
+            //__syncthreads();
+
+            if (threadIdx.x == blockDim.x - 1) {
+                rho_old_sh[0] = rho_new_sh[0];
+            }
+            __syncthreads();
+        }
+
+        logger.log_iteration(batch_id, iter, norms_res_sh[0]);
+
+        // copy x back to global memory
+        single_rhs_copy(num_rows, x_sh, x_gl_entry_ptr);
+        __syncthreads();
+    }
+}
diff --git a/benchmark/utils/spmv_validation.hpp b/common/cuda_hip/stop/batch_criteria.hpp.inc
similarity index 59%
rename from benchmark/utils/spmv_validation.hpp
rename to common/cuda_hip/stop/batch_criteria.hpp.inc
index 83ea2085ec2..d9ca9d10487 100644
--- a/benchmark/utils/spmv_validation.hpp
+++ b/common/cuda_hip/stop/batch_criteria.hpp.inc
@@ -30,54 +30,51 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-#ifndef GKO_BENCHMARK_UTILS_SPMV_VALIDATION_HPP_
-#define GKO_BENCHMARK_UTILS_SPMV_VALIDATION_HPP_
-
-
-#include <ginkgo/ginkgo.hpp>
-
-
-#include <cstdlib>
-#include <iostream>
-
-
-#include <rapidjson/document.h>
-
-
-std::string example_config = R"(
-  [
-    {"filename": "my_file.mtx"},
-    {"filename": "my_file2.mtx"},
-    {"size": 100, "stencil": "7pt"},
-  ]
-)";
-
 
 /**
- * Function which outputs the input format for benchmarks similar to the spmv.
+ * @see reference/stop/batch_criteria.hpp
  */
-[[noreturn]] void print_config_error_and_exit()
-{
-    std::cerr << "Input has to be a JSON array of matrix configurations:\n"
-              << example_config << std::endl;
-    std::exit(1);
-}
+template <typename ValueType>
+class SimpleRelResidual {
+public:
+    using real_type = remove_complex<ValueType>;
+
+    __device__ __forceinline__ SimpleRelResidual(
+        const real_type rel_res_tol, const real_type* const rhs_b_norms)
+        : rel_tol_{rel_res_tol}, rhs_norms_{rhs_b_norms}
+    {}
+
+    __device__ __forceinline__ bool check_converged(
+        const real_type* const residual_norms) const
+    {
+        return residual_norms[0] <= (rel_tol_ * rhs_norms_[0]);
+    }
+
+private:
+    const real_type rel_tol_;
+    const real_type* const rhs_norms_;
+};
 
 
 /**
- * Validates whether the input format is correct for spmv-like benchmarks.
- *
- * @param value  the JSON value to test.
+ * @see reference/stop/batch_criteria.hpp
  */
-void validate_option_object(const rapidjson::Value& value)
-{
-    if (!value.IsObject() ||
-        !((value.HasMember("size") && value.HasMember("stencil") &&
-           value["size"].IsInt64() && value["stencil"].IsString()) ||
-          (value.HasMember("filename") && value["filename"].IsString()))) {
-        print_config_error_and_exit();
+template <typename ValueType>
+class SimpleAbsResidual {
+public:
+    using real_type = remove_complex<ValueType>;
+
+    __device__ __forceinline__ SimpleAbsResidual(const real_type tol,
+                                                 const real_type*)
+        : abs_tol_{tol}
+    {}
+
+    __device__ __forceinline__ bool check_converged(
+        const real_type* const residual_norms) const
+    {
+        return (residual_norms[0] <= abs_tol_);
     }
-}
-
 
-#endif  // GKO_BENCHMARK_UTILS_SPMV_VALIDATION_HPP_
+private:
+    const real_type abs_tol_;
+};
diff --git a/common/unified/CMakeLists.txt b/common/unified/CMakeLists.txt
new file mode 100644
index 00000000000..7ac6b3df40c
--- /dev/null
+++ b/common/unified/CMakeLists.txt
@@ -0,0 +1,33 @@
+set(UNIFIED_SOURCES
+    base/device_matrix_data_kernels.cpp
+    base/index_set_kernels.cpp
+    components/absolute_array_kernels.cpp
+    components/fill_array_kernels.cpp
+    components/format_conversion_kernels.cpp
+    components/precision_conversion_kernels.cpp
+    components/reduce_array_kernels.cpp
+    distributed/partition_helpers_kernels.cpp
+    distributed/partition_kernels.cpp
+    matrix/coo_kernels.cpp
+    matrix/csr_kernels.cpp
+    matrix/ell_kernels.cpp
+    matrix/hybrid_kernels.cpp
+    matrix/permutation_kernels.cpp
+    matrix/scaled_permutation_kernels.cpp
+    matrix/sellp_kernels.cpp
+    matrix/sparsity_csr_kernels.cpp
+    matrix/diagonal_kernels.cpp
+    multigrid/pgm_kernels.cpp
+    preconditioner/jacobi_kernels.cpp
+    solver/bicg_kernels.cpp
+    solver/bicgstab_kernels.cpp
+    solver/cg_kernels.cpp
+    solver/cgs_kernels.cpp
+    solver/common_gmres_kernels.cpp
+    solver/fcg_kernels.cpp
+    solver/gcr_kernels.cpp
+    solver/gmres_kernels.cpp
+    solver/ir_kernels.cpp
+    )
+list(TRANSFORM UNIFIED_SOURCES PREPEND ${CMAKE_CURRENT_SOURCE_DIR}/)
+set(GKO_UNIFIED_COMMON_SOURCES ${UNIFIED_SOURCES} PARENT_SCOPE)
\ No newline at end of file
diff --git a/common/unified/distributed/partition_helpers_kernels.cpp b/common/unified/distributed/partition_helpers_kernels.cpp
new file mode 100644
index 00000000000..3c041dd7e4b
--- /dev/null
+++ b/common/unified/distributed/partition_helpers_kernels.cpp
@@ -0,0 +1,102 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/distributed/partition_helpers_kernels.hpp"
+
+
+#include "common/unified/base/kernel_launch.hpp"
+#include "common/unified/base/kernel_launch_reduction.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace partition_helpers {
+
+
+template <typename GlobalIndexType>
+void check_consecutive_ranges(std::shared_ptr<const DefaultExecutor> exec,
+                              const array<GlobalIndexType>& range_start_ends,
+                              bool& result)
+{
+    array<uint32> result_uint32{exec, 1};
+    auto num_ranges = range_start_ends.get_num_elems() / 2;
+    // need additional guard because DPCPP doesn't return the initial value for
+    // empty inputs
+    if (num_ranges > 1) {
+        run_kernel_reduction(
+            exec,
+            [] GKO_KERNEL(const auto i, const auto* ranges) {
+                return ranges[2 * i] == ranges[2 * i + 1];
+            },
+            [] GKO_KERNEL(const auto a, const auto b) {
+                return static_cast<uint32>(a && b);
+            },
+            [] GKO_KERNEL(auto x) { return x; }, static_cast<uint32>(true),
+            result_uint32.get_data(), num_ranges - 1,
+            range_start_ends.get_const_data() + 1);
+        result =
+            static_cast<bool>(exec->copy_val_to_host(result_uint32.get_data()));
+    } else {
+        result = true;
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(
+    GKO_DECLARE_PARTITION_HELPERS_CHECK_CONSECUTIVE_RANGES);
+
+
+template <typename GlobalIndexType>
+void compress_ranges(std::shared_ptr<const DefaultExecutor> exec,
+                     const array<GlobalIndexType>& range_start_ends,
+                     array<GlobalIndexType>& range_offsets)
+{
+    run_kernel(
+        exec,
+        [] GKO_KERNEL(const auto i, const auto* start_ends, auto* offsets) {
+            if (i == 0) {
+                offsets[0] = start_ends[0];
+            }
+            offsets[i + 1] = start_ends[2 * i + 1];
+        },
+        range_offsets.get_num_elems() - 1, range_start_ends.get_const_data(),
+        range_offsets.get_data());
+}
+
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(
+    GKO_DECLARE_PARTITION_HELPERS_COMPRESS_RANGES);
+
+
+}  // namespace partition_helpers
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/common/unified/distributed/partition_kernels.cpp b/common/unified/distributed/partition_kernels.cpp
index cb0f4813da5..dc13fec9f1b 100644
--- a/common/unified/distributed/partition_kernels.cpp
+++ b/common/unified/distributed/partition_kernels.cpp
@@ -66,19 +66,22 @@ void count_ranges(std::shared_ptr<const DefaultExecutor> exec,
 template <typename GlobalIndexType>
 void build_from_contiguous(std::shared_ptr<const DefaultExecutor> exec,
                            const array<GlobalIndexType>& ranges,
+                           const array<comm_index_type>& part_id_mapping,
                            GlobalIndexType* range_bounds,
                            comm_index_type* part_ids)
 {
     run_kernel(
         exec,
-        [] GKO_KERNEL(auto i, auto ranges, auto bounds, auto ids) {
+        [] GKO_KERNEL(auto i, auto ranges, auto mapping, auto bounds, auto ids,
+                      bool uses_mapping) {
             if (i == 0) {
                 bounds[0] = 0;
             }
             bounds[i + 1] = ranges[i + 1];
-            ids[i] = i;
+            ids[i] = uses_mapping ? mapping[i] : i;
         },
-        ranges.get_num_elems() - 1, ranges, range_bounds, part_ids);
+        ranges.get_num_elems() - 1, ranges, part_id_mapping, range_bounds,
+        part_ids, part_id_mapping.get_num_elems() > 0);
 }
 
 GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_PARTITION_BUILD_FROM_CONTIGUOUS);
diff --git a/common/unified/matrix/csr_kernels.cpp b/common/unified/matrix/csr_kernels.cpp
index f4e034998bd..d1abb043c44 100644
--- a/common/unified/matrix/csr_kernels.cpp
+++ b/common/unified/matrix/csr_kernels.cpp
@@ -54,53 +54,71 @@ namespace GKO_DEVICE_NAMESPACE {
 namespace csr {
 
 
-template <typename IndexType>
-void invert_permutation(std::shared_ptr<const DefaultExecutor> exec,
-                        size_type size, const IndexType* permutation_indices,
-                        IndexType* inv_permutation)
+template <typename ValueType, typename IndexType>
+void inv_col_permute(std::shared_ptr<const DefaultExecutor> exec,
+                     const IndexType* perm,
+                     const matrix::Csr<ValueType, IndexType>* orig,
+                     matrix::Csr<ValueType, IndexType>* col_permuted)
 {
+    auto num_rows = orig->get_size()[0];
+    auto nnz = orig->get_num_stored_elements();
+    auto size = std::max(num_rows + 1, nnz);
     run_kernel(
         exec,
-        [] GKO_KERNEL(auto tid, auto permutation, auto inv_permutation) {
-            inv_permutation[permutation[tid]] = tid;
+        [] GKO_KERNEL(auto tid, auto num_rows, auto num_nonzeros,
+                      auto permutation, auto in_row_ptrs, auto in_col_idxs,
+                      auto in_vals, auto out_row_ptrs, auto out_col_idxs,
+                      auto out_vals) {
+            if (tid < num_nonzeros) {
+                out_col_idxs[tid] = permutation[in_col_idxs[tid]];
+                out_vals[tid] = in_vals[tid];
+            }
+            if (tid <= num_rows) {
+                out_row_ptrs[tid] = in_row_ptrs[tid];
+            }
         },
-        size, permutation_indices, inv_permutation);
+        size, num_rows, nnz, perm, orig->get_const_row_ptrs(),
+        orig->get_const_col_idxs(), orig->get_const_values(),
+        col_permuted->get_row_ptrs(), col_permuted->get_col_idxs(),
+        col_permuted->get_values());
 }
 
-GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_INVERT_PERMUTATION_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_INV_COL_PERMUTE_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
-void inverse_column_permute(std::shared_ptr<const DefaultExecutor> exec,
-                            const IndexType* perm,
-                            const matrix::Csr<ValueType, IndexType>* orig,
-                            matrix::Csr<ValueType, IndexType>* column_permuted)
+void inv_col_scale_permute(std::shared_ptr<const DefaultExecutor> exec,
+                           const ValueType* scale, const IndexType* perm,
+                           const matrix::Csr<ValueType, IndexType>* orig,
+                           matrix::Csr<ValueType, IndexType>* col_permuted)
 {
     auto num_rows = orig->get_size()[0];
     auto nnz = orig->get_num_stored_elements();
-    auto size = std::max(num_rows, nnz);
+    auto size = std::max(num_rows + 1, nnz);
     run_kernel(
         exec,
-        [] GKO_KERNEL(auto tid, auto num_rows, auto num_nonzeros,
+        [] GKO_KERNEL(auto tid, auto num_rows, auto num_nonzeros, auto scale,
                       auto permutation, auto in_row_ptrs, auto in_col_idxs,
                       auto in_vals, auto out_row_ptrs, auto out_col_idxs,
                       auto out_vals) {
             if (tid < num_nonzeros) {
-                out_col_idxs[tid] = permutation[in_col_idxs[tid]];
-                out_vals[tid] = in_vals[tid];
+                const auto out_col = permutation[in_col_idxs[tid]];
+                out_col_idxs[tid] = out_col;
+                out_vals[tid] = in_vals[tid] / scale[out_col];
             }
             if (tid <= num_rows) {
                 out_row_ptrs[tid] = in_row_ptrs[tid];
             }
         },
-        size, num_rows, nnz, perm, orig->get_const_row_ptrs(),
+        size, num_rows, nnz, scale, perm, orig->get_const_row_ptrs(),
         orig->get_const_col_idxs(), orig->get_const_values(),
-        column_permuted->get_row_ptrs(), column_permuted->get_col_idxs(),
-        column_permuted->get_values());
+        col_permuted->get_row_ptrs(), col_permuted->get_col_idxs(),
+        col_permuted->get_values());
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CSR_INVERSE_COLUMN_PERMUTE_KERNEL);
+    GKO_DECLARE_CSR_INV_COL_SCALE_PERMUTE_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -154,8 +172,8 @@ void convert_to_sellp(std::shared_ptr<const DefaultExecutor> exec,
             for (auto i = row_begin; i < row_begin + slice_length; i++) {
                 cols[out_idx] =
                     i < row_end ? in_cols[i] : invalid_index<IndexType>();
-                values[out_idx] = i < row_end ? unpack_member(in_values[i])
-                                              : zero(values[out_idx]);
+                values[out_idx] =
+                    i < row_end ? in_values[i] : zero(values[out_idx]);
                 out_idx += slice_size;
             }
         },
@@ -185,8 +203,8 @@ void convert_to_ell(std::shared_ptr<const DefaultExecutor> exec,
             for (auto i = row_begin; i < row_begin + ell_length; i++) {
                 cols[out_idx] =
                     i < row_end ? in_cols[i] : invalid_index<IndexType>();
-                values[out_idx] = i < row_end ? unpack_member(in_values[i])
-                                              : zero(values[out_idx]);
+                values[out_idx] =
+                    i < row_end ? in_values[i] : zero(values[out_idx]);
                 out_idx += ell_stride;
             }
         },
diff --git a/common/unified/matrix/dense_kernels.instantiate.cpp b/common/unified/matrix/dense_kernels.instantiate.cpp
new file mode 100644
index 00000000000..73e06385f54
--- /dev/null
+++ b/common/unified/matrix/dense_kernels.instantiate.cpp
@@ -0,0 +1,130 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "common/unified/matrix/dense_kernels.template.cpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace dense {
+
+
+// begin
+GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_OR_COPY(
+    GKO_DECLARE_DENSE_COPY_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_FILL_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(GKO_DECLARE_DENSE_SCALE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(
+    GKO_DECLARE_DENSE_INV_SCALE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(
+    GKO_DECLARE_DENSE_ADD_SCALED_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(
+    GKO_DECLARE_DENSE_SUB_SCALED_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_ADD_SCALED_DIAG_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_SUB_SCALED_DIAG_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_SQRT_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_DENSE_SYMM_PERMUTE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_DENSE_INV_SYMM_PERMUTE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_DENSE_NONSYMM_PERMUTE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_DENSE_INV_NONSYMM_PERMUTE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_2(
+    GKO_DECLARE_DENSE_ROW_GATHER_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_2(
+    GKO_DECLARE_DENSE_ADVANCED_ROW_GATHER_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_DENSE_COL_PERMUTE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_DENSE_INV_ROW_PERMUTE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_DENSE_INV_COL_PERMUTE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_DENSE_SYMM_SCALE_PERMUTE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_DENSE_INV_SYMM_SCALE_PERMUTE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_DENSE_NONSYMM_SCALE_PERMUTE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_DENSE_INV_NONSYMM_SCALE_PERMUTE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_DENSE_ROW_SCALE_PERMUTE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_DENSE_INV_ROW_SCALE_PERMUTE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_DENSE_COL_SCALE_PERMUTE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_DENSE_INV_COL_SCALE_PERMUTE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_EXTRACT_DIAGONAL_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_INPLACE_ABSOLUTE_DENSE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_OUTPLACE_ABSOLUTE_DENSE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_MAKE_COMPLEX_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GET_REAL_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GET_IMAG_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(
+    GKO_DECLARE_DENSE_ADD_SCALED_IDENTITY_KERNEL);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_DENSE_COMPUTE_MAX_NNZ_PER_ROW_KERNEL);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_DENSE_COMPUTE_SLICE_SETS_KERNEL);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_DENSE_COUNT_NONZEROS_PER_ROW_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_DENSE_COUNT_NONZEROS_PER_ROW_KERNEL_SIZE_T);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_DENSE_COMPUTE_SQUARED_NORM2_KERNEL);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_MEAN_KERNEL);
+// end
+
+
+}  // namespace dense
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/common/unified/matrix/dense_kernels.cpp b/common/unified/matrix/dense_kernels.template.cpp
similarity index 68%
rename from common/unified/matrix/dense_kernels.cpp
rename to common/unified/matrix/dense_kernels.template.cpp
index 18d2fbabe6c..ed508066ba8 100644
--- a/common/unified/matrix/dense_kernels.cpp
+++ b/common/unified/matrix/dense_kernels.template.cpp
@@ -67,9 +67,6 @@ void copy(std::shared_ptr<const DefaultExecutor> exec,
         input->get_size(), input, output);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_OR_COPY(
-    GKO_DECLARE_DENSE_COPY_KERNEL);
-
 
 template <typename ValueType>
 void fill(std::shared_ptr<const DefaultExecutor> exec,
@@ -83,8 +80,6 @@ void fill(std::shared_ptr<const DefaultExecutor> exec,
         mat->get_size(), mat, value);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_FILL_KERNEL);
-
 
 template <typename ValueType, typename IndexType>
 void fill_in_matrix_data(std::shared_ptr<const DefaultExecutor> exec,
@@ -100,9 +95,6 @@ void fill_in_matrix_data(std::shared_ptr<const DefaultExecutor> exec,
         data.get_const_col_idxs(), data.get_const_values(), output);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL);
-
 
 template <typename ValueType, typename ScalarType>
 void scale(std::shared_ptr<const DefaultExecutor> exec,
@@ -125,8 +117,6 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(GKO_DECLARE_DENSE_SCALE_KERNEL);
-
 
 template <typename ValueType, typename ScalarType>
 void inv_scale(std::shared_ptr<const DefaultExecutor> exec,
@@ -150,9 +140,6 @@ void inv_scale(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(
-    GKO_DECLARE_DENSE_INV_SCALE_KERNEL);
-
 
 template <typename ValueType, typename ScalarType>
 void add_scaled(std::shared_ptr<const DefaultExecutor> exec,
@@ -176,9 +163,6 @@ void add_scaled(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(
-    GKO_DECLARE_DENSE_ADD_SCALED_KERNEL);
-
 
 template <typename ValueType, typename ScalarType>
 void sub_scaled(std::shared_ptr<const DefaultExecutor> exec,
@@ -202,9 +186,6 @@ void sub_scaled(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(
-    GKO_DECLARE_DENSE_SUB_SCALED_KERNEL);
-
 
 template <typename ValueType>
 void add_scaled_diag(std::shared_ptr<const DefaultExecutor> exec,
@@ -221,8 +202,6 @@ void add_scaled_diag(std::shared_ptr<const DefaultExecutor> exec,
         x->get_size()[0], alpha->get_const_values(), x->get_const_values(), y);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_ADD_SCALED_DIAG_KERNEL);
-
 
 template <typename ValueType>
 void sub_scaled_diag(std::shared_ptr<const DefaultExecutor> exec,
@@ -239,8 +218,6 @@ void sub_scaled_diag(std::shared_ptr<const DefaultExecutor> exec,
         x->get_size()[0], alpha->get_const_values(), x->get_const_values(), y);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_SUB_SCALED_DIAG_KERNEL);
-
 
 template <typename ValueType>
 void compute_dot(std::shared_ptr<const DefaultExecutor> exec,
@@ -257,8 +234,6 @@ void compute_dot(std::shared_ptr<const DefaultExecutor> exec,
         tmp, x, y);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL);
-
 
 template <typename ValueType>
 void compute_conj_dot(std::shared_ptr<const DefaultExecutor> exec,
@@ -275,8 +250,6 @@ void compute_conj_dot(std::shared_ptr<const DefaultExecutor> exec,
         tmp, x, y);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL);
-
 
 template <typename ValueType>
 void compute_norm2(std::shared_ptr<const DefaultExecutor> exec,
@@ -292,8 +265,6 @@ void compute_norm2(std::shared_ptr<const DefaultExecutor> exec,
         result->get_values(), x->get_size(), tmp, x);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL);
-
 template <typename ValueType>
 void compute_norm1(std::shared_ptr<const DefaultExecutor> exec,
                    const matrix::Dense<ValueType>* x,
@@ -306,7 +277,21 @@ void compute_norm1(std::shared_ptr<const DefaultExecutor> exec,
         x->get_size(), tmp, x);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL);
+
+template <typename ValueType>
+void compute_mean(std::shared_ptr<const DefaultExecutor> exec,
+                  const matrix::Dense<ValueType>* x,
+                  matrix::Dense<ValueType>* result, array<char>& tmp)
+{
+    using ValueType_nc = gko::remove_complex<ValueType>;
+    run_kernel_col_reduction_cached(
+        exec,
+        [] GKO_KERNEL(auto i, auto j, auto x, auto inv_total_size) {
+            return x(i, j) * inv_total_size;
+        },
+        GKO_KERNEL_REDUCE_SUM(ValueType), result->get_values(), x->get_size(),
+        tmp, x, ValueType_nc{1.} / x->get_size()[0]);
+}
 
 
 template <typename ValueType>
@@ -325,9 +310,6 @@ void compute_max_nnz_per_row(std::shared_ptr<const DefaultExecutor> exec,
                                     source->get_size()[0]);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
-    GKO_DECLARE_DENSE_COMPUTE_MAX_NNZ_PER_ROW_KERNEL);
-
 
 template <typename ValueType>
 void compute_slice_sets(std::shared_ptr<const DefaultExecutor> exec,
@@ -357,9 +339,6 @@ void compute_slice_sets(std::shared_ptr<const DefaultExecutor> exec,
     components::prefix_sum_nonnegative(exec, slice_sets, num_slices + 1);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
-    GKO_DECLARE_DENSE_COMPUTE_SLICE_SETS_KERNEL);
-
 
 template <typename ValueType, typename IndexType>
 void count_nonzeros_per_row(std::shared_ptr<const DefaultExecutor> exec,
@@ -374,11 +353,6 @@ void count_nonzeros_per_row(std::shared_ptr<const DefaultExecutor> exec,
         GKO_KERNEL_REDUCE_SUM(IndexType), result, 1, mtx->get_size(), mtx);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_DENSE_COUNT_NONZEROS_PER_ROW_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
-    GKO_DECLARE_DENSE_COUNT_NONZEROS_PER_ROW_KERNEL_SIZE_T);
-
 
 template <typename ValueType>
 void compute_squared_norm2(std::shared_ptr<const DefaultExecutor> exec,
@@ -393,9 +367,6 @@ void compute_squared_norm2(std::shared_ptr<const DefaultExecutor> exec,
         x->get_size(), tmp, x);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
-    GKO_DECLARE_DENSE_COMPUTE_SQUARED_NORM2_KERNEL);
-
 
 template <typename ValueType>
 void compute_sqrt(std::shared_ptr<const DefaultExecutor> exec,
@@ -409,12 +380,10 @@ void compute_sqrt(std::shared_ptr<const DefaultExecutor> exec,
         x->get_size(), x);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_SQRT_KERNEL);
-
 
 template <typename ValueType, typename IndexType>
 void symm_permute(std::shared_ptr<const DefaultExecutor> exec,
-                  const array<IndexType>* permutation_indices,
+                  const IndexType* permutation_indices,
                   const matrix::Dense<ValueType>* orig,
                   matrix::Dense<ValueType>* permuted)
 {
@@ -423,16 +392,13 @@ void symm_permute(std::shared_ptr<const DefaultExecutor> exec,
         [] GKO_KERNEL(auto row, auto col, auto orig, auto perm, auto permuted) {
             permuted(row, col) = orig(perm[row], perm[col]);
         },
-        orig->get_size(), orig, *permutation_indices, permuted);
+        orig->get_size(), orig, permutation_indices, permuted);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_DENSE_SYMM_PERMUTE_KERNEL);
-
 
 template <typename ValueType, typename IndexType>
 void inv_symm_permute(std::shared_ptr<const DefaultExecutor> exec,
-                      const array<IndexType>* permutation_indices,
+                      const IndexType* permutation_indices,
                       const matrix::Dense<ValueType>* orig,
                       matrix::Dense<ValueType>* permuted)
 {
@@ -441,17 +407,49 @@ void inv_symm_permute(std::shared_ptr<const DefaultExecutor> exec,
         [] GKO_KERNEL(auto row, auto col, auto orig, auto perm, auto permuted) {
             permuted(perm[row], perm[col]) = orig(row, col);
         },
-        orig->get_size(), orig, *permutation_indices, permuted);
+        orig->get_size(), orig, permutation_indices, permuted);
+}
+
+
+template <typename ValueType, typename IndexType>
+void nonsymm_permute(std::shared_ptr<const DefaultExecutor> exec,
+                     const IndexType* row_permutation_indices,
+                     const IndexType* column_permutation_indices,
+                     const matrix::Dense<ValueType>* orig,
+                     matrix::Dense<ValueType>* permuted)
+{
+    run_kernel(
+        exec,
+        [] GKO_KERNEL(auto row, auto col, auto orig, auto row_perm,
+                      auto col_perm, auto permuted) {
+            permuted(row, col) = orig(row_perm[row], col_perm[col]);
+        },
+        orig->get_size(), orig, row_permutation_indices,
+        column_permutation_indices, permuted);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_DENSE_INV_SYMM_PERMUTE_KERNEL);
+
+template <typename ValueType, typename IndexType>
+void inv_nonsymm_permute(std::shared_ptr<const DefaultExecutor> exec,
+                         const IndexType* row_permutation_indices,
+                         const IndexType* column_permutation_indices,
+                         const matrix::Dense<ValueType>* orig,
+                         matrix::Dense<ValueType>* permuted)
+{
+    run_kernel(
+        exec,
+        [] GKO_KERNEL(auto row, auto col, auto orig, auto row_perm,
+                      auto col_perm, auto permuted) {
+            permuted(row_perm[row], col_perm[col]) = orig(row, col);
+        },
+        orig->get_size(), orig, row_permutation_indices,
+        column_permutation_indices, permuted);
+}
 
 
 template <typename ValueType, typename OutputType, typename IndexType>
 void row_gather(std::shared_ptr<const DefaultExecutor> exec,
-                const array<IndexType>* row_idxs,
-                const matrix::Dense<ValueType>* orig,
+                const IndexType* row_idxs, const matrix::Dense<ValueType>* orig,
                 matrix::Dense<OutputType>* row_collection)
 {
     run_kernel(
@@ -459,18 +457,14 @@ void row_gather(std::shared_ptr<const DefaultExecutor> exec,
         [] GKO_KERNEL(auto row, auto col, auto orig, auto rows, auto gathered) {
             gathered(row, col) = orig(rows[row], col);
         },
-        dim<2>{row_idxs->get_num_elems(), orig->get_size()[1]}, orig, *row_idxs,
-        row_collection);
+        row_collection->get_size(), orig, row_idxs, row_collection);
 }
 
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_2(
-    GKO_DECLARE_DENSE_ROW_GATHER_KERNEL);
-
 
 template <typename ValueType, typename OutputType, typename IndexType>
 void advanced_row_gather(std::shared_ptr<const DefaultExecutor> exec,
                          const matrix::Dense<ValueType>* alpha,
-                         const array<IndexType>* row_idxs,
+                         const IndexType* row_idxs,
                          const matrix::Dense<ValueType>* orig,
                          const matrix::Dense<ValueType>* beta,
                          matrix::Dense<OutputType>* row_collection)
@@ -485,67 +479,202 @@ void advanced_row_gather(std::shared_ptr<const DefaultExecutor> exec,
                 static_cast<type>(beta[0]) *
                     static_cast<type>(gathered(row, col));
         },
-        dim<2>{row_idxs->get_num_elems(), orig->get_size()[1]},
-        alpha->get_const_values(), orig, *row_idxs, beta->get_const_values(),
-        row_collection);
+        row_collection->get_size(), alpha->get_const_values(), orig, row_idxs,
+        beta->get_const_values(), row_collection);
 }
 
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_2(
-    GKO_DECLARE_DENSE_ADVANCED_ROW_GATHER_KERNEL);
-
 
 template <typename ValueType, typename IndexType>
-void column_permute(std::shared_ptr<const DefaultExecutor> exec,
-                    const array<IndexType>* permutation_indices,
-                    const matrix::Dense<ValueType>* orig,
-                    matrix::Dense<ValueType>* column_permuted)
+void col_permute(std::shared_ptr<const DefaultExecutor> exec,
+                 const IndexType* permutation_indices,
+                 const matrix::Dense<ValueType>* orig,
+                 matrix::Dense<ValueType>* col_permuted)
 {
     run_kernel(
         exec,
         [] GKO_KERNEL(auto row, auto col, auto orig, auto perm, auto permuted) {
             permuted(row, col) = orig(row, perm[col]);
         },
-        orig->get_size(), orig, *permutation_indices, column_permuted);
+        orig->get_size(), orig, permutation_indices, col_permuted);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_DENSE_COLUMN_PERMUTE_KERNEL);
-
 
 template <typename ValueType, typename IndexType>
-void inverse_row_permute(std::shared_ptr<const DefaultExecutor> exec,
-                         const array<IndexType>* permutation_indices,
-                         const matrix::Dense<ValueType>* orig,
-                         matrix::Dense<ValueType>* row_permuted)
+void inv_row_permute(std::shared_ptr<const DefaultExecutor> exec,
+                     const IndexType* permutation_indices,
+                     const matrix::Dense<ValueType>* orig,
+                     matrix::Dense<ValueType>* row_permuted)
 {
     run_kernel(
         exec,
         [] GKO_KERNEL(auto row, auto col, auto orig, auto perm, auto permuted) {
             permuted(perm[row], col) = orig(row, col);
         },
-        orig->get_size(), orig, *permutation_indices, row_permuted);
+        orig->get_size(), orig, permutation_indices, row_permuted);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_DENSE_INV_ROW_PERMUTE_KERNEL);
-
 
 template <typename ValueType, typename IndexType>
-void inverse_column_permute(std::shared_ptr<const DefaultExecutor> exec,
-                            const array<IndexType>* permutation_indices,
-                            const matrix::Dense<ValueType>* orig,
-                            matrix::Dense<ValueType>* column_permuted)
+void inv_col_permute(std::shared_ptr<const DefaultExecutor> exec,
+                     const IndexType* permutation_indices,
+                     const matrix::Dense<ValueType>* orig,
+                     matrix::Dense<ValueType>* col_permuted)
 {
     run_kernel(
         exec,
         [] GKO_KERNEL(auto row, auto col, auto orig, auto perm, auto permuted) {
             permuted(row, perm[col]) = orig(row, col);
         },
-        orig->get_size(), orig, *permutation_indices, column_permuted);
+        orig->get_size(), orig, permutation_indices, col_permuted);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_DENSE_INV_COLUMN_PERMUTE_KERNEL);
+
+template <typename ValueType, typename IndexType>
+void symm_scale_permute(std::shared_ptr<const DefaultExecutor> exec,
+                        const ValueType* scale, const IndexType* perm,
+                        const matrix::Dense<ValueType>* orig,
+                        matrix::Dense<ValueType>* permuted)
+{
+    run_kernel(
+        exec,
+        [] GKO_KERNEL(auto i, auto j, auto scale, auto perm, auto orig,
+                      auto permuted) {
+            const auto row = perm[i];
+            const auto col = perm[j];
+            permuted(i, j) = scale[row] * scale[col] * orig(row, col);
+        },
+        orig->get_size(), scale, perm, orig, permuted);
+}
+
+
+template <typename ValueType, typename IndexType>
+void inv_symm_scale_permute(std::shared_ptr<const DefaultExecutor> exec,
+                            const ValueType* scale, const IndexType* perm,
+                            const matrix::Dense<ValueType>* orig,
+                            matrix::Dense<ValueType>* permuted)
+{
+    run_kernel(
+        exec,
+        [] GKO_KERNEL(auto i, auto j, auto scale, auto perm, auto orig,
+                      auto permuted) {
+            const auto row = perm[i];
+            const auto col = perm[j];
+            permuted(row, col) = orig(i, j) / (scale[row] * scale[col]);
+        },
+        orig->get_size(), scale, perm, orig, permuted);
+}
+
+
+template <typename ValueType, typename IndexType>
+void nonsymm_scale_permute(std::shared_ptr<const DefaultExecutor> exec,
+                           const ValueType* row_scale,
+                           const IndexType* row_perm,
+                           const ValueType* col_scale,
+                           const IndexType* col_perm,
+                           const matrix::Dense<ValueType>* orig,
+                           matrix::Dense<ValueType>* permuted)
+{
+    run_kernel(
+        exec,
+        [] GKO_KERNEL(auto i, auto j, auto row_scale, auto row_perm,
+                      auto col_scale, auto col_perm, auto orig, auto permuted) {
+            const auto row = row_perm[i];
+            const auto col = col_perm[j];
+            permuted(i, j) = row_scale[row] * col_scale[col] * orig(row, col);
+        },
+        orig->get_size(), row_scale, row_perm, col_scale, col_perm, orig,
+        permuted);
+}
+
+
+template <typename ValueType, typename IndexType>
+void inv_nonsymm_scale_permute(std::shared_ptr<const DefaultExecutor> exec,
+                               const ValueType* row_scale,
+                               const IndexType* row_perm,
+                               const ValueType* col_scale,
+                               const IndexType* col_perm,
+                               const matrix::Dense<ValueType>* orig,
+                               matrix::Dense<ValueType>* permuted)
+{
+    run_kernel(
+        exec,
+        [] GKO_KERNEL(auto i, auto j, auto row_scale, auto row_perm,
+                      auto col_scale, auto col_perm, auto orig, auto permuted) {
+            const auto row = row_perm[i];
+            const auto col = col_perm[j];
+            permuted(row, col) = orig(i, j) / (row_scale[row] * col_scale[col]);
+        },
+        orig->get_size(), row_scale, row_perm, col_scale, col_perm, orig,
+        permuted);
+}
+
+
+template <typename ValueType, typename IndexType>
+void row_scale_permute(std::shared_ptr<const DefaultExecutor> exec,
+                       const ValueType* scale, const IndexType* perm,
+                       const matrix::Dense<ValueType>* orig,
+                       matrix::Dense<ValueType>* permuted)
+{
+    run_kernel(
+        exec,
+        [] GKO_KERNEL(auto i, auto j, auto scale, auto perm, auto orig,
+                      auto permuted) {
+            const auto row = perm[i];
+            permuted(i, j) = scale[row] * orig(row, j);
+        },
+        orig->get_size(), scale, perm, orig, permuted);
+}
+
+
+template <typename ValueType, typename IndexType>
+void inv_row_scale_permute(std::shared_ptr<const DefaultExecutor> exec,
+                           const ValueType* scale, const IndexType* perm,
+                           const matrix::Dense<ValueType>* orig,
+                           matrix::Dense<ValueType>* permuted)
+{
+    run_kernel(
+        exec,
+        [] GKO_KERNEL(auto i, auto j, auto scale, auto perm, auto orig,
+                      auto permuted) {
+            const auto row = perm[i];
+            permuted(row, j) = orig(i, j) / scale[row];
+        },
+        orig->get_size(), scale, perm, orig, permuted);
+}
+
+
+template <typename ValueType, typename IndexType>
+void col_scale_permute(std::shared_ptr<const DefaultExecutor> exec,
+                       const ValueType* scale, const IndexType* perm,
+                       const matrix::Dense<ValueType>* orig,
+                       matrix::Dense<ValueType>* permuted)
+{
+    run_kernel(
+        exec,
+        [] GKO_KERNEL(auto i, auto j, auto scale, auto perm, auto orig,
+                      auto permuted) {
+            const auto col = perm[j];
+            permuted(i, j) = scale[col] * orig(i, col);
+        },
+        orig->get_size(), scale, perm, orig, permuted);
+}
+
+
+template <typename ValueType, typename IndexType>
+void inv_col_scale_permute(std::shared_ptr<const DefaultExecutor> exec,
+                           const ValueType* scale, const IndexType* perm,
+                           const matrix::Dense<ValueType>* orig,
+                           matrix::Dense<ValueType>* permuted)
+{
+    run_kernel(
+        exec,
+        [] GKO_KERNEL(auto i, auto j, auto scale, auto perm, auto orig,
+                      auto permuted) {
+            const auto col = perm[j];
+            permuted(i, col) = orig(i, j) / scale[col];
+        },
+        orig->get_size(), scale, perm, orig, permuted);
+}
 
 
 template <typename ValueType>
@@ -559,8 +688,6 @@ void extract_diagonal(std::shared_ptr<const DefaultExecutor> exec,
         diag->get_size()[0], orig, diag->get_values());
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_EXTRACT_DIAGONAL_KERNEL);
-
 
 template <typename ValueType>
 void inplace_absolute_dense(std::shared_ptr<const DefaultExecutor> exec,
@@ -574,8 +701,6 @@ void inplace_absolute_dense(std::shared_ptr<const DefaultExecutor> exec,
         source->get_size(), source);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_INPLACE_ABSOLUTE_DENSE_KERNEL);
-
 
 template <typename ValueType>
 void outplace_absolute_dense(std::shared_ptr<const DefaultExecutor> exec,
@@ -590,8 +715,6 @@ void outplace_absolute_dense(std::shared_ptr<const DefaultExecutor> exec,
         source->get_size(), source, result);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_OUTPLACE_ABSOLUTE_DENSE_KERNEL);
-
 
 template <typename ValueType>
 void make_complex(std::shared_ptr<const DefaultExecutor> exec,
@@ -606,8 +729,6 @@ void make_complex(std::shared_ptr<const DefaultExecutor> exec,
         source->get_size(), source, result);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_MAKE_COMPLEX_KERNEL);
-
 
 template <typename ValueType>
 void get_real(std::shared_ptr<const DefaultExecutor> exec,
@@ -622,8 +743,6 @@ void get_real(std::shared_ptr<const DefaultExecutor> exec,
         source->get_size(), source, result);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GET_REAL_KERNEL);
-
 
 template <typename ValueType>
 void get_imag(std::shared_ptr<const DefaultExecutor> exec,
@@ -638,8 +757,6 @@ void get_imag(std::shared_ptr<const DefaultExecutor> exec,
         source->get_size(), source, result);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GET_IMAG_KERNEL);
-
 
 template <typename ValueType, typename ScalarType>
 void add_scaled_identity(std::shared_ptr<const DefaultExecutor> exec,
@@ -659,9 +776,6 @@ void add_scaled_identity(std::shared_ptr<const DefaultExecutor> exec,
         mtx);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(
-    GKO_DECLARE_DENSE_ADD_SCALED_IDENTITY_KERNEL);
-
 
 }  // namespace dense
 }  // namespace GKO_DEVICE_NAMESPACE
diff --git a/common/unified/matrix/permutation_kernels.cpp b/common/unified/matrix/permutation_kernels.cpp
new file mode 100644
index 00000000000..e437737c524
--- /dev/null
+++ b/common/unified/matrix/permutation_kernels.cpp
@@ -0,0 +1,85 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/matrix/permutation_kernels.hpp"
+
+
+#include <ginkgo/core/base/math.hpp>
+
+
+#include "common/unified/base/kernel_launch.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace permutation {
+
+
+template <typename IndexType>
+void invert(std::shared_ptr<const DefaultExecutor> exec,
+            const IndexType* permutation_indices, size_type size,
+            IndexType* inv_permutation)
+{
+    run_kernel(
+        exec,
+        [] GKO_KERNEL(auto i, auto permutation, auto inv_permutation) {
+            inv_permutation[permutation[i]] = i;
+        },
+        size, permutation_indices, inv_permutation);
+}
+
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_PERMUTATION_INVERT_KERNEL);
+
+
+template <typename IndexType>
+void compose(std::shared_ptr<const DefaultExecutor> exec,
+             const IndexType* first_permutation,
+             const IndexType* second_permutation, size_type size,
+             IndexType* output_permutation)
+{
+    run_kernel(
+        exec,
+        [] GKO_KERNEL(auto i, auto first_permutation, auto second_permutation,
+                      auto output_permutation) {
+            output_permutation[i] = first_permutation[second_permutation[i]];
+        },
+        size, first_permutation, second_permutation, output_permutation);
+}
+
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_PERMUTATION_COMPOSE_KERNEL);
+
+
+}  // namespace permutation
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/common/unified/matrix/scaled_permutation_kernels.cpp b/common/unified/matrix/scaled_permutation_kernels.cpp
new file mode 100644
index 00000000000..ff3bb55becb
--- /dev/null
+++ b/common/unified/matrix/scaled_permutation_kernels.cpp
@@ -0,0 +1,98 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/matrix/scaled_permutation_kernels.hpp"
+
+
+#include <ginkgo/core/base/math.hpp>
+
+
+#include "common/unified/base/kernel_launch.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace scaled_permutation {
+
+
+template <typename ValueType, typename IndexType>
+void invert(std::shared_ptr<const DefaultExecutor> exec,
+            const ValueType* input_scale, const IndexType* input_permutation,
+            size_type size, ValueType* output_scale,
+            IndexType* output_permutation)
+{
+    run_kernel(
+        exec,
+        [] GKO_KERNEL(auto i, auto input_scale, auto input_permutation,
+                      auto output_scale, auto output_permutation) {
+            const auto ip = input_permutation[i];
+            output_permutation[ip] = i;
+            output_scale[i] = one(input_scale[ip]) / input_scale[ip];
+        },
+        size, input_scale, input_permutation, output_scale, output_permutation);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_SCALED_PERMUTATION_INVERT_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void compose(std::shared_ptr<const DefaultExecutor> exec,
+             const ValueType* first_scale, const IndexType* first_permutation,
+             const ValueType* second_scale, const IndexType* second_permutation,
+             size_type size, ValueType* output_scale,
+             IndexType* output_permutation)
+{
+    run_kernel(
+        exec,
+        [] GKO_KERNEL(auto i, auto first_scale, auto first_permutation,
+                      auto second_scale, auto second_permutation,
+                      auto output_permutation, auto output_scale) {
+            const auto second_permuted = second_permutation[i];
+            const auto combined_permuted = first_permutation[second_permuted];
+            output_permutation[i] = combined_permuted;
+            output_scale[combined_permuted] =
+                first_scale[combined_permuted] * second_scale[second_permuted];
+        },
+        size, first_scale, first_permutation, second_scale, second_permutation,
+        output_permutation, output_scale);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_SCALED_PERMUTATION_COMPOSE_KERNEL);
+
+
+}  // namespace scaled_permutation
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/common/unified/multigrid/pgm_kernels.cpp b/common/unified/multigrid/pgm_kernels.cpp
index 5836486f2a6..a61b32dacbd 100644
--- a/common/unified/multigrid/pgm_kernels.cpp
+++ b/common/unified/multigrid/pgm_kernels.cpp
@@ -135,7 +135,7 @@ void map_row(std::shared_ptr<const DefaultExecutor> exec,
         exec,
         [] GKO_KERNEL(auto tidx, auto fine_row_ptrs, auto agg, auto row_idxs) {
             const auto coarse_row = agg[tidx];
-            // TODO: when it is neccessary, it can use warp per row to improve.
+            // TODO: when it is necessary, it can use warp per row to improve.
             for (auto i = fine_row_ptrs[tidx]; i < fine_row_ptrs[tidx + 1];
                  i++) {
                 row_idxs[i] = coarse_row;
@@ -232,7 +232,7 @@ void find_strongest_neighbor(
                 // all neighbor is agg, connect to the strongest agg
                 // Also, no others will use this item as their
                 // strongest_neighbor because they are already aggregated. Thus,
-                // it is determinstic behavior
+                // it is deterministic behavior
                 agg[row] = agg[strongest_agg];
             } else if (strongest_unagg != -1) {
                 // set the strongest neighbor in the unagg group
@@ -260,7 +260,7 @@ void assign_to_exist_agg(std::shared_ptr<const DefaultExecutor> exec,
 {
     const auto num = agg.get_num_elems();
     if (intermediate_agg.get_num_elems() > 0) {
-        // determinstic kernel
+        // deterministic kernel
         run_kernel(
             exec,
             [] GKO_KERNEL(auto row, auto row_ptrs, auto col_idxs,
diff --git a/contributors.txt b/contributors.txt
index 1f1259bc082..aec120d93dd 100644
--- a/contributors.txt
+++ b/contributors.txt
@@ -20,6 +20,7 @@ Kashi Aditya <aditya.kashi@kit.edu> Karlsruhe Institute of Technology
 Koch Marcel <marcel.koch@kit.edu> Karlsruhe Institute of Technology
 Maier Matthias <matthias@43-1.org> Texas A&M University
 Nayak Pratik <pratik.nayak@kit.edu> Karlsruhe Institute of Technology
+Nguyen Phuong <phuong.nguyen@icl.utk.edu> University of Tennessee, Knoxville
 Olenik Gregor <go@hpsim.de> HPSim
 Ribizel Tobias <mail@upsj.de> Karlsruhe Institute of Technology
 Riemer Lukas <lksriemer@gmail.com> Karlsruhe Institute of Technology
diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt
index 2f9643115c9..ce4a52037b9 100644
--- a/core/CMakeLists.txt
+++ b/core/CMakeLists.txt
@@ -4,12 +4,14 @@ add_library(ginkgo "")
 target_sources(ginkgo
     PRIVATE
     base/array.cpp
+    base/batch_multi_vector.cpp
     base/combination.cpp
     base/composition.cpp
     base/dense_cache.cpp
     base/device_matrix_data.cpp
     base/executor.cpp
     base/index_set.cpp
+    base/memory.cpp
     base/mpi.cpp
     base/mtx_io.cpp
     base/perturbation.cpp
@@ -27,6 +29,7 @@ target_sources(ginkgo
     factorization/par_ilu.cpp
     factorization/par_ilut.cpp
     factorization/symbolic.cpp
+    log/batch_logger.cpp
     log/convergence.cpp
     log/logger.cpp
     log/performance_hint.cpp
@@ -37,6 +40,9 @@ target_sources(ginkgo
     log/vtune.cpp
     log/record.cpp
     log/stream.cpp
+    matrix/batch_dense.cpp
+    matrix/batch_ell.cpp
+    matrix/batch_identity.cpp
     matrix/coo.cpp
     matrix/csr.cpp
     matrix/dense.cpp
@@ -47,16 +53,19 @@ target_sources(ginkgo
     matrix/hybrid.cpp
     matrix/identity.cpp
     matrix/permutation.cpp
+    matrix/row_gatherer.cpp
+    matrix/scaled_permutation.cpp
     matrix/sellp.cpp
     matrix/sparsity_csr.cpp
-    matrix/row_gatherer.cpp
     multigrid/pgm.cpp
     multigrid/fixed_coarsening.cpp
     preconditioner/isai.cpp
     preconditioner/jacobi.cpp
     reorder/amd.cpp
+    reorder/mc64.cpp
     reorder/rcm.cpp
     reorder/scaled_reordered.cpp
+    solver/batch_bicgstab.cpp
     solver/bicg.cpp
     solver/bicgstab.cpp
     solver/cb_gmres.cpp
@@ -93,6 +102,7 @@ if(GINKGO_BUILD_MPI)
         PRIVATE
         mpi/exception.cpp
         distributed/matrix.cpp
+        distributed/partition_helpers.cpp
         distributed/vector.cpp
         distributed/preconditioner/schwarz.cpp)
 endif()
@@ -111,10 +121,7 @@ target_link_libraries(ginkgo
 set(GKO_RPATH_ADDITIONS "")
 
 if(GINKGO_HAVE_PAPI_SDE)
-    target_link_libraries(ginkgo PUBLIC PAPI::PAPI)
-    list(GET PAPI_LIBRARIES 0 PAPI_FIRST_LIB)
-    get_filename_component(GKO_PAPI_LIBDIR "${PAPI_FIRST_LIB}" DIRECTORY)
-    list(APPEND GKO_RPATH_ADDITIONS "${GKO_PAPI_LIBDIR}")
+    target_link_libraries(ginkgo PUBLIC PAPI::PAPI_SDE)
 endif()
 
 if(GINKGO_HAVE_TAU)
diff --git a/core/base/batch_multi_vector.cpp b/core/base/batch_multi_vector.cpp
new file mode 100644
index 00000000000..6dcf8dd90b5
--- /dev/null
+++ b/core/base/batch_multi_vector.cpp
@@ -0,0 +1,299 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+
+
+#include <algorithm>
+#include <type_traits>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/exception.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/matrix_data.hpp>
+#include <ginkgo/core/base/utils.hpp>
+#include <ginkgo/core/matrix/batch_dense.hpp>
+
+
+#include "core/base/batch_multi_vector_kernels.hpp"
+
+
+namespace gko {
+namespace batch {
+namespace multi_vector {
+namespace {
+
+
+GKO_REGISTER_OPERATION(scale, batch_multi_vector::scale);
+GKO_REGISTER_OPERATION(add_scaled, batch_multi_vector::add_scaled);
+GKO_REGISTER_OPERATION(compute_dot, batch_multi_vector::compute_dot);
+GKO_REGISTER_OPERATION(compute_conj_dot, batch_multi_vector::compute_conj_dot);
+GKO_REGISTER_OPERATION(compute_norm2, batch_multi_vector::compute_norm2);
+GKO_REGISTER_OPERATION(copy, batch_multi_vector::copy);
+
+
+}  // namespace
+}  // namespace multi_vector
+
+
+namespace detail {
+
+
+template <typename ValueType>
+batch_dim<2> compute_batch_size(
+    const std::vector<gko::matrix::Dense<ValueType>*>& matrices)
+{
+    auto common_size = matrices[0]->get_size();
+    for (size_type i = 1; i < matrices.size(); ++i) {
+        GKO_ASSERT_EQUAL_DIMENSIONS(common_size, matrices[i]->get_size());
+    }
+    return batch_dim<2>{matrices.size(), common_size};
+}
+
+
+}  // namespace detail
+
+
+template <typename ValueType>
+std::unique_ptr<gko::matrix::Dense<ValueType>>
+MultiVector<ValueType>::create_view_for_item(size_type item_id)
+{
+    auto exec = this->get_executor();
+    auto num_rows = this->get_common_size()[0];
+    auto stride = this->get_common_size()[1];
+    auto mat = unbatch_type::create(
+        exec, this->get_common_size(),
+        make_array_view(exec, num_rows * stride,
+                        this->get_values_for_item(item_id)),
+        stride);
+    return mat;
+}
+
+
+template <typename ValueType>
+std::unique_ptr<const gko::matrix::Dense<ValueType>>
+MultiVector<ValueType>::create_const_view_for_item(size_type item_id) const
+{
+    auto exec = this->get_executor();
+    auto num_rows = this->get_common_size()[0];
+    auto stride = this->get_common_size()[1];
+    auto mat = unbatch_type::create_const(
+        exec, this->get_common_size(),
+        make_const_array_view(exec, num_rows * stride,
+                              this->get_const_values_for_item(item_id)),
+        stride);
+    return mat;
+}
+
+
+template <typename ValueType>
+MultiVector<ValueType>::MultiVector(std::shared_ptr<const Executor> exec,
+                                    const batch_dim<2>& size)
+    : EnablePolymorphicObject<MultiVector<ValueType>>(exec),
+      batch_size_(size),
+      values_(exec, compute_num_elems(size))
+{}
+
+
+template <typename ValueType>
+std::unique_ptr<MultiVector<ValueType>>
+MultiVector<ValueType>::create_with_config_of(
+    ptr_param<const MultiVector> other)
+{
+    // De-referencing `other` before calling the functions (instead of
+    // using operator `->`) is currently required to be compatible with
+    // CUDA 10.1.
+    // Otherwise, it results in a compile error.
+    return (*other).create_with_same_config();
+}
+
+
+template <typename ValueType>
+std::unique_ptr<const MultiVector<ValueType>>
+MultiVector<ValueType>::create_const(
+    std::shared_ptr<const Executor> exec, const batch_dim<2>& sizes,
+    gko::detail::const_array_view<ValueType>&& values)
+{
+    // cast const-ness away, but return a const object afterwards,
+    // so we can ensure that no modifications take place.
+    return std::unique_ptr<const MultiVector>(new MultiVector{
+        exec, sizes, gko::detail::array_const_cast(std::move(values))});
+}
+
+
+template <typename ValueType>
+void MultiVector<ValueType>::fill(ValueType value)
+{
+    GKO_ASSERT(this->values_.get_num_elems() > 0);
+    this->values_.fill(value);
+}
+
+
+template <typename ValueType>
+void MultiVector<ValueType>::set_size(const batch_dim<2>& value) noexcept
+{
+    batch_size_ = value;
+}
+
+
+template <typename ValueType>
+std::unique_ptr<MultiVector<ValueType>>
+MultiVector<ValueType>::create_with_same_config() const
+{
+    return MultiVector<ValueType>::create(this->get_executor(),
+                                          this->get_size());
+}
+
+
+template <typename ValueType>
+void MultiVector<ValueType>::scale(
+    ptr_param<const MultiVector<ValueType>> alpha)
+{
+    GKO_ASSERT_EQ(alpha->get_num_batch_items(), this->get_num_batch_items());
+    GKO_ASSERT_EQUAL_ROWS(alpha->get_common_size(), dim<2>(1, 1));
+    if (alpha->get_common_size()[1] != 1) {
+        // different alpha for each column
+        GKO_ASSERT_EQUAL_COLS(this->get_common_size(),
+                              alpha->get_common_size());
+    }
+    auto exec = this->get_executor();
+    exec->run(multi_vector::make_scale(make_temporary_clone(exec, alpha).get(),
+                                       this));
+}
+
+
+template <typename ValueType>
+void MultiVector<ValueType>::add_scaled(
+    ptr_param<const MultiVector<ValueType>> alpha,
+    ptr_param<const MultiVector<ValueType>> b)
+{
+    GKO_ASSERT_EQ(alpha->get_num_batch_items(), this->get_num_batch_items());
+    GKO_ASSERT_EQUAL_ROWS(alpha->get_common_size(), dim<2>(1, 1));
+    if (alpha->get_common_size()[1] != 1) {
+        // different alpha for each column
+        GKO_ASSERT_EQUAL_COLS(this->get_common_size(),
+                              alpha->get_common_size());
+    }
+    GKO_ASSERT_EQ(b->get_num_batch_items(), this->get_num_batch_items());
+    GKO_ASSERT_EQUAL_DIMENSIONS(this->get_common_size(), b->get_common_size());
+
+    auto exec = this->get_executor();
+    exec->run(multi_vector::make_add_scaled(
+        make_temporary_clone(exec, alpha).get(),
+        make_temporary_clone(exec, b).get(), this));
+}
+
+
+inline const batch_dim<2> get_col_sizes(const batch_dim<2>& sizes)
+{
+    return batch_dim<2>(sizes.get_num_batch_items(),
+                        dim<2>(1, sizes.get_common_size()[1]));
+}
+
+
+template <typename ValueType>
+void MultiVector<ValueType>::compute_conj_dot(
+    ptr_param<const MultiVector<ValueType>> b,
+    ptr_param<MultiVector<ValueType>> result) const
+{
+    GKO_ASSERT_EQ(b->get_num_batch_items(), this->get_num_batch_items());
+    GKO_ASSERT_EQUAL_DIMENSIONS(this->get_common_size(), b->get_common_size());
+    GKO_ASSERT_EQ(this->get_num_batch_items(), result->get_num_batch_items());
+    GKO_ASSERT_EQUAL_DIMENSIONS(
+        result->get_common_size(),
+        get_col_sizes(this->get_size()).get_common_size());
+    auto exec = this->get_executor();
+    exec->run(multi_vector::make_compute_conj_dot(
+        this, make_temporary_clone(exec, b).get(),
+        make_temporary_output_clone(exec, result).get()));
+}
+
+
+template <typename ValueType>
+void MultiVector<ValueType>::compute_dot(
+    ptr_param<const MultiVector<ValueType>> b,
+    ptr_param<MultiVector<ValueType>> result) const
+{
+    GKO_ASSERT_EQ(b->get_num_batch_items(), this->get_num_batch_items());
+    GKO_ASSERT_EQUAL_DIMENSIONS(this->get_common_size(), b->get_common_size());
+    GKO_ASSERT_EQ(this->get_num_batch_items(), result->get_num_batch_items());
+    GKO_ASSERT_EQUAL_DIMENSIONS(
+        result->get_common_size(),
+        get_col_sizes(this->get_size()).get_common_size());
+    auto exec = this->get_executor();
+    exec->run(multi_vector::make_compute_dot(
+        this, make_temporary_clone(exec, b).get(),
+        make_temporary_output_clone(exec, result).get()));
+}
+
+
+template <typename ValueType>
+void MultiVector<ValueType>::compute_norm2(
+    ptr_param<MultiVector<remove_complex<ValueType>>> result) const
+{
+    GKO_ASSERT_EQ(this->get_num_batch_items(), result->get_num_batch_items());
+    GKO_ASSERT_EQUAL_DIMENSIONS(
+        result->get_common_size(),
+        get_col_sizes(this->get_size()).get_common_size());
+
+    auto exec = this->get_executor();
+    exec->run(multi_vector::make_compute_norm2(
+        this, make_temporary_output_clone(exec, result).get()));
+}
+
+
+template <typename ValueType>
+void MultiVector<ValueType>::convert_to(
+    MultiVector<next_precision<ValueType>>* result) const
+{
+    result->values_ = this->values_;
+    result->set_size(this->get_size());
+}
+
+
+template <typename ValueType>
+void MultiVector<ValueType>::move_to(
+    MultiVector<next_precision<ValueType>>* result)
+{
+    this->convert_to(result);
+}
+
+
+#define GKO_DECLARE_BATCH_MULTI_VECTOR(_type) class MultiVector<_type>
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR);
+
+
+}  // namespace batch
+}  // namespace gko
diff --git a/core/base/batch_multi_vector_kernels.hpp b/core/base/batch_multi_vector_kernels.hpp
new file mode 100644
index 00000000000..5a39567f470
--- /dev/null
+++ b/core/base/batch_multi_vector_kernels.hpp
@@ -0,0 +1,111 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CORE_BASE_BATCH_MULTI_VECTOR_KERNELS_HPP_
+#define GKO_CORE_BASE_BATCH_MULTI_VECTOR_KERNELS_HPP_
+
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+
+
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/types.hpp>
+
+
+#include "core/base/kernel_declaration.hpp"
+
+
+namespace gko {
+namespace kernels {
+
+
+#define GKO_DECLARE_BATCH_MULTI_VECTOR_SCALE_KERNEL(_type)  \
+    void scale(std::shared_ptr<const DefaultExecutor> exec, \
+               const batch::MultiVector<_type>* alpha,      \
+               batch::MultiVector<_type>* x)
+
+#define GKO_DECLARE_BATCH_MULTI_VECTOR_ADD_SCALED_KERNEL(_type)  \
+    void add_scaled(std::shared_ptr<const DefaultExecutor> exec, \
+                    const batch::MultiVector<_type>* alpha,      \
+                    const batch::MultiVector<_type>* x,          \
+                    batch::MultiVector<_type>* y)
+
+#define GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_DOT_KERNEL(_type)  \
+    void compute_dot(std::shared_ptr<const DefaultExecutor> exec, \
+                     const batch::MultiVector<_type>* x,          \
+                     const batch::MultiVector<_type>* y,          \
+                     batch::MultiVector<_type>* result)
+
+#define GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_CONJ_DOT_KERNEL(_type)  \
+    void compute_conj_dot(std::shared_ptr<const DefaultExecutor> exec, \
+                          const batch::MultiVector<_type>* x,          \
+                          const batch::MultiVector<_type>* y,          \
+                          batch::MultiVector<_type>* result)
+
+#define GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_NORM2_KERNEL(_type)  \
+    void compute_norm2(std::shared_ptr<const DefaultExecutor> exec, \
+                       const batch::MultiVector<_type>* x,          \
+                       batch::MultiVector<remove_complex<_type>>* result)
+
+#define GKO_DECLARE_BATCH_MULTI_VECTOR_COPY_KERNEL(_type)  \
+    void copy(std::shared_ptr<const DefaultExecutor> exec, \
+              const batch::MultiVector<_type>* x,          \
+              batch::MultiVector<_type>* result)
+
+
+#define GKO_DECLARE_ALL_AS_TEMPLATES                                   \
+    template <typename ValueType>                                      \
+    GKO_DECLARE_BATCH_MULTI_VECTOR_SCALE_KERNEL(ValueType);            \
+    template <typename ValueType>                                      \
+    GKO_DECLARE_BATCH_MULTI_VECTOR_ADD_SCALED_KERNEL(ValueType);       \
+    template <typename ValueType>                                      \
+    GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_DOT_KERNEL(ValueType);      \
+    template <typename ValueType>                                      \
+    GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_CONJ_DOT_KERNEL(ValueType); \
+    template <typename ValueType>                                      \
+    GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_NORM2_KERNEL(ValueType);    \
+    template <typename ValueType>                                      \
+    GKO_DECLARE_BATCH_MULTI_VECTOR_COPY_KERNEL(ValueType)
+
+
+GKO_DECLARE_FOR_ALL_EXECUTOR_NAMESPACES(batch_multi_vector,
+                                        GKO_DECLARE_ALL_AS_TEMPLATES);
+
+
+#undef GKO_DECLARE_ALL_AS_TEMPLATES
+
+
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_CORE_BASE_BATCH_MULTI_VECTOR_KERNELS_HPP_
diff --git a/core/base/batch_struct.hpp b/core/base/batch_struct.hpp
new file mode 100644
index 00000000000..041630af66e
--- /dev/null
+++ b/core/base/batch_struct.hpp
@@ -0,0 +1,142 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CORE_BASE_BATCH_STRUCT_HPP_
+#define GKO_CORE_BASE_BATCH_STRUCT_HPP_
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/lin_op.hpp>
+#include <ginkgo/core/base/types.hpp>
+
+
+namespace gko {
+namespace batch {
+namespace multi_vector {
+
+
+/**
+ * Encapsulates one matrix from a batch of multi-vectors.
+ */
+template <typename ValueType>
+struct batch_item {
+    using value_type = ValueType;
+    ValueType* values;
+    int32 stride;
+    int32 num_rows;
+    int32 num_rhs;
+};
+
+
+/**
+ * A 'simple' structure to store a global uniform batch of multi-vectors.
+ */
+template <typename ValueType>
+struct uniform_batch {
+    using value_type = ValueType;
+    using entry_type = batch_item<ValueType>;
+
+    ValueType* values;
+    size_type num_batch_items;
+    int32 stride;
+    int32 num_rows;
+    int32 num_rhs;
+
+    inline size_type get_single_item_num_nnz() const
+    {
+        return static_cast<size_type>(stride * num_rows);
+    }
+};
+
+
+template <typename ValueType>
+GKO_ATTRIBUTES GKO_INLINE ValueType* batch_item_ptr(
+    ValueType* const batch_start, const size_type stride, const int num_rows,
+    const size_type batch_idx)
+{
+    return batch_start + batch_idx * stride * num_rows;
+}
+
+
+}  // namespace multi_vector
+
+
+template <typename ValueType>
+GKO_ATTRIBUTES GKO_INLINE multi_vector::batch_item<const ValueType> to_const(
+    const multi_vector::batch_item<ValueType>& b)
+{
+    return {b.values, b.stride, b.num_rows, b.num_rhs};
+}
+
+
+template <typename ValueType>
+GKO_ATTRIBUTES GKO_INLINE multi_vector::uniform_batch<const ValueType> to_const(
+    const multi_vector::uniform_batch<ValueType>& ub)
+{
+    return {ub.values, ub.num_batch_items, ub.stride, ub.num_rows, ub.num_rhs};
+}
+
+
+/**
+ * Extract one object (matrix, vector etc.) from a batch of objects
+ *
+ * This overload is for batch multi-vectors.
+ * These overloads are intended to be called from within a kernel.
+ *
+ * @param batch  The batch of objects to extract from
+ * @param batch_idx  The position of the desired object in the batch
+ */
+template <typename ValueType>
+GKO_ATTRIBUTES GKO_INLINE multi_vector::batch_item<ValueType>
+extract_batch_item(const multi_vector::uniform_batch<ValueType>& batch,
+                   const size_type batch_idx)
+{
+    return {batch.values + batch_idx * batch.stride * batch.num_rows,
+            batch.stride, batch.num_rows, batch.num_rhs};
+}
+
+template <typename ValueType>
+GKO_ATTRIBUTES GKO_INLINE multi_vector::batch_item<ValueType>
+extract_batch_item(ValueType* const batch_values, const int32 stride,
+                   const int32 num_rows, const int32 num_rhs,
+                   const size_type batch_idx)
+{
+    return {batch_values + batch_idx * stride * num_rows, stride, num_rows,
+            num_rhs};
+}
+
+
+}  // namespace batch
+}  // namespace gko
+
+
+#endif  // GKO_CORE_BASE_BATCH_STRUCT_HPP_
diff --git a/core/base/batch_utilities.hpp b/core/base/batch_utilities.hpp
new file mode 100644
index 00000000000..cc92d294173
--- /dev/null
+++ b/core/base/batch_utilities.hpp
@@ -0,0 +1,451 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CORE_BASE_BATCH_UTILITIES_HPP_
+#define GKO_CORE_BASE_BATCH_UTILITIES_HPP_
+
+
+#include <algorithm>
+#include <type_traits>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/exception.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/matrix_data.hpp>
+#include <ginkgo/core/base/utils.hpp>
+#include <ginkgo/core/base/utils_helper.hpp>
+#include <ginkgo/core/matrix/batch_dense.hpp>
+
+
+namespace gko {
+namespace batch {
+
+
+/**
+ * Duplicate a given input batch object.
+ */
+template <typename OutputType, typename... TArgs>
+std::unique_ptr<OutputType> duplicate(std::shared_ptr<const Executor> exec,
+                                      size_type num_duplications,
+                                      const OutputType* input,
+                                      TArgs&&... create_args)
+{
+    auto num_batch_items = input->get_num_batch_items();
+    auto tmp =
+        OutputType::create(exec,
+                           batch_dim<2>(num_batch_items * num_duplications,
+                                        input->get_common_size()),
+                           std::forward<TArgs>(create_args)...);
+
+    for (size_type i = 0; i < num_duplications; ++i) {
+        for (size_type b = 0; b < num_batch_items; ++b) {
+            tmp->create_view_for_item(i * num_batch_items + b)
+                ->copy_from(input->create_const_view_for_item(b).get());
+        }
+    }
+
+    return std::move(tmp);
+}
+
+
+/**
+ * Duplicate a monolithic matrix and create a batch object.
+ */
+template <typename OutputType, typename... TArgs>
+std::unique_ptr<OutputType> create_from_item(
+    std::shared_ptr<const Executor> exec, const size_type num_duplications,
+    const typename OutputType::unbatch_type* input, TArgs&&... create_args)
+{
+    auto num_batch_items = num_duplications;
+    auto tmp = OutputType::create(
+        exec, batch_dim<2>(num_batch_items, input->get_size()),
+        std::forward<TArgs>(create_args)...);
+
+    for (size_type b = 0; b < num_batch_items; ++b) {
+        tmp->create_view_for_item(b)->copy_from(input);
+    }
+
+    return std::move(tmp);
+}
+
+
+/**
+ * Create a batch object from a vector of monolithic object that share the same
+ * sparsity pattern.
+ *
+ * @note The sparsity of the elements in the input vector of matrices needs to
+ * be the same. TODO: Check for same sparsity among the different input items
+ */
+template <typename OutputType, typename... TArgs>
+std::unique_ptr<OutputType> create_from_item(
+    std::shared_ptr<const Executor> exec,
+    const std::vector<typename OutputType::unbatch_type*>& input,
+    TArgs&&... create_args)
+{
+    auto num_batch_items = input.size();
+    auto tmp = OutputType::create(
+        exec, batch_dim<2>(num_batch_items, input[0]->get_size()),
+        std::forward<TArgs>(create_args)...);
+
+    for (size_type b = 0; b < num_batch_items; ++b) {
+        tmp->create_view_for_item(b)->copy_from(input[b]);
+    }
+
+    return std::move(tmp);
+}
+
+
+/**
+ * Unbatch a batched object into a vector of items of its unbatch_type.
+ */
+template <typename InputType>
+auto unbatch(const InputType* batch_object)
+{
+    auto unbatched_mats =
+        std::vector<std::unique_ptr<typename InputType::unbatch_type>>{};
+    for (size_type b = 0; b < batch_object->get_num_batch_items(); ++b) {
+        unbatched_mats.emplace_back(
+            batch_object->create_const_view_for_item(b)->clone());
+    }
+    return unbatched_mats;
+}
+
+
+namespace detail {
+
+
+template <typename ValueType, typename IndexType>
+void assert_same_sparsity_in_batched_data(
+    const std::vector<gko::matrix_data<ValueType, IndexType>>& data)
+{
+    if (data.empty()) {
+        return;
+    }
+    auto num_nnz = data.at(0).nonzeros.size();
+    auto base_data = data.at(0);
+    base_data.ensure_row_major_order();
+    for (int b = 1; b < data.size(); ++b) {
+        if (data[b].nonzeros.size() != num_nnz) {
+            GKO_NOT_IMPLEMENTED;
+        }
+        auto temp_data = data.at(b);
+        temp_data.ensure_row_major_order();
+        for (int nnz = 0; nnz < num_nnz; ++nnz) {
+            if (temp_data.nonzeros.at(nnz).row !=
+                    base_data.nonzeros.at(nnz).row ||
+                temp_data.nonzeros.at(nnz).column !=
+                    base_data.nonzeros.at(nnz).column) {
+                GKO_NOT_IMPLEMENTED;
+            }
+        }
+    }
+}
+
+
+}  // namespace detail
+
+
+/**
+ * Create a batch object from a vector of gko::matrix_data objects. Each item of
+ * the vector needs to store the same sparsity pattern.
+ */
+template <typename ValueType, typename IndexType, typename OutputType,
+          typename... TArgs>
+std::unique_ptr<OutputType> read(
+    std::shared_ptr<const Executor> exec,
+    const std::vector<gko::matrix_data<ValueType, IndexType>>& data,
+    TArgs&&... create_args)
+{
+    auto num_batch_items = data.size();
+    // Throw if all the items in the batch dont have same sparsity.
+    if (!std::is_same<OutputType,
+                      gko::batch::matrix::Dense<ValueType>>::value &&
+        !std::is_same<OutputType, gko::batch::MultiVector<ValueType>>::value) {
+        detail::assert_same_sparsity_in_batched_data(data);
+    }
+    auto tmp =
+        OutputType::create(exec, batch_dim<2>(num_batch_items, data.at(0).size),
+                           std::forward<TArgs>(create_args)...);
+
+    for (size_type b = 0; b < num_batch_items; ++b) {
+        if (data.at(b).size != data.at(0).size) {
+            GKO_INVALID_STATE("Incorrect data passed in");
+        }
+        tmp->create_view_for_item(b)->read(data[b]);
+    }
+
+    return std::move(tmp);
+}
+
+
+/**
+ * Write a vector of matrix data objects from an input batch object.
+ */
+template <typename ValueType, typename IndexType, typename OutputType>
+std::vector<gko::matrix_data<ValueType, IndexType>> write(
+    const OutputType* mvec)
+{
+    auto data = std::vector<gko::matrix_data<ValueType, IndexType>>(
+        mvec->get_num_batch_items());
+
+    for (size_type b = 0; b < mvec->get_num_batch_items(); ++b) {
+        data[b] = {mvec->get_common_size(), {}};
+        mvec->create_const_view_for_item(b)->write(data[b]);
+    }
+
+    return data;
+}
+
+
+/**
+ * Creates and initializes a batch of the specified Matrix type from a series of
+ * single column-vectors.
+ *
+ * @tparam Matrix  matrix type to initialize (It has to implement the
+ *                 read<Matrix> function)
+ * @tparam TArgs  argument types for Matrix::create method
+ *                (not including the implied Executor as the first argument)
+ *
+ * @param vals  values used to initialize the batch vector
+ * @param exec  Executor associated to the vector
+ * @param create_args  additional arguments passed to Matrix::create, not
+ *                     including the Executor, which is passed as the first
+ *                     argument
+ *
+ * @ingroup mat_formats
+ */
+template <typename Matrix, typename... TArgs>
+std::unique_ptr<Matrix> initialize(
+    std::initializer_list<std::initializer_list<typename Matrix::value_type>>
+        vals,
+    std::shared_ptr<const Executor> exec, TArgs&&... create_args)
+{
+    using value_type = typename Matrix::value_type;
+    using index_type = typename Matrix::index_type;
+    using mat_data = gko::matrix_data<value_type, index_type>;
+    size_type num_batch_items = vals.size();
+    GKO_THROW_IF_INVALID(num_batch_items > 0, "Input data is empty");
+    auto vals_begin = begin(vals);
+    size_type common_num_rows = vals_begin ? vals_begin->size() : 0;
+    auto common_size = dim<2>(common_num_rows, 1);
+    for (auto& val : vals) {
+        GKO_ASSERT_EQ(common_num_rows, val.size());
+    }
+    auto b_size = batch_dim<2>(num_batch_items, common_size);
+    size_type batch = 0;
+    std::vector<mat_data> input_mat_data(num_batch_items, common_size);
+    for (const auto& b : vals) {
+        input_mat_data[batch].nonzeros.reserve(b.size());
+        size_type idx = 0;
+        for (const auto& elem : b) {
+            if (elem != zero<value_type>()) {
+                input_mat_data[batch].nonzeros.emplace_back(idx, 0, elem);
+            }
+            ++idx;
+        }
+        ++batch;
+    }
+    return read<value_type, index_type, Matrix>(
+        exec, input_mat_data, std::forward<TArgs>(create_args)...);
+}
+
+
+/**
+ * Creates and initializes a batch of matrices.
+ *
+ * @tparam Matrix  matrix type to initialize (It has to implement the
+ *                 read<Matrix> function)
+ * @tparam TArgs  argument types for Matrix::create method
+ *                (not including the implied Executor as the first argument)
+ *
+ * @param vals  values used to initialize the matrix
+ * @param exec  Executor associated with the matrix
+ * @param create_args  additional arguments passed to Matrix::create, not
+ *                     including the Executor, which is passed as the first
+ *                     argument
+ *
+ * @ingroup mat_formats
+ */
+template <typename Matrix, typename... TArgs>
+std::unique_ptr<Matrix> initialize(
+    std::initializer_list<std::initializer_list<
+        std::initializer_list<typename Matrix::value_type>>>
+        vals,
+    std::shared_ptr<const Executor> exec, TArgs&&... create_args)
+{
+    using value_type = typename Matrix::value_type;
+    using index_type = typename Matrix::index_type;
+    using mat_data = gko::matrix_data<value_type, index_type>;
+    size_type num_batch_items = vals.size();
+    GKO_THROW_IF_INVALID(num_batch_items > 0, "Input data is empty");
+    auto vals_begin = begin(vals);
+    size_type common_num_rows = vals_begin ? vals_begin->size() : 0;
+    size_type common_num_cols =
+        vals_begin->begin() ? vals_begin->begin()->size() : 0;
+    auto common_size = dim<2>(common_num_rows, common_num_cols);
+    for (const auto& b : vals) {
+        auto num_rows = b.size();
+        auto num_cols = begin(b)->size();
+        auto b_size = dim<2>(num_rows, num_cols);
+        GKO_ASSERT_EQUAL_DIMENSIONS(b_size, common_size);
+    }
+
+    auto b_size = batch_dim<2>(num_batch_items, common_size);
+    size_type batch = 0;
+    std::vector<mat_data> input_mat_data(num_batch_items, common_size);
+    for (const auto& b : vals) {
+        size_type ridx = 0;
+        for (const auto& row : b) {
+            size_type cidx = 0;
+            for (const auto& elem : row) {
+                if (elem != zero<value_type>()) {
+                    input_mat_data[batch].nonzeros.emplace_back(ridx, cidx,
+                                                                elem);
+                }
+                ++cidx;
+            }
+            ++ridx;
+        }
+        ++batch;
+    }
+    return read<value_type, index_type, Matrix>(
+        exec, input_mat_data, std::forward<TArgs>(create_args)...);
+}
+
+
+/**
+ * Creates and initializes a batch of specified Matrix type with a single
+ * column-vector by making copies of the single input column vector.
+ *
+ * @tparam Matrix  matrix type to initialize (It has to implement the
+ *                 read<Matrix> function)
+ * @tparam TArgs  argument types for Matrix::create method
+ *                (not including the implied Executor as the first argument)
+ *
+ * @param num_batch_items  The number of times the input vector is to be
+ *                         duplicated
+ * @param vals  values used to initialize each vector in the temp. batch
+ * @param exec  Executor associated with the matrix
+ * @param create_args  additional arguments passed to Matrix::create, not
+ *                     including the Executor, which is passed as the first
+ *                     argument
+ *
+ * @ingroup mat_formats
+ */
+template <typename Matrix, typename... TArgs>
+std::unique_ptr<Matrix> initialize(
+    const size_type num_batch_items,
+    std::initializer_list<typename Matrix::value_type> vals,
+    std::shared_ptr<const Executor> exec, TArgs&&... create_args)
+{
+    using value_type = typename Matrix::value_type;
+    using index_type = typename Matrix::index_type;
+    using mat_data = gko::matrix_data<value_type, index_type>;
+    GKO_THROW_IF_INVALID(num_batch_items > 0 && vals.size() > 0,
+                         "Input data is empty");
+    auto num_rows = begin(vals) ? vals.size() : 0;
+    auto common_size = dim<2>(num_rows, 1);
+    auto b_size = batch_dim<2>(num_batch_items, common_size);
+    mat_data single_mat_data(common_size);
+    single_mat_data.nonzeros.reserve(num_rows);
+    size_type idx = 0;
+    for (const auto& elem : vals) {
+        if (elem != zero<value_type>()) {
+            single_mat_data.nonzeros.emplace_back(idx, 0, elem);
+        }
+        ++idx;
+    }
+    std::vector<mat_data> input_mat_data(num_batch_items, single_mat_data);
+    return read<value_type, index_type, Matrix>(
+        exec, input_mat_data, std::forward<TArgs>(create_args)...);
+}
+
+
+/**
+ * Creates and initializes a matrix from copies of a given matrix.
+ *
+ * @tparam Matrix  matrix type to initialize (It has to implement the
+ *                 read<Matrix> function)
+ * @tparam TArgs  argument types for Matrix::create method
+ *                (not including the implied Executor as the first argument)
+ *
+ * @param num_batch_items  The number of times the input matrix is duplicated
+ * @param vals  values used to initialize each matrix in the temp. batch
+ * @param exec  Executor associated to the matrix
+ * @param create_args  additional arguments passed to Matrix::create, not
+ *                     including the Executor, which is passed as the first
+ *                     argument
+ *
+ * @ingroup mat_formats
+ */
+template <typename Matrix, typename... TArgs>
+std::unique_ptr<Matrix> initialize(
+    const size_type num_batch_items,
+    std::initializer_list<std::initializer_list<typename Matrix::value_type>>
+        vals,
+    std::shared_ptr<const Executor> exec, TArgs&&... create_args)
+{
+    using value_type = typename Matrix::value_type;
+    using index_type = typename Matrix::index_type;
+    using mat_data = gko::matrix_data<value_type, index_type>;
+    GKO_THROW_IF_INVALID(num_batch_items > 0 && vals.size() > 0,
+                         "Input data is empty");
+    auto common_size = dim<2>(begin(vals) ? vals.size() : 0,
+                              begin(vals) ? begin(vals)->size() : 0);
+    batch_dim<2> b_size(num_batch_items, common_size);
+    mat_data single_mat_data(common_size);
+    size_type ridx = 0;
+    for (const auto& row : vals) {
+        size_type cidx = 0;
+        for (const auto& elem : row) {
+            if (elem != zero<value_type>()) {
+                single_mat_data.nonzeros.emplace_back(ridx, cidx, elem);
+            }
+            ++cidx;
+        }
+        ++ridx;
+    }
+    std::vector<mat_data> input_mat_data(num_batch_items, single_mat_data);
+    return read<value_type, index_type, Matrix>(
+        exec, input_mat_data, std::forward<TArgs>(create_args)...);
+}
+
+
+}  // namespace batch
+}  // namespace gko
+
+
+#endif  // GKO_CORE_BASE_BATCH_UTILITIES_HPP_
diff --git a/core/base/copy_assignable.hpp b/core/base/copy_assignable.hpp
new file mode 100644
index 00000000000..7f5e4125e10
--- /dev/null
+++ b/core/base/copy_assignable.hpp
@@ -0,0 +1,130 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CORE_BASE_COPY_ASSIGNABLE_HPP_
+#define GKO_CORE_BASE_COPY_ASSIGNABLE_HPP_
+
+
+#include <vector>
+
+
+namespace gko {
+namespace detail {
+
+
+template <typename T, typename = void>
+class copy_assignable;
+
+
+/**
+ * Helper class to make a type copy assignable.
+ *
+ * This class wraps an object of a type that has a copy constructor, but not
+ * a copy assignment. This is most often the case for lambdas. The wrapped
+ * object can then be copy assigned, by relying on the copy constructor.
+ *
+ * @tparam T  type with a copy constructor
+ */
+template <typename T>
+class copy_assignable<
+    T, typename std::enable_if<std::is_copy_constructible<T>::value>::type> {
+public:
+    copy_assignable() = default;
+
+    copy_assignable(const copy_assignable& other)
+    {
+        if (this != &other) {
+            *this = other;
+        }
+    }
+
+    copy_assignable(copy_assignable&& other) noexcept
+    {
+        if (this != &other) {
+            *this = std::move(other);
+        }
+    }
+
+    copy_assignable(const T& obj) : obj_{new (buf)(T)(obj)} {}
+
+    copy_assignable(T&& obj) : obj_{new (buf)(T)(std::move(obj))} {}
+
+    copy_assignable& operator=(const copy_assignable& other)
+    {
+        if (this != &other) {
+            if (obj_) {
+                obj_->~T();
+            }
+            obj_ = new (buf)(T)(*other.obj_);
+        }
+        return *this;
+    }
+
+    copy_assignable& operator=(copy_assignable&& other) noexcept
+    {
+        if (this != &other) {
+            if (obj_) {
+                obj_->~T();
+            }
+            obj_ = new (buf)(T)(std::move(*other.obj_));
+        }
+        return *this;
+    }
+
+    ~copy_assignable()
+    {
+        if (obj_) {
+            obj_->~T();
+        }
+    }
+
+    template <typename... Args>
+    decltype(auto) operator()(Args&&... args) const
+    {
+        return (*obj_)(std::forward<Args>(args)...);
+    }
+
+    T const& get() const { return *obj_; }
+
+    T& get() { return *obj_; }
+
+private:
+    //!< Store wrapped object on the stack, should use std::optional in c++17
+    T* obj_{};
+    alignas(T) unsigned char buf[sizeof(T)];
+};
+
+
+}  // namespace detail
+}  // namespace gko
+
+#endif  // GKO_CORE_BASE_COPY_ASSIGNABLE_HPP_
diff --git a/core/base/dispatch_helper.hpp b/core/base/dispatch_helper.hpp
index 155d5ef6c23..7ca04107575 100644
--- a/core/base/dispatch_helper.hpp
+++ b/core/base/dispatch_helper.hpp
@@ -54,16 +54,16 @@ namespace gko {
  * @note this is the end case
  */
 template <typename T, typename Func, typename... Args>
-void run(T, Func, Args...)
+void run(T obj, Func, Args...)
 {
-    GKO_NOT_IMPLEMENTED;
+    GKO_NOT_SUPPORTED(obj);
 }
 
 /**
  * run uses template to go through the list and select the valid
  * template and run it.
  *
- * @tparam K  the current type tried in the convertion
+ * @tparam K  the current type tried in the conversion
  * @tparam ...Types  the other types will be tried in the conversion if K fails
  * @tparam T  the type of input object
  * @tparam Func  the function will run if the object can be converted to K
@@ -97,9 +97,9 @@ void run(T obj, Func f, Args... args)
  */
 template <template <typename> class Base, typename T, typename Func,
           typename... Args>
-void run(T, Func, Args...)
+void run(T obj, Func, Args...)
 {
-    GKO_NOT_IMPLEMENTED;
+    GKO_NOT_SUPPORTED(obj);
 }
 
 /**
@@ -108,7 +108,7 @@ void run(T, Func, Args...)
  *
  * @tparam Base  the Base class with one template
  * @tparam K  the current template type of B. pointer of const Base<K> is tried
- *            in the convertion.
+ *            in the conversion.
  * @tparam ...Types  the other types will be tried in the conversion if K fails
  * @tparam T  the type of input object waiting converted
  * @tparam Func  the function will run if the object can be converted to pointer
diff --git a/core/base/iterator_factory.hpp b/core/base/iterator_factory.hpp
index 6384d5bfbce..bbc1d3b4b2b 100644
--- a/core/base/iterator_factory.hpp
+++ b/core/base/iterator_factory.hpp
@@ -42,6 +42,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <utility>
 
 
+#include "core/base/copy_assignable.hpp"
+
+
 namespace gko {
 namespace detail {
 
@@ -84,7 +87,7 @@ class zip_iterator_reference
     template <std::size_t... idxs>
     value_type cast_impl(std::index_sequence<idxs...>) const
     {
-        // gcc 5 throws error as using unintialized array
+        // gcc 5 throws error as using uninitialized array
         // std::tuple<int, char> t = { 1, '2' }; is not allowed.
         // converting to 'std::tuple<...>' from initializer list would use
         // explicit constructor
@@ -366,6 +369,133 @@ void swap(zip_iterator_reference<Iterators...> a,
 }
 
 
+/**
+ * Random access iterator that uses a function to transform the index.
+ *
+ * For a function `fn` and an underlying iterator `it`, accessing the
+ * permute_iterator at index `i` will result in accessing `it[fn(i)]`.
+ *
+ * @tparam IteratorType  Underlying iterator, has to be random access.
+ * @tparam PermuteFn  A function `difference_type -> difference_type` that
+ *                    transforms any given index. It doesn't have to be a strict
+ *                    permutation of indices (i.e. not bijective).
+ */
+template <typename IteratorType, typename PermuteFn>
+class permute_iterator {
+public:
+    using difference_type = std::ptrdiff_t;
+    using value_type = typename std::iterator_traits<IteratorType>::value_type;
+    using pointer = typename std::iterator_traits<IteratorType>::pointer;
+    using reference = typename std::iterator_traits<IteratorType>::reference;
+    using iterator_category = std::random_access_iterator_tag;
+
+    permute_iterator() = default;
+
+    explicit permute_iterator(IteratorType it, PermuteFn perm)
+        : it_{std::move(it)}, idx_{}, perm_{std::move(perm)}
+    {}
+
+    permute_iterator& operator+=(difference_type i)
+    {
+        idx_ += i;
+        return *this;
+    }
+
+    permute_iterator& operator-=(difference_type i) { return *this += -i; }
+
+    permute_iterator& operator++() { return *this += 1; }
+
+    permute_iterator operator++(int)
+    {
+        auto tmp = *this;
+        ++(*this);
+        return tmp;
+    }
+
+    permute_iterator& operator--() { return *this -= 1; }
+
+    permute_iterator operator--(int)
+    {
+        auto tmp = *this;
+        --(*this);
+        return tmp;
+    }
+
+    permute_iterator operator+(difference_type i) const
+    {
+        auto tmp = *this;
+        tmp += i;
+        return tmp;
+    }
+
+    friend permute_iterator operator+(difference_type i,
+                                      const permute_iterator& iter)
+    {
+        return iter + i;
+    }
+
+    permute_iterator operator-(difference_type i) const
+    {
+        auto tmp = *this;
+        tmp -= i;
+        return tmp;
+    }
+
+    difference_type operator-(const permute_iterator& other) const
+    {
+        return idx_ - other.idx_;
+    }
+
+    reference operator*() const { return it_[perm_(idx_)]; }
+
+    reference operator[](difference_type i) const { return *(*this + i); }
+
+    bool operator==(const permute_iterator& other) const
+    {
+        return idx_ == other.idx_;
+    }
+
+    bool operator!=(const permute_iterator& other) const
+    {
+        return !(*this == other);
+    }
+
+    bool operator<(const permute_iterator& other) const
+    {
+        return idx_ < other.idx_;
+    }
+
+    bool operator<=(const permute_iterator& other) const
+    {
+        return idx_ <= other.idx_;
+    }
+
+    bool operator>(const permute_iterator& other) const
+    {
+        return !(*this <= other);
+    }
+
+    bool operator>=(const permute_iterator& other) const
+    {
+        return !(*this < other);
+    }
+
+private:
+    IteratorType it_;
+    difference_type idx_;
+    copy_assignable<PermuteFn> perm_;
+};
+
+
+template <typename IteratorType, typename PermutationFn>
+permute_iterator<IteratorType, PermutationFn> make_permute_iterator(
+    IteratorType it, PermutationFn perm)
+{
+    return permute_iterator<IteratorType, PermutationFn>{std::move(it),
+                                                         std::move(perm)};
+}
+
+
 }  // namespace detail
 }  // namespace gko
 
diff --git a/core/base/memory.cpp b/core/base/memory.cpp
new file mode 100644
index 00000000000..b6c6f8f265c
--- /dev/null
+++ b/core/base/memory.cpp
@@ -0,0 +1,59 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/base/memory.hpp>
+
+
+#include <new>
+
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+
+
+namespace gko {
+
+
+void* CpuAllocator::allocate(size_type num_bytes)
+{
+    auto ptr = ::operator new (num_bytes, std::nothrow_t{});
+    GKO_ENSURE_ALLOCATED(ptr, "cpu", num_bytes);
+    return ptr;
+}
+
+
+void CpuAllocator::deallocate(void* ptr)
+{
+    ::operator delete (ptr, std::nothrow_t{});
+}
+
+
+}  // namespace gko
diff --git a/core/base/mixed_precision_types.hpp b/core/base/mixed_precision_types.hpp
index 84b0af21c5e..b5c1e37569b 100644
--- a/core/base/mixed_precision_types.hpp
+++ b/core/base/mixed_precision_types.hpp
@@ -39,42 +39,65 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #ifdef GINKGO_MIXED_PRECISION
-#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE(_macro, ...)  \
-    template _macro(float, float, float, __VA_ARGS__);          \
-    template _macro(float, float, double, __VA_ARGS__);         \
-    template _macro(float, double, float, __VA_ARGS__);         \
-    template _macro(float, double, double, __VA_ARGS__);        \
-    template _macro(double, float, float, __VA_ARGS__);         \
-    template _macro(double, float, double, __VA_ARGS__);        \
-    template _macro(double, double, float, __VA_ARGS__);        \
-    template _macro(double, double, double, __VA_ARGS__);       \
-    template _macro(std::complex<float>, std::complex<float>,   \
-                    std::complex<float>, __VA_ARGS__);          \
-    template _macro(std::complex<float>, std::complex<float>,   \
-                    std::complex<double>, __VA_ARGS__);         \
-    template _macro(std::complex<float>, std::complex<double>,  \
-                    std::complex<float>, __VA_ARGS__);          \
-    template _macro(std::complex<float>, std::complex<double>,  \
-                    std::complex<double>, __VA_ARGS__);         \
-    template _macro(std::complex<double>, std::complex<float>,  \
-                    std::complex<float>, __VA_ARGS__);          \
-    template _macro(std::complex<double>, std::complex<float>,  \
-                    std::complex<double>, __VA_ARGS__);         \
-    template _macro(std::complex<double>, std::complex<double>, \
-                    std::complex<float>, __VA_ARGS__);          \
-    template _macro(std::complex<double>, std::complex<double>, \
+
+#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1(_macro, ...) \
+    template _macro(float, float, float, __VA_ARGS__);                \
+    template _macro(float, float, double, __VA_ARGS__);               \
+    template _macro(float, double, float, __VA_ARGS__);               \
+    template _macro(float, double, double, __VA_ARGS__)
+
+#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2(_macro, ...) \
+    template _macro(double, float, float, __VA_ARGS__);               \
+    template _macro(double, float, double, __VA_ARGS__);              \
+    template _macro(double, double, float, __VA_ARGS__);              \
+    template _macro(double, double, double, __VA_ARGS__)
+
+#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3(_macro, ...) \
+    template _macro(std::complex<float>, std::complex<float>,         \
+                    std::complex<float>, __VA_ARGS__);                \
+    template _macro(std::complex<float>, std::complex<float>,         \
+                    std::complex<double>, __VA_ARGS__);               \
+    template _macro(std::complex<float>, std::complex<double>,        \
+                    std::complex<float>, __VA_ARGS__);                \
+    template _macro(std::complex<float>, std::complex<double>,        \
+                    std::complex<double>, __VA_ARGS__)
+
+#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4(_macro, ...) \
+    template _macro(std::complex<double>, std::complex<float>,        \
+                    std::complex<float>, __VA_ARGS__);                \
+    template _macro(std::complex<double>, std::complex<float>,        \
+                    std::complex<double>, __VA_ARGS__);               \
+    template _macro(std::complex<double>, std::complex<double>,       \
+                    std::complex<float>, __VA_ARGS__);                \
+    template _macro(std::complex<double>, std::complex<double>,       \
                     std::complex<double>, __VA_ARGS__)
+
 #else
-#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE(_macro, ...)  \
-    template _macro(float, float, float, __VA_ARGS__);          \
-    template _macro(double, double, double, __VA_ARGS__);       \
-    template _macro(std::complex<float>, std::complex<float>,   \
-                    std::complex<float>, __VA_ARGS__);          \
-    template _macro(std::complex<double>, std::complex<double>, \
+
+#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1(_macro, ...) \
+    template _macro(float, float, float, __VA_ARGS__)
+
+#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2(_macro, ...) \
+    template _macro(double, double, double, __VA_ARGS__)
+
+#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3(_macro, ...) \
+    template _macro(std::complex<float>, std::complex<float>,         \
+                    std::complex<float>, __VA_ARGS__)
+
+#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4(_macro, ...) \
+    template _macro(std::complex<double>, std::complex<double>,       \
                     std::complex<double>, __VA_ARGS__)
+
 #endif
 
 
+#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE(_macro, ...)             \
+    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1(_macro, __VA_ARGS__); \
+    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2(_macro, __VA_ARGS__); \
+    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3(_macro, __VA_ARGS__); \
+    GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4(_macro, __VA_ARGS__)
+
+
 #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(_macro) \
     GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE(_macro, int32);       \
     GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE(_macro, int64)
diff --git a/core/base/mtx_io.cpp b/core/base/mtx_io.cpp
index d8604e95b5f..de4f6ec1e86 100644
--- a/core/base/mtx_io.cpp
+++ b/core/base/mtx_io.cpp
@@ -267,7 +267,7 @@ class mtx_io {
 
     /**
      * storage modifier hierarchy provides algorithms for handling storage
-     * modifiers (general, symetric, skew symetric, hermitian) and filling the
+     * modifiers (general, symmetric, skew symmetric, hermitian) and filling the
      * entire matrix from the stored parts
      */
     struct storage_modifier {
@@ -491,7 +491,7 @@ class mtx_io {
          * @param os  The output stream to write to
          * @param data  The matrix data to write
          * @param entry_writer  The entry format to write in.
-         * @param modifier  The strorage modifer
+         * @param modifier  The storage modifier
          */
         virtual void write_data(std::ostream& os,
                                 const matrix_data<ValueType, IndexType>& data,
@@ -554,7 +554,7 @@ class mtx_io {
          * @param os  The output stream to write to
          * @param data  The matrix data to write
          * @param entry_writer  The entry format to write in.
-         * @param modifier  The strorage modifer
+         * @param modifier  The storage modifier
          */
         void write_data(std::ostream& os,
                         const matrix_data<ValueType, IndexType>& data,
@@ -623,7 +623,7 @@ class mtx_io {
          * @param os  The output stream to write to
          * @param data  The matrix data to write
          * @param entry_writer  The entry format to write in.
-         * @param modifier  The strorage modifer
+         * @param modifier  The storage modifier
          */
         void write_data(std::ostream& os,
                         const matrix_data<ValueType, IndexType>& data,
diff --git a/core/base/types.hpp b/core/base/types.hpp
index 5f90ed2cafe..39ca169d486 100644
--- a/core/base/types.hpp
+++ b/core/base/types.hpp
@@ -109,7 +109,7 @@ constexpr std::enable_if_t<(num_groups > current_shift + 1), int> shift(
  *
  * The usage will be the following
  * Set the method with bits Cfg = ConfigSet<b_0, b_1, ..., b_k>
- * Encode the given infomation encoded = Cfg::encode(x_0, x_1, ..., x_k)
+ * Encode the given information encoded = Cfg::encode(x_0, x_1, ..., x_k)
  * Decode the specific position information x_t = Cfg::decode<t>(encoded)
  * The encoded result will use 32 bits to record
  * rrrrr0..01....1...k..k, which 1/2/.../k means the bits store the information
diff --git a/core/components/addressable_pq.hpp b/core/components/addressable_pq.hpp
new file mode 100644
index 00000000000..64502eb5b99
--- /dev/null
+++ b/core/components/addressable_pq.hpp
@@ -0,0 +1,250 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CORE_COMPONENTS_ADDRESSABLE_PQ_HPP_
+#define GKO_CORE_COMPONENTS_ADDRESSABLE_PQ_HPP_
+
+
+#include <algorithm>
+#include <vector>
+
+
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/types.hpp>
+
+
+#include "core/base/allocator.hpp"
+
+
+namespace gko {
+
+
+/**
+ * An addressable priority queue based on a k-ary heap.
+ *
+ * It allows inserting key-node pairs, modifying their key as well as accessing
+ * and removing the key-node pair with the minimum key.
+ *
+ * @tparam KeyType    The type of the keys
+ * @tparam IndexType  The type of the nodes, it needs to be an integer type.
+ * @tparam degree     The node degree k
+ */
+template <typename KeyType, typename IndexType, int degree = 4>
+struct addressable_priority_queue {
+    /**
+     * Constructs an addressable PQ from a host executor.
+     *
+     * @param host_exec  the host executor for allocating the data
+     * @param num_nodes  the number of nodes that may be inserted into this
+     *                   queue. Every node ID inserted must be below num_nodes.
+     */
+    addressable_priority_queue(std::shared_ptr<const Executor> host_exec,
+                               size_type num_nodes)
+        : keys_{host_exec},
+          nodes_{host_exec},
+          heap_pos_{num_nodes, invalid_index<IndexType>(), host_exec}
+    {}
+
+    /**
+     * Inserts the given key-node pair into the PQ.
+     * Duplicate keys are allowed, they may be returned in an arbitrary order.
+     *
+     * @param key  the key by which the queue is ordered
+     * @param node  the node associated with the key. Every node may only be
+     *              inserted once!
+     */
+    void insert(KeyType key, IndexType node)
+    {
+        GKO_ASSERT(node < static_cast<IndexType>(heap_pos_.size()));
+        GKO_ASSERT(node >= 0);
+        GKO_ASSERT(heap_pos_[node] == invalid_index<IndexType>());
+        keys_.push_back(key);
+        nodes_.push_back(node);
+        const auto new_pos = size() - 1;
+        heap_pos_[node] = new_pos;
+        sift_up(new_pos);
+    }
+
+    /**
+     * Updates the key of a node with the given new key.
+     * Duplicate keys are allowed, they may be returned in an arbitrary order.
+     *
+     * @param new_key  the key by which the queue is ordered
+     * @param node  the node associated with the key. It must have been inserted
+     *              beforehand.
+     */
+    void update_key(KeyType new_key, IndexType node)
+    {
+        GKO_ASSERT(node < static_cast<IndexType>(heap_pos_.size()));
+        GKO_ASSERT(node >= 0);
+        auto pos = heap_pos_[node];
+        GKO_ASSERT(pos < size());
+        GKO_ASSERT(pos != invalid_index<IndexType>());
+        GKO_ASSERT(nodes_[pos] == node);
+        auto old_key = keys_[pos];
+        keys_[pos] = new_key;
+        if (old_key < new_key) {
+            sift_down(pos);
+        } else {
+            sift_up(pos);
+        }
+    }
+
+    /**
+     * Returns the minimum key from the queue.
+     *
+     * @return the minimum key from the queue
+     */
+    KeyType min_key() const { return keys_[0]; }
+
+    /**
+     * Returns the node belonging to the minimum key from the queue.
+     *
+     * @return the node corresponding to the minimum key
+     */
+    IndexType min_node() const { return nodes_[0]; }
+
+    /**
+     * Returns the key-node pair with the minimum key from the queue.
+     *
+     * @return the key-node pair corresponding to the minimum key
+     */
+    std::pair<KeyType, IndexType> min() const
+    {
+        return {min_key(), min_node()};
+    }
+
+    /**
+     * Removes the key-node pair with the minimum key from the queue.
+     */
+    void pop_min()
+    {
+        GKO_ASSERT(!empty());
+        swap(0, size() - 1);
+        heap_pos_[nodes_.back()] = invalid_index<IndexType>();
+        keys_.pop_back();
+        nodes_.pop_back();
+        sift_down(0);
+    }
+
+    /**
+     * Returns the number of key-node pairs in the queue.
+     *
+     * @return  the number of key-node pairs in the queue
+     */
+    std::size_t size() const { return keys_.size(); }
+
+    /**
+     * Returns true if and only if the queue has size 0.
+     *
+     * @return if queue has size 0
+     */
+    bool empty() const { return size() == 0; }
+
+    /** Clears the queue, removing all entries. */
+    void reset()
+    {
+        for (auto node : nodes_) {
+            heap_pos_[node] = invalid_index<IndexType>();
+        }
+        keys_.clear();
+        nodes_.clear();
+    }
+
+private:
+    std::size_t parent(std::size_t i) const { return (i - 1) / degree; }
+
+    std::size_t first_child(std::size_t i) const { return degree * i + 1; }
+
+    void swap(std::size_t i, std::size_t j)
+    {
+        std::swap(keys_[i], keys_[j]);
+        std::swap(nodes_[i], nodes_[j]);
+        std::swap(heap_pos_[nodes_[i]], heap_pos_[nodes_[j]]);
+    }
+
+    /**
+     * Restores the heap invariant downwards, i.e. the
+     * Moves the key-node pair at position i down (toward the leaves)
+     * until its key is smaller or equal to the one of all its children.
+     */
+    void sift_down(std::size_t i)
+    {
+        auto cur = i;
+        while (first_child(cur) < size()) {
+            auto it = keys_.cbegin();
+            if (first_child(cur + 1) < size()) {
+                // fast path: known loop trip count
+                it = std::min_element(keys_.cbegin() + first_child(cur),
+                                      keys_.cbegin() + first_child(cur + 1));
+            } else {
+                // slow path: unknown loop trip count
+                it = std::min_element(keys_.cbegin() + first_child(cur),
+                                      keys_.cbegin() + size());
+            }
+            if (keys_[cur] <= *it) {
+                break;
+            }
+            auto min_child = std::distance(keys_.cbegin(), it);
+            swap(cur, min_child);
+            cur = min_child;
+        }
+    }
+
+    /**
+     * Moves the key-node pair at position i up (toward the root)
+     * until its key is larger or equal to the one of its parent.
+     */
+    void sift_up(std::size_t i)
+    {
+        auto cur = i;
+        while (cur > 0) {
+            if (keys_[cur] >= keys_[parent(cur)]) {
+                break;
+            }
+            swap(cur, parent(cur));
+            cur = parent(cur);
+        }
+    }
+
+    vector<KeyType> keys_;
+    vector<IndexType> nodes_;
+    // for each node, heap_pos_[node] stores the position of this node inside
+    // the heap, or invalid_index<IndexType>() if it's not in the heap.
+    vector<IndexType> heap_pos_;
+};
+
+
+}  // namespace gko
+
+
+#endif  // GKO_CORE_COMPONENTS_ADDRESSABLE_PQ_HPP_
diff --git a/core/device_hooks/CMakeLists.txt b/core/device_hooks/CMakeLists.txt
index 901acef7797..573f87fad93 100644
--- a/core/device_hooks/CMakeLists.txt
+++ b/core/device_hooks/CMakeLists.txt
@@ -8,7 +8,7 @@ if(NOT GINKGO_BUILD_CUDA)
     ginkgo_install_library(ginkgo_cuda)
 endif()
 
-if (NOT GINKGO_BUILD_DPCPP)
+if (NOT GINKGO_BUILD_SYCL)
     add_library(ginkgo_dpcpp
         $<TARGET_OBJECTS:ginkgo_dpcpp_device>
         dpcpp_hooks.cpp)
diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp
index f1af9318f9f..762d7d78a16 100644
--- a/core/device_hooks/common_kernels.inc.cpp
+++ b/core/device_hooks/common_kernels.inc.cpp
@@ -34,6 +34,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/types.hpp>
 
 
+#include "core/base/batch_multi_vector_kernels.hpp"
 #include "core/base/device_matrix_data_kernels.hpp"
 #include "core/base/index_set_kernels.hpp"
 #include "core/base/mixed_precision_types.hpp"
@@ -44,6 +45,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/components/reduce_array_kernels.hpp"
 #include "core/distributed/matrix_kernels.hpp"
+#include "core/distributed/partition_helpers_kernels.hpp"
 #include "core/distributed/partition_kernels.hpp"
 #include "core/distributed/vector_kernels.hpp"
 #include "core/factorization/cholesky_kernels.hpp"
@@ -55,6 +57,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/factorization/par_ict_kernels.hpp"
 #include "core/factorization/par_ilu_kernels.hpp"
 #include "core/factorization/par_ilut_kernels.hpp"
+#include "core/matrix/batch_dense_kernels.hpp"
+#include "core/matrix/batch_ell_kernels.hpp"
 #include "core/matrix/coo_kernels.hpp"
 #include "core/matrix/csr_kernels.hpp"
 #include "core/matrix/dense_kernels.hpp"
@@ -63,12 +67,15 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/matrix/fbcsr_kernels.hpp"
 #include "core/matrix/fft_kernels.hpp"
 #include "core/matrix/hybrid_kernels.hpp"
+#include "core/matrix/permutation_kernels.hpp"
+#include "core/matrix/scaled_permutation_kernels.hpp"
 #include "core/matrix/sellp_kernels.hpp"
 #include "core/matrix/sparsity_csr_kernels.hpp"
 #include "core/multigrid/pgm_kernels.hpp"
 #include "core/preconditioner/isai_kernels.hpp"
 #include "core/preconditioner/jacobi_kernels.hpp"
 #include "core/reorder/rcm_kernels.hpp"
+#include "core/solver/batch_bicgstab_kernels.hpp"
 #include "core/solver/bicg_kernels.hpp"
 #include "core/solver/bicgstab_kernels.hpp"
 #include "core/solver/cb_gmres_kernels.hpp"
@@ -134,6 +141,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     _macro(ValueType, IndexType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \
     GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(_macro)
 
+#define GKO_STUB_VALUE_AND_INT32_TYPE(_macro)                       \
+    template <typename ValueType, typename IndexType>               \
+    _macro(ValueType, IndexType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \
+    GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(_macro)
+
 #define GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE(_macro)                     \
     template <typename InputValueType, typename MatrixValueType,        \
               typename OutputValueType, typename IndexType>             \
@@ -254,6 +266,17 @@ GKO_STUB_LOCAL_GLOBAL_TYPE(GKO_DECLARE_PARTITION_IS_ORDERED);
 }  // namespace partition
 
 
+namespace partition_helpers {
+
+
+GKO_STUB_INDEX_TYPE(GKO_DECLARE_PARTITION_HELPERS_SORT_BY_RANGE_START);
+GKO_STUB_INDEX_TYPE(GKO_DECLARE_PARTITION_HELPERS_CHECK_CONSECUTIVE_RANGES);
+GKO_STUB_INDEX_TYPE(GKO_DECLARE_PARTITION_HELPERS_COMPRESS_RANGES);
+
+
+}  // namespace partition_helpers
+
+
 namespace distributed_vector {
 
 
@@ -272,6 +295,40 @@ GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(GKO_DECLARE_BUILD_LOCAL_NONLOCAL);
 }  // namespace distributed_matrix
 
 
+namespace batch_multi_vector {
+
+
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR_SCALE_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR_ADD_SCALED_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_DOT_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_CONJ_DOT_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_NORM2_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR_COPY_KERNEL);
+
+
+}  // namespace batch_multi_vector
+
+
+namespace batch_dense {
+
+
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_SIMPLE_APPLY_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_ADVANCED_APPLY_KERNEL);
+
+
+}  // namespace batch_dense
+
+
+namespace batch_ell {
+
+
+GKO_STUB_VALUE_AND_INT32_TYPE(GKO_DECLARE_BATCH_ELL_SIMPLE_APPLY_KERNEL);
+GKO_STUB_VALUE_AND_INT32_TYPE(GKO_DECLARE_BATCH_ELL_ADVANCED_APPLY_KERNEL);
+
+
+}  // namespace batch_ell
+
+
 namespace dense {
 
 
@@ -293,6 +350,7 @@ GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL);
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL);
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL);
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_MEAN_KERNEL);
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_SQUARED_NORM2_KERNEL);
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_SQRT_KERNEL);
 GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL);
@@ -317,9 +375,20 @@ GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_INV_SYMM_PERMUTE_KERNEL);
 GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE_2(GKO_DECLARE_DENSE_ROW_GATHER_KERNEL);
 GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE_2(
     GKO_DECLARE_DENSE_ADVANCED_ROW_GATHER_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_COLUMN_PERMUTE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_COL_PERMUTE_KERNEL);
 GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_INV_ROW_PERMUTE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_INV_COLUMN_PERMUTE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_INV_COL_PERMUTE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_NONSYMM_PERMUTE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_INV_NONSYMM_PERMUTE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_SYMM_SCALE_PERMUTE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_INV_SYMM_SCALE_PERMUTE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_ROW_SCALE_PERMUTE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_COL_SCALE_PERMUTE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_INV_ROW_SCALE_PERMUTE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_INV_COL_SCALE_PERMUTE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_NONSYMM_SCALE_PERMUTE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_DENSE_INV_NONSYMM_SCALE_PERMUTE_KERNEL);
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_EXTRACT_DIAGONAL_KERNEL);
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_INPLACE_ABSOLUTE_DENSE_KERNEL);
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_OUTPLACE_ABSOLUTE_DENSE_KERNEL);
@@ -346,6 +415,15 @@ GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DIAGONAL_FILL_IN_MATRIX_DATA_KERNEL);
 }  // namespace diagonal
 
 
+namespace batch_bicgstab {
+
+
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_APPLY_KERNEL);
+
+
+}  // namespace batch_bicgstab
+
+
 namespace cg {
 
 
@@ -536,11 +614,16 @@ GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_CONVERT_TO_HYBRID_KERNEL);
 GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_CONVERT_TO_SELLP_KERNEL);
 GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_TRANSPOSE_KERNEL);
 GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_CONJ_TRANSPOSE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_INV_NONSYMM_PERMUTE_KERNEL);
 GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_INV_SYMM_PERMUTE_KERNEL);
 GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_ROW_PERMUTE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_INVERSE_COLUMN_PERMUTE_KERNEL);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_INVERSE_ROW_PERMUTE_KERNEL);
-GKO_STUB_INDEX_TYPE(GKO_DECLARE_INVERT_PERMUTATION_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_INV_COL_PERMUTE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_INV_ROW_PERMUTE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_INV_NONSYMM_SCALE_PERMUTE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_INV_SYMM_SCALE_PERMUTE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_ROW_SCALE_PERMUTE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_INV_COL_SCALE_PERMUTE_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_INV_ROW_SCALE_PERMUTE_KERNEL);
 GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SORT_BY_COLUMN_INDEX);
 GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_IS_SORTED_BY_COLUMN_INDEX);
 GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_EXTRACT_DIAGONAL);
@@ -653,6 +736,26 @@ GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_HYBRID_CONVERT_TO_CSR_KERNEL);
 }  // namespace hybrid
 
 
+namespace permutation {
+
+
+GKO_STUB_INDEX_TYPE(GKO_DECLARE_PERMUTATION_INVERT_KERNEL);
+GKO_STUB_INDEX_TYPE(GKO_DECLARE_PERMUTATION_COMPOSE_KERNEL);
+
+
+}  // namespace permutation
+
+
+namespace scaled_permutation {
+
+
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SCALED_PERMUTATION_INVERT_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SCALED_PERMUTATION_COMPOSE_KERNEL);
+
+
+}  // namespace scaled_permutation
+
+
 namespace sellp {
 
 
@@ -755,6 +858,8 @@ namespace lu_factorization {
 
 GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LU_INITIALIZE);
 GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LU_FACTORIZE);
+GKO_STUB_INDEX_TYPE(GKO_DECLARE_LU_SYMMETRIC_FACTORIZE_SIMPLE);
+GKO_STUB_INDEX_TYPE(GKO_DECLARE_LU_SYMMETRIC_FACTORIZE_SIMPLE_FINALIZE);
 
 
 }  // namespace lu_factorization
@@ -801,6 +906,8 @@ GKO_STUB_VALUE_AND_INDEX_TYPE(
 
 
 }  // namespace par_ilut_factorization
+
+
 namespace rcm {
 
 
diff --git a/core/device_hooks/cuda_hooks.cpp b/core/device_hooks/cuda_hooks.cpp
index dd4c3f19f7c..ff644a5f05f 100644
--- a/core/device_hooks/cuda_hooks.cpp
+++ b/core/device_hooks/cuda_hooks.cpp
@@ -35,6 +35,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/memory.hpp>
+#include <ginkgo/core/base/stream.hpp>
 #include <ginkgo/core/base/timer.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/base/version.hpp>
@@ -52,12 +54,77 @@ version version_info::get_cuda_version() noexcept
 }
 
 
+void* CudaAllocator::allocate(size_type num_bytes) GKO_NOT_COMPILED(cuda);
+
+
+void CudaAllocator::deallocate(void* dev_ptr) GKO_NOT_COMPILED(cuda);
+
+
+CudaAsyncAllocator::CudaAsyncAllocator(CUstream_st* stream)
+    GKO_NOT_COMPILED(cuda);
+
+
+void* CudaAsyncAllocator::allocate(size_type num_bytes) GKO_NOT_COMPILED(cuda);
+
+
+void CudaAsyncAllocator::deallocate(void* dev_ptr) GKO_NOT_COMPILED(cuda);
+
+
+bool CudaAsyncAllocator::check_environment(int device_id,
+                                           CUstream_st* stream) const
+    GKO_NOT_COMPILED(cuda);
+
+
+CudaUnifiedAllocator::CudaUnifiedAllocator(int device_id)
+    GKO_NOT_COMPILED(cuda);
+
+
+CudaUnifiedAllocator::CudaUnifiedAllocator(int device_id, unsigned int flags)
+    GKO_NOT_COMPILED(cuda);
+
+
+void* CudaUnifiedAllocator::allocate(size_type num_bytes)
+    GKO_NOT_COMPILED(cuda);
+
+
+void CudaUnifiedAllocator::deallocate(void* dev_ptr) GKO_NOT_COMPILED(cuda);
+
+
+bool CudaUnifiedAllocator::check_environment(int device_id,
+                                             CUstream_st* stream) const
+    GKO_NOT_COMPILED(cuda);
+
+
+CudaHostAllocator::CudaHostAllocator(int device_id) GKO_NOT_COMPILED(cuda);
+
+
+void* CudaHostAllocator::allocate(size_type num_bytes) GKO_NOT_COMPILED(cuda);
+
+
+void CudaHostAllocator::deallocate(void* dev_ptr) GKO_NOT_COMPILED(cuda);
+
+
+bool CudaHostAllocator::check_environment(int device_id,
+                                          CUstream_st* stream) const
+    GKO_NOT_COMPILED(cuda);
+
+
 std::shared_ptr<CudaExecutor> CudaExecutor::create(
     int device_id, std::shared_ptr<Executor> master, bool device_reset,
     allocation_mode alloc_mode, CUstream_st* stream)
+{
+    return std::shared_ptr<CudaExecutor>(
+        new CudaExecutor(device_id, std::move(master),
+                         std::make_shared<CudaAllocator>(), stream));
+}
+
+
+std::shared_ptr<CudaExecutor> CudaExecutor::create(
+    int device_id, std::shared_ptr<Executor> master,
+    std::shared_ptr<CudaAllocatorBase> alloc, CUstream_st* stream)
 {
     return std::shared_ptr<CudaExecutor>(new CudaExecutor(
-        device_id, std::move(master), device_reset, alloc_mode, stream));
+        device_id, std::move(master), std::move(alloc), stream));
 }
 
 
@@ -154,6 +221,9 @@ scoped_device_id_guard::scoped_device_id_guard(const CudaExecutor* exec,
     GKO_NOT_COMPILED(cuda);
 
 
+cuda_stream::cuda_stream() GKO_NOT_COMPILED(cuda);
+
+
 cuda_stream::cuda_stream(int device_id) GKO_NOT_COMPILED(cuda);
 
 
diff --git a/core/device_hooks/dpcpp_hooks.cpp b/core/device_hooks/dpcpp_hooks.cpp
index a08f1f608fb..532e9c55bbe 100644
--- a/core/device_hooks/dpcpp_hooks.cpp
+++ b/core/device_hooks/dpcpp_hooks.cpp
@@ -36,6 +36,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/memory.hpp>
 #include <ginkgo/core/base/timer.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/base/version.hpp>
diff --git a/core/device_hooks/hip_hooks.cpp b/core/device_hooks/hip_hooks.cpp
index 50637f7b3f0..521b2590626 100644
--- a/core/device_hooks/hip_hooks.cpp
+++ b/core/device_hooks/hip_hooks.cpp
@@ -36,6 +36,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/memory.hpp>
+#include <ginkgo/core/base/stream.hpp>
 #include <ginkgo/core/base/timer.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/base/version.hpp>
@@ -53,12 +55,75 @@ version version_info::get_hip_version() noexcept
 }
 
 
+void* HipAllocator::allocate(size_type num_bytes) GKO_NOT_COMPILED(hip);
+
+
+void HipAllocator::deallocate(void* dev_ptr) GKO_NOT_COMPILED(hip);
+
+
+HipAsyncAllocator::HipAsyncAllocator(GKO_HIP_STREAM_STRUCT* stream)
+    GKO_NOT_COMPILED(hip);
+
+
+void* HipAsyncAllocator::allocate(size_type num_bytes) GKO_NOT_COMPILED(hip);
+
+
+void HipAsyncAllocator::deallocate(void* dev_ptr) GKO_NOT_COMPILED(hip);
+
+
+bool HipAsyncAllocator::check_environment(int device_id,
+                                          GKO_HIP_STREAM_STRUCT* stream) const
+    GKO_NOT_COMPILED(hip);
+
+
+HipUnifiedAllocator::HipUnifiedAllocator(int device_id) GKO_NOT_COMPILED(hip);
+
+
+HipUnifiedAllocator::HipUnifiedAllocator(int device_id, unsigned int flags)
+    GKO_NOT_COMPILED(hip);
+
+
+void* HipUnifiedAllocator::allocate(size_type num_bytes) GKO_NOT_COMPILED(hip);
+
+
+void HipUnifiedAllocator::deallocate(void* dev_ptr) GKO_NOT_COMPILED(hip);
+
+
+bool HipUnifiedAllocator::check_environment(int device_id,
+                                            GKO_HIP_STREAM_STRUCT* stream) const
+    GKO_NOT_COMPILED(hip);
+
+
+HipHostAllocator::HipHostAllocator(int device_id) GKO_NOT_COMPILED(hip);
+
+
+void* HipHostAllocator::allocate(size_type num_bytes) GKO_NOT_COMPILED(hip);
+
+
+void HipHostAllocator::deallocate(void* dev_ptr) GKO_NOT_COMPILED(hip);
+
+
+bool HipHostAllocator::check_environment(int device_id,
+                                         GKO_HIP_STREAM_STRUCT* stream) const
+    GKO_NOT_COMPILED(hip);
+
+
 std::shared_ptr<HipExecutor> HipExecutor::create(
     int device_id, std::shared_ptr<Executor> master, bool device_reset,
     allocation_mode alloc_mode, GKO_HIP_STREAM_STRUCT* stream)
 {
-    return std::shared_ptr<HipExecutor>(new HipExecutor(
-        device_id, std::move(master), device_reset, alloc_mode, stream));
+    return std::shared_ptr<HipExecutor>(
+        new HipExecutor(device_id, std::move(master),
+                        std::make_shared<HipAllocator>(), stream));
+}
+
+
+std::shared_ptr<HipExecutor> HipExecutor::create(
+    int device_id, std::shared_ptr<Executor> master,
+    std::shared_ptr<HipAllocatorBase> alloc, GKO_HIP_STREAM_STRUCT* stream)
+{
+    return std::shared_ptr<HipExecutor>(
+        new HipExecutor(device_id, std::move(master), alloc, stream));
 }
 
 
@@ -155,6 +220,9 @@ scoped_device_id_guard::scoped_device_id_guard(const HipExecutor* exec,
     GKO_NOT_COMPILED(hip);
 
 
+hip_stream::hip_stream() GKO_NOT_COMPILED(hip);
+
+
 hip_stream::hip_stream(int device_id) GKO_NOT_COMPILED(hip);
 
 
diff --git a/core/device_hooks/omp_hooks.cpp b/core/device_hooks/omp_hooks.cpp
index f652a4d4582..f79ddfdeca6 100644
--- a/core/device_hooks/omp_hooks.cpp
+++ b/core/device_hooks/omp_hooks.cpp
@@ -31,6 +31,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
 #include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/scoped_device_id_guard.hpp>
 #include <ginkgo/core/base/version.hpp>
 
@@ -51,6 +52,9 @@ scoped_device_id_guard::scoped_device_id_guard(const OmpExecutor* exec,
     GKO_NOT_COMPILED(omp);
 
 
+int OmpExecutor::get_num_omp_threads() { return 1; }
+
+
 }  // namespace gko
 
 
diff --git a/core/distributed/partition.cpp b/core/distributed/partition.cpp
index a1db99396e7..bfeb5e8c286 100644
--- a/core/distributed/partition.cpp
+++ b/core/distributed/partition.cpp
@@ -76,14 +76,21 @@ Partition<LocalIndexType, GlobalIndexType>::build_from_mapping(
 template <typename LocalIndexType, typename GlobalIndexType>
 std::unique_ptr<Partition<LocalIndexType, GlobalIndexType>>
 Partition<LocalIndexType, GlobalIndexType>::build_from_contiguous(
-    std::shared_ptr<const Executor> exec, const array<GlobalIndexType>& ranges)
+    std::shared_ptr<const Executor> exec, const array<GlobalIndexType>& ranges,
+    const array<comm_index_type>& part_ids)
 {
+    GKO_ASSERT(part_ids.get_num_elems() == 0 ||
+               part_ids.get_num_elems() + 1 == ranges.get_num_elems());
+
+    array<comm_index_type> empty(exec);
     auto local_ranges = make_temporary_clone(exec, &ranges);
+    auto local_part_ids = make_temporary_clone(
+        exec, part_ids.get_num_elems() > 0 ? &part_ids : &empty);
     auto result = Partition::create(
         exec, static_cast<comm_index_type>(ranges.get_num_elems() - 1),
         ranges.get_num_elems() - 1);
     exec->run(partition::make_build_from_contiguous(
-        *local_ranges.get(), result->offsets_.get_data(),
+        *local_ranges, *local_part_ids, result->offsets_.get_data(),
         result->part_ids_.get_data()));
     result->finalize_construction();
     return result;
@@ -117,7 +124,7 @@ void Partition<LocalIndexType, GlobalIndexType>::finalize_construction()
 
 
 template <typename LocalIndexType, typename GlobalIndexType>
-bool Partition<LocalIndexType, GlobalIndexType>::has_connected_parts()
+bool Partition<LocalIndexType, GlobalIndexType>::has_connected_parts() const
 {
     return this->get_num_parts() - this->get_num_empty_parts() ==
            this->get_num_ranges();
@@ -125,7 +132,7 @@ bool Partition<LocalIndexType, GlobalIndexType>::has_connected_parts()
 
 
 template <typename LocalIndexType, typename GlobalIndexType>
-bool Partition<LocalIndexType, GlobalIndexType>::has_ordered_parts()
+bool Partition<LocalIndexType, GlobalIndexType>::has_ordered_parts() const
 {
     if (this->has_connected_parts()) {
         auto exec = this->get_executor();
diff --git a/core/distributed/partition_helpers.cpp b/core/distributed/partition_helpers.cpp
new file mode 100644
index 00000000000..b1fd1dd9bc5
--- /dev/null
+++ b/core/distributed/partition_helpers.cpp
@@ -0,0 +1,155 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/distributed/partition_helpers.hpp>
+
+
+#include <numeric>
+
+
+#include <ginkgo/core/distributed/partition.hpp>
+
+
+#include "core/components/fill_array_kernels.hpp"
+#include "core/distributed/partition_helpers_kernels.hpp"
+
+
+namespace gko {
+namespace experimental {
+namespace distributed {
+namespace components {
+namespace {
+
+
+GKO_REGISTER_OPERATION(fill_seq_array, components::fill_seq_array);
+
+
+}  // namespace
+}  // namespace components
+
+
+namespace partition_helpers {
+namespace {
+
+
+GKO_REGISTER_OPERATION(sort_by_range_start,
+                       partition_helpers::sort_by_range_start);
+GKO_REGISTER_OPERATION(check_consecutive_ranges,
+                       partition_helpers::check_consecutive_ranges);
+GKO_REGISTER_OPERATION(compress_ranges, partition_helpers::compress_ranges);
+
+
+}  // namespace
+}  // namespace partition_helpers
+
+
+template <typename LocalIndexType, typename GlobalIndexType>
+std::unique_ptr<Partition<LocalIndexType, GlobalIndexType>>
+build_partition_from_local_range(std::shared_ptr<const Executor> exec,
+                                 mpi::communicator comm, span local_range)
+{
+    std::array<GlobalIndexType, 2> range{
+        static_cast<GlobalIndexType>(local_range.begin),
+        static_cast<GlobalIndexType>(local_range.end)};
+
+    // make all range_start_ends available on each rank
+    // note: not all combination of MPI + GPU library seem to support
+    // mixing host and device buffers, e.g. OpenMPI 4.0.5 and Rocm 4.0
+    auto mpi_exec = exec->get_master();
+    array<GlobalIndexType> ranges_start_end(mpi_exec, comm.size() * 2);
+    ranges_start_end.fill(invalid_index<GlobalIndexType>());
+    comm.all_gather(mpi_exec, range.data(), 2, ranges_start_end.get_data(), 2);
+    ranges_start_end.set_executor(exec);
+
+    // make_sort_by_range_start
+    array<comm_index_type> part_ids(exec, comm.size());
+    exec->run(components::make_fill_seq_array(part_ids.get_data(),
+                                              part_ids.get_num_elems()));
+    exec->run(partition_helpers::make_sort_by_range_start(ranges_start_end,
+                                                          part_ids));
+
+    // check for consistency
+    bool consecutive_ranges = false;
+    exec->run(partition_helpers::make_check_consecutive_ranges(
+        ranges_start_end, consecutive_ranges));
+    if (!consecutive_ranges) {
+        GKO_INVALID_STATE("The partition contains gaps.");
+    }
+
+    // join (now consecutive) starts and ends into combined array
+    array<GlobalIndexType> ranges(exec, comm.size() + 1);
+    exec->run(
+        partition_helpers::make_compress_ranges(ranges_start_end, ranges));
+
+    return Partition<LocalIndexType, GlobalIndexType>::build_from_contiguous(
+        exec, ranges, part_ids);
+}
+
+#define GKO_DECLARE_BUILD_PARTITION_FROM_LOCAL_RANGE(_local_type,          \
+                                                     _global_type)         \
+    std::unique_ptr<Partition<_local_type, _global_type>>                  \
+    build_partition_from_local_range(std::shared_ptr<const Executor> exec, \
+                                     mpi::communicator comm, span local_range)
+GKO_INSTANTIATE_FOR_EACH_LOCAL_GLOBAL_INDEX_TYPE(
+    GKO_DECLARE_BUILD_PARTITION_FROM_LOCAL_RANGE);
+
+
+template <typename LocalIndexType, typename GlobalIndexType>
+std::unique_ptr<Partition<LocalIndexType, GlobalIndexType>>
+build_partition_from_local_size(std::shared_ptr<const Executor> exec,
+                                mpi::communicator comm, size_type local_size)
+{
+    auto local_size_gi = static_cast<GlobalIndexType>(local_size);
+    array<GlobalIndexType> sizes(exec->get_master(), comm.size());
+    comm.all_gather(exec, &local_size_gi, 1, sizes.get_data(), 1);
+
+    array<GlobalIndexType> offsets(exec->get_master(), comm.size() + 1);
+    offsets.get_data()[0] = 0;
+    std::partial_sum(sizes.get_data(), sizes.get_data() + comm.size(),
+                     offsets.get_data() + 1);
+
+    return Partition<LocalIndexType, GlobalIndexType>::build_from_contiguous(
+        exec, offsets);
+}
+
+#define GKO_DECLARE_BUILD_PARTITION_FROM_LOCAL_SIZE(_local_type, _global_type) \
+    std::unique_ptr<Partition<_local_type, _global_type>>                      \
+    build_partition_from_local_size(std::shared_ptr<const Executor> exec,      \
+                                    mpi::communicator comm,                    \
+                                    size_type local_range)
+GKO_INSTANTIATE_FOR_EACH_LOCAL_GLOBAL_INDEX_TYPE(
+    GKO_DECLARE_BUILD_PARTITION_FROM_LOCAL_SIZE);
+
+
+}  // namespace distributed
+}  // namespace experimental
+}  // namespace gko
diff --git a/core/distributed/partition_helpers_kernels.hpp b/core/distributed/partition_helpers_kernels.hpp
new file mode 100644
index 00000000000..ed9fa60364f
--- /dev/null
+++ b/core/distributed/partition_helpers_kernels.hpp
@@ -0,0 +1,86 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GINKGO_PARTITION_HELPERS_KERNELS_HPP
+#define GINKGO_PARTITION_HELPERS_KERNELS_HPP
+
+
+#include <ginkgo/core/base/array.hpp>
+
+
+#include "core/base/kernel_declaration.hpp"
+
+
+namespace gko {
+namespace kernels {
+
+
+#define GKO_DECLARE_PARTITION_HELPERS_SORT_BY_RANGE_START(_type) \
+    void sort_by_range_start(                                    \
+        std::shared_ptr<const DefaultExecutor> exec,             \
+        array<_type>& range_start_ends,                          \
+        array<experimental::distributed::comm_index_type>& part_ids)
+
+
+#define GKO_DECLARE_PARTITION_HELPERS_CHECK_CONSECUTIVE_RANGES(_type)          \
+    void check_consecutive_ranges(std::shared_ptr<const DefaultExecutor> exec, \
+                                  const array<_type>& range_start_ends,        \
+                                  bool& result)
+
+
+#define GKO_DECLARE_PARTITION_HELPERS_COMPRESS_RANGES(_type)          \
+    void compress_ranges(std::shared_ptr<const DefaultExecutor> exec, \
+                         const array<_type>& range_start_ends,        \
+                         array<_type>& range_offsets)
+
+
+#define GKO_DECLARE_ALL_AS_TEMPLATES                                         \
+    template <typename GlobalIndexType>                                      \
+    GKO_DECLARE_PARTITION_HELPERS_SORT_BY_RANGE_START(GlobalIndexType);      \
+    template <typename GlobalIndexType>                                      \
+    GKO_DECLARE_PARTITION_HELPERS_CHECK_CONSECUTIVE_RANGES(GlobalIndexType); \
+    template <typename GlobalIndexType>                                      \
+    GKO_DECLARE_PARTITION_HELPERS_COMPRESS_RANGES(GlobalIndexType)
+
+
+GKO_DECLARE_FOR_ALL_EXECUTOR_NAMESPACES(partition_helpers,
+                                        GKO_DECLARE_ALL_AS_TEMPLATES);
+
+
+#undef GKO_DECLARE_ALL_AS_TEMPLATES
+
+
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GINKGO_PARTITION_HELPERS_KERNELS_HPP
diff --git a/core/distributed/partition_kernels.hpp b/core/distributed/partition_kernels.hpp
index 3d66ed113e8..070ff0839b4 100644
--- a/core/distributed/partition_kernels.hpp
+++ b/core/distributed/partition_kernels.hpp
@@ -49,10 +49,11 @@ namespace kernels {
                       const array<comm_index_type>& mapping,       \
                       size_type& num_ranges)
 
-#define GKO_PARTITION_BUILD_FROM_CONTIGUOUS(GlobalIndexType)                \
-    void build_from_contiguous(std::shared_ptr<const DefaultExecutor> exec, \
-                               const array<GlobalIndexType>& ranges,        \
-                               GlobalIndexType* range_bounds,               \
+#define GKO_PARTITION_BUILD_FROM_CONTIGUOUS(GlobalIndexType)                  \
+    void build_from_contiguous(std::shared_ptr<const DefaultExecutor> exec,   \
+                               const array<GlobalIndexType>& ranges,          \
+                               const array<comm_index_type>& part_id_mapping, \
+                               GlobalIndexType* range_bounds,                 \
                                comm_index_type* part_ids)
 
 #define GKO_PARTITION_BUILD_FROM_MAPPING(GlobalIndexType)                \
diff --git a/core/distributed/preconditioner/schwarz.cpp b/core/distributed/preconditioner/schwarz.cpp
index 31c57947704..45536c9df87 100644
--- a/core/distributed/preconditioner/schwarz.cpp
+++ b/core/distributed/preconditioner/schwarz.cpp
@@ -98,18 +98,42 @@ void Schwarz<ValueType, LocalIndexType, GlobalIndexType>::apply_impl(
 }
 
 
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+void Schwarz<ValueType, LocalIndexType, GlobalIndexType>::set_solver(
+    std::shared_ptr<const LinOp> new_solver)
+{
+    auto exec = this->get_executor();
+    if (new_solver) {
+        if (new_solver->get_executor() != exec) {
+            new_solver = gko::clone(exec, new_solver);
+        }
+    }
+    this->local_solver_ = new_solver;
+}
+
+
 template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
 void Schwarz<ValueType, LocalIndexType, GlobalIndexType>::generate(
     std::shared_ptr<const LinOp> system_matrix)
 {
-    if (parameters_.local_solver_factory) {
-        this->local_solver_ = parameters_.local_solver_factory->generate(
-            as<experimental::distributed::Matrix<ValueType, LocalIndexType,
-                                                 GlobalIndexType>>(
-                system_matrix)
-                ->get_local_matrix());
+    if (parameters_.local_solver && parameters_.generated_local_solver) {
+        GKO_INVALID_STATE(
+            "Provided both a generated solver and a solver factory");
+    }
+
+    if (!parameters_.local_solver && !parameters_.generated_local_solver) {
+        GKO_INVALID_STATE(
+            "Requires either a generated solver or an solver factory");
+    }
+
+    if (parameters_.local_solver) {
+        this->set_solver(gko::share(parameters_.local_solver->generate(
+            as<experimental::distributed::Matrix<
+                ValueType, LocalIndexType, GlobalIndexType>>(system_matrix)
+                ->get_local_matrix())));
+
     } else {
-        GKO_NOT_IMPLEMENTED;
+        this->set_solver(parameters_.generated_local_solver);
     }
 }
 
diff --git a/core/distributed/vector.cpp b/core/distributed/vector.cpp
index 001cf75b76d..8b6b60f0963 100644
--- a/core/distributed/vector.cpp
+++ b/core/distributed/vector.cpp
@@ -573,12 +573,54 @@ void Vector<ValueType>::compute_squared_norm2(ptr_param<LinOp> result,
 }
 
 
+template <typename ValueType>
+void Vector<ValueType>::compute_mean(ptr_param<LinOp> result) const
+{
+    array<char> tmp{this->get_executor()};
+    this->compute_mean(result, tmp);
+}
+
+
+template <typename ValueType>
+void Vector<ValueType>::compute_mean(ptr_param<LinOp> result,
+                                     array<char>& tmp) const
+{
+    using MeanVector = local_vector_type;
+    const auto global_size = this->get_size()[0];
+    const auto local_size = this->get_local_vector()->get_size()[0];
+    const auto num_vecs = static_cast<int>(this->get_size()[1]);
+    GKO_ASSERT_EQUAL_COLS(result, this);
+    auto exec = this->get_executor();
+    const auto comm = this->get_communicator();
+    auto dense_res = make_temporary_clone(exec, as<MeanVector>(result));
+    this->get_local_vector()->compute_mean(dense_res.get());
+
+    // scale by its weight ie ratio of local to global size
+    auto weight = initialize<matrix::Dense<remove_complex<ValueType>>>(
+        {static_cast<remove_complex<ValueType>>(local_size) / global_size},
+        this->get_executor());
+    dense_res->scale(weight.get());
+
+    exec->synchronize();
+    if (mpi::requires_host_buffer(exec, comm)) {
+        host_reduction_buffer_.init(exec->get_master(), dense_res->get_size());
+        host_reduction_buffer_->copy_from(dense_res.get());
+        comm.all_reduce(exec->get_master(),
+                        host_reduction_buffer_->get_values(), num_vecs,
+                        MPI_SUM);
+        dense_res->copy_from(host_reduction_buffer_.get());
+    } else {
+        comm.all_reduce(exec, dense_res->get_values(), num_vecs, MPI_SUM);
+    }
+}
+
 template <typename ValueType>
 ValueType& Vector<ValueType>::at_local(size_type row, size_type col) noexcept
 {
     return local_.at(row, col);
 }
 
+
 template <typename ValueType>
 ValueType Vector<ValueType>::at_local(size_type row,
                                       size_type col) const noexcept
@@ -586,6 +628,7 @@ ValueType Vector<ValueType>::at_local(size_type row,
     return local_.at(row, col);
 }
 
+
 template <typename ValueType>
 ValueType& Vector<ValueType>::at_local(size_type idx) noexcept
 {
diff --git a/core/factorization/factorization.cpp b/core/factorization/factorization.cpp
index d38d18ca3e5..5877124bf77 100644
--- a/core/factorization/factorization.cpp
+++ b/core/factorization/factorization.cpp
@@ -33,18 +33,89 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/factorization/factorization.hpp>
 
 
+#include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 
 
+#include "core/factorization/factorization_kernels.hpp"
+
+
 namespace gko {
 namespace experimental {
 namespace factorization {
+namespace {
+
+
+GKO_REGISTER_OPERATION(initialize_row_ptrs_l_u,
+                       factorization::initialize_row_ptrs_l_u);
+GKO_REGISTER_OPERATION(initialize_l_u, factorization::initialize_l_u);
+GKO_REGISTER_OPERATION(initialize_row_ptrs_l,
+                       factorization::initialize_row_ptrs_l);
+GKO_REGISTER_OPERATION(initialize_l, factorization::initialize_l);
+
+
+}  // namespace
 
 
 template <typename ValueType, typename IndexType>
 std::unique_ptr<Factorization<ValueType, IndexType>>
-Factorization<ValueType, IndexType>::unpack() const GKO_NOT_IMPLEMENTED;
+Factorization<ValueType, IndexType>::unpack() const
+{
+    const auto exec = this->get_executor();
+    const auto size = this->get_size();
+    switch (this->get_storage_type()) {
+    case storage_type::empty:
+        GKO_NOT_SUPPORTED(nullptr);
+    case storage_type::composition:
+    case storage_type::symm_composition:
+        return this->clone();
+    case storage_type::combined_lu: {
+        // count nonzeros
+        array<index_type> l_row_ptrs{exec, size[0] + 1};
+        array<index_type> u_row_ptrs{exec, size[0] + 1};
+        const auto mtx = this->get_combined();
+        exec->run(make_initialize_row_ptrs_l_u(mtx.get(), l_row_ptrs.get_data(),
+                                               u_row_ptrs.get_data()));
+        const auto l_nnz = static_cast<size_type>(
+            exec->copy_val_to_host(l_row_ptrs.get_const_data() + size[0]));
+        const auto u_nnz = static_cast<size_type>(
+            exec->copy_val_to_host(u_row_ptrs.get_const_data() + size[0]));
+        // create matrices
+        auto l_mtx = matrix_type::create(
+            exec, size, array<value_type>{exec, l_nnz},
+            array<index_type>{exec, l_nnz}, std::move(l_row_ptrs));
+        auto u_mtx = matrix_type::create(
+            exec, size, array<value_type>{exec, u_nnz},
+            array<index_type>{exec, u_nnz}, std::move(u_row_ptrs));
+        // fill matrices
+        exec->run(make_initialize_l_u(mtx.get(), l_mtx.get(), u_mtx.get()));
+        return create_from_composition(
+            composition_type::create(std::move(l_mtx), std::move(u_mtx)));
+    }
+    case storage_type::symm_combined_cholesky: {
+        // count nonzeros
+        array<index_type> l_row_ptrs{exec, size[0] + 1};
+        const auto mtx = this->get_combined();
+        exec->run(make_initialize_row_ptrs_l(mtx.get(), l_row_ptrs.get_data()));
+        const auto l_nnz = static_cast<size_type>(
+            exec->copy_val_to_host(l_row_ptrs.get_const_data() + size[0]));
+        // create matrices
+        auto l_mtx = matrix_type::create(
+            exec, size, array<value_type>{exec, l_nnz},
+            array<index_type>{exec, l_nnz}, std::move(l_row_ptrs));
+        // fill matrices
+        exec->run(make_initialize_l(mtx.get(), l_mtx.get(), false));
+        auto u_mtx = l_mtx->conj_transpose();
+        return create_from_symm_composition(
+            composition_type::create(std::move(l_mtx), std::move(u_mtx)));
+    }
+    case storage_type::combined_ldu:
+    case storage_type::symm_combined_ldl:
+    default:
+        GKO_NOT_IMPLEMENTED;
+    }
+}
 
 
 template <typename ValueType, typename IndexType>
@@ -58,7 +129,7 @@ template <typename ValueType, typename IndexType>
 std::shared_ptr<const gko::matrix::Csr<ValueType, IndexType>>
 Factorization<ValueType, IndexType>::get_lower_factor() const
 {
-    switch (storage_type_) {
+    switch (this->get_storage_type()) {
     case storage_type::composition:
     case storage_type::symm_composition:
         GKO_ASSERT(factors_->get_operators().size() == 2 ||
diff --git a/core/factorization/lu.cpp b/core/factorization/lu.cpp
index 47f2711c4c4..fecc9bc9425 100644
--- a/core/factorization/lu.cpp
+++ b/core/factorization/lu.cpp
@@ -60,6 +60,8 @@ GKO_REGISTER_OPERATION(factorize, lu_factorization::factorize);
 GKO_REGISTER_HOST_OPERATION(symbolic_cholesky,
                             gko::factorization::symbolic_cholesky);
 GKO_REGISTER_HOST_OPERATION(symbolic_lu, gko::factorization::symbolic_lu);
+GKO_REGISTER_HOST_OPERATION(symbolic_lu_near_symm,
+                            gko::factorization::symbolic_lu_near_symm);
 
 
 }  // namespace
@@ -95,12 +97,21 @@ std::unique_ptr<LinOp> Lu<ValueType, IndexType>::generate_impl(
     const auto num_rows = mtx->get_size()[0];
     std::unique_ptr<matrix_type> factors;
     if (!parameters_.symbolic_factorization) {
-        if (parameters_.symmetric_sparsity) {
+        switch (parameters_.symbolic_algorithm) {
+        case symbolic_type::general:
+            exec->run(make_symbolic_lu(mtx.get(), factors));
+            break;
+        case symbolic_type::near_symmetric:
+            exec->run(make_symbolic_lu_near_symm(mtx.get(), factors));
+            break;
+        case symbolic_type::symmetric: {
             std::unique_ptr<gko::factorization::elimination_forest<IndexType>>
                 forest;
             exec->run(make_symbolic_cholesky(mtx.get(), true, factors, forest));
-        } else {
-            exec->run(make_symbolic_lu(mtx.get(), factors));
+            break;
+        }
+        default:
+            GKO_INVALID_STATE("Invalid symbolic factorization algorithm");
         }
     } else {
         const auto& symbolic = parameters_.symbolic_factorization;
diff --git a/core/factorization/lu_kernels.hpp b/core/factorization/lu_kernels.hpp
index d3e7aea8f08..1aae4a31479 100644
--- a/core/factorization/lu_kernels.hpp
+++ b/core/factorization/lu_kernels.hpp
@@ -66,11 +66,30 @@ namespace kernels {
                    array<int>& tmp_storage)
 
 
-#define GKO_DECLARE_ALL_AS_TEMPLATES                  \
-    template <typename ValueType, typename IndexType> \
-    GKO_DECLARE_LU_INITIALIZE(ValueType, IndexType);  \
-    template <typename ValueType, typename IndexType> \
-    GKO_DECLARE_LU_FACTORIZE(ValueType, IndexType)
+#define GKO_DECLARE_LU_SYMMETRIC_FACTORIZE_SIMPLE(IndexType)                  \
+    void symbolic_factorize_simple(                                           \
+        std::shared_ptr<const DefaultExecutor> exec,                          \
+        const IndexType* row_ptrs, const IndexType* col_idxs,                 \
+        const IndexType* factor_lookup_offsets,                               \
+        const int64* factor_lookup_descs, const int32* factor_lookup_storage, \
+        matrix::Csr<float, IndexType>* factors, IndexType* out_row_nnz)
+
+
+#define GKO_DECLARE_LU_SYMMETRIC_FACTORIZE_SIMPLE_FINALIZE(IndexType) \
+    void symbolic_factorize_simple_finalize(                          \
+        std::shared_ptr<const DefaultExecutor> exec,                  \
+        const matrix::Csr<float, IndexType>* factors, IndexType* col_idxs)
+
+
+#define GKO_DECLARE_ALL_AS_TEMPLATES                      \
+    template <typename ValueType, typename IndexType>     \
+    GKO_DECLARE_LU_INITIALIZE(ValueType, IndexType);      \
+    template <typename ValueType, typename IndexType>     \
+    GKO_DECLARE_LU_FACTORIZE(ValueType, IndexType);       \
+    template <typename IndexType>                         \
+    GKO_DECLARE_LU_SYMMETRIC_FACTORIZE_SIMPLE(IndexType); \
+    template <typename IndexType>                         \
+    GKO_DECLARE_LU_SYMMETRIC_FACTORIZE_SIMPLE_FINALIZE(IndexType)
 
 
 GKO_DECLARE_FOR_ALL_EXECUTOR_NAMESPACES(lu_factorization,
diff --git a/core/factorization/symbolic.cpp b/core/factorization/symbolic.cpp
index f4c27ffffe6..f568ad3d603 100644
--- a/core/factorization/symbolic.cpp
+++ b/core/factorization/symbolic.cpp
@@ -33,6 +33,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/factorization/symbolic.hpp"
 
 
+#include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/temporary_clone.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
@@ -44,6 +45,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/factorization/cholesky_kernels.hpp"
 #include "core/factorization/elimination_forest.hpp"
 #include "core/factorization/lu_kernels.hpp"
+#include "core/matrix/csr_kernels.hpp"
+#include "core/matrix/csr_lookup.hpp"
 
 
 namespace gko {
@@ -53,10 +56,14 @@ namespace {
 
 GKO_REGISTER_OPERATION(symbolic_count, cholesky::symbolic_count);
 GKO_REGISTER_OPERATION(symbolic, cholesky::symbolic_factorize);
+GKO_REGISTER_OPERATION(build_lookup_offsets, csr::build_lookup_offsets);
+GKO_REGISTER_OPERATION(build_lookup, csr::build_lookup);
 GKO_REGISTER_OPERATION(prefix_sum_nonnegative,
                        components::prefix_sum_nonnegative);
-GKO_REGISTER_OPERATION(initialize, lu_factorization::initialize);
-GKO_REGISTER_OPERATION(factorize, lu_factorization::factorize);
+GKO_REGISTER_OPERATION(symbolic_factorize_simple,
+                       lu_factorization::symbolic_factorize_simple);
+GKO_REGISTER_OPERATION(symbolic_factorize_simple_finalize,
+                       lu_factorization::symbolic_factorize_simple_finalize);
 GKO_REGISTER_HOST_OPERATION(compute_elim_forest, compute_elim_forest);
 
 
@@ -70,6 +77,7 @@ void symbolic_cholesky(
     std::unique_ptr<elimination_forest<IndexType>>& forest)
 {
     using matrix_type = matrix::Csr<ValueType, IndexType>;
+    GKO_ASSERT_IS_SQUARE_MATRIX(mtx);
     const auto exec = mtx->get_executor();
     const auto host_exec = exec->get_master();
     exec->run(make_compute_elim_forest(mtx, forest));
@@ -104,6 +112,85 @@ void symbolic_cholesky(
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SYMBOLIC_CHOLESKY);
 
 
+template <typename ValueType, typename IndexType>
+void symbolic_lu_near_symm(
+    const matrix::Csr<ValueType, IndexType>* mtx,
+    std::unique_ptr<matrix::Csr<ValueType, IndexType>>& factors)
+{
+    using matrix_type = matrix::Csr<ValueType, IndexType>;
+    using float_matrix_type = matrix::Csr<float, IndexType>;
+    using scalar_type = gko::matrix::Dense<float>;
+    using id_type = gko::matrix::Identity<float>;
+    GKO_ASSERT_IS_SQUARE_MATRIX(mtx);
+    const auto exec = mtx->get_executor();
+    const auto size = mtx->get_size();
+    std::unique_ptr<float_matrix_type> symm_factors;
+    {
+        const auto nnz = mtx->get_num_stored_elements();
+        // turn the input matrix into a symbolic float matrix
+        array<float> dummy_values{exec, nnz};
+        const auto float_mtx = float_matrix_type::create_const(
+            exec, size, dummy_values.as_const_view(),
+            make_const_array_view(exec, nnz, mtx->get_const_col_idxs()),
+            make_const_array_view(exec, size[0] + 1,
+                                  mtx->get_const_row_ptrs()));
+        // compute A + A^T symbolically
+        const auto scalar = gko::initialize<scalar_type>({one<float>()}, exec);
+        const auto symm_mtx = as<float_matrix_type>(float_mtx->transpose());
+        const auto id = id_type::create(exec, size);
+        float_mtx->apply(scalar, id, scalar, symm_mtx);
+        // compute Cholesky factorization
+        std::unique_ptr<elimination_forest<IndexType>> forest;
+        symbolic_cholesky(symm_mtx.get(), true, symm_factors, forest);
+    }
+    // build lookup structure
+    array<IndexType> storage_offsets{exec, size[0] + 1};
+    array<int64> row_descs{exec, size[0]};
+    array<IndexType> diag_idxs{exec, size[0]};
+    const auto allowed_sparsity = gko::matrix::csr::sparsity_type::bitmap |
+                                  gko::matrix::csr::sparsity_type::full |
+                                  gko::matrix::csr::sparsity_type::hash;
+    exec->run(make_build_lookup_offsets(
+        symm_factors->get_const_row_ptrs(), symm_factors->get_const_col_idxs(),
+        size[0], allowed_sparsity, storage_offsets.get_data()));
+    const auto storage_size = static_cast<size_type>(
+        exec->copy_val_to_host(storage_offsets.get_const_data() + size[0]));
+    array<int32> storage{exec, storage_size};
+    exec->run(make_build_lookup(
+        symm_factors->get_const_row_ptrs(), symm_factors->get_const_col_idxs(),
+        size[0], allowed_sparsity, storage_offsets.get_const_data(),
+        row_descs.get_data(), storage.get_data()));
+    // compute "numerical" factorization with 1s and 0s
+    array<IndexType> factor_row_ptrs{exec, size[0] + 1};
+    exec->run(make_symbolic_factorize_simple(
+        mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(),
+        storage_offsets.get_const_data(), row_descs.get_const_data(),
+        storage.get_const_data(), symm_factors.get(),
+        factor_row_ptrs.get_data()));
+    // build row pointers from nnz
+    exec->run(
+        make_prefix_sum_nonnegative(factor_row_ptrs.get_data(), size[0] + 1));
+    const auto factor_nnz = static_cast<size_type>(
+        exec->copy_val_to_host(factor_row_ptrs.get_const_data() + size[0]));
+    // copy over nonzero columns
+    array<IndexType> factor_cols{exec, factor_nnz};
+    exec->run(make_symbolic_factorize_simple_finalize(symm_factors.get(),
+                                                      factor_cols.get_data()));
+    factors =
+        matrix_type::create(exec, size, array<ValueType>{exec, factor_nnz},
+                            std::move(factor_cols), std::move(factor_row_ptrs));
+}
+
+
+#define GKO_DECLARE_SYMBOLIC_LU_NEAR_SYMM(ValueType, IndexType) \
+    void symbolic_lu_near_symm(                                 \
+        const matrix::Csr<ValueType, IndexType>* mtx,           \
+        std::unique_ptr<matrix::Csr<ValueType, IndexType>>& factors)
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_SYMBOLIC_LU_NEAR_SYMM);
+
+
 template <typename ValueType, typename IndexType>
 void symbolic_lu(const matrix::Csr<ValueType, IndexType>* mtx,
                  std::unique_ptr<matrix::Csr<ValueType, IndexType>>& factors)
@@ -189,5 +276,6 @@ void symbolic_lu(const matrix::Csr<ValueType, IndexType>* mtx,
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_SYMBOLIC_LU);
 
+
 }  // namespace factorization
 }  // namespace gko
diff --git a/core/factorization/symbolic.hpp b/core/factorization/symbolic.hpp
index 492a0b0ee40..ea2ee23419b 100644
--- a/core/factorization/symbolic.hpp
+++ b/core/factorization/symbolic.hpp
@@ -69,6 +69,23 @@ template <typename ValueType, typename IndexType>
 void symbolic_lu(const matrix::Csr<ValueType, IndexType>* mtx,
                  std::unique_ptr<matrix::Csr<ValueType, IndexType>>& factors);
 
+/**
+ * Computes the symbolic LU factorization of the given, nearly symmetric matrix.
+ *
+ * The implementation uses a symbolic Cholesky factorization of A + A^T and
+ * computes which entries of the resulting matrix are part of the LU
+ * factorization using a kernel similar to the numerical factorization.
+ * It works best if the amount of fill-in for A + A^T is similar to the amount
+ * of fill-in for A.
+ *
+ * @param mtx  the input matrix
+ * @param factors  the output factors stored in a combined pattern
+ */
+template <typename ValueType, typename IndexType>
+void symbolic_lu_near_symm(
+    const matrix::Csr<ValueType, IndexType>* mtx,
+    std::unique_ptr<matrix::Csr<ValueType, IndexType>>& factors);
+
 
 }  // namespace factorization
 }  // namespace gko
diff --git a/core/log/batch_logger.cpp b/core/log/batch_logger.cpp
new file mode 100644
index 00000000000..22726c44e9d
--- /dev/null
+++ b/core/log/batch_logger.cpp
@@ -0,0 +1,70 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/log/batch_logger.hpp>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/math.hpp>
+
+
+namespace gko {
+namespace batch {
+namespace log {
+
+
+template <typename ValueType>
+void BatchConvergence<ValueType>::on_batch_solver_completed(
+    const array<int>& iteration_count,
+    const array<remove_complex<ValueType>>& residual_norm) const
+{
+    if (this->iteration_count_.get_num_elems() == 0) {
+        this->iteration_count_ = gko::array<int>(
+            iteration_count.get_executor(), iteration_count.get_num_elems());
+    }
+    if (this->residual_norm_.get_num_elems() == 0) {
+        this->residual_norm_ = gko::array<remove_complex<ValueType>>(
+            residual_norm.get_executor(), residual_norm.get_num_elems());
+    }
+    this->iteration_count_ = iteration_count;
+    this->residual_norm_ = residual_norm;
+}
+
+
+#define GKO_DECLARE_BATCH_CONVERGENCE(_type) class BatchConvergence<_type>
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CONVERGENCE);
+
+
+}  // namespace log
+}  // namespace batch
+}  // namespace gko
diff --git a/core/log/logger.cpp b/core/log/logger.cpp
index 81f75842474..3cccb66d34c 100644
--- a/core/log/logger.cpp
+++ b/core/log/logger.cpp
@@ -43,6 +43,7 @@ constexpr Logger::mask_type Logger::operation_events_mask;
 constexpr Logger::mask_type Logger::polymorphic_object_events_mask;
 constexpr Logger::mask_type Logger::linop_events_mask;
 constexpr Logger::mask_type Logger::linop_factory_events_mask;
+constexpr Logger::mask_type Logger::batch_linop_factory_events_mask;
 constexpr Logger::mask_type Logger::criterion_events_mask;
 
 constexpr Logger::mask_type Logger::allocation_started_mask;
@@ -74,6 +75,8 @@ constexpr Logger::mask_type Logger::linop_factory_generate_completed_mask;
 constexpr Logger::mask_type Logger::criterion_check_started_mask;
 constexpr Logger::mask_type Logger::criterion_check_completed_mask;
 
+constexpr Logger::mask_type Logger::batch_solver_completed_mask;
+
 constexpr Logger::mask_type Logger::iteration_complete_mask;
 
 
diff --git a/core/log/tau.cpp b/core/log/tau.cpp
index 62b68732de1..e1b29c9c953 100644
--- a/core/log/tau.cpp
+++ b/core/log/tau.cpp
@@ -30,16 +30,15 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
+#include <ginkgo/config.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/log/profiler_hook.hpp>
+
 #if GKO_HAVE_TAU
 #define PERFSTUBS_USE_TIMERS
 #include <perfstubs_api/timer.h>
 #endif
 
-
-#include <ginkgo/core/base/exception_helpers.hpp>
-#include <ginkgo/core/log/profiler_hook.hpp>
-
-
 namespace gko {
 namespace log {
 
@@ -56,7 +55,7 @@ void begin_tau(const char* name, profile_event_category)
 }
 
 
-void end_tau(const char*, profile_event_category)
+void end_tau(const char* name, profile_event_category)
 {
     PERFSTUBS_STOP_STRING(name);
 }
diff --git a/core/matrix/batch_dense.cpp b/core/matrix/batch_dense.cpp
new file mode 100644
index 00000000000..58c7fa25cea
--- /dev/null
+++ b/core/matrix/batch_dense.cpp
@@ -0,0 +1,222 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/matrix/batch_dense.hpp>
+
+
+#include <algorithm>
+#include <type_traits>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/exception.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/utils.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+#include "core/matrix/batch_dense_kernels.hpp"
+
+
+namespace gko {
+namespace batch {
+namespace matrix {
+namespace dense {
+namespace {
+
+
+GKO_REGISTER_OPERATION(simple_apply, batch_dense::simple_apply);
+GKO_REGISTER_OPERATION(advanced_apply, batch_dense::advanced_apply);
+
+
+}  // namespace
+}  // namespace dense
+
+
+template <typename ValueType>
+std::unique_ptr<gko::matrix::Dense<ValueType>>
+Dense<ValueType>::create_view_for_item(size_type item_id)
+{
+    auto exec = this->get_executor();
+    auto num_rows = this->get_common_size()[0];
+    auto stride = this->get_common_size()[1];
+    auto mat = unbatch_type::create(
+        exec, this->get_common_size(),
+        make_array_view(exec, num_rows * stride,
+                        this->get_values_for_item(item_id)),
+        stride);
+    return mat;
+}
+
+
+template <typename ValueType>
+std::unique_ptr<const gko::matrix::Dense<ValueType>>
+Dense<ValueType>::create_const_view_for_item(size_type item_id) const
+{
+    auto exec = this->get_executor();
+    auto num_rows = this->get_common_size()[0];
+    auto stride = this->get_common_size()[1];
+    auto mat = unbatch_type::create_const(
+        exec, this->get_common_size(),
+        make_const_array_view(exec, num_rows * stride,
+                              this->get_const_values_for_item(item_id)),
+        stride);
+    return mat;
+}
+
+
+template <typename ValueType>
+std::unique_ptr<const Dense<ValueType>> Dense<ValueType>::create_const(
+    std::shared_ptr<const Executor> exec, const batch_dim<2>& sizes,
+    gko::detail::const_array_view<ValueType>&& values)
+{
+    // cast const-ness away, but return a const object afterwards,
+    // so we can ensure that no modifications take place.
+    return std::unique_ptr<const Dense>(new Dense{
+        exec, sizes, gko::detail::array_const_cast(std::move(values))});
+}
+
+
+template <typename ValueType>
+Dense<ValueType>::Dense(std::shared_ptr<const Executor> exec,
+                        const batch_dim<2>& size)
+    : EnableBatchLinOp<Dense<ValueType>>(exec, size),
+      values_(exec, compute_num_elems(size))
+{}
+
+
+template <typename ValueType>
+Dense<ValueType>* Dense<ValueType>::apply(
+    ptr_param<const MultiVector<ValueType>> b,
+    ptr_param<MultiVector<ValueType>> x)
+{
+    this->validate_application_parameters(b.get(), x.get());
+    auto exec = this->get_executor();
+    this->apply_impl(make_temporary_clone(exec, b).get(),
+                     make_temporary_clone(exec, x).get());
+    return this;
+}
+
+
+template <typename ValueType>
+const Dense<ValueType>* Dense<ValueType>::apply(
+    ptr_param<const MultiVector<ValueType>> b,
+    ptr_param<MultiVector<ValueType>> x) const
+{
+    this->validate_application_parameters(b.get(), x.get());
+    auto exec = this->get_executor();
+    this->apply_impl(make_temporary_clone(exec, b).get(),
+                     make_temporary_clone(exec, x).get());
+    return this;
+}
+
+
+template <typename ValueType>
+Dense<ValueType>* Dense<ValueType>::apply(
+    ptr_param<const MultiVector<ValueType>> alpha,
+    ptr_param<const MultiVector<ValueType>> b,
+    ptr_param<const MultiVector<ValueType>> beta,
+    ptr_param<MultiVector<ValueType>> x)
+{
+    this->validate_application_parameters(alpha.get(), b.get(), beta.get(),
+                                          x.get());
+    auto exec = this->get_executor();
+    this->apply_impl(make_temporary_clone(exec, alpha).get(),
+                     make_temporary_clone(exec, b).get(),
+                     make_temporary_clone(exec, beta).get(),
+                     make_temporary_clone(exec, x).get());
+    return this;
+}
+
+
+template <typename ValueType>
+const Dense<ValueType>* Dense<ValueType>::apply(
+    ptr_param<const MultiVector<ValueType>> alpha,
+    ptr_param<const MultiVector<ValueType>> b,
+    ptr_param<const MultiVector<ValueType>> beta,
+    ptr_param<MultiVector<ValueType>> x) const
+{
+    this->validate_application_parameters(alpha.get(), b.get(), beta.get(),
+                                          x.get());
+    auto exec = this->get_executor();
+    this->apply_impl(make_temporary_clone(exec, alpha).get(),
+                     make_temporary_clone(exec, b).get(),
+                     make_temporary_clone(exec, beta).get(),
+                     make_temporary_clone(exec, x).get());
+    return this;
+}
+
+
+template <typename ValueType>
+void Dense<ValueType>::apply_impl(const MultiVector<ValueType>* b,
+                                  MultiVector<ValueType>* x) const
+{
+    this->get_executor()->run(dense::make_simple_apply(this, b, x));
+}
+
+
+template <typename ValueType>
+void Dense<ValueType>::apply_impl(const MultiVector<ValueType>* alpha,
+                                  const MultiVector<ValueType>* b,
+                                  const MultiVector<ValueType>* beta,
+                                  MultiVector<ValueType>* x) const
+{
+    this->get_executor()->run(
+        dense::make_advanced_apply(alpha, this, b, beta, x));
+}
+
+
+template <typename ValueType>
+void Dense<ValueType>::convert_to(
+    Dense<next_precision<ValueType>>* result) const
+{
+    result->values_ = this->values_;
+    result->set_size(this->get_size());
+}
+
+
+template <typename ValueType>
+void Dense<ValueType>::move_to(Dense<next_precision<ValueType>>* result)
+{
+    this->convert_to(result);
+}
+
+
+#define GKO_DECLARE_BATCH_DENSE_MATRIX(_type) class Dense<_type>
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_MATRIX);
+
+
+}  // namespace matrix
+}  // namespace batch
+}  // namespace gko
diff --git a/core/matrix/batch_dense_kernels.hpp b/core/matrix/batch_dense_kernels.hpp
new file mode 100644
index 00000000000..ef59ff3e9cc
--- /dev/null
+++ b/core/matrix/batch_dense_kernels.hpp
@@ -0,0 +1,83 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CORE_MATRIX_BATCH_DENSE_KERNELS_HPP_
+#define GKO_CORE_MATRIX_BATCH_DENSE_KERNELS_HPP_
+
+
+#include <ginkgo/core/matrix/batch_dense.hpp>
+
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/types.hpp>
+
+
+#include "core/base/kernel_declaration.hpp"
+
+
+namespace gko {
+namespace kernels {
+
+
+#define GKO_DECLARE_BATCH_DENSE_SIMPLE_APPLY_KERNEL(_type)         \
+    void simple_apply(std::shared_ptr<const DefaultExecutor> exec, \
+                      const batch::matrix::Dense<_type>* a,        \
+                      const batch::MultiVector<_type>* b,          \
+                      batch::MultiVector<_type>* c)
+
+#define GKO_DECLARE_BATCH_DENSE_ADVANCED_APPLY_KERNEL(_type)         \
+    void advanced_apply(std::shared_ptr<const DefaultExecutor> exec, \
+                        const batch::MultiVector<_type>* alpha,      \
+                        const batch::matrix::Dense<_type>* a,        \
+                        const batch::MultiVector<_type>* b,          \
+                        const batch::MultiVector<_type>* beta,       \
+                        batch::MultiVector<_type>* c)
+
+#define GKO_DECLARE_ALL_AS_TEMPLATES                        \
+    template <typename ValueType>                           \
+    GKO_DECLARE_BATCH_DENSE_SIMPLE_APPLY_KERNEL(ValueType); \
+    template <typename ValueType>                           \
+    GKO_DECLARE_BATCH_DENSE_ADVANCED_APPLY_KERNEL(ValueType)
+
+
+GKO_DECLARE_FOR_ALL_EXECUTOR_NAMESPACES(batch_dense,
+                                        GKO_DECLARE_ALL_AS_TEMPLATES);
+
+
+#undef GKO_DECLARE_ALL_AS_TEMPLATES
+
+
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_CORE_MATRIX_BATCH_DENSE_KERNELS_HPP_
diff --git a/core/matrix/batch_ell.cpp b/core/matrix/batch_ell.cpp
new file mode 100644
index 00000000000..88863a05dd4
--- /dev/null
+++ b/core/matrix/batch_ell.cpp
@@ -0,0 +1,238 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/matrix/batch_ell.hpp>
+
+
+#include <algorithm>
+#include <type_traits>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/exception.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/utils.hpp>
+#include <ginkgo/core/matrix/ell.hpp>
+
+
+#include "core/matrix/batch_ell_kernels.hpp"
+
+
+namespace gko {
+namespace batch {
+namespace matrix {
+namespace ell {
+namespace {
+
+
+GKO_REGISTER_OPERATION(simple_apply, batch_ell::simple_apply);
+GKO_REGISTER_OPERATION(advanced_apply, batch_ell::advanced_apply);
+
+
+}  // namespace
+}  // namespace ell
+
+
+template <typename ValueType, typename IndexType>
+std::unique_ptr<gko::matrix::Ell<ValueType, IndexType>>
+Ell<ValueType, IndexType>::create_view_for_item(size_type item_id)
+{
+    auto exec = this->get_executor();
+    auto num_rows = this->get_common_size()[0];
+    auto stride = this->get_common_size()[0];
+    auto mat = unbatch_type::create(
+        exec, this->get_common_size(),
+        make_array_view(exec, this->get_num_elements_per_item(),
+                        this->get_values_for_item(item_id)),
+        make_array_view(exec, this->get_num_elements_per_item(),
+                        this->get_col_idxs()),
+        this->get_num_stored_elements_per_row(), stride);
+    return mat;
+}
+
+
+template <typename ValueType, typename IndexType>
+std::unique_ptr<const gko::matrix::Ell<ValueType, IndexType>>
+Ell<ValueType, IndexType>::create_const_view_for_item(size_type item_id) const
+{
+    auto exec = this->get_executor();
+    auto num_rows = this->get_common_size()[0];
+    auto stride = this->get_common_size()[0];
+    auto mat = unbatch_type::create_const(
+        exec, this->get_common_size(),
+        make_const_array_view(exec, this->get_num_elements_per_item(),
+                              this->get_const_values_for_item(item_id)),
+        make_const_array_view(exec, this->get_num_elements_per_item(),
+                              this->get_const_col_idxs()),
+        this->get_num_stored_elements_per_row(), stride);
+    return mat;
+}
+
+
+template <typename ValueType, typename IndexType>
+std::unique_ptr<const Ell<ValueType, IndexType>>
+Ell<ValueType, IndexType>::create_const(
+    std::shared_ptr<const Executor> exec, const batch_dim<2>& sizes,
+    const IndexType num_elems_per_row,
+    gko::detail::const_array_view<ValueType>&& values,
+    gko::detail::const_array_view<IndexType>&& col_idxs)
+{
+    // cast const-ness away, but return a const object afterwards,
+    // so we can ensure that no modifications take place.
+    return std::unique_ptr<const Ell>(
+        new Ell{exec, sizes, num_elems_per_row,
+                gko::detail::array_const_cast(std::move(values)),
+                gko::detail::array_const_cast(std::move(col_idxs))});
+}
+
+
+template <typename ValueType, typename IndexType>
+Ell<ValueType, IndexType>::Ell(std::shared_ptr<const Executor> exec,
+                               const batch_dim<2>& size,
+                               IndexType num_elems_per_row)
+    : EnableBatchLinOp<Ell<ValueType, IndexType>>(exec, size),
+      num_elems_per_row_(num_elems_per_row == 0 ? size.get_common_size()[1]
+                                                : num_elems_per_row),
+      values_(exec, compute_num_elems(size, num_elems_per_row_)),
+      col_idxs_(exec, this->get_common_size()[0] * num_elems_per_row_)
+{}
+
+
+template <typename ValueType, typename IndexType>
+Ell<ValueType, IndexType>* Ell<ValueType, IndexType>::apply(
+    ptr_param<const MultiVector<ValueType>> b,
+    ptr_param<MultiVector<ValueType>> x)
+{
+    this->validate_application_parameters(b.get(), x.get());
+    auto exec = this->get_executor();
+    this->apply_impl(make_temporary_clone(exec, b).get(),
+                     make_temporary_clone(exec, x).get());
+    return this;
+}
+
+
+template <typename ValueType, typename IndexType>
+const Ell<ValueType, IndexType>* Ell<ValueType, IndexType>::apply(
+    ptr_param<const MultiVector<ValueType>> b,
+    ptr_param<MultiVector<ValueType>> x) const
+{
+    this->validate_application_parameters(b.get(), x.get());
+    auto exec = this->get_executor();
+    this->apply_impl(make_temporary_clone(exec, b).get(),
+                     make_temporary_clone(exec, x).get());
+    return this;
+}
+
+
+template <typename ValueType, typename IndexType>
+Ell<ValueType, IndexType>* Ell<ValueType, IndexType>::apply(
+    ptr_param<const MultiVector<ValueType>> alpha,
+    ptr_param<const MultiVector<ValueType>> b,
+    ptr_param<const MultiVector<ValueType>> beta,
+    ptr_param<MultiVector<ValueType>> x)
+{
+    this->validate_application_parameters(alpha.get(), b.get(), beta.get(),
+                                          x.get());
+    auto exec = this->get_executor();
+    this->apply_impl(make_temporary_clone(exec, alpha).get(),
+                     make_temporary_clone(exec, b).get(),
+                     make_temporary_clone(exec, beta).get(),
+                     make_temporary_clone(exec, x).get());
+    return this;
+}
+
+
+template <typename ValueType, typename IndexType>
+const Ell<ValueType, IndexType>* Ell<ValueType, IndexType>::apply(
+    ptr_param<const MultiVector<ValueType>> alpha,
+    ptr_param<const MultiVector<ValueType>> b,
+    ptr_param<const MultiVector<ValueType>> beta,
+    ptr_param<MultiVector<ValueType>> x) const
+{
+    this->validate_application_parameters(alpha.get(), b.get(), beta.get(),
+                                          x.get());
+    auto exec = this->get_executor();
+    this->apply_impl(make_temporary_clone(exec, alpha).get(),
+                     make_temporary_clone(exec, b).get(),
+                     make_temporary_clone(exec, beta).get(),
+                     make_temporary_clone(exec, x).get());
+    return this;
+}
+
+
+template <typename ValueType, typename IndexType>
+void Ell<ValueType, IndexType>::apply_impl(const MultiVector<ValueType>* b,
+                                           MultiVector<ValueType>* x) const
+{
+    this->get_executor()->run(ell::make_simple_apply(this, b, x));
+}
+
+
+template <typename ValueType, typename IndexType>
+void Ell<ValueType, IndexType>::apply_impl(const MultiVector<ValueType>* alpha,
+                                           const MultiVector<ValueType>* b,
+                                           const MultiVector<ValueType>* beta,
+                                           MultiVector<ValueType>* x) const
+{
+    this->get_executor()->run(
+        ell::make_advanced_apply(alpha, this, b, beta, x));
+}
+
+
+template <typename ValueType, typename IndexType>
+void Ell<ValueType, IndexType>::convert_to(
+    Ell<next_precision<ValueType>, IndexType>* result) const
+{
+    result->values_ = this->values_;
+    result->col_idxs_ = this->col_idxs_;
+    result->num_elems_per_row_ = this->num_elems_per_row_;
+    result->set_size(this->get_size());
+}
+
+
+template <typename ValueType, typename IndexType>
+void Ell<ValueType, IndexType>::move_to(
+    Ell<next_precision<ValueType>, IndexType>* result)
+{
+    this->convert_to(result);
+}
+
+
+#define GKO_DECLARE_BATCH_ELL_MATRIX(ValueType) class Ell<ValueType, int32>
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_ELL_MATRIX);
+
+
+}  // namespace matrix
+}  // namespace batch
+}  // namespace gko
diff --git a/core/matrix/batch_ell_kernels.hpp b/core/matrix/batch_ell_kernels.hpp
new file mode 100644
index 00000000000..d3acc582f9b
--- /dev/null
+++ b/core/matrix/batch_ell_kernels.hpp
@@ -0,0 +1,84 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CORE_MATRIX_BATCH_ELL_KERNELS_HPP_
+#define GKO_CORE_MATRIX_BATCH_ELL_KERNELS_HPP_
+
+
+#include <ginkgo/core/matrix/batch_ell.hpp>
+
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/types.hpp>
+
+
+#include "core/base/kernel_declaration.hpp"
+
+
+namespace gko {
+namespace kernels {
+
+
+#define GKO_DECLARE_BATCH_ELL_SIMPLE_APPLY_KERNEL(_vtype, _itype)  \
+    void simple_apply(std::shared_ptr<const DefaultExecutor> exec, \
+                      const batch::matrix::Ell<_vtype, _itype>* a, \
+                      const batch::MultiVector<_vtype>* b,         \
+                      batch::MultiVector<_vtype>* c)
+
+#define GKO_DECLARE_BATCH_ELL_ADVANCED_APPLY_KERNEL(_vtype, _itype)  \
+    void advanced_apply(std::shared_ptr<const DefaultExecutor> exec, \
+                        const batch::MultiVector<_vtype>* alpha,     \
+                        const batch::matrix::Ell<_vtype, _itype>* a, \
+                        const batch::MultiVector<_vtype>* b,         \
+                        const batch::MultiVector<_vtype>* beta,      \
+                        batch::MultiVector<_vtype>* c)
+
+#define GKO_DECLARE_ALL_AS_TEMPLATES                                 \
+    template <typename ValueType, typename IndexType>                \
+    GKO_DECLARE_BATCH_ELL_SIMPLE_APPLY_KERNEL(ValueType, IndexType); \
+    template <typename ValueType, typename IndexType>                \
+    GKO_DECLARE_BATCH_ELL_ADVANCED_APPLY_KERNEL(ValueType, IndexType)
+
+
+GKO_DECLARE_FOR_ALL_EXECUTOR_NAMESPACES(batch_ell,
+                                        GKO_DECLARE_ALL_AS_TEMPLATES);
+
+
+#undef GKO_DECLARE_ALL_AS_TEMPLATES
+
+
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_CORE_MATRIX_BATCH_ELL_KERNELS_HPP_
diff --git a/core/matrix/batch_identity.cpp b/core/matrix/batch_identity.cpp
new file mode 100644
index 00000000000..0e4a7b59838
--- /dev/null
+++ b/core/matrix/batch_identity.cpp
@@ -0,0 +1,143 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/matrix/batch_identity.hpp>
+
+
+#include <algorithm>
+#include <type_traits>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/batch_dim.hpp>
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/exception.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/utils.hpp>
+#include <ginkgo/core/matrix/identity.hpp>
+
+
+namespace gko {
+namespace batch {
+namespace matrix {
+
+
+template <typename ValueType>
+Identity<ValueType>::Identity(std::shared_ptr<const Executor> exec,
+                              const batch_dim<2>& size)
+    : EnableBatchLinOp<Identity<ValueType>>(exec, size)
+{
+    GKO_ASSERT_BATCH_HAS_SQUARE_DIMENSIONS(this->get_size());
+}
+
+
+template <typename ValueType>
+Identity<ValueType>* Identity<ValueType>::apply(
+    ptr_param<const MultiVector<ValueType>> b,
+    ptr_param<MultiVector<ValueType>> x)
+{
+    static_cast<const Identity*>(this)->apply(b, x);
+    return this;
+}
+
+
+template <typename ValueType>
+const Identity<ValueType>* Identity<ValueType>::apply(
+    ptr_param<const MultiVector<ValueType>> b,
+    ptr_param<MultiVector<ValueType>> x) const
+{
+    this->validate_application_parameters(b.get(), x.get());
+    auto exec = this->get_executor();
+    this->apply_impl(make_temporary_clone(exec, b).get(),
+                     make_temporary_clone(exec, x).get());
+    return this;
+}
+
+
+template <typename ValueType>
+Identity<ValueType>* Identity<ValueType>::apply(
+    ptr_param<const MultiVector<ValueType>> alpha,
+    ptr_param<const MultiVector<ValueType>> b,
+    ptr_param<const MultiVector<ValueType>> beta,
+    ptr_param<MultiVector<ValueType>> x)
+{
+    static_cast<const Identity*>(this)->apply(alpha, b, beta, x);
+    return this;
+}
+
+
+template <typename ValueType>
+const Identity<ValueType>* Identity<ValueType>::apply(
+    ptr_param<const MultiVector<ValueType>> alpha,
+    ptr_param<const MultiVector<ValueType>> b,
+    ptr_param<const MultiVector<ValueType>> beta,
+    ptr_param<MultiVector<ValueType>> x) const
+{
+    this->validate_application_parameters(alpha.get(), b.get(), beta.get(),
+                                          x.get());
+    auto exec = this->get_executor();
+    this->apply_impl(make_temporary_clone(exec, alpha).get(),
+                     make_temporary_clone(exec, b).get(),
+                     make_temporary_clone(exec, beta).get(),
+                     make_temporary_clone(exec, x).get());
+    return this;
+}
+
+
+template <typename ValueType>
+void Identity<ValueType>::apply_impl(const MultiVector<ValueType>* b,
+                                     MultiVector<ValueType>* x) const
+{
+    x->copy_from(b);
+}
+
+
+template <typename ValueType>
+void Identity<ValueType>::apply_impl(const MultiVector<ValueType>* alpha,
+                                     const MultiVector<ValueType>* b,
+                                     const MultiVector<ValueType>* beta,
+                                     MultiVector<ValueType>* x) const
+{
+    x->scale(beta);
+    x->add_scaled(alpha, b);
+}
+
+
+#define GKO_DECLARE_BATCH_IDENTITY_MATRIX(ValueType) class Identity<ValueType>
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_IDENTITY_MATRIX);
+
+
+}  // namespace matrix
+}  // namespace batch
+}  // namespace gko
diff --git a/core/matrix/batch_struct.hpp b/core/matrix/batch_struct.hpp
new file mode 100644
index 00000000000..4d5027e159e
--- /dev/null
+++ b/core/matrix/batch_struct.hpp
@@ -0,0 +1,220 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CORE_MATRIX_BATCH_STRUCT_HPP_
+#define GKO_CORE_MATRIX_BATCH_STRUCT_HPP_
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/matrix/batch_dense.hpp>
+#include <ginkgo/core/matrix/batch_ell.hpp>
+
+
+namespace gko {
+namespace batch {
+namespace matrix {
+namespace dense {
+
+
+/**
+ * Encapsulates one matrix from a batch of dense matrices.
+ */
+template <typename ValueType>
+struct batch_item {
+    using value_type = ValueType;
+    value_type* values;
+    int32 stride;
+    int32 num_rows;
+    int32 num_cols;
+};
+
+
+/**
+ * A 'simple' structure to store a global uniform batch of dense matrices.
+ */
+template <typename ValueType>
+struct uniform_batch {
+    using value_type = ValueType;
+    using entry_type = batch_item<ValueType>;
+
+    ValueType* values;
+    size_type num_batch_items;
+    int32 stride;
+    int32 num_rows;
+    int32 num_cols;
+
+    inline size_type get_single_item_num_nnz() const
+    {
+        return static_cast<size_type>(stride * num_rows);
+    }
+};
+
+
+}  // namespace dense
+
+
+namespace ell {
+
+
+/**
+ * Encapsulates one matrix from a batch of ell matrices.
+ */
+template <typename ValueType, typename IndexType>
+struct batch_item {
+    using value_type = ValueType;
+    using index_type = IndexType;
+
+    ValueType* values;
+    const index_type* col_idxs;
+    index_type stride;
+    index_type num_rows;
+    index_type num_cols;
+    index_type num_stored_elems_per_row;
+};
+
+
+/**
+ * A 'simple' structure to store a global uniform batch of ell matrices.
+ */
+template <typename ValueType, typename IndexType>
+struct uniform_batch {
+    using value_type = ValueType;
+    using index_type = IndexType;
+    using entry_type = batch_item<value_type, index_type>;
+
+    ValueType* values;
+    const index_type* col_idxs;
+    size_type num_batch_items;
+    index_type stride;
+    index_type num_rows;
+    index_type num_cols;
+    index_type num_stored_elems_per_row;
+
+    inline size_type get_single_item_num_nnz() const
+    {
+        return static_cast<size_type>(stride * num_stored_elems_per_row);
+    }
+};
+
+
+}  // namespace ell
+
+
+template <typename ValueType>
+GKO_ATTRIBUTES GKO_INLINE dense::batch_item<const ValueType> to_const(
+    const dense::batch_item<ValueType>& b)
+{
+    return {b.values, b.stride, b.num_rows, b.num_cols};
+}
+
+
+template <typename ValueType>
+GKO_ATTRIBUTES GKO_INLINE dense::uniform_batch<const ValueType> to_const(
+    const dense::uniform_batch<ValueType>& ub)
+{
+    return {ub.values, ub.num_batch_items, ub.stride, ub.num_rows, ub.num_cols};
+}
+
+
+template <typename ValueType>
+GKO_ATTRIBUTES GKO_INLINE dense::batch_item<ValueType> extract_batch_item(
+    const dense::uniform_batch<ValueType>& batch, const size_type batch_idx)
+{
+    return {batch.values + batch_idx * batch.stride * batch.num_rows,
+            batch.stride, batch.num_rows, batch.num_cols};
+}
+
+template <typename ValueType>
+GKO_ATTRIBUTES GKO_INLINE dense::batch_item<ValueType> extract_batch_item(
+    ValueType* const batch_values, const int32 stride, const int32 num_rows,
+    const int32 num_cols, const size_type batch_idx)
+{
+    return {batch_values + batch_idx * stride * num_rows, stride, num_rows,
+            num_cols};
+}
+
+
+template <typename ValueType, typename IndexType>
+GKO_ATTRIBUTES GKO_INLINE ell::batch_item<const ValueType, const IndexType>
+to_const(const ell::batch_item<ValueType, IndexType>& b)
+{
+    return {b.values,   b.col_idxs, b.stride,
+            b.num_rows, b.num_cols, b.num_stored_elems_per_row};
+}
+
+
+template <typename ValueType, typename IndexType>
+GKO_ATTRIBUTES GKO_INLINE ell::uniform_batch<const ValueType, const IndexType>
+to_const(const ell::uniform_batch<ValueType, IndexType>& ub)
+{
+    return {ub.values,   ub.col_idxs, ub.num_batch_items,         ub.stride,
+            ub.num_rows, ub.num_cols, ub.num_stored_elems_per_row};
+}
+
+
+template <typename ValueType, typename IndexType>
+GKO_ATTRIBUTES GKO_INLINE ell::batch_item<ValueType, IndexType>
+extract_batch_item(const ell::uniform_batch<ValueType, IndexType>& batch,
+                   const size_type batch_idx)
+{
+    return {batch.values +
+                batch_idx * batch.num_stored_elems_per_row * batch.num_rows,
+            batch.col_idxs,
+            batch.stride,
+            batch.num_rows,
+            batch.num_cols,
+            batch.num_stored_elems_per_row};
+}
+
+template <typename ValueType, typename IndexType>
+GKO_ATTRIBUTES GKO_INLINE ell::batch_item<ValueType, IndexType>
+extract_batch_item(ValueType* const batch_values,
+                   IndexType* const batch_col_idxs, const int stride,
+                   const int num_rows, const int num_cols,
+                   int num_elems_per_row, const size_type batch_idx)
+{
+    return {batch_values + batch_idx * num_elems_per_row * num_rows,
+            batch_col_idxs,
+            stride,
+            num_rows,
+            num_cols,
+            num_elems_per_row};
+}
+
+
+}  // namespace matrix
+}  // namespace batch
+}  // namespace gko
+
+
+#endif  // GKO_CORE_MATRIX_BATCH_STRUCT_HPP_
diff --git a/core/matrix/csr.cpp b/core/matrix/csr.cpp
index 9a4697c1195..b34cae206f5 100644
--- a/core/matrix/csr.cpp
+++ b/core/matrix/csr.cpp
@@ -45,6 +45,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/ell.hpp>
 #include <ginkgo/core/matrix/fbcsr.hpp>
 #include <ginkgo/core/matrix/identity.hpp>
+#include <ginkgo/core/matrix/permutation.hpp>
+#include <ginkgo/core/matrix/scaled_permutation.hpp>
 #include <ginkgo/core/matrix/sellp.hpp>
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
@@ -57,6 +59,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/matrix/csr_kernels.hpp"
 #include "core/matrix/ell_kernels.hpp"
 #include "core/matrix/hybrid_kernels.hpp"
+#include "core/matrix/permutation.hpp"
 #include "core/matrix/sellp_kernels.hpp"
 
 
@@ -93,9 +96,15 @@ GKO_REGISTER_OPERATION(transpose, csr::transpose);
 GKO_REGISTER_OPERATION(conj_transpose, csr::conj_transpose);
 GKO_REGISTER_OPERATION(inv_symm_permute, csr::inv_symm_permute);
 GKO_REGISTER_OPERATION(row_permute, csr::row_permute);
-GKO_REGISTER_OPERATION(inverse_row_permute, csr::inverse_row_permute);
-GKO_REGISTER_OPERATION(inverse_column_permute, csr::inverse_column_permute);
-GKO_REGISTER_OPERATION(invert_permutation, csr::invert_permutation);
+GKO_REGISTER_OPERATION(inv_row_permute, csr::inv_row_permute);
+GKO_REGISTER_OPERATION(inv_col_permute, csr::inv_col_permute);
+GKO_REGISTER_OPERATION(inv_nonsymm_permute, csr::inv_nonsymm_permute);
+GKO_REGISTER_OPERATION(inv_symm_scale_permute, csr::inv_symm_scale_permute);
+GKO_REGISTER_OPERATION(row_scale_permute, csr::row_scale_permute);
+GKO_REGISTER_OPERATION(inv_row_scale_permute, csr::inv_row_scale_permute);
+GKO_REGISTER_OPERATION(inv_col_scale_permute, csr::inv_col_scale_permute);
+GKO_REGISTER_OPERATION(inv_nonsymm_scale_permute,
+                       csr::inv_nonsymm_scale_permute);
 GKO_REGISTER_OPERATION(convert_ptrs_to_sizes,
                        components::convert_ptrs_to_sizes);
 GKO_REGISTER_OPERATION(sort_by_column_index, csr::sort_by_column_index);
@@ -520,26 +529,210 @@ std::unique_ptr<LinOp> Csr<ValueType, IndexType>::conj_transpose() const
 }
 
 
+template <typename ValueType, typename IndexType>
+std::unique_ptr<Csr<ValueType, IndexType>> Csr<ValueType, IndexType>::permute(
+    ptr_param<const Permutation<IndexType>> permutation,
+    permute_mode mode) const
+{
+    const auto exec = this->get_executor();
+    const auto size = this->get_size();
+    const auto nnz = this->get_num_stored_elements();
+    validate_permute_dimensions(size, permutation->get_size(), mode);
+    if ((mode & permute_mode::symmetric) == permute_mode::none) {
+        return this->clone();
+    }
+    auto result = Csr::create(exec, size, nnz, this->get_strategy()->copy());
+    auto local_permutation = make_temporary_clone(exec, permutation);
+    std::unique_ptr<const Permutation<IndexType>> inv_permutation;
+    const auto perm_idxs = local_permutation->get_const_permutation();
+    const IndexType* inv_perm_idxs{};
+    // Due to the sparse storage, we can only inverse-permute columns, so we
+    // need to compute the inverse for forward-permutations.
+    bool needs_inverse =
+        (mode & permute_mode::inverse_columns) == permute_mode::columns;
+    if (needs_inverse) {
+        inv_permutation = local_permutation->compute_inverse();
+        inv_perm_idxs = inv_permutation->get_const_permutation();
+    }
+    switch (mode) {
+    case permute_mode::rows:
+        exec->run(csr::make_row_permute(perm_idxs, this, result.get()));
+        break;
+    case permute_mode::columns:
+        exec->run(csr::make_inv_col_permute(inv_perm_idxs, this, result.get()));
+        break;
+    case permute_mode::inverse_rows:
+        exec->run(csr::make_inv_row_permute(perm_idxs, this, result.get()));
+        break;
+    case permute_mode::inverse_columns:
+        exec->run(csr::make_inv_col_permute(perm_idxs, this, result.get()));
+        break;
+    case permute_mode::symmetric:
+        exec->run(
+            csr::make_inv_symm_permute(inv_perm_idxs, this, result.get()));
+        break;
+    case permute_mode::inverse_symmetric:
+        exec->run(csr::make_inv_symm_permute(perm_idxs, this, result.get()));
+        break;
+    default:
+        GKO_INVALID_STATE("Invalid permute mode");
+    }
+    result->make_srow();
+    if ((mode & permute_mode::columns) == permute_mode::columns) {
+        result->sort_by_column_index();
+    }
+    return result;
+}
+
+
+template <typename ValueType, typename IndexType>
+std::unique_ptr<Csr<ValueType, IndexType>> Csr<ValueType, IndexType>::permute(
+    ptr_param<const Permutation<IndexType>> row_permutation,
+    ptr_param<const Permutation<IndexType>> col_permutation, bool invert) const
+{
+    const auto exec = this->get_executor();
+    const auto size = this->get_size();
+    const auto nnz = this->get_num_stored_elements();
+    GKO_ASSERT_EQUAL_ROWS(this, row_permutation);
+    GKO_ASSERT_EQUAL_COLS(this, col_permutation);
+    auto result = Csr::create(exec, size, nnz, this->get_strategy()->copy());
+    auto local_row_permutation = make_temporary_clone(exec, row_permutation);
+    auto local_col_permutation = make_temporary_clone(exec, col_permutation);
+    if (invert) {
+        exec->run(csr::make_inv_nonsymm_permute(
+            local_row_permutation->get_const_permutation(),
+            local_col_permutation->get_const_permutation(), this,
+            result.get()));
+    } else {
+        const auto inv_row_perm = local_row_permutation->compute_inverse();
+        const auto inv_col_perm = local_col_permutation->compute_inverse();
+        exec->run(csr::make_inv_nonsymm_permute(
+            inv_row_perm->get_const_permutation(),
+            inv_col_perm->get_const_permutation(), this, result.get()));
+    }
+    result->make_srow();
+    result->sort_by_column_index();
+    return result;
+}
+
+
+template <typename ValueType, typename IndexType>
+std::unique_ptr<Csr<ValueType, IndexType>>
+Csr<ValueType, IndexType>::scale_permute(
+    ptr_param<const ScaledPermutation<ValueType, IndexType>> permutation,
+    permute_mode mode) const
+{
+    const auto exec = this->get_executor();
+    const auto size = this->get_size();
+    const auto nnz = this->get_num_stored_elements();
+    validate_permute_dimensions(size, permutation->get_size(), mode);
+    if ((mode & permute_mode::symmetric) == permute_mode::none) {
+        return this->clone();
+    }
+    auto result = Csr::create(exec, size, nnz, this->get_strategy()->copy());
+    auto local_permutation = make_temporary_clone(exec, permutation);
+    std::unique_ptr<const ScaledPermutation<ValueType, IndexType>>
+        inv_permutation;
+    const auto perm_idxs = local_permutation->get_const_permutation();
+    const auto scale_factors = local_permutation->get_const_scaling_factors();
+    const ValueType* inv_scale_factors{};
+    const IndexType* inv_perm_idxs{};
+    // to permute columns, we need to know the inverse permutation
+    bool needs_inverse =
+        (mode & permute_mode::inverse_columns) == permute_mode::columns;
+    if (needs_inverse) {
+        inv_permutation = local_permutation->compute_inverse();
+        inv_scale_factors = inv_permutation->get_const_scaling_factors();
+        inv_perm_idxs = inv_permutation->get_const_permutation();
+    }
+    switch (mode) {
+    case permute_mode::rows:
+        exec->run(csr::make_row_scale_permute(scale_factors, perm_idxs, this,
+                                              result.get()));
+        break;
+    case permute_mode::columns:
+        exec->run(csr::make_inv_col_scale_permute(
+            inv_scale_factors, inv_perm_idxs, this, result.get()));
+        break;
+    case permute_mode::inverse_rows:
+        exec->run(csr::make_inv_row_scale_permute(scale_factors, perm_idxs,
+                                                  this, result.get()));
+        break;
+    case permute_mode::inverse_columns:
+        exec->run(csr::make_inv_col_scale_permute(scale_factors, perm_idxs,
+                                                  this, result.get()));
+        break;
+    case permute_mode::symmetric:
+        exec->run(csr::make_inv_symm_scale_permute(
+            inv_scale_factors, inv_perm_idxs, this, result.get()));
+        break;
+    case permute_mode::inverse_symmetric:
+        exec->run(csr::make_inv_symm_scale_permute(scale_factors, perm_idxs,
+                                                   this, result.get()));
+        break;
+    default:
+        GKO_INVALID_STATE("Invalid permute mode");
+    }
+    result->make_srow();
+    if ((mode & permute_mode::columns) == permute_mode::columns) {
+        result->sort_by_column_index();
+    }
+    return result;
+}
+
+
+template <typename ValueType, typename IndexType>
+std::unique_ptr<Csr<ValueType, IndexType>>
+Csr<ValueType, IndexType>::scale_permute(
+    ptr_param<const ScaledPermutation<ValueType, IndexType>> row_permutation,
+    ptr_param<const ScaledPermutation<ValueType, IndexType>> col_permutation,
+    bool invert) const
+{
+    const auto exec = this->get_executor();
+    const auto size = this->get_size();
+    const auto nnz = this->get_num_stored_elements();
+    GKO_ASSERT_EQUAL_ROWS(this, row_permutation);
+    GKO_ASSERT_EQUAL_COLS(this, col_permutation);
+    auto result = Csr::create(exec, size, nnz, this->get_strategy()->copy());
+    auto local_row_permutation = make_temporary_clone(exec, row_permutation);
+    auto local_col_permutation = make_temporary_clone(exec, col_permutation);
+    if (invert) {
+        exec->run(csr::make_inv_nonsymm_scale_permute(
+            local_row_permutation->get_const_scaling_factors(),
+            local_row_permutation->get_const_permutation(),
+            local_col_permutation->get_const_scaling_factors(),
+            local_col_permutation->get_const_permutation(), this,
+            result.get()));
+    } else {
+        const auto inv_row_perm = local_row_permutation->compute_inverse();
+        const auto inv_col_perm = local_col_permutation->compute_inverse();
+        exec->run(csr::make_inv_nonsymm_scale_permute(
+            inv_row_perm->get_const_scaling_factors(),
+            inv_row_perm->get_const_permutation(),
+            inv_col_perm->get_const_scaling_factors(),
+            inv_col_perm->get_const_permutation(), this, result.get()));
+    }
+    result->make_srow();
+    result->sort_by_column_index();
+    return result;
+}
+
+
+template <typename IndexType>
+std::unique_ptr<const Permutation<IndexType>> create_permutation_view(
+    const array<IndexType>& indices)
+{
+    return Permutation<IndexType>::create_const(indices.get_executor(),
+                                                indices.as_const_view());
+}
+
+
 template <typename ValueType, typename IndexType>
 std::unique_ptr<LinOp> Csr<ValueType, IndexType>::permute(
     const array<IndexType>* permutation_indices) const
 {
-    GKO_ASSERT_IS_SQUARE_MATRIX(this);
-    GKO_ASSERT_EQ(permutation_indices->get_num_elems(), this->get_size()[0]);
-    auto exec = this->get_executor();
-    auto permute_cpy =
-        Csr::create(exec, this->get_size(), this->get_num_stored_elements(),
-                    this->get_strategy());
-    array<IndexType> inv_permutation(exec, this->get_size()[1]);
-
-    exec->run(csr::make_invert_permutation(
-        this->get_size()[1],
-        make_temporary_clone(exec, permutation_indices)->get_const_data(),
-        inv_permutation.get_data()));
-    exec->run(csr::make_inv_symm_permute(inv_permutation.get_const_data(), this,
-                                         permute_cpy.get()));
-    permute_cpy->make_srow();
-    return std::move(permute_cpy);
+    return permute(create_permutation_view(*permutation_indices),
+                   permute_mode::symmetric);
 }
 
 
@@ -547,18 +740,8 @@ template <typename ValueType, typename IndexType>
 std::unique_ptr<LinOp> Csr<ValueType, IndexType>::inverse_permute(
     const array<IndexType>* permutation_indices) const
 {
-    GKO_ASSERT_IS_SQUARE_MATRIX(this);
-    GKO_ASSERT_EQ(permutation_indices->get_num_elems(), this->get_size()[0]);
-    auto exec = this->get_executor();
-    auto permute_cpy =
-        Csr::create(exec, this->get_size(), this->get_num_stored_elements(),
-                    this->get_strategy());
-
-    exec->run(csr::make_inv_symm_permute(
-        make_temporary_clone(exec, permutation_indices)->get_const_data(), this,
-        permute_cpy.get()));
-    permute_cpy->make_srow();
-    return std::move(permute_cpy);
+    return permute(create_permutation_view(*permutation_indices),
+                   permute_mode::inverse_symmetric);
 }
 
 
@@ -566,17 +749,8 @@ template <typename ValueType, typename IndexType>
 std::unique_ptr<LinOp> Csr<ValueType, IndexType>::row_permute(
     const array<IndexType>* permutation_indices) const
 {
-    GKO_ASSERT_EQ(permutation_indices->get_num_elems(), this->get_size()[0]);
-    auto exec = this->get_executor();
-    auto permute_cpy =
-        Csr::create(exec, this->get_size(), this->get_num_stored_elements(),
-                    this->get_strategy());
-
-    exec->run(csr::make_row_permute(
-        make_temporary_clone(exec, permutation_indices)->get_const_data(), this,
-        permute_cpy.get()));
-    permute_cpy->make_srow();
-    return std::move(permute_cpy);
+    return permute(create_permutation_view(*permutation_indices),
+                   permute_mode::rows);
 }
 
 
@@ -584,22 +758,8 @@ template <typename ValueType, typename IndexType>
 std::unique_ptr<LinOp> Csr<ValueType, IndexType>::column_permute(
     const array<IndexType>* permutation_indices) const
 {
-    GKO_ASSERT_EQ(permutation_indices->get_num_elems(), this->get_size()[1]);
-    auto exec = this->get_executor();
-    auto permute_cpy =
-        Csr::create(exec, this->get_size(), this->get_num_stored_elements(),
-                    this->get_strategy());
-    array<IndexType> inv_permutation(exec, this->get_size()[1]);
-
-    exec->run(csr::make_invert_permutation(
-        this->get_size()[1],
-        make_temporary_clone(exec, permutation_indices)->get_const_data(),
-        inv_permutation.get_data()));
-    exec->run(csr::make_inverse_column_permute(inv_permutation.get_const_data(),
-                                               this, permute_cpy.get()));
-    permute_cpy->make_srow();
-    permute_cpy->sort_by_column_index();
-    return std::move(permute_cpy);
+    return permute(create_permutation_view(*permutation_indices),
+                   permute_mode::columns);
 }
 
 
@@ -607,17 +767,8 @@ template <typename ValueType, typename IndexType>
 std::unique_ptr<LinOp> Csr<ValueType, IndexType>::inverse_row_permute(
     const array<IndexType>* permutation_indices) const
 {
-    GKO_ASSERT_EQ(permutation_indices->get_num_elems(), this->get_size()[0]);
-    auto exec = this->get_executor();
-    auto inverse_permute_cpy =
-        Csr::create(exec, this->get_size(), this->get_num_stored_elements(),
-                    this->get_strategy());
-
-    exec->run(csr::make_inverse_row_permute(
-        make_temporary_clone(exec, permutation_indices)->get_const_data(), this,
-        inverse_permute_cpy.get()));
-    inverse_permute_cpy->make_srow();
-    return std::move(inverse_permute_cpy);
+    return permute(create_permutation_view(*permutation_indices),
+                   permute_mode::inverse_rows);
 }
 
 
@@ -625,18 +776,8 @@ template <typename ValueType, typename IndexType>
 std::unique_ptr<LinOp> Csr<ValueType, IndexType>::inverse_column_permute(
     const array<IndexType>* permutation_indices) const
 {
-    GKO_ASSERT_EQ(permutation_indices->get_num_elems(), this->get_size()[1]);
-    auto exec = this->get_executor();
-    auto inverse_permute_cpy =
-        Csr::create(exec, this->get_size(), this->get_num_stored_elements(),
-                    this->get_strategy());
-
-    exec->run(csr::make_inverse_column_permute(
-        make_temporary_clone(exec, permutation_indices)->get_const_data(), this,
-        inverse_permute_cpy.get()));
-    inverse_permute_cpy->make_srow();
-    inverse_permute_cpy->sort_by_column_index();
-    return std::move(inverse_permute_cpy);
+    return permute(create_permutation_view(*permutation_indices),
+                   permute_mode::inverse_columns);
 }
 
 
diff --git a/core/matrix/csr_kernels.hpp b/core/matrix/csr_kernels.hpp
index 42a92ca1b84..26d80f93b8b 100644
--- a/core/matrix/csr_kernels.hpp
+++ b/core/matrix/csr_kernels.hpp
@@ -146,23 +146,61 @@ namespace kernels {
                      const matrix::Csr<ValueType, IndexType>* orig, \
                      matrix::Csr<ValueType, IndexType>* row_permuted)
 
-#define GKO_DECLARE_CSR_INVERSE_ROW_PERMUTE_KERNEL(ValueType, IndexType)    \
-    void inverse_row_permute(std::shared_ptr<const DefaultExecutor> exec,   \
-                             const IndexType* permutation_indices,          \
+#define GKO_DECLARE_CSR_INV_ROW_PERMUTE_KERNEL(ValueType, IndexType)    \
+    void inv_row_permute(std::shared_ptr<const DefaultExecutor> exec,   \
+                         const IndexType* permutation_indices,          \
+                         const matrix::Csr<ValueType, IndexType>* orig, \
+                         matrix::Csr<ValueType, IndexType>* row_permuted)
+
+#define GKO_DECLARE_CSR_INV_COL_PERMUTE_KERNEL(ValueType, IndexType)    \
+    void inv_col_permute(std::shared_ptr<const DefaultExecutor> exec,   \
+                         const IndexType* permutation_indices,          \
+                         const matrix::Csr<ValueType, IndexType>* orig, \
+                         matrix::Csr<ValueType, IndexType>* col_permuted)
+
+#define GKO_DECLARE_CSR_INV_NONSYMM_PERMUTE_KERNEL(ValueType, IndexType)    \
+    void inv_nonsymm_permute(std::shared_ptr<const DefaultExecutor> exec,   \
+                             const IndexType* row_permutation_indices,      \
+                             const IndexType* column_permutation_indices,   \
                              const matrix::Csr<ValueType, IndexType>* orig, \
-                             matrix::Csr<ValueType, IndexType>* row_permuted)
-
-#define GKO_DECLARE_CSR_INVERSE_COLUMN_PERMUTE_KERNEL(ValueType, IndexType) \
-    void inverse_column_permute(                                            \
-        std::shared_ptr<const DefaultExecutor> exec,                        \
-        const IndexType* permutation_indices,                               \
-        const matrix::Csr<ValueType, IndexType>* orig,                      \
-        matrix::Csr<ValueType, IndexType>* column_permuted)
-
-#define GKO_DECLARE_INVERT_PERMUTATION_KERNEL(IndexType)             \
-    void invert_permutation(                                         \
-        std::shared_ptr<const DefaultExecutor> exec, size_type size, \
-        const IndexType* permutation_indices, IndexType* inv_permutation)
+                             matrix::Csr<ValueType, IndexType>* permuted)
+
+#define GKO_DECLARE_CSR_INV_SYMM_SCALE_PERMUTE_KERNEL(ValueType, IndexType)    \
+    void inv_symm_scale_permute(std::shared_ptr<const DefaultExecutor> exec,   \
+                                const ValueType* scale,                        \
+                                const IndexType* permutation_indices,          \
+                                const matrix::Csr<ValueType, IndexType>* orig, \
+                                matrix::Csr<ValueType, IndexType>* permuted)
+
+#define GKO_DECLARE_CSR_ROW_SCALE_PERMUTE_KERNEL(ValueType, IndexType)    \
+    void row_scale_permute(std::shared_ptr<const DefaultExecutor> exec,   \
+                           const ValueType* scale,                        \
+                           const IndexType* permutation_indices,          \
+                           const matrix::Csr<ValueType, IndexType>* orig, \
+                           matrix::Csr<ValueType, IndexType>* row_permuted)
+
+#define GKO_DECLARE_CSR_INV_ROW_SCALE_PERMUTE_KERNEL(ValueType, IndexType)   \
+    void inv_row_scale_permute(                                              \
+        std::shared_ptr<const DefaultExecutor> exec, const ValueType* scale, \
+        const IndexType* permutation_indices,                                \
+        const matrix::Csr<ValueType, IndexType>* orig,                       \
+        matrix::Csr<ValueType, IndexType>* row_permuted)
+
+#define GKO_DECLARE_CSR_INV_COL_SCALE_PERMUTE_KERNEL(ValueType, IndexType)   \
+    void inv_col_scale_permute(                                              \
+        std::shared_ptr<const DefaultExecutor> exec, const ValueType* scale, \
+        const IndexType* permutation_indices,                                \
+        const matrix::Csr<ValueType, IndexType>* orig,                       \
+        matrix::Csr<ValueType, IndexType>* col_permuted)
+
+#define GKO_DECLARE_CSR_INV_NONSYMM_SCALE_PERMUTE_KERNEL(ValueType, IndexType) \
+    void inv_nonsymm_scale_permute(                                            \
+        std::shared_ptr<const DefaultExecutor> exec,                           \
+        const ValueType* row_scale, const IndexType* row_permutation_indices,  \
+        const ValueType* column_scale,                                         \
+        const IndexType* column_permutation_indices,                           \
+        const matrix::Csr<ValueType, IndexType>* orig,                         \
+        matrix::Csr<ValueType, IndexType>* col_permuted)
 
 #define GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_SPAN_KERNEL(ValueType, IndexType)  \
     void calculate_nonzeros_per_row_in_span(                                   \
@@ -251,74 +289,84 @@ namespace kernels {
                           IndexType sample_size, IndexType* result)
 
 
-#define GKO_DECLARE_ALL_AS_TEMPLATES                                       \
-    template <typename MatrixValueType, typename InputValueType,           \
-              typename OutputValueType, typename IndexType>                \
-    GKO_DECLARE_CSR_SPMV_KERNEL(MatrixValueType, InputValueType,           \
-                                OutputValueType, IndexType);               \
-    template <typename MatrixValueType, typename InputValueType,           \
-              typename OutputValueType, typename IndexType>                \
-    GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL(MatrixValueType, InputValueType,  \
-                                         OutputValueType, IndexType);      \
-    template <typename ValueType, typename IndexType>                      \
-    GKO_DECLARE_CSR_SPGEMM_KERNEL(ValueType, IndexType);                   \
-    template <typename ValueType, typename IndexType>                      \
-    GKO_DECLARE_CSR_ADVANCED_SPGEMM_KERNEL(ValueType, IndexType);          \
-    template <typename ValueType, typename IndexType>                      \
-    GKO_DECLARE_CSR_SPGEAM_KERNEL(ValueType, IndexType);                   \
-    template <typename ValueType, typename IndexType>                      \
-    GKO_DECLARE_CSR_FILL_IN_DENSE_KERNEL(ValueType, IndexType);            \
-    template <typename ValueType, typename IndexType>                      \
-    GKO_DECLARE_CSR_CONVERT_TO_SELLP_KERNEL(ValueType, IndexType);         \
-    template <typename ValueType, typename IndexType>                      \
-    GKO_DECLARE_CSR_CONVERT_TO_HYBRID_KERNEL(ValueType, IndexType);        \
-    template <typename ValueType, typename IndexType>                      \
-    GKO_DECLARE_CSR_CONVERT_TO_ELL_KERNEL(ValueType, IndexType);           \
-    template <typename ValueType, typename IndexType>                      \
-    GKO_DECLARE_CSR_CONVERT_TO_FBCSR_KERNEL(ValueType, IndexType);         \
-    template <typename ValueType, typename IndexType>                      \
-    GKO_DECLARE_CSR_TRANSPOSE_KERNEL(ValueType, IndexType);                \
-    template <typename ValueType, typename IndexType>                      \
-    GKO_DECLARE_CSR_CONJ_TRANSPOSE_KERNEL(ValueType, IndexType);           \
-    template <typename ValueType, typename IndexType>                      \
-    GKO_DECLARE_CSR_INV_SYMM_PERMUTE_KERNEL(ValueType, IndexType);         \
-    template <typename ValueType, typename IndexType>                      \
-    GKO_DECLARE_CSR_ROW_PERMUTE_KERNEL(ValueType, IndexType);              \
-    template <typename ValueType, typename IndexType>                      \
-    GKO_DECLARE_CSR_INVERSE_ROW_PERMUTE_KERNEL(ValueType, IndexType);      \
-    template <typename ValueType, typename IndexType>                      \
-    GKO_DECLARE_CSR_INVERSE_COLUMN_PERMUTE_KERNEL(ValueType, IndexType);   \
-    template <typename IndexType>                                          \
-    GKO_DECLARE_INVERT_PERMUTATION_KERNEL(IndexType);                      \
-    template <typename ValueType, typename IndexType>                      \
-    GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_SPAN_KERNEL(ValueType, IndexType); \
-    template <typename ValueType, typename IndexType>                      \
-    GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_KERNEL(ValueType, IndexType);       \
-    template <typename ValueType, typename IndexType>                      \
-    GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_INDEX_SET_KERNEL(ValueType,        \
-                                                         IndexType);       \
-    template <typename ValueType, typename IndexType>                      \
-    GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_FROM_INDEX_SET_KERNEL(ValueType,    \
-                                                             IndexType);   \
-    template <typename ValueType, typename IndexType>                      \
-    GKO_DECLARE_CSR_SORT_BY_COLUMN_INDEX(ValueType, IndexType);            \
-    template <typename ValueType, typename IndexType>                      \
-    GKO_DECLARE_CSR_IS_SORTED_BY_COLUMN_INDEX(ValueType, IndexType);       \
-    template <typename ValueType, typename IndexType>                      \
-    GKO_DECLARE_CSR_EXTRACT_DIAGONAL(ValueType, IndexType);                \
-    template <typename ValueType, typename IndexType>                      \
-    GKO_DECLARE_CSR_SCALE_KERNEL(ValueType, IndexType);                    \
-    template <typename ValueType, typename IndexType>                      \
-    GKO_DECLARE_CSR_INV_SCALE_KERNEL(ValueType, IndexType);                \
-    template <typename ValueType, typename IndexType>                      \
-    GKO_DECLARE_CSR_CHECK_DIAGONAL_ENTRIES_EXIST(ValueType, IndexType);    \
-    template <typename ValueType, typename IndexType>                      \
-    GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL(ValueType, IndexType);      \
-    template <typename IndexType>                                          \
-    GKO_DECLARE_CSR_BUILD_LOOKUP_OFFSETS_KERNEL(IndexType);                \
-    template <typename IndexType>                                          \
-    GKO_DECLARE_CSR_BUILD_LOOKUP_KERNEL(IndexType);                        \
-    template <typename IndexType>                                          \
+#define GKO_DECLARE_ALL_AS_TEMPLATES                                        \
+    template <typename MatrixValueType, typename InputValueType,            \
+              typename OutputValueType, typename IndexType>                 \
+    GKO_DECLARE_CSR_SPMV_KERNEL(MatrixValueType, InputValueType,            \
+                                OutputValueType, IndexType);                \
+    template <typename MatrixValueType, typename InputValueType,            \
+              typename OutputValueType, typename IndexType>                 \
+    GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL(MatrixValueType, InputValueType,   \
+                                         OutputValueType, IndexType);       \
+    template <typename ValueType, typename IndexType>                       \
+    GKO_DECLARE_CSR_SPGEMM_KERNEL(ValueType, IndexType);                    \
+    template <typename ValueType, typename IndexType>                       \
+    GKO_DECLARE_CSR_ADVANCED_SPGEMM_KERNEL(ValueType, IndexType);           \
+    template <typename ValueType, typename IndexType>                       \
+    GKO_DECLARE_CSR_SPGEAM_KERNEL(ValueType, IndexType);                    \
+    template <typename ValueType, typename IndexType>                       \
+    GKO_DECLARE_CSR_FILL_IN_DENSE_KERNEL(ValueType, IndexType);             \
+    template <typename ValueType, typename IndexType>                       \
+    GKO_DECLARE_CSR_CONVERT_TO_SELLP_KERNEL(ValueType, IndexType);          \
+    template <typename ValueType, typename IndexType>                       \
+    GKO_DECLARE_CSR_CONVERT_TO_HYBRID_KERNEL(ValueType, IndexType);         \
+    template <typename ValueType, typename IndexType>                       \
+    GKO_DECLARE_CSR_CONVERT_TO_ELL_KERNEL(ValueType, IndexType);            \
+    template <typename ValueType, typename IndexType>                       \
+    GKO_DECLARE_CSR_CONVERT_TO_FBCSR_KERNEL(ValueType, IndexType);          \
+    template <typename ValueType, typename IndexType>                       \
+    GKO_DECLARE_CSR_TRANSPOSE_KERNEL(ValueType, IndexType);                 \
+    template <typename ValueType, typename IndexType>                       \
+    GKO_DECLARE_CSR_CONJ_TRANSPOSE_KERNEL(ValueType, IndexType);            \
+    template <typename ValueType, typename IndexType>                       \
+    GKO_DECLARE_CSR_INV_SYMM_PERMUTE_KERNEL(ValueType, IndexType);          \
+    template <typename ValueType, typename IndexType>                       \
+    GKO_DECLARE_CSR_ROW_PERMUTE_KERNEL(ValueType, IndexType);               \
+    template <typename ValueType, typename IndexType>                       \
+    GKO_DECLARE_CSR_INV_ROW_PERMUTE_KERNEL(ValueType, IndexType);           \
+    template <typename ValueType, typename IndexType>                       \
+    GKO_DECLARE_CSR_INV_COL_PERMUTE_KERNEL(ValueType, IndexType);           \
+    template <typename ValueType, typename IndexType>                       \
+    GKO_DECLARE_CSR_INV_NONSYMM_PERMUTE_KERNEL(ValueType, IndexType);       \
+    template <typename ValueType, typename IndexType>                       \
+    GKO_DECLARE_CSR_INV_SYMM_SCALE_PERMUTE_KERNEL(ValueType, IndexType);    \
+    template <typename ValueType, typename IndexType>                       \
+    GKO_DECLARE_CSR_ROW_SCALE_PERMUTE_KERNEL(ValueType, IndexType);         \
+    template <typename ValueType, typename IndexType>                       \
+    GKO_DECLARE_CSR_INV_ROW_SCALE_PERMUTE_KERNEL(ValueType, IndexType);     \
+    template <typename ValueType, typename IndexType>                       \
+    GKO_DECLARE_CSR_INV_COL_SCALE_PERMUTE_KERNEL(ValueType, IndexType);     \
+    template <typename ValueType, typename IndexType>                       \
+    GKO_DECLARE_CSR_INV_NONSYMM_SCALE_PERMUTE_KERNEL(ValueType, IndexType); \
+    template <typename ValueType, typename IndexType>                       \
+    GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_SPAN_KERNEL(ValueType, IndexType);  \
+    template <typename ValueType, typename IndexType>                       \
+    GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_KERNEL(ValueType, IndexType);        \
+    template <typename ValueType, typename IndexType>                       \
+    GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_INDEX_SET_KERNEL(ValueType,         \
+                                                         IndexType);        \
+    template <typename ValueType, typename IndexType>                       \
+    GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_FROM_INDEX_SET_KERNEL(ValueType,     \
+                                                             IndexType);    \
+    template <typename ValueType, typename IndexType>                       \
+    GKO_DECLARE_CSR_SORT_BY_COLUMN_INDEX(ValueType, IndexType);             \
+    template <typename ValueType, typename IndexType>                       \
+    GKO_DECLARE_CSR_IS_SORTED_BY_COLUMN_INDEX(ValueType, IndexType);        \
+    template <typename ValueType, typename IndexType>                       \
+    GKO_DECLARE_CSR_EXTRACT_DIAGONAL(ValueType, IndexType);                 \
+    template <typename ValueType, typename IndexType>                       \
+    GKO_DECLARE_CSR_SCALE_KERNEL(ValueType, IndexType);                     \
+    template <typename ValueType, typename IndexType>                       \
+    GKO_DECLARE_CSR_INV_SCALE_KERNEL(ValueType, IndexType);                 \
+    template <typename ValueType, typename IndexType>                       \
+    GKO_DECLARE_CSR_CHECK_DIAGONAL_ENTRIES_EXIST(ValueType, IndexType);     \
+    template <typename ValueType, typename IndexType>                       \
+    GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL(ValueType, IndexType);       \
+    template <typename IndexType>                                           \
+    GKO_DECLARE_CSR_BUILD_LOOKUP_OFFSETS_KERNEL(IndexType);                 \
+    template <typename IndexType>                                           \
+    GKO_DECLARE_CSR_BUILD_LOOKUP_KERNEL(IndexType);                         \
+    template <typename IndexType>                                           \
     GKO_DECLARE_CSR_BENCHMARK_LOOKUP_KERNEL(IndexType)
 
 
diff --git a/core/matrix/dense.cpp b/core/matrix/dense.cpp
index 17dec93c234..72f984aa27f 100644
--- a/core/matrix/dense.cpp
+++ b/core/matrix/dense.cpp
@@ -43,6 +43,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/precision_dispatch.hpp>
+#include <ginkgo/core/base/temporary_clone.hpp>
 #include <ginkgo/core/base/utils.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
@@ -50,6 +51,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/ell.hpp>
 #include <ginkgo/core/matrix/fbcsr.hpp>
 #include <ginkgo/core/matrix/hybrid.hpp>
+#include <ginkgo/core/matrix/permutation.hpp>
+#include <ginkgo/core/matrix/scaled_permutation.hpp>
 #include <ginkgo/core/matrix/sellp.hpp>
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
@@ -58,6 +61,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/dense_kernels.hpp"
 #include "core/matrix/hybrid_kernels.hpp"
+#include "core/matrix/permutation.hpp"
 
 
 namespace gko {
@@ -80,6 +84,7 @@ GKO_REGISTER_OPERATION(compute_dot, dense::compute_dot_dispatch);
 GKO_REGISTER_OPERATION(compute_conj_dot, dense::compute_conj_dot_dispatch);
 GKO_REGISTER_OPERATION(compute_norm2, dense::compute_norm2_dispatch);
 GKO_REGISTER_OPERATION(compute_norm1, dense::compute_norm1);
+GKO_REGISTER_OPERATION(compute_mean, dense::compute_mean);
 GKO_REGISTER_OPERATION(compute_squared_norm2, dense::compute_squared_norm2);
 GKO_REGISTER_OPERATION(compute_sqrt, dense::compute_sqrt);
 GKO_REGISTER_OPERATION(compute_max_nnz_per_row, dense::compute_max_nnz_per_row);
@@ -95,11 +100,22 @@ GKO_REGISTER_OPERATION(transpose, dense::transpose);
 GKO_REGISTER_OPERATION(conj_transpose, dense::conj_transpose);
 GKO_REGISTER_OPERATION(symm_permute, dense::symm_permute);
 GKO_REGISTER_OPERATION(inv_symm_permute, dense::inv_symm_permute);
+GKO_REGISTER_OPERATION(nonsymm_permute, dense::nonsymm_permute);
+GKO_REGISTER_OPERATION(inv_nonsymm_permute, dense::inv_nonsymm_permute);
 GKO_REGISTER_OPERATION(row_gather, dense::row_gather);
 GKO_REGISTER_OPERATION(advanced_row_gather, dense::advanced_row_gather);
-GKO_REGISTER_OPERATION(column_permute, dense::column_permute);
-GKO_REGISTER_OPERATION(inverse_row_permute, dense::inverse_row_permute);
-GKO_REGISTER_OPERATION(inverse_column_permute, dense::inverse_column_permute);
+GKO_REGISTER_OPERATION(col_permute, dense::col_permute);
+GKO_REGISTER_OPERATION(inverse_row_permute, dense::inv_row_permute);
+GKO_REGISTER_OPERATION(inverse_col_permute, dense::inv_col_permute);
+GKO_REGISTER_OPERATION(symm_scale_permute, dense::symm_scale_permute);
+GKO_REGISTER_OPERATION(inv_symm_scale_permute, dense::inv_symm_scale_permute);
+GKO_REGISTER_OPERATION(nonsymm_scale_permute, dense::nonsymm_scale_permute);
+GKO_REGISTER_OPERATION(inv_nonsymm_scale_permute,
+                       dense::inv_nonsymm_scale_permute);
+GKO_REGISTER_OPERATION(row_scale_permute, dense::row_scale_permute);
+GKO_REGISTER_OPERATION(col_scale_permute, dense::col_scale_permute);
+GKO_REGISTER_OPERATION(inv_row_scale_permute, dense::inv_row_scale_permute);
+GKO_REGISTER_OPERATION(inv_col_scale_permute, dense::inv_col_scale_permute);
 GKO_REGISTER_OPERATION(fill_in_matrix_data, dense::fill_in_matrix_data);
 GKO_REGISTER_OPERATION(convert_to_coo, dense::convert_to_coo);
 GKO_REGISTER_OPERATION(convert_to_csr, dense::convert_to_csr);
@@ -496,6 +512,29 @@ void Dense<ValueType>::compute_squared_norm2(ptr_param<LinOp> result,
 }
 
 
+template <typename ValueType>
+void Dense<ValueType>::compute_mean(ptr_param<LinOp> result) const
+{
+    auto exec = this->get_executor();
+    this->compute_mean_impl(make_temporary_output_clone(exec, result).get());
+}
+
+
+template <typename ValueType>
+void Dense<ValueType>::compute_mean(ptr_param<LinOp> result,
+                                    array<char>& tmp) const
+{
+    GKO_ASSERT_EQUAL_COLS(result, this);
+    auto exec = this->get_executor();
+    if (tmp.get_executor() != exec) {
+        tmp.clear();
+        tmp.set_executor(exec);
+    }
+    auto dense_res = make_temporary_conversion<ValueType>(result);
+    exec->run(dense::make_compute_mean(this, dense_res.get(), tmp));
+}
+
+
 template <typename ValueType>
 void Dense<ValueType>::compute_squared_norm2_impl(LinOp* result) const
 {
@@ -506,6 +545,15 @@ void Dense<ValueType>::compute_squared_norm2_impl(LinOp* result) const
 }
 
 
+template <typename ValueType>
+void Dense<ValueType>::compute_mean_impl(LinOp* result) const
+{
+    auto exec = this->get_executor();
+    array<char> tmp{exec};
+    this->compute_mean(make_temporary_output_clone(exec, result).get(), tmp);
+}
+
+
 template <typename ValueType>
 Dense<ValueType>& Dense<ValueType>::operator=(const Dense& other)
 {
@@ -1080,48 +1128,158 @@ void Dense<ValueType>::conj_transpose(ptr_param<Dense<ValueType>> output) const
 
 template <typename ValueType>
 template <typename IndexType>
-void Dense<ValueType>::permute_impl(const array<IndexType>* permutation_indices,
-                                    Dense<ValueType>* output) const
+void Dense<ValueType>::permute_impl(const Permutation<IndexType>* permutation,
+                                    permute_mode mode, Dense* output) const
 {
-    GKO_ASSERT_IS_SQUARE_MATRIX(this);
+    const auto exec = this->get_executor();
+    const auto size = this->get_size();
     GKO_ASSERT_EQUAL_DIMENSIONS(this, output);
-    GKO_ASSERT_EQ(permutation_indices->get_num_elems(), this->get_size()[0]);
-    auto exec = this->get_executor();
-
-    exec->run(dense::make_symm_permute(
-        make_temporary_clone(exec, permutation_indices).get(), this,
-        make_temporary_output_clone(exec, output).get()));
+    validate_permute_dimensions(size, permutation->get_size(), mode);
+    if ((mode & permute_mode::symmetric) == permute_mode::none) {
+        output->copy_from(this);
+        return;
+    }
+    auto local_output = make_temporary_output_clone(exec, output);
+    auto local_perm = make_temporary_clone(exec, permutation);
+    switch (mode) {
+    case permute_mode::rows:
+        exec->run(dense::make_row_gather(local_perm->get_const_permutation(),
+                                         this, local_output.get()));
+        break;
+    case permute_mode::columns:
+        exec->run(dense::make_col_permute(local_perm->get_const_permutation(),
+                                          this, local_output.get()));
+        break;
+    case permute_mode::symmetric:
+        exec->run(dense::make_symm_permute(local_perm->get_const_permutation(),
+                                           this, local_output.get()));
+        break;
+    case permute_mode::inverse_rows:
+        exec->run(dense::make_inverse_row_permute(
+            local_perm->get_const_permutation(), this, local_output.get()));
+        break;
+    case permute_mode::inverse_columns:
+        exec->run(dense::make_inverse_col_permute(
+            local_perm->get_const_permutation(), this, local_output.get()));
+        break;
+    case permute_mode::inverse_symmetric:
+        exec->run(dense::make_inv_symm_permute(
+            local_perm->get_const_permutation(), this, local_output.get()));
+        break;
+    default:
+        GKO_INVALID_STATE("Invalid permute mode");
+    }
 }
 
 
 template <typename ValueType>
 template <typename IndexType>
-void Dense<ValueType>::inverse_permute_impl(
-    const array<IndexType>* permutation_indices, Dense<ValueType>* output) const
+void Dense<ValueType>::permute_impl(
+    const Permutation<IndexType>* row_permutation,
+    const Permutation<IndexType>* col_permutation, bool invert,
+    Dense* output) const
 {
-    GKO_ASSERT_IS_SQUARE_MATRIX(this);
-    GKO_ASSERT_EQUAL_DIMENSIONS(this, output);
-    GKO_ASSERT_EQ(permutation_indices->get_num_elems(), this->get_size()[0]);
     auto exec = this->get_executor();
-
-    exec->run(dense::make_inv_symm_permute(
-        make_temporary_clone(exec, permutation_indices).get(), this,
-        make_temporary_output_clone(exec, output).get()));
+    auto size = this->get_size();
+    GKO_ASSERT_EQUAL_DIMENSIONS(this, output);
+    GKO_ASSERT_EQUAL_ROWS(this, row_permutation);
+    GKO_ASSERT_EQUAL_COLS(this, col_permutation);
+    auto local_output = make_temporary_output_clone(exec, output);
+    auto local_row_perm = make_temporary_clone(exec, row_permutation);
+    auto local_col_perm = make_temporary_clone(exec, col_permutation);
+    if (invert) {
+        exec->run(dense::make_inv_nonsymm_permute(
+            local_row_perm->get_const_permutation(),
+            local_col_perm->get_const_permutation(), this, local_output.get()));
+    } else {
+        exec->run(dense::make_nonsymm_permute(
+            local_row_perm->get_const_permutation(),
+            local_col_perm->get_const_permutation(), this, local_output.get()));
+    }
 }
 
 
 template <typename ValueType>
 template <typename IndexType>
-void Dense<ValueType>::row_permute_impl(
-    const array<IndexType>* permutation_indices, Dense<ValueType>* output) const
+void Dense<ValueType>::scale_permute_impl(
+    const ScaledPermutation<ValueType, IndexType>* permutation,
+    permute_mode mode, Dense* output) const
 {
-    GKO_ASSERT_EQ(permutation_indices->get_num_elems(), this->get_size()[0]);
+    const auto exec = this->get_executor();
+    const auto size = this->get_size();
     GKO_ASSERT_EQUAL_DIMENSIONS(this, output);
-    auto exec = this->get_executor();
+    validate_permute_dimensions(size, permutation->get_size(), mode);
+    if ((mode & permute_mode::symmetric) == permute_mode::none) {
+        output->copy_from(this);
+        return;
+    }
+    auto local_output = make_temporary_output_clone(exec, output);
+    auto local_perm = make_temporary_clone(exec, permutation);
+    switch (mode) {
+    case permute_mode::rows:
+        exec->run(dense::make_row_scale_permute(
+            local_perm->get_const_scaling_factors(),
+            local_perm->get_const_permutation(), this, local_output.get()));
+        break;
+    case permute_mode::columns:
+        exec->run(dense::make_col_scale_permute(
+            local_perm->get_const_scaling_factors(),
+            local_perm->get_const_permutation(), this, local_output.get()));
+        break;
+    case permute_mode::symmetric:
+        exec->run(dense::make_symm_scale_permute(
+            local_perm->get_const_scaling_factors(),
+            local_perm->get_const_permutation(), this, local_output.get()));
+        break;
+    case permute_mode::inverse_rows:
+        exec->run(dense::make_inv_row_scale_permute(
+            local_perm->get_const_scaling_factors(),
+            local_perm->get_const_permutation(), this, local_output.get()));
+        break;
+    case permute_mode::inverse_columns:
+        exec->run(dense::make_inv_col_scale_permute(
+            local_perm->get_const_scaling_factors(),
+            local_perm->get_const_permutation(), this, local_output.get()));
+        break;
+    case permute_mode::inverse_symmetric:
+        exec->run(dense::make_inv_symm_scale_permute(
+            local_perm->get_const_scaling_factors(),
+            local_perm->get_const_permutation(), this, local_output.get()));
+        break;
+    default:
+        GKO_INVALID_STATE("Invalid permute mode");
+    }
+}
 
-    exec->run(dense::make_row_gather(
-        make_temporary_clone(exec, permutation_indices).get(), this,
-        make_temporary_output_clone(exec, output).get()));
+
+template <typename ValueType>
+template <typename IndexType>
+void Dense<ValueType>::scale_permute_impl(
+    const ScaledPermutation<ValueType, IndexType>* row_permutation,
+    const ScaledPermutation<ValueType, IndexType>* col_permutation, bool invert,
+    Dense* output) const
+{
+    auto exec = this->get_executor();
+    auto size = this->get_size();
+    GKO_ASSERT_EQUAL_DIMENSIONS(this, output);
+    GKO_ASSERT_EQUAL_ROWS(this, row_permutation);
+    GKO_ASSERT_EQUAL_COLS(this, col_permutation);
+    auto local_output = make_temporary_output_clone(exec, output);
+    auto local_row_perm = make_temporary_clone(exec, row_permutation);
+    auto local_col_perm = make_temporary_clone(exec, col_permutation);
+    if (invert) {
+        exec->run(dense::make_inv_nonsymm_scale_permute(
+            local_row_perm->get_const_scaling_factors(),
+            local_row_perm->get_const_permutation(),
+            local_col_perm->get_const_scaling_factors(),
+            local_col_perm->get_const_permutation(), this, local_output.get()));
+    } else {
+        exec->run(dense::make_nonsymm_scale_permute(
+            local_row_perm->get_const_scaling_factors(),
+            local_row_perm->get_const_permutation(),
+            local_col_perm->get_const_scaling_factors(),
+            local_col_perm->get_const_permutation(), this, local_output.get()));
+    }
 }
 
 
@@ -1135,7 +1293,7 @@ void Dense<ValueType>::row_gather_impl(const array<IndexType>* row_idxs,
     GKO_ASSERT_EQUAL_DIMENSIONS(expected_dim, row_collection);
 
     exec->run(dense::make_row_gather(
-        make_temporary_clone(exec, row_idxs).get(), this,
+        make_temporary_clone(exec, row_idxs)->get_const_data(), this,
         make_temporary_output_clone(exec, row_collection).get()));
 }
 
@@ -1152,82 +1310,129 @@ void Dense<ValueType>::row_gather_impl(const Dense<ValueType>* alpha,
 
     exec->run(dense::make_advanced_row_gather(
         make_temporary_clone(exec, alpha).get(),
-        make_temporary_clone(exec, row_idxs).get(), this,
+        make_temporary_clone(exec, row_idxs)->get_const_data(), this,
         make_temporary_clone(exec, beta).get(),
         make_temporary_clone(exec, row_collection).get()));
 }
 
 
 template <typename ValueType>
-template <typename IndexType>
-void Dense<ValueType>::column_permute_impl(
-    const array<IndexType>* permutation_indices, Dense<ValueType>* output) const
+std::unique_ptr<LinOp> Dense<ValueType>::permute(
+    const array<int32>* permutation_indices) const
 {
-    GKO_ASSERT_EQ(permutation_indices->get_num_elems(), this->get_size()[1]);
-    GKO_ASSERT_EQUAL_DIMENSIONS(this, output);
-    auto exec = this->get_executor();
-
-    exec->run(dense::make_column_permute(
-        make_temporary_clone(exec, permutation_indices).get(), this,
-        make_temporary_output_clone(exec, output).get()));
+    auto result = Dense::create(this->get_executor(), this->get_size());
+    this->permute(permutation_indices, result);
+    return result;
 }
 
 
 template <typename ValueType>
-template <typename IndexType>
-void Dense<ValueType>::inverse_row_permute_impl(
-    const array<IndexType>* permutation_indices, Dense<ValueType>* output) const
+std::unique_ptr<LinOp> Dense<ValueType>::permute(
+    const array<int64>* permutation_indices) const
 {
-    GKO_ASSERT_EQ(permutation_indices->get_num_elems(), this->get_size()[0]);
-    GKO_ASSERT_EQUAL_DIMENSIONS(this, output);
-    auto exec = this->get_executor();
-
-    exec->run(dense::make_inverse_row_permute(
-        make_temporary_clone(exec, permutation_indices).get(), this,
-        make_temporary_output_clone(exec, output).get()));
+    auto result = Dense::create(this->get_executor(), this->get_size());
+    this->permute(permutation_indices, result);
+    return result;
 }
 
 
 template <typename ValueType>
-template <typename IndexType>
-void Dense<ValueType>::inverse_column_permute_impl(
-    const array<IndexType>* permutation_indices, Dense<ValueType>* output) const
+std::unique_ptr<Dense<ValueType>> Dense<ValueType>::permute(
+    ptr_param<const Permutation<int32>> permutation, permute_mode mode) const
 {
-    GKO_ASSERT_EQ(permutation_indices->get_num_elems(), this->get_size()[1]);
-    GKO_ASSERT_EQUAL_DIMENSIONS(this, output);
-    auto exec = this->get_executor();
+    auto result = Dense::create(this->get_executor(), this->get_size());
+    this->permute(permutation, result, mode);
+    return result;
+}
+
 
-    exec->run(dense::make_inverse_column_permute(
-        make_temporary_clone(exec, permutation_indices).get(), this,
-        make_temporary_output_clone(exec, output).get()));
+template <typename ValueType>
+std::unique_ptr<Dense<ValueType>> Dense<ValueType>::permute(
+    ptr_param<const Permutation<int64>> permutation, permute_mode mode) const
+{
+    auto result = Dense::create(this->get_executor(), this->get_size());
+    this->permute(permutation, result, mode);
+    return result;
 }
 
 
 template <typename ValueType>
-std::unique_ptr<LinOp> Dense<ValueType>::permute(
-    const array<int32>* permutation_indices) const
+std::unique_ptr<Dense<ValueType>> Dense<ValueType>::permute(
+    ptr_param<const Permutation<int32>> row_permutation,
+    ptr_param<const Permutation<int32>> col_permutation, bool invert) const
 {
     auto result = Dense::create(this->get_executor(), this->get_size());
-    this->permute(permutation_indices, result);
+    this->permute(row_permutation, col_permutation, result, invert);
     return result;
 }
 
 
 template <typename ValueType>
-std::unique_ptr<LinOp> Dense<ValueType>::permute(
-    const array<int64>* permutation_indices) const
+std::unique_ptr<Dense<ValueType>> Dense<ValueType>::permute(
+    ptr_param<const Permutation<int64>> row_permutation,
+    ptr_param<const Permutation<int64>> col_permutation, bool invert) const
 {
     auto result = Dense::create(this->get_executor(), this->get_size());
-    this->permute(permutation_indices, result);
+    this->permute(row_permutation, col_permutation, result, invert);
     return result;
 }
 
 
+template <typename ValueType>
+void Dense<ValueType>::permute(ptr_param<const Permutation<int32>> permutation,
+                               ptr_param<Dense<ValueType>> result,
+                               permute_mode mode) const
+{
+    this->permute_impl(permutation.get(), mode, result.get());
+}
+
+
+template <typename ValueType>
+void Dense<ValueType>::permute(ptr_param<const Permutation<int64>> permutation,
+                               ptr_param<Dense<ValueType>> result,
+                               permute_mode mode) const
+{
+    this->permute_impl(permutation.get(), mode, result.get());
+}
+
+
+template <typename ValueType>
+void Dense<ValueType>::permute(
+    ptr_param<const Permutation<int32>> row_permutation,
+    ptr_param<const Permutation<int32>> col_permutation,
+    ptr_param<Dense<ValueType>> result, bool invert) const
+{
+    this->permute_impl(row_permutation.get(), col_permutation.get(), invert,
+                       result.get());
+}
+
+
+template <typename ValueType>
+void Dense<ValueType>::permute(
+    ptr_param<const Permutation<int64>> row_permutation,
+    ptr_param<const Permutation<int64>> col_permutation,
+    ptr_param<Dense<ValueType>> result, bool invert) const
+{
+    this->permute_impl(row_permutation.get(), col_permutation.get(), invert,
+                       result.get());
+}
+
+
+template <typename IndexType>
+std::unique_ptr<const Permutation<IndexType>> create_permutation_view(
+    const array<IndexType>& indices)
+{
+    return Permutation<IndexType>::create_const(indices.get_executor(),
+                                                indices.as_const_view());
+}
+
+
 template <typename ValueType>
 void Dense<ValueType>::permute(const array<int32>* permutation_indices,
                                ptr_param<Dense<ValueType>> output) const
 {
-    this->permute_impl(permutation_indices, output.get());
+    this->permute_impl(create_permutation_view(*permutation_indices).get(),
+                       permute_mode::symmetric, output.get());
 }
 
 
@@ -1235,7 +1440,8 @@ template <typename ValueType>
 void Dense<ValueType>::permute(const array<int64>* permutation_indices,
                                ptr_param<Dense<ValueType>> output) const
 {
-    this->permute_impl(permutation_indices, output.get());
+    this->permute_impl(create_permutation_view(*permutation_indices).get(),
+                       permute_mode::symmetric, output.get());
 }
 
 
@@ -1263,7 +1469,8 @@ template <typename ValueType>
 void Dense<ValueType>::inverse_permute(const array<int32>* permutation_indices,
                                        ptr_param<Dense<ValueType>> output) const
 {
-    this->inverse_permute_impl(permutation_indices, output.get());
+    this->permute_impl(create_permutation_view(*permutation_indices).get(),
+                       permute_mode::inverse_symmetric, output.get());
 }
 
 
@@ -1271,7 +1478,8 @@ template <typename ValueType>
 void Dense<ValueType>::inverse_permute(const array<int64>* permutation_indices,
                                        ptr_param<Dense<ValueType>> output) const
 {
-    this->inverse_permute_impl(permutation_indices, output.get());
+    this->permute_impl(create_permutation_view(*permutation_indices).get(),
+                       permute_mode::inverse_symmetric, output.get());
 }
 
 
@@ -1299,7 +1507,8 @@ template <typename ValueType>
 void Dense<ValueType>::row_permute(const array<int32>* permutation_indices,
                                    ptr_param<Dense<ValueType>> output) const
 {
-    this->row_permute_impl(permutation_indices, output.get());
+    this->permute_impl(create_permutation_view(*permutation_indices).get(),
+                       permute_mode::rows, output.get());
 }
 
 
@@ -1307,7 +1516,8 @@ template <typename ValueType>
 void Dense<ValueType>::row_permute(const array<int64>* permutation_indices,
                                    ptr_param<Dense<ValueType>> output) const
 {
-    this->row_permute_impl(permutation_indices, output.get());
+    this->permute_impl(create_permutation_view(*permutation_indices).get(),
+                       permute_mode::rows, output.get());
 }
 
 
@@ -1434,7 +1644,8 @@ template <typename ValueType>
 void Dense<ValueType>::column_permute(const array<int32>* permutation_indices,
                                       ptr_param<Dense<ValueType>> output) const
 {
-    this->column_permute_impl(permutation_indices, output.get());
+    this->permute_impl(create_permutation_view(*permutation_indices).get(),
+                       permute_mode::columns, output.get());
 }
 
 
@@ -1442,7 +1653,8 @@ template <typename ValueType>
 void Dense<ValueType>::column_permute(const array<int64>* permutation_indices,
                                       ptr_param<Dense<ValueType>> output) const
 {
-    this->column_permute_impl(permutation_indices, output.get());
+    this->permute_impl(create_permutation_view(*permutation_indices).get(),
+                       permute_mode::columns, output.get());
 }
 
 
@@ -1471,7 +1683,8 @@ void Dense<ValueType>::inverse_row_permute(
     const array<int32>* permutation_indices,
     ptr_param<Dense<ValueType>> output) const
 {
-    this->inverse_row_permute_impl(permutation_indices, output.get());
+    this->permute_impl(create_permutation_view(*permutation_indices).get(),
+                       permute_mode::inverse_rows, output.get());
 }
 
 
@@ -1480,7 +1693,8 @@ void Dense<ValueType>::inverse_row_permute(
     const array<int64>* permutation_indices,
     ptr_param<Dense<ValueType>> output) const
 {
-    this->inverse_row_permute_impl(permutation_indices, output.get());
+    this->permute_impl(create_permutation_view(*permutation_indices).get(),
+                       permute_mode::inverse_rows, output.get());
 }
 
 
@@ -1509,7 +1723,8 @@ void Dense<ValueType>::inverse_column_permute(
     const array<int32>* permutation_indices,
     ptr_param<Dense<ValueType>> output) const
 {
-    this->inverse_column_permute_impl(permutation_indices, output.get());
+    this->permute_impl(create_permutation_view(*permutation_indices).get(),
+                       permute_mode::inverse_columns, output.get());
 }
 
 
@@ -1518,7 +1733,94 @@ void Dense<ValueType>::inverse_column_permute(
     const array<int64>* permutation_indices,
     ptr_param<Dense<ValueType>> output) const
 {
-    this->inverse_column_permute_impl(permutation_indices, output.get());
+    this->permute_impl(create_permutation_view(*permutation_indices).get(),
+                       permute_mode::inverse_columns, output.get());
+}
+
+
+template <typename ValueType>
+std::unique_ptr<Dense<ValueType>> Dense<ValueType>::scale_permute(
+    ptr_param<const ScaledPermutation<value_type, int32>> permutation,
+    permute_mode mode) const
+{
+    auto result = Dense::create(this->get_executor(), this->get_size());
+    this->scale_permute(permutation, result, mode);
+    return result;
+}
+
+
+template <typename ValueType>
+std::unique_ptr<Dense<ValueType>> Dense<ValueType>::scale_permute(
+    ptr_param<const ScaledPermutation<value_type, int64>> permutation,
+    permute_mode mode) const
+{
+    auto result = Dense::create(this->get_executor(), this->get_size());
+    this->scale_permute(permutation, result, mode);
+    return result;
+}
+
+
+template <typename ValueType>
+void Dense<ValueType>::scale_permute(
+    ptr_param<const ScaledPermutation<value_type, int32>> permutation,
+    ptr_param<Dense> output, permute_mode mode) const
+{
+    this->scale_permute_impl(permutation.get(), mode, output.get());
+}
+
+
+template <typename ValueType>
+void Dense<ValueType>::scale_permute(
+    ptr_param<const ScaledPermutation<value_type, int64>> permutation,
+    ptr_param<Dense> output, permute_mode mode) const
+{
+    this->scale_permute_impl(permutation.get(), mode, output.get());
+}
+
+
+template <typename ValueType>
+std::unique_ptr<Dense<ValueType>> Dense<ValueType>::scale_permute(
+    ptr_param<const ScaledPermutation<value_type, int32>> row_permutation,
+    ptr_param<const ScaledPermutation<value_type, int32>> col_permutation,
+    bool invert) const
+{
+    auto result = Dense::create(this->get_executor(), this->get_size());
+    this->scale_permute(row_permutation, col_permutation, result, invert);
+    return result;
+}
+
+
+template <typename ValueType>
+std::unique_ptr<Dense<ValueType>> Dense<ValueType>::scale_permute(
+    ptr_param<const ScaledPermutation<value_type, int64>> row_permutation,
+    ptr_param<const ScaledPermutation<value_type, int64>> col_permutation,
+    bool invert) const
+{
+    auto result = Dense::create(this->get_executor(), this->get_size());
+    this->scale_permute(row_permutation, col_permutation, result, invert);
+    return result;
+}
+
+
+template <typename ValueType>
+void Dense<ValueType>::scale_permute(
+    ptr_param<const ScaledPermutation<value_type, int32>> row_permutation,
+    ptr_param<const ScaledPermutation<value_type, int32>> col_permutation,
+    ptr_param<Dense> output, bool invert) const
+{
+    this->scale_permute_impl(row_permutation.get(), col_permutation.get(),
+                             invert, output.get());
+}
+
+
+template <typename ValueType>
+void Dense<ValueType>::scale_permute(
+    ptr_param<const ScaledPermutation<value_type, int64>> row_permutation,
+    ptr_param<const ScaledPermutation<value_type, int64>> col_permutation,
+    ptr_param<Dense> output, bool invert) const
+{
+    this->scale_permute_impl(row_permutation.get(), col_permutation.get(),
+                             invert, output.get());
 }
 
 
diff --git a/core/matrix/dense_kernels.hpp b/core/matrix/dense_kernels.hpp
index 9a487fadeda..f315a393712 100644
--- a/core/matrix/dense_kernels.hpp
+++ b/core/matrix/dense_kernels.hpp
@@ -146,6 +146,11 @@ namespace kernels {
                        matrix::Dense<remove_complex<_type>>* result, \
                        array<char>& tmp)
 
+#define GKO_DECLARE_DENSE_COMPUTE_MEAN_KERNEL(_type)               \
+    void compute_mean(std::shared_ptr<const DefaultExecutor> exec, \
+                      const matrix::Dense<_type>* x,               \
+                      matrix::Dense<_type>* result, array<char>& tmp)
+
 #define GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL(_type, _prec)         \
     void fill_in_matrix_data(std::shared_ptr<const DefaultExecutor> exec,  \
                              const device_matrix_data<_type, _prec>& data, \
@@ -232,50 +237,112 @@ namespace kernels {
                         const matrix::Dense<_type>* orig,            \
                         matrix::Dense<_type>* trans)
 
+#define GKO_DECLARE_DENSE_SYMM_SCALE_PERMUTE_KERNEL(_vtype, _itype)           \
+    void symm_scale_permute(                                                  \
+        std::shared_ptr<const DefaultExecutor> exec, const _vtype* scale,     \
+        const _itype* permutation_indices, const matrix::Dense<_vtype>* orig, \
+        matrix::Dense<_vtype>* permuted)
+
+#define GKO_DECLARE_DENSE_ROW_SCALE_PERMUTE_KERNEL(_vtype, _itype)            \
+    void row_scale_permute(                                                   \
+        std::shared_ptr<const DefaultExecutor> exec, const _vtype* scale,     \
+        const _itype* permutation_indices, const matrix::Dense<_vtype>* orig, \
+        matrix::Dense<_vtype>* permuted)
+
+#define GKO_DECLARE_DENSE_COL_SCALE_PERMUTE_KERNEL(_vtype, _itype)            \
+    void col_scale_permute(                                                   \
+        std::shared_ptr<const DefaultExecutor> exec, const _vtype* scale,     \
+        const _itype* permutation_indices, const matrix::Dense<_vtype>* orig, \
+        matrix::Dense<_vtype>* permuted)
+
+#define GKO_DECLARE_DENSE_INV_SYMM_SCALE_PERMUTE_KERNEL(_vtype, _itype)       \
+    void inv_symm_scale_permute(                                              \
+        std::shared_ptr<const DefaultExecutor> exec, const _vtype* scale,     \
+        const _itype* permutation_indices, const matrix::Dense<_vtype>* orig, \
+        matrix::Dense<_vtype>* permuted)
+
+#define GKO_DECLARE_DENSE_INV_ROW_SCALE_PERMUTE_KERNEL(_vtype, _itype)        \
+    void inv_row_scale_permute(                                               \
+        std::shared_ptr<const DefaultExecutor> exec, const _vtype* scale,     \
+        const _itype* permutation_indices, const matrix::Dense<_vtype>* orig, \
+        matrix::Dense<_vtype>* permuted)
+
+#define GKO_DECLARE_DENSE_INV_COL_SCALE_PERMUTE_KERNEL(_vtype, _itype)        \
+    void inv_col_scale_permute(                                               \
+        std::shared_ptr<const DefaultExecutor> exec, const _vtype* scale,     \
+        const _itype* permutation_indices, const matrix::Dense<_vtype>* orig, \
+        matrix::Dense<_vtype>* permuted)
+
+#define GKO_DECLARE_DENSE_NONSYMM_SCALE_PERMUTE_KERNEL(_vtype, _itype)        \
+    void nonsymm_scale_permute(                                               \
+        std::shared_ptr<const DefaultExecutor> exec, const _vtype* row_scale, \
+        const _itype* row_permutation_indices, const _vtype* column_scale,    \
+        const _itype* column_permutation_indices,                             \
+        const matrix::Dense<_vtype>* orig, matrix::Dense<_vtype>* permuted)
+
+#define GKO_DECLARE_DENSE_INV_NONSYMM_SCALE_PERMUTE_KERNEL(_vtype, _itype)    \
+    void inv_nonsymm_scale_permute(                                           \
+        std::shared_ptr<const DefaultExecutor> exec, const _vtype* row_scale, \
+        const _itype* row_permutation_indices, const _vtype* column_scale,    \
+        const _itype* column_permutation_indices,                             \
+        const matrix::Dense<_vtype>* orig, matrix::Dense<_vtype>* permuted)
+
 #define GKO_DECLARE_DENSE_SYMM_PERMUTE_KERNEL(_vtype, _itype)      \
     void symm_permute(std::shared_ptr<const DefaultExecutor> exec, \
-                      const array<_itype>* permutation_indices,    \
+                      const _itype* permutation_indices,           \
                       const matrix::Dense<_vtype>* orig,           \
                       matrix::Dense<_vtype>* permuted)
 
 #define GKO_DECLARE_DENSE_INV_SYMM_PERMUTE_KERNEL(_vtype, _itype)      \
     void inv_symm_permute(std::shared_ptr<const DefaultExecutor> exec, \
-                          const array<_itype>* permutation_indices,    \
+                          const _itype* permutation_indices,           \
                           const matrix::Dense<_vtype>* orig,           \
                           matrix::Dense<_vtype>* permuted)
 
+#define GKO_DECLARE_DENSE_NONSYMM_PERMUTE_KERNEL(_vtype, _itype)      \
+    void nonsymm_permute(std::shared_ptr<const DefaultExecutor> exec, \
+                         const _itype* row_permutation_indices,       \
+                         const _itype* column_permutation_indices,    \
+                         const matrix::Dense<_vtype>* orig,           \
+                         matrix::Dense<_vtype>* permuted)
+
+#define GKO_DECLARE_DENSE_INV_NONSYMM_PERMUTE_KERNEL(_vtype, _itype)      \
+    void inv_nonsymm_permute(std::shared_ptr<const DefaultExecutor> exec, \
+                             const _itype* row_permutation_indices,       \
+                             const _itype* column_permutation_indices,    \
+                             const matrix::Dense<_vtype>* orig,           \
+                             matrix::Dense<_vtype>* permuted)
+
 #define GKO_DECLARE_DENSE_ROW_GATHER_KERNEL(_vtype, _otype, _itype) \
     void row_gather(std::shared_ptr<const DefaultExecutor> exec,    \
-                    const array<_itype>* gather_indices,            \
+                    const _itype* gather_indices,                   \
                     const matrix::Dense<_vtype>* orig,              \
                     matrix::Dense<_otype>* row_collection)
 
-
-#define GKO_DECLARE_DENSE_ADVANCED_ROW_GATHER_KERNEL(_vtype, _otype, _itype) \
-    void advanced_row_gather(std::shared_ptr<const DefaultExecutor> exec,    \
-                             const matrix::Dense<_vtype>* alpha,             \
-                             const array<_itype>* gather_indices,            \
-                             const matrix::Dense<_vtype>* orig,              \
-                             const matrix::Dense<_vtype>* beta,              \
-                             matrix::Dense<_otype>* row_collection)
-
-#define GKO_DECLARE_DENSE_COLUMN_PERMUTE_KERNEL(_vtype, _itype)      \
-    void column_permute(std::shared_ptr<const DefaultExecutor> exec, \
-                        const array<_itype>* permutation_indices,    \
-                        const matrix::Dense<_vtype>* orig,           \
-                        matrix::Dense<_vtype>* column_permuted)
-
-#define GKO_DECLARE_DENSE_INV_ROW_PERMUTE_KERNEL(_vtype, _itype)          \
-    void inverse_row_permute(std::shared_ptr<const DefaultExecutor> exec, \
-                             const array<_itype>* permutation_indices,    \
-                             const matrix::Dense<_vtype>* orig,           \
-                             matrix::Dense<_vtype>* row_permuted)
-
-#define GKO_DECLARE_DENSE_INV_COLUMN_PERMUTE_KERNEL(_vtype, _itype)          \
-    void inverse_column_permute(std::shared_ptr<const DefaultExecutor> exec, \
-                                const array<_itype>* permutation_indices,    \
-                                const matrix::Dense<_vtype>* orig,           \
-                                matrix::Dense<_vtype>* column_permuted)
+#define GKO_DECLARE_DENSE_ADVANCED_ROW_GATHER_KERNEL(_vtype, _otype, _itype)  \
+    void advanced_row_gather(                                                 \
+        std::shared_ptr<const DefaultExecutor> exec,                          \
+        const matrix::Dense<_vtype>* alpha, const _itype* gather_indices,     \
+        const matrix::Dense<_vtype>* orig, const matrix::Dense<_vtype>* beta, \
+        matrix::Dense<_otype>* row_collection)
+
+#define GKO_DECLARE_DENSE_COL_PERMUTE_KERNEL(_vtype, _itype)      \
+    void col_permute(std::shared_ptr<const DefaultExecutor> exec, \
+                     const _itype* permutation_indices,           \
+                     const matrix::Dense<_vtype>* orig,           \
+                     matrix::Dense<_vtype>* col_permuted)
+
+#define GKO_DECLARE_DENSE_INV_ROW_PERMUTE_KERNEL(_vtype, _itype)      \
+    void inv_row_permute(std::shared_ptr<const DefaultExecutor> exec, \
+                         const _itype* permutation_indices,           \
+                         const matrix::Dense<_vtype>* orig,           \
+                         matrix::Dense<_vtype>* row_permuted)
+
+#define GKO_DECLARE_DENSE_INV_COL_PERMUTE_KERNEL(_vtype, _itype)      \
+    void inv_col_permute(std::shared_ptr<const DefaultExecutor> exec, \
+                         const _itype* permutation_indices,           \
+                         const matrix::Dense<_vtype>* orig,           \
+                         matrix::Dense<_vtype>* col_permuted)
 
 #define GKO_DECLARE_DENSE_EXTRACT_DIAGONAL_KERNEL(_vtype)              \
     void extract_diagonal(std::shared_ptr<const DefaultExecutor> exec, \
@@ -314,102 +381,124 @@ namespace kernels {
                              matrix::Dense<_vtype>* mtx)
 
 
-#define GKO_DECLARE_ALL_AS_TEMPLATES                                        \
-    template <typename ValueType>                                           \
-    GKO_DECLARE_DENSE_SIMPLE_APPLY_KERNEL(ValueType);                       \
-    template <typename ValueType>                                           \
-    GKO_DECLARE_DENSE_APPLY_KERNEL(ValueType);                              \
-    template <typename InValueType, typename OutValueType>                  \
-    GKO_DECLARE_DENSE_COPY_KERNEL(InValueType, OutValueType);               \
-    template <typename ValueType>                                           \
-    GKO_DECLARE_DENSE_FILL_KERNEL(ValueType);                               \
-    template <typename ValueType, typename ScalarType>                      \
-    GKO_DECLARE_DENSE_SCALE_KERNEL(ValueType, ScalarType);                  \
-    template <typename ValueType, typename ScalarType>                      \
-    GKO_DECLARE_DENSE_INV_SCALE_KERNEL(ValueType, ScalarType);              \
-    template <typename ValueType, typename ScalarType>                      \
-    GKO_DECLARE_DENSE_ADD_SCALED_KERNEL(ValueType, ScalarType);             \
-    template <typename ValueType, typename ScalarType>                      \
-    GKO_DECLARE_DENSE_SUB_SCALED_KERNEL(ValueType, ScalarType);             \
-    template <typename ValueType>                                           \
-    GKO_DECLARE_DENSE_ADD_SCALED_DIAG_KERNEL(ValueType);                    \
-    template <typename ValueType>                                           \
-    GKO_DECLARE_DENSE_SUB_SCALED_DIAG_KERNEL(ValueType);                    \
-    template <typename ValueType>                                           \
-    GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL(ValueType);                        \
-    template <typename ValueType>                                           \
-    GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL(ValueType);               \
-    template <typename ValueType>                                           \
-    GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL(ValueType);                   \
-    template <typename ValueType>                                           \
-    GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL(ValueType);          \
-    template <typename ValueType>                                           \
-    GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL(ValueType);                      \
-    template <typename ValueType>                                           \
-    GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL(ValueType);             \
-    template <typename ValueType>                                           \
-    GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL(ValueType);                      \
-    template <typename ValueType, typename IndexType>                       \
-    GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL(ValueType, IndexType);     \
-    template <typename ValueType>                                           \
-    GKO_DECLARE_DENSE_COMPUTE_SQUARED_NORM2_KERNEL(ValueType);              \
-    template <typename ValueType>                                           \
-    GKO_DECLARE_DENSE_COMPUTE_SQRT_KERNEL(ValueType);                       \
-    template <typename ValueType, typename IndexType>                       \
-    GKO_DECLARE_DENSE_CONVERT_TO_COO_KERNEL(ValueType, IndexType);          \
-    template <typename ValueType, typename IndexType>                       \
-    GKO_DECLARE_DENSE_CONVERT_TO_CSR_KERNEL(ValueType, IndexType);          \
-    template <typename ValueType, typename IndexType>                       \
-    GKO_DECLARE_DENSE_CONVERT_TO_ELL_KERNEL(ValueType, IndexType);          \
-    template <typename ValueType, typename IndexType>                       \
-    GKO_DECLARE_DENSE_CONVERT_TO_FBCSR_KERNEL(ValueType, IndexType);        \
-    template <typename ValueType, typename IndexType>                       \
-    GKO_DECLARE_DENSE_CONVERT_TO_HYBRID_KERNEL(ValueType, IndexType);       \
-    template <typename ValueType, typename IndexType>                       \
-    GKO_DECLARE_DENSE_CONVERT_TO_SELLP_KERNEL(ValueType, IndexType);        \
-    template <typename ValueType, typename IndexType>                       \
-    GKO_DECLARE_DENSE_CONVERT_TO_SPARSITY_CSR_KERNEL(ValueType, IndexType); \
-    template <typename ValueType>                                           \
-    GKO_DECLARE_DENSE_COMPUTE_MAX_NNZ_PER_ROW_KERNEL(ValueType);            \
-    template <typename ValueType>                                           \
-    GKO_DECLARE_DENSE_COMPUTE_SLICE_SETS_KERNEL(ValueType);                 \
-    template <typename ValueType, typename IndexType>                       \
-    GKO_DECLARE_DENSE_COUNT_NONZEROS_PER_ROW_KERNEL(ValueType, IndexType);  \
-    template <typename ValueType, typename IndexType>                       \
-    GKO_DECLARE_DENSE_COUNT_NONZERO_BLOCKS_PER_ROW_KERNEL(ValueType,        \
-                                                          IndexType);       \
-    template <typename ValueType>                                           \
-    GKO_DECLARE_DENSE_TRANSPOSE_KERNEL(ValueType);                          \
-    template <typename ValueType>                                           \
-    GKO_DECLARE_DENSE_CONJ_TRANSPOSE_KERNEL(ValueType);                     \
-    template <typename ValueType, typename IndexType>                       \
-    GKO_DECLARE_DENSE_SYMM_PERMUTE_KERNEL(ValueType, IndexType);            \
-    template <typename ValueType, typename IndexType>                       \
-    GKO_DECLARE_DENSE_INV_SYMM_PERMUTE_KERNEL(ValueType, IndexType);        \
-    template <typename ValueType, typename OutputType, typename IndexType>  \
-    GKO_DECLARE_DENSE_ROW_GATHER_KERNEL(ValueType, OutputType, IndexType);  \
-    template <typename ValueType, typename OutputType, typename IndexType>  \
-    GKO_DECLARE_DENSE_ADVANCED_ROW_GATHER_KERNEL(ValueType, OutputType,     \
-                                                 IndexType);                \
-    template <typename ValueType, typename IndexType>                       \
-    GKO_DECLARE_DENSE_COLUMN_PERMUTE_KERNEL(ValueType, IndexType);          \
-    template <typename ValueType, typename IndexType>                       \
-    GKO_DECLARE_DENSE_INV_ROW_PERMUTE_KERNEL(ValueType, IndexType);         \
-    template <typename ValueType, typename IndexType>                       \
-    GKO_DECLARE_DENSE_INV_COLUMN_PERMUTE_KERNEL(ValueType, IndexType);      \
-    template <typename ValueType>                                           \
-    GKO_DECLARE_DENSE_EXTRACT_DIAGONAL_KERNEL(ValueType);                   \
-    template <typename ValueType>                                           \
-    GKO_DECLARE_INPLACE_ABSOLUTE_DENSE_KERNEL(ValueType);                   \
-    template <typename ValueType>                                           \
-    GKO_DECLARE_OUTPLACE_ABSOLUTE_DENSE_KERNEL(ValueType);                  \
-    template <typename ValueType>                                           \
-    GKO_DECLARE_MAKE_COMPLEX_KERNEL(ValueType);                             \
-    template <typename ValueType>                                           \
-    GKO_DECLARE_GET_REAL_KERNEL(ValueType);                                 \
-    template <typename ValueType>                                           \
-    GKO_DECLARE_GET_IMAG_KERNEL(ValueType);                                 \
-    template <typename ValueType, typename ScalarType>                      \
+#define GKO_DECLARE_ALL_AS_TEMPLATES                                          \
+    template <typename ValueType>                                             \
+    GKO_DECLARE_DENSE_SIMPLE_APPLY_KERNEL(ValueType);                         \
+    template <typename ValueType>                                             \
+    GKO_DECLARE_DENSE_APPLY_KERNEL(ValueType);                                \
+    template <typename InValueType, typename OutValueType>                    \
+    GKO_DECLARE_DENSE_COPY_KERNEL(InValueType, OutValueType);                 \
+    template <typename ValueType>                                             \
+    GKO_DECLARE_DENSE_FILL_KERNEL(ValueType);                                 \
+    template <typename ValueType, typename ScalarType>                        \
+    GKO_DECLARE_DENSE_SCALE_KERNEL(ValueType, ScalarType);                    \
+    template <typename ValueType, typename ScalarType>                        \
+    GKO_DECLARE_DENSE_INV_SCALE_KERNEL(ValueType, ScalarType);                \
+    template <typename ValueType, typename ScalarType>                        \
+    GKO_DECLARE_DENSE_ADD_SCALED_KERNEL(ValueType, ScalarType);               \
+    template <typename ValueType, typename ScalarType>                        \
+    GKO_DECLARE_DENSE_SUB_SCALED_KERNEL(ValueType, ScalarType);               \
+    template <typename ValueType>                                             \
+    GKO_DECLARE_DENSE_ADD_SCALED_DIAG_KERNEL(ValueType);                      \
+    template <typename ValueType>                                             \
+    GKO_DECLARE_DENSE_SUB_SCALED_DIAG_KERNEL(ValueType);                      \
+    template <typename ValueType>                                             \
+    GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL(ValueType);                          \
+    template <typename ValueType>                                             \
+    GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL(ValueType);                 \
+    template <typename ValueType>                                             \
+    GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL(ValueType);                     \
+    template <typename ValueType>                                             \
+    GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL(ValueType);            \
+    template <typename ValueType>                                             \
+    GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL(ValueType);                        \
+    template <typename ValueType>                                             \
+    GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL(ValueType);               \
+    template <typename ValueType>                                             \
+    GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL(ValueType);                        \
+    template <typename ValueType>                                             \
+    GKO_DECLARE_DENSE_COMPUTE_MEAN_KERNEL(ValueType);                         \
+    template <typename ValueType, typename IndexType>                         \
+    GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL(ValueType, IndexType);       \
+    template <typename ValueType>                                             \
+    GKO_DECLARE_DENSE_COMPUTE_SQUARED_NORM2_KERNEL(ValueType);                \
+    template <typename ValueType>                                             \
+    GKO_DECLARE_DENSE_COMPUTE_SQRT_KERNEL(ValueType);                         \
+    template <typename ValueType, typename IndexType>                         \
+    GKO_DECLARE_DENSE_CONVERT_TO_COO_KERNEL(ValueType, IndexType);            \
+    template <typename ValueType, typename IndexType>                         \
+    GKO_DECLARE_DENSE_CONVERT_TO_CSR_KERNEL(ValueType, IndexType);            \
+    template <typename ValueType, typename IndexType>                         \
+    GKO_DECLARE_DENSE_CONVERT_TO_ELL_KERNEL(ValueType, IndexType);            \
+    template <typename ValueType, typename IndexType>                         \
+    GKO_DECLARE_DENSE_CONVERT_TO_FBCSR_KERNEL(ValueType, IndexType);          \
+    template <typename ValueType, typename IndexType>                         \
+    GKO_DECLARE_DENSE_CONVERT_TO_HYBRID_KERNEL(ValueType, IndexType);         \
+    template <typename ValueType, typename IndexType>                         \
+    GKO_DECLARE_DENSE_CONVERT_TO_SELLP_KERNEL(ValueType, IndexType);          \
+    template <typename ValueType, typename IndexType>                         \
+    GKO_DECLARE_DENSE_CONVERT_TO_SPARSITY_CSR_KERNEL(ValueType, IndexType);   \
+    template <typename ValueType>                                             \
+    GKO_DECLARE_DENSE_COMPUTE_MAX_NNZ_PER_ROW_KERNEL(ValueType);              \
+    template <typename ValueType>                                             \
+    GKO_DECLARE_DENSE_COMPUTE_SLICE_SETS_KERNEL(ValueType);                   \
+    template <typename ValueType, typename IndexType>                         \
+    GKO_DECLARE_DENSE_COUNT_NONZEROS_PER_ROW_KERNEL(ValueType, IndexType);    \
+    template <typename ValueType, typename IndexType>                         \
+    GKO_DECLARE_DENSE_COUNT_NONZERO_BLOCKS_PER_ROW_KERNEL(ValueType,          \
+                                                          IndexType);         \
+    template <typename ValueType>                                             \
+    GKO_DECLARE_DENSE_TRANSPOSE_KERNEL(ValueType);                            \
+    template <typename ValueType>                                             \
+    GKO_DECLARE_DENSE_CONJ_TRANSPOSE_KERNEL(ValueType);                       \
+    template <typename ValueType, typename IndexType>                         \
+    GKO_DECLARE_DENSE_SYMM_PERMUTE_KERNEL(ValueType, IndexType);              \
+    template <typename ValueType, typename IndexType>                         \
+    GKO_DECLARE_DENSE_INV_SYMM_PERMUTE_KERNEL(ValueType, IndexType);          \
+    template <typename ValueType, typename IndexType>                         \
+    GKO_DECLARE_DENSE_NONSYMM_PERMUTE_KERNEL(ValueType, IndexType);           \
+    template <typename ValueType, typename IndexType>                         \
+    GKO_DECLARE_DENSE_INV_NONSYMM_PERMUTE_KERNEL(ValueType, IndexType);       \
+    template <typename ValueType, typename OutputType, typename IndexType>    \
+    GKO_DECLARE_DENSE_ROW_GATHER_KERNEL(ValueType, OutputType, IndexType);    \
+    template <typename ValueType, typename OutputType, typename IndexType>    \
+    GKO_DECLARE_DENSE_ADVANCED_ROW_GATHER_KERNEL(ValueType, OutputType,       \
+                                                 IndexType);                  \
+    template <typename ValueType, typename IndexType>                         \
+    GKO_DECLARE_DENSE_COL_PERMUTE_KERNEL(ValueType, IndexType);               \
+    template <typename ValueType, typename IndexType>                         \
+    GKO_DECLARE_DENSE_INV_ROW_PERMUTE_KERNEL(ValueType, IndexType);           \
+    template <typename ValueType, typename IndexType>                         \
+    GKO_DECLARE_DENSE_INV_COL_PERMUTE_KERNEL(ValueType, IndexType);           \
+    template <typename ValueType, typename IndexType>                         \
+    GKO_DECLARE_DENSE_SYMM_SCALE_PERMUTE_KERNEL(ValueType, IndexType);        \
+    template <typename ValueType, typename IndexType>                         \
+    GKO_DECLARE_DENSE_INV_SYMM_SCALE_PERMUTE_KERNEL(ValueType, IndexType);    \
+    template <typename ValueType, typename IndexType>                         \
+    GKO_DECLARE_DENSE_ROW_SCALE_PERMUTE_KERNEL(ValueType, IndexType);         \
+    template <typename ValueType, typename IndexType>                         \
+    GKO_DECLARE_DENSE_COL_SCALE_PERMUTE_KERNEL(ValueType, IndexType);         \
+    template <typename ValueType, typename IndexType>                         \
+    GKO_DECLARE_DENSE_INV_ROW_SCALE_PERMUTE_KERNEL(ValueType, IndexType);     \
+    template <typename ValueType, typename IndexType>                         \
+    GKO_DECLARE_DENSE_INV_COL_SCALE_PERMUTE_KERNEL(ValueType, IndexType);     \
+    template <typename ValueType, typename IndexType>                         \
+    GKO_DECLARE_DENSE_NONSYMM_SCALE_PERMUTE_KERNEL(ValueType, IndexType);     \
+    template <typename ValueType, typename IndexType>                         \
+    GKO_DECLARE_DENSE_INV_NONSYMM_SCALE_PERMUTE_KERNEL(ValueType, IndexType); \
+    template <typename ValueType>                                             \
+    GKO_DECLARE_DENSE_EXTRACT_DIAGONAL_KERNEL(ValueType);                     \
+    template <typename ValueType>                                             \
+    GKO_DECLARE_INPLACE_ABSOLUTE_DENSE_KERNEL(ValueType);                     \
+    template <typename ValueType>                                             \
+    GKO_DECLARE_OUTPLACE_ABSOLUTE_DENSE_KERNEL(ValueType);                    \
+    template <typename ValueType>                                             \
+    GKO_DECLARE_MAKE_COMPLEX_KERNEL(ValueType);                               \
+    template <typename ValueType>                                             \
+    GKO_DECLARE_GET_REAL_KERNEL(ValueType);                                   \
+    template <typename ValueType>                                             \
+    GKO_DECLARE_GET_IMAG_KERNEL(ValueType);                                   \
+    template <typename ValueType, typename ScalarType>                        \
     GKO_DECLARE_DENSE_ADD_SCALED_IDENTITY_KERNEL(ValueType, ScalarType)
 
 
diff --git a/core/matrix/permutation.cpp b/core/matrix/permutation.cpp
index a641834f12c..3f13ef9711f 100644
--- a/core/matrix/permutation.cpp
+++ b/core/matrix/permutation.cpp
@@ -33,8 +33,260 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/permutation.hpp>
 
 
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/precision_dispatch.hpp>
+#include <ginkgo/core/base/temporary_clone.hpp>
+#include <ginkgo/core/base/utils_helper.hpp>
+
+
+#include "core/base/dispatch_helper.hpp"
+#include "core/matrix/permutation_kernels.hpp"
+
+
 namespace gko {
 namespace matrix {
+namespace permutation {
+
+
+GKO_REGISTER_OPERATION(invert, permutation::invert);
+GKO_REGISTER_OPERATION(compose, permutation::compose);
+
+
+}  // namespace permutation
+
+
+void validate_permute_dimensions(dim<2> size, dim<2> permutation_size,
+                                 permute_mode mode)
+{
+    if ((mode & permute_mode::symmetric) == permute_mode::symmetric) {
+        GKO_ASSERT_IS_SQUARE_MATRIX(size);
+    }
+    if ((mode & permute_mode::rows) == permute_mode::rows) {
+        if (size[0] != permutation_size[0]) {
+            throw DimensionMismatch(
+                __FILE__, __LINE__, __func__, "matrix", size[0], size[1],
+                "permutation", permutation_size[0], permutation_size[0],
+                "expected the permutation size to match the number of rows");
+        };
+    }
+    if ((mode & permute_mode::columns) == permute_mode::columns) {
+        if (size[1] != permutation_size[0]) {
+            throw DimensionMismatch(
+                __FILE__, __LINE__, __func__, "matrix", size[0], size[1],
+                "permutation", permutation_size[0], permutation_size[0],
+                "expected the permutation size to match the number of columns");
+        };
+    }
+}
+
+
+permute_mode operator|(permute_mode a, permute_mode b)
+{
+    return static_cast<permute_mode>(static_cast<unsigned>(a) |
+                                     static_cast<unsigned>(b));
+}
+
+
+permute_mode operator&(permute_mode a, permute_mode b)
+{
+    return static_cast<permute_mode>(static_cast<unsigned>(a) &
+                                     static_cast<unsigned>(b));
+}
+
+
+permute_mode operator^(permute_mode a, permute_mode b)
+{
+    return static_cast<permute_mode>(static_cast<unsigned>(a) ^
+                                     static_cast<unsigned>(b));
+}
+
+
+std::ostream& operator<<(std::ostream& stream, permute_mode mode)
+{
+    switch (mode) {
+    case permute_mode::none:
+        return stream << "none";
+    case permute_mode::rows:
+        return stream << "rows";
+    case permute_mode::columns:
+        return stream << "columns";
+    case permute_mode::symmetric:
+        return stream << "symmetric";
+    case permute_mode::inverse:
+        return stream << "inverse";
+    case permute_mode::inverse_rows:
+        return stream << "inverse_rows";
+    case permute_mode::inverse_columns:
+        return stream << "inverse_columns";
+    case permute_mode::inverse_symmetric:
+        return stream << "inverse_symmetric";
+    }
+    return stream;
+}
+
+
+template <typename IndexType>
+std::unique_ptr<const Permutation<IndexType>>
+Permutation<IndexType>::create_const(
+    std::shared_ptr<const Executor> exec, size_type size,
+    gko::detail::const_array_view<IndexType>&& perm_idxs,
+    mask_type enabled_permute)
+{
+    GKO_ASSERT_EQ(enabled_permute, row_permute);
+    GKO_ASSERT_EQ(size, perm_idxs.get_num_elems());
+    return create_const(std::move(exec), std::move(perm_idxs));
+}
+
+
+template <typename IndexType>
+std::unique_ptr<const Permutation<IndexType>>
+Permutation<IndexType>::create_const(
+    std::shared_ptr<const Executor> exec,
+    gko::detail::const_array_view<IndexType>&& perm_idxs)
+{
+    // cast const-ness away, but return a const object afterwards,
+    // so we can ensure that no modifications take place.
+    return std::unique_ptr<const Permutation<IndexType>>(
+        new Permutation<IndexType>{
+            exec, gko::detail::array_const_cast(std::move(perm_idxs))});
+}
+
+
+template <typename IndexType>
+Permutation<IndexType>::Permutation(std::shared_ptr<const Executor> exec,
+                                    size_type size)
+    : EnableLinOp<Permutation>(exec, size), permutation_{exec, size}
+{}
+
+
+template <typename IndexType>
+Permutation<IndexType>::Permutation(std::shared_ptr<const Executor> exec,
+                                    array<index_type> permutation_indices)
+    : EnableLinOp<Permutation>(exec, permutation_indices.get_num_elems()),
+      permutation_{exec, std::move(permutation_indices)}
+{}
+
+
+template <typename IndexType>
+Permutation<IndexType>::Permutation(std::shared_ptr<const Executor> exec,
+                                    const dim<2>& size)
+    : Permutation{exec, size[0]}
+{
+    GKO_ASSERT_IS_SQUARE_MATRIX(size);
+}
+
+
+template <typename IndexType>
+Permutation<IndexType>::Permutation(std::shared_ptr<const Executor> exec,
+                                    const dim<2>& size,
+                                    const mask_type& enabled_permute)
+    : Permutation{exec, size[0]}
+{
+    GKO_ASSERT_EQ(enabled_permute, row_permute);
+    GKO_ASSERT_IS_SQUARE_MATRIX(size);
+}
+
+
+template <typename IndexType>
+size_type Permutation<IndexType>::get_permutation_size() const noexcept
+{
+    return this->get_size()[0];
+}
+
+
+template <typename IndexType>
+mask_type Permutation<IndexType>::get_permute_mask() const
+{
+    return row_permute;
+}
+
+
+template <typename IndexType>
+void Permutation<IndexType>::set_permute_mask(mask_type permute_mask)
+{
+    GKO_ASSERT_EQ(permute_mask, row_permute);
+}
+
+
+template <typename IndexType>
+std::unique_ptr<Permutation<IndexType>>
+Permutation<IndexType>::compute_inverse() const
+{
+    const auto exec = this->get_executor();
+    const auto size = this->get_size()[0];
+    auto result = Permutation<IndexType>::create(exec, size);
+    exec->run(permutation::make_invert(this->get_const_permutation(), size,
+                                       result->get_permutation()));
+    return result;
+}
+
+
+template <typename IndexType>
+std::unique_ptr<Permutation<IndexType>> Permutation<IndexType>::compose(
+    ptr_param<const Permutation<IndexType>> other) const
+{
+    GKO_ASSERT_EQUAL_DIMENSIONS(this, other);
+    const auto exec = this->get_executor();
+    const auto size = this->get_size()[0];
+    const auto local_other = make_temporary_clone(exec, other);
+    auto result = Permutation<IndexType>::create(exec, size);
+    exec->run(permutation::make_compose(this->get_const_permutation(),
+                                        local_other->get_const_permutation(),
+                                        size, result->get_permutation()));
+    return result;
+}
+
+
+template <typename IndexType>
+void Permutation<IndexType>::write(
+    gko::matrix_data<value_type, index_type>& data) const
+{
+    const auto host_this =
+        make_temporary_clone(this->get_executor()->get_master(), this);
+    data.size = this->get_size();
+    data.nonzeros.clear();
+    data.nonzeros.reserve(data.size[0]);
+    for (IndexType row = 0; row < this->get_size()[0]; row++) {
+        data.nonzeros.emplace_back(row, host_this->get_const_permutation()[row],
+                                   one<value_type>());
+    }
+}
+
+
+template <typename Functor>
+void dispatch_dense(const LinOp* op, Functor fn)
+{
+    using matrix::Dense;
+    using std::complex;
+    run<const Dense<double>*, const Dense<float>*,
+        const Dense<complex<double>>*, const Dense<complex<float>>*>(op, fn);
+}
+
+
+template <typename IndexType>
+void Permutation<IndexType>::apply_impl(const LinOp* in, LinOp* out) const
+{
+    dispatch_dense(in, [&](auto dense_in) {
+        auto dense_out = make_temporary_conversion<
+            typename gko::detail::pointee<decltype(dense_in)>::value_type>(out);
+        dense_in->permute(this, dense_out.get(), permute_mode::rows);
+    });
+}
+
+
+template <typename IndexType>
+void Permutation<IndexType>::apply_impl(const LinOp* alpha, const LinOp* in,
+                                        const LinOp* beta, LinOp* out) const
+{
+    dispatch_dense(in, [&](auto dense_in) {
+        auto dense_out = make_temporary_conversion<
+            typename gko::detail::pointee<decltype(dense_in)>::value_type>(out);
+        auto tmp = dense_in->permute(this, permute_mode::rows);
+        dense_out->scale(beta);
+        dense_out->add_scaled(alpha, tmp);
+    });
+}
 
 
 #define GKO_DECLARE_PERMUTATION_MATRIX(_type) class Permutation<_type>
diff --git a/core/matrix/permutation.hpp b/core/matrix/permutation.hpp
new file mode 100644
index 00000000000..08a1d731b4e
--- /dev/null
+++ b/core/matrix/permutation.hpp
@@ -0,0 +1,59 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CORE_MATRIX_PERMUTATION_HPP_
+#define GKO_CORE_MATRIX_PERMUTATION_HPP_
+
+
+#include <ginkgo/core/matrix/permutation.hpp>
+
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+
+
+namespace gko {
+namespace matrix {
+
+
+/**
+ * Checks that the given input and permutation size are consistent with
+ * the given mode.
+ */
+void validate_permute_dimensions(dim<2> size, dim<2> permutation_size,
+                                 permute_mode mode);
+
+
+}  // namespace matrix
+}  // namespace gko
+
+
+#endif  // GKO_CORE_MATRIX_PERMUTATION_HPP_
diff --git a/core/matrix/permutation_kernels.hpp b/core/matrix/permutation_kernels.hpp
new file mode 100644
index 00000000000..957121f4c41
--- /dev/null
+++ b/core/matrix/permutation_kernels.hpp
@@ -0,0 +1,88 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CORE_MATRIX_PERMUTATION_KERNELS_HPP_
+#define GKO_CORE_MATRIX_PERMUTATION_KERNELS_HPP_
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/index_set.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/matrix/coo.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/matrix/diagonal.hpp>
+#include <ginkgo/core/matrix/ell.hpp>
+#include <ginkgo/core/matrix/hybrid.hpp>
+#include <ginkgo/core/matrix/sellp.hpp>
+#include <ginkgo/core/matrix/sparsity_csr.hpp>
+
+
+#include "core/base/kernel_declaration.hpp"
+#include "core/matrix/csr_lookup.hpp"
+
+
+namespace gko {
+namespace kernels {
+
+
+#define GKO_DECLARE_PERMUTATION_INVERT_KERNEL(IndexType)              \
+    void invert(std::shared_ptr<const DefaultExecutor> exec,          \
+                const IndexType* permutation_indices, size_type size, \
+                IndexType* inv_permutation)
+
+#define GKO_DECLARE_PERMUTATION_COMPOSE_KERNEL(IndexType)             \
+    void compose(std::shared_ptr<const DefaultExecutor> exec,         \
+                 const IndexType* first_permutation,                  \
+                 const IndexType* second_permutation, size_type size, \
+                 IndexType* combined_permutation)
+
+
+#define GKO_DECLARE_ALL_AS_TEMPLATES                  \
+    template <typename IndexType>                     \
+    GKO_DECLARE_PERMUTATION_INVERT_KERNEL(IndexType); \
+    template <typename IndexType>                     \
+    GKO_DECLARE_PERMUTATION_COMPOSE_KERNEL(IndexType)
+
+
+GKO_DECLARE_FOR_ALL_EXECUTOR_NAMESPACES(permutation,
+                                        GKO_DECLARE_ALL_AS_TEMPLATES);
+
+
+#undef GKO_DECLARE_ALL_AS_TEMPLATES
+
+
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_CORE_MATRIX_PERMUTATION_KERNELS_HPP_
diff --git a/core/matrix/scaled_permutation.cpp b/core/matrix/scaled_permutation.cpp
new file mode 100644
index 00000000000..53296aee5b3
--- /dev/null
+++ b/core/matrix/scaled_permutation.cpp
@@ -0,0 +1,213 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/matrix/scaled_permutation.hpp>
+
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/precision_dispatch.hpp>
+
+
+#include "core/matrix/scaled_permutation_kernels.hpp"
+
+
+namespace gko {
+namespace matrix {
+namespace scaled_permutation {
+namespace {
+
+
+GKO_REGISTER_OPERATION(invert, scaled_permutation::invert);
+GKO_REGISTER_OPERATION(compose, scaled_permutation::compose);
+
+
+}  // namespace
+}  // namespace scaled_permutation
+
+
+template <typename ValueType, typename IndexType>
+ScaledPermutation<ValueType, IndexType>::ScaledPermutation(
+    std::shared_ptr<const Executor> exec, size_type size)
+    : ScaledPermutation{exec, array<ValueType>{exec, size},
+                        array<IndexType>{exec, size}}
+{}
+
+
+template <typename ValueType, typename IndexType>
+ScaledPermutation<ValueType, IndexType>::ScaledPermutation(
+    std::shared_ptr<const Executor> exec, array<value_type> scaling_factors,
+    array<index_type> permutation_indices)
+    : EnableLinOp<ScaledPermutation>(exec,
+                                     dim<2>{scaling_factors.get_num_elems(),
+                                            scaling_factors.get_num_elems()}),
+      scale_{exec, std::move(scaling_factors)},
+      permutation_{exec, std::move(permutation_indices)}
+{
+    GKO_ASSERT_EQ(scale_.get_num_elems(), permutation_.get_num_elems());
+}
+
+
+template <typename ValueType, typename IndexType>
+std::unique_ptr<ScaledPermutation<ValueType, IndexType>>
+ScaledPermutation<ValueType, IndexType>::create(
+    std::shared_ptr<const Executor> exec, size_type size)
+{
+    return std::unique_ptr<ScaledPermutation>(
+        new ScaledPermutation{exec, size});
+}
+
+
+template <typename ValueType, typename IndexType>
+std::unique_ptr<ScaledPermutation<ValueType, IndexType>>
+ScaledPermutation<ValueType, IndexType>::create(
+    ptr_param<const Permutation<IndexType>> permutation)
+{
+    const auto exec = permutation->get_executor();
+    const auto size = permutation->get_size()[0];
+    array<value_type> scale{exec, size};
+    array<index_type> perm{exec, size};
+    exec->copy(size, permutation->get_const_permutation(), perm.get_data());
+    scale.fill(one<ValueType>());
+    return create(exec, std::move(scale), std::move(perm));
+}
+
+
+template <typename ValueType, typename IndexType>
+std::unique_ptr<ScaledPermutation<ValueType, IndexType>>
+ScaledPermutation<ValueType, IndexType>::create(
+    std::shared_ptr<const Executor> exec, array<value_type> scaling_factors,
+    array<index_type> permutation_indices)
+{
+    return std::unique_ptr<ScaledPermutation>(new ScaledPermutation{
+        exec, std::move(scaling_factors), std::move(permutation_indices)});
+}
+
+
+template <typename ValueType, typename IndexType>
+std::unique_ptr<const ScaledPermutation<ValueType, IndexType>>
+ScaledPermutation<ValueType, IndexType>::create_const(
+    std::shared_ptr<const Executor> exec,
+    gko::detail::const_array_view<value_type>&& scale,
+    gko::detail::const_array_view<index_type>&& perm_idxs)
+{
+    return create(exec, gko::detail::array_const_cast(std::move(scale)),
+                  gko::detail::array_const_cast(std::move(perm_idxs)));
+}
+
+
+template <typename ValueType, typename IndexType>
+std::unique_ptr<ScaledPermutation<ValueType, IndexType>>
+ScaledPermutation<ValueType, IndexType>::compute_inverse() const
+{
+    const auto exec = this->get_executor();
+    const auto size = this->get_size()[0];
+    auto result = ScaledPermutation::create(exec, size);
+    exec->run(scaled_permutation::make_invert(
+        this->get_const_scaling_factors(), this->get_const_permutation(), size,
+        result->get_scaling_factors(), result->get_permutation()));
+    return result;
+}
+
+
+template <typename ValueType, typename IndexType>
+std::unique_ptr<ScaledPermutation<ValueType, IndexType>>
+ScaledPermutation<ValueType, IndexType>::compose(
+    ptr_param<const ScaledPermutation> other) const
+{
+    GKO_ASSERT_EQUAL_DIMENSIONS(this, other);
+    const auto exec = this->get_executor();
+    const auto size = this->get_size()[0];
+    const auto local_other = make_temporary_clone(exec, other);
+    auto result = ScaledPermutation::create(exec, size);
+    exec->run(scaled_permutation::make_compose(
+        this->get_const_scaling_factors(), this->get_const_permutation(),
+        local_other->get_const_scaling_factors(),
+        local_other->get_const_permutation(), size,
+        result->get_scaling_factors(), result->get_permutation()));
+    return result;
+}
+
+
+template <typename ValueType, typename IndexType>
+void ScaledPermutation<ValueType, IndexType>::apply_impl(const LinOp* b,
+                                                         LinOp* x) const
+{
+    precision_dispatch_real_complex<ValueType>(
+        [this](auto dense_b, auto dense_x) {
+            dense_b->scale_permute(this, dense_x, permute_mode::rows);
+        },
+        b, x);
+}
+
+
+template <typename ValueType, typename IndexType>
+void ScaledPermutation<ValueType, IndexType>::apply_impl(const LinOp* alpha,
+                                                         const LinOp* b,
+                                                         const LinOp* beta,
+                                                         LinOp* x) const
+{
+    precision_dispatch_real_complex<ValueType>(
+        [this](auto dense_alpha, auto dense_b, auto dense_beta, auto dense_x) {
+            auto tmp = dense_b->scale_permute(this, permute_mode::rows);
+            dense_x->scale(dense_beta);
+            dense_x->add_scaled(dense_alpha, tmp);
+        },
+        alpha, b, beta, x);
+}
+
+
+template <typename ValueType, typename IndexType>
+void ScaledPermutation<ValueType, IndexType>::write(
+    gko::matrix_data<value_type, index_type>& data) const
+{
+    const auto host_this =
+        make_temporary_clone(this->get_executor()->get_master(), this);
+    data.size = this->get_size();
+    data.nonzeros.clear();
+    data.nonzeros.reserve(data.size[0]);
+    for (IndexType row = 0; row < this->get_size()[0]; row++) {
+        auto col = host_this->get_const_permutation()[row];
+        data.nonzeros.emplace_back(row, col,
+                                   host_this->get_const_scaling_factors()[col]);
+    }
+}
+
+
+#define GKO_DECLARE_SCALED_PERMUTATION_MATRIX(ValueType, IndexType) \
+    class ScaledPermutation<ValueType, IndexType>
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_SCALED_PERMUTATION_MATRIX);
+
+
+}  // namespace matrix
+}  // namespace gko
diff --git a/core/matrix/scaled_permutation_kernels.hpp b/core/matrix/scaled_permutation_kernels.hpp
new file mode 100644
index 00000000000..8f247ac33d1
--- /dev/null
+++ b/core/matrix/scaled_permutation_kernels.hpp
@@ -0,0 +1,80 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CORE_MATRIX_SCALED_PERMUTATION_KERNELS_HPP_
+#define GKO_CORE_MATRIX_SCALED_PERMUTATION_KERNELS_HPP_
+
+
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+#include "core/base/kernel_declaration.hpp"
+
+
+namespace gko {
+namespace kernels {
+
+
+#define GKO_DECLARE_SCALED_PERMUTATION_INVERT_KERNEL(ValueType, IndexType) \
+    void invert(std::shared_ptr<const DefaultExecutor> exec,               \
+                const ValueType* input_scale,                              \
+                const IndexType* input_permutation, size_type size,        \
+                ValueType* output_scale, IndexType* output_permutation)
+
+#define GKO_DECLARE_SCALED_PERMUTATION_COMPOSE_KERNEL(ValueType, IndexType) \
+    void compose(std::shared_ptr<const DefaultExecutor> exec,               \
+                 const ValueType* first_scale,                              \
+                 const IndexType* first_permutation,                        \
+                 const ValueType* second_scale,                             \
+                 const IndexType* second_permutation, size_type size,       \
+                 ValueType* output_scale, IndexType* output_permutation)
+
+
+#define GKO_DECLARE_ALL_AS_TEMPLATES                                    \
+    template <typename ValueType, typename IndexType>                   \
+    GKO_DECLARE_SCALED_PERMUTATION_INVERT_KERNEL(ValueType, IndexType); \
+    template <typename ValueType, typename IndexType>                   \
+    GKO_DECLARE_SCALED_PERMUTATION_COMPOSE_KERNEL(ValueType, IndexType)
+
+
+GKO_DECLARE_FOR_ALL_EXECUTOR_NAMESPACES(scaled_permutation,
+                                        GKO_DECLARE_ALL_AS_TEMPLATES);
+
+
+#undef GKO_DECLARE_ALL_AS_TEMPLATES
+
+
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_CORE_MATRIX_SCALED_PERMUTATION_KERNELS_HPP_
diff --git a/core/preconditioner/isai.cpp b/core/preconditioner/isai.cpp
index 52fa9140853..4e0e2ea95d8 100644
--- a/core/preconditioner/isai.cpp
+++ b/core/preconditioner/isai.cpp
@@ -230,17 +230,15 @@ void Isai<IsaiType, ValueType, IndexType>::generate_inverse(
                 excess_solver_factory =
                     Gmres::build()
                         .with_preconditioner(
-                            Bj::build().with_max_block_size(32u).on(exec))
+                            Bj::build().with_max_block_size(32u))
                         .with_criteria(
-                            gko::stop::Iteration::build()
-                                .with_max_iters(excess_dim)
-                                .on(exec),
+                            gko::stop::Iteration::build().with_max_iters(
+                                excess_dim),
                             gko::stop::ResidualNorm<ValueType>::build()
                                 .with_baseline(gko::stop::mode::rhs_norm)
                                 .with_reduction_factor(
                                     remove_complex<ValueType>{
-                                        excess_solver_reduction})
-                                .on(exec))
+                                        excess_solver_reduction}))
                         .on(exec);
                 excess_solution->copy_from(excess_rhs);
             } else if (is_lower) {
diff --git a/core/reorder/amd.cpp b/core/reorder/amd.cpp
index fa955801c2b..a305a95293d 100644
--- a/core/reorder/amd.cpp
+++ b/core/reorder/amd.cpp
@@ -212,8 +212,7 @@ std::unique_ptr<LinOp> Amd<IndexType>::generate_impl(
         head, elen, degree, w));
 
     // permutation gets copied to device via gko::array constructor
-    return permutation_type::create(exec, dim<2>{num_rows, num_rows},
-                                    std::move(permutation));
+    return permutation_type::create(exec, std::move(permutation));
 }
 
 
diff --git a/core/reorder/mc64.cpp b/core/reorder/mc64.cpp
new file mode 100644
index 00000000000..b9b8daf382e
--- /dev/null
+++ b/core/reorder/mc64.cpp
@@ -0,0 +1,625 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/reorder/mc64.hpp>
+
+
+#include <chrono>
+#include <memory>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/polymorphic_object.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/base/utils.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/permutation.hpp>
+#include <ginkgo/core/matrix/scaled_permutation.hpp>
+#include <ginkgo/core/matrix/sparsity_csr.hpp>
+
+
+#include "core/components/addressable_pq.hpp"
+#include "core/components/fill_array_kernels.hpp"
+#include "core/matrix/csr_kernels.hpp"
+#include "core/reorder/mc64.hpp"
+
+
+namespace gko {
+namespace experimental {
+namespace reorder {
+namespace mc64 {
+
+
+template <typename ValueType, typename IndexType>
+void initialize_weights(const matrix::Csr<ValueType, IndexType>* host_mtx,
+                        array<remove_complex<ValueType>>& weights_array,
+                        array<remove_complex<ValueType>>& dual_u_array,
+                        array<remove_complex<ValueType>>& row_maxima_array,
+                        gko::experimental::reorder::mc64_strategy strategy)
+{
+    constexpr auto inf =
+        std::numeric_limits<remove_complex<ValueType>>::infinity();
+    const auto num_rows = host_mtx->get_size()[0];
+    const auto row_ptrs = host_mtx->get_const_row_ptrs();
+    const auto col_idxs = host_mtx->get_const_col_idxs();
+    const auto values = host_mtx->get_const_values();
+    auto weights = weights_array.get_data();
+    auto dual_u = dual_u_array.get_data();
+    auto row_maxima = row_maxima_array.get_data();
+    auto run_computation = [&](auto calculate_weight) {
+        for (IndexType row = 0; row < num_rows; row++) {
+            const auto row_begin = row_ptrs[row];
+            const auto row_end = row_ptrs[row + 1];
+            auto row_max = -inf;
+            for (IndexType idx = row_begin; idx < row_end; idx++) {
+                const auto weight = calculate_weight(values[idx]);
+                weights[idx] = weight;
+                row_max = std::max(weight, row_max);
+            }
+
+            row_maxima[row] = row_max;
+
+            for (IndexType idx = row_begin; idx < row_end; idx++) {
+                const auto weight = row_max - weights[idx];
+                weights[idx] = weight;
+                const auto col = col_idxs[idx];
+                dual_u[col] = std::min(weight, dual_u[col]);
+            }
+        }
+    };
+    if (strategy ==
+        gko::experimental::reorder::mc64_strategy::max_diagonal_sum) {
+        run_computation([](ValueType a) { return abs(a); });
+    } else {
+        run_computation([](ValueType a) { return std::log2(abs(a)); });
+    }
+}
+
+
+// Assume invalid_index in permutation and inv_permutation
+template <typename ValueType, typename IndexType>
+void initial_matching(
+    size_type num_rows, const IndexType* row_ptrs, const IndexType* col_idxs,
+    const array<ValueType>& weights_array, const array<ValueType>& dual_u_array,
+    array<IndexType>& permutation, array<IndexType>& inv_permutation,
+    array<IndexType>& matched_idxs_array,
+    array<IndexType>& unmatched_rows_array, ValueType tolerance)
+{
+    const auto nnz = row_ptrs[num_rows];
+    const auto weights = weights_array.get_const_data();
+    const auto dual_u = dual_u_array.get_const_data();
+    auto p = permutation.get_data();
+    auto ip = inv_permutation.get_data();
+    auto idxs = matched_idxs_array.get_data();
+    auto unmatched = unmatched_rows_array.get_data();
+    size_type um_count = 0;
+
+    // In the following comments, w(row, col) will refer to the reduced weight
+    // abs(weights(row, col) - dual_u(col)) where dual_u is a dual vector
+    // needed for non-negativity of all weights.
+    // For each row, look for an unmatched column col for which
+    // w(row, col) < tolerance. If one is found, add the edge (row, col) to the
+    // matching and move on to the next row.
+    for (IndexType row = 0; row < num_rows; row++) {
+        const auto row_begin = row_ptrs[row];
+        const auto row_end = row_ptrs[row + 1];
+        bool matched = false;
+        for (IndexType idx = row_begin; idx < row_end; idx++) {
+            const auto col = col_idxs[idx];
+            if (abs(weights[idx] - dual_u[col]) < tolerance &&
+                ip[col] == invalid_index<IndexType>()) {
+                p[row] = col;
+                ip[col] = row;
+                idxs[row] = idx;
+                matched = true;
+                break;
+            }
+        }
+        if (!matched) {
+            // Mark unmatched rows for later.
+            unmatched[um_count++] = row;
+        }
+    }
+
+    // For remaining unmatched rows, look for a matched column with i
+    // w(row, col) < tolerance that is matched to another row, row_1.
+    // If there is another column col_1 with w(row_1, col_1) < tolerance
+    // that is not yet matched, replace the matched edge (row_1, col)
+    // with the two new matched edges (row, col) and (row_1, col_1).
+    size_type um = 0;
+    auto row = unmatched[um];
+    // If row == 0 we passed the last unmatched row and reached the
+    // zero-initialized part of the array. Row 0 is always matched as the matrix
+    // is assumed to be nonsingular and the previous loop starts with row 0.
+    while (row != 0 && um < num_rows) {
+        const auto row_begin = row_ptrs[row];
+        const auto row_end = row_ptrs[row + 1];
+        bool found = [&] {
+            for (IndexType idx = row_begin; idx < row_end; idx++) {
+                const auto col = col_idxs[idx];
+                if (abs(weights[idx] - dual_u[col]) < tolerance) {
+                    const auto row_1 = ip[col];
+                    const auto row_1_begin = row_ptrs[row_1];
+                    const auto row_1_end = row_ptrs[row_1 + 1];
+                    for (IndexType idx_1 = row_1_begin; idx_1 < row_1_end;
+                         idx_1++) {
+                        const auto col_1 = col_idxs[idx_1];
+                        if (abs(weights[idx_1] - dual_u[col_1]) < tolerance &&
+                            ip[col_1] == invalid_index<IndexType>()) {
+                            p[row] = col;
+                            ip[col] = row;
+                            idxs[row] = idx;
+                            p[row_1] = col_1;
+                            ip[col_1] = row_1;
+                            idxs[row_1] = idx_1;
+                            return true;
+                        }
+                    }
+                }
+            }
+            return false;
+        }();
+        if (found) {
+            // Mark previously unmatched row as matched.
+            unmatched[um] = invalid_index<IndexType>();
+        }
+        row = unmatched[++um];
+    }
+}
+
+
+template <typename ValueType, typename IndexType>
+void shortest_augmenting_path(
+    size_type num_rows, const IndexType* row_ptrs, const IndexType* col_idxs,
+    array<ValueType>& weights_array, array<ValueType>& dual_u_array,
+    array<ValueType>& distance_array, array<IndexType>& permutation,
+    array<IndexType>& inv_permutation, IndexType root,
+    array<IndexType>& parents_array, array<IndexType>& generation_array,
+    array<IndexType>& marked_cols_array, array<IndexType>& matched_idxs_array,
+    addressable_priority_queue<ValueType, IndexType>& queue,
+    std::vector<IndexType>& q_j, ValueType tolerance)
+{
+    constexpr auto inf = std::numeric_limits<ValueType>::infinity();
+    const auto nnz = row_ptrs[num_rows];
+    auto weights = weights_array.get_data();
+    auto dual_u = dual_u_array.get_data();
+    auto distance = distance_array.get_data();
+
+    auto p = permutation.get_data();
+    auto ip = inv_permutation.get_data();
+
+    auto parents = parents_array.get_data();
+    // Generation array to mark visited nodes.
+    // It can take four states:
+    //  - gen[col] = #rows + root: The distance to col is smaller than the
+    //      length of the currently shortest augmenting path.
+    //  - gen[col] = - #rows - root: The distance to col is within a tolerance
+    //      of the currently shortest distance to the root. In this case, col
+    //      is placed into the vector q_j holding the nodes with the shortest
+    //      known distance to the root.
+    //  - gen[col] = root: The distance to col is smaller than the length of
+    //      the currently shortest augmenting path but larger than the currently
+    //      shortest known distance to the root. In this case, col is placed
+    //      into the priority queue.
+    //  - gen[col] = - root: The shortest possible distance for col to the root
+    //      has been found. If encountered again, col does not need to be
+    //      considered another time.
+    auto generation = generation_array.get_data();
+    // Set of marked columns whose shortest alternating paths and distances to
+    // the root are known.
+    auto marked_cols = marked_cols_array.get_data();
+    // Indices of the nonzero entries corresponding to the matched column in
+    // each matched row. So, if row i is matched to column j, W(i,j) is found
+    // at weights[idxs[i]] where W is the weight matrix.
+    auto idxs = matched_idxs_array.get_data();
+
+    queue.reset();
+    q_j.clear();
+
+    // The length of the current path.
+    ValueType lsp = inf;
+    // The length of the currently shortest found augmenting path starting from
+    // root.
+    ValueType lsap = inf;
+    // The column at the end of the currently shortest found augmenting path.
+    auto jsap = invalid_index<IndexType>();
+
+    auto row = root;
+    IndexType marked_counter = 0;
+
+    const auto begin = row_ptrs[row];
+    const auto end = row_ptrs[row + 1];
+
+    // Look for matching candidates in the row corresponding to root.
+    // As root is not yet matched, the corresponding entry in the dual
+    // vector v is 0 so we do not have to compute it.
+    for (IndexType idx = begin; idx < end; idx++) {
+        const auto col = col_idxs[idx];
+        const ValueType dnew = weights[idx] - dual_u[col];
+
+        if (dnew < lsap) {
+            if (ip[col] == invalid_index<IndexType>()) {
+                // col is unmatched so we found an augmenting path.
+                lsap = dnew;
+                jsap = col;
+                parents[col] = row;
+            } else {
+                distance[col] = dnew;
+                parents[col] = row;
+                generation[col] = num_rows + root;
+                if (dnew < lsp) {
+                    lsp = dnew;
+                }
+            }
+        }
+    }
+
+    // Write the columns in the row corresponding to root with the
+    // smallest distance into q_j, other columns with distance
+    // smaller than lsap into the priority queue.
+    for (IndexType idx = begin; idx < end; idx++) {
+        const auto col = col_idxs[idx];
+        const auto dist = distance[col];
+        const auto gen = generation[col];
+        if (dist < lsap && gen == num_rows + root) {
+            if (abs(dist - lsp) < tolerance) {
+                generation[col] = -num_rows - root;
+                q_j.push_back(col);
+            } else {
+                generation[col] = root;
+                queue.insert(dist, col);
+            }
+        }
+    }
+
+    while (true) {
+        // Mark the column with the shortest known distance to the root
+        // and proceed in its matched row. If both q_j and queue are empty
+        // or if the current path becomes longer than the currently
+        // shortest augmenting path, we are done.
+        if (q_j.size() > 0) {
+            // q_j is known to contain only entries with shortest known
+            // distance to the root, so if it is not empty we do not
+            // have to operate on the priority queue.
+            if (lsap <= lsp) {
+                break;
+            }
+            const auto col = q_j.back();
+            q_j.pop_back();
+            generation[col] = -root;
+            marked_cols[marked_counter++] = col;
+            row = ip[col];
+        } else {
+            if (queue.empty()) {
+                break;
+            }
+            auto col = queue.min_node();
+            while (generation[col] == -root && !queue.empty()) {
+                // If col is already marked because it previously was in q_j
+                // we have to disregard it.
+                queue.pop_min();
+                col = queue.min_node();
+            }
+            if (queue.empty()) {
+                break;
+            }
+            lsp = distance[col];
+            if (lsap <= lsp) {
+                break;
+            }
+            generation[col] = -root;
+            marked_cols[marked_counter++] = col;
+            queue.pop_min();
+            row = ip[col];
+        }
+        const auto row_begin = row_ptrs[row];
+        const auto row_end = row_ptrs[row + 1];
+        // Compute the entry of the dual vector v corresponding to row.
+        const auto dual_vi = p[row] == invalid_index<IndexType>()
+                                 ? zero<ValueType>()
+                                 : weights[idxs[row]] - dual_u[p[row]];
+        for (IndexType idx = row_begin; idx < row_end; idx++) {
+            const auto col = col_idxs[idx];
+            const auto gen = generation[col];
+
+            // col is already marked. Note that root will never be 0 as this row
+            // is guaranteed to already be part of the initial matching.
+            if (gen == -root) {
+                continue;
+            }
+
+            const ValueType dnew = lsp + weights[idx] - dual_u[col] - dual_vi;
+
+            if (dnew < lsap) {
+                if (ip[col] == invalid_index<IndexType>()) {
+                    // col is unmatched so we found an augmenting path.
+                    lsap = dnew;
+                    jsap = col;
+                    parents[col] = row;
+                } else {
+                    if ((gen != root || dnew < distance[col]) &&
+                        gen != -num_rows - root) {
+                        distance[col] = dnew;
+                        parents[col] = row;
+                        if (abs(dnew - lsp) < tolerance) {
+                            // dnew is the shortest currently possible distance,
+                            // so col can be put into q_j and be marked
+                            // accordingly.
+                            generation[col] = -num_rows - root;
+                            q_j.push_back(col);
+                        } else if (gen != root) {
+                            // col was not encountered before.
+                            generation[col] = root;
+                            queue.insert(dnew, col);
+                        } else {
+                            // col was already encountered but with larger
+                            // distance on a different path.
+                            generation[col] = root;
+                            queue.update_key(dnew, col);
+                        }
+                    }
+                }
+            }
+        }
+    }
+    if (lsap != inf) {
+        IndexType col = jsap;
+        // Update the matching along the shortest augmenting path.
+        do {
+            row = parents[col];
+            ip[col] = row;
+            auto idx = row_ptrs[row];
+            while (col_idxs[idx] != col) {
+                idx++;
+            }
+            idxs[row] = idx;
+            std::swap(col, p[row]);
+        } while (row != root);
+        // Update the dual vector u.
+        for (size_type i = 0; i < marked_counter; i++) {
+            const auto marked_col = marked_cols[i];
+            dual_u[marked_col] += distance[marked_col] - lsap;
+        }
+    }
+}
+
+
+template <typename ValueType, typename IndexType>
+void augment_matching(const matrix::Csr<ValueType, IndexType>* host_mtx,
+                      array<remove_complex<ValueType>>& weights,
+                      array<remove_complex<ValueType>>& dual_u,
+                      array<remove_complex<ValueType>>& distance,
+                      array<IndexType>& permutation,
+                      array<IndexType>& inv_permutation,
+                      array<IndexType>& unmatched_rows,
+                      array<IndexType>& parents, array<IndexType>& generation,
+                      array<IndexType>& marked_cols,
+                      array<IndexType>& matched_idxs,
+                      remove_complex<ValueType> tolerance)
+{
+    const auto host_exec = host_mtx->get_executor();
+    const auto num_rows = host_mtx->get_size()[0];
+    const auto row_ptrs = host_mtx->get_const_row_ptrs();
+    const auto col_idxs = host_mtx->get_const_col_idxs();
+    addressable_priority_queue<remove_complex<ValueType>, IndexType> queue{
+        host_exec, num_rows};
+    // For each row that is not contained in the initial matching, search for
+    // an augmenting path, update the matching and compute the new entries
+    // of the dual vectors.
+    std::vector<IndexType> q_j{};
+    const auto unmatched = unmatched_rows.get_data();
+    auto root = unmatched[0];
+    for (size_type um = 1; root != 0 && um < num_rows; um++) {
+        if (root != invalid_index<IndexType>()) {
+            mc64::shortest_augmenting_path(
+                num_rows, row_ptrs, col_idxs, weights, dual_u, distance,
+                permutation, inv_permutation, root, parents, generation,
+                marked_cols, matched_idxs, queue, q_j, tolerance);
+        }
+        root = unmatched[um];
+    }
+}
+
+
+template <typename ValueType, typename IndexType>
+void compute_scaling(const matrix::Csr<ValueType, IndexType>* host_mtx,
+                     const array<remove_complex<ValueType>>& weights_array,
+                     const array<remove_complex<ValueType>>& dual_u_array,
+                     const array<remove_complex<ValueType>>& row_maxima_array,
+                     const array<IndexType>& permutation,
+                     const array<IndexType>& matched_idxs_array,
+                     mc64_strategy strategy, ValueType* row_scaling,
+                     ValueType* col_scaling)
+{
+    constexpr auto inf =
+        std::numeric_limits<remove_complex<ValueType>>::infinity();
+    const auto num_rows = host_mtx->get_size()[0];
+    const auto weights = weights_array.get_const_data();
+    const auto dual_u = dual_u_array.get_const_data();
+    const auto row_maxima = row_maxima_array.get_const_data();
+    const auto p = permutation.get_const_data();
+    const auto idxs = matched_idxs_array.get_const_data();
+
+    if (strategy == mc64_strategy::max_diagonal_product) {
+        for (size_type i = 0; i < num_rows; i++) {
+            const remove_complex<ValueType> u_val = std::exp2(dual_u[i]);
+            const remove_complex<ValueType> v_val =
+                std::exp2(weights[idxs[i]] - dual_u[p[i]] - row_maxima[i]);
+            col_scaling[i] = ValueType{u_val};
+            row_scaling[i] = ValueType{v_val};
+        }
+    } else {
+        for (size_type i = 0; i < num_rows; i++) {
+            col_scaling[i] = 1.;
+            row_scaling[i] = 1.;
+        }
+    }
+}
+
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_MC64_INITIALIZE_WEIGHTS);
+GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_MC64_INITIAL_MATCHING);
+GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_MC64_SHORTEST_AUGMENTING_PATH);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_MC64_COMPUTE_SCALING);
+
+
+}  // namespace mc64
+
+
+namespace {
+
+
+GKO_REGISTER_HOST_OPERATION(initialize_weights, mc64::initialize_weights);
+GKO_REGISTER_HOST_OPERATION(initial_matching, mc64::initial_matching);
+GKO_REGISTER_HOST_OPERATION(augment_matching, mc64::augment_matching);
+GKO_REGISTER_HOST_OPERATION(compute_scaling, mc64::compute_scaling);
+GKO_REGISTER_OPERATION(fill_seq_array, components::fill_seq_array);
+
+
+}  // namespace
+
+
+template <typename ValueType, typename IndexType>
+std::unique_ptr<Composition<ValueType>> Mc64<ValueType, IndexType>::generate(
+    std::shared_ptr<const LinOp> system_matrix) const
+{
+    auto product = std::unique_ptr<Composition<ValueType>>(
+        static_cast<Composition<ValueType>*>(
+            this->LinOpFactory::generate(std::move(system_matrix)).release()));
+    return product;
+}
+
+
+template <typename ValueType, typename IndexType>
+Mc64<ValueType, IndexType>::Mc64(std::shared_ptr<const Executor> exec,
+                                 const parameters_type& params)
+    : EnablePolymorphicObject<Mc64, LinOpFactory>(std::move(exec)),
+      parameters_{params}
+{}
+
+
+template <typename ValueType, typename IndexType>
+std::unique_ptr<LinOp> Mc64<ValueType, IndexType>::generate_impl(
+    std::shared_ptr<const LinOp> system_matrix) const
+{
+    const auto exec = this->get_executor();
+    const auto host_exec = exec->get_master();
+    const auto mtx =
+        copy_and_convert_to<matrix_type>(host_exec, system_matrix.get());
+    const auto num_rows = mtx->get_size()[0];
+    const auto nnz = mtx->get_num_stored_elements();
+
+    // Real valued arrays with space for:
+    //     - nnz entries for weights
+    //     - num_rows entries each for the dual vector u, distance information
+    //       and the max weight per row
+    array<remove_complex<ValueType>> weights{host_exec, nnz};
+    array<remove_complex<ValueType>> dual_u{host_exec, num_rows};
+    array<remove_complex<ValueType>> distance{host_exec, num_rows};
+    array<remove_complex<ValueType>> row_maxima{host_exec, num_rows};
+    // Zero initialized index arrays with space for n entries each for parent
+    // information, priority queue handles, generation information, marked
+    // columns, indices corresponding to matched columns in the according row
+    // and still unmatched rows
+    array<IndexType> parents{host_exec, num_rows};
+    array<IndexType> generation{host_exec, num_rows};
+    array<IndexType> marked_cols{host_exec, num_rows};
+    array<IndexType> matched_idxs{host_exec, num_rows};
+    array<IndexType> unmatched_rows{host_exec, num_rows};
+    array<ValueType> row_scaling{host_exec, num_rows};
+    array<ValueType> col_scaling{host_exec, num_rows};
+    parents.fill(0);
+    generation.fill(0);
+    marked_cols.fill(0);
+    matched_idxs.fill(0);
+    unmatched_rows.fill(0);
+    constexpr auto inf =
+        std::numeric_limits<remove_complex<ValueType>>::infinity();
+    dual_u.fill(inf);
+    distance.fill(inf);
+
+    array<IndexType> permutation{host_exec, num_rows};
+    array<IndexType> inv_permutation{host_exec, num_rows};
+    permutation.fill(invalid_index<IndexType>());
+    inv_permutation.fill(invalid_index<IndexType>());
+
+    const auto row_ptrs = mtx->get_const_row_ptrs();
+    const auto col_idxs = mtx->get_const_col_idxs();
+
+    if (num_rows > 0) {
+        exec->run(make_initialize_weights(mtx.get(), weights, dual_u,
+                                          row_maxima, parameters_.strategy));
+
+        // Compute an initial maximum matching from the nonzero entries for
+        // which the reduced weight (W(i, j) - u(j) - v(i)) is zero. Here, W is
+        // the weight matrix and u and v are the dual vectors. Note that v
+        // initially only contains zeros and hence can still be ignored here.
+        exec->run(make_initial_matching(num_rows, row_ptrs, col_idxs, weights,
+                                        dual_u, permutation, inv_permutation,
+                                        matched_idxs, unmatched_rows,
+                                        parameters_.tolerance));
+
+        exec->run(make_augment_matching(
+            mtx.get(), weights, dual_u, distance, permutation, inv_permutation,
+            unmatched_rows, parents, generation, marked_cols, matched_idxs,
+            this->get_parameters().tolerance));
+
+        exec->run(make_compute_scaling(
+            mtx.get(), weights, dual_u, row_maxima, permutation, matched_idxs,
+            parameters_.strategy, row_scaling.get_data(),
+            col_scaling.get_data()));
+    }
+
+    array<index_type> identity_permutation{exec, num_rows};
+    exec->run(make_fill_seq_array(identity_permutation.get_data(), num_rows));
+
+    using perm_type = gko::matrix::ScaledPermutation<ValueType, IndexType>;
+    return result_type::create(
+        perm_type::create(exec, std::move(row_scaling),
+                          std::move(inv_permutation)),
+        perm_type::create(exec, std::move(col_scaling),
+                          std::move(identity_permutation)));
+}
+
+
+#define GKO_DECLARE_MC64(ValueType, IndexType) class Mc64<ValueType, IndexType>
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_MC64);
+
+
+}  // namespace reorder
+}  // namespace experimental
+}  // namespace gko
diff --git a/core/reorder/mc64.hpp b/core/reorder/mc64.hpp
new file mode 100644
index 00000000000..5ec0e94cd57
--- /dev/null
+++ b/core/reorder/mc64.hpp
@@ -0,0 +1,112 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CORE_REORDER_MC64_HPP_
+#define GKO_CORE_REORDER_MC64_HPP_
+
+
+#include <ginkgo/core/reorder/mc64.hpp>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+
+
+#include "core/components/addressable_pq.hpp"
+
+
+namespace gko {
+namespace experimental {
+namespace reorder {
+namespace mc64 {
+
+
+#define GKO_DECLARE_MC64_INITIALIZE_WEIGHTS(ValueType, IndexType) \
+    void initialize_weights(                                      \
+        const matrix::Csr<ValueType, IndexType>* mtx,             \
+        array<remove_complex<ValueType>>& weights_array,          \
+        array<remove_complex<ValueType>>& dual_u_array,           \
+        array<remove_complex<ValueType>>& row_maxima_array,       \
+        gko::experimental::reorder::mc64_strategy strategy)
+
+#define GKO_DECLARE_MC64_INITIAL_MATCHING(ValueType, IndexType)              \
+    void initial_matching(                                                   \
+        size_type num_rows, const IndexType* row_ptrs,                       \
+        const IndexType* col_idxs, const array<ValueType>& weights_array,    \
+        const array<ValueType>& dual_u_array, array<IndexType>& permutation, \
+        array<IndexType>& inv_permutation,                                   \
+        array<IndexType>& matched_idxs_array,                                \
+        array<IndexType>& unmatched_rows_array, ValueType tolerance)
+
+#define GKO_DECLARE_MC64_SHORTEST_AUGMENTING_PATH(ValueType, IndexType)   \
+    void shortest_augmenting_path(                                        \
+        size_type num_rows, const IndexType* row_ptrs,                    \
+        const IndexType* col_idxs, array<ValueType>& weights_array,       \
+        array<ValueType>& dual_u_array, array<ValueType>& distance_array, \
+        array<IndexType>& permutation, array<IndexType>& inv_permutation, \
+        IndexType root, array<IndexType>& parents_array,                  \
+        array<IndexType>& generation_array,                               \
+        array<IndexType>& marked_cols_array,                              \
+        array<IndexType>& matched_idxs_array,                             \
+        addressable_priority_queue<ValueType, IndexType>& queue,          \
+        std::vector<IndexType>& q_j, ValueType tolerance)
+
+#define GKO_DECLARE_MC64_COMPUTE_SCALING(ValueType, IndexType)              \
+    void compute_scaling(                                                   \
+        const matrix::Csr<ValueType, IndexType>* mtx,                       \
+        const array<remove_complex<ValueType>>& weights_array,              \
+        const array<remove_complex<ValueType>>& dual_u_array,               \
+        const array<remove_complex<ValueType>>& row_maxima_array,           \
+        const array<IndexType>& permutation,                                \
+        const array<IndexType>& matched_idxs_array, mc64_strategy strategy, \
+        ValueType* row_scaling, ValueType* col_scaling)
+
+
+template <typename ValueType, typename IndexType>
+GKO_DECLARE_MC64_INITIALIZE_WEIGHTS(ValueType, IndexType);
+
+template <typename ValueType, typename IndexType>
+GKO_DECLARE_MC64_INITIAL_MATCHING(ValueType, IndexType);
+
+template <typename ValueType, typename IndexType>
+GKO_DECLARE_MC64_SHORTEST_AUGMENTING_PATH(ValueType, IndexType);
+
+template <typename ValueType, typename IndexType>
+GKO_DECLARE_MC64_COMPUTE_SCALING(ValueType, IndexType);
+
+
+}  // namespace mc64
+}  // namespace reorder
+}  // namespace experimental
+}  // namespace gko
+
+#endif  // GKO_CORE_REORDER_MC64_HPP_
diff --git a/core/reorder/nested_dissection.cpp b/core/reorder/nested_dissection.cpp
index caf85c979e5..2f501ad1d96 100644
--- a/core/reorder/nested_dissection.cpp
+++ b/core/reorder/nested_dissection.cpp
@@ -183,8 +183,7 @@ std::unique_ptr<LinOp> NestedDissection<ValueType, IndexType>::generate_impl(
                             inv_permutation.get_data()));
     permutation.set_executor(exec);
     // we discard the inverse permutation
-    return permutation_type::create(exec, dim<2>{num_rows, num_rows},
-                                    std::move(permutation));
+    return permutation_type::create(exec, std::move(permutation));
 }
 
 
diff --git a/core/reorder/rcm.cpp b/core/reorder/rcm.cpp
index ce4c26225a1..fcc76676871 100644
--- a/core/reorder/rcm.cpp
+++ b/core/reorder/rcm.cpp
@@ -66,22 +66,86 @@ GKO_REGISTER_OPERATION(get_degree_of_nodes, rcm::get_degree_of_nodes);
 
 
 template <typename ValueType, typename IndexType>
-void Rcm<ValueType, IndexType>::generate(
-    std::shared_ptr<const Executor>& exec,
-    std::unique_ptr<SparsityMatrix> adjacency_matrix) const
+void rcm_reorder(const matrix::SparsityCsr<ValueType, IndexType>* mtx,
+                 IndexType* permutation, IndexType* inv_permutation,
+                 starting_strategy strategy)
 {
-    const IndexType num_rows = adjacency_matrix->get_size()[0];
-    const auto mtx = adjacency_matrix.get();
-    auto degrees = array<IndexType>(exec, num_rows);
-    // RCM is only valid for symmetric matrices. Need to add an expensive check
-    // for symmetricity here ?
+    const auto exec = mtx->get_executor();
+    const IndexType num_rows = mtx->get_size()[0];
+    array<IndexType> degrees{exec, mtx->get_size()[0]};
     exec->run(rcm::make_get_degree_of_nodes(num_rows, mtx->get_const_row_ptrs(),
                                             degrees.get_data()));
     exec->run(rcm::make_get_permutation(
         num_rows, mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(),
-        degrees.get_const_data(), permutation_->get_permutation(),
-        inv_permutation_.get() ? inv_permutation_->get_permutation() : nullptr,
-        parameters_.strategy));
+        degrees.get_const_data(), permutation, inv_permutation, strategy));
+}
+
+
+template <typename ValueType, typename IndexType>
+Rcm<ValueType, IndexType>::Rcm(std::shared_ptr<const Executor> exec)
+    : EnablePolymorphicObject<Rcm, ReorderingBase<IndexType>>(std::move(exec))
+{}
+
+
+template <typename ValueType, typename IndexType>
+Rcm<ValueType, IndexType>::Rcm(const Factory* factory,
+                               const ReorderingBaseArgs& args)
+    : EnablePolymorphicObject<Rcm, ReorderingBase<IndexType>>(
+          factory->get_executor()),
+      parameters_{factory->get_parameters()}
+{
+    // Always execute the reordering on the cpu.
+    const auto is_gpu_executor =
+        this->get_executor() != this->get_executor()->get_master();
+    auto cpu_exec = is_gpu_executor ? this->get_executor()->get_master()
+                                    : this->get_executor();
+
+    auto adjacency_matrix = SparsityMatrix::create(cpu_exec);
+    array<IndexType> degrees;
+
+    // The adjacency matrix has to be square.
+    GKO_ASSERT_IS_SQUARE_MATRIX(args.system_matrix);
+    // This is needed because it does not make sense to call the copy and
+    // convert if the existing matrix is empty.
+    if (args.system_matrix->get_size()) {
+        auto tmp =
+            copy_and_convert_to<SparsityMatrix>(cpu_exec, args.system_matrix);
+        // This function provided within the Sparsity matrix format removes
+        // the diagonal elements and outputs an adjacency matrix.
+        adjacency_matrix = tmp->to_adjacency_matrix();
+    }
+
+    auto const size = adjacency_matrix->get_size()[0];
+    permutation_ = PermutationMatrix::create(cpu_exec, size);
+
+    // To make it explicit.
+    inv_permutation_ = nullptr;
+    if (parameters_.construct_inverse_permutation) {
+        inv_permutation_ = PermutationMatrix::create(cpu_exec, size);
+    }
+
+    rcm_reorder(
+        adjacency_matrix.get(), permutation_->get_permutation(),
+        inv_permutation_ ? inv_permutation_->get_permutation() : nullptr,
+        parameters_.strategy);
+
+    // Copy back results to gpu if necessary.
+    if (is_gpu_executor) {
+        const auto gpu_exec = this->get_executor();
+        auto gpu_perm = share(PermutationMatrix::create(gpu_exec, size));
+        gpu_perm->copy_from(permutation_);
+        permutation_ = gpu_perm;
+        if (inv_permutation_) {
+            auto gpu_inv_perm =
+                share(PermutationMatrix::create(gpu_exec, size));
+            gpu_inv_perm->copy_from(inv_permutation_);
+            inv_permutation_ = gpu_inv_perm;
+        }
+    }
+    auto permutation_array =
+        make_array_view(this->get_executor(), permutation_->get_size()[0],
+                        permutation_->get_permutation());
+    this->set_permutation_array(permutation_array);
 }
 
 
@@ -90,4 +154,95 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_RCM);
 
 
 }  // namespace reorder
+
+
+namespace experimental {
+namespace reorder {
+
+
+template <typename IndexType>
+Rcm<IndexType>::Rcm(std::shared_ptr<const Executor> exec,
+                    const parameters_type& params)
+    : EnablePolymorphicObject<Rcm, LinOpFactory>(std::move(exec)),
+      parameters_{params}
+{}
+
+
+template <typename IndexType>
+std::unique_ptr<matrix::Permutation<IndexType>> Rcm<IndexType>::generate(
+    std::shared_ptr<const LinOp> system_matrix) const
+{
+    auto product =
+        std::unique_ptr<permutation_type>(static_cast<permutation_type*>(
+            this->LinOpFactory::generate(std::move(system_matrix)).release()));
+    return product;
+}
+
+
+template <typename IndexType>
+std::unique_ptr<LinOp> Rcm<IndexType>::generate_impl(
+    std::shared_ptr<const LinOp> system_matrix) const
+{
+    GKO_ASSERT_IS_SQUARE_MATRIX(system_matrix);
+    const auto exec = this->get_executor();
+    const auto host_exec = exec->get_master();
+    const auto num_rows = system_matrix->get_size()[0];
+    using sparsity_mtx = matrix::SparsityCsr<float, IndexType>;
+    std::unique_ptr<LinOp> converted;
+    // extract row pointers and column indices
+    const IndexType* row_ptrs{};
+    const IndexType* col_idxs{};
+    size_type nnz{};
+    auto convert = [&](auto op, auto value_type) {
+        using ValueType = std::decay_t<decltype(value_type)>;
+        using Identity = matrix::Identity<ValueType>;
+        using Mtx = matrix::Csr<ValueType, IndexType>;
+        using Scalar = matrix::Dense<ValueType>;
+        auto conv_csr = Mtx::create(host_exec);
+        as<ConvertibleTo<Mtx>>(op)->convert_to(conv_csr);
+        if (!parameters_.skip_symmetrize) {
+            auto scalar = initialize<Scalar>({one<ValueType>()}, exec);
+            auto id = Identity::create(exec, conv_csr->get_size()[0]);
+            // compute A^T + A
+            conv_csr->transpose()->apply(scalar, id, scalar, conv_csr);
+        }
+        if (exec != host_exec) {
+            conv_csr = gko::clone(host_exec, std::move(conv_csr));
+        }
+        nnz = conv_csr->get_num_stored_elements();
+        row_ptrs = conv_csr->get_const_row_ptrs();
+        col_idxs = conv_csr->get_const_col_idxs();
+        converted = std::move(conv_csr);
+    };
+    if (auto convertible =
+            dynamic_cast<const ConvertibleTo<matrix::Csr<float, IndexType>>*>(
+                system_matrix.get())) {
+        convert(system_matrix, float{});
+    } else {
+        convert(system_matrix, std::complex<float>{});
+    }
+
+    array<IndexType> permutation(host_exec, num_rows);
+
+    // remove diagonal entries
+    auto pattern = sparsity_mtx::create_const(
+        host_exec, gko::dim<2>{num_rows, num_rows},
+        make_const_array_view(host_exec, nnz, col_idxs),
+        make_const_array_view(host_exec, num_rows + 1, row_ptrs));
+    pattern = pattern->to_adjacency_matrix();
+    rcm_reorder(pattern.get(), permutation.get_data(),
+                static_cast<IndexType*>(nullptr), parameters_.strategy);
+
+    // permutation gets copied to device via gko::array constructor
+    return permutation_type::create(exec, std::move(permutation));
+}
+
+
+#undef GKO_DECLARE_RCM
+#define GKO_DECLARE_RCM(IndexType) class Rcm<IndexType>
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_RCM);
+
+
+}  // namespace reorder
+}  // namespace experimental
 }  // namespace gko
diff --git a/core/solver/batch_bicgstab.cpp b/core/solver/batch_bicgstab.cpp
new file mode 100644
index 00000000000..b4219bbc5fa
--- /dev/null
+++ b/core/solver/batch_bicgstab.cpp
@@ -0,0 +1,79 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/solver/batch_bicgstab.hpp>
+
+
+#include <ginkgo/core/base/batch_lin_op.hpp>
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/math.hpp>
+
+
+#include "core/base/batch_multi_vector_kernels.hpp"
+#include "core/solver/batch_bicgstab_kernels.hpp"
+
+
+namespace gko {
+namespace batch {
+namespace solver {
+namespace bicgstab {
+
+
+GKO_REGISTER_OPERATION(apply, batch_bicgstab::apply);
+
+
+}  // namespace bicgstab
+
+
+template <typename ValueType>
+void Bicgstab<ValueType>::solver_apply(
+    const MultiVector<ValueType>* b, MultiVector<ValueType>* x,
+    log::detail::log_data<remove_complex<ValueType>>* log_data) const
+{
+    using MVec = MultiVector<ValueType>;
+    const kernels::batch_bicgstab::settings<remove_complex<ValueType>> settings{
+        this->max_iterations_, static_cast<real_type>(this->residual_tol_),
+        parameters_.tolerance_type};
+    auto exec = this->get_executor();
+    exec->run(bicgstab::make_apply(settings, this->system_matrix_.get(),
+                                   this->preconditioner_.get(), b, x,
+                                   *log_data));
+}
+
+
+#define GKO_DECLARE_BATCH_BICGSTAB(_type) class Bicgstab<_type>
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB);
+
+
+}  // namespace solver
+}  // namespace batch
+}  // namespace gko
diff --git a/core/solver/batch_bicgstab_kernels.hpp b/core/solver/batch_bicgstab_kernels.hpp
new file mode 100644
index 00000000000..32291562afd
--- /dev/null
+++ b/core/solver/batch_bicgstab_kernels.hpp
@@ -0,0 +1,230 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CORE_SOLVER_BATCH_BICGSTAB_KERNELS_HPP_
+#define GKO_CORE_SOLVER_BATCH_BICGSTAB_KERNELS_HPP_
+
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/log/batch_logger.hpp>
+#include <ginkgo/core/matrix/batch_dense.hpp>
+#include <ginkgo/core/matrix/batch_ell.hpp>
+#include <ginkgo/core/stop/batch_stop_enum.hpp>
+
+
+#include "core/base/kernel_declaration.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace batch_bicgstab {
+
+
+/**
+ * Options controlling the batch Bicgstab solver.
+ */
+template <typename RealType>
+struct settings {
+    static_assert(std::is_same<RealType, remove_complex<RealType>>::value,
+                  "Template parameter must be a real type");
+    int max_iterations;
+    RealType residual_tol;
+    ::gko::batch::stop::tolerance_type tol_type;
+};
+
+
+/**
+ * Calculates the amount of in-solver storage needed by batch-Bicgstab.
+ *
+ * The calculation includes multivectors for
+ * - r
+ * - r_hat
+ * - p
+ * - p_hat
+ * - v
+ * - s
+ * - s_hat
+ * - t
+ * - x
+ * Note: small arrays for
+ * - rho_old
+ * - rho_new
+ * - omega
+ * - alpha
+ * - temp
+ * - rhs_norms
+ * - res_norms
+ * are currently not accounted for as they are in static shared memory.
+ */
+template <typename ValueType>
+inline int local_memory_requirement(const int num_rows, const int num_rhs)
+{
+    return (9 * num_rows * num_rhs) * sizeof(ValueType);
+}
+
+
+struct storage_config {
+    // preconditioner storage
+    bool prec_shared;
+    // total number of shared vectors
+    int n_shared;
+    // number of vectors in global memory
+    int n_global;
+    // global stride from one batch entry to the next
+    int gmem_stride_bytes;
+    // padded vector length
+    int padded_vec_len;
+};
+
+
+template <int align_bytes>
+void set_gmem_stride_bytes(storage_config& sconf,
+                           const int multi_vector_size_bytes,
+                           const int prec_storage_bytes)
+{
+    int gmem_stride = sconf.n_global * multi_vector_size_bytes;
+    if (!sconf.prec_shared) {
+        gmem_stride += prec_storage_bytes;
+    }
+    // align global memory chunks
+    sconf.gmem_stride_bytes =
+        gmem_stride > 0 ? ceildiv(gmem_stride, align_bytes) * align_bytes : 0;
+}
+
+
+/**
+ * Calculates the amount of in-solver storage needed by batch-Bicgstab and
+ * the split between shared and global memory.
+ *
+ * The calculation includes multivectors for
+ * - r
+ * - r_hat
+ * - p
+ * - p_hat
+ * - v
+ * - s
+ * - s_hat
+ * - t
+ * - x
+ * In addition, small arrays are needed for
+ * - rho_old
+ * - rho_new
+ * - omega
+ * - alpha
+ * - temp
+ * - rhs_norms
+ * - res_norms
+ *
+ * @param available_shared_mem  The amount of shared memory per block to use
+ * for keeping intermediate vectors. In case keeping the matrix in L1 cache etc.
+ *   should be prioritized, the cache configuration must be updated separately
+ *   and the needed space should be subtracted before passing to this
+ *   function.
+ * @param num_rows  Size of the matrix.
+ * @param num_nz  Number of nonzeros in the matrix
+ * @param num_rhs  Number of right-hand-sides in the vectors.
+ * @return  A struct containing allocation information specific to Bicgstab.
+ */
+template <typename Prectype, typename ValueType, int align_bytes = 32>
+storage_config compute_shared_storage(const int available_shared_mem,
+                                      const int num_rows, const int num_nz,
+                                      const int num_rhs)
+{
+    using real_type = remove_complex<ValueType>;
+    const int vec_size = num_rows * num_rhs * sizeof(ValueType);
+    const int num_main_vecs = 9;
+    const int prec_storage =
+        Prectype::dynamic_work_size(num_rows, num_nz) * sizeof(ValueType);
+    int rem_shared = available_shared_mem;
+    // Set default values. Initially all vecs are in global memory.
+    // {prec_shared, n_shared, n_global, gmem_stride_bytes, padded_vec_len}
+    storage_config sconf{false, 0, num_main_vecs, 0, num_rows};
+    // If available shared mem is zero, set all vecs to global.
+    if (rem_shared <= 0) {
+        set_gmem_stride_bytes<align_bytes>(sconf, vec_size, prec_storage);
+        return sconf;
+    }
+    // Compute the number of vecs that can be stored in shared memory and assign
+    // the rest to global memory.
+    const int initial_vecs_available = rem_shared / vec_size;
+    const int num_vecs_shared = min(initial_vecs_available, num_main_vecs);
+    sconf.n_shared += num_vecs_shared;
+    sconf.n_global -= num_vecs_shared;
+    rem_shared -= num_vecs_shared * vec_size;
+    // Set the storage configuration with preconditioner workspace in global if
+    // there are any vectors in global memory.
+    if (sconf.n_global > 0) {
+        set_gmem_stride_bytes<align_bytes>(sconf, vec_size, prec_storage);
+        return sconf;
+    }
+    // If more shared memory space is available and preconditioner workspace is
+    // needed, enable preconditioner workspace to use shared memory.
+    if (rem_shared >= prec_storage && prec_storage > 0) {
+        sconf.prec_shared = true;
+        rem_shared -= prec_storage;
+    }
+    // Set the global storage config and align to align_bytes bytes.
+    set_gmem_stride_bytes<align_bytes>(sconf, vec_size, prec_storage);
+    return sconf;
+}
+
+
+}  // namespace batch_bicgstab
+
+
+#define GKO_DECLARE_BATCH_BICGSTAB_APPLY_KERNEL(_type)                       \
+    void apply(                                                              \
+        std::shared_ptr<const DefaultExecutor> exec,                         \
+        const gko::kernels::batch_bicgstab::settings<remove_complex<_type>>& \
+            options,                                                         \
+        const batch::BatchLinOp* a, const batch::BatchLinOp* preconditioner, \
+        const batch::MultiVector<_type>* b, batch::MultiVector<_type>* x,    \
+        gko::batch::log::detail::log_data<remove_complex<_type>>& logdata)
+
+
+#define GKO_DECLARE_ALL_AS_TEMPLATES \
+    template <typename ValueType>    \
+    GKO_DECLARE_BATCH_BICGSTAB_APPLY_KERNEL(ValueType)
+
+
+GKO_DECLARE_FOR_ALL_EXECUTOR_NAMESPACES(batch_bicgstab,
+                                        GKO_DECLARE_ALL_AS_TEMPLATES);
+
+
+#undef GKO_DECLARE_ALL_AS_TEMPLATES
+
+
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_CORE_SOLVER_BATCH_BICGSTAB_KERNELS_HPP_
diff --git a/core/solver/batch_dispatch.hpp b/core/solver/batch_dispatch.hpp
new file mode 100644
index 00000000000..84afd45cb1a
--- /dev/null
+++ b/core/solver/batch_dispatch.hpp
@@ -0,0 +1,342 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CORE_SOLVER_BATCH_DISPATCH_HPP_
+#define GKO_CORE_SOLVER_BATCH_DISPATCH_HPP_
+
+
+#include <ginkgo/core/base/batch_lin_op.hpp>
+#include <ginkgo/core/log/batch_logger.hpp>
+#include <ginkgo/core/matrix/batch_identity.hpp>
+#include <ginkgo/core/stop/batch_stop_enum.hpp>
+
+
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
+
+
+#if defined GKO_COMPILING_CUDA
+
+
+#include "cuda/base/batch_struct.hpp"
+#include "cuda/components/cooperative_groups.cuh"
+#include "cuda/log/batch_logger.cuh"
+#include "cuda/matrix/batch_struct.hpp"
+#include "cuda/preconditioner/batch_preconditioners.cuh"
+#include "cuda/stop/batch_criteria.cuh"
+
+
+namespace gko {
+namespace batch {
+namespace solver {
+
+
+namespace device = gko::kernels::cuda;
+
+
+template <typename ValueType>
+using DeviceValueType = typename gko::kernels::cuda::cuda_type<ValueType>;
+
+
+}  // namespace solver
+}  // namespace batch
+}  // namespace gko
+
+
+#elif defined GKO_COMPILING_HIP
+
+
+#include "hip/base/batch_struct.hip.hpp"
+#include "hip/components/cooperative_groups.hip.hpp"
+#include "hip/log/batch_logger.hip.hpp"
+#include "hip/matrix/batch_struct.hip.hpp"
+#include "hip/preconditioner/batch_preconditioners.hip.hpp"
+#include "hip/stop/batch_criteria.hip.hpp"
+
+
+namespace gko {
+namespace batch {
+namespace solver {
+
+
+namespace device = gko::kernels::hip;
+
+
+template <typename ValueType>
+using DeviceValueType = gko::kernels::hip::hip_type<ValueType>;
+
+
+}  // namespace solver
+}  // namespace batch
+}  // namespace gko
+
+
+#elif defined GKO_COMPILING_DPCPP
+
+
+#include "dpcpp/base/batch_struct.hpp"
+#include "dpcpp/log/batch_logger.hpp"
+#include "dpcpp/matrix/batch_struct.hpp"
+#include "dpcpp/preconditioner/batch_preconditioners.hpp"
+#include "dpcpp/stop/batch_criteria.hpp"
+
+
+namespace gko {
+namespace batch {
+namespace solver {
+
+
+namespace device = gko::kernels::dpcpp;
+
+
+template <typename ValueType>
+using DeviceValueType = ValueType;
+
+
+}  // namespace solver
+}  // namespace batch
+}  // namespace gko
+
+
+#else
+
+
+#include "reference/base/batch_struct.hpp"
+#include "reference/log/batch_logger.hpp"
+#include "reference/matrix/batch_struct.hpp"
+#include "reference/preconditioner/batch_identity.hpp"
+#include "reference/stop/batch_criteria.hpp"
+
+
+namespace gko {
+namespace batch {
+namespace solver {
+
+
+namespace device = gko::kernels::host;
+
+
+template <typename ValueType>
+using DeviceValueType = ValueType;
+
+
+}  // namespace solver
+}  // namespace batch
+}  // namespace gko
+
+
+#endif
+
+
+namespace gko {
+namespace batch {
+namespace solver {
+
+
+template <typename DValueType>
+class kernel_caller_interface {
+public:
+    template <typename BatchMatrixType, typename PrecType, typename StopType,
+              typename LogType>
+    void call_kernel(LogType logger, const BatchMatrixType& mat,
+                     const multi_vector::uniform_batch<DValueType>& b,
+                     const multi_vector::uniform_batch<DValueType>& x) const
+    {}
+};
+
+
+namespace log {
+namespace detail {
+/**
+ *
+ * Types of batch loggers available.
+ */
+enum class log_type { simple_convergence_completion };
+
+
+}  // namespace detail
+}  // namespace log
+
+
+/**
+ * Handles dispatching to the correct instantiation of a batched solver
+ * depending on runtime parameters.
+ *
+ * @tparam ValueType  The user-facing value type.
+ * @tparam KernelCaller  Class with an interface like kernel_caller_interface,
+ *   that is responsible for finally calling the templated backend-specific
+ *   kernel.
+ * @tparam SettingsType  Structure type of options for the particular solver to
+ * be used.
+ */
+template <typename ValueType, typename KernelCaller, typename SettingsType>
+class batch_solver_dispatch {
+public:
+    using value_type = ValueType;
+    using device_value_type = DeviceValueType<ValueType>;
+    using real_type = remove_complex<value_type>;
+
+    batch_solver_dispatch(
+        const KernelCaller& kernel_caller, const SettingsType& settings,
+        const BatchLinOp* const matrix, const BatchLinOp* const preconditioner,
+        const log::detail::log_type logger_type =
+            log::detail::log_type::simple_convergence_completion)
+        : caller_{kernel_caller},
+          settings_{settings},
+          mat_{matrix},
+          precond_{preconditioner},
+          logger_type_{logger_type}
+    {}
+
+    template <typename PrecType, typename BatchMatrixType, typename LogType>
+    void dispatch_on_stop(
+        const LogType& logger, const BatchMatrixType& mat_item,
+        PrecType precond,
+        const multi_vector::uniform_batch<const device_value_type>& b_item,
+        const multi_vector::uniform_batch<device_value_type>& x_item)
+    {
+        if (settings_.tol_type == stop::tolerance_type::absolute) {
+            caller_.template call_kernel<
+                BatchMatrixType, PrecType,
+                device::batch_stop::SimpleAbsResidual<device_value_type>,
+                LogType>(logger, mat_item, precond, b_item, x_item);
+        } else if (settings_.tol_type == stop::tolerance_type::relative) {
+            caller_.template call_kernel<
+                BatchMatrixType, PrecType,
+                device::batch_stop::SimpleRelResidual<device_value_type>,
+                LogType>(logger, mat_item, precond, b_item, x_item);
+        } else {
+            GKO_NOT_IMPLEMENTED;
+        }
+    }
+
+    template <typename BatchMatrixType, typename LogType>
+    void dispatch_on_preconditioner(
+        const LogType& logger, const BatchMatrixType& mat_item,
+        const multi_vector::uniform_batch<const device_value_type>& b_item,
+        const multi_vector::uniform_batch<device_value_type>& x_item)
+    {
+        if (!precond_ ||
+            dynamic_cast<const matrix::Identity<value_type>*>(precond_)) {
+            dispatch_on_stop<
+                device::batch_preconditioner::Identity<device_value_type>>(
+                logger, mat_item,
+                device::batch_preconditioner::Identity<device_value_type>(),
+                b_item, x_item);
+        } else {
+            GKO_NOT_IMPLEMENTED;
+        }
+    }
+
+    template <typename BatchMatrixType>
+    void dispatch_on_logger(
+        const BatchMatrixType& amat,
+        const multi_vector::uniform_batch<const device_value_type>& b_item,
+        const multi_vector::uniform_batch<device_value_type>& x_item,
+        batch::log::detail::log_data<real_type>& log_data)
+    {
+        if (logger_type_ ==
+            log::detail::log_type::simple_convergence_completion) {
+            device::batch_log::SimpleFinalLogger<real_type> logger(
+                log_data.res_norms.get_data(), log_data.iter_counts.get_data());
+            dispatch_on_preconditioner(logger, amat, b_item, x_item);
+        } else {
+            GKO_NOT_IMPLEMENTED;
+        }
+    }
+
+    void dispatch_on_matrix(
+        const multi_vector::uniform_batch<const device_value_type>& b_item,
+        const multi_vector::uniform_batch<device_value_type>& x_item,
+        batch::log::detail::log_data<real_type>& log_data)
+    {
+        if (auto batch_mat =
+                dynamic_cast<const batch::matrix::Ell<ValueType, int32>*>(
+                    mat_)) {
+            auto mat_item = device::get_batch_struct(batch_mat);
+            dispatch_on_logger(mat_item, b_item, x_item, log_data);
+        } else if (auto batch_mat =
+                       dynamic_cast<const batch::matrix::Dense<ValueType>*>(
+                           mat_)) {
+            auto mat_item = device::get_batch_struct(batch_mat);
+            dispatch_on_logger(mat_item, b_item, x_item, log_data);
+        } else {
+            GKO_NOT_SUPPORTED(mat_);
+        }
+    }
+
+    /**
+     * Solves a linear system from the given data and kernel caller.
+     *
+     * @note The correct backend-specific get_batch_struct function needs to be
+     * available in the current scope.
+     */
+    void apply(const MultiVector<ValueType>* const b,
+               MultiVector<ValueType>* const x,
+               batch::log::detail::log_data<real_type>& log_data)
+    {
+        const auto x_item = device::get_batch_struct(x);
+        const auto b_item = device::get_batch_struct(b);
+
+        dispatch_on_matrix(b_item, x_item, log_data);
+    }
+
+private:
+    const KernelCaller caller_;
+    const SettingsType settings_;
+    const BatchLinOp* mat_;
+    const BatchLinOp* precond_;
+    const log::detail::log_type logger_type_;
+};
+
+
+/**
+ * Convenient function to create a dispatcher. Infers most template arguments.
+ */
+template <typename ValueType, typename KernelCaller, typename SettingsType>
+batch_solver_dispatch<ValueType, KernelCaller, SettingsType> create_dispatcher(
+    const KernelCaller& kernel_caller, const SettingsType& settings,
+    const BatchLinOp* const matrix, const BatchLinOp* const preconditioner,
+    const log::detail::log_type logger_type =
+        log::detail::log_type::simple_convergence_completion)
+{
+    return batch_solver_dispatch<ValueType, KernelCaller, SettingsType>(
+        kernel_caller, settings, matrix, preconditioner, logger_type);
+}
+
+
+}  // namespace solver
+}  // namespace batch
+}  // namespace gko
+
+
+#endif  // GKO_CORE_SOLVER_BATCH_DISPATCH_HPP_
diff --git a/core/solver/gcr.cpp b/core/solver/gcr.cpp
index e1df71491e5..4b767ad40ad 100644
--- a/core/solver/gcr.cpp
+++ b/core/solver/gcr.cpp
@@ -186,7 +186,7 @@ void Gcr<ValueType>::apply_dense_impl(const VectorType* dense_b,
     size_type restart_iter = 0;
 
     /* Memory movement summary for average iteration with krylov_dim d:
-     * (4d+22+4/d)n+(d+1+1/d) * values + matrix/preconditioner stroage
+     * (4d+22+4/d)n+(d+1+1/d) * values + matrix/preconditioner storage
      * 1x SpMV:                       2n * values + storage
      * 1x Preconditioner:             2n * values + storage
      * 1x step 1       (scal, axpys)  6n
diff --git a/core/solver/multigrid.cpp b/core/solver/multigrid.cpp
index 7a521f5f53e..84afc1666cc 100644
--- a/core/solver/multigrid.cpp
+++ b/core/solver/multigrid.cpp
@@ -182,8 +182,8 @@ namespace multigrid {
 
 
 /**
- * The enum class is to combine the cycle infomation  It's legal to use a binary
- * or(|) operation to combine several properties.
+ * The enum class is to combine the cycle information  It's legal to use a
+ * binary or(|) operation to combine several properties.
  */
 enum class cycle_mode {
     /**
@@ -197,7 +197,7 @@ enum class cycle_mode {
     first_of_cycle = 2,
 
     /**
-     * current procees is the end one of the cycle
+     * current process is the end one of the cycle
      */
     end_of_cycle = 4
 };
@@ -569,21 +569,18 @@ void Multigrid::generate()
                     using absolute_value_type = remove_complex<value_type>;
                     return solver::Gmres<value_type>::build()
                         .with_criteria(
-                            stop::Iteration::build()
-                                .with_max_iters(matrix->get_size()[0])
-                                .on(exec),
+                            stop::Iteration::build().with_max_iters(
+                                matrix->get_size()[0]),
                             stop::ResidualNorm<value_type>::build()
                                 .with_reduction_factor(
                                     std::numeric_limits<
                                         absolute_value_type>::epsilon() *
-                                    absolute_value_type{10})
-                                .on(exec))
+                                    absolute_value_type{10}))
                         .with_krylov_dim(
                             std::min(size_type(100), matrix->get_size()[0]))
                         .with_preconditioner(
                             preconditioner::Jacobi<value_type>::build()
-                                .with_max_block_size(1u)
-                                .on(exec))
+                                .with_max_block_size(1u))
                         .on(exec)
                         ->generate(matrix);
                 } else {
@@ -591,8 +588,7 @@ void Multigrid::generate()
                                                         int32>::build()
                         .with_factorization(
                             experimental::factorization::Lu<value_type,
-                                                            int32>::build()
-                                .on(exec))
+                                                            int32>::build())
                         .on(exec)
                         ->generate(matrix);
                 }
diff --git a/core/test/CMakeLists.txt b/core/test/CMakeLists.txt
index b330a493b38..69f7ddd749e 100644
--- a/core/test/CMakeLists.txt
+++ b/core/test/CMakeLists.txt
@@ -1,5 +1,6 @@
 include(${PROJECT_SOURCE_DIR}/cmake/create_test.cmake)
 
+add_subdirectory(gtest)
 add_subdirectory(accessor)
 add_subdirectory(base)
 add_subdirectory(components)
diff --git a/core/test/accessor/reduced_row_major_ginkgo.cpp b/core/test/accessor/reduced_row_major_ginkgo.cpp
index b12fba6ad0f..d72245d9882 100644
--- a/core/test/accessor/reduced_row_major_ginkgo.cpp
+++ b/core/test/accessor/reduced_row_major_ginkgo.cpp
@@ -45,6 +45,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "accessor/reduced_row_major.hpp"
 #include "accessor/utils.hpp"
 #include "core/base/extended_float.hpp"  // necessary for gko::half
+#include "core/test/utils.hpp"
 
 
 namespace {
@@ -156,7 +157,8 @@ using ReducedStorage3dTypes =
                      std::tuple<std::complex<double>, std::complex<float>>,
                      std::tuple<std::complex<float>, std::complex<float>>>;
 
-TYPED_TEST_SUITE(ReducedStorage3d, ReducedStorage3dTypes);
+TYPED_TEST_SUITE(ReducedStorage3d, ReducedStorage3dTypes,
+                 PairTypenameNameGenerator);
 
 
 TYPED_TEST(ReducedStorage3d, CorrectLengths)
diff --git a/core/test/base/CMakeLists.txt b/core/test/base/CMakeLists.txt
index aa79ca3ed92..4e611852be5 100644
--- a/core/test/base/CMakeLists.txt
+++ b/core/test/base/CMakeLists.txt
@@ -1,9 +1,13 @@
 ginkgo_create_test(abstract_factory)
 ginkgo_create_test(allocator)
 ginkgo_create_test(array)
-ginkgo_create_test(dense_cache)
+ginkgo_create_test(batch_dim)
+ginkgo_create_test(batch_lin_op)
+ginkgo_create_test(batch_multi_vector)
 ginkgo_create_test(combination)
 ginkgo_create_test(composition)
+ginkgo_create_test(deferred_factory)
+ginkgo_create_test(dense_cache)
 ginkgo_create_test(dim)
 ginkgo_create_test(exception)
 ginkgo_create_test(exception_helpers)
diff --git a/core/test/base/batch_dim.cpp b/core/test/base/batch_dim.cpp
new file mode 100644
index 00000000000..e8722530fba
--- /dev/null
+++ b/core/test/base/batch_dim.cpp
@@ -0,0 +1,92 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/base/batch_dim.hpp>
+
+
+#include <memory>
+
+
+#include <gtest/gtest.h>
+
+
+TEST(BatchDim, ConstructsCorrectUniformObject)
+{
+    gko::batch_dim<2> d{4, gko::dim<2>(5)};
+
+    ASSERT_EQ(d.get_num_batch_items(), 4);
+    ASSERT_EQ(d.get_common_size(), gko::dim<2>(5));
+}
+
+
+TEST(BatchDim, ConstructsNullObject)
+{
+    gko::batch_dim<2> d{};
+
+    ASSERT_EQ(d.get_num_batch_items(), 0);
+    ASSERT_EQ(d.get_common_size(), gko::dim<2>{});
+}
+
+
+TEST(BatchDim, EqualityReturnsTrueWhenEqual)
+{
+    ASSERT_TRUE(gko::batch_dim<2>(2, gko::dim<2>{3}) ==
+                gko::batch_dim<2>(2, gko::dim<2>{3}));
+}
+
+
+TEST(BatchDim, EqualityReturnsFalseWhenDifferentNumBatches)
+{
+    ASSERT_FALSE(gko::batch_dim<2>(3, gko::dim<2>{3}) ==
+                 gko::batch_dim<2>(2, gko::dim<2>{3}));
+}
+
+
+TEST(BatchDim, EqualityReturnsFalseWhenDifferentBatchSizes)
+{
+    ASSERT_FALSE(gko::batch_dim<2>(3, gko::dim<2>{3}) ==
+                 gko::batch_dim<2>(3, gko::dim<2>{4}));
+}
+
+
+TEST(BatchDim, NotEqualWorks)
+{
+    ASSERT_TRUE(gko::batch_dim<2>(3, gko::dim<2>{3}) !=
+                gko::batch_dim<2>(3, gko::dim<2>{4}));
+}
+
+
+TEST(BatchDim, TransposesBatchDimensions)
+{
+    ASSERT_EQ(gko::transpose(gko::batch_dim<2>(2, gko::dim<2>{4, 2})),
+              gko::batch_dim<2>(2, gko::dim<2>{2, 4}));
+}
diff --git a/core/test/base/batch_lin_op.cpp b/core/test/base/batch_lin_op.cpp
new file mode 100644
index 00000000000..61dcf89f109
--- /dev/null
+++ b/core/test/base/batch_lin_op.cpp
@@ -0,0 +1,233 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/base/batch_lin_op.hpp>
+
+
+#include <complex>
+#include <memory>
+#include <type_traits>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/lin_op.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/log/logger.hpp>
+
+
+namespace {
+
+
+struct DummyLogger : gko::log::Logger {
+    DummyLogger()
+        : gko::log::Logger(gko::log::Logger::batch_linop_factory_events_mask)
+    {}
+
+    void on_batch_linop_factory_generate_started(
+        const gko::batch::BatchLinOpFactory*,
+        const gko::batch::BatchLinOp*) const override
+    {
+        batch_linop_factory_generate_started++;
+    }
+
+    void on_batch_linop_factory_generate_completed(
+        const gko::batch::BatchLinOpFactory*, const gko::batch::BatchLinOp*,
+        const gko::batch::BatchLinOp*) const override
+    {
+        batch_linop_factory_generate_completed++;
+    }
+
+    int mutable batch_linop_factory_generate_started = 0;
+    int mutable batch_linop_factory_generate_completed = 0;
+};
+
+
+class DummyBatchLinOp : public gko::batch::EnableBatchLinOp<DummyBatchLinOp>,
+                        public gko::EnableCreateMethod<DummyBatchLinOp> {
+public:
+    DummyBatchLinOp(std::shared_ptr<const gko::Executor> exec,
+                    gko::batch_dim<2> size = gko::batch_dim<2>{})
+        : gko::batch::EnableBatchLinOp<DummyBatchLinOp>(exec, size)
+    {}
+};
+
+
+class EnableBatchLinOp : public ::testing::Test {
+protected:
+    EnableBatchLinOp()
+        : ref{gko::ReferenceExecutor::create()},
+          op{DummyBatchLinOp::create(ref,
+                                     gko::batch_dim<2>(1, gko::dim<2>{3, 5}))}
+    {}
+
+    std::shared_ptr<const gko::ReferenceExecutor> ref;
+    std::unique_ptr<DummyBatchLinOp> op;
+};
+
+
+TEST_F(EnableBatchLinOp, KnowsNumBatchItems)
+{
+    ASSERT_EQ(op->get_num_batch_items(), 1);
+}
+
+
+TEST_F(EnableBatchLinOp, KnowsItsSizes)
+{
+    auto op1_sizes = gko::batch_dim<2>(1, gko::dim<2>{3, 5});
+    ASSERT_EQ(op->get_size(), op1_sizes);
+}
+
+
+template <typename T = int>
+class DummyBatchLinOpWithFactory
+    : public gko::batch::EnableBatchLinOp<DummyBatchLinOpWithFactory<T>> {
+public:
+    DummyBatchLinOpWithFactory(std::shared_ptr<const gko::Executor> exec)
+        : gko::batch::EnableBatchLinOp<DummyBatchLinOpWithFactory>(exec)
+    {}
+
+    GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory)
+    {
+        T GKO_FACTORY_PARAMETER_SCALAR(value, T{5});
+    };
+    GKO_ENABLE_BATCH_LIN_OP_FACTORY(DummyBatchLinOpWithFactory, parameters,
+                                    Factory);
+    GKO_ENABLE_BUILD_METHOD(Factory);
+
+    DummyBatchLinOpWithFactory(const Factory* factory,
+                               std::shared_ptr<const gko::batch::BatchLinOp> op)
+        : gko::batch::EnableBatchLinOp<DummyBatchLinOpWithFactory>(
+              factory->get_executor()),
+          parameters_{factory->get_parameters()},
+          op_{op}
+    {}
+
+    std::shared_ptr<const gko::batch::BatchLinOp> op_;
+};
+
+
+class EnableBatchLinOpFactory : public ::testing::Test {
+protected:
+    EnableBatchLinOpFactory()
+        : ref{gko::ReferenceExecutor::create()},
+          logger{std::make_shared<DummyLogger>()}
+
+    {}
+
+    std::shared_ptr<const gko::ReferenceExecutor> ref;
+    std::shared_ptr<DummyLogger> logger;
+};
+
+
+TEST_F(EnableBatchLinOpFactory, CreatesDefaultFactory)
+{
+    auto factory = DummyBatchLinOpWithFactory<>::build().on(ref);
+
+    ASSERT_EQ(factory->get_parameters().value, 5);
+    ASSERT_EQ(factory->get_executor(), ref);
+}
+
+
+TEST_F(EnableBatchLinOpFactory, CreatesFactoryWithParameters)
+{
+    auto factory = DummyBatchLinOpWithFactory<>::build().with_value(7).on(ref);
+
+    ASSERT_EQ(factory->get_parameters().value, 7);
+    ASSERT_EQ(factory->get_executor(), ref);
+}
+
+
+TEST_F(EnableBatchLinOpFactory, PassesParametersToBatchLinOp)
+{
+    auto dummy = gko::share(
+        DummyBatchLinOp::create(ref, gko::batch_dim<2>(1, gko::dim<2>{3, 5})));
+    auto factory = DummyBatchLinOpWithFactory<>::build().with_value(6).on(ref);
+
+    auto op = factory->generate(dummy);
+
+    ASSERT_EQ(op->get_executor(), ref);
+    ASSERT_EQ(op->get_parameters().value, 6);
+    ASSERT_EQ(op->op_.get(), dummy.get());
+}
+
+
+TEST_F(EnableBatchLinOpFactory, FactoryGenerateIsLogged)
+{
+    auto before_logger = *logger;
+    auto factory = DummyBatchLinOpWithFactory<>::build().on(ref);
+    factory->add_logger(logger);
+    factory->generate(
+        DummyBatchLinOp::create(ref, gko::batch_dim<2>(1, gko::dim<2>{3, 5})));
+
+    ASSERT_EQ(logger->batch_linop_factory_generate_started,
+              before_logger.batch_linop_factory_generate_started + 1);
+    ASSERT_EQ(logger->batch_linop_factory_generate_completed,
+              before_logger.batch_linop_factory_generate_completed + 1);
+}
+
+
+TEST_F(EnableBatchLinOpFactory, WithLoggersWorksAndPropagates)
+{
+    auto before_logger = *logger;
+    auto factory =
+        DummyBatchLinOpWithFactory<>::build().with_loggers(logger).on(ref);
+    auto op = factory->generate(
+        DummyBatchLinOp::create(ref, gko::batch_dim<2>(1, gko::dim<2>{3, 5})));
+
+    ASSERT_EQ(logger->batch_linop_factory_generate_started,
+              before_logger.batch_linop_factory_generate_started + 1);
+    ASSERT_EQ(logger->batch_linop_factory_generate_completed,
+              before_logger.batch_linop_factory_generate_completed + 1);
+}
+
+
+TEST_F(EnableBatchLinOpFactory, CopiesLinOpToOtherExecutor)
+{
+    auto ref2 = gko::ReferenceExecutor::create();
+    auto dummy = gko::share(
+        DummyBatchLinOp::create(ref2, gko::batch_dim<2>(1, gko::dim<2>{3, 5})));
+    auto factory = DummyBatchLinOpWithFactory<>::build().with_value(6).on(ref);
+
+    auto op = factory->generate(dummy);
+
+    ASSERT_EQ(op->get_executor(), ref);
+    ASSERT_EQ(op->get_parameters().value, 6);
+    ASSERT_EQ(op->op_->get_executor(), ref);
+    ASSERT_NE(op->op_.get(), dummy.get());
+    ASSERT_TRUE(dynamic_cast<const DummyBatchLinOp*>(op->op_.get()));
+}
+
+
+}  // namespace
diff --git a/core/test/base/batch_multi_vector.cpp b/core/test/base/batch_multi_vector.cpp
new file mode 100644
index 00000000000..1041096a5ed
--- /dev/null
+++ b/core/test/base/batch_multi_vector.cpp
@@ -0,0 +1,480 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/range.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+#include "core/base/batch_utilities.hpp"
+#include "core/test/utils.hpp"
+#include "core/test/utils/batch_helpers.hpp"
+
+
+template <typename T>
+class MultiVector : public ::testing::Test {
+protected:
+    using value_type = T;
+    using DenseMtx = gko::matrix::Dense<value_type>;
+    using size_type = gko::size_type;
+    MultiVector()
+        : exec(gko::ReferenceExecutor::create()),
+          mtx(gko::batch::initialize<gko::batch::MultiVector<value_type>>(
+              {{{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}},
+               {{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}},
+              exec)),
+          dense_mtx(gko::initialize<gko::matrix::Dense<value_type>>(
+              {{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, exec))
+    {}
+
+
+    static void assert_equal_to_original_mtx(
+        const gko::batch::MultiVector<value_type>* m)
+    {
+        ASSERT_NE(m->get_const_values(), nullptr);
+        EXPECT_EQ(m->get_const_values()[0], value_type{-1.0});
+        ASSERT_EQ(m->get_num_batch_items(), 2);
+        ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 3));
+        EXPECT_EQ(m->at(0, 0, 0), value_type{-1.0});
+        EXPECT_EQ(m->at(0, 0, 1), value_type{2.0});
+        EXPECT_EQ(m->at(0, 0, 2), value_type{3.0});
+        EXPECT_EQ(m->at(0, 1, 0), value_type{-1.5});
+        EXPECT_EQ(m->at(0, 1, 1), value_type{2.5});
+        ASSERT_EQ(m->at(0, 1, 2), value_type{3.5});
+        EXPECT_EQ(m->at(1, 0, 0), value_type{1.0});
+        EXPECT_EQ(m->at(1, 0, 1), value_type{2.5});
+        EXPECT_EQ(m->at(1, 0, 2), value_type{3.0});
+        EXPECT_EQ(m->at(1, 1, 0), value_type{1.0});
+        EXPECT_EQ(m->at(1, 1, 1), value_type{2.0});
+        ASSERT_EQ(m->at(1, 1, 2), value_type{3.0});
+    }
+
+    static void assert_empty(gko::batch::MultiVector<value_type>* m)
+    {
+        ASSERT_EQ(m->get_num_batch_items(), 0);
+        ASSERT_EQ(m->get_common_size(), gko::dim<2>{});
+        ASSERT_EQ(m->get_const_values(), nullptr);
+    }
+
+    std::shared_ptr<const gko::Executor> exec;
+    std::unique_ptr<gko::batch::MultiVector<value_type>> mtx;
+    std::unique_ptr<gko::matrix::Dense<value_type>> dense_mtx;
+};
+
+TYPED_TEST_SUITE(MultiVector, gko::test::ValueTypes, TypenameNameGenerator);
+
+
+TYPED_TEST(MultiVector, CanBeEmpty)
+{
+    auto empty = gko::batch::MultiVector<TypeParam>::create(this->exec);
+
+    this->assert_empty(empty.get());
+}
+
+
+TYPED_TEST(MultiVector, KnowsItsSizeAndValues)
+{
+    ASSERT_NE(this->mtx->get_const_values(), nullptr);
+
+    this->assert_equal_to_original_mtx(this->mtx.get());
+}
+
+
+TYPED_TEST(MultiVector, CanGetValuesForEntry)
+{
+    using value_type = typename TestFixture::value_type;
+
+    ASSERT_EQ(this->mtx->get_values_for_item(1)[0], value_type{1.0});
+}
+
+
+TYPED_TEST(MultiVector, CanCreateDenseItemView)
+{
+    GKO_ASSERT_MTX_NEAR(this->mtx->create_view_for_item(1), this->dense_mtx,
+                        0.0);
+}
+
+
+TYPED_TEST(MultiVector, CanBeCopied)
+{
+    auto mtx_copy = gko::batch::MultiVector<TypeParam>::create(this->exec);
+
+    mtx_copy->copy_from(this->mtx.get());
+
+    this->assert_equal_to_original_mtx(this->mtx.get());
+    this->mtx->at(0, 0, 0) = 7;
+    this->mtx->at(0, 1) = 7;
+    this->assert_equal_to_original_mtx(mtx_copy.get());
+}
+
+
+TYPED_TEST(MultiVector, CanBeMoved)
+{
+    auto mtx_copy = gko::batch::MultiVector<TypeParam>::create(this->exec);
+
+    this->mtx->move_to(mtx_copy.get());
+
+    this->assert_equal_to_original_mtx(mtx_copy.get());
+}
+
+
+TYPED_TEST(MultiVector, CanBeCloned)
+{
+    auto mtx_clone = this->mtx->clone();
+
+    this->assert_equal_to_original_mtx(
+        dynamic_cast<decltype(this->mtx.get())>(mtx_clone.get()));
+}
+
+
+TYPED_TEST(MultiVector, CanBeCleared)
+{
+    this->mtx->clear();
+
+    this->assert_empty(this->mtx.get());
+}
+
+
+TYPED_TEST(MultiVector, CanBeConstructedWithSize)
+{
+    using size_type = gko::size_type;
+
+    auto m = gko::batch::MultiVector<TypeParam>::create(
+        this->exec, gko::batch_dim<2>(2, gko::dim<2>(2, 4)));
+
+    ASSERT_EQ(m->get_num_batch_items(), 2);
+    ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 4));
+}
+
+
+TYPED_TEST(MultiVector, CanBeConstructedFromExistingData)
+{
+    using value_type = typename TestFixture::value_type;
+    using size_type = gko::size_type;
+    // clang-format off
+    value_type data[] = {
+       1.0,  2.0,
+      -1.0,  3.0,
+       4.0, -1.0,
+       3.0,  5.0,
+       1.0,  5.0,
+       6.0, -3.0};
+    // clang-format on
+
+    auto m = gko::batch::MultiVector<TypeParam>::create(
+        this->exec, gko::batch_dim<2>(2, gko::dim<2>(2, 2)),
+        gko::array<value_type>::view(this->exec, 8, data));
+
+    ASSERT_EQ(m->get_const_values(), data);
+    ASSERT_EQ(m->at(0, 0, 0), value_type{1.0});
+    ASSERT_EQ(m->at(0, 0, 1), value_type{2.0});
+    ASSERT_EQ(m->at(0, 1, 0), value_type{-1.0});
+    ASSERT_EQ(m->at(0, 1, 1), value_type{3.0});
+    ASSERT_EQ(m->at(1, 0, 0), value_type{4.0});
+    ASSERT_EQ(m->at(1, 0, 1), value_type{-1.0});
+    ASSERT_EQ(m->at(1, 1, 0), value_type{3.0});
+    ASSERT_EQ(m->at(1, 1, 1), value_type{5.0});
+}
+
+
+TYPED_TEST(MultiVector, CanBeConstructedFromExistingConstData)
+{
+    using value_type = typename TestFixture::value_type;
+    using size_type = gko::size_type;
+    // clang-format off
+    value_type data[] = {
+       1.0,  2.0,
+      -1.0,  3.0,
+       4.0, -1.0,
+       3.0,  5.0,
+       1.0,  5.0,
+       6.0, -3.0};
+    // clang-format on
+
+    auto m = gko::batch::MultiVector<TypeParam>::create_const(
+        this->exec, gko::batch_dim<2>(2, gko::dim<2>(2, 2)),
+        gko::array<value_type>::const_view(this->exec, 8, data));
+
+    ASSERT_EQ(m->get_const_values(), data);
+    ASSERT_EQ(m->at(0, 0, 0), value_type{1.0});
+    ASSERT_EQ(m->at(0, 0, 1), value_type{2.0});
+    ASSERT_EQ(m->at(0, 1, 0), value_type{-1.0});
+    ASSERT_EQ(m->at(0, 1, 1), value_type{3.0});
+    ASSERT_EQ(m->at(1, 0, 0), value_type{4.0});
+    ASSERT_EQ(m->at(1, 0, 1), value_type{-1.0});
+    ASSERT_EQ(m->at(1, 1, 0), value_type{3.0});
+    ASSERT_EQ(m->at(1, 1, 1), value_type{5.0});
+}
+
+
+TYPED_TEST(MultiVector, CanBeConstructedFromDenseMatrices)
+{
+    using value_type = typename TestFixture::value_type;
+    using DenseMtx = typename TestFixture::DenseMtx;
+    using size_type = gko::size_type;
+    auto mat1 = gko::initialize<DenseMtx>({{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}},
+                                          this->exec);
+    auto mat2 = gko::initialize<DenseMtx>({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}},
+                                          this->exec);
+
+    auto m = gko::batch::create_from_item<gko::batch::MultiVector<value_type>>(
+        this->exec, std::vector<DenseMtx*>{mat1.get(), mat2.get()});
+
+    this->assert_equal_to_original_mtx(m.get());
+}
+
+
+TYPED_TEST(MultiVector, CanBeConstructedFromDenseMatricesByDuplication)
+{
+    using value_type = typename TestFixture::value_type;
+    using DenseMtx = typename TestFixture::DenseMtx;
+    using size_type = gko::size_type;
+    auto mat1 = gko::initialize<DenseMtx>({{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}},
+                                          this->exec);
+    auto mat2 = gko::initialize<DenseMtx>({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}},
+                                          this->exec);
+
+    auto bat_m =
+        gko::batch::create_from_item<gko::batch::MultiVector<value_type>>(
+            this->exec,
+            std::vector<DenseMtx*>{mat1.get(), mat1.get(), mat1.get()});
+    auto m = gko::batch::create_from_item<gko::batch::MultiVector<value_type>>(
+        this->exec, 3, mat1.get());
+
+    GKO_ASSERT_BATCH_MTX_NEAR(bat_m.get(), m.get(), 1e-14);
+}
+
+
+TYPED_TEST(MultiVector, CanBeConstructedByDuplicatingMultiVectors)
+{
+    using value_type = typename TestFixture::value_type;
+    using DenseMtx = typename TestFixture::DenseMtx;
+    using size_type = gko::size_type;
+    auto mat1 = gko::initialize<DenseMtx>({{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}},
+                                          this->exec);
+    auto mat2 = gko::initialize<DenseMtx>({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}},
+                                          this->exec);
+    auto m = gko::batch::create_from_item<gko::batch::MultiVector<value_type>>(
+        this->exec, std::vector<DenseMtx*>{mat1.get(), mat2.get()});
+    auto m_ref =
+        gko::batch::create_from_item<gko::batch::MultiVector<value_type>>(
+            this->exec,
+            std::vector<DenseMtx*>{mat1.get(), mat2.get(), mat1.get(),
+                                   mat2.get(), mat1.get(), mat2.get()});
+
+    auto m2 = gko::batch::duplicate<gko::batch::MultiVector<value_type>>(
+        this->exec, 3, m.get());
+
+    GKO_ASSERT_BATCH_MTX_NEAR(m2.get(), m_ref.get(), 1e-14);
+}
+
+
+TYPED_TEST(MultiVector, CanBeListConstructed)
+{
+    using value_type = typename TestFixture::value_type;
+
+    auto m = gko::batch::initialize<gko::batch::MultiVector<TypeParam>>(
+        {{1.0, 2.0}, {1.0, 3.0}}, this->exec);
+
+    ASSERT_EQ(m->get_num_batch_items(), 2);
+    ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 1));
+    EXPECT_EQ(m->at(0, 0), value_type{1});
+    EXPECT_EQ(m->at(0, 1), value_type{2});
+    EXPECT_EQ(m->at(1, 0), value_type{1});
+    EXPECT_EQ(m->at(1, 1), value_type{3});
+}
+
+
+TYPED_TEST(MultiVector, CanBeListConstructedByCopies)
+{
+    using value_type = typename TestFixture::value_type;
+
+    auto m = gko::batch::initialize<gko::batch::MultiVector<TypeParam>>(
+        2, I<value_type>({1.0, 2.0}), this->exec);
+
+    ASSERT_EQ(m->get_num_batch_items(), 2);
+    ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 1));
+    EXPECT_EQ(m->at(0, 0, 0), value_type{1.0});
+    EXPECT_EQ(m->at(0, 0, 1), value_type{2.0});
+    EXPECT_EQ(m->at(1, 0, 0), value_type{1.0});
+    EXPECT_EQ(m->at(1, 0, 1), value_type{2.0});
+}
+
+
+TYPED_TEST(MultiVector, CanBeDoubleListConstructed)
+{
+    using value_type = typename TestFixture::value_type;
+    using T = value_type;
+
+    auto m = gko::batch::initialize<gko::batch::MultiVector<TypeParam>>(
+        {{I<T>{1.0, 1.0, 0.0}, I<T>{2.0, 4.0, 3.0}, I<T>{3.0, 6.0, 1.0}},
+         {I<T>{1.0, 2.0, -1.0}, I<T>{3.0, 4.0, -2.0}, I<T>{5.0, 6.0, -3.0}}},
+        this->exec);
+
+    ASSERT_EQ(m->get_common_size(), gko::dim<2>(3, 3));
+    EXPECT_EQ(m->at(0, 0), value_type{1.0});
+    EXPECT_EQ(m->at(0, 1), value_type{1.0});
+    EXPECT_EQ(m->at(0, 2), value_type{0.0});
+    ASSERT_EQ(m->at(0, 3), value_type{2.0});
+    EXPECT_EQ(m->at(0, 4), value_type{4.0});
+    EXPECT_EQ(m->at(1, 0), value_type{1.0});
+    EXPECT_EQ(m->at(1, 1), value_type{2.0});
+    EXPECT_EQ(m->at(1, 2), value_type{-1.0});
+    ASSERT_EQ(m->at(1, 3), value_type{3.0});
+    EXPECT_EQ(m->at(1, 4), value_type{4.0});
+}
+
+
+TYPED_TEST(MultiVector, CanBeFilledWithValue)
+{
+    using value_type = typename TestFixture::value_type;
+    auto m = gko::batch::MultiVector<TypeParam>::create(
+        this->exec, gko::batch_dim<2>(2, gko::dim<2>(3, 1)));
+
+    m->fill(value_type(2.0));
+
+    ASSERT_EQ(m->get_num_batch_items(), 2);
+    ASSERT_EQ(m->get_common_size(), gko::dim<2>(3, 1));
+    EXPECT_EQ(m->at(0, 0, 0), value_type{2.0});
+    EXPECT_EQ(m->at(0, 0, 1), value_type{2.0});
+    EXPECT_EQ(m->at(0, 0, 2), value_type{2.0});
+    EXPECT_EQ(m->at(1, 0, 0), value_type{2.0});
+    EXPECT_EQ(m->at(1, 0, 1), value_type{2.0});
+    EXPECT_EQ(m->at(1, 0, 2), value_type{2.0});
+}
+
+
+TYPED_TEST(MultiVector, CanBeUnbatchedIntoDenseMatrices)
+{
+    using value_type = typename TestFixture::value_type;
+    using DenseMtx = typename TestFixture::DenseMtx;
+    using size_type = gko::size_type;
+    auto mat1 = gko::initialize<DenseMtx>({{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}},
+                                          this->exec);
+    auto mat2 = gko::initialize<DenseMtx>({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}},
+                                          this->exec);
+
+    auto dense_mats = gko::batch::unbatch<gko::batch::MultiVector<value_type>>(
+        this->mtx.get());
+
+    ASSERT_EQ(dense_mats.size(), 2);
+    GKO_ASSERT_MTX_NEAR(dense_mats[0].get(), mat1.get(), 0.);
+    GKO_ASSERT_MTX_NEAR(dense_mats[1].get(), mat2.get(), 0.);
+}
+
+
+TYPED_TEST(MultiVector, CanBeReadFromMatrixData)
+{
+    using value_type = typename TestFixture::value_type;
+    using index_type = int;
+
+    auto vec_data = std::vector<gko::matrix_data<value_type, index_type>>{};
+    vec_data.emplace_back(gko::matrix_data<value_type, index_type>(
+        {2, 2}, {{0, 0, 1.0}, {0, 1, 3.0}, {1, 0, 0.0}, {1, 1, 5.0}}));
+    vec_data.emplace_back(gko::matrix_data<value_type, index_type>(
+        {2, 2}, {{0, 0, -1.0}, {0, 1, 0.5}, {1, 0, 0.0}, {1, 1, 9.0}}));
+
+    auto m = gko::batch::read<value_type, index_type,
+                              gko::batch::MultiVector<value_type>>(this->exec,
+                                                                   vec_data);
+
+    ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 2));
+    EXPECT_EQ(m->at(0, 0, 0), value_type{1.0});
+    EXPECT_EQ(m->at(0, 0, 1), value_type{3.0});
+    EXPECT_EQ(m->at(0, 1, 0), value_type{0.0});
+    EXPECT_EQ(m->at(0, 1, 1), value_type{5.0});
+    EXPECT_EQ(m->at(1, 0, 0), value_type{-1.0});
+    EXPECT_EQ(m->at(1, 0, 1), value_type{0.5});
+    EXPECT_EQ(m->at(1, 1, 0), value_type{0.0});
+    EXPECT_EQ(m->at(1, 1, 1), value_type{9.0});
+}
+
+
+TYPED_TEST(MultiVector, CanBeReadFromSparseMatrixData)
+{
+    using value_type = typename TestFixture::value_type;
+    using index_type = int;
+    auto vec_data = std::vector<gko::matrix_data<value_type, index_type>>{};
+    vec_data.emplace_back(gko::matrix_data<value_type, index_type>(
+        {2, 2}, {{0, 0, 1.0}, {0, 1, 3.0}, {1, 1, 5.0}}));
+    vec_data.emplace_back(gko::matrix_data<value_type, index_type>(
+        {2, 2}, {{0, 0, -1.0}, {0, 1, 0.5}, {1, 1, 9.0}}));
+
+    auto m = gko::batch::read<value_type, index_type,
+                              gko::batch::MultiVector<value_type>>(this->exec,
+                                                                   vec_data);
+
+    ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 2));
+    EXPECT_EQ(m->at(0, 0, 0), value_type{1.0});
+    EXPECT_EQ(m->at(0, 0, 1), value_type{3.0});
+    EXPECT_EQ(m->at(0, 1, 0), value_type{0.0});
+    EXPECT_EQ(m->at(0, 1, 1), value_type{5.0});
+    EXPECT_EQ(m->at(1, 0, 0), value_type{-1.0});
+    EXPECT_EQ(m->at(1, 0, 1), value_type{0.5});
+    EXPECT_EQ(m->at(1, 1, 0), value_type{0.0});
+    EXPECT_EQ(m->at(1, 1, 1), value_type{9.0});
+}
+
+
+TYPED_TEST(MultiVector, GeneratesCorrectMatrixData)
+{
+    using value_type = typename TestFixture::value_type;
+    using index_type = int;
+    using tpl = typename gko::matrix_data<TypeParam>::nonzero_type;
+
+    auto data =
+        gko::batch::write<value_type, index_type,
+                          gko::batch::MultiVector<value_type>>(this->mtx.get());
+
+    ASSERT_EQ(data[0].size, gko::dim<2>(2, 3));
+    ASSERT_EQ(data[0].nonzeros.size(), 6);
+    EXPECT_EQ(data[0].nonzeros[0], tpl(0, 0, value_type{-1.0}));
+    EXPECT_EQ(data[0].nonzeros[1], tpl(0, 1, value_type{2.0}));
+    EXPECT_EQ(data[0].nonzeros[2], tpl(0, 2, value_type{3.0}));
+    EXPECT_EQ(data[0].nonzeros[3], tpl(1, 0, value_type{-1.5}));
+    EXPECT_EQ(data[0].nonzeros[4], tpl(1, 1, value_type{2.5}));
+    EXPECT_EQ(data[0].nonzeros[5], tpl(1, 2, value_type{3.5}));
+    ASSERT_EQ(data[1].size, gko::dim<2>(2, 3));
+    ASSERT_EQ(data[1].nonzeros.size(), 6);
+    EXPECT_EQ(data[1].nonzeros[0], tpl(0, 0, value_type{1.0}));
+    EXPECT_EQ(data[1].nonzeros[1], tpl(0, 1, value_type{2.5}));
+    EXPECT_EQ(data[1].nonzeros[2], tpl(0, 2, value_type{3.0}));
+    EXPECT_EQ(data[1].nonzeros[3], tpl(1, 0, value_type{1.0}));
+    EXPECT_EQ(data[1].nonzeros[4], tpl(1, 1, value_type{2.0}));
+    EXPECT_EQ(data[1].nonzeros[5], tpl(1, 2, value_type{3.0}));
+}
diff --git a/core/test/base/deferred_factory.cpp b/core/test/base/deferred_factory.cpp
new file mode 100644
index 00000000000..ef2e592ab03
--- /dev/null
+++ b/core/test/base/deferred_factory.cpp
@@ -0,0 +1,433 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/abstract_factory.hpp>
+#include <ginkgo/core/base/exception.hpp>
+#include <ginkgo/core/base/executor.hpp>
+
+
+template <typename T>
+using dfp = gko::deferred_factory_parameter<T>;
+
+
+// Note: the following Factory structure is not identical to Ginkgo Factory
+// structure, but it is easier setup without too many dependencies and
+// inheritances.
+struct DummyBaseFactory {
+    virtual ~DummyBaseFactory() = default;
+    struct param {
+        std::unique_ptr<DummyBaseFactory> on(
+            std::shared_ptr<const gko::Executor>) const
+        {
+            return std::make_unique<DummyBaseFactory>();
+        }
+    };
+};
+
+
+struct DummyFactory : DummyBaseFactory {
+    struct param {
+        std::unique_ptr<DummyFactory> on(
+            std::shared_ptr<const gko::Executor>) const
+        {
+            return std::make_unique<DummyFactory>();
+        }
+    };
+};
+
+
+struct DummyFactory2 : DummyBaseFactory {
+    struct param : public gko::enable_parameters_type<param, DummyFactory2> {
+        using parameters_type = param;
+        std::vector<std::shared_ptr<const DummyBaseFactory>>
+            GKO_DEFERRED_FACTORY_VECTOR_PARAMETER(const_factory_list);
+
+        std::vector<std::shared_ptr<DummyBaseFactory>>
+            GKO_DEFERRED_FACTORY_VECTOR_PARAMETER(factory_list);
+    };
+
+    const param& get_parameters() const noexcept { return parameters_; };
+
+    void add_logger(std::shared_ptr<const gko::log::Logger> logger) {}
+
+    DummyFactory2(std::shared_ptr<const gko::Executor>, const param& parameters)
+        : parameters_(parameters)
+    {}
+
+private:
+    param parameters_;
+};
+
+
+using DF = DummyFactory;
+using DBF = DummyBaseFactory;
+using DF2 = DummyFactory2;
+
+
+// used to distinguish specialization for function check
+struct DummyFlag {};
+
+
+// test_impl is to check the constructor available or not in the compile time.
+// Note. It only checks the signature with the template and SFINAE. If the
+// compilation error is only in the function/constructor definition, it will
+// still give the true back.
+template <typename, typename T, typename...>
+struct test_impl : std::false_type {};
+
+// specialization for constructor
+template <typename T, typename... Args>
+struct test_impl<gko::xstd::void_t<decltype(T(std::declval<Args>()...))>, T,
+                 Args...> : std::true_type {};
+
+// specialization for DF2 with_factory_list
+template <typename... Args>
+struct test_impl<gko::xstd::void_t<decltype(
+                     DF2::param{}.with_factory_list(std::declval<Args>()...))>,
+                 DummyFlag, Args...> : std::true_type {};
+
+// test the object can be constructable or not with Args.
+template <typename T, typename... Args>
+using test = test_impl<void, T, Args...>;
+
+// test the DF2::param{}.with_factory_list can be called or not with Args.
+template <typename... Args>
+using test_with_factory = test_impl<void, DummyFlag, Args...>;
+
+
+class DeferredFactoryParameter : public ::testing::Test {
+protected:
+    DF::param df_param{};
+    DBF::param dbf_param{};
+    const DF::param const_df_param{};
+    const DBF::param const_dbf_param{};
+    std::shared_ptr<DF> shared_df = std::make_shared<DF>();
+    std::shared_ptr<DBF> shared_dbf = std::make_shared<DBF>();
+    std::shared_ptr<const DF> shared_const_df = std::make_shared<DF>();
+    std::shared_ptr<const DBF> shared_const_dbf = std::make_shared<DBF>();
+    dfp<DF> dfp_df{std::make_shared<DF>()};
+    dfp<DBF> dfp_dbf{std::make_shared<DBF>()};
+    dfp<const DF> dfp_const_df{std::make_shared<DF>()};
+    dfp<const DBF> dfp_const_dbf{std::make_shared<DBF>()};
+    dfp<DF> nest_dfp_df{dfp_df};
+    dfp<DBF> nest_dfp_dbf{dfp_dbf};
+    dfp<const DF> nest_dfp_const_df{dfp_const_df};
+    dfp<const DBF> nest_dfp_const_dbf{dfp_const_dbf};
+};
+
+
+TEST_F(DeferredFactoryParameter, CanBeDefaultConstructed)
+{
+    auto fact = dfp<DBF>();
+    auto fact2 = dfp<const DBF>();
+
+    ASSERT_TRUE(fact.is_empty());
+    ASSERT_THROW(fact.on(nullptr), gko::NotSupported);
+    ASSERT_TRUE(fact2.is_empty());
+    ASSERT_THROW(fact2.on(nullptr), gko::NotSupported);
+}
+
+
+TEST_F(DeferredFactoryParameter, CanBeConstructedFromNullptr)
+{
+    auto fact = dfp<DBF>(nullptr);
+    auto fact2 = dfp<const DBF>(nullptr);
+
+    ASSERT_FALSE(fact.is_empty());
+    ASSERT_EQ(fact.on(nullptr), nullptr);
+    ASSERT_FALSE(fact2.is_empty());
+    ASSERT_EQ(fact2.on(nullptr), nullptr);
+}
+
+
+TEST_F(DeferredFactoryParameter, CheckNonConstConstructor)
+{
+    // Itself
+    // shared_ptr
+    auto fact0 = dfp<DBF>(this->shared_dbf);
+    // unique_ptr
+    auto fact1 = dfp<DBF>(this->dbf_param.on(nullptr));
+    // const param
+    auto fact2 = dfp<DBF>(this->const_dbf_param);
+    // param
+    auto fact3 = dfp<DBF>(this->dbf_param);
+    // deferred_factory_parameter
+    auto fact4 = dfp<DBF>(this->dfp_dbf);
+    // Childtype
+    auto fact5 = dfp<DBF>(this->shared_df);
+    auto fact6 = dfp<DBF>(this->df_param.on(nullptr));
+    auto fact7 = dfp<DBF>(this->const_df_param);
+    auto fact8 = dfp<DBF>(this->df_param);
+    auto fact9 = dfp<DBF>(this->dfp_df);
+
+    for (auto& fact : {fact0, fact1, fact2, fact3, fact4}) {
+        ASSERT_TRUE(std::dynamic_pointer_cast<DBF>(fact.on(nullptr)));
+    }
+    for (auto& fact : {fact5, fact6, fact7, fact8, fact9}) {
+        ASSERT_TRUE(std::dynamic_pointer_cast<DF>(fact.on(nullptr)));
+    }
+}
+
+
+TEST_F(DeferredFactoryParameter, CheckConstConstructor)
+{
+    // Itself
+    // shared_ptr
+    auto fact0 = dfp<const DBF>(this->shared_dbf);
+    // shared_ptr const
+    auto fact1 = dfp<const DBF>(this->shared_const_dbf);
+    // unique_ptr
+    auto fact2 = dfp<const DBF>(this->dbf_param.on(nullptr));
+    // unique_ptr const
+    auto fact3 = dfp<const DBF>(
+        static_cast<std::unique_ptr<const DBF>>(this->dbf_param.on(nullptr)));
+    // const param
+    auto fact4 = dfp<const DBF>(this->const_dbf_param);
+    // param
+    auto fact5 = dfp<const DBF>(this->dbf_param);
+    // deferred_factory_parameter
+    auto fact6 = dfp<const DBF>(this->dfp_dbf);
+    // deferred_factory_parameter const
+    auto fact7 = dfp<const DBF>(this->dfp_const_dbf);
+    // Childtype
+    auto fact_child0 = dfp<const DBF>(this->shared_df);
+    auto fact_child1 = dfp<const DBF>(this->shared_const_df);
+    auto fact_child2 = dfp<const DBF>(this->df_param.on(nullptr));
+    auto fact_child3 = dfp<const DBF>(
+        static_cast<std::unique_ptr<const DF>>(this->df_param.on(nullptr)));
+    auto fact_child4 = dfp<const DBF>(this->const_df_param);
+    auto fact_child5 = dfp<const DBF>(this->df_param);
+    auto fact_child6 = dfp<const DBF>(this->dfp_df);
+    auto fact_child7 = dfp<const DBF>(this->dfp_const_df);
+
+    for (auto& fact :
+         {fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7}) {
+        ASSERT_TRUE(std::dynamic_pointer_cast<const DBF>(fact.on(nullptr)));
+    }
+    for (auto& fact : {fact_child0, fact_child1, fact_child2, fact_child3,
+                       fact_child4, fact_child5, fact_child6, fact_child7}) {
+        ASSERT_TRUE(std::dynamic_pointer_cast<const DF>(fact.on(nullptr)));
+    }
+}
+
+
+TEST_F(DeferredFactoryParameter, ValidateNotAllowedFromNonConstConstructor)
+{
+    ASSERT_TRUE((test<dfp<DBF>, std::shared_ptr<DBF>>::value));
+    // The following can not be constructed. Using the corresponding constructor
+    // leads to a compile-time error.
+    ASSERT_FALSE((test<dfp<DBF>, std::shared_ptr<const DBF>>::value));
+    ASSERT_FALSE((test<dfp<DBF>, std::unique_ptr<const DBF>>::value));
+    ASSERT_FALSE((test<dfp<DBF>, dfp<const DBF>>::value));
+    ASSERT_FALSE((test<dfp<DF>, dfp<DBF>>::value));
+    ASSERT_FALSE((test<dfp<DF>, std::shared_ptr<DBF>>::value));
+    ASSERT_FALSE((test<dfp<DF>, dfp<DF2>>::value));
+    ASSERT_FALSE((test<dfp<DF>, std::shared_ptr<DF2>>::value));
+}
+
+
+TEST_F(DeferredFactoryParameter, CheckMacroWithConstList)
+{
+    auto result =
+        DummyFactory2::param{}
+            .with_const_factory_list(
+                this->df_param, this->const_df_param, this->shared_df,
+                this->shared_const_df, this->df_param.on(nullptr), this->dfp_df,
+                this->dfp_const_df, this->nest_dfp_df, this->nest_dfp_const_df)
+            .on(nullptr);
+    auto result_base =
+        DummyFactory2::param{}
+            .with_const_factory_list(this->dbf_param, this->const_dbf_param,
+                                     this->shared_dbf, this->shared_const_dbf,
+                                     this->dbf_param.on(nullptr), this->dfp_dbf,
+                                     this->dfp_const_dbf, this->nest_dfp_dbf,
+                                     this->nest_dfp_const_dbf)
+            .on(nullptr);
+
+    auto& factory_list = result->get_parameters().const_factory_list;
+    auto& base_factory_list = result_base->get_parameters().const_factory_list;
+    const auto num = factory_list.size();
+    ASSERT_EQ(num, 9);
+    ASSERT_EQ(base_factory_list.size(), 9);
+    for (int i = 0; i < num; i++) {
+        // The list requires const DummyBaseFactory, so they must be const
+        ASSERT_TRUE(std::dynamic_pointer_cast<const DF>(factory_list.at(i)));
+        ASSERT_TRUE(
+            std::dynamic_pointer_cast<const DBF>(base_factory_list.at(i)));
+    }
+}
+
+
+TEST_F(DeferredFactoryParameter, CheckMacroWithNonConstList)
+{
+    auto result =
+        DummyFactory2::param{}
+            .with_factory_list(this->df_param, this->const_df_param,
+                               this->shared_df, this->df_param.on(nullptr),
+                               this->dfp_df, this->nest_dfp_df)
+            .on(nullptr);
+    auto result_base =
+        DummyFactory2::param{}
+            .with_factory_list(this->dbf_param, this->const_dbf_param,
+                               this->shared_dbf, this->dbf_param.on(nullptr),
+                               this->dfp_dbf, this->nest_dfp_dbf)
+            .on(nullptr);
+
+    auto& factory_list = result->get_parameters().factory_list;
+    auto& base_factory_list = result_base->get_parameters().factory_list;
+    const auto num = factory_list.size();
+    ASSERT_EQ(num, 6);
+    ASSERT_EQ(base_factory_list.size(), 6);
+    for (int i = 0; i < num; i++) {
+        // The list requires DummyBaseFactory, so they must be non-const
+        ASSERT_TRUE(std::dynamic_pointer_cast<DF>(factory_list.at(i)));
+        ASSERT_TRUE(std::dynamic_pointer_cast<DBF>(base_factory_list.at(i)));
+    }
+}
+
+
+TEST_F(DeferredFactoryParameter, CheckMacroWithConstVector)
+{
+    auto const_dbf_vec = std::vector<std::shared_ptr<const DBF>>{
+        this->shared_dbf, this->shared_dbf};
+    auto dbf_vec =
+        std::vector<std::shared_ptr<DBF>>{this->shared_dbf, this->shared_dbf};
+    auto dfp_const_dbf_vec =
+        std::vector<dfp<const DBF>>{this->dbf_param, this->shared_dbf};
+    auto dfp_dbf_vec = std::vector<dfp<DBF>>{this->dbf_param, this->shared_dbf};
+    auto dbf_param_vec =
+        std::vector<DBF::param>{this->dbf_param, this->dbf_param};
+    // child
+    auto const_df_vec = std::vector<std::shared_ptr<const DF>>{this->shared_df,
+                                                               this->shared_df};
+    auto df_vec =
+        std::vector<std::shared_ptr<DF>>{this->shared_df, this->shared_df};
+    auto dfp_const_df_vec =
+        std::vector<dfp<const DF>>{this->df_param, this->shared_df};
+    auto dfp_df_vec = std::vector<dfp<DF>>{this->df_param, this->shared_df};
+    auto df_param_vec = std::vector<DF::param>{this->df_param, this->df_param};
+    std::vector<std::shared_ptr<DF2>> result_base_vector;
+    std::vector<std::shared_ptr<DF2>> result_vector;
+
+    result_base_vector.emplace_back(
+        DF2::param{}.with_const_factory_list(const_dbf_vec).on(nullptr));
+    result_base_vector.emplace_back(
+        DF2::param{}.with_const_factory_list(dbf_vec).on(nullptr));
+    result_base_vector.emplace_back(
+        DF2::param{}.with_const_factory_list(dfp_const_dbf_vec).on(nullptr));
+    result_base_vector.emplace_back(
+        DF2::param{}.with_const_factory_list(dfp_dbf_vec).on(nullptr));
+    result_base_vector.emplace_back(
+        DF2::param{}.with_const_factory_list(dbf_param_vec).on(nullptr));
+    // For child input
+    result_vector.emplace_back(
+        DF2::param{}.with_const_factory_list(const_df_vec).on(nullptr));
+    result_vector.emplace_back(
+        DF2::param{}.with_const_factory_list(df_vec).on(nullptr));
+    result_vector.emplace_back(
+        DF2::param{}.with_const_factory_list(dfp_const_df_vec).on(nullptr));
+    result_vector.emplace_back(
+        DF2::param{}.with_const_factory_list(dfp_df_vec).on(nullptr));
+    result_vector.emplace_back(
+        DF2::param{}.with_const_factory_list(df_param_vec).on(nullptr));
+
+    for (const auto& result : result_base_vector) {
+        auto& factory_list = result->get_parameters().const_factory_list;
+        ASSERT_TRUE(std::dynamic_pointer_cast<const DBF>(factory_list.at(0)));
+        ASSERT_TRUE(std::dynamic_pointer_cast<const DBF>(factory_list.at(1)));
+    }
+    for (const auto& result : result_vector) {
+        auto& factory_list = result->get_parameters().const_factory_list;
+        ASSERT_TRUE(std::dynamic_pointer_cast<const DF>(factory_list.at(0)));
+        ASSERT_TRUE(std::dynamic_pointer_cast<const DF>(factory_list.at(1)));
+    }
+}
+
+
+TEST_F(DeferredFactoryParameter, CheckMacroWithNonConstVector)
+{
+    auto dbf_vec =
+        std::vector<std::shared_ptr<DBF>>{this->shared_dbf, this->shared_dbf};
+    auto dfp_dbf_vec = std::vector<dfp<DBF>>{this->dbf_param, this->shared_dbf};
+    auto dbf_param_vec =
+        std::vector<DBF::param>{this->dbf_param, this->dbf_param};
+    // child
+    auto df_vec =
+        std::vector<std::shared_ptr<DF>>{this->shared_df, this->shared_df};
+    auto dfp_df_vec = std::vector<dfp<DF>>{this->df_param, this->shared_df};
+    auto df_param_vec = std::vector<DF::param>{this->df_param, this->df_param};
+    std::vector<std::shared_ptr<DF2>> result_base_vector;
+    std::vector<std::shared_ptr<DF2>> result_vector;
+
+    result_base_vector.emplace_back(
+        DF2::param{}.with_factory_list(dbf_vec).on(nullptr));
+    result_base_vector.emplace_back(
+        DF2::param{}.with_factory_list(dfp_dbf_vec).on(nullptr));
+    result_base_vector.emplace_back(
+        DF2::param{}.with_factory_list(dbf_param_vec).on(nullptr));
+    // For child input
+    result_vector.emplace_back(
+        DF2::param{}.with_factory_list(df_vec).on(nullptr));
+    result_vector.emplace_back(
+        DF2::param{}.with_factory_list(dfp_df_vec).on(nullptr));
+    result_vector.emplace_back(
+        DF2::param{}.with_factory_list(df_param_vec).on(nullptr));
+
+    for (const auto& result : result_base_vector) {
+        auto& factory_list = result->get_parameters().factory_list;
+        ASSERT_TRUE(std::dynamic_pointer_cast<DBF>(factory_list.at(0)));
+        ASSERT_TRUE(std::dynamic_pointer_cast<DBF>(factory_list.at(1)));
+    }
+    for (const auto& result : result_vector) {
+        auto& factory_list = result->get_parameters().factory_list;
+        ASSERT_TRUE(std::dynamic_pointer_cast<DF>(factory_list.at(0)));
+        ASSERT_TRUE(std::dynamic_pointer_cast<DF>(factory_list.at(1)));
+    }
+}
+
+
+TEST_F(DeferredFactoryParameter, ValidateNotAllowedFromMacroWithNonConst)
+{
+    ASSERT_TRUE((test_with_factory<std::vector<std::shared_ptr<DBF>>>::value));
+    ASSERT_TRUE(
+        (test_with_factory<std::shared_ptr<DBF>, std::shared_ptr<DF>>::value));
+    ASSERT_FALSE(
+        (test_with_factory<std::vector<std::shared_ptr<const DBF>>>::value));
+    ASSERT_FALSE((test_with_factory<std::vector<dfp<const DBF>>>::value));
+    ASSERT_FALSE(
+        (test_with_factory<std::vector<std::shared_ptr<DummyFlag>>>::value));
+    ASSERT_FALSE((test_with_factory<std::shared_ptr<const DBF>,
+                                    std::shared_ptr<const DF>>::value));
+}
diff --git a/core/test/base/executor.cpp b/core/test/base/executor.cpp
index 0d64dfcf3cf..a331d8f3485 100644
--- a/core/test/base/executor.cpp
+++ b/core/test/base/executor.cpp
@@ -46,6 +46,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <ginkgo/core/base/exception.hpp>
+#include <ginkgo/core/base/memory.hpp>
 
 
 namespace {
@@ -247,111 +248,53 @@ TEST(ReferenceExecutor, IsItsOwnMaster)
 
 TEST(CudaExecutor, KnowsItsMaster)
 {
-    auto omp = gko::OmpExecutor::create();
-    exec_ptr cuda = gko::CudaExecutor::create(0, omp);
+    auto ref = gko::ReferenceExecutor::create();
+    exec_ptr cuda = gko::CudaExecutor::create(0, ref);
 
-    ASSERT_EQ(omp, cuda->get_master());
+    ASSERT_EQ(ref, cuda->get_master());
 }
 
 
 TEST(CudaExecutor, KnowsItsDeviceId)
 {
-    auto omp = gko::OmpExecutor::create();
-    auto cuda = gko::CudaExecutor::create(0, omp);
+    auto ref = gko::ReferenceExecutor::create();
+    auto cuda = gko::CudaExecutor::create(0, ref);
 
     ASSERT_EQ(0, cuda->get_device_id());
 }
 
 
-TEST(CudaExecutor, CanGetDeviceResetBoolean)
-{
-    auto omp = gko::OmpExecutor::create();
-    auto cuda = gko::CudaExecutor::create(0, omp);
-
-    ASSERT_EQ(false, cuda->get_device_reset());
-}
-
-
-TEST(CudaExecutor, CanSetDefaultDeviceResetBoolean)
-{
-    auto omp = gko::OmpExecutor::create();
-    auto cuda = gko::CudaExecutor::create(0, omp, true);
-
-    ASSERT_EQ(true, cuda->get_device_reset());
-}
-
-
-TEST(CudaExecutor, CanSetDeviceResetBoolean)
-{
-    auto omp = gko::OmpExecutor::create();
-    auto cuda = gko::CudaExecutor::create(0, omp);
-
-    cuda->set_device_reset(true);
-
-    ASSERT_EQ(true, cuda->get_device_reset());
-}
-
-
 TEST(HipExecutor, KnowsItsMaster)
 {
-    auto omp = gko::OmpExecutor::create();
-    exec_ptr hip = gko::HipExecutor::create(0, omp);
+    auto ref = gko::ReferenceExecutor::create();
+    exec_ptr hip = gko::HipExecutor::create(0, ref);
 
-    ASSERT_EQ(omp, hip->get_master());
+    ASSERT_EQ(ref, hip->get_master());
 }
 
 
 TEST(HipExecutor, KnowsItsDeviceId)
 {
-    auto omp = gko::OmpExecutor::create();
-    auto hip = gko::HipExecutor::create(0, omp);
+    auto ref = gko::ReferenceExecutor::create();
+    auto hip = gko::HipExecutor::create(0, ref);
 
     ASSERT_EQ(0, hip->get_device_id());
 }
 
 
-TEST(HipExecutor, CanGetDeviceResetBoolean)
-{
-    auto omp = gko::OmpExecutor::create();
-    auto hip = gko::HipExecutor::create(0, omp);
-
-    ASSERT_EQ(false, hip->get_device_reset());
-}
-
-
-TEST(HipExecutor, CanSetDefaultDeviceResetBoolean)
-{
-    auto omp = gko::OmpExecutor::create();
-    auto hip = gko::HipExecutor::create(0, omp, true);
-
-    ASSERT_EQ(true, hip->get_device_reset());
-}
-
-
-TEST(HipExecutor, CanSetDeviceResetBoolean)
-{
-    auto omp = gko::OmpExecutor::create();
-    auto hip = gko::HipExecutor::create(0, omp);
-
-    hip->set_device_reset(true);
-
-    ASSERT_EQ(true, hip->get_device_reset());
-}
-
-
 TEST(DpcppExecutor, KnowsItsMaster)
 {
-    auto omp = gko::OmpExecutor::create();
-    exec_ptr dpcpp = gko::DpcppExecutor::create(0, omp);
+    auto ref = gko::ReferenceExecutor::create();
+    exec_ptr dpcpp = gko::DpcppExecutor::create(0, ref);
 
-    ASSERT_EQ(omp, dpcpp->get_master());
+    ASSERT_EQ(ref, dpcpp->get_master());
 }
 
 
 TEST(DpcppExecutor, KnowsItsDeviceId)
 {
-    auto omp = gko::OmpExecutor::create();
-    auto dpcpp = gko::DpcppExecutor::create(0, omp);
+    auto ref = gko::ReferenceExecutor::create();
+    auto dpcpp = gko::DpcppExecutor::create(0, ref);
 
     ASSERT_EQ(0, dpcpp->get_device_id());
 }
@@ -361,13 +304,13 @@ TEST(Executor, CanVerifyMemory)
 {
     auto ref = gko::ReferenceExecutor::create();
     auto omp = gko::OmpExecutor::create();
-    auto hip = gko::HipExecutor::create(0, omp);
-    auto cuda = gko::CudaExecutor::create(0, omp);
+    auto hip = gko::HipExecutor::create(0, ref);
+    auto cuda = gko::CudaExecutor::create(0, ref);
     auto omp2 = gko::OmpExecutor::create();
-    auto hip2 = gko::HipExecutor::create(0, omp);
-    auto cuda2 = gko::CudaExecutor::create(0, omp);
-    auto hip_1 = gko::HipExecutor::create(1, omp);
-    auto cuda_1 = gko::CudaExecutor::create(1, omp);
+    auto hip2 = gko::HipExecutor::create(0, ref);
+    auto cuda2 = gko::CudaExecutor::create(0, ref);
+    auto hip_1 = gko::HipExecutor::create(1, ref);
+    auto cuda_1 = gko::CudaExecutor::create(1, ref);
     std::shared_ptr<gko::DpcppExecutor> host_dpcpp;
     std::shared_ptr<gko::DpcppExecutor> cpu_dpcpp;
     std::shared_ptr<gko::DpcppExecutor> gpu_dpcpp;
@@ -375,16 +318,16 @@ TEST(Executor, CanVerifyMemory)
     std::shared_ptr<gko::DpcppExecutor> cpu_dpcpp_dup;
     std::shared_ptr<gko::DpcppExecutor> gpu_dpcpp_dup;
     if (gko::DpcppExecutor::get_num_devices("host")) {
-        host_dpcpp = gko::DpcppExecutor::create(0, omp, "host");
-        host_dpcpp_dup = gko::DpcppExecutor::create(0, omp, "host");
+        host_dpcpp = gko::DpcppExecutor::create(0, ref, "host");
+        host_dpcpp_dup = gko::DpcppExecutor::create(0, ref, "host");
     }
     if (gko::DpcppExecutor::get_num_devices("cpu")) {
-        cpu_dpcpp = gko::DpcppExecutor::create(0, omp, "cpu");
-        cpu_dpcpp_dup = gko::DpcppExecutor::create(0, omp, "cpu");
+        cpu_dpcpp = gko::DpcppExecutor::create(0, ref, "cpu");
+        cpu_dpcpp_dup = gko::DpcppExecutor::create(0, ref, "cpu");
     }
     if (gko::DpcppExecutor::get_num_devices("gpu")) {
-        gpu_dpcpp = gko::DpcppExecutor::create(0, omp, "gpu");
-        gpu_dpcpp_dup = gko::DpcppExecutor::create(0, omp, "gpu");
+        gpu_dpcpp = gko::DpcppExecutor::create(0, ref, "gpu");
+        gpu_dpcpp_dup = gko::DpcppExecutor::create(0, ref, "gpu");
     }
 
     ASSERT_EQ(false, ref->memory_accessible(omp));
@@ -442,20 +385,11 @@ TEST(Executor, CanVerifyMemory)
 }
 
 
-template <typename T>
-struct mock_free : T {
-    /**
-     * @internal Due to a bug with gcc 5.3, the constructor needs to be called
-     * with `()` operator instead of `{}`.
-     */
-    template <typename... Params>
-    mock_free(Params&&... params) : T(std::forward<Params>(params)...)
-    {}
-
-    void raw_free(void* ptr) const noexcept override
+struct MockAllocator : gko::CpuAllocator {
+    void deallocate(void* ptr) noexcept override
     {
         called_free = true;
-        T::raw_free(ptr);
+        CpuAllocator::deallocate(ptr);
     }
 
     mutable bool called_free{false};
@@ -464,12 +398,13 @@ struct mock_free : T {
 
 TEST(ExecutorDeleter, DeletesObject)
 {
-    auto ref = std::make_shared<mock_free<gko::ReferenceExecutor>>();
+    auto alloc = std::make_shared<MockAllocator>();
+    auto ref = gko::ReferenceExecutor::create(alloc);
     auto x = ref->alloc<int>(5);
 
     gko::executor_deleter<int>{ref}(x);
 
-    ASSERT_TRUE(ref->called_free);
+    ASSERT_TRUE(alloc->called_free);
 }
 
 
diff --git a/core/test/base/iterator_factory.cpp b/core/test/base/iterator_factory.cpp
index 68ed87e07cb..4db597e399a 100644
--- a/core/test/base/iterator_factory.cpp
+++ b/core/test/base/iterator_factory.cpp
@@ -67,13 +67,13 @@ namespace {
 
 
 template <typename ValueIndexType>
-class IteratorFactory : public ::testing::Test {
+class ZipIterator : public ::testing::Test {
 protected:
     using value_type =
         typename std::tuple_element<0, decltype(ValueIndexType())>::type;
     using index_type =
         typename std::tuple_element<1, decltype(ValueIndexType())>::type;
-    IteratorFactory()
+    ZipIterator()
         : reversed_index{100, 50, 10, 9, 8, 7, 5, 5, 4, 3, 2, 1, 0, -1, -2},
           ordered_index{-2, -1, 0, 1, 2, 3, 4, 5, 5, 7, 8, 9, 10, 50, 100},
           reversed_value{15., 14., 13., 12., 11., 10., 9., 7.,
@@ -109,11 +109,11 @@ class IteratorFactory : public ::testing::Test {
     const std::vector<value_type> ordered_value;
 };
 
-TYPED_TEST_SUITE(IteratorFactory, gko::test::ValueIndexTypes,
+TYPED_TEST_SUITE(ZipIterator, gko::test::ValueIndexTypes,
                  PairTypenameNameGenerator);
 
 
-TYPED_TEST(IteratorFactory, EmptyIterator)
+TYPED_TEST(ZipIterator, EmptyIterator)
 {
     using index_type = typename TestFixture::index_type;
     using value_type = typename TestFixture::value_type;
@@ -125,7 +125,7 @@ TYPED_TEST(IteratorFactory, EmptyIterator)
 }
 
 
-TYPED_TEST(IteratorFactory, SortingReversedWithIterator)
+TYPED_TEST(ZipIterator, SortingReversedWithIterator)
 {
     using index_type = typename TestFixture::index_type;
     using value_type = typename TestFixture::value_type;
@@ -140,7 +140,7 @@ TYPED_TEST(IteratorFactory, SortingReversedWithIterator)
 }
 
 
-TYPED_TEST(IteratorFactory, SortingAlreadySortedWithIterator)
+TYPED_TEST(ZipIterator, SortingAlreadySortedWithIterator)
 {
     using index_type = typename TestFixture::index_type;
     using value_type = typename TestFixture::value_type;
@@ -155,7 +155,7 @@ TYPED_TEST(IteratorFactory, SortingAlreadySortedWithIterator)
 }
 
 
-TYPED_TEST(IteratorFactory, IteratorReferenceOperatorSmaller)
+TYPED_TEST(ZipIterator, IteratorReferenceOperatorSmaller)
 {
     using index_type = typename TestFixture::index_type;
     using value_type = typename TestFixture::value_type;
@@ -170,7 +170,7 @@ TYPED_TEST(IteratorFactory, IteratorReferenceOperatorSmaller)
 }
 
 
-TYPED_TEST(IteratorFactory, IteratorReferenceOperatorSmaller2)
+TYPED_TEST(ZipIterator, IteratorReferenceOperatorSmaller2)
 {
     using index_type = typename TestFixture::index_type;
     using value_type = typename TestFixture::value_type;
@@ -185,7 +185,7 @@ TYPED_TEST(IteratorFactory, IteratorReferenceOperatorSmaller2)
 }
 
 
-TYPED_TEST(IteratorFactory, IncreasingIterator)
+TYPED_TEST(ZipIterator, IncreasingIterator)
 {
     using index_type = typename TestFixture::index_type;
     using value_type = typename TestFixture::value_type;
@@ -262,7 +262,7 @@ bool check_assertion_exit_code(int exit_code)
 }
 
 
-TYPED_TEST(IteratorFactory, IncompatibleIteratorDeathTest)
+TYPED_TEST(ZipIterator, IncompatibleIteratorDeathTest)
 {
     using index_type = typename TestFixture::index_type;
     using value_type = typename TestFixture::value_type;
@@ -273,20 +273,20 @@ TYPED_TEST(IteratorFactory, IncompatibleIteratorDeathTest)
 
     // a set of operations that return inconsistent results for the two
     // different iterators
-    EXPECT_EXIT(it2 - it1, check_assertion_exit_code, "");
-    EXPECT_EXIT(it2 == it1, check_assertion_exit_code, "");
-    EXPECT_EXIT(it2 != it1, check_assertion_exit_code, "");
-    EXPECT_EXIT(it1 < it2, check_assertion_exit_code, "");
-    EXPECT_EXIT(it2 <= it1, check_assertion_exit_code, "");
-    EXPECT_EXIT(it2 > it1, check_assertion_exit_code, "");
-    EXPECT_EXIT(it1 >= it2, check_assertion_exit_code, "");
+    EXPECT_EXIT((void)(it2 - it1), check_assertion_exit_code, "");
+    EXPECT_EXIT((void)(it2 == it1), check_assertion_exit_code, "");
+    EXPECT_EXIT((void)(it2 != it1), check_assertion_exit_code, "");
+    EXPECT_EXIT((void)(it1 < it2), check_assertion_exit_code, "");
+    EXPECT_EXIT((void)(it2 <= it1), check_assertion_exit_code, "");
+    EXPECT_EXIT((void)(it2 > it1), check_assertion_exit_code, "");
+    EXPECT_EXIT((void)(it1 >= it2), check_assertion_exit_code, "");
 }
 
 
 #endif
 
 
-TYPED_TEST(IteratorFactory, DecreasingIterator)
+TYPED_TEST(ZipIterator, DecreasingIterator)
 {
     using index_type = typename TestFixture::index_type;
     using value_type = typename TestFixture::value_type;
@@ -316,7 +316,7 @@ TYPED_TEST(IteratorFactory, DecreasingIterator)
 }
 
 
-TYPED_TEST(IteratorFactory, CorrectDereferencing)
+TYPED_TEST(ZipIterator, CorrectDereferencing)
 {
     using index_type_it = typename TestFixture::index_type;
     using value_type_it = typename TestFixture::value_type;
@@ -337,7 +337,7 @@ TYPED_TEST(IteratorFactory, CorrectDereferencing)
 }
 
 
-TYPED_TEST(IteratorFactory, CorrectSwapping)
+TYPED_TEST(ZipIterator, CorrectSwapping)
 {
     using index_type = typename TestFixture::index_type;
     using value_type = typename TestFixture::value_type;
@@ -361,7 +361,7 @@ TYPED_TEST(IteratorFactory, CorrectSwapping)
 }
 
 
-TYPED_TEST(IteratorFactory, CorrectHandWrittenSwapping)
+TYPED_TEST(ZipIterator, CorrectHandWrittenSwapping)
 {
     using index_type = typename TestFixture::index_type;
     using value_type = typename TestFixture::value_type;
@@ -388,4 +388,155 @@ TYPED_TEST(IteratorFactory, CorrectHandWrittenSwapping)
 }
 
 
+template <typename ValueType>
+class PermuteIterator : public ::testing::Test {
+protected:
+    using value_type = ValueType;
+};
+
+TYPED_TEST_SUITE(PermuteIterator, gko::test::ValueAndIndexTypes,
+                 TypenameNameGenerator);
+
+
+TYPED_TEST(PermuteIterator, EmptyIterator)
+{
+    auto test_iter = gko::detail::make_permute_iterator<TypeParam*>(
+        nullptr, [](int i) { return i; });
+
+    ASSERT_NO_THROW(std::sort(test_iter, test_iter));
+}
+
+
+TYPED_TEST(PermuteIterator, SortingWithIdentityPermutation)
+{
+    std::vector<TypeParam> vec{6, 2, 5, 2, 4};
+    std::vector<TypeParam> sorted{2, 2, 4, 5, 6};
+    auto test_iter = gko::detail::make_permute_iterator(
+        vec.begin(), [](int i) { return i; });
+
+    std::sort(test_iter, test_iter + vec.size());
+
+    ASSERT_EQ(vec, sorted);
+}
+
+
+TYPED_TEST(PermuteIterator, SortingWithReversePermutation)
+{
+    std::vector<TypeParam> vec{6, 2, 5, 2, 4};
+    std::vector<TypeParam> sorted{6, 5, 4, 2, 2};
+    auto test_iter = gko::detail::make_permute_iterator(
+        vec.begin(),
+        [size = vec.size()](int i) { return static_cast<int>(size) - 1 - i; });
+
+    std::sort(test_iter, test_iter + vec.size());
+
+    ASSERT_EQ(vec, sorted);
+}
+
+
+TYPED_TEST(PermuteIterator, SortingWithStridedPermutation)
+{
+    std::vector<TypeParam> vec{6, 8, 2, 9, 5, 1, 2, 7, 4, 0};
+    std::vector<TypeParam> sorted{2, 8, 2, 9, 4, 1, 5, 7, 6, 0};
+    auto test_iter = gko::detail::make_permute_iterator(
+        vec.begin(), [](int i) { return 2 * i; });
+
+    std::sort(test_iter, test_iter + vec.size() / 2);
+
+    ASSERT_EQ(vec, sorted);
+}
+
+
+TYPED_TEST(PermuteIterator, IncreasingIterator)
+{
+    std::vector<TypeParam> vec{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+    auto perm = [size = vec.size()](int i) {
+        return static_cast<int>(size) - 1 - i;
+    };
+
+    auto test_iter = gko::detail::make_permute_iterator(vec.begin(), perm);
+    auto begin = test_iter;
+    auto plus_2 = begin + 2;
+    auto plus_2_rev = 2 + begin;
+    auto plus_minus_2 = plus_2 - 2;
+    auto increment_pre_2 = begin;
+    ++increment_pre_2;
+    ++increment_pre_2;
+    auto increment_post_2 = begin;
+    increment_post_2++;
+    increment_post_2++;
+    auto increment_pre_test = begin;
+    auto increment_post_test = begin;
+
+    // check results for equality
+    ASSERT_TRUE(begin == plus_minus_2);
+    ASSERT_TRUE(plus_2 == increment_pre_2);
+    ASSERT_TRUE(plus_2_rev == increment_pre_2);
+    ASSERT_TRUE(increment_pre_2 == increment_post_2);
+    ASSERT_TRUE(begin == increment_post_test++);
+    ASSERT_TRUE(begin + 1 == ++increment_pre_test);
+    ASSERT_TRUE(*plus_2 == vec[perm(2)]);
+    // check other comparison operators and difference
+    std::vector<gko::detail::permute_iterator<
+        typename std::vector<TypeParam>::iterator, decltype(perm)>>
+        its{begin,
+            plus_2,
+            plus_2_rev,
+            plus_minus_2,
+            increment_pre_2,
+            increment_post_2,
+            increment_pre_test,
+            increment_post_test,
+            begin + 5,
+            begin + 9};
+    std::sort(its.begin(), its.end());
+    std::vector<int> dists;
+    std::vector<int> ref_dists{0, 1, 0, 1, 0, 0, 0, 3, 4};
+    for (int i = 0; i < its.size() - 1; i++) {
+        SCOPED_TRACE(i);
+        dists.push_back(its[i + 1] - its[i]);
+        auto equal = dists.back() > 0;
+        ASSERT_EQ(its[i + 1] > its[i], equal);
+        ASSERT_EQ(its[i] < its[i + 1], equal);
+        ASSERT_EQ(its[i] != its[i + 1], equal);
+        ASSERT_EQ(its[i] == its[i + 1], !equal);
+        ASSERT_EQ(its[i] >= its[i + 1], !equal);
+        ASSERT_EQ(its[i + 1] <= its[i], !equal);
+        ASSERT_TRUE(its[i + 1] >= its[i]);
+        ASSERT_TRUE(its[i] <= its[i + 1]);
+    }
+    ASSERT_EQ(dists, ref_dists);
+}
+
+
+TYPED_TEST(PermuteIterator, DecreasingIterator)
+{
+    std::vector<TypeParam> vec{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+    auto perm = [size = vec.size()](int i) {
+        return static_cast<int>(size) - 1 - i;
+    };
+
+    auto test_iter = gko::detail::make_permute_iterator(vec.begin(), perm);
+
+    auto iter = test_iter + 5;
+    auto minus_2 = iter - 2;
+    auto minus_plus_2 = minus_2 + 2;
+    auto decrement_pre_2 = iter;
+    --decrement_pre_2;
+    --decrement_pre_2;
+    auto decrement_post_2 = iter;
+    decrement_post_2--;
+    decrement_post_2--;
+    auto decrement_pre_test = iter;
+    auto decrement_post_test = iter;
+
+    ASSERT_TRUE(iter == minus_plus_2);
+    ASSERT_TRUE(minus_2 == decrement_pre_2);
+    ASSERT_TRUE(decrement_pre_2 == decrement_post_2);
+    ASSERT_TRUE(iter == decrement_post_test--);
+    ASSERT_TRUE(iter - 1 == --decrement_pre_test);
+    ASSERT_TRUE(*minus_2 == vec[perm(3)]);
+}
+
+
 }  // namespace
diff --git a/core/test/base/mtx_io.cpp b/core/test/base/mtx_io.cpp
index a1029bd9d12..a25f462556a 100644
--- a/core/test/base/mtx_io.cpp
+++ b/core/test/base/mtx_io.cpp
@@ -286,7 +286,7 @@ TEST(MtxReader, ReadsSparseRealMtx)
 }
 
 
-TEST(MtxReader, ReadsSparseRealSymetricMtx)
+TEST(MtxReader, ReadsSparseRealSymmetricMtx)
 {
     using tpl = gko::matrix_data<double, gko::int32>::nonzero_type;
     std::istringstream iss(
@@ -310,7 +310,7 @@ TEST(MtxReader, ReadsSparseRealSymetricMtx)
 }
 
 
-TEST(MtxReader, ReadsSparseRealSkewSymetricMtx)
+TEST(MtxReader, ReadsSparseRealSkewSymmetricMtx)
 {
     using tpl = gko::matrix_data<double, gko::int32>::nonzero_type;
     std::istringstream iss(
@@ -330,7 +330,7 @@ TEST(MtxReader, ReadsSparseRealSkewSymetricMtx)
 }
 
 
-TEST(MtxReader, ReadsSparseRealSkewSymetricMtxWithExplicitDiagonal)
+TEST(MtxReader, ReadsSparseRealSkewSymmetricMtxWithExplicitDiagonal)
 {
     using tpl = gko::matrix_data<double, gko::int32>::nonzero_type;
     std::istringstream iss(
diff --git a/core/test/components/CMakeLists.txt b/core/test/components/CMakeLists.txt
index a04f47199a5..e88b373d246 100644
--- a/core/test/components/CMakeLists.txt
+++ b/core/test/components/CMakeLists.txt
@@ -1 +1,2 @@
+ginkgo_create_test(addressable_pq)
 ginkgo_create_test(disjoint_sets)
diff --git a/core/test/components/addressable_pq.cpp b/core/test/components/addressable_pq.cpp
new file mode 100644
index 00000000000..87bb40e6570
--- /dev/null
+++ b/core/test/components/addressable_pq.cpp
@@ -0,0 +1,152 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/components/addressable_pq.hpp"
+
+
+#include <algorithm>
+#include <type_traits>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/executor.hpp>
+
+
+#include "core/test/utils.hpp"
+
+
+namespace {
+
+
+template <typename ValueIndexType>
+class AddressablePriorityQueue : public ::testing::Test {
+protected:
+    using value_type =
+        typename std::tuple_element<0, decltype(ValueIndexType())>::type;
+    using index_type =
+        typename std::tuple_element<1, decltype(ValueIndexType())>::type;
+    using pq_type2 = gko::addressable_priority_queue<value_type, index_type, 2>;
+    using pq_type4 = gko::addressable_priority_queue<value_type, index_type, 4>;
+
+    AddressablePriorityQueue() : exec(gko::ReferenceExecutor::create()) {}
+
+    template <typename PQType>
+    void assert_min(const PQType& pq, value_type key, index_type val)
+    {
+        ASSERT_FALSE(pq.empty());
+        ASSERT_EQ(pq.min_key(), key);
+        ASSERT_EQ(pq.min_node(), val);
+        ASSERT_TRUE((pq.min() == std::pair<value_type, index_type>{key, val}));
+    }
+
+    template <typename PQType>
+    void test_pq_functionality()
+    {
+        PQType pq{exec, 8};
+
+        pq.insert(value_type{.5}, 1);
+        ASSERT_EQ(pq.size(), 1);
+        assert_min(pq, .5, 1);
+
+        // insert larger key
+        pq.insert(value_type{1.}, 7);
+        ASSERT_EQ(pq.size(), 2);
+        assert_min(pq, .5, 1);
+
+        // insert min key
+        pq.insert(value_type{.1}, 4);
+        ASSERT_EQ(pq.size(), 3);
+        assert_min(pq, .1, 4);
+
+        // update key to have different min
+        pq.update_key(value_type{.7}, 4);
+        ASSERT_EQ(pq.size(), 3);
+        assert_min(pq, .5, 1);
+
+        // insert same key as min
+        pq.insert(value_type{.5}, 2);
+        ASSERT_EQ(pq.size(), 4);
+        assert_min(pq, .5, 1);
+
+        // update max to new min key
+        pq.update_key(value_type{.2}, 7);
+        ASSERT_EQ(pq.size(), 4);
+        assert_min(pq, .2, 7);
+
+        // insert intermediate key
+        pq.insert(value_type{.3}, 5);
+        ASSERT_EQ(pq.size(), 5);
+        assert_min(pq, .2, 7);
+
+        // pop min works
+        pq.pop_min();
+        ASSERT_EQ(pq.size(), 4);
+        assert_min(pq, .3, 5);
+
+        // reset works
+        pq.reset();
+        ASSERT_EQ(pq.size(), 0);
+        ASSERT_TRUE(pq.empty());
+    }
+
+    std::shared_ptr<const gko::Executor> exec;
+};
+
+TYPED_TEST_SUITE(AddressablePriorityQueue, gko::test::RealValueIndexTypes,
+                 TypenameNameGenerator);
+
+
+TYPED_TEST(AddressablePriorityQueue, InitializesCorrectly)
+{
+    using pq_type = typename TestFixture::pq_type2;
+    pq_type pq{this->exec, 0};
+
+    ASSERT_EQ(pq.size(), 0);
+    ASSERT_TRUE(pq.empty());
+}
+
+
+TYPED_TEST(AddressablePriorityQueue, WorksWithDegree2)
+{
+    this->template test_pq_functionality<typename TestFixture::pq_type2>();
+}
+
+
+TYPED_TEST(AddressablePriorityQueue, WorksWithDegree4)
+{
+    this->template test_pq_functionality<typename TestFixture::pq_type4>();
+}
+
+
+}  // namespace
diff --git a/core/test/gtest/CMakeLists.txt b/core/test/gtest/CMakeLists.txt
new file mode 100644
index 00000000000..cdfc67fafdf
--- /dev/null
+++ b/core/test/gtest/CMakeLists.txt
@@ -0,0 +1,32 @@
+function(add_gtest_main suffix definitions)
+    add_library(ginkgo_gtest_main${suffix} STATIC ginkgo_main.cpp resources.cpp)
+    target_link_libraries(ginkgo_gtest_main${suffix} PUBLIC Ginkgo::ginkgo GTest::GTest)
+    target_compile_definitions(ginkgo_gtest_main${suffix} PRIVATE ${definitions})
+    ginkgo_compile_features(ginkgo_gtest_main${suffix})
+    if (GINKGO_BUILD_MPI)
+        add_library(ginkgo_gtest_main_mpi${suffix} STATIC ginkgo_mpi_main.cpp resources.cpp)
+        target_link_libraries(ginkgo_gtest_main_mpi${suffix} PUBLIC Ginkgo::ginkgo GTest::GTest MPI::MPI_CXX)
+        target_compile_definitions(ginkgo_gtest_main_mpi${suffix} PRIVATE ${definitions})
+        ginkgo_compile_features(ginkgo_gtest_main_mpi${suffix})
+    endif()
+endfunction()
+
+add_gtest_main("" "")
+add_library(ginkgo_gtest_main_reference ALIAS ginkgo_gtest_main)
+add_library(ginkgo_gtest_main_cpu ALIAS ginkgo_gtest_main)
+if (GINKGO_BUILD_MPI)
+    add_library(ginkgo_gtest_main_mpi_reference ALIAS ginkgo_gtest_main_mpi)
+    add_library(ginkgo_gtest_main_mpi_cpu ALIAS ginkgo_gtest_main_mpi)
+endif()
+if (GINKGO_BUILD_OMP)
+    add_gtest_main("_omp" "GKO_COMPILING_OMP")
+endif()
+if (GINKGO_BUILD_CUDA)
+    add_gtest_main("_cuda" "GKO_COMPILING_CUDA")
+endif()
+if (GINKGO_BUILD_HIP)
+    add_gtest_main("_hip" "GKO_COMPILING_HIP")
+endif()
+if (GINKGO_BUILD_SYCL)
+    add_gtest_main("_dpcpp" "GKO_COMPILING_DPCPP")
+endif()
diff --git a/core/test/gtest/environments.hpp b/core/test/gtest/environments.hpp
new file mode 100644
index 00000000000..78c5a40f8a5
--- /dev/null
+++ b/core/test/gtest/environments.hpp
@@ -0,0 +1,137 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CORE_TEST_GTEST_ENVIRONMENTS_HPP_
+#define GKO_CORE_TEST_GTEST_ENVIRONMENTS_HPP_
+
+
+#include <algorithm>
+#include <regex>
+#include <sstream>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/mpi.hpp>
+
+
+#include "core/test/gtest/resources.hpp"
+
+
+#ifdef GKO_COMPILING_OMP
+#include <omp.h>
+#endif
+
+
+#ifdef GKO_COMPILING_CUDA
+#include "cuda/base/device.hpp"
+#endif
+
+
+#ifdef GKO_COMPILING_HIP
+#include "hip/base/device.hpp"
+#endif
+
+
+#if GKO_COMPILING_DPCPP
+#include "dpcpp/base/device.hpp"
+#endif
+
+
+class DeviceEnvironment : public ::testing::Environment {
+public:
+    explicit DeviceEnvironment(int rank) : rank_(rank) { print_environment(); }
+
+#ifdef GKO_COMPILING_OMP
+    void print_environment() const
+    {
+        if (ResourceEnvironment::omp_threads > 0) {
+            omp_set_num_threads(ResourceEnvironment::omp_threads);
+        }
+        std::stringstream ss;
+        ss << "Rank " << rank_ << ": OMP threads " << omp_get_max_threads()
+           << std::endl;
+        std::cerr << ss.str();
+    }
+#elif defined(GKO_COMPILING_CUDA)
+    void print_environment() const
+    {
+        auto device_id = ResourceEnvironment::cuda_device_id;
+        std::stringstream ss;
+        ss << "Rank " << rank_ << ": CUDA device "
+           << gko::kernels::cuda::get_device_name(device_id) << " ID "
+           << device_id << std::endl;
+        std::cerr << ss.str();
+    }
+
+    void TearDown() override
+    {
+        gko::kernels::cuda::reset_device(ResourceEnvironment::cuda_device_id);
+    }
+#elif defined(GKO_COMPILING_HIP)
+    void print_environment() const
+    {
+        auto device_id = ResourceEnvironment::hip_device_id;
+        std::stringstream ss;
+        ss << "Rank " << rank_ << ": HIP device "
+           << gko::kernels::hip::get_device_name(device_id) << " ID "
+           << device_id << std::endl;
+        std::cerr << ss.str();
+    }
+
+    void TearDown() override
+    {
+        gko::kernels::hip::reset_device(ResourceEnvironment::hip_device_id);
+    }
+#elif defined(GKO_COMPILING_DPCPP)
+    void print_environment() const
+    {
+        auto device_id = ResourceEnvironment::sycl_device_id;
+        std::stringstream ss;
+        ss << "Rank " << rank_ << ": SYCL device "
+           << gko::kernels::dpcpp::get_device_name(device_id) << " ID "
+           << device_id << std::endl;
+        std::cerr << ss.str();
+    }
+#else
+    void print_environment() const {}
+#endif
+
+private:
+    int rank_;
+};
+
+
+#endif  // GKO_CORE_TEST_GTEST_ENVIRONMENTS_HPP_
diff --git a/core/test/gtest/ginkgo_main.cpp b/core/test/gtest/ginkgo_main.cpp
new file mode 100644
index 00000000000..b8458dbc0b0
--- /dev/null
+++ b/core/test/gtest/ginkgo_main.cpp
@@ -0,0 +1,53 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <gtest/gtest.h>
+
+
+#include "core/test/gtest/environments.hpp"
+
+
+int ResourceEnvironment::omp_threads = 0;
+int ResourceEnvironment::cuda_device_id = 0;
+int ResourceEnvironment::hip_device_id = 0;
+int ResourceEnvironment::sycl_device_id = 0;
+
+
+int main(int argc, char** argv)
+{
+    ::testing::InitGoogleTest(&argc, argv);
+
+    ::testing::AddGlobalTestEnvironment(new ResourceEnvironment);
+    ::testing::AddGlobalTestEnvironment(new DeviceEnvironment(0));
+    int result = RUN_ALL_TESTS();
+    return result;
+}
diff --git a/core/test/mpi/gtest/mpi_listener.cpp b/core/test/gtest/ginkgo_mpi_main.cpp
similarity index 95%
rename from core/test/mpi/gtest/mpi_listener.cpp
rename to core/test/gtest/ginkgo_mpi_main.cpp
index 66c9e6cd319..6853a12c940 100644
--- a/core/test/mpi/gtest/mpi_listener.cpp
+++ b/core/test/gtest/ginkgo_mpi_main.cpp
@@ -51,6 +51,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <gtest/gtest.h>
 
 
+#include <ginkgo/core/base/mpi.hpp>
+
+
+#include "core/test/gtest/environments.hpp"
+
+
 namespace GTestMPIListener {
 
 // This class sets up the global test environment, which is needed
@@ -92,7 +98,6 @@ class MPIEnvironment : public ::testing::Environment {
 private:
     // Disallow copying
     MPIEnvironment(const MPIEnvironment& env) {}
-
 };  // class MPIEnvironment
 
 
@@ -373,17 +378,33 @@ class MPIWrapperPrinter : public ::testing::TestEventListener {
 }  // namespace GTestMPIListener
 
 
+int ResourceEnvironment::omp_threads = 0;
+int ResourceEnvironment::cuda_device_id = 0;
+int ResourceEnvironment::hip_device_id = 0;
+int ResourceEnvironment::sycl_device_id = 0;
+
+
 int main(int argc, char** argv)
 {
     ::testing::InitGoogleTest(&argc, argv);
+
     MPI_Init(&argc, &argv);
+    MPI_Comm comm(MPI_COMM_WORLD);
+    int rank;
+    int size;
+    MPI_Comm_rank(comm, &rank);
+    MPI_Comm_size(comm, &size);
+
     ::testing::AddGlobalTestEnvironment(new GTestMPIListener::MPIEnvironment);
+    ::testing::AddGlobalTestEnvironment(new ResourceEnvironment(rank, size));
+    ::testing::AddGlobalTestEnvironment(new DeviceEnvironment(rank));
+    MPI_Barrier(comm);
+
     ::testing::TestEventListeners& listeners =
         ::testing::UnitTest::GetInstance()->listeners();
     ::testing::TestEventListener* l =
         listeners.Release(listeners.default_result_printer());
-    listeners.Append(
-        new GTestMPIListener::MPIWrapperPrinter(l, MPI_COMM_WORLD));
+    listeners.Append(new GTestMPIListener::MPIWrapperPrinter(l, comm));
     int result = RUN_ALL_TESTS();
     return result;
 }
diff --git a/core/test/gtest/resources.cpp b/core/test/gtest/resources.cpp
new file mode 100644
index 00000000000..0dd427b75ee
--- /dev/null
+++ b/core/test/gtest/resources.cpp
@@ -0,0 +1,145 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <algorithm>
+#include <regex>
+#include <sstream>
+
+
+#include "core/test/gtest/resources.hpp"
+
+
+#ifdef GKO_COMPILING_OMP
+#include <omp.h>
+#endif
+
+
+#ifdef GKO_COMPILING_CUDA
+#include "cuda/base/device.hpp"
+#endif
+
+
+#ifdef GKO_COMPILING_HIP
+#include "hip/base/device.hpp"
+#endif
+
+
+#if GKO_COMPILING_DPCPP
+#include "dpcpp/base/device.hpp"
+#endif
+
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/mpi.hpp>
+
+
+struct ctest_resource {
+    int id;
+    int slots;
+};
+
+
+char* get_ctest_group(std::string resource_type, int group_id)
+{
+    std::transform(resource_type.begin(), resource_type.end(),
+                   resource_type.begin(),
+                   [](auto c) { return std::toupper(c); });
+    std::string rs_group_env = "CTEST_RESOURCE_GROUP_" +
+                               std::to_string(group_id) + "_" + resource_type;
+    return std::getenv(rs_group_env.c_str());
+}
+
+
+ctest_resource parse_ctest_resources(std::string resource)
+{
+    std::regex re(R"(id\:(\d+),slots\:(\d+))");
+    std::smatch match;
+
+    if (!std::regex_match(resource, match, re)) {
+        GKO_INVALID_STATE("Can't parse ctest_resource string: " + resource);
+    }
+
+    return ctest_resource{std::stoi(match[1]), std::stoi(match[2])};
+}
+
+
+ResourceEnvironment::ResourceEnvironment(int rank, int size)
+{
+#if GINKGO_BUILD_MPI
+    if (size > 1) {
+        cuda_device_id = gko::experimental::mpi::map_rank_to_device_id(
+            MPI_COMM_WORLD, std::max(gko::CudaExecutor::get_num_devices(), 1));
+        hip_device_id = gko::experimental::mpi::map_rank_to_device_id(
+            MPI_COMM_WORLD, std::max(gko::HipExecutor::get_num_devices(), 1));
+        sycl_device_id = gko::experimental::mpi::map_rank_to_device_id(
+            MPI_COMM_WORLD,
+            std::max(gko::DpcppExecutor::get_num_devices("gpu"), 1));
+    }
+#endif
+
+    auto rs_count_env = std::getenv("CTEST_RESOURCE_GROUP_COUNT");
+    auto rs_count = rs_count_env ? std::stoi(rs_count_env) : 0;
+    if (rs_count == 0) {
+        if (rank == 0) {
+            std::cerr << "Running without CTest ctest_resource configuration"
+                      << std::endl;
+        }
+        return;
+    }
+    if (rs_count != size) {
+        GKO_INVALID_STATE("Invalid resource group count: " +
+                          std::to_string(rs_count));
+    }
+
+    // parse CTest ctest_resource group descriptions
+    // OpenMP CPU threads
+    if (auto rs_omp_env = get_ctest_group("cpu", rank)) {
+        auto resource = parse_ctest_resources(rs_omp_env);
+        omp_threads = resource.slots;
+    }
+    // CUDA GPUs
+    if (auto rs_cuda_env = get_ctest_group("cudagpu", rank)) {
+        auto resource = parse_ctest_resources(rs_cuda_env);
+        cuda_device_id = resource.id;
+    }
+    // HIP GPUs
+    if (auto rs_hip_env = get_ctest_group("hipgpu", rank)) {
+        auto resource = parse_ctest_resources(rs_hip_env);
+        hip_device_id = resource.id;
+    }
+    // SYCL GPUs (no other devices!)
+    if (auto rs_sycl_env = get_ctest_group("sycl", rank)) {
+        auto resource = parse_ctest_resources(rs_sycl_env);
+        sycl_device_id = resource.id;
+    }
+}
diff --git a/core/test/gtest/resources.hpp b/core/test/gtest/resources.hpp
new file mode 100644
index 00000000000..a88280f29c7
--- /dev/null
+++ b/core/test/gtest/resources.hpp
@@ -0,0 +1,51 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CORE_TEST_GTEST_RESOURCES_HPP_
+#define GKO_CORE_TEST_GTEST_RESOURCES_HPP_
+
+
+#include <gtest/gtest.h>
+
+
+class ResourceEnvironment : public ::testing::Environment {
+public:
+    explicit ResourceEnvironment(int rank = 0, int size = 1);
+
+    static int omp_threads;
+    static int cuda_device_id;
+    static int hip_device_id;
+    static int sycl_device_id;
+};
+
+
+#endif  // GKO_CORE_TEST_GTEST_RESOURCES_HPP_
diff --git a/core/test/log/CMakeLists.txt b/core/test/log/CMakeLists.txt
index 964572bd48c..8efd7fafc46 100644
--- a/core/test/log/CMakeLists.txt
+++ b/core/test/log/CMakeLists.txt
@@ -1,7 +1,7 @@
 ginkgo_create_test(convergence)
 ginkgo_create_test(logger)
 if (GINKGO_HAVE_PAPI_SDE)
-    ginkgo_create_test(papi PAPI::PAPI)
+    ginkgo_create_test(papi ADDITIONAL_LIBRARIES PAPI::PAPI)
 endif()
 ginkgo_create_test(performance_hint)
 ginkgo_create_test(profiler_hook)
diff --git a/core/test/log/convergence.cpp b/core/test/log/convergence.cpp
index f6294d08cd4..746e8603865 100644
--- a/core/test/log/convergence.cpp
+++ b/core/test/log/convergence.cpp
@@ -68,8 +68,7 @@ class Convergence : public ::testing::Test {
         gko::initialize<AbsoluteDense>({6}, exec);
     std::unique_ptr<gko::LinOp> system =
         gko::solver::Ir<T>::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(1u).on(exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(1u))
             .on(exec)
             ->generate(gko::initialize<Dense>(I<I<T>>{{1, 2}, {0, 3}}, exec));
     std::unique_ptr<Dense> rhs = gko::initialize<Dense>({15, 25}, exec);
diff --git a/core/test/log/logger.cpp b/core/test/log/logger.cpp
index ff84f9a78c2..c739067eeb2 100644
--- a/core/test/log/logger.cpp
+++ b/core/test/log/logger.cpp
@@ -30,18 +30,13 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-#include <ginkgo/core/log/logger.hpp>
-
+// force-top: on
+#include <ginkgo/core/base/types.hpp>
+GKO_BEGIN_DISABLE_DEPRECATION_WARNINGS
+// force-top: off
 
-#if defined(__GNUC__) || defined(__clang__)
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
 
-#endif
-#ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable : 5211, 4973, 4974)
-#endif
+#include <ginkgo/core/log/logger.hpp>
 
 
 #include <memory>
@@ -362,9 +357,4 @@ TEST(IterationCompleteOverload, CanLogCurrent)
 }  // namespace
 
 
-#if defined(__GNUC__) || defined(__clang__)
-#pragma GCC diagnostic pop
-#endif
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
+GKO_END_DISABLE_DEPRECATION_WARNINGS
diff --git a/core/test/log/papi.cpp b/core/test/log/papi.cpp
index 8ab0bb6421d..0928f35d1ba 100644
--- a/core/test/log/papi.cpp
+++ b/core/test/log/papi.cpp
@@ -71,7 +71,12 @@ class Papi : public ::testing::Test {
         }
     }
 
-    void TearDown() { eventset = PAPI_NULL; }
+    void TearDown()
+    {
+        logger = nullptr;
+        PAPI_destroy_eventset(&eventset);
+        PAPI_shutdown();
+    }
 
     template <typename U>
     const std::string init(const gko::log::Logger::mask_type& event,
@@ -466,8 +471,7 @@ TYPED_TEST(Papi, CatchesLinOpFactoryGenerateStarted)
 {
     auto factory =
         gko::solver::Bicgstab<TypeParam>::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .on(this->exec);
     auto str = this->init(gko::log::Logger::linop_factory_generate_started_mask,
                           "linop_factory_generate_started", factory.get());
@@ -487,8 +491,7 @@ TYPED_TEST(Papi, CatchesLinOpFactoryGenerateCompleted)
 {
     auto factory =
         gko::solver::Bicgstab<TypeParam>::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .on(this->exec);
     TypeParam dummy;
     auto str =
diff --git a/core/test/log/profiler_hook.cpp b/core/test/log/profiler_hook.cpp
index 281eed2d70b..cd6e1b0a3ce 100644
--- a/core/test/log/profiler_hook.cpp
+++ b/core/test/log/profiler_hook.cpp
@@ -202,8 +202,7 @@ TEST(ProfilerHook, LogsIteration)
     auto alpha = gko::share(gko::initialize<Vec>({1.0}, exec));
     auto solver =
         gko::solver::Ir<>::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(1u).on(exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(1u))
             .on(exec)
             ->generate(mtx);
     logger->set_object_name(solver, "solver");
diff --git a/core/test/log/record.cpp b/core/test/log/record.cpp
index 0aeca2b3df7..098f93ad523 100644
--- a/core/test/log/record.cpp
+++ b/core/test/log/record.cpp
@@ -440,8 +440,7 @@ TEST(Record, CatchesLinopFactoryGenerateStarted)
         gko::log::Logger::linop_factory_generate_started_mask);
     auto factory =
         gko::solver::Bicgstab<>::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .on(exec);
     auto input = factory->generate(gko::matrix::Dense<>::create(exec));
 
@@ -462,8 +461,7 @@ TEST(Record, CatchesLinopFactoryGenerateCompleted)
         gko::log::Logger::linop_factory_generate_completed_mask);
     auto factory =
         gko::solver::Bicgstab<>::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .on(exec);
     auto input = factory->generate(gko::matrix::Dense<>::create(exec));
     auto output = factory->generate(gko::matrix::Dense<>::create(exec));
@@ -569,8 +567,7 @@ TEST(Record, CatchesIterations)
         gko::log::Record::create(gko::log::Logger::iteration_complete_mask);
     auto factory =
         gko::solver::Bicgstab<>::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .on(exec);
     auto solver = factory->generate(gko::initialize<Dense>({1.1}, exec));
     auto right_hand_side = gko::initialize<Dense>({-5.5}, exec);
diff --git a/core/test/log/stream.cpp b/core/test/log/stream.cpp
index 3558a7d5564..721273ca468 100644
--- a/core/test/log/stream.cpp
+++ b/core/test/log/stream.cpp
@@ -606,8 +606,7 @@ TYPED_TEST(Stream, CatchesLinopFactoryGenerateStarted)
         gko::log::Logger::linop_factory_generate_started_mask, out);
     auto factory =
         gko::solver::Bicgstab<TypeParam>::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .on(exec);
     auto input = factory->generate(gko::matrix::Dense<TypeParam>::create(exec));
     std::stringstream ptrstream_factory;
@@ -633,8 +632,7 @@ TYPED_TEST(Stream, CatchesLinopFactoryGenerateCompleted)
         gko::log::Logger::linop_factory_generate_completed_mask, out);
     auto factory =
         gko::solver::Bicgstab<TypeParam>::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .on(exec);
     auto input = factory->generate(gko::matrix::Dense<TypeParam>::create(exec));
     auto output =
@@ -815,8 +813,7 @@ TYPED_TEST(Stream, CatchesIterationsWithVerbose)
 
     auto factory =
         gko::solver::Bicgstab<TypeParam>::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .on(exec);
     auto solver = factory->generate(gko::initialize<Dense>({1.1}, exec));
     auto right_hand_side = gko::initialize<Dense>({-5.5}, exec);
diff --git a/core/test/matrix/CMakeLists.txt b/core/test/matrix/CMakeLists.txt
index 433361a054f..43d3b74ff0f 100644
--- a/core/test/matrix/CMakeLists.txt
+++ b/core/test/matrix/CMakeLists.txt
@@ -1,3 +1,6 @@
+ginkgo_create_test(batch_dense)
+ginkgo_create_test(batch_ell)
+ginkgo_create_test(batch_identity)
 ginkgo_create_test(coo)
 ginkgo_create_test(coo_builder)
 ginkgo_create_test(csr)
diff --git a/core/test/matrix/batch_dense.cpp b/core/test/matrix/batch_dense.cpp
new file mode 100644
index 00000000000..adeddbcc994
--- /dev/null
+++ b/core/test/matrix/batch_dense.cpp
@@ -0,0 +1,480 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/matrix/batch_dense.hpp>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/range.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+#include "core/base/batch_utilities.hpp"
+#include "core/test/utils.hpp"
+#include "core/test/utils/batch_helpers.hpp"
+
+
+template <typename T>
+class Dense : public ::testing::Test {
+protected:
+    using value_type = T;
+    using DenseMtx = gko::matrix::Dense<value_type>;
+    using size_type = gko::size_type;
+    Dense()
+        : exec(gko::ReferenceExecutor::create()),
+          mtx(gko::batch::initialize<gko::batch::matrix::Dense<value_type>>(
+              {{{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}},
+               {{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}},
+              exec)),
+          mvec(gko::batch::initialize<gko::batch::MultiVector<value_type>>(
+              {{{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}},
+               {{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}},
+              exec)),
+          dense_mtx(gko::initialize<gko::matrix::Dense<value_type>>(
+              {{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}, exec))
+    {}
+
+
+    static void assert_equal_to_original_mtx(
+        gko::batch::matrix::Dense<value_type>* m)
+    {
+        ASSERT_EQ(m->get_num_batch_items(), 2);
+        ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 3));
+        ASSERT_EQ(m->get_num_stored_elements(), 2 * (2 * 3));
+        EXPECT_EQ(m->at(0, 0, 0), value_type{-1.0});
+        EXPECT_EQ(m->at(0, 0, 1), value_type{2.0});
+        EXPECT_EQ(m->at(0, 0, 2), value_type{3.0});
+        EXPECT_EQ(m->at(0, 1, 0), value_type{-1.5});
+        EXPECT_EQ(m->at(0, 1, 1), value_type{2.5});
+        ASSERT_EQ(m->at(0, 1, 2), value_type{3.5});
+        EXPECT_EQ(m->at(1, 0, 0), value_type{1.0});
+        EXPECT_EQ(m->at(1, 0, 1), value_type{2.5});
+        EXPECT_EQ(m->at(1, 0, 2), value_type{3.0});
+        EXPECT_EQ(m->at(1, 1, 0), value_type{1.0});
+        EXPECT_EQ(m->at(1, 1, 1), value_type{2.0});
+        ASSERT_EQ(m->at(1, 1, 2), value_type{3.0});
+    }
+
+    static void assert_empty(gko::batch::matrix::Dense<value_type>* m)
+    {
+        ASSERT_EQ(m->get_num_batch_items(), 0);
+        ASSERT_EQ(m->get_num_stored_elements(), 0);
+    }
+
+    std::shared_ptr<const gko::Executor> exec;
+    std::unique_ptr<gko::batch::matrix::Dense<value_type>> mtx;
+    std::unique_ptr<gko::batch::MultiVector<value_type>> mvec;
+    std::unique_ptr<gko::matrix::Dense<value_type>> dense_mtx;
+};
+
+TYPED_TEST_SUITE(Dense, gko::test::ValueTypes, TypenameNameGenerator);
+
+
+TYPED_TEST(Dense, KnowsItsSizeAndValues)
+{
+    this->assert_equal_to_original_mtx(this->mtx.get());
+}
+
+
+TYPED_TEST(Dense, CanBeEmpty)
+{
+    auto empty = gko::batch::matrix::Dense<TypeParam>::create(this->exec);
+    this->assert_empty(empty.get());
+}
+
+
+TYPED_TEST(Dense, ReturnsNullValuesArrayWhenEmpty)
+{
+    auto empty = gko::batch::matrix::Dense<TypeParam>::create(this->exec);
+    ASSERT_EQ(empty->get_const_values(), nullptr);
+}
+
+
+TYPED_TEST(Dense, CanGetValuesForEntry)
+{
+    using value_type = typename TestFixture::value_type;
+
+    ASSERT_EQ(this->mtx->get_values_for_item(1)[0], value_type{1.0});
+}
+
+
+TYPED_TEST(Dense, CanCreateDenseItemView)
+{
+    GKO_ASSERT_MTX_NEAR(this->mtx->create_view_for_item(1), this->dense_mtx,
+                        0.0);
+}
+
+
+TYPED_TEST(Dense, CanBeCopied)
+{
+    auto mtx_copy = gko::batch::matrix::Dense<TypeParam>::create(this->exec);
+
+    mtx_copy->copy_from(this->mtx.get());
+
+    this->assert_equal_to_original_mtx(this->mtx.get());
+    this->mtx->at(0, 0, 0) = 7;
+    this->mtx->at(0, 1) = 7;
+    this->assert_equal_to_original_mtx(mtx_copy.get());
+}
+
+
+TYPED_TEST(Dense, CanBeMoved)
+{
+    auto mtx_copy = gko::batch::matrix::Dense<TypeParam>::create(this->exec);
+
+    this->mtx->move_to(mtx_copy);
+
+    this->assert_equal_to_original_mtx(mtx_copy.get());
+}
+
+
+TYPED_TEST(Dense, CanBeCloned)
+{
+    auto mtx_clone = this->mtx->clone();
+
+    this->assert_equal_to_original_mtx(
+        dynamic_cast<decltype(this->mtx.get())>(mtx_clone.get()));
+}
+
+
+TYPED_TEST(Dense, CanBeCleared)
+{
+    this->mtx->clear();
+
+    this->assert_empty(this->mtx.get());
+}
+
+
+TYPED_TEST(Dense, CanBeConstructedWithSize)
+{
+    using size_type = gko::size_type;
+
+    auto m = gko::batch::matrix::Dense<TypeParam>::create(
+        this->exec, gko::batch_dim<2>(2, gko::dim<2>{5, 3}));
+
+    ASSERT_EQ(m->get_num_batch_items(), 2);
+    ASSERT_EQ(m->get_common_size(), gko::dim<2>(5, 3));
+    ASSERT_EQ(m->get_num_stored_elements(), 30);
+}
+
+
+TYPED_TEST(Dense, CanBeConstructedFromExistingData)
+{
+    using value_type = typename TestFixture::value_type;
+    using size_type = gko::size_type;
+    // clang-format off
+    value_type data[] = {
+       1.0,  2.0,
+      -1.0,  3.0,
+       4.0, -1.0,
+       3.0,  5.0,
+       1.0,  5.0,
+       6.0, -3.0};
+    // clang-format on
+
+    auto m = gko::batch::matrix::Dense<TypeParam>::create(
+        this->exec, gko::batch_dim<2>(2, gko::dim<2>(2, 2)),
+        gko::array<value_type>::view(this->exec, 8, data));
+
+    ASSERT_EQ(m->get_const_values(), data);
+    ASSERT_EQ(m->at(0, 0, 0), value_type{1.0});
+    ASSERT_EQ(m->at(0, 0, 1), value_type{2.0});
+    ASSERT_EQ(m->at(0, 1, 0), value_type{-1.0});
+    ASSERT_EQ(m->at(0, 1, 1), value_type{3.0});
+    ASSERT_EQ(m->at(1, 0, 0), value_type{4.0});
+    ASSERT_EQ(m->at(1, 0, 1), value_type{-1.0});
+    ASSERT_EQ(m->at(1, 1, 0), value_type{3.0});
+    ASSERT_EQ(m->at(1, 1, 1), value_type{5.0});
+}
+
+
+TYPED_TEST(Dense, CanBeConstructedFromExistingConstData)
+{
+    using value_type = typename TestFixture::value_type;
+    using size_type = gko::size_type;
+    // clang-format off
+    const value_type data[] = {
+       1.0,  2.0,
+      -1.0,  3.0,
+       4.0, -1.0,
+       3.0,  5.0,
+       1.0,  5.0,
+       6.0, -3.0};
+    // clang-format on
+
+    auto m = gko::batch::matrix::Dense<TypeParam>::create_const(
+        this->exec, gko::batch_dim<2>(2, gko::dim<2>(2, 2)),
+        gko::array<value_type>::const_view(this->exec, 8, data));
+
+    ASSERT_EQ(m->get_const_values(), data);
+    ASSERT_EQ(m->at(0, 0, 0), value_type{1.0});
+    ASSERT_EQ(m->at(0, 0, 1), value_type{2.0});
+    ASSERT_EQ(m->at(0, 1, 0), value_type{-1.0});
+    ASSERT_EQ(m->at(0, 1, 1), value_type{3.0});
+    ASSERT_EQ(m->at(1, 0, 0), value_type{4.0});
+    ASSERT_EQ(m->at(1, 0, 1), value_type{-1.0});
+    ASSERT_EQ(m->at(1, 1, 0), value_type{3.0});
+    ASSERT_EQ(m->at(1, 1, 1), value_type{5.0});
+}
+
+
+TYPED_TEST(Dense, CanBeConstructedFromDenseMatrices)
+{
+    using value_type = typename TestFixture::value_type;
+    using DenseMtx = typename TestFixture::DenseMtx;
+    using size_type = gko::size_type;
+    auto mat1 = gko::initialize<DenseMtx>({{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}},
+                                          this->exec);
+    auto mat2 = gko::initialize<DenseMtx>({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}},
+                                          this->exec);
+
+    auto m =
+        gko::batch::create_from_item<gko::batch::matrix::Dense<value_type>>(
+            this->exec, std::vector<DenseMtx*>{mat1.get(), mat2.get()});
+
+    this->assert_equal_to_original_mtx(m.get());
+}
+
+
+TYPED_TEST(Dense, CanBeConstructedFromDenseMatricesByDuplication)
+{
+    using value_type = typename TestFixture::value_type;
+    using DenseMtx = typename TestFixture::DenseMtx;
+    using size_type = gko::size_type;
+    auto mat1 = gko::initialize<DenseMtx>(
+        4, {{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}}, this->exec);
+    auto mat2 = gko::initialize<DenseMtx>({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}},
+                                          this->exec);
+    auto bat_m =
+        gko::batch::create_from_item<gko::batch::matrix::Dense<value_type>>(
+            this->exec,
+            std::vector<DenseMtx*>{mat1.get(), mat1.get(), mat1.get()});
+
+    auto m =
+        gko::batch::create_from_item<gko::batch::matrix::Dense<value_type>>(
+            this->exec, 3, mat1.get());
+
+    GKO_ASSERT_BATCH_MTX_NEAR(bat_m.get(), m.get(), 0);
+}
+
+
+TYPED_TEST(Dense, CanBeConstructedByDuplicatingDenseMatrices)
+{
+    using value_type = typename TestFixture::value_type;
+    using DenseMtx = typename TestFixture::DenseMtx;
+    using size_type = gko::size_type;
+    auto mat1 = gko::initialize<DenseMtx>({{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}},
+                                          this->exec);
+    auto mat2 = gko::initialize<DenseMtx>({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}},
+                                          this->exec);
+    auto m =
+        gko::batch::create_from_item<gko::batch::matrix::Dense<value_type>>(
+            this->exec, std::vector<DenseMtx*>{mat1.get(), mat2.get()});
+    auto m_ref =
+        gko::batch::create_from_item<gko::batch::matrix::Dense<value_type>>(
+            this->exec,
+            std::vector<DenseMtx*>{mat1.get(), mat2.get(), mat1.get(),
+                                   mat2.get(), mat1.get(), mat2.get()});
+
+    auto m2 = gko::batch::duplicate<gko::batch::matrix::Dense<value_type>>(
+        this->exec, 3, m.get());
+
+    GKO_ASSERT_BATCH_MTX_NEAR(m2.get(), m_ref.get(), 0);
+}
+
+
+TYPED_TEST(Dense, CanBeUnbatchedIntoDenseMatrices)
+{
+    using value_type = typename TestFixture::value_type;
+    using DenseMtx = typename TestFixture::DenseMtx;
+    using size_type = gko::size_type;
+    auto mat1 = gko::initialize<DenseMtx>(
+        4, {{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}}, this->exec);
+    auto mat2 = gko::initialize<DenseMtx>({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}},
+                                          this->exec);
+
+    auto dense_mats =
+        gko::batch::unbatch<gko::batch::matrix::Dense<value_type>>(
+            this->mtx.get());
+
+    GKO_ASSERT_MTX_NEAR(dense_mats[0].get(), mat1.get(), 0.);
+    GKO_ASSERT_MTX_NEAR(dense_mats[1].get(), mat2.get(), 0.);
+}
+
+
+TYPED_TEST(Dense, CanBeListConstructed)
+{
+    using value_type = typename TestFixture::value_type;
+
+    auto m = gko::batch::initialize<gko::batch::matrix::Dense<TypeParam>>(
+        {{1.0, 2.0}, {1.0, 3.0}}, this->exec);
+
+    ASSERT_EQ(m->get_num_batch_items(), 2);
+    ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 1));
+    EXPECT_EQ(m->at(0, 0), value_type{1});
+    EXPECT_EQ(m->at(0, 1), value_type{2});
+    EXPECT_EQ(m->at(1, 0), value_type{1});
+    EXPECT_EQ(m->at(1, 1), value_type{3});
+}
+
+
+TYPED_TEST(Dense, CanBeListConstructedByCopies)
+{
+    using value_type = typename TestFixture::value_type;
+
+    auto m = gko::batch::initialize<gko::batch::matrix::Dense<TypeParam>>(
+        2, I<value_type>({1.0, 2.0}), this->exec);
+
+    ASSERT_EQ(m->get_num_batch_items(), 2);
+    ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 1));
+    EXPECT_EQ(m->at(0, 0, 0), value_type{1.0});
+    EXPECT_EQ(m->at(0, 0, 1), value_type{2.0});
+    EXPECT_EQ(m->at(1, 0, 0), value_type{1.0});
+    EXPECT_EQ(m->at(1, 0, 1), value_type{2.0});
+}
+
+
+TYPED_TEST(Dense, CanBeDoubleListConstructed)
+{
+    using value_type = typename TestFixture::value_type;
+    using T = value_type;
+
+    auto m = gko::batch::initialize<gko::batch::matrix::Dense<TypeParam>>(
+        {{I<T>{1.0, 1.0, 0.0}, I<T>{2.0, 4.0, 3.0}, I<T>{3.0, 6.0, 1.0}},
+         {I<T>{1.0, 2.0, -1.0}, I<T>{3.0, 4.0, -2.0}, I<T>{5.0, 6.0, -3.0}}},
+        this->exec);
+
+    ASSERT_EQ(m->get_common_size(), gko::dim<2>(3, 3));
+    EXPECT_EQ(m->at(0, 0), value_type{1.0});
+    EXPECT_EQ(m->at(0, 1), value_type{1.0});
+    EXPECT_EQ(m->at(0, 2), value_type{0.0});
+    EXPECT_EQ(m->at(0, 3), value_type{2.0});
+    EXPECT_EQ(m->at(0, 4), value_type{4.0});
+    EXPECT_EQ(m->at(0, 5), value_type{3.0});
+    EXPECT_EQ(m->at(0, 6), value_type{3.0});
+    EXPECT_EQ(m->at(0, 7), value_type{6.0});
+    EXPECT_EQ(m->at(0, 8), value_type{1.0});
+    EXPECT_EQ(m->at(1, 0), value_type{1.0});
+    EXPECT_EQ(m->at(1, 1), value_type{2.0});
+    EXPECT_EQ(m->at(1, 2), value_type{-1.0});
+    EXPECT_EQ(m->at(1, 3), value_type{3.0});
+    EXPECT_EQ(m->at(1, 4), value_type{4.0});
+    EXPECT_EQ(m->at(1, 5), value_type{-2.0});
+    EXPECT_EQ(m->at(1, 6), value_type{5.0});
+    EXPECT_EQ(m->at(1, 7), value_type{6.0});
+    EXPECT_EQ(m->at(1, 8), value_type{-3.0});
+}
+
+
+TYPED_TEST(Dense, CanBeReadFromMatrixData)
+{
+    using value_type = typename TestFixture::value_type;
+    using index_type = int;
+    auto vec_data = std::vector<gko::matrix_data<value_type, index_type>>{};
+    vec_data.emplace_back(gko::matrix_data<value_type, index_type>(
+        {2, 2}, {{0, 0, 1.0}, {0, 1, 3.0}, {1, 0, 0.0}, {1, 1, 5.0}}));
+    vec_data.emplace_back(gko::matrix_data<value_type, index_type>(
+        {2, 2}, {{0, 0, -1.0}, {0, 1, 0.5}, {1, 0, 0.0}, {1, 1, 9.0}}));
+
+    auto m = gko::batch::read<value_type, index_type,
+                              gko::batch::matrix::Dense<value_type>>(this->exec,
+                                                                     vec_data);
+
+    ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 2));
+    EXPECT_EQ(m->at(0, 0, 0), value_type{1.0});
+    EXPECT_EQ(m->at(0, 0, 1), value_type{3.0});
+    EXPECT_EQ(m->at(0, 1, 0), value_type{0.0});
+    EXPECT_EQ(m->at(0, 1, 1), value_type{5.0});
+    EXPECT_EQ(m->at(1, 0, 0), value_type{-1.0});
+    EXPECT_EQ(m->at(1, 0, 1), value_type{0.5});
+    EXPECT_EQ(m->at(1, 1, 0), value_type{0.0});
+    EXPECT_EQ(m->at(1, 1, 1), value_type{9.0});
+}
+
+
+TYPED_TEST(Dense, CanBeReadFromSparseMatrixData)
+{
+    using value_type = typename TestFixture::value_type;
+    using index_type = int;
+    auto vec_data = std::vector<gko::matrix_data<value_type, index_type>>{};
+    vec_data.emplace_back(gko::matrix_data<value_type, index_type>(
+        {2, 2}, {{0, 0, 1.0}, {0, 1, 3.0}, {1, 1, 5.0}}));
+    vec_data.emplace_back(gko::matrix_data<value_type, index_type>(
+        {2, 2}, {{0, 0, -1.0}, {0, 1, 0.5}, {1, 1, 9.0}}));
+
+    auto m = gko::batch::read<value_type, index_type,
+                              gko::batch::matrix::Dense<value_type>>(this->exec,
+                                                                     vec_data);
+
+    ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 2));
+    EXPECT_EQ(m->at(0, 0, 0), value_type{1.0});
+    EXPECT_EQ(m->at(0, 0, 1), value_type{3.0});
+    EXPECT_EQ(m->at(0, 1, 0), value_type{0.0});
+    EXPECT_EQ(m->at(0, 1, 1), value_type{5.0});
+    EXPECT_EQ(m->at(1, 0, 0), value_type{-1.0});
+    EXPECT_EQ(m->at(1, 0, 1), value_type{0.5});
+    EXPECT_EQ(m->at(1, 1, 0), value_type{0.0});
+    EXPECT_EQ(m->at(1, 1, 1), value_type{9.0});
+}
+
+
+TYPED_TEST(Dense, GeneratesCorrectMatrixData)
+{
+    using value_type = typename TestFixture::value_type;
+    using index_type = int;
+    using tpl = typename gko::matrix_data<TypeParam>::nonzero_type;
+
+    auto data = gko::batch::write<value_type, index_type,
+                                  gko::batch::matrix::Dense<value_type>>(
+        this->mtx.get());
+
+    ASSERT_EQ(data[0].size, gko::dim<2>(2, 3));
+    ASSERT_EQ(data[0].nonzeros.size(), 6);
+    EXPECT_EQ(data[0].nonzeros[0], tpl(0, 0, value_type{-1.0}));
+    EXPECT_EQ(data[0].nonzeros[1], tpl(0, 1, value_type{2.0}));
+    EXPECT_EQ(data[0].nonzeros[2], tpl(0, 2, value_type{3.0}));
+    EXPECT_EQ(data[0].nonzeros[3], tpl(1, 0, value_type{-1.5}));
+    EXPECT_EQ(data[0].nonzeros[4], tpl(1, 1, value_type{2.5}));
+    EXPECT_EQ(data[0].nonzeros[5], tpl(1, 2, value_type{3.5}));
+    ASSERT_EQ(data[1].size, gko::dim<2>(2, 3));
+    ASSERT_EQ(data[1].nonzeros.size(), 6);
+    EXPECT_EQ(data[1].nonzeros[0], tpl(0, 0, value_type{1.0}));
+    EXPECT_EQ(data[1].nonzeros[1], tpl(0, 1, value_type{2.5}));
+    EXPECT_EQ(data[1].nonzeros[2], tpl(0, 2, value_type{3.0}));
+    EXPECT_EQ(data[1].nonzeros[3], tpl(1, 0, value_type{1.0}));
+    EXPECT_EQ(data[1].nonzeros[4], tpl(1, 1, value_type{2.0}));
+    EXPECT_EQ(data[1].nonzeros[5], tpl(1, 2, value_type{3.0}));
+}
diff --git a/core/test/matrix/batch_ell.cpp b/core/test/matrix/batch_ell.cpp
new file mode 100644
index 00000000000..a42a18f5faf
--- /dev/null
+++ b/core/test/matrix/batch_ell.cpp
@@ -0,0 +1,525 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/matrix/batch_ell.hpp>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/matrix/ell.hpp>
+
+
+#include "core/base/batch_utilities.hpp"
+#include "core/test/utils.hpp"
+#include "core/test/utils/batch_helpers.hpp"
+
+
+template <typename T>
+class Ell : public ::testing::Test {
+protected:
+    using value_type = T;
+    using index_type = gko::int32;
+    using BatchEllMtx = gko::batch::matrix::Ell<value_type, index_type>;
+    using EllMtx = gko::matrix::Ell<value_type, index_type>;
+    using size_type = gko::size_type;
+    Ell()
+        : exec(gko::ReferenceExecutor::create()),
+          mtx(gko::batch::initialize<BatchEllMtx>(
+              {{{-1.0, 2.0, 3.0}, {-1.5, 2.5, 3.5}},
+               {{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}},
+              exec, 3)),
+          sp_mtx(gko::batch::initialize<BatchEllMtx>(
+              {{{-1.0, 0.0, 0.0}, {0.0, 2.5, 3.5}},
+               {{1.0, 0.0, 0.0}, {0.0, 2.0, 3.0}}},
+              exec, 2)),
+          ell_mtx(gko::initialize<EllMtx>({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}},
+                                          exec, gko::dim<2>(2, 3), 3)),
+          sp_ell_mtx(gko::initialize<EllMtx>({{1.0, 0.0, 0.0}, {0.0, 2.0, 3.0}},
+                                             exec, gko::dim<2>(2, 3), 2))
+    {}
+
+    static void assert_equal_to_original_sparse_mtx(const BatchEllMtx* m)
+    {
+        ASSERT_EQ(m->get_num_batch_items(), 2);
+        ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 3));
+        ASSERT_EQ(m->get_num_stored_elements(), 2 * (2 * 2));
+        ASSERT_EQ(m->get_num_stored_elements_per_row(), 2);
+        EXPECT_EQ(m->get_const_values()[0], value_type{-1.0});
+        EXPECT_EQ(m->get_const_values()[1], value_type{2.5});
+        EXPECT_EQ(m->get_const_values()[2], value_type{0.0});
+        EXPECT_EQ(m->get_const_values()[3], value_type{3.5});
+        EXPECT_EQ(m->get_const_values()[4], value_type{1.0});
+        EXPECT_EQ(m->get_const_values()[5], value_type{2.0});
+        EXPECT_EQ(m->get_const_values()[6], value_type{0.0});
+        EXPECT_EQ(m->get_const_values()[7], value_type{3.0});
+        EXPECT_EQ(m->get_const_col_idxs()[0], index_type{0});
+        EXPECT_EQ(m->get_const_col_idxs()[1], index_type{1});
+        EXPECT_EQ(m->get_const_col_idxs()[2], index_type{-1});
+        ASSERT_EQ(m->get_const_col_idxs()[3], index_type{2});
+    }
+
+    static void assert_equal_to_original_mtx(const BatchEllMtx* m)
+    {
+        ASSERT_EQ(m->get_num_batch_items(), 2);
+        ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 3));
+        ASSERT_EQ(m->get_num_stored_elements(), 2 * (2 * 3));
+        ASSERT_EQ(m->get_num_stored_elements_per_row(), 3);
+        EXPECT_EQ(m->get_const_values()[0], value_type{-1.0});
+        EXPECT_EQ(m->get_const_values()[1], value_type{-1.5});
+        EXPECT_EQ(m->get_const_values()[2], value_type{2.0});
+        EXPECT_EQ(m->get_const_values()[3], value_type{2.5});
+        EXPECT_EQ(m->get_const_values()[4], value_type{3.0});
+        EXPECT_EQ(m->get_const_values()[5], value_type{3.5});
+        EXPECT_EQ(m->get_const_values()[6], value_type{1.0});
+        EXPECT_EQ(m->get_const_values()[7], value_type{1.0});
+        EXPECT_EQ(m->get_const_values()[8], value_type{2.5});
+        EXPECT_EQ(m->get_const_values()[9], value_type{2.0});
+        EXPECT_EQ(m->get_const_values()[10], value_type{3.0});
+        ASSERT_EQ(m->get_const_values()[11], value_type{3.0});
+    }
+
+    static void assert_empty(BatchEllMtx* m)
+    {
+        ASSERT_EQ(m->get_num_batch_items(), 0);
+        ASSERT_EQ(m->get_num_stored_elements(), 0);
+        ASSERT_EQ(m->get_num_stored_elements_per_row(), 0);
+    }
+
+    std::shared_ptr<const gko::Executor> exec;
+    std::unique_ptr<BatchEllMtx> mtx;
+    std::unique_ptr<BatchEllMtx> sp_mtx;
+    std::unique_ptr<EllMtx> ell_mtx;
+    std::unique_ptr<EllMtx> sp_ell_mtx;
+};
+
+TYPED_TEST_SUITE(Ell, gko::test::ValueTypes, TypenameNameGenerator);
+
+
+TYPED_TEST(Ell, KnowsItsSizeAndValues)
+{
+    this->assert_equal_to_original_mtx(this->mtx.get());
+}
+
+
+TYPED_TEST(Ell, SparseMtxKnowsItsSizeAndValues)
+{
+    this->assert_equal_to_original_sparse_mtx(this->sp_mtx.get());
+}
+
+
+TYPED_TEST(Ell, CanBeEmpty)
+{
+    using BatchEllMtx = typename TestFixture::BatchEllMtx;
+
+    auto empty = BatchEllMtx::create(this->exec);
+
+    this->assert_empty(empty.get());
+    ASSERT_EQ(empty->get_const_values(), nullptr);
+}
+
+
+TYPED_TEST(Ell, CanGetValuesForEntry)
+{
+    using value_type = typename TestFixture::value_type;
+
+    ASSERT_EQ(this->mtx->get_values_for_item(1)[0], value_type{1.0});
+}
+
+
+TYPED_TEST(Ell, CanCreateEllItemView)
+{
+    GKO_ASSERT_MTX_NEAR(this->mtx->create_view_for_item(1), this->ell_mtx, 0.0);
+}
+
+
+TYPED_TEST(Ell, CanCreateSpEllItemView)
+{
+    GKO_ASSERT_MTX_NEAR(this->sp_mtx->create_view_for_item(1), this->sp_ell_mtx,
+                        0.0);
+}
+
+
+TYPED_TEST(Ell, CanBeCopied)
+{
+    using BatchEllMtx = typename TestFixture::BatchEllMtx;
+
+    auto mtx_copy = BatchEllMtx::create(this->exec);
+
+    mtx_copy->copy_from(this->mtx.get());
+
+    this->assert_equal_to_original_mtx(this->mtx.get());
+    this->mtx->get_values()[0] = 7;
+    this->assert_equal_to_original_mtx(mtx_copy.get());
+}
+
+
+TYPED_TEST(Ell, CanBeMoved)
+{
+    using BatchEllMtx = typename TestFixture::BatchEllMtx;
+
+    auto mtx_copy = BatchEllMtx::create(this->exec);
+
+    this->mtx->move_to(mtx_copy);
+
+    this->assert_equal_to_original_mtx(mtx_copy.get());
+}
+
+
+TYPED_TEST(Ell, CanBeCloned)
+{
+    auto mtx_clone = this->mtx->clone();
+
+    this->assert_equal_to_original_mtx(
+        dynamic_cast<decltype(this->mtx.get())>(mtx_clone.get()));
+}
+
+
+TYPED_TEST(Ell, CanBeCleared)
+{
+    this->mtx->clear();
+
+    this->assert_empty(this->mtx.get());
+}
+
+
+TYPED_TEST(Ell, CanBeConstructedWithSize)
+{
+    using BatchEllMtx = typename TestFixture::BatchEllMtx;
+
+    auto m = BatchEllMtx::create(this->exec,
+                                 gko::batch_dim<2>(2, gko::dim<2>{5, 3}), 2);
+
+    ASSERT_EQ(m->get_num_batch_items(), 2);
+    ASSERT_EQ(m->get_common_size(), gko::dim<2>(5, 3));
+    ASSERT_EQ(m->get_num_stored_elements_per_row(), 2);
+    ASSERT_EQ(m->get_num_stored_elements(), 20);
+}
+
+
+TYPED_TEST(Ell, CanBeConstructedFromExistingData)
+{
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using BatchEllMtx = typename TestFixture::BatchEllMtx;
+    // clang-format off
+    value_type values[] = {
+       -1.0,  2.5,
+        0.0,  3.5,
+        1.0,  2.0,
+        0.0,  3.0};
+    index_type col_idxs[] = {
+       0, 1,
+      -1, 2};
+    // clang-format on
+
+    auto m = BatchEllMtx::create(
+        this->exec, gko::batch_dim<2>(2, gko::dim<2>(2, 3)), 2,
+        gko::array<value_type>::view(this->exec, 8, values),
+        gko::array<index_type>::view(this->exec, 4, col_idxs));
+
+    this->assert_equal_to_original_sparse_mtx(m.get());
+}
+
+
+TYPED_TEST(Ell, CanBeConstructedFromExistingConstData)
+{
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using BatchEllMtx = typename TestFixture::BatchEllMtx;
+    // clang-format off
+    value_type values[] = {
+       -1.0,  2.5,
+        0.0,  3.5,
+        1.0,  2.0,
+        0.0,  3.0};
+    index_type col_idxs[] = {
+       0, 1,
+      -1, 2};
+    // clang-format on
+
+    auto m = BatchEllMtx::create_const(
+        this->exec, gko::batch_dim<2>(2, gko::dim<2>(2, 3)), 2,
+        gko::array<value_type>::const_view(this->exec, 8, values),
+        gko::array<index_type>::const_view(this->exec, 4, col_idxs));
+
+    this->assert_equal_to_original_sparse_mtx(m.get());
+}
+
+
+TYPED_TEST(Ell, CanBeConstructedFromEllMatrices)
+{
+    using BatchEllMtx = typename TestFixture::BatchEllMtx;
+    using EllMtx = typename TestFixture::EllMtx;
+    auto mat1 = gko::initialize<EllMtx>({{-1.0, 0.0, 0.0}, {0.0, 2.5, 3.5}},
+                                        this->exec);
+    auto mat2 =
+        gko::initialize<EllMtx>({{1.0, 0.0, 0.0}, {0.0, 2.0, 3.0}}, this->exec);
+
+    auto m = gko::batch::create_from_item<BatchEllMtx>(
+        this->exec, std::vector<EllMtx*>{mat1.get(), mat2.get()},
+        mat1->get_num_stored_elements_per_row());
+
+    this->assert_equal_to_original_sparse_mtx(m.get());
+}
+
+
+TYPED_TEST(Ell, CanBeConstructedFromEllMatricesByDuplication)
+{
+    using BatchEllMtx = typename TestFixture::BatchEllMtx;
+    using EllMtx = typename TestFixture::EllMtx;
+    auto mat1 =
+        gko::initialize<EllMtx>({{1.0, 0.0, 0.0}, {0.0, 2.0, 0.0}}, this->exec);
+    auto bat_m = gko::batch::create_from_item<BatchEllMtx>(
+        this->exec, std::vector<EllMtx*>{mat1.get(), mat1.get(), mat1.get()},
+        mat1->get_num_stored_elements_per_row());
+
+    auto m = gko::batch::create_from_item<BatchEllMtx>(
+        this->exec, 3, mat1.get(), mat1->get_num_stored_elements_per_row());
+
+    GKO_ASSERT_BATCH_MTX_NEAR(bat_m.get(), m.get(), 1e-14);
+}
+
+
+TYPED_TEST(Ell, CanBeConstructedByDuplicatingEllMatrices)
+{
+    using BatchEllMtx = typename TestFixture::BatchEllMtx;
+    using EllMtx = typename TestFixture::EllMtx;
+    auto mat1 = gko::initialize<EllMtx>({{-1.0, 0.0, 0.0}, {0.0, 2.5, 0.0}},
+                                        this->exec);
+    auto mat2 =
+        gko::initialize<EllMtx>({{1.0, 0.0, 0.0}, {0.0, 2.0, 0.0}}, this->exec);
+
+    auto m = gko::batch::create_from_item<BatchEllMtx>(
+        this->exec, std::vector<EllMtx*>{mat1.get(), mat2.get()},
+        mat1->get_num_stored_elements_per_row());
+    auto m_ref = gko::batch::create_from_item<BatchEllMtx>(
+        this->exec,
+        std::vector<EllMtx*>{mat1.get(), mat2.get(), mat1.get(), mat2.get(),
+                             mat1.get(), mat2.get()},
+        mat1->get_num_stored_elements_per_row());
+
+    auto m2 = gko::batch::duplicate<BatchEllMtx>(
+        this->exec, 3, m.get(), mat1->get_num_stored_elements_per_row());
+
+    GKO_ASSERT_BATCH_MTX_NEAR(m2.get(), m_ref.get(), 1e-14);
+}
+
+
+TYPED_TEST(Ell, CanBeUnbatchedIntoEllMatrices)
+{
+    using BatchEllMtx = typename TestFixture::BatchEllMtx;
+    using EllMtx = typename TestFixture::EllMtx;
+    auto mat1 = gko::initialize<EllMtx>({{-1.0, 0.0, 0.0}, {0.0, 2.5, 3.5}},
+                                        this->exec);
+    auto mat2 =
+        gko::initialize<EllMtx>({{1.0, 0.0, 0.0}, {0.0, 2.0, 3.0}}, this->exec);
+
+    auto ell_mats = gko::batch::unbatch<BatchEllMtx>(this->sp_mtx.get());
+
+    GKO_ASSERT_MTX_NEAR(ell_mats[0].get(), mat1.get(), 0.);
+    GKO_ASSERT_MTX_NEAR(ell_mats[1].get(), mat2.get(), 0.);
+}
+
+
+TYPED_TEST(Ell, CanBeListConstructed)
+{
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using BatchEllMtx = typename TestFixture::BatchEllMtx;
+    using EllMtx = typename TestFixture::EllMtx;
+
+    auto m = gko::batch::initialize<BatchEllMtx>({{0.0, -1.0}, {0.0, -5.0}},
+                                                 this->exec);
+
+    ASSERT_EQ(m->get_num_batch_items(), 2);
+    ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 1));
+    ASSERT_EQ(m->get_num_stored_elements(), 4);
+    ASSERT_EQ(m->get_num_stored_elements_per_row(), 1);
+    EXPECT_EQ(m->get_values()[0], value_type{0.0});
+    EXPECT_EQ(m->get_values()[1], value_type{-1.0});
+    EXPECT_EQ(m->get_values()[2], value_type{0.0});
+    EXPECT_EQ(m->get_values()[3], value_type{-5.0});
+    EXPECT_EQ(m->get_col_idxs()[0], index_type{-1});
+    EXPECT_EQ(m->get_col_idxs()[1], index_type{0});
+}
+
+
+TYPED_TEST(Ell, CanBeListConstructedByCopies)
+{
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using BatchEllMtx = typename TestFixture::BatchEllMtx;
+
+    auto m = gko::batch::initialize<BatchEllMtx>(2, I<value_type>({0.0, -1.0}),
+                                                 this->exec, 1);
+
+    ASSERT_EQ(m->get_num_batch_items(), 2);
+    ASSERT_EQ(m->get_common_size(), gko::dim<2>(2, 1));
+    ASSERT_EQ(m->get_num_stored_elements(), 4);
+    ASSERT_EQ(m->get_num_stored_elements_per_row(), 1);
+    EXPECT_EQ(m->get_values()[0], value_type{0.0});
+    EXPECT_EQ(m->get_values()[1], value_type{-1.0});
+    EXPECT_EQ(m->get_values()[2], value_type{0.0});
+    EXPECT_EQ(m->get_values()[3], value_type{-1.0});
+    EXPECT_EQ(m->get_col_idxs()[0], index_type{-1});
+    EXPECT_EQ(m->get_col_idxs()[1], index_type{0});
+}
+
+
+TYPED_TEST(Ell, CanBeDoubleListConstructed)
+{
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using BatchEllMtx = typename TestFixture::BatchEllMtx;
+    using T = value_type;
+
+    auto m = gko::batch::initialize<BatchEllMtx>(
+        // clang-format off
+        {{I<T>{1.0, 0.0, 0.0},
+          I<T>{2.0, 0.0, 3.0},
+          I<T>{3.0, 6.0, 0.0}},
+         {I<T>{1.0, 0.0, 0.0},
+          I<T>{3.0, 0.0, -2.0},
+          I<T>{5.0, 8.0, 0.0}}},
+        // clang-format on
+        this->exec, 2);
+
+    ASSERT_EQ(m->get_num_batch_items(), 2);
+    ASSERT_EQ(m->get_common_size(), gko::dim<2>(3, 3));
+    ASSERT_EQ(m->get_num_stored_elements(), 2 * (2 * 3));
+    ASSERT_EQ(m->get_num_stored_elements_per_row(), 2);
+    EXPECT_EQ(m->get_values()[0], value_type{1.0});
+    EXPECT_EQ(m->get_values()[1], value_type{2.0});
+    EXPECT_EQ(m->get_values()[2], value_type{3.0});
+    EXPECT_EQ(m->get_values()[3], value_type{0.0});
+    EXPECT_EQ(m->get_values()[4], value_type{3.0});
+    EXPECT_EQ(m->get_values()[5], value_type{6.0});
+    EXPECT_EQ(m->get_values()[6], value_type{1.0});
+    EXPECT_EQ(m->get_values()[7], value_type{3.0});
+    EXPECT_EQ(m->get_values()[8], value_type{5.0});
+    EXPECT_EQ(m->get_values()[9], value_type{0.0});
+    EXPECT_EQ(m->get_values()[10], value_type{-2.0});
+    EXPECT_EQ(m->get_values()[11], value_type{8.0});
+    EXPECT_EQ(m->get_col_idxs()[0], index_type{0});
+    EXPECT_EQ(m->get_col_idxs()[1], index_type{0});
+    EXPECT_EQ(m->get_col_idxs()[2], index_type{0});
+    EXPECT_EQ(m->get_col_idxs()[3], index_type{-1});
+    EXPECT_EQ(m->get_col_idxs()[4], index_type{2});
+    EXPECT_EQ(m->get_col_idxs()[5], index_type{1});
+}
+
+
+TYPED_TEST(Ell, CanBeReadFromMatrixData)
+{
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using BatchEllMtx = typename TestFixture::BatchEllMtx;
+    auto vec_data = std::vector<gko::matrix_data<value_type, index_type>>{};
+    vec_data.emplace_back(gko::matrix_data<value_type, index_type>(
+        {2, 3}, {{0, 0, -1.0}, {1, 1, 2.5}, {1, 2, 3.5}}));
+    vec_data.emplace_back(gko::matrix_data<value_type, index_type>(
+        {2, 3}, {{0, 0, 1.0}, {1, 1, 2.0}, {1, 2, 3.0}}));
+
+    auto m = gko::batch::read<value_type, index_type, BatchEllMtx>(this->exec,
+                                                                   vec_data, 2);
+
+    this->assert_equal_to_original_sparse_mtx(m.get());
+}
+
+
+TYPED_TEST(Ell, ThrowsForDataWithDifferentNnz)
+{
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using BatchEllMtx = typename TestFixture::BatchEllMtx;
+    auto vec_data = std::vector<gko::matrix_data<value_type, index_type>>{};
+    vec_data.emplace_back(
+        gko::matrix_data<value_type, index_type>({2, 3}, {
+                                                             {0, 0, -1.0},
+                                                             {1, 1, 2.5},
+                                                             {1, 2, 0.5},
+                                                             {2, 2, -3.0},
+                                                         }));
+    vec_data.emplace_back(gko::matrix_data<value_type, index_type>(
+        {2, 3}, {{0, 0, 1.0}, {1, 1, 2.0}, {1, 2, 3.0}}));
+
+    EXPECT_THROW(
+        gko::batch::detail::assert_same_sparsity_in_batched_data(vec_data),
+        gko::NotImplemented);
+}
+
+
+TYPED_TEST(Ell, ThrowsForDataWithDifferentSparsity)
+{
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using BatchEllMtx = typename TestFixture::BatchEllMtx;
+    auto vec_data = std::vector<gko::matrix_data<value_type, index_type>>{};
+    vec_data.emplace_back(
+        gko::matrix_data<value_type, index_type>({2, 3}, {
+                                                             {0, 0, -1.0},
+                                                             {1, 1, 2.5},
+                                                             {2, 2, -3.0},
+                                                         }));
+    vec_data.emplace_back(gko::matrix_data<value_type, index_type>(
+        {2, 3}, {{0, 0, 1.0}, {1, 1, 2.0}, {1, 2, 3.0}}));
+
+    EXPECT_THROW(
+        gko::batch::detail::assert_same_sparsity_in_batched_data(vec_data),
+        gko::NotImplemented);
+}
+
+
+TYPED_TEST(Ell, GeneratesCorrectMatrixData)
+{
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using BatchEllMtx = typename TestFixture::BatchEllMtx;
+    using tpl = typename gko::matrix_data<TypeParam>::nonzero_type;
+
+    auto data = gko::batch::write<value_type, index_type, BatchEllMtx>(
+        this->sp_mtx.get());
+
+    ASSERT_EQ(data[0].size, gko::dim<2>(2, 3));
+    ASSERT_EQ(data[0].nonzeros.size(), 3);
+    EXPECT_EQ(data[0].nonzeros[0], tpl(0, 0, value_type{-1.0}));
+    EXPECT_EQ(data[0].nonzeros[1], tpl(1, 1, value_type{2.5}));
+    EXPECT_EQ(data[0].nonzeros[2], tpl(1, 2, value_type{3.5}));
+    ASSERT_EQ(data[1].size, gko::dim<2>(2, 3));
+    ASSERT_EQ(data[1].nonzeros.size(), 3);
+    EXPECT_EQ(data[1].nonzeros[0], tpl(0, 0, value_type{1.0}));
+    EXPECT_EQ(data[1].nonzeros[1], tpl(1, 1, value_type{2.0}));
+    EXPECT_EQ(data[1].nonzeros[2], tpl(1, 2, value_type{3.0}));
+}
diff --git a/core/test/matrix/batch_identity.cpp b/core/test/matrix/batch_identity.cpp
new file mode 100644
index 00000000000..9a812fa35cd
--- /dev/null
+++ b/core/test/matrix/batch_identity.cpp
@@ -0,0 +1,186 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/matrix/batch_identity.hpp>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/range.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+#include "core/base/batch_utilities.hpp"
+#include "core/test/utils.hpp"
+#include "core/test/utils/batch_helpers.hpp"
+
+
+template <typename T>
+class Identity : public ::testing::Test {
+protected:
+    using value_type = T;
+    using MVec = gko::batch::MultiVector<value_type>;
+    using size_type = gko::size_type;
+    Identity()
+        : exec(gko::ReferenceExecutor::create()),
+          mtx(gko::batch::matrix::Identity<value_type>::create(
+              exec, gko::batch_dim<2>(2, gko::dim<2>(3, 3)))),
+          mvec(gko::batch::initialize<gko::batch::MultiVector<value_type>>(
+              {{{-1.0, 2.0, 3.0}, {-1.0, 8.0, 3.0}, {-1.5, 2.5, 3.5}},
+               {{-1.0, 3.0, 2.0}, {8.0, 5.5, 7.0}, {1.0, 2.0, 5.0}}},
+              exec))
+    {}
+
+
+    static void assert_equal_to_original_mtx(
+        gko::batch::matrix::Identity<value_type>* m)
+    {
+        ASSERT_EQ(m->get_num_batch_items(), 2);
+        ASSERT_EQ(m->get_common_size(), gko::dim<2>(3, 3));
+    }
+
+    static void assert_empty(gko::batch::matrix::Identity<value_type>* m)
+    {
+        ASSERT_EQ(m->get_num_batch_items(), 0);
+    }
+
+    std::shared_ptr<const gko::Executor> exec;
+    std::unique_ptr<gko::batch::matrix::Identity<value_type>> mtx;
+    std::unique_ptr<gko::batch::MultiVector<value_type>> mvec;
+};
+
+TYPED_TEST_SUITE(Identity, gko::test::ValueTypes);
+
+
+TYPED_TEST(Identity, KnowsItsSizeAndValues)
+{
+    this->assert_equal_to_original_mtx(this->mtx.get());
+}
+
+
+TYPED_TEST(Identity, CanBeEmpty)
+{
+    auto empty = gko::batch::matrix::Identity<TypeParam>::create(this->exec);
+    this->assert_empty(empty.get());
+}
+
+
+TYPED_TEST(Identity, CanBeCopied)
+{
+    auto mtx_copy = gko::batch::matrix::Identity<TypeParam>::create(this->exec);
+
+    mtx_copy->copy_from(this->mtx.get());
+
+    this->assert_equal_to_original_mtx(this->mtx.get());
+    this->assert_equal_to_original_mtx(mtx_copy.get());
+}
+
+
+TYPED_TEST(Identity, CanBeMoved)
+{
+    auto mtx_copy = gko::batch::matrix::Identity<TypeParam>::create(this->exec);
+
+    this->mtx->move_to(mtx_copy);
+
+    this->assert_equal_to_original_mtx(mtx_copy.get());
+}
+
+
+TYPED_TEST(Identity, CanBeCloned)
+{
+    auto mtx_clone = this->mtx->clone();
+
+    this->assert_equal_to_original_mtx(
+        dynamic_cast<decltype(this->mtx.get())>(mtx_clone.get()));
+}
+
+
+TYPED_TEST(Identity, CanBeCleared)
+{
+    this->mtx->clear();
+
+    this->assert_empty(this->mtx.get());
+}
+
+
+TYPED_TEST(Identity, CanBeConstructedWithSize)
+{
+    auto m = gko::batch::matrix::Identity<TypeParam>::create(
+        this->exec, gko::batch_dim<2>(4, gko::dim<2>{4, 4}));
+
+    ASSERT_EQ(m->get_num_batch_items(), 4);
+    ASSERT_EQ(m->get_common_size(), gko::dim<2>(4, 4));
+}
+
+
+TYPED_TEST(Identity, FailsToConstructForRectangularSizes)
+{
+    ASSERT_THROW(gko::batch::matrix::Identity<TypeParam>::create(
+                     this->exec, gko::batch_dim<2>(4, gko::dim<2>{3, 4})),
+                 gko::BadDimension);
+}
+
+
+TYPED_TEST(Identity, CanApplytoMultiVector)
+{
+    using MVec = typename TestFixture::MVec;
+    using value_type = typename TestFixture::value_type;
+    auto x = this->mvec->clone();
+    x->fill(gko::zero<value_type>());
+    ASSERT_EQ(x->at(0, 0, 0), value_type{0.0});
+
+    this->mtx->apply(this->mvec, x);
+
+    GKO_ASSERT_BATCH_MTX_NEAR(this->mvec, x, 0.0);
+}
+
+
+TYPED_TEST(Identity, CanAdvancedApplytoMultiVector)
+{
+    using MVec = typename TestFixture::MVec;
+    using value_type = typename TestFixture::value_type;
+    auto x = this->mvec->clone();
+    x->fill(gko::one<value_type>());
+    ASSERT_EQ(x->at(0, 0, 0), value_type{1.0});
+    auto alpha = gko::batch::initialize<MVec>({{1.0}, {-1.0}}, this->exec);
+    auto beta = gko::batch::initialize<MVec>({{2.0}, {-4.0}}, this->exec);
+    auto axpby = x->clone();
+    axpby->scale(beta);
+    axpby->add_scaled(alpha, this->mvec);
+
+    this->mtx->apply(alpha, this->mvec, beta, x);
+
+    GKO_ASSERT_BATCH_MTX_NEAR(axpby, x, 0.0);
+}
diff --git a/core/test/matrix/permutation.cpp b/core/test/matrix/permutation.cpp
index 09ef5e4701a..4879d1a8402 100644
--- a/core/test/matrix/permutation.cpp
+++ b/core/test/matrix/permutation.cpp
@@ -51,39 +51,37 @@ namespace {
 template <typename ValueIndexType>
 class Permutation : public ::testing::Test {
 protected:
-    using v_type =
+    using value_type =
         typename std::tuple_element<0, decltype(ValueIndexType())>::type;
-    using i_type =
+    using index_type =
         typename std::tuple_element<1, decltype(ValueIndexType())>::type;
-    using Vec = gko::matrix::Dense<v_type>;
-    using Csr = gko::matrix::Csr<v_type, i_type>;
+    using Vec = gko::matrix::Dense<value_type>;
+    using Csr = gko::matrix::Csr<value_type, index_type>;
     Permutation()
         : exec(gko::ReferenceExecutor::create()),
-          mtx(gko::matrix::Permutation<i_type>::create(
-              exec, gko::dim<2>{4, 3}, gko::array<i_type>{exec, {1, 0, 2, 3}}))
+          mtx(gko::matrix::Permutation<index_type>::create(
+              exec, gko::array<index_type>{exec, {1, 0, 2, 3}}))
     {}
 
 
     static void assert_equal_to_original_mtx(
-        gko::ptr_param<gko::matrix::Permutation<i_type>> m)
+        gko::ptr_param<gko::matrix::Permutation<index_type>> m)
     {
         auto perm = m->get_permutation();
-        ASSERT_EQ(m->get_size(), gko::dim<2>(4, 3));
-        ASSERT_EQ(m->get_permutation_size(), 4);
+        ASSERT_EQ(m->get_size(), gko::dim<2>(4, 4));
         ASSERT_EQ(perm[0], 1);
         ASSERT_EQ(perm[1], 0);
         ASSERT_EQ(perm[2], 2);
         ASSERT_EQ(perm[3], 3);
     }
 
-    static void assert_empty(gko::matrix::Permutation<i_type>* m)
+    static void assert_empty(gko::matrix::Permutation<index_type>* m)
     {
         ASSERT_EQ(m->get_size(), gko::dim<2>(0, 0));
-        ASSERT_EQ(m->get_permutation_size(), 0);
     }
 
     std::shared_ptr<const gko::Executor> exec;
-    std::unique_ptr<gko::matrix::Permutation<i_type>> mtx;
+    std::unique_ptr<gko::matrix::Permutation<index_type>> mtx;
 };
 
 TYPED_TEST_SUITE(Permutation, gko::test::ValueIndexTypes,
@@ -92,8 +90,8 @@ TYPED_TEST_SUITE(Permutation, gko::test::ValueIndexTypes,
 
 TYPED_TEST(Permutation, CanBeEmpty)
 {
-    using i_type = typename TestFixture::i_type;
-    auto empty = gko::matrix::Permutation<i_type>::create(this->exec);
+    using index_type = typename TestFixture::index_type;
+    auto empty = gko::matrix::Permutation<index_type>::create(this->exec);
 
     this->assert_empty(empty.get());
 }
@@ -101,8 +99,8 @@ TYPED_TEST(Permutation, CanBeEmpty)
 
 TYPED_TEST(Permutation, ReturnsNullValuesArrayWhenEmpty)
 {
-    using i_type = typename TestFixture::i_type;
-    auto empty = gko::matrix::Permutation<i_type>::create(this->exec);
+    using index_type = typename TestFixture::index_type;
+    auto empty = gko::matrix::Permutation<index_type>::create(this->exec);
 
     ASSERT_EQ(empty->get_const_permutation(), nullptr);
 }
@@ -110,33 +108,20 @@ TYPED_TEST(Permutation, ReturnsNullValuesArrayWhenEmpty)
 
 TYPED_TEST(Permutation, CanBeConstructedWithSize)
 {
-    using i_type = typename TestFixture::i_type;
-    auto m =
-        gko::matrix::Permutation<i_type>::create(this->exec, gko::dim<2>{2, 3});
+    using index_type = typename TestFixture::index_type;
+    auto m = gko::matrix::Permutation<index_type>::create(this->exec, 2);
 
-    ASSERT_EQ(m->get_size(), gko::dim<2>(2, 3));
-    ASSERT_EQ(m->get_permutation_size(), 2);
-}
-
-
-TYPED_TEST(Permutation, FactorySetsCorrectPermuteMask)
-{
-    using i_type = typename TestFixture::i_type;
-    auto m = gko::matrix::Permutation<i_type>::create(this->exec);
-    auto mask = m->get_permute_mask();
-
-    ASSERT_EQ(mask, gko::matrix::row_permute);
+    ASSERT_EQ(m->get_size(), gko::dim<2>(2, 2));
 }
 
 
 TYPED_TEST(Permutation, PermutationCanBeConstructedFromExistingData)
 {
-    using i_type = typename TestFixture::i_type;
-    i_type data[] = {1, 0, 2};
+    using index_type = typename TestFixture::index_type;
+    index_type data[] = {1, 0, 2};
 
-    auto m = gko::matrix::Permutation<i_type>::create(
-        this->exec, gko::dim<2>{3, 5},
-        gko::make_array_view(this->exec, 3, data));
+    auto m = gko::matrix::Permutation<index_type>::create(
+        this->exec, gko::make_array_view(this->exec, 3, data));
 
     ASSERT_EQ(m->get_const_permutation(), data);
 }
@@ -144,93 +129,17 @@ TYPED_TEST(Permutation, PermutationCanBeConstructedFromExistingData)
 
 TYPED_TEST(Permutation, PermutationCanBeConstructedFromExistingConstData)
 {
-    using i_type = typename TestFixture::i_type;
-    using i_type = typename TestFixture::i_type;
-    const i_type data[] = {1, 0, 2};
-
-    auto m = gko::matrix::Permutation<i_type>::create_const(
-        this->exec, 3, gko::array<i_type>::const_view(this->exec, 3, data));
-
-    ASSERT_EQ(m->get_const_permutation(), data);
-}
-
-
-TYPED_TEST(Permutation, CanBeConstructedWithSizeAndMask)
-{
-    using i_type = typename TestFixture::i_type;
-    auto m = gko::matrix::Permutation<i_type>::create(
-        this->exec, gko::dim<2>{2, 3}, gko::matrix::column_permute);
-
-    ASSERT_EQ(m->get_size(), gko::dim<2>(2, 3));
-    ASSERT_EQ(m->get_permutation_size(), 2);
-    ASSERT_EQ(m->get_permute_mask(), gko::matrix::column_permute);
-}
-
-
-TYPED_TEST(Permutation, CanExplicitlyOverrideSetPermuteMask)
-{
-    using i_type = typename TestFixture::i_type;
-    auto m = gko::matrix::Permutation<i_type>::create(
-        this->exec, gko::dim<2>{2, 3}, gko::matrix::column_permute);
-
-    auto mask = m->get_permute_mask();
-    ASSERT_EQ(mask, gko::matrix::column_permute);
-
-    m->set_permute_mask(gko::matrix::row_permute |
-                        gko::matrix::inverse_permute);
-
-    auto s_mask = m->get_permute_mask();
-    ASSERT_EQ(s_mask, gko::matrix::row_permute | gko::matrix::inverse_permute);
-}
-
-
-TYPED_TEST(Permutation, PermutationThrowsforWrongRowPermDimensions)
-{
-    using i_type = typename TestFixture::i_type;
-    i_type data[] = {0, 2, 1};
+    using index_type = typename TestFixture::index_type;
+    using index_type = typename TestFixture::index_type;
+    const index_type data[] = {1, 0, 2};
 
-    ASSERT_THROW(gko::matrix::Permutation<i_type>::create(
-                     this->exec, gko::dim<2>{4, 2},
-                     gko::make_array_view(this->exec, 3, data)),
-                 gko::ValueMismatch);
-}
-
-
-TYPED_TEST(Permutation, SettingMaskDoesNotModifyData)
-{
-    using i_type = typename TestFixture::i_type;
-    i_type data[] = {1, 0, 2};
-
-    auto m = gko::matrix::Permutation<i_type>::create(
-        this->exec, gko::dim<2>{3, 5},
-        gko::make_array_view(this->exec, 3, data));
-
-    auto mask = m->get_permute_mask();
-    ASSERT_EQ(m->get_const_permutation(), data);
-    ASSERT_EQ(mask, gko::matrix::row_permute);
-
-    m->set_permute_mask(gko::matrix::row_permute |
-                        gko::matrix::inverse_permute);
+    auto m = gko::matrix::Permutation<index_type>::create_const(
+        this->exec, gko::array<index_type>::const_view(this->exec, 3, data));
 
-    auto s_mask = m->get_permute_mask();
-    ASSERT_EQ(s_mask, gko::matrix::row_permute | gko::matrix::inverse_permute);
     ASSERT_EQ(m->get_const_permutation(), data);
 }
 
 
-TYPED_TEST(Permutation, PermutationThrowsforWrongColPermDimensions)
-{
-    using i_type = typename TestFixture::i_type;
-    i_type data[] = {0, 2, 1};
-
-    ASSERT_THROW(gko::matrix::Permutation<i_type>::create(
-                     this->exec, gko::dim<2>{3, 4},
-                     gko::make_array_view(this->exec, 3, data),
-                     gko::matrix::column_permute),
-                 gko::ValueMismatch);
-}
-
-
 TYPED_TEST(Permutation, KnowsItsSizeAndValues)
 {
     this->assert_equal_to_original_mtx(this->mtx);
@@ -239,8 +148,8 @@ TYPED_TEST(Permutation, KnowsItsSizeAndValues)
 
 TYPED_TEST(Permutation, CanBeCopied)
 {
-    using i_type = typename TestFixture::i_type;
-    auto mtx_copy = gko::matrix::Permutation<i_type>::create(this->exec);
+    using index_type = typename TestFixture::index_type;
+    auto mtx_copy = gko::matrix::Permutation<index_type>::create(this->exec);
 
     mtx_copy->copy_from(this->mtx);
 
@@ -252,8 +161,8 @@ TYPED_TEST(Permutation, CanBeCopied)
 
 TYPED_TEST(Permutation, CanBeMoved)
 {
-    using i_type = typename TestFixture::i_type;
-    auto mtx_copy = gko::matrix::Permutation<i_type>::create(this->exec);
+    using index_type = typename TestFixture::index_type;
+    auto mtx_copy = gko::matrix::Permutation<index_type>::create(this->exec);
 
     mtx_copy->move_from(this->mtx);
 
@@ -261,32 +170,6 @@ TYPED_TEST(Permutation, CanBeMoved)
 }
 
 
-TYPED_TEST(Permutation, CopyingPreservesMask)
-{
-    using i_type = typename TestFixture::i_type;
-    auto mtx_copy = gko::matrix::Permutation<i_type>::create(this->exec);
-
-    mtx_copy->copy_from(this->mtx);
-
-    auto o_mask = this->mtx->get_permute_mask();
-    auto n_mask = mtx_copy->get_permute_mask();
-    ASSERT_EQ(o_mask, gko::matrix::row_permute);
-    ASSERT_EQ(o_mask, n_mask);
-
-    this->mtx->set_permute_mask(gko::matrix::column_permute);
-
-    o_mask = this->mtx->get_permute_mask();
-    n_mask = mtx_copy->get_permute_mask();
-    ASSERT_EQ(o_mask, gko::matrix::column_permute);
-    ASSERT_NE(o_mask, n_mask);
-
-    mtx_copy->copy_from(this->mtx);
-
-    n_mask = mtx_copy->get_permute_mask();
-    ASSERT_EQ(o_mask, n_mask);
-}
-
-
 TYPED_TEST(Permutation, CanBeCloned)
 {
     auto mtx_clone = this->mtx->clone();
diff --git a/core/test/mpi/base/polymorphic_object.cpp b/core/test/mpi/base/polymorphic_object.cpp
index 88bcb756f4b..1cacc5d52f4 100644
--- a/core/test/mpi/base/polymorphic_object.cpp
+++ b/core/test/mpi/base/polymorphic_object.cpp
@@ -152,7 +152,7 @@ class EnableDistributedPolymorphicObject : public testing::Test {
 protected:
     std::shared_ptr<gko::ReferenceExecutor> ref{
         gko::ReferenceExecutor::create()};
-    // TDOD: We can't rely on Omp module being available in this test!
+    // TODO: We can't rely on Omp module being available in this test!
     std::shared_ptr<gko::OmpExecutor> omp{gko::OmpExecutor::create()};
     gko::experimental::mpi::communicator comm{MPI_COMM_WORLD};
     gko::experimental::mpi::communicator split_comm{comm.get(), comm.rank() < 2,
diff --git a/core/test/mpi/distributed/preconditioner/schwarz.cpp b/core/test/mpi/distributed/preconditioner/schwarz.cpp
index ff1cd0d45e5..16b0af91b74 100644
--- a/core/test/mpi/distributed/preconditioner/schwarz.cpp
+++ b/core/test/mpi/distributed/preconditioner/schwarz.cpp
@@ -67,7 +67,7 @@ class SchwarzFactory : public ::testing::Test {
           mtx(Mtx::create(exec, MPI_COMM_WORLD))
     {
         schwarz = Schwarz::build()
-                      .with_local_solver_factory(jacobi_factory)
+                      .with_local_solver(jacobi_factory)
                       .on(exec)
                       ->generate(mtx);
     }
@@ -83,8 +83,8 @@ class SchwarzFactory : public ::testing::Test {
                              gko::ptr_param<const Schwarz> b)
     {
         ASSERT_EQ(a->get_size(), b->get_size());
-        ASSERT_EQ(a->get_parameters().local_solver_factory,
-                  b->get_parameters().local_solver_factory);
+        ASSERT_EQ(a->get_parameters().local_solver,
+                  b->get_parameters().local_solver);
     }
 
     std::shared_ptr<const gko::Executor> exec;
@@ -105,7 +105,7 @@ TYPED_TEST(SchwarzFactory, KnowsItsExecutor)
 
 TYPED_TEST(SchwarzFactory, CanSetLocalFactory)
 {
-    ASSERT_EQ(this->schwarz->get_parameters().local_solver_factory,
+    ASSERT_EQ(this->schwarz->get_parameters().local_solver,
               this->jacobi_factory);
 }
 
@@ -123,9 +123,8 @@ TYPED_TEST(SchwarzFactory, CanBeCopied)
     using Jacobi = typename TestFixture::Jacobi;
     using Schwarz = typename TestFixture::Schwarz;
     using Mtx = typename TestFixture::Mtx;
-    auto bj = gko::share(Jacobi::build().on(this->exec));
     auto copy = Schwarz::build()
-                    .with_local_solver_factory(bj)
+                    .with_local_solver(Jacobi::build())
                     .on(this->exec)
                     ->generate(Mtx::create(this->exec, MPI_COMM_WORLD));
 
@@ -141,9 +140,8 @@ TYPED_TEST(SchwarzFactory, CanBeMoved)
     using Schwarz = typename TestFixture::Schwarz;
     using Mtx = typename TestFixture::Mtx;
     auto tmp = clone(this->schwarz);
-    auto bj = gko::share(Jacobi::build().on(this->exec));
     auto copy = Schwarz::build()
-                    .with_local_solver_factory(bj)
+                    .with_local_solver(Jacobi::build())
                     .on(this->exec)
                     ->generate(Mtx::create(this->exec, MPI_COMM_WORLD));
 
@@ -158,7 +156,20 @@ TYPED_TEST(SchwarzFactory, CanBeCleared)
     this->schwarz->clear();
 
     ASSERT_EQ(this->schwarz->get_size(), gko::dim<2>(0, 0));
-    ASSERT_EQ(this->schwarz->get_parameters().local_solver_factory, nullptr);
+    ASSERT_EQ(this->schwarz->get_parameters().local_solver, nullptr);
+}
+
+
+TYPED_TEST(SchwarzFactory, PassExplicitFactory)
+{
+    using Jacobi = typename TestFixture::Jacobi;
+    using Schwarz = typename TestFixture::Schwarz;
+    auto jacobi_factory = gko::share(Jacobi::build().on(this->exec));
+
+    auto factory =
+        Schwarz::build().with_local_solver(jacobi_factory).on(this->exec);
+
+    ASSERT_EQ(factory->get_parameters().local_solver, jacobi_factory);
 }
 
 
diff --git a/core/test/multigrid/fixed_coarsening.cpp b/core/test/multigrid/fixed_coarsening.cpp
index 376c58e3c25..c65a5e7f3ca 100644
--- a/core/test/multigrid/fixed_coarsening.cpp
+++ b/core/test/multigrid/fixed_coarsening.cpp
@@ -86,7 +86,7 @@ TYPED_TEST(FixedCoarseningFactory, DefaultSetting)
     using MgLevel = typename TestFixture::MgLevel;
     auto factory = MgLevel::build().on(this->exec);
 
-    ASSERT_EQ(factory->get_parameters().coarse_rows.get_data(), nullptr);
+    ASSERT_EQ(factory->get_parameters().coarse_rows.get_const_data(), nullptr);
     ASSERT_EQ(factory->get_parameters().skip_sorting, false);
 }
 
diff --git a/core/test/preconditioner/ic.cpp b/core/test/preconditioner/ic.cpp
index efd54ee9ebc..654bbab610e 100644
--- a/core/test/preconditioner/ic.cpp
+++ b/core/test/preconditioner/ic.cpp
@@ -33,6 +33,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/preconditioner/ic.hpp>
 
 
+GKO_BEGIN_DISABLE_DEPRECATION_WARNINGS
+
+
 #include <memory>
 
 
@@ -44,6 +47,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/solver/bicgstab.hpp>
 
 
+#include "core/test/utils.hpp"
+
+
 namespace {
 
 
@@ -77,9 +83,8 @@ TEST_F(IcFactory, KnowsItsExecutor)
 
 TEST_F(IcFactory, CanSetLSolverFactory)
 {
-    auto ic_factory = ic_prec_type::build()
-                          .with_l_solver_factory(this->l_factory)
-                          .on(this->exec);
+    auto ic_factory =
+        ic_prec_type::build().with_l_solver(this->l_factory).on(this->exec);
 
     ASSERT_EQ(ic_factory->get_parameters().l_solver_factory, this->l_factory);
 }
@@ -88,7 +93,7 @@ TEST_F(IcFactory, CanSetLSolverFactory)
 TEST_F(IcFactory, CanSetFactorizationFactory)
 {
     auto ic_factory = ic_prec_type::build()
-                          .with_factorization_factory(this->fact_factory)
+                          .with_factorization(this->fact_factory)
                           .on(this->exec);
 
     ASSERT_EQ(ic_factory->get_parameters().factorization_factory,
@@ -96,4 +101,34 @@ TEST_F(IcFactory, CanSetFactorizationFactory)
 }
 
 
+TEST_F(IcFactory, DeprecatedFactoryParameter)
+{
+    auto ilu_factory = ic_prec_type::build()
+                           .with_l_solver_factory(this->l_factory)
+                           .with_factorization_factory(this->fact_factory)
+                           .on(this->exec);
+
+    ASSERT_EQ(ilu_factory->get_parameters().l_solver_factory, this->l_factory);
+    ASSERT_EQ(ilu_factory->get_parameters().factorization_factory,
+              this->fact_factory);
+}
+
+
+TEST_F(IcFactory, DeferredFactoryParameter)
+{
+    auto ic_factory = ic_prec_type::build()
+                          .with_l_solver(solver_type::build())
+                          .with_factorization(ic_type::build())
+                          .on(this->exec);
+
+    GKO_ASSERT_DYNAMIC_TYPE(ic_factory->get_parameters().l_solver_factory,
+                            solver_type::Factory);
+    GKO_ASSERT_DYNAMIC_TYPE(ic_factory->get_parameters().factorization_factory,
+                            ic_type::Factory);
+}
+
+
 }  // namespace
+
+
+GKO_END_DISABLE_DEPRECATION_WARNINGS
diff --git a/core/test/preconditioner/ilu.cpp b/core/test/preconditioner/ilu.cpp
index c7b72e09b09..b9c8884a6f6 100644
--- a/core/test/preconditioner/ilu.cpp
+++ b/core/test/preconditioner/ilu.cpp
@@ -33,6 +33,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/preconditioner/ilu.hpp>
 
 
+GKO_BEGIN_DISABLE_DEPRECATION_WARNINGS
+
+
 #include <memory>
 
 
@@ -44,6 +47,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/solver/bicgstab.hpp>
 
 
+#include "core/test/utils.hpp"
+
+
 namespace {
 
 
@@ -81,9 +87,8 @@ TEST_F(IluFactory, KnowsItsExecutor)
 
 TEST_F(IluFactory, CanSetLSolverFactory)
 {
-    auto ilu_factory = ilu_prec_type::build()
-                           .with_l_solver_factory(this->l_factory)
-                           .on(this->exec);
+    auto ilu_factory =
+        ilu_prec_type::build().with_l_solver(this->l_factory).on(this->exec);
 
     ASSERT_EQ(ilu_factory->get_parameters().l_solver_factory, this->l_factory);
 }
@@ -91,9 +96,8 @@ TEST_F(IluFactory, CanSetLSolverFactory)
 
 TEST_F(IluFactory, CanSetUSolverFactory)
 {
-    auto ilu_factory = ilu_prec_type::build()
-                           .with_u_solver_factory(this->u_factory)
-                           .on(this->exec);
+    auto ilu_factory =
+        ilu_prec_type::build().with_u_solver(this->u_factory).on(this->exec);
 
     ASSERT_EQ(ilu_factory->get_parameters().u_solver_factory, this->u_factory);
 }
@@ -102,12 +106,47 @@ TEST_F(IluFactory, CanSetUSolverFactory)
 TEST_F(IluFactory, CanSetFactorizationFactory)
 {
     auto ilu_factory = ilu_prec_type::build()
+                           .with_factorization(this->fact_factory)
+                           .on(this->exec);
+
+    ASSERT_EQ(ilu_factory->get_parameters().factorization_factory,
+              this->fact_factory);
+}
+
+
+TEST_F(IluFactory, DeprecatedFactoryParameter)
+{
+    auto ilu_factory = ilu_prec_type::build()
+                           .with_l_solver_factory(this->l_factory)
+                           .with_u_solver_factory(this->u_factory)
                            .with_factorization_factory(this->fact_factory)
                            .on(this->exec);
 
+    ASSERT_EQ(ilu_factory->get_parameters().l_solver_factory, this->l_factory);
+    ASSERT_EQ(ilu_factory->get_parameters().u_solver_factory, this->u_factory);
     ASSERT_EQ(ilu_factory->get_parameters().factorization_factory,
               this->fact_factory);
 }
 
 
+TEST_F(IluFactory, DeferredFactoryParameter)
+{
+    auto ilu_factory = ilu_prec_type::build()
+                           .with_l_solver(l_solver_type::build())
+                           .with_u_solver(u_solver_type::build())
+                           .with_factorization(ilu_type::build())
+                           .on(this->exec);
+
+    GKO_ASSERT_DYNAMIC_TYPE(ilu_factory->get_parameters().l_solver_factory,
+                            l_solver_type::Factory);
+    GKO_ASSERT_DYNAMIC_TYPE(ilu_factory->get_parameters().u_solver_factory,
+                            u_solver_type::Factory);
+    GKO_ASSERT_DYNAMIC_TYPE(ilu_factory->get_parameters().factorization_factory,
+                            ilu_type::Factory);
+}
+
+
 }  // namespace
+
+
+GKO_END_DISABLE_DEPRECATION_WARNINGS
diff --git a/core/test/reorder/rcm.cpp b/core/test/reorder/rcm.cpp
index dfd90fe137a..8a8450aa441 100644
--- a/core/test/reorder/rcm.cpp
+++ b/core/test/reorder/rcm.cpp
@@ -47,11 +47,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 namespace {
 
+
 class Rcm : public ::testing::Test {
 protected:
     using v_type = double;
     using i_type = int;
     using reorder_type = gko::reorder::Rcm<v_type, i_type>;
+    using new_reorder_type = gko::experimental::reorder::Rcm<i_type>;
 
     Rcm()
         : exec(gko::ReferenceExecutor::create()),
@@ -62,9 +64,32 @@ class Rcm : public ::testing::Test {
     std::unique_ptr<reorder_type::Factory> rcm_factory;
 };
 
+
 TEST_F(Rcm, RcmFactoryKnowsItsExecutor)
 {
     ASSERT_EQ(this->rcm_factory->get_executor(), this->exec);
 }
 
+
+TEST_F(Rcm, NewInterfaceDefaults)
+{
+    auto param = new_reorder_type::build();
+
+    ASSERT_EQ(param.skip_symmetrize, false);
+    ASSERT_EQ(param.strategy,
+              gko::reorder::starting_strategy::pseudo_peripheral);
+}
+
+
+TEST_F(Rcm, NewInterfaceSetParameters)
+{
+    auto param =
+        new_reorder_type::build().with_skip_symmetrize(true).with_strategy(
+            gko::reorder::starting_strategy::minimum_degree);
+
+    ASSERT_EQ(param.skip_symmetrize, true);
+    ASSERT_EQ(param.strategy, gko::reorder::starting_strategy::minimum_degree);
+}
+
+
 }  // namespace
diff --git a/core/test/reorder/scaled_reordered.cpp b/core/test/reorder/scaled_reordered.cpp
index 183bfd7ef3d..461b5e15b4d 100644
--- a/core/test/reorder/scaled_reordered.cpp
+++ b/core/test/reorder/scaled_reordered.cpp
@@ -46,6 +46,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/solver/bicgstab.hpp>
 
 
+GKO_BEGIN_DISABLE_DEPRECATION_WARNINGS
+
+
 namespace {
 
 
@@ -132,3 +135,6 @@ TEST_F(ScaledReorderedFactory, CanSetColScaling)
 
 
 }  // namespace
+
+
+GKO_END_DISABLE_DEPRECATION_WARNINGS
diff --git a/core/test/solver/CMakeLists.txt b/core/test/solver/CMakeLists.txt
index 4ca8763e2ee..828d8cceb6a 100644
--- a/core/test/solver/CMakeLists.txt
+++ b/core/test/solver/CMakeLists.txt
@@ -1,7 +1,9 @@
+ginkgo_create_test(batch_bicgstab)
 ginkgo_create_test(bicg)
 ginkgo_create_test(bicgstab)
 ginkgo_create_test(cg)
 ginkgo_create_test(cgs)
+ginkgo_create_test(direct)
 ginkgo_create_test(fcg)
 ginkgo_create_test(gcr)
 ginkgo_create_test(gmres)
diff --git a/core/test/solver/batch_bicgstab.cpp b/core/test/solver/batch_bicgstab.cpp
new file mode 100644
index 00000000000..07b94fd2617
--- /dev/null
+++ b/core/test/solver/batch_bicgstab.cpp
@@ -0,0 +1,305 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/solver/batch_bicgstab.hpp>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/matrix/batch_dense.hpp>
+
+
+#include "core/base/batch_utilities.hpp"
+#include "core/test/utils.hpp"
+#include "core/test/utils/batch_helpers.hpp"
+
+
+namespace {
+
+
+template <typename T>
+class BatchBicgstab : public ::testing::Test {
+protected:
+    using value_type = T;
+    using real_type = gko::remove_complex<T>;
+    using Mtx = gko::batch::matrix::Dense<value_type>;
+    using MVec = gko::batch::MultiVector<value_type>;
+    using Solver = gko::batch::solver::Bicgstab<value_type>;
+
+    BatchBicgstab()
+        : exec(gko::ReferenceExecutor::create()),
+          mtx(gko::share(gko::test::generate_3pt_stencil_batch_matrix<Mtx>(
+              this->exec->get_master(), num_batch_items, num_rows))),
+          solver_factory(Solver::build()
+                             .with_max_iterations(def_max_iters)
+                             .with_tolerance(def_abs_res_tol)
+                             .with_tolerance_type(def_tol_type)
+                             .on(exec)),
+          solver(solver_factory->generate(mtx))
+    {}
+
+    std::shared_ptr<const gko::Executor> exec;
+    const gko::size_type num_batch_items = 3;
+    const int num_rows = 5;
+    std::shared_ptr<const Mtx> mtx;
+    const int def_max_iters = 100;
+    const real_type def_abs_res_tol = 1e-11;
+    const gko::batch::stop::tolerance_type def_tol_type =
+        gko::batch::stop::tolerance_type::absolute;
+    std::unique_ptr<typename Solver::Factory> solver_factory;
+    std::unique_ptr<gko::batch::BatchLinOp> solver;
+};
+
+TYPED_TEST_SUITE(BatchBicgstab, gko::test::ValueTypes);
+
+
+TYPED_TEST(BatchBicgstab, FactoryKnowsItsExecutor)
+{
+    ASSERT_EQ(this->solver_factory->get_executor(), this->exec);
+}
+
+
+TYPED_TEST(BatchBicgstab, FactoryHasCorrectDefaults)
+{
+    using Solver = typename TestFixture::Solver;
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+
+    auto solver_factory = Solver::build().on(this->exec);
+    auto solver = solver_factory->generate(Mtx::create(this->exec));
+
+    ASSERT_NE(solver->get_system_matrix(), nullptr);
+    ASSERT_NE(solver->get_preconditioner(), nullptr);
+    ASSERT_NO_THROW(gko::as<gko::batch::matrix::Identity<value_type>>(
+        solver->get_preconditioner()));
+    ASSERT_EQ(solver->get_tolerance(), 1e-11);
+    ASSERT_EQ(solver->get_max_iterations(), 100);
+    ASSERT_EQ(solver->get_tolerance_type(),
+              gko::batch::stop::tolerance_type::absolute);
+}
+
+
+TYPED_TEST(BatchBicgstab, FactoryCreatesCorrectSolver)
+{
+    using Solver = typename TestFixture::Solver;
+    ASSERT_EQ(this->solver->get_common_size(),
+              gko::dim<2>(this->num_rows, this->num_rows));
+
+    auto solver = gko::as<Solver>(this->solver.get());
+
+    ASSERT_NE(solver->get_system_matrix(), nullptr);
+    ASSERT_EQ(solver->get_system_matrix(), this->mtx);
+}
+
+
+TYPED_TEST(BatchBicgstab, CanBeCopied)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using Solver = typename TestFixture::Solver;
+    auto copy = this->solver_factory->generate(Mtx::create(this->exec));
+
+    copy->copy_from(this->solver.get());
+
+    ASSERT_EQ(copy->get_common_size(),
+              gko::dim<2>(this->num_rows, this->num_rows));
+    ASSERT_EQ(copy->get_num_batch_items(), this->num_batch_items);
+    auto copy_mtx = gko::as<Solver>(copy.get())->get_system_matrix();
+    const auto copy_batch_mtx = gko::as<const Mtx>(copy_mtx.get());
+    GKO_ASSERT_BATCH_MTX_NEAR(this->mtx.get(), copy_batch_mtx, 0.0);
+}
+
+
+TYPED_TEST(BatchBicgstab, CanBeMoved)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using Solver = typename TestFixture::Solver;
+    auto copy = this->solver_factory->generate(Mtx::create(this->exec));
+
+    copy->move_from(this->solver);
+
+    ASSERT_EQ(copy->get_common_size(),
+              gko::dim<2>(this->num_rows, this->num_rows));
+    ASSERT_EQ(copy->get_num_batch_items(), this->num_batch_items);
+    auto copy_mtx = gko::as<Solver>(copy.get())->get_system_matrix();
+    const auto copy_batch_mtx = gko::as<const Mtx>(copy_mtx.get());
+    GKO_ASSERT_BATCH_MTX_NEAR(this->mtx.get(), copy_batch_mtx, 0.0);
+}
+
+
+TYPED_TEST(BatchBicgstab, CanBeCloned)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using Solver = typename TestFixture::Solver;
+
+    auto clone = this->solver->clone();
+
+    ASSERT_EQ(clone->get_common_size(),
+              gko::dim<2>(this->num_rows, this->num_rows));
+    ASSERT_EQ(clone->get_num_batch_items(), this->num_batch_items);
+    auto clone_mtx = gko::as<Solver>(clone.get())->get_system_matrix();
+    const auto clone_batch_mtx = gko::as<const Mtx>(clone_mtx.get());
+    GKO_ASSERT_BATCH_MTX_NEAR(this->mtx.get(), clone_batch_mtx, 0.0);
+}
+
+
+TYPED_TEST(BatchBicgstab, CanBeCleared)
+{
+    using Solver = typename TestFixture::Solver;
+
+    this->solver->clear();
+
+    ASSERT_EQ(this->solver->get_num_batch_items(), 0);
+    auto solver_mtx = gko::as<Solver>(this->solver.get())->get_system_matrix();
+    ASSERT_EQ(solver_mtx, nullptr);
+}
+
+
+TYPED_TEST(BatchBicgstab, CanSetCriteriaInFactory)
+{
+    using Solver = typename TestFixture::Solver;
+    using real_type = typename TestFixture::real_type;
+
+    auto solver_factory =
+        Solver::build()
+            .with_max_iterations(22)
+            .with_tolerance(static_cast<real_type>(0.25))
+            .with_tolerance_type(gko::batch::stop::tolerance_type::relative)
+            .on(this->exec);
+
+    auto solver = solver_factory->generate(this->mtx);
+    ASSERT_EQ(solver->get_parameters().max_iterations, 22);
+    ASSERT_EQ(solver->get_parameters().tolerance, 0.25);
+    ASSERT_EQ(solver->get_parameters().tolerance_type,
+              gko::batch::stop::tolerance_type::relative);
+}
+
+
+TYPED_TEST(BatchBicgstab, CanSetResidualTol)
+{
+    using Solver = typename TestFixture::Solver;
+    using real_type = typename TestFixture::real_type;
+    auto solver_factory =
+        Solver::build()
+            .with_max_iterations(22)
+            .with_tolerance(static_cast<real_type>(0.25))
+            .with_tolerance_type(gko::batch::stop::tolerance_type::relative)
+            .on(this->exec);
+    auto solver = solver_factory->generate(this->mtx);
+
+    solver->reset_tolerance(0.5);
+
+    ASSERT_EQ(solver->get_parameters().max_iterations, 22);
+    ASSERT_EQ(solver->get_parameters().tolerance, 0.25);
+    ASSERT_EQ(solver->get_parameters().tolerance_type,
+              gko::batch::stop::tolerance_type::relative);
+    ASSERT_EQ(solver->get_tolerance(), 0.5);
+}
+
+
+TYPED_TEST(BatchBicgstab, CanSetMaxIterations)
+{
+    using Solver = typename TestFixture::Solver;
+    using real_type = typename TestFixture::real_type;
+    auto solver_factory =
+        Solver::build()
+            .with_max_iterations(22)
+            .with_tolerance(static_cast<real_type>(0.25))
+            .with_tolerance_type(gko::batch::stop::tolerance_type::relative)
+            .on(this->exec);
+    auto solver = solver_factory->generate(this->mtx);
+
+    solver->reset_max_iterations(10);
+
+    ASSERT_EQ(solver->get_parameters().tolerance, 0.25);
+    ASSERT_EQ(solver->get_parameters().max_iterations, 22);
+    ASSERT_EQ(solver->get_parameters().tolerance_type,
+              gko::batch::stop::tolerance_type::relative);
+    ASSERT_EQ(solver->get_max_iterations(), 10);
+}
+
+
+TYPED_TEST(BatchBicgstab, CanSetTolType)
+{
+    using Solver = typename TestFixture::Solver;
+    using real_type = typename TestFixture::real_type;
+    auto solver_factory =
+        Solver::build()
+            .with_max_iterations(22)
+            .with_tolerance(static_cast<real_type>(0.25))
+            .with_tolerance_type(gko::batch::stop::tolerance_type::relative)
+            .on(this->exec);
+    auto solver = solver_factory->generate(this->mtx);
+
+    solver->reset_tolerance_type(gko::batch::stop::tolerance_type::absolute);
+
+    ASSERT_EQ(solver->get_parameters().max_iterations, 22);
+    ASSERT_EQ(solver->get_parameters().tolerance, 0.25);
+    ASSERT_EQ(solver->get_parameters().tolerance_type,
+              gko::batch::stop::tolerance_type::relative);
+    ASSERT_EQ(solver->get_tolerance_type(),
+              gko::batch::stop::tolerance_type::absolute);
+}
+
+
+TYPED_TEST(BatchBicgstab, ThrowsOnRectangularMatrixInFactory)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using Solver = typename TestFixture::Solver;
+    std::shared_ptr<Mtx> rectangular_mtx =
+        Mtx::create(this->exec, gko::batch_dim<2>(2, gko::dim<2>{3, 5}));
+
+    ASSERT_THROW(this->solver_factory->generate(rectangular_mtx),
+                 gko::BadDimension);
+}
+
+
+TYPED_TEST(BatchBicgstab, ThrowsForMultipleRhs)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using MVec = typename TestFixture::MVec;
+    using Solver = typename TestFixture::Solver;
+    std::shared_ptr<MVec> b =
+        MVec::create(this->exec, gko::batch_dim<2>(2, gko::dim<2>{3, 2}));
+    std::shared_ptr<MVec> x =
+        MVec::create(this->exec, gko::batch_dim<2>(2, gko::dim<2>{3, 2}));
+    std::shared_ptr<Mtx> mtx =
+        Mtx::create(this->exec, gko::batch_dim<2>(2, gko::dim<2>{3, 2}));
+
+    ASSERT_THROW(this->solver_factory->generate(mtx)->apply(b, x),
+                 gko::BadDimension);
+}
+
+
+}  // namespace
diff --git a/core/test/solver/bicg.cpp b/core/test/solver/bicg.cpp
index 4c7421f63e1..c13070fad1e 100644
--- a/core/test/solver/bicg.cpp
+++ b/core/test/solver/bicg.cpp
@@ -66,10 +66,9 @@ class Bicg : public ::testing::Test {
           bicg_factory(
               Solver::build()
                   .with_criteria(
-                      gko::stop::Iteration::build().with_max_iters(3u).on(exec),
+                      gko::stop::Iteration::build().with_max_iters(3u),
                       gko::stop::ResidualNorm<value_type>::build()
-                          .with_reduction_factor(gko::remove_complex<T>{1e-6})
-                          .on(exec))
+                          .with_reduction_factor(gko::remove_complex<T>{1e-6}))
                   .on(exec)),
           solver(bicg_factory->generate(mtx))
     {}
@@ -165,18 +164,12 @@ TYPED_TEST(Bicg, CanSetPreconditionerGenerator)
     using value_type = typename TestFixture::value_type;
     auto bicg_factory =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec),
-                gko::stop::ResidualNorm<value_type>::build()
-                    .with_reduction_factor(
-                        gko::remove_complex<value_type>(1e-6))
-                    .on(this->exec))
-            .with_preconditioner(
-                Solver::build()
-                    .with_criteria(
-                        gko::stop::Iteration::build().with_max_iters(3u).on(
-                            this->exec))
-                    .on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u),
+                           gko::stop::ResidualNorm<value_type>::build()
+                               .with_reduction_factor(
+                                   gko::remove_complex<value_type>(1e-6)))
+            .with_preconditioner(Solver::build().with_criteria(
+                gko::stop::Iteration::build().with_max_iters(3u)))
             .on(this->exec);
     auto solver = bicg_factory->generate(this->mtx);
     auto precond = dynamic_cast<const gko::solver::Bicg<value_type>*>(
@@ -195,15 +188,13 @@ TYPED_TEST(Bicg, CanSetPreconditionerInFactory)
     using Solver = typename TestFixture::Solver;
     std::shared_ptr<Solver> bicg_precond =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .on(this->exec)
             ->generate(this->mtx);
 
     auto bicg_factory =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .with_generated_preconditioner(bicg_precond)
             .on(this->exec);
     auto solver = bicg_factory->generate(this->mtx);
@@ -246,15 +237,13 @@ TYPED_TEST(Bicg, ThrowsOnWrongPreconditionerInFactory)
         Mtx::create(this->exec, gko::dim<2>{2, 2});
     std::shared_ptr<Solver> bicg_precond =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .on(this->exec)
             ->generate(wrong_sized_mtx);
 
     auto bicg_factory =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .with_generated_preconditioner(bicg_precond)
             .on(this->exec);
 
@@ -279,15 +268,13 @@ TYPED_TEST(Bicg, CanSetPreconditioner)
     using Solver = typename TestFixture::Solver;
     std::shared_ptr<Solver> bicg_precond =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .on(this->exec)
             ->generate(this->mtx);
 
     auto bicg_factory =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .on(this->exec);
     auto solver = bicg_factory->generate(this->mtx);
     solver->set_preconditioner(bicg_precond);
@@ -298,4 +285,21 @@ TYPED_TEST(Bicg, CanSetPreconditioner)
 }
 
 
+TYPED_TEST(Bicg, PassExplicitFactory)
+{
+    using Solver = typename TestFixture::Solver;
+    auto stop_factory = gko::share(
+        gko::stop::Iteration::build().with_max_iters(1u).on(this->exec));
+    auto precond_factory = gko::share(Solver::build().on(this->exec));
+
+    auto factory = Solver::build()
+                       .with_criteria(stop_factory)
+                       .with_preconditioner(precond_factory)
+                       .on(this->exec);
+
+    ASSERT_EQ(factory->get_parameters().criteria.front(), stop_factory);
+    ASSERT_EQ(factory->get_parameters().preconditioner, precond_factory);
+}
+
+
 }  // namespace
diff --git a/core/test/solver/bicgstab.cpp b/core/test/solver/bicgstab.cpp
index c42cd7db2af..b420ccfc49e 100644
--- a/core/test/solver/bicgstab.cpp
+++ b/core/test/solver/bicgstab.cpp
@@ -64,10 +64,9 @@ class Bicgstab : public ::testing::Test {
           bicgstab_factory(
               Solver::build()
                   .with_criteria(
-                      gko::stop::Iteration::build().with_max_iters(3u).on(exec),
+                      gko::stop::Iteration::build().with_max_iters(3u),
                       gko::stop::ResidualNorm<value_type>::build()
-                          .with_reduction_factor(gko::remove_complex<T>{1e-6})
-                          .on(exec))
+                          .with_reduction_factor(gko::remove_complex<T>{1e-6}))
                   .on(exec)),
           solver(bicgstab_factory->generate(mtx))
     {}
@@ -160,14 +159,9 @@ TYPED_TEST(Bicgstab, CanSetPreconditionerGenerator)
     using value_type = typename TestFixture::value_type;
     auto bicgstab_factory =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
-            .with_preconditioner(
-                Solver::build()
-                    .with_criteria(
-                        gko::stop::Iteration::build().with_max_iters(3u).on(
-                            this->exec))
-                    .on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
+            .with_preconditioner(Solver::build().with_criteria(
+                gko::stop::Iteration::build().with_max_iters(3u)))
             .on(this->exec);
 
     auto solver = bicgstab_factory->generate(this->mtx);
@@ -208,15 +202,13 @@ TYPED_TEST(Bicgstab, CanSetPreconditionerInFactory)
     using Solver = typename TestFixture::Solver;
     std::shared_ptr<Solver> bicgstab_precond =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .on(this->exec)
             ->generate(this->mtx);
 
     auto bicgstab_factory =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .with_generated_preconditioner(bicgstab_precond)
             .on(this->exec);
     auto solver = bicgstab_factory->generate(this->mtx);
@@ -235,15 +227,13 @@ TYPED_TEST(Bicgstab, ThrowsOnWrongPreconditionerInFactory)
         Mtx::create(this->exec, gko::dim<2>{2, 2});
     std::shared_ptr<Solver> bicgstab_precond =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .on(this->exec)
             ->generate(wrong_sized_mtx);
 
     auto bicgstab_factory =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .with_generated_preconditioner(bicgstab_precond)
             .on(this->exec);
 
@@ -268,15 +258,13 @@ TYPED_TEST(Bicgstab, CanSetPreconditioner)
     using Solver = typename TestFixture::Solver;
     std::shared_ptr<Solver> bicgstab_precond =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .on(this->exec)
             ->generate(this->mtx);
 
     auto bicgstab_factory =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .on(this->exec);
     auto solver = bicgstab_factory->generate(this->mtx);
     solver->set_preconditioner(bicgstab_precond);
@@ -287,4 +275,21 @@ TYPED_TEST(Bicgstab, CanSetPreconditioner)
 }
 
 
+TYPED_TEST(Bicgstab, PassExplicitFactory)
+{
+    using Solver = typename TestFixture::Solver;
+    auto stop_factory = gko::share(
+        gko::stop::Iteration::build().with_max_iters(1u).on(this->exec));
+    auto precond_factory = gko::share(Solver::build().on(this->exec));
+
+    auto factory = Solver::build()
+                       .with_criteria(stop_factory)
+                       .with_preconditioner(precond_factory)
+                       .on(this->exec);
+
+    ASSERT_EQ(factory->get_parameters().criteria.front(), stop_factory);
+    ASSERT_EQ(factory->get_parameters().preconditioner, precond_factory);
+}
+
+
 }  // namespace
diff --git a/core/test/solver/cb_gmres.cpp b/core/test/solver/cb_gmres.cpp
index b81d84f8b1e..434544b3ca2 100644
--- a/core/test/solver/cb_gmres.cpp
+++ b/core/test/solver/cb_gmres.cpp
@@ -72,23 +72,20 @@ class CbGmres : public ::testing::Test {
               Solver::build()
                   .with_storage_precision(storage_precision)
                   .with_criteria(
-                      gko::stop::Iteration::build().with_max_iters(3u).on(exec),
+                      gko::stop::Iteration::build().with_max_iters(3u),
                       gko::stop::ResidualNorm<value_type>::build()
                           .with_baseline(gko::stop::mode::initial_resnorm)
-                          .with_reduction_factor(nc_value_type{1e-6})
-                          .on(exec))
+                          .with_reduction_factor(nc_value_type{1e-6}))
                   .on(exec)),
           solver(cb_gmres_factory->generate(mtx)),
           cb_gmres_big_factory(
               Solver::build()
                   .with_storage_precision(storage_precision)
                   .with_criteria(
-                      gko::stop::Iteration::build().with_max_iters(128u).on(
-                          exec),
+                      gko::stop::Iteration::build().with_max_iters(128u),
                       gko::stop::ResidualNorm<value_type>::build()
                           .with_baseline(gko::stop::mode::initial_resnorm)
-                          .with_reduction_factor(nc_value_type{1e-6})
-                          .on(exec))
+                          .with_reduction_factor(nc_value_type{1e-6}))
                   .on(exec)),
           big_solver(cb_gmres_big_factory->generate(mtx))
     {}
@@ -226,18 +223,12 @@ TYPED_TEST(CbGmres, CanSetPreconditionerGenerator)
     using Solver = typename TestFixture::Solver;
     auto cb_gmres_factory =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec),
-                gko::stop::ResidualNorm<value_type>::build()
-                    .with_baseline(gko::stop::mode::initial_resnorm)
-                    .with_reduction_factor(nc_value_type{1e-6})
-                    .on(this->exec))
-            .with_preconditioner(
-                Solver::build()
-                    .with_criteria(
-                        gko::stop::Iteration::build().with_max_iters(3u).on(
-                            this->exec))
-                    .on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u),
+                           gko::stop::ResidualNorm<value_type>::build()
+                               .with_baseline(gko::stop::mode::initial_resnorm)
+                               .with_reduction_factor(nc_value_type{1e-6}))
+            .with_preconditioner(Solver::build().with_criteria(
+                gko::stop::Iteration::build().with_max_iters(3u)))
             .on(this->exec);
     auto solver = cb_gmres_factory->generate(this->mtx);
     auto precond =
@@ -259,8 +250,7 @@ TYPED_TEST(CbGmres, CanSetKrylovDim)
     auto cb_gmres_factory =
         Solver::build()
             .with_krylov_dim(new_krylov_dim)
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(4u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(4u))
             .on(this->exec);
     auto solver = cb_gmres_factory->generate(this->mtx);
 
@@ -279,8 +269,7 @@ TYPED_TEST(CbGmres, CanUseSetKrylovDim)
     const gko::size_type new_krylov_dim{40u};
     auto cb_gmres_factory =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(4u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(4u))
             .on(this->exec);
     auto solver = cb_gmres_factory->generate(this->mtx);
 
@@ -298,15 +287,13 @@ TYPED_TEST(CbGmres, CanSetPreconditionerInFactory)
     using Solver = typename TestFixture::Solver;
     std::shared_ptr<Solver> cb_gmres_precond =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .on(this->exec)
             ->generate(this->mtx);
 
     auto cb_gmres_factory =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .with_generated_preconditioner(cb_gmres_precond)
             .on(this->exec);
     auto solver = cb_gmres_factory->generate(this->mtx);
@@ -325,15 +312,13 @@ TYPED_TEST(CbGmres, ThrowsOnWrongPreconditionerInFactory)
         Mtx::create(this->exec, gko::dim<2>{2, 2});
     std::shared_ptr<Solver> cb_gmres_precond =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .on(this->exec)
             ->generate(wrong_sized_mtx);
 
     auto cb_gmres_factory =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .with_generated_preconditioner(cb_gmres_precond)
             .on(this->exec);
 
@@ -346,15 +331,13 @@ TYPED_TEST(CbGmres, CanSetPreconditioner)
     using Solver = typename TestFixture::Solver;
     std::shared_ptr<Solver> cb_gmres_precond =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .on(this->exec)
             ->generate(this->mtx);
 
     auto cb_gmres_factory =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .on(this->exec);
     auto solver = cb_gmres_factory->generate(this->mtx);
     solver->set_preconditioner(cb_gmres_precond);
@@ -365,4 +348,21 @@ TYPED_TEST(CbGmres, CanSetPreconditioner)
 }
 
 
+TYPED_TEST(CbGmres, PassExplicitFactory)
+{
+    using Solver = typename TestFixture::Solver;
+    auto stop_factory = gko::share(
+        gko::stop::Iteration::build().with_max_iters(1u).on(this->exec));
+    auto precond_factory = gko::share(Solver::build().on(this->exec));
+
+    auto factory = Solver::build()
+                       .with_criteria(stop_factory)
+                       .with_preconditioner(precond_factory)
+                       .on(this->exec);
+
+    ASSERT_EQ(factory->get_parameters().criteria.front(), stop_factory);
+    ASSERT_EQ(factory->get_parameters().preconditioner, precond_factory);
+}
+
+
 }  // namespace
diff --git a/core/test/solver/cg.cpp b/core/test/solver/cg.cpp
index 5daf43bc160..f94694e775b 100644
--- a/core/test/solver/cg.cpp
+++ b/core/test/solver/cg.cpp
@@ -66,10 +66,9 @@ class Cg : public ::testing::Test {
           cg_factory(
               Solver::build()
                   .with_criteria(
-                      gko::stop::Iteration::build().with_max_iters(3u).on(exec),
+                      gko::stop::Iteration::build().with_max_iters(3u),
                       gko::stop::ResidualNorm<value_type>::build()
-                          .with_reduction_factor(gko::remove_complex<T>{1e-6})
-                          .on(exec))
+                          .with_reduction_factor(gko::remove_complex<T>{1e-6}))
                   .on(exec)),
           solver(cg_factory->generate(mtx))
     {}
@@ -164,18 +163,12 @@ TYPED_TEST(Cg, CanSetPreconditionerGenerator)
     using value_type = typename TestFixture::value_type;
     auto cg_factory =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec),
-                gko::stop::ResidualNorm<value_type>::build()
-                    .with_reduction_factor(
-                        gko::remove_complex<value_type>(1e-6))
-                    .on(this->exec))
-            .with_preconditioner(
-                Solver::build()
-                    .with_criteria(
-                        gko::stop::Iteration::build().with_max_iters(3u).on(
-                            this->exec))
-                    .on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u),
+                           gko::stop::ResidualNorm<value_type>::build()
+                               .with_reduction_factor(
+                                   gko::remove_complex<value_type>(1e-6)))
+            .with_preconditioner(Solver::build().with_criteria(
+                gko::stop::Iteration::build().with_max_iters(3u)))
             .on(this->exec);
     auto solver = cg_factory->generate(this->mtx);
     auto precond = dynamic_cast<const gko::solver::Cg<value_type>*>(
@@ -194,15 +187,13 @@ TYPED_TEST(Cg, CanSetPreconditionerInFactory)
     using Solver = typename TestFixture::Solver;
     std::shared_ptr<Solver> cg_precond =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .on(this->exec)
             ->generate(this->mtx);
 
     auto cg_factory =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .with_generated_preconditioner(cg_precond)
             .on(this->exec);
     auto solver = cg_factory->generate(this->mtx);
@@ -245,15 +236,13 @@ TYPED_TEST(Cg, ThrowsOnWrongPreconditionerInFactory)
         Mtx::create(this->exec, gko::dim<2>{2, 2});
     std::shared_ptr<Solver> cg_precond =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .on(this->exec)
             ->generate(wrong_sized_mtx);
 
     auto cg_factory =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .with_generated_preconditioner(cg_precond)
             .on(this->exec);
 
@@ -278,15 +267,13 @@ TYPED_TEST(Cg, CanSetPreconditioner)
     using Solver = typename TestFixture::Solver;
     std::shared_ptr<Solver> cg_precond =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .on(this->exec)
             ->generate(this->mtx);
 
     auto cg_factory =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .on(this->exec);
     auto solver = cg_factory->generate(this->mtx);
     solver->set_preconditioner(cg_precond);
@@ -297,4 +284,21 @@ TYPED_TEST(Cg, CanSetPreconditioner)
 }
 
 
+TYPED_TEST(Cg, PassExplicitFactory)
+{
+    using Solver = typename TestFixture::Solver;
+    auto stop_factory = gko::share(
+        gko::stop::Iteration::build().with_max_iters(1u).on(this->exec));
+    auto precond_factory = gko::share(Solver::build().on(this->exec));
+
+    auto factory = Solver::build()
+                       .with_criteria(stop_factory)
+                       .with_preconditioner(precond_factory)
+                       .on(this->exec);
+
+    ASSERT_EQ(factory->get_parameters().criteria.front(), stop_factory);
+    ASSERT_EQ(factory->get_parameters().preconditioner, precond_factory);
+}
+
+
 }  // namespace
diff --git a/core/test/solver/cgs.cpp b/core/test/solver/cgs.cpp
index c23dc7b2e3b..6216899d898 100644
--- a/core/test/solver/cgs.cpp
+++ b/core/test/solver/cgs.cpp
@@ -66,10 +66,9 @@ class Cgs : public ::testing::Test {
           cgs_factory(
               Solver::build()
                   .with_criteria(
-                      gko::stop::Iteration::build().with_max_iters(3u).on(exec),
+                      gko::stop::Iteration::build().with_max_iters(3u),
                       gko::stop::ResidualNorm<value_type>::build()
-                          .with_reduction_factor(gko::remove_complex<T>{1e-6})
-                          .on(exec))
+                          .with_reduction_factor(gko::remove_complex<T>{1e-6}))
                   .on(exec)),
           solver(cgs_factory->generate(mtx))
     {}
@@ -164,18 +163,12 @@ TYPED_TEST(Cgs, CanSetPreconditionerGenerator)
     using value_type = typename TestFixture::value_type;
     auto cgs_factory =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec),
-                gko::stop::ResidualNorm<value_type>::build()
-                    .with_reduction_factor(
-                        gko::remove_complex<value_type>(1e-6))
-                    .on(this->exec))
-            .with_preconditioner(
-                Solver::build()
-                    .with_criteria(
-                        gko::stop::Iteration::build().with_max_iters(3u).on(
-                            this->exec))
-                    .on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u),
+                           gko::stop::ResidualNorm<value_type>::build()
+                               .with_reduction_factor(
+                                   gko::remove_complex<value_type>(1e-6)))
+            .with_preconditioner(Solver::build().with_criteria(
+                gko::stop::Iteration::build().with_max_iters(3u)))
             .on(this->exec);
     auto solver = cgs_factory->generate(this->mtx);
     auto precond = dynamic_cast<const gko::solver::Cgs<value_type>*>(
@@ -218,15 +211,13 @@ TYPED_TEST(Cgs, CanSetPreconditionerInFactory)
     using Solver = typename TestFixture::Solver;
     std::shared_ptr<Solver> cgs_precond =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .on(this->exec)
             ->generate(this->mtx);
 
     auto cgs_factory =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .with_generated_preconditioner(cgs_precond)
             .on(this->exec);
     auto solver = cgs_factory->generate(this->mtx);
@@ -245,15 +236,13 @@ TYPED_TEST(Cgs, ThrowsOnWrongPreconditionerInFactory)
         Mtx::create(this->exec, gko::dim<2>{2, 2});
     std::shared_ptr<Solver> cgs_precond =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .on(this->exec)
             ->generate(wrong_sized_mtx);
 
     auto cgs_factory =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .with_generated_preconditioner(cgs_precond)
             .on(this->exec);
 
@@ -278,15 +267,13 @@ TYPED_TEST(Cgs, CanSetPreconditioner)
     using Solver = typename TestFixture::Solver;
     std::shared_ptr<Solver> cgs_precond =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .on(this->exec)
             ->generate(this->mtx);
 
     auto cgs_factory =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .on(this->exec);
     auto solver = cgs_factory->generate(this->mtx);
     solver->set_preconditioner(cgs_precond);
@@ -297,4 +284,21 @@ TYPED_TEST(Cgs, CanSetPreconditioner)
 }
 
 
+TYPED_TEST(Cgs, PassExplicitFactory)
+{
+    using Solver = typename TestFixture::Solver;
+    auto stop_factory = gko::share(
+        gko::stop::Iteration::build().with_max_iters(1u).on(this->exec));
+    auto precond_factory = gko::share(Solver::build().on(this->exec));
+
+    auto factory = Solver::build()
+                       .with_criteria(stop_factory)
+                       .with_preconditioner(precond_factory)
+                       .on(this->exec);
+
+    ASSERT_EQ(factory->get_parameters().criteria.front(), stop_factory);
+    ASSERT_EQ(factory->get_parameters().preconditioner, precond_factory);
+}
+
+
 }  // namespace
diff --git a/core/test/solver/direct.cpp b/core/test/solver/direct.cpp
new file mode 100644
index 00000000000..a4110c8c18d
--- /dev/null
+++ b/core/test/solver/direct.cpp
@@ -0,0 +1,105 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/solver/direct.hpp>
+
+
+#include <memory>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/factorization/lu.hpp>
+
+
+#include "core/test/utils.hpp"
+
+
+namespace {
+
+
+template <typename ValueIndexType>
+class Direct : public ::testing::Test {
+protected:
+    using value_type =
+        typename std::tuple_element<0, decltype(ValueIndexType())>::type;
+    using index_type =
+        typename std::tuple_element<1, decltype(ValueIndexType())>::type;
+    using Solver = gko::experimental::solver::Direct<value_type, index_type>;
+    using Lu = gko::experimental::factorization::Lu<value_type, index_type>;
+
+    Direct()
+        : exec(gko::ReferenceExecutor::create()),
+          factory(Solver::build().with_factorization(Lu::build()).on(exec))
+    {}
+
+    std::shared_ptr<const gko::Executor> exec;
+    std::unique_ptr<typename Solver::Factory> factory;
+};
+
+TYPED_TEST_SUITE(Direct, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
+
+
+TYPED_TEST(Direct, FactoryKnowsItsExecutor)
+{
+    ASSERT_EQ(this->factory->get_executor(), this->exec);
+}
+
+
+TYPED_TEST(Direct, ThrowsOnRectangularMatrixInFactory)
+{
+    using Mtx = gko::matrix::Csr<typename TestFixture::value_type,
+                                 typename TestFixture::index_type>;
+    std::shared_ptr<Mtx> rectangular_matrix =
+        Mtx::create(this->exec, gko::dim<2>{1, 2}, 0);
+
+    ASSERT_THROW(this->factory->generate(rectangular_matrix),
+                 gko::DimensionMismatch);
+}
+
+
+TYPED_TEST(Direct, PassExplicitFactory)
+{
+    using Solver = typename TestFixture::Solver;
+    using Lu = typename TestFixture::Lu;
+    auto lu_factory = gko::share(Lu::build().on(this->exec));
+
+    auto factory =
+        Solver::build().with_factorization(lu_factory).on(this->exec);
+
+    ASSERT_EQ(factory->get_parameters().factorization, lu_factory);
+}
+
+
+}  // namespace
diff --git a/core/test/solver/fcg.cpp b/core/test/solver/fcg.cpp
index 59bb5e0cdee..87f27c2bacd 100644
--- a/core/test/solver/fcg.cpp
+++ b/core/test/solver/fcg.cpp
@@ -63,10 +63,9 @@ class Fcg : public ::testing::Test {
           fcg_factory(
               Solver::build()
                   .with_criteria(
-                      gko::stop::Iteration::build().with_max_iters(3u).on(exec),
+                      gko::stop::Iteration::build().with_max_iters(3u),
                       gko::stop::ResidualNorm<value_type>::build()
-                          .with_reduction_factor(gko::remove_complex<T>{1e-6})
-                          .on(exec))
+                          .with_reduction_factor(gko::remove_complex<T>{1e-6}))
                   .on(exec)),
           solver(fcg_factory->generate(mtx))
     {}
@@ -163,18 +162,12 @@ TYPED_TEST(Fcg, CanSetPreconditionerGenerator)
     using value_type = typename TestFixture::value_type;
     auto fcg_factory =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec),
-                gko::stop::ResidualNorm<value_type>::build()
-                    .with_reduction_factor(
-                        gko::remove_complex<value_type>(1e-6))
-                    .on(this->exec))
-            .with_preconditioner(
-                Solver::build()
-                    .with_criteria(
-                        gko::stop::Iteration::build().with_max_iters(3u).on(
-                            this->exec))
-                    .on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u),
+                           gko::stop::ResidualNorm<value_type>::build()
+                               .with_reduction_factor(
+                                   gko::remove_complex<value_type>(1e-6)))
+            .with_preconditioner(Solver::build().with_criteria(
+                gko::stop::Iteration::build().with_max_iters(3u)))
             .on(this->exec);
     auto solver = fcg_factory->generate(this->mtx);
     auto precond = dynamic_cast<const gko::solver::Fcg<value_type>*>(
@@ -217,15 +210,13 @@ TYPED_TEST(Fcg, CanSetPreconditionerInFactory)
     using Solver = typename TestFixture::Solver;
     std::shared_ptr<Solver> fcg_precond =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .on(this->exec)
             ->generate(this->mtx);
 
     auto fcg_factory =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .with_generated_preconditioner(fcg_precond)
             .on(this->exec);
     auto solver = fcg_factory->generate(this->mtx);
@@ -244,15 +235,13 @@ TYPED_TEST(Fcg, ThrowsOnWrongPreconditionerInFactory)
         Mtx::create(this->exec, gko::dim<2>{2, 2});
     std::shared_ptr<Solver> fcg_precond =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .on(this->exec)
             ->generate(wrong_sized_mtx);
 
     auto fcg_factory =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .with_generated_preconditioner(fcg_precond)
             .on(this->exec);
 
@@ -277,15 +266,13 @@ TYPED_TEST(Fcg, CanSetPreconditioner)
     using Solver = typename TestFixture::Solver;
     std::shared_ptr<Solver> fcg_precond =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .on(this->exec)
             ->generate(this->mtx);
 
     auto fcg_factory =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .on(this->exec);
     auto solver = fcg_factory->generate(this->mtx);
     solver->set_preconditioner(fcg_precond);
@@ -296,4 +283,21 @@ TYPED_TEST(Fcg, CanSetPreconditioner)
 }
 
 
+TYPED_TEST(Fcg, PassExplicitFactory)
+{
+    using Solver = typename TestFixture::Solver;
+    auto stop_factory = gko::share(
+        gko::stop::Iteration::build().with_max_iters(1u).on(this->exec));
+    auto precond_factory = gko::share(Solver::build().on(this->exec));
+
+    auto factory = Solver::build()
+                       .with_criteria(stop_factory)
+                       .with_preconditioner(precond_factory)
+                       .on(this->exec);
+
+    ASSERT_EQ(factory->get_parameters().criteria.front(), stop_factory);
+    ASSERT_EQ(factory->get_parameters().preconditioner, precond_factory);
+}
+
+
 }  // namespace
diff --git a/core/test/solver/gcr.cpp b/core/test/solver/gcr.cpp
index f7ba80ebba1..4c08863f09b 100644
--- a/core/test/solver/gcr.cpp
+++ b/core/test/solver/gcr.cpp
@@ -67,23 +67,19 @@ class Gcr : public ::testing::Test {
         : exec(gko::ReferenceExecutor::create()),
           mtx(gko::initialize<Mtx>(
               {{1.0, 2.0, 3.0}, {3.0, 2.0, -1.0}, {0.0, -1.0, 2}}, exec)),
-          gcr_factory(
-              Solver::build()
-                  .with_criteria(
-                      gko::stop::Iteration::build().with_max_iters(3u).on(exec),
-                      gko::stop::ResidualNorm<value_type>::build()
-                          .with_reduction_factor(reduction_factor)
-                          .on(exec))
-                  .on(exec)),
+          gcr_factory(Solver::build()
+                          .with_criteria(
+                              gko::stop::Iteration::build().with_max_iters(3u),
+                              gko::stop::ResidualNorm<value_type>::build()
+                                  .with_reduction_factor(reduction_factor))
+                          .on(exec)),
           solver(gcr_factory->generate(mtx)),
           gcr_big_factory(
               Big_solver::build()
                   .with_criteria(
-                      gko::stop::Iteration::build().with_max_iters(128u).on(
-                          exec),
+                      gko::stop::Iteration::build().with_max_iters(128u),
                       gko::stop::ResidualNorm<value_type>::build()
-                          .with_reduction_factor(reduction_factor)
-                          .on(exec))
+                          .with_reduction_factor(reduction_factor))
                   .on(exec)),
           big_solver(gcr_big_factory->generate(mtx))
     {}
@@ -198,16 +194,11 @@ TYPED_TEST(Gcr, CanSetPreconditionerGenerator)
     auto gcr_factory =
         Solver::build()
             .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec),
+                gko::stop::Iteration::build().with_max_iters(3u),
                 gko::stop::ResidualNorm<value_type>::build()
-                    .with_reduction_factor(TestFixture::reduction_factor)
-                    .on(this->exec))
-            .with_preconditioner(
-                Solver::build()
-                    .with_criteria(
-                        gko::stop::Iteration::build().with_max_iters(3u).on(
-                            this->exec))
-                    .on(this->exec))
+                    .with_reduction_factor(TestFixture::reduction_factor))
+            .with_preconditioner(Solver::build().with_criteria(
+                gko::stop::Iteration::build().with_max_iters(3u)))
             .on(this->exec);
     auto solver = gcr_factory->generate(this->mtx);
     auto precond = dynamic_cast<const gko::solver::Gcr<value_type>*>(
@@ -253,10 +244,9 @@ TYPED_TEST(Gcr, CanSetKrylovDim)
         Solver::build()
             .with_krylov_dim(4u)
             .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(4u).on(this->exec),
+                gko::stop::Iteration::build().with_max_iters(4u),
                 gko::stop::ResidualNorm<value_type>::build()
-                    .with_reduction_factor(TestFixture::reduction_factor)
-                    .on(this->exec))
+                    .with_reduction_factor(TestFixture::reduction_factor))
             .on(this->exec);
     auto solver = gcr_factory->generate(this->mtx);
     auto krylov_dim = solver->get_krylov_dim();
@@ -289,15 +279,13 @@ TYPED_TEST(Gcr, CanSetPreconditionerInFactory)
     using Solver = typename TestFixture::Solver;
     std::shared_ptr<Solver> gcr_precond =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .on(this->exec)
             ->generate(this->mtx);
 
     auto gcr_factory =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .with_generated_preconditioner(gcr_precond)
             .on(this->exec);
     auto solver = gcr_factory->generate(this->mtx);
@@ -316,15 +304,13 @@ TYPED_TEST(Gcr, ThrowsOnWrongPreconditionerInFactory)
         Mtx::create(this->exec, gko::dim<2>{2, 2});
     std::shared_ptr<Solver> gcr_precond =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .on(this->exec)
             ->generate(wrong_sized_mtx);
 
     auto gcr_factory =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .with_generated_preconditioner(gcr_precond)
             .on(this->exec);
 
@@ -349,15 +335,13 @@ TYPED_TEST(Gcr, CanSetPreconditioner)
     using Solver = typename TestFixture::Solver;
     std::shared_ptr<Solver> gcr_precond =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .on(this->exec)
             ->generate(this->mtx);
 
     auto gcr_factory =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .on(this->exec);
     auto solver = gcr_factory->generate(this->mtx);
     solver->set_preconditioner(gcr_precond);
@@ -368,4 +352,21 @@ TYPED_TEST(Gcr, CanSetPreconditioner)
 }
 
 
+TYPED_TEST(Gcr, PassExplicitFactory)
+{
+    using Solver = typename TestFixture::Solver;
+    auto stop_factory = gko::share(
+        gko::stop::Iteration::build().with_max_iters(1u).on(this->exec));
+    auto precond_factory = gko::share(Solver::build().on(this->exec));
+
+    auto factory = Solver::build()
+                       .with_criteria(stop_factory)
+                       .with_preconditioner(precond_factory)
+                       .on(this->exec);
+
+    ASSERT_EQ(factory->get_parameters().criteria.front(), stop_factory);
+    ASSERT_EQ(factory->get_parameters().preconditioner, precond_factory);
+}
+
+
 }  // namespace
diff --git a/core/test/solver/gmres.cpp b/core/test/solver/gmres.cpp
index 11cafe2c86f..2464bb7273d 100644
--- a/core/test/solver/gmres.cpp
+++ b/core/test/solver/gmres.cpp
@@ -70,20 +70,17 @@ class Gmres : public ::testing::Test {
           gmres_factory(
               Solver::build()
                   .with_criteria(
-                      gko::stop::Iteration::build().with_max_iters(3u).on(exec),
+                      gko::stop::Iteration::build().with_max_iters(3u),
                       gko::stop::ResidualNorm<value_type>::build()
-                          .with_reduction_factor(reduction_factor)
-                          .on(exec))
+                          .with_reduction_factor(reduction_factor))
                   .on(exec)),
           solver(gmres_factory->generate(mtx)),
           gmres_big_factory(
               Big_solver::build()
                   .with_criteria(
-                      gko::stop::Iteration::build().with_max_iters(128u).on(
-                          exec),
+                      gko::stop::Iteration::build().with_max_iters(128u),
                       gko::stop::ResidualNorm<value_type>::build()
-                          .with_reduction_factor(reduction_factor)
-                          .on(exec))
+                          .with_reduction_factor(reduction_factor))
                   .on(exec)),
           big_solver(gmres_big_factory->generate(mtx))
     {}
@@ -183,16 +180,11 @@ TYPED_TEST(Gmres, CanSetPreconditionerGenerator)
     auto gmres_factory =
         Solver::build()
             .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec),
+                gko::stop::Iteration::build().with_max_iters(3u),
                 gko::stop::ResidualNorm<value_type>::build()
-                    .with_reduction_factor(TestFixture::reduction_factor)
-                    .on(this->exec))
-            .with_preconditioner(
-                Solver::build()
-                    .with_criteria(
-                        gko::stop::Iteration::build().with_max_iters(3u).on(
-                            this->exec))
-                    .on(this->exec))
+                    .with_reduction_factor(TestFixture::reduction_factor))
+            .with_preconditioner(Solver::build().with_criteria(
+                gko::stop::Iteration::build().with_max_iters(3u)))
             .on(this->exec);
     auto solver = gmres_factory->generate(this->mtx);
     auto precond = dynamic_cast<const gko::solver::Gmres<value_type>*>(
@@ -239,10 +231,9 @@ TYPED_TEST(Gmres, CanSetKrylovDim)
         Solver::build()
             .with_krylov_dim(4u)
             .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(4u).on(this->exec),
+                gko::stop::Iteration::build().with_max_iters(4u),
                 gko::stop::ResidualNorm<value_type>::build()
-                    .with_reduction_factor(TestFixture::reduction_factor)
-                    .on(this->exec))
+                    .with_reduction_factor(TestFixture::reduction_factor))
             .on(this->exec);
     auto solver = gmres_factory->generate(this->mtx);
     auto krylov_dim = solver->get_krylov_dim();
@@ -275,15 +266,13 @@ TYPED_TEST(Gmres, CanSetPreconditionerInFactory)
     using Solver = typename TestFixture::Solver;
     std::shared_ptr<Solver> gmres_precond =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .on(this->exec)
             ->generate(this->mtx);
 
     auto gmres_factory =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .with_generated_preconditioner(gmres_precond)
             .on(this->exec);
     auto solver = gmres_factory->generate(this->mtx);
@@ -302,15 +291,13 @@ TYPED_TEST(Gmres, ThrowsOnWrongPreconditionerInFactory)
         Mtx::create(this->exec, gko::dim<2>{2, 2});
     std::shared_ptr<Solver> gmres_precond =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .on(this->exec)
             ->generate(wrong_sized_mtx);
 
     auto gmres_factory =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .with_generated_preconditioner(gmres_precond)
             .on(this->exec);
 
@@ -335,15 +322,13 @@ TYPED_TEST(Gmres, CanSetPreconditioner)
     using Solver = typename TestFixture::Solver;
     std::shared_ptr<Solver> gmres_precond =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .on(this->exec)
             ->generate(this->mtx);
 
     auto gmres_factory =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .on(this->exec);
     auto solver = gmres_factory->generate(this->mtx);
     solver->set_preconditioner(gmres_precond);
@@ -354,4 +339,21 @@ TYPED_TEST(Gmres, CanSetPreconditioner)
 }
 
 
+TYPED_TEST(Gmres, PassExplicitFactory)
+{
+    using Solver = typename TestFixture::Solver;
+    auto stop_factory = gko::share(
+        gko::stop::Iteration::build().with_max_iters(1u).on(this->exec));
+    auto precond_factory = gko::share(Solver::build().on(this->exec));
+
+    auto factory = Solver::build()
+                       .with_criteria(stop_factory)
+                       .with_preconditioner(precond_factory)
+                       .on(this->exec);
+
+    ASSERT_EQ(factory->get_parameters().criteria.front(), stop_factory);
+    ASSERT_EQ(factory->get_parameters().preconditioner, precond_factory);
+}
+
+
 }  // namespace
diff --git a/core/test/solver/idr.cpp b/core/test/solver/idr.cpp
index f9109acb69e..5552f6f1c0a 100644
--- a/core/test/solver/idr.cpp
+++ b/core/test/solver/idr.cpp
@@ -64,10 +64,9 @@ class Idr : public ::testing::Test {
           idr_factory(
               Solver::build()
                   .with_criteria(
-                      gko::stop::Iteration::build().with_max_iters(3u).on(exec),
+                      gko::stop::Iteration::build().with_max_iters(3u),
                       gko::stop::ResidualNorm<value_type>::build()
-                          .with_reduction_factor(gko::remove_complex<T>{1e-6})
-                          .on(exec))
+                          .with_reduction_factor(gko::remove_complex<T>{1e-6}))
                   .on(exec)),
           solver(idr_factory->generate(mtx))
     {}
@@ -162,14 +161,9 @@ TYPED_TEST(Idr, CanSetPreconditionerGenerator)
     using value_type = typename TestFixture::value_type;
     auto idr_factory =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
-            .with_preconditioner(
-                Solver::build()
-                    .with_criteria(
-                        gko::stop::Iteration::build().with_max_iters(3u).on(
-                            this->exec))
-                    .on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
+            .with_preconditioner(Solver::build().with_criteria(
+                gko::stop::Iteration::build().with_max_iters(3u)))
             .on(this->exec);
 
     auto solver = idr_factory->generate(this->mtx);
@@ -209,15 +203,13 @@ TYPED_TEST(Idr, CanSetPreconditionerInFactory)
     using Solver = typename TestFixture::Solver;
     std::shared_ptr<Solver> idr_precond =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .on(this->exec)
             ->generate(this->mtx);
 
     auto idr_factory =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .with_generated_preconditioner(idr_precond)
             .on(this->exec);
     auto solver = idr_factory->generate(this->mtx);
@@ -236,15 +228,13 @@ TYPED_TEST(Idr, ThrowsOnWrongPreconditionerInFactory)
         Mtx::create(this->exec, gko::dim<2>{2, 2});
     std::shared_ptr<Solver> idr_precond =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .on(this->exec)
             ->generate(wrong_sized_mtx);
 
     auto idr_factory =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .with_generated_preconditioner(idr_precond)
             .on(this->exec);
 
@@ -257,15 +247,13 @@ TYPED_TEST(Idr, CanSetPreconditioner)
     using Solver = typename TestFixture::Solver;
     std::shared_ptr<Solver> idr_precond =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .on(this->exec)
             ->generate(this->mtx);
 
     auto idr_factory =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .on(this->exec);
     auto solver = idr_factory->generate(this->mtx);
     solver->set_preconditioner(idr_precond);
@@ -283,8 +271,7 @@ TYPED_TEST(Idr, CanSetSubspaceDim)
     auto idr_factory =
         Solver::build()
             .with_subspace_dim(8u)
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(4u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(4u))
             .on(this->exec);
     auto solver = idr_factory->generate(this->mtx);
     auto subspace_dim = solver->get_subspace_dim();
@@ -320,8 +307,7 @@ TYPED_TEST(Idr, CanSetKappa)
     auto idr_factory =
         Solver::build()
             .with_kappa(real_type{0.05})
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(4u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(4u))
             .on(this->exec);
     auto solver = idr_factory->generate(this->mtx);
     auto kappa = solver->get_kappa();
@@ -359,8 +345,7 @@ TYPED_TEST(Idr, CanSetDeterministic)
     auto idr_factory =
         Solver::build()
             .with_deterministic(true)
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(4u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(4u))
             .on(this->exec);
     auto solver = idr_factory->generate(this->mtx);
     auto deterministic = solver->get_deterministic();
@@ -396,8 +381,7 @@ TYPED_TEST(Idr, CanSetComplexSubspace)
     auto idr_factory =
         Solver::build()
             .with_complex_subspace(true)
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(4u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(4u))
             .on(this->exec);
     auto solver = idr_factory->generate(this->mtx);
     auto complex_subspace = solver->get_complex_subspace();
@@ -420,10 +404,27 @@ TYPED_TEST(Idr, CanSetComplexSubspaceAgain)
 
     auto solver = idr_factory->generate(this->mtx);
 
-    solver->set_complex_subpsace(false);
+    solver->set_complex_subspace(false);
 
     ASSERT_EQ(solver->get_complex_subspace(), false);
 }
 
 
+TYPED_TEST(Idr, PassExplicitFactory)
+{
+    using Solver = typename TestFixture::Solver;
+    auto stop_factory = gko::share(
+        gko::stop::Iteration::build().with_max_iters(1u).on(this->exec));
+    auto precond_factory = gko::share(Solver::build().on(this->exec));
+
+    auto factory = Solver::build()
+                       .with_criteria(stop_factory)
+                       .with_preconditioner(precond_factory)
+                       .on(this->exec);
+
+    ASSERT_EQ(factory->get_parameters().criteria.front(), stop_factory);
+    ASSERT_EQ(factory->get_parameters().preconditioner, precond_factory);
+}
+
+
 }  // namespace
diff --git a/core/test/solver/ir.cpp b/core/test/solver/ir.cpp
index 5fdcd55af14..171c0c92b00 100644
--- a/core/test/solver/ir.cpp
+++ b/core/test/solver/ir.cpp
@@ -64,14 +64,12 @@ class Ir : public ::testing::Test {
         : exec(gko::ReferenceExecutor::create()),
           mtx(gko::initialize<Mtx>(
               {{2, -1.0, 0.0}, {-1.0, 2, -1.0}, {0.0, -1.0, 2}}, exec)),
-          ir_factory(
-              Solver::build()
-                  .with_criteria(
-                      gko::stop::Iteration::build().with_max_iters(3u).on(exec),
-                      gko::stop::ResidualNorm<value_type>::build()
-                          .with_reduction_factor(r<value_type>::value)
-                          .on(exec))
-                  .on(exec)),
+          ir_factory(Solver::build()
+                         .with_criteria(
+                             gko::stop::Iteration::build().with_max_iters(3u),
+                             gko::stop::ResidualNorm<value_type>::build()
+                                 .with_reduction_factor(r<value_type>::value))
+                         .on(exec)),
           solver(ir_factory->generate(mtx))
     {}
 
@@ -164,17 +162,11 @@ TYPED_TEST(Ir, CanSetInnerSolverInFactory)
     using value_type = typename TestFixture::value_type;
     auto ir_factory =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec),
-                gko::stop::ResidualNorm<value_type>::build()
-                    .with_reduction_factor(r<value_type>::value)
-                    .on(this->exec))
-            .with_solver(
-                Solver::build()
-                    .with_criteria(
-                        gko::stop::Iteration::build().with_max_iters(3u).on(
-                            this->exec))
-                    .on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u),
+                           gko::stop::ResidualNorm<value_type>::build()
+                               .with_reduction_factor(r<value_type>::value))
+            .with_solver(Solver::build().with_criteria(
+                gko::stop::Iteration::build().with_max_iters(3u)))
             .on(this->exec);
     auto solver = ir_factory->generate(this->mtx);
     auto inner_solver = dynamic_cast<const Solver*>(
@@ -191,15 +183,13 @@ TYPED_TEST(Ir, CanSetGeneratedInnerSolverInFactory)
     using Solver = typename TestFixture::Solver;
     std::shared_ptr<Solver> ir_solver =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .on(this->exec)
             ->generate(this->mtx);
 
     auto ir_factory =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .with_generated_solver(ir_solver)
             .on(this->exec);
     auto solver = ir_factory->generate(this->mtx);
@@ -242,15 +232,13 @@ TYPED_TEST(Ir, ThrowsOnWrongInnerSolverInFactory)
         Mtx::create(this->exec, gko::dim<2>{2, 2});
     std::shared_ptr<Solver> ir_solver =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .on(this->exec)
             ->generate(wrong_sized_mtx);
 
     auto ir_factory =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .with_generated_solver(ir_solver)
             .on(this->exec);
 
@@ -263,15 +251,13 @@ TYPED_TEST(Ir, CanSetInnerSolver)
     using Solver = typename TestFixture::Solver;
     std::shared_ptr<Solver> ir_solver =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .on(this->exec)
             ->generate(this->mtx);
 
     auto ir_factory =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .on(this->exec);
     auto solver = ir_factory->generate(this->mtx);
     solver->set_solver(ir_solver);
@@ -291,9 +277,7 @@ TYPED_TEST(Ir, CanSetApplyWithInitialGuessMode)
                        initial_guess_mode::zero}) {
         auto ir_factory =
             Solver::build()
-                .with_criteria(
-                    gko::stop::Iteration::build().with_max_iters(3u).on(
-                        this->exec))
+                .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
                 .with_default_initial_guess(guess)
                 .on(this->exec);
         auto solver = ir_factory->generate(this->mtx);
@@ -312,15 +296,13 @@ TYPED_TEST(Ir, ThrowOnWrongInnerSolverSet)
         Mtx::create(this->exec, gko::dim<2>{2, 2});
     std::shared_ptr<Solver> ir_solver =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .on(this->exec)
             ->generate(wrong_sized_mtx);
 
     auto ir_factory =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .on(this->exec);
     auto solver = ir_factory->generate(this->mtx);
 
@@ -347,11 +329,9 @@ TYPED_TEST(Ir, DefaultRelaxationFactor)
 
     auto richardson =
         gko::solver::Richardson<value_type>::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec),
-                gko::stop::ResidualNorm<value_type>::build()
-                    .with_reduction_factor(r<value_type>::value)
-                    .on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u),
+                           gko::stop::ResidualNorm<value_type>::build()
+                               .with_reduction_factor(r<value_type>::value))
             .on(this->exec)
             ->generate(this->mtx);
 
@@ -366,11 +346,9 @@ TYPED_TEST(Ir, UseAsRichardson)
 
     auto richardson =
         gko::solver::Richardson<value_type>::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec),
-                gko::stop::ResidualNorm<value_type>::build()
-                    .with_reduction_factor(r<value_type>::value)
-                    .on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u),
+                           gko::stop::ResidualNorm<value_type>::build()
+                               .with_reduction_factor(r<value_type>::value))
             .with_relaxation_factor(relaxation_factor)
             .on(this->exec)
             ->generate(this->mtx);
@@ -497,4 +475,21 @@ TYPED_TEST(Ir, RunResidualNormCheckCorrectTimes)
 }
 
 
+TYPED_TEST(Ir, PassExplicitFactory)
+{
+    using Solver = typename TestFixture::Solver;
+    auto stop_factory = gko::share(
+        gko::stop::Iteration::build().with_max_iters(1u).on(this->exec));
+    auto inner_solver_factory = gko::share(Solver::build().on(this->exec));
+
+    auto factory = Solver::build()
+                       .with_criteria(stop_factory)
+                       .with_solver(inner_solver_factory)
+                       .on(this->exec);
+
+    ASSERT_EQ(factory->get_parameters().criteria.front(), stop_factory);
+    ASSERT_EQ(factory->get_parameters().solver, inner_solver_factory);
+}
+
+
 }  // namespace
diff --git a/core/test/solver/multigrid.cpp b/core/test/solver/multigrid.cpp
index 856f9651ebe..bab6bcaf863 100644
--- a/core/test/solver/multigrid.cpp
+++ b/core/test/solver/multigrid.cpp
@@ -153,11 +153,10 @@ class Multigrid : public ::testing::Test {
         multigrid_factory =
             Solver::build()
                 .with_criteria(
-                    gko::stop::Iteration::build().with_max_iters(3u).on(exec),
+                    gko::stop::Iteration::build().with_max_iters(3u),
                     gko::stop::ResidualNorm<value_type>::build()
                         .with_baseline(gko::stop::mode::initial_resnorm)
-                        .with_reduction_factor(gko::remove_complex<T>{1e-6})
-                        .on(exec))
+                        .with_reduction_factor(gko::remove_complex<T>{1e-6}))
                 .with_max_levels(2u)
                 .with_coarsest_solver(lo_factory)
                 .with_pre_smoother(lo_factory)
@@ -287,8 +286,7 @@ TYPED_TEST(Multigrid, ApplyUsesInitialGuessReturnsFalseWhenZeroGuess)
     using Solver = typename TestFixture::Solver;
     auto multigrid_factory =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(3u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(3u))
             .with_max_levels(2u)
             .with_coarsest_solver(this->lo_factory)
             .with_pre_smoother(this->lo_factory)
@@ -904,4 +902,32 @@ TYPED_TEST(Multigrid, CustomCoarsestSolverSelector)
 }
 
 
+TYPED_TEST(Multigrid, DeferredFactoryParameter)
+{
+    using Solver = typename TestFixture::Solver;
+    using DummyRPFactory = typename TestFixture::DummyRPFactory;
+    using DummyFactory = typename TestFixture::DummyFactory;
+
+    auto solver = Solver::build()
+                      .with_mg_level(DummyRPFactory::build())
+                      .with_pre_smoother(DummyFactory::build())
+                      .with_mid_smoother(DummyFactory::build())
+                      .with_post_smoother(DummyFactory::build())
+                      .with_criteria(gko::stop::Iteration::build())
+                      .with_coarsest_solver(DummyFactory::build())
+                      .on(this->exec);
+
+    GKO_ASSERT_DYNAMIC_TYPE(solver->get_parameters().mg_level[0],
+                            typename DummyRPFactory::Factory);
+    GKO_ASSERT_DYNAMIC_TYPE(solver->get_parameters().pre_smoother[0],
+                            typename DummyFactory::Factory);
+    GKO_ASSERT_DYNAMIC_TYPE(solver->get_parameters().mid_smoother[0],
+                            typename DummyFactory::Factory);
+    GKO_ASSERT_DYNAMIC_TYPE(solver->get_parameters().post_smoother[0],
+                            typename DummyFactory::Factory);
+    GKO_ASSERT_DYNAMIC_TYPE(solver->get_parameters().coarsest_solver[0],
+                            typename DummyFactory::Factory);
+}
+
+
 }  // namespace
diff --git a/core/test/solver/workspace.cpp b/core/test/solver/workspace.cpp
index ffbab815dc6..3dc53fb6abe 100644
--- a/core/test/solver/workspace.cpp
+++ b/core/test/solver/workspace.cpp
@@ -256,8 +256,8 @@ TEST_F(Workspace, CanCreateOperators)
     ASSERT_EQ(op2->get_size(), size2);
     ASSERT_EQ(op1->get_stride(), stride1);
     ASSERT_EQ(op2->get_stride(), stride2);
-    ASSERT_EQ(typeid(*op1), typeid(DummyLinOp));
-    ASSERT_EQ(typeid(*op2), typeid(DummyLinOp2));
+    GKO_ASSERT_DYNAMIC_TYPE(op1, DummyLinOp);
+    GKO_ASSERT_DYNAMIC_TYPE(op2, DummyLinOp2);
     ASSERT_EQ(op1, ws.get_op(1));
     ASSERT_EQ(op2, ws.get_op(0));
 }
@@ -288,7 +288,7 @@ TEST_F(Workspace, ChecksExactOperatorType)
         0, [&] { return std::make_unique<DerivedDummyLinOp>(exec); },
         typeid(DerivedDummyLinOp), {}, 0);
 
-    ASSERT_EQ(typeid(*op1), typeid(DerivedDummyLinOp));
+    GKO_ASSERT_DYNAMIC_TYPE(op1, DerivedDummyLinOp);
 }
 
 
diff --git a/core/test/utils/assertions.hpp b/core/test/utils/assertions.hpp
index e0ec27b8624..40034883078 100644
--- a/core/test/utils/assertions.hpp
+++ b/core/test/utils/assertions.hpp
@@ -43,17 +43,21 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <initializer_list>
 #include <string>
 #include <type_traits>
+#include <typeinfo>
 
 
 #include <gtest/gtest.h>
 
 
 #include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/mtx_io.hpp>
+#include <ginkgo/core/base/name_demangling.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "core/base/batch_utilities.hpp"
 #include "core/base/extended_float.hpp"
 
 
@@ -315,6 +319,86 @@ double get_relative_error(const MatrixData1& first, const MatrixData2& second)
 }
 
 
+template <typename MatrixData1, typename MatrixData2>
+::testing::AssertionResult batch_matrices_near_impl(
+    const std::string& first_expression, const std::string& second_expression,
+    const std::string& tolerance_expression, const MatrixData1& first,
+    const MatrixData2& second, double tolerance)
+{
+    std::vector<double> err;
+    for (size_type b = 0; b < first.size(); ++b) {
+        if (first.size() != second.size()) {
+            return ::testing::AssertionFailure()
+                   << "Expected matrices of equal size\n\t" << first_expression
+                   << " is of size [" << first[b].size[0] << " x "
+                   << first[b].size[1] << "]\n\t" << second_expression
+                   << " is of size [" << second[b].size[0] << " x "
+                   << second[b].size[1] << "]"
+                   << " for batch " << b;
+        }
+
+        err.push_back(detail::get_relative_error(first[b], second[b]));
+    }
+
+    auto bat = std::find_if(err.begin(), err.end(),
+                            [&](double& e) { return !(e <= tolerance); });
+    if (bat == err.end()) {
+        return ::testing::AssertionSuccess();
+    } else {
+        const auto b_pos = static_cast<ptrdiff_t>(bat - err.begin());
+        auto num_rows = first[b_pos].size[0];
+        auto num_cols = first[b_pos].size[1];
+        auto fail = ::testing::AssertionFailure();
+        fail << "Error for batch: " << b_pos << "\n Relative error between "
+             << first_expression << " and " << second_expression << " is "
+             << err[b_pos] << "\n"
+             << "\twhich is larger than " << tolerance_expression
+             << " (which is " << tolerance << ")\n";
+        if (num_rows * num_cols <= 1000) {
+            fail << first_expression << " is:\n";
+            detail::print_matrix(fail, first[b_pos]);
+            fail << second_expression << " is:\n";
+            detail::print_matrix(fail, second[b_pos]);
+            fail << "component-wise relative error is:\n";
+            detail::print_componentwise_error(fail, first[b_pos],
+                                              second[b_pos]);
+        } else {
+            // build output filenames
+            auto test_case_info =
+                ::testing::UnitTest::GetInstance()->current_test_info();
+            auto testname =
+                test_case_info ? std::string{test_case_info->test_case_name()} +
+                                     "." + test_case_info->name()
+                               : std::string{"null"};
+            auto firstfile = testname + "." + first_expression + ".mtx";
+            auto secondfile = testname + "." + second_expression + ".mtx";
+            auto to_remove = [](char c) {
+                return !std::isalnum(c) && c != '_' && c != '.' && c != '-' &&
+                       c != '<' && c != '>';
+            };
+            // remove all but alphanumerical and _.-<> characters from
+            // expressions
+            firstfile.erase(
+                std::remove_if(firstfile.begin(), firstfile.end(), to_remove),
+                firstfile.end());
+            secondfile.erase(
+                std::remove_if(secondfile.begin(), secondfile.end(), to_remove),
+                secondfile.end());
+            // save matrices
+            std::ofstream first_stream{firstfile};
+            gko::write_raw(first_stream, first[b_pos],
+                           gko::layout_type::coordinate);
+            std::ofstream second_stream{secondfile};
+            gko::write_raw(second_stream, second[b_pos],
+                           gko::layout_type::coordinate);
+            fail << first_expression << " saved as " << firstfile << "\n";
+            fail << second_expression << " saved as " << secondfile << "\n";
+        }
+        return fail;
+    }
+}
+
+
 template <typename MatrixData1, typename MatrixData2>
 ::testing::AssertionResult matrices_near_impl(
     const std::string& first_expression, const std::string& second_expression,
@@ -600,6 +684,81 @@ ::testing::AssertionResult values_near<std::complex<half>, std::complex<half>>(
 }
 
 
+/**
+ * This is a gtest predicate which checks if two batch matrices are relatively
+ * near.
+ *
+ * More formally, it checks whether the following equation holds for each of the
+ * matrices in the batch:
+ *
+ * ```
+ * ||first - second|| <= tolerance * max(||first||, ||second||)
+ * ```
+ *
+ * This function should not be called directly, but used in conjunction with
+ * `ASSERT_PRED_FORMAT3` as follows:
+ *
+ * ```
+ * // Check if first and second are near
+ * ASSERT_PRED_FORMAT3(gko::test::assertions::batch_matrices_near,
+ *                     first, second, tolerance);
+ * // Check if first and second are far
+ * ASSERT_PRED_FORMAT3(!gko::test::assertions::batch_matrices_near,
+ *                     first, second, tolerance);
+ * ```
+ *
+ * @see GKO_ASSERT_BATCH_MTX_NEAR
+ * @see GKO_EXPECT_BATCH_MTX_NEAR
+ */
+template <typename Mat1, typename Mat2>
+::testing::AssertionResult batch_matrices_near(
+    const std::string& first_expression, const std::string& second_expression,
+    const std::string& tolerance_expression, const Mat1* first,
+    const Mat2* second, double tolerance)
+{
+    auto exec = first->get_executor()->get_master();
+    using value_type1 = typename Mat1::value_type;
+    using value_type2 = typename Mat2::value_type;
+
+    auto first_data = gko::batch::write<value_type1, int, Mat1>(first);
+    auto second_data = gko::batch::write<value_type2, int, Mat2>(second);
+
+    if (first_data.size() != second_data.size()) {
+        return ::testing::AssertionFailure()
+               << "Expected same batch sizes for " << first_expression
+               << " and " << second_expression << ", but got batch size "
+               << first_data.size() << " for " << first_expression
+               << " and batch size " << second_data.size() << " for "
+               << second_expression;
+    }
+
+    for (size_type b = 0; b < first_data.size(); ++b) {
+        first_data[b].ensure_row_major_order();
+        second_data[b].ensure_row_major_order();
+    }
+
+    return detail::batch_matrices_near_impl(
+        detail::remove_pointer_wrapper(first_expression),
+        detail::remove_pointer_wrapper(second_expression), tolerance_expression,
+        first_data, second_data, tolerance);
+}
+
+
+template <typename Mat1, typename T>
+::testing::AssertionResult batch_matrices_near(
+    const std::string& first_expression, const std::string& second_expression,
+    const std::string& tolerance_expression, const Mat1* first,
+    std::initializer_list<std::initializer_list<T>> second, double tolerance)
+{
+    auto second_mtx =
+        batch::initialize<batch::MultiVector<detail::remove_container<T>>>(
+            second, first->get_executor()->get_master());
+    return batch_matrices_near(
+        first_expression, detail::remove_list_wrapper(second_expression),
+        tolerance_expression, first, second_mtx.get(), tolerance);
+}
+
+
 /**
  * This is a gtest predicate which checks if two matrices are relatively near.
  *
@@ -853,6 +1012,45 @@ ::testing::AssertionResult matrices_equal_sparsity(
 }
 
 
+template <typename Ptr1, typename Ptr2>
+::testing::AssertionResult dynamic_type_eq(const std::string& expr1,
+                                           const std::string& expr2,
+                                           const Ptr1& ptr1, const Ptr2& ptr2)
+{
+    auto& ref1 = *ptr1;
+    auto& ref2 = *ptr2;
+    if (typeid(ref1) == typeid(ref2)) {
+        return ::testing::AssertionSuccess();
+    } else {
+        return ::testing::AssertionFailure()
+               << "mismatching dynamic types\n"
+               << expr1 << " is\n\t"
+               << gko::name_demangling::get_type_name(typeid(ref1)) << "\n"
+               << expr2 << " is\n\t"
+               << gko::name_demangling::get_type_name(typeid(ref2)) << "\n";
+    }
+}
+
+
+template <typename Ptr>
+::testing::AssertionResult dynamic_type_is(const std::string& expr,
+                                           const std::string&, const Ptr& ptr,
+                                           const std::type_info& type)
+{
+    auto& ref = *ptr;
+    if (typeid(ref) == type) {
+        return ::testing::AssertionSuccess();
+    } else {
+        return ::testing::AssertionFailure()
+               << "unexpected dynamic type\n"
+               << expr << " is\n\t"
+               << gko::name_demangling::get_type_name(typeid(ref)) << "\n"
+               << "but we expected\n\t"
+               << gko::name_demangling::get_type_name(type) << "\n";
+    }
+}
+
+
 namespace detail {
 
 
@@ -940,6 +1138,44 @@ T* plain_ptr(T* ptr)
     }
 
 
+/**
+ * Checks if two batched matrices are near each other.
+ *
+ * More formally, it checks whether the following equation holds:
+ *
+ * ```
+ * ||_mtx1 - _mtx2|| <= _tol * max(||_mtx1||, ||_mtx2||)
+ * ```
+ * for all batches
+ *
+ * Has to be called from within a google test unit test.
+ * Internally calls gko::test::assertions::batch_matrices_near().
+ *
+ * @param _mtx1  first matrix
+ * @param _mtx2  second matrix
+ * @param _tol  tolerance level
+ */
+#define GKO_ASSERT_BATCH_MTX_NEAR(_mtx1, _mtx2, _tol)                     \
+    {                                                                     \
+        using ::gko::test::assertions::detail::l;                         \
+        using ::gko::test::assertions::detail::plain_ptr;                 \
+        ASSERT_PRED_FORMAT3(::gko::test::assertions::batch_matrices_near, \
+                            plain_ptr(_mtx1), plain_ptr(_mtx2), _tol);    \
+    }
+
+
+/**
+ * @copydoc GKO_ASSERT_MTX_NEAR
+ */
+#define GKO_EXPECT_BATCH_MTX_NEAR(_mtx1, _mtx2, _tol)                     \
+    {                                                                     \
+        using ::gko::test::assertions::detail::l;                         \
+        using ::gko::test::assertions::detail::plain_ptr;                 \
+        EXPECT_PRED_FORMAT3(::gko::test::assertions::batch_matrices_near, \
+                            plain_ptr(_mtx1), plain_ptr(_mtx2), _tol);    \
+    }
+
+
 /**
  * Checks if two matrices are near each other.
  *
@@ -1054,4 +1290,51 @@ T* plain_ptr(T* ptr)
     }
 
 
+/**
+ * Checks if the dynamic types of the objects referenced by two pointers are
+ * equal.
+ *
+ * @param _ptr1  the first pointer
+ * @param _ptr2  the second pointer
+ */
+#define GKO_ASSERT_DYNAMIC_TYPE_EQ(_ptr1, _ptr2)                             \
+    {                                                                        \
+        ASSERT_PRED_FORMAT2(::gko::test::assertions::dynamic_type_eq, _ptr1, \
+                            _ptr2);                                          \
+    }
+
+
+/**
+ * @copydoc GKO_ASSERT_DYNAMIC_TYPE_EQ
+ */
+#define GKO_EXPECT_DYNAMIC_TYPE_EQ(_ptr1, _ptr2)                             \
+    {                                                                        \
+        EXPECT_PRED_FORMAT2(::gko::test::assertions::dynamic_type_eq, _ptr1, \
+                            _ptr2);                                          \
+    }
+
+
+/**
+ * Checks if the dynamic type of a pointer to an object matches a given type
+ *
+ * @param _ptr  the pointer
+ * @param _type  the expected type
+ */
+#define GKO_ASSERT_DYNAMIC_TYPE(_ptr, _type)                                \
+    {                                                                       \
+        ASSERT_PRED_FORMAT2(::gko::test::assertions::dynamic_type_is, _ptr, \
+                            typeid(_type));                                 \
+    }
+
+
+/**
+ * @copydoc GKO_ASSERT_DYNAMIC_TYPE
+ */
+#define GKO_EXPECT_DYNAMIC_TYPE(_ptr, _type)                                \
+    {                                                                       \
+        EXPECT_PRED_FORMAT2(::gko::test::assertions::dynamic_type_is, _ptr, \
+                            typeid(_type));                                 \
+    }
+
+
 #endif  // GKO_CORE_TEST_UTILS_ASSERTIONS_HPP_
diff --git a/core/test/utils/assertions_test.cpp b/core/test/utils/assertions_test.cpp
index 2e3cbefaaf6..029af45e076 100644
--- a/core/test/utils/assertions_test.cpp
+++ b/core/test/utils/assertions_test.cpp
@@ -98,7 +98,7 @@ class MatricesNear : public ::testing::Test {
 };
 
 
-TEST_F(MatricesNear, SuceedsIfSame)
+TEST_F(MatricesNear, SucceedsIfSame)
 {
     ASSERT_PRED_FORMAT3(gko::test::assertions::matrices_near, mtx1.get(),
                         mtx1.get(), 0.0);
diff --git a/core/test/utils/batch_helpers.hpp b/core/test/utils/batch_helpers.hpp
new file mode 100644
index 00000000000..eee31050505
--- /dev/null
+++ b/core/test/utils/batch_helpers.hpp
@@ -0,0 +1,390 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CORE_TEST_UTILS_BATCH_HELPERS_HPP_
+#define GKO_CORE_TEST_UTILS_BATCH_HELPERS_HPP_
+
+
+#include <random>
+#include <vector>
+
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/device_matrix_data.hpp>
+#include <ginkgo/core/base/matrix_data.hpp>
+#include <ginkgo/core/log/batch_logger.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+#include "core/test/utils/assertions.hpp"
+#include "core/test/utils/matrix_generator.hpp"
+#include "core/utils/matrix_utils.hpp"
+
+
+namespace gko {
+namespace test {
+
+
+/**
+ * Converts a vector of unique pointers to a vector of shared pointers.
+ */
+template <typename T>
+std::vector<std::shared_ptr<T>> share(std::vector<std::unique_ptr<T>>&& objs)
+{
+    std::vector<std::shared_ptr<T>> out;
+    out.reserve(objs.size());
+    for (auto& obj : objs) {
+        out.push_back(std::move(obj));
+    }
+    return out;
+}
+
+
+/**
+ * Generates a batch of random matrices of the specified type.
+ */
+template <typename MatrixType, typename NonzeroDistribution,
+          typename ValueDistribution, typename Engine, typename... MatrixArgs>
+std::unique_ptr<MatrixType> generate_random_batch_matrix(
+    const size_type num_batch_items, const size_type num_rows,
+    const size_type num_cols, NonzeroDistribution&& nonzero_dist,
+    ValueDistribution&& value_dist, Engine&& engine,
+    std::shared_ptr<const Executor> exec, MatrixArgs&&... args)
+{
+    using value_type = typename MatrixType::value_type;
+    using index_type = typename MatrixType::index_type;
+    auto result = MatrixType::create(
+        exec, batch_dim<2>(num_batch_items, dim<2>(num_rows, num_cols)),
+        std::forward<MatrixArgs>(args)...);
+    auto sp_mat = generate_random_device_matrix_data<value_type, index_type>(
+        num_rows, num_cols, nonzero_dist, value_dist, engine,
+        exec->get_master());
+    auto row_idxs = gko::array<index_type>::const_view(
+                        exec->get_master(), sp_mat.get_num_elems(),
+                        sp_mat.get_const_row_idxs())
+                        .copy_to_array();
+    auto col_idxs = gko::array<index_type>::const_view(
+                        exec->get_master(), sp_mat.get_num_elems(),
+                        sp_mat.get_const_col_idxs())
+                        .copy_to_array();
+
+    for (size_type b = 0; b < num_batch_items; b++) {
+        auto rand_mat =
+            fill_random_matrix<typename MatrixType::unbatch_type, index_type>(
+                num_rows, num_cols, row_idxs, col_idxs, value_dist, engine,
+                exec);
+        result->create_view_for_item(b)->copy_from(rand_mat.get());
+    }
+
+    return result;
+}
+
+
+/**
+ * Generate a batch of 1D Poisson (3pt stencil, {-1, 5, -1}) matrices in the
+ * given input matrix format.
+ *
+ * @tparam MatrixType  The concrete type of the output matrix.
+ *
+ * @param exec  The executor.
+ * @param num_rows  The size (number of rows) of the generated matrix
+ * @param num_batch_items  The number of Poisson matrices in the batch
+ * @param args  The create args to be forwarded to the matrix
+ */
+template <typename MatrixType, typename... MatrixArgs>
+std::unique_ptr<const MatrixType> generate_3pt_stencil_batch_matrix(
+    std::shared_ptr<const Executor> exec, const size_type num_batch_items,
+    const int num_rows, MatrixArgs&&... args)
+{
+    using value_type = typename MatrixType::value_type;
+    using index_type = typename MatrixType::index_type;
+    const int num_cols = num_rows;
+    gko::matrix_data<value_type, index_type> data{
+        gko::dim<2>{static_cast<size_type>(num_rows),
+                    static_cast<size_type>(num_cols)},
+        {}};
+    for (int row = 0; row < num_rows; ++row) {
+        if (row > 0) {
+            data.nonzeros.emplace_back(row - 1, row, value_type{-1.0});
+        }
+        data.nonzeros.emplace_back(row, row, value_type{5.0});
+        if (row < num_rows - 1) {
+            data.nonzeros.emplace_back(row, row + 1, value_type{-1.0});
+        }
+    }
+
+    std::vector<gko::matrix_data<value_type, index_type>> batch_data(
+        num_batch_items, data);
+    return gko::batch::read<value_type, index_type, MatrixType>(
+        exec, batch_data, std::forward<MatrixArgs>(args)...);
+}
+
+
+template <typename MatrixType, typename... MatrixArgs>
+std::unique_ptr<const MatrixType> generate_diag_dominant_batch_matrix(
+    std::shared_ptr<const gko::Executor> exec, const size_type num_batch_items,
+    const int num_rows, const bool is_hermitian, MatrixArgs&&... args)
+{
+    using value_type = typename MatrixType::value_type;
+    using index_type = typename MatrixType::index_type;
+    using real_type = remove_complex<value_type>;
+    using unbatch_type = typename MatrixType::unbatch_type;
+    using multi_vec = batch::MultiVector<value_type>;
+    using real_vec = batch::MultiVector<real_type>;
+    const int num_cols = num_rows;
+    gko::matrix_data<value_type, index_type> data{
+        gko::dim<2>{static_cast<size_type>(num_rows),
+                    static_cast<size_type>(num_cols)},
+        {}};
+    auto engine = std::default_random_engine(42);
+    auto rand_diag_dist = std::normal_distribution<real_type>(8.0, 1.0);
+    for (int row = 0; row < num_rows; ++row) {
+        std::uniform_int_distribution<index_type> rand_nnz_dist{1, row + 1};
+        const auto k = rand_nnz_dist(engine);
+        if (row > 0) {
+            data.nonzeros.emplace_back(row - 1, row, value_type{-1.0});
+        }
+        data.nonzeros.emplace_back(
+            row, row,
+            std::abs(static_cast<value_type>(
+                detail::get_rand_value<real_type>(rand_diag_dist, engine))));
+        if (row < num_rows - 1) {
+            data.nonzeros.emplace_back(row, k, value_type{-1.0});
+            data.nonzeros.emplace_back(row, row + 1, value_type{-1.0});
+        }
+    }
+
+    if (is_hermitian) {
+        gko::utils::make_hpd(data);
+    }
+    data.ensure_row_major_order();
+
+    auto soa_data =
+        gko::device_matrix_data<value_type, index_type>::create_from_host(
+            exec->get_master(), data);
+    auto row_idxs = gko::array<index_type>::const_view(
+                        exec->get_master(), soa_data.get_num_elems(),
+                        soa_data.get_const_row_idxs())
+                        .copy_to_array();
+    auto col_idxs = gko::array<index_type>::const_view(
+                        exec->get_master(), soa_data.get_num_elems(),
+                        soa_data.get_const_col_idxs())
+                        .copy_to_array();
+
+    std::vector<gko::matrix_data<value_type, index_type>> batch_data;
+    batch_data.reserve(num_batch_items);
+    batch_data.emplace_back(data);
+    auto rand_val_dist = std::normal_distribution<>(-0.5, 0.5);
+    for (size_type b = 1; b < num_batch_items; b++) {
+        auto rand_data = fill_random_matrix_data<value_type, index_type>(
+            num_rows, num_cols, row_idxs, col_idxs, rand_val_dist, engine);
+        gko::utils::make_diag_dominant(rand_data);
+        GKO_ASSERT(rand_data.size == batch_data.at(0).size);
+        GKO_ASSERT(rand_data.nonzeros.size() == data.nonzeros.size());
+        // Copy over the diagonal values
+        for (int i = 0; i < data.nonzeros.size(); ++i) {
+            if (data.nonzeros[i].row == data.nonzeros[i].column) {
+                rand_data.nonzeros[i] = data.nonzeros[i];
+            }
+        }
+        batch_data.emplace_back(rand_data);
+    }
+    return gko::batch::read<value_type, index_type, MatrixType>(
+        exec, batch_data, std::forward<MatrixArgs>(args)...);
+}
+
+
+template <typename MatrixType>
+struct LinearSystem {
+    using value_type = typename MatrixType::value_type;
+    using multi_vec = batch::MultiVector<value_type>;
+    using real_vec = batch::MultiVector<remove_complex<value_type>>;
+
+    std::shared_ptr<const MatrixType> matrix;
+    std::shared_ptr<multi_vec> rhs;
+    std::shared_ptr<real_vec> host_rhs_norm;
+    std::shared_ptr<multi_vec> exact_sol;
+};
+
+
+template <typename MatrixType>
+LinearSystem<MatrixType> generate_batch_linear_system(
+    std::shared_ptr<const MatrixType> input_batch_matrix, const int num_rhs)
+{
+    using value_type = typename MatrixType::value_type;
+    using index_type = typename MatrixType::index_type;
+    using multi_vec = batch::MultiVector<value_type>;
+    using real_vec = batch::MultiVector<remove_complex<value_type>>;
+    LinearSystem<MatrixType> sys;
+    sys.matrix = input_batch_matrix;
+    const auto num_batch_items = sys.matrix->get_num_batch_items();
+    const auto num_rows = sys.matrix->get_common_size()[0];
+    auto exec = sys.matrix->get_executor();
+    sys.exact_sol = multi_vec::create(
+        exec, batch_dim<2>(num_batch_items, gko::dim<2>(num_rows, num_rhs)));
+    sys.exact_sol->fill(value_type{2.0});
+
+    sys.rhs = multi_vec::create_with_config_of(sys.exact_sol);
+    // A * x_{exact} = b
+    sys.matrix->apply(sys.exact_sol, sys.rhs);
+    const gko::batch_dim<2> norm_dim(num_batch_items, gko::dim<2>(1, num_rhs));
+    sys.host_rhs_norm = real_vec::create(exec->get_master(), norm_dim);
+    sys.rhs->compute_norm2(sys.host_rhs_norm.get());
+    return sys;
+}
+
+
+template <typename MatrixType>
+std::unique_ptr<
+    batch::MultiVector<remove_complex<typename MatrixType::value_type>>>
+compute_residual_norms(
+    const MatrixType* mtx,
+    const batch::MultiVector<typename MatrixType::value_type>* b,
+    const batch::MultiVector<typename MatrixType::value_type>* x)
+{
+    using value_type = typename MatrixType::value_type;
+    using multi_vec = batch::MultiVector<value_type>;
+    using real_vec = batch::MultiVector<remove_complex<value_type>>;
+    auto exec = mtx->get_executor();
+    auto num_batch_items = x->get_num_batch_items();
+    auto num_rhs = x->get_common_size()[1];
+    const gko::batch_dim<2> norm_dim(num_batch_items, gko::dim<2>(1, num_rhs));
+
+    auto residual_vec = b->clone();
+    auto res_norm = real_vec::create(exec->get_master(), norm_dim);
+    auto alpha =
+        gko::batch::initialize<multi_vec>(num_batch_items, {-1.0}, exec);
+    auto beta = gko::batch::initialize<multi_vec>(num_batch_items, {1.0}, exec);
+    mtx->apply(alpha, x, beta, residual_vec);
+    residual_vec->compute_norm2(res_norm);
+    return res_norm;
+}
+
+
+template <typename ValueType>
+struct Result {
+    using multi_vec = batch::MultiVector<ValueType>;
+    using real_vec = batch::MultiVector<remove_complex<ValueType>>;
+
+    std::shared_ptr<multi_vec> x;
+    std::shared_ptr<real_vec> host_res_norm;
+};
+
+
+template <typename ValueType>
+struct ResultWithLogData : public Result<ValueType> {
+    std::unique_ptr<
+        gko::batch::log::detail::log_data<remove_complex<ValueType>>>
+        log_data;
+};
+
+
+template <typename MatrixType, typename SolverType>
+Result<typename MatrixType::value_type> solve_linear_system(
+    std::shared_ptr<const Executor> exec, const LinearSystem<MatrixType>& sys,
+    std::shared_ptr<SolverType> solver)
+{
+    using value_type = typename MatrixType::value_type;
+    using real_type = remove_complex<value_type>;
+    using multi_vec = typename Result<value_type>::multi_vec;
+    using real_vec = typename Result<value_type>::real_vec;
+
+    const size_type num_batch_items = sys.matrix->get_num_batch_items();
+    const int num_rows = sys.matrix->get_common_size()[0];
+    const int num_rhs = sys.rhs->get_common_size()[1];
+    const gko::batch_dim<2> vec_size(num_batch_items,
+                                     gko::dim<2>(num_rows, num_rhs));
+    const gko::batch_dim<2> norm_size(num_batch_items, gko::dim<2>(1, num_rhs));
+
+    Result<value_type> result;
+    result.x = multi_vec::create_with_config_of(sys.rhs);
+    result.x->fill(zero<value_type>());
+
+    solver->apply(sys.rhs, result.x);
+    result.host_res_norm =
+        compute_residual_norms(sys.matrix.get(), sys.rhs.get(), result.x.get());
+
+    return std::move(result);
+}
+
+
+template <typename MatrixType, typename SolveLambda, typename Settings>
+ResultWithLogData<typename MatrixType::value_type> solve_linear_system(
+    std::shared_ptr<const Executor> exec, SolveLambda solve_lambda,
+    const Settings settings, const LinearSystem<MatrixType>& sys,
+    std::shared_ptr<batch::BatchLinOpFactory> precond_factory = nullptr)
+{
+    using value_type = typename MatrixType::value_type;
+    using real_type = remove_complex<value_type>;
+    using multi_vec = typename Result<value_type>::multi_vec;
+    using real_vec = typename Result<value_type>::real_vec;
+
+    const size_type num_batch_items = sys.matrix->get_num_batch_items();
+    const int num_rows = sys.matrix->get_common_size()[0];
+    const int num_rhs = sys.rhs->get_common_size()[1];
+    const gko::batch_dim<2> norm_size(num_batch_items, gko::dim<2>(1, num_rhs));
+
+    ResultWithLogData<value_type> result;
+    result.x = multi_vec::create_with_config_of(sys.rhs);
+    result.x->fill(zero<value_type>());
+
+    auto log_data = std::make_unique<batch::log::detail::log_data<real_type>>(
+        exec, num_batch_items);
+
+    std::unique_ptr<gko::batch::BatchLinOp> precond;
+    if (precond_factory) {
+        precond = precond_factory->generate(sys.matrix);
+    } else {
+        precond = nullptr;
+    }
+
+    solve_lambda(settings, precond.get(), sys.matrix.get(), sys.rhs.get(),
+                 result.x.get(), *log_data.get());
+
+
+    result.log_data = std::make_unique<batch::log::detail::log_data<real_type>>(
+        exec->get_master(), num_batch_items);
+    result.log_data->iter_counts = log_data->iter_counts;
+    result.log_data->res_norms = log_data->res_norms;
+
+    result.host_res_norm =
+        compute_residual_norms(sys.matrix.get(), sys.rhs.get(), result.x.get());
+
+    return std::move(result);
+}
+
+
+}  // namespace test
+}  // namespace gko
+
+
+#endif  // GKO_CORE_TEST_UTILS_BATCH_HELPERS_HPP_
diff --git a/core/test/utils/fb_matrix_generator.hpp b/core/test/utils/fb_matrix_generator.hpp
index 1c5c818757b..7c43b0905c1 100644
--- a/core/test/utils/fb_matrix_generator.hpp
+++ b/core/test/utils/fb_matrix_generator.hpp
@@ -129,7 +129,7 @@ std::unique_ptr<MatrixType> generate_random_matrix_with_diag(
  *                generated FBCSR matrix.
  * @param block_size  Block size of output Fbcsr matrix.
  * @param row_diag_dominant  If true, a row-diagonal-dominant Fbcsr matrix is
- *                           generated. Note that in this case, the intput Csr
+ *                           generated. Note that in this case, the input Csr
  *                           matrix must have diagonal entries in all rows.
  * @param rand_engine  Random number engine to use, such as
  * std::default_random_engine.
diff --git a/core/test/utils/matrix_generator.hpp b/core/test/utils/matrix_generator.hpp
index 23ab84cc491..d5370c6ef6a 100644
--- a/core/test/utils/matrix_generator.hpp
+++ b/core/test/utils/matrix_generator.hpp
@@ -42,6 +42,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <vector>
 
 
+#include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/utils.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
@@ -54,6 +55,49 @@ namespace gko {
 namespace test {
 
 
+/**
+ * Fills matrix data for a random matrix given a sparsity pattern
+ *
+ * @tparam ValueType  the type for matrix values
+ * @tparam IndexType  the type for row and column indices
+ * @tparam ValueDistribution  type of value distribution
+ * @tparam Engine  type of random engine
+ *
+ * @param num_rows  number of rows
+ * @param num_cols  number of columns
+ * @param row_idxs  the row indices of the matrix
+ * @param col_idxs  the column indices of the matrix
+ * @param value_dist  distribution of matrix values
+ * @param engine  a random engine
+ *
+ * @return the generated matrix_data with entries according to the given
+ *         dimensions and nonzero count and value distributions.
+ */
+template <typename ValueType, typename IndexType, typename ValueDistribution,
+          typename Engine>
+matrix_data<ValueType, IndexType> fill_random_matrix_data(
+    size_type num_rows, size_type num_cols,
+    const gko::array<IndexType>& row_indices,
+    const gko::array<IndexType>& col_indices, ValueDistribution&& value_dist,
+    Engine&& engine)
+{
+    matrix_data<ValueType, IndexType> data{gko::dim<2>{num_rows, num_cols}, {}};
+    auto host_exec = row_indices.get_executor()->get_master();
+    auto host_row_indices = make_temporary_clone(host_exec, &row_indices);
+    auto host_col_indices = make_temporary_clone(host_exec, &col_indices);
+
+    for (int nnz = 0; nnz < row_indices.get_num_elems(); ++nnz) {
+        data.nonzeros.emplace_back(
+            host_row_indices->get_const_data()[nnz],
+            host_col_indices->get_const_data()[nnz],
+            detail::get_rand_value<ValueType>(value_dist, engine));
+    }
+
+    data.ensure_row_major_order();
+    return data;
+}
+
+
 /**
  * Generates matrix data for a random matrix.
  *
@@ -156,6 +200,49 @@ generate_random_device_matrix_data(gko::size_type num_rows,
 }
 
 
+/**
+ * Fills a random matrix with given sparsity pattern.
+ *
+ * @tparam MatrixType  type of matrix to generate (must implement
+ *                     the interface `ReadableFromMatrixData<>` and provide
+ *                     matching `value_type` and `index_type` type aliases)
+ * @tparam IndexType  the type for row and column indices
+ * @tparam ValueDistribution  type of value distribution
+ * @tparam Engine  type of random engine
+ *
+ * @param num_rows  number of rows
+ * @param num_cols  number of columns
+ * @param row_idxs  the row indices of the matrix
+ * @param col_idxs  the column indices of the matrix
+ * @param value_dist  distribution of matrix values
+ * @param exec  executor where the matrix should be allocated
+ * @param args  additional arguments for the matrix constructor
+ *
+ * @return the unique pointer of MatrixType
+ */
+template <typename MatrixType = matrix::Dense<>,
+          typename IndexType = typename MatrixType::index_type,
+          typename ValueDistribution, typename Engine, typename... MatrixArgs>
+std::unique_ptr<MatrixType> fill_random_matrix(
+    size_type num_rows, size_type num_cols,
+    const gko::array<IndexType>& row_idxs,
+    const gko::array<IndexType>& col_idxs, ValueDistribution&& value_dist,
+    Engine&& engine, std::shared_ptr<const Executor> exec, MatrixArgs&&... args)
+{
+    using value_type = typename MatrixType::value_type;
+    using index_type = IndexType;
+
+    GKO_ASSERT(row_idxs.get_num_elems() == col_idxs.get_num_elems());
+    GKO_ASSERT(row_idxs.get_num_elems() <= (num_rows * num_cols));
+    auto result = MatrixType::create(exec, std::forward<MatrixArgs>(args)...);
+    result->read(fill_random_matrix_data<value_type, index_type>(
+        num_rows, num_cols, row_idxs, col_idxs,
+        std::forward<ValueDistribution>(value_dist),
+        std::forward<Engine>(engine)));
+    return result;
+}
+
+
 /**
  * Generates a random matrix.
  *
@@ -163,6 +250,10 @@ generate_random_device_matrix_data(gko::size_type num_rows,
  *                     the interface `ReadableFromMatrixData<>` and provide
  *                     matching `value_type` and `index_type` type aliases)
  *
+ * @param num_rows  number of rows
+ * @param num_cols  number of columns
+ * @param nonzero_dist  distribution of nonzeros per row
+ * @param value_dist  distribution of matrix values
  * @param exec  executor where the matrix should be allocated
  * @param args  additional arguments for the matrix constructor
  *
@@ -553,7 +644,7 @@ std::unique_ptr<MatrixType> generate_tridiag_matrix(
 /**
  * This computes an inverse of an tridiagonal Toeplitz matrix.
  *
- * The compuation is based on the formula is from
+ * The computation is based on the formula is from
  * https://en.wikipedia.org/wiki/Tridiagonal_matrix#Inversion
  *
  * @param size  the (square) size of the resulting matrix
diff --git a/core/test/utils/matrix_utils_test.cpp b/core/test/utils/matrix_utils_test.cpp
index 31a6072270e..cc5ed70966d 100644
--- a/core/test/utils/matrix_utils_test.cpp
+++ b/core/test/utils/matrix_utils_test.cpp
@@ -355,29 +355,37 @@ TEST(MatrixUtils, ModifyToEnsureAllDiagonalEntries)
     using T = float;
     using Csr = gko::matrix::Csr<T, int>;
     auto exec = gko::ReferenceExecutor::create();
-    auto b = gko::initialize<Csr>(
-        {I<T>{2.0, 0.0, 1.1, 0.0}, I<T>{1.0, 2.4, 0.0, -1.0},
-         I<T>{0.0, -4.0, 2.2, -2.0}, I<T>{0.0, -3.0, 1.5, 1.0}},
-        exec);
+    auto check_all_diag = [](const Csr* csr) {
+        const auto rowptrs = csr->get_const_row_ptrs();
+        const auto colidxs = csr->get_const_col_idxs();
+        const auto ndiag =
+            static_cast<int>(std::min(csr->get_size()[0], csr->get_size()[1]));
+        bool all_diags = true;
+        for (int i = 0; i < ndiag; i++) {
+            bool has_diag = false;
+            for (int j = rowptrs[i]; j < rowptrs[i + 1]; j++) {
+                if (colidxs[j] == i) {
+                    has_diag = true;
+                    break;
+                }
+            }
+            if (!has_diag) {
+                all_diags = false;
+                break;
+            }
+        }
+        return all_diags;
+    };
+    auto b = gko::initialize<Csr>({I<T>{2.0, 0.0, 1.1}, I<T>{1.0, 0.0, 0.0},
+                                   I<T>{0.0, -4.0, 2.2}, I<T>{0.0, -3.0, 1.5}},
+                                  exec);
+    // ensure it misses some diag
+    bool prev_check = check_all_diag(b.get());
 
     gko::utils::ensure_all_diagonal_entries(b.get());
 
-    const auto rowptrs = b->get_const_row_ptrs();
-    const auto colidxs = b->get_const_col_idxs();
-    bool all_diags = true;
-    for (int i = 0; i < 3; i++) {
-        bool has_diag = false;
-        for (int j = rowptrs[i]; j < rowptrs[i + 1]; j++) {
-            if (colidxs[j] == i) {
-                has_diag = true;
-            }
-        }
-        if (!has_diag) {
-            all_diags = false;
-            break;
-        }
-    }
-    ASSERT_TRUE(all_diags);
+    ASSERT_FALSE(prev_check);
+    ASSERT_TRUE(check_all_diag(b.get()));
 }
 
 
diff --git a/core/test/utils/unsort_matrix.hpp b/core/test/utils/unsort_matrix.hpp
index 04ece71d346..e22d86b326e 100644
--- a/core/test/utils/unsort_matrix.hpp
+++ b/core/test/utils/unsort_matrix.hpp
@@ -40,6 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/matrix_data.hpp>
+#include <ginkgo/core/base/temporary_clone.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 
 
@@ -55,24 +56,8 @@ namespace test {
 template <typename MtxPtr, typename RandomEngine>
 void unsort_matrix(MtxPtr&& mtx, RandomEngine&& engine)
 {
-    using value_type = gko::detail::pointee<decltype(mtx->get_values())>;
-    using index_type = gko::detail::pointee<decltype(mtx->get_col_idxs())>;
-    auto nnz = mtx->get_num_stored_elements();
-    if (nnz <= 0) {
-        return;
-    }
-
-    const auto exec = mtx->get_executor();
-    const auto master = exec->get_master();
-
-    // If exec is not the master/host, extract the master and perform the
-    // unsorting there, followed by copying it back
-    if (exec != master) {
-        auto h_mtx = mtx->clone(master);
-        unsort_matrix(h_mtx, engine);
-        mtx->copy_from(h_mtx);
-        return;
-    }
+    using value_type = typename gko::detail::pointee<MtxPtr>::value_type;
+    using index_type = typename gko::detail::pointee<MtxPtr>::index_type;
     matrix_data<value_type, index_type> data;
     mtx->write(data);
     auto& nonzeros = data.nonzeros;
diff --git a/core/utils/matrix_utils.hpp b/core/utils/matrix_utils.hpp
index fed92ad73ef..65b610d1a1d 100644
--- a/core/utils/matrix_utils.hpp
+++ b/core/utils/matrix_utils.hpp
@@ -301,9 +301,10 @@ void ensure_all_diagonal_entries(MtxType* mtx)
     using index_type = typename MtxType::index_type;
     matrix_data<value_type, index_type> mdata;
     mtx->write(mdata);
-    const auto nrows = static_cast<index_type>(mtx->get_size()[0]);
-    mdata.nonzeros.reserve(mtx->get_num_stored_elements() + nrows);
-    for (index_type i = 0; i < nrows; i++) {
+    const auto ndiag = static_cast<index_type>(
+        std::min(mtx->get_size()[0], mtx->get_size()[1]));
+    mdata.nonzeros.reserve(mtx->get_num_stored_elements() + ndiag);
+    for (index_type i = 0; i < ndiag; i++) {
         mdata.nonzeros.push_back({i, i, zero<value_type>()});
     }
     mdata.sum_duplicates();
diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt
index bbe7a953dbd..1efa8192aeb 100644
--- a/cuda/CMakeLists.txt
+++ b/cuda/CMakeLists.txt
@@ -1,15 +1,27 @@
+cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
 add_library(ginkgo_cuda $<TARGET_OBJECTS:ginkgo_cuda_device> "")
+include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake)
+add_instantiation_files(. matrix/csr_kernels.instantiate.cu CSR_INSTANTIATE)
+add_instantiation_files(. matrix/fbcsr_kernels.instantiate.cu FBCSR_INSTANTIATE)
+# we don't split up the dense kernels into distinct compilations
+list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp)
 target_sources(ginkgo_cuda
     PRIVATE
+    base/batch_multi_vector_kernels.cu
+    base/device.cpp
     base/device_matrix_data_kernels.cu
     base/exception.cpp
     base/executor.cpp
     base/index_set_kernels.cpp
+    base/memory.cpp
+    base/nvtx.cpp
     base/scoped_device_id.cpp
+    base/stream.cpp
     base/timer.cpp
     base/version.cpp
     components/prefix_sum_kernels.cu
     distributed/matrix_kernels.cu
+    distributed/partition_helpers_kernels.cu
     distributed/partition_kernels.cu
     distributed/vector_kernels.cu
     factorization/cholesky_kernels.cu
@@ -26,12 +38,14 @@ target_sources(ginkgo_cuda
     factorization/par_ilut_select_kernel.cu
     factorization/par_ilut_spgeam_kernel.cu
     factorization/par_ilut_sweep_kernel.cu
+    matrix/batch_dense_kernels.cu
+    matrix/batch_ell_kernels.cu
     matrix/coo_kernels.cu
-    matrix/csr_kernels.cu
+    ${CSR_INSTANTIATE}
     matrix/dense_kernels.cu
     matrix/diagonal_kernels.cu
     matrix/ell_kernels.cu
-    matrix/fbcsr_kernels.cu
+    ${FBCSR_INSTANTIATE}
     matrix/fft_kernels.cu
     matrix/sellp_kernels.cu
     matrix/sparsity_csr_kernels.cu
@@ -42,6 +56,7 @@ target_sources(ginkgo_cuda
     preconditioner/jacobi_kernels.cu
     preconditioner/jacobi_simple_apply_kernel.cu
     reorder/rcm_kernels.cu
+    solver/batch_bicgstab_kernels.cu
     solver/cb_gmres_kernels.cu
     solver/idr_kernels.cu
     solver/lower_trs_kernels.cu
@@ -104,26 +119,16 @@ target_compile_options(ginkgo_cuda PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:${GINKGO_C
 target_compile_options(ginkgo_cuda PRIVATE $<$<COMPILE_LANGUAGE:CXX>:${GINKGO_COMPILER_FLAGS}>)
 ginkgo_compile_features(ginkgo_cuda)
 target_compile_definitions(ginkgo_cuda PRIVATE GKO_COMPILING_CUDA)
-target_include_directories(ginkgo_cuda
-    SYSTEM PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
 
 # include path for generated headers like jacobi_common.hpp
 target_include_directories(ginkgo_cuda
     PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/..)
-target_link_libraries(ginkgo_cuda PRIVATE ${CUDA_RUNTIME_LIBS} ${CUBLAS} ${CUSPARSE} ${CURAND} ${CUFFT} nvtx::nvtx)
+target_link_libraries(ginkgo_cuda PRIVATE CUDA::cudart CUDA::cublas CUDA::cusparse CUDA::curand CUDA::cufft nvtx::nvtx)
 # NVTX3 is header-only and requires dlopen/dlclose in static builds
 target_link_libraries(ginkgo_cuda PUBLIC ginkgo_device ${CMAKE_DL_LIBS})
-target_compile_options(ginkgo_cuda
-        PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:${GINKGO_CUDA_ARCH_FLAGS}>")
-# we handle CUDA architecture flags for now, disable CMake handling
-if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.18)
-    set_target_properties(ginkgo_cuda PROPERTIES CUDA_ARCHITECTURES OFF)
-endif()
-list(GET CUDA_RUNTIME_LIBS 0 CUDA_FIRST_LIB)
-get_filename_component(GKO_CUDA_LIBDIR "${CUDA_FIRST_LIB}" DIRECTORY)
 
 ginkgo_default_includes(ginkgo_cuda)
-ginkgo_install_library(ginkgo_cuda "${GKO_CUDA_LIBDIR}")
+ginkgo_install_library(ginkgo_cuda)
 
 if (GINKGO_CHECK_CIRCULAR_DEPS)
     ginkgo_check_headers(ginkgo_cuda GKO_COMPILING_CUDA)
diff --git a/cuda/base/batch_multi_vector_kernels.cu b/cuda/base/batch_multi_vector_kernels.cu
new file mode 100644
index 00000000000..5c4d1f5bdc5
--- /dev/null
+++ b/cuda/base/batch_multi_vector_kernels.cu
@@ -0,0 +1,85 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/base/batch_multi_vector_kernels.hpp"
+
+
+#include <thrust/functional.h>
+#include <thrust/transform.h>
+
+
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/range_accessors.hpp>
+
+
+#include "core/base/batch_struct.hpp"
+#include "cuda/base/batch_struct.hpp"
+#include "cuda/base/config.hpp"
+#include "cuda/base/cublas_bindings.hpp"
+#include "cuda/base/pointer_mode_guard.hpp"
+#include "cuda/base/thrust.cuh"
+#include "cuda/components/cooperative_groups.cuh"
+#include "cuda/components/reduction.cuh"
+#include "cuda/components/thread_ids.cuh"
+#include "cuda/components/uninitialized_array.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+/**
+ * @brief The MultiVector matrix format namespace.
+ *
+ * @ingroup batch_multi_vector
+ */
+namespace batch_multi_vector {
+
+
+constexpr auto default_block_size = 256;
+constexpr int sm_oversubscription = 4;
+
+// clang-format off
+
+// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
+
+#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc"
+
+
+#include "common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc"
+
+// clang-format on
+
+
+}  // namespace batch_multi_vector
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko
diff --git a/cuda/base/batch_struct.hpp b/cuda/base/batch_struct.hpp
new file mode 100644
index 00000000000..14b300c9204
--- /dev/null
+++ b/cuda/base/batch_struct.hpp
@@ -0,0 +1,93 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CUDA_BASE_BATCH_STRUCT_HPP_
+#define GKO_CUDA_BASE_BATCH_STRUCT_HPP_
+
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/math.hpp>
+
+
+#include "core/base/batch_struct.hpp"
+#include "cuda/base/config.hpp"
+#include "cuda/base/types.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+
+
+/** @file batch_struct.hpp
+ *
+ * Helper functions to generate a batch struct from a batch LinOp,
+ * while also shallow-casting to the required CUDA scalar type.
+ *
+ * A specialization is needed for every format of every kind of linear algebra
+ * object. These are intended to be called on the host.
+ */
+
+
+/**
+ * Generates an immutable uniform batch struct from a batch of multi-vectors.
+ */
+template <typename ValueType>
+inline batch::multi_vector::uniform_batch<const cuda_type<ValueType>>
+get_batch_struct(const batch::MultiVector<ValueType>* const op)
+{
+    return {as_cuda_type(op->get_const_values()), op->get_num_batch_items(),
+            static_cast<int32>(op->get_common_size()[1]),
+            static_cast<int32>(op->get_common_size()[0]),
+            static_cast<int32>(op->get_common_size()[1])};
+}
+
+/**
+ * Generates a uniform batch struct from a batch of multi-vectors.
+ */
+template <typename ValueType>
+inline batch::multi_vector::uniform_batch<cuda_type<ValueType>>
+get_batch_struct(batch::MultiVector<ValueType>* const op)
+{
+    return {as_cuda_type(op->get_values()), op->get_num_batch_items(),
+            static_cast<int32>(op->get_common_size()[1]),
+            static_cast<int32>(op->get_common_size()[0]),
+            static_cast<int32>(op->get_common_size()[1])};
+}
+
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_CUDA_BASE_BATCH_STRUCT_HPP_
diff --git a/cuda/base/curand_bindings.hpp b/cuda/base/curand_bindings.hpp
index 429481ec9b6..4bf12dd9064 100644
--- a/cuda/base/curand_bindings.hpp
+++ b/cuda/base/curand_bindings.hpp
@@ -83,6 +83,12 @@ inline curandGenerator_t rand_generator(int64 seed,
 }
 
 
+inline void destroy(curandGenerator_t gen)
+{
+    GKO_ASSERT_NO_CURAND_ERRORS(curandDestroyGenerator(gen));
+}
+
+
 #define GKO_BIND_CURAND_RANDOM_VECTOR(ValueType, CurandName)                 \
     inline void rand_vector(                                                 \
         curandGenerator_t& gen, int n, remove_complex<ValueType> mean,       \
diff --git a/cuda/base/device.cpp b/cuda/base/device.cpp
new file mode 100644
index 00000000000..32cf6265160
--- /dev/null
+++ b/cuda/base/device.cpp
@@ -0,0 +1,71 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <cuda_runtime.h>
+
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+
+
+#include "cuda/base/device.hpp"
+#include "cuda/base/scoped_device_id.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+
+
+void reset_device(int device_id)
+{
+    gko::detail::cuda_scoped_device_id_guard guard{device_id};
+    cudaDeviceReset();
+}
+
+
+void destroy_event(CUevent_st* event)
+{
+    GKO_ASSERT_NO_CUDA_ERRORS(cudaEventDestroy(event));
+}
+
+
+std::string get_device_name(int device_id)
+{
+    cudaDeviceProp prop;
+    GKO_ASSERT_NO_CUDA_ERRORS(cudaGetDeviceProperties(&prop, device_id));
+    return {prop.name};
+}
+
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko
diff --git a/cuda/base/device.hpp b/cuda/base/device.hpp
index 7bd9390c54e..e363f455300 100644
--- a/cuda/base/device.hpp
+++ b/cuda/base/device.hpp
@@ -50,6 +50,10 @@ void reset_device(int device_id);
 void destroy_event(CUevent_st* event);
 
 
+/** returns cudaDeviceProp.name for the given device */
+std::string get_device_name(int device_id);
+
+
 }  // namespace cuda
 }  // namespace kernels
 }  // namespace gko
diff --git a/cuda/base/executor.cpp b/cuda/base/executor.cpp
index e474d9c9f49..f296fb9da86 100644
--- a/cuda/base/executor.cpp
+++ b/cuda/base/executor.cpp
@@ -39,18 +39,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <cuda_runtime.h>
-#ifdef GKO_LEGACY_NVTX
-#include <nvToolsExt.h>
-#else
-#include <nvtx3/nvToolsExt.h>
-#endif
 
 
 #include <ginkgo/config.hpp>
 #include <ginkgo/core/base/device.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
-#include <ginkgo/core/log/profiler_hook.hpp>
+#include <ginkgo/core/base/memory.hpp>
 
 
 #include "cuda/base/config.hpp"
@@ -65,25 +60,43 @@ namespace gko {
 #include "common/cuda_hip/base/executor.hpp.inc"
 
 
+std::unique_ptr<CudaAllocatorBase> cuda_allocator_from_mode(
+    int device_id, allocation_mode mode)
+{
+    switch (mode) {
+    case allocation_mode::device:
+        return std::make_unique<CudaAllocator>();
+    case allocation_mode::unified_global:
+        return std::make_unique<CudaUnifiedAllocator>(device_id,
+                                                      cudaMemAttachGlobal);
+    case allocation_mode::unified_host:
+        return std::make_unique<CudaUnifiedAllocator>(device_id,
+                                                      cudaMemAttachHost);
+    default:
+        GKO_NOT_SUPPORTED(mode);
+    }
+}
+
+
 std::shared_ptr<CudaExecutor> CudaExecutor::create(
     int device_id, std::shared_ptr<Executor> master, bool device_reset,
     allocation_mode alloc_mode, cudaStream_t stream)
 {
-    return std::shared_ptr<CudaExecutor>(
-        new CudaExecutor(device_id, std::move(master), device_reset, alloc_mode,
-                         stream),
-        [device_id](CudaExecutor* exec) {
-            auto device_reset = exec->get_device_reset();
-            std::lock_guard<std::mutex> guard(
-                nvidia_device::get_mutex(device_id));
-            delete exec;
-            auto& num_execs = nvidia_device::get_num_execs(device_id);
-            num_execs--;
-            if (!num_execs && device_reset) {
-                detail::cuda_scoped_device_id_guard g(device_id);
-                cudaDeviceReset();
-            }
-        });
+    return create(device_id, master,
+                  cuda_allocator_from_mode(device_id, alloc_mode), stream);
+}
+
+
+std::shared_ptr<CudaExecutor> CudaExecutor::create(
+    int device_id, std::shared_ptr<Executor> master,
+    std::shared_ptr<CudaAllocatorBase> alloc, cudaStream_t stream)
+{
+    if (!alloc->check_environment(device_id, stream)) {
+        throw Error{__FILE__, __LINE__,
+                    "Allocator uses incorrect stream or device ID."};
+    }
+    return std::shared_ptr<CudaExecutor>(new CudaExecutor(
+        device_id, std::move(master), std::move(alloc), stream));
 }
 
 
@@ -123,41 +136,14 @@ void OmpExecutor::raw_copy_to(const CudaExecutor* dest, size_type num_bytes,
 void CudaExecutor::raw_free(void* ptr) const noexcept
 {
     detail::cuda_scoped_device_id_guard g(this->get_device_id());
-    auto error_code = cudaFree(ptr);
-    if (error_code != cudaSuccess) {
-#if GKO_VERBOSE_LEVEL >= 1
-        // Unfortunately, if memory free fails, there's not much we can do
-        std::cerr << "Unrecoverable CUDA error on device "
-                  << this->get_device_id() << " in " << __func__ << ": "
-                  << cudaGetErrorName(error_code) << ": "
-                  << cudaGetErrorString(error_code) << std::endl
-                  << "Exiting program" << std::endl;
-#endif  // GKO_VERBOSE_LEVEL >= 1
-        std::exit(error_code);
-    }
+    alloc_->deallocate(ptr);
 }
 
 
 void* CudaExecutor::raw_alloc(size_type num_bytes) const
 {
-    void* dev_ptr = nullptr;
     detail::cuda_scoped_device_id_guard g(this->get_device_id());
-    int error_code = 0;
-    if (this->alloc_mode_ == allocation_mode::unified_host) {
-        error_code = cudaMallocManaged(&dev_ptr, num_bytes, cudaMemAttachHost);
-    } else if (this->alloc_mode_ == allocation_mode::unified_global) {
-        error_code =
-            cudaMallocManaged(&dev_ptr, num_bytes, cudaMemAttachGlobal);
-    } else if (this->alloc_mode_ == allocation_mode::device) {
-        error_code = cudaMalloc(&dev_ptr, num_bytes);
-    } else {
-        GKO_NOT_SUPPORTED(this->alloc_mode_);
-    }
-    if (error_code != cudaErrorMemoryAllocation) {
-        GKO_ASSERT_NO_CUDA_ERRORS(error_code);
-    }
-    GKO_ENSURE_ALLOCATED(dev_ptr, "cuda", num_bytes);
-    return dev_ptr;
+    return alloc_->allocate(num_bytes);
 }
 
 
@@ -298,98 +284,4 @@ void CudaExecutor::init_handles()
 }
 
 
-cuda_stream::cuda_stream(int device_id) : stream_{}, device_id_(device_id)
-{
-    detail::cuda_scoped_device_id_guard g(device_id_);
-    GKO_ASSERT_NO_CUDA_ERRORS(cudaStreamCreate(&stream_));
-}
-
-
-cuda_stream::~cuda_stream()
-{
-    if (stream_) {
-        detail::cuda_scoped_device_id_guard g(device_id_);
-        cudaStreamDestroy(stream_);
-    }
-}
-
-
-cuda_stream::cuda_stream(cuda_stream&& other)
-    : stream_{std::exchange(other.stream_, nullptr)},
-      device_id_(std::exchange(other.device_id_, -1))
-{}
-
-
-CUstream_st* cuda_stream::get() const { return stream_; }
-
-
-namespace log {
-
-
-// "GKO" in ASCII to avoid collision with other application's categories
-constexpr static uint32 category_magic_offset = 0x676B6FU;
-
-
-void init_nvtx()
-{
-#define NAMED_CATEGORY(_name)                                             \
-    nvtxNameCategory(static_cast<uint32>(profile_event_category::_name) + \
-                         category_magic_offset,                           \
-                     "gko::" #_name)
-    NAMED_CATEGORY(memory);
-    NAMED_CATEGORY(operation);
-    NAMED_CATEGORY(object);
-    NAMED_CATEGORY(linop);
-    NAMED_CATEGORY(factory);
-    NAMED_CATEGORY(solver);
-    NAMED_CATEGORY(criterion);
-    NAMED_CATEGORY(user);
-    NAMED_CATEGORY(internal);
-#undef NAMED_CATEGORY
-}
-
-
-std::function<void(const char*, profile_event_category)> begin_nvtx_fn(
-    uint32_t color_argb)
-{
-    return [color_argb](const char* name, profile_event_category category) {
-        nvtxEventAttributes_t attr{};
-        attr.version = NVTX_VERSION;
-        attr.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
-        attr.category = static_cast<uint32>(category) + category_magic_offset;
-        attr.colorType = NVTX_COLOR_ARGB;
-        attr.color = color_argb;
-        attr.payloadType = NVTX_PAYLOAD_UNKNOWN;
-        attr.messageType = NVTX_MESSAGE_TYPE_ASCII;
-        attr.message.ascii = name;
-        nvtxRangePushEx(&attr);
-    };
-}
-
-
-void end_nvtx(const char* name, profile_event_category) { nvtxRangePop(); }
-
-
-}  // namespace log
-
-
-namespace kernels {
-namespace cuda {
-
-
-void reset_device(int device_id)
-{
-    gko::detail::cuda_scoped_device_id_guard guard{device_id};
-    cudaDeviceReset();
-}
-
-
-void destroy_event(CUevent_st* event)
-{
-    GKO_ASSERT_NO_CUDA_ERRORS(cudaEventDestroy(event));
-}
-
-
-}  // namespace cuda
-}  // namespace kernels
 }  // namespace gko
diff --git a/cuda/base/kernel_config.hpp b/cuda/base/kernel_config.hpp
new file mode 100644
index 00000000000..a4aecea1d55
--- /dev/null
+++ b/cuda/base/kernel_config.hpp
@@ -0,0 +1,88 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CUDA_BASE_KERNEL_CONFIG_HPP_
+#define GKO_CUDA_BASE_KERNEL_CONFIG_HPP_
+
+
+#include <cuda_runtime.h>
+
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+namespace detail {
+
+
+template <typename ValueType>
+class shared_memory_config_guard {
+public:
+    using value_type = ValueType;
+    shared_memory_config_guard() : original_config_{}
+    {
+        GKO_ASSERT_NO_CUDA_ERRORS(
+            cudaDeviceGetSharedMemConfig(&original_config_));
+
+        if (sizeof(value_type) == 4) {
+            GKO_ASSERT_NO_CUDA_ERRORS(
+                cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeFourByte));
+        } else if (sizeof(value_type) % 8 == 0) {
+            GKO_ASSERT_NO_CUDA_ERRORS(
+                cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte));
+        } else {
+            GKO_ASSERT_NO_CUDA_ERRORS(
+                cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeDefault));
+        }
+    }
+
+
+    ~shared_memory_config_guard()
+    {
+        // No need to exit or throw if we cant set the value back.
+        cudaDeviceSetSharedMemConfig(original_config_);
+    }
+
+private:
+    cudaSharedMemConfig original_config_;
+};
+
+
+}  // namespace detail
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_CUDA_BASE_KERNEL_CONFIG_HPP_
diff --git a/cuda/base/memory.cpp b/cuda/base/memory.cpp
new file mode 100644
index 00000000000..b5bfb14ac74
--- /dev/null
+++ b/cuda/base/memory.cpp
@@ -0,0 +1,224 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/base/memory.hpp>
+
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+
+
+#include "cuda/base/scoped_device_id.hpp"
+
+
+namespace gko {
+
+
+#define GKO_ASSERT_NO_CUDA_ALLOCATION_ERRORS(_operation, _size)       \
+    {                                                                 \
+        auto error_code = _operation;                                 \
+        if (error_code == cudaErrorMemoryAllocation) {                \
+            throw AllocationError(__FILE__, __LINE__, "cuda", _size); \
+        } else {                                                      \
+            GKO_ASSERT_NO_CUDA_ERRORS(error_code);                    \
+        }                                                             \
+    }
+
+
+#if GKO_VERBOSE_LEVEL >= 1
+#define GKO_EXIT_ON_CUDA_ERROR(_operation)                                  \
+    {                                                                       \
+        const auto error_code = _operation;                                 \
+        if (error_code != cudaSuccess) {                                    \
+            int device_id{-1};                                              \
+            cudaGetDevice(&device_id);                                      \
+            std::cerr << "Unrecoverable CUDA error on device " << device_id \
+                      << " in " << __func__ << ":" << __LINE__ << ": "      \
+                      << cudaGetErrorName(error_code) << ": "               \
+                      << cudaGetErrorString(error_code) << std::endl        \
+                      << "Exiting program" << std::endl;                    \
+            std::exit(error_code);                                          \
+        }                                                                   \
+    }
+#else
+#define GKO_EXIT_ON_CUDA_ERROR(_operation)  \
+    {                                       \
+        const auto error_code = _operation; \
+        if (error_code != cudaSuccess) {    \
+            std::exit(error_code);          \
+        }                                   \
+    }
+#endif
+
+
+void* CudaAllocator::allocate(size_type num_bytes)
+{
+    void* ptr{};
+    GKO_ASSERT_NO_CUDA_ALLOCATION_ERRORS(cudaMalloc(&ptr, num_bytes),
+                                         num_bytes);
+    return ptr;
+}
+
+
+void CudaAllocator::deallocate(void* ptr)
+{
+    GKO_EXIT_ON_CUDA_ERROR(cudaFree(ptr));
+}
+
+
+#if CUDA_VERSION >= 11020
+
+
+CudaAsyncAllocator::CudaAsyncAllocator(cudaStream_t stream) : stream_{stream} {}
+
+
+void* CudaAsyncAllocator::allocate(size_type num_bytes)
+{
+    void* ptr{};
+    GKO_ASSERT_NO_CUDA_ALLOCATION_ERRORS(
+        cudaMallocAsync(&ptr, num_bytes, stream_), num_bytes);
+    return ptr;
+}
+
+
+void CudaAsyncAllocator::deallocate(void* ptr)
+{
+    GKO_EXIT_ON_CUDA_ERROR(cudaFreeAsync(ptr, stream_));
+}
+
+
+#else  // Fall back to regular allocation
+
+
+CudaAsyncAllocator::CudaAsyncAllocator(cudaStream_t stream) : stream_{stream}
+{
+#if GKO_VERBOSE_LEVEL >= 1
+    std::cerr << "This version of CUDA does not support cudaMallocAsync, "
+                 "please use CudaAllocator instead of CudaAsyncAllocator.\n";
+#endif
+}
+
+
+void* CudaAsyncAllocator::allocate(size_type num_bytes)
+{
+    void* ptr{};
+    GKO_ASSERT_NO_CUDA_ALLOCATION_ERRORS(cudaMalloc(&ptr, num_bytes),
+                                         num_bytes);
+    return ptr;
+}
+
+
+void CudaAsyncAllocator::deallocate(void* ptr)
+{
+    GKO_EXIT_ON_CUDA_ERROR(cudaFree(ptr));
+}
+
+
+#endif
+
+
+bool CudaAsyncAllocator::check_environment(int device_id,
+                                           CUstream_st* stream) const
+{
+    return stream == stream_;
+}
+
+
+CudaUnifiedAllocator::CudaUnifiedAllocator(int device_id)
+    : CudaUnifiedAllocator{device_id, cudaMemAttachGlobal}
+{}
+
+
+CudaUnifiedAllocator::CudaUnifiedAllocator(int device_id, unsigned int flags)
+    : device_id_{device_id}, flags_{flags}
+{}
+
+
+void* CudaUnifiedAllocator::allocate(size_type num_bytes)
+{
+    // we need to set the device ID in case this gets used in a host executor
+    detail::cuda_scoped_device_id_guard g(device_id_);
+    void* ptr{};
+    GKO_ASSERT_NO_CUDA_ALLOCATION_ERRORS(
+        cudaMallocManaged(&ptr, num_bytes, flags_), num_bytes);
+    return ptr;
+}
+
+
+void CudaUnifiedAllocator::deallocate(void* ptr)
+{
+    // we need to set the device ID in case this gets used in a host executor
+    detail::cuda_scoped_device_id_guard g(device_id_);
+    GKO_EXIT_ON_CUDA_ERROR(cudaFree(ptr));
+}
+
+
+bool CudaUnifiedAllocator::check_environment(int device_id,
+                                             CUstream_st* stream) const
+{
+    return device_id == device_id_;
+}
+
+
+CudaHostAllocator::CudaHostAllocator(int device_id) : device_id_{device_id} {}
+
+
+void* CudaHostAllocator::allocate(size_type num_bytes)
+{
+    // we need to set the device ID in case this gets used in a host executor
+    detail::cuda_scoped_device_id_guard g(device_id_);
+    void* ptr{};
+    GKO_ASSERT_NO_CUDA_ALLOCATION_ERRORS(cudaMallocHost(&ptr, num_bytes),
+                                         num_bytes);
+    return ptr;
+}
+
+
+void CudaHostAllocator::deallocate(void* ptr)
+{
+    // we need to set the device ID in case this gets used in a host executor
+    detail::cuda_scoped_device_id_guard g(device_id_);
+    GKO_EXIT_ON_CUDA_ERROR(cudaFreeHost(ptr));
+}
+
+
+bool CudaHostAllocator::check_environment(int device_id,
+                                          CUstream_st* stream) const
+{
+    return device_id == device_id_;
+}
+
+
+}  // namespace gko
diff --git a/cuda/base/nvtx.cpp b/cuda/base/nvtx.cpp
new file mode 100644
index 00000000000..3cbc59299b0
--- /dev/null
+++ b/cuda/base/nvtx.cpp
@@ -0,0 +1,98 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <cuda_runtime.h>
+
+
+#include <ginkgo/config.hpp>
+
+
+#ifdef GKO_LEGACY_NVTX
+#include <nvToolsExt.h>
+#else
+#include <nvtx3/nvToolsExt.h>
+#endif
+
+
+#include <ginkgo/core/log/profiler_hook.hpp>
+
+
+namespace gko {
+namespace log {
+
+
+// "GKO" in ASCII to avoid collision with other application's categories
+constexpr static uint32 category_magic_offset = 0x676B6FU;
+
+
+void init_nvtx()
+{
+#define NAMED_CATEGORY(_name)                                             \
+    nvtxNameCategory(static_cast<uint32>(profile_event_category::_name) + \
+                         category_magic_offset,                           \
+                     "gko::" #_name)
+    NAMED_CATEGORY(memory);
+    NAMED_CATEGORY(operation);
+    NAMED_CATEGORY(object);
+    NAMED_CATEGORY(linop);
+    NAMED_CATEGORY(factory);
+    NAMED_CATEGORY(solver);
+    NAMED_CATEGORY(criterion);
+    NAMED_CATEGORY(user);
+    NAMED_CATEGORY(internal);
+#undef NAMED_CATEGORY
+}
+
+
+std::function<void(const char*, profile_event_category)> begin_nvtx_fn(
+    uint32_t color_argb)
+{
+    return [color_argb](const char* name, profile_event_category category) {
+        nvtxEventAttributes_t attr{};
+        attr.version = NVTX_VERSION;
+        attr.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+        attr.category = static_cast<uint32>(category) + category_magic_offset;
+        attr.colorType = NVTX_COLOR_ARGB;
+        attr.color = color_argb;
+        attr.payloadType = NVTX_PAYLOAD_UNKNOWN;
+        attr.messageType = NVTX_MESSAGE_TYPE_ASCII;
+        attr.message.ascii = name;
+        nvtxRangePushEx(&attr);
+    };
+}
+
+
+void end_nvtx(const char* name, profile_event_category) { nvtxRangePop(); }
+
+
+}  // namespace log
+}  // namespace gko
diff --git a/cuda/base/stream.cpp b/cuda/base/stream.cpp
new file mode 100644
index 00000000000..76027bd51e2
--- /dev/null
+++ b/cuda/base/stream.cpp
@@ -0,0 +1,76 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/base/stream.hpp>
+
+
+#include <cuda_runtime.h>
+
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+
+
+#include "cuda/base/scoped_device_id.hpp"
+
+
+namespace gko {
+
+
+cuda_stream::cuda_stream() : stream_{nullptr}, device_id_{} {}
+
+
+cuda_stream::cuda_stream(int device_id) : stream_{}, device_id_(device_id)
+{
+    detail::cuda_scoped_device_id_guard g(device_id_);
+    GKO_ASSERT_NO_CUDA_ERRORS(cudaStreamCreate(&stream_));
+}
+
+
+cuda_stream::~cuda_stream()
+{
+    if (stream_) {
+        detail::cuda_scoped_device_id_guard g(device_id_);
+        cudaStreamDestroy(stream_);
+    }
+}
+
+
+cuda_stream::cuda_stream(cuda_stream&& other)
+    : stream_{std::exchange(other.stream_, nullptr)},
+      device_id_(std::exchange(other.device_id_, 0))
+{}
+
+
+CUstream_st* cuda_stream::get() const { return stream_; }
+
+
+}  // namespace gko
diff --git a/cuda/components/cooperative_groups.cuh b/cuda/components/cooperative_groups.cuh
index 93db80f2c31..db59a47658d 100644
--- a/cuda/components/cooperative_groups.cuh
+++ b/cuda/components/cooperative_groups.cuh
@@ -399,7 +399,7 @@ using cooperative_groups::thread_block_tile;
 // public API:
 // void sync() const
 // unsigned thread_rank() const
-// usigned size() const
+// unsigned size() const
 // T shfl(T, int)
 // T shfl_up(T, unsigned)
 // T shfl_down(T, unsigned)
diff --git a/cuda/components/memory.cuh b/cuda/components/memory.cuh
new file mode 100644
index 00000000000..480e5f94603
--- /dev/null
+++ b/cuda/components/memory.cuh
@@ -0,0 +1,807 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CUDA_COMPONENTS_MEMORY_CUH_
+#define GKO_CUDA_COMPONENTS_MEMORY_CUH_
+
+
+#include <type_traits>
+
+
+#include <ginkgo/core/base/math.hpp>
+
+
+#include "cuda/base/types.hpp"
+
+
+// this file is generated by dev_tools/scripts/generate_cuda_memory_ptx.py
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+
+
+/**
+ * Transforms a generic CUDA pointer pointing to shared memory to a
+ * shared memory pointer for use in PTX assembly.
+ * CUDA PTX assembly uses 32bit pointers for shared memory addressing.
+ * The result is undefined for a generic pointer pointing to anything but
+ * shared memory.
+ */
+__device__ __forceinline__ uint32 convert_generic_ptr_to_smem_ptr(void* ptr)
+{
+// see
+// https://github.com/NVIDIA/cutlass/blob/
+//     6fc5008803fe4e81b81a836fcd3a88258f4e5bbf/
+//     include/cutlass/arch/memory_sm75.h#L90
+// for reasoning behind this implementation
+#if (!defined(__clang__) && __CUDACC_VER_MAJOR__ >= 11)
+    return static_cast<uint32>(__cvta_generic_to_shared(ptr));
+#elif (!defined(__clang__) && CUDACC_VER_MAJOR__ == 10 && \
+       __CUDACC_VER_MINOR__ >= 2)
+    return __nvvm_get_smem_pointer(ptr);
+#else
+    uint32 smem_ptr;
+    asm("{{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %1; cvt.u32.u64 "
+        "%0, smem_ptr; }}"
+        : "=r"(smem_ptr)
+        : "l"(ptr));
+    return smem_ptr;
+#endif
+}
+
+
+__device__ __forceinline__ void membar_acq_rel()
+{
+#if __CUDA_ARCH__ < 700
+    asm volatile("membar.gl;" ::: "memory");
+#else
+    asm volatile("fence.acq_rel.gpu;" ::: "memory");
+#endif
+}
+
+
+__device__ __forceinline__ void membar_acq_rel_shared()
+{
+#if __CUDA_ARCH__ < 700
+    asm volatile("membar.cta;" ::: "memory");
+#else
+    asm volatile("fence.acq_rel.cta;" ::: "memory");
+#endif
+}
+
+
+#include "common/cuda_hip/components/memory.hpp.inc"
+
+
+__device__ __forceinline__ int32 load_relaxed_shared(const int32* ptr)
+{
+    int32 result;
+#if __CUDA_ARCH__ < 700
+    asm volatile("ld.volatile.shared.s32 %0, [%1];"
+                 : "=r"(result)
+                 : "r"(convert_generic_ptr_to_smem_ptr(const_cast<int32*>(ptr)))
+                 : "memory");
+#else
+    asm volatile("ld.relaxed.cta.shared.s32 %0, [%1];"
+                 : "=r"(result)
+                 : "r"(convert_generic_ptr_to_smem_ptr(const_cast<int32*>(ptr)))
+                 : "memory");
+#endif
+
+    return result;
+}
+
+
+__device__ __forceinline__ void store_relaxed_shared(int32* ptr, int32 result)
+{
+#if __CUDA_ARCH__ < 700
+    asm volatile("st.volatile.shared.s32 [%0], %1;" ::"r"(
+                     convert_generic_ptr_to_smem_ptr(ptr)),
+                 "r"(result)
+                 : "memory");
+#else
+    asm volatile("st.relaxed.cta.shared.s32 [%0], %1;" ::"r"(
+                     convert_generic_ptr_to_smem_ptr(ptr)),
+                 "r"(result)
+                 : "memory");
+#endif
+}
+
+
+__device__ __forceinline__ int64 load_relaxed_shared(const int64* ptr)
+{
+    int64 result;
+#if __CUDA_ARCH__ < 700
+    asm volatile("ld.volatile.shared.s64 %0, [%1];"
+                 : "=l"(result)
+                 : "r"(convert_generic_ptr_to_smem_ptr(const_cast<int64*>(ptr)))
+                 : "memory");
+#else
+    asm volatile("ld.relaxed.cta.shared.s64 %0, [%1];"
+                 : "=l"(result)
+                 : "r"(convert_generic_ptr_to_smem_ptr(const_cast<int64*>(ptr)))
+                 : "memory");
+#endif
+
+    return result;
+}
+
+
+__device__ __forceinline__ void store_relaxed_shared(int64* ptr, int64 result)
+{
+#if __CUDA_ARCH__ < 700
+    asm volatile("st.volatile.shared.s64 [%0], %1;" ::"r"(
+                     convert_generic_ptr_to_smem_ptr(ptr)),
+                 "l"(result)
+                 : "memory");
+#else
+    asm volatile("st.relaxed.cta.shared.s64 [%0], %1;" ::"r"(
+                     convert_generic_ptr_to_smem_ptr(ptr)),
+                 "l"(result)
+                 : "memory");
+#endif
+}
+
+
+__device__ __forceinline__ float load_relaxed_shared(const float* ptr)
+{
+    float result;
+#if __CUDA_ARCH__ < 700
+    asm volatile("ld.volatile.shared.f32 %0, [%1];"
+                 : "=f"(result)
+                 : "r"(convert_generic_ptr_to_smem_ptr(const_cast<float*>(ptr)))
+                 : "memory");
+#else
+    asm volatile("ld.relaxed.cta.shared.f32 %0, [%1];"
+                 : "=f"(result)
+                 : "r"(convert_generic_ptr_to_smem_ptr(const_cast<float*>(ptr)))
+                 : "memory");
+#endif
+
+    return result;
+}
+
+
+__device__ __forceinline__ void store_relaxed_shared(float* ptr, float result)
+{
+#if __CUDA_ARCH__ < 700
+    asm volatile("st.volatile.shared.f32 [%0], %1;" ::"r"(
+                     convert_generic_ptr_to_smem_ptr(ptr)),
+                 "f"(result)
+                 : "memory");
+#else
+    asm volatile("st.relaxed.cta.shared.f32 [%0], %1;" ::"r"(
+                     convert_generic_ptr_to_smem_ptr(ptr)),
+                 "f"(result)
+                 : "memory");
+#endif
+}
+
+
+__device__ __forceinline__ double load_relaxed_shared(const double* ptr)
+{
+    double result;
+#if __CUDA_ARCH__ < 700
+    asm volatile(
+        "ld.volatile.shared.f64 %0, [%1];"
+        : "=d"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(const_cast<double*>(ptr)))
+        : "memory");
+#else
+    asm volatile(
+        "ld.relaxed.cta.shared.f64 %0, [%1];"
+        : "=d"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(const_cast<double*>(ptr)))
+        : "memory");
+#endif
+
+    return result;
+}
+
+
+__device__ __forceinline__ void store_relaxed_shared(double* ptr, double result)
+{
+#if __CUDA_ARCH__ < 700
+    asm volatile("st.volatile.shared.f64 [%0], %1;" ::"r"(
+                     convert_generic_ptr_to_smem_ptr(ptr)),
+                 "d"(result)
+                 : "memory");
+#else
+    asm volatile("st.relaxed.cta.shared.f64 [%0], %1;" ::"r"(
+                     convert_generic_ptr_to_smem_ptr(ptr)),
+                 "d"(result)
+                 : "memory");
+#endif
+}
+
+
+__device__ __forceinline__ int32 load_acquire_shared(const int32* ptr)
+{
+    int32 result;
+#if __CUDA_ARCH__ < 700
+    asm volatile("ld.volatile.shared.s32 %0, [%1];"
+                 : "=r"(result)
+                 : "r"(convert_generic_ptr_to_smem_ptr(const_cast<int32*>(ptr)))
+                 : "memory");
+#else
+    asm volatile("ld.acquire.cta.shared.s32 %0, [%1];"
+                 : "=r"(result)
+                 : "r"(convert_generic_ptr_to_smem_ptr(const_cast<int32*>(ptr)))
+                 : "memory");
+#endif
+    membar_acq_rel_shared();
+    return result;
+}
+
+
+__device__ __forceinline__ void store_release_shared(int32* ptr, int32 result)
+{
+    membar_acq_rel_shared();
+#if __CUDA_ARCH__ < 700
+    asm volatile("st.volatile.shared.s32 [%0], %1;" ::"r"(
+                     convert_generic_ptr_to_smem_ptr(ptr)),
+                 "r"(result)
+                 : "memory");
+#else
+    asm volatile("st.release.cta.shared.s32 [%0], %1;" ::"r"(
+                     convert_generic_ptr_to_smem_ptr(ptr)),
+                 "r"(result)
+                 : "memory");
+#endif
+}
+
+
+__device__ __forceinline__ int64 load_acquire_shared(const int64* ptr)
+{
+    int64 result;
+#if __CUDA_ARCH__ < 700
+    asm volatile("ld.volatile.shared.s64 %0, [%1];"
+                 : "=l"(result)
+                 : "r"(convert_generic_ptr_to_smem_ptr(const_cast<int64*>(ptr)))
+                 : "memory");
+#else
+    asm volatile("ld.acquire.cta.shared.s64 %0, [%1];"
+                 : "=l"(result)
+                 : "r"(convert_generic_ptr_to_smem_ptr(const_cast<int64*>(ptr)))
+                 : "memory");
+#endif
+    membar_acq_rel_shared();
+    return result;
+}
+
+
+__device__ __forceinline__ void store_release_shared(int64* ptr, int64 result)
+{
+    membar_acq_rel_shared();
+#if __CUDA_ARCH__ < 700
+    asm volatile("st.volatile.shared.s64 [%0], %1;" ::"r"(
+                     convert_generic_ptr_to_smem_ptr(ptr)),
+                 "l"(result)
+                 : "memory");
+#else
+    asm volatile("st.release.cta.shared.s64 [%0], %1;" ::"r"(
+                     convert_generic_ptr_to_smem_ptr(ptr)),
+                 "l"(result)
+                 : "memory");
+#endif
+}
+
+
+__device__ __forceinline__ float load_acquire_shared(const float* ptr)
+{
+    float result;
+#if __CUDA_ARCH__ < 700
+    asm volatile("ld.volatile.shared.f32 %0, [%1];"
+                 : "=f"(result)
+                 : "r"(convert_generic_ptr_to_smem_ptr(const_cast<float*>(ptr)))
+                 : "memory");
+#else
+    asm volatile("ld.acquire.cta.shared.f32 %0, [%1];"
+                 : "=f"(result)
+                 : "r"(convert_generic_ptr_to_smem_ptr(const_cast<float*>(ptr)))
+                 : "memory");
+#endif
+    membar_acq_rel_shared();
+    return result;
+}
+
+
+__device__ __forceinline__ void store_release_shared(float* ptr, float result)
+{
+    membar_acq_rel_shared();
+#if __CUDA_ARCH__ < 700
+    asm volatile("st.volatile.shared.f32 [%0], %1;" ::"r"(
+                     convert_generic_ptr_to_smem_ptr(ptr)),
+                 "f"(result)
+                 : "memory");
+#else
+    asm volatile("st.release.cta.shared.f32 [%0], %1;" ::"r"(
+                     convert_generic_ptr_to_smem_ptr(ptr)),
+                 "f"(result)
+                 : "memory");
+#endif
+}
+
+
+__device__ __forceinline__ double load_acquire_shared(const double* ptr)
+{
+    double result;
+#if __CUDA_ARCH__ < 700
+    asm volatile(
+        "ld.volatile.shared.f64 %0, [%1];"
+        : "=d"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(const_cast<double*>(ptr)))
+        : "memory");
+#else
+    asm volatile(
+        "ld.acquire.cta.shared.f64 %0, [%1];"
+        : "=d"(result)
+        : "r"(convert_generic_ptr_to_smem_ptr(const_cast<double*>(ptr)))
+        : "memory");
+#endif
+    membar_acq_rel_shared();
+    return result;
+}
+
+
+__device__ __forceinline__ void store_release_shared(double* ptr, double result)
+{
+    membar_acq_rel_shared();
+#if __CUDA_ARCH__ < 700
+    asm volatile("st.volatile.shared.f64 [%0], %1;" ::"r"(
+                     convert_generic_ptr_to_smem_ptr(ptr)),
+                 "d"(result)
+                 : "memory");
+#else
+    asm volatile("st.release.cta.shared.f64 [%0], %1;" ::"r"(
+                     convert_generic_ptr_to_smem_ptr(ptr)),
+                 "d"(result)
+                 : "memory");
+#endif
+}
+
+
+__device__ __forceinline__ int32 load_relaxed(const int32* ptr)
+{
+    int32 result;
+#if __CUDA_ARCH__ < 700
+    asm volatile("ld.volatile.s32 %0, [%1];"
+                 : "=r"(result)
+                 : "l"(const_cast<int32*>(ptr))
+                 : "memory");
+#else
+    asm volatile("ld.relaxed.gpu.s32 %0, [%1];"
+                 : "=r"(result)
+                 : "l"(const_cast<int32*>(ptr))
+                 : "memory");
+#endif
+
+    return result;
+}
+
+
+__device__ __forceinline__ void store_relaxed(int32* ptr, int32 result)
+{
+#if __CUDA_ARCH__ < 700
+    asm volatile("st.volatile.s32 [%0], %1;" ::"l"(ptr), "r"(result)
+                 : "memory");
+#else
+    asm volatile("st.relaxed.gpu.s32 [%0], %1;" ::"l"(ptr), "r"(result)
+                 : "memory");
+#endif
+}
+
+
+__device__ __forceinline__ int64 load_relaxed(const int64* ptr)
+{
+    int64 result;
+#if __CUDA_ARCH__ < 700
+    asm volatile("ld.volatile.s64 %0, [%1];"
+                 : "=l"(result)
+                 : "l"(const_cast<int64*>(ptr))
+                 : "memory");
+#else
+    asm volatile("ld.relaxed.gpu.s64 %0, [%1];"
+                 : "=l"(result)
+                 : "l"(const_cast<int64*>(ptr))
+                 : "memory");
+#endif
+
+    return result;
+}
+
+
+__device__ __forceinline__ void store_relaxed(int64* ptr, int64 result)
+{
+#if __CUDA_ARCH__ < 700
+    asm volatile("st.volatile.s64 [%0], %1;" ::"l"(ptr), "l"(result)
+                 : "memory");
+#else
+    asm volatile("st.relaxed.gpu.s64 [%0], %1;" ::"l"(ptr), "l"(result)
+                 : "memory");
+#endif
+}
+
+
+__device__ __forceinline__ float load_relaxed(const float* ptr)
+{
+    float result;
+#if __CUDA_ARCH__ < 700
+    asm volatile("ld.volatile.f32 %0, [%1];"
+                 : "=f"(result)
+                 : "l"(const_cast<float*>(ptr))
+                 : "memory");
+#else
+    asm volatile("ld.relaxed.gpu.f32 %0, [%1];"
+                 : "=f"(result)
+                 : "l"(const_cast<float*>(ptr))
+                 : "memory");
+#endif
+
+    return result;
+}
+
+
+__device__ __forceinline__ void store_relaxed(float* ptr, float result)
+{
+#if __CUDA_ARCH__ < 700
+    asm volatile("st.volatile.f32 [%0], %1;" ::"l"(ptr), "f"(result)
+                 : "memory");
+#else
+    asm volatile("st.relaxed.gpu.f32 [%0], %1;" ::"l"(ptr), "f"(result)
+                 : "memory");
+#endif
+}
+
+
+__device__ __forceinline__ double load_relaxed(const double* ptr)
+{
+    double result;
+#if __CUDA_ARCH__ < 700
+    asm volatile("ld.volatile.f64 %0, [%1];"
+                 : "=d"(result)
+                 : "l"(const_cast<double*>(ptr))
+                 : "memory");
+#else
+    asm volatile("ld.relaxed.gpu.f64 %0, [%1];"
+                 : "=d"(result)
+                 : "l"(const_cast<double*>(ptr))
+                 : "memory");
+#endif
+
+    return result;
+}
+
+
+__device__ __forceinline__ void store_relaxed(double* ptr, double result)
+{
+#if __CUDA_ARCH__ < 700
+    asm volatile("st.volatile.f64 [%0], %1;" ::"l"(ptr), "d"(result)
+                 : "memory");
+#else
+    asm volatile("st.relaxed.gpu.f64 [%0], %1;" ::"l"(ptr), "d"(result)
+                 : "memory");
+#endif
+}
+
+
+__device__ __forceinline__ int32 load_acquire(const int32* ptr)
+{
+    int32 result;
+#if __CUDA_ARCH__ < 700
+    asm volatile("ld.volatile.s32 %0, [%1];"
+                 : "=r"(result)
+                 : "l"(const_cast<int32*>(ptr))
+                 : "memory");
+#else
+    asm volatile("ld.acquire.gpu.s32 %0, [%1];"
+                 : "=r"(result)
+                 : "l"(const_cast<int32*>(ptr))
+                 : "memory");
+#endif
+    membar_acq_rel();
+    return result;
+}
+
+
+__device__ __forceinline__ void store_release(int32* ptr, int32 result)
+{
+    membar_acq_rel();
+#if __CUDA_ARCH__ < 700
+    asm volatile("st.volatile.s32 [%0], %1;" ::"l"(ptr), "r"(result)
+                 : "memory");
+#else
+    asm volatile("st.release.gpu.s32 [%0], %1;" ::"l"(ptr), "r"(result)
+                 : "memory");
+#endif
+}
+
+
+__device__ __forceinline__ int64 load_acquire(const int64* ptr)
+{
+    int64 result;
+#if __CUDA_ARCH__ < 700
+    asm volatile("ld.volatile.s64 %0, [%1];"
+                 : "=l"(result)
+                 : "l"(const_cast<int64*>(ptr))
+                 : "memory");
+#else
+    asm volatile("ld.acquire.gpu.s64 %0, [%1];"
+                 : "=l"(result)
+                 : "l"(const_cast<int64*>(ptr))
+                 : "memory");
+#endif
+    membar_acq_rel();
+    return result;
+}
+
+
+__device__ __forceinline__ void store_release(int64* ptr, int64 result)
+{
+    membar_acq_rel();
+#if __CUDA_ARCH__ < 700
+    asm volatile("st.volatile.s64 [%0], %1;" ::"l"(ptr), "l"(result)
+                 : "memory");
+#else
+    asm volatile("st.release.gpu.s64 [%0], %1;" ::"l"(ptr), "l"(result)
+                 : "memory");
+#endif
+}
+
+
+__device__ __forceinline__ float load_acquire(const float* ptr)
+{
+    float result;
+#if __CUDA_ARCH__ < 700
+    asm volatile("ld.volatile.f32 %0, [%1];"
+                 : "=f"(result)
+                 : "l"(const_cast<float*>(ptr))
+                 : "memory");
+#else
+    asm volatile("ld.acquire.gpu.f32 %0, [%1];"
+                 : "=f"(result)
+                 : "l"(const_cast<float*>(ptr))
+                 : "memory");
+#endif
+    membar_acq_rel();
+    return result;
+}
+
+
+__device__ __forceinline__ void store_release(float* ptr, float result)
+{
+    membar_acq_rel();
+#if __CUDA_ARCH__ < 700
+    asm volatile("st.volatile.f32 [%0], %1;" ::"l"(ptr), "f"(result)
+                 : "memory");
+#else
+    asm volatile("st.release.gpu.f32 [%0], %1;" ::"l"(ptr), "f"(result)
+                 : "memory");
+#endif
+}
+
+
+__device__ __forceinline__ double load_acquire(const double* ptr)
+{
+    double result;
+#if __CUDA_ARCH__ < 700
+    asm volatile("ld.volatile.f64 %0, [%1];"
+                 : "=d"(result)
+                 : "l"(const_cast<double*>(ptr))
+                 : "memory");
+#else
+    asm volatile("ld.acquire.gpu.f64 %0, [%1];"
+                 : "=d"(result)
+                 : "l"(const_cast<double*>(ptr))
+                 : "memory");
+#endif
+    membar_acq_rel();
+    return result;
+}
+
+
+__device__ __forceinline__ void store_release(double* ptr, double result)
+{
+    membar_acq_rel();
+#if __CUDA_ARCH__ < 700
+    asm volatile("st.volatile.f64 [%0], %1;" ::"l"(ptr), "d"(result)
+                 : "memory");
+#else
+    asm volatile("st.release.gpu.f64 [%0], %1;" ::"l"(ptr), "d"(result)
+                 : "memory");
+#endif
+}
+
+
+__device__ __forceinline__ thrust::complex<float> load_relaxed_shared(
+    const thrust::complex<float>* ptr)
+{
+    float real_result;
+    float imag_result;
+#if __CUDA_ARCH__ < 700
+    asm volatile("ld.volatile.shared.v2.f32 {%0, %1}, [%2];"
+                 : "=f"(real_result), "=f"(imag_result)
+                 : "r"(convert_generic_ptr_to_smem_ptr(
+                     const_cast<thrust::complex<float>*>(ptr)))
+                 : "memory");
+#else
+    asm volatile("ld.relaxed.cta.shared.v2.f32 {%0, %1}, [%2];"
+                 : "=f"(real_result), "=f"(imag_result)
+                 : "r"(convert_generic_ptr_to_smem_ptr(
+                     const_cast<thrust::complex<float>*>(ptr)))
+                 : "memory");
+#endif
+    return thrust::complex<float>{real_result, imag_result};
+}
+
+
+__device__ __forceinline__ void store_relaxed_shared(
+    thrust::complex<float>* ptr, thrust::complex<float> result)
+{
+    auto real_result = result.real();
+    auto imag_result = result.imag();
+#if __CUDA_ARCH__ < 700
+    asm volatile("st.volatile.shared.v2.f32 [%0], {%1, %2};" ::"r"(
+                     convert_generic_ptr_to_smem_ptr(ptr)),
+                 "f"(real_result), "f"(imag_result)
+                 : "memory");
+#else
+    asm volatile("st.relaxed.cta.shared.v2.f32 [%0], {%1, %2};" ::"r"(
+                     convert_generic_ptr_to_smem_ptr(ptr)),
+                 "f"(real_result), "f"(imag_result)
+                 : "memory");
+#endif
+}
+
+
+__device__ __forceinline__ thrust::complex<double> load_relaxed_shared(
+    const thrust::complex<double>* ptr)
+{
+    double real_result;
+    double imag_result;
+#if __CUDA_ARCH__ < 700
+    asm volatile("ld.volatile.shared.v2.f64 {%0, %1}, [%2];"
+                 : "=d"(real_result), "=d"(imag_result)
+                 : "r"(convert_generic_ptr_to_smem_ptr(
+                     const_cast<thrust::complex<double>*>(ptr)))
+                 : "memory");
+#else
+    asm volatile("ld.relaxed.cta.shared.v2.f64 {%0, %1}, [%2];"
+                 : "=d"(real_result), "=d"(imag_result)
+                 : "r"(convert_generic_ptr_to_smem_ptr(
+                     const_cast<thrust::complex<double>*>(ptr)))
+                 : "memory");
+#endif
+    return thrust::complex<double>{real_result, imag_result};
+}
+
+
+__device__ __forceinline__ void store_relaxed_shared(
+    thrust::complex<double>* ptr, thrust::complex<double> result)
+{
+    auto real_result = result.real();
+    auto imag_result = result.imag();
+#if __CUDA_ARCH__ < 700
+    asm volatile("st.volatile.shared.v2.f64 [%0], {%1, %2};" ::"r"(
+                     convert_generic_ptr_to_smem_ptr(ptr)),
+                 "d"(real_result), "d"(imag_result)
+                 : "memory");
+#else
+    asm volatile("st.relaxed.cta.shared.v2.f64 [%0], {%1, %2};" ::"r"(
+                     convert_generic_ptr_to_smem_ptr(ptr)),
+                 "d"(real_result), "d"(imag_result)
+                 : "memory");
+#endif
+}
+
+
+__device__ __forceinline__ thrust::complex<float> load_relaxed(
+    const thrust::complex<float>* ptr)
+{
+    float real_result;
+    float imag_result;
+#if __CUDA_ARCH__ < 700
+    asm volatile("ld.volatile.v2.f32 {%0, %1}, [%2];"
+                 : "=f"(real_result), "=f"(imag_result)
+                 : "l"(const_cast<thrust::complex<float>*>(ptr))
+                 : "memory");
+#else
+    asm volatile("ld.relaxed.gpu.v2.f32 {%0, %1}, [%2];"
+                 : "=f"(real_result), "=f"(imag_result)
+                 : "l"(const_cast<thrust::complex<float>*>(ptr))
+                 : "memory");
+#endif
+    return thrust::complex<float>{real_result, imag_result};
+}
+
+
+__device__ __forceinline__ void store_relaxed(thrust::complex<float>* ptr,
+                                              thrust::complex<float> result)
+{
+    auto real_result = result.real();
+    auto imag_result = result.imag();
+#if __CUDA_ARCH__ < 700
+    asm volatile("st.volatile.v2.f32 [%0], {%1, %2};" ::"l"(ptr),
+                 "f"(real_result), "f"(imag_result)
+                 : "memory");
+#else
+    asm volatile("st.relaxed.gpu.v2.f32 [%0], {%1, %2};" ::"l"(ptr),
+                 "f"(real_result), "f"(imag_result)
+                 : "memory");
+#endif
+}
+
+
+__device__ __forceinline__ thrust::complex<double> load_relaxed(
+    const thrust::complex<double>* ptr)
+{
+    double real_result;
+    double imag_result;
+#if __CUDA_ARCH__ < 700
+    asm volatile("ld.volatile.v2.f64 {%0, %1}, [%2];"
+                 : "=d"(real_result), "=d"(imag_result)
+                 : "l"(const_cast<thrust::complex<double>*>(ptr))
+                 : "memory");
+#else
+    asm volatile("ld.relaxed.gpu.v2.f64 {%0, %1}, [%2];"
+                 : "=d"(real_result), "=d"(imag_result)
+                 : "l"(const_cast<thrust::complex<double>*>(ptr))
+                 : "memory");
+#endif
+    return thrust::complex<double>{real_result, imag_result};
+}
+
+
+__device__ __forceinline__ void store_relaxed(thrust::complex<double>* ptr,
+                                              thrust::complex<double> result)
+{
+    auto real_result = result.real();
+    auto imag_result = result.imag();
+#if __CUDA_ARCH__ < 700
+    asm volatile("st.volatile.v2.f64 [%0], {%1, %2};" ::"l"(ptr),
+                 "d"(real_result), "d"(imag_result)
+                 : "memory");
+#else
+    asm volatile("st.relaxed.gpu.v2.f64 [%0], {%1, %2};" ::"l"(ptr),
+                 "d"(real_result), "d"(imag_result)
+                 : "memory");
+#endif
+}
+
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko
+
+#endif  // GKO_CUDA_COMPONENTS_MEMORY_CUH_
diff --git a/cuda/components/syncfree.cuh b/cuda/components/syncfree.cuh
index 625f1bd8359..d00064b06b7 100644
--- a/cuda/components/syncfree.cuh
+++ b/cuda/components/syncfree.cuh
@@ -41,7 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "cuda/base/config.hpp"
 #include "cuda/components/atomic.cuh"
 #include "cuda/components/cooperative_groups.cuh"
-#include "cuda/components/volatile.cuh"
+#include "cuda/components/memory.cuh"
 
 
 namespace gko {
diff --git a/cuda/distributed/partition_helpers_kernels.cu b/cuda/distributed/partition_helpers_kernels.cu
new file mode 100644
index 00000000000..62dad1efaf1
--- /dev/null
+++ b/cuda/distributed/partition_helpers_kernels.cu
@@ -0,0 +1,57 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/distributed/partition_helpers_kernels.hpp"
+
+
+#include <thrust/device_ptr.h>
+#include <thrust/execution_policy.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/sort.h>
+
+
+#include "cuda/base/thrust.cuh"
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+namespace partition_helpers {
+
+
+#include "common/cuda_hip/distributed/partition_helpers_kernels.hpp.inc"
+
+
+}  // namespace partition_helpers
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko
diff --git a/cuda/factorization/lu_kernels.cu b/cuda/factorization/lu_kernels.cu
index 84656eb2510..55567f7fd9e 100644
--- a/cuda/factorization/lu_kernels.cu
+++ b/cuda/factorization/lu_kernels.cu
@@ -37,13 +37,20 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <memory>
 
 
+#include <thrust/copy.h>
+#include <thrust/iterator/transform_output_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+
+
 #include <ginkgo/core/matrix/csr.hpp>
 
 
 #include "core/base/allocator.hpp"
 #include "core/matrix/csr_lookup.hpp"
+#include "cuda/base/thrust.cuh"
 #include "cuda/base/types.hpp"
 #include "cuda/components/cooperative_groups.cuh"
+#include "cuda/components/reduction.cuh"
 #include "cuda/components/syncfree.cuh"
 #include "cuda/components/thread_ids.cuh"
 
diff --git a/cuda/factorization/par_ic_kernels.cu b/cuda/factorization/par_ic_kernels.cu
index b700be483ea..0f54e5b4a98 100644
--- a/cuda/factorization/par_ic_kernels.cu
+++ b/cuda/factorization/par_ic_kernels.cu
@@ -40,6 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "cuda/base/math.hpp"
 #include "cuda/base/types.hpp"
+#include "cuda/components/memory.cuh"
 #include "cuda/components/thread_ids.cuh"
 
 
diff --git a/cuda/factorization/par_ict_kernels.cu b/cuda/factorization/par_ict_kernels.cu
index f2a5f9f4754..66f64e5959b 100644
--- a/cuda/factorization/par_ict_kernels.cu
+++ b/cuda/factorization/par_ict_kernels.cu
@@ -47,6 +47,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/synthesizer/implementation_selection.hpp"
 #include "cuda/base/math.hpp"
 #include "cuda/components/intrinsics.cuh"
+#include "cuda/components/memory.cuh"
 #include "cuda/components/merging.cuh"
 #include "cuda/components/prefix_sum.cuh"
 #include "cuda/components/reduction.cuh"
diff --git a/cuda/factorization/par_ilu_kernels.cu b/cuda/factorization/par_ilu_kernels.cu
index 9796ee343fc..3b45c2993f2 100644
--- a/cuda/factorization/par_ilu_kernels.cu
+++ b/cuda/factorization/par_ilu_kernels.cu
@@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "cuda/base/math.hpp"
 #include "cuda/base/types.hpp"
+#include "cuda/components/memory.cuh"
 #include "cuda/components/thread_ids.cuh"
 
 
diff --git a/cuda/factorization/par_ilut_sweep_kernel.cu b/cuda/factorization/par_ilut_sweep_kernel.cu
index c4b292402ac..98cd8c5de48 100644
--- a/cuda/factorization/par_ilut_sweep_kernel.cu
+++ b/cuda/factorization/par_ilut_sweep_kernel.cu
@@ -47,6 +47,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/synthesizer/implementation_selection.hpp"
 #include "cuda/base/math.hpp"
 #include "cuda/components/intrinsics.cuh"
+#include "cuda/components/memory.cuh"
 #include "cuda/components/merging.cuh"
 #include "cuda/components/prefix_sum.cuh"
 #include "cuda/components/reduction.cuh"
diff --git a/cuda/get_info.cmake b/cuda/get_info.cmake
index 3d91ea9f23a..eeadaf9725c 100644
--- a/cuda/get_info.cmake
+++ b/cuda/get_info.cmake
@@ -1,16 +1,10 @@
 ginkgo_print_module_header(${detailed_log} "CUDA")
-ginkgo_print_variable(${detailed_log} "GINKGO_CUDA_ARCHITECTURES")
+ginkgo_print_variable(${detailed_log} "CMAKE_CUDA_ARCHITECTURES")
 ginkgo_print_variable(${detailed_log} "GINKGO_CUDA_COMPILER_FLAGS")
-ginkgo_print_variable(${detailed_log} "GINKGO_CUDA_DEFAULT_HOST_COMPILER")
-ginkgo_print_variable(${detailed_log} "GINKGO_CUDA_ARCH_FLAGS")
 ginkgo_print_module_footer(${detailed_log} "CUDA variables:")
 ginkgo_print_variable(${detailed_log} "CMAKE_CUDA_COMPILER")
 ginkgo_print_variable(${detailed_log} "CMAKE_CUDA_COMPILER_VERSION")
 ginkgo_print_flags(${detailed_log} "CMAKE_CUDA_FLAGS")
 ginkgo_print_variable(${detailed_log} "CMAKE_CUDA_HOST_COMPILER")
-ginkgo_print_variable(${detailed_log} "CUDA_INCLUDE_DIRS")
-ginkgo_print_module_footer(${detailed_log} "CUDA Libraries:")
-ginkgo_print_variable(${detailed_log} "CUBLAS")
-ginkgo_print_variable(${detailed_log} "CUDA_RUNTIME_LIBS")
-ginkgo_print_variable(${detailed_log} "CUSPARSE")
+ginkgo_print_variable(${detailed_log} "CUDAToolkit_LIBRARY_DIR")
 ginkgo_print_module_footer(${detailed_log} "")
diff --git a/cuda/log/batch_logger.cuh b/cuda/log/batch_logger.cuh
new file mode 100644
index 00000000000..5591d5fac3b
--- /dev/null
+++ b/cuda/log/batch_logger.cuh
@@ -0,0 +1,54 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CUDA_LOG_BATCH_LOGGER_CUH_
+#define GKO_CUDA_LOG_BATCH_LOGGER_CUH_
+
+
+#include <ginkgo/core/base/types.hpp>
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+namespace batch_log {
+
+
+#include "common/cuda_hip/log/batch_logger.hpp.inc"
+
+
+}  // namespace batch_log
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko
+
+#endif  // GKO_CUDA_LOG_BATCH_LOGGER_CUH_
diff --git a/cuda/matrix/batch_dense_kernels.cu b/cuda/matrix/batch_dense_kernels.cu
new file mode 100644
index 00000000000..c693a3ae861
--- /dev/null
+++ b/cuda/matrix/batch_dense_kernels.cu
@@ -0,0 +1,85 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/matrix/batch_dense_kernels.hpp"
+
+
+#include <thrust/functional.h>
+
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/matrix/batch_dense.hpp>
+
+
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "cuda/base/batch_struct.hpp"
+#include "cuda/base/config.hpp"
+#include "cuda/base/thrust.cuh"
+#include "cuda/components/cooperative_groups.cuh"
+#include "cuda/components/reduction.cuh"
+#include "cuda/components/thread_ids.cuh"
+#include "cuda/components/uninitialized_array.hpp"
+#include "cuda/matrix/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+/**
+ * @brief The Dense matrix format namespace.
+ *
+ * @ingroup batch_dense
+ */
+namespace batch_dense {
+
+
+constexpr auto default_block_size = 256;
+constexpr int sm_oversubscription = 4;
+
+// clang-format off
+
+// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
+
+#include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc"
+
+
+#include "common/cuda_hip/matrix/batch_dense_kernel_launcher.hpp.inc"
+
+
+// clang-format on
+
+
+}  // namespace batch_dense
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko
diff --git a/cuda/matrix/batch_ell_kernels.cu b/cuda/matrix/batch_ell_kernels.cu
new file mode 100644
index 00000000000..5cadd7755a2
--- /dev/null
+++ b/cuda/matrix/batch_ell_kernels.cu
@@ -0,0 +1,85 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/matrix/batch_ell_kernels.hpp"
+
+
+#include <thrust/functional.h>
+
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/matrix/batch_ell.hpp>
+
+
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "cuda/base/batch_struct.hpp"
+#include "cuda/base/config.hpp"
+#include "cuda/base/thrust.cuh"
+#include "cuda/components/cooperative_groups.cuh"
+#include "cuda/components/reduction.cuh"
+#include "cuda/components/thread_ids.cuh"
+#include "cuda/components/uninitialized_array.hpp"
+#include "cuda/matrix/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+/**
+ * @brief The Ell matrix format namespace.
+ * @ref Ell
+ * @ingroup batch_ell
+ */
+namespace batch_ell {
+
+
+constexpr auto default_block_size = 256;
+constexpr int sm_oversubscription = 4;
+
+// clang-format off
+
+// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
+
+#include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc"
+
+
+#include "common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc"
+
+// clang-format on
+
+
+}  // namespace batch_ell
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko
diff --git a/cuda/matrix/batch_struct.hpp b/cuda/matrix/batch_struct.hpp
new file mode 100644
index 00000000000..55a30c043e3
--- /dev/null
+++ b/cuda/matrix/batch_struct.hpp
@@ -0,0 +1,131 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CUDA_MATRIX_BATCH_STRUCT_HPP_
+#define GKO_CUDA_MATRIX_BATCH_STRUCT_HPP_
+
+
+#include "core/matrix/batch_struct.hpp"
+
+
+#include <ginkgo/core/matrix/batch_dense.hpp>
+#include <ginkgo/core/matrix/batch_ell.hpp>
+
+
+#include "core/base/batch_struct.hpp"
+#include "cuda/base/types.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+
+
+/** @file batch_struct.hpp
+ *
+ * Helper functions to generate a batch struct from a batch LinOp,
+ * while also shallow-casting to the required CUDA scalar type.
+ *
+ * A specialization is needed for every format of every kind of linear algebra
+ * object. These are intended to be called on the host.
+ */
+
+
+/**
+ * Generates an immutable uniform batch struct from a batch of dense matrices.
+ */
+template <typename ValueType>
+inline batch::matrix::dense::uniform_batch<const cuda_type<ValueType>>
+get_batch_struct(const batch::matrix::Dense<ValueType>* const op)
+{
+    return {as_cuda_type(op->get_const_values()), op->get_num_batch_items(),
+            static_cast<int32>(op->get_common_size()[1]),
+            static_cast<int32>(op->get_common_size()[0]),
+            static_cast<int32>(op->get_common_size()[1])};
+}
+
+
+/**
+ * Generates a uniform batch struct from a batch of dense matrices.
+ */
+template <typename ValueType>
+inline batch::matrix::dense::uniform_batch<cuda_type<ValueType>>
+get_batch_struct(batch::matrix::Dense<ValueType>* const op)
+{
+    return {as_cuda_type(op->get_values()), op->get_num_batch_items(),
+            static_cast<int32>(op->get_common_size()[1]),
+            static_cast<int32>(op->get_common_size()[0]),
+            static_cast<int32>(op->get_common_size()[1])};
+}
+
+
+/**
+ * Generates an immutable uniform batch struct from a batch of ell matrices.
+ */
+template <typename ValueType, typename IndexType>
+inline batch::matrix::ell::uniform_batch<const cuda_type<ValueType>,
+                                         const IndexType>
+get_batch_struct(const batch::matrix::Ell<ValueType, IndexType>* const op)
+{
+    return {as_cuda_type(op->get_const_values()),
+            op->get_const_col_idxs(),
+            op->get_num_batch_items(),
+            static_cast<IndexType>(op->get_common_size()[0]),
+            static_cast<IndexType>(op->get_common_size()[0]),
+            static_cast<IndexType>(op->get_common_size()[1]),
+            static_cast<IndexType>(op->get_num_stored_elements_per_row())};
+}
+
+
+/**
+ * Generates a uniform batch struct from a batch of ell matrices.
+ */
+template <typename ValueType, typename IndexType>
+inline batch::matrix::ell::uniform_batch<cuda_type<ValueType>, IndexType>
+get_batch_struct(batch::matrix::Ell<ValueType, IndexType>* const op)
+{
+    return {as_cuda_type(op->get_values()),
+            op->get_col_idxs(),
+            op->get_num_batch_items(),
+            static_cast<IndexType>(op->get_common_size()[0]),
+            static_cast<IndexType>(op->get_common_size()[0]),
+            static_cast<IndexType>(op->get_common_size()[1]),
+            static_cast<IndexType>(op->get_num_stored_elements_per_row())};
+}
+
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_CUDA_MATRIX_BATCH_STRUCT_HPP_
diff --git a/cuda/matrix/csr_kernels.instantiate.cu b/cuda/matrix/csr_kernels.instantiate.cu
new file mode 100644
index 00000000000..335d42d2ff9
--- /dev/null
+++ b/cuda/matrix/csr_kernels.instantiate.cu
@@ -0,0 +1,109 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "cuda/matrix/csr_kernels.template.cu"
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+/**
+ * @brief The Compressed sparse row matrix format namespace.
+ *
+ * @ingroup csr
+ */
+namespace csr {
+
+
+// begin
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_CONVERT_TO_FBCSR_KERNEL);
+// split
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_SPMV_KERNEL);
+// split
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_TRANSPOSE_KERNEL);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_CONJ_TRANSPOSE_KERNEL);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_SORT_BY_COLUMN_INDEX);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEMM_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_ADVANCED_SPGEMM_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CSR_BUILD_LOOKUP_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEAM_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_FILL_IN_DENSE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_INV_NONSYMM_PERMUTE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_INV_SYMM_PERMUTE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_ROW_PERMUTE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_INV_ROW_PERMUTE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_INV_NONSYMM_SCALE_PERMUTE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_INV_SYMM_SCALE_PERMUTE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_ROW_SCALE_PERMUTE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_INV_ROW_SCALE_PERMUTE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_SPAN_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_INDEX_SET_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_FROM_INDEX_SET_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_IS_SORTED_BY_COLUMN_INDEX);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_EXTRACT_DIAGONAL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_CHECK_DIAGONAL_ENTRIES_EXIST);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL);
+// end
+
+
+}  // namespace csr
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko
diff --git a/cuda/matrix/csr_kernels.cu b/cuda/matrix/csr_kernels.template.cu
similarity index 72%
rename from cuda/matrix/csr_kernels.cu
rename to cuda/matrix/csr_kernels.template.cu
index 619ead5bbbb..d5b577a6068 100644
--- a/cuda/matrix/csr_kernels.cu
+++ b/cuda/matrix/csr_kernels.template.cu
@@ -124,7 +124,7 @@ namespace {
 template <int items_per_thread, typename MatrixValueType,
           typename InputValueType, typename OutputValueType, typename IndexType>
 void merge_path_spmv(syn::value_list<int, items_per_thread>,
-                     std::shared_ptr<const CudaExecutor> exec,
+                     std::shared_ptr<const DefaultExecutor> exec,
                      const matrix::Csr<MatrixValueType, IndexType>* a,
                      const matrix::Dense<InputValueType>* b,
                      matrix::Dense<OutputValueType>* c,
@@ -204,7 +204,7 @@ GKO_ENABLE_IMPLEMENTATION_SELECTION(select_merge_path_spmv, merge_path_spmv);
 
 
 template <typename ValueType, typename IndexType>
-int compute_items_per_thread(std::shared_ptr<const CudaExecutor> exec)
+int compute_items_per_thread(std::shared_ptr<const DefaultExecutor> exec)
 {
     const int version =
         (exec->get_major_version() << 4) + exec->get_minor_version();
@@ -245,7 +245,7 @@ int compute_items_per_thread(std::shared_ptr<const CudaExecutor> exec)
 template <int subwarp_size, typename MatrixValueType, typename InputValueType,
           typename OutputValueType, typename IndexType>
 void classical_spmv(syn::value_list<int, subwarp_size>,
-                    std::shared_ptr<const CudaExecutor> exec,
+                    std::shared_ptr<const DefaultExecutor> exec,
                     const matrix::Csr<MatrixValueType, IndexType>* a,
                     const matrix::Dense<InputValueType>* b,
                     matrix::Dense<OutputValueType>* c,
@@ -298,7 +298,7 @@ GKO_ENABLE_IMPLEMENTATION_SELECTION(select_classical_spmv, classical_spmv);
 
 template <typename MatrixValueType, typename InputValueType,
           typename OutputValueType, typename IndexType>
-void load_balance_spmv(std::shared_ptr<const CudaExecutor> exec,
+void load_balance_spmv(std::shared_ptr<const DefaultExecutor> exec,
                        const matrix::Csr<MatrixValueType, IndexType>* a,
                        const matrix::Dense<InputValueType>* b,
                        matrix::Dense<OutputValueType>* c,
@@ -349,7 +349,7 @@ void load_balance_spmv(std::shared_ptr<const CudaExecutor> exec,
 
 
 template <typename ValueType, typename IndexType>
-bool try_general_sparselib_spmv(std::shared_ptr<const CudaExecutor> exec,
+bool try_general_sparselib_spmv(std::shared_ptr<const DefaultExecutor> exec,
                                 const ValueType* alpha,
                                 const matrix::Csr<ValueType, IndexType>* a,
                                 const matrix::Dense<ValueType>* b,
@@ -441,7 +441,7 @@ template <typename MatrixValueType, typename InputValueType,
           typename = std::enable_if_t<
               !std::is_same<MatrixValueType, InputValueType>::value ||
               !std::is_same<MatrixValueType, OutputValueType>::value>>
-bool try_sparselib_spmv(std::shared_ptr<const CudaExecutor> exec,
+bool try_sparselib_spmv(std::shared_ptr<const DefaultExecutor> exec,
                         const matrix::Csr<MatrixValueType, IndexType>* a,
                         const matrix::Dense<InputValueType>* b,
                         matrix::Dense<OutputValueType>* c,
@@ -453,7 +453,7 @@ bool try_sparselib_spmv(std::shared_ptr<const CudaExecutor> exec,
 }
 
 template <typename ValueType, typename IndexType>
-bool try_sparselib_spmv(std::shared_ptr<const CudaExecutor> exec,
+bool try_sparselib_spmv(std::shared_ptr<const DefaultExecutor> exec,
                         const matrix::Csr<ValueType, IndexType>* a,
                         const matrix::Dense<ValueType>* b,
                         matrix::Dense<ValueType>* c,
@@ -479,7 +479,7 @@ bool try_sparselib_spmv(std::shared_ptr<const CudaExecutor> exec,
 
 template <typename MatrixValueType, typename InputValueType,
           typename OutputValueType, typename IndexType>
-void spmv(std::shared_ptr<const CudaExecutor> exec,
+void spmv(std::shared_ptr<const DefaultExecutor> exec,
           const matrix::Csr<MatrixValueType, IndexType>* a,
           const matrix::Dense<InputValueType>* b,
           matrix::Dense<OutputValueType>* c)
@@ -533,13 +533,10 @@ void spmv(std::shared_ptr<const CudaExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CSR_SPMV_KERNEL);
-
 
 template <typename MatrixValueType, typename InputValueType,
           typename OutputValueType, typename IndexType>
-void advanced_spmv(std::shared_ptr<const CudaExecutor> exec,
+void advanced_spmv(std::shared_ptr<const DefaultExecutor> exec,
                    const matrix::Dense<MatrixValueType>* alpha,
                    const matrix::Csr<MatrixValueType, IndexType>* a,
                    const matrix::Dense<InputValueType>* b,
@@ -598,12 +595,9 @@ void advanced_spmv(std::shared_ptr<const CudaExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL);
-
 
 template <typename ValueType, typename IndexType>
-void spgemm(std::shared_ptr<const CudaExecutor> exec,
+void spgemm(std::shared_ptr<const DefaultExecutor> exec,
             const matrix::Csr<ValueType, IndexType>* a,
             const matrix::Csr<ValueType, IndexType>* b,
             matrix::Csr<ValueType, IndexType>* c)
@@ -724,59 +718,9 @@ void spgemm(std::shared_ptr<const CudaExecutor> exec,
 #endif  // CUDA_VERSION >= 11000
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEMM_KERNEL);
-
-
-namespace {
-
-
-template <int subwarp_size, typename ValueType, typename IndexType>
-void spgeam(syn::value_list<int, subwarp_size>,
-            std::shared_ptr<const DefaultExecutor> exec, const ValueType* alpha,
-            const IndexType* a_row_ptrs, const IndexType* a_col_idxs,
-            const ValueType* a_vals, const ValueType* beta,
-            const IndexType* b_row_ptrs, const IndexType* b_col_idxs,
-            const ValueType* b_vals, matrix::Csr<ValueType, IndexType>* c)
-{
-    auto m = static_cast<IndexType>(c->get_size()[0]);
-    auto c_row_ptrs = c->get_row_ptrs();
-    // count nnz for alpha * A + beta * B
-    auto subwarps_per_block = default_block_size / subwarp_size;
-    auto num_blocks = ceildiv(m, subwarps_per_block);
-    if (num_blocks > 0) {
-        kernel::spgeam_nnz<subwarp_size>
-            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
-                a_row_ptrs, a_col_idxs, b_row_ptrs, b_col_idxs, m, c_row_ptrs);
-    }
-
-    // build row pointers
-    components::prefix_sum_nonnegative(exec, c_row_ptrs, m + 1);
-
-    // accumulate non-zeros for alpha * A + beta * B
-    matrix::CsrBuilder<ValueType, IndexType> c_builder{c};
-    auto c_nnz = exec->copy_val_to_host(c_row_ptrs + m);
-    c_builder.get_col_idx_array().resize_and_reset(c_nnz);
-    c_builder.get_value_array().resize_and_reset(c_nnz);
-    auto c_col_idxs = c->get_col_idxs();
-    auto c_vals = c->get_values();
-    if (num_blocks > 0) {
-        kernel::spgeam<subwarp_size>
-            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
-                as_device_type(alpha), a_row_ptrs, a_col_idxs,
-                as_device_type(a_vals), as_device_type(beta), b_row_ptrs,
-                b_col_idxs, as_device_type(b_vals), m, c_row_ptrs, c_col_idxs,
-                as_device_type(c_vals));
-    }
-}
-
-GKO_ENABLE_IMPLEMENTATION_SELECTION(select_spgeam, spgeam);
-
-
-}  // namespace
-
 
 template <typename ValueType, typename IndexType>
-void advanced_spgemm(std::shared_ptr<const CudaExecutor> exec,
+void advanced_spgemm(std::shared_ptr<const DefaultExecutor> exec,
                      const matrix::Dense<ValueType>* alpha,
                      const matrix::Csr<ValueType, IndexType>* a,
                      const matrix::Csr<ValueType, IndexType>* b,
@@ -920,64 +864,9 @@ void advanced_spgemm(std::shared_ptr<const CudaExecutor> exec,
 #endif  // CUDA_VERSION >= 11000
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CSR_ADVANCED_SPGEMM_KERNEL);
-
 
 template <typename ValueType, typename IndexType>
-void spgeam(std::shared_ptr<const DefaultExecutor> exec,
-            const matrix::Dense<ValueType>* alpha,
-            const matrix::Csr<ValueType, IndexType>* a,
-            const matrix::Dense<ValueType>* beta,
-            const matrix::Csr<ValueType, IndexType>* b,
-            matrix::Csr<ValueType, IndexType>* c)
-{
-    auto total_nnz =
-        a->get_num_stored_elements() + b->get_num_stored_elements();
-    auto nnz_per_row = total_nnz / a->get_size()[0];
-    select_spgeam(
-        spgeam_kernels(),
-        [&](int compiled_subwarp_size) {
-            return compiled_subwarp_size >= nnz_per_row ||
-                   compiled_subwarp_size == config::warp_size;
-        },
-        syn::value_list<int>(), syn::type_list<>(), exec,
-        alpha->get_const_values(), a->get_const_row_ptrs(),
-        a->get_const_col_idxs(), a->get_const_values(),
-        beta->get_const_values(), b->get_const_row_ptrs(),
-        b->get_const_col_idxs(), b->get_const_values(), c);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEAM_KERNEL);
-
-
-template <typename ValueType, typename IndexType>
-void fill_in_dense(std::shared_ptr<const CudaExecutor> exec,
-                   const matrix::Csr<ValueType, IndexType>* source,
-                   matrix::Dense<ValueType>* result)
-{
-    const auto num_rows = result->get_size()[0];
-    const auto num_cols = result->get_size()[1];
-    const auto stride = result->get_stride();
-    const auto row_ptrs = source->get_const_row_ptrs();
-    const auto col_idxs = source->get_const_col_idxs();
-    const auto vals = source->get_const_values();
-
-    auto grid_dim = ceildiv(num_rows, default_block_size);
-    if (grid_dim > 0) {
-        kernel::fill_in_dense<<<grid_dim, default_block_size, 0,
-                                exec->get_stream()>>>(
-            num_rows, as_device_type(row_ptrs), as_device_type(col_idxs),
-            as_device_type(vals), stride, as_device_type(result->get_values()));
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CSR_FILL_IN_DENSE_KERNEL);
-
-
-template <typename ValueType, typename IndexType>
-void transpose(std::shared_ptr<const CudaExecutor> exec,
+void transpose(std::shared_ptr<const DefaultExecutor> exec,
                const matrix::Csr<ValueType, IndexType>* orig,
                matrix::Csr<ValueType, IndexType>* trans)
 {
@@ -1024,11 +913,9 @@ void transpose(std::shared_ptr<const CudaExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_TRANSPOSE_KERNEL);
-
 
 template <typename ValueType, typename IndexType>
-void conj_transpose(std::shared_ptr<const CudaExecutor> exec,
+void conj_transpose(std::shared_ptr<const DefaultExecutor> exec,
                     const matrix::Csr<ValueType, IndexType>* orig,
                     matrix::Csr<ValueType, IndexType>* trans)
 {
@@ -1083,186 +970,9 @@ void conj_transpose(std::shared_ptr<const CudaExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CSR_CONJ_TRANSPOSE_KERNEL);
-
-
-template <typename ValueType, typename IndexType>
-void inv_symm_permute(std::shared_ptr<const CudaExecutor> exec,
-                      const IndexType* perm,
-                      const matrix::Csr<ValueType, IndexType>* orig,
-                      matrix::Csr<ValueType, IndexType>* permuted)
-{
-    auto num_rows = orig->get_size()[0];
-    auto count_num_blocks = ceildiv(num_rows, default_block_size);
-    if (count_num_blocks > 0) {
-        kernel::inv_row_ptr_permute<<<count_num_blocks, default_block_size, 0,
-                                      exec->get_stream()>>>(
-            num_rows, perm, orig->get_const_row_ptrs(),
-            permuted->get_row_ptrs());
-    }
-    components::prefix_sum_nonnegative(exec, permuted->get_row_ptrs(),
-                                       num_rows + 1);
-    auto copy_num_blocks =
-        ceildiv(num_rows, default_block_size / config::warp_size);
-    if (copy_num_blocks > 0) {
-        kernel::inv_symm_permute<config::warp_size>
-            <<<copy_num_blocks, default_block_size, 0, exec->get_stream()>>>(
-                num_rows, perm, orig->get_const_row_ptrs(),
-                orig->get_const_col_idxs(),
-                as_device_type(orig->get_const_values()),
-                permuted->get_row_ptrs(), permuted->get_col_idxs(),
-                as_device_type(permuted->get_values()));
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CSR_INV_SYMM_PERMUTE_KERNEL);
-
-
-template <typename ValueType, typename IndexType>
-void row_permute(std::shared_ptr<const CudaExecutor> exec,
-                 const IndexType* perm,
-                 const matrix::Csr<ValueType, IndexType>* orig,
-                 matrix::Csr<ValueType, IndexType>* row_permuted)
-{
-    auto num_rows = orig->get_size()[0];
-    auto count_num_blocks = ceildiv(num_rows, default_block_size);
-    if (count_num_blocks > 0) {
-        kernel::row_ptr_permute<<<count_num_blocks, default_block_size, 0,
-                                  exec->get_stream()>>>(
-            num_rows, perm, orig->get_const_row_ptrs(),
-            row_permuted->get_row_ptrs());
-    }
-    components::prefix_sum_nonnegative(exec, row_permuted->get_row_ptrs(),
-                                       num_rows + 1);
-    auto copy_num_blocks =
-        ceildiv(num_rows, default_block_size / config::warp_size);
-    if (copy_num_blocks > 0) {
-        kernel::row_permute<config::warp_size>
-            <<<copy_num_blocks, default_block_size, 0, exec->get_stream()>>>(
-                num_rows, perm, orig->get_const_row_ptrs(),
-                orig->get_const_col_idxs(),
-                as_device_type(orig->get_const_values()),
-                row_permuted->get_row_ptrs(), row_permuted->get_col_idxs(),
-                as_device_type(row_permuted->get_values()));
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CSR_ROW_PERMUTE_KERNEL);
-
-
-template <typename ValueType, typename IndexType>
-void inverse_row_permute(std::shared_ptr<const CudaExecutor> exec,
-                         const IndexType* perm,
-                         const matrix::Csr<ValueType, IndexType>* orig,
-                         matrix::Csr<ValueType, IndexType>* row_permuted)
-{
-    auto num_rows = orig->get_size()[0];
-    auto count_num_blocks = ceildiv(num_rows, default_block_size);
-    if (count_num_blocks > 0) {
-        kernel::inv_row_ptr_permute<<<count_num_blocks, default_block_size, 0,
-                                      exec->get_stream()>>>(
-            num_rows, perm, orig->get_const_row_ptrs(),
-            row_permuted->get_row_ptrs());
-    }
-    components::prefix_sum_nonnegative(exec, row_permuted->get_row_ptrs(),
-                                       num_rows + 1);
-    auto copy_num_blocks =
-        ceildiv(num_rows, default_block_size / config::warp_size);
-    if (copy_num_blocks > 0) {
-        kernel::inv_row_permute<config::warp_size>
-            <<<copy_num_blocks, default_block_size, 0, exec->get_stream()>>>(
-                num_rows, perm, orig->get_const_row_ptrs(),
-                orig->get_const_col_idxs(),
-                as_device_type(orig->get_const_values()),
-                row_permuted->get_row_ptrs(), row_permuted->get_col_idxs(),
-                as_device_type(row_permuted->get_values()));
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CSR_INVERSE_ROW_PERMUTE_KERNEL);
-
-
-template <typename ValueType, typename IndexType>
-void calculate_nonzeros_per_row_in_span(
-    std::shared_ptr<const DefaultExecutor> exec,
-    const matrix::Csr<ValueType, IndexType>* source, const span& row_span,
-    const span& col_span, array<IndexType>* row_nnz)
-{
-    const auto num_rows = source->get_size()[0];
-    auto row_ptrs = source->get_const_row_ptrs();
-    auto col_idxs = source->get_const_col_idxs();
-    auto grid_dim = ceildiv(row_span.length(), default_block_size);
-    if (grid_dim > 0) {
-        kernel::calculate_nnz_per_row_in_span<<<grid_dim, default_block_size, 0,
-                                                exec->get_stream()>>>(
-            row_span, col_span, as_device_type(row_ptrs),
-            as_device_type(col_idxs), as_device_type(row_nnz->get_data()));
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_SPAN_KERNEL);
-
 
 template <typename ValueType, typename IndexType>
-void compute_submatrix(std::shared_ptr<const DefaultExecutor> exec,
-                       const matrix::Csr<ValueType, IndexType>* source,
-                       gko::span row_span, gko::span col_span,
-                       matrix::Csr<ValueType, IndexType>* result)
-{
-    auto row_offset = row_span.begin;
-    auto col_offset = col_span.begin;
-    auto num_rows = result->get_size()[0];
-    auto num_cols = result->get_size()[1];
-    auto row_ptrs = source->get_const_row_ptrs();
-    auto grid_dim = ceildiv(num_rows, default_block_size);
-    if (grid_dim > 0) {
-        kernel::compute_submatrix_idxs_and_vals<<<grid_dim, default_block_size,
-                                                  0, exec->get_stream()>>>(
-            num_rows, num_cols, row_offset, col_offset,
-            as_device_type(source->get_const_row_ptrs()),
-            as_device_type(source->get_const_col_idxs()),
-            as_device_type(source->get_const_values()),
-            as_device_type(result->get_const_row_ptrs()),
-            as_device_type(result->get_col_idxs()),
-            as_device_type(result->get_values()));
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_KERNEL);
-
-
-template <typename ValueType, typename IndexType>
-void calculate_nonzeros_per_row_in_index_set(
-    std::shared_ptr<const DefaultExecutor> exec,
-    const matrix::Csr<ValueType, IndexType>* source,
-    const gko::index_set<IndexType>& row_index_set,
-    const gko::index_set<IndexType>& col_index_set,
-    IndexType* row_nnz) GKO_NOT_IMPLEMENTED;
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_INDEX_SET_KERNEL);
-
-
-template <typename ValueType, typename IndexType>
-void compute_submatrix_from_index_set(
-    std::shared_ptr<const DefaultExecutor> exec,
-    const matrix::Csr<ValueType, IndexType>* source,
-    const gko::index_set<IndexType>& row_index_set,
-    const gko::index_set<IndexType>& col_index_set,
-    matrix::Csr<ValueType, IndexType>* result) GKO_NOT_IMPLEMENTED;
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_FROM_INDEX_SET_KERNEL);
-
-
-template <typename ValueType, typename IndexType>
-void sort_by_column_index(std::shared_ptr<const CudaExecutor> exec,
+void sort_by_column_index(std::shared_ptr<const DefaultExecutor> exec,
                           matrix::Csr<ValueType, IndexType>* to_sort)
 {
     if (cusparse::is_supported<ValueType, IndexType>::value) {
@@ -1312,110 +1022,6 @@ void sort_by_column_index(std::shared_ptr<const CudaExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CSR_SORT_BY_COLUMN_INDEX);
-
-
-template <typename ValueType, typename IndexType>
-void is_sorted_by_column_index(
-    std::shared_ptr<const CudaExecutor> exec,
-    const matrix::Csr<ValueType, IndexType>* to_check, bool* is_sorted)
-{
-    *is_sorted = true;
-    auto cpu_array = make_array_view(exec->get_master(), 1, is_sorted);
-    auto gpu_array = array<bool>{exec, cpu_array};
-    auto block_size = default_block_size;
-    auto num_rows = static_cast<IndexType>(to_check->get_size()[0]);
-    auto num_blocks = ceildiv(num_rows, block_size);
-    if (num_blocks > 0) {
-        kernel::
-            check_unsorted<<<num_blocks, block_size, 0, exec->get_stream()>>>(
-                to_check->get_const_row_ptrs(), to_check->get_const_col_idxs(),
-                num_rows, gpu_array.get_data());
-    }
-    cpu_array = gpu_array;
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CSR_IS_SORTED_BY_COLUMN_INDEX);
-
-
-template <typename ValueType, typename IndexType>
-void extract_diagonal(std::shared_ptr<const CudaExecutor> exec,
-                      const matrix::Csr<ValueType, IndexType>* orig,
-                      matrix::Diagonal<ValueType>* diag)
-{
-    const auto nnz = orig->get_num_stored_elements();
-    const auto diag_size = diag->get_size()[0];
-    const auto num_blocks =
-        ceildiv(config::warp_size * diag_size, default_block_size);
-
-    const auto orig_values = orig->get_const_values();
-    const auto orig_row_ptrs = orig->get_const_row_ptrs();
-    const auto orig_col_idxs = orig->get_const_col_idxs();
-    auto diag_values = diag->get_values();
-
-    if (num_blocks > 0) {
-        kernel::extract_diagonal<<<num_blocks, default_block_size, 0,
-                                   exec->get_stream()>>>(
-            diag_size, nnz, as_device_type(orig_values),
-            as_device_type(orig_row_ptrs), as_device_type(orig_col_idxs),
-            as_device_type(diag_values));
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_EXTRACT_DIAGONAL);
-
-
-template <typename ValueType, typename IndexType>
-void check_diagonal_entries_exist(
-    std::shared_ptr<const CudaExecutor> exec,
-    const matrix::Csr<ValueType, IndexType>* const mtx, bool& has_all_diags)
-{
-    const size_type num_warps = mtx->get_size()[0];
-    if (num_warps > 0) {
-        const size_type num_blocks =
-            num_warps / (default_block_size / config::warp_size);
-        array<bool> has_diags(exec, {true});
-        kernel::check_diagonal_entries<<<num_blocks, default_block_size, 0,
-                                         exec->get_stream()>>>(
-            static_cast<IndexType>(
-                std::min(mtx->get_size()[0], mtx->get_size()[1])),
-            mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(),
-            has_diags.get_data());
-        has_all_diags = exec->copy_val_to_host(has_diags.get_const_data());
-    } else {
-        has_all_diags = true;
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CSR_CHECK_DIAGONAL_ENTRIES_EXIST);
-
-
-template <typename ValueType, typename IndexType>
-void add_scaled_identity(std::shared_ptr<const CudaExecutor> exec,
-                         const matrix::Dense<ValueType>* const alpha,
-                         const matrix::Dense<ValueType>* const beta,
-                         matrix::Csr<ValueType, IndexType>* const mtx)
-{
-    const auto nrows = mtx->get_size()[0];
-    if (nrows == 0) {
-        return;
-    }
-    const auto nthreads = nrows * config::warp_size;
-    const auto nblocks = ceildiv(nthreads, default_block_size);
-    kernel::add_scaled_identity<<<nblocks, default_block_size, 0,
-                                  exec->get_stream()>>>(
-        as_device_type(alpha->get_const_values()),
-        as_device_type(beta->get_const_values()), static_cast<IndexType>(nrows),
-        mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(),
-        as_device_type(mtx->get_values()));
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL);
-
 
 }  // namespace csr
 }  // namespace cuda
diff --git a/cuda/matrix/ell_kernels.cu b/cuda/matrix/ell_kernels.cu
index 124a4deda75..7b20236827e 100644
--- a/cuda/matrix/ell_kernels.cu
+++ b/cuda/matrix/ell_kernels.cu
@@ -122,10 +122,12 @@ void abstract_spmv(syn::value_list<int, info>,
                    const matrix::Dense<MatrixValueType>* alpha = nullptr,
                    const matrix::Dense<OutputValueType>* beta = nullptr)
 {
+    using arithmetic_type =
+        highest_precision<InputValueType, OutputValueType, MatrixValueType>;
     using a_accessor =
-        gko::acc::reduced_row_major<1, OutputValueType, const MatrixValueType>;
+        gko::acc::reduced_row_major<1, arithmetic_type, const MatrixValueType>;
     using b_accessor =
-        gko::acc::reduced_row_major<2, OutputValueType, const InputValueType>;
+        gko::acc::reduced_row_major<2, arithmetic_type, const InputValueType>;
 
     const auto nrows = a->get_size()[0];
     const auto stride = a->get_stride();
diff --git a/cuda/matrix/fbcsr_kernels.instantiate.cu b/cuda/matrix/fbcsr_kernels.instantiate.cu
new file mode 100644
index 00000000000..73c3fc136ba
--- /dev/null
+++ b/cuda/matrix/fbcsr_kernels.instantiate.cu
@@ -0,0 +1,75 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "cuda/matrix/fbcsr_kernels.template.cu"
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+/**
+ * @brief The fixed-size block compressed sparse row matrix format namespace.
+ *
+ * @ingroup fbcsr
+ */
+namespace fbcsr {
+
+
+// begin
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_FBCSR_FILL_IN_MATRIX_DATA_KERNEL);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_FBCSR_FILL_IN_DENSE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_FBCSR_CONVERT_TO_CSR_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_FBCSR_IS_SORTED_BY_COLUMN_INDEX);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_FBCSR_SORT_BY_COLUMN_INDEX);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_FBCSR_EXTRACT_DIAGONAL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_SPMV_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_FBCSR_ADVANCED_SPMV_KERNEL);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_FBCSR_TRANSPOSE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_FBCSR_CONJ_TRANSPOSE_KERNEL);
+// end
+
+
+}  // namespace fbcsr
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko
diff --git a/cuda/matrix/fbcsr_kernels.cu b/cuda/matrix/fbcsr_kernels.template.cu
similarity index 97%
rename from cuda/matrix/fbcsr_kernels.cu
rename to cuda/matrix/fbcsr_kernels.template.cu
index 8160a0ac5a5..c629b292bfb 100644
--- a/cuda/matrix/fbcsr_kernels.cu
+++ b/cuda/matrix/fbcsr_kernels.template.cu
@@ -180,8 +180,6 @@ void spmv(std::shared_ptr<const CudaExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_SPMV_KERNEL);
-
 
 template <typename ValueType, typename IndexType>
 void advanced_spmv(std::shared_ptr<const CudaExecutor> exec,
@@ -240,9 +238,6 @@ void advanced_spmv(std::shared_ptr<const CudaExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_FBCSR_ADVANCED_SPMV_KERNEL);
-
 
 namespace {
 
@@ -305,9 +300,6 @@ void transpose(const std::shared_ptr<const CudaExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_FBCSR_TRANSPOSE_KERNEL);
-
 
 template <typename ValueType, typename IndexType>
 void conj_transpose(std::shared_ptr<const CudaExecutor> exec,
@@ -325,9 +317,6 @@ void conj_transpose(std::shared_ptr<const CudaExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_FBCSR_CONJ_TRANSPOSE_KERNEL);
-
 
 }  // namespace fbcsr
 }  // namespace cuda
diff --git a/cuda/matrix/sparsity_csr_kernels.cu b/cuda/matrix/sparsity_csr_kernels.cu
index 73e1fd9cb76..ab367c80b20 100644
--- a/cuda/matrix/sparsity_csr_kernels.cu
+++ b/cuda/matrix/sparsity_csr_kernels.cu
@@ -33,15 +33,22 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/matrix/sparsity_csr_kernels.hpp"
 
 
+#include <thrust/sort.h>
+
+
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
 #include "accessor/cuda_helper.hpp"
 #include "accessor/reduced_row_major.hpp"
 #include "core/base/mixed_precision_types.hpp"
+#include "core/components/fill_array_kernels.hpp"
+#include "core/components/format_conversion_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
 #include "cuda/base/config.hpp"
+#include "cuda/base/cusparse_bindings.hpp"
 #include "cuda/base/math.hpp"
+#include "cuda/base/thrust.cuh"
 #include "cuda/base/types.hpp"
 #include "cuda/components/cooperative_groups.cuh"
 #include "cuda/components/reduction.cuh"
@@ -61,6 +68,7 @@ namespace sparsity_csr {
 
 
 constexpr int classical_oversubscription = 32;
+constexpr int default_block_size = 512;
 constexpr int spmv_block_size = 128;
 constexpr int warps_in_block = 4;
 
@@ -68,6 +76,7 @@ constexpr int warps_in_block = 4;
 using classical_kernels = syn::value_list<int, 2>;
 
 
+#include "common/cuda_hip/matrix/csr_common.hpp.inc"
 #include "common/cuda_hip/matrix/sparsity_csr_kernels.hpp.inc"
 
 
@@ -178,6 +187,62 @@ GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SPARSITY_CSR_ADVANCED_SPMV_KERNEL);
 
 
+template <typename ValueType, typename IndexType>
+void sort_by_column_index(std::shared_ptr<const DefaultExecutor> exec,
+                          matrix::SparsityCsr<ValueType, IndexType>* to_sort)
+{
+    const auto nnz = static_cast<IndexType>(to_sort->get_num_nonzeros());
+    const auto num_rows = static_cast<IndexType>(to_sort->get_size()[0]);
+    const auto num_cols = static_cast<IndexType>(to_sort->get_size()[1]);
+    const auto row_ptrs = to_sort->get_const_row_ptrs();
+    const auto col_idxs = to_sort->get_col_idxs();
+    if (cusparse::is_supported<ValueType, IndexType>::value) {
+        const auto handle = exec->get_cusparse_handle();
+        auto descr = cusparse::create_mat_descr();
+        array<IndexType> permutation_array(exec, to_sort->get_num_nonzeros());
+        auto permutation = permutation_array.get_data();
+        components::fill_seq_array(exec, permutation,
+                                   to_sort->get_num_nonzeros());
+        size_type buffer_size{};
+        cusparse::csrsort_buffer_size(handle, num_rows, num_cols, nnz, row_ptrs,
+                                      col_idxs, buffer_size);
+        array<char> buffer_array{exec, buffer_size};
+        auto buffer = buffer_array.get_data();
+        cusparse::csrsort(handle, num_rows, num_cols, nnz, descr, row_ptrs,
+                          col_idxs, permutation, buffer);
+        cusparse::destroy(descr);
+    } else {
+        fallback_sort(exec, to_sort);
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_SPARSITY_CSR_SORT_BY_COLUMN_INDEX);
+
+
+template <typename ValueType, typename IndexType>
+void is_sorted_by_column_index(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const matrix::SparsityCsr<ValueType, IndexType>* to_check, bool* is_sorted)
+{
+    *is_sorted = true;
+    auto cpu_array = make_array_view(exec->get_master(), 1, is_sorted);
+    auto gpu_array = array<bool>{exec, cpu_array};
+    const auto num_rows = static_cast<IndexType>(to_check->get_size()[0]);
+    auto num_blocks = ceildiv(num_rows, default_block_size);
+    if (num_blocks > 0) {
+        kernel::check_unsorted<<<num_blocks, default_block_size, 0,
+                                 exec->get_stream()>>>(
+            to_check->get_const_row_ptrs(), to_check->get_const_col_idxs(),
+            num_rows, gpu_array.get_data());
+    }
+    cpu_array = gpu_array;
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_SPARSITY_CSR_IS_SORTED_BY_COLUMN_INDEX);
+
+
 }  // namespace sparsity_csr
 }  // namespace cuda
 }  // namespace kernels
diff --git a/cuda/preconditioner/batch_preconditioners.cuh b/cuda/preconditioner/batch_preconditioners.cuh
new file mode 100644
index 00000000000..d26639d9b62
--- /dev/null
+++ b/cuda/preconditioner/batch_preconditioners.cuh
@@ -0,0 +1,60 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CUDA_PRECONDITIONER_BATCH_PRECONDITIONERS_CUH_
+#define GKO_CUDA_PRECONDITIONER_BATCH_PRECONDITIONERS_CUH_
+
+
+#include <ginkgo/core/matrix/batch_identity.hpp>
+
+
+#include "core/matrix/batch_struct.hpp"
+#include "cuda/components/cooperative_groups.cuh"
+#include "cuda/components/reduction.cuh"
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+namespace batch_preconditioner {
+
+
+#include "common/cuda_hip/preconditioner/batch_identity.hpp.inc"
+
+
+}  // namespace batch_preconditioner
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_CUDA_PRECONDITIONER_BATCH_PRECONDITIONERS_CUH_
diff --git a/cuda/solver/batch_bicgstab_kernels.cu b/cuda/solver/batch_bicgstab_kernels.cu
new file mode 100644
index 00000000000..1d80f206c1b
--- /dev/null
+++ b/cuda/solver/batch_bicgstab_kernels.cu
@@ -0,0 +1,297 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/solver/batch_bicgstab_kernels.hpp"
+
+
+#include <thrust/functional.h>
+#include <thrust/transform.h>
+
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/math.hpp>
+
+
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "core/solver/batch_dispatch.hpp"
+#include "cuda/base/batch_struct.hpp"
+#include "cuda/base/config.hpp"
+#include "cuda/base/kernel_config.hpp"
+#include "cuda/base/thrust.cuh"
+#include "cuda/base/types.hpp"
+#include "cuda/components/cooperative_groups.cuh"
+#include "cuda/components/reduction.cuh"
+#include "cuda/components/thread_ids.cuh"
+#include "cuda/components/uninitialized_array.hpp"
+#include "cuda/matrix/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+
+
+// NOTE: this default block size is not used for the main solver kernel.
+constexpr int default_block_size = 256;
+constexpr int sm_oversubscription = 4;
+
+
+/**
+ * @brief The batch Bicgstab solver namespace.
+ *
+ * @ingroup batch_bicgstab
+ */
+namespace batch_bicgstab {
+
+
+#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc"
+#include "common/cuda_hip/components/uninitialized_array.hpp.inc"
+#include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc"
+#include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc"
+#include "common/cuda_hip/solver/batch_bicgstab_kernels.hpp.inc"
+
+
+template <typename StopType, typename PrecType, typename LogType,
+          typename BatchMatrixType, typename ValueType>
+int get_num_threads_per_block(std::shared_ptr<const DefaultExecutor> exec,
+                              const int num_rows)
+{
+    int num_warps = std::max(num_rows / 4, 2);
+    constexpr int warp_sz = static_cast<int>(config::warp_size);
+    const int min_block_size = 2 * warp_sz;
+    const int device_max_threads =
+        ((std::max(num_rows, min_block_size)) / warp_sz) * warp_sz;
+    cudaFuncAttributes funcattr;
+    cudaFuncGetAttributes(&funcattr,
+                          apply_kernel<StopType, 9, true, PrecType, LogType,
+                                       BatchMatrixType, ValueType>);
+    const int num_regs_used = funcattr.numRegs;
+    int max_regs_blk = 0;
+    cudaDeviceGetAttribute(&max_regs_blk, cudaDevAttrMaxRegistersPerBlock,
+                           exec->get_device_id());
+    const int max_threads_regs =
+        ((max_regs_blk / static_cast<int>(num_regs_used)) / warp_sz) * warp_sz;
+    int max_threads = std::min(max_threads_regs, device_max_threads);
+    max_threads = max_threads <= 1024 ? max_threads : 1024;
+    return std::max(std::min(num_warps * warp_sz, max_threads), min_block_size);
+}
+
+
+template <typename StopType, typename PrecType, typename LogType,
+          typename BatchMatrixType, typename ValueType>
+int get_max_dynamic_shared_memory(std::shared_ptr<const DefaultExecutor> exec)
+{
+    int shmem_per_sm = 0;
+    cudaDeviceGetAttribute(&shmem_per_sm,
+                           cudaDevAttrMaxSharedMemoryPerMultiprocessor,
+                           exec->get_device_id());
+    GKO_ASSERT_NO_CUDA_ERRORS(cudaFuncSetAttribute(
+        apply_kernel<StopType, 9, true, PrecType, LogType, BatchMatrixType,
+                     ValueType>,
+        cudaFuncAttributePreferredSharedMemoryCarveout, 99 /*%*/));
+    cudaFuncAttributes funcattr;
+    cudaFuncGetAttributes(&funcattr,
+                          apply_kernel<StopType, 9, true, PrecType, LogType,
+                                       BatchMatrixType, ValueType>);
+    return funcattr.maxDynamicSharedSizeBytes;
+}
+
+
+template <typename T>
+using settings = gko::kernels::batch_bicgstab::settings<T>;
+
+
+template <typename CuValueType>
+class kernel_caller {
+public:
+    using value_type = CuValueType;
+
+    kernel_caller(std::shared_ptr<const DefaultExecutor> exec,
+                  const settings<remove_complex<value_type>> settings)
+        : exec_{std::move(exec)}, settings_{settings}
+    {}
+
+    template <typename StopType, const int n_shared,
+              const bool prec_shared_bool, typename PrecType, typename LogType,
+              typename BatchMatrixType>
+    void launch_apply_kernel(
+        const gko::kernels::batch_bicgstab::storage_config& sconf,
+        LogType& logger, PrecType& prec, const BatchMatrixType& mat,
+        const value_type* const __restrict__ b_values,
+        value_type* const __restrict__ x_values,
+        value_type* const __restrict__ workspace_data, const int& block_size,
+        const size_t& shared_size) const
+    {
+        apply_kernel<StopType, n_shared, prec_shared_bool>
+            <<<mat.num_batch_items, block_size, shared_size,
+               exec_->get_stream()>>>(sconf, settings_.max_iterations,
+                                      settings_.residual_tol, logger, prec, mat,
+                                      b_values, x_values, workspace_data);
+    }
+
+
+    template <typename BatchMatrixType, typename PrecType, typename StopType,
+              typename LogType>
+    void call_kernel(
+        LogType logger, const BatchMatrixType& mat, PrecType prec,
+        const gko::batch::multi_vector::uniform_batch<const value_type>& b,
+        const gko::batch::multi_vector::uniform_batch<value_type>& x) const
+    {
+        using real_type = gko::remove_complex<value_type>;
+        const size_type num_batch_items = mat.num_batch_items;
+        constexpr int align_multiple = 8;
+        const int padded_num_rows =
+            ceildiv(mat.num_rows, align_multiple) * align_multiple;
+        auto shem_guard =
+            gko::kernels::cuda::detail::shared_memory_config_guard<
+                value_type>();
+        const int shmem_per_blk =
+            get_max_dynamic_shared_memory<StopType, PrecType, LogType,
+                                          BatchMatrixType, value_type>(exec_);
+        const int block_size =
+            get_num_threads_per_block<StopType, PrecType, LogType,
+                                      BatchMatrixType, value_type>(
+                exec_, mat.num_rows);
+        GKO_ASSERT(block_size >= 2 * config::warp_size);
+
+        const size_t prec_size =
+            PrecType::dynamic_work_size(padded_num_rows,
+                                        mat.get_single_item_num_nnz()) *
+            sizeof(value_type);
+        const auto sconf =
+            gko::kernels::batch_bicgstab::compute_shared_storage<PrecType,
+                                                                 value_type>(
+                shmem_per_blk, padded_num_rows, mat.get_single_item_num_nnz(),
+                b.num_rhs);
+        const size_t shared_size =
+            sconf.n_shared * padded_num_rows * sizeof(value_type) +
+            (sconf.prec_shared ? prec_size : 0);
+        auto workspace = gko::array<value_type>(
+            exec_,
+            sconf.gmem_stride_bytes * num_batch_items / sizeof(value_type));
+        assert(sconf.gmem_stride_bytes % sizeof(value_type) == 0);
+
+        value_type* const workspace_data = workspace.get_data();
+
+        // Template parameters launch_apply_kernel<StopType, n_shared,
+        // prec_shared>
+        if (sconf.prec_shared) {
+            launch_apply_kernel<StopType, 9, true>(
+                sconf, logger, prec, mat, b.values, x.values, workspace_data,
+                block_size, shared_size);
+        } else {
+            switch (sconf.n_shared) {
+            case 0:
+                launch_apply_kernel<StopType, 0, false>(
+                    sconf, logger, prec, mat, b.values, x.values,
+                    workspace_data, block_size, shared_size);
+                break;
+            case 1:
+                launch_apply_kernel<StopType, 1, false>(
+                    sconf, logger, prec, mat, b.values, x.values,
+                    workspace_data, block_size, shared_size);
+                break;
+            case 2:
+                launch_apply_kernel<StopType, 2, false>(
+                    sconf, logger, prec, mat, b.values, x.values,
+                    workspace_data, block_size, shared_size);
+                break;
+            case 3:
+                launch_apply_kernel<StopType, 3, false>(
+                    sconf, logger, prec, mat, b.values, x.values,
+                    workspace_data, block_size, shared_size);
+                break;
+            case 4:
+                launch_apply_kernel<StopType, 4, false>(
+                    sconf, logger, prec, mat, b.values, x.values,
+                    workspace_data, block_size, shared_size);
+                break;
+            case 5:
+                launch_apply_kernel<StopType, 5, false>(
+                    sconf, logger, prec, mat, b.values, x.values,
+                    workspace_data, block_size, shared_size);
+                break;
+            case 6:
+                launch_apply_kernel<StopType, 6, false>(
+                    sconf, logger, prec, mat, b.values, x.values,
+                    workspace_data, block_size, shared_size);
+                break;
+            case 7:
+                launch_apply_kernel<StopType, 7, false>(
+                    sconf, logger, prec, mat, b.values, x.values,
+                    workspace_data, block_size, shared_size);
+                break;
+            case 8:
+                launch_apply_kernel<StopType, 8, false>(
+                    sconf, logger, prec, mat, b.values, x.values,
+                    workspace_data, block_size, shared_size);
+                break;
+            case 9:
+                launch_apply_kernel<StopType, 9, false>(
+                    sconf, logger, prec, mat, b.values, x.values,
+                    workspace_data, block_size, shared_size);
+                break;
+            default:
+                GKO_NOT_IMPLEMENTED;
+            }
+        }
+    }
+
+private:
+    std::shared_ptr<const DefaultExecutor> exec_;
+    const settings<remove_complex<value_type>> settings_;
+};
+
+
+template <typename ValueType>
+void apply(std::shared_ptr<const DefaultExecutor> exec,
+           const settings<remove_complex<ValueType>>& settings,
+           const batch::BatchLinOp* const mat,
+           const batch::BatchLinOp* const precon,
+           const batch::MultiVector<ValueType>* const b,
+           batch::MultiVector<ValueType>* const x,
+           batch::log::detail::log_data<remove_complex<ValueType>>& logdata)
+{
+    using cu_value_type = cuda_type<ValueType>;
+    auto dispatcher = batch::solver::create_dispatcher<ValueType>(
+        kernel_caller<cu_value_type>(exec, settings), settings, mat, precon);
+    dispatcher.apply(b, x, logdata);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_APPLY_KERNEL);
+
+
+}  // namespace batch_bicgstab
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko
diff --git a/cuda/solver/common_trs_kernels.cuh b/cuda/solver/common_trs_kernels.cuh
index a8b134cebf2..6dbd65968d0 100644
--- a/cuda/solver/common_trs_kernels.cuh
+++ b/cuda/solver/common_trs_kernels.cuh
@@ -55,9 +55,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "cuda/base/pointer_mode_guard.hpp"
 #include "cuda/base/types.hpp"
 #include "cuda/components/atomic.cuh"
+#include "cuda/components/memory.cuh"
 #include "cuda/components/thread_ids.cuh"
 #include "cuda/components/uninitialized_array.hpp"
-#include "cuda/components/volatile.cuh"
 
 
 namespace gko {
@@ -120,12 +120,13 @@ struct CudaSolveStruct : gko::solver::SolveStruct {
         const auto rows = matrix->get_size()[0];
         // workaround suggested by NVIDIA engineers: for some reason
         // cusparse needs non-nullptr input vectors even for analysis
+        // also make sure they are aligned by 16 bytes
         auto descr_b = cusparse::create_dnmat(
             dim<2>{matrix->get_size()[0], num_rhs}, matrix->get_size()[1],
-            reinterpret_cast<ValueType*>(0xDEAD));
+            reinterpret_cast<ValueType*>(0xDEAD0));
         auto descr_c = cusparse::create_dnmat(
             dim<2>{matrix->get_size()[0], num_rhs}, matrix->get_size()[1],
-            reinterpret_cast<ValueType*>(0xDEAF));
+            reinterpret_cast<ValueType*>(0xDEAF0));
 
         auto work_size = cusparse::spsm_buffer_size(
             handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
@@ -198,7 +199,7 @@ struct CudaSolveStruct : gko::solver::SolveStruct {
 };
 
 
-#elif (defined(CUDA_VERSION) && (CUDA_VERSION >= 9020))
+#else
 
 template <typename ValueType, typename IndexType>
 struct CudaSolveStruct : gko::solver::SolveStruct {
@@ -425,30 +426,31 @@ __global__ void sptrsv_naive_caching_kernel(
                                              : dependency * nrhs + rhs;
         const bool shmem_possible =
             (dependency_gid / default_block_size) == self_shmem_id;
+        ValueType val{};
         if (shmem_possible) {
             const auto dependency_shid = dependency_gid % default_block_size;
-            x_p = &x_s[dependency_shid];
-        }
-
-        ValueType x = *x_p;
-        while (is_nan(x)) {
-            x = load(x_p, 0);
+            while (is_nan(val = load_relaxed_shared(x_s + dependency_shid))) {
+            }
+        } else {
+            while (
+                is_nan(val = load_relaxed(x + dependency * x_stride + rhs))) {
+            }
         }
 
-        sum += x * vals[i];
+        sum += val * vals[i];
     }
 
     // The first entry past the triangular part will be the diagonal
     const auto diag = unit_diag ? one<ValueType>() : vals[i];
     const auto r = (b[row * b_stride + rhs] - sum) / diag;
 
-    store(x_s, self_shid, r);
-    x[row * x_stride + rhs] = r;
+    store_relaxed_shared(x_s + self_shid, r);
+    store_relaxed(x + row * x_stride + rhs, r);
 
-    // This check to ensure no infinte loops happen.
+    // This check to ensure no infinite loops happen.
     if (is_nan(r)) {
-        store(x_s, self_shid, zero<ValueType>());
-        x[row * x_stride + rhs] = zero<ValueType>();
+        store_relaxed_shared(x_s + self_shid, zero<ValueType>());
+        store_relaxed(x + row * x_stride + rhs, zero<ValueType>());
         *nan_produced = true;
     }
 }
@@ -487,12 +489,12 @@ __global__ void sptrsv_naive_legacy_kernel(
     auto j = row_begin;
     auto col = colidxs[j];
     while (j != row_end) {
-        auto x_val = load(x, col * x_stride + rhs);
+        auto x_val = load_relaxed(x + col * x_stride + rhs);
         while (!is_nan(x_val)) {
             sum += vals[j] * x_val;
             j += row_step;
             col = colidxs[j];
-            x_val = load(x, col * x_stride + rhs);
+            x_val = load_relaxed(x + col * x_stride + rhs);
         }
         // to avoid the kernel hanging on matrices without diagonal,
         // we bail out if we are past the triangle, even if it's not
@@ -502,12 +504,12 @@ __global__ void sptrsv_naive_legacy_kernel(
             // assert(row == col);
             auto diag = unit_diag ? one<ValueType>() : vals[j];
             const auto r = (b[row * b_stride + rhs] - sum) / diag;
-            store(x, row * x_stride + rhs, r);
+            store_relaxed(x + row * x_stride + rhs, r);
             // after we encountered the diagonal, we are done
             // this also skips entries outside the triangle
             j = row_end;
             if (is_nan(r)) {
-                store(x, row * x_stride + rhs, zero<ValueType>());
+                store_relaxed(x + row * x_stride + rhs, zero<ValueType>());
                 *nan_produced = true;
             }
         }
diff --git a/cuda/solver/idr_kernels.cu b/cuda/solver/idr_kernels.cu
index 10e8a7b2fc3..7bfe56987f4 100644
--- a/cuda/solver/idr_kernels.cu
+++ b/cuda/solver/idr_kernels.cu
@@ -104,6 +104,7 @@ void initialize_subspace_vectors(std::shared_ptr<const DefaultExecutor> exec,
             gen,
             subspace_vectors->get_size()[0] * subspace_vectors->get_stride(),
             0.0, 1.0, subspace_vectors->get_values());
+        curand::destroy(gen);
     }
 }
 
diff --git a/cuda/components/volatile.cuh b/cuda/stop/batch_criteria.cuh
similarity index 88%
rename from cuda/components/volatile.cuh
rename to cuda/stop/batch_criteria.cuh
index 96cb869c57e..d804ee8100e 100644
--- a/cuda/components/volatile.cuh
+++ b/cuda/stop/batch_criteria.cuh
@@ -30,29 +30,25 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-#ifndef GKO_CUDA_COMPONENTS_VOLATILE_CUH_
-#define GKO_CUDA_COMPONENTS_VOLATILE_CUH_
-
-
-#include <type_traits>
+#ifndef GKO_CUDA_STOP_BATCH_CRITERIA_CUH_
+#define GKO_CUDA_STOP_BATCH_CRITERIA_CUH_
 
 
 #include <ginkgo/core/base/math.hpp>
 
 
-#include "cuda/base/types.hpp"
-
-
 namespace gko {
 namespace kernels {
 namespace cuda {
+namespace batch_stop {
 
 
-#include "common/cuda_hip/components/volatile.hpp.inc"
+#include "common/cuda_hip/stop/batch_criteria.hpp.inc"
 
 
+}  // namespace batch_stop
 }  // namespace cuda
 }  // namespace kernels
 }  // namespace gko
 
-#endif  // GKO_CUDA_COMPONENTS_VOLATILE_CUH_
+#endif  // GKO_CUDA_STOP_BATCH_CRITERIA_CUH_
diff --git a/cuda/stop/residual_norm_kernels.cu b/cuda/stop/residual_norm_kernels.cu
index 39280b19f69..96e0667d06e 100644
--- a/cuda/stop/residual_norm_kernels.cu
+++ b/cuda/stop/residual_norm_kernels.cu
@@ -67,7 +67,7 @@ __global__ __launch_bounds__(default_block_size) void residual_norm_kernel(
 {
     const auto tidx = thread::get_thread_id_flat();
     if (tidx < num_cols) {
-        if (tau[tidx] < rel_residual_goal * orig_tau[tidx]) {
+        if (tau[tidx] <= rel_residual_goal * orig_tau[tidx]) {
             stop_status[tidx].converge(stoppingId, setFinalized);
             device_storage[1] = true;
         }
@@ -148,7 +148,7 @@ __launch_bounds__(default_block_size) void implicit_residual_norm_kernel(
 {
     const auto tidx = thread::get_thread_id_flat();
     if (tidx < num_cols) {
-        if (sqrt(abs(tau[tidx])) < rel_residual_goal * orig_tau[tidx]) {
+        if (sqrt(abs(tau[tidx])) <= rel_residual_goal * orig_tau[tidx]) {
             stop_status[tidx].converge(stoppingId, setFinalized);
             device_storage[1] = true;
         }
diff --git a/cuda/test/base/CMakeLists.txt b/cuda/test/base/CMakeLists.txt
index 9be3caf9faa..d4260c6e934 100644
--- a/cuda/test/base/CMakeLists.txt
+++ b/cuda/test/base/CMakeLists.txt
@@ -1,13 +1,13 @@
-ginkgo_create_cuda_test(array)
+ginkgo_create_test(array RESOURCE_TYPE cudagpu)
 ginkgo_create_cuda_test(cuda_executor)
-ginkgo_create_test(index_set)
-ginkgo_create_test(cuda_executor_reset ADDITIONAL_LIBRARIES Threads::Threads)
+ginkgo_create_test(index_set RESOURCE_TYPE cudagpu)
 if(GINKGO_HAVE_HWLOC)
     find_package(NUMA REQUIRED)
     ginkgo_create_cuda_test(cuda_executor_topology ADDITIONAL_LIBRARIES NUMA::NUMA)
 endif()
 ginkgo_create_cuda_test(exception_helpers)
 ginkgo_create_cuda_test(kernel_launch)
-ginkgo_create_cuda_test(lin_op)
+ginkgo_create_test(lin_op RESOURCE_TYPE cudagpu)
 ginkgo_create_cuda_test(math)
+ginkgo_create_test(memory RESOURCE_TYPE cudagpu)
 ginkgo_create_cuda_test(scoped_device_id)
diff --git a/cuda/test/base/array.cu b/cuda/test/base/array.cpp
similarity index 100%
rename from cuda/test/base/array.cu
rename to cuda/test/base/array.cpp
diff --git a/cuda/test/base/cuda_executor.cu b/cuda/test/base/cuda_executor.cu
index 5f489ac22f0..83cfd1827ad 100644
--- a/cuda/test/base/cuda_executor.cu
+++ b/cuda/test/base/cuda_executor.cu
@@ -42,6 +42,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/stream.hpp>
 
 #include "common/cuda_hip/base/executor.hpp.inc"
 #include "cuda/base/scoped_device_id.hpp"
@@ -92,7 +93,7 @@ protected:
           stream(0),
           other_stream(gko::CudaExecutor::get_num_devices() - 1),
 #endif
-          omp(gko::OmpExecutor::create()),
+          ref(gko::ReferenceExecutor::create()),
           cuda(nullptr),
           cuda2(nullptr),
           cuda3(nullptr)
@@ -103,18 +104,19 @@ protected:
         ASSERT_GT(gko::CudaExecutor::get_num_devices(), 0);
 #ifdef GKO_TEST_NONDEFAULT_STREAM
         cuda = gko::CudaExecutor::create(
-            0, omp, false, gko::default_cuda_alloc_mode, stream.get());
+            0, ref, std::make_shared<gko::CudaAllocator>(), stream.get());
         cuda2 = gko::CudaExecutor::create(
-            gko::CudaExecutor::get_num_devices() - 1, omp, false,
-            gko::default_cuda_alloc_mode, other_stream.get());
+            gko::CudaExecutor::get_num_devices() - 1, ref,
+            std::make_shared<gko::CudaAllocator>(), other_stream.get());
         cuda3 = gko::CudaExecutor::create(
-            0, omp, false, gko::allocation_mode::unified_global, stream.get());
+            0, ref, std::make_shared<gko::CudaUnifiedAllocator>(0),
+            stream.get());
 #else
-        cuda = gko::CudaExecutor::create(0, omp);
+        cuda = gko::CudaExecutor::create(0, ref);
         cuda2 = gko::CudaExecutor::create(
-            gko::CudaExecutor::get_num_devices() - 1, omp);
-        cuda3 = gko::CudaExecutor::create(0, omp, false,
-                                          gko::allocation_mode::unified_global);
+            gko::CudaExecutor::get_num_devices() - 1, ref);
+        cuda3 = gko::CudaExecutor::create(
+            0, ref, std::make_shared<gko::CudaUnifiedAllocator>(0));
 #endif
     }
 
@@ -130,7 +132,7 @@ protected:
     gko::cuda_stream stream;
     gko::cuda_stream other_stream;
 #endif
-    std::shared_ptr<gko::Executor> omp;
+    std::shared_ptr<gko::ReferenceExecutor> ref;
     std::shared_ptr<gko::CudaExecutor> cuda;
     std::shared_ptr<gko::CudaExecutor> cuda2;
     std::shared_ptr<gko::CudaExecutor> cuda3;
@@ -139,8 +141,8 @@ protected:
 
 TEST_F(CudaExecutor, CanInstantiateTwoExecutorsOnOneDevice)
 {
-    auto cuda = gko::CudaExecutor::create(0, omp);
-    auto cuda2 = gko::CudaExecutor::create(0, omp);
+    auto cuda = gko::CudaExecutor::create(0, ref);
+    auto cuda2 = gko::CudaExecutor::create(0, ref);
 
     // We want automatic deinitialization to not create any error
 }
@@ -195,7 +197,7 @@ TEST_F(CudaExecutor, CopiesDataToCuda)
     int orig[] = {3, 8};
     auto* copy = cuda->alloc<int>(2);
 
-    cuda->copy_from(omp, 2, orig, copy);
+    cuda->copy_from(ref, 2, orig, copy);
 
     check_data<<<1, 1, 0, cuda->get_stream()>>>(copy);
     ASSERT_NO_THROW(cuda->synchronize());
@@ -216,7 +218,7 @@ TEST_F(CudaExecutor, CanAllocateOnUnifiedMemory)
     int orig[] = {3, 8};
     auto* copy = cuda3->alloc<int>(2);
 
-    cuda3->copy_from(omp, 2, orig, copy);
+    cuda3->copy_from(ref, 2, orig, copy);
 
     check_data<<<1, 1, 0, cuda3->get_stream()>>>(copy);
     ASSERT_NO_THROW(cuda3->synchronize());
@@ -238,7 +240,7 @@ TEST_F(CudaExecutor, CopiesDataFromCuda)
     auto orig = cuda->alloc<int>(2);
     init_data<<<1, 1, 0, cuda->get_stream()>>>(orig);
 
-    omp->copy_from(cuda, 2, orig, copy);
+    ref->copy_from(cuda, 2, orig, copy);
 
     EXPECT_EQ(3, copy[0]);
     ASSERT_EQ(8, copy[1]);
@@ -291,7 +293,7 @@ TEST_F(CudaExecutor, CopiesDataFromCudaToCuda)
     cuda2->run(ExampleOperation(value));
     ASSERT_EQ(value, cuda2->get_device_id());
     // Put the results on OpenMP and run CPU side assertions
-    omp->copy_from(cuda2, 2, copy_cuda2, copy);
+    ref->copy_from(cuda2, 2, copy_cuda2, copy);
     EXPECT_EQ(3, copy[0]);
     ASSERT_EQ(8, copy[1]);
     cuda2->free(copy_cuda2);
diff --git a/cuda/test/base/cuda_executor_topology.cu b/cuda/test/base/cuda_executor_topology.cu
index a0ee6826ded..3b91cc7941a 100644
--- a/cuda/test/base/cuda_executor_topology.cu
+++ b/cuda/test/base/cuda_executor_topology.cu
@@ -60,15 +60,15 @@ namespace {
 class CudaExecutor : public ::testing::Test {
 protected:
     CudaExecutor()
-        : omp(gko::OmpExecutor::create()), cuda(nullptr), cuda2(nullptr)
+        : ref(gko::ReferenceExecutor::create()), cuda(nullptr), cuda2(nullptr)
     {}
 
     void SetUp()
     {
         ASSERT_GT(gko::CudaExecutor::get_num_devices(), 0);
-        cuda = gko::CudaExecutor::create(0, omp);
+        cuda = gko::CudaExecutor::create(0, ref);
         cuda2 = gko::CudaExecutor::create(
-            gko::CudaExecutor::get_num_devices() - 1, omp);
+            gko::CudaExecutor::get_num_devices() - 1, ref);
     }
 
     void TearDown()
@@ -79,7 +79,7 @@ protected:
         }
     }
 
-    std::shared_ptr<gko::Executor> omp;
+    std::shared_ptr<gko::ReferenceExecutor> ref;
     std::shared_ptr<const gko::CudaExecutor> cuda;
     std::shared_ptr<const gko::CudaExecutor> cuda2;
 };
@@ -102,7 +102,7 @@ inline int get_core_os_id(int log_id)
 
 TEST_F(CudaExecutor, CanBindToSinglePu)
 {
-    cuda = gko::CudaExecutor::create(0, gko::OmpExecutor::create());
+    cuda = gko::CudaExecutor::create(0, gko::ReferenceExecutor::create());
 
     const int bind_pu = 1;
     gko::machine_topology::get_instance()->bind_to_pu(bind_pu);
@@ -114,7 +114,7 @@ TEST_F(CudaExecutor, CanBindToSinglePu)
 
 TEST_F(CudaExecutor, CanBindToPus)
 {
-    cuda = gko::CudaExecutor::create(0, gko::OmpExecutor::create());
+    cuda = gko::CudaExecutor::create(0, gko::ReferenceExecutor::create());
 
     std::vector<int> bind_pus = {1, 3};
     gko::machine_topology::get_instance()->bind_to_pus(bind_pus);
@@ -126,7 +126,7 @@ TEST_F(CudaExecutor, CanBindToPus)
 
 TEST_F(CudaExecutor, CanBindToCores)
 {
-    cuda = gko::CudaExecutor::create(0, gko::OmpExecutor::create());
+    cuda = gko::CudaExecutor::create(0, gko::ReferenceExecutor::create());
 
     std::vector<int> bind_cores = {1, 3};
     gko::machine_topology::get_instance()->bind_to_cores(bind_cores);
@@ -138,7 +138,7 @@ TEST_F(CudaExecutor, CanBindToCores)
 
 TEST_F(CudaExecutor, ClosestCpusIsPopulated)
 {
-    cuda = gko::CudaExecutor::create(0, gko::OmpExecutor::create());
+    cuda = gko::CudaExecutor::create(0, gko::ReferenceExecutor::create());
     auto close_cpus = cuda->get_closest_pus();
     if (close_cpus.size() == 0) {
         GTEST_SKIP();
@@ -150,7 +150,7 @@ TEST_F(CudaExecutor, ClosestCpusIsPopulated)
 
 TEST_F(CudaExecutor, KnowsItsNuma)
 {
-    cuda = gko::CudaExecutor::create(0, gko::OmpExecutor::create());
+    cuda = gko::CudaExecutor::create(0, gko::ReferenceExecutor::create());
     auto numa0 = cuda->get_closest_numa();
     auto close_cpus = cuda->get_closest_pus();
     if (close_cpus.size() == 0) {
diff --git a/cuda/test/base/lin_op.cu b/cuda/test/base/lin_op.cpp
similarity index 97%
rename from cuda/test/base/lin_op.cu
rename to cuda/test/base/lin_op.cpp
index 77139c96232..f2ee75e39c6 100644
--- a/cuda/test/base/lin_op.cu
+++ b/cuda/test/base/lin_op.cpp
@@ -144,6 +144,10 @@ class FactoryParameter : public ::testing::Test {
     FactoryParameter() {}
 
 public:
+    // FACTORY_PARAMETER macro needs self, which is usually available in
+    // enable_parameters_type. To reduce complexity, we add self here.
+    GKO_ENABLE_SELF(FactoryParameter);
+
     std::vector<int> GKO_FACTORY_PARAMETER_VECTOR(vector_parameter, 10, 11);
     int GKO_FACTORY_PARAMETER_SCALAR(scalar_parameter, -4);
 };
diff --git a/cuda/test/base/memory.cpp b/cuda/test/base/memory.cpp
new file mode 100644
index 00000000000..a329817f4af
--- /dev/null
+++ b/cuda/test/base/memory.cpp
@@ -0,0 +1,126 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/base/memory.hpp>
+
+
+#include <memory>
+#include <type_traits>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/exception.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/executor.hpp>
+
+
+#include "cuda/test/utils.hpp"
+
+
+namespace {
+
+
+class Memory : public CudaTestFixture {
+protected:
+    Memory()
+        : host_exec_with_pinned{gko::OmpExecutor::create(
+              std::make_shared<gko::CudaHostAllocator>(0))},
+          host_exec_with_unified{gko::OmpExecutor::create(
+              std::make_shared<gko::CudaUnifiedAllocator>(0))},
+          exec_with_normal{gko::CudaExecutor::create(
+              0, ref, std::make_shared<gko::CudaAllocator>(),
+              exec->get_stream())},
+          exec_with_async{gko::CudaExecutor::create(
+              0, host_exec_with_pinned,
+              std::make_shared<gko::CudaAsyncAllocator>(exec->get_stream()),
+              exec->get_stream())},
+          exec_with_unified{gko::CudaExecutor::create(
+              0, host_exec_with_unified,
+              std::make_shared<gko::CudaUnifiedAllocator>(0),
+              exec->get_stream())}
+    {}
+
+    std::shared_ptr<gko::OmpExecutor> host_exec_with_pinned;
+    std::shared_ptr<gko::OmpExecutor> host_exec_with_unified;
+    std::shared_ptr<gko::CudaExecutor> exec_with_normal;
+    std::shared_ptr<gko::CudaExecutor> exec_with_async;
+    std::shared_ptr<gko::CudaExecutor> exec_with_unified;
+};
+
+
+TEST_F(Memory, DeviceAllocationWorks)
+{
+    gko::array<int> data{exec_with_normal, {1, 2}};
+
+    GKO_ASSERT_ARRAY_EQ(data, I<int>({1, 2}));
+}
+
+
+TEST_F(Memory, AsyncDeviceAllocationWorks)
+{
+    gko::array<int> data{exec_with_async, {1, 2}};
+
+    GKO_ASSERT_ARRAY_EQ(data, I<int>({1, 2}));
+}
+
+
+TEST_F(Memory, UnifiedDeviceAllocationWorks)
+{
+    gko::array<int> data{exec_with_unified, {1, 2}};
+    exec->synchronize();
+
+    ASSERT_EQ(data.get_const_data()[0], 1);
+    ASSERT_EQ(data.get_const_data()[1], 2);
+}
+
+
+TEST_F(Memory, HostUnifiedAllocationWorks)
+{
+    gko::array<int> data{host_exec_with_unified, {1, 2}};
+
+    ASSERT_EQ(data.get_const_data()[0], 1);
+    ASSERT_EQ(data.get_const_data()[1], 2);
+}
+
+
+TEST_F(Memory, HostPinnedAllocationWorks)
+{
+    gko::array<int> data{host_exec_with_pinned, {1, 2}};
+
+    ASSERT_EQ(data.get_const_data()[0], 1);
+    ASSERT_EQ(data.get_const_data()[1], 2);
+}
+
+
+}  // namespace
diff --git a/cuda/test/reorder/CMakeLists.txt b/cuda/test/reorder/CMakeLists.txt
index 108e3b57dd5..79deba957b3 100644
--- a/cuda/test/reorder/CMakeLists.txt
+++ b/cuda/test/reorder/CMakeLists.txt
@@ -1 +1 @@
-ginkgo_create_test(rcm_kernels)
\ No newline at end of file
+ginkgo_create_test(rcm_kernels RESOURCE_TYPE cudagpu)
diff --git a/cuda/test/reorder/rcm_kernels.cpp b/cuda/test/reorder/rcm_kernels.cpp
index 169f26e208e..828c2f8f5bb 100644
--- a/cuda/test/reorder/rcm_kernels.cpp
+++ b/cuda/test/reorder/rcm_kernels.cpp
@@ -49,34 +49,39 @@ class Rcm : public CudaTestFixture {
     using i_type = int;
     using CsrMtx = gko::matrix::Csr<v_type, i_type>;
     using reorder_type = gko::reorder::Rcm<v_type, i_type>;
+    using new_reorder_type = gko::experimental::reorder::Rcm<i_type>;
     using perm_type = gko::matrix::Permutation<i_type>;
 
 
     Rcm()
-        :  // clang-format off
-          p_mtx(gko::initialize<CsrMtx>({{1.0, 2.0, 0.0, -1.3, 2.1},
+        : p_mtx(gko::initialize<CsrMtx>({{1.0, 2.0, 0.0, -1.3, 2.1},
                                          {2.0, 5.0, 1.5, 0.0, 0.0},
                                          {0.0, 1.5, 1.5, 1.1, 0.0},
                                          {-1.3, 0.0, 1.1, 2.0, 0.0},
                                          {2.1, 0.0, 0.0, 0.0, 1.0}},
-                                        exec)),
-          // clang-format on
-          rcm_factory(reorder_type::build().on(exec)),
-          reorder_op(rcm_factory->generate(p_mtx))
+                                        exec))
     {}
 
-    std::unique_ptr<reorder_type::Factory> rcm_factory;
     std::shared_ptr<CsrMtx> p_mtx;
-    std::unique_ptr<reorder_type> reorder_op;
 };
 
 
-TEST_F(Rcm, IsExecutedOnCpuExecutor)
+TEST_F(Rcm, IsEquivalentToRef)
 {
-    // This only executes successfully if computed on cpu executor.
-    auto p = reorder_op->get_permutation();
+    auto reorder_op = reorder_type::build().on(ref)->generate(p_mtx);
+    auto dreorder_op = reorder_type::build().on(exec)->generate(p_mtx);
 
-    ASSERT_TRUE(true);
+    GKO_ASSERT_ARRAY_EQ(dreorder_op->get_permutation_array(),
+                        reorder_op->get_permutation_array());
+}
+
+
+TEST_F(Rcm, IsEquivalentToRefNewInterface)
+{
+    auto reorder_op = new_reorder_type::build().on(ref)->generate(p_mtx);
+    auto dreorder_op = new_reorder_type::build().on(exec)->generate(p_mtx);
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(dreorder_op, reorder_op);
 }
 
 
diff --git a/cuda/test/solver/CMakeLists.txt b/cuda/test/solver/CMakeLists.txt
index 0220d94c8d9..f8cd67c1241 100644
--- a/cuda/test/solver/CMakeLists.txt
+++ b/cuda/test/solver/CMakeLists.txt
@@ -1,2 +1,2 @@
-ginkgo_create_test(lower_trs_kernels ADDITIONAL_INCLUDES ${CUDA_INCLUDE_DIRS})
-ginkgo_create_test(upper_trs_kernels ADDITIONAL_INCLUDES ${CUDA_INCLUDE_DIRS})
+ginkgo_create_cuda_test(lower_trs_kernels)
+ginkgo_create_cuda_test(upper_trs_kernels)
diff --git a/cuda/test/solver/lower_trs_kernels.cpp b/cuda/test/solver/lower_trs_kernels.cu
similarity index 100%
rename from cuda/test/solver/lower_trs_kernels.cpp
rename to cuda/test/solver/lower_trs_kernels.cu
diff --git a/cuda/test/solver/upper_trs_kernels.cpp b/cuda/test/solver/upper_trs_kernels.cu
similarity index 100%
rename from cuda/test/solver/upper_trs_kernels.cpp
rename to cuda/test/solver/upper_trs_kernels.cu
diff --git a/cuda/test/utils.hpp b/cuda/test/utils.hpp
index 814405ba0d9..d13e364d66a 100644
--- a/cuda/test/utils.hpp
+++ b/cuda/test/utils.hpp
@@ -38,33 +38,27 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/stream.hpp>
 
 
+#include "core/test/gtest/resources.hpp"
 #include "cuda/base/device.hpp"
 
 
 namespace {
 
 
-class CudaEnvironment : public ::testing::Environment {
-public:
-    void TearDown() override { gko::kernels::cuda::reset_device(0); }
-};
-
-testing::Environment* cuda_env =
-    testing::AddGlobalTestEnvironment(new CudaEnvironment);
-
-
 class CudaTestFixture : public ::testing::Test {
 protected:
     CudaTestFixture()
         : ref(gko::ReferenceExecutor::create()),
 #ifdef GKO_TEST_NONDEFAULT_STREAM
-          exec(gko::CudaExecutor::create(
-              0, ref, false, gko::default_cuda_alloc_mode, stream.get()))
-#else
-          exec(gko::CudaExecutor::create(0, ref))
+          stream(ResourceEnvironment::cuda_device_id),
 #endif
+          exec(gko::CudaExecutor::create(
+              ResourceEnvironment::cuda_device_id, ref,
+              std::make_shared<gko::CudaAllocator>(), stream.get())),
+          guard(exec->get_scoped_device_id_guard())
     {}
 
     void TearDown()
@@ -75,11 +69,10 @@ class CudaTestFixture : public ::testing::Test {
         }
     }
 
-#ifdef GKO_TEST_NONDEFAULT_STREAM
     gko::cuda_stream stream;
-#endif
     std::shared_ptr<gko::ReferenceExecutor> ref;
     std::shared_ptr<gko::CudaExecutor> exec;
+    gko::scoped_device_id_guard guard;
 };
 
 
diff --git a/cuda/test/utils/CMakeLists.txt b/cuda/test/utils/CMakeLists.txt
index 06dffda5da0..28f5770856f 100644
--- a/cuda/test/utils/CMakeLists.txt
+++ b/cuda/test/utils/CMakeLists.txt
@@ -1 +1 @@
-ginkgo_create_test(assertions_test)
+ginkgo_create_cuda_test(assertions_test)
diff --git a/cuda/test/utils/assertions_test.cpp b/cuda/test/utils/assertions_test.cu
similarity index 100%
rename from cuda/test/utils/assertions_test.cpp
rename to cuda/test/utils/assertions_test.cu
diff --git a/dev_tools/oneapi/convert_source.sh b/dev_tools/oneapi/convert_source.sh
index f6983dd500a..7aabca6f17d 100755
--- a/dev_tools/oneapi/convert_source.sh
+++ b/dev_tools/oneapi/convert_source.sh
@@ -3,7 +3,7 @@
 # convert_source.sh converts cuda (and c++ code) to dpcpp code with ginkgo design.
 
 # Usage:
-# EnvironementSet ./dev_tools/oneapi/convert_source.sh <the_file_in_cuda>
+# EnvironmentSet ./dev_tools/oneapi/convert_source.sh <the_file_in_cuda>
 # <the_file_in_cuda> can be .hpp/.cpp/.cu/.cuh
 
 # the following are parameters set by environment variables
@@ -18,7 +18,7 @@
 #   ROOT_BUILD_DIR: the complete path for build folder. The default is "${ROOT_DIR}/${BUILD_DIR}"
 #   GTEST_HEADER_DIR: the gtest header folder. The default is "${ROOT_BUILD_DIR}/_deps/googletest-src/googletest/include"
 #   CLANG_FORMAT: the clang-format exec. The default is "clang-format"
-#   VERBOSE: if it is set as 1, script will ouput the path information
+#   VERBOSE: if it is set as 1, script will output the path information
 CURRENT_DIR="$( pwd )"
 cd "$( dirname "${BASH_SOURCE[0]}" )"
 SCRIPT_DIR="$( pwd )"
@@ -33,7 +33,7 @@ GTEST_HEADER_DIR="${GTEST_HEADER_DIR:="${ROOT_BUILD_DIR}/_deps/googletest-src/go
 CLANG_FORMAT=${CLANG_FORMAT:="clang-format"}
 if [[ "${VERBOSE}" == 1 ]]; then
     echo "#####################"
-    echo "# Enviroment Setting:"
+    echo "# Environment Setting:"
     echo "CURRENT_DIR ${CURRENT_DIR}"
     echo "SCRIPT_DIR ${SCRIPT_DIR}"
     echo "ROOT_DIR ${ROOT_DIR}"
@@ -262,7 +262,7 @@ rm "${OUTPUT_FOLDER}/${OUTPUT_FILE}"
 rm "${OUTPUT_FOLDER}/${OUTPUT_FILE}.dp.cpp"
 
 # Call DPCT
-echo "# Call DPCT on the previosly generated file."
+echo "# Call DPCT on the previously generated file."
 echo "############################################"
 dpct --extra-arg="-std=c++14" --extra-arg="-I ${ROOT_DIR}" --extra-arg="-I ${ROOT_DIR}/include" --extra-arg="-I ${ROOT_BUILD_DIR}/include" --extra-arg="-I ${ROOT_DIR}/dev_tools/oneapi" --extra-arg="-I ${GTEST_HEADER_DIR}" --cuda-include-path="${CUDA_HEADER_DIR}" --format-range=none ${OUTPUT_FILE} --suppress-warnings=1049 --out-root=${OUTPUT_FOLDER}
 echo "############################################"
diff --git a/dev_tools/scripts/config b/dev_tools/scripts/config
index 03b160e3656..79e6a227530 100644
--- a/dev_tools/scripts/config
+++ b/dev_tools/scripts/config
@@ -51,6 +51,7 @@
 - "common/unified/.*.cpp"
     - PathIgnore: "2"
     - PathPrefix: "core"
+    - CoreSuffix: "\.template"
 - "core/test/base/(extended_float|iterator_factory)"
     - RemoveTest: "true"
 - "core/test/base/allocator"
@@ -96,3 +97,4 @@
 - ".*"
     - PathPrefix: "core"
     - PathIgnore: "1"
+    - CoreSuffix: "\.template"
diff --git a/dev_tools/scripts/format_header.sh b/dev_tools/scripts/format_header.sh
index a501b6f97d2..2437a03d623 100755
--- a/dev_tools/scripts/format_header.sh
+++ b/dev_tools/scripts/format_header.sh
@@ -266,7 +266,7 @@ while IFS='' read -r line || [ -n "$line" ]; do
             echo "${line}" >> "${CONTENT}"
             SKIP="false"
             if [[ "${line}" =~ $START_BLOCK_REX ]]; then
-                # keep everythin in #if block and /* block
+                # keep everything in #if block and /* block
                 IN_BLOCK=$((IN_BLOCK+1))
                 if [ -z "${ALARM}" ]; then
                     ALARM="set"
@@ -291,13 +291,13 @@ if [ "${ALARM}" = "true" ]; then
     echo "Warning $1: sorting is probably incorrect"
 fi
 
-# Wrtie license
+# Write license
 echo "/*${GINKGO_LICENSE_BEACON}" > "$1"
 cat LICENSE >> "$1"
 echo "${GINKGO_LICENSE_BEACON}*/" >> "$1"
 echo "" >> "$1"
 
-# Wrtie the definition of header according to path
+# Write the definition of header according to path
 if [ -n "${IFNDEF}" ] && [ -n "${DEFINE}" ]; then
     IFNDEF="#ifndef ${HEADER_DEF}"
     DEFINE="#define ${HEADER_DEF}"
diff --git a/dev_tools/scripts/generate_cuda_memory_ptx.py b/dev_tools/scripts/generate_cuda_memory_ptx.py
new file mode 100755
index 00000000000..1a4987be847
--- /dev/null
+++ b/dev_tools/scripts/generate_cuda_memory_ptx.py
@@ -0,0 +1,244 @@
+#!/usr/bin/env python3
+import dataclasses
+
+
+@dataclasses.dataclass
+class space:
+    ptx_space_suffix: str
+    ptx_scope_suffix: str
+    fn_suffix: str
+    ptr_expr: str
+    ptr_constraint: str
+
+
+@dataclasses.dataclass
+class ordering:
+    ptx_load_suffix: str
+    fn_load_suffix: str
+    ptx_store_suffix: str
+    fn_store_suffix: str
+    is_relaxed: bool
+
+
+@dataclasses.dataclass
+class type_desc:
+    ptx_type_suffix: str
+    val_constraint: str
+    name: str
+
+
+memory_spaces = [
+    space(ptx_space_suffix=".shared", ptx_scope_suffix=".cta", fn_suffix="_shared",
+          ptr_expr="convert_generic_ptr_to_smem_ptr({ptr})", ptr_constraint="r"),
+    space(ptx_space_suffix="", ptx_scope_suffix=".gpu", fn_suffix="", ptr_expr="{ptr}", ptr_constraint="l")]
+memory_orderings = [
+    ordering(ptx_load_suffix=".relaxed", fn_load_suffix="_relaxed",
+             ptx_store_suffix=".relaxed", fn_store_suffix="_relaxed", is_relaxed=True),
+    ordering(ptx_load_suffix=".acquire", fn_load_suffix="_acquire",
+             ptx_store_suffix=".release", fn_store_suffix="_release", is_relaxed=False)
+]
+types = [type_desc(ptx_type_suffix=".s32", val_constraint="r", name="int32"),
+         type_desc(ptx_type_suffix=".s64", val_constraint="l", name="int64"),
+         type_desc(ptx_type_suffix=".f32", val_constraint="f", name="float"),
+         type_desc(ptx_type_suffix=".f64", val_constraint="d", name="double")]
+# header
+print("""/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CUDA_COMPONENTS_MEMORY_CUH_
+#define GKO_CUDA_COMPONENTS_MEMORY_CUH_
+
+
+#include <type_traits>
+
+
+#include <ginkgo/core/base/math.hpp>
+
+
+#include "cuda/base/types.hpp"
+
+
+// this file is generated by dev_tools/scripts/generate_cuda_memory_ptx.py
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+
+
+/**
+ * Transforms a generic CUDA pointer pointing to shared memory to a
+ * shared memory pointer for use in PTX assembly.
+ * CUDA PTX assembly uses 32bit pointers for shared memory addressing.
+ * The result is undefined for a generic pointer pointing to anything but
+ * shared memory.
+ */
+__device__ __forceinline__ uint32 convert_generic_ptr_to_smem_ptr(void* ptr)
+{
+// see
+// https://github.com/NVIDIA/cutlass/blob/
+//     6fc5008803fe4e81b81a836fcd3a88258f4e5bbf/
+//     include/cutlass/arch/memory_sm75.h#L90
+// for reasoning behind this implementation
+#if (!defined(__clang__) && __CUDACC_VER_MAJOR__ >= 11)
+    return static_cast<uint32>(__cvta_generic_to_shared(ptr));
+#elif (!defined(__clang__) && CUDACC_VER_MAJOR__ == 10 && \
+       __CUDACC_VER_MINOR__ >= 2)
+    return __nvvm_get_smem_pointer(ptr);
+#else
+    uint32 smem_ptr;
+    asm("{{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %1; cvt.u32.u64 "
+        "%0, smem_ptr; }}"
+        : "=r"(smem_ptr)
+        : "l"(ptr));
+    return smem_ptr;
+#endif
+}
+
+
+__device__ __forceinline__ void membar_acq_rel()
+{
+#if __CUDA_ARCH__ < 700
+    asm volatile("membar.gl;" ::: "memory");
+#else
+    asm volatile("fence.acq_rel.gpu;" ::: "memory");
+#endif
+}
+
+
+__device__ __forceinline__ void membar_acq_rel_shared()
+{
+#if __CUDA_ARCH__ < 700
+    asm volatile("membar.cta;" ::: "memory");
+#else
+    asm volatile("fence.acq_rel.cta;" ::: "memory");
+#endif
+}
+
+
+#include "common/cuda_hip/components/memory.hpp.inc"
+""")
+
+# relaxed
+for s in memory_spaces:
+    for o in memory_orderings:
+        for t in types:
+            membar_expression = "" if o.is_relaxed else f"membar_acq_rel{s.fn_suffix}();"
+            const_ptr_expr = s.ptr_expr.format(
+                ptr=f"const_cast<{t.name}*>(ptr)")
+            mut_ptr_expr = s.ptr_expr.format(ptr="ptr")
+            print(f"""
+__device__ __forceinline__ {t.name} load{o.fn_load_suffix}{s.fn_suffix}(const {t.name}* ptr)
+{{
+    {t.name} result;
+#if __CUDA_ARCH__ < 700
+    asm volatile("ld.volatile{s.ptx_space_suffix}{t.ptx_type_suffix} %0, [%1];"
+                 : "={t.val_constraint}"(result)
+                 : "{s.ptr_constraint}"({const_ptr_expr})
+                 : "memory");
+#else
+    asm volatile("ld{o.ptx_load_suffix}{s.ptx_scope_suffix}{s.ptx_space_suffix}{t.ptx_type_suffix} %0, [%1];"
+                 : "={t.val_constraint}"(result)
+                 : "{s.ptr_constraint}"({const_ptr_expr})
+                 : "memory");
+#endif
+    {membar_expression}
+    return result;
+}}
+
+
+__device__ __forceinline__ void store{o.fn_store_suffix}{s.fn_suffix}({t.name}* ptr, {t.name} result)
+{{
+    {membar_expression}
+#if __CUDA_ARCH__ < 700
+    asm volatile("st.volatile{s.ptx_space_suffix}{t.ptx_type_suffix} [%0], %1;"
+                 :: "{s.ptr_constraint}"({mut_ptr_expr}), "{t.val_constraint}"(result)
+                 : "memory");
+#else
+    asm volatile("st{o.ptx_store_suffix}{s.ptx_scope_suffix}{s.ptx_space_suffix}{t.ptx_type_suffix} [%0], %1;"
+                 :: "{s.ptr_constraint}"({mut_ptr_expr}), "{t.val_constraint}"(result)
+                 : "memory");
+#endif
+}}
+""")
+
+# vectorized relaxed loads for thrust::complex
+types = [type_desc(ptx_type_suffix=".f32", val_constraint="f", name="float"),
+         type_desc(ptx_type_suffix=".f64", val_constraint="d", name="double")]
+for s in memory_spaces:
+    for t in types:
+        const_ptr_expr = s.ptr_expr.format(
+            ptr=f"const_cast<thrust::complex<{t.name}>*>(ptr)")
+        mut_ptr_expr = s.ptr_expr.format(ptr="ptr")
+        print(f"""
+__device__ __forceinline__ thrust::complex<{t.name}> load_relaxed{s.fn_suffix}(const thrust::complex<{t.name}>* ptr)
+{{
+    {t.name} real_result;
+    {t.name} imag_result;
+#if __CUDA_ARCH__ < 700
+    asm volatile("ld.volatile{s.ptx_space_suffix}.v2{t.ptx_type_suffix} {{%0, %1}}, [%2];"
+                 : "={t.val_constraint}"(real_result), "={t.val_constraint}"(imag_result)
+                 : "{s.ptr_constraint}"({const_ptr_expr})
+                 : "memory");
+#else
+    asm volatile("ld.relaxed{s.ptx_scope_suffix}{s.ptx_space_suffix}.v2{t.ptx_type_suffix} {{%0, %1}}, [%2];"
+                 : "={t.val_constraint}"(real_result), "={t.val_constraint}"(imag_result)
+                 : "{s.ptr_constraint}"({const_ptr_expr})
+                 : "memory");
+#endif
+    return thrust::complex<{t.name}>{{real_result, imag_result}};
+}}
+
+
+__device__ __forceinline__ void store_relaxed{s.fn_suffix}(thrust::complex<{t.name}>* ptr, thrust::complex<{t.name}> result)
+{{
+    auto real_result = result.real();
+    auto imag_result = result.imag();
+#if __CUDA_ARCH__ < 700
+    asm volatile("st.volatile{s.ptx_space_suffix}.v2{t.ptx_type_suffix} [%0], {{%1, %2}};"
+                 :: "{s.ptr_constraint}"({mut_ptr_expr}), "{t.val_constraint}"(real_result), "{t.val_constraint}"(imag_result)
+                 : "memory");
+#else
+    asm volatile("st.relaxed{s.ptx_scope_suffix}{s.ptx_space_suffix}.v2{t.ptx_type_suffix} [%0], {{%1, %2}};"
+                 :: "{s.ptr_constraint}"({mut_ptr_expr}), "{t.val_constraint}"(real_result), "{t.val_constraint}"(imag_result)
+                 : "memory");
+#endif
+}}
+""")
+
+print("""
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko
+
+#endif  // GKO_CUDA_COMPONENTS_MEMORY_CUH_
+""")
diff --git a/dev_tools/scripts/regroup b/dev_tools/scripts/regroup
index 85eade99289..e35bd37efee 100644
--- a/dev_tools/scripts/regroup
+++ b/dev_tools/scripts/regroup
@@ -1,6 +1,6 @@
 IncludeBlocks: Regroup
 IncludeCategories:
-  - Regex: '^<(rapidjson|gflags|gtest|papi).*'
+  - Regex: '^<(nlohmann|gflags|gtest|papi).*'
     Priority: 3
   - Regex: '^<(omp|cu|hip|thrust|CL/|cooperative|oneapi|mpi|nvToolsExt).*'
     Priority: 2
diff --git a/devices/cuda/executor.cpp b/devices/cuda/executor.cpp
index d066d272f81..3789274c3f3 100644
--- a/devices/cuda/executor.cpp
+++ b/devices/cuda/executor.cpp
@@ -64,31 +64,4 @@ bool CudaExecutor::verify_memory_to(const HipExecutor* dest_exec) const
 }
 
 
-void CudaExecutor::increase_num_execs(unsigned device_id)
-{
-#ifdef GKO_COMPILING_CUDA_DEVICE
-    // increase the Cuda Device count only when ginkgo build cuda
-    std::lock_guard<std::mutex> guard(nvidia_device::get_mutex(device_id));
-    nvidia_device::get_num_execs(device_id)++;
-#endif  // GKO_COMPILING_CUDA_DEVICE
-}
-
-
-void CudaExecutor::decrease_num_execs(unsigned device_id)
-{
-#ifdef GKO_COMPILING_CUDA_DEVICE
-    // increase the Cuda Device count only when ginkgo build cuda
-    std::lock_guard<std::mutex> guard(nvidia_device::get_mutex(device_id));
-    nvidia_device::get_num_execs(device_id)--;
-#endif  // GKO_COMPILING_CUDA_DEVICE
-}
-
-
-unsigned CudaExecutor::get_num_execs(unsigned device_id)
-{
-    std::lock_guard<std::mutex> guard(nvidia_device::get_mutex(device_id));
-    return nvidia_device::get_num_execs(device_id);
-}
-
-
 }  // namespace gko
diff --git a/devices/hip/executor.cpp b/devices/hip/executor.cpp
index 60efb4c53a3..b044074c19e 100644
--- a/devices/hip/executor.cpp
+++ b/devices/hip/executor.cpp
@@ -61,38 +61,4 @@ bool HipExecutor::verify_memory_to(const CudaExecutor* dest_exec) const
 }
 
 
-#if (GINKGO_HIP_PLATFORM_NVCC == 1)
-using hip_device_class = nvidia_device;
-#else
-using hip_device_class = amd_device;
-#endif
-
-
-void HipExecutor::increase_num_execs(int device_id)
-{
-#ifdef GKO_COMPILING_HIP_DEVICE
-    // increase the HIP Device count only when ginkgo build hip
-    std::lock_guard<std::mutex> guard(hip_device_class::get_mutex(device_id));
-    hip_device_class::get_num_execs(device_id)++;
-#endif  // GKO_COMPILING_HIP_DEVICE
-}
-
-
-void HipExecutor::decrease_num_execs(int device_id)
-{
-#ifdef GKO_COMPILING_HIP_DEVICE
-    // increase the HIP Device count only when ginkgo build hip
-    std::lock_guard<std::mutex> guard(hip_device_class::get_mutex(device_id));
-    hip_device_class::get_num_execs(device_id)--;
-#endif  // GKO_COMPILING_HIP_DEVICE
-}
-
-
-int HipExecutor::get_num_execs(int device_id)
-{
-    std::lock_guard<std::mutex> guard(hip_device_class::get_mutex(device_id));
-    return hip_device_class::get_num_execs(device_id);
-}
-
-
 }  // namespace gko
diff --git a/devices/omp/executor.cpp b/devices/omp/executor.cpp
index 352216f7633..f8e700bc2d5 100644
--- a/devices/omp/executor.cpp
+++ b/devices/omp/executor.cpp
@@ -55,7 +55,10 @@ void OmpExecutor::populate_exec_info(const machine_topology* mach_topo)
 }
 
 
-void OmpExecutor::raw_free(void* ptr) const noexcept { std::free(ptr); }
+void OmpExecutor::raw_free(void* ptr) const noexcept
+{
+    return alloc_->deallocate(ptr);
+}
 
 
 std::shared_ptr<Executor> OmpExecutor::get_master() noexcept
@@ -72,7 +75,7 @@ std::shared_ptr<const Executor> OmpExecutor::get_master() const noexcept
 
 void* OmpExecutor::raw_alloc(size_type num_bytes) const
 {
-    return GKO_ENSURE_ALLOCATED(std::malloc(num_bytes), "OMP", num_bytes);
+    return alloc_->allocate(num_bytes);
 }
 
 
diff --git a/devices/reference/dummy.cpp b/devices/reference/dummy.cpp
index 210666655f7..6ab5dde07f3 100644
--- a/devices/reference/dummy.cpp
+++ b/devices/reference/dummy.cpp
@@ -31,4 +31,4 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
 // Remove this file once there is at least one source file in
-// ginkgo_referece_device
+// ginkgo_reference_device
diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt
index 8b975bb6544..8965b42add4 100644
--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.9)
+cmake_minimum_required(VERSION 3.16)
 include(helpers.cmake)
 find_package(Doxygen REQUIRED)
 find_package(Perl REQUIRED)
diff --git a/doc/examples/examples.hpp.in b/doc/examples/examples.hpp.in
index 5e685e2aa7b..7234a3ca8aa 100644
--- a/doc/examples/examples.hpp.in
+++ b/doc/examples/examples.hpp.in
@@ -64,7 +64,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  * <ol>
  *   <li> <code>-DGINKGO_BUILD_CUDA=ON</code> option for NVIDIA GPUs.
  *   <li> <code>-DGINKGO_BUILD_HIP=ON</code> option for AMD or NVIDIA GPUs.
- *   <li> <code>-DGINKGO_BUILD_DPCPP=ON</code> option for Intel GPUs (and
+ *   <li> <code>-DGINKGO_BUILD_SYCL=ON</code> option for Intel GPUs (and
  *        possibly any other platform).
  * </ol>
  *
@@ -212,7 +212,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *   <tr valign="top">
  *       <td>@ref heat_equation</td>
  *       <td> Solving a 2D heat equation and showing matrix assembly, vector
- *            initalization and solver setup in a more complex setting with
+ *            initialization and solver setup in a more complex setting with
  *            output visualization.
  *       </td></tr>
  *
diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt
index 31b5e0543ba..7499bca97a5 100644
--- a/dpcpp/CMakeLists.txt
+++ b/dpcpp/CMakeLists.txt
@@ -1,11 +1,15 @@
-find_package(MKL CONFIG REQUIRED HINTS "$ENV{MKLROOT}")
-set(GINKGO_MKL_ROOT "${MKL_ROOT}" PARENT_SCOPE)
-find_package(oneDPL REQUIRED HINTS "$ENV{DPL_ROOT}")
-set(GINKGO_DPL_ROOT "${DPL_ROOT}" PARENT_SCOPE)
+find_package(MKL CONFIG REQUIRED HINTS "$ENV{MKLROOT}" "$ENV{MKL_ROOT}")
+find_package(oneDPL REQUIRED HINTS "$ENV{DPL_ROOT}" "$ENV{DPLROOT}")
+# use the parameter from cmake 
+set(GINKGO_MKL_ROOT "${MKL_DIR}" PARENT_SCOPE)
+set(GINKGO_DPL_ROOT "${oneDPL_DIR}" PARENT_SCOPE)
 
+include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake)
+add_instantiation_files(${PROJECT_SOURCE_DIR}/common/unified matrix/dense_kernels.instantiate.cpp DENSE_INSTANTIATE)
 add_library(ginkgo_dpcpp $<TARGET_OBJECTS:ginkgo_dpcpp_device> "")
 target_sources(ginkgo_dpcpp
     PRIVATE
+    base/batch_multi_vector_kernels.dp.cpp
     base/device_matrix_data_kernels.dp.cpp
     base/executor.dp.cpp
     base/helper.dp.cpp
@@ -15,6 +19,7 @@ target_sources(ginkgo_dpcpp
     base/version.dp.cpp
     components/prefix_sum_kernels.dp.cpp
     distributed/matrix_kernels.dp.cpp
+    distributed/partition_helpers_kernels.dp.cpp
     distributed/partition_kernels.dp.cpp
     distributed/vector_kernels.dp.cpp
     factorization/cholesky_kernels.dp.cpp
@@ -31,6 +36,8 @@ target_sources(ginkgo_dpcpp
     factorization/par_ilut_select_kernel.dp.cpp
     factorization/par_ilut_spgeam_kernel.dp.cpp
     factorization/par_ilut_sweep_kernel.dp.cpp
+    matrix/batch_dense_kernels.dp.cpp
+    matrix/batch_ell_kernels.dp.cpp
     matrix/coo_kernels.dp.cpp
     matrix/csr_kernels.dp.cpp
     matrix/fbcsr_kernels.dp.cpp
@@ -47,6 +54,7 @@ target_sources(ginkgo_dpcpp
     preconditioner/jacobi_kernels.dp.cpp
     preconditioner/jacobi_simple_apply_kernel.dp.cpp
     reorder/rcm_kernels.dp.cpp
+    solver/batch_bicgstab_kernels.dp.cpp
     solver/cb_gmres_kernels.dp.cpp
     solver/idr_kernels.dp.cpp
     solver/lower_trs_kernels.dp.cpp
@@ -55,6 +63,7 @@ target_sources(ginkgo_dpcpp
     stop/criterion_kernels.dp.cpp
     stop/residual_norm_kernels.dp.cpp
     ${GKO_UNIFIED_COMMON_SOURCES}
+    ${DENSE_INSTANTIATE}
     )
 
 # TODO: adjust it when dpcpp jacobi supports more block size
@@ -84,6 +93,10 @@ target_compile_definitions(ginkgo_dpcpp PRIVATE GKO_COMPILING_DPCPP _ONEDPL_COMP
 
 set(GINKGO_DPCPP_FLAGS ${GINKGO_DPCPP_FLAGS} PARENT_SCOPE)
 target_compile_options(ginkgo_dpcpp PRIVATE "${GINKGO_DPCPP_FLAGS}")
+# all file in target ginkgo_dpcpp are necessarily compiled with sycl, so we can ignore the warning.
+# If we would like to use SOURCES, please use the new files copied from GKO_UNIFIED_COMMON_SOURCES.
+# Otherwise, the source's properties will be changed by add_sycl_to_target
+gko_add_sycl_to_target(TARGET ginkgo_dpcpp)
 target_compile_options(ginkgo_dpcpp PRIVATE "${GINKGO_COMPILER_FLAGS}")
 # Note: add MKL as PRIVATE not PUBLIC (MKL example shows) to avoid propagating
 # find_package(MKL) everywhere when linking ginkgo (see the MKL example
diff --git a/dpcpp/base/batch_multi_vector_kernels.dp.cpp b/dpcpp/base/batch_multi_vector_kernels.dp.cpp
new file mode 100644
index 00000000000..51665d26ff9
--- /dev/null
+++ b/dpcpp/base/batch_multi_vector_kernels.dp.cpp
@@ -0,0 +1,400 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/base/batch_multi_vector_kernels.hpp"
+
+
+#include <algorithm>
+
+
+#include <CL/sycl.hpp>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/math.hpp>
+
+
+#include "core/base/batch_struct.hpp"
+#include "core/components/prefix_sum_kernels.hpp"
+#include "dpcpp/base/batch_struct.hpp"
+#include "dpcpp/base/config.hpp"
+#include "dpcpp/base/dim3.dp.hpp"
+#include "dpcpp/base/dpct.hpp"
+#include "dpcpp/base/helper.hpp"
+#include "dpcpp/components/cooperative_groups.dp.hpp"
+#include "dpcpp/components/intrinsics.dp.hpp"
+#include "dpcpp/components/reduction.dp.hpp"
+#include "dpcpp/components/thread_ids.dp.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace dpcpp {
+/**
+ * @brief The MultiVector matrix format namespace.
+ * @ref MultiVector
+ * @ingroup batch_multi_vector
+ */
+namespace batch_multi_vector {
+
+
+#include "dpcpp/base/batch_multi_vector_kernels.hpp.inc"
+
+
+template <typename ValueType>
+void scale(std::shared_ptr<const DefaultExecutor> exec,
+           const batch::MultiVector<ValueType>* const alpha,
+           batch::MultiVector<ValueType>* const x)
+{
+    const auto alpha_ub = get_batch_struct(alpha);
+    const auto x_ub = get_batch_struct(x);
+
+    const int num_rows = x->get_common_size()[0];
+    constexpr int max_subgroup_size = config::warp_size;
+    const auto num_batches = x_ub.num_batch_items;
+    auto device = exec->get_queue()->get_device();
+    long max_group_size =
+        device.get_info<sycl::info::device::max_work_group_size>();
+    int group_size =
+        std::min(ceildiv(num_rows, max_subgroup_size) * max_subgroup_size,
+                 max_group_size);
+
+    const dim3 block(group_size);
+    const dim3 grid(num_batches);
+
+    // Launch a kernel that has nbatches blocks, each block has max group size
+    if (alpha->get_common_size()[1] == 1) {
+        exec->get_queue()->submit([&](sycl::handler& cgh) {
+            cgh.parallel_for(
+                sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
+                    auto group = item_ct1.get_group();
+                    auto group_id = group.get_group_linear_id();
+                    const auto alpha_b =
+                        batch::extract_batch_item(alpha_ub, group_id);
+                    const auto x_b = batch::extract_batch_item(x_ub, group_id);
+                    scale_kernel(alpha_b, x_b, item_ct1,
+                                 [](int col) { return 0; });
+                });
+        });
+    } else {
+        exec->get_queue()->submit([&](sycl::handler& cgh) {
+            cgh.parallel_for(
+                sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
+                    auto group = item_ct1.get_group();
+                    auto group_id = group.get_group_linear_id();
+                    const auto alpha_b =
+                        batch::extract_batch_item(alpha_ub, group_id);
+                    const auto x_b = batch::extract_batch_item(x_ub, group_id);
+                    scale_kernel(alpha_b, x_b, item_ct1,
+                                 [](int col) { return col; });
+                });
+        });
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_BATCH_MULTI_VECTOR_SCALE_KERNEL);
+
+
+template <typename ValueType>
+void add_scaled(std::shared_ptr<const DefaultExecutor> exec,
+                const batch::MultiVector<ValueType>* const alpha,
+                const batch::MultiVector<ValueType>* const x,
+                batch::MultiVector<ValueType>* const y)
+{
+    constexpr int max_subgroup_size = config::warp_size;
+    const int num_rows = x->get_common_size()[0];
+    const int num_cols = x->get_common_size()[1];
+    const auto num_batches = x->get_num_batch_items();
+    auto device = exec->get_queue()->get_device();
+    long max_group_size =
+        device.get_info<sycl::info::device::max_work_group_size>();
+    int group_size =
+        std::min(ceildiv(num_rows, max_subgroup_size) * max_subgroup_size,
+                 max_group_size);
+
+    const dim3 block(group_size);
+    const dim3 grid(num_batches);
+    const auto alpha_ub = get_batch_struct(alpha);
+    const auto x_ub = get_batch_struct(x);
+    const auto y_ub = get_batch_struct(y);
+    if (alpha->get_common_size()[1] == 1) {
+        exec->get_queue()->submit([&](sycl::handler& cgh) {
+            cgh.parallel_for(
+                sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
+                    auto group = item_ct1.get_group();
+                    auto group_id = group.get_group_linear_id();
+                    const auto alpha_b =
+                        batch::extract_batch_item(alpha_ub, group_id);
+                    const auto x_b = batch::extract_batch_item(x_ub, group_id);
+                    const auto y_b = batch::extract_batch_item(y_ub, group_id);
+                    add_scaled_kernel(alpha_b, x_b, y_b, item_ct1,
+                                      [](auto col) { return 0; });
+                });
+        });
+    } else {
+        exec->get_queue()->submit([&](sycl::handler& cgh) {
+            cgh.parallel_for(
+                sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
+                    auto group = item_ct1.get_group();
+                    auto group_id = group.get_group_linear_id();
+                    const auto alpha_b =
+                        batch::extract_batch_item(alpha_ub, group_id);
+                    const auto x_b = batch::extract_batch_item(x_ub, group_id);
+                    const auto y_b = batch::extract_batch_item(y_ub, group_id);
+                    add_scaled_kernel(alpha_b, x_b, y_b, item_ct1,
+                                      [](auto col) { return col; });
+                });
+        });
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_BATCH_MULTI_VECTOR_ADD_SCALED_KERNEL);
+
+
+template <typename ValueType>
+void compute_dot(std::shared_ptr<const DefaultExecutor> exec,
+                 const batch::MultiVector<ValueType>* const x,
+                 const batch::MultiVector<ValueType>* const y,
+                 batch::MultiVector<ValueType>* const result)
+{
+    const auto x_ub = get_batch_struct(x);
+    const auto y_ub = get_batch_struct(y);
+    const auto res_ub = get_batch_struct(result);
+
+    constexpr int max_subgroup_size = config::warp_size;
+    const auto num_batches = x_ub.num_batch_items;
+    const int num_rows = x_ub.num_rows;
+    auto device = exec->get_queue()->get_device();
+
+    long max_group_size =
+        device.get_info<sycl::info::device::max_work_group_size>();
+    int group_size =
+        std::min(ceildiv(num_rows, max_subgroup_size) * max_subgroup_size,
+                 max_group_size);
+
+    const dim3 block(group_size);
+    const dim3 grid(num_batches);
+    if (x->get_common_size()[1] == 1) {
+        exec->get_queue()->submit([&](sycl::handler& cgh) {
+            cgh.parallel_for(
+                sycl_nd_range(grid, block),
+                [=](sycl::nd_item<3> item_ct1)
+                    [[sycl::reqd_sub_group_size(max_subgroup_size)]] {
+                        auto group = item_ct1.get_group();
+                        auto group_id = group.get_group_linear_id();
+                        const auto x_b =
+                            batch::extract_batch_item(x_ub, group_id);
+                        const auto y_b =
+                            batch::extract_batch_item(y_ub, group_id);
+                        const auto res_b =
+                            batch::extract_batch_item(res_ub, group_id);
+                        single_rhs_compute_conj_dot_sg(
+                            x_b.num_rows, x_b.values, y_b.values,
+                            res_b.values[0], item_ct1);
+                    });
+        });
+    } else {
+        // TODO: Remove reqd_sub_group size and use sycl::reduce_over_group
+        exec->get_queue()->submit([&](sycl::handler& cgh) {
+            cgh.parallel_for(
+                sycl_nd_range(grid, block),
+                [=](sycl::nd_item<3> item_ct1)
+                    [[sycl::reqd_sub_group_size(max_subgroup_size)]] {
+                        auto group = item_ct1.get_group();
+                        auto group_id = group.get_group_linear_id();
+                        const auto x_b =
+                            batch::extract_batch_item(x_ub, group_id);
+                        const auto y_b =
+                            batch::extract_batch_item(y_ub, group_id);
+                        const auto res_b =
+                            batch::extract_batch_item(res_ub, group_id);
+                        compute_gen_dot_product_kernel(
+                            x_b, y_b, res_b, item_ct1,
+                            [](auto val) { return val; });
+                    });
+        });
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_DOT_KERNEL);
+
+
+template <typename ValueType>
+void compute_conj_dot(std::shared_ptr<const DefaultExecutor> exec,
+                      const batch::MultiVector<ValueType>* const x,
+                      const batch::MultiVector<ValueType>* const y,
+                      batch::MultiVector<ValueType>* const result)
+{
+    const auto x_ub = get_batch_struct(x);
+    const auto y_ub = get_batch_struct(y);
+    const auto res_ub = get_batch_struct(result);
+
+    constexpr int max_subgroup_size = config::warp_size;
+    const int num_rows = x->get_common_size()[0];
+    const auto num_batches = x_ub.num_batch_items;
+    auto device = exec->get_queue()->get_device();
+    long max_group_size =
+        device.get_info<sycl::info::device::max_work_group_size>();
+    int group_size =
+        std::min(ceildiv(num_rows, max_subgroup_size) * max_subgroup_size,
+                 max_group_size);
+
+    const dim3 block(group_size);
+    const dim3 grid(num_batches);
+
+    exec->get_queue()->submit([&](sycl::handler& cgh) {
+        cgh.parallel_for(
+            sycl_nd_range(grid, block),
+            [=](sycl::nd_item<3> item_ct1)
+                [[sycl::reqd_sub_group_size(max_subgroup_size)]] {
+                    auto group = item_ct1.get_group();
+                    auto group_id = group.get_group_linear_id();
+                    const auto x_b = batch::extract_batch_item(x_ub, group_id);
+                    const auto y_b = batch::extract_batch_item(y_ub, group_id);
+                    const auto res_b =
+                        batch::extract_batch_item(res_ub, group_id);
+                    compute_gen_dot_product_kernel(
+                        x_b, y_b, res_b, item_ct1,
+                        [](auto val) { return conj(val); });
+                });
+    });
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_CONJ_DOT_KERNEL);
+
+
+template <typename ValueType>
+void compute_norm2(std::shared_ptr<const DefaultExecutor> exec,
+                   const batch::MultiVector<ValueType>* const x,
+                   batch::MultiVector<remove_complex<ValueType>>* const result)
+{
+    const auto x_ub = get_batch_struct(x);
+    const auto res_ub = get_batch_struct(result);
+
+    const auto num_batches = x_ub.num_batch_items;
+    const int num_rows = x->get_common_size()[0];
+    auto device = exec->get_queue()->get_device();
+
+    constexpr int max_subgroup_size = config::warp_size;
+    long max_group_size =
+        device.get_info<sycl::info::device::max_work_group_size>();
+    int group_size =
+        std::min(ceildiv(num_rows, max_subgroup_size) * max_subgroup_size,
+                 max_group_size);
+
+    const dim3 block(group_size);
+    const dim3 grid(num_batches);
+    if (x->get_common_size()[1] == 1) {
+        exec->get_queue()->submit([&](sycl::handler& cgh) {
+            cgh.parallel_for(
+                sycl_nd_range(grid, block),
+                [=](sycl::nd_item<3> item_ct1)
+                    [[sycl::reqd_sub_group_size(max_subgroup_size)]] {
+                        auto group = item_ct1.get_group();
+                        auto group_id = group.get_group_linear_id();
+                        const auto x_b =
+                            batch::extract_batch_item(x_ub, group_id);
+                        const auto res_b =
+                            batch::extract_batch_item(res_ub, group_id);
+                        single_rhs_compute_norm2_sg(x_b.num_rows, x_b.values,
+                                                    res_b.values[0], item_ct1);
+                    });
+        });
+    } else {
+        exec->get_queue()->submit([&](sycl::handler& cgh) {
+            cgh.parallel_for(
+                sycl_nd_range(grid, block),
+                [=](sycl::nd_item<3> item_ct1)
+                    [[sycl::reqd_sub_group_size(max_subgroup_size)]] {
+                        auto group = item_ct1.get_group();
+                        auto group_id = group.get_group_linear_id();
+                        const auto x_b =
+                            batch::extract_batch_item(x_ub, group_id);
+                        const auto res_b =
+                            batch::extract_batch_item(res_ub, group_id);
+                        compute_norm2_kernel(x_b, res_b, item_ct1);
+                    });
+        });
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_NORM2_KERNEL);
+
+
+template <typename ValueType>
+void copy(std::shared_ptr<const DefaultExecutor> exec,
+          const batch::MultiVector<ValueType>* x,
+          batch::MultiVector<ValueType>* result)
+{
+    const auto x_ub = get_batch_struct(x);
+    const auto result_ub = get_batch_struct(result);
+
+    const auto num_batches = x_ub.num_batch_items;
+    const int num_rows = x->get_common_size()[0];
+    auto device = exec->get_queue()->get_device();
+    constexpr int max_subgroup_size = config::warp_size;
+    long max_group_size =
+        device.get_info<sycl::info::device::max_work_group_size>();
+    int group_size =
+        std::min(ceildiv(num_rows, max_subgroup_size) * max_subgroup_size,
+                 max_group_size);
+
+    const dim3 block(group_size);
+    const dim3 grid(num_batches);
+
+    exec->get_queue()->submit([&](sycl::handler& cgh) {
+        cgh.parallel_for(
+            sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
+                auto group = item_ct1.get_group();
+                auto group_id = group.get_group_linear_id();
+                const auto x_b = batch::extract_batch_item(x_ub, group_id);
+                const auto result_b =
+                    batch::extract_batch_item(result_ub, group_id);
+                copy_kernel(x_b, result_b, item_ct1);
+            });
+    });
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR_COPY_KERNEL);
+
+
+}  // namespace batch_multi_vector
+}  // namespace dpcpp
+}  // namespace kernels
+}  // namespace gko
diff --git a/dpcpp/base/batch_multi_vector_kernels.hpp.inc b/dpcpp/base/batch_multi_vector_kernels.hpp.inc
new file mode 100644
index 00000000000..be9d02aa88d
--- /dev/null
+++ b/dpcpp/base/batch_multi_vector_kernels.hpp.inc
@@ -0,0 +1,258 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+template <typename ValueType, typename Mapping>
+__dpct_inline__ void scale_kernel(
+    const gko::batch::multi_vector::batch_item<const ValueType>& alpha,
+    const gko::batch::multi_vector::batch_item<ValueType>& x,
+    sycl::nd_item<3>& item_ct1, Mapping map)
+{
+    const int max_li = x.num_rows * x.num_rhs;
+    for (int li = item_ct1.get_local_linear_id(); li < max_li;
+         li += item_ct1.get_local_range().size()) {
+        const int row = li / x.num_rhs;
+        const int col = li % x.num_rhs;
+
+        x.values[row * x.stride + col] =
+            alpha.values[map(col)] * x.values[row * x.stride + col];
+    }
+}
+
+
+template <typename ValueType, typename Mapping>
+__dpct_inline__ void add_scaled_kernel(
+    const gko::batch::multi_vector::batch_item<const ValueType>& alpha,
+    const gko::batch::multi_vector::batch_item<const ValueType>& x,
+    const gko::batch::multi_vector::batch_item<ValueType>& y,
+    sycl::nd_item<3>& item_ct1, Mapping map)
+{
+    const int max_li = x.num_rows * x.num_rhs;
+    for (int li = item_ct1.get_local_id(2); li < max_li;
+         li += item_ct1.get_local_range(2)) {
+        const int row = li / x.num_rhs;
+        const int col = li % x.num_rhs;
+
+        y.values[row * y.stride + col] +=
+            alpha.values[map(col)] * x.values[row * x.stride + col];
+    }
+}
+
+
+template <typename ValueType>
+__dpct_inline__ void single_rhs_compute_conj_dot(
+    const int num_rows, const ValueType* const __restrict__ x,
+    const ValueType* const __restrict__ y, ValueType& result,
+    sycl::nd_item<3> item_ct1)
+{
+    const auto group = item_ct1.get_group();
+    const auto group_size = item_ct1.get_local_range().size();
+    const auto tid = item_ct1.get_local_linear_id();
+
+    ValueType val = zero<ValueType>();
+
+    for (int r = tid; r < num_rows; r += group_size) {
+        val += conj(x[r]) * y[r];
+    }
+    result = sycl::reduce_over_group(group, val, sycl::plus<>());
+}
+
+
+template <int tile_size = config::warp_size, typename ValueType>
+__dpct_inline__ void single_rhs_compute_conj_dot_sg(
+    const int num_rows, const ValueType* const __restrict__ x,
+    const ValueType* const __restrict__ y, ValueType& result,
+    sycl::nd_item<3> item_ct1)
+{
+    auto subg =
+        group::tiled_partition<tile_size>(group::this_thread_block(item_ct1));
+    const auto subgroup = static_cast<sycl::sub_group>(subg);
+    const int subgroup_id = subgroup.get_group_id();
+    const int subgroup_size = subgroup.get_local_range().size();
+    const auto subgroup_tid = subgroup.get_local_id();
+
+    ValueType val = zero<ValueType>();
+
+    for (int r = subgroup_tid; r < num_rows; r += subgroup_size) {
+        val += conj(x[r]) * y[r];
+    }
+
+    val = ::gko::kernels::dpcpp::reduce(
+        subg, val, [](ValueType a, ValueType b) { return a + b; });
+
+    if (subgroup_tid == 0) {
+        result = val;
+    }
+}
+
+
+template <typename ValueType, typename Mapping>
+__dpct_inline__ void compute_gen_dot_product_kernel(
+    const gko::batch::multi_vector::batch_item<const ValueType>& x,
+    const gko::batch::multi_vector::batch_item<const ValueType>& y,
+    const gko::batch::multi_vector::batch_item<ValueType>& result,
+    sycl::nd_item<3>& item_ct1, Mapping conj_map)
+{
+    constexpr auto tile_size = config::warp_size;
+    auto subg =
+        group::tiled_partition<tile_size>(group::this_thread_block(item_ct1));
+    const auto subgroup = static_cast<sycl::sub_group>(subg);
+    const int subgroup_id = subgroup.get_group_id();
+    const int subgroup_size = subgroup.get_local_range().size();
+    const int num_subgroups = subgroup.get_group_range().size();
+
+    for (int rhs_index = subgroup_id; rhs_index < x.num_rhs;
+         rhs_index += num_subgroups) {
+        ValueType val = zero<ValueType>();
+
+        for (int r = subgroup.get_local_id(); r < x.num_rows;
+             r += subgroup_size) {
+            val += conj_map(x.values[r * x.stride + rhs_index]) *
+                   y.values[r * y.stride + rhs_index];
+        }
+
+        val = ::gko::kernels::dpcpp::reduce(
+            subg, val, [](ValueType a, ValueType b) { return a + b; });
+
+        if (subgroup.get_local_id() == 0) {
+            result.values[rhs_index] = val;
+        }
+    }
+}
+
+
+template <int tile_size = config::warp_size, typename ValueType>
+__dpct_inline__ void single_rhs_compute_norm2_sg(
+    const int num_rows, const ValueType* const __restrict__ x,
+    gko::remove_complex<ValueType>& result, sycl::nd_item<3> item_ct1)
+{
+    auto subg =
+        group::tiled_partition<tile_size>(group::this_thread_block(item_ct1));
+    const auto subgroup = static_cast<sycl::sub_group>(subg);
+    const int subgroup_id = subgroup.get_group_id();
+    const int subgroup_size = subgroup.get_local_range().size();
+
+    using real_type = typename gko::remove_complex<ValueType>;
+    real_type val = zero<real_type>();
+
+    for (int r = subgroup.get_local_id(); r < num_rows; r += subgroup_size) {
+        val += squared_norm(x[r]);
+    }
+
+    val = ::gko::kernels::dpcpp::reduce(
+        subg, val, [](real_type a, real_type b) { return a + b; });
+
+    if (subgroup.get_local_id() == 0) {
+        result = sqrt(val);
+    }
+}
+
+
+template <typename ValueType>
+__dpct_inline__ void single_rhs_compute_norm2(
+    const int num_rows, const ValueType* const __restrict__ x,
+    gko::remove_complex<ValueType>& result, sycl::nd_item<3> item_ct1)
+{
+    const auto group = item_ct1.get_group();
+    const auto group_size = item_ct1.get_local_range().size();
+    const auto tid = item_ct1.get_local_linear_id();
+
+    using real_type = typename gko::remove_complex<ValueType>;
+    real_type val = zero<real_type>();
+
+    for (int r = tid; r < num_rows; r += group_size) {
+        val += squared_norm(x[r]);
+    }
+
+    val = sycl::reduce_over_group(group, val, sycl::plus<>());
+
+    result = sqrt(val);
+}
+
+
+template <typename ValueType>
+__dpct_inline__ void compute_norm2_kernel(
+    const gko::batch::multi_vector::batch_item<const ValueType>& x,
+    const gko::batch::multi_vector::batch_item<remove_complex<ValueType>>&
+        result,
+    sycl::nd_item<3>& item_ct1)
+{
+    constexpr auto tile_size = config::warp_size;
+    auto subg =
+        group::tiled_partition<tile_size>(group::this_thread_block(item_ct1));
+    const auto subgroup = static_cast<sycl::sub_group>(subg);
+    const int subgroup_id = subgroup.get_group_id();
+    const int subgroup_size = subgroup.get_local_range().size();
+    const int num_subgroups = subgroup.get_group_range().size();
+
+    using real_type = typename gko::remove_complex<ValueType>;
+    for (int rhs_index = subgroup_id; rhs_index < x.num_rhs;
+         rhs_index += num_subgroups) {
+        real_type val = zero<real_type>();
+
+        for (int r = subgroup.get_local_id(); r < x.num_rows;
+             r += subgroup_size)
+            val += squared_norm(x.values[r * x.stride + rhs_index]);
+
+        val = ::gko::kernels::dpcpp::reduce(
+            subg, val, [](real_type a, real_type b) { return a + b; });
+
+        if (subgroup.get_local_id() == 0) {
+            result.values[rhs_index] = sqrt(val);
+        }
+    }
+}
+
+
+template <typename ValueType>
+__dpct_inline__ void copy_kernel(const int num_rows, const ValueType* in,
+                                 ValueType* out, sycl::nd_item<3>& item_ct1)
+{
+    for (int iz = item_ct1.get_local_linear_id(); iz < num_rows;
+         iz += item_ct1.get_local_range().size()) {
+        out[iz] = in[iz];
+    }
+}
+
+
+template <typename ValueType>
+__dpct_inline__ void copy_kernel(
+    const gko::batch::multi_vector::batch_item<const ValueType>& in,
+    const gko::batch::multi_vector::batch_item<ValueType>& out,
+    sycl::nd_item<3>& item_ct1)
+{
+    for (int iz = item_ct1.get_local_linear_id(); iz < in.num_rows * in.num_rhs;
+         iz += item_ct1.get_local_range().size()) {
+        const int i = iz / in.num_rhs;
+        const int j = iz % in.num_rhs;
+        out.values[i * out.stride + j] = in.values[i * in.stride + j];
+    }
+}
diff --git a/dpcpp/base/batch_struct.hpp b/dpcpp/base/batch_struct.hpp
new file mode 100644
index 00000000000..dc8301ecb2e
--- /dev/null
+++ b/dpcpp/base/batch_struct.hpp
@@ -0,0 +1,93 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_DPCPP_BASE_BATCH_STRUCT_HPP_
+#define GKO_DPCPP_BASE_BATCH_STRUCT_HPP_
+
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/math.hpp>
+
+
+#include "core/base/batch_struct.hpp"
+#include "dpcpp/base/config.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace dpcpp {
+
+
+/** @file batch_struct.hpp
+ *
+ * Helper functions to generate a batch struct from a batch LinOp,
+ * while also shallow-casting to the required DPCPP scalar type.
+ *
+ * A specialization is needed for every format of every kind of linear algebra
+ * object. These are intended to be called on the host.
+ */
+
+
+/**
+ * Generates an immutable uniform batch struct from a batch of multi-vectors.
+ */
+template <typename ValueType>
+inline batch::multi_vector::uniform_batch<const ValueType> get_batch_struct(
+    const batch::MultiVector<ValueType>* const op)
+{
+    return {op->get_const_values(), op->get_num_batch_items(),
+            static_cast<int32>(op->get_common_size()[1]),
+            static_cast<int32>(op->get_common_size()[0]),
+            static_cast<int32>(op->get_common_size()[1])};
+}
+
+
+/**
+ * Generates a uniform batch struct from a batch of multi-vectors.
+ */
+template <typename ValueType>
+inline batch::multi_vector::uniform_batch<ValueType> get_batch_struct(
+    batch::MultiVector<ValueType>* const op)
+{
+    return {op->get_values(), op->get_num_batch_items(),
+            static_cast<int32>(op->get_common_size()[1]),
+            static_cast<int32>(op->get_common_size()[0]),
+            static_cast<int32>(op->get_common_size()[1])};
+}
+
+
+}  // namespace dpcpp
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_DPCPP_BASE_BATCH_STRUCT_HPP_
diff --git a/dpcpp/base/device.hpp b/dpcpp/base/device.hpp
index 6047fbed615..658ccbe18f4 100644
--- a/dpcpp/base/device.hpp
+++ b/dpcpp/base/device.hpp
@@ -46,6 +46,9 @@ namespace dpcpp {
 void destroy_event(sycl::event* event);
 
 
+std::string get_device_name(int device_id);
+
+
 }  // namespace dpcpp
 }  // namespace kernels
 }  // namespace gko
diff --git a/dpcpp/base/device_matrix_data_kernels.dp.cpp b/dpcpp/base/device_matrix_data_kernels.dp.cpp
index 9d387ce7ecf..f8185d884c1 100644
--- a/dpcpp/base/device_matrix_data_kernels.dp.cpp
+++ b/dpcpp/base/device_matrix_data_kernels.dp.cpp
@@ -33,7 +33,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 // force-top: on
 // oneDPL needs to be first to avoid issues with libstdc++ TBB impl
 #include <oneapi/dpl/algorithm>
-#include <oneapi/dpl/execution>
 // force-top: off
 
 
@@ -43,6 +42,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
+#include "dpcpp/base/onedpl.hpp"
+
+
 namespace gko {
 namespace kernels {
 namespace dpcpp {
@@ -56,8 +58,7 @@ void remove_zeros(std::shared_ptr<const DefaultExecutor> exec,
 {
     using nonzero_type = matrix_data_entry<ValueType, IndexType>;
     auto size = values.get_num_elems();
-    auto policy =
-        oneapi::dpl::execution::make_device_policy(*exec->get_queue());
+    auto policy = onedpl_policy(exec);
     auto nnz = std::count_if(
         policy, values.get_const_data(), values.get_const_data() + size,
         [](ValueType val) { return is_nonzero<ValueType>(val); });
@@ -96,8 +97,7 @@ void sum_duplicates(std::shared_ptr<const DefaultExecutor> exec, size_type,
     if (size == 0) {
         return;
     }
-    auto policy =
-        oneapi::dpl::execution::make_device_policy(*exec->get_queue());
+    auto policy = onedpl_policy(exec);
     auto in_loc_it = oneapi::dpl::make_zip_iterator(row_idxs.get_const_data(),
                                                     col_idxs.get_const_data());
     auto adj_in_loc_it =
@@ -136,8 +136,7 @@ template <typename ValueType, typename IndexType>
 void sort_row_major(std::shared_ptr<const DefaultExecutor> exec,
                     device_matrix_data<ValueType, IndexType>& data)
 {
-    auto policy =
-        oneapi::dpl::execution::make_device_policy(*exec->get_queue());
+    auto policy = onedpl_policy(exec);
     auto input_it = oneapi::dpl::make_zip_iterator(
         data.get_row_idxs(), data.get_col_idxs(), data.get_values());
     std::sort(policy, input_it, input_it + data.get_num_elems(),
diff --git a/dpcpp/base/executor.dp.cpp b/dpcpp/base/executor.dp.cpp
index c2015c8664c..6d6bbbe0388 100644
--- a/dpcpp/base/executor.dp.cpp
+++ b/dpcpp/base/executor.dp.cpp
@@ -103,7 +103,7 @@ void DpcppExecutor::populate_exec_info(const machine_topology* mach_topo)
 
 void DpcppExecutor::raw_free(void* ptr) const noexcept
 {
-    // the free function may syncronize excution or not, which depends on
+    // the free function may synchronize execution or not, which depends on
     // implementation or backend, so it is not guaranteed.
     // TODO: maybe a light wait implementation?
     try {
@@ -323,6 +323,17 @@ namespace dpcpp {
 void destroy_event(sycl::event* event) { delete event; }
 
 
+std::string get_device_name(int device_id)
+{
+    auto devices = ::gko::detail::get_devices("gpu");
+    if (devices.empty()) {
+        return "CPU";
+    }
+
+    return devices[device_id].get_info<sycl::info::device::name>();
+}
+
+
 }  // namespace dpcpp
 }  // namespace kernels
 }  // namespace gko
diff --git a/dpcpp/base/helper.hpp b/dpcpp/base/helper.hpp
index 714f5a0d37a..b38b6c1ef8b 100644
--- a/dpcpp/base/helper.hpp
+++ b/dpcpp/base/helper.hpp
@@ -203,7 +203,7 @@ bool validate(sycl::queue* queue, unsigned workgroup_size,
  * get_first_cfg will return the first valid config by validate function from
  * given config array.
  *
- * @tparam IterArr  the iteratable array type
+ * @tparam IterArr  the iterable array type
  * @tparam Validate  the validate function type
  *
  * @param arr  the config array
diff --git a/dpcpp/base/kernel_launch_reduction.dp.hpp b/dpcpp/base/kernel_launch_reduction.dp.hpp
index 6cae0c72dcb..1cf7c1f774a 100644
--- a/dpcpp/base/kernel_launch_reduction.dp.hpp
+++ b/dpcpp/base/kernel_launch_reduction.dp.hpp
@@ -194,8 +194,8 @@ void run_kernel_reduction_impl(std::shared_ptr<const DpcppExecutor> exec,
     } else {
         queue->submit([&](sycl::handler& cgh) {
             generic_kernel_reduction_1d<DeviceConfig>(
-                cgh, static_cast<int64>(size), num_workgroups, fn, op, finalize,
-                identity, result, args...);
+                cgh, static_cast<int64>(size), 1, fn, op, finalize, identity,
+                result, args...);
         });
     }
 }
@@ -240,9 +240,9 @@ void run_kernel_reduction_impl(std::shared_ptr<const DpcppExecutor> exec,
         });
     } else {
         queue->submit([&](sycl::handler& cgh) {
-            generic_kernel_reduction_2d<DeviceConfig>(
-                cgh, rows, cols, num_workgroups, fn, op, finalize, identity,
-                result, args...);
+            generic_kernel_reduction_2d<DeviceConfig>(cgh, rows, cols, 1, fn,
+                                                      op, finalize, identity,
+                                                      result, args...);
         });
     }
 }
diff --git a/hip/test/base/hip_executor_reset.cpp b/dpcpp/base/onedpl.hpp
similarity index 64%
rename from hip/test/base/hip_executor_reset.cpp
rename to dpcpp/base/onedpl.hpp
index 39e3252e053..4af31d3e115 100644
--- a/hip/test/base/hip_executor_reset.cpp
+++ b/dpcpp/base/onedpl.hpp
@@ -30,58 +30,32 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-#include <ginkgo/core/base/executor.hpp>
-
-
-#include <thread>
-
-
-#include <gtest/gtest.h>
-
-
-namespace {
+#ifndef GKO_DPCPP_BASE_ONEDPL_HPP_
+#define GKO_DPCPP_BASE_ONEDPL_HPP_
 
 
-#define GTEST_ASSERT_NO_EXIT(statement) \
-    ASSERT_EXIT({ {statement} exit(0); }, ::testing::ExitedWithCode(0), "")
+// force-top: on
+#include <oneapi/dpl/execution>
+// force-top: off
 
 
-TEST(DeviceReset, HipCuda)
-{
-    GTEST_ASSERT_NO_EXIT({
-        auto ref = gko::ReferenceExecutor::create();
-        auto hip = gko::HipExecutor::create(0, ref, true);
-        auto cuda = gko::CudaExecutor::create(0, ref, true);
-    });
-}
+#include <ginkgo/core/base/executor.hpp>
 
 
-TEST(DeviceReset, CudaHip)
-{
-    GTEST_ASSERT_NO_EXIT({
-        auto ref = gko::ReferenceExecutor::create();
-        auto cuda = gko::CudaExecutor::create(0, ref, true);
-        auto hip = gko::HipExecutor::create(0, ref, true);
-    });
-}
+namespace gko {
+namespace kernels {
+namespace dpcpp {
 
 
-void func()
+inline auto onedpl_policy(std::shared_ptr<const DpcppExecutor> exec)
 {
-    auto ref = gko::ReferenceExecutor::create();
-    auto exec = gko::HipExecutor::create(0, ref, true);
+    return oneapi::dpl::execution::make_device_policy(*exec->get_queue());
 }
 
 
-TEST(DeviceReset, HipHip)
-{
-    GTEST_ASSERT_NO_EXIT({
-        std::thread t1(func);
-        std::thread t2(func);
-        t1.join();
-        t2.join();
-    });
-}
+}  // namespace dpcpp
+}  // namespace kernels
+}  // namespace gko
 
 
-}  // namespace
+#endif  // GKO_DPCPP_BASE_ONEDPL_HPP_
diff --git a/dpcpp/components/segment_scan.dp.hpp b/dpcpp/components/segment_scan.dp.hpp
index ba0d9577fe3..b73ae12e9b3 100644
--- a/dpcpp/components/segment_scan.dp.hpp
+++ b/dpcpp/components/segment_scan.dp.hpp
@@ -50,7 +50,7 @@ namespace dpcpp {
 /**
  * @internal
  *
- * Compute a segement scan using add operation (+) of a subgroup_size. Each
+ * Compute a segment scan using add operation (+) of a subgroup_size. Each
  * segment performs suffix sum. Works on the source array and returns whether
  * the thread is the first element of its segment with same `ind`.
  */
diff --git a/dpcpp/components/thread_ids.dp.hpp b/dpcpp/components/thread_ids.dp.hpp
index 2792e2307e4..e689e9f14ba 100644
--- a/dpcpp/components/thread_ids.dp.hpp
+++ b/dpcpp/components/thread_ids.dp.hpp
@@ -238,7 +238,7 @@ __dpct_inline__ size_type get_thread_id(sycl::nd_item<3> item_ct1)
  *
  * Returns the global ID of the thread in the given index type.
  * This function assumes one-dimensional thread and block indexing in cuda
- * sense. It uses the third position infomation to get the information.
+ * sense. It uses the third position information to get the information.
  *
  * @return the global ID of the thread in the given index type.
  *
@@ -258,7 +258,7 @@ __dpct_inline__ IndexType get_thread_id_flat(sycl::nd_item<3> item_ct1)
  *
  * Returns the total number of threads in the given index type.
  * This function assumes one-dimensional thread and block indexing in cuda
- * sense. It uses the third position infomation to get the information.
+ * sense. It uses the third position information to get the information.
  *
  * @return the total number of threads in the given index type.
  *
diff --git a/dpcpp/distributed/partition_helpers_kernels.dp.cpp b/dpcpp/distributed/partition_helpers_kernels.dp.cpp
new file mode 100644
index 00000000000..8b0171cd349
--- /dev/null
+++ b/dpcpp/distributed/partition_helpers_kernels.dp.cpp
@@ -0,0 +1,119 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+// force-top: on
+#include <oneapi/dpl/algorithm>
+#include <oneapi/dpl/execution>
+#include <oneapi/dpl/iterator>
+// force-top: off
+
+
+#include "core/distributed/partition_helpers_kernels.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace dpcpp {
+namespace partition_helpers {
+
+struct stride {
+#if ONEDPL_VERSION_MAJOR >= 2022 && ONEDPL_VERSION_MINOR >= 1
+    template <typename Index>
+    Index operator()(const Index& i) const
+    {
+        return i * 2;
+    }
+#else
+    // Some older version require [] while some require (), so I added both
+    template <typename Index>
+    Index operator[](const Index& i) const
+    {
+        return i * 2;
+    }
+
+    template <typename Index>
+    Index operator()(const Index& i) const
+    {
+        return i * 2;
+    }
+#endif
+};
+
+template <typename GlobalIndexType>
+void sort_by_range_start(
+    std::shared_ptr<const DefaultExecutor> exec,
+    array<GlobalIndexType>& range_start_ends,
+    array<experimental::distributed::comm_index_type>& part_ids)
+{
+    auto policy =
+        oneapi::dpl::execution::make_device_policy(*exec->get_queue());
+    auto num_ranges = range_start_ends.get_num_elems() / 2;
+
+    auto start_it = oneapi::dpl::make_permutation_iterator(
+        range_start_ends.get_data(), stride{});
+    auto end_it = oneapi::dpl::make_permutation_iterator(
+        range_start_ends.get_data() + 1, stride{});
+
+    // older versions of oneDPL have a bug when sorting permutation iterators
+#if ONEDPL_VERSION_MAJOR >= 2022 && ONEDPL_VERSION_MINOR >= 1
+    auto zip_it =
+        oneapi::dpl::make_zip_iterator(start_it, end_it, part_ids.get_data());
+    std::stable_sort(policy, zip_it, zip_it + num_ranges, [](auto a, auto b) {
+        return std::get<0>(a) < std::get<0>(b);
+    });
+#else
+    array<GlobalIndexType> starts(exec, num_ranges);
+    array<GlobalIndexType> ends(exec, num_ranges);
+
+    std::copy(policy, start_it, start_it + num_ranges, starts.get_data());
+    std::copy(policy, end_it, end_it + num_ranges, ends.get_data());
+
+    auto zip_it = oneapi::dpl::make_zip_iterator(
+        starts.get_data(), ends.get_data(), part_ids.get_data());
+    std::stable_sort(policy, zip_it, zip_it + num_ranges, [](auto a, auto b) {
+        return std::get<0>(a) < std::get<0>(b);
+    });
+
+    std::copy(policy, starts.get_data(), starts.get_data() + num_ranges,
+              start_it);
+    std::copy(policy, ends.get_data(), ends.get_data() + num_ranges, end_it);
+#endif
+}
+
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(
+    GKO_DECLARE_PARTITION_HELPERS_SORT_BY_RANGE_START);
+
+
+}  // namespace partition_helpers
+}  // namespace dpcpp
+}  // namespace kernels
+}  // namespace gko
diff --git a/dpcpp/distributed/partition_kernels.dp.cpp b/dpcpp/distributed/partition_kernels.dp.cpp
index 7d9210894e2..04b7ff215ed 100644
--- a/dpcpp/distributed/partition_kernels.dp.cpp
+++ b/dpcpp/distributed/partition_kernels.dp.cpp
@@ -30,17 +30,86 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
+// force-top: on
+#include <oneapi/dpl/algorithm>
+#include <oneapi/dpl/iterator>
+// force-top: off
+
+
 #include "core/distributed/partition_kernels.hpp"
 
 
+#include "common/unified/base/kernel_launch.hpp"
+#include "core/components/fill_array_kernels.hpp"
+#include "dpcpp/base/onedpl.hpp"
+
+
 namespace gko {
 namespace kernels {
 namespace dpcpp {
 namespace partition {
+namespace kernel {
+
+
+template <typename LocalIndexType, typename GlobalIndexType>
+void setup_sizes_ids_permutation(
+    std::shared_ptr<const DefaultExecutor> exec, size_type num_ranges,
+    comm_index_type num_parts, const GlobalIndexType* range_offsets,
+    const comm_index_type* range_parts, Array<LocalIndexType>& range_sizes,
+    Array<comm_index_type>& part_ids, Array<GlobalIndexType>& permutation)
+{
+    run_kernel(
+        exec,
+        [] GKO_KERNEL(auto i, auto num_ranges, auto num_parts,
+                      auto range_offsets, auto range_parts, auto range_sizes,
+                      auto part_ids, auto permutation) {
+            if (i == 0) {
+                // set sentinel value at the end
+                part_ids[num_ranges] = num_parts;
+            }
+            range_sizes[i] = range_offsets[i + 1] - range_offsets[i];
+            part_ids[i] = range_parts[i];
+            permutation[i] = static_cast<GlobalIndexType>(i);
+        },
+        num_ranges, num_ranges, num_parts, range_offsets, range_parts,
+        range_sizes, part_ids, permutation);
+}
+
+
+template <typename LocalIndexType, typename GlobalIndexType>
+void compute_part_sizes_and_starting_indices(
+    std::shared_ptr<const DefaultExecutor> exec, size_type num_ranges,
+    const Array<LocalIndexType>& range_sizes,
+    const Array<comm_index_type>& part_ids,
+    const Array<GlobalIndexType>& permutation, LocalIndexType* starting_indices,
+    LocalIndexType* part_sizes)
+{
+    run_kernel(
+        exec,
+        [] GKO_KERNEL(auto i, auto grouped_starting_indices,
+                      auto grouped_part_ids, auto orig_idxs,
+                      auto starting_indices, auto part_sizes) {
+            auto prev_part = i > 0 ? grouped_part_ids[i - 1]
+                                   : invalid_index<comm_index_type>();
+            auto cur_part = grouped_part_ids[i];
+            auto next_part =
+                grouped_part_ids[i + 1];  // last element has to be num_parts
+            if (cur_part != next_part) {
+                part_sizes[cur_part] = grouped_starting_indices[i];
+            }
+            // write result shifted by one entry to get exclusive prefix sum
+            starting_indices[orig_idxs[i]] =
+                prev_part == cur_part ? grouped_starting_indices[i - 1]
+                                      : LocalIndexType{};
+        },
+        num_ranges, range_sizes, part_ids, permutation, starting_indices,
+        part_sizes);
+}
+
+
+}  // namespace kernel
 
 
-// TODO: wait until https://github.com/oneapi-src/oneDPL/pull/388 is release to
-// implement it similar to cuda/hip
 template <typename LocalIndexType, typename GlobalIndexType>
 void build_starting_indices(std::shared_ptr<const DefaultExecutor> exec,
                             const GlobalIndexType* range_offsets,
@@ -48,7 +117,46 @@ void build_starting_indices(std::shared_ptr<const DefaultExecutor> exec,
                             size_type num_ranges, comm_index_type num_parts,
                             comm_index_type& num_empty_parts,
                             LocalIndexType* starting_indices,
-                            LocalIndexType* part_sizes) GKO_NOT_IMPLEMENTED;
+                            LocalIndexType* part_sizes)
+{
+    if (num_ranges > 0) {
+        auto policy = onedpl_policy(exec);
+
+        Array<LocalIndexType> range_sizes{exec, num_ranges};
+        // num_parts sentinel at the end
+        Array<comm_index_type> tmp_part_ids{exec, num_ranges + 1};
+        Array<GlobalIndexType> permutation{exec, num_ranges};
+        // set part_sizes to 0 in case of empty parts
+        components::fill_array(exec, part_sizes, num_parts,
+                               zero<LocalIndexType>());
+
+        kernel::setup_sizes_ids_permutation(
+            exec, num_ranges, num_parts, range_offsets, range_parts,
+            range_sizes, tmp_part_ids, permutation);
+
+        auto tmp_part_id_ptr = tmp_part_ids.get_data();
+        auto range_sizes_ptr = range_sizes.get_data();
+        auto sort_it = oneapi::dpl::make_zip_iterator(
+            tmp_part_id_ptr, range_sizes_ptr, permutation.get_data());
+        // group range_sizes by part ID
+        oneapi::dpl::stable_sort(policy, sort_it, sort_it + num_ranges,
+                                 [](const auto t_a, const auto t_b) {
+                                     return std::get<0>(t_a) < std::get<0>(t_b);
+                                 });
+        // compute inclusive prefix sum for each part
+        oneapi::dpl::inclusive_scan_by_segment(
+            policy, tmp_part_id_ptr, tmp_part_id_ptr + num_ranges,
+            range_sizes_ptr, range_sizes_ptr);
+        // write back the results
+        kernel::compute_part_sizes_and_starting_indices(
+            exec, num_ranges, range_sizes, tmp_part_ids, permutation,
+            starting_indices, part_sizes);
+        num_empty_parts =
+            oneapi::dpl::count(policy, part_sizes, part_sizes + num_parts, 0);
+    } else {
+        num_empty_parts = num_parts;
+    }
+}
 
 GKO_INSTANTIATE_FOR_EACH_LOCAL_GLOBAL_INDEX_TYPE(
     GKO_DECLARE_PARTITION_BUILD_STARTING_INDICES);
diff --git a/dpcpp/factorization/lu_kernels.dp.cpp b/dpcpp/factorization/lu_kernels.dp.cpp
index caa555d6203..d3bd83a7658 100644
--- a/dpcpp/factorization/lu_kernels.dp.cpp
+++ b/dpcpp/factorization/lu_kernels.dp.cpp
@@ -76,6 +76,29 @@ void factorize(std::shared_ptr<const DefaultExecutor> exec,
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LU_FACTORIZE);
 
 
+template <typename IndexType>
+void symbolic_factorize_simple(std::shared_ptr<const DefaultExecutor> exec,
+                               const IndexType* row_ptrs,
+                               const IndexType* col_idxs,
+                               const IndexType* lookup_offsets,
+                               const int64* lookup_descs,
+                               const int32* lookup_storage,
+                               matrix::Csr<float, IndexType>* factors,
+                               IndexType* out_row_nnz) GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_LU_SYMMETRIC_FACTORIZE_SIMPLE);
+
+
+template <typename IndexType>
+void symbolic_factorize_simple_finalize(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const matrix::Csr<float, IndexType>* factors,
+    IndexType* out_col_idxs) GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(
+    GKO_DECLARE_LU_SYMMETRIC_FACTORIZE_SIMPLE_FINALIZE);
+
+
 }  // namespace lu_factorization
 }  // namespace dpcpp
 }  // namespace kernels
diff --git a/dpcpp/get_info.cmake b/dpcpp/get_info.cmake
index 36918a3a8c6..ee9c0398f3e 100644
--- a/dpcpp/get_info.cmake
+++ b/dpcpp/get_info.cmake
@@ -3,6 +3,5 @@ ginkgo_print_module_footer(${detailed_log} "DPCPP variables:")
 ginkgo_print_variable(${detailed_log} "GINKGO_DPCPP_FLAGS")
 ginkgo_print_variable(${detailed_log} "GINKGO_DPCPP_SINGLE_MODE")
 ginkgo_print_module_footer(${detailed_log} "DPCPP environment variables:")
-ginkgo_print_env_variable(${detailed_log} "SYCL_DEVICE_TYPE")
-ginkgo_print_env_variable(${detailed_log} "SYCL_BE")
+ginkgo_print_env_variable(${detailed_log} "SYCL_DEVICE_FILTER")
 ginkgo_print_module_footer(${detailed_log} "")
diff --git a/dpcpp/log/batch_logger.hpp b/dpcpp/log/batch_logger.hpp
new file mode 100644
index 00000000000..ef5337e9939
--- /dev/null
+++ b/dpcpp/log/batch_logger.hpp
@@ -0,0 +1,83 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_DPCPP_LOG_BATCH_LOGGER_HPP_
+#define GKO_DPCPP_LOG_BATCH_LOGGER_HPP_
+
+
+#include <ginkgo/config.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/types.hpp>
+
+
+#include "dpcpp/base/config.hpp"
+#include "dpcpp/base/dim3.dp.hpp"
+#include "dpcpp/base/dpct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace dpcpp {
+namespace batch_log {
+
+
+/**
+ * @see reference/log/batch_logger.hpp
+ */
+template <typename RealType>
+class SimpleFinalLogger final {
+public:
+    using real_type = remove_complex<RealType>;
+
+    SimpleFinalLogger(real_type* const batch_residuals, int* const batch_iters)
+        : final_residuals_{batch_residuals}, final_iters_{batch_iters}
+    {}
+
+    __dpct_inline__ void log_iteration(const size_type batch_idx,
+                                       const int iter, const real_type res_norm)
+    {
+        final_iters_[batch_idx] = iter;
+        final_residuals_[batch_idx] = res_norm;
+    }
+
+private:
+    real_type* const final_residuals_;
+    int* const final_iters_;
+};
+
+
+}  // namespace batch_log
+}  // namespace dpcpp
+}  // namespace kernels
+}  // namespace gko
+
+#endif  // GKO_DPCPP_LOG_BATCH_LOGGER_HPP_
diff --git a/dpcpp/matrix/batch_dense_kernels.dp.cpp b/dpcpp/matrix/batch_dense_kernels.dp.cpp
new file mode 100644
index 00000000000..d1320e79968
--- /dev/null
+++ b/dpcpp/matrix/batch_dense_kernels.dp.cpp
@@ -0,0 +1,178 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/matrix/batch_dense_kernels.hpp"
+
+
+#include <algorithm>
+
+
+#include <CL/sycl.hpp>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/batch_dense.hpp>
+
+
+#include "core/base/batch_struct.hpp"
+#include "core/components/prefix_sum_kernels.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "dpcpp/base/batch_struct.hpp"
+#include "dpcpp/base/config.hpp"
+#include "dpcpp/base/dim3.dp.hpp"
+#include "dpcpp/base/dpct.hpp"
+#include "dpcpp/base/helper.hpp"
+#include "dpcpp/components/cooperative_groups.dp.hpp"
+#include "dpcpp/components/intrinsics.dp.hpp"
+#include "dpcpp/components/reduction.dp.hpp"
+#include "dpcpp/components/thread_ids.dp.hpp"
+#include "dpcpp/matrix/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace dpcpp {
+/**
+ * @brief The Dense matrix format namespace.
+ *
+ * @ingroup batch_dense
+ */
+namespace batch_dense {
+
+
+#include "dpcpp/matrix/batch_dense_kernels.hpp.inc"
+
+
+template <typename ValueType>
+void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
+                  const batch::matrix::Dense<ValueType>* mat,
+                  const batch::MultiVector<ValueType>* b,
+                  batch::MultiVector<ValueType>* x)
+{
+    const size_type num_rows = mat->get_common_size()[0];
+    const size_type num_cols = mat->get_common_size()[1];
+
+    const auto num_batch_items = mat->get_num_batch_items();
+    auto device = exec->get_queue()->get_device();
+    auto group_size =
+        device.get_info<sycl::info::device::max_work_group_size>();
+
+    const dim3 block(group_size);
+    const dim3 grid(num_batch_items);
+    const auto x_ub = get_batch_struct(x);
+    const auto b_ub = get_batch_struct(b);
+    const auto mat_ub = get_batch_struct(mat);
+    if (b_ub.num_rhs > 1) {
+        GKO_NOT_IMPLEMENTED;
+    }
+
+    // Launch a kernel that has nbatches blocks, each block has max group size
+    exec->get_queue()->submit([&](sycl::handler& cgh) {
+        cgh.parallel_for(
+            sycl_nd_range(grid, block),
+            [=](sycl::nd_item<3> item_ct1)
+                [[sycl::reqd_sub_group_size(config::warp_size)]] {
+                    auto group = item_ct1.get_group();
+                    auto group_id = group.get_group_linear_id();
+                    const auto mat_b =
+                        batch::matrix::extract_batch_item(mat_ub, group_id);
+                    const auto b_b = batch::extract_batch_item(b_ub, group_id);
+                    const auto x_b = batch::extract_batch_item(x_ub, group_id);
+                    simple_apply_kernel(mat_b, b_b.values, x_b.values,
+                                        item_ct1);
+                });
+    });
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_BATCH_DENSE_SIMPLE_APPLY_KERNEL);
+
+
+template <typename ValueType>
+void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
+                    const batch::MultiVector<ValueType>* alpha,
+                    const batch::matrix::Dense<ValueType>* mat,
+                    const batch::MultiVector<ValueType>* b,
+                    const batch::MultiVector<ValueType>* beta,
+                    batch::MultiVector<ValueType>* x)
+{
+    const auto mat_ub = get_batch_struct(mat);
+    const auto b_ub = get_batch_struct(b);
+    const auto x_ub = get_batch_struct(x);
+    const auto alpha_ub = get_batch_struct(alpha);
+    const auto beta_ub = get_batch_struct(beta);
+
+    if (b_ub.num_rhs > 1) {
+        GKO_NOT_IMPLEMENTED;
+    }
+
+    const auto num_batch_items = mat_ub.num_batch_items;
+    auto device = exec->get_queue()->get_device();
+    auto group_size =
+        device.get_info<sycl::info::device::max_work_group_size>();
+
+    const dim3 block(group_size);
+    const dim3 grid(num_batch_items);
+
+    // Launch a kernel that has nbatches blocks, each block has max group size
+    exec->get_queue()->submit([&](sycl::handler& cgh) {
+        cgh.parallel_for(
+            sycl_nd_range(grid, block),
+            [=](sycl::nd_item<3> item_ct1)
+                [[sycl::reqd_sub_group_size(config::warp_size)]] {
+                    auto group = item_ct1.get_group();
+                    auto group_id = group.get_group_linear_id();
+                    const auto mat_b =
+                        batch::matrix::extract_batch_item(mat_ub, group_id);
+                    const auto b_b = batch::extract_batch_item(b_ub, group_id);
+                    const auto x_b = batch::extract_batch_item(x_ub, group_id);
+                    const auto alpha_b =
+                        batch::extract_batch_item(alpha_ub, group_id);
+                    const auto beta_b =
+                        batch::extract_batch_item(beta_ub, group_id);
+                    advanced_apply_kernel(alpha_b.values[0], mat_b, b_b.values,
+                                          beta_b.values[0], x_b.values,
+                                          item_ct1);
+                });
+    });
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_BATCH_DENSE_ADVANCED_APPLY_KERNEL);
+
+
+}  // namespace batch_dense
+}  // namespace dpcpp
+}  // namespace kernels
+}  // namespace gko
diff --git a/dpcpp/matrix/batch_dense_kernels.hpp.inc b/dpcpp/matrix/batch_dense_kernels.hpp.inc
new file mode 100644
index 00000000000..ba232ea02e4
--- /dev/null
+++ b/dpcpp/matrix/batch_dense_kernels.hpp.inc
@@ -0,0 +1,94 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+template <typename ValueType>
+__dpct_inline__ void simple_apply_kernel(
+    const gko::batch::matrix::dense::batch_item<const ValueType>& mat,
+    const ValueType* b, ValueType* x, sycl::nd_item<3>& item_ct1)
+{
+    constexpr auto tile_size = config::warp_size;
+    auto subg =
+        group::tiled_partition<tile_size>(group::this_thread_block(item_ct1));
+    const auto subgroup = static_cast<sycl::sub_group>(subg);
+    const int subgroup_id = subgroup.get_group_id();
+    const int subgroup_size = subgroup.get_local_range().size();
+    const int num_subgroups = subgroup.get_group_range().size();
+
+    for (int row = subgroup_id; row < mat.num_rows; row += num_subgroups) {
+        ValueType temp = zero<ValueType>();
+        for (int j = subgroup.get_local_id(); j < mat.num_cols;
+             j += subgroup_size) {
+            const ValueType val = mat.values[row * mat.stride + j];
+            temp += val * b[j];
+        }
+
+        temp = ::gko::kernels::dpcpp::reduce(
+            subg, temp, [](ValueType a, ValueType b) { return a + b; });
+
+        if (subgroup.get_local_id() == 0) {
+            x[row] = temp;
+        }
+    }
+}
+
+
+template <typename ValueType>
+__dpct_inline__ void advanced_apply_kernel(
+    const ValueType alpha,
+    const gko::batch::matrix::dense::batch_item<const ValueType>& mat,
+    const ValueType* b, const ValueType beta, ValueType* x,
+    sycl::nd_item<3>& item_ct1)
+{
+    constexpr auto tile_size = config::warp_size;
+    auto subg =
+        group::tiled_partition<tile_size>(group::this_thread_block(item_ct1));
+    const auto subgroup = static_cast<sycl::sub_group>(subg);
+    const int subgroup_id = subgroup.get_group_id();
+    const int subgroup_size = subgroup.get_local_range().size();
+    const int num_subgroup = subgroup.get_group_range().size();
+
+    for (int row = subgroup_id; row < mat.num_rows; row += num_subgroup) {
+        ValueType temp = zero<ValueType>();
+        for (int j = subgroup.get_local_id(); j < mat.num_cols;
+             j += subgroup_size) {
+            const ValueType val = mat.values[row * mat.stride + j];
+            temp += alpha * val * b[j];
+        }
+
+        temp = ::gko::kernels::dpcpp::reduce(
+            subg, temp, [](ValueType a, ValueType b) { return a + b; });
+
+        if (subgroup.get_local_id() == 0) {
+            x[row] = temp + beta * x[row];
+        }
+    }
+}
diff --git a/dpcpp/matrix/batch_ell_kernels.dp.cpp b/dpcpp/matrix/batch_ell_kernels.dp.cpp
new file mode 100644
index 00000000000..f565f69f270
--- /dev/null
+++ b/dpcpp/matrix/batch_ell_kernels.dp.cpp
@@ -0,0 +1,176 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/matrix/batch_ell_kernels.hpp"
+
+
+#include <algorithm>
+
+
+#include <CL/sycl.hpp>
+
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/matrix/batch_ell.hpp>
+
+
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "dpcpp/base/batch_struct.hpp"
+#include "dpcpp/base/dim3.dp.hpp"
+#include "dpcpp/base/dpct.hpp"
+#include "dpcpp/base/helper.hpp"
+#include "dpcpp/components/cooperative_groups.dp.hpp"
+#include "dpcpp/components/intrinsics.dp.hpp"
+#include "dpcpp/components/reduction.dp.hpp"
+#include "dpcpp/components/thread_ids.dp.hpp"
+#include "dpcpp/matrix/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace dpcpp {
+/**
+ * @brief The Ell matrix format namespace.
+ * @ref Ell
+ * @ingroup batch_ell
+ */
+namespace batch_ell {
+
+
+#include "dpcpp/matrix/batch_ell_kernels.hpp.inc"
+
+
+template <typename ValueType, typename IndexType>
+void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
+                  const batch::matrix::Ell<ValueType, IndexType>* mat,
+                  const batch::MultiVector<ValueType>* b,
+                  batch::MultiVector<ValueType>* x)
+{
+    const size_type num_rows = mat->get_common_size()[0];
+    const size_type num_cols = mat->get_common_size()[1];
+
+    const auto num_batch_items = mat->get_num_batch_items();
+    auto device = exec->get_queue()->get_device();
+    // TODO: use runtime selection of group size based on num_rows.
+    auto group_size =
+        device.get_info<sycl::info::device::max_work_group_size>();
+
+    const dim3 block(group_size);
+    const dim3 grid(num_batch_items);
+    const auto x_ub = get_batch_struct(x);
+    const auto b_ub = get_batch_struct(b);
+    const auto mat_ub = get_batch_struct(mat);
+    if (b_ub.num_rhs > 1) {
+        GKO_NOT_IMPLEMENTED;
+    }
+
+    // Launch a kernel that has nbatches blocks, each block has max group size
+    exec->get_queue()->submit([&](sycl::handler& cgh) {
+        cgh.parallel_for(
+            sycl_nd_range(grid, block),
+            [=](sycl::nd_item<3> item_ct1)
+                [[sycl::reqd_sub_group_size(config::warp_size)]] {
+                    auto group = item_ct1.get_group();
+                    auto group_id = group.get_group_linear_id();
+                    const auto mat_b =
+                        batch::matrix::extract_batch_item(mat_ub, group_id);
+                    const auto b_b = batch::extract_batch_item(b_ub, group_id);
+                    const auto x_b = batch::extract_batch_item(x_ub, group_id);
+                    simple_apply_kernel(mat_b, b_b.values, x_b.values,
+                                        item_ct1);
+                });
+    });
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
+    GKO_DECLARE_BATCH_ELL_SIMPLE_APPLY_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
+                    const batch::MultiVector<ValueType>* alpha,
+                    const batch::matrix::Ell<ValueType, IndexType>* mat,
+                    const batch::MultiVector<ValueType>* b,
+                    const batch::MultiVector<ValueType>* beta,
+                    batch::MultiVector<ValueType>* x)
+{
+    const auto mat_ub = get_batch_struct(mat);
+    const auto b_ub = get_batch_struct(b);
+    const auto x_ub = get_batch_struct(x);
+    const auto alpha_ub = get_batch_struct(alpha);
+    const auto beta_ub = get_batch_struct(beta);
+
+    if (b_ub.num_rhs > 1) {
+        GKO_NOT_IMPLEMENTED;
+    }
+
+    const auto num_batch_items = mat_ub.num_batch_items;
+    auto device = exec->get_queue()->get_device();
+    // TODO: use runtime selection of group size based on num_rows.
+    auto group_size =
+        device.get_info<sycl::info::device::max_work_group_size>();
+
+    const dim3 block(group_size);
+    const dim3 grid(num_batch_items);
+
+    // Launch a kernel that has nbatches blocks, each block has max group size
+    exec->get_queue()->submit([&](sycl::handler& cgh) {
+        cgh.parallel_for(
+            sycl_nd_range(grid, block),
+            [=](sycl::nd_item<3> item_ct1)
+                [[sycl::reqd_sub_group_size(config::warp_size)]] {
+                    auto group = item_ct1.get_group();
+                    auto group_id = group.get_group_linear_id();
+                    const auto mat_b =
+                        batch::matrix::extract_batch_item(mat_ub, group_id);
+                    const auto b_b = batch::extract_batch_item(b_ub, group_id);
+                    const auto x_b = batch::extract_batch_item(x_ub, group_id);
+                    const auto alpha_b =
+                        batch::extract_batch_item(alpha_ub, group_id);
+                    const auto beta_b =
+                        batch::extract_batch_item(beta_ub, group_id);
+                    advanced_apply_kernel(alpha_b.values[0], mat_b, b_b.values,
+                                          beta_b.values[0], x_b.values,
+                                          item_ct1);
+                });
+    });
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
+    GKO_DECLARE_BATCH_ELL_ADVANCED_APPLY_KERNEL);
+
+
+}  // namespace batch_ell
+}  // namespace dpcpp
+}  // namespace kernels
+}  // namespace gko
diff --git a/dpcpp/matrix/batch_ell_kernels.hpp.inc b/dpcpp/matrix/batch_ell_kernels.hpp.inc
new file mode 100644
index 00000000000..8c54d48db7d
--- /dev/null
+++ b/dpcpp/matrix/batch_ell_kernels.hpp.inc
@@ -0,0 +1,74 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+template <typename ValueType, typename IndexType>
+__dpct_inline__ void simple_apply_kernel(
+    const gko::batch::matrix::ell::batch_item<const ValueType, IndexType>& mat,
+    const ValueType* b, ValueType* x, sycl::nd_item<3>& item_ct1)
+{
+    for (int tidx = item_ct1.get_local_linear_id(); tidx < mat.num_rows;
+         tidx += item_ct1.get_local_range().size()) {
+        auto temp = zero<ValueType>();
+        for (size_type idx = 0; idx < mat.num_stored_elems_per_row; idx++) {
+            const auto col_idx = mat.col_idxs[tidx + idx * mat.stride];
+            if (col_idx == invalid_index<IndexType>()) {
+                break;
+            } else {
+                temp += mat.values[tidx + idx * mat.stride] * b[col_idx];
+            }
+        }
+        x[tidx] = temp;
+    }
+}
+
+
+template <typename ValueType, typename IndexType>
+__dpct_inline__ void advanced_apply_kernel(
+    const ValueType alpha,
+    const gko::batch::matrix::ell::batch_item<const ValueType, IndexType>& mat,
+    const ValueType* b, const ValueType beta, ValueType* x,
+    sycl::nd_item<3>& item_ct1)
+{
+    for (int tidx = item_ct1.get_local_linear_id(); tidx < mat.num_rows;
+         tidx += item_ct1.get_local_range().size()) {
+        auto temp = zero<ValueType>();
+        for (size_type idx = 0; idx < mat.num_stored_elems_per_row; idx++) {
+            const auto col_idx = mat.col_idxs[tidx + idx * mat.stride];
+            if (col_idx == invalid_index<IndexType>()) {
+                break;
+            } else {
+                temp += mat.values[tidx + idx * mat.stride] * b[col_idx];
+            }
+        }
+        x[tidx] = alpha * temp + beta * x[tidx];
+    }
+}
diff --git a/dpcpp/matrix/batch_struct.hpp b/dpcpp/matrix/batch_struct.hpp
new file mode 100644
index 00000000000..7f36378d8e1
--- /dev/null
+++ b/dpcpp/matrix/batch_struct.hpp
@@ -0,0 +1,129 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_DPCPP_MATRIX_BATCH_STRUCT_HPP_
+#define GKO_DPCPP_MATRIX_BATCH_STRUCT_HPP_
+
+
+#include "core/matrix/batch_struct.hpp"
+
+
+#include <ginkgo/core/matrix/batch_dense.hpp>
+#include <ginkgo/core/matrix/batch_ell.hpp>
+
+
+#include "core/base/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace dpcpp {
+
+
+/** @file batch_struct.hpp
+ *
+ * Helper functions to generate a batch struct from a batch LinOp,
+ * while also shallow-casting to the required DPCPP scalar type.
+ *
+ * A specialization is needed for every format of every kind of linear algebra
+ * object. These are intended to be called on the host.
+ */
+
+
+/**
+ * Generates an immutable uniform batch struct from a batch of dense matrices.
+ */
+template <typename ValueType>
+inline batch::matrix::dense::uniform_batch<const ValueType> get_batch_struct(
+    const batch::matrix::Dense<ValueType>* const op)
+{
+    return {op->get_const_values(), op->get_num_batch_items(),
+            static_cast<int32>(op->get_common_size()[1]),
+            static_cast<int32>(op->get_common_size()[0]),
+            static_cast<int32>(op->get_common_size()[1])};
+}
+
+
+/**
+ * Generates a uniform batch struct from a batch of dense matrices.
+ */
+template <typename ValueType>
+inline batch::matrix::dense::uniform_batch<ValueType> get_batch_struct(
+    batch::matrix::Dense<ValueType>* const op)
+{
+    return {op->get_values(), op->get_num_batch_items(),
+            static_cast<int32>(op->get_common_size()[1]),
+            static_cast<int32>(op->get_common_size()[0]),
+            static_cast<int32>(op->get_common_size()[1])};
+}
+
+
+/**
+ * Generates an immutable uniform batch struct from a batch of ell matrices.
+ */
+template <typename ValueType, typename IndexType>
+inline batch::matrix::ell::uniform_batch<const ValueType, const IndexType>
+get_batch_struct(const batch::matrix::Ell<ValueType, IndexType>* const op)
+{
+    return {op->get_const_values(),
+            op->get_const_col_idxs(),
+            op->get_num_batch_items(),
+            static_cast<IndexType>(op->get_common_size()[0]),
+            static_cast<IndexType>(op->get_common_size()[0]),
+            static_cast<IndexType>(op->get_common_size()[1]),
+            static_cast<IndexType>(op->get_num_stored_elements_per_row())};
+}
+
+
+/**
+ * Generates a uniform batch struct from a batch of ell matrices.
+ */
+template <typename ValueType, typename IndexType>
+inline batch::matrix::ell::uniform_batch<ValueType, IndexType> get_batch_struct(
+    batch::matrix::Ell<ValueType, IndexType>* const op)
+{
+    return {op->get_values(),
+            op->get_col_idxs(),
+            op->get_num_batch_items(),
+            static_cast<IndexType>(op->get_common_size()[0]),
+            static_cast<IndexType>(op->get_common_size()[0]),
+            static_cast<IndexType>(op->get_common_size()[1]),
+            static_cast<IndexType>(op->get_num_stored_elements_per_row())};
+}
+
+
+}  // namespace dpcpp
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_DPCPP_MATRIX_BATCH_STRUCT_HPP_
diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp
index 11309b67b9b..cadb9fe46e6 100644
--- a/dpcpp/matrix/csr_kernels.dp.cpp
+++ b/dpcpp/matrix/csr_kernels.dp.cpp
@@ -871,6 +871,76 @@ void extract_diagonal(size_type diag_size, size_type nnz,
 GKO_ENABLE_DEFAULT_HOST(extract_diagonal, extract_diagonal);
 
 
+template <typename IndexType>
+void check_diagonal_entries(const IndexType num_min_rows_cols,
+                            const IndexType* const __restrict__ row_ptrs,
+                            const IndexType* const __restrict__ col_idxs,
+                            bool* const __restrict__ has_all_diags,
+                            sycl::nd_item<3> item_ct1)
+{
+    constexpr int subgroup_size = config::warp_size;
+    auto tile_grp = group::tiled_partition<subgroup_size>(
+        group::this_thread_block(item_ct1));
+    const auto row =
+        thread::get_subwarp_id_flat<subgroup_size, IndexType>(item_ct1);
+    if (row < num_min_rows_cols) {
+        const auto tid_in_warp = tile_grp.thread_rank();
+        const auto row_start = row_ptrs[row];
+        const auto num_nz = row_ptrs[row + 1] - row_start;
+        bool row_has_diag_local{false};
+        for (IndexType iz = tid_in_warp; iz < num_nz; iz += subgroup_size) {
+            if (col_idxs[iz + row_start] == row) {
+                row_has_diag_local = true;
+                break;
+            }
+        }
+        auto row_has_diag = static_cast<bool>(tile_grp.any(row_has_diag_local));
+        if (!row_has_diag) {
+            if (tile_grp.thread_rank() == 0) {
+                *has_all_diags = false;
+            }
+        }
+    }
+}
+
+GKO_ENABLE_DEFAULT_HOST(check_diagonal_entries, check_diagonal_entries);
+
+
+template <typename ValueType, typename IndexType>
+void add_scaled_identity(const ValueType* const __restrict__ alpha,
+                         const ValueType* const __restrict__ beta,
+                         const IndexType num_rows,
+                         const IndexType* const __restrict__ row_ptrs,
+                         const IndexType* const __restrict__ col_idxs,
+                         ValueType* const __restrict__ values,
+                         sycl::nd_item<3> item_ct1)
+{
+    constexpr int subgroup_size = config::warp_size;
+    auto tile_grp = group::tiled_partition<subgroup_size>(
+        group::this_thread_block(item_ct1));
+    const auto row =
+        thread::get_subwarp_id_flat<subgroup_size, IndexType>(item_ct1);
+    if (row < num_rows) {
+        const auto tid_in_warp = tile_grp.thread_rank();
+        const auto row_start = row_ptrs[row];
+        const auto num_nz = row_ptrs[row + 1] - row_start;
+        const auto beta_val = beta[0];
+        const auto alpha_val = alpha[0];
+        for (IndexType iz = tid_in_warp; iz < num_nz; iz += subgroup_size) {
+            if (beta_val != one<ValueType>()) {
+                values[iz + row_start] *= beta_val;
+            }
+            if (col_idxs[iz + row_start] == row &&
+                alpha_val != zero<ValueType>()) {
+                values[iz + row_start] += alpha_val;
+            }
+        }
+    }
+}
+
+GKO_ENABLE_DEFAULT_HOST(add_scaled_identity, add_scaled_identity);
+
+
 }  // namespace kernel
 
 
@@ -912,7 +982,8 @@ void inv_row_ptr_permute_kernel(size_type num_rows,
 GKO_ENABLE_DEFAULT_HOST(inv_row_ptr_permute_kernel, inv_row_ptr_permute_kernel);
 
 
-template <int subgroup_size, typename ValueType, typename IndexType>
+template <int subgroup_size = config::warp_size, typename ValueType,
+          typename IndexType>
 void row_permute_kernel(size_type num_rows,
                         const IndexType* __restrict__ permutation,
                         const IndexType* __restrict__ in_row_ptrs,
@@ -927,38 +998,23 @@ void row_permute_kernel(size_type num_rows,
     if (tid >= num_rows) {
         return;
     }
-    auto lane = item_ct1.get_local_id(2) % subgroup_size;
-    auto in_row = permutation[tid];
-    auto out_row = tid;
-    auto in_begin = in_row_ptrs[in_row];
-    auto in_size = in_row_ptrs[in_row + 1] - in_begin;
-    auto out_begin = out_row_ptrs[out_row];
+    const auto lane = item_ct1.get_local_id(2) % subgroup_size;
+    const auto in_row = permutation[tid];
+    const auto out_row = tid;
+    const auto in_begin = in_row_ptrs[in_row];
+    const auto in_size = in_row_ptrs[in_row + 1] - in_begin;
+    const auto out_begin = out_row_ptrs[out_row];
     for (IndexType i = lane; i < in_size; i += subgroup_size) {
         out_cols[out_begin + i] = in_cols[in_begin + i];
         out_vals[out_begin + i] = in_vals[in_begin + i];
     }
 }
 
-template <int subgroup_size, typename ValueType, typename IndexType>
-void row_permute_kernel(dim3 grid, dim3 block, size_type dynamic_shared_memory,
-                        sycl::queue* queue, size_type num_rows,
-                        const IndexType* permutation,
-                        const IndexType* in_row_ptrs, const IndexType* in_cols,
-                        const ValueType* in_vals, const IndexType* out_row_ptrs,
-                        IndexType* out_cols, ValueType* out_vals)
-{
-    queue->submit([&](sycl::handler& cgh) {
-        cgh.parallel_for(
-            sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
-                row_permute_kernel<subgroup_size>(
-                    num_rows, permutation, in_row_ptrs, in_cols, in_vals,
-                    out_row_ptrs, out_cols, out_vals, item_ct1);
-            });
-    });
-}
+GKO_ENABLE_DEFAULT_HOST(row_permute_kernel, row_permute_kernel);
 
 
-template <int subgroup_size, typename ValueType, typename IndexType>
+template <int subgroup_size = config::warp_size, typename ValueType,
+          typename IndexType>
 void inv_row_permute_kernel(size_type num_rows,
                             const IndexType* __restrict__ permutation,
                             const IndexType* __restrict__ in_row_ptrs,
@@ -973,39 +1029,23 @@ void inv_row_permute_kernel(size_type num_rows,
     if (tid >= num_rows) {
         return;
     }
-    auto lane = item_ct1.get_local_id(2) % subgroup_size;
-    auto in_row = tid;
-    auto out_row = permutation[tid];
-    auto in_begin = in_row_ptrs[in_row];
-    auto in_size = in_row_ptrs[in_row + 1] - in_begin;
-    auto out_begin = out_row_ptrs[out_row];
+    const auto lane = item_ct1.get_local_id(2) % subgroup_size;
+    const auto in_row = tid;
+    const auto out_row = permutation[tid];
+    const auto in_begin = in_row_ptrs[in_row];
+    const auto in_size = in_row_ptrs[in_row + 1] - in_begin;
+    const auto out_begin = out_row_ptrs[out_row];
     for (IndexType i = lane; i < in_size; i += subgroup_size) {
         out_cols[out_begin + i] = in_cols[in_begin + i];
         out_vals[out_begin + i] = in_vals[in_begin + i];
     }
 }
 
-template <int subgroup_size, typename ValueType, typename IndexType>
-void inv_row_permute_kernel(dim3 grid, dim3 block,
-                            size_type dynamic_shared_memory, sycl::queue* queue,
-                            size_type num_rows, const IndexType* permutation,
-                            const IndexType* in_row_ptrs,
-                            const IndexType* in_cols, const ValueType* in_vals,
-                            const IndexType* out_row_ptrs, IndexType* out_cols,
-                            ValueType* out_vals)
-{
-    queue->submit([&](sycl::handler& cgh) {
-        cgh.parallel_for(
-            sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
-                inv_row_permute_kernel<subgroup_size>(
-                    num_rows, permutation, in_row_ptrs, in_cols, in_vals,
-                    out_row_ptrs, out_cols, out_vals, item_ct1);
-            });
-    });
-}
+GKO_ENABLE_DEFAULT_HOST(inv_row_permute_kernel, inv_row_permute_kernel);
 
 
-template <int subgroup_size, typename ValueType, typename IndexType>
+template <int subgroup_size = config::warp_size, typename ValueType,
+          typename IndexType>
 void inv_symm_permute_kernel(size_type num_rows,
                              const IndexType* __restrict__ permutation,
                              const IndexType* __restrict__ in_row_ptrs,
@@ -1015,6 +1055,38 @@ void inv_symm_permute_kernel(size_type num_rows,
                              IndexType* __restrict__ out_cols,
                              ValueType* __restrict__ out_vals,
                              sycl::nd_item<3> item_ct1)
+{
+    auto tid = thread::get_subwarp_id_flat<subgroup_size>(item_ct1);
+    if (tid >= num_rows) {
+        return;
+    }
+    const auto lane = item_ct1.get_local_id(2) % subgroup_size;
+    const auto in_row = tid;
+    const auto out_row = permutation[tid];
+    const auto in_begin = in_row_ptrs[in_row];
+    const auto in_size = in_row_ptrs[in_row + 1] - in_begin;
+    const auto out_begin = out_row_ptrs[out_row];
+    for (IndexType i = lane; i < in_size; i += subgroup_size) {
+        out_cols[out_begin + i] = permutation[in_cols[in_begin + i]];
+        out_vals[out_begin + i] = in_vals[in_begin + i];
+    }
+}
+
+GKO_ENABLE_DEFAULT_HOST(inv_symm_permute_kernel, inv_symm_permute_kernel);
+
+
+template <int subgroup_size = config::warp_size, typename ValueType,
+          typename IndexType>
+void inv_nonsymm_permute_kernel(size_type num_rows,
+                                const IndexType* __restrict__ row_permutation,
+                                const IndexType* __restrict__ col_permutation,
+                                const IndexType* __restrict__ in_row_ptrs,
+                                const IndexType* __restrict__ in_cols,
+                                const ValueType* __restrict__ in_vals,
+                                const IndexType* __restrict__ out_row_ptrs,
+                                IndexType* __restrict__ out_cols,
+                                ValueType* __restrict__ out_vals,
+                                sycl::nd_item<3> item_ct1)
 {
     auto tid = thread::get_subwarp_id_flat<subgroup_size>(item_ct1);
     if (tid >= num_rows) {
@@ -1022,36 +1094,155 @@ void inv_symm_permute_kernel(size_type num_rows,
     }
     auto lane = item_ct1.get_local_id(2) % subgroup_size;
     auto in_row = tid;
-    auto out_row = permutation[tid];
+    auto out_row = row_permutation[tid];
     auto in_begin = in_row_ptrs[in_row];
     auto in_size = in_row_ptrs[in_row + 1] - in_begin;
     auto out_begin = out_row_ptrs[out_row];
     for (IndexType i = lane; i < in_size; i += subgroup_size) {
-        out_cols[out_begin + i] = permutation[in_cols[in_begin + i]];
+        out_cols[out_begin + i] = col_permutation[in_cols[in_begin + i]];
         out_vals[out_begin + i] = in_vals[in_begin + i];
     }
 }
 
-template <int subgroup_size, typename ValueType, typename IndexType>
-void inv_symm_permute_kernel(dim3 grid, dim3 block,
-                             size_type dynamic_shared_memory,
-                             sycl::queue* queue, size_type num_rows,
-                             const IndexType* permutation,
-                             const IndexType* in_row_ptrs,
-                             const IndexType* in_cols, const ValueType* in_vals,
-                             const IndexType* out_row_ptrs, IndexType* out_cols,
-                             ValueType* out_vals)
+GKO_ENABLE_DEFAULT_HOST(inv_nonsymm_permute_kernel, inv_nonsymm_permute_kernel);
+
+
+template <int subgroup_size = config::warp_size, typename ValueType,
+          typename IndexType>
+void row_scale_permute_kernel(size_type num_rows,
+                              const ValueType* __restrict__ scale,
+                              const IndexType* __restrict__ permutation,
+                              const IndexType* __restrict__ in_row_ptrs,
+                              const IndexType* __restrict__ in_cols,
+                              const ValueType* __restrict__ in_vals,
+                              const IndexType* __restrict__ out_row_ptrs,
+                              IndexType* __restrict__ out_cols,
+                              ValueType* __restrict__ out_vals,
+                              sycl::nd_item<3> item_ct1)
 {
-    queue->submit([&](sycl::handler& cgh) {
-        cgh.parallel_for(
-            sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
-                inv_symm_permute_kernel<subgroup_size>(
-                    num_rows, permutation, in_row_ptrs, in_cols, in_vals,
-                    out_row_ptrs, out_cols, out_vals, item_ct1);
-            });
-    });
+    auto tid = thread::get_subwarp_id_flat<subgroup_size>(item_ct1);
+    if (tid >= num_rows) {
+        return;
+    }
+    const auto lane = item_ct1.get_local_id(2) % subgroup_size;
+    const auto in_row = permutation[tid];
+    const auto out_row = tid;
+    const auto in_begin = in_row_ptrs[in_row];
+    const auto in_size = in_row_ptrs[in_row + 1] - in_begin;
+    const auto out_begin = out_row_ptrs[out_row];
+    for (IndexType i = lane; i < in_size; i += subgroup_size) {
+        out_cols[out_begin + i] = in_cols[in_begin + i];
+        out_vals[out_begin + i] = in_vals[in_begin + i] * scale[in_row];
+    }
+}
+
+GKO_ENABLE_DEFAULT_HOST(row_scale_permute_kernel, row_scale_permute_kernel);
+
+
+template <int subgroup_size = config::warp_size, typename ValueType,
+          typename IndexType>
+void inv_row_scale_permute_kernel(size_type num_rows,
+                                  const ValueType* __restrict__ scale,
+                                  const IndexType* __restrict__ permutation,
+                                  const IndexType* __restrict__ in_row_ptrs,
+                                  const IndexType* __restrict__ in_cols,
+                                  const ValueType* __restrict__ in_vals,
+                                  const IndexType* __restrict__ out_row_ptrs,
+                                  IndexType* __restrict__ out_cols,
+                                  ValueType* __restrict__ out_vals,
+                                  sycl::nd_item<3> item_ct1)
+{
+    auto tid = thread::get_subwarp_id_flat<subgroup_size>(item_ct1);
+    if (tid >= num_rows) {
+        return;
+    }
+    const auto lane = item_ct1.get_local_id(2) % subgroup_size;
+    const auto in_row = tid;
+    const auto out_row = permutation[tid];
+    const auto in_begin = in_row_ptrs[in_row];
+    const auto in_size = in_row_ptrs[in_row + 1] - in_begin;
+    const auto out_begin = out_row_ptrs[out_row];
+    for (IndexType i = lane; i < in_size; i += subgroup_size) {
+        out_cols[out_begin + i] = in_cols[in_begin + i];
+        out_vals[out_begin + i] = in_vals[in_begin + i] / scale[out_row];
+    }
+}
+
+GKO_ENABLE_DEFAULT_HOST(inv_row_scale_permute_kernel,
+                        inv_row_scale_permute_kernel);
+
+
+template <int subgroup_size = config::warp_size, typename ValueType,
+          typename IndexType>
+void inv_symm_scale_permute_kernel(size_type num_rows,
+                                   const ValueType* __restrict__ scale,
+                                   const IndexType* __restrict__ permutation,
+                                   const IndexType* __restrict__ in_row_ptrs,
+                                   const IndexType* __restrict__ in_cols,
+                                   const ValueType* __restrict__ in_vals,
+                                   const IndexType* __restrict__ out_row_ptrs,
+                                   IndexType* __restrict__ out_cols,
+                                   ValueType* __restrict__ out_vals,
+                                   sycl::nd_item<3> item_ct1)
+{
+    auto tid = thread::get_subwarp_id_flat<subgroup_size>(item_ct1);
+    if (tid >= num_rows) {
+        return;
+    }
+    const auto lane = item_ct1.get_local_id(2) % subgroup_size;
+    const auto in_row = tid;
+    const auto out_row = permutation[tid];
+    const auto in_begin = in_row_ptrs[in_row];
+    const auto in_size = in_row_ptrs[in_row + 1] - in_begin;
+    const auto out_begin = out_row_ptrs[out_row];
+    for (IndexType i = lane; i < in_size; i += subgroup_size) {
+        const auto out_col = permutation[in_cols[in_begin + i]];
+        out_cols[out_begin + i] = out_col;
+        out_vals[out_begin + i] =
+            in_vals[in_begin + i] / (scale[out_row] * scale[out_col]);
+    }
+}
+
+GKO_ENABLE_DEFAULT_HOST(inv_symm_scale_permute_kernel,
+                        inv_symm_scale_permute_kernel);
+
+
+template <int subgroup_size = config::warp_size, typename ValueType,
+          typename IndexType>
+void inv_nonsymm_scale_permute_kernel(
+    size_type num_rows, const ValueType* __restrict__ row_scale,
+    const IndexType* __restrict__ row_permutation,
+    const ValueType* __restrict__ col_scale,
+    const IndexType* __restrict__ col_permutation,
+    const IndexType* __restrict__ in_row_ptrs,
+    const IndexType* __restrict__ in_cols,
+    const ValueType* __restrict__ in_vals,
+    const IndexType* __restrict__ out_row_ptrs,
+    IndexType* __restrict__ out_cols, ValueType* __restrict__ out_vals,
+    sycl::nd_item<3> item_ct1)
+{
+    auto tid = thread::get_subwarp_id_flat<subgroup_size>(item_ct1);
+    if (tid >= num_rows) {
+        return;
+    }
+    const auto lane = item_ct1.get_local_id(2) % subgroup_size;
+    const auto in_row = tid;
+    const auto out_row = row_permutation[tid];
+    const auto in_begin = in_row_ptrs[in_row];
+    const auto in_size = in_row_ptrs[in_row + 1] - in_begin;
+    const auto out_begin = out_row_ptrs[out_row];
+    for (IndexType i = lane; i < in_size; i += subgroup_size) {
+        const auto out_col = col_permutation[in_cols[in_begin + i]];
+        out_cols[out_begin + i] = out_col;
+        out_vals[out_begin + i] =
+            in_vals[in_begin + i] / (row_scale[out_row] * col_scale[out_col]);
+    }
 }
 
+GKO_ENABLE_DEFAULT_HOST(inv_nonsymm_scale_permute_kernel,
+                        inv_nonsymm_scale_permute_kernel);
+
+
 namespace host_kernel {
 
 
@@ -2185,7 +2376,7 @@ void inv_symm_permute(std::shared_ptr<const DpcppExecutor> exec,
                                        num_rows + 1);
     auto copy_num_blocks =
         ceildiv(num_rows, default_block_size / config::warp_size);
-    inv_symm_permute_kernel<config::warp_size>(
+    inv_symm_permute_kernel(
         copy_num_blocks, default_block_size, 0, exec->get_queue(), num_rows,
         perm, orig->get_const_row_ptrs(), orig->get_const_col_idxs(),
         orig->get_const_values(), permuted->get_row_ptrs(),
@@ -2196,6 +2387,33 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_INV_SYMM_PERMUTE_KERNEL);
 
 
+template <typename ValueType, typename IndexType>
+void inv_nonsymm_permute(std::shared_ptr<const DpcppExecutor> exec,
+                         const IndexType* row_perm, const IndexType* col_perm,
+                         const matrix::Csr<ValueType, IndexType>* orig,
+                         matrix::Csr<ValueType, IndexType>* permuted)
+{
+    auto num_rows = orig->get_size()[0];
+    auto count_num_blocks = ceildiv(num_rows, default_block_size);
+    inv_row_ptr_permute_kernel(
+        count_num_blocks, default_block_size, 0, exec->get_queue(), num_rows,
+        row_perm, orig->get_const_row_ptrs(), permuted->get_row_ptrs());
+    components::prefix_sum_nonnegative(exec, permuted->get_row_ptrs(),
+                                       num_rows + 1);
+    auto copy_num_blocks =
+        ceildiv(num_rows, default_block_size / config::warp_size);
+    inv_nonsymm_permute_kernel(
+        copy_num_blocks, default_block_size, 0, exec->get_queue(), num_rows,
+        row_perm, col_perm, orig->get_const_row_ptrs(),
+        orig->get_const_col_idxs(), orig->get_const_values(),
+        permuted->get_row_ptrs(), permuted->get_col_idxs(),
+        permuted->get_values());
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_INV_NONSYMM_PERMUTE_KERNEL);
+
+
 template <typename ValueType, typename IndexType>
 void row_permute(std::shared_ptr<const DpcppExecutor> exec,
                  const IndexType* perm,
@@ -2211,7 +2429,7 @@ void row_permute(std::shared_ptr<const DpcppExecutor> exec,
                                        num_rows + 1);
     auto copy_num_blocks =
         ceildiv(num_rows, default_block_size / config::warp_size);
-    row_permute_kernel<config::warp_size>(
+    row_permute_kernel(
         copy_num_blocks, default_block_size, 0, exec->get_queue(), num_rows,
         perm, orig->get_const_row_ptrs(), orig->get_const_col_idxs(),
         orig->get_const_values(), row_permuted->get_row_ptrs(),
@@ -2223,10 +2441,10 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 
 template <typename ValueType, typename IndexType>
-void inverse_row_permute(std::shared_ptr<const DpcppExecutor> exec,
-                         const IndexType* perm,
-                         const matrix::Csr<ValueType, IndexType>* orig,
-                         matrix::Csr<ValueType, IndexType>* row_permuted)
+void inv_row_permute(std::shared_ptr<const DpcppExecutor> exec,
+                     const IndexType* perm,
+                     const matrix::Csr<ValueType, IndexType>* orig,
+                     matrix::Csr<ValueType, IndexType>* row_permuted)
 {
     auto num_rows = orig->get_size()[0];
     auto count_num_blocks = ceildiv(num_rows, default_block_size);
@@ -2237,7 +2455,7 @@ void inverse_row_permute(std::shared_ptr<const DpcppExecutor> exec,
                                        num_rows + 1);
     auto copy_num_blocks =
         ceildiv(num_rows, default_block_size / config::warp_size);
-    inv_row_permute_kernel<config::warp_size>(
+    inv_row_permute_kernel(
         copy_num_blocks, default_block_size, 0, exec->get_queue(), num_rows,
         perm, orig->get_const_row_ptrs(), orig->get_const_col_idxs(),
         orig->get_const_values(), row_permuted->get_row_ptrs(),
@@ -2245,7 +2463,115 @@ void inverse_row_permute(std::shared_ptr<const DpcppExecutor> exec,
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CSR_INVERSE_ROW_PERMUTE_KERNEL);
+    GKO_DECLARE_CSR_INV_ROW_PERMUTE_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void inv_symm_scale_permute(std::shared_ptr<const DpcppExecutor> exec,
+                            const ValueType* scale, const IndexType* perm,
+                            const matrix::Csr<ValueType, IndexType>* orig,
+                            matrix::Csr<ValueType, IndexType>* permuted)
+{
+    auto num_rows = orig->get_size()[0];
+    auto count_num_blocks = ceildiv(num_rows, default_block_size);
+    inv_row_ptr_permute_kernel(
+        count_num_blocks, default_block_size, 0, exec->get_queue(), num_rows,
+        perm, orig->get_const_row_ptrs(), permuted->get_row_ptrs());
+    components::prefix_sum_nonnegative(exec, permuted->get_row_ptrs(),
+                                       num_rows + 1);
+    auto copy_num_blocks =
+        ceildiv(num_rows, default_block_size / config::warp_size);
+    inv_symm_scale_permute_kernel(
+        copy_num_blocks, default_block_size, 0, exec->get_queue(), num_rows,
+        scale, perm, orig->get_const_row_ptrs(), orig->get_const_col_idxs(),
+        orig->get_const_values(), permuted->get_row_ptrs(),
+        permuted->get_col_idxs(), permuted->get_values());
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_INV_SYMM_SCALE_PERMUTE_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void inv_nonsymm_scale_permute(std::shared_ptr<const DpcppExecutor> exec,
+                               const ValueType* row_scale,
+                               const IndexType* row_perm,
+                               const ValueType* col_scale,
+                               const IndexType* col_perm,
+                               const matrix::Csr<ValueType, IndexType>* orig,
+                               matrix::Csr<ValueType, IndexType>* permuted)
+{
+    auto num_rows = orig->get_size()[0];
+    auto count_num_blocks = ceildiv(num_rows, default_block_size);
+    inv_row_ptr_permute_kernel(
+        count_num_blocks, default_block_size, 0, exec->get_queue(), num_rows,
+        row_perm, orig->get_const_row_ptrs(), permuted->get_row_ptrs());
+    components::prefix_sum_nonnegative(exec, permuted->get_row_ptrs(),
+                                       num_rows + 1);
+    auto copy_num_blocks =
+        ceildiv(num_rows, default_block_size / config::warp_size);
+    inv_nonsymm_scale_permute_kernel(
+        copy_num_blocks, default_block_size, 0, exec->get_queue(), num_rows,
+        row_scale, row_perm, col_scale, col_perm, orig->get_const_row_ptrs(),
+        orig->get_const_col_idxs(), orig->get_const_values(),
+        permuted->get_row_ptrs(), permuted->get_col_idxs(),
+        permuted->get_values());
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_INV_NONSYMM_SCALE_PERMUTE_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void row_scale_permute(std::shared_ptr<const DpcppExecutor> exec,
+                       const ValueType* scale, const IndexType* perm,
+                       const matrix::Csr<ValueType, IndexType>* orig,
+                       matrix::Csr<ValueType, IndexType>* row_permuted)
+{
+    auto num_rows = orig->get_size()[0];
+    auto count_num_blocks = ceildiv(num_rows, default_block_size);
+    row_ptr_permute_kernel(
+        count_num_blocks, default_block_size, 0, exec->get_queue(), num_rows,
+        perm, orig->get_const_row_ptrs(), row_permuted->get_row_ptrs());
+    components::prefix_sum_nonnegative(exec, row_permuted->get_row_ptrs(),
+                                       num_rows + 1);
+    auto copy_num_blocks =
+        ceildiv(num_rows, default_block_size / config::warp_size);
+    row_scale_permute_kernel(
+        copy_num_blocks, default_block_size, 0, exec->get_queue(), num_rows,
+        scale, perm, orig->get_const_row_ptrs(), orig->get_const_col_idxs(),
+        orig->get_const_values(), row_permuted->get_row_ptrs(),
+        row_permuted->get_col_idxs(), row_permuted->get_values());
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_ROW_SCALE_PERMUTE_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void inv_row_scale_permute(std::shared_ptr<const DpcppExecutor> exec,
+                           const ValueType* scale, const IndexType* perm,
+                           const matrix::Csr<ValueType, IndexType>* orig,
+                           matrix::Csr<ValueType, IndexType>* row_permuted)
+{
+    auto num_rows = orig->get_size()[0];
+    auto count_num_blocks = ceildiv(num_rows, default_block_size);
+    inv_row_ptr_permute_kernel(
+        count_num_blocks, default_block_size, 0, exec->get_queue(), num_rows,
+        perm, orig->get_const_row_ptrs(), row_permuted->get_row_ptrs());
+    components::prefix_sum_nonnegative(exec, row_permuted->get_row_ptrs(),
+                                       num_rows + 1);
+    auto copy_num_blocks =
+        ceildiv(num_rows, default_block_size / config::warp_size);
+    inv_row_scale_permute_kernel(
+        copy_num_blocks, default_block_size, 0, exec->get_queue(), num_rows,
+        scale, perm, orig->get_const_row_ptrs(), orig->get_const_col_idxs(),
+        orig->get_const_values(), row_permuted->get_row_ptrs(),
+        row_permuted->get_col_idxs(), row_permuted->get_values());
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_INV_ROW_SCALE_PERMUTE_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -2364,8 +2690,23 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_EXTRACT_DIAGONAL);
 template <typename ValueType, typename IndexType>
 void check_diagonal_entries_exist(
     std::shared_ptr<const DpcppExecutor> exec,
-    const matrix::Csr<ValueType, IndexType>* const mtx,
-    bool& has_all_diags) GKO_NOT_IMPLEMENTED;
+    const matrix::Csr<ValueType, IndexType>* const mtx, bool& has_all_diags)
+{
+    const auto num_diag = static_cast<IndexType>(
+        std::min(mtx->get_size()[0], mtx->get_size()[1]));
+    if (num_diag > 0) {
+        const IndexType num_blocks =
+            ceildiv(num_diag, default_block_size / config::warp_size);
+        array<bool> has_diags(exec, {true});
+        kernel::check_diagonal_entries(
+            num_blocks, default_block_size, 0, exec->get_queue(), num_diag,
+            mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(),
+            has_diags.get_data());
+        has_all_diags = exec->copy_val_to_host(has_diags.get_const_data());
+    } else {
+        has_all_diags = true;
+    }
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_CHECK_DIAGONAL_ENTRIES_EXIST);
@@ -2376,7 +2717,19 @@ void add_scaled_identity(std::shared_ptr<const DpcppExecutor> exec,
                          const matrix::Dense<ValueType>* const alpha,
                          const matrix::Dense<ValueType>* const beta,
                          matrix::Csr<ValueType, IndexType>* const mtx)
-    GKO_NOT_IMPLEMENTED;
+{
+    const auto nrows = mtx->get_size()[0];
+    if (nrows == 0) {
+        return;
+    }
+    const auto nthreads = nrows * config::warp_size;
+    const auto nblocks = ceildiv(nthreads, default_block_size);
+    kernel::add_scaled_identity(
+        nblocks, default_block_size, 0, exec->get_queue(),
+        alpha->get_const_values(), beta->get_const_values(),
+        static_cast<IndexType>(nrows), mtx->get_const_row_ptrs(),
+        mtx->get_const_col_idxs(), mtx->get_values());
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL);
diff --git a/dpcpp/matrix/ell_kernels.dp.cpp b/dpcpp/matrix/ell_kernels.dp.cpp
index 65fad771140..4817b9a5991 100644
--- a/dpcpp/matrix/ell_kernels.dp.cpp
+++ b/dpcpp/matrix/ell_kernels.dp.cpp
@@ -120,16 +120,17 @@ void spmv_kernel(
     const size_type stride, const size_type num_stored_elements_per_row,
     acc::range<b_accessor> b, OutputValueType* __restrict__ c,
     const size_type c_stride, Closure op, sycl::nd_item<3> item_ct1,
-    uninitialized_array<OutputValueType,
+    uninitialized_array<typename a_accessor::arithmetic_type,
                         default_block_size / num_thread_per_worker>& storage)
 {
+    using arithmetic_type = typename a_accessor::arithmetic_type;
     const auto tidx = thread::get_thread_id_flat(item_ct1);
     const decltype(tidx) column_id = item_ct1.get_group(1);
     if (num_thread_per_worker == 1) {
         // Specialize the num_thread_per_worker = 1. It doesn't need the shared
         // memory, __syncthreads, and atomic_add
         if (tidx < num_rows) {
-            auto temp = zero<OutputValueType>();
+            auto temp = zero<arithmetic_type>();
             for (size_type idx = 0; idx < num_stored_elements_per_row; idx++) {
                 const auto ind = tidx + idx * stride;
                 const auto col_idx = col[ind];
@@ -150,11 +151,11 @@ void spmv_kernel(
         const auto step_size = num_worker_per_row * num_thread_per_worker;
 
         if (runnable && idx_in_worker == 0) {
-            storage[item_ct1.get_local_id(2)] = 0;
+            storage[item_ct1.get_local_id(2)] = zero<arithmetic_type>();
         }
 
         item_ct1.barrier(sycl::access::fence_space::local_space);
-        auto temp = zero<OutputValueType>();
+        auto temp = zero<arithmetic_type>();
         if (runnable) {
             for (size_type idx =
                      worker_id * num_thread_per_worker + idx_in_worker;
@@ -193,13 +194,15 @@ void spmv(
     const size_type stride, const size_type num_stored_elements_per_row,
     acc::range<b_accessor> b, OutputValueType* __restrict__ c,
     const size_type c_stride, sycl::nd_item<3> item_ct1,
-    uninitialized_array<OutputValueType,
+    uninitialized_array<typename a_accessor::arithmetic_type,
                         default_block_size / num_thread_per_worker>& storage)
 {
     spmv_kernel<num_thread_per_worker, atomic>(
         num_rows, num_worker_per_row, val, col, stride,
         num_stored_elements_per_row, b, c, c_stride,
-        [](const OutputValueType& x, const OutputValueType& y) { return x; },
+        [](const auto& x, const OutputValueType& y) {
+            return static_cast<OutputValueType>(x);
+        },
         item_ct1, storage);
 }
 
@@ -214,7 +217,7 @@ void spmv(dim3 grid, dim3 block, size_type dynamic_shared_memory,
 {
     queue->submit([&](sycl::handler& cgh) {
         sycl::accessor<
-            uninitialized_array<OutputValueType,
+            uninitialized_array<typename a_accessor::arithmetic_type,
                                 default_block_size / num_thread_per_worker>,
             0, sycl::access_mode::read_write, sycl::access::target::local>
             storage_acc_ct1(cgh);
@@ -239,10 +242,11 @@ void spmv(
     const size_type num_stored_elements_per_row, acc::range<b_accessor> b,
     const OutputValueType* __restrict__ beta, OutputValueType* __restrict__ c,
     const size_type c_stride, sycl::nd_item<3> item_ct1,
-    uninitialized_array<OutputValueType,
+    uninitialized_array<typename a_accessor::arithmetic_type,
                         default_block_size / num_thread_per_worker>& storage)
 {
-    const OutputValueType alpha_val = alpha(0);
+    using arithmetic_type = typename a_accessor::arithmetic_type;
+    const auto alpha_val = alpha(0);
     const OutputValueType beta_val = beta[0];
     if (atomic) {
         // Because the atomic operation changes the values of c during
@@ -253,17 +257,17 @@ void spmv(
         spmv_kernel<num_thread_per_worker, atomic>(
             num_rows, num_worker_per_row, val, col, stride,
             num_stored_elements_per_row, b, c, c_stride,
-            [&alpha_val](const OutputValueType& x, const OutputValueType& y) {
-                return alpha_val * x;
+            [&alpha_val](const auto& x, const OutputValueType& y) {
+                return static_cast<OutputValueType>(alpha_val * x);
             },
             item_ct1, storage);
     } else {
         spmv_kernel<num_thread_per_worker, atomic>(
             num_rows, num_worker_per_row, val, col, stride,
             num_stored_elements_per_row, b, c, c_stride,
-            [&alpha_val, &beta_val](const OutputValueType& x,
-                                    const OutputValueType& y) {
-                return alpha_val * x + beta_val * y;
+            [&alpha_val, &beta_val](const auto& x, const OutputValueType& y) {
+                return static_cast<OutputValueType>(
+                    alpha_val * x + static_cast<arithmetic_type>(beta_val * y));
             },
             item_ct1, storage);
     }
@@ -281,7 +285,7 @@ void spmv(dim3 grid, dim3 block, size_type dynamic_shared_memory,
 {
     queue->submit([&](sycl::handler& cgh) {
         sycl::accessor<
-            uninitialized_array<OutputValueType,
+            uninitialized_array<typename a_accessor::arithmetic_type,
                                 default_block_size / num_thread_per_worker>,
             0, sycl::access_mode::read_write, sycl::access::target::local>
             storage_acc_ct1(cgh);
@@ -316,10 +320,12 @@ void abstract_spmv(syn::value_list<int, info>,
                    const matrix::Dense<MatrixValueType>* alpha = nullptr,
                    const matrix::Dense<OutputValueType>* beta = nullptr)
 {
+    using arithmetic_type =
+        highest_precision<InputValueType, OutputValueType, MatrixValueType>;
     using a_accessor =
-        gko::acc::reduced_row_major<1, OutputValueType, const MatrixValueType>;
+        gko::acc::reduced_row_major<1, arithmetic_type, const MatrixValueType>;
     using b_accessor =
-        gko::acc::reduced_row_major<2, OutputValueType, const InputValueType>;
+        gko::acc::reduced_row_major<2, arithmetic_type, const InputValueType>;
 
     const auto nrows = a->get_size()[0];
     const auto stride = a->get_stride();
diff --git a/dpcpp/matrix/sparsity_csr_kernels.dp.cpp b/dpcpp/matrix/sparsity_csr_kernels.dp.cpp
index 2cebac00c5f..1acc16d7026 100644
--- a/dpcpp/matrix/sparsity_csr_kernels.dp.cpp
+++ b/dpcpp/matrix/sparsity_csr_kernels.dp.cpp
@@ -303,7 +303,23 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 template <typename ValueType, typename IndexType>
 void sort_by_column_index(std::shared_ptr<const DpcppExecutor> exec,
                           matrix::SparsityCsr<ValueType, IndexType>* to_sort)
-    GKO_NOT_IMPLEMENTED;
+{
+    const auto num_rows = to_sort->get_size()[0];
+    const auto row_ptrs = to_sort->get_const_row_ptrs();
+    const auto cols = to_sort->get_col_idxs();
+    auto queue = exec->get_queue();
+    // build sorted postorder node list for each row
+    queue->submit([&](sycl::handler& cgh) {
+        cgh.parallel_for(sycl::range<1>{num_rows}, [=](sycl::id<1> idx_id) {
+            const auto row = idx_id[0];
+            const auto row_begin = row_ptrs[row];
+            const auto row_end = row_ptrs[row + 1];
+            // heap-sort the elements
+            std::make_heap(cols + row_begin, cols + row_end);
+            std::sort_heap(cols + row_begin, cols + row_end);
+        });
+    });
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SPARSITY_CSR_SORT_BY_COLUMN_INDEX);
@@ -312,8 +328,32 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 template <typename ValueType, typename IndexType>
 void is_sorted_by_column_index(
     std::shared_ptr<const DpcppExecutor> exec,
-    const matrix::SparsityCsr<ValueType, IndexType>* to_check,
-    bool* is_sorted) GKO_NOT_IMPLEMENTED;
+    const matrix::SparsityCsr<ValueType, IndexType>* to_check, bool* is_sorted)
+{
+    *is_sorted = true;
+    auto cpu_array = make_array_view(exec->get_master(), 1, is_sorted);
+    auto gpu_array = array<bool>{exec, cpu_array};
+    const auto num_rows = to_check->get_size()[0];
+    const auto row_ptrs = to_check->get_const_row_ptrs();
+    const auto cols = to_check->get_const_col_idxs();
+    auto is_sorted_device = gpu_array.get_data();
+    exec->get_queue()->submit([&](sycl::handler& cgh) {
+        cgh.parallel_for(sycl::range<1>{num_rows}, [=](sycl::id<1> idx) {
+            const auto row = static_cast<size_type>(idx[0]);
+            const auto begin = row_ptrs[row];
+            const auto end = row_ptrs[row + 1];
+            if (*is_sorted_device) {
+                for (auto i = begin; i < end - 1; i++) {
+                    if (cols[i] > cols[i + 1]) {
+                        *is_sorted_device = false;
+                        break;
+                    }
+                }
+            }
+        });
+    });
+    cpu_array = gpu_array;
+};
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SPARSITY_CSR_IS_SORTED_BY_COLUMN_INDEX);
diff --git a/dpcpp/multigrid/pgm_kernels.dp.cpp b/dpcpp/multigrid/pgm_kernels.dp.cpp
index 15bd22180c0..b404b1c10ab 100644
--- a/dpcpp/multigrid/pgm_kernels.dp.cpp
+++ b/dpcpp/multigrid/pgm_kernels.dp.cpp
@@ -33,7 +33,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 // force-top: on
 // oneDPL needs to be first to avoid issues with libstdc++ TBB impl
 #include <oneapi/dpl/algorithm>
-#include <oneapi/dpl/execution>
 // force-top: off
 
 
@@ -48,6 +47,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/multigrid/pgm.hpp>
 
 
+#include "dpcpp/base/onedpl.hpp"
+
+
 namespace gko {
 namespace kernels {
 namespace dpcpp {
@@ -63,8 +65,7 @@ template <typename IndexType>
 void sort_agg(std::shared_ptr<const DefaultExecutor> exec, IndexType num,
               IndexType* row_idxs, IndexType* col_idxs)
 {
-    auto policy =
-        oneapi::dpl::execution::make_device_policy(*exec->get_queue());
+    auto policy = onedpl_policy(exec);
     auto it = oneapi::dpl::make_zip_iterator(row_idxs, col_idxs);
     std::sort(policy, it, it + num, [](auto a, auto b) {
         return std::tie(std::get<0>(a), std::get<1>(a)) <
@@ -79,12 +80,12 @@ template <typename ValueType, typename IndexType>
 void sort_row_major(std::shared_ptr<const DefaultExecutor> exec, size_type nnz,
                     IndexType* row_idxs, IndexType* col_idxs, ValueType* vals)
 {
-    auto policy =
-        oneapi::dpl::execution::make_device_policy(*exec->get_queue());
+    auto policy = onedpl_policy(exec);
     auto it = oneapi::dpl::make_zip_iterator(row_idxs, col_idxs, vals);
-    // Because reduce_by_segment is not determinstic, so we do not need
+    // Because reduce_by_segment is not deterministic, so we do not need
+    // stable_sort
+    // TODO: If we have deterministic reduce_by_segment, it should be
     // stable_sort
-    // TODO: If we have determinstic reduce_by_segment, it should be stable_sort
     std::sort(policy, it, it + nnz, [](auto a, auto b) {
         return std::tie(std::get<0>(a), std::get<1>(a)) <
                std::tie(std::get<0>(b), std::get<1>(b));
diff --git a/dpcpp/preconditioner/batch_identity.hpp.inc b/dpcpp/preconditioner/batch_identity.hpp.inc
new file mode 100644
index 00000000000..792886f845d
--- /dev/null
+++ b/dpcpp/preconditioner/batch_identity.hpp.inc
@@ -0,0 +1,59 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+/**
+ * @see reference/preconditioner/batch_identity.hpp
+ */
+template <typename ValueType>
+class Identity final {
+public:
+    using value_type = ValueType;
+
+    static constexpr int work_size = 0;
+
+    static int dynamic_work_size(int, int) { return 0; }
+
+    template <typename batch_item_type>
+    void generate(size_type, const batch_item_type&, ValueType*,
+                  sycl::nd_item<3> item_ct1)
+    {}
+
+    __dpct_inline__ void apply(const int num_rows, const ValueType* const r,
+                               ValueType* const z,
+                               sycl::nd_item<3> item_ct1) const
+    {
+        for (int li = item_ct1.get_local_linear_id(); li < num_rows;
+             li += item_ct1.get_local_range().size()) {
+            z[li] = r[li];
+        }
+    }
+};
diff --git a/dpcpp/preconditioner/batch_preconditioners.hpp b/dpcpp/preconditioner/batch_preconditioners.hpp
new file mode 100644
index 00000000000..f2b6b1d034f
--- /dev/null
+++ b/dpcpp/preconditioner/batch_preconditioners.hpp
@@ -0,0 +1,58 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_DPCPP_PRECONDITIONER_BATCH_PRECONDITIONERS_HPP_
+#define GKO_DPCPP_PRECONDITIONER_BATCH_PRECONDITIONERS_HPP_
+
+
+#include <ginkgo/core/matrix/batch_identity.hpp>
+
+
+#include "core/matrix/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace dpcpp {
+namespace batch_preconditioner {
+
+
+#include "dpcpp/preconditioner/batch_identity.hpp.inc"
+
+
+}  // namespace batch_preconditioner
+}  // namespace dpcpp
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_DPCPP_PRECONDITIONER_BATCH_PRECONDITIONERS_HPP_
diff --git a/dpcpp/solver/batch_bicgstab_kernels.dp.cpp b/dpcpp/solver/batch_bicgstab_kernels.dp.cpp
new file mode 100644
index 00000000000..9e353734f36
--- /dev/null
+++ b/dpcpp/solver/batch_bicgstab_kernels.dp.cpp
@@ -0,0 +1,297 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/solver/batch_bicgstab_kernels.hpp"
+
+
+#include <CL/sycl.hpp>
+
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/matrix/batch_ell.hpp>
+#include <ginkgo/core/solver/batch_bicgstab.hpp>
+
+
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "core/solver/batch_dispatch.hpp"
+#include "dpcpp/base/batch_struct.hpp"
+#include "dpcpp/base/config.hpp"
+#include "dpcpp/base/dim3.dp.hpp"
+#include "dpcpp/base/dpct.hpp"
+#include "dpcpp/base/helper.hpp"
+#include "dpcpp/components/cooperative_groups.dp.hpp"
+#include "dpcpp/components/intrinsics.dp.hpp"
+#include "dpcpp/components/reduction.dp.hpp"
+#include "dpcpp/components/thread_ids.dp.hpp"
+#include "dpcpp/matrix/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace dpcpp {
+/**
+ * @brief The batch Bicgstab solver namespace.
+ *
+ * @ingroup batch_bicgstab
+ */
+namespace batch_bicgstab {
+
+
+#include "dpcpp/base/batch_multi_vector_kernels.hpp.inc"
+#include "dpcpp/matrix/batch_dense_kernels.hpp.inc"
+#include "dpcpp/matrix/batch_ell_kernels.hpp.inc"
+#include "dpcpp/solver/batch_bicgstab_kernels.hpp.inc"
+
+
+template <typename T>
+using settings = gko::kernels::batch_bicgstab::settings<T>;
+
+
+__dpct_inline__ int get_group_size(int value,
+                                   int subgroup_size = config::warp_size)
+{
+    int num_sg = ceildiv(value, subgroup_size);
+    return num_sg * subgroup_size;
+}
+
+
+template <typename ValueType>
+class KernelCaller {
+public:
+    KernelCaller(std::shared_ptr<const DefaultExecutor> exec,
+                 const settings<remove_complex<ValueType>> settings)
+        : exec_{std::move(exec)}, settings_{settings}
+    {}
+
+    template <typename StopType, const int subgroup_size,
+              const int n_shared_total, typename PrecType, typename LogType,
+              typename BatchMatrixType>
+    __dpct_inline__ void launch_apply_kernel(
+        const gko::kernels::batch_bicgstab::storage_config& sconf,
+        LogType& logger, PrecType& prec, const BatchMatrixType mat,
+        const ValueType* const __restrict__ b_values,
+        ValueType* const __restrict__ x_values,
+        ValueType* const __restrict__ workspace, const int& group_size,
+        const int& shared_size) const
+    {
+        auto num_rows = mat.num_rows;
+
+        const dim3 block(group_size);
+        const dim3 grid(mat.num_batch_items);
+
+        auto max_iters = settings_.max_iterations;
+        auto res_tol = settings_.residual_tol;
+
+        exec_->get_queue()->submit([&](sycl::handler& cgh) {
+            sycl::accessor<ValueType, 1, sycl::access_mode::read_write,
+                           sycl::access::target::local>
+                slm_values(sycl::range<1>(shared_size), cgh);
+
+            cgh.parallel_for(
+                sycl_nd_range(grid, block),
+                [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(
+                    subgroup_size)]] [[intel::kernel_args_restrict]] {
+                    auto batch_id = item_ct1.get_group_linear_id();
+                    const auto mat_global_entry =
+                        gko::batch::matrix::extract_batch_item(mat, batch_id);
+                    const ValueType* const b_global_entry =
+                        gko::batch::multi_vector::batch_item_ptr(
+                            b_values, 1, num_rows, batch_id);
+                    ValueType* const x_global_entry =
+                        gko::batch::multi_vector::batch_item_ptr(
+                            x_values, 1, num_rows, batch_id);
+                    apply_kernel<StopType, n_shared_total>(
+                        sconf, max_iters, res_tol, logger, prec,
+                        mat_global_entry, b_global_entry, x_global_entry,
+                        num_rows, mat.get_single_item_num_nnz(),
+                        static_cast<ValueType*>(slm_values.get_pointer()),
+                        item_ct1, workspace);
+                });
+        });
+    }
+
+    template <typename BatchMatrixType, typename PrecType, typename StopType,
+              typename LogType>
+    void call_kernel(
+        LogType logger, const BatchMatrixType& mat, PrecType prec,
+        const gko::batch::multi_vector::uniform_batch<const ValueType>& b,
+        const gko::batch::multi_vector::uniform_batch<ValueType>& x) const
+    {
+        using real_type = gko::remove_complex<ValueType>;
+        const size_type num_batch_items = mat.num_batch_items;
+        const auto num_rows = mat.num_rows;
+        const auto num_rhs = b.num_rhs;
+        GKO_ASSERT(num_rhs == 1);
+
+        auto device = exec_->get_queue()->get_device();
+        auto max_group_size =
+            device.get_info<sycl::info::device::max_work_group_size>();
+        int group_size =
+            device.get_info<sycl::info::device::max_work_group_size>();
+        if (group_size > num_rows) {
+            group_size = get_group_size(num_rows);
+        };
+        group_size = std::min(
+            std::max(group_size, static_cast<int>(2 * config::warp_size)),
+            static_cast<int>(max_group_size));
+
+        // reserve 5 for intermediate rho-s, norms,
+        // alpha, omega, temp and for reduce_over_group
+        // If the value available is negative, then set it to 0
+        const int static_var_mem =
+            (group_size + 5) * sizeof(ValueType) + 2 * sizeof(real_type);
+        int shmem_per_blk = std::max(
+            static_cast<int>(
+                device.get_info<sycl::info::device::local_mem_size>()) -
+                static_var_mem,
+            0);
+        const int padded_num_rows = num_rows;
+        const size_type prec_size = PrecType::dynamic_work_size(
+            padded_num_rows, mat.get_single_item_num_nnz());
+        const auto sconf =
+            gko::kernels::batch_bicgstab::compute_shared_storage<PrecType,
+                                                                 ValueType>(
+                shmem_per_blk, padded_num_rows, mat.get_single_item_num_nnz(),
+                b.num_rhs);
+        const size_t shared_size = sconf.n_shared * padded_num_rows +
+                                   (sconf.prec_shared ? prec_size : 0);
+        auto workspace = gko::array<ValueType>(
+            exec_,
+            sconf.gmem_stride_bytes * num_batch_items / sizeof(ValueType));
+        GKO_ASSERT(sconf.gmem_stride_bytes % sizeof(ValueType) == 0);
+
+        ValueType* const workspace_data = workspace.get_data();
+        int n_shared_total = sconf.n_shared + int(sconf.prec_shared);
+
+        // template
+        // launch_apply_kernel<StopType, subgroup_size, n_shared_total,
+        // sg_kernel_all>
+        if (num_rows <= 32 && n_shared_total == 10) {
+            launch_apply_kernel<StopType, 32, 10>(
+                sconf, logger, prec, mat, b.values, x.values, workspace_data,
+                group_size, shared_size);
+        } else if (num_rows <= 256 && n_shared_total == 10) {
+            launch_apply_kernel<StopType, 32, 10>(
+                sconf, logger, prec, mat, b.values, x.values, workspace_data,
+                group_size, shared_size);
+        } else {
+            switch (n_shared_total) {
+            case 0:
+                launch_apply_kernel<StopType, 32, 0>(
+                    sconf, logger, prec, mat, b.values, x.values,
+                    workspace_data, group_size, shared_size);
+                break;
+            case 1:
+                launch_apply_kernel<StopType, 32, 1>(
+                    sconf, logger, prec, mat, b.values, x.values,
+                    workspace_data, group_size, shared_size);
+                break;
+            case 2:
+                launch_apply_kernel<StopType, 32, 2>(
+                    sconf, logger, prec, mat, b.values, x.values,
+                    workspace_data, group_size, shared_size);
+                break;
+            case 3:
+                launch_apply_kernel<StopType, 32, 3>(
+                    sconf, logger, prec, mat, b.values, x.values,
+                    workspace_data, group_size, shared_size);
+                break;
+            case 4:
+                launch_apply_kernel<StopType, 32, 4>(
+                    sconf, logger, prec, mat, b.values, x.values,
+                    workspace_data, group_size, shared_size);
+                break;
+            case 5:
+                launch_apply_kernel<StopType, 32, 5>(
+                    sconf, logger, prec, mat, b.values, x.values,
+                    workspace_data, group_size, shared_size);
+                break;
+            case 6:
+                launch_apply_kernel<StopType, 32, 6>(
+                    sconf, logger, prec, mat, b.values, x.values,
+                    workspace_data, group_size, shared_size);
+                break;
+            case 7:
+                launch_apply_kernel<StopType, 32, 7>(
+                    sconf, logger, prec, mat, b.values, x.values,
+                    workspace_data, group_size, shared_size);
+                break;
+            case 8:
+                launch_apply_kernel<StopType, 32, 8>(
+                    sconf, logger, prec, mat, b.values, x.values,
+                    workspace_data, group_size, shared_size);
+                break;
+            case 9:
+                launch_apply_kernel<StopType, 32, 9>(
+                    sconf, logger, prec, mat, b.values, x.values,
+                    workspace_data, group_size, shared_size);
+                break;
+            case 10:
+                launch_apply_kernel<StopType, 32, 10>(
+                    sconf, logger, prec, mat, b.values, x.values,
+                    workspace_data, group_size, shared_size);
+                break;
+            default:
+                GKO_NOT_IMPLEMENTED;
+            }
+        }
+    }
+
+private:
+    std::shared_ptr<const DefaultExecutor> exec_;
+    const settings<remove_complex<ValueType>> settings_;
+};
+
+
+template <typename ValueType>
+void apply(std::shared_ptr<const DefaultExecutor> exec,
+           const settings<remove_complex<ValueType>>& settings,
+           const batch::BatchLinOp* const mat,
+           const batch::BatchLinOp* const precond,
+           const batch::MultiVector<ValueType>* const b,
+           batch::MultiVector<ValueType>* const x,
+           batch::log::detail::log_data<remove_complex<ValueType>>& logdata)
+{
+    auto dispatcher = batch::solver::create_dispatcher<ValueType>(
+        KernelCaller<ValueType>(exec, settings), settings, mat, precond);
+    dispatcher.apply(b, x, logdata);
+}
+
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_APPLY_KERNEL);
+
+
+}  // namespace batch_bicgstab
+}  // namespace dpcpp
+}  // namespace kernels
+}  // namespace gko
diff --git a/dpcpp/solver/batch_bicgstab_kernels.hpp.inc b/dpcpp/solver/batch_bicgstab_kernels.hpp.inc
new file mode 100644
index 00000000000..03f8ea31165
--- /dev/null
+++ b/dpcpp/solver/batch_bicgstab_kernels.hpp.inc
@@ -0,0 +1,413 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+template <typename BatchMatrixType_entry, typename ValueType>
+__dpct_inline__ void initialize(
+    const int num_rows, const BatchMatrixType_entry& mat_global_entry,
+    const ValueType* const b_global_entry,
+    const ValueType* const x_global_entry, ValueType& rho_old, ValueType& omega,
+    ValueType& alpha, ValueType* const x_shared_entry,
+    ValueType* const r_shared_entry, ValueType* const r_hat_shared_entry,
+    ValueType* const p_shared_entry, ValueType* const v_shared_entry,
+    ValueType* const p_hat_shared_entry,
+    typename gko::remove_complex<ValueType>& rhs_norm,
+    typename gko::remove_complex<ValueType>& res_norm,
+    sycl::nd_item<3> item_ct1)
+{
+    auto sg = item_ct1.get_sub_group();
+    const auto sg_id = sg.get_group_id();
+    const auto tid = item_ct1.get_local_linear_id();
+    const auto group_size = item_ct1.get_local_range().size();
+    const auto group = item_ct1.get_group();
+
+    rho_old = one<ValueType>();
+    omega = one<ValueType>();
+    alpha = one<ValueType>();
+
+    // copy x from global to shared memory
+    // r = b
+    for (int iz = tid; iz < num_rows; iz += group_size) {
+        x_shared_entry[iz] = x_global_entry[iz];
+        r_shared_entry[iz] = b_global_entry[iz];
+    }
+    item_ct1.barrier(sycl::access::fence_space::global_and_local);
+
+    // r = b - A*x
+    advanced_apply_kernel(static_cast<ValueType>(-1.0), mat_global_entry,
+                          x_shared_entry, static_cast<ValueType>(1.0),
+                          r_shared_entry, item_ct1);
+    item_ct1.barrier(sycl::access::fence_space::global_and_local);
+
+    if (sg_id == 0) {
+        single_rhs_compute_norm2_sg(num_rows, r_shared_entry, res_norm,
+                                    item_ct1);
+    } else if (sg_id == 1) {
+        single_rhs_compute_norm2_sg(num_rows, b_global_entry, rhs_norm,
+                                    item_ct1);
+    }
+    item_ct1.barrier(sycl::access::fence_space::global_and_local);
+
+
+    for (int iz = tid; iz < num_rows; iz += group_size) {
+        r_hat_shared_entry[iz] = r_shared_entry[iz];
+        p_shared_entry[iz] = zero<ValueType>();
+        p_hat_shared_entry[iz] = zero<ValueType>();
+        v_shared_entry[iz] = zero<ValueType>();
+    }
+}
+
+
+template <typename ValueType>
+__dpct_inline__ void update_p(const int num_rows, const ValueType& rho_new,
+                              const ValueType& rho_old, const ValueType& alpha,
+                              const ValueType& omega,
+                              const ValueType* const r_shared_entry,
+                              const ValueType* const v_shared_entry,
+                              ValueType* const p_shared_entry,
+                              sycl::nd_item<3> item_ct1)
+{
+    const ValueType beta = (rho_new / rho_old) * (alpha / omega);
+    for (int r = item_ct1.get_local_linear_id(); r < num_rows;
+         r += item_ct1.get_local_range().size()) {
+        p_shared_entry[r] =
+            r_shared_entry[r] +
+            beta * (p_shared_entry[r] - omega * v_shared_entry[r]);
+    }
+}
+
+
+template <typename ValueType>
+__dpct_inline__ void compute_alpha(const int num_rows, const ValueType& rho_new,
+                                   const ValueType* const r_hat_shared_entry,
+                                   const ValueType* const v_shared_entry,
+                                   ValueType& alpha, sycl::nd_item<3> item_ct1)
+{
+    auto sg = item_ct1.get_sub_group();
+    const auto sg_id = sg.get_group_id();
+    const auto tid = item_ct1.get_local_linear_id();
+    if (sg_id == 0) {
+        single_rhs_compute_conj_dot_sg(num_rows, r_hat_shared_entry,
+                                       v_shared_entry, alpha, item_ct1);
+    }
+    item_ct1.barrier(sycl::access::fence_space::global_and_local);
+    if (tid == 0) {
+        alpha = rho_new / alpha;
+    }
+    item_ct1.barrier(sycl::access::fence_space::global_and_local);
+}
+
+
+template <typename ValueType>
+__dpct_inline__ void update_s(const int num_rows,
+                              const ValueType* const r_shared_entry,
+                              const ValueType& alpha,
+                              const ValueType* const v_shared_entry,
+                              ValueType* const s_shared_entry,
+                              sycl::nd_item<3> item_ct1)
+{
+    for (int r = item_ct1.get_local_linear_id(); r < num_rows;
+         r += item_ct1.get_local_range().size()) {
+        s_shared_entry[r] = r_shared_entry[r] - alpha * v_shared_entry[r];
+    }
+}
+
+
+template <typename ValueType>
+__dpct_inline__ void compute_omega(const int num_rows,
+                                   const ValueType* const t_shared_entry,
+                                   const ValueType* const s_shared_entry,
+                                   ValueType& temp, ValueType& omega,
+                                   sycl::nd_item<3> item_ct1)
+{
+    auto sg = item_ct1.get_sub_group();
+    const auto sg_id = sg.get_group_id();
+    const auto tid = item_ct1.get_local_linear_id();
+    if (sg_id == 0) {
+        single_rhs_compute_conj_dot_sg(num_rows, t_shared_entry, s_shared_entry,
+                                       omega, item_ct1);
+    } else if (sg_id == 1) {
+        single_rhs_compute_conj_dot_sg(num_rows, t_shared_entry, t_shared_entry,
+                                       temp, item_ct1);
+    }
+    item_ct1.barrier(sycl::access::fence_space::global_and_local);
+    if (tid == 0) {
+        omega /= temp;
+    }
+    item_ct1.barrier(sycl::access::fence_space::global_and_local);
+}
+
+
+template <typename ValueType>
+__dpct_inline__ void update_x_and_r(
+    const int num_rows, const ValueType* const p_hat_shared_entry,
+    const ValueType* const s_hat_shared_entry, const ValueType& alpha,
+    const ValueType& omega, const ValueType* const s_shared_entry,
+    const ValueType* const t_shared_entry, ValueType* const x_shared_entry,
+    ValueType* const r_shared_entry, sycl::nd_item<3> item_ct1)
+{
+    for (int r = item_ct1.get_local_linear_id(); r < num_rows;
+         r += item_ct1.get_local_range().size()) {
+        x_shared_entry[r] = x_shared_entry[r] + alpha * p_hat_shared_entry[r] +
+                            omega * s_hat_shared_entry[r];
+        r_shared_entry[r] = s_shared_entry[r] - omega * t_shared_entry[r];
+    }
+}
+
+
+template <typename ValueType>
+__dpct_inline__ void update_x_middle(const int num_rows, const ValueType& alpha,
+                                     const ValueType* const p_hat_shared_entry,
+                                     ValueType* const x_shared_entry,
+                                     sycl::nd_item<3> item_ct1)
+{
+    for (int r = item_ct1.get_local_linear_id(); r < num_rows;
+         r += item_ct1.get_local_range().size()) {
+        x_shared_entry[r] = x_shared_entry[r] + alpha * p_hat_shared_entry[r];
+    }
+}
+
+
+template <typename StopType, const int n_shared_total, typename PrecType,
+          typename LogType, typename BatchMatrixType, typename ValueType>
+void apply_kernel(const gko::kernels::batch_bicgstab::storage_config sconf,
+                  const int max_iter, const gko::remove_complex<ValueType> tol,
+                  LogType logger, PrecType prec_shared,
+                  const BatchMatrixType mat_global_entry,
+                  const ValueType* const __restrict__ b_global_entry,
+                  ValueType* const __restrict__ x_global_entry,
+                  const size_type num_rows, const size_type nnz,
+                  ValueType* const __restrict__ slm_values,
+                  sycl::nd_item<3> item_ct1,
+                  ValueType* const __restrict__ workspace = nullptr)
+{
+    using real_type = typename gko::remove_complex<ValueType>;
+
+    const auto sg = item_ct1.get_sub_group();
+    const int sg_id = sg.get_group_id();
+    const int tid = item_ct1.get_local_linear_id();
+    auto group = item_ct1.get_group();
+    const int group_size = item_ct1.get_local_range().size();
+
+    const auto batch_id = item_ct1.get_group_linear_id();
+
+    ValueType* rho_old_sh;
+    ValueType* rho_new_sh;
+    ValueType* alpha_sh;
+    ValueType* omega_sh;
+    ValueType* temp_sh;
+    real_type* norms_rhs_sh;
+    real_type* norms_res_sh;
+
+    using tile_value_t = ValueType[5];
+    tile_value_t& values =
+        *sycl::ext::oneapi::group_local_memory_for_overwrite<tile_value_t>(
+            group);
+    using tile_real_t = real_type[2];
+    tile_real_t& reals =
+        *sycl::ext::oneapi::group_local_memory_for_overwrite<tile_real_t>(
+            group);
+    rho_old_sh = &values[0];
+    rho_new_sh = &values[1];
+    alpha_sh = &values[2];
+    omega_sh = &values[3];
+    temp_sh = &values[4];
+    norms_rhs_sh = &reals[0];
+    norms_res_sh = &reals[1];
+    const int gmem_offset =
+        batch_id * sconf.gmem_stride_bytes / sizeof(ValueType);
+    ValueType* p_hat_sh;
+    ValueType* s_hat_sh;
+    ValueType* s_sh;
+    ValueType* p_sh;
+    ValueType* r_sh;
+    ValueType* r_hat_sh;
+    ValueType* v_sh;
+    ValueType* t_sh;
+    ValueType* x_sh;
+    ValueType* prec_work_sh;
+
+    if constexpr (n_shared_total >= 1) {
+        p_hat_sh = slm_values;
+    } else {
+        p_hat_sh = workspace + gmem_offset;
+    }
+    if constexpr (n_shared_total == 1) {
+        s_hat_sh = workspace + gmem_offset;
+    } else {
+        s_hat_sh = p_hat_sh + sconf.padded_vec_len;
+    }
+    if constexpr (n_shared_total == 2) {
+        v_sh = workspace + gmem_offset;
+    } else {
+        v_sh = s_hat_sh + sconf.padded_vec_len;
+    }
+    if constexpr (n_shared_total == 3) {
+        t_sh = workspace + gmem_offset;
+    } else {
+        t_sh = v_sh + sconf.padded_vec_len;
+    }
+    if constexpr (n_shared_total == 4) {
+        p_sh = workspace + gmem_offset;
+    } else {
+        p_sh = t_sh + sconf.padded_vec_len;
+    }
+    if constexpr (n_shared_total == 5) {
+        s_sh = workspace + gmem_offset;
+    } else {
+        s_sh = p_sh + sconf.padded_vec_len;
+    }
+    if constexpr (n_shared_total == 6) {
+        r_sh = workspace + gmem_offset;
+    } else {
+        r_sh = s_sh + sconf.padded_vec_len;
+    }
+    if constexpr (n_shared_total == 7) {
+        r_hat_sh = workspace + gmem_offset;
+    } else {
+        r_hat_sh = r_sh + sconf.padded_vec_len;
+    }
+    if constexpr (n_shared_total == 8) {
+        x_sh = workspace + gmem_offset;
+    } else {
+        x_sh = r_hat_sh + sconf.padded_vec_len;
+    }
+    if constexpr (n_shared_total == 9) {
+        prec_work_sh = workspace + gmem_offset;
+    } else {
+        prec_work_sh = x_sh + sconf.padded_vec_len;
+    }
+
+    // generate preconditioner
+    prec_shared.generate(batch_id, mat_global_entry, prec_work_sh, item_ct1);
+
+    // initialization
+    // rho_old = 1, omega = 1, alpha = 1
+    // compute b norms
+    // copy x from global to shared memory
+    // r = b - A*x
+    // compute residual norms
+    // r_hat = r
+    // p = 0
+    // p_hat = 0
+    // v = 0
+    initialize(num_rows, mat_global_entry, b_global_entry, x_global_entry,
+               rho_old_sh[0], omega_sh[0], alpha_sh[0], x_sh, r_sh, r_hat_sh,
+               p_sh, p_hat_sh, v_sh, norms_rhs_sh[0], norms_res_sh[0],
+               item_ct1);
+    item_ct1.barrier(sycl::access::fence_space::global_and_local);
+
+    // stopping criterion object
+    StopType stop(tol, norms_rhs_sh);
+
+    int iter = 0;
+    for (; iter < max_iter; iter++) {
+        if (stop.check_converged(norms_res_sh)) {
+            logger.log_iteration(batch_id, iter, norms_res_sh[0]);
+            break;
+        }
+
+        // rho_new =  < r_hat , r > = (r_hat)' * (r)
+        if (sg_id == 0) {
+            single_rhs_compute_conj_dot_sg(num_rows, r_hat_sh, r_sh,
+                                           rho_new_sh[0], item_ct1);
+        }
+        item_ct1.barrier(sycl::access::fence_space::global_and_local);
+
+        // beta = (rho_new / rho_old)*(alpha / omega)
+        // p = r + beta*(p - omega * v)
+        update_p(num_rows, rho_new_sh[0], rho_old_sh[0], alpha_sh[0],
+                 omega_sh[0], r_sh, v_sh, p_sh, item_ct1);
+        item_ct1.barrier(sycl::access::fence_space::global_and_local);
+
+        // p_hat = precond * p
+        prec_shared.apply(num_rows, p_sh, p_hat_sh, item_ct1);
+        item_ct1.barrier(sycl::access::fence_space::global_and_local);
+
+        // v = A * p_hat
+        simple_apply_kernel(mat_global_entry, p_hat_sh, v_sh, item_ct1);
+        item_ct1.barrier(sycl::access::fence_space::global_and_local);
+
+        // alpha = rho_new / < r_hat , v>
+        compute_alpha(num_rows, rho_new_sh[0], r_hat_sh, v_sh, alpha_sh[0],
+                      item_ct1);
+        item_ct1.barrier(sycl::access::fence_space::global_and_local);
+
+        // s = r - alpha*v
+        update_s(num_rows, r_sh, alpha_sh[0], v_sh, s_sh, item_ct1);
+        item_ct1.barrier(sycl::access::fence_space::global_and_local);
+
+        // an estimate of residual norms
+        if (sg_id == 0) {
+            single_rhs_compute_norm2_sg(num_rows, s_sh, norms_res_sh[0],
+                                        item_ct1);
+        }
+        item_ct1.barrier(sycl::access::fence_space::global_and_local);
+
+        if (stop.check_converged(norms_res_sh)) {
+            update_x_middle(num_rows, alpha_sh[0], p_hat_sh, x_sh, item_ct1);
+            logger.log_iteration(batch_id, iter, norms_res_sh[0]);
+            break;
+        }
+
+        // s_hat = precond * s
+        prec_shared.apply(num_rows, s_sh, s_hat_sh, item_ct1);
+        item_ct1.barrier(sycl::access::fence_space::global_and_local);
+
+        // t = A * s_hat
+        simple_apply_kernel(mat_global_entry, s_hat_sh, t_sh, item_ct1);
+        item_ct1.barrier(sycl::access::fence_space::global_and_local);
+
+        // omega = <t,s> / <t,t>
+        compute_omega(num_rows, t_sh, s_sh, temp_sh[0], omega_sh[0], item_ct1);
+        item_ct1.barrier(sycl::access::fence_space::global_and_local);
+
+        // x = x + alpha*p_hat + omega *s_hat
+        // r = s - omega * t
+        update_x_and_r(num_rows, p_hat_sh, s_hat_sh, alpha_sh[0], omega_sh[0],
+                       s_sh, t_sh, x_sh, r_sh, item_ct1);
+        item_ct1.barrier(sycl::access::fence_space::global_and_local);
+
+        if (sg_id == 0)
+            single_rhs_compute_norm2_sg(num_rows, r_sh, norms_res_sh[0],
+                                        item_ct1);
+        if (tid == group_size - 1) {
+            rho_old_sh[0] = rho_new_sh[0];
+        }
+        item_ct1.barrier(sycl::access::fence_space::global_and_local);
+    }
+
+    logger.log_iteration(batch_id, iter, norms_res_sh[0]);
+
+    // copy x back to global memory
+    copy_kernel(num_rows, x_sh, x_global_entry, item_ct1);
+    item_ct1.barrier(sycl::access::fence_space::global_and_local);
+}
diff --git a/dpcpp/stop/batch_criteria.hpp b/dpcpp/stop/batch_criteria.hpp
new file mode 100644
index 00000000000..f1e51ebd1ae
--- /dev/null
+++ b/dpcpp/stop/batch_criteria.hpp
@@ -0,0 +1,105 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_DPCPP_STOP_BATCH_CRITERIA_HPP_
+#define GKO_DPCPP_STOP_BATCH_CRITERIA_HPP_
+
+
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/stop/batch_stop_enum.hpp>
+
+
+#include "dpcpp/base/config.hpp"
+#include "dpcpp/base/dim3.dp.hpp"
+#include "dpcpp/base/dpct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace dpcpp {
+namespace batch_stop {
+
+
+/**
+ * @see reference/stop/batch_criteria.hpp
+ */
+template <typename ValueType>
+class SimpleRelResidual {
+public:
+    using real_type = remove_complex<ValueType>;
+
+    SimpleRelResidual(const real_type rel_res_tol,
+                      const real_type* const rhs_b_norms)
+        : rel_tol_{rel_res_tol}, rhs_norms_{rhs_b_norms}
+    {}
+
+    __dpct_inline__ bool check_converged(
+        const real_type* const residual_norms) const
+    {
+        return residual_norms[0] <= (rel_tol_ * rhs_norms_[0]);
+    }
+
+private:
+    const real_type rel_tol_;
+    const real_type* const rhs_norms_;
+};
+
+
+/**
+ * @see reference/stop/batch_criteria.hpp
+ */
+template <typename ValueType>
+class SimpleAbsResidual {
+public:
+    using real_type = remove_complex<ValueType>;
+
+    SimpleAbsResidual(const real_type tol, const real_type*) : abs_tol_{tol} {}
+
+    __dpct_inline__ bool check_converged(
+        const real_type* const residual_norms) const
+    {
+        return (residual_norms[0] <= abs_tol_);
+    }
+
+private:
+    const real_type abs_tol_;
+};
+
+
+}  // namespace batch_stop
+}  // namespace dpcpp
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_DPCPP_STOP_BATCH_CRITERIA_HPP_
diff --git a/dpcpp/stop/residual_norm_kernels.dp.cpp b/dpcpp/stop/residual_norm_kernels.dp.cpp
index 6d47c5bdcb2..fd0d5b00455 100644
--- a/dpcpp/stop/residual_norm_kernels.dp.cpp
+++ b/dpcpp/stop/residual_norm_kernels.dp.cpp
@@ -82,7 +82,7 @@ void residual_norm(std::shared_ptr<const DpcppExecutor> exec,
         cgh.parallel_for(
             sycl::range<1>{tau->get_size()[1]}, [=](sycl::id<1> idx_id) {
                 const auto tidx = idx_id[0];
-                if (tau_val[tidx] < rel_residual_goal * orig_tau_val[tidx]) {
+                if (tau_val[tidx] <= rel_residual_goal * orig_tau_val[tidx]) {
                     stop_status_val[tidx].converge(stoppingId, setFinalized);
                     device_storage_val[1] = true;
                 }
@@ -138,7 +138,7 @@ void implicit_residual_norm(
         cgh.parallel_for(
             sycl::range<1>{tau->get_size()[1]}, [=](sycl::id<1> idx_id) {
                 const auto tidx = idx_id[0];
-                if (std::sqrt(std::abs(tau_val[tidx])) <
+                if (std::sqrt(std::abs(tau_val[tidx])) <=
                     rel_residual_goal * orig_tau_val[tidx]) {
                     stop_status_val[tidx].converge(stoppingId, setFinalized);
                     device_storage_val[1] = true;
diff --git a/dpcpp/test/matrix/CMakeLists.txt b/dpcpp/test/matrix/CMakeLists.txt
index 88ab52e9c3f..7ada04882da 100644
--- a/dpcpp/test/matrix/CMakeLists.txt
+++ b/dpcpp/test/matrix/CMakeLists.txt
@@ -1 +1 @@
-ginkgo_create_test(fbcsr_kernels)
+ginkgo_create_dpcpp_test(fbcsr_kernels)
diff --git a/dpcpp/test/matrix/fbcsr_kernels.cpp b/dpcpp/test/matrix/fbcsr_kernels.dp.cpp
similarity index 100%
rename from dpcpp/test/matrix/fbcsr_kernels.cpp
rename to dpcpp/test/matrix/fbcsr_kernels.dp.cpp
diff --git a/dpcpp/test/preconditioner/CMakeLists.txt b/dpcpp/test/preconditioner/CMakeLists.txt
index a0ca5a2e38a..c606e12ac3e 100644
--- a/dpcpp/test/preconditioner/CMakeLists.txt
+++ b/dpcpp/test/preconditioner/CMakeLists.txt
@@ -1 +1 @@
-ginkgo_create_test(jacobi_kernels)
+ginkgo_create_dpcpp_test(jacobi_kernels)
diff --git a/dpcpp/test/preconditioner/jacobi_kernels.cpp b/dpcpp/test/preconditioner/jacobi_kernels.dp.cpp
similarity index 100%
rename from dpcpp/test/preconditioner/jacobi_kernels.cpp
rename to dpcpp/test/preconditioner/jacobi_kernels.dp.cpp
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 41ed77d9002..33e3bab735a 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -59,7 +59,7 @@ find_package(Kokkos QUIET)
 if(Kokkos_FOUND)
     if(GINKGO_WITH_CCACHE)
         message(WARNING "The CMAKE_CXX_COMPILER_LAUNCHER is set due to "
-            "GINKGO_WITH_CCACHE=ON which is known to casue issues with CUDA enabled "
+            "GINKGO_WITH_CCACHE=ON which is known to cause issues with CUDA enabled "
             "Kokkos (https://github.com/kokkos/kokkos/issues/4821) including compilation "
             "failures. This can be prevented by setting GINKGO_WITH_CCACHE=OFF.")
     endif()
diff --git a/examples/adaptiveprecision-blockjacobi/CMakeLists.txt b/examples/adaptiveprecision-blockjacobi/CMakeLists.txt
index b121e201c77..324400e9cb4 100644
--- a/examples/adaptiveprecision-blockjacobi/CMakeLists.txt
+++ b/examples/adaptiveprecision-blockjacobi/CMakeLists.txt
@@ -1,9 +1,9 @@
-cmake_minimum_required(VERSION 3.9)
+cmake_minimum_required(VERSION 3.16)
 project(adaptiveprecision-blockjacobi)
 
 # We only need to find Ginkgo if we build this example stand-alone
 if (NOT GINKGO_BUILD_EXAMPLES)
-    find_package(Ginkgo 1.6.0 REQUIRED)
+    find_package(Ginkgo 1.7.0 REQUIRED)
 endif()
 
 add_executable(adaptiveprecision-blockjacobi adaptiveprecision-blockjacobi.cpp)
diff --git a/examples/adaptiveprecision-blockjacobi/adaptiveprecision-blockjacobi.cpp b/examples/adaptiveprecision-blockjacobi/adaptiveprecision-blockjacobi.cpp
index b300292e9a3..b673024c6fe 100644
--- a/examples/adaptiveprecision-blockjacobi/adaptiveprecision-blockjacobi.cpp
+++ b/examples/adaptiveprecision-blockjacobi/adaptiveprecision-blockjacobi.cpp
@@ -68,13 +68,12 @@ int main(int argc, char* argv[])
             {"omp", [] { return gko::OmpExecutor::create(); }},
             {"cuda",
              [] {
-                 return gko::CudaExecutor::create(0, gko::OmpExecutor::create(),
-                                                  true);
+                 return gko::CudaExecutor::create(0,
+                                                  gko::OmpExecutor::create());
              }},
             {"hip",
              [] {
-                 return gko::HipExecutor::create(0, gko::OmpExecutor::create(),
-                                                 true);
+                 return gko::HipExecutor::create(0, gko::OmpExecutor::create());
              }},
             {"dpcpp",
              [] {
@@ -111,18 +110,14 @@ int main(int argc, char* argv[])
     const RealValueType reduction_factor = 1e-7;
     auto solver_gen =
         cg::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(10000u).on(exec),
-                gko::stop::ResidualNorm<ValueType>::build()
-                    .with_reduction_factor(reduction_factor)
-                    .on(exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(10000u),
+                           gko::stop::ResidualNorm<ValueType>::build()
+                               .with_reduction_factor(reduction_factor))
             // Add preconditioner, these 2 lines are the only
             // difference from the simple solver example
-            .with_preconditioner(bj::build()
-                                     .with_max_block_size(16u)
-                                     .with_storage_optimization(
-                                         gko::precision_reduction::autodetect())
-                                     .on(exec))
+            .with_preconditioner(
+                bj::build().with_max_block_size(16u).with_storage_optimization(
+                    gko::precision_reduction::autodetect()))
             .on(exec);
     // Create solver
     std::shared_ptr<const gko::log::Convergence<ValueType>> logger =
diff --git a/examples/build-setup.sh b/examples/build-setup.sh
index f7a14a0d0a6..a0c947e433b 100644
--- a/examples/build-setup.sh
+++ b/examples/build-setup.sh
@@ -3,7 +3,7 @@
 # copy libraries
 LIBRARY_NAMES="ginkgo ginkgo_reference ginkgo_omp ginkgo_cuda ginkgo_hip ginkgo_dpcpp ginkgo_device"
 SUFFIXES=".so .dylib .dll d.so d.dylib d.dll"
-VERSION="1.6.0"
+VERSION="1.7.0"
 for name in ${LIBRARY_NAMES}; do
     for suffix in ${SUFFIXES}; do
         cp ${BUILD_DIR}/lib/lib${name}${suffix}.${VERSION} \
diff --git a/examples/cb-gmres/CMakeLists.txt b/examples/cb-gmres/CMakeLists.txt
index 97321c8ccbc..826100b8bd2 100644
--- a/examples/cb-gmres/CMakeLists.txt
+++ b/examples/cb-gmres/CMakeLists.txt
@@ -1,9 +1,9 @@
-cmake_minimum_required(VERSION 3.9)
+cmake_minimum_required(VERSION 3.16)
 project(cb-gmres)
 
 # We only need to find Ginkgo if we build this example stand-alone
 if (NOT GINKGO_BUILD_EXAMPLES)
-    find_package(Ginkgo 1.6.0 REQUIRED)
+    find_package(Ginkgo 1.7.0 REQUIRED)
 endif()
 
 add_executable(cb-gmres cb-gmres.cpp)
diff --git a/examples/cb-gmres/cb-gmres.cpp b/examples/cb-gmres/cb-gmres.cpp
index c0235f75e55..915035fd642 100644
--- a/examples/cb-gmres/cb-gmres.cpp
+++ b/examples/cb-gmres/cb-gmres.cpp
@@ -108,13 +108,12 @@ int main(int argc, char* argv[])
             {"omp", [] { return gko::OmpExecutor::create(); }},
             {"cuda",
              [] {
-                 return gko::CudaExecutor::create(0, gko::OmpExecutor::create(),
-                                                  true);
+                 return gko::CudaExecutor::create(0,
+                                                  gko::OmpExecutor::create());
              }},
             {"hip",
              [] {
-                 return gko::HipExecutor::create(0, gko::OmpExecutor::create(),
-                                                 true);
+                 return gko::HipExecutor::create(0, gko::OmpExecutor::create());
              }},
             {"dpcpp",
              [] {
@@ -155,12 +154,10 @@ int main(int argc, char* argv[])
     // storage type
     auto solver_gen_keep =
         cb_gmres::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(1000u).on(exec),
-                gko::stop::ResidualNorm<ValueType>::build()
-                    .with_baseline(gko::stop::mode::rhs_norm)
-                    .with_reduction_factor(reduction_factor)
-                    .on(exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(1000u),
+                           gko::stop::ResidualNorm<ValueType>::build()
+                               .with_baseline(gko::stop::mode::rhs_norm)
+                               .with_reduction_factor(reduction_factor))
             .with_krylov_dim(100u)
             .with_storage_precision(
                 gko::solver::cb_gmres::storage_precision::keep)
@@ -168,12 +165,10 @@ int main(int argc, char* argv[])
 
     auto solver_gen_reduce =
         cb_gmres::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(1000u).on(exec),
-                gko::stop::ResidualNorm<ValueType>::build()
-                    .with_baseline(gko::stop::mode::rhs_norm)
-                    .with_reduction_factor(reduction_factor)
-                    .on(exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(1000u),
+                           gko::stop::ResidualNorm<ValueType>::build()
+                               .with_baseline(gko::stop::mode::rhs_norm)
+                               .with_reduction_factor(reduction_factor))
             .with_krylov_dim(100u)
             .with_storage_precision(
                 gko::solver::cb_gmres::storage_precision::reduce1)
diff --git a/examples/custom-logger/CMakeLists.txt b/examples/custom-logger/CMakeLists.txt
index 1d0c8bcf9ad..8278d3e72ba 100644
--- a/examples/custom-logger/CMakeLists.txt
+++ b/examples/custom-logger/CMakeLists.txt
@@ -1,9 +1,9 @@
-cmake_minimum_required(VERSION 3.9)
+cmake_minimum_required(VERSION 3.16)
 project(custom-logger)
 
 # We only need to find Ginkgo if we build this example stand-alone
 if (NOT GINKGO_BUILD_EXAMPLES)
-    find_package(Ginkgo 1.6.0 REQUIRED)
+    find_package(Ginkgo 1.7.0 REQUIRED)
 endif()
 
 add_executable(custom-logger custom-logger.cpp)
diff --git a/examples/custom-logger/custom-logger.cpp b/examples/custom-logger/custom-logger.cpp
index c2270cadb0d..e44303b81a2 100644
--- a/examples/custom-logger/custom-logger.cpp
+++ b/examples/custom-logger/custom-logger.cpp
@@ -249,13 +249,12 @@ int main(int argc, char* argv[])
             {"omp", [] { return gko::OmpExecutor::create(); }},
             {"cuda",
              [] {
-                 return gko::CudaExecutor::create(0, gko::OmpExecutor::create(),
-                                                  true);
+                 return gko::CudaExecutor::create(0,
+                                                  gko::OmpExecutor::create());
              }},
             {"hip",
              [] {
-                 return gko::HipExecutor::create(0, gko::OmpExecutor::create(),
-                                                 true);
+                 return gko::HipExecutor::create(0, gko::OmpExecutor::create());
              }},
             {"dpcpp",
              [] {
@@ -291,11 +290,9 @@ int main(int argc, char* argv[])
     // object needs to be built on.
     auto solver_gen =
         cg::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(20u).on(exec),
-                gko::stop::ResidualNorm<ValueType>::build()
-                    .with_reduction_factor(reduction_factor)
-                    .on(exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(20u),
+                           gko::stop::ResidualNorm<ValueType>::build()
+                               .with_reduction_factor(reduction_factor))
             .on(exec);
 
     // Instantiate a ResidualLogger logger.
diff --git a/examples/custom-matrix-format/CMakeLists.txt b/examples/custom-matrix-format/CMakeLists.txt
index c357572edea..b5182fb6bbc 100644
--- a/examples/custom-matrix-format/CMakeLists.txt
+++ b/examples/custom-matrix-format/CMakeLists.txt
@@ -1,9 +1,9 @@
-cmake_minimum_required(VERSION 3.9)
+cmake_minimum_required(VERSION 3.18)
 project(custom-matrix-format CXX CUDA)
 
 # We only need to find Ginkgo if we build this example stand-alone
 if (NOT GINKGO_BUILD_EXAMPLES)
-    find_package(Ginkgo 1.6.0 REQUIRED)
+    find_package(Ginkgo 1.7.0 REQUIRED)
     find_package(OpenMP 3.0 REQUIRED)
 endif()
 
@@ -18,13 +18,5 @@ set(CMAKE_CUDA_STANDARD_REQUIRED ON)
 add_executable(custom-matrix-format custom-matrix-format.cpp stencil_kernel.cu)
 target_link_libraries(custom-matrix-format Ginkgo::ginkgo OpenMP::OpenMP_CXX)
 
-# inherit CUDA architecture flags from Ginkgo
-target_compile_options(custom-matrix-format
-    PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:${GINKGO_CUDA_ARCH_FLAGS}>")
-# we handle CUDA architecture flags for now, disable CMake handling
-if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.18)
-    set_target_properties(custom-matrix-format PROPERTIES CUDA_ARCHITECTURES OFF)
-endif()
-
 # workaround for clang-cuda/g++ interaction
 set_target_properties(custom-matrix-format PROPERTIES POSITION_INDEPENDENT_CODE ON)
diff --git a/examples/custom-matrix-format/custom-matrix-format.cpp b/examples/custom-matrix-format/custom-matrix-format.cpp
index af08dbdf226..bcaa126cdaa 100644
--- a/examples/custom-matrix-format/custom-matrix-format.cpp
+++ b/examples/custom-matrix-format/custom-matrix-format.cpp
@@ -255,13 +255,12 @@ int main(int argc, char* argv[])
             {"omp", [] { return gko::OmpExecutor::create(); }},
             {"cuda",
              [] {
-                 return gko::CudaExecutor::create(0, gko::OmpExecutor::create(),
-                                                  true);
+                 return gko::CudaExecutor::create(0,
+                                                  gko::OmpExecutor::create());
              }},
             {"hip",
              [] {
-                 return gko::HipExecutor::create(0, gko::OmpExecutor::create(),
-                                                 true);
+                 return gko::HipExecutor::create(0, gko::OmpExecutor::create());
              }},
             {"dpcpp",
              [] {
@@ -292,12 +291,10 @@ int main(int argc, char* argv[])
     const RealValueType reduction_factor{1e-7};
     // Generate solver and solve the system
     cg::build()
-        .with_criteria(gko::stop::Iteration::build()
-                           .with_max_iters(discretization_points)
-                           .on(exec),
-                       gko::stop::ResidualNorm<ValueType>::build()
-                           .with_reduction_factor(reduction_factor)
-                           .on(exec))
+        .with_criteria(
+            gko::stop::Iteration::build().with_max_iters(discretization_points),
+            gko::stop::ResidualNorm<ValueType>::build().with_reduction_factor(
+                reduction_factor))
         .on(exec)
         // notice how our custom StencilMatrix can be used in the same way as
         // any built-in type
diff --git a/examples/custom-stopping-criterion/CMakeLists.txt b/examples/custom-stopping-criterion/CMakeLists.txt
index 79b7b9aaab5..b429fba7c59 100644
--- a/examples/custom-stopping-criterion/CMakeLists.txt
+++ b/examples/custom-stopping-criterion/CMakeLists.txt
@@ -1,9 +1,9 @@
-cmake_minimum_required(VERSION 3.9)
+cmake_minimum_required(VERSION 3.16)
 project(custom-stopping-criterion)
 
 # We only need to find Ginkgo if we build this example stand-alone
 if (NOT GINKGO_BUILD_EXAMPLES)
-    find_package(Ginkgo 1.6.0 REQUIRED)
+    find_package(Ginkgo 1.7.0 REQUIRED)
     set(THREADS_PREFER_PTHREAD_FLAG ON)
     find_package(Threads REQUIRED)
 endif()
diff --git a/examples/custom-stopping-criterion/custom-stopping-criterion.cpp b/examples/custom-stopping-criterion/custom-stopping-criterion.cpp
index 9389f86cc45..e4c7d88785c 100644
--- a/examples/custom-stopping-criterion/custom-stopping-criterion.cpp
+++ b/examples/custom-stopping-criterion/custom-stopping-criterion.cpp
@@ -109,13 +109,12 @@ void run_solver(volatile bool* stop_iteration_process,
     auto x = gko::read<vec>(std::ifstream("data/x0.mtx"), exec);
 
     // Create solver factory and solve system
-    auto solver = bicg::build()
-                      .with_criteria(ByInteraction::build()
-                                         .with_stop_iteration_process(
-                                             stop_iteration_process)
-                                         .on(exec))
-                      .on(exec)
-                      ->generate(A);
+    auto solver =
+        bicg::build()
+            .with_criteria(ByInteraction::build().with_stop_iteration_process(
+                stop_iteration_process))
+            .on(exec)
+            ->generate(A);
     solver->add_logger(gko::log::Stream<ValueType>::create(
         gko::log::Logger::iteration_complete_mask, std::cout, true));
     solver->apply(b, x);
@@ -158,13 +157,12 @@ int main(int argc, char* argv[])
             {"omp", [] { return gko::OmpExecutor::create(); }},
             {"cuda",
              [] {
-                 return gko::CudaExecutor::create(0, gko::OmpExecutor::create(),
-                                                  true);
+                 return gko::CudaExecutor::create(0,
+                                                  gko::OmpExecutor::create());
              }},
             {"hip",
              [] {
-                 return gko::HipExecutor::create(0, gko::OmpExecutor::create(),
-                                                 true);
+                 return gko::HipExecutor::create(0, gko::OmpExecutor::create());
              }},
             {"dpcpp",
              [] {
@@ -176,7 +174,7 @@ int main(int argc, char* argv[])
     // executor where Ginkgo will perform the computation
     const auto exec = exec_map.at(executor_string)();  // throws if not valid
 
-    // Declare a user controled boolean for the iteration process
+    // Declare a user controlled boolean for the iteration process
     volatile bool stop_iteration_process{};
 
     // Create a new a thread to launch the solver
diff --git a/examples/distributed-solver/distributed-solver.cpp b/examples/distributed-solver/distributed-solver.cpp
index 865a44b0643..7eafba783ee 100644
--- a/examples/distributed-solver/distributed-solver.cpp
+++ b/examples/distributed-solver/distributed-solver.cpp
@@ -51,9 +51,9 @@ int main(int argc, char* argv[])
     // done with the following helper construct that uses RAII to automate the
     // initialization and finalization.
     const gko::experimental::mpi::environment env(argc, argv);
-    // @sect3{Type Definitiions}
+    // @sect3{Type Definitions}
     // Define the needed types. In a parallel program we need to differentiate
-    // beweeen global and local indices, thus we have two index types.
+    // between global and local indices, thus we have two index types.
     using GlobalIndexType = gko::int64;
     using LocalIndexType = gko::int32;
     // The underlying value type.
@@ -119,15 +119,14 @@ int main(int argc, char* argv[])
                  int device_id = gko::experimental::mpi::map_rank_to_device_id(
                      comm, gko::CudaExecutor::get_num_devices());
                  return gko::CudaExecutor::create(
-                     device_id, gko::ReferenceExecutor::create(), false,
-                     gko::allocation_mode::device);
+                     device_id, gko::ReferenceExecutor::create());
              }},
             {"hip",
              [](MPI_Comm comm) {
                  int device_id = gko::experimental::mpi::map_rank_to_device_id(
                      comm, gko::HipExecutor::get_num_devices());
                  return gko::HipExecutor::create(
-                     device_id, gko::ReferenceExecutor::create(), true);
+                     device_id, gko::ReferenceExecutor::create());
              }},
             {"dpcpp", [](MPI_Comm comm) {
                  int device_id = 0;
@@ -222,19 +221,15 @@ int main(int argc, char* argv[])
     const gko::remove_complex<ValueType> reduction_factor{1e-8};
     std::shared_ptr<const gko::log::Convergence<ValueType>> logger =
         gko::log::Convergence<ValueType>::create();
-    auto Ainv =
-        solver::build()
-            .with_preconditioner(schwarz::build()
-                                     .with_local_solver_factory(local_solver)
-                                     .on(exec))
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(num_iters).on(
-                    exec),
-                gko::stop::ResidualNorm<ValueType>::build()
-                    .with_reduction_factor(reduction_factor)
-                    .on(exec))
-            .on(exec)
-            ->generate(A);
+    auto Ainv = solver::build()
+                    .with_preconditioner(
+                        schwarz::build().with_local_solver(local_solver))
+                    .with_criteria(
+                        gko::stop::Iteration::build().with_max_iters(num_iters),
+                        gko::stop::ResidualNorm<ValueType>::build()
+                            .with_reduction_factor(reduction_factor))
+                    .on(exec)
+                    ->generate(A);
     // Add logger to the generated solver to log the iteration count and
     // residual norm
     Ainv->add_logger(logger);
diff --git a/examples/external-lib-interfacing/CMakeLists.txt b/examples/external-lib-interfacing/CMakeLists.txt
index 4501ace4088..56d7b92ea0f 100644
--- a/examples/external-lib-interfacing/CMakeLists.txt
+++ b/examples/external-lib-interfacing/CMakeLists.txt
@@ -1,7 +1,7 @@
 if(GINKGO_BUILD_EXTLIB_EXAMPLE)
     # This is just an example of the CMakeLists.txt file that can be used after the
     # correct version of deal.ii has been installed.
-    cmake_minimum_required(VERSION 3.9)
+    cmake_minimum_required(VERSION 3.16)
     project(DEAL_II_EXAMPLE LANGUAGES CXX)
 
     find_package(MPI 3.1 COMPONENTS CXX REQUIRED)
diff --git a/examples/external-lib-interfacing/external-lib-interfacing.cpp b/examples/external-lib-interfacing/external-lib-interfacing.cpp
index 08b35923b30..04824cb9578 100644
--- a/examples/external-lib-interfacing/external-lib-interfacing.cpp
+++ b/examples/external-lib-interfacing/external-lib-interfacing.cpp
@@ -880,11 +880,9 @@ void AdvectionProblem<dim>::solve()
     auto solver_gen =
         bicgstab::build()
             .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(1000).on(exec),
-                gko::stop::ResidualNorm<>::build()
-                    .with_reduction_factor(1e-12)
-                    .on(exec))
-            .with_preconditioner(bj::build().on(exec))
+                gko::stop::Iteration::build().with_max_iters(1000),
+                gko::stop::ResidualNorm<>::build().with_reduction_factor(1e-12))
+            .with_preconditioner(bj::build())
             .on(exec);
     auto solver = solver_gen->generate(gko::give(A));
 
@@ -1324,7 +1322,7 @@ void GradientEstimation::estimate_cell(
 // <code>set_thread_limit</code>, the default value from the Intel Threading
 // Building Blocks (TBB) library is used. If the call to
 // <code>set_thread_limit</code> is omitted, the number of threads will be
-// chosen by TBB indepently of DEAL_II_NUM_THREADS.
+// chosen by TBB independently of DEAL_II_NUM_THREADS.
 int main()
 {
     try {
diff --git a/examples/ginkgo-overhead/CMakeLists.txt b/examples/ginkgo-overhead/CMakeLists.txt
index 5afbc22c731..350b58312fc 100644
--- a/examples/ginkgo-overhead/CMakeLists.txt
+++ b/examples/ginkgo-overhead/CMakeLists.txt
@@ -1,9 +1,9 @@
-cmake_minimum_required(VERSION 3.9)
+cmake_minimum_required(VERSION 3.16)
 project(ginkgo-overhead)
 
 # We only need to find Ginkgo if we build this example stand-alone
 if (NOT GINKGO_BUILD_EXAMPLES)
-    find_package(Ginkgo 1.6.0 REQUIRED)
+    find_package(Ginkgo 1.7.0 REQUIRED)
 endif()
 
 add_executable(ginkgo-overhead ginkgo-overhead.cpp)
diff --git a/examples/ginkgo-overhead/ginkgo-overhead.cpp b/examples/ginkgo-overhead/ginkgo-overhead.cpp
index 5bd90ba0bad..f3f308c495f 100644
--- a/examples/ginkgo-overhead/ginkgo-overhead.cpp
+++ b/examples/ginkgo-overhead/ginkgo-overhead.cpp
@@ -72,8 +72,7 @@ int main(int argc, char* argv[])
     auto cg_factory =
         cg::build()
             .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(num_iters).on(
-                    exec))
+                gko::stop::Iteration::build().with_max_iters(num_iters))
             .on(exec);
     auto A = gko::initialize<mtx>({1.0}, exec);
     auto b = gko::initialize<vec>({std::nan("")}, exec);
diff --git a/examples/ginkgo-ranges/CMakeLists.txt b/examples/ginkgo-ranges/CMakeLists.txt
index de86438d62b..734a4567376 100644
--- a/examples/ginkgo-ranges/CMakeLists.txt
+++ b/examples/ginkgo-ranges/CMakeLists.txt
@@ -1,9 +1,9 @@
-cmake_minimum_required(VERSION 3.9)
+cmake_minimum_required(VERSION 3.16)
 project(ginkgo-ranges)
 
 # We only need to find Ginkgo if we build this example stand-alone
 if (NOT GINKGO_BUILD_EXAMPLES)
-    find_package(Ginkgo 1.6.0 REQUIRED)
+    find_package(Ginkgo 1.7.0 REQUIRED)
 endif()
 add_executable(ginkgo-ranges ginkgo-ranges.cpp)
 target_link_libraries(ginkgo-ranges Ginkgo::ginkgo)
diff --git a/examples/heat-equation/CMakeLists.txt b/examples/heat-equation/CMakeLists.txt
index 3b0cfc57cb0..89dfb9e513b 100644
--- a/examples/heat-equation/CMakeLists.txt
+++ b/examples/heat-equation/CMakeLists.txt
@@ -1,9 +1,9 @@
-cmake_minimum_required(VERSION 3.9)
+cmake_minimum_required(VERSION 3.16)
 project(heat-equation)
 
 # We only need to find Ginkgo if we build this example stand-alone
 if (NOT GINKGO_BUILD_EXAMPLES)
-    find_package(Ginkgo 1.6.0 REQUIRED)
+    find_package(Ginkgo 1.7.0 REQUIRED)
 endif()
 find_package(OpenCV REQUIRED)
 
diff --git a/examples/heat-equation/heat-equation.cpp b/examples/heat-equation/heat-equation.cpp
index eae87f7e64f..8e69931b250 100644
--- a/examples/heat-equation/heat-equation.cpp
+++ b/examples/heat-equation/heat-equation.cpp
@@ -192,11 +192,10 @@ int main(int argc, char* argv[])
     // stopping at 1e-10 relative accuracy
     auto solver =
         gko::solver::Cg<>::build()
-            .with_preconditioner(gko::preconditioner::Ic<>::build().on(exec))
+            .with_preconditioner(gko::preconditioner::Ic<>::build())
             .with_criteria(gko::stop::ResidualNorm<>::build()
                                .with_baseline(gko::stop::mode::rhs_norm)
-                               .with_reduction_factor(1e-10)
-                               .on(exec))
+                               .with_reduction_factor(1e-10))
             .on(exec)
             ->generate(stencil_matrix);
     // time stamp of the last output frame (initialized to a sentinel value)
diff --git a/examples/ilu-preconditioned-solver/CMakeLists.txt b/examples/ilu-preconditioned-solver/CMakeLists.txt
index 85daf54923a..0d1d215860e 100644
--- a/examples/ilu-preconditioned-solver/CMakeLists.txt
+++ b/examples/ilu-preconditioned-solver/CMakeLists.txt
@@ -1,9 +1,9 @@
-cmake_minimum_required(VERSION 3.9)
+cmake_minimum_required(VERSION 3.16)
 project(ilu-preconditioned-solver)
 
 # We only need to find Ginkgo if we build this example stand-alone
 if (NOT GINKGO_BUILD_EXAMPLES)
-    find_package(Ginkgo 1.6.0 REQUIRED)
+    find_package(Ginkgo 1.7.0 REQUIRED)
 endif()
 
 add_executable(ilu-preconditioned-solver ilu-preconditioned-solver.cpp)
diff --git a/examples/ilu-preconditioned-solver/ilu-preconditioned-solver.cpp b/examples/ilu-preconditioned-solver/ilu-preconditioned-solver.cpp
index aa32e0e879a..acebd9d96ff 100644
--- a/examples/ilu-preconditioned-solver/ilu-preconditioned-solver.cpp
+++ b/examples/ilu-preconditioned-solver/ilu-preconditioned-solver.cpp
@@ -68,13 +68,12 @@ int main(int argc, char* argv[])
             {"omp", [] { return gko::OmpExecutor::create(); }},
             {"cuda",
              [] {
-                 return gko::CudaExecutor::create(0, gko::OmpExecutor::create(),
-                                                  true);
+                 return gko::CudaExecutor::create(0,
+                                                  gko::OmpExecutor::create());
              }},
             {"hip",
              [] {
-                 return gko::HipExecutor::create(0, gko::OmpExecutor::create(),
-                                                 true);
+                 return gko::HipExecutor::create(0, gko::OmpExecutor::create());
              }},
             {"dpcpp",
              [] {
@@ -115,11 +114,9 @@ int main(int argc, char* argv[])
     const RealValueType reduction_factor{1e-7};
     auto ilu_gmres_factory =
         gmres::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(1000u).on(exec),
-                gko::stop::ResidualNorm<ValueType>::build()
-                    .with_reduction_factor(reduction_factor)
-                    .on(exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(1000u),
+                           gko::stop::ResidualNorm<ValueType>::build()
+                               .with_reduction_factor(reduction_factor))
             .with_generated_preconditioner(ilu_preconditioner)
             .on(exec);
 
diff --git a/examples/inverse-iteration/CMakeLists.txt b/examples/inverse-iteration/CMakeLists.txt
index fa1d17e55c4..c73da656587 100644
--- a/examples/inverse-iteration/CMakeLists.txt
+++ b/examples/inverse-iteration/CMakeLists.txt
@@ -1,9 +1,9 @@
-cmake_minimum_required(VERSION 3.9)
+cmake_minimum_required(VERSION 3.16)
 project(inverse-iteration)
 
 # We only need to find Ginkgo if we build this example stand-alone
 if (NOT GINKGO_BUILD_EXAMPLES)
-    find_package(Ginkgo 1.6.0 REQUIRED)
+    find_package(Ginkgo 1.7.0 REQUIRED)
 endif()
 
 add_executable(inverse-iteration inverse-iteration.cpp)
diff --git a/examples/inverse-iteration/inverse-iteration.cpp b/examples/inverse-iteration/inverse-iteration.cpp
index 5d8270f1ca1..2b584e0ca4f 100644
--- a/examples/inverse-iteration/inverse-iteration.cpp
+++ b/examples/inverse-iteration/inverse-iteration.cpp
@@ -72,13 +72,12 @@ int main(int argc, char* argv[])
             {"omp", [] { return gko::OmpExecutor::create(); }},
             {"cuda",
              [] {
-                 return gko::CudaExecutor::create(0, gko::OmpExecutor::create(),
-                                                  true);
+                 return gko::CudaExecutor::create(0,
+                                                  gko::OmpExecutor::create());
              }},
             {"hip",
              [] {
-                 return gko::HipExecutor::create(0, gko::OmpExecutor::create(),
-                                                 true);
+                 return gko::HipExecutor::create(0, gko::OmpExecutor::create());
              }},
             {"dpcpp",
              [] {
@@ -119,12 +118,10 @@ int main(int argc, char* argv[])
     // Generate solver operator  (A - zI)^-1
     auto solver =
         solver_type::build()
-            .with_criteria(gko::stop::Iteration::build()
-                               .with_max_iters(system_max_iterations)
-                               .on(exec),
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(
+                               system_max_iterations),
                            gko::stop::ResidualNorm<precision>::build()
-                               .with_reduction_factor(system_residual_goal)
-                               .on(exec))
+                               .with_reduction_factor(system_residual_goal))
             .on(exec)
             ->generate(system_matrix);
 
diff --git a/examples/ir-ilu-preconditioned-solver/CMakeLists.txt b/examples/ir-ilu-preconditioned-solver/CMakeLists.txt
index c1424429636..3a05cb56a81 100644
--- a/examples/ir-ilu-preconditioned-solver/CMakeLists.txt
+++ b/examples/ir-ilu-preconditioned-solver/CMakeLists.txt
@@ -1,9 +1,9 @@
-cmake_minimum_required(VERSION 3.9)
+cmake_minimum_required(VERSION 3.16)
 project(ir-ilu-preconditioned-solver)
 
 # We only need to find Ginkgo if we build this example stand-alone
 if (NOT GINKGO_BUILD_EXAMPLES)
-    find_package(Ginkgo 1.6.0 REQUIRED)
+    find_package(Ginkgo 1.7.0 REQUIRED)
 endif()
 
 add_executable(ir-ilu-preconditioned-solver ir-ilu-preconditioned-solver.cpp)
diff --git a/examples/ir-ilu-preconditioned-solver/ir-ilu-preconditioned-solver.cpp b/examples/ir-ilu-preconditioned-solver/ir-ilu-preconditioned-solver.cpp
index e676e15cc6d..be7e8261f2c 100644
--- a/examples/ir-ilu-preconditioned-solver/ir-ilu-preconditioned-solver.cpp
+++ b/examples/ir-ilu-preconditioned-solver/ir-ilu-preconditioned-solver.cpp
@@ -71,13 +71,12 @@ int main(int argc, char* argv[])
             {"omp", [] { return gko::OmpExecutor::create(); }},
             {"cuda",
              [] {
-                 return gko::CudaExecutor::create(0, gko::OmpExecutor::create(),
-                                                  true);
+                 return gko::CudaExecutor::create(0,
+                                                  gko::OmpExecutor::create());
              }},
             {"hip",
              [] {
-                 return gko::HipExecutor::create(0, gko::OmpExecutor::create(),
-                                                 true);
+                 return gko::HipExecutor::create(0, gko::OmpExecutor::create());
              }},
             {"dpcpp",
              [] {
@@ -120,18 +119,16 @@ int main(int argc, char* argv[])
     auto trisolve_factory =
         ir::build()
             .with_solver(bj_factory)
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(sweeps).on(exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(sweeps))
             .on(exec);
 
     // Generate an ILU preconditioner factory by setting lower and upper
     // triangular solver - in this case the previously defined iterative
     // refinement method.
-    auto ilu_pre_factory =
-        gko::preconditioner::Ilu<ir, ir>::build()
-            .with_l_solver_factory(gko::clone(trisolve_factory))
-            .with_u_solver_factory(gko::clone(trisolve_factory))
-            .on(exec);
+    auto ilu_pre_factory = gko::preconditioner::Ilu<ir, ir>::build()
+                               .with_l_solver(gko::clone(trisolve_factory))
+                               .with_u_solver(gko::clone(trisolve_factory))
+                               .on(exec);
 
     // Use incomplete factors to generate ILU preconditioner
     auto ilu_preconditioner = gko::share(ilu_pre_factory->generate(par_ilu));
diff --git a/examples/iterative-refinement/CMakeLists.txt b/examples/iterative-refinement/CMakeLists.txt
index 39a2651a90d..f8c06ddcafa 100644
--- a/examples/iterative-refinement/CMakeLists.txt
+++ b/examples/iterative-refinement/CMakeLists.txt
@@ -1,9 +1,9 @@
-cmake_minimum_required(VERSION 3.9)
+cmake_minimum_required(VERSION 3.16)
 project(iterative-refinement)
 
 # We only need to find Ginkgo if we build this example stand-alone
 if (NOT GINKGO_BUILD_EXAMPLES)
-    find_package(Ginkgo 1.6.0 REQUIRED)
+    find_package(Ginkgo 1.7.0 REQUIRED)
 endif()
 
 add_executable(iterative-refinement iterative-refinement.cpp)
diff --git a/examples/iterative-refinement/iterative-refinement.cpp b/examples/iterative-refinement/iterative-refinement.cpp
index cbd2156be60..711d43049a1 100644
--- a/examples/iterative-refinement/iterative-refinement.cpp
+++ b/examples/iterative-refinement/iterative-refinement.cpp
@@ -68,13 +68,12 @@ int main(int argc, char* argv[])
             {"omp", [] { return gko::OmpExecutor::create(); }},
             {"cuda",
              [] {
-                 return gko::CudaExecutor::create(0, gko::OmpExecutor::create(),
-                                                  true);
+                 return gko::CudaExecutor::create(0,
+                                                  gko::OmpExecutor::create());
              }},
             {"hip",
              [] {
-                 return gko::HipExecutor::create(0, gko::OmpExecutor::create(),
-                                                 true);
+                 return gko::HipExecutor::create(0, gko::OmpExecutor::create());
              }},
             {"dpcpp",
              [] {
@@ -114,19 +113,13 @@ int main(int argc, char* argv[])
     RealValueType inner_reduction_factor{1e-2};
     auto solver_gen =
         ir::build()
-            .with_solver(
-                cg::build()
-                    .with_criteria(
-                        gko::stop::ResidualNorm<ValueType>::build()
-                            .with_reduction_factor(inner_reduction_factor)
-                            .on(exec))
-                    .on(exec))
+            .with_solver(cg::build().with_criteria(
+                gko::stop::ResidualNorm<ValueType>::build()
+                    .with_reduction_factor(inner_reduction_factor)))
             .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(max_iters).on(
-                    exec),
+                gko::stop::Iteration::build().with_max_iters(max_iters),
                 gko::stop::ResidualNorm<ValueType>::build()
-                    .with_reduction_factor(outer_reduction_factor)
-                    .on(exec))
+                    .with_reduction_factor(outer_reduction_factor))
             .on(exec);
     // Create solver
     auto solver = solver_gen->generate(A);
diff --git a/examples/kokkos_assembly/CMakeLists.txt b/examples/kokkos_assembly/CMakeLists.txt
index e6f214e68e2..9e229c29f58 100644
--- a/examples/kokkos_assembly/CMakeLists.txt
+++ b/examples/kokkos_assembly/CMakeLists.txt
@@ -1,9 +1,9 @@
-cmake_minimum_required(VERSION 3.13)
+cmake_minimum_required(VERSION 3.16)
 project(kokkos-assembly CXX)
 
 # We only need to find Ginkgo if we build this example stand-alone
 if(NOT GINKGO_BUILD_EXAMPLES)
-    find_package(Ginkgo 1.6.0 REQUIRED)
+    find_package(Ginkgo 1.7.0 REQUIRED)
 endif()
 
 find_package(Kokkos REQUIRED)
diff --git a/examples/kokkos_assembly/kokkos_assembly.cpp b/examples/kokkos_assembly/kokkos_assembly.cpp
index ba579199ee3..88ff261b759 100644
--- a/examples/kokkos_assembly/kokkos_assembly.cpp
+++ b/examples/kokkos_assembly/kokkos_assembly.cpp
@@ -208,13 +208,11 @@ int main(int argc, char* argv[])
     const RealValueType reduction_factor{1e-7};
     // Generate solver and solve the system
     cg::build()
-        .with_criteria(gko::stop::Iteration::build()
-                           .with_max_iters(discretization_points)
-                           .on(exec),
-                       gko::stop::ResidualNorm<ValueType>::build()
-                           .with_reduction_factor(reduction_factor)
-                           .on(exec))
-        .with_preconditioner(bj::build().on(exec))
+        .with_criteria(
+            gko::stop::Iteration::build().with_max_iters(discretization_points),
+            gko::stop::ResidualNorm<ValueType>::build().with_reduction_factor(
+                reduction_factor))
+        .with_preconditioner(bj::build())
         .on(exec)
         ->generate(A)
         ->apply(rhs, u);
diff --git a/examples/minimal-cuda-solver/CMakeLists.txt b/examples/minimal-cuda-solver/CMakeLists.txt
index 52aa56b60fc..2d81e558eec 100644
--- a/examples/minimal-cuda-solver/CMakeLists.txt
+++ b/examples/minimal-cuda-solver/CMakeLists.txt
@@ -1,9 +1,9 @@
-cmake_minimum_required(VERSION 3.9)
+cmake_minimum_required(VERSION 3.16)
 project(minimal-cuda-solver)
 
 # We only need to find Ginkgo if we build this example stand-alone
 if (NOT GINKGO_BUILD_EXAMPLES)
-    find_package(Ginkgo 1.6.0 REQUIRED)
+    find_package(Ginkgo 1.7.0 REQUIRED)
 endif()
 
 add_executable(minimal-cuda-solver minimal-cuda-solver.cpp)
diff --git a/examples/minimal-cuda-solver/minimal-cuda-solver.cpp b/examples/minimal-cuda-solver/minimal-cuda-solver.cpp
index 7182bc9ad8c..ccbdaadfc41 100644
--- a/examples/minimal-cuda-solver/minimal-cuda-solver.cpp
+++ b/examples/minimal-cuda-solver/minimal-cuda-solver.cpp
@@ -36,7 +36,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 int main()
 {
     // Instantiate a CUDA executor
-    auto gpu = gko::CudaExecutor::create(0, gko::OmpExecutor::create(), true);
+    auto gpu = gko::CudaExecutor::create(0, gko::OmpExecutor::create());
     // Read data
     auto A = gko::read<gko::matrix::Csr<>>(std::cin, gpu);
     auto b = gko::read<gko::matrix::Dense<>>(std::cin, gpu);
@@ -44,12 +44,10 @@ int main()
     // Create the solver
     auto solver =
         gko::solver::Cg<>::build()
-            .with_preconditioner(gko::preconditioner::Jacobi<>::build().on(gpu))
+            .with_preconditioner(gko::preconditioner::Jacobi<>::build())
             .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(20u).on(gpu),
-                gko::stop::ResidualNorm<>::build()
-                    .with_reduction_factor(1e-15)
-                    .on(gpu))
+                gko::stop::Iteration::build().with_max_iters(20u),
+                gko::stop::ResidualNorm<>::build().with_reduction_factor(1e-15))
             .on(gpu);
     // Solve system
     solver->generate(give(A))->apply(b, x);
diff --git a/examples/mixed-multigrid-preconditioned-solver/CMakeLists.txt b/examples/mixed-multigrid-preconditioned-solver/CMakeLists.txt
index 54384f544b7..a66a8410bfb 100644
--- a/examples/mixed-multigrid-preconditioned-solver/CMakeLists.txt
+++ b/examples/mixed-multigrid-preconditioned-solver/CMakeLists.txt
@@ -1,9 +1,9 @@
-cmake_minimum_required(VERSION 3.9)
+cmake_minimum_required(VERSION 3.16)
 project(mixed-multigrid-preconditioned-solver)
 
 # We only need to find Ginkgo if we build this example stand-alone
 if (NOT GINKGO_BUILD_EXAMPLES)
-    find_package(Ginkgo 1.6.0 REQUIRED)
+    find_package(Ginkgo 1.7.0 REQUIRED)
 endif()
 
 add_executable(mixed-multigrid-preconditioned-solver mixed-multigrid-preconditioned-solver.cpp)
diff --git a/examples/mixed-multigrid-preconditioned-solver/doc/results.dox b/examples/mixed-multigrid-preconditioned-solver/doc/results.dox
index af922a27ebc..dccd3ccad93 100644
--- a/examples/mixed-multigrid-preconditioned-solver/doc/results.dox
+++ b/examples/mixed-multigrid-preconditioned-solver/doc/results.dox
@@ -14,7 +14,7 @@ Final residual norm sqrt(r^T r):
 CG iteration count:     39
 CG generation time [ms]: 2.04293
 CG execution time [ms]: 22.3874
-CG execution time per iteraion[ms]: 0.574036
+CG execution time per iteration[ms]: 0.574036
 
 @endcode
 
diff --git a/examples/mixed-multigrid-preconditioned-solver/mixed-multigrid-preconditioned-solver.cpp b/examples/mixed-multigrid-preconditioned-solver/mixed-multigrid-preconditioned-solver.cpp
index 6f1600d2805..3834fa7f33f 100644
--- a/examples/mixed-multigrid-preconditioned-solver/mixed-multigrid-preconditioned-solver.cpp
+++ b/examples/mixed-multigrid-preconditioned-solver/mixed-multigrid-preconditioned-solver.cpp
@@ -71,13 +71,12 @@ int main(int argc, char* argv[])
             {"omp", [] { return gko::OmpExecutor::create(); }},
             {"cuda",
              [] {
-                 return gko::CudaExecutor::create(0, gko::OmpExecutor::create(),
-                                                  true);
+                 return gko::CudaExecutor::create(0,
+                                                  gko::OmpExecutor::create());
              }},
             {"hip",
              [] {
-                 return gko::HipExecutor::create(0, gko::OmpExecutor::create(),
-                                                 true);
+                 return gko::HipExecutor::create(0, gko::OmpExecutor::create());
              }},
             {"dpcpp",
              [] {
@@ -140,15 +139,13 @@ int main(int argc, char* argv[])
         ir::build()
             .with_solver(inner_solver_gen)
             .with_relaxation_factor(static_cast<ValueType>(0.9))
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(1u).on(exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(1u))
             .on(exec));
     auto smoother_gen_f = gko::share(
         ir_f::build()
             .with_solver(inner_solver_gen_f)
             .with_relaxation_factor(static_cast<MixedType>(0.9))
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(1u).on(exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(1u))
             .on(exec));
     // Create MultigridLevel factory
     auto mg_level_gen =
@@ -160,15 +157,13 @@ int main(int argc, char* argv[])
         ir::build()
             .with_solver(inner_solver_gen)
             .with_relaxation_factor(static_cast<ValueType>(0.9))
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(4u).on(exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(4u))
             .on(exec));
     auto coarsest_gen_f = gko::share(
         ir_f::build()
             .with_solver(inner_solver_gen_f)
             .with_relaxation_factor(static_cast<MixedType>(0.9))
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(4u).on(exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(4u))
             .on(exec));
     // Create multigrid factory
     std::shared_ptr<gko::LinOpFactory> multigrid_gen;
@@ -193,8 +188,7 @@ int main(int argc, char* argv[])
                 .with_coarsest_solver(coarsest_gen_f)
                 .with_default_initial_guess(
                     gko::solver::initial_guess_mode::zero)
-                .with_criteria(
-                    gko::stop::Iteration::build().with_max_iters(1u).on(exec))
+                .with_criteria(gko::stop::Iteration::build().with_max_iters(1u))
                 .on(exec);
     } else {
         multigrid_gen =
@@ -207,8 +201,7 @@ int main(int argc, char* argv[])
                 .with_coarsest_solver(coarsest_gen)
                 .with_default_initial_guess(
                     gko::solver::initial_guess_mode::zero)
-                .with_criteria(
-                    gko::stop::Iteration::build().with_max_iters(1u).on(exec))
+                .with_criteria(gko::stop::Iteration::build().with_max_iters(1u))
                 .on(exec);
     }
     // Create solver factory
@@ -251,7 +244,7 @@ int main(int argc, char* argv[])
               << static_cast<double>(gen_time.count()) / 1000000.0 << std::endl;
     std::cout << "CG execution time [ms]: "
               << static_cast<double>(time.count()) / 1000000.0 << std::endl;
-    std::cout << "CG execution time per iteraion[ms]: "
+    std::cout << "CG execution time per iteration[ms]: "
               << static_cast<double>(time.count()) / 1000000.0 /
                      logger->get_num_iterations()
               << std::endl;
diff --git a/examples/mixed-multigrid-solver/CMakeLists.txt b/examples/mixed-multigrid-solver/CMakeLists.txt
index e4ee334e38f..af73c94c334 100644
--- a/examples/mixed-multigrid-solver/CMakeLists.txt
+++ b/examples/mixed-multigrid-solver/CMakeLists.txt
@@ -1,9 +1,9 @@
-cmake_minimum_required(VERSION 3.9)
+cmake_minimum_required(VERSION 3.16)
 project(mixed-multigrid-solver)
 
 # We only need to find Ginkgo if we build this example stand-alone
 if (NOT GINKGO_BUILD_EXAMPLES)
-    find_package(Ginkgo 1.6.0 REQUIRED)
+    find_package(Ginkgo 1.7.0 REQUIRED)
 endif()
 
 add_executable(mixed-multigrid-solver mixed-multigrid-solver.cpp)
diff --git a/examples/mixed-multigrid-solver/doc/results.dox b/examples/mixed-multigrid-solver/doc/results.dox
index 7cbaa772d18..045fe343743 100644
--- a/examples/mixed-multigrid-solver/doc/results.dox
+++ b/examples/mixed-multigrid-solver/doc/results.dox
@@ -14,7 +14,7 @@ Final residual norm sqrt(r^T r):
 Multigrid iteration count:     9
 Multigrid generation time [ms]: 3.35361
 Multigrid execution time [ms]: 10.048
-Multigrid execution time per iteraion[ms]: 1.11644
+Multigrid execution time per iteration[ms]: 1.11644
 
 @endcode
 
diff --git a/examples/mixed-multigrid-solver/mixed-multigrid-solver.cpp b/examples/mixed-multigrid-solver/mixed-multigrid-solver.cpp
index d3f45cda916..33684198c83 100644
--- a/examples/mixed-multigrid-solver/mixed-multigrid-solver.cpp
+++ b/examples/mixed-multigrid-solver/mixed-multigrid-solver.cpp
@@ -69,13 +69,12 @@ int main(int argc, char* argv[])
             {"omp", [] { return gko::OmpExecutor::create(); }},
             {"cuda",
              [] {
-                 return gko::CudaExecutor::create(0, gko::OmpExecutor::create(),
-                                                  true);
+                 return gko::CudaExecutor::create(0,
+                                                  gko::OmpExecutor::create());
              }},
             {"hip",
              [] {
-                 return gko::HipExecutor::create(0, gko::OmpExecutor::create(),
-                                                 true);
+                 return gko::HipExecutor::create(0, gko::OmpExecutor::create());
              }},
             {"dpcpp",
              [] {
@@ -126,17 +125,15 @@ int main(int argc, char* argv[])
     // Create smoother factory (ir with bj)
     auto smoother_gen = gko::share(
         ir::build()
-            .with_solver(bj::build().with_max_block_size(1u).on(exec))
+            .with_solver(bj::build().with_max_block_size(1u))
             .with_relaxation_factor(static_cast<ValueType>(0.9))
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(1u).on(exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(1u))
             .on(exec));
     auto smoother_gen2 = gko::share(
         ir2::build()
-            .with_solver(bj2::build().with_max_block_size(1u).on(exec))
+            .with_solver(bj2::build().with_max_block_size(1u))
             .with_relaxation_factor(static_cast<MixedType>(0.9))
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(1u).on(exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(1u))
             .on(exec));
     // Create RestrictProlong factory
     auto mg_level_gen =
@@ -146,17 +143,15 @@ int main(int argc, char* argv[])
     // Create CoarsesSolver factory
     auto coarsest_solver_gen = gko::share(
         ir::build()
-            .with_solver(bj::build().with_max_block_size(1u).on(exec))
+            .with_solver(bj::build().with_max_block_size(1u))
             .with_relaxation_factor(static_cast<ValueType>(0.9))
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(4u).on(exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(4u))
             .on(exec));
     auto coarsest_solver_gen2 = gko::share(
         ir2::build()
-            .with_solver(bj2::build().with_max_block_size(1u).on(exec))
+            .with_solver(bj2::build().with_max_block_size(1u))
             .with_relaxation_factor(static_cast<MixedType>(0.9))
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(4u).on(exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(4u))
             .on(exec));
     // Create multigrid factory
     std::shared_ptr<gko::LinOpFactory> multigrid_gen;
@@ -233,7 +228,7 @@ int main(int argc, char* argv[])
               << static_cast<double>(gen_time.count()) / 1000000.0 << std::endl;
     std::cout << "Multigrid execution time [ms]: "
               << static_cast<double>(time.count()) / 1000000.0 << std::endl;
-    std::cout << "Multigrid execution time per iteraion[ms]: "
+    std::cout << "Multigrid execution time per iteration[ms]: "
               << static_cast<double>(time.count()) / 1000000.0 /
                      logger->get_num_iterations()
               << std::endl;
diff --git a/examples/mixed-precision-ir/CMakeLists.txt b/examples/mixed-precision-ir/CMakeLists.txt
index a0a46c0fd6e..156ede4fe13 100644
--- a/examples/mixed-precision-ir/CMakeLists.txt
+++ b/examples/mixed-precision-ir/CMakeLists.txt
@@ -1,9 +1,9 @@
-cmake_minimum_required(VERSION 3.9)
+cmake_minimum_required(VERSION 3.16)
 project(mixed-precision-ir)
 
 # We only need to find Ginkgo if we build this example stand-alone
 if (NOT GINKGO_BUILD_EXAMPLES)
-    find_package(Ginkgo 1.6.0 REQUIRED)
+    find_package(Ginkgo 1.7.0 REQUIRED)
 endif()
 
 add_executable(mixed-precision-ir mixed-precision-ir.cpp)
diff --git a/examples/mixed-precision-ir/mixed-precision-ir.cpp b/examples/mixed-precision-ir/mixed-precision-ir.cpp
index 3510a2163e1..0083ca15162 100644
--- a/examples/mixed-precision-ir/mixed-precision-ir.cpp
+++ b/examples/mixed-precision-ir/mixed-precision-ir.cpp
@@ -76,13 +76,12 @@ int main(int argc, char* argv[])
             {"omp", [] { return gko::OmpExecutor::create(); }},
             {"cuda",
              [] {
-                 return gko::CudaExecutor::create(0, gko::OmpExecutor::create(),
-                                                  true);
+                 return gko::CudaExecutor::create(0,
+                                                  gko::OmpExecutor::create());
              }},
             {"hip",
              [] {
-                 return gko::HipExecutor::create(0, gko::OmpExecutor::create(),
-                                                 true);
+                 return gko::HipExecutor::create(0, gko::OmpExecutor::create());
              }},
             {"dpcpp",
              [] {
@@ -125,12 +124,10 @@ int main(int argc, char* argv[])
     // Create inner solver
     auto inner_solver =
         cg::build()
-            .with_criteria(gko::stop::ResidualNorm<SolverType>::build()
-                               .with_reduction_factor(inner_reduction_factor)
-                               .on(exec),
-                           gko::stop::Iteration::build()
-                               .with_max_iters(max_inner_iters)
-                               .on(exec))
+            .with_criteria(
+                gko::stop::ResidualNorm<SolverType>::build()
+                    .with_reduction_factor(inner_reduction_factor),
+                gko::stop::Iteration::build().with_max_iters(max_inner_iters))
             .on(exec)
             ->generate(give(solver_A));
 
diff --git a/examples/mixed-spmv/CMakeLists.txt b/examples/mixed-spmv/CMakeLists.txt
index ad8e31aad3e..2e2ed9bb074 100644
--- a/examples/mixed-spmv/CMakeLists.txt
+++ b/examples/mixed-spmv/CMakeLists.txt
@@ -1,9 +1,9 @@
-cmake_minimum_required(VERSION 3.9)
+cmake_minimum_required(VERSION 3.16)
 project(mixed-spmv)
 
 # We only need to find Ginkgo if we build this example stand-alone
 if (NOT GINKGO_BUILD_EXAMPLES)
-    find_package(Ginkgo 1.6.0 REQUIRED)
+    find_package(Ginkgo 1.7.0 REQUIRED)
 endif()
 
 add_executable(mixed-spmv mixed-spmv.cpp)
diff --git a/examples/mixed-spmv/mixed-spmv.cpp b/examples/mixed-spmv/mixed-spmv.cpp
index 78461de39ef..6b327c1c708 100644
--- a/examples/mixed-spmv/mixed-spmv.cpp
+++ b/examples/mixed-spmv/mixed-spmv.cpp
@@ -170,13 +170,12 @@ int main(int argc, char* argv[])
             {"omp", [] { return gko::OmpExecutor::create(); }},
             {"cuda",
              [] {
-                 return gko::CudaExecutor::create(0, gko::OmpExecutor::create(),
-                                                  true);
+                 return gko::CudaExecutor::create(0,
+                                                  gko::OmpExecutor::create());
              }},
             {"hip",
              [] {
-                 return gko::HipExecutor::create(0, gko::OmpExecutor::create(),
-                                                 true);
+                 return gko::HipExecutor::create(0, gko::OmpExecutor::create());
              }},
             {"dpcpp",
              [] {
diff --git a/examples/multigrid-preconditioned-solver-customized/CMakeLists.txt b/examples/multigrid-preconditioned-solver-customized/CMakeLists.txt
index 4d2b0822d08..99ba03167f5 100644
--- a/examples/multigrid-preconditioned-solver-customized/CMakeLists.txt
+++ b/examples/multigrid-preconditioned-solver-customized/CMakeLists.txt
@@ -1,9 +1,9 @@
-cmake_minimum_required(VERSION 3.9)
+cmake_minimum_required(VERSION 3.16)
 project(multigrid-preconditioned-solver-customized)
 
 # We only need to find Ginkgo if we build this example stand-alone
 if (NOT GINKGO_BUILD_EXAMPLES)
-    find_package(Ginkgo 1.6.0 REQUIRED)
+    find_package(Ginkgo 1.7.0 REQUIRED)
 endif()
 
 add_executable(multigrid-preconditioned-solver-customized multigrid-preconditioned-solver-customized.cpp)
diff --git a/examples/multigrid-preconditioned-solver-customized/doc/results.dox b/examples/multigrid-preconditioned-solver-customized/doc/results.dox
index c7ba90d2fbb..2135f715934 100644
--- a/examples/multigrid-preconditioned-solver-customized/doc/results.dox
+++ b/examples/multigrid-preconditioned-solver-customized/doc/results.dox
@@ -14,7 +14,7 @@ Final residual norm sqrt(r^T r):
 CG iteration count:     12
 CG generation time [ms]: 1.41642
 CG execution time [ms]: 6.59244
-CG execution time per iteraion[ms]: 0.54937
+CG execution time per iteration[ms]: 0.54937
 
 @endcode
 
diff --git a/examples/multigrid-preconditioned-solver-customized/multigrid-preconditioned-solver-customized.cpp b/examples/multigrid-preconditioned-solver-customized/multigrid-preconditioned-solver-customized.cpp
index 6f75ca29630..d63dedf486b 100644
--- a/examples/multigrid-preconditioned-solver-customized/multigrid-preconditioned-solver-customized.cpp
+++ b/examples/multigrid-preconditioned-solver-customized/multigrid-preconditioned-solver-customized.cpp
@@ -64,13 +64,12 @@ int main(int argc, char* argv[])
             {"omp", [] { return gko::OmpExecutor::create(); }},
             {"cuda",
              [] {
-                 return gko::CudaExecutor::create(0, gko::OmpExecutor::create(),
-                                                  true);
+                 return gko::CudaExecutor::create(0,
+                                                  gko::OmpExecutor::create());
              }},
             {"hip",
              [] {
-                 return gko::HipExecutor::create(0, gko::OmpExecutor::create(),
-                                                 true);
+                 return gko::HipExecutor::create(0, gko::OmpExecutor::create());
              }},
             {"dpcpp",
              [] {
@@ -131,8 +130,7 @@ int main(int argc, char* argv[])
     // iterative refinement with two iterations and an Ic solver.
     auto ic_gen = gko::share(
         ic::build()
-            .with_factorization_factory(
-                gko::factorization::Ic<ValueType, int>::build().on(exec))
+            .with_factorization(gko::factorization::Ic<ValueType, int>::build())
             .on(exec));
     auto smoother_gen = gko::share(
         gko::solver::build_smoother(ic_gen, 2u, static_cast<ValueType>(0.9)));
@@ -160,8 +158,7 @@ int main(int argc, char* argv[])
             .with_mg_level(mg_level_gen)
             .with_coarsest_solver(coarsest_gen)
             .with_default_initial_guess(gko::solver::initial_guess_mode::zero)
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(1u).on(exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(1u))
             .on(exec);
     // Create solver factory
     auto solver_gen = cg::build()
@@ -203,7 +200,7 @@ int main(int argc, char* argv[])
               << static_cast<double>(gen_time.count()) / 1000000.0 << std::endl;
     std::cout << "CG execution time [ms]: "
               << static_cast<double>(time.count()) / 1000000.0 << std::endl;
-    std::cout << "CG execution time per iteraion[ms]: "
+    std::cout << "CG execution time per iteration[ms]: "
               << static_cast<double>(time.count()) / 1000000.0 /
                      logger->get_num_iterations()
               << std::endl;
diff --git a/examples/multigrid-preconditioned-solver/CMakeLists.txt b/examples/multigrid-preconditioned-solver/CMakeLists.txt
index af7c296b631..75c56b80062 100644
--- a/examples/multigrid-preconditioned-solver/CMakeLists.txt
+++ b/examples/multigrid-preconditioned-solver/CMakeLists.txt
@@ -1,9 +1,9 @@
-cmake_minimum_required(VERSION 3.9)
+cmake_minimum_required(VERSION 3.16)
 project(multigrid-preconditioned-solver)
 
 # We only need to find Ginkgo if we build this example stand-alone
 if (NOT GINKGO_BUILD_EXAMPLES)
-    find_package(Ginkgo 1.6.0 REQUIRED)
+    find_package(Ginkgo 1.7.0 REQUIRED)
 endif()
 
 add_executable(multigrid-preconditioned-solver multigrid-preconditioned-solver.cpp)
diff --git a/examples/multigrid-preconditioned-solver/doc/results.dox b/examples/multigrid-preconditioned-solver/doc/results.dox
index af922a27ebc..dccd3ccad93 100644
--- a/examples/multigrid-preconditioned-solver/doc/results.dox
+++ b/examples/multigrid-preconditioned-solver/doc/results.dox
@@ -14,7 +14,7 @@ Final residual norm sqrt(r^T r):
 CG iteration count:     39
 CG generation time [ms]: 2.04293
 CG execution time [ms]: 22.3874
-CG execution time per iteraion[ms]: 0.574036
+CG execution time per iteration[ms]: 0.574036
 
 @endcode
 
diff --git a/examples/multigrid-preconditioned-solver/multigrid-preconditioned-solver.cpp b/examples/multigrid-preconditioned-solver/multigrid-preconditioned-solver.cpp
index 7f47d039072..0bb51e6fee9 100644
--- a/examples/multigrid-preconditioned-solver/multigrid-preconditioned-solver.cpp
+++ b/examples/multigrid-preconditioned-solver/multigrid-preconditioned-solver.cpp
@@ -62,13 +62,12 @@ int main(int argc, char* argv[])
             {"omp", [] { return gko::OmpExecutor::create(); }},
             {"cuda",
              [] {
-                 return gko::CudaExecutor::create(0, gko::OmpExecutor::create(),
-                                                  true);
+                 return gko::CudaExecutor::create(0,
+                                                  gko::OmpExecutor::create());
              }},
             {"hip",
              [] {
-                 return gko::HipExecutor::create(0, gko::OmpExecutor::create(),
-                                                 true);
+                 return gko::HipExecutor::create(0, gko::OmpExecutor::create());
              }},
             {"dpcpp",
              [] {
@@ -109,19 +108,16 @@ int main(int argc, char* argv[])
     std::shared_ptr<gko::LinOpFactory> multigrid_gen;
     multigrid_gen =
         mg::build()
-            .with_mg_level(pgm::build().with_deterministic(true).on(exec))
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(1u).on(exec))
+            .with_mg_level(pgm::build().with_deterministic(true))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(1u))
             .on(exec);
     const gko::remove_complex<ValueType> tolerance = 1e-8;
     auto solver_gen =
         cg::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(100u).on(exec),
-                gko::stop::ResidualNorm<ValueType>::build()
-                    .with_baseline(gko::stop::mode::absolute)
-                    .with_reduction_factor(tolerance)
-                    .on(exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(100u),
+                           gko::stop::ResidualNorm<ValueType>::build()
+                               .with_baseline(gko::stop::mode::absolute)
+                               .with_reduction_factor(tolerance))
             .with_preconditioner(multigrid_gen)
             .on(exec);
     // Create solver
@@ -162,7 +158,7 @@ int main(int argc, char* argv[])
               << static_cast<double>(gen_time.count()) / 1000000.0 << std::endl;
     std::cout << "CG execution time [ms]: "
               << static_cast<double>(time.count()) / 1000000.0 << std::endl;
-    std::cout << "CG execution time per iteraion[ms]: "
+    std::cout << "CG execution time per iteration[ms]: "
               << static_cast<double>(time.count()) / 1000000.0 /
                      logger->get_num_iterations()
               << std::endl;
diff --git a/examples/nine-pt-stencil-solver/CMakeLists.txt b/examples/nine-pt-stencil-solver/CMakeLists.txt
index d2384129d47..511bb334d7c 100644
--- a/examples/nine-pt-stencil-solver/CMakeLists.txt
+++ b/examples/nine-pt-stencil-solver/CMakeLists.txt
@@ -1,9 +1,9 @@
-cmake_minimum_required(VERSION 3.9)
+cmake_minimum_required(VERSION 3.16)
 project(nine-pt-stencil-solver)
 
 # We only need to find Ginkgo if we build this example stand-alone
 if (NOT GINKGO_BUILD_EXAMPLES)
-    find_package(Ginkgo 1.6.0 REQUIRED)
+    find_package(Ginkgo 1.7.0 REQUIRED)
 endif()
 
 add_executable(nine-pt-stencil-solver nine-pt-stencil-solver.cpp)
diff --git a/examples/nine-pt-stencil-solver/nine-pt-stencil-solver.cpp b/examples/nine-pt-stencil-solver/nine-pt-stencil-solver.cpp
index 51fdf97d4a4..be3cc958baf 100644
--- a/examples/nine-pt-stencil-solver/nine-pt-stencil-solver.cpp
+++ b/examples/nine-pt-stencil-solver/nine-pt-stencil-solver.cpp
@@ -230,13 +230,12 @@ void solve_system(const std::string& executor_string,
             {"omp", [] { return gko::OmpExecutor::create(); }},
             {"cuda",
              [] {
-                 return gko::CudaExecutor::create(0, gko::OmpExecutor::create(),
-                                                  true);
+                 return gko::CudaExecutor::create(0,
+                                                  gko::OmpExecutor::create());
              }},
             {"hip",
              [] {
-                 return gko::HipExecutor::create(0, gko::OmpExecutor::create(),
-                                                 true);
+                 return gko::HipExecutor::create(0, gko::OmpExecutor::create());
              }},
             {"dpcpp",
              [] {
@@ -283,12 +282,10 @@ void solve_system(const std::string& executor_string,
     // Generate solver
     auto solver_gen =
         cg::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(dp_2).on(exec),
-                gko::stop::ResidualNorm<ValueType>::build()
-                    .with_reduction_factor(reduction_factor)
-                    .on(exec))
-            .with_preconditioner(bj::build().on(exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(dp_2),
+                           gko::stop::ResidualNorm<ValueType>::build()
+                               .with_reduction_factor(reduction_factor))
+            .with_preconditioner(bj::build())
             .on(exec);
     auto solver = solver_gen->generate(gko::give(matrix));
 
diff --git a/examples/papi-logging/CMakeLists.txt b/examples/papi-logging/CMakeLists.txt
index ac2560f499d..3695e12b814 100644
--- a/examples/papi-logging/CMakeLists.txt
+++ b/examples/papi-logging/CMakeLists.txt
@@ -1,9 +1,9 @@
-cmake_minimum_required(VERSION 3.9)
+cmake_minimum_required(VERSION 3.16)
 project(papi-logging)
 
 # We only need to find Ginkgo if we build this example stand-alone
 if (NOT GINKGO_BUILD_EXAMPLES)
-    find_package(Ginkgo 1.6.0 REQUIRED)
+    find_package(Ginkgo 1.7.0 REQUIRED)
 endif()
 
 if (NOT GINKGO_HAVE_PAPI_SDE)
diff --git a/examples/papi-logging/papi-logging.cpp b/examples/papi-logging/papi-logging.cpp
index 0d81ef65909..0b26e56dd80 100644
--- a/examples/papi-logging/papi-logging.cpp
+++ b/examples/papi-logging/papi-logging.cpp
@@ -151,13 +151,12 @@ int main(int argc, char* argv[])
             {"omp", [] { return gko::OmpExecutor::create(); }},
             {"cuda",
              [] {
-                 return gko::CudaExecutor::create(0, gko::OmpExecutor::create(),
-                                                  true);
+                 return gko::CudaExecutor::create(0,
+                                                  gko::OmpExecutor::create());
              }},
             {"hip",
              [] {
-                 return gko::HipExecutor::create(0, gko::OmpExecutor::create(),
-                                                 true);
+                 return gko::HipExecutor::create(0, gko::OmpExecutor::create());
              }},
             {"dpcpp",
              [] {
@@ -178,11 +177,9 @@ int main(int argc, char* argv[])
     const RealValueType reduction_factor{1e-7};
     auto solver_gen =
         cg::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(20u).on(exec),
-                gko::stop::ResidualNorm<ValueType>::build()
-                    .with_reduction_factor(reduction_factor)
-                    .on(exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(20u),
+                           gko::stop::ResidualNorm<ValueType>::build()
+                               .with_reduction_factor(reduction_factor))
             .on(exec);
     auto solver = solver_gen->generate(A);
 
diff --git a/examples/par-ilu-convergence/CMakeLists.txt b/examples/par-ilu-convergence/CMakeLists.txt
index bee08841173..8679ccdf526 100644
--- a/examples/par-ilu-convergence/CMakeLists.txt
+++ b/examples/par-ilu-convergence/CMakeLists.txt
@@ -1,9 +1,9 @@
-cmake_minimum_required(VERSION 3.9)
+cmake_minimum_required(VERSION 3.16)
 project(par-ilu-convergence)
 
 # We only need to find Ginkgo if we build this example stand-alone
 if (NOT GINKGO_BUILD_EXAMPLES)
-    find_package(Ginkgo 1.6.0 REQUIRED)
+    find_package(Ginkgo 1.7.0 REQUIRED)
 endif()
 
 add_executable(par-ilu-convergence par-ilu-convergence.cpp)
diff --git a/examples/par-ilu-convergence/par-ilu-convergence.cpp b/examples/par-ilu-convergence/par-ilu-convergence.cpp
index 93e32422a7e..cc0440baa05 100644
--- a/examples/par-ilu-convergence/par-ilu-convergence.cpp
+++ b/examples/par-ilu-convergence/par-ilu-convergence.cpp
@@ -133,52 +133,36 @@ int main(int argc, char* argv[])
                                                                  exec);
     }));
 
-    std::shared_ptr<gko::LinOpFactory> factory;
-    std::function<void(int)> set_iterations;
-    if (precond == "parilu") {
-        factory =
-            gko::factorization::ParIlu<ValueType, IndexType>::build().on(exec);
-        set_iterations = [&](int it) {
-            gko::as<gko::factorization::ParIlu<ValueType, IndexType>::Factory>(
-                factory)
-                ->get_parameters()
-                .iterations = it;
-        };
-    } else if (precond == "paric") {
-        factory =
-            gko::factorization::ParIc<ValueType, IndexType>::build().on(exec);
-        set_iterations = [&](int it) {
-            gko::as<gko::factorization::ParIc<ValueType, IndexType>::Factory>(
-                factory)
-                ->get_parameters()
-                .iterations = it;
-        };
-    } else if (precond == "parilut") {
-        factory = gko::factorization::ParIlut<ValueType, IndexType>::build()
-                      .with_fill_in_limit(limit)
-                      .on(exec);
-        set_iterations = [&](int it) {
-            gko::as<gko::factorization::ParIlut<ValueType, IndexType>::Factory>(
-                factory)
-                ->get_parameters()
-                .iterations = it;
-        };
-    } else if (precond == "parict") {
-        factory = gko::factorization::ParIct<ValueType, IndexType>::build()
-                      .with_fill_in_limit(limit)
-                      .on(exec);
-        set_iterations = [&](int it) {
-            gko::as<gko::factorization::ParIct<ValueType, IndexType>::Factory>(
-                factory)
-                ->get_parameters()
-                .iterations = it;
-        };
-    }
+    auto factory_generator =
+        [&](gko::size_type iteration) -> std::shared_ptr<gko::LinOpFactory> {
+        if (precond == "parilu") {
+            return gko::factorization::ParIlu<ValueType, IndexType>::build()
+                .with_iterations(iteration)
+                .on(exec);
+        } else if (precond == "paric") {
+            return gko::factorization::ParIc<ValueType, IndexType>::build()
+                .with_iterations(iteration)
+                .on(exec);
+        } else if (precond == "parilut") {
+            return gko::factorization::ParIlut<ValueType, IndexType>::build()
+                .with_fill_in_limit(limit)
+                .with_iterations(iteration)
+                .on(exec);
+        } else if (precond == "parict") {
+            return gko::factorization::ParIct<ValueType, IndexType>::build()
+                .with_fill_in_limit(limit)
+                .with_iterations(iteration)
+                .on(exec);
+        } else {
+            GKO_NOT_IMPLEMENTED;
+        }
+    };
+
     auto one = gko::initialize<gko::matrix::Dense<ValueType>>({1.0}, exec);
     auto minus_one =
         gko::initialize<gko::matrix::Dense<ValueType>>({-1.0}, exec);
     for (int it = 1; it <= max_iterations; ++it) {
-        set_iterations(it);
+        auto factory = factory_generator(it);
         std::cout << it << ';';
         std::vector<long> times;
         std::vector<double> residuals;
diff --git a/examples/performance-debugging/CMakeLists.txt b/examples/performance-debugging/CMakeLists.txt
index 4f095e4d1c6..7f6317a491f 100644
--- a/examples/performance-debugging/CMakeLists.txt
+++ b/examples/performance-debugging/CMakeLists.txt
@@ -1,9 +1,9 @@
-cmake_minimum_required(VERSION 3.9)
+cmake_minimum_required(VERSION 3.16)
 project(performance-debugging)
 
 # We only need to find Ginkgo if we build this example stand-alone
 if (NOT GINKGO_BUILD_EXAMPLES)
-    find_package(Ginkgo 1.6.0 REQUIRED)
+    find_package(Ginkgo 1.7.0 REQUIRED)
 endif()
 
 add_executable(performance-debugging performance-debugging.cpp)
diff --git a/examples/performance-debugging/performance-debugging.cpp b/examples/performance-debugging/performance-debugging.cpp
index f357a8d4619..c8f741114d2 100644
--- a/examples/performance-debugging/performance-debugging.cpp
+++ b/examples/performance-debugging/performance-debugging.cpp
@@ -371,13 +371,12 @@ int main(int argc, char* argv[])
             {"omp", [] { return gko::OmpExecutor::create(); }},
             {"cuda",
              [] {
-                 return gko::CudaExecutor::create(0, gko::OmpExecutor::create(),
-                                                  true);
+                 return gko::CudaExecutor::create(0,
+                                                  gko::OmpExecutor::create());
              }},
             {"hip",
              [] {
-                 return gko::HipExecutor::create(0, gko::OmpExecutor::create(),
-                                                 true);
+                 return gko::HipExecutor::create(0, gko::OmpExecutor::create());
              }},
             {"dpcpp",
              [] {
@@ -417,10 +416,8 @@ int main(int argc, char* argv[])
         solver::build()
             .with_criteria(
                 gko::stop::ResidualNorm<ValueType>::build()
-                    .with_reduction_factor(reduction_factor)
-                    .on(exec),
-                gko::stop::Iteration::build().with_max_iters(max_iters).on(
-                    exec))
+                    .with_reduction_factor(reduction_factor),
+                gko::stop::Iteration::build().with_max_iters(max_iters))
             .with_preconditioner(preconditioner::create(exec))
             .on(exec);
 
diff --git a/examples/poisson-solver/CMakeLists.txt b/examples/poisson-solver/CMakeLists.txt
index 64e0633ee75..83791b5cfda 100644
--- a/examples/poisson-solver/CMakeLists.txt
+++ b/examples/poisson-solver/CMakeLists.txt
@@ -1,9 +1,9 @@
-cmake_minimum_required(VERSION 3.9)
+cmake_minimum_required(VERSION 3.16)
 project(poisson-solver)
 
 # We only need to find Ginkgo if we build this example stand-alone
 if (NOT GINKGO_BUILD_EXAMPLES)
-    find_package(Ginkgo 1.6.0 REQUIRED)
+    find_package(Ginkgo 1.7.0 REQUIRED)
 endif()
 
 add_executable(poisson-solver poisson-solver.cpp)
diff --git a/examples/poisson-solver/poisson-solver.cpp b/examples/poisson-solver/poisson-solver.cpp
index 7602600a514..eba163fb281 100644
--- a/examples/poisson-solver/poisson-solver.cpp
+++ b/examples/poisson-solver/poisson-solver.cpp
@@ -144,13 +144,12 @@ int main(int argc, char* argv[])
             {"omp", [] { return gko::OmpExecutor::create(); }},
             {"cuda",
              [] {
-                 return gko::CudaExecutor::create(0, gko::OmpExecutor::create(),
-                                                  true);
+                 return gko::CudaExecutor::create(0,
+                                                  gko::OmpExecutor::create());
              }},
             {"hip",
              [] {
-                 return gko::HipExecutor::create(0, gko::OmpExecutor::create(),
-                                                 true);
+                 return gko::HipExecutor::create(0, gko::OmpExecutor::create());
              }},
             {"dpcpp",
              [] {
@@ -185,13 +184,11 @@ int main(int argc, char* argv[])
     const gko::remove_complex<ValueType> reduction_factor = 1e-7;
     // Generate solver and solve the system
     cg::build()
-        .with_criteria(gko::stop::Iteration::build()
-                           .with_max_iters(discretization_points)
-                           .on(exec),
-                       gko::stop::ResidualNorm<ValueType>::build()
-                           .with_reduction_factor(reduction_factor)
-                           .on(exec))
-        .with_preconditioner(bj::build().on(exec))
+        .with_criteria(
+            gko::stop::Iteration::build().with_max_iters(discretization_points),
+            gko::stop::ResidualNorm<ValueType>::build().with_reduction_factor(
+                reduction_factor))
+        .with_preconditioner(bj::build())
         .on(exec)
         ->generate(clone(exec, matrix))  // copy the matrix to the executor
         ->apply(rhs, u);
diff --git a/examples/preconditioned-solver/CMakeLists.txt b/examples/preconditioned-solver/CMakeLists.txt
index b046686243d..b8d9bb8fc9f 100644
--- a/examples/preconditioned-solver/CMakeLists.txt
+++ b/examples/preconditioned-solver/CMakeLists.txt
@@ -1,9 +1,9 @@
-cmake_minimum_required(VERSION 3.9)
+cmake_minimum_required(VERSION 3.16)
 project(preconditioned-solver)
 
 # We only need to find Ginkgo if we build this example stand-alone
 if (NOT GINKGO_BUILD_EXAMPLES)
-    find_package(Ginkgo 1.6.0 REQUIRED)
+    find_package(Ginkgo 1.7.0 REQUIRED)
 endif()
 add_executable(preconditioned-solver preconditioned-solver.cpp)
 target_link_libraries(preconditioned-solver Ginkgo::ginkgo)
diff --git a/examples/preconditioned-solver/preconditioned-solver.cpp b/examples/preconditioned-solver/preconditioned-solver.cpp
index 37963f205cc..cb3d34be8bc 100644
--- a/examples/preconditioned-solver/preconditioned-solver.cpp
+++ b/examples/preconditioned-solver/preconditioned-solver.cpp
@@ -69,13 +69,12 @@ int main(int argc, char* argv[])
             {"omp", [] { return gko::OmpExecutor::create(); }},
             {"cuda",
              [] {
-                 return gko::CudaExecutor::create(0, gko::OmpExecutor::create(),
-                                                  true);
+                 return gko::CudaExecutor::create(0,
+                                                  gko::OmpExecutor::create());
              }},
             {"hip",
              [] {
-                 return gko::HipExecutor::create(0, gko::OmpExecutor::create(),
-                                                 true);
+                 return gko::HipExecutor::create(0, gko::OmpExecutor::create());
              }},
             {"dpcpp",
              [] {
@@ -96,14 +95,12 @@ int main(int argc, char* argv[])
     // Create solver factory
     auto solver_gen =
         cg::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(20u).on(exec),
-                gko::stop::ResidualNorm<ValueType>::build()
-                    .with_reduction_factor(reduction_factor)
-                    .on(exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(20u),
+                           gko::stop::ResidualNorm<ValueType>::build()
+                               .with_reduction_factor(reduction_factor))
             // Add preconditioner, these 2 lines are the only
             // difference from the simple solver example
-            .with_preconditioner(bj::build().with_max_block_size(8u).on(exec))
+            .with_preconditioner(bj::build().with_max_block_size(8u))
             .on(exec);
     // Create solver
     auto solver = solver_gen->generate(A);
diff --git a/examples/preconditioner-export/CMakeLists.txt b/examples/preconditioner-export/CMakeLists.txt
index 1d2156b9d5a..83a20952d51 100644
--- a/examples/preconditioner-export/CMakeLists.txt
+++ b/examples/preconditioner-export/CMakeLists.txt
@@ -1,9 +1,9 @@
-cmake_minimum_required(VERSION 3.9)
+cmake_minimum_required(VERSION 3.16)
 project(preconditioner-export)
 
 # We only need to find Ginkgo if we build this example stand-alone
 if (NOT GINKGO_BUILD_EXAMPLES)
-    find_package(Ginkgo 1.6.0 REQUIRED)
+    find_package(Ginkgo 1.7.0 REQUIRED)
 endif()
 
 add_executable(preconditioner-export preconditioner-export.cpp)
diff --git a/examples/preconditioner-export/preconditioner-export.cpp b/examples/preconditioner-export/preconditioner-export.cpp
index 81aeece1cb1..f504a4ac991 100644
--- a/examples/preconditioner-export/preconditioner-export.cpp
+++ b/examples/preconditioner-export/preconditioner-export.cpp
@@ -131,19 +131,21 @@ int main(int argc, char* argv[])
     // handle different preconditioners
     if (precond == "jacobi") {
         // jacobi: max_block_size, accuracy, storage_optimization
-        auto factory = gko::preconditioner::Jacobi<>::build().on(exec);
+        auto factory_parameter = gko::preconditioner::Jacobi<>::build();
         if (argc >= 5) {
-            factory->get_parameters().max_block_size = std::stoi(argv[4]);
+            factory_parameter.with_max_block_size(
+                static_cast<gko::uint32>(std::stoi(argv[4])));
         }
         if (argc >= 6) {
-            factory->get_parameters().accuracy = std::stod(argv[5]);
+            factory_parameter.with_accuracy(std::stod(argv[5]));
         }
         if (argc >= 7) {
-            factory->get_parameters().storage_optimization =
+            factory_parameter.with_storage_optimization(
                 std::string{argv[6]} == "auto"
                     ? gko::precision_reduction::autodetect()
-                    : gko::precision_reduction(0, std::stoi(argv[6]));
+                    : gko::precision_reduction(0, std::stoi(argv[6])));
         }
+        auto factory = factory_parameter.on(exec);
         auto jacobi = try_generate([&] { return factory->generate(mtx); });
         output(jacobi, matrix + ".jacobi" + output_suffix);
     } else if (precond == "ilu") {
@@ -157,10 +159,12 @@ int main(int argc, char* argv[])
                matrix + ".ilu-u");
     } else if (precond == "parilu") {
         // parilu: iterations
-        auto factory = gko::factorization::ParIlu<>::build().on(exec);
+        auto factory_parameter = gko::factorization::ParIlu<>::build();
         if (argc >= 5) {
-            factory->get_parameters().iterations = std::stoi(argv[4]);
+            factory_parameter.with_iterations(
+                static_cast<gko::size_type>(std::stoi(argv[4])));
         }
+        auto factory = factory_parameter.on(exec);
         auto ilu = gko::as<gko::Composition<>>(
             try_generate([&] { return factory->generate(mtx); }));
         output(gko::as<gko::matrix::Csr<>>(ilu->get_operators()[0]),
@@ -169,13 +173,15 @@ int main(int argc, char* argv[])
                matrix + ".parilu" + output_suffix + "-u");
     } else if (precond == "parilut") {
         // parilut: iterations, fill-in limit
-        auto factory = gko::factorization::ParIlut<>::build().on(exec);
+        auto factory_parameter = gko::factorization::ParIlut<>::build();
         if (argc >= 5) {
-            factory->get_parameters().iterations = std::stoi(argv[4]);
+            factory_parameter.with_iterations(
+                static_cast<gko::size_type>(std::stoi(argv[4])));
         }
         if (argc >= 6) {
-            factory->get_parameters().fill_in_limit = std::stod(argv[5]);
+            factory_parameter.with_fill_in_limit(std::stod(argv[5]));
         }
+        auto factory = factory_parameter.on(exec);
         auto ilut = gko::as<gko::Composition<>>(
             try_generate([&] { return factory->generate(mtx); }));
         output(gko::as<gko::matrix::Csr<>>(ilut->get_operators()[0]),
@@ -193,13 +199,11 @@ int main(int argc, char* argv[])
         auto factory =
             gko::preconditioner::Ilu<gko::preconditioner::LowerIsai<>,
                                      gko::preconditioner::UpperIsai<>>::build()
-                .with_factorization_factory(fact_factory)
-                .with_l_solver_factory(gko::preconditioner::LowerIsai<>::build()
-                                           .with_sparsity_power(sparsity_power)
-                                           .on(exec))
-                .with_u_solver_factory(gko::preconditioner::UpperIsai<>::build()
-                                           .with_sparsity_power(sparsity_power)
-                                           .on(exec))
+                .with_factorization(fact_factory)
+                .with_l_solver(gko::preconditioner::LowerIsai<>::build()
+                                   .with_sparsity_power(sparsity_power))
+                .with_u_solver(gko::preconditioner::UpperIsai<>::build()
+                                   .with_sparsity_power(sparsity_power))
                 .on(exec);
         auto ilu_isai = try_generate([&] { return factory->generate(mtx); });
         output(ilu_isai->get_l_solver()->get_approximate_inverse(),
@@ -208,25 +212,24 @@ int main(int argc, char* argv[])
                matrix + ".ilu-isai" + output_suffix + "-u");
     } else if (precond == "parilu-isai") {
         // parilu-isai: iterations, sparsity power
-        auto fact_factory =
-            gko::share(gko::factorization::ParIlu<>::build().on(exec));
+        auto fact_parameter = gko::factorization::ParIlu<>::build();
         int sparsity_power = 1;
         if (argc >= 5) {
-            fact_factory->get_parameters().iterations = std::stoi(argv[4]);
+            fact_parameter.with_iterations(
+                static_cast<gko::size_type>(std::stoi(argv[4])));
         }
         if (argc >= 6) {
             sparsity_power = std::stoi(argv[5]);
         }
+        auto fact_factory = gko::share(fact_parameter.on(exec));
         auto factory =
             gko::preconditioner::Ilu<gko::preconditioner::LowerIsai<>,
                                      gko::preconditioner::UpperIsai<>>::build()
-                .with_factorization_factory(fact_factory)
-                .with_l_solver_factory(gko::preconditioner::LowerIsai<>::build()
-                                           .with_sparsity_power(sparsity_power)
-                                           .on(exec))
-                .with_u_solver_factory(gko::preconditioner::UpperIsai<>::build()
-                                           .with_sparsity_power(sparsity_power)
-                                           .on(exec))
+                .with_factorization(fact_factory)
+                .with_l_solver(gko::preconditioner::LowerIsai<>::build()
+                                   .with_sparsity_power(sparsity_power))
+                .with_u_solver(gko::preconditioner::UpperIsai<>::build()
+                                   .with_sparsity_power(sparsity_power))
                 .on(exec);
         auto ilu_isai = try_generate([&] { return factory->generate(mtx); });
         output(ilu_isai->get_l_solver()->get_approximate_inverse(),
@@ -235,28 +238,27 @@ int main(int argc, char* argv[])
                matrix + ".parilu-isai" + output_suffix + "-u");
     } else if (precond == "parilut-isai") {
         // parilut-isai: iterations, fill-in limit, sparsity power
-        auto fact_factory =
-            gko::share(gko::factorization::ParIlut<>::build().on(exec));
+        auto fact_parameter = gko::factorization::ParIlut<>::build();
         int sparsity_power = 1;
         if (argc >= 5) {
-            fact_factory->get_parameters().iterations = std::stoi(argv[4]);
+            fact_parameter.with_iterations(
+                static_cast<gko::size_type>(std::stoi(argv[4])));
         }
         if (argc >= 6) {
-            fact_factory->get_parameters().fill_in_limit = std::stod(argv[5]);
+            fact_parameter.with_fill_in_limit(std::stod(argv[5]));
         }
         if (argc >= 7) {
             sparsity_power = std::stoi(argv[6]);
         }
+        auto fact_factory = gko::share(fact_parameter.on(exec));
         auto factory =
             gko::preconditioner::Ilu<gko::preconditioner::LowerIsai<>,
                                      gko::preconditioner::UpperIsai<>>::build()
-                .with_factorization_factory(fact_factory)
-                .with_l_solver_factory(gko::preconditioner::LowerIsai<>::build()
-                                           .with_sparsity_power(sparsity_power)
-                                           .on(exec))
-                .with_u_solver_factory(gko::preconditioner::UpperIsai<>::build()
-                                           .with_sparsity_power(sparsity_power)
-                                           .on(exec))
+                .with_factorization(fact_factory)
+                .with_l_solver(gko::preconditioner::LowerIsai<>::build()
+                                   .with_sparsity_power(sparsity_power))
+                .with_u_solver(gko::preconditioner::UpperIsai<>::build()
+                                   .with_sparsity_power(sparsity_power))
                 .on(exec);
         auto ilu_isai = try_generate([&] { return factory->generate(mtx); });
         output(ilu_isai->get_l_solver()->get_approximate_inverse(),
diff --git a/examples/schroedinger-splitting/CMakeLists.txt b/examples/schroedinger-splitting/CMakeLists.txt
index b7bdece35e8..555fb59b554 100644
--- a/examples/schroedinger-splitting/CMakeLists.txt
+++ b/examples/schroedinger-splitting/CMakeLists.txt
@@ -1,9 +1,9 @@
-cmake_minimum_required(VERSION 3.9)
+cmake_minimum_required(VERSION 3.16)
 project(schroedinger-splitting)
 
 # We only need to find Ginkgo if we build this example stand-alone
 if (NOT GINKGO_BUILD_EXAMPLES)
-    find_package(Ginkgo 1.6.0 REQUIRED)
+    find_package(Ginkgo 1.7.0 REQUIRED)
 endif()
 find_package(OpenCV REQUIRED)
 
diff --git a/examples/simple-solver-logging/CMakeLists.txt b/examples/simple-solver-logging/CMakeLists.txt
index 4092445848a..2272413f52a 100644
--- a/examples/simple-solver-logging/CMakeLists.txt
+++ b/examples/simple-solver-logging/CMakeLists.txt
@@ -1,9 +1,9 @@
-cmake_minimum_required(VERSION 3.9)
+cmake_minimum_required(VERSION 3.16)
 project(simple-solver-logging)
 
 # We only need to find Ginkgo if we build this example stand-alone
 if (NOT GINKGO_BUILD_EXAMPLES)
-    find_package(Ginkgo 1.6.0 REQUIRED)
+    find_package(Ginkgo 1.7.0 REQUIRED)
 endif()
 
 add_executable(simple-solver-logging simple-solver-logging.cpp)
diff --git a/examples/simple-solver-logging/simple-solver-logging.cpp b/examples/simple-solver-logging/simple-solver-logging.cpp
index 6aa85462605..2ef47524612 100644
--- a/examples/simple-solver-logging/simple-solver-logging.cpp
+++ b/examples/simple-solver-logging/simple-solver-logging.cpp
@@ -85,13 +85,12 @@ int main(int argc, char* argv[])
             {"omp", [] { return gko::OmpExecutor::create(); }},
             {"cuda",
              [] {
-                 return gko::CudaExecutor::create(0, gko::OmpExecutor::create(),
-                                                  true);
+                 return gko::CudaExecutor::create(0,
+                                                  gko::OmpExecutor::create());
              }},
             {"hip",
              [] {
-                 return gko::HipExecutor::create(0, gko::OmpExecutor::create(),
-                                                 true);
+                 return gko::HipExecutor::create(0, gko::OmpExecutor::create());
              }},
             {"dpcpp",
              [] {
@@ -137,9 +136,8 @@ int main(int argc, char* argv[])
     // Generate solver
     auto solver_gen =
         cg::build()
-            .with_criteria(
-                residual_criterion,
-                gko::stop::Iteration::build().with_max_iters(20u).on(exec))
+            .with_criteria(residual_criterion,
+                           gko::stop::Iteration::build().with_max_iters(20u))
             .on(exec);
     auto solver = solver_gen->generate(A);
 
diff --git a/examples/simple-solver/CMakeLists.txt b/examples/simple-solver/CMakeLists.txt
index f505e19729e..d2a30ac084f 100644
--- a/examples/simple-solver/CMakeLists.txt
+++ b/examples/simple-solver/CMakeLists.txt
@@ -1,9 +1,9 @@
-cmake_minimum_required(VERSION 3.9)
+cmake_minimum_required(VERSION 3.16)
 project(simple-solver)
 
 # We only need to find Ginkgo if we build this example stand-alone
 if (NOT GINKGO_BUILD_EXAMPLES)
-    find_package(Ginkgo 1.6.0 REQUIRED)
+    find_package(Ginkgo 1.7.0 REQUIRED)
 endif()
 
 add_executable(simple-solver simple-solver.cpp)
diff --git a/examples/simple-solver/simple-solver.cpp b/examples/simple-solver/simple-solver.cpp
index 8f665f98496..d80c0633ab8 100644
--- a/examples/simple-solver/simple-solver.cpp
+++ b/examples/simple-solver/simple-solver.cpp
@@ -89,13 +89,12 @@ int main(int argc, char* argv[])
             {"omp", [] { return gko::OmpExecutor::create(); }},
             {"cuda",
              [] {
-                 return gko::CudaExecutor::create(0, gko::OmpExecutor::create(),
-                                                  true);
+                 return gko::CudaExecutor::create(0,
+                                                  gko::OmpExecutor::create());
              }},
             {"hip",
              [] {
-                 return gko::HipExecutor::create(0, gko::OmpExecutor::create(),
-                                                 true);
+                 return gko::HipExecutor::create(0, gko::OmpExecutor::create());
              }},
             {"dpcpp",
              [] {
@@ -131,11 +130,9 @@ int main(int argc, char* argv[])
     const RealValueType reduction_factor{1e-7};
     auto solver_gen =
         cg::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(20u).on(exec),
-                gko::stop::ResidualNorm<ValueType>::build()
-                    .with_reduction_factor(reduction_factor)
-                    .on(exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(20u),
+                           gko::stop::ResidualNorm<ValueType>::build()
+                               .with_reduction_factor(reduction_factor))
             .on(exec);
     // Generate the solver from the matrix. The solver factory built in the
     // previous step takes a "matrix"(a gko::LinOp to be more general) as an
diff --git a/examples/three-pt-stencil-solver/CMakeLists.txt b/examples/three-pt-stencil-solver/CMakeLists.txt
index d2941b12976..164c9e08302 100644
--- a/examples/three-pt-stencil-solver/CMakeLists.txt
+++ b/examples/three-pt-stencil-solver/CMakeLists.txt
@@ -1,9 +1,9 @@
-cmake_minimum_required(VERSION 3.9)
+cmake_minimum_required(VERSION 3.16)
 project(three-pt-stencil-solver)
 
 # We only need to find Ginkgo if we build this example stand-alone
 if (NOT GINKGO_BUILD_EXAMPLES)
-    find_package(Ginkgo 1.6.0 REQUIRED)
+    find_package(Ginkgo 1.7.0 REQUIRED)
 endif()
 
 add_executable(three-pt-stencil-solver three-pt-stencil-solver.cpp)
diff --git a/examples/three-pt-stencil-solver/three-pt-stencil-solver.cpp b/examples/three-pt-stencil-solver/three-pt-stencil-solver.cpp
index 6bf3cc21a8a..f4af38882b0 100644
--- a/examples/three-pt-stencil-solver/three-pt-stencil-solver.cpp
+++ b/examples/three-pt-stencil-solver/three-pt-stencil-solver.cpp
@@ -165,13 +165,12 @@ void solve_system(const std::string& executor_string,
             {"omp", [] { return gko::OmpExecutor::create(); }},
             {"cuda",
              [] {
-                 return gko::CudaExecutor::create(0, gko::OmpExecutor::create(),
-                                                  true);
+                 return gko::CudaExecutor::create(0,
+                                                  gko::OmpExecutor::create());
              }},
             {"hip",
              [] {
-                 return gko::HipExecutor::create(0, gko::OmpExecutor::create(),
-                                                 true);
+                 return gko::HipExecutor::create(0, gko::OmpExecutor::create());
              }},
             {"dpcpp",
              [] {
@@ -217,13 +216,11 @@ void solve_system(const std::string& executor_string,
     // Generate solver
     auto solver_gen =
         cg::build()
-            .with_criteria(gko::stop::Iteration::build()
-                               .with_max_iters(gko::size_type(dp))
-                               .on(exec),
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(
+                               gko::size_type(dp)),
                            gko::stop::ResidualNorm<ValueType>::build()
-                               .with_reduction_factor(reduction_factor)
-                               .on(exec))
-            .with_preconditioner(bj::build().on(exec))
+                               .with_reduction_factor(reduction_factor))
+            .with_preconditioner(bj::build())
             .on(exec);
     auto solver = solver_gen->generate(gko::give(matrix));
 
diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt
index 6c6fc235f45..cb193920edc 100644
--- a/hip/CMakeLists.txt
+++ b/hip/CMakeLists.txt
@@ -1,13 +1,24 @@
+include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake)
+add_instantiation_files(. matrix/csr_kernels.instantiate.hip.cpp CSR_INSTANTIATE)
+add_instantiation_files(. matrix/fbcsr_kernels.instantiate.hip.cpp FBCSR_INSTANTIATE)
+# we don't split up the dense kernels into distinct compilations
+list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp)
 set(GINKGO_HIP_SOURCES
+    base/batch_multi_vector_kernels.hip.cpp
+    base/device.hip.cpp
     base/device_matrix_data_kernels.hip.cpp
     base/exception.hip.cpp
     base/executor.hip.cpp
     base/index_set_kernels.hip.cpp
+    base/memory.hip.cpp
+    base/roctx.hip.cpp
     base/scoped_device_id.hip.cpp
+    base/stream.hip.cpp
     base/timer.hip.cpp
     base/version.hip.cpp
     components/prefix_sum_kernels.hip.cpp
     distributed/matrix_kernels.hip.cpp
+    distributed/partition_helpers_kernels.hip.cpp
     distributed/partition_kernels.hip.cpp
     distributed/vector_kernels.hip.cpp
     factorization/cholesky_kernels.hip.cpp
@@ -24,12 +35,14 @@ set(GINKGO_HIP_SOURCES
     factorization/par_ilut_select_kernel.hip.cpp
     factorization/par_ilut_spgeam_kernel.hip.cpp
     factorization/par_ilut_sweep_kernel.hip.cpp
+    matrix/batch_dense_kernels.hip.cpp
+    matrix/batch_ell_kernels.hip.cpp
     matrix/coo_kernels.hip.cpp
-    matrix/csr_kernels.hip.cpp
+    ${CSR_INSTANTIATE}
     matrix/dense_kernels.hip.cpp
     matrix/diagonal_kernels.hip.cpp
     matrix/ell_kernels.hip.cpp
-    matrix/fbcsr_kernels.hip.cpp
+    ${FBCSR_INSTANTIATE}
     matrix/sellp_kernels.hip.cpp
     matrix/sparsity_csr_kernels.hip.cpp
     multigrid/pgm_kernels.hip.cpp
@@ -39,6 +52,7 @@ set(GINKGO_HIP_SOURCES
     preconditioner/jacobi_kernels.hip.cpp
     preconditioner/jacobi_simple_apply_kernel.hip.cpp
     reorder/rcm_kernels.hip.cpp
+    solver/batch_bicgstab_kernels.hip.cpp
     solver/cb_gmres_kernels.hip.cpp
     solver/idr_kernels.hip.cpp
     solver/lower_trs_kernels.hip.cpp
@@ -132,7 +146,7 @@ if(GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_AMD_REGEX}")
     endif()
     target_link_libraries(ginkgo_hip PUBLIC ${HIP_LIBAMDHIP64_LIBRARIES})
 elseif(GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_NVIDIA_REGEX}")
-    find_package(CUDA 9.2 REQUIRED)
+    find_package(CUDA 10.1 REQUIRED)
     target_link_libraries(ginkgo_hip PUBLIC ${CUDA_LIBRARIES})
 endif()
 
diff --git a/hip/base/batch_multi_vector_kernels.hip.cpp b/hip/base/batch_multi_vector_kernels.hip.cpp
new file mode 100644
index 00000000000..f59d873840c
--- /dev/null
+++ b/hip/base/batch_multi_vector_kernels.hip.cpp
@@ -0,0 +1,87 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/base/batch_multi_vector_kernels.hpp"
+
+
+#include <hip/hip_runtime.h>
+#include <thrust/functional.h>
+#include <thrust/transform.h>
+
+
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/range_accessors.hpp>
+
+
+#include "core/base/batch_struct.hpp"
+#include "hip/base/batch_struct.hip.hpp"
+#include "hip/base/config.hip.hpp"
+#include "hip/base/hipblas_bindings.hip.hpp"
+#include "hip/base/pointer_mode_guard.hip.hpp"
+#include "hip/base/thrust.hip.hpp"
+#include "hip/components/cooperative_groups.hip.hpp"
+#include "hip/components/reduction.hip.hpp"
+#include "hip/components/thread_ids.hip.hpp"
+#include "hip/components/uninitialized_array.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+/**
+ * @brief The MultiVector matrix format namespace.
+ *
+ * @ingroup batch_multi_vector
+ */
+namespace batch_multi_vector {
+
+
+constexpr auto default_block_size = 256;
+constexpr int sm_oversubscription = 4;
+
+
+// clang-format off
+
+// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
+
+#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc"
+
+
+#include "common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc"
+
+// clang-format on
+
+
+}  // namespace batch_multi_vector
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
diff --git a/hip/base/batch_struct.hip.hpp b/hip/base/batch_struct.hip.hpp
new file mode 100644
index 00000000000..5747e202fb7
--- /dev/null
+++ b/hip/base/batch_struct.hip.hpp
@@ -0,0 +1,93 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_HIP_BASE_BATCH_STRUCT_HIP_HPP_
+#define GKO_HIP_BASE_BATCH_STRUCT_HIP_HPP_
+
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/math.hpp>
+
+
+#include "core/base/batch_struct.hpp"
+#include "hip/base/config.hip.hpp"
+#include "hip/base/types.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+
+
+/** @file batch_struct.hpp
+ *
+ * Helper functions to generate a batch struct from a batch LinOp,
+ * while also shallow-casting to the required Hip scalar type.
+ *
+ * A specialization is needed for every format of every kind of linear algebra
+ * object. These are intended to be called on the host.
+ */
+
+
+/**
+ * Generates an immutable uniform batch struct from a batch of multi-vectors.
+ */
+template <typename ValueType>
+inline batch::multi_vector::uniform_batch<const hip_type<ValueType>>
+get_batch_struct(const batch::MultiVector<ValueType>* const op)
+{
+    return {as_hip_type(op->get_const_values()), op->get_num_batch_items(),
+            static_cast<int32>(op->get_common_size()[1]),
+            static_cast<int32>(op->get_common_size()[0]),
+            static_cast<int32>(op->get_common_size()[1])};
+}
+
+/**
+ * Generates a uniform batch struct from a batch of multi-vectors.
+ */
+template <typename ValueType>
+inline batch::multi_vector::uniform_batch<hip_type<ValueType>> get_batch_struct(
+    batch::MultiVector<ValueType>* const op)
+{
+    return {as_hip_type(op->get_values()), op->get_num_batch_items(),
+            static_cast<int32>(op->get_common_size()[1]),
+            static_cast<int32>(op->get_common_size()[0]),
+            static_cast<int32>(op->get_common_size()[1])};
+}
+
+
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_HIP_BASE_BATCH_STRUCT_HIP_HPP_
diff --git a/hip/base/device.hip.cpp b/hip/base/device.hip.cpp
new file mode 100644
index 00000000000..d539fa69b43
--- /dev/null
+++ b/hip/base/device.hip.cpp
@@ -0,0 +1,75 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/base/device.hpp>
+
+
+#include <hip/hip_runtime.h>
+
+
+#include <ginkgo/config.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/stream.hpp>
+
+
+#include "hip/base/scoped_device_id.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+
+
+void reset_device(int device_id)
+{
+    gko::detail::hip_scoped_device_id_guard guard{device_id};
+    hipDeviceReset();
+}
+
+
+void destroy_event(GKO_HIP_EVENT_STRUCT* event)
+{
+    GKO_ASSERT_NO_HIP_ERRORS(hipEventDestroy(event));
+}
+
+
+std::string get_device_name(int device_id)
+{
+    hipDeviceProp_t prop;
+    GKO_ASSERT_NO_HIP_ERRORS(hipGetDeviceProperties(&prop, device_id));
+    return {prop.name};
+}
+
+
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
diff --git a/hip/base/device.hpp b/hip/base/device.hpp
index dcc8c3ba0f1..fceffe4a503 100644
--- a/hip/base/device.hpp
+++ b/hip/base/device.hpp
@@ -49,6 +49,10 @@ void reset_device(int device_id);
 void destroy_event(GKO_HIP_EVENT_STRUCT* event);
 
 
+/** returns hipDeviceProp.name for the given device */
+std::string get_device_name(int device_id);
+
+
 }  // namespace hip
 }  // namespace kernels
 }  // namespace gko
diff --git a/hip/base/exception.hip.cpp b/hip/base/exception.hip.cpp
index 19a2b3739ac..7a182963f74 100644
--- a/hip/base/exception.hip.cpp
+++ b/hip/base/exception.hip.cpp
@@ -37,9 +37,15 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <hip/hip_runtime.h>
+#if HIP_VERSION >= 50200000
+#include <hipblas/hipblas.h>
+#include <hiprand/hiprand.h>
+#include <hipsparse/hipsparse.h>
+#else
 #include <hipblas.h>
 #include <hiprand.h>
 #include <hipsparse.h>
+#endif
 
 
 #include <ginkgo/core/base/types.hpp>
diff --git a/hip/base/executor.hip.cpp b/hip/base/executor.hip.cpp
index cd8a485c19d..8d175c0e424 100644
--- a/hip/base/executor.hip.cpp
+++ b/hip/base/executor.hip.cpp
@@ -37,15 +37,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <hip/hip_runtime.h>
-#if GINKGO_HIP_PLATFORM_HCC && GKO_HAVE_ROCTX
-#include <roctx.h>
-#endif
 
 
 #include <ginkgo/config.hpp>
 #include <ginkgo/core/base/device.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
-#include <ginkgo/core/log/profiler_hook.hpp>
 
 
 #include "hip/base/config.hip.hpp"
@@ -60,32 +56,43 @@ namespace gko {
 #include "common/cuda_hip/base/executor.hpp.inc"
 
 
-#if (GINKGO_HIP_PLATFORM_NVCC == 1)
-using hip_device_class = nvidia_device;
-#else
-using hip_device_class = amd_device;
-#endif
+std::unique_ptr<HipAllocatorBase> hip_allocator_from_mode(int device_id,
+                                                          allocation_mode mode)
+{
+    switch (mode) {
+    case allocation_mode::device:
+        return std::make_unique<HipAllocator>();
+    case allocation_mode::unified_global:
+        return std::make_unique<HipUnifiedAllocator>(device_id,
+                                                     hipMemAttachGlobal);
+    case allocation_mode::unified_host:
+        return std::make_unique<HipUnifiedAllocator>(device_id,
+                                                     hipMemAttachHost);
+    default:
+        GKO_NOT_SUPPORTED(mode);
+    }
+}
 
 
 std::shared_ptr<HipExecutor> HipExecutor::create(
     int device_id, std::shared_ptr<Executor> master, bool device_reset,
     allocation_mode alloc_mode, hipStream_t stream)
 {
-    return std::shared_ptr<HipExecutor>(
-        new HipExecutor(device_id, std::move(master), device_reset, alloc_mode,
-                        stream),
-        [device_id](HipExecutor* exec) {
-            auto device_reset = exec->get_device_reset();
-            std::lock_guard<std::mutex> guard(
-                hip_device_class::get_mutex(device_id));
-            delete exec;
-            auto& num_execs = hip_device_class::get_num_execs(device_id);
-            num_execs--;
-            if (!num_execs && device_reset) {
-                detail::hip_scoped_device_id_guard g(device_id);
-                hipDeviceReset();
-            }
-        });
+    return create(device_id, std::move(master),
+                  hip_allocator_from_mode(device_id, alloc_mode), stream);
+}
+
+
+std::shared_ptr<HipExecutor> HipExecutor::create(
+    int device_id, std::shared_ptr<Executor> master,
+    std::shared_ptr<HipAllocatorBase> alloc, hipStream_t stream)
+{
+    if (!alloc->check_environment(device_id, stream)) {
+        throw Error{__FILE__, __LINE__,
+                    "Allocator uses incorrect stream or device ID."};
+    }
+    return std::shared_ptr<HipExecutor>(new HipExecutor(
+        device_id, std::move(master), std::move(alloc), stream));
 }
 
 
@@ -125,42 +132,14 @@ void OmpExecutor::raw_copy_to(const HipExecutor* dest, size_type num_bytes,
 void HipExecutor::raw_free(void* ptr) const noexcept
 {
     detail::hip_scoped_device_id_guard g(this->get_device_id());
-    auto error_code = hipFree(ptr);
-    if (error_code != hipSuccess) {
-#if GKO_VERBOSE_LEVEL >= 1
-        // Unfortunately, if memory free fails, there's not much we can do
-        std::cerr << "Unrecoverable HIP error on device "
-                  << this->get_device_id() << " in " << __func__ << ": "
-                  << hipGetErrorName(error_code) << ": "
-                  << hipGetErrorString(error_code) << std::endl
-                  << "Exiting program" << std::endl;
-#endif  // GKO_VERBOSE_LEVEL >= 1
-        std::exit(error_code);
-    }
+    alloc_->deallocate(ptr);
 }
 
 
 void* HipExecutor::raw_alloc(size_type num_bytes) const
 {
-    void* dev_ptr = nullptr;
     detail::hip_scoped_device_id_guard g(this->get_device_id());
-    int error_code = 0;
-    if (this->alloc_mode_ == allocation_mode::device) {
-        error_code = hipMalloc(&dev_ptr, num_bytes);
-#if !(GKO_HIP_PLATFORM_HCC == 1)
-    } else if (this->alloc_mode_ == allocation_mode::unified_global) {
-        error_code = hipMallocManaged(&dev_ptr, num_bytes, hipMemAttachGlobal);
-    } else if (this->alloc_mode_ == allocation_mode::unified_host) {
-        error_code = hipMallocManaged(&dev_ptr, num_bytes, hipMemAttachHost);
-#endif
-    } else {
-        GKO_NOT_SUPPORTED(this->alloc_mode_);
-    }
-    if (error_code != hipErrorMemoryAllocation) {
-        GKO_ASSERT_NO_HIP_ERRORS(error_code);
-    }
-    GKO_ENSURE_ALLOCATED(dev_ptr, "hip", num_bytes);
-    return dev_ptr;
+    return alloc_->allocate(num_bytes);
 }
 
 
@@ -309,73 +288,4 @@ void HipExecutor::init_handles()
 }
 
 
-hip_stream::hip_stream(int device_id) : stream_{}, device_id_(device_id)
-{
-    detail::hip_scoped_device_id_guard g(device_id_);
-    GKO_ASSERT_NO_HIP_ERRORS(hipStreamCreate(&stream_));
-}
-
-
-hip_stream::~hip_stream()
-{
-    if (stream_) {
-        detail::hip_scoped_device_id_guard g(device_id_);
-        hipStreamDestroy(stream_);
-    }
-}
-
-
-hip_stream::hip_stream(hip_stream&& other)
-    : stream_{std::exchange(other.stream_, nullptr)},
-      device_id_{std::exchange(other.device_id_, -1)}
-{}
-
-
-GKO_HIP_STREAM_STRUCT* hip_stream::get() const { return stream_; }
-
-
-namespace log {
-
-
-#if GINKGO_HIP_PLATFORM_HCC && GKO_HAVE_ROCTX
-
-void begin_roctx(const char* name, profile_event_category)
-{
-    roctxRangePush(name);
-}
-
-void end_roctx(const char*, profile_event_category) { roctxRangePop(); }
-
-#else
-
-void begin_roctx(const char* name, profile_event_category)
-    GKO_NOT_COMPILED(roctx);
-
-void end_roctx(const char*, profile_event_category) GKO_NOT_COMPILED(roctx);
-
-#endif
-
-
-}  // namespace log
-
-
-namespace kernels {
-namespace hip {
-
-
-void reset_device(int device_id)
-{
-    gko::detail::hip_scoped_device_id_guard guard{device_id};
-    hipDeviceReset();
-}
-
-
-void destroy_event(GKO_HIP_EVENT_STRUCT* event)
-{
-    GKO_ASSERT_NO_HIP_ERRORS(hipEventDestroy(event));
-}
-
-
-}  // namespace hip
-}  // namespace kernels
 }  // namespace gko
diff --git a/hip/base/hipblas_bindings.hip.hpp b/hip/base/hipblas_bindings.hip.hpp
index 2ff73c81e34..63751aa725a 100644
--- a/hip/base/hipblas_bindings.hip.hpp
+++ b/hip/base/hipblas_bindings.hip.hpp
@@ -34,7 +34,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GKO_HIP_BASE_HIPBLAS_BINDINGS_HIP_HPP_
 
 
+#include <hip/hip_runtime.h>
+#if HIP_VERSION >= 50200000
+#include <hipblas/hipblas.h>
+#else
 #include <hipblas.h>
+#endif
 
 
 #include <ginkgo/core/base/exception_helpers.hpp>
diff --git a/hip/base/hiprand_bindings.hip.hpp b/hip/base/hiprand_bindings.hip.hpp
index 900433af339..dfef3bb84b4 100644
--- a/hip/base/hiprand_bindings.hip.hpp
+++ b/hip/base/hiprand_bindings.hip.hpp
@@ -34,7 +34,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GKO_HIP_BASE_HIPRAND_BINDINGS_HIP_HPP_
 
 
+#include <hip/hip_runtime.h>
+#if HIP_VERSION >= 50200000
+#include <hiprand/hiprand.h>
+#else
 #include <hiprand.h>
+#endif
 
 
 #include <ginkgo/core/base/exception_helpers.hpp>
@@ -82,6 +87,11 @@ inline hiprandGenerator_t rand_generator(int64 seed,
     return gen;
 }
 
+inline void destroy(hiprandGenerator_t gen)
+{
+    GKO_ASSERT_NO_HIPRAND_ERRORS(hiprandDestroyGenerator(gen));
+}
+
 
 #define GKO_BIND_HIPRAND_RANDOM_VECTOR(ValueType, HiprandName)               \
     inline void rand_vector(                                                 \
diff --git a/hip/base/hipsparse_bindings.hip.hpp b/hip/base/hipsparse_bindings.hip.hpp
index 90378d3c711..322467dc2b3 100644
--- a/hip/base/hipsparse_bindings.hip.hpp
+++ b/hip/base/hipsparse_bindings.hip.hpp
@@ -34,7 +34,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GKO_HIP_BASE_HIPSPARSE_BINDINGS_HIP_HPP_
 
 
+#include <hip/hip_runtime.h>
+#if HIP_VERSION >= 50200000
+#include <hipsparse/hipsparse.h>
+#else
 #include <hipsparse.h>
+#endif
 
 
 #include <ginkgo/core/base/exception_helpers.hpp>
diff --git a/hip/base/hipsparse_block_bindings.hip.hpp b/hip/base/hipsparse_block_bindings.hip.hpp
index bc6c28394eb..49ef1e86c7d 100644
--- a/hip/base/hipsparse_block_bindings.hip.hpp
+++ b/hip/base/hipsparse_block_bindings.hip.hpp
@@ -34,7 +34,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GKO_HIP_BASE_HIPSPARSE_BLOCK_BINDINGS_HIP_HPP_
 
 
+#include <hip/hip_runtime.h>
+#if HIP_VERSION >= 50200000
+#include <hipsparse/hipsparse.h>
+#else
 #include <hipsparse.h>
+#endif
 
 
 #include <ginkgo/core/base/exception_helpers.hpp>
diff --git a/hip/base/memory.hip.cpp b/hip/base/memory.hip.cpp
new file mode 100644
index 00000000000..be795bb3397
--- /dev/null
+++ b/hip/base/memory.hip.cpp
@@ -0,0 +1,222 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/base/memory.hpp>
+
+
+#include <hip/hip_runtime.h>
+
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+
+
+#include "hip/base/scoped_device_id.hip.hpp"
+
+
+namespace gko {
+
+
+#define GKO_ASSERT_NO_HIP_ALLOCATION_ERRORS(_operation, _size)       \
+    {                                                                \
+        auto error_code = _operation;                                \
+        if (error_code == hipErrorMemoryAllocation) {                \
+            throw AllocationError(__FILE__, __LINE__, "hip", _size); \
+        } else {                                                     \
+            GKO_ASSERT_NO_HIP_ERRORS(error_code);                    \
+        }                                                            \
+    }
+
+
+#if GKO_VERBOSE_LEVEL >= 1
+#define GKO_EXIT_ON_HIP_ERROR(_operation)                                  \
+    {                                                                      \
+        const auto error_code = _operation;                                \
+        if (error_code != hipSuccess) {                                    \
+            int device_id{-1};                                             \
+            hipGetDevice(&device_id);                                      \
+            std::cerr << "Unrecoverable HIP error on device " << device_id \
+                      << " in " << __func__ << ": "                        \
+                      << hipGetErrorName(error_code) << ": "               \
+                      << hipGetErrorString(error_code) << std::endl        \
+                      << "Exiting program" << std::endl;                   \
+            std::exit(error_code);                                         \
+        }                                                                  \
+    }
+#else
+#define GKO_EXIT_ON_HIP_ERROR(_operation)   \
+    {                                       \
+        const auto error_code = _operation; \
+        if (error_code != hipSuccess) {     \
+            std::exit(error_code);          \
+        }                                   \
+    }
+#endif
+
+
+void* HipAllocator::allocate(size_type num_bytes)
+{
+    void* dev_ptr{};
+    GKO_ASSERT_NO_HIP_ALLOCATION_ERRORS(hipMalloc(&dev_ptr, num_bytes),
+                                        num_bytes);
+    return dev_ptr;
+}
+
+
+void HipAllocator::deallocate(void* dev_ptr)
+{
+    GKO_EXIT_ON_HIP_ERROR(hipFree(dev_ptr));
+}
+
+
+#if HIP_VERSION >= 50200000
+
+
+HipAsyncAllocator::HipAsyncAllocator(hipStream_t stream) : stream_{stream} {}
+
+
+void* HipAsyncAllocator::allocate(size_type num_bytes)
+{
+    void* ptr{};
+    GKO_ASSERT_NO_HIP_ALLOCATION_ERRORS(
+        hipMallocAsync(&ptr, num_bytes, stream_), num_bytes);
+    return ptr;
+}
+
+
+void HipAsyncAllocator::deallocate(void* ptr)
+{
+    GKO_EXIT_ON_HIP_ERROR(hipFreeAsync(ptr, stream_));
+}
+
+
+#else  // Fall back to regular allocation
+
+
+HipAsyncAllocator::HipAsyncAllocator(hipStream_t stream) : stream_{stream}
+{
+#if GKO_VERBOSE_LEVEL >= 1
+    std::cerr << "This version of HIP does not support hipMallocAsync, "
+                 "please use HipAllocator instead of HipAsyncAllocator.\n";
+#endif
+}
+
+
+void* HipAsyncAllocator::allocate(size_type num_bytes)
+{
+    void* ptr{};
+    GKO_ASSERT_NO_HIP_ALLOCATION_ERRORS(hipMalloc(&ptr, num_bytes), num_bytes);
+    return ptr;
+}
+
+
+void HipAsyncAllocator::deallocate(void* ptr)
+{
+    GKO_EXIT_ON_HIP_ERROR(hipFree(ptr));
+}
+
+
+#endif
+
+
+bool HipAsyncAllocator::check_environment(int device_id,
+                                          hipStream_t stream) const
+{
+    return stream == stream_;
+}
+
+
+HipUnifiedAllocator::HipUnifiedAllocator(int device_id)
+    : HipUnifiedAllocator{device_id, hipMemAttachGlobal}
+{}
+
+
+HipUnifiedAllocator::HipUnifiedAllocator(int device_id, unsigned int flags)
+    : device_id_{device_id}, flags_{flags}
+{}
+
+
+void* HipUnifiedAllocator::allocate(size_type num_bytes)
+{
+    // we need to set the device ID in case this gets used in a host executor
+    detail::hip_scoped_device_id_guard g(device_id_);
+    void* ptr{};
+    GKO_ASSERT_NO_HIP_ALLOCATION_ERRORS(
+        hipMallocManaged(&ptr, num_bytes, flags_), num_bytes);
+    return ptr;
+}
+
+
+void HipUnifiedAllocator::deallocate(void* ptr)
+{
+    // we need to set the device ID in case this gets used in a host executor
+    detail::hip_scoped_device_id_guard g(device_id_);
+    GKO_EXIT_ON_HIP_ERROR(hipFree(ptr));
+}
+
+
+bool HipUnifiedAllocator::check_environment(int device_id,
+                                            hipStream_t stream) const
+{
+    return device_id == device_id_;
+}
+
+
+HipHostAllocator::HipHostAllocator(int device_id) : device_id_{device_id} {}
+
+
+void* HipHostAllocator::allocate(size_type num_bytes)
+{
+    // we need to set the device ID in case this gets used in a host executor
+    detail::hip_scoped_device_id_guard g(device_id_);
+    void* ptr{};
+    GKO_ASSERT_NO_HIP_ALLOCATION_ERRORS(hipHostMalloc(&ptr, num_bytes),
+                                        num_bytes);
+    return ptr;
+}
+
+
+void HipHostAllocator::deallocate(void* ptr)
+{
+    // we need to set the device ID in case this gets used in a host executor
+    detail::hip_scoped_device_id_guard g(device_id_);
+    GKO_EXIT_ON_HIP_ERROR(hipHostFree(ptr));
+}
+
+
+bool HipHostAllocator::check_environment(int device_id,
+                                         hipStream_t stream) const
+{
+    return device_id == device_id_;
+}
+
+
+}  // namespace gko
diff --git a/hip/base/pointer_mode_guard.hip.hpp b/hip/base/pointer_mode_guard.hip.hpp
index 681839ec9e2..11fa5afeb9e 100644
--- a/hip/base/pointer_mode_guard.hip.hpp
+++ b/hip/base/pointer_mode_guard.hip.hpp
@@ -38,8 +38,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <hip/hip_runtime.h>
+#if HIP_VERSION >= 50200000
+#include <hipblas/hipblas.h>
+#include <hipsparse/hipsparse.h>
+#else
 #include <hipblas.h>
 #include <hipsparse.h>
+#endif
 
 
 #include <ginkgo/core/base/exception_helpers.hpp>
diff --git a/hip/base/roctx.hip.cpp b/hip/base/roctx.hip.cpp
new file mode 100644
index 00000000000..6e2d93b3a06
--- /dev/null
+++ b/hip/base/roctx.hip.cpp
@@ -0,0 +1,78 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <hip/hip_runtime.h>
+
+
+#include <ginkgo/config.hpp>
+
+
+#if GINKGO_HIP_PLATFORM_HCC && GKO_HAVE_ROCTX
+#if HIP_VERSION >= 50200000
+#include <roctracer/roctx.h>
+#else
+#include <roctx.h>
+#endif
+#endif
+
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/log/profiler_hook.hpp>
+
+
+namespace gko {
+namespace log {
+
+
+#if GINKGO_HIP_PLATFORM_HCC && GKO_HAVE_ROCTX
+
+void begin_roctx(const char* name, profile_event_category)
+{
+    roctxRangePush(name);
+}
+
+
+void end_roctx(const char*, profile_event_category) { roctxRangePop(); }
+
+#else
+
+void begin_roctx(const char* name, profile_event_category)
+    GKO_NOT_COMPILED(roctx);
+
+
+void end_roctx(const char*, profile_event_category) GKO_NOT_COMPILED(roctx);
+
+#endif
+
+
+}  // namespace log
+}  // namespace gko
diff --git a/hip/base/stream.hip.cpp b/hip/base/stream.hip.cpp
new file mode 100644
index 00000000000..dc2d99b8b17
--- /dev/null
+++ b/hip/base/stream.hip.cpp
@@ -0,0 +1,78 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/base/stream.hpp>
+
+
+#include <hip/hip_runtime.h>
+
+
+#include <ginkgo/config.hpp>
+#include <ginkgo/core/base/device.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+
+
+#include "hip/base/scoped_device_id.hip.hpp"
+
+
+namespace gko {
+
+
+hip_stream::hip_stream() : stream_{}, device_id_{} {}
+
+
+hip_stream::hip_stream(int device_id) : stream_{}, device_id_(device_id)
+{
+    detail::hip_scoped_device_id_guard g(device_id_);
+    GKO_ASSERT_NO_HIP_ERRORS(hipStreamCreate(&stream_));
+}
+
+
+hip_stream::~hip_stream()
+{
+    if (stream_) {
+        detail::hip_scoped_device_id_guard g(device_id_);
+        hipStreamDestroy(stream_);
+    }
+}
+
+
+hip_stream::hip_stream(hip_stream&& other)
+    : stream_{std::exchange(other.stream_, nullptr)},
+      device_id_{std::exchange(other.device_id_, 0)}
+{}
+
+
+GKO_HIP_STREAM_STRUCT* hip_stream::get() const { return stream_; }
+
+
+}  // namespace gko
diff --git a/hip/base/types.hip.hpp b/hip/base/types.hip.hpp
index 93ae3646a4c..c886378ec80 100644
--- a/hip/base/types.hip.hpp
+++ b/hip/base/types.hip.hpp
@@ -43,7 +43,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <hip/hip_complex.h>
 #include <hip/hip_fp16.h>
 #include <hip/hip_runtime.h>
+#if HIP_VERSION >= 50200000
+#include <hipblas/hipblas.h>
+#else
 #include <hipblas.h>
+#endif
 #include <thrust/complex.h>
 
 
diff --git a/hip/components/memory.hip.hpp b/hip/components/memory.hip.hpp
new file mode 100644
index 00000000000..485f67343e0
--- /dev/null
+++ b/hip/components/memory.hip.hpp
@@ -0,0 +1,120 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_HIP_COMPONENTS_MEMORY_HIP_HPP_
+#define GKO_HIP_COMPONENTS_MEMORY_HIP_HPP_
+
+
+#include <type_traits>
+
+
+#include <ginkgo/core/base/math.hpp>
+
+
+#include "hip/base/types.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+
+
+#include "common/cuda_hip/components/memory.hpp.inc"
+
+
+template <typename ValueType>
+__device__ __forceinline__ ValueType load_relaxed(const ValueType* ptr)
+{
+    return load(ptr, 0);
+}
+
+
+template <typename ValueType>
+__device__ __forceinline__ ValueType load_acquire(const ValueType* ptr)
+{
+    auto result = load(ptr, 0);
+    __threadfence();
+    return result;
+}
+
+template <typename ValueType>
+__device__ __forceinline__ void store_relaxed(ValueType* ptr, ValueType value)
+{
+    store(ptr, 0, value);
+}
+
+
+template <typename ValueType>
+__device__ __forceinline__ void store_release(ValueType* ptr, ValueType value)
+{
+    __threadfence();
+    store(ptr, 0, value);
+}
+
+
+template <typename ValueType>
+__device__ __forceinline__ ValueType load_relaxed_shared(const ValueType* ptr)
+{
+    return load(ptr, 0);
+}
+
+
+template <typename ValueType>
+__device__ __forceinline__ ValueType load_acquire_shared(const ValueType* ptr)
+{
+    auto result = load(ptr, 0);
+    __threadfence();
+    return result;
+}
+
+template <typename ValueType>
+__device__ __forceinline__ void store_relaxed_shared(ValueType* ptr,
+                                                     ValueType value)
+{
+    store(ptr, 0, value);
+}
+
+
+template <typename ValueType>
+__device__ __forceinline__ void store_release_shared(ValueType* ptr,
+                                                     ValueType value)
+{
+    __threadfence();
+    store(ptr, 0, value);
+}
+
+
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
+
+#endif  // GKO_HIP_COMPONENTS_MEMORY_HIP_HPP_
diff --git a/hip/components/syncfree.hip.hpp b/hip/components/syncfree.hip.hpp
index 232ff059585..528a9200d08 100644
--- a/hip/components/syncfree.hip.hpp
+++ b/hip/components/syncfree.hip.hpp
@@ -41,7 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "hip/base/config.hip.hpp"
 #include "hip/components/atomic.hip.hpp"
 #include "hip/components/cooperative_groups.hip.hpp"
-#include "hip/components/volatile.hip.hpp"
+#include "hip/components/memory.hip.hpp"
 
 
 namespace gko {
diff --git a/hip/distributed/partition_helpers_kernels.hip.cpp b/hip/distributed/partition_helpers_kernels.hip.cpp
new file mode 100644
index 00000000000..d4769141676
--- /dev/null
+++ b/hip/distributed/partition_helpers_kernels.hip.cpp
@@ -0,0 +1,57 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/distributed/partition_helpers_kernels.hpp"
+
+
+#include <thrust/device_ptr.h>
+#include <thrust/execution_policy.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/sort.h>
+
+
+#include "hip/base/thrust.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+namespace partition_helpers {
+
+
+#include "common/cuda_hip/distributed/partition_helpers_kernels.hpp.inc"
+
+
+}  // namespace partition_helpers
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
diff --git a/hip/factorization/lu_kernels.hip.cpp b/hip/factorization/lu_kernels.hip.cpp
index 507e57bd430..7c9707967c4 100644
--- a/hip/factorization/lu_kernels.hip.cpp
+++ b/hip/factorization/lu_kernels.hip.cpp
@@ -37,13 +37,20 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <memory>
 
 
+#include <thrust/copy.h>
+#include <thrust/iterator/transform_output_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+
+
 #include <ginkgo/core/matrix/csr.hpp>
 
 
 #include "core/base/allocator.hpp"
 #include "core/matrix/csr_lookup.hpp"
+#include "hip/base/thrust.hip.hpp"
 #include "hip/base/types.hip.hpp"
 #include "hip/components/cooperative_groups.hip.hpp"
+#include "hip/components/reduction.hip.hpp"
 #include "hip/components/syncfree.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
 
diff --git a/hip/factorization/par_ic_kernels.hip.cpp b/hip/factorization/par_ic_kernels.hip.cpp
index c8209f2c9dd..deb7d2b83f8 100644
--- a/hip/factorization/par_ic_kernels.hip.cpp
+++ b/hip/factorization/par_ic_kernels.hip.cpp
@@ -40,6 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "hip/base/math.hip.hpp"
 #include "hip/base/types.hip.hpp"
+#include "hip/components/memory.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
 
 
diff --git a/hip/factorization/par_ict_kernels.hip.cpp b/hip/factorization/par_ict_kernels.hip.cpp
index fa914f4d33c..24857fe6807 100644
--- a/hip/factorization/par_ict_kernels.hip.cpp
+++ b/hip/factorization/par_ict_kernels.hip.cpp
@@ -50,6 +50,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/synthesizer/implementation_selection.hpp"
 #include "hip/base/math.hip.hpp"
 #include "hip/components/intrinsics.hip.hpp"
+#include "hip/components/memory.hip.hpp"
 #include "hip/components/merging.hip.hpp"
 #include "hip/components/prefix_sum.hip.hpp"
 #include "hip/components/reduction.hip.hpp"
diff --git a/hip/factorization/par_ilu_kernels.hip.cpp b/hip/factorization/par_ilu_kernels.hip.cpp
index 42e5fd55425..b283e00b8fd 100644
--- a/hip/factorization/par_ilu_kernels.hip.cpp
+++ b/hip/factorization/par_ilu_kernels.hip.cpp
@@ -42,6 +42,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "hip/base/math.hip.hpp"
 #include "hip/base/types.hip.hpp"
+#include "hip/components/memory.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
 
 
diff --git a/hip/factorization/par_ilut_sweep_kernel.hip.cpp b/hip/factorization/par_ilut_sweep_kernel.hip.cpp
index 6e8ed1d8822..f566aa5a159 100644
--- a/hip/factorization/par_ilut_sweep_kernel.hip.cpp
+++ b/hip/factorization/par_ilut_sweep_kernel.hip.cpp
@@ -50,6 +50,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/synthesizer/implementation_selection.hpp"
 #include "hip/base/math.hip.hpp"
 #include "hip/components/intrinsics.hip.hpp"
+#include "hip/components/memory.hip.hpp"
 #include "hip/components/merging.hip.hpp"
 #include "hip/components/prefix_sum.hip.hpp"
 #include "hip/components/reduction.hip.hpp"
diff --git a/hip/log/batch_logger.hip.hpp b/hip/log/batch_logger.hip.hpp
new file mode 100644
index 00000000000..1685a1fca6d
--- /dev/null
+++ b/hip/log/batch_logger.hip.hpp
@@ -0,0 +1,54 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_HIP_LOG_BATCH_LOGGER_HIP_HPP_
+#define GKO_HIP_LOG_BATCH_LOGGER_HIP_HPP_
+
+
+#include <ginkgo/core/base/types.hpp>
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+namespace batch_log {
+
+#include "common/cuda_hip/log/batch_logger.hpp.inc"
+
+
+}  // namespace batch_log
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_HIP_LOG_BATCH_LOGGER_HIP_HPP_
diff --git a/hip/matrix/batch_dense_kernels.hip.cpp b/hip/matrix/batch_dense_kernels.hip.cpp
new file mode 100644
index 00000000000..3361feeb8b8
--- /dev/null
+++ b/hip/matrix/batch_dense_kernels.hip.cpp
@@ -0,0 +1,85 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/matrix/batch_dense_kernels.hpp"
+
+
+#include <hip/hip_runtime.h>
+#include <thrust/functional.h>
+
+
+#include <ginkgo/core/base/math.hpp>
+
+
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "hip/base/batch_struct.hip.hpp"
+#include "hip/base/config.hip.hpp"
+#include "hip/base/thrust.hip.hpp"
+#include "hip/components/cooperative_groups.hip.hpp"
+#include "hip/components/reduction.hip.hpp"
+#include "hip/components/thread_ids.hip.hpp"
+#include "hip/components/uninitialized_array.hip.hpp"
+#include "hip/matrix/batch_struct.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+/**
+ * @brief The Dense matrix format namespace.
+ *
+ * @ingroup batch_dense
+ */
+namespace batch_dense {
+
+
+constexpr auto default_block_size = 256;
+constexpr int sm_oversubscription = 4;
+
+// clang-format off
+
+// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
+
+#include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc"
+
+
+#include "common/cuda_hip/matrix/batch_dense_kernel_launcher.hpp.inc"
+
+
+// clang-format on
+
+
+}  // namespace batch_dense
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
diff --git a/hip/matrix/batch_ell_kernels.hip.cpp b/hip/matrix/batch_ell_kernels.hip.cpp
new file mode 100644
index 00000000000..96e7cdb298e
--- /dev/null
+++ b/hip/matrix/batch_ell_kernels.hip.cpp
@@ -0,0 +1,86 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/matrix/batch_ell_kernels.hpp"
+
+
+#include <hip/hip_runtime.h>
+#include <thrust/functional.h>
+
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/matrix/batch_ell.hpp>
+
+
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "hip/base/batch_struct.hip.hpp"
+#include "hip/base/config.hip.hpp"
+#include "hip/base/thrust.hip.hpp"
+#include "hip/components/cooperative_groups.hip.hpp"
+#include "hip/components/reduction.hip.hpp"
+#include "hip/components/thread_ids.hip.hpp"
+#include "hip/components/uninitialized_array.hip.hpp"
+#include "hip/matrix/batch_struct.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+/**
+ * @brief The Ell matrix format namespace.
+ * @ref Ell
+ * @ingroup batch_ell
+ */
+namespace batch_ell {
+
+
+constexpr auto default_block_size = 256;
+constexpr int sm_oversubscription = 4;
+
+// clang-format off
+
+// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
+
+#include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc"
+
+
+#include "common/cuda_hip/matrix/batch_ell_kernel_launcher.hpp.inc"
+
+// clang-format on
+
+
+}  // namespace batch_ell
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
diff --git a/hip/matrix/batch_struct.hip.hpp b/hip/matrix/batch_struct.hip.hpp
new file mode 100644
index 00000000000..ba75b1b634e
--- /dev/null
+++ b/hip/matrix/batch_struct.hip.hpp
@@ -0,0 +1,131 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_HIP_MATRIX_BATCH_STRUCT_HIP_HPP_
+#define GKO_HIP_MATRIX_BATCH_STRUCT_HIP_HPP_
+
+
+#include "core/matrix/batch_struct.hpp"
+
+
+#include <ginkgo/core/matrix/batch_dense.hpp>
+#include <ginkgo/core/matrix/batch_ell.hpp>
+
+
+#include "core/base/batch_struct.hpp"
+#include "hip/base/types.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+
+
+/** @file batch_struct.hpp
+ *
+ * Helper functions to generate a batch struct from a batch LinOp,
+ * while also shallow-casting to the required HIP scalar type.
+ *
+ * A specialization is needed for every format of every kind of linear algebra
+ * object. These are intended to be called on the host.
+ */
+
+
+/**
+ * Generates an immutable uniform batch struct from a batch of dense matrices.
+ */
+template <typename ValueType>
+inline batch::matrix::dense::uniform_batch<const hip_type<ValueType>>
+get_batch_struct(const batch::matrix::Dense<ValueType>* const op)
+{
+    return {as_hip_type(op->get_const_values()), op->get_num_batch_items(),
+            static_cast<int32>(op->get_common_size()[1]),
+            static_cast<int32>(op->get_common_size()[0]),
+            static_cast<int32>(op->get_common_size()[1])};
+}
+
+
+/**
+ * Generates a uniform batch struct from a batch of dense matrices.
+ */
+template <typename ValueType>
+inline batch::matrix::dense::uniform_batch<hip_type<ValueType>>
+get_batch_struct(batch::matrix::Dense<ValueType>* const op)
+{
+    return {as_hip_type(op->get_values()), op->get_num_batch_items(),
+            static_cast<int32>(op->get_common_size()[1]),
+            static_cast<int32>(op->get_common_size()[0]),
+            static_cast<int32>(op->get_common_size()[1])};
+}
+
+
+/**
+ * Generates an immutable uniform batch struct from a batch of ell matrices.
+ */
+template <typename ValueType, typename IndexType>
+inline batch::matrix::ell::uniform_batch<const hip_type<ValueType>,
+                                         const IndexType>
+get_batch_struct(const batch::matrix::Ell<ValueType, IndexType>* const op)
+{
+    return {as_hip_type(op->get_const_values()),
+            op->get_const_col_idxs(),
+            op->get_num_batch_items(),
+            static_cast<IndexType>(op->get_common_size()[0]),
+            static_cast<IndexType>(op->get_common_size()[0]),
+            static_cast<IndexType>(op->get_common_size()[1]),
+            static_cast<IndexType>(op->get_num_stored_elements_per_row())};
+}
+
+
+/**
+ * Generates a uniform batch struct from a batch of ell matrices.
+ */
+template <typename ValueType, typename IndexType>
+inline batch::matrix::ell::uniform_batch<hip_type<ValueType>, IndexType>
+get_batch_struct(batch::matrix::Ell<ValueType, IndexType>* const op)
+{
+    return {as_hip_type(op->get_values()),
+            op->get_col_idxs(),
+            op->get_num_batch_items(),
+            static_cast<IndexType>(op->get_common_size()[0]),
+            static_cast<IndexType>(op->get_common_size()[0]),
+            static_cast<IndexType>(op->get_common_size()[1]),
+            static_cast<IndexType>(op->get_num_stored_elements_per_row())};
+}
+
+
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_HIP_MATRIX_BATCH_STRUCT_HIP_HPP_
diff --git a/hip/matrix/csr_kernels.instantiate.hip.cpp b/hip/matrix/csr_kernels.instantiate.hip.cpp
new file mode 100644
index 00000000000..156b170311f
--- /dev/null
+++ b/hip/matrix/csr_kernels.instantiate.hip.cpp
@@ -0,0 +1,157 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "hip/matrix/csr_kernels.template.hip.cpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+/**
+ * @brief The Compressed sparse row matrix format namespace.
+ *
+ * @ingroup csr
+ */
+namespace csr {
+
+
+// begin
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_CONVERT_TO_FBCSR_KERNEL);
+
+
+// split
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1(GKO_DECLARE_CSR_SPMV_KERNEL,
+                                                 int32);
+// split
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2(GKO_DECLARE_CSR_SPMV_KERNEL,
+                                                 int32);
+// split
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3(GKO_DECLARE_CSR_SPMV_KERNEL,
+                                                 int32);
+// split
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4(GKO_DECLARE_CSR_SPMV_KERNEL,
+                                                 int32);
+// split
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1(GKO_DECLARE_CSR_SPMV_KERNEL,
+                                                 int64);
+// split
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2(GKO_DECLARE_CSR_SPMV_KERNEL,
+                                                 int64);
+// split
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3(GKO_DECLARE_CSR_SPMV_KERNEL,
+                                                 int64);
+// split
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4(GKO_DECLARE_CSR_SPMV_KERNEL,
+                                                 int64);
+
+
+// split
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1(
+    GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int32);
+// split
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2(
+    GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int32);
+// split
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3(
+    GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int32);
+// split
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4(
+    GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int32);
+// split
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1(
+    GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int64);
+// split
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2(
+    GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int64);
+// split
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3(
+    GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int64);
+// split
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4(
+    GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int64);
+
+
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_TRANSPOSE_KERNEL);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_CONJ_TRANSPOSE_KERNEL);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_SORT_BY_COLUMN_INDEX);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEMM_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_ADVANCED_SPGEMM_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CSR_BUILD_LOOKUP_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEAM_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_FILL_IN_DENSE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_INV_NONSYMM_PERMUTE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_INV_SYMM_PERMUTE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_ROW_PERMUTE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_INV_ROW_PERMUTE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_INV_NONSYMM_SCALE_PERMUTE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_INV_SYMM_SCALE_PERMUTE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_ROW_SCALE_PERMUTE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_INV_ROW_SCALE_PERMUTE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_SPAN_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_INDEX_SET_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_FROM_INDEX_SET_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_IS_SORTED_BY_COLUMN_INDEX);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_EXTRACT_DIAGONAL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_CHECK_DIAGONAL_ENTRIES_EXIST);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL);
+// end
+
+
+}  // namespace csr
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
diff --git a/hip/matrix/csr_kernels.hip.cpp b/hip/matrix/csr_kernels.template.hip.cpp
similarity index 68%
rename from hip/matrix/csr_kernels.hip.cpp
rename to hip/matrix/csr_kernels.template.hip.cpp
index b18cfa0f12b..52101385c92 100644
--- a/hip/matrix/csr_kernels.hip.cpp
+++ b/hip/matrix/csr_kernels.template.hip.cpp
@@ -493,9 +493,6 @@ void spmv(std::shared_ptr<const HipExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CSR_SPMV_KERNEL);
-
 
 template <typename MatrixValueType, typename InputValueType,
           typename OutputValueType, typename IndexType>
@@ -558,9 +555,6 @@ void advanced_spmv(std::shared_ptr<const HipExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL);
-
 
 template <typename ValueType, typename IndexType>
 void spgemm(std::shared_ptr<const HipExecutor> exec,
@@ -634,56 +628,6 @@ void spgemm(std::shared_ptr<const HipExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEMM_KERNEL);
-
-
-namespace {
-
-
-template <int subwarp_size, typename ValueType, typename IndexType>
-void spgeam(syn::value_list<int, subwarp_size>,
-            std::shared_ptr<const HipExecutor> exec, const ValueType* alpha,
-            const IndexType* a_row_ptrs, const IndexType* a_col_idxs,
-            const ValueType* a_vals, const ValueType* beta,
-            const IndexType* b_row_ptrs, const IndexType* b_col_idxs,
-            const ValueType* b_vals, matrix::Csr<ValueType, IndexType>* c)
-{
-    auto m = static_cast<IndexType>(c->get_size()[0]);
-    auto c_row_ptrs = c->get_row_ptrs();
-    // count nnz for alpha * A + beta * B
-    auto subwarps_per_block = default_block_size / subwarp_size;
-    auto num_blocks = ceildiv(m, subwarps_per_block);
-    if (num_blocks > 0) {
-        kernel::spgeam_nnz<subwarp_size>
-            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
-                a_row_ptrs, a_col_idxs, b_row_ptrs, b_col_idxs, m, c_row_ptrs);
-    }
-
-    // build row pointers
-    components::prefix_sum_nonnegative(exec, c_row_ptrs, m + 1);
-
-    // accumulate non-zeros for alpha * A + beta * B
-    matrix::CsrBuilder<ValueType, IndexType> c_builder{c};
-    auto c_nnz = exec->copy_val_to_host(c_row_ptrs + m);
-    c_builder.get_col_idx_array().resize_and_reset(c_nnz);
-    c_builder.get_value_array().resize_and_reset(c_nnz);
-    auto c_col_idxs = c->get_col_idxs();
-    auto c_vals = c->get_values();
-    if (num_blocks > 0) {
-        kernel::spgeam<subwarp_size>
-            <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
-                as_device_type(alpha), a_row_ptrs, a_col_idxs,
-                as_device_type(a_vals), as_device_type(beta), b_row_ptrs,
-                b_col_idxs, as_device_type(b_vals), m, c_row_ptrs, c_col_idxs,
-                as_device_type(c_vals));
-    }
-}
-
-GKO_ENABLE_IMPLEMENTATION_SELECTION(select_spgeam, spgeam);
-
-
-}  // namespace
-
 
 template <typename ValueType, typename IndexType>
 void advanced_spgemm(std::shared_ptr<const HipExecutor> exec,
@@ -775,61 +719,6 @@ void advanced_spgemm(std::shared_ptr<const HipExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CSR_ADVANCED_SPGEMM_KERNEL);
-
-
-template <typename ValueType, typename IndexType>
-void spgeam(std::shared_ptr<const DefaultExecutor> exec,
-            const matrix::Dense<ValueType>* alpha,
-            const matrix::Csr<ValueType, IndexType>* a,
-            const matrix::Dense<ValueType>* beta,
-            const matrix::Csr<ValueType, IndexType>* b,
-            matrix::Csr<ValueType, IndexType>* c)
-{
-    auto total_nnz =
-        a->get_num_stored_elements() + b->get_num_stored_elements();
-    auto nnz_per_row = total_nnz / a->get_size()[0];
-    select_spgeam(
-        spgeam_kernels(),
-        [&](int compiled_subwarp_size) {
-            return compiled_subwarp_size >= nnz_per_row ||
-                   compiled_subwarp_size == config::warp_size;
-        },
-        syn::value_list<int>(), syn::type_list<>(), exec,
-        alpha->get_const_values(), a->get_const_row_ptrs(),
-        a->get_const_col_idxs(), a->get_const_values(),
-        beta->get_const_values(), b->get_const_row_ptrs(),
-        b->get_const_col_idxs(), b->get_const_values(), c);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEAM_KERNEL);
-
-
-template <typename ValueType, typename IndexType>
-void fill_in_dense(std::shared_ptr<const HipExecutor> exec,
-                   const matrix::Csr<ValueType, IndexType>* source,
-                   matrix::Dense<ValueType>* result)
-{
-    const auto num_rows = result->get_size()[0];
-    const auto num_cols = result->get_size()[1];
-    const auto stride = result->get_stride();
-    const auto row_ptrs = source->get_const_row_ptrs();
-    const auto col_idxs = source->get_const_col_idxs();
-    const auto vals = source->get_const_values();
-
-    auto grid_dim = ceildiv(num_rows, default_block_size);
-    if (grid_dim > 0) {
-        kernel::fill_in_dense<<<grid_dim, default_block_size, 0,
-                                exec->get_stream()>>>(
-            num_rows, as_device_type(row_ptrs), as_device_type(col_idxs),
-            as_device_type(vals), stride, as_device_type(result->get_values()));
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CSR_FILL_IN_DENSE_KERNEL);
-
 
 template <typename ValueType, typename IndexType>
 void transpose(std::shared_ptr<const HipExecutor> exec,
@@ -854,8 +743,6 @@ void transpose(std::shared_ptr<const HipExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_TRANSPOSE_KERNEL);
-
 
 template <typename ValueType, typename IndexType>
 void conj_transpose(std::shared_ptr<const HipExecutor> exec,
@@ -888,183 +775,6 @@ void conj_transpose(std::shared_ptr<const HipExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CSR_CONJ_TRANSPOSE_KERNEL);
-
-
-template <typename ValueType, typename IndexType>
-void inv_symm_permute(std::shared_ptr<const HipExecutor> exec,
-                      const IndexType* perm,
-                      const matrix::Csr<ValueType, IndexType>* orig,
-                      matrix::Csr<ValueType, IndexType>* permuted)
-{
-    auto num_rows = orig->get_size()[0];
-    auto count_num_blocks = ceildiv(num_rows, default_block_size);
-    if (count_num_blocks > 0) {
-        kernel::inv_row_ptr_permute<<<count_num_blocks, default_block_size, 0,
-                                      exec->get_stream()>>>(
-            num_rows, perm, orig->get_const_row_ptrs(),
-            permuted->get_row_ptrs());
-    }
-    components::prefix_sum_nonnegative(exec, permuted->get_row_ptrs(),
-                                       num_rows + 1);
-    auto copy_num_blocks =
-        ceildiv(num_rows, default_block_size / config::warp_size);
-    if (copy_num_blocks > 0) {
-        kernel::inv_symm_permute<config::warp_size>
-            <<<copy_num_blocks, default_block_size, 0, exec->get_stream()>>>(
-                num_rows, perm, orig->get_const_row_ptrs(),
-                orig->get_const_col_idxs(),
-                as_device_type(orig->get_const_values()),
-                permuted->get_row_ptrs(), permuted->get_col_idxs(),
-                as_device_type(permuted->get_values()));
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CSR_INV_SYMM_PERMUTE_KERNEL);
-
-
-template <typename ValueType, typename IndexType>
-void row_permute(std::shared_ptr<const HipExecutor> exec, const IndexType* perm,
-                 const matrix::Csr<ValueType, IndexType>* orig,
-                 matrix::Csr<ValueType, IndexType>* row_permuted)
-{
-    auto num_rows = orig->get_size()[0];
-    auto count_num_blocks = ceildiv(num_rows, default_block_size);
-    if (count_num_blocks > 0) {
-        kernel::row_ptr_permute<<<count_num_blocks, default_block_size, 0,
-                                  exec->get_stream()>>>(
-            num_rows, perm, orig->get_const_row_ptrs(),
-            row_permuted->get_row_ptrs());
-    }
-    components::prefix_sum_nonnegative(exec, row_permuted->get_row_ptrs(),
-                                       num_rows + 1);
-    auto copy_num_blocks =
-        ceildiv(num_rows, default_block_size / config::warp_size);
-    if (copy_num_blocks > 0) {
-        kernel::row_permute<config::warp_size>
-            <<<copy_num_blocks, default_block_size, 0, exec->get_stream()>>>(
-                num_rows, perm, orig->get_const_row_ptrs(),
-                orig->get_const_col_idxs(),
-                as_device_type(orig->get_const_values()),
-                row_permuted->get_row_ptrs(), row_permuted->get_col_idxs(),
-                as_device_type(row_permuted->get_values()));
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CSR_ROW_PERMUTE_KERNEL);
-
-
-template <typename ValueType, typename IndexType>
-void inverse_row_permute(std::shared_ptr<const HipExecutor> exec,
-                         const IndexType* perm,
-                         const matrix::Csr<ValueType, IndexType>* orig,
-                         matrix::Csr<ValueType, IndexType>* row_permuted)
-{
-    auto num_rows = orig->get_size()[0];
-    auto count_num_blocks = ceildiv(num_rows, default_block_size);
-    if (count_num_blocks > 0) {
-        kernel::inv_row_ptr_permute<<<count_num_blocks, default_block_size, 0,
-                                      exec->get_stream()>>>(
-            num_rows, perm, orig->get_const_row_ptrs(),
-            row_permuted->get_row_ptrs());
-    }
-    components::prefix_sum_nonnegative(exec, row_permuted->get_row_ptrs(),
-                                       num_rows + 1);
-    auto copy_num_blocks =
-        ceildiv(num_rows, default_block_size / config::warp_size);
-    if (copy_num_blocks > 0) {
-        kernel::inv_row_permute<config::warp_size>
-            <<<copy_num_blocks, default_block_size, 0, exec->get_stream()>>>(
-                num_rows, perm, orig->get_const_row_ptrs(),
-                orig->get_const_col_idxs(),
-                as_device_type(orig->get_const_values()),
-                row_permuted->get_row_ptrs(), row_permuted->get_col_idxs(),
-                as_device_type(row_permuted->get_values()));
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CSR_INVERSE_ROW_PERMUTE_KERNEL);
-
-
-template <typename ValueType, typename IndexType>
-void calculate_nonzeros_per_row_in_span(
-    std::shared_ptr<const DefaultExecutor> exec,
-    const matrix::Csr<ValueType, IndexType>* source, const span& row_span,
-    const span& col_span, array<IndexType>* row_nnz)
-{
-    const auto num_rows = source->get_size()[0];
-    auto row_ptrs = source->get_const_row_ptrs();
-    auto col_idxs = source->get_const_col_idxs();
-    auto grid_dim = ceildiv(row_span.length(), default_block_size);
-
-    if (grid_dim > 0) {
-        kernel::calculate_nnz_per_row_in_span<<<grid_dim, default_block_size, 0,
-                                                exec->get_stream()>>>(
-            row_span, col_span, as_device_type(row_ptrs),
-            as_device_type(col_idxs), as_device_type(row_nnz->get_data()));
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_SPAN_KERNEL);
-
-
-template <typename ValueType, typename IndexType>
-void compute_submatrix(std::shared_ptr<const DefaultExecutor> exec,
-                       const matrix::Csr<ValueType, IndexType>* source,
-                       gko::span row_span, gko::span col_span,
-                       matrix::Csr<ValueType, IndexType>* result)
-{
-    auto row_offset = row_span.begin;
-    auto col_offset = col_span.begin;
-    auto num_rows = result->get_size()[0];
-    auto num_cols = result->get_size()[1];
-    auto row_ptrs = source->get_const_row_ptrs();
-    auto grid_dim = ceildiv(num_rows, default_block_size);
-    if (grid_dim > 0) {
-        kernel::compute_submatrix_idxs_and_vals<<<grid_dim, default_block_size,
-                                                  0, exec->get_stream()>>>(
-            num_rows, num_cols, row_offset, col_offset,
-            as_device_type(source->get_const_row_ptrs()),
-            as_device_type(source->get_const_col_idxs()),
-            as_device_type(source->get_const_values()),
-            as_device_type(result->get_const_row_ptrs()),
-            as_device_type(result->get_col_idxs()),
-            as_device_type(result->get_values()));
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_KERNEL);
-
-
-template <typename ValueType, typename IndexType>
-void calculate_nonzeros_per_row_in_index_set(
-    std::shared_ptr<const DefaultExecutor> exec,
-    const matrix::Csr<ValueType, IndexType>* source,
-    const gko::index_set<IndexType>& row_index_set,
-    const gko::index_set<IndexType>& col_index_set,
-    IndexType* row_nnz) GKO_NOT_IMPLEMENTED;
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CSR_CALC_NNZ_PER_ROW_IN_INDEX_SET_KERNEL);
-
-
-template <typename ValueType, typename IndexType>
-void compute_submatrix_from_index_set(
-    std::shared_ptr<const DefaultExecutor> exec,
-    const matrix::Csr<ValueType, IndexType>* source,
-    const gko::index_set<IndexType>& row_index_set,
-    const gko::index_set<IndexType>& col_index_set,
-    matrix::Csr<ValueType, IndexType>* result) GKO_NOT_IMPLEMENTED;
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CSR_COMPUTE_SUB_MATRIX_FROM_INDEX_SET_KERNEL);
-
 
 template <typename ValueType, typename IndexType>
 void sort_by_column_index(std::shared_ptr<const HipExecutor> exec,
@@ -1110,109 +820,6 @@ void sort_by_column_index(std::shared_ptr<const HipExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CSR_SORT_BY_COLUMN_INDEX);
-
-
-template <typename ValueType, typename IndexType>
-void is_sorted_by_column_index(
-    std::shared_ptr<const HipExecutor> exec,
-    const matrix::Csr<ValueType, IndexType>* to_check, bool* is_sorted)
-{
-    *is_sorted = true;
-    auto cpu_array = make_array_view(exec->get_master(), 1, is_sorted);
-    auto gpu_array = array<bool>{exec, cpu_array};
-    auto block_size = default_block_size;
-    auto num_rows = static_cast<IndexType>(to_check->get_size()[0]);
-    auto num_blocks = ceildiv(num_rows, block_size);
-    if (num_blocks > 0) {
-        kernel::
-            check_unsorted<<<num_blocks, block_size, 0, exec->get_stream()>>>(
-                to_check->get_const_row_ptrs(), to_check->get_const_col_idxs(),
-                num_rows, gpu_array.get_data());
-    }
-    cpu_array = gpu_array;
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CSR_IS_SORTED_BY_COLUMN_INDEX);
-
-
-template <typename ValueType, typename IndexType>
-void extract_diagonal(std::shared_ptr<const HipExecutor> exec,
-                      const matrix::Csr<ValueType, IndexType>* orig,
-                      matrix::Diagonal<ValueType>* diag)
-{
-    const auto nnz = orig->get_num_stored_elements();
-    const auto diag_size = diag->get_size()[0];
-    const auto num_blocks =
-        ceildiv(config::warp_size * diag_size, default_block_size);
-
-    const auto orig_values = orig->get_const_values();
-    const auto orig_row_ptrs = orig->get_const_row_ptrs();
-    const auto orig_col_idxs = orig->get_const_col_idxs();
-    auto diag_values = diag->get_values();
-    if (num_blocks > 0) {
-        kernel::extract_diagonal<<<num_blocks, default_block_size, 0,
-                                   exec->get_stream()>>>(
-            diag_size, nnz, as_device_type(orig_values),
-            as_device_type(orig_row_ptrs), as_device_type(orig_col_idxs),
-            as_device_type(diag_values));
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_EXTRACT_DIAGONAL);
-
-
-template <typename ValueType, typename IndexType>
-void check_diagonal_entries_exist(
-    std::shared_ptr<const HipExecutor> exec,
-    const matrix::Csr<ValueType, IndexType>* const mtx, bool& has_all_diags)
-{
-    const size_type num_warps = mtx->get_size()[0];
-    if (num_warps > 0) {
-        const size_type num_blocks =
-            num_warps / (default_block_size / config::warp_size);
-        array<bool> has_diags(exec, {true});
-        kernel::check_diagonal_entries<<<num_blocks, default_block_size, 0,
-                                         exec->get_stream()>>>(
-            static_cast<IndexType>(
-                std::min(mtx->get_size()[0], mtx->get_size()[1])),
-            mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(),
-            has_diags.get_data());
-        has_all_diags = exec->copy_val_to_host(has_diags.get_const_data());
-    } else {
-        has_all_diags = true;
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CSR_CHECK_DIAGONAL_ENTRIES_EXIST);
-
-
-template <typename ValueType, typename IndexType>
-void add_scaled_identity(std::shared_ptr<const HipExecutor> exec,
-                         const matrix::Dense<ValueType>* const alpha,
-                         const matrix::Dense<ValueType>* const beta,
-                         matrix::Csr<ValueType, IndexType>* const mtx)
-{
-    const auto nrows = mtx->get_size()[0];
-    if (nrows == 0) {
-        return;
-    }
-    const auto nthreads = nrows * config::warp_size;
-    const auto nblocks = ceildiv(nthreads, default_block_size);
-    kernel::add_scaled_identity<<<nblocks, default_block_size, 0,
-                                  exec->get_stream()>>>(
-        as_device_type(alpha->get_const_values()),
-        as_device_type(beta->get_const_values()), static_cast<IndexType>(nrows),
-        mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(),
-        as_device_type(mtx->get_values()));
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL);
-
 
 }  // namespace csr
 }  // namespace hip
diff --git a/hip/matrix/ell_kernels.hip.cpp b/hip/matrix/ell_kernels.hip.cpp
index db9d5aa11bb..1567548463f 100644
--- a/hip/matrix/ell_kernels.hip.cpp
+++ b/hip/matrix/ell_kernels.hip.cpp
@@ -125,10 +125,12 @@ void abstract_spmv(syn::value_list<int, info>,
                    const matrix::Dense<MatrixValueType>* alpha = nullptr,
                    const matrix::Dense<OutputValueType>* beta = nullptr)
 {
+    using arithmetic_type =
+        highest_precision<InputValueType, OutputValueType, MatrixValueType>;
     using a_accessor =
-        acc::reduced_row_major<1, OutputValueType, const MatrixValueType>;
+        acc::reduced_row_major<1, arithmetic_type, const MatrixValueType>;
     using b_accessor =
-        acc::reduced_row_major<2, OutputValueType, const InputValueType>;
+        acc::reduced_row_major<2, arithmetic_type, const InputValueType>;
 
     const auto nrows = a->get_size()[0];
     const auto stride = a->get_stride();
diff --git a/hip/matrix/fbcsr_kernels.instantiate.hip.cpp b/hip/matrix/fbcsr_kernels.instantiate.hip.cpp
new file mode 100644
index 00000000000..8cf4944e08a
--- /dev/null
+++ b/hip/matrix/fbcsr_kernels.instantiate.hip.cpp
@@ -0,0 +1,75 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "hip/matrix/fbcsr_kernels.template.hip.cpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+/**
+ * @brief The fixed-size block compressed sparse row matrix format namespace.
+ *
+ * @ingroup fbcsr
+ */
+namespace fbcsr {
+
+
+// begin
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_FBCSR_FILL_IN_MATRIX_DATA_KERNEL);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_FBCSR_FILL_IN_DENSE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_FBCSR_CONVERT_TO_CSR_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_FBCSR_IS_SORTED_BY_COLUMN_INDEX);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_FBCSR_SORT_BY_COLUMN_INDEX);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_FBCSR_EXTRACT_DIAGONAL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_SPMV_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_FBCSR_ADVANCED_SPMV_KERNEL);
+// split
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_FBCSR_TRANSPOSE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_FBCSR_CONJ_TRANSPOSE_KERNEL);
+// end
+
+
+}  // namespace fbcsr
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
diff --git a/hip/matrix/fbcsr_kernels.hip.cpp b/hip/matrix/fbcsr_kernels.template.hip.cpp
similarity index 96%
rename from hip/matrix/fbcsr_kernels.hip.cpp
rename to hip/matrix/fbcsr_kernels.template.hip.cpp
index 8a4d78e7e40..88cad66753c 100644
--- a/hip/matrix/fbcsr_kernels.hip.cpp
+++ b/hip/matrix/fbcsr_kernels.template.hip.cpp
@@ -182,8 +182,6 @@ void spmv(std::shared_ptr<const HipExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_FBCSR_SPMV_KERNEL);
-
 
 template <typename ValueType, typename IndexType>
 void advanced_spmv(std::shared_ptr<const HipExecutor> exec,
@@ -242,9 +240,6 @@ void advanced_spmv(std::shared_ptr<const HipExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_FBCSR_ADVANCED_SPMV_KERNEL);
-
 
 template <typename ValueType, typename IndexType>
 void transpose(const std::shared_ptr<const DefaultExecutor> exec,
@@ -254,9 +249,6 @@ void transpose(const std::shared_ptr<const DefaultExecutor> exec,
     fallback_transpose(exec, input, output);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_FBCSR_TRANSPOSE_KERNEL);
-
 
 template <typename ValueType, typename IndexType>
 void conj_transpose(std::shared_ptr<const HipExecutor> exec,
@@ -274,9 +266,6 @@ void conj_transpose(std::shared_ptr<const HipExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_FBCSR_CONJ_TRANSPOSE_KERNEL);
-
 
 }  // namespace fbcsr
 }  // namespace hip
diff --git a/hip/matrix/fft_kernels.hip.cpp b/hip/matrix/fft_kernels.hip.cpp
index 238aeddc40f..56c967d9e49 100644
--- a/hip/matrix/fft_kernels.hip.cpp
+++ b/hip/matrix/fft_kernels.hip.cpp
@@ -36,7 +36,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <array>
 
 
+#include <hip/hip_runtime.h>
+#if HIP_VERSION >= 50200000
+#include <hipfft/hipfft.h>
+#else
 #include <hipfft.h>
+#endif
 
 
 #include <ginkgo/core/base/exception_helpers.hpp>
diff --git a/hip/matrix/sparsity_csr_kernels.hip.cpp b/hip/matrix/sparsity_csr_kernels.hip.cpp
index bc9cd0a31db..2084aa5656f 100644
--- a/hip/matrix/sparsity_csr_kernels.hip.cpp
+++ b/hip/matrix/sparsity_csr_kernels.hip.cpp
@@ -34,6 +34,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <hip/hip_runtime.h>
+#include <thrust/sort.h>
 
 
 #include <ginkgo/core/base/exception_helpers.hpp>
@@ -42,9 +43,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "accessor/hip_helper.hpp"
 #include "accessor/reduced_row_major.hpp"
 #include "core/base/mixed_precision_types.hpp"
+#include "core/components/fill_array_kernels.hpp"
+#include "core/components/format_conversion_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
 #include "hip/base/config.hip.hpp"
+#include "hip/base/hipsparse_bindings.hip.hpp"
 #include "hip/base/math.hip.hpp"
+#include "hip/base/thrust.hip.hpp"
 #include "hip/base/types.hip.hpp"
 #include "hip/components/cooperative_groups.hip.hpp"
 #include "hip/components/reduction.hip.hpp"
@@ -64,6 +69,7 @@ namespace sparsity_csr {
 
 
 constexpr int classical_oversubscription = 32;
+constexpr int default_block_size = 512;
 constexpr int spmv_block_size = 256;
 constexpr int warps_in_block = 4;
 
@@ -71,6 +77,7 @@ constexpr int warps_in_block = 4;
 using classical_kernels = syn::value_list<int, 2>;
 
 
+#include "common/cuda_hip/matrix/csr_common.hpp.inc"
 #include "common/cuda_hip/matrix/sparsity_csr_kernels.hpp.inc"
 
 
@@ -181,6 +188,62 @@ GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_SPARSITY_CSR_ADVANCED_SPMV_KERNEL);
 
 
+template <typename ValueType, typename IndexType>
+void sort_by_column_index(std::shared_ptr<const DefaultExecutor> exec,
+                          matrix::SparsityCsr<ValueType, IndexType>* to_sort)
+{
+    const auto nnz = static_cast<IndexType>(to_sort->get_num_nonzeros());
+    const auto num_rows = static_cast<IndexType>(to_sort->get_size()[0]);
+    const auto num_cols = static_cast<IndexType>(to_sort->get_size()[1]);
+    const auto row_ptrs = to_sort->get_const_row_ptrs();
+    const auto col_idxs = to_sort->get_col_idxs();
+    if (hipsparse::is_supported<ValueType, IndexType>::value) {
+        const auto handle = exec->get_hipsparse_handle();
+        auto descr = hipsparse::create_mat_descr();
+        array<IndexType> permutation_array(exec, to_sort->get_num_nonzeros());
+        auto permutation = permutation_array.get_data();
+        components::fill_seq_array(exec, permutation,
+                                   to_sort->get_num_nonzeros());
+        size_type buffer_size{};
+        hipsparse::csrsort_buffer_size(handle, num_rows, num_cols, nnz,
+                                       row_ptrs, col_idxs, buffer_size);
+        array<char> buffer_array{exec, buffer_size};
+        auto buffer = buffer_array.get_data();
+        hipsparse::csrsort(handle, num_rows, num_cols, nnz, descr, row_ptrs,
+                           col_idxs, permutation, buffer);
+        hipsparse::destroy(descr);
+    } else {
+        fallback_sort(exec, to_sort);
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_SPARSITY_CSR_SORT_BY_COLUMN_INDEX);
+
+
+template <typename ValueType, typename IndexType>
+void is_sorted_by_column_index(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const matrix::SparsityCsr<ValueType, IndexType>* to_check, bool* is_sorted)
+{
+    *is_sorted = true;
+    auto cpu_array = make_array_view(exec->get_master(), 1, is_sorted);
+    auto gpu_array = array<bool>{exec, cpu_array};
+    const auto num_rows = static_cast<IndexType>(to_check->get_size()[0]);
+    auto num_blocks = ceildiv(num_rows, default_block_size);
+    if (num_blocks > 0) {
+        kernel::check_unsorted<<<num_blocks, default_block_size, 0,
+                                 exec->get_stream()>>>(
+            to_check->get_const_row_ptrs(), to_check->get_const_col_idxs(),
+            num_rows, gpu_array.get_data());
+    }
+    cpu_array = gpu_array;
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_SPARSITY_CSR_IS_SORTED_BY_COLUMN_INDEX);
+
+
 }  // namespace sparsity_csr
 }  // namespace hip
 }  // namespace kernels
diff --git a/hip/preconditioner/batch_preconditioners.hip.hpp b/hip/preconditioner/batch_preconditioners.hip.hpp
new file mode 100644
index 00000000000..5c95913d285
--- /dev/null
+++ b/hip/preconditioner/batch_preconditioners.hip.hpp
@@ -0,0 +1,60 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_HIP_PRECONDITIONER_BATCH_PRECONDITIONERS_HIP_HPP_
+#define GKO_HIP_PRECONDITIONER_BATCH_PRECONDITIONERS_HIP_HPP_
+
+
+#include <ginkgo/core/matrix/batch_identity.hpp>
+
+
+#include "core/matrix/batch_struct.hpp"
+#include "hip/components/cooperative_groups.hip.hpp"
+#include "hip/components/reduction.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+namespace batch_preconditioner {
+
+
+#include "common/cuda_hip/preconditioner/batch_identity.hpp.inc"
+
+
+}  // namespace batch_preconditioner
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_HIP_PRECONDITIONER_BATCH_PRECONDITIONERS_HIP_HPP_
diff --git a/hip/solver/batch_bicgstab_kernels.hip.cpp b/hip/solver/batch_bicgstab_kernels.hip.cpp
new file mode 100644
index 00000000000..217d314a5c9
--- /dev/null
+++ b/hip/solver/batch_bicgstab_kernels.hip.cpp
@@ -0,0 +1,270 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/solver/batch_bicgstab_kernels.hpp"
+
+
+#include <hip/hip_runtime.h>
+#include <thrust/functional.h>
+#include <thrust/transform.h>
+
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/math.hpp>
+
+
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "core/solver/batch_dispatch.hpp"
+#include "hip/base/batch_struct.hip.hpp"
+#include "hip/base/config.hip.hpp"
+#include "hip/base/math.hip.hpp"
+#include "hip/base/thrust.hip.hpp"
+#include "hip/base/types.hip.hpp"
+#include "hip/components/cooperative_groups.hip.hpp"
+#include "hip/components/reduction.hip.hpp"
+#include "hip/components/thread_ids.hip.hpp"
+#include "hip/components/uninitialized_array.hip.hpp"
+#include "hip/matrix/batch_struct.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+
+
+constexpr int default_block_size = 256;
+constexpr int sm_oversubscription = 4;
+
+/**
+ * @brief The batch Bicgstab solver namespace.
+ *
+ * @ingroup batch_bicgstab
+ */
+namespace batch_bicgstab {
+
+
+#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc"
+#include "common/cuda_hip/components/uninitialized_array.hpp.inc"
+#include "common/cuda_hip/matrix/batch_dense_kernels.hpp.inc"
+#include "common/cuda_hip/matrix/batch_ell_kernels.hpp.inc"
+#include "common/cuda_hip/solver/batch_bicgstab_kernels.hpp.inc"
+
+
+template <typename BatchMatrixType>
+int get_num_threads_per_block(std::shared_ptr<const DefaultExecutor> exec,
+                              const int num_rows)
+{
+    int num_warps = std::max(num_rows / 4, 2);
+    constexpr int warp_sz = static_cast<int>(config::warp_size);
+    const int min_block_size = 2 * warp_sz;
+    const int device_max_threads =
+        ((std::max(num_rows, min_block_size)) / warp_sz) * warp_sz;
+    // This value has been taken from ROCm docs. This is the number of registers
+    // that maximizes the occupancy on an AMD GPU (MI200). HIP does not have an
+    // API to query the number of registers a function uses.
+    const int num_regs_used_per_thread = 64;
+    int max_regs_blk = 0;
+    GKO_ASSERT_NO_HIP_ERRORS(hipDeviceGetAttribute(
+        &max_regs_blk, hipDeviceAttributeMaxRegistersPerBlock,
+        exec->get_device_id()));
+    const int max_threads_regs = (max_regs_blk / num_regs_used_per_thread);
+    int max_threads = std::min(max_threads_regs, device_max_threads);
+    max_threads = max_threads <= 1024 ? max_threads : 1024;
+    return std::max(std::min(num_warps * warp_sz, max_threads), min_block_size);
+}
+
+
+template <typename T>
+using settings = gko::kernels::batch_bicgstab::settings<T>;
+
+
+template <typename HipValueType>
+class kernel_caller {
+public:
+    using value_type = HipValueType;
+
+    kernel_caller(std::shared_ptr<const DefaultExecutor> exec,
+                  const settings<remove_complex<value_type>> settings)
+        : exec_{exec}, settings_{settings}
+    {}
+
+    template <typename StopType, const int n_shared,
+              const bool prec_shared_bool, typename PrecType, typename LogType,
+              typename BatchMatrixType>
+    void launch_apply_kernel(
+        const gko::kernels::batch_bicgstab::storage_config& sconf,
+        LogType& logger, PrecType& prec, const BatchMatrixType& mat,
+        const value_type* const __restrict__ b_values,
+        value_type* const __restrict__ x_values,
+        value_type* const __restrict__ workspace_data, const int& block_size,
+        const size_t& shared_size) const
+    {
+        apply_kernel<StopType, n_shared, prec_shared_bool>
+            <<<mat.num_batch_items, block_size, shared_size,
+               exec_->get_stream()>>>(sconf, settings_.max_iterations,
+                                      settings_.residual_tol, logger, prec, mat,
+                                      b_values, x_values, workspace_data);
+    }
+
+
+    template <typename BatchMatrixType, typename PrecType, typename StopType,
+              typename LogType>
+    void call_kernel(
+        LogType logger, const BatchMatrixType& mat, PrecType prec,
+        const gko::batch::multi_vector::uniform_batch<const value_type>& b,
+        const gko::batch::multi_vector::uniform_batch<value_type>& x) const
+    {
+        using real_type = gko::remove_complex<value_type>;
+        const size_type num_batch_items = mat.num_batch_items;
+        constexpr int align_multiple = 8;
+        const int padded_num_rows =
+            ceildiv(mat.num_rows, align_multiple) * align_multiple;
+        int shmem_per_blk = 0;
+        GKO_ASSERT_NO_HIP_ERRORS(hipDeviceGetAttribute(
+            &shmem_per_blk, hipDeviceAttributeMaxSharedMemoryPerBlock,
+            exec_->get_device_id()));
+        const int block_size =
+            get_num_threads_per_block<BatchMatrixType>(exec_, mat.num_rows);
+        GKO_ASSERT(block_size >= 2 * config::warp_size);
+
+        const size_t prec_size =
+            PrecType::dynamic_work_size(padded_num_rows,
+                                        mat.get_single_item_num_nnz()) *
+            sizeof(value_type);
+        const auto sconf =
+            gko::kernels::batch_bicgstab::compute_shared_storage<PrecType,
+                                                                 value_type>(
+                shmem_per_blk, padded_num_rows, mat.get_single_item_num_nnz(),
+                b.num_rhs);
+        const size_t shared_size =
+            sconf.n_shared * padded_num_rows * sizeof(value_type) +
+            (sconf.prec_shared ? prec_size : 0);
+        auto workspace = gko::array<value_type>(
+            exec_,
+            sconf.gmem_stride_bytes * num_batch_items / sizeof(value_type));
+        assert(sconf.gmem_stride_bytes % sizeof(value_type) == 0);
+
+        value_type* const workspace_data = workspace.get_data();
+
+        // Template parameters launch_apply_kernel<StopType, n_shared,
+        // prec_shared)
+        if (sconf.prec_shared) {
+            launch_apply_kernel<StopType, 9, true>(
+                sconf, logger, prec, mat, b.values, x.values, workspace_data,
+                block_size, shared_size);
+        } else {
+            switch (sconf.n_shared) {
+            case 0:
+                launch_apply_kernel<StopType, 0, false>(
+                    sconf, logger, prec, mat, b.values, x.values,
+                    workspace_data, block_size, shared_size);
+                break;
+            case 1:
+                launch_apply_kernel<StopType, 1, false>(
+                    sconf, logger, prec, mat, b.values, x.values,
+                    workspace_data, block_size, shared_size);
+                break;
+            case 2:
+                launch_apply_kernel<StopType, 2, false>(
+                    sconf, logger, prec, mat, b.values, x.values,
+                    workspace_data, block_size, shared_size);
+                break;
+            case 3:
+                launch_apply_kernel<StopType, 3, false>(
+                    sconf, logger, prec, mat, b.values, x.values,
+                    workspace_data, block_size, shared_size);
+                break;
+            case 4:
+                launch_apply_kernel<StopType, 4, false>(
+                    sconf, logger, prec, mat, b.values, x.values,
+                    workspace_data, block_size, shared_size);
+                break;
+            case 5:
+                launch_apply_kernel<StopType, 5, false>(
+                    sconf, logger, prec, mat, b.values, x.values,
+                    workspace_data, block_size, shared_size);
+                break;
+            case 6:
+                launch_apply_kernel<StopType, 6, false>(
+                    sconf, logger, prec, mat, b.values, x.values,
+                    workspace_data, block_size, shared_size);
+                break;
+            case 7:
+                launch_apply_kernel<StopType, 7, false>(
+                    sconf, logger, prec, mat, b.values, x.values,
+                    workspace_data, block_size, shared_size);
+                break;
+            case 8:
+                launch_apply_kernel<StopType, 8, false>(
+                    sconf, logger, prec, mat, b.values, x.values,
+                    workspace_data, block_size, shared_size);
+                break;
+            case 9:
+                launch_apply_kernel<StopType, 9, false>(
+                    sconf, logger, prec, mat, b.values, x.values,
+                    workspace_data, block_size, shared_size);
+                break;
+            default:
+                GKO_NOT_IMPLEMENTED;
+            }
+        }
+    }
+
+private:
+    std::shared_ptr<const DefaultExecutor> exec_;
+    const settings<remove_complex<value_type>> settings_;
+};
+
+
+template <typename ValueType>
+void apply(std::shared_ptr<const DefaultExecutor> exec,
+           const settings<remove_complex<ValueType>>& settings,
+           const batch::BatchLinOp* const mat,
+           const batch::BatchLinOp* const precon,
+           const batch::MultiVector<ValueType>* const b,
+           batch::MultiVector<ValueType>* const x,
+           batch::log::detail::log_data<remove_complex<ValueType>>& logdata)
+{
+    using hip_value_type = hip_type<ValueType>;
+    auto dispatcher = batch::solver::create_dispatcher<ValueType>(
+        kernel_caller<hip_value_type>(exec, settings), settings, mat, precon);
+    dispatcher.apply(b, x, logdata);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_APPLY_KERNEL);
+
+
+}  // namespace batch_bicgstab
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
diff --git a/hip/solver/common_trs_kernels.hip.hpp b/hip/solver/common_trs_kernels.hip.hpp
index 643c875561e..6cf2ca516f2 100644
--- a/hip/solver/common_trs_kernels.hip.hpp
+++ b/hip/solver/common_trs_kernels.hip.hpp
@@ -39,7 +39,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <hip/hip_runtime.h>
+#if HIP_VERSION >= 50200000
+#include <hipsparse/hipsparse.h>
+#else
 #include <hipsparse.h>
+#endif
 
 
 #include <ginkgo/core/base/exception_helpers.hpp>
diff --git a/hip/solver/idr_kernels.hip.cpp b/hip/solver/idr_kernels.hip.cpp
index 9e6f353abe4..1a3d2931897 100644
--- a/hip/solver/idr_kernels.hip.cpp
+++ b/hip/solver/idr_kernels.hip.cpp
@@ -106,6 +106,7 @@ void initialize_subspace_vectors(std::shared_ptr<const DefaultExecutor> exec,
             gen,
             subspace_vectors->get_size()[0] * subspace_vectors->get_stride(),
             0.0, 1.0, subspace_vectors->get_values());
+        hiprand::destroy(gen);
     }
 }
 
diff --git a/hip/solver/lower_trs_kernels.hip.cpp b/hip/solver/lower_trs_kernels.hip.cpp
index 2e9dd0d0ce3..283f5ee5284 100644
--- a/hip/solver/lower_trs_kernels.hip.cpp
+++ b/hip/solver/lower_trs_kernels.hip.cpp
@@ -37,7 +37,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <hip/hip_runtime.h>
+#if HIP_VERSION >= 50200000
+#include <hipsparse/hipsparse.h>
+#else
 #include <hipsparse.h>
+#endif
 
 
 #include <ginkgo/core/base/exception_helpers.hpp>
diff --git a/hip/solver/upper_trs_kernels.hip.cpp b/hip/solver/upper_trs_kernels.hip.cpp
index a3c6070614c..09e71826130 100644
--- a/hip/solver/upper_trs_kernels.hip.cpp
+++ b/hip/solver/upper_trs_kernels.hip.cpp
@@ -37,7 +37,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <hip/hip_runtime.h>
+#if HIP_VERSION >= 50200000
+#include <hipsparse/hipsparse.h>
+#else
 #include <hipsparse.h>
+#endif
 
 
 #include <ginkgo/core/base/exception_helpers.hpp>
diff --git a/hip/components/volatile.hip.hpp b/hip/stop/batch_criteria.hip.hpp
similarity index 87%
rename from hip/components/volatile.hip.hpp
rename to hip/stop/batch_criteria.hip.hpp
index de0202d8391..49477346ab2 100644
--- a/hip/components/volatile.hip.hpp
+++ b/hip/stop/batch_criteria.hip.hpp
@@ -30,29 +30,25 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-#ifndef GKO_HIP_COMPONENTS_VOLATILE_HIP_HPP_
-#define GKO_HIP_COMPONENTS_VOLATILE_HIP_HPP_
-
-
-#include <type_traits>
+#ifndef GKO_HIP_STOP_BATCH_CRITERIA_HIP_HPP_
+#define GKO_HIP_STOP_BATCH_CRITERIA_HIP_HPP_
 
 
 #include <ginkgo/core/base/math.hpp>
 
 
-#include "hip/base/types.hip.hpp"
-
-
 namespace gko {
 namespace kernels {
 namespace hip {
+namespace batch_stop {
 
 
-#include "common/cuda_hip/components/volatile.hpp.inc"
+#include "common/cuda_hip/stop/batch_criteria.hpp.inc"
 
 
+}  // namespace batch_stop
 }  // namespace hip
 }  // namespace kernels
 }  // namespace gko
 
-#endif  // GKO_HIP_COMPONENTS_VOLATILE_HIP_HPP_
+#endif  // GKO_HIP_STOP_BATCH_CRITERIA_HIP_HPP_
diff --git a/hip/stop/residual_norm_kernels.hip.cpp b/hip/stop/residual_norm_kernels.hip.cpp
index 506d1416cc7..d9a37cfd3e1 100644
--- a/hip/stop/residual_norm_kernels.hip.cpp
+++ b/hip/stop/residual_norm_kernels.hip.cpp
@@ -70,7 +70,7 @@ __global__ __launch_bounds__(default_block_size) void residual_norm_kernel(
 {
     const auto tidx = thread::get_thread_id_flat();
     if (tidx < num_cols) {
-        if (tau[tidx] < rel_residual_goal * orig_tau[tidx]) {
+        if (tau[tidx] <= rel_residual_goal * orig_tau[tidx]) {
             stop_status[tidx].converge(stoppingId, setFinalized);
             device_storage[1] = true;
         }
@@ -151,7 +151,7 @@ __launch_bounds__(default_block_size) void implicit_residual_norm_kernel(
 {
     const auto tidx = thread::get_thread_id_flat();
     if (tidx < num_cols) {
-        if (sqrt(abs(tau[tidx])) < rel_residual_goal * orig_tau[tidx]) {
+        if (sqrt(abs(tau[tidx])) <= rel_residual_goal * orig_tau[tidx]) {
             stop_status[tidx].converge(stoppingId, setFinalized);
             device_storage[1] = true;
         }
diff --git a/hip/test/base/CMakeLists.txt b/hip/test/base/CMakeLists.txt
index 7ed0d2ceb52..bfe8c8be96a 100644
--- a/hip/test/base/CMakeLists.txt
+++ b/hip/test/base/CMakeLists.txt
@@ -1,14 +1,14 @@
 ginkgo_create_hip_test(hip_executor)
-ginkgo_create_test(index_set)
-ginkgo_create_test(hip_executor_reset ADDITIONAL_LIBRARIES Threads::Threads)
+ginkgo_create_test(index_set RESOURCE_TYPE hipgpu)
 if(GINKGO_HAVE_HWLOC)
     find_package(NUMA REQUIRED)
     ginkgo_create_hip_test(hip_executor_topology ADDITIONAL_LIBRARIES NUMA::NUMA)
 endif()
 ginkgo_create_hip_test(kernel_launch)
 # correct flags for kernel_launch.hpp are set in GINKGO_HIPCC_OPTIONS
-ginkgo_create_hip_test(lin_op)
+ginkgo_create_test(lin_op RESOURCE_TYPE hipgpu)
 ginkgo_create_hip_test(math)
+ginkgo_create_test(memory RESOURCE_TYPE hipgpu)
 # Only hcc needs the libraries. nvcc only requires the headers.
 if (GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_AMD_REGEX}")
     ginkgo_create_hip_test(exception_helpers ADDITIONAL_LIBRARIES roc::hipblas roc::hipsparse hip::hiprand roc::rocrand)
diff --git a/hip/test/base/exception_helpers.hip.cpp b/hip/test/base/exception_helpers.hip.cpp
index 29dea03961f..7014738bd76 100644
--- a/hip/test/base/exception_helpers.hip.cpp
+++ b/hip/test/base/exception_helpers.hip.cpp
@@ -34,9 +34,15 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <hip/hip_runtime.h>
+#if HIP_VERSION >= 50200000
+#include <hipblas/hipblas.h>
+#include <hiprand/hiprand.h>
+#include <hipsparse/hipsparse.h>
+#else
 #include <hipblas.h>
 #include <hiprand.h>
 #include <hipsparse.h>
+#endif
 
 
 #include <gtest/gtest.h>
diff --git a/hip/test/base/hip_executor.hip.cpp b/hip/test/base/hip_executor.hip.cpp
index d27dd58d132..42499384704 100644
--- a/hip/test/base/hip_executor.hip.cpp
+++ b/hip/test/base/hip_executor.hip.cpp
@@ -98,7 +98,7 @@ class HipExecutor : public ::testing::Test {
           stream(0),
           other_stream(gko::HipExecutor::get_num_devices() - 1),
 #endif
-          omp(gko::OmpExecutor::create()),
+          ref(gko::ReferenceExecutor::create()),
           hip(nullptr),
           hip2(nullptr),
           hip3(nullptr)
@@ -109,18 +109,19 @@ class HipExecutor : public ::testing::Test {
         ASSERT_GT(gko::HipExecutor::get_num_devices(), 0);
 #ifdef GKO_TEST_NONDEFAULT_STREAM
         hip = gko::HipExecutor::create(
-            0, omp, false, gko::default_hip_alloc_mode, stream.get());
-        hip2 = gko::HipExecutor::create(gko::HipExecutor::get_num_devices() - 1,
-                                        omp, false, gko::default_hip_alloc_mode,
-                                        other_stream.get());
+            0, ref, std::make_shared<gko::HipAllocator>(), stream.get());
+        hip2 = gko::HipExecutor::create(
+            gko::HipExecutor::get_num_devices() - 1, ref,
+            std::make_shared<gko::HipAllocator>(), other_stream.get());
         hip3 = gko::HipExecutor::create(
-            0, omp, false, gko::allocation_mode::unified_global, stream.get());
+            0, ref, std::make_shared<gko::HipUnifiedAllocator>(0),
+            stream.get());
 #else
-        hip = gko::HipExecutor::create(0, omp);
+        hip = gko::HipExecutor::create(0, ref);
         hip2 = gko::HipExecutor::create(gko::HipExecutor::get_num_devices() - 1,
-                                        omp);
-        hip3 = gko::HipExecutor::create(0, omp, false,
-                                        gko::allocation_mode::unified_global);
+                                        ref);
+        hip3 = gko::HipExecutor::create(
+            0, ref, std::make_shared<gko::HipUnifiedAllocator>(0));
 #endif
     }
 
@@ -136,7 +137,7 @@ class HipExecutor : public ::testing::Test {
     gko::hip_stream stream;
     gko::hip_stream other_stream;
 #endif
-    std::shared_ptr<gko::Executor> omp;
+    std::shared_ptr<gko::ReferenceExecutor> ref;
     std::shared_ptr<gko::HipExecutor> hip;
     std::shared_ptr<gko::HipExecutor> hip2;
     std::shared_ptr<gko::HipExecutor> hip3;
@@ -145,8 +146,8 @@ class HipExecutor : public ::testing::Test {
 
 TEST_F(HipExecutor, CanInstantiateTwoExecutorsOnOneDevice)
 {
-    auto hip = gko::HipExecutor::create(0, omp);
-    auto hip2 = gko::HipExecutor::create(0, omp);
+    auto hip = gko::HipExecutor::create(0, ref);
+    auto hip2 = gko::HipExecutor::create(0, ref);
 
     // We want automatic deinitialization to not create any error
 }
@@ -204,7 +205,7 @@ TEST_F(HipExecutor, CopiesDataToHip)
     int orig[] = {3, 8};
     auto* copy = hip->alloc<int>(2);
 
-    hip->copy_from(omp, 2, orig, copy);
+    hip->copy_from(ref, 2, orig, copy);
 
     check_data<<<1, 1, 0, hip->get_stream()>>>(copy);
     ASSERT_NO_THROW(hip->synchronize());
@@ -232,7 +233,7 @@ TEST_F(HipExecutor, CanAllocateOnUnifiedMemory)
     int orig[] = {3, 8};
     auto* copy = hip3->alloc<int>(2);
 
-    hip3->copy_from(omp, 2, orig, copy);
+    hip3->copy_from(ref, 2, orig, copy);
 
     check_data<<<1, 1, 0, hip3->get_stream()>>>(copy);
     ASSERT_NO_THROW(hip3->synchronize());
@@ -257,7 +258,7 @@ TEST_F(HipExecutor, CopiesDataFromHip)
     auto orig = hip->alloc<int>(2);
     init_data<<<1, 1, 0, hip->get_stream()>>>(orig);
 
-    omp->copy_from(hip, 2, orig, copy);
+    ref->copy_from(hip, 2, orig, copy);
 
     EXPECT_EQ(3, copy[0]);
     ASSERT_EQ(8, copy[1]);
@@ -310,7 +311,7 @@ TEST_F(HipExecutor, CopiesDataFromHipToHip)
     hip2->run(ExampleOperation(value));
     ASSERT_EQ(value, hip2->get_device_id());
     // Put the results on OpenMP and run CPU side assertions
-    omp->copy_from(hip2, 2, copy_hip2, copy);
+    ref->copy_from(hip2, 2, copy_hip2, copy);
     EXPECT_EQ(3, copy[0]);
     ASSERT_EQ(8, copy[1]);
     hip2->free(copy_hip2);
diff --git a/hip/test/base/hip_executor_topology.hip.cpp b/hip/test/base/hip_executor_topology.hip.cpp
index 394b2776319..3d6e3f2bddc 100644
--- a/hip/test/base/hip_executor_topology.hip.cpp
+++ b/hip/test/base/hip_executor_topology.hip.cpp
@@ -65,15 +65,16 @@ namespace {
 
 class HipExecutor : public ::testing::Test {
 protected:
-    HipExecutor() : omp(gko::OmpExecutor::create()), hip(nullptr), hip2(nullptr)
+    HipExecutor()
+        : ref(gko::ReferenceExecutor::create()), hip(nullptr), hip2(nullptr)
     {}
 
     void SetUp()
     {
         ASSERT_GT(gko::HipExecutor::get_num_devices(), 0);
-        hip = gko::HipExecutor::create(0, omp);
+        hip = gko::HipExecutor::create(0, ref);
         hip2 = gko::HipExecutor::create(gko::HipExecutor::get_num_devices() - 1,
-                                        omp);
+                                        ref);
     }
 
     void TearDown()
@@ -84,7 +85,7 @@ class HipExecutor : public ::testing::Test {
         }
     }
 
-    std::shared_ptr<gko::Executor> omp;
+    std::shared_ptr<gko::ReferenceExecutor> ref;
     std::shared_ptr<const gko::HipExecutor> hip;
     std::shared_ptr<const gko::HipExecutor> hip2;
 };
@@ -107,7 +108,7 @@ inline int get_core_os_id(int log_id)
 
 TEST_F(HipExecutor, CanBindToSinglePu)
 {
-    hip = gko::HipExecutor::create(0, gko::OmpExecutor::create());
+    hip = gko::HipExecutor::create(0, gko::ReferenceExecutor::create());
 
     const int bind_pu = 1;
     gko::machine_topology::get_instance()->bind_to_pu(bind_pu);
@@ -119,7 +120,7 @@ TEST_F(HipExecutor, CanBindToSinglePu)
 
 TEST_F(HipExecutor, CanBindToPus)
 {
-    hip = gko::HipExecutor::create(0, gko::OmpExecutor::create());
+    hip = gko::HipExecutor::create(0, gko::ReferenceExecutor::create());
 
     std::vector<int> bind_pus = {1, 3};
     gko::machine_topology::get_instance()->bind_to_pus(bind_pus);
@@ -131,7 +132,7 @@ TEST_F(HipExecutor, CanBindToPus)
 
 TEST_F(HipExecutor, CanBindToCores)
 {
-    hip = gko::HipExecutor::create(0, gko::OmpExecutor::create());
+    hip = gko::HipExecutor::create(0, gko::ReferenceExecutor::create());
 
     std::vector<int> bind_cores = {1, 3};
     gko::machine_topology::get_instance()->bind_to_cores(bind_cores);
@@ -143,7 +144,7 @@ TEST_F(HipExecutor, CanBindToCores)
 
 TEST_F(HipExecutor, ClosestCpusIsPopulated)
 {
-    hip = gko::HipExecutor::create(0, gko::OmpExecutor::create());
+    hip = gko::HipExecutor::create(0, gko::ReferenceExecutor::create());
     auto close_cpus = hip->get_closest_pus();
     if (close_cpus.size() == 0) {
         GTEST_SKIP();
@@ -155,7 +156,7 @@ TEST_F(HipExecutor, ClosestCpusIsPopulated)
 
 TEST_F(HipExecutor, KnowsItsNuma)
 {
-    hip = gko::HipExecutor::create(0, gko::OmpExecutor::create());
+    hip = gko::HipExecutor::create(0, gko::ReferenceExecutor::create());
     auto numa0 = hip->get_closest_numa();
     auto close_cpus = hip->get_closest_pus();
     if (close_cpus.size() == 0) {
diff --git a/hip/test/base/lin_op.hip.cpp b/hip/test/base/lin_op.cpp
similarity index 94%
rename from hip/test/base/lin_op.hip.cpp
rename to hip/test/base/lin_op.cpp
index 0fceb6dcfee..614048949ea 100644
--- a/hip/test/base/lin_op.hip.cpp
+++ b/hip/test/base/lin_op.cpp
@@ -44,6 +44,10 @@ class FactoryParameter : public ::testing::Test {
     FactoryParameter() {}
 
 public:
+    // FACTORY_PARAMETER macro needs self, which is usually available in
+    // enable_parameters_type. To reduce complexity, we add self here.
+    GKO_ENABLE_SELF(FactoryParameter);
+
     std::vector<int> GKO_FACTORY_PARAMETER_VECTOR(vector_parameter, 10, 11);
     int GKO_FACTORY_PARAMETER_SCALAR(scalar_parameter, -4);
 };
diff --git a/hip/test/base/memory.cpp b/hip/test/base/memory.cpp
new file mode 100644
index 00000000000..2dc0a3aa337
--- /dev/null
+++ b/hip/test/base/memory.cpp
@@ -0,0 +1,126 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/base/memory.hpp>
+
+
+#include <memory>
+#include <type_traits>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/exception.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/executor.hpp>
+
+
+#include "hip/test/utils.hip.hpp"
+
+
+namespace {
+
+
+class Memory : public HipTestFixture {
+protected:
+    Memory()
+        : host_exec_with_pinned{gko::OmpExecutor::create(
+              std::make_shared<gko::HipHostAllocator>(0))},
+          host_exec_with_unified{gko::OmpExecutor::create(
+              std::make_shared<gko::HipUnifiedAllocator>(0))},
+          exec_with_normal{gko::HipExecutor::create(
+              0, ref, std::make_shared<gko::HipAllocator>(),
+              exec->get_stream())},
+          exec_with_async{gko::HipExecutor::create(
+              0, host_exec_with_pinned,
+              std::make_shared<gko::HipAsyncAllocator>(exec->get_stream()),
+              exec->get_stream())},
+          exec_with_unified{gko::HipExecutor::create(
+              0, host_exec_with_unified,
+              std::make_shared<gko::HipUnifiedAllocator>(0),
+              exec->get_stream())}
+    {}
+
+    std::shared_ptr<gko::OmpExecutor> host_exec_with_pinned;
+    std::shared_ptr<gko::OmpExecutor> host_exec_with_unified;
+    std::shared_ptr<gko::HipExecutor> exec_with_normal;
+    std::shared_ptr<gko::HipExecutor> exec_with_async;
+    std::shared_ptr<gko::HipExecutor> exec_with_unified;
+};
+
+
+TEST_F(Memory, DeviceAllocationWorks)
+{
+    gko::array<int> data{exec_with_normal, {1, 2}};
+
+    GKO_ASSERT_ARRAY_EQ(data, I<int>({1, 2}));
+}
+
+
+TEST_F(Memory, AsyncDeviceAllocationWorks)
+{
+    gko::array<int> data{exec_with_async, {1, 2}};
+
+    GKO_ASSERT_ARRAY_EQ(data, I<int>({1, 2}));
+}
+
+
+TEST_F(Memory, UnifiedDeviceAllocationWorks)
+{
+    gko::array<int> data{exec_with_unified, {1, 2}};
+    exec->synchronize();
+
+    ASSERT_EQ(data.get_const_data()[0], 1);
+    ASSERT_EQ(data.get_const_data()[1], 2);
+}
+
+
+TEST_F(Memory, HostUnifiedAllocationWorks)
+{
+    gko::array<int> data{host_exec_with_unified, {1, 2}};
+
+    ASSERT_EQ(data.get_const_data()[0], 1);
+    ASSERT_EQ(data.get_const_data()[1], 2);
+}
+
+
+TEST_F(Memory, HostPinnedAllocationWorks)
+{
+    gko::array<int> data{host_exec_with_pinned, {1, 2}};
+
+    ASSERT_EQ(data.get_const_data()[0], 1);
+    ASSERT_EQ(data.get_const_data()[1], 2);
+}
+
+
+}  // namespace
diff --git a/hip/test/matrix/CMakeLists.txt b/hip/test/matrix/CMakeLists.txt
index 82db4b8b376..a52069daea0 100644
--- a/hip/test/matrix/CMakeLists.txt
+++ b/hip/test/matrix/CMakeLists.txt
@@ -1,4 +1,4 @@
-ginkgo_create_hip_test(fbcsr_kernels)
+ginkgo_create_test(fbcsr_kernels RESOURCE_TYPE hipgpu)
 if (hipfft_FOUND)
     ginkgo_create_hip_test(fft_kernels)
 endif()
diff --git a/hip/test/matrix/fbcsr_kernels.hip.cpp b/hip/test/matrix/fbcsr_kernels.cpp
similarity index 100%
rename from hip/test/matrix/fbcsr_kernels.hip.cpp
rename to hip/test/matrix/fbcsr_kernels.cpp
diff --git a/hip/test/matrix/fft_kernels.hip.cpp b/hip/test/matrix/fft_kernels.hip.cpp
index 59c24492b5b..8c213df8ad5 100644
--- a/hip/test/matrix/fft_kernels.hip.cpp
+++ b/hip/test/matrix/fft_kernels.hip.cpp
@@ -33,7 +33,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/fft.hpp>
 
 
+#include <hip/hip_runtime.h>
+#if HIP_VERSION >= 50200000
+#include <hipfft/hipfft.h>
+#else
 #include <hipfft.h>
+#endif
 
 
 #include <gtest/gtest.h>
diff --git a/hip/test/solver/CMakeLists.txt b/hip/test/solver/CMakeLists.txt
index a3b86589410..fcbb3de0c47 100644
--- a/hip/test/solver/CMakeLists.txt
+++ b/hip/test/solver/CMakeLists.txt
@@ -1,2 +1,2 @@
-ginkgo_create_test(lower_trs_kernels)
-ginkgo_create_test(upper_trs_kernels)
+ginkgo_create_test(lower_trs_kernels RESOURCE_TYPE hipgpu)
+ginkgo_create_test(upper_trs_kernels RESOURCE_TYPE hipgpu)
diff --git a/hip/test/utils.hip.hpp b/hip/test/utils.hip.hpp
index 9337da14139..9fc3edc3f82 100644
--- a/hip/test/utils.hip.hpp
+++ b/hip/test/utils.hip.hpp
@@ -38,33 +38,27 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/stream.hpp>
 
 
+#include "core/test/gtest/resources.hpp"
 #include "hip/base/device.hpp"
 
 
 namespace {
 
 
-class HipEnvironment : public ::testing::Environment {
-public:
-    void TearDown() override { gko::kernels::hip::reset_device(0); }
-};
-
-testing::Environment* hip_env =
-    testing::AddGlobalTestEnvironment(new HipEnvironment);
-
-
 class HipTestFixture : public ::testing::Test {
 protected:
     HipTestFixture()
         : ref(gko::ReferenceExecutor::create()),
 #ifdef GKO_TEST_NONDEFAULT_STREAM
-          exec(gko::HipExecutor::create(
-              0, ref, false, gko::default_hip_alloc_mode, stream.get()))
-#else
-          exec(gko::HipExecutor::create(0, ref))
+          stream(ResourceEnvironment::hip_device_id),
 #endif
+          exec(gko::HipExecutor::create(ResourceEnvironment::hip_device_id, ref,
+                                        std::make_shared<gko::HipAllocator>(),
+                                        stream.get())),
+          guard(exec->get_scoped_device_id_guard())
     {}
 
     void TearDown()
@@ -75,11 +69,10 @@ class HipTestFixture : public ::testing::Test {
         }
     }
 
-#ifdef GKO_TEST_NONDEFAULT_STREAM
     gko::hip_stream stream;
-#endif
     std::shared_ptr<gko::ReferenceExecutor> ref;
     std::shared_ptr<gko::HipExecutor> exec;
+    gko::scoped_device_id_guard guard;
 };
 
 
diff --git a/hip/test/utils/CMakeLists.txt b/hip/test/utils/CMakeLists.txt
index a6c52f65d9c..d9ec2ff29a7 100644
--- a/hip/test/utils/CMakeLists.txt
+++ b/hip/test/utils/CMakeLists.txt
@@ -1 +1 @@
-ginkgo_create_hip_test(assertions_test)
+ginkgo_create_test(assertions_test RESOURCE_TYPE hipgpu)
diff --git a/hip/test/utils/assertions_test.hip.cpp b/hip/test/utils/assertions_test.cpp
similarity index 100%
rename from hip/test/utils/assertions_test.hip.cpp
rename to hip/test/utils/assertions_test.cpp
diff --git a/include/ginkgo/core/base/abstract_factory.hpp b/include/ginkgo/core/base/abstract_factory.hpp
index 1c5043c186f..a441d4102d9 100644
--- a/include/ginkgo/core/base/abstract_factory.hpp
+++ b/include/ginkgo/core/base/abstract_factory.hpp
@@ -34,6 +34,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GKO_PUBLIC_CORE_BASE_ABSTRACT_FACTORY_HPP_
 
 
+#include <unordered_map>
+
+
 #include <ginkgo/core/base/polymorphic_object.hpp>
 
 
@@ -257,7 +260,11 @@ class enable_parameters_type {
      */
     std::unique_ptr<Factory> on(std::shared_ptr<const Executor> exec) const
     {
-        auto factory = std::unique_ptr<Factory>(new Factory(exec, *self()));
+        ConcreteParametersType copy = *self();
+        for (const auto& item : deferred_factories) {
+            item.second(exec, copy);
+        }
+        auto factory = std::unique_ptr<Factory>(new Factory(exec, copy));
         for (auto& logger : loggers) {
             factory->add_logger(logger);
         };
@@ -271,9 +278,373 @@ class enable_parameters_type {
      * Loggers to be attached to the factory and generated object.
      */
     std::vector<std::shared_ptr<const log::Logger>> loggers{};
+
+    /**
+     * Deferred factory parameter initialization functions that will be called
+     * in on(). Their names usually correspond to the variable names in the
+     * parameter type. They will be provided the executor and the parameter
+     * object currently being initialized from the generators.
+     */
+    std::unordered_map<std::string,
+                       std::function<void(std::shared_ptr<const Executor> exec,
+                                          ConcreteParametersType&)>>
+        deferred_factories;
+};
+
+
+/**
+ * This Macro will generate a new type containing the parameters for the factory
+ * `_factory_name`. For more details, see #GKO_ENABLE_LIN_OP_FACTORY().
+ * It is required to use this macro **before** calling the
+ * macro #GKO_ENABLE_LIN_OP_FACTORY().
+ * It is also required to use the same names for all parameters between both
+ * macros.
+ *
+ * @param _parameters_name  name of the parameters member in the class
+ * @param _factory_name  name of the generated factory type
+ *
+ * @ingroup LinOp
+ */
+#define GKO_CREATE_FACTORY_PARAMETERS(_parameters_name, _factory_name)  \
+public:                                                                 \
+    class _factory_name;                                                \
+    struct _parameters_name##_type                                      \
+        : public ::gko::enable_parameters_type<_parameters_name##_type, \
+                                               _factory_name>
+
+
+namespace detail {
+
+
+// Use pointer not the type because std::is_convertible<const type, type> can be
+// true.
+template <typename From, typename To>
+struct is_pointer_convertible : std::is_convertible<From*, To*> {};
+
+
+}  // namespace detail
+
+
+/**
+ * Represents a factory parameter of factory type that can either initialized by
+ * a pre-existing factory or by passing in a factory_parameters object whose
+ * `.on(exec)` will be called to instantiate a factory.
+ *
+ * @tparam FactoryType  the type of factory that can be instantiated from this
+ * object.
+ */
+template <typename FactoryType>
+class deferred_factory_parameter {
+public:
+    /** Creates an empty deferred factory parameter. */
+    deferred_factory_parameter() = default;
+
+    /** Creates a deferred factory parameter returning a nullptr. */
+    deferred_factory_parameter(std::nullptr_t)
+    {
+        generator_ = [](std::shared_ptr<const Executor>) { return nullptr; };
+    }
+
+    /**
+     * Creates a deferred factory parameter from a preexisting factory with
+     * shared ownership.
+     */
+    template <typename ConcreteFactoryType,
+              std::enable_if_t<detail::is_pointer_convertible<
+                  ConcreteFactoryType, FactoryType>::value>* = nullptr>
+    deferred_factory_parameter(std::shared_ptr<ConcreteFactoryType> factory)
+    {
+        generator_ = [factory =
+                          std::shared_ptr<FactoryType>(std::move(factory))](
+                         std::shared_ptr<const Executor>) { return factory; };
+    }
+
+    /**
+     * Creates a deferred factory parameter by taking ownership of a
+     * preexisting factory with unique ownership.
+     */
+    template <typename ConcreteFactoryType, typename Deleter,
+              std::enable_if_t<detail::is_pointer_convertible<
+                  ConcreteFactoryType, FactoryType>::value>* = nullptr>
+    deferred_factory_parameter(
+        std::unique_ptr<ConcreteFactoryType, Deleter> factory)
+    {
+        generator_ = [factory =
+                          std::shared_ptr<FactoryType>(std::move(factory))](
+                         std::shared_ptr<const Executor>) { return factory; };
+    }
+
+    /**
+     * Creates a deferred factory parameter object from a
+     * factory_parameters-like object. To instantiate the actual factory, the
+     * parameter's `.on(exec)` function will be called.
+     */
+    template <typename ParametersType,
+              typename U = decltype(std::declval<ParametersType>().on(
+                  std::shared_ptr<const Executor>{})),
+              std::enable_if_t<detail::is_pointer_convertible<
+                  typename U::element_type, FactoryType>::value>* = nullptr>
+    deferred_factory_parameter(ParametersType parameters)
+    {
+        generator_ = [parameters](std::shared_ptr<const Executor> exec)
+            -> std::shared_ptr<FactoryType> { return parameters.on(exec); };
+    }
+
+    /**
+     * Instantiates the deferred parameter into an actual factory. This will
+     * throw if the deferred factory parameter is empty.
+     */
+    std::shared_ptr<FactoryType> on(std::shared_ptr<const Executor> exec) const
+    {
+        if (this->is_empty()) {
+            GKO_NOT_SUPPORTED(*this);
+        }
+        return generator_(exec);
+    }
+
+    /** Returns true iff the parameter is empty. */
+    bool is_empty() const { return !bool(generator_); }
+
+private:
+    std::function<std::shared_ptr<FactoryType>(std::shared_ptr<const Executor>)>
+        generator_;
 };
 
 
+/**
+ * Defines a build method for the factory, simplifying its construction by
+ * removing the repetitive typing of factory's name.
+ *
+ * @param _factory_name  the factory for which to define the method
+ *
+ * @ingroup LinOp
+ */
+#define GKO_ENABLE_BUILD_METHOD(_factory_name)                               \
+    static auto build()->decltype(_factory_name::create())                   \
+    {                                                                        \
+        return _factory_name::create();                                      \
+    }                                                                        \
+    static_assert(true,                                                      \
+                  "This assert is used to counter the false positive extra " \
+                  "semi-colon warnings")
+
+
+#if !(defined(__CUDACC__) || defined(__HIPCC__))
+/**
+ * Creates a factory parameter in the factory parameters structure.
+ *
+ * @param _name  name of the parameter
+ * @param __VA_ARGS__  default value of the parameter
+ *
+ * @see GKO_ENABLE_LIN_OP_FACTORY for more details, and usage example
+ *
+ * @deprecated Use GKO_FACTORY_PARAMETER_SCALAR or GKO_FACTORY_PARAMETER_VECTOR
+ *
+ * @ingroup LinOp
+ */
+#define GKO_FACTORY_PARAMETER(_name, ...)                                    \
+    _name{__VA_ARGS__};                                                      \
+                                                                             \
+    template <typename... Args>                                              \
+    auto with_##_name(Args&&... _value)                                      \
+        ->std::decay_t<decltype(*(this->self()))>&                           \
+    {                                                                        \
+        using type = decltype(this->_name);                                  \
+        this->_name = type{std::forward<Args>(_value)...};                   \
+        return *(this->self());                                              \
+    }                                                                        \
+    static_assert(true,                                                      \
+                  "This assert is used to counter the false positive extra " \
+                  "semi-colon warnings")
+
+/**
+ * Creates a scalar factory parameter in the factory parameters structure.
+ *
+ * Scalar in this context means that the constructor for this type only takes
+ * a single parameter.
+ *
+ * @param _name  name of the parameter
+ * @param _default  default value of the parameter
+ *
+ * @see GKO_ENABLE_LIN_OP_FACTORY for more details, and usage example
+ *
+ * @ingroup LinOp
+ */
+#define GKO_FACTORY_PARAMETER_SCALAR(_name, _default) \
+    GKO_FACTORY_PARAMETER(_name, _default)
+
+/**
+ * Creates a vector factory parameter in the factory parameters structure.
+ *
+ * Vector in this context means that the constructor for this type takes
+ * multiple parameters.
+ *
+ * @param _name  name of the parameter
+ * @param _default  default value of the parameter
+ *
+ * @see GKO_ENABLE_LIN_OP_FACTORY for more details, and usage example
+ *
+ * @ingroup LinOp
+ */
+#define GKO_FACTORY_PARAMETER_VECTOR(_name, ...) \
+    GKO_FACTORY_PARAMETER(_name, __VA_ARGS__)
+#else  // defined(__CUDACC__) || defined(__HIPCC__)
+// A workaround for the NVCC compiler - parameter pack expansion does not work
+// properly, because while the assignment to a scalar value is translated by
+// cudafe into a C-style cast, the parameter pack expansion is not removed and
+// `Args&&... args` is still kept as a parameter pack.
+#define GKO_FACTORY_PARAMETER(_name, ...)                                    \
+    _name{__VA_ARGS__};                                                      \
+                                                                             \
+    template <typename... Args>                                              \
+    auto with_##_name(Args&&... _value)                                      \
+        ->std::decay_t<decltype(*(this->self()))>&                           \
+    {                                                                        \
+        GKO_NOT_IMPLEMENTED;                                                 \
+        return *(this->self());                                              \
+    }                                                                        \
+    static_assert(true,                                                      \
+                  "This assert is used to counter the false positive extra " \
+                  "semi-colon warnings")
+
+#define GKO_FACTORY_PARAMETER_SCALAR(_name, _default)                         \
+    _name{_default};                                                          \
+                                                                              \
+    template <typename Arg>                                                   \
+    auto with_##_name(Arg&& _value)->std::decay_t<decltype(*(this->self()))>& \
+    {                                                                         \
+        using type = decltype(this->_name);                                   \
+        this->_name = type{std::forward<Arg>(_value)};                        \
+        return *(this->self());                                               \
+    }                                                                         \
+    static_assert(true,                                                       \
+                  "This assert is used to counter the false positive extra "  \
+                  "semi-colon warnings")
+
+#define GKO_FACTORY_PARAMETER_VECTOR(_name, ...)                             \
+    _name{__VA_ARGS__};                                                      \
+                                                                             \
+    template <typename... Args>                                              \
+    auto with_##_name(Args&&... _value)                                      \
+        ->std::decay_t<decltype(*(this->self()))>&                           \
+    {                                                                        \
+        using type = decltype(this->_name);                                  \
+        this->_name = type{std::forward<Args>(_value)...};                   \
+        return *(this->self());                                              \
+    }                                                                        \
+    static_assert(true,                                                      \
+                  "This assert is used to counter the false positive extra " \
+                  "semi-colon warnings")
+#endif  // defined(__CUDACC__) || defined(__HIPCC__)
+
+/**
+ * Creates a factory parameter of factory type. The parameter can either be set
+ * directly, or its creation can be deferred until the executor is set in the
+ * `.on(exec)` function call, by using a deferred_factory_parameter.
+ *
+ * @param _name  name of the parameter
+ * @param _type  pointee type of the parameter, e.g. LinOpFactory
+ *
+ */
+#define GKO_DEFERRED_FACTORY_PARAMETER(_name)                                  \
+    _name{};                                                                   \
+                                                                               \
+private:                                                                       \
+    using _name##_type = typename decltype(_name)::element_type;               \
+                                                                               \
+public:                                                                        \
+    auto with_##_name(::gko::deferred_factory_parameter<_name##_type> factory) \
+        ->std::decay_t<decltype(*(this->self()))>&                             \
+    {                                                                          \
+        this->_name##_generator_ = std::move(factory);                         \
+        this->deferred_factories[#_name] = [](const auto& exec,                \
+                                              auto& params) {                  \
+            if (!params._name##_generator_.is_empty()) {                       \
+                params._name = params._name##_generator_.on(exec);             \
+            }                                                                  \
+        };                                                                     \
+        return *(this->self());                                                \
+    }                                                                          \
+                                                                               \
+private:                                                                       \
+    ::gko::deferred_factory_parameter<_name##_type> _name##_generator_;        \
+                                                                               \
+public:                                                                        \
+    static_assert(true,                                                        \
+                  "This assert is used to counter the false positive extra "   \
+                  "semi-colon warnings")
+
+/**
+ * Creates a factory parameter representing a vector of factories type. The
+ * parameter can either be set directly, or its creation can be deferred until
+ * the executor is set in the
+ * `.on(exec)` function call, by using a vector of deferred_factory_parameters.
+ *
+ * @param _name  name of the parameter
+ * @param _type  pointee type of the vector entries, e.g. LinOpFactory
+ *
+ */
+#define GKO_DEFERRED_FACTORY_VECTOR_PARAMETER(_name)                           \
+    _name{};                                                                   \
+                                                                               \
+private:                                                                       \
+    using _name##_type = typename decltype(_name)::value_type::element_type;   \
+                                                                               \
+public:                                                                        \
+    template <typename... Args,                                                \
+              typename = std::enable_if_t<::gko::xstd::conjunction<            \
+                  std::is_convertible<Args, ::gko::deferred_factory_parameter< \
+                                                _name##_type>>...>::value>>    \
+    auto with_##_name(Args&&... factories)                                     \
+        ->std::decay_t<decltype(*(this->self()))>&                             \
+    {                                                                          \
+        this->_name##_generator_ = {                                           \
+            ::gko::deferred_factory_parameter<_name##_type>{                   \
+                std::forward<Args>(factories)}...};                            \
+        this->deferred_factories[#_name] = [](const auto& exec,                \
+                                              auto& params) {                  \
+            if (!params._name##_generator_.empty()) {                          \
+                params._name.clear();                                          \
+                for (auto& generator : params._name##_generator_) {            \
+                    params._name.push_back(generator.on(exec));                \
+                }                                                              \
+            }                                                                  \
+        };                                                                     \
+        return *(this->self());                                                \
+    }                                                                          \
+    template <typename FactoryType,                                            \
+              typename = std::enable_if_t<std::is_convertible<                 \
+                  FactoryType,                                                 \
+                  ::gko::deferred_factory_parameter<_name##_type>>::value>>    \
+    auto with_##_name(const std::vector<FactoryType>& factories)               \
+        ->std::decay_t<decltype(*(this->self()))>&                             \
+    {                                                                          \
+        this->_name##_generator_.clear();                                      \
+        for (const auto& factory : factories) {                                \
+            this->_name##_generator_.push_back(factory);                       \
+        }                                                                      \
+        this->deferred_factories[#_name] = [](const auto& exec,                \
+                                              auto& params) {                  \
+            if (!params._name##_generator_.empty()) {                          \
+                params._name.clear();                                          \
+                for (auto& generator : params._name##_generator_) {            \
+                    params._name.push_back(generator.on(exec));                \
+                }                                                              \
+            }                                                                  \
+        };                                                                     \
+        return *(this->self());                                                \
+    }                                                                          \
+                                                                               \
+private:                                                                       \
+    std::vector<::gko::deferred_factory_parameter<_name##_type>>               \
+        _name##_generator_;                                                    \
+                                                                               \
+public:                                                                        \
+    static_assert(true,                                                        \
+                  "This assert is used to counter the false positive extra "   \
+                  "semi-colon warnings")
+
+
 }  // namespace gko
 
 
diff --git a/include/ginkgo/core/base/array.hpp b/include/ginkgo/core/base/array.hpp
index 1140f1e400c..846817f9938 100644
--- a/include/ginkgo/core/base/array.hpp
+++ b/include/ginkgo/core/base/array.hpp
@@ -160,7 +160,7 @@ class const_array_view {
 
 
 template <typename ValueType>
-using ConstArrayView [[deprecated("please use const_array_view")]] =
+using ConstArrayView GKO_DEPRECATED("please use const_array_view") =
     const_array_view<ValueType>;
 
 
@@ -714,7 +714,7 @@ class array {
 
 
 template <typename ValueType>
-using Array [[deprecated("please use array")]] = array<ValueType>;
+using Array GKO_DEPRECATED("please use array") = array<ValueType>;
 
 
 /**
diff --git a/include/ginkgo/core/base/batch_dim.hpp b/include/ginkgo/core/base/batch_dim.hpp
new file mode 100644
index 00000000000..e0ade2c872f
--- /dev/null
+++ b/include/ginkgo/core/base/batch_dim.hpp
@@ -0,0 +1,159 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_PUBLIC_CORE_BASE_BATCH_DIM_HPP_
+#define GKO_PUBLIC_CORE_BASE_BATCH_DIM_HPP_
+
+
+#include <iostream>
+
+
+#include <ginkgo/core/base/dim.hpp>
+#include <ginkgo/core/base/types.hpp>
+
+
+namespace gko {
+
+
+/**
+ * A type representing the dimensions of a multidimensional batch object.
+ *
+ * @tparam Dimensionality  number of dimensions of the object
+ * @tparam DimensionType  datatype used to represent each dimension
+ *
+ * @ingroup batch_dim
+ */
+template <size_type Dimensionality = 2, typename DimensionType = size_type>
+struct batch_dim {
+    static constexpr size_type dimensionality = Dimensionality;
+    using dimension_type = DimensionType;
+
+    /**
+     * Get the number of batch items stored
+     *
+     * @return num_batch_items
+     */
+    size_type get_num_batch_items() const { return num_batch_items_; }
+
+    /**
+     * Get the common size of the batch items
+     *
+     * @return common_size
+     */
+    dim<dimensionality, dimension_type> get_common_size() const
+    {
+        return common_size_;
+    }
+
+    /**
+     * Checks if two batch_dim objects are equal.
+     *
+     * @param x  first object
+     * @param y  second object
+     *
+     * @return true if and only if all dimensions of both objects are equal.
+     */
+    friend bool operator==(const batch_dim& x, const batch_dim& y)
+    {
+        return x.num_batch_items_ == y.num_batch_items_ &&
+               x.common_size_ == y.common_size_;
+    }
+
+
+    /**
+     * Checks if two batch_dim objects are different.
+     *
+     * @tparam Dimensionality  number of dimensions of the dim objects
+     * @tparam DimensionType  datatype used to represent each dimension
+     *
+     * @param x  first object
+     * @param y  second object
+     *
+     * @return `!(x == y)`
+     */
+    friend bool operator!=(const batch_dim<Dimensionality, DimensionType>& x,
+                           const batch_dim<Dimensionality, DimensionType>& y)
+    {
+        return !(x == y);
+    }
+
+
+    /**
+     * The default constructor
+     */
+    batch_dim()
+        : common_size_(dim<dimensionality, dimension_type>{}),
+          num_batch_items_(0)
+    {}
+
+    /**
+     * Creates a batch_dim object which stores a uniform size for all batch
+     * entries.
+     *
+     * @param num_batch_items  the number of batch items to be stored
+     * @param common_size  the common size of all the batch items stored
+     *
+     * @note  Use this constructor when uniform batches need to be stored.
+     */
+    explicit batch_dim(const size_type num_batch_items,
+                       const dim<dimensionality, dimension_type>& common_size)
+        : common_size_(common_size), num_batch_items_(num_batch_items)
+    {}
+
+private:
+    size_type num_batch_items_{};
+    dim<dimensionality, dimension_type> common_size_{};
+};
+
+
+/**
+ * Returns a batch_dim object with its dimensions swapped for batched operators
+ *
+ * @tparam DimensionType  datatype used to represent each dimension
+ *
+ * @param dimensions original object
+ *
+ * @return a batch_dim object with dimensions swapped
+ */
+template <typename DimensionType>
+inline batch_dim<2, DimensionType> transpose(
+    const batch_dim<2, DimensionType>& input)
+{
+    return batch_dim<2, DimensionType>(input.get_num_batch_items(),
+                                       transpose(input.get_common_size()));
+}
+
+
+}  // namespace gko
+
+
+#endif  // GKO_PUBLIC_CORE_BASE_BATCH_DIM_HPP_
diff --git a/include/ginkgo/core/base/batch_lin_op.hpp b/include/ginkgo/core/base/batch_lin_op.hpp
new file mode 100644
index 00000000000..03f3a6dc6f4
--- /dev/null
+++ b/include/ginkgo/core/base/batch_lin_op.hpp
@@ -0,0 +1,431 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_PUBLIC_CORE_BASE_BATCH_LIN_OP_HPP_
+#define GKO_PUBLIC_CORE_BASE_BATCH_LIN_OP_HPP_
+
+
+#include <memory>
+#include <type_traits>
+#include <utility>
+
+
+#include <ginkgo/core/base/abstract_factory.hpp>
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/dim.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/matrix_assembly_data.hpp>
+#include <ginkgo/core/base/matrix_data.hpp>
+#include <ginkgo/core/base/polymorphic_object.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/base/utils.hpp>
+#include <ginkgo/core/log/logger.hpp>
+
+
+namespace gko {
+namespace batch {
+
+
+/**
+ * @addtogroup BatchLinOp
+ *
+ * @section batch_linop_concept Batched Linear operator as a concept
+ *
+ * A batch linear operator (BatchLinOp) forms the base class for all batched
+ * linear algebra objects. In general, it follows the same structure as the
+ * LinOp class, but has some crucial differences which make it not strictly
+ * representable through or with the LinOp class.
+ *
+ * A batched operator is defined as a set of independent linear operators which
+ * have no communication/information exchange between them. Therefore, any
+ * collective operations between the batches is not possible and not
+ * implemented. This allows for each batch to be computed and operated on in an
+ * embarrassingly parallel fashion.
+ *
+ * A key difference between the LinOp and the BatchLinOp class is that the apply
+ * between BatchLinOps is no longer supported. The user can apply a BatchLinOp
+ * to a batch::MultiVector but not to any general BatchLinOp.
+ *
+ * Therefore, the BatchLinOp serves only as a base class providing necessary
+ * core functionality from Polymorphic object and store the dimensions of the
+ * batched object.
+ *
+ * @note Apply to batch::MultiVector objects are handled by the concrete LinOp
+ * and may be moved to the base BatchLinOp class in the future.
+ *
+ * @ref BatchLinOp
+ */
+class BatchLinOp : public EnableAbstractPolymorphicObject<BatchLinOp> {
+public:
+    /**
+     * Returns the number of items in the batch operator.
+     *
+     * @return  number of items in the batch operator
+     */
+    size_type get_num_batch_items() const noexcept
+    {
+        return get_size().get_num_batch_items();
+    }
+
+    /**
+     * Returns the common size of the batch items.
+     *
+     * @return  the common size stored
+     */
+    dim<2> get_common_size() const { return get_size().get_common_size(); }
+
+    /**
+     * Returns the size of the batch operator.
+     *
+     * @return  size of the batch operator, a batch_dim object
+     */
+    const batch_dim<2>& get_size() const noexcept { return size_; }
+
+    /**
+     * Validates the sizes for the apply(b,x) operation in the
+     * concrete BatchLinOp.
+     *
+     */
+    template <typename ValueType>
+    void validate_application_parameters(const MultiVector<ValueType>* b,
+                                         MultiVector<ValueType>* x) const
+    {
+        GKO_ASSERT_EQ(b->get_num_batch_items(), this->get_num_batch_items());
+        GKO_ASSERT_EQ(this->get_num_batch_items(), x->get_num_batch_items());
+
+        GKO_ASSERT_CONFORMANT(this->get_common_size(), b->get_common_size());
+        GKO_ASSERT_EQUAL_ROWS(this->get_common_size(), x->get_common_size());
+        GKO_ASSERT_EQUAL_COLS(b->get_common_size(), x->get_common_size());
+    }
+
+    /**
+     * Validates the sizes for the apply(alpha, b , beta, x) operation in the
+     * concrete BatchLinOp.
+     *
+     */
+    template <typename ValueType>
+    void validate_application_parameters(const MultiVector<ValueType>* alpha,
+                                         const MultiVector<ValueType>* b,
+                                         const MultiVector<ValueType>* beta,
+                                         MultiVector<ValueType>* x) const
+    {
+        GKO_ASSERT_EQ(b->get_num_batch_items(), this->get_num_batch_items());
+        GKO_ASSERT_EQ(this->get_num_batch_items(), x->get_num_batch_items());
+
+        GKO_ASSERT_CONFORMANT(this->get_common_size(), b->get_common_size());
+        GKO_ASSERT_EQUAL_ROWS(this->get_common_size(), x->get_common_size());
+        GKO_ASSERT_EQUAL_COLS(b->get_common_size(), x->get_common_size());
+        GKO_ASSERT_EQUAL_DIMENSIONS(alpha->get_common_size(),
+                                    gko::dim<2>(1, 1));
+        GKO_ASSERT_EQUAL_DIMENSIONS(beta->get_common_size(), gko::dim<2>(1, 1));
+    }
+
+protected:
+    /**
+     * Sets the size of the batch operator.
+     *
+     * @param size to be set
+     */
+    void set_size(const batch_dim<2>& size) { size_ = size; }
+
+    /**
+     * Creates a batch operator storing items of uniform sizes.
+     *
+     * @param exec        the executor where all the operations are performed
+     * @param batch_size  the size the batched operator, as a batch_dim object
+     */
+    explicit BatchLinOp(std::shared_ptr<const Executor> exec,
+                        const batch_dim<2>& batch_size)
+        : EnableAbstractPolymorphicObject<BatchLinOp>(exec), size_{batch_size}
+    {}
+
+    /**
+     * Creates a batch operator storing items of uniform sizes.
+     *
+     * @param exec        the executor where all the operations are performed
+     * @param num_batch_items the number of batch items to be stored in the
+     * operator
+     * @param size        the common size of the items in the batched operator
+     */
+    explicit BatchLinOp(std::shared_ptr<const Executor> exec,
+                        const size_type num_batch_items = 0,
+                        const dim<2>& common_size = dim<2>{})
+        : BatchLinOp{std::move(exec),
+                     num_batch_items > 0
+                         ? batch_dim<2>(num_batch_items, common_size)
+                         : batch_dim<2>{}}
+    {}
+
+private:
+    batch_dim<2> size_{};
+};
+
+
+/**
+ * A BatchLinOpFactory represents a higher order mapping which transforms one
+ * batch linear operator into another.
+ *
+ * In a similar fashion to LinOps, BatchLinOps are also "generated" from the
+ * BatchLinOpFactory. A function of this class is to provide a generate method,
+ * which internally cals the generate_impl(), which the concrete BatchLinOps
+ * have to implement.
+ *
+ * Example: using BatchCG in Ginkgo
+ * ---------------------------
+ *
+ * ```c++
+ * // Suppose A is a batch matrix, batch_b, a batch rhs vector, and batch_x, an
+ * // initial guess
+ * // Create a BatchCG which runs for at most 1000 iterations, and stops after
+ * // reducing the residual norm by 6 orders of magnitude
+ * auto batch_cg_factory = solver::BatchCg<>::build()
+ *     .with_max_iters(1000)
+ *     .with_rel_residual_goal(1e-6)
+ *     .on(cuda);
+ * // create a batch linear operator which represents the solver
+ * auto batch_cg = batch_cg_factory->generate(A);
+ * // solve the system
+ * batch_cg->apply(batch_b, batch_x);
+ * ```
+ *
+ * @ingroup BatchLinOp
+ */
+class BatchLinOpFactory
+    : public AbstractFactory<BatchLinOp, std::shared_ptr<const BatchLinOp>> {
+public:
+    using AbstractFactory<BatchLinOp,
+                          std::shared_ptr<const BatchLinOp>>::AbstractFactory;
+
+    std::unique_ptr<BatchLinOp> generate(
+        std::shared_ptr<const BatchLinOp> input) const
+    {
+        this->template log<
+            gko::log::Logger::batch_linop_factory_generate_started>(
+            this, input.get());
+        const auto exec = this->get_executor();
+        std::unique_ptr<BatchLinOp> generated;
+        if (input->get_executor() == exec) {
+            generated = this->AbstractFactory::generate(input);
+        } else {
+            generated =
+                this->AbstractFactory::generate(gko::clone(exec, input));
+        }
+        this->template log<
+            gko::log::Logger::batch_linop_factory_generate_completed>(
+            this, input.get(), generated.get());
+        return generated;
+    }
+};
+
+
+/**
+ * The EnableBatchLinOp mixin can be used to provide sensible default
+ * implementations of the majority of the BatchLinOp and PolymorphicObject
+ * interface.
+ *
+ * The goal of the mixin is to facilitate the development of new BatchLinOp, by
+ * enabling the implementers to focus on the important parts of their operator,
+ * while the library takes care of generating the trivial utility functions.
+ * The mixin will provide default implementations for the entire
+ * PolymorphicObject interface, including a default implementation of
+ * `copy_from` between objects of the new BatchLinOp type.
+ *
+ * Implementers of new BatchLinOps are required to specify only the following
+ * aspects:
+ *
+ * 1.  Creation of the BatchLinOp: This can be facilitated via either
+ *     EnableCreateMethod mixin (used mostly for matrix formats),
+ *     or GKO_ENABLE_BATCH_LIN_OP_FACTORY macro (used for operators created from
+ *     other operators, like preconditioners and solvers).
+ *
+ * @tparam ConcreteBatchLinOp  the concrete BatchLinOp which is being
+ *                             implemented [CRTP parameter]
+ * @tparam PolymorphicBase  parent of ConcreteBatchLinOp in the polymorphic
+ *                          hierarchy, has to be a subclass of BatchLinOp
+ *
+ * @ingroup BatchLinOp
+ */
+template <typename ConcreteBatchLinOp, typename PolymorphicBase = BatchLinOp>
+class EnableBatchLinOp
+    : public EnablePolymorphicObject<ConcreteBatchLinOp, PolymorphicBase>,
+      public EnablePolymorphicAssignment<ConcreteBatchLinOp> {
+public:
+    using EnablePolymorphicObject<ConcreteBatchLinOp,
+                                  PolymorphicBase>::EnablePolymorphicObject;
+};
+
+
+/**
+ * This is an alias for the EnableDefaultFactory mixin, which correctly sets the
+ * template parameters to enable a subclass of BatchLinOpFactory.
+ *
+ * @tparam ConcreteFactory  the concrete factory which is being implemented
+ *                          [CRTP parameter]
+ * @tparam ConcreteBatchLinOp  the concrete BatchLinOp type which this factory
+ * produces, needs to have a constructor which takes a const ConcreteFactory *,
+ * and an std::shared_ptr<const BatchLinOp> as parameters.
+ * @tparam ParametersType  a subclass of enable_parameters_type template which
+ *                         defines all of the parameters of the factory
+ * @tparam PolymorphicBase  parent of ConcreteFactory in the polymorphic
+ *                          hierarchy, has to be a subclass of BatchLinOpFactory
+ *
+ * @ingroup BatchLinOp
+ */
+template <typename ConcreteFactory, typename ConcreteBatchLinOp,
+          typename ParametersType, typename PolymorphicBase = BatchLinOpFactory>
+using EnableDefaultBatchLinOpFactory =
+    EnableDefaultFactory<ConcreteFactory, ConcreteBatchLinOp, ParametersType,
+                         PolymorphicBase>;
+
+
+/**
+ * This macro will generate a default implementation of a BatchLinOpFactory for
+ * the BatchLinOp subclass it is defined in.
+ *
+ * It is required to first call the macro #GKO_CREATE_FACTORY_PARAMETERS()
+ * before this one in order to instantiate the parameters type first.
+ *
+ * The list of parameters for the factory should be defined in a code block
+ * after the macro definition, and should contain a list of
+ * GKO_FACTORY_PARAMETER_* declarations. The class should provide a constructor
+ * with signature
+ * _batch_lin_op(const _factory_name *, std::shared_ptr<const BatchLinOp>)
+ * which the factory will use a callback to construct the object.
+ *
+ * A minimal example of a batch linear operator is the following:
+ *
+ * ```c++
+ * struct MyBatchLinOp : public EnableBatchLinOp<MyBatchLinOp> {
+ *     GKO_ENABLE_BATCH_LIN_OP_FACTORY(MyBatchLinOp, my_parameters, Factory) {
+ *         // a factory parameter named "my_value", of type int and default
+ *         // value of 5
+ *         int GKO_FACTORY_PARAMETER_SCALAR(my_value, 5);
+ *         // a factory parameter named `my_pair` of type `std::pair<int,int>`
+ *         // and default value {5, 5}
+ *         std::pair<int, int> GKO_FACTORY_PARAMETER_VECTOR(my_pair, 5, 5);
+ *     };
+ *     // constructor needed by EnableBatchLinOp
+ *     explicit MyBatchLinOp(std::shared_ptr<const Executor> exec) {
+ *         : EnableBatchLinOp<MyBatchLinOp>(exec) {}
+ *     // constructor needed by the factory
+ *     explicit MyBatchLinOp(const Factory *factory,
+ *                      std::shared_ptr<const BatchLinOp> matrix)
+ *         : EnableBatchLinOp<MyBatchLinOp>(factory->get_executor()),
+ *                                          matrix->get_size()),
+ *           // store factory's parameters locally
+ *           my_parameters_{factory->get_parameters()}
+ *     {
+ *          int value = my_parameters_.my_value;
+ *          // do something with value
+ *     }
+ * ```
+ *
+ * MyBatchLinOp can then be created as follows:
+ *
+ * ```c++
+ * auto exec = gko::ReferenceExecutor::create();
+ * // create a factory with default `my_value` parameter
+ * auto fact = MyBatchLinOp::build().on(exec);
+ * // create a operator using the factory:
+ * auto my_op = fact->generate(gko::batch::matrix::Identity::create(exec, 2));
+ * std::cout << my_op->get_my_parameters().my_value;  // prints 5
+ *
+ * // create a factory with custom `my_value` parameter
+ * auto fact = MyLinOp::build().with_my_value(0).on(exec);
+ * // create a operator using the factory:
+ * auto my_op = fact->generate(gko::batch::matrix::Identity::create(exec, 2));
+ * std::cout << my_op->get_my_parameters().my_value;  // prints 0
+ * ```
+ *
+ * @note It is possible to combine both the #GKO_CREATE_FACTORY_PARAMETER_*()
+ * macros with this one in a unique macro for class __templates__ (not with
+ * regular classes). Splitting this into two distinct macros allows to use them
+ * in all contexts. See <https://stackoverflow.com/q/50202718/9385966> for more
+ * details.
+ *
+ * @param _batch_lin_op  concrete operator for which the factory is to be
+ *                       created [CRTP parameter]
+ * @param _parameters_name  name of the parameters member in the class
+ *                          (its type is `<_parameters_name>_type`, the
+ *                          protected member's name is `<_parameters_name>_`,
+ *                          and the public getter's name is
+ *                          `get_<_parameters_name>()`)
+ * @param _factory_name  name of the generated factory type
+ *
+ * @ingroup BatchLinOp
+ */
+#define GKO_ENABLE_BATCH_LIN_OP_FACTORY(_batch_lin_op, _parameters_name,     \
+                                        _factory_name)                       \
+public:                                                                      \
+    const _parameters_name##_type& get_##_parameters_name() const            \
+    {                                                                        \
+        return _parameters_name##_;                                          \
+    }                                                                        \
+                                                                             \
+    class _factory_name                                                      \
+        : public ::gko::batch::EnableDefaultBatchLinOpFactory<               \
+              _factory_name, _batch_lin_op, _parameters_name##_type> {       \
+        friend class ::gko::EnablePolymorphicObject<                         \
+            _factory_name, ::gko::batch::BatchLinOpFactory>;                 \
+        friend class ::gko::enable_parameters_type<_parameters_name##_type,  \
+                                                   _factory_name>;           \
+        explicit _factory_name(std::shared_ptr<const ::gko::Executor> exec)  \
+            : ::gko::batch::EnableDefaultBatchLinOpFactory<                  \
+                  _factory_name, _batch_lin_op, _parameters_name##_type>(    \
+                  std::move(exec))                                           \
+        {}                                                                   \
+        explicit _factory_name(std::shared_ptr<const ::gko::Executor> exec,  \
+                               const _parameters_name##_type& parameters)    \
+            : ::gko::batch::EnableDefaultBatchLinOpFactory<                  \
+                  _factory_name, _batch_lin_op, _parameters_name##_type>(    \
+                  std::move(exec), parameters)                               \
+        {}                                                                   \
+    };                                                                       \
+    friend ::gko::batch::EnableDefaultBatchLinOpFactory<                     \
+        _factory_name, _batch_lin_op, _parameters_name##_type>;              \
+                                                                             \
+                                                                             \
+private:                                                                     \
+    _parameters_name##_type _parameters_name##_;                             \
+                                                                             \
+public:                                                                      \
+    static_assert(true,                                                      \
+                  "This assert is used to counter the false positive extra " \
+                  "semi-colon warnings")
+
+
+}  // namespace batch
+}  // namespace gko
+
+
+#endif  // GKO_PUBLIC_CORE_BASE_BATCH_LIN_OP_HPP_
diff --git a/include/ginkgo/core/base/batch_multi_vector.hpp b/include/ginkgo/core/base/batch_multi_vector.hpp
new file mode 100644
index 00000000000..405603269ff
--- /dev/null
+++ b/include/ginkgo/core/base/batch_multi_vector.hpp
@@ -0,0 +1,453 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_PUBLIC_CORE_BASE_BATCH_MULTI_VECTOR_HPP_
+#define GKO_PUBLIC_CORE_BASE_BATCH_MULTI_VECTOR_HPP_
+
+
+#include <initializer_list>
+#include <vector>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/batch_dim.hpp>
+#include <ginkgo/core/base/dim.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/mtx_io.hpp>
+#include <ginkgo/core/base/polymorphic_object.hpp>
+#include <ginkgo/core/base/range_accessors.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/base/utils.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+namespace gko {
+namespace batch {
+
+
+/**
+ * MultiVector stores multiple vectors in a batched fashion and is useful
+ * for batched operations. For example, if you want to store two batch items
+ * with multi-vectors of size (3 x 2) given below:
+ *
+ * [1 2 ; 3 4
+ *  1 2 ; 3 4
+ *  1 2 ; 3 4]
+ *
+ * In memory, they would be stored as a single array:
+ * [1 2 1 2 1 2 3 4 3 4 3 4].
+ *
+ * Access functions @at can help access individual
+ * item if necessary.
+ *
+ * The values of the different batch items are stored consecutively and in each
+ * batch item, the multi-vectors are stored in a row-major fashion.
+ *
+ * @tparam ValueType  precision of multi-vector elements
+ *
+ * @ingroup batch_multi_vector
+ * @ingroup batched
+ */
+template <typename ValueType = default_precision>
+class MultiVector
+    : public EnablePolymorphicObject<MultiVector<ValueType>>,
+      public EnablePolymorphicAssignment<MultiVector<ValueType>>,
+      public EnableCreateMethod<MultiVector<ValueType>>,
+      public ConvertibleTo<MultiVector<next_precision<ValueType>>> {
+    friend class EnableCreateMethod<MultiVector>;
+    friend class EnablePolymorphicObject<MultiVector>;
+    friend class MultiVector<to_complex<ValueType>>;
+    friend class MultiVector<next_precision<ValueType>>;
+
+public:
+    using EnablePolymorphicAssignment<MultiVector>::convert_to;
+    using EnablePolymorphicAssignment<MultiVector>::move_to;
+    using ConvertibleTo<MultiVector<next_precision<ValueType>>>::convert_to;
+    using ConvertibleTo<MultiVector<next_precision<ValueType>>>::move_to;
+
+    using value_type = ValueType;
+    using index_type = int32;
+    using unbatch_type = gko::matrix::Dense<ValueType>;
+    using absolute_type = remove_complex<MultiVector<ValueType>>;
+    using complex_type = to_complex<MultiVector<ValueType>>;
+
+    /**
+     * Creates a MultiVector with the configuration of another
+     * MultiVector.
+     *
+     * @param other  The other multi-vector whose configuration needs to copied.
+     */
+    static std::unique_ptr<MultiVector> create_with_config_of(
+        ptr_param<const MultiVector> other);
+
+    void convert_to(
+        MultiVector<next_precision<ValueType>>* result) const override;
+
+    void move_to(MultiVector<next_precision<ValueType>>* result) override;
+
+    /**
+     * Creates a mutable view (of matrix::Dense type) of one item of the Batch
+     * MultiVector object. Does not perform any deep copies, but only returns a
+     * view of the data.
+     *
+     * @param item_id  The index of the batch item
+     *
+     * @return  a matrix::Dense object with the data from the batch item at the
+     *          given index.
+     */
+    std::unique_ptr<unbatch_type> create_view_for_item(size_type item_id);
+
+    /**
+     * @copydoc create_view_for_item(size_type)
+     */
+    std::unique_ptr<const unbatch_type> create_const_view_for_item(
+        size_type item_id) const;
+
+    /**
+     * Returns the batch size.
+     *
+     * @return the batch size
+     */
+    batch_dim<2> get_size() const { return batch_size_; }
+
+    /**
+     * Returns the number of batch items.
+     *
+     * @return the number of batch items
+     */
+    size_type get_num_batch_items() const
+    {
+        return batch_size_.get_num_batch_items();
+    }
+
+    /**
+     * Returns the common size of the batch items.
+     *
+     * @return the common size stored
+     */
+    dim<2> get_common_size() const { return batch_size_.get_common_size(); }
+
+    /**
+     * Returns a pointer to the array of values of the multi-vector
+     *
+     * @return the pointer to the array of values
+     */
+    value_type* get_values() noexcept { return values_.get_data(); }
+
+    /**
+     * @copydoc get_values()
+     *
+     * @note This is the constant version of the function, which can be
+     *       significantly more memory efficient than the non-constant version,
+     *       so always prefer this version.
+     */
+    const value_type* get_const_values() const noexcept
+    {
+        return values_.get_const_data();
+    }
+
+    /**
+     * Returns a pointer to the array of values of the multi-vector for a
+     * specific batch item.
+     *
+     * @param batch_id  the id of the batch item.
+     *
+     * @return the pointer to the array of values
+     */
+    value_type* get_values_for_item(size_type batch_id) noexcept
+    {
+        GKO_ASSERT(batch_id < this->get_num_batch_items());
+        return values_.get_data() + this->get_cumulative_offset(batch_id);
+    }
+
+    /**
+     * @copydoc get_values_for_item(size_type)
+     *
+     * @note This is the constant version of the function, which can be
+     *       significantly more memory efficient than the non-constant version,
+     *       so always prefer this version.
+     */
+    const value_type* get_const_values_for_item(
+        size_type batch_id) const noexcept
+    {
+        GKO_ASSERT(batch_id < this->get_num_batch_items());
+        return values_.get_const_data() + this->get_cumulative_offset(batch_id);
+    }
+
+    /**
+     * Returns the number of elements explicitly stored in the batch matrix,
+     * cumulative across all the batch items.
+     *
+     * @return the number of elements explicitly stored in the vector,
+     *         cumulative across all the batch items
+     */
+    size_type get_num_stored_elements() const noexcept
+    {
+        return values_.get_num_elems();
+    }
+
+    /**
+     * Get the cumulative storage size offset
+     *
+     * @param batch_id the batch id
+     *
+     * @return the cumulative offset
+     */
+    size_type get_cumulative_offset(size_type batch_id) const
+    {
+        return batch_id * this->get_common_size()[0] *
+               this->get_common_size()[1];
+    }
+
+    /**
+     * Returns a single element for a particular batch item.
+     *
+     * @param batch_id  the batch item index to be queried
+     * @param row  the row of the requested element
+     * @param col  the column of the requested element
+     *
+     * @note  the method has to be called on the same Executor the vector is
+     *        stored at (e.g. trying to call this method on a GPU multi-vector
+     *        from the OMP results in a runtime error)
+     */
+    value_type& at(size_type batch_id, size_type row, size_type col)
+    {
+        GKO_ASSERT(batch_id < this->get_num_batch_items());
+        return values_.get_data()[linearize_index(batch_id, row, col)];
+    }
+
+    /**
+     * @copydoc MultiVector::at(size_type, size_type, size_type)
+     */
+    value_type at(size_type batch_id, size_type row, size_type col) const
+    {
+        GKO_ASSERT(batch_id < this->get_num_batch_items());
+        return values_.get_const_data()[linearize_index(batch_id, row, col)];
+    }
+
+    /**
+     * Returns a single element for a particular batch item.
+     *
+     * Useful for iterating across all elements of the vector.
+     * However, it is less efficient than the two-parameter variant of this
+     * method.
+     *
+     * @param batch_id  the batch item index to be queried
+     * @param idx  a linear index of the requested element
+     *
+     * @note  the method has to be called on the same Executor the vector is
+     *        stored at (e.g. trying to call this method on a GPU multi-vector
+     *        from the OMP results in a runtime error)
+     */
+    ValueType& at(size_type batch_id, size_type idx) noexcept
+    {
+        return values_.get_data()[linearize_index(batch_id, idx)];
+    }
+
+    /**
+     * @copydoc MultiVector::at(size_type, size_type, size_type)
+     */
+    ValueType at(size_type batch_id, size_type idx) const noexcept
+    {
+        return values_.get_const_data()[linearize_index(batch_id, idx)];
+    }
+
+    /**
+     * Scales the vector with a scalar (aka: BLAS scal).
+     *
+     * @param alpha  the scalar
+     *
+     * @note If alpha is 1x1 MultiVector matrix, the entire multi-vector
+     *      (all batches) is scaled by alpha. If it is a MultiVector row
+     *      vector of values, then i-th column of the vector is scaled with the
+     *      i-th element of alpha (the number of columns of alpha has to match
+     *      the number of columns of the multi-vector).
+     */
+    void scale(ptr_param<const MultiVector<ValueType>> alpha);
+
+    /**
+     * Adds `b` scaled by `alpha` to the vector (aka: BLAS axpy).
+     *
+     * @param alpha  the scalar
+     * @param b  a multi-vector of the same dimension as this
+     *
+     * @note If alpha is 1x1 MultiVector matrix, the entire multi-vector
+     *      (all batches) is scaled by alpha. If it is a MultiVector row
+     *      vector of values, then i-th column of the vector is scaled with the
+     *      i-th element of alpha (the number of columns of alpha has to match
+     *      the number of columns of the multi-vector).
+     */
+    void add_scaled(ptr_param<const MultiVector<ValueType>> alpha,
+                    ptr_param<const MultiVector<ValueType>> b);
+
+    /**
+     * Computes the column-wise dot product of each multi-vector in this batch
+     * and its corresponding entry in `b`.
+     *
+     * @param b  a MultiVector of same dimension as this
+     * @param result  a MultiVector row vector, used to store the dot
+     * product
+     */
+    void compute_dot(ptr_param<const MultiVector<ValueType>> b,
+                     ptr_param<MultiVector<ValueType>> result) const;
+
+    /**
+     * Computes the column-wise conjugate dot product of each multi-vector in
+     * this batch and its corresponding entry in `b`. If the vector has complex
+     * value_type, then the conjugate of this is taken.
+     *
+     * @param b  a MultiVector of same dimension as this
+     * @param result  a MultiVector row vector, used to store the dot
+     *                product (the number of column in the vector must match the
+     *                number of columns of this)
+     */
+    void compute_conj_dot(ptr_param<const MultiVector<ValueType>> b,
+                          ptr_param<MultiVector<ValueType>> result) const;
+
+    /**
+     * Computes the Euclidean (L^2) norm of each multi-vector in this batch.
+     *
+     * @param result  a MultiVector, used to store the norm
+     *                (the number of columns in the vector must match the number
+     *                of columns of this)
+     */
+    void compute_norm2(
+        ptr_param<MultiVector<remove_complex<ValueType>>> result) const;
+
+    /**
+     * Creates a constant (immutable) batch multi-vector from a constant
+     * array.
+     *
+     * @param exec  the executor to create the vector on
+     * @param size  the dimensions of the vector
+     * @param values  the value array of the vector
+     * @param stride  the row-stride of the vector
+     *
+     * @return A smart pointer to the constant multi-vector wrapping the input
+     * array (if it resides on the same executor as the vector) or a copy of the
+     * array on the correct executor.
+     */
+    static std::unique_ptr<const MultiVector<ValueType>> create_const(
+        std::shared_ptr<const Executor> exec, const batch_dim<2>& sizes,
+        gko::detail::const_array_view<ValueType>&& values);
+
+    /**
+     * Fills the input MultiVector with a given value
+     *
+     * @param value  the value to be filled
+     */
+    void fill(ValueType value);
+
+private:
+    inline size_type compute_num_elems(const batch_dim<2>& size)
+    {
+        return size.get_num_batch_items() * size.get_common_size()[0] *
+               size.get_common_size()[1];
+    }
+
+protected:
+    /**
+     * Sets the size of the MultiVector.
+     *
+     * @param value  the new size of the operator
+     */
+    void set_size(const batch_dim<2>& value) noexcept;
+
+    /**
+     * Creates an uninitialized multi-vector of the specified
+     * size.
+     *
+     * @param exec  Executor associated to the vector
+     * @param size  size of the batch multi vector
+     */
+    MultiVector(std::shared_ptr<const Executor> exec,
+                const batch_dim<2>& size = batch_dim<2>{});
+
+    /**
+     * Creates a MultiVector from an already allocated (and
+     * initialized) array.
+     *
+     * @tparam ValuesArray  type of array of values
+     *
+     * @param exec  Executor associated to the vector
+     * @param size  sizes of the batch matrices in a batch_dim object
+     * @param values  array of values
+     *
+     * @note If `values` is not an rvalue, not an array of ValueType, or is on
+     *       the wrong executor, an internal copy will be created, and the
+     *       original array data will not be used in the vector.
+     */
+    template <typename ValuesArray>
+    MultiVector(std::shared_ptr<const Executor> exec, const batch_dim<2>& size,
+                ValuesArray&& values)
+        : EnablePolymorphicObject<MultiVector<ValueType>>(exec),
+          batch_size_(size),
+          values_{exec, std::forward<ValuesArray>(values)}
+    {
+        // Ensure that the values array has the correct size
+        auto num_elems = compute_num_elems(size);
+        GKO_ENSURE_IN_BOUNDS(num_elems, values_.get_num_elems() + 1);
+    }
+
+    /**
+     * Creates a MultiVector with the same configuration as the
+     * callers object.
+     *
+     * @returns a MultiVector with the same configuration as the
+     * caller.
+     */
+    std::unique_ptr<MultiVector> create_with_same_config() const;
+
+    size_type linearize_index(size_type batch, size_type row,
+                              size_type col) const noexcept
+    {
+        return this->get_cumulative_offset(batch) +
+               row * batch_size_.get_common_size()[1] + col;
+    }
+
+    size_type linearize_index(size_type batch, size_type idx) const noexcept
+    {
+        return linearize_index(batch, idx / this->get_common_size()[1],
+                               idx % this->get_common_size()[1]);
+    }
+
+private:
+    batch_dim<2> batch_size_;
+    array<value_type> values_;
+};
+
+
+}  // namespace batch
+}  // namespace gko
+
+
+#endif  // GKO_PUBLIC_CORE_BASE_BATCH_MULTI_VECTOR_HPP_
diff --git a/include/ginkgo/core/base/composition.hpp b/include/ginkgo/core/base/composition.hpp
index 44c24b901b3..5091b4a439e 100644
--- a/include/ginkgo/core/base/composition.hpp
+++ b/include/ginkgo/core/base/composition.hpp
@@ -176,7 +176,7 @@ class Composition : public EnableLinOp<Composition<ValueType>>,
      * @tparam Rest  types of trailing parameters
      *
      * @param oper  the first operator
-     * @param rest  remainging operators
+     * @param rest  remaining operators
      */
     template <typename... Rest>
     explicit Composition(std::shared_ptr<const LinOp> oper, Rest&&... rest)
@@ -217,7 +217,7 @@ class UseComposition {
     }
 
     /**
-     * Returns the operator at index-th poistion of composition
+     * Returns the operator at index-th position of composition
      *
      * @return index-th operator
      *
diff --git a/include/ginkgo/core/base/exception.hpp b/include/ginkgo/core/base/exception.hpp
index ad39adf7a36..97b60f07d05 100644
--- a/include/ginkgo/core/base/exception.hpp
+++ b/include/ginkgo/core/base/exception.hpp
@@ -73,7 +73,7 @@ namespace gko {
  *     try {
  *         auto y = apply(A, x);
  *     } catch(Error e) {
- *         // an error occured, write the message to screen and exit
+ *         // an error occurred, write the message to screen and exit
  *         std::cout << e.what() << std::endl;
  *         return -1;
  *     }
@@ -160,7 +160,7 @@ class NotSupported : public Error {
      *
      * @param file  The name of the offending source file
      * @param line  The source code line number where the error occurred
-     * @param func  The name of the function where the error occured
+     * @param func  The name of the function where the error occurred
      * @param obj_type  The object type on which the requested operation
                        cannot be performed.
      */
@@ -513,7 +513,7 @@ class BadDimension : public Error {
  * Error that denotes issues between block sizes and matrix dimensions
  *
  * \tparam IndexType  Type of index used by the linear algebra object that is
- *                    incompatible with the requried block size.
+ *                    incompatible with the required block size.
  */
 template <typename IndexType>
 class BlockSizeError : public Error {
@@ -683,6 +683,7 @@ class UnsupportedMatrixProperty : public Error {
 };
 
 
+/** Exception thrown if an object is in an invalid state. */
 class InvalidStateError : public Error {
 public:
     /**
diff --git a/include/ginkgo/core/base/exception_helpers.hpp b/include/ginkgo/core/base/exception_helpers.hpp
index 50ff0354105..dcf07ed093f 100644
--- a/include/ginkgo/core/base/exception_helpers.hpp
+++ b/include/ginkgo/core/base/exception_helpers.hpp
@@ -37,6 +37,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <typeinfo>
 
 
+#include <ginkgo/core/base/batch_dim.hpp>
 #include <ginkgo/core/base/dim.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/name_demangling.hpp>
@@ -147,6 +148,22 @@ inline dim<2> get_size(const T& op)
 inline dim<2> get_size(const dim<2>& size) { return size; }
 
 
+template <typename T>
+inline batch_dim<2> get_batch_size(const T& op)
+{
+    return op->get_size();
+}
+
+inline batch_dim<2> get_batch_size(const batch_dim<2>& size) { return size; }
+
+
+template <typename T>
+inline size_type get_num_batch_items(const T& obj)
+{
+    return obj.get_num_batch_items();
+}
+
+
 }  // namespace detail
 
 
@@ -298,6 +315,169 @@ inline dim<2> get_size(const dim<2>& size) { return size; }
     }
 
 
+/**
+ * Asserts that _op1 and _op2 have equal number of items in the batch
+ *
+ * @throw ValueMismatch  if _op1 and _op2 do not have equal number of items
+ */
+#define GKO_ASSERT_BATCH_EQUAL_NUM_ITEMS(_op1, _op2)                       \
+    {                                                                      \
+        auto equal_num_items =                                             \
+            ::gko::detail::get_batch_size(_op1).get_num_batch_items() ==   \
+            ::gko::detail::get_batch_size(_op2).get_num_batch_items();     \
+        if (!equal_num_items) {                                            \
+            throw ::gko::ValueMismatch(                                    \
+                __FILE__, __LINE__, __func__,                              \
+                ::gko::detail::get_batch_size(_op1).get_num_batch_items(), \
+                ::gko::detail::get_batch_size(_op2).get_num_batch_items(), \
+                "expected equal number of batch items");                   \
+        }                                                                  \
+    }
+
+
+/**
+ * Asserts that _op1 can be applied to _op2.
+ *
+ * @throw DimensionMismatch  if _op1 cannot be applied to _op2.
+ */
+#define GKO_ASSERT_BATCH_CONFORMANT(_op1, _op2)                              \
+    {                                                                        \
+        GKO_ASSERT_BATCH_EQUAL_NUM_ITEMS(_op1, _op2);                        \
+        auto equal_inner_size =                                              \
+            ::gko::detail::get_batch_size(_op1).get_common_size()[1] ==      \
+            ::gko::detail::get_batch_size(_op2).get_common_size()[0];        \
+        if (!equal_inner_size) {                                             \
+            throw ::gko::DimensionMismatch(                                  \
+                __FILE__, __LINE__, __func__, #_op1,                         \
+                ::gko::detail::get_batch_size(_op1).get_common_size()[0],    \
+                ::gko::detail::get_batch_size(_op1).get_common_size()[1],    \
+                #_op2,                                                       \
+                ::gko::detail::get_batch_size(_op2).get_common_size()[0],    \
+                ::gko::detail::get_batch_size(_op2).get_common_size()[1],    \
+                "expected matching inner dimensions among all batch items"); \
+        }                                                                    \
+    }
+
+
+/**
+ * Asserts that _op1 can be applied to _op2 from the right.
+ *
+ * @throw DimensionMismatch  if _op1 cannot be applied to _op2 from the right.
+ */
+#define GKO_ASSERT_BATCH_REVERSE_CONFORMANT(_op1, _op2)                      \
+    {                                                                        \
+        GKO_ASSERT_BATCH_EQUAL_NUM_ITEMS(_op1, _op2);                        \
+        auto equal_outer_size =                                              \
+            ::gko::detail::get_batch_size(_op1).get_common_size()[0] ==      \
+            ::gko::detail::get_batch_size(_op2).get_common_size()[1];        \
+        if (!equal_outer_size) {                                             \
+            throw ::gko::DimensionMismatch(                                  \
+                __FILE__, __LINE__, __func__, #_op1,                         \
+                ::gko::detail::get_batch_size(_op1).get_common_size()[0],    \
+                ::gko::detail::get_batch_size(_op1).get_common_size()[1],    \
+                #_op2,                                                       \
+                ::gko::detail::get_batch_size(_op2).get_common_size()[0],    \
+                ::gko::detail::get_batch_size(_op2).get_common_size()[1],    \
+                "expected matching outer dimensions among all batch items"); \
+        }                                                                    \
+    }
+
+
+/**
+ * Asserts that `_op1` and `_op2` have the same number of rows.
+ *
+ * @throw DimensionMismatch  if `_op1` and `_op2` differ in the number of rows
+ */
+#define GKO_ASSERT_BATCH_EQUAL_ROWS(_op1, _op2)                            \
+    {                                                                      \
+        GKO_ASSERT_BATCH_EQUAL_NUM_ITEMS(_op1, _op2);                      \
+        auto equal_rows =                                                  \
+            ::gko::detail::get_batch_size(_op1).get_common_size()[0] ==    \
+            ::gko::detail::get_batch_size(_op2).get_common_size()[0];      \
+        if (!equal_rows) {                                                 \
+            throw ::gko::DimensionMismatch(                                \
+                __FILE__, __LINE__, __func__, #_op1,                       \
+                ::gko::detail::get_batch_size(_op1).get_common_size()[0],  \
+                ::gko::detail::get_batch_size(_op1).get_common_size()[1],  \
+                #_op2,                                                     \
+                ::gko::detail::get_batch_size(_op2).get_common_size()[0],  \
+                ::gko::detail::get_batch_size(_op2).get_common_size()[1],  \
+                "expected matching number of rows among all batch items"); \
+        }                                                                  \
+    }
+
+
+/**
+ * Asserts that `_op1` and `_op2` have the same number of columns.
+ *
+ * @throw DimensionMismatch  if `_op1` and `_op2` differ in the number of
+ *                           columns
+ */
+#define GKO_ASSERT_BATCH_EQUAL_COLS(_op1, _op2)                            \
+    {                                                                      \
+        GKO_ASSERT_BATCH_EQUAL_NUM_ITEMS(_op1, _op2);                      \
+        auto equal_cols =                                                  \
+            ::gko::detail::get_batch_size(_op1).get_common_size()[1] ==    \
+            ::gko::detail::get_batch_size(_op2).get_common_size()[1];      \
+        if (!equal_cols) {                                                 \
+            throw ::gko::DimensionMismatch(                                \
+                __FILE__, __LINE__, __func__, #_op1,                       \
+                ::gko::detail::get_batch_size(_op1).get_common_size()[0],  \
+                ::gko::detail::get_batch_size(_op1).get_common_size()[1],  \
+                #_op2,                                                     \
+                ::gko::detail::get_batch_size(_op2).get_common_size()[0],  \
+                ::gko::detail::get_batch_size(_op2).get_common_size()[1],  \
+                "expected matching number of cols among all batch items"); \
+        }                                                                  \
+    }
+
+
+/**
+ * Asserts that `_op1` and `_op2` have the same number of rows and columns.
+ *
+ * @throw DimensionMismatch  if `_op1` and `_op2` differ in the number of
+ *                           rows or columns
+ */
+#define GKO_ASSERT_BATCH_EQUAL_DIMENSIONS(_op1, _op2)                     \
+    {                                                                     \
+        GKO_ASSERT_BATCH_EQUAL_NUM_ITEMS(_op1, _op2);                     \
+        auto equal_size =                                                 \
+            ::gko::detail::get_batch_size(_op1).get_common_size() ==      \
+            ::gko::detail::get_batch_size(_op2).get_common_size();        \
+        if (!equal_size) {                                                \
+            throw ::gko::DimensionMismatch(                               \
+                __FILE__, __LINE__, __func__, #_op1,                      \
+                ::gko::detail::get_batch_size(_op1).get_common_size()[0], \
+                ::gko::detail::get_batch_size(_op1).get_common_size()[1], \
+                #_op2,                                                    \
+                ::gko::detail::get_batch_size(_op2).get_common_size()[0], \
+                ::gko::detail::get_batch_size(_op2).get_common_size()[1], \
+                "expected matching size among all batch items");          \
+        }                                                                 \
+    }
+
+
+/**
+ * Asserts that `_op1` and `_op2` have the same number of rows and columns.
+ *
+ * @throw DimensionMismatch  if `_op1` and `_op2` differ in the number of
+ *                           rows or columns
+ */
+#define GKO_ASSERT_BATCH_HAS_SQUARE_DIMENSIONS(_op1)                      \
+    {                                                                     \
+        auto is_square =                                                  \
+            ::gko::detail::get_batch_size(_op1).get_common_size()[0] ==   \
+            ::gko::detail::get_batch_size(_op1).get_common_size()[1];     \
+        if (!is_square) {                                                 \
+            throw ::gko::BadDimension(                                    \
+                __FILE__, __LINE__, __func__, #_op1,                      \
+                ::gko::detail::get_batch_size(_op1).get_common_size()[0], \
+                ::gko::detail::get_batch_size(_op1).get_common_size()[1], \
+                "expected common size of matrices to be square");         \
+        }                                                                 \
+    }
+
+
 /**
  * Instantiates a MpiError.
  *
@@ -706,6 +886,13 @@ inline T ensure_allocated_impl(T ptr, const std::string& file, int line,
                   "semi-colon warnings")
 
 
+/**
+ * Throws an InvalidStateError with a user-specified message
+ *
+ * @param _message  message to be displayed.
+ *
+ * @throw  InvalidStateError.
+ */
 #define GKO_INVALID_STATE(_message)                                          \
     {                                                                        \
         throw ::gko::InvalidStateError(__FILE__, __LINE__, __func__,         \
@@ -716,6 +903,26 @@ inline T ensure_allocated_impl(T ptr, const std::string& file, int line,
                   "semi-colon warnings")
 
 
+/**
+ * Throws an InvalidStateError if condition is not satisfied
+ *
+ * @param _condition  the condition to check.
+ * @param _message  message to be displayed.
+ *
+ * @throw  InvalidStateError.
+ */
+#define GKO_THROW_IF_INVALID(_condition, _message)                           \
+    {                                                                        \
+        if (!(_condition)) {                                                 \
+            throw ::gko::InvalidStateError(__FILE__, __LINE__, __func__,     \
+                                           _message);                        \
+        }                                                                    \
+    }                                                                        \
+    static_assert(true,                                                      \
+                  "This assert is used to counter the false positive extra " \
+                  "semi-colon warnings")
+
+
 }  // namespace gko
 
 
diff --git a/include/ginkgo/core/base/executor.hpp b/include/ginkgo/core/base/executor.hpp
index 965cd562bff..a2a1d2ac0ff 100644
--- a/include/ginkgo/core/base/executor.hpp
+++ b/include/ginkgo/core/base/executor.hpp
@@ -47,7 +47,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <ginkgo/core/base/device.hpp>
+#include <ginkgo/core/base/fwd_decls.hpp>
 #include <ginkgo/core/base/machine_topology.hpp>
+#include <ginkgo/core/base/memory.hpp>
 #include <ginkgo/core/base/scoped_device_id_guard.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/log/logger.hpp>
@@ -83,8 +85,8 @@ enum class log_propagation_mode {
  * host through the Unified memory model.
  *
  * `unified_host` allocates memory on the
- * host and it is not available on devices which do not have concurrent acesses
- * switched on, but this access can be explictly switched on, when necessary.
+ * host and it is not available on devices which do not have concurrent accesses
+ * switched on, but this access can be explicitly switched on, when necessary.
  */
 enum class allocation_mode { device, unified_global, unified_host };
 
@@ -121,33 +123,6 @@ constexpr allocation_mode default_hip_alloc_mode =
 }  // namespace gko
 
 
-// after intel/llvm September'22 release, which uses major version 6, they
-// introduce another inline namespace _V1.
-#if GINKGO_DPCPP_MAJOR_VERSION >= 6
-namespace sycl {
-inline namespace _V1 {
-
-
-class queue;
-class event;
-
-
-}  // namespace _V1
-}  // namespace sycl
-#else  // GINKGO_DPCPP_MAJOR_VERSION < 6
-inline namespace cl {
-namespace sycl {
-
-
-class queue;
-class event;
-
-
-}  // namespace sycl
-}  // namespace cl
-#endif
-
-
 /**
  * The enum class is for the dpcpp queue property. It's legal to use a binary
  * or(|) operation to combine several properties.
@@ -172,29 +147,6 @@ GKO_ATTRIBUTES GKO_INLINE dpcpp_queue_property operator|(dpcpp_queue_property a,
 }
 
 
-struct cublasContext;
-
-struct cusparseContext;
-
-struct CUstream_st;
-
-struct CUevent_st;
-
-struct hipblasContext;
-
-struct hipsparseContext;
-
-#if GINKGO_HIP_PLATFORM_HCC
-struct ihipStream_t;
-struct ihipEvent_t;
-#define GKO_HIP_STREAM_STRUCT ihipStream_t
-#define GKO_HIP_EVENT_STRUCT ihipEvent_t
-#else
-#define GKO_HIP_STREAM_STRUCT CUstream_st
-#define GKO_HIP_EVENT_STRUCT CUevent_st
-#endif
-
-
 namespace gko {
 
 
@@ -1355,14 +1307,20 @@ class EnableDeviceReset {
      *
      * @param device_reset  whether to allow a device reset or not
      */
-    void set_device_reset(bool device_reset) { device_reset_ = device_reset; }
+    GKO_DEPRECATED(
+        "device_reset is no longer supported, call "
+        "cudaDeviceReset/hipDeviceReset manually")
+    void set_device_reset(bool device_reset) {}
 
     /**
      * Returns the current status of the device reset boolean for this executor.
      *
      * @return the current status of the device reset boolean for this executor.
      */
-    bool get_device_reset() { return device_reset_; }
+    GKO_DEPRECATED(
+        "device_reset is no longer supported, call "
+        "cudaDeviceReset/hipDeviceReset manually")
+    bool get_device_reset() { return false; }
 
 protected:
     /**
@@ -1370,11 +1328,12 @@ class EnableDeviceReset {
      *
      * @param device_reset  the starting device_reset status. Defaults to false.
      */
-    EnableDeviceReset(bool device_reset = false) : device_reset_{device_reset}
-    {}
+    EnableDeviceReset() {}
 
-private:
-    bool device_reset_{};
+    GKO_DEPRECATED(
+        "device_reset is no longer supported, call "
+        "cudaDeviceReset/hipDeviceReset manually")
+    EnableDeviceReset(bool device_reset) {}
 };
 
 
@@ -1411,9 +1370,11 @@ class OmpExecutor : public detail::ExecutorBase<OmpExecutor>,
     /**
      * Creates a new OmpExecutor.
      */
-    static std::shared_ptr<OmpExecutor> create()
+    static std::shared_ptr<OmpExecutor> create(
+        std::shared_ptr<CpuAllocatorBase> alloc =
+            std::make_shared<CpuAllocator>())
     {
-        return std::shared_ptr<OmpExecutor>(new OmpExecutor());
+        return std::shared_ptr<OmpExecutor>(new OmpExecutor(std::move(alloc)));
     }
 
     std::shared_ptr<Executor> get_master() noexcept override;
@@ -1432,10 +1393,13 @@ class OmpExecutor : public detail::ExecutorBase<OmpExecutor>,
         return this->get_exec_info().num_pu_per_cu;
     }
 
+    static int get_num_omp_threads();
+
     scoped_device_id_guard get_scoped_device_id_guard() const override;
 
 protected:
-    OmpExecutor()
+    OmpExecutor(std::shared_ptr<CpuAllocatorBase> alloc)
+        : alloc_{std::move(alloc)}
     {
         this->OmpExecutor::populate_exec_info(machine_topology::get_instance());
     }
@@ -1457,6 +1421,8 @@ class OmpExecutor : public detail::ExecutorBase<OmpExecutor>,
     GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(CudaExecutor, false);
 
     bool verify_memory_to(const DpcppExecutor* dest_exec) const override;
+
+    std::shared_ptr<CpuAllocatorBase> alloc_;
 };
 
 
@@ -1476,9 +1442,12 @@ using DefaultExecutor = OmpExecutor;
  */
 class ReferenceExecutor : public OmpExecutor {
 public:
-    static std::shared_ptr<ReferenceExecutor> create()
+    static std::shared_ptr<ReferenceExecutor> create(
+        std::shared_ptr<CpuAllocatorBase> alloc =
+            std::make_shared<CpuAllocator>())
     {
-        return std::shared_ptr<ReferenceExecutor>(new ReferenceExecutor());
+        return std::shared_ptr<ReferenceExecutor>(
+            new ReferenceExecutor(std::move(alloc)));
     }
 
     scoped_device_id_guard get_scoped_device_id_guard() const override
@@ -1495,7 +1464,8 @@ class ReferenceExecutor : public OmpExecutor {
     }
 
 protected:
-    ReferenceExecutor()
+    ReferenceExecutor(std::shared_ptr<CpuAllocatorBase> alloc)
+        : OmpExecutor{std::move(alloc)}
     {
         this->ReferenceExecutor::populate_exec_info(
             machine_topology::get_instance());
@@ -1550,17 +1520,35 @@ class CudaExecutor : public detail::ExecutorBase<CudaExecutor>,
      * @param device_id  the CUDA device id of this device
      * @param master  an executor on the host that is used to invoke the device
      * kernels
-     * @param device_reset  whether to reset the device after the object exits
-     *                      the scope.
+     * @param device_reset  this option no longer has any effect.
      * @param alloc_mode  the allocation mode that the executor should operate
      *                    on. See @allocation_mode for more details
+     * @param stream  the stream to execute operations on.
      */
+    GKO_DEPRECATED(
+        "device_reset is deprecated entirely, call cudaDeviceReset directly. "
+        "alloc_mode was replaced by the Allocator type "
+        "hierarchy.")
     static std::shared_ptr<CudaExecutor> create(
-        int device_id, std::shared_ptr<Executor> master,
-        bool device_reset = false,
+        int device_id, std::shared_ptr<Executor> master, bool device_reset,
         allocation_mode alloc_mode = default_cuda_alloc_mode,
         CUstream_st* stream = nullptr);
 
+    /**
+     * Creates a new CudaExecutor with a custom allocator and device stream.
+     *
+     * @param device_id  the CUDA device id of this device
+     * @param master  an executor on the host that is used to invoke the device
+     *                kernels.
+     * @param alloc  the allocator to use for device memory allocations.
+     * @param stream  the stream to execute operations on.
+     */
+    static std::shared_ptr<CudaExecutor> create(
+        int device_id, std::shared_ptr<Executor> master,
+        std::shared_ptr<CudaAllocatorBase> alloc =
+            std::make_shared<CudaAllocator>(),
+        CUstream_st* stream = nullptr);
+
     std::shared_ptr<Executor> get_master() noexcept override;
 
     std::shared_ptr<const Executor> get_master() const noexcept override;
@@ -1616,7 +1604,7 @@ class CudaExecutor : public detail::ExecutorBase<CudaExecutor>,
     }
 
     /**
-     * Get the major verion of compute capability.
+     * Get the major version of compute capability.
      */
     int get_major_version() const noexcept
     {
@@ -1624,7 +1612,7 @@ class CudaExecutor : public detail::ExecutorBase<CudaExecutor>,
     }
 
     /**
-     * Get the minor verion of compute capability.
+     * Get the minor version of compute capability.
      */
     int get_minor_version() const noexcept
     {
@@ -1679,26 +1667,15 @@ class CudaExecutor : public detail::ExecutorBase<CudaExecutor>,
     void init_handles();
 
     CudaExecutor(int device_id, std::shared_ptr<Executor> master,
-                 bool device_reset = false,
-                 allocation_mode alloc_mode = default_cuda_alloc_mode,
-                 CUstream_st* stream = nullptr)
-        : EnableDeviceReset{device_reset},
-          master_(master),
-          alloc_mode_{alloc_mode},
-          stream_{stream}
+                 std::shared_ptr<CudaAllocatorBase> alloc, CUstream_st* stream)
+        : alloc_{std::move(alloc)}, master_(master), stream_{stream}
     {
         this->get_exec_info().device_id = device_id;
         this->get_exec_info().num_computing_units = 0;
         this->get_exec_info().num_pu_per_cu = 0;
         this->CudaExecutor::populate_exec_info(
             machine_topology::get_instance());
-
-        // it only gets attribute from device, so it should not be affected by
-        // DeviceReset.
         this->set_gpu_property();
-        // increase the number of executor before any operations may be affected
-        // by DeviceReset.
-        increase_num_execs(this->get_exec_info().device_id);
         this->init_handles();
     }
 
@@ -1718,12 +1695,6 @@ class CudaExecutor : public detail::ExecutorBase<CudaExecutor>,
 
     bool verify_memory_to(const CudaExecutor* dest_exec) const override;
 
-    static void increase_num_execs(unsigned device_id);
-
-    static void decrease_num_execs(unsigned device_id);
-
-    static unsigned get_num_execs(unsigned device_id);
-
     void populate_exec_info(const machine_topology* mach_topo) override;
 
 private:
@@ -1733,45 +1704,8 @@ class CudaExecutor : public detail::ExecutorBase<CudaExecutor>,
     using handle_manager = std::unique_ptr<T, std::function<void(T*)>>;
     handle_manager<cublasContext> cublas_handle_;
     handle_manager<cusparseContext> cusparse_handle_;
+    std::shared_ptr<CudaAllocatorBase> alloc_;
     CUstream_st* stream_;
-
-    allocation_mode alloc_mode_;
-};
-
-
-/**
- * An RAII wrapper for a custom CUDA stream.
- * The stream will be created on construction and destroyed when the lifetime of
- * the wrapper ends.
- */
-class cuda_stream {
-public:
-    /** Creates a new custom CUDA stream. */
-    cuda_stream(int device_id = 0);
-
-    /** Destroys the custom CUDA stream, if it wasn't moved-from already. */
-    ~cuda_stream();
-
-    cuda_stream(const cuda_stream&) = delete;
-
-    /** Move-constructs from an existing stream, which will be emptied. */
-    cuda_stream(cuda_stream&&);
-
-    cuda_stream& operator=(const cuda_stream&) = delete;
-
-    /** Move-assigns from an existing stream, which will be emptied. */
-    cuda_stream& operator=(cuda_stream&&) = delete;
-
-    /**
-     * Returns the native CUDA stream handle.
-     * In a moved-from cuda_stream, this will return nullptr.
-     */
-    CUstream_st* get() const;
-
-private:
-    CUstream_st* stream_;
-
-    int device_id_;
 };
 
 
@@ -1805,12 +1739,21 @@ class HipExecutor : public detail::ExecutorBase<HipExecutor>,
      * @param alloc_mode  the allocation mode that the executor should operate
      *                    on. See @allocation_mode for more details
      */
+    GKO_DEPRECATED(
+        "device_reset is deprecated entirely, call hipDeviceReset directly. "
+        "alloc_mode was replaced by the Allocator type "
+        "hierarchy.")
     static std::shared_ptr<HipExecutor> create(
-        int device_id, std::shared_ptr<Executor> master,
-        bool device_reset = false,
+        int device_id, std::shared_ptr<Executor> master, bool device_reset,
         allocation_mode alloc_mode = default_hip_alloc_mode,
         GKO_HIP_STREAM_STRUCT* stream = nullptr);
 
+    static std::shared_ptr<HipExecutor> create(
+        int device_id, std::shared_ptr<Executor> master,
+        std::shared_ptr<HipAllocatorBase> alloc =
+            std::make_shared<HipAllocator>(),
+        GKO_HIP_STREAM_STRUCT* stream = nullptr);
+
     std::shared_ptr<Executor> get_master() noexcept override;
 
     std::shared_ptr<const Executor> get_master() const noexcept override;
@@ -1849,7 +1792,7 @@ class HipExecutor : public detail::ExecutorBase<HipExecutor>,
     }
 
     /**
-     * Get the major verion of compute capability.
+     * Get the major version of compute capability.
      */
     int get_major_version() const noexcept
     {
@@ -1857,7 +1800,7 @@ class HipExecutor : public detail::ExecutorBase<HipExecutor>,
     }
 
     /**
-     * Get the minor verion of compute capability.
+     * Get the minor version of compute capability.
      */
     int get_minor_version() const noexcept
     {
@@ -1923,25 +1866,15 @@ class HipExecutor : public detail::ExecutorBase<HipExecutor>,
     void init_handles();
 
     HipExecutor(int device_id, std::shared_ptr<Executor> master,
-                bool device_reset = false,
-                allocation_mode alloc_mode = default_hip_alloc_mode,
-                GKO_HIP_STREAM_STRUCT* stream = nullptr)
-        : EnableDeviceReset{device_reset},
-          master_(master),
-          alloc_mode_(alloc_mode),
-          stream_{stream}
+                std::shared_ptr<HipAllocatorBase> alloc,
+                GKO_HIP_STREAM_STRUCT* stream)
+        : master_{std::move(master)}, alloc_{std::move(alloc)}, stream_{stream}
     {
         this->get_exec_info().device_id = device_id;
         this->get_exec_info().num_computing_units = 0;
         this->get_exec_info().num_pu_per_cu = 0;
         this->HipExecutor::populate_exec_info(machine_topology::get_instance());
-
-        // it only gets attribute from device, so it should not be affected by
-        // DeviceReset.
         this->set_gpu_property();
-        // increase the number of executor before any operations may be affected
-        // by DeviceReset.
-        increase_num_execs(this->get_exec_info().device_id);
         this->init_handles();
     }
 
@@ -1961,12 +1894,6 @@ class HipExecutor : public detail::ExecutorBase<HipExecutor>,
 
     bool verify_memory_to(const HipExecutor* dest_exec) const override;
 
-    static void increase_num_execs(int device_id);
-
-    static void decrease_num_execs(int device_id);
-
-    static int get_num_execs(int device_id);
-
     void populate_exec_info(const machine_topology* mach_topo) override;
 
 private:
@@ -1976,45 +1903,8 @@ class HipExecutor : public detail::ExecutorBase<HipExecutor>,
     using handle_manager = std::unique_ptr<T, std::function<void(T*)>>;
     handle_manager<hipblasContext> hipblas_handle_;
     handle_manager<hipsparseContext> hipsparse_handle_;
-
-    allocation_mode alloc_mode_;
-    GKO_HIP_STREAM_STRUCT* stream_;
-};
-
-
-/**
- * An RAII wrapper for a custom HIP stream.
- * The stream will be created on construction and destroyed when the lifetime of
- * the wrapper ends.
- */
-class hip_stream {
-public:
-    /** Creates a new custom HIP stream. */
-    hip_stream(int device_id = 0);
-
-    /** Destroys the custom HIP stream, if it wasn't moved-from already. */
-    ~hip_stream();
-
-    hip_stream(const hip_stream&) = delete;
-
-    /** Move-constructs from an existing stream, which will be emptied. */
-    hip_stream(hip_stream&&);
-
-    hip_stream& operator=(const hip_stream&) = delete;
-
-    /** Move-assigns from an existing stream, which will be emptied. */
-    hip_stream& operator=(hip_stream&&) = delete;
-
-    /**
-     * Returns the native HIP stream handle.
-     * In a moved-from hip_stream, this will return nullptr.
-     */
-    GKO_HIP_STREAM_STRUCT* get() const;
-
-private:
+    std::shared_ptr<HipAllocatorBase> alloc_;
     GKO_HIP_STREAM_STRUCT* stream_;
-
-    int device_id_;
 };
 
 
diff --git a/include/ginkgo/core/base/fwd_decls.hpp b/include/ginkgo/core/base/fwd_decls.hpp
new file mode 100644
index 00000000000..f99d3a0f90e
--- /dev/null
+++ b/include/ginkgo/core/base/fwd_decls.hpp
@@ -0,0 +1,90 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_PUBLIC_CORE_BASE_FWD_DECLS_HPP_
+#define GKO_PUBLIC_CORE_BASE_FWD_DECLS_HPP_
+
+
+#include <ginkgo/config.hpp>
+
+
+struct cublasContext;
+
+struct cusparseContext;
+
+struct CUstream_st;
+
+struct CUevent_st;
+
+struct hipblasContext;
+
+struct hipsparseContext;
+
+#if GINKGO_HIP_PLATFORM_HCC
+struct ihipStream_t;
+struct ihipEvent_t;
+#define GKO_HIP_STREAM_STRUCT ihipStream_t
+#define GKO_HIP_EVENT_STRUCT ihipEvent_t
+#else
+#define GKO_HIP_STREAM_STRUCT CUstream_st
+#define GKO_HIP_EVENT_STRUCT CUevent_st
+#endif
+
+
+// after intel/llvm September'22 release, which uses major version 6, they
+// introduce another inline namespace _V1.
+#if GINKGO_DPCPP_MAJOR_VERSION >= 6
+namespace sycl {
+inline namespace _V1 {
+
+
+class queue;
+class event;
+
+
+}  // namespace _V1
+}  // namespace sycl
+#else  // GINKGO_DPCPP_MAJOR_VERSION < 6
+inline namespace cl {
+namespace sycl {
+
+
+class queue;
+class event;
+
+
+}  // namespace sycl
+}  // namespace cl
+#endif
+
+
+#endif  // GKO_PUBLIC_CORE_BASE_FWD_DECLS_HPP_
diff --git a/include/ginkgo/core/base/index_set.hpp b/include/ginkgo/core/base/index_set.hpp
index 3594d837f88..281690b7807 100644
--- a/include/ginkgo/core/base/index_set.hpp
+++ b/include/ginkgo/core/base/index_set.hpp
@@ -360,7 +360,7 @@ class index_set {
         const bool is_sorted = false) const;
 
     /**
-     * This function allows the user obtain a decompresed global_indices array
+     * This function allows the user obtain a decompressed global_indices array
      * from the indices stored in the index set
      *
      * @return  the decompressed set of indices.
diff --git a/include/ginkgo/core/base/lin_op.hpp b/include/ginkgo/core/base/lin_op.hpp
index c7043f4ae25..e40b0500bde 100644
--- a/include/ginkgo/core/base/lin_op.hpp
+++ b/include/ginkgo/core/base/lin_op.hpp
@@ -520,6 +520,9 @@ class Permutable {
      * In the resulting LinOp, the entry at location `(i,j)` contains the input
      * value `(perm[i],perm[j])`.
      *
+     * From the linear algebra perspective, with $P_{ij} = \delta_{i
+     * \pi(i)}$, this represents the operation $P A P^T$.
+     *
      * @param permutation_indices  the array of indices containing the
      *                             permutation order.
      *
@@ -530,7 +533,7 @@ class Permutable {
     {
         return as<Permutable>(this->row_permute(permutation_indices))
             ->column_permute(permutation_indices);
-    };
+    }
 
     /**
      * Returns a LinOp representing the symmetric inverse row and column
@@ -538,6 +541,9 @@ class Permutable {
      * In the resulting LinOp, the entry at location `(perm[i],perm[j])`
      * contains the input value `(i,j)`.
      *
+     * From the linear algebra perspective, with $P_{ij} = \delta_{i
+     * \pi(i)}$, this represents the operation $P^{-1} A P^{-T}$.
+     *
      * @param permutation_indices  the array of indices containing the
      *                             permutation order.
      *
@@ -548,13 +554,16 @@ class Permutable {
     {
         return as<Permutable>(this->inverse_row_permute(permutation_indices))
             ->inverse_column_permute(permutation_indices);
-    };
+    }
 
     /**
      * Returns a LinOp representing the row permutation of the Permutable
      * object.
      * In the resulting LinOp, the row `i` contains the input row `perm[i]`.
      *
+     * From the linear algebra perspective, with $P_{ij} = \delta_{i
+     * \pi(i)}$, this represents the operation $P A$.
+     *
      * @param permutation_indices  the array of indices containing the
      *                             permutation order.
      *
@@ -569,6 +578,9 @@ class Permutable {
      * In the resulting LinOp, the column `i` contains the input column
      * `perm[i]`.
      *
+     * From the linear algebra perspective, with $P_{ij} = \delta_{i
+     * \pi(i)}$, this represents the operation $A P^T$.
+     *
      * @param permutation_indices  the array of indices containing the
      *                             permutation order `perm`.
      *
@@ -582,6 +594,9 @@ class Permutable {
      * object.
      * In the resulting LinOp, the row `perm[i]` contains the input row `i`.
      *
+     * From the linear algebra perspective, with $P_{ij} = \delta_{i
+     * \pi(i)}$, this represents the operation $P^{-1} A$.
+     *
      * @param permutation_indices  the array of indices containing the
      *                             permutation order `perm`.
      *
@@ -596,6 +611,9 @@ class Permutable {
      * In the resulting LinOp, the column `perm[i]` contains the input column
      * `i`.
      *
+     * From the linear algebra perspective, with $P_{ij} = \delta_{i
+     * \pi(i)}$, this represents the operation $A P^{-T}$.
+     *
      * @param permutation_indices  the array of indices containing the
      *                             permutation order `perm`.
      *
@@ -931,7 +949,7 @@ class EnableLinOp
  * template parameters to enable a subclass of LinOpFactory.
  *
  * @tparam ConcreteFactory  the concrete factory which is being implemented
- *                          [CRTP parmeter]
+ *                          [CRTP parameter]
  * @tparam ConcreteLinOp  the concrete LinOp type which this factory produces,
  *                        needs to have a constructor which takes a
  *                        const ConcreteFactory *, and an
@@ -949,26 +967,6 @@ using EnableDefaultLinOpFactory =
     EnableDefaultFactory<ConcreteFactory, ConcreteLinOp, ParametersType,
                          PolymorphicBase>;
 
-/**
- * This Macro will generate a new type containing the parameters for the factory
- * `_factory_name`. For more details, see #GKO_ENABLE_LIN_OP_FACTORY().
- * It is required to use this macro **before** calling the
- * macro #GKO_ENABLE_LIN_OP_FACTORY().
- * It is also required to use the same names for all parameters between both
- * macros.
- *
- * @param _parameters_name  name of the parameters member in the class
- * @param _factory_name  name of the generated factory type
- *
- * @ingroup LinOp
- */
-#define GKO_CREATE_FACTORY_PARAMETERS(_parameters_name, _factory_name)  \
-public:                                                                 \
-    class _factory_name;                                                \
-    struct _parameters_name##_type                                      \
-        : public ::gko::enable_parameters_type<_parameters_name##_type, \
-                                               _factory_name>
-
 
 /**
  * This macro will generate a default implementation of a LinOpFactory for the
@@ -1084,134 +1082,6 @@ public:                                                                      \
                   "semi-colon warnings")
 
 
-/**
- * Defines a build method for the factory, simplifying its construction by
- * removing the repetitive typing of factory's name.
- *
- * @param _factory_name  the factory for which to define the method
- *
- * @ingroup LinOp
- */
-#define GKO_ENABLE_BUILD_METHOD(_factory_name)                               \
-    static auto build()->decltype(_factory_name::create())                   \
-    {                                                                        \
-        return _factory_name::create();                                      \
-    }                                                                        \
-    static_assert(true,                                                      \
-                  "This assert is used to counter the false positive extra " \
-                  "semi-colon warnings")
-
-
-#if !(defined(__CUDACC__) || defined(__HIPCC__))
-/**
- * Creates a factory parameter in the factory parameters structure.
- *
- * @param _name  name of the parameter
- * @param __VA_ARGS__  default value of the parameter
- *
- * @see GKO_ENABLE_LIN_OP_FACTORY for more details, and usage example
- *
- * @deprecated Use GKO_FACTORY_PARAMETER_SCALAR or GKO_FACTORY_PARAMETER_VECTOR
- *
- * @ingroup LinOp
- */
-#define GKO_FACTORY_PARAMETER(_name, ...)                                    \
-    mutable _name{__VA_ARGS__};                                              \
-                                                                             \
-    template <typename... Args>                                              \
-    auto with_##_name(Args&&... _value)                                      \
-        const->const std::decay_t<decltype(*this)>&                          \
-    {                                                                        \
-        using type = decltype(this->_name);                                  \
-        this->_name = type{std::forward<Args>(_value)...};                   \
-        return *this;                                                        \
-    }                                                                        \
-    static_assert(true,                                                      \
-                  "This assert is used to counter the false positive extra " \
-                  "semi-colon warnings")
-
-/**
- * Creates a scalar factory parameter in the factory parameters structure.
- *
- * Scalar in this context means that the constructor for this type only takes
- * a single parameter.
- *
- * @param _name  name of the parameter
- * @param _default  default value of the parameter
- *
- * @see GKO_ENABLE_LIN_OP_FACTORY for more details, and usage example
- *
- * @ingroup LinOp
- */
-#define GKO_FACTORY_PARAMETER_SCALAR(_name, _default) \
-    GKO_FACTORY_PARAMETER(_name, _default)
-
-/**
- * Creates a vector factory parameter in the factory parameters structure.
- *
- * Vector in this context means that the constructor for this type takes
- * multiple parameters.
- *
- * @param _name  name of the parameter
- * @param _default  default value of the parameter
- *
- * @see GKO_ENABLE_LIN_OP_FACTORY for more details, and usage example
- *
- * @ingroup LinOp
- */
-#define GKO_FACTORY_PARAMETER_VECTOR(_name, ...) \
-    GKO_FACTORY_PARAMETER(_name, __VA_ARGS__)
-#else  // defined(__CUDACC__) || defined(__HIPCC__)
-// A workaround for the NVCC compiler - parameter pack expansion does not work
-// properly, because while the assignment to a scalar value is translated by
-// cudafe into a C-style cast, the parameter pack expansion is not removed and
-// `Args&&... args` is still kept as a parameter pack.
-#define GKO_FACTORY_PARAMETER(_name, ...)                                    \
-    mutable _name{__VA_ARGS__};                                              \
-                                                                             \
-    template <typename... Args>                                              \
-    auto with_##_name(Args&&... _value)                                      \
-        const->const std::decay_t<decltype(*this)>&                          \
-    {                                                                        \
-        GKO_NOT_IMPLEMENTED;                                                 \
-        return *this;                                                        \
-    }                                                                        \
-    static_assert(true,                                                      \
-                  "This assert is used to counter the false positive extra " \
-                  "semi-colon warnings")
-
-#define GKO_FACTORY_PARAMETER_SCALAR(_name, _default)                        \
-    mutable _name{_default};                                                 \
-                                                                             \
-    template <typename Arg>                                                  \
-    auto with_##_name(Arg&& _value)                                          \
-        const->const std::decay_t<decltype(*this)>&                          \
-    {                                                                        \
-        using type = decltype(this->_name);                                  \
-        this->_name = type{std::forward<Arg>(_value)};                       \
-        return *this;                                                        \
-    }                                                                        \
-    static_assert(true,                                                      \
-                  "This assert is used to counter the false positive extra " \
-                  "semi-colon warnings")
-
-#define GKO_FACTORY_PARAMETER_VECTOR(_name, ...)                             \
-    mutable _name{__VA_ARGS__};                                              \
-                                                                             \
-    template <typename... Args>                                              \
-    auto with_##_name(Args&&... _value)                                      \
-        const->const std::decay_t<decltype(*this)>&                          \
-    {                                                                        \
-        using type = decltype(this->_name);                                  \
-        this->_name = type{std::forward<Args>(_value)...};                   \
-        return *this;                                                        \
-    }                                                                        \
-    static_assert(true,                                                      \
-                  "This assert is used to counter the false positive extra " \
-                  "semi-colon warnings")
-#endif  // defined(__CUDACC__) || defined(__HIPCC__)
-
-
 }  // namespace gko
 
 
diff --git a/include/ginkgo/core/base/machine_topology.hpp b/include/ginkgo/core/base/machine_topology.hpp
index 4fa7c2f8e17..b0b9ce2b0ad 100644
--- a/include/ginkgo/core/base/machine_topology.hpp
+++ b/include/ginkgo/core/base/machine_topology.hpp
@@ -71,7 +71,7 @@ namespace gko {
 
 /**
  * The machine topology class represents the hierarchical topology of a machine,
- * including NUMA nodes, cores and PCI Devices. Various infomation of the
+ * including NUMA nodes, cores and PCI Devices. Various information of the
  * machine are gathered with the help of the Hardware Locality library (hwloc).
  *
  * This class also provides functionalities to bind objects in the topology to
@@ -415,7 +415,7 @@ class machine_topology {
 };
 
 
-using MachineTopology [[deprecated("please use machine_topology")]] =
+using MachineTopology GKO_DEPRECATED("please use machine_topology") =
     machine_topology;
 
 
diff --git a/include/ginkgo/core/base/math.hpp b/include/ginkgo/core/base/math.hpp
index 3a6152c55d4..70e4db5bb2d 100644
--- a/include/ginkgo/core/base/math.hpp
+++ b/include/ginkgo/core/base/math.hpp
@@ -47,13 +47,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/utils.hpp>
 
 
-// Using SYCL_LANGUAGE_VERSION will lead the mismatch sycl namespace from 6.0.0
-// when using dpcpp compiler without dpcpp module
-#if GINKGO_DPCPP_MAJOR_VERSION
-#include <CL/sycl.hpp>
-#endif
-
-
 namespace gko {
 
 
diff --git a/include/ginkgo/core/base/memory.hpp b/include/ginkgo/core/base/memory.hpp
new file mode 100644
index 00000000000..6997b6351e5
--- /dev/null
+++ b/include/ginkgo/core/base/memory.hpp
@@ -0,0 +1,274 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_PUBLIC_CORE_BASE_MEMORY_HPP_
+#define GKO_PUBLIC_CORE_BASE_MEMORY_HPP_
+
+
+#include <ginkgo/core/base/fwd_decls.hpp>
+#include <ginkgo/core/base/types.hpp>
+
+
+namespace gko {
+
+
+/**
+ * Provides generic allocation and deallocation functionality to be used by an
+ * Executor.
+ */
+class Allocator {
+public:
+    virtual ~Allocator() = default;
+
+    virtual void* allocate(size_type num_bytes) = 0;
+
+    virtual void deallocate(void* ptr) = 0;
+};
+
+
+/**
+ * Implement this interface to provide an allocator for OmpExecutor or
+ * ReferenceExecutor.
+ */
+class CpuAllocatorBase : public Allocator {};
+
+
+/**
+ * Implement this interface to provide an allocator for CudaExecutor.
+ */
+class CudaAllocatorBase : public Allocator {
+    friend class CudaExecutor;
+
+protected:
+    /**
+     * Checks if the allocator can be used safely with the provided device ID
+     * and stream. The check is necessary to ensure safe usage of stream-ordered
+     * allocators and unified shared memory allocators.
+     *
+     * @param device_id the device ID the allocator will be used in.
+     * @param stream the stream the allocator will be used with.
+     *
+     * @return true if and only if the allocator can be used by CudaExecutor in
+     *         the given environment.
+     */
+    virtual bool check_environment(int device_id, CUstream_st* stream) const
+    {
+        return true;
+    }
+};
+
+
+/**
+ * Implement this interface to provide an allocator for HipExecutor.
+ */
+class HipAllocatorBase : public Allocator {
+    friend class HipExecutor;
+
+protected:
+    /**
+     * Checks if the allocator can be used safely with the provided device ID
+     * and stream. The check is necessary to ensure safe usage of stream-ordered
+     * allocators and unified shared memory allocators.
+     *
+     * @param device_id the device ID the allocator will be used in.
+     * @param stream the stream the allocator will be used with.
+     *
+     * @return true if and only if the allocator can be used by HipExecutor in
+     *         the given environment.
+     */
+    virtual bool check_environment(int device_id,
+                                   GKO_HIP_STREAM_STRUCT* stream) const
+    {
+        return true;
+    }
+};
+
+
+/**
+ * Allocator using new/delete.
+ */
+class CpuAllocator : public CpuAllocatorBase {
+public:
+    void* allocate(size_type num_bytes) override;
+
+    void deallocate(void* ptr) override;
+};
+
+
+/**
+ * Allocator using cudaMalloc.
+ */
+class CudaAllocator : public CudaAllocatorBase {
+public:
+    void* allocate(size_type num_bytes) override;
+
+    void deallocate(void* ptr) override;
+};
+
+
+/*
+ * Allocator using cudaMallocAsync.
+ */
+class CudaAsyncAllocator : public CudaAllocatorBase {
+public:
+    void* allocate(size_type num_bytes) override;
+
+    void deallocate(void* ptr) override;
+
+    CudaAsyncAllocator(CUstream_st* stream);
+
+    bool check_environment(int device_id, CUstream_st* stream) const override;
+
+private:
+    CUstream_st* stream_;
+};
+
+
+/*
+ * Allocator using cudaMallocManaged
+ */
+class CudaUnifiedAllocator : public CudaAllocatorBase, public CpuAllocatorBase {
+public:
+    void* allocate(size_type num_bytes) override;
+
+    void deallocate(void* ptr) override;
+
+    CudaUnifiedAllocator(int device_id);
+
+    CudaUnifiedAllocator(int device_id, unsigned int flags);
+
+protected:
+    bool check_environment(int device_id, CUstream_st* stream) const override;
+
+private:
+    int device_id_;
+    unsigned int flags_;
+};
+
+
+/*
+ * Allocator using cudaHostMalloc.
+ */
+class CudaHostAllocator : public CudaAllocatorBase, public CpuAllocatorBase {
+public:
+    void* allocate(size_type num_bytes) override;
+
+    void deallocate(void* ptr) override;
+
+    CudaHostAllocator(int device_id);
+
+protected:
+    bool check_environment(int device_id, CUstream_st* stream) const override;
+
+private:
+    int device_id_;
+};
+
+
+/*
+ * Allocator using hipMalloc.
+ */
+class HipAllocator : public HipAllocatorBase {
+public:
+    void* allocate(size_type num_bytes) override;
+
+    void deallocate(void* ptr) override;
+};
+
+
+/*
+ * Allocator using hipMallocAsync.
+ */
+class HipAsyncAllocator : public HipAllocatorBase {
+public:
+    void* allocate(size_type num_bytes) override;
+
+    void deallocate(void* ptr) override;
+
+    HipAsyncAllocator(GKO_HIP_STREAM_STRUCT* stream);
+
+protected:
+    bool check_environment(int device_id,
+                           GKO_HIP_STREAM_STRUCT* stream) const override;
+
+private:
+    GKO_HIP_STREAM_STRUCT* stream_;
+};
+
+
+/*
+ * Allocator using hipMallocManaged
+ */
+class HipUnifiedAllocator : public HipAllocatorBase, public CpuAllocatorBase {
+public:
+    void* allocate(size_type num_bytes) override;
+
+    void deallocate(void* ptr) override;
+
+    HipUnifiedAllocator(int device_id);
+
+    HipUnifiedAllocator(int device_id, unsigned int flags);
+
+protected:
+    bool check_environment(int device_id,
+                           GKO_HIP_STREAM_STRUCT* stream) const override;
+
+private:
+    int device_id_;
+    unsigned int flags_;
+};
+
+
+/*
+ * Allocator using hipHostAlloc.
+ */
+class HipHostAllocator : public HipAllocatorBase, public CpuAllocatorBase {
+public:
+    void* allocate(size_type num_bytes) override;
+
+    void deallocate(void* ptr) override;
+
+    HipHostAllocator(int device_id);
+
+protected:
+    bool check_environment(int device_id,
+                           GKO_HIP_STREAM_STRUCT* stream) const override;
+
+private:
+    int device_id_;
+};
+
+
+}  // namespace gko
+
+
+#endif  // GKO_PUBLIC_CORE_BASE_MEMORY_HPP_
diff --git a/include/ginkgo/core/base/perturbation.hpp b/include/ginkgo/core/base/perturbation.hpp
index 4e9adc4e94e..e0378b8cec2 100644
--- a/include/ginkgo/core/base/perturbation.hpp
+++ b/include/ginkgo/core/base/perturbation.hpp
@@ -186,7 +186,7 @@ class Perturbation : public EnableLinOp<Perturbation<ValueType>>,
         cache_struct(const cache_struct& other) {}
         cache_struct& operator=(const cache_struct& other) { return *this; }
 
-        // allocate linops of cache. The dimenstion of `intermediate` is
+        // allocate linops of cache. The dimension of `intermediate` is
         // (the number of rows of projector, the number of columns of b). Others
         // are 1x1 scalar.
         void allocate(std::shared_ptr<const Executor> exec, dim<2> size)
diff --git a/include/ginkgo/core/base/polymorphic_object.hpp b/include/ginkgo/core/base/polymorphic_object.hpp
index 8d4c327ac33..6ef59078b5d 100644
--- a/include/ginkgo/core/base/polymorphic_object.hpp
+++ b/include/ginkgo/core/base/polymorphic_object.hpp
@@ -59,8 +59,9 @@ namespace gko {
  * @note Most of the public methods of this class should not be overridden
  *       directly, and are thus not virtual. Instead, there are equivalent
  *       protected methods (ending in <method_name>_impl) that should be
- *       overriden instead. This allows polymorphic objects to implement default
- *       behavior around virtual methods (parameter checking, type casting).
+ *       overridden instead. This allows polymorphic objects to implement
+ *       default behavior around virtual methods (parameter checking, type
+ *       casting).
  *
  * @see EnablePolymorphicObject if you wish to implement a concrete polymorphic
  *      object and have sensible defaults generated automatically.
@@ -181,14 +182,13 @@ class PolymorphicObject : public log::EnableLogging<PolymorphicObject> {
      * @tparam Deleter  the deleter of the unique_ptr parameter
      */
     template <typename Derived, typename Deleter>
-    [[deprecated(
+    GKO_DEPRECATED(
         "This function will be removed in a future release, the replacement "
         "will copy instead of move. If a move is intended, use move_from "
-        "instead.")]] std::
-        enable_if_t<
-            std::is_base_of<PolymorphicObject, std::decay_t<Derived>>::value,
-            PolymorphicObject>*
-        copy_from(std::unique_ptr<Derived, Deleter>&& other)
+        "instead.")
+    std::enable_if_t<
+        std::is_base_of<PolymorphicObject, std::decay_t<Derived>>::value,
+        PolymorphicObject>* copy_from(std::unique_ptr<Derived, Deleter>&& other)
     {
         this->template log<log::Logger::polymorphic_object_move_started>(
             exec_.get(), other.get(), this);
@@ -408,14 +408,13 @@ class EnableAbstractPolymorphicObject : public PolymorphicBase {
     }
 
     template <typename Derived>
-    [[deprecated(
+    GKO_DEPRECATED(
         "This function will be removed in a future release, the replacement "
         "will copy instead of move. If a move in intended, use move_to "
-        "instead.")]] std::
-        enable_if_t<
-            std::is_base_of<PolymorphicObject, std::decay_t<Derived>>::value,
-            AbstractObject>*
-        copy_from(std::unique_ptr<Derived>&& other)
+        "instead.")
+    std::enable_if_t<
+        std::is_base_of<PolymorphicObject, std::decay_t<Derived>>::value,
+        AbstractObject>* copy_from(std::unique_ptr<Derived>&& other)
     {
         return static_cast<AbstractObject*>(
             this->PolymorphicBase::copy_from(std::move(other)));
@@ -657,7 +656,7 @@ std::shared_ptr<const R> copy_and_convert_to(
  * The mixin changes parameter and return types of appropriate public methods of
  * PolymorphicObject in the same way EnableAbstractPolymorphicObject does.
  * In addition, it also provides default implementations of PolymorphicObject's
- * vritual methods by using the _executor default constructor_ and the
+ * virtual methods by using the _executor default constructor_ and the
  * assignment operator of ConcreteObject. Consequently, the following is a
  * minimal example of PolymorphicObject:
  *
diff --git a/include/ginkgo/core/base/range.hpp b/include/ginkgo/core/base/range.hpp
index c9713f33572..815192ed112 100644
--- a/include/ginkgo/core/base/range.hpp
+++ b/include/ginkgo/core/base/range.hpp
@@ -188,6 +188,27 @@ GKO_ATTRIBUTES constexpr GKO_INLINE
            equal_dimensions<CurrentDimension + 1>(first, second);
 }
 
+/**
+ * Helper that stores the first type of a parameter pack, if its length is
+ * greater 0.
+ */
+template <class...>
+struct head;
+
+/**
+ * @copydoc head
+ */
+template <class First, class... Rest>
+struct head<First, Rest...> {
+    using type = First;
+};
+
+/**
+ * @copydoc head
+ */
+template <class... T>
+using head_t = typename head<T...>::type;
+
 
 }  // namespace detail
 
@@ -255,7 +276,7 @@ GKO_ATTRIBUTES constexpr GKO_INLINE
  * `x` an `y` are ranges, and `alpha` is a scalar.
  * Range operations are optimized for memory access, and the above code does not
  * allocate additional storage for intermediate ranges `alpha * x`
- * or `aplha * x + y`. In fact, the entire computation is done during the
+ * or `alpha * x + y`. In fact, the entire computation is done during the
  * assignment, and the results of operations `+` and `*` only register the data,
  * and the types of operations that will be computed once the results are
  * needed.
@@ -274,7 +295,7 @@ GKO_ATTRIBUTES constexpr GKO_INLINE
  *
  * __`mmul` is not a highly-optimized BLAS-3 version of the matrix
  * multiplication.__ The current design of ranges and accessors prevents that,
- * so if you need a high-perfromance matrix multiplication, you should use one
+ * so if you need a high-performance matrix multiplication, you should use one
  * of the libraries that provide that, or implement your own
  * (you can use pointwise range operations to help simplify that). However,
  * range design might get improved in the future to allow efficient
@@ -327,7 +348,12 @@ class range {
      *
      * @param params  parameters forwarded to Accessor constructor.
      */
-    template <typename... AccessorParams>
+    template <
+        typename... AccessorParams,
+        typename = std::enable_if_t<
+            sizeof...(AccessorParams) != 1 ||
+            !std::is_same<
+                range, std::decay<detail::head_t<AccessorParams...>>>::value>>
     GKO_ATTRIBUTES constexpr explicit range(AccessorParams&&... params)
         : accessor_{std::forward<AccessorParams>(params)...}
     {}
@@ -588,6 +614,17 @@ struct implement_binary_operation<operation_kind::range_by_scalar,
 
 }  // namespace detail
 
+#define GKO_DEPRECATED_UNARY_RANGE_OPERATION(_operation_deprecated_name,     \
+                                             _operation_name)                \
+    namespace accessor {                                                     \
+    template <typename Operand>                                              \
+    struct GKO_DEPRECATED("Please use " #_operation_name)                    \
+        _operation_deprecated_name : _operation_name<Operand> {};            \
+    }                                                                        \
+    static_assert(true,                                                      \
+                  "This assert is used to counter the false positive extra " \
+                  "semi-colon warnings")
+
 
 #define GKO_ENABLE_UNARY_RANGE_OPERATION(_operation_name, _operator_name, \
                                          _operator)                       \
@@ -682,21 +719,30 @@ GKO_ENABLE_UNARY_RANGE_OPERATION(bitwise_not, operator~,
                                  accessor::detail::bitwise_not);
 
 // common unary functions
+
 GKO_ENABLE_UNARY_RANGE_OPERATION(zero_operation, zero,
                                  accessor::detail::zero_operation);
-GKO_ENABLE_UNARY_RANGE_OPERATION(one_operaton, one,
+GKO_ENABLE_UNARY_RANGE_OPERATION(one_operation, one,
                                  accessor::detail::one_operation);
-GKO_ENABLE_UNARY_RANGE_OPERATION(abs_operaton, abs,
+GKO_ENABLE_UNARY_RANGE_OPERATION(abs_operation, abs,
                                  accessor::detail::abs_operation);
-GKO_ENABLE_UNARY_RANGE_OPERATION(real_operaton, real,
+GKO_ENABLE_UNARY_RANGE_OPERATION(real_operation, real,
                                  accessor::detail::real_operation);
-GKO_ENABLE_UNARY_RANGE_OPERATION(imag_operaton, imag,
+GKO_ENABLE_UNARY_RANGE_OPERATION(imag_operation, imag,
                                  accessor::detail::imag_operation);
-GKO_ENABLE_UNARY_RANGE_OPERATION(conj_operaton, conj,
+GKO_ENABLE_UNARY_RANGE_OPERATION(conj_operation, conj,
                                  accessor::detail::conj_operation);
-GKO_ENABLE_UNARY_RANGE_OPERATION(squared_norm_operaton, squared_norm,
+GKO_ENABLE_UNARY_RANGE_OPERATION(squared_norm_operation, squared_norm,
                                  accessor::detail::squared_norm_operation);
 
+GKO_DEPRECATED_UNARY_RANGE_OPERATION(one_operaton, one_operation);
+GKO_DEPRECATED_UNARY_RANGE_OPERATION(abs_operaton, abs_operation);
+GKO_DEPRECATED_UNARY_RANGE_OPERATION(real_operaton, real_operation);
+GKO_DEPRECATED_UNARY_RANGE_OPERATION(imag_operaton, imag_operation);
+GKO_DEPRECATED_UNARY_RANGE_OPERATION(conj_operaton, conj_operation);
+GKO_DEPRECATED_UNARY_RANGE_OPERATION(squared_norm_operaton,
+                                     squared_norm_operation);
+
 namespace accessor {
 
 
@@ -740,6 +786,7 @@ struct transpose_operation {
 GKO_BIND_UNARY_RANGE_OPERATION_TO_OPERATOR(transpose_operation, transpose);
 
 
+#undef GKO_DEPRECATED_UNARY_RANGE_OPERATION
 #undef GKO_DEFINE_SIMPLE_UNARY_OPERATION
 #undef GKO_ENABLE_UNARY_RANGE_OPERATION
 
@@ -815,6 +862,9 @@ GKO_BIND_UNARY_RANGE_OPERATION_TO_OPERATOR(transpose_operation, transpose);
                   "semi-colon warnings")
 
 
+#define GKO_DEPRECATED_SIMPLE_BINARY_OPERATION(_deprecated_name, _name) \
+    struct GKO_DEPRECATED("Please use " #_name) _deprecated_name : _name {}
+
 #define GKO_DEFINE_SIMPLE_BINARY_OPERATION(_name, ...)                         \
     struct _name {                                                             \
     private:                                                                   \
@@ -893,6 +943,8 @@ GKO_DEFINE_SIMPLE_BINARY_OPERATION(right_shift, first >> second);
 GKO_DEFINE_SIMPLE_BINARY_OPERATION(max_operation, max(first, second));
 GKO_DEFINE_SIMPLE_BINARY_OPERATION(min_operation, min(first, second));
 
+GKO_DEPRECATED_SIMPLE_BINARY_OPERATION(max_operaton, max_operation);
+GKO_DEPRECATED_SIMPLE_BINARY_OPERATION(min_operaton, min_operation);
 }  // namespace detail
 }  // namespace accessor
 
@@ -935,9 +987,9 @@ GKO_ENABLE_BINARY_RANGE_OPERATION(right_shift, operator>>,
                                   accessor::detail::right_shift);
 
 // common binary functions
-GKO_ENABLE_BINARY_RANGE_OPERATION(max_operaton, max,
+GKO_ENABLE_BINARY_RANGE_OPERATION(max_operation, max,
                                   accessor::detail::max_operation);
-GKO_ENABLE_BINARY_RANGE_OPERATION(min_operaton, min,
+GKO_ENABLE_BINARY_RANGE_OPERATION(min_operation, min,
                                   accessor::detail::min_operation);
 
 
diff --git a/include/ginkgo/core/base/scoped_device_id_guard.hpp b/include/ginkgo/core/base/scoped_device_id_guard.hpp
index 52fccdd241c..6b236a6a37e 100644
--- a/include/ginkgo/core/base/scoped_device_id_guard.hpp
+++ b/include/ginkgo/core/base/scoped_device_id_guard.hpp
@@ -58,7 +58,7 @@ class generic_scoped_device_id_guard {
 public:
     generic_scoped_device_id_guard() = default;
 
-    // TODO: this should be a purely virtual funtion, but somehow that leads to
+    // TODO: this should be a purely virtual function, but somehow that leads to
     // linker errors
     virtual ~generic_scoped_device_id_guard() = default;
 
diff --git a/include/ginkgo/core/base/std_extensions.hpp b/include/ginkgo/core/base/std_extensions.hpp
index 69629f98e06..1064ae464f0 100644
--- a/include/ginkgo/core/base/std_extensions.hpp
+++ b/include/ginkgo/core/base/std_extensions.hpp
@@ -128,6 +128,16 @@ constexpr bool less_equal(const T&& lhs, const T&& rhs)
 }
 
 
+// available in <type_traits> with C++17
+template <class...>
+struct conjunction : std::true_type {};
+template <class B1>
+struct conjunction<B1> : B1 {};
+template <class B1, class... Bn>
+struct conjunction<B1, Bn...>
+    : std::conditional_t<bool(B1::value), conjunction<Bn...>, B1> {};
+
+
 }  // namespace xstd
 }  // namespace gko
 
diff --git a/include/ginkgo/core/base/stream.hpp b/include/ginkgo/core/base/stream.hpp
new file mode 100644
index 00000000000..f7d45f59c5a
--- /dev/null
+++ b/include/ginkgo/core/base/stream.hpp
@@ -0,0 +1,132 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_PUBLIC_CORE_BASE_STREAM_HPP_
+#define GKO_PUBLIC_CORE_BASE_STREAM_HPP_
+
+
+#include <ginkgo/core/base/executor.hpp>
+
+
+namespace gko {
+
+
+/**
+ * An RAII wrapper for a custom CUDA stream.
+ * The stream will be created on construction and destroyed when the lifetime of
+ * the wrapper ends.
+ */
+class cuda_stream {
+public:
+    /** Creates an empty stream wrapper, representing the default stream. */
+    cuda_stream();
+
+    /**
+     * Creates a new custom CUDA stream on the given device.
+     *
+     * @param device_id  the device ID to create the stream on.
+     */
+    cuda_stream(int device_id);
+
+    /** Destroys the custom CUDA stream, if it isn't empty. */
+    ~cuda_stream();
+
+    cuda_stream(const cuda_stream&) = delete;
+
+    /** Move-constructs from an existing stream, which will be emptied. */
+    cuda_stream(cuda_stream&&);
+
+    cuda_stream& operator=(const cuda_stream&) = delete;
+
+    /** Move-assigns from an existing stream, which will be emptied. */
+    cuda_stream& operator=(cuda_stream&&) = delete;
+
+    /**
+     * Returns the native CUDA stream handle.
+     * In an empty cuda_stream, this will return nullptr.
+     */
+    CUstream_st* get() const;
+
+private:
+    CUstream_st* stream_;
+
+    int device_id_;
+};
+
+
+/**
+ * An RAII wrapper for a custom HIP stream.
+ * The stream will be created on construction and destroyed when the lifetime of
+ * the wrapper ends.
+ */
+class hip_stream {
+public:
+    /** Creates an empty stream wrapper, representing the default stream. */
+    hip_stream();
+
+    /**
+     * Creates a new custom HIP stream on the given device.
+     *
+     * @param device_id  the device ID to create the stream on.
+     */
+    hip_stream(int device_id);
+
+    /** Destroys the custom HIP stream, if it isn't empty. */
+    ~hip_stream();
+
+    hip_stream(const hip_stream&) = delete;
+
+    /** Move-constructs from an existing stream, which will be emptied. */
+    hip_stream(hip_stream&&);
+
+    hip_stream& operator=(const hip_stream&) = delete;
+
+    /** Move-assigns from an existing stream, which will be emptied. */
+    hip_stream& operator=(hip_stream&&) = delete;
+
+    /**
+     * Returns the native HIP stream handle.
+     * In an empty hip_stream, this will return nullptr.
+     */
+    GKO_HIP_STREAM_STRUCT* get() const;
+
+private:
+    GKO_HIP_STREAM_STRUCT* stream_;
+
+    int device_id_;
+};
+
+
+}  // namespace gko
+
+
+#endif  // GKO_PUBLIC_CORE_BASE_STREAM_HPP_
diff --git a/include/ginkgo/core/base/types.hpp b/include/ginkgo/core/base/types.hpp
index 68b5da6e3eb..c665e0ead79 100644
--- a/include/ginkgo/core/base/types.hpp
+++ b/include/ginkgo/core/base/types.hpp
@@ -89,11 +89,26 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 // Handle deprecated notices correctly on different systems
-#if defined(_WIN32) || defined(__CYGWIN__)
-#define GKO_DEPRECATED(msg) __declspec(deprecated(msg))
+// clang-format off
+#define GKO_DEPRECATED(_msg) [[deprecated(_msg)]]
+#ifdef __NVCOMPILER
+#define GKO_BEGIN_DISABLE_DEPRECATION_WARNINGS _Pragma("diag_suppress 1445")
+#define GKO_END_DISABLE_DEPRECATION_WARNINGS _Pragma("diag_warning 1445")
+#elif defined(__GNUC__) || defined(__clang__)
+#define GKO_BEGIN_DISABLE_DEPRECATION_WARNINGS                      \
+    _Pragma("GCC diagnostic push")                                  \
+    _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"")
+#define GKO_END_DISABLE_DEPRECATION_WARNINGS _Pragma("GCC diagnostic pop")
+#elif defined(_MSC_VER)
+#define GKO_BEGIN_DISABLE_DEPRECATION_WARNINGS        \
+    _Pragma("warning(push)")                          \
+    _Pragma("warning(disable : 5211 4973 4974 4996)")
+#define GKO_END_DISABLE_DEPRECATION_WARNINGS _Pragma("warning(pop)")
 #else
-#define GKO_DEPRECATED(msg) __attribute__((deprecated(msg)))
-#endif  // defined(_WIN32) || defined(__CYGWIN__)
+#define GKO_BEGIN_DISABLE_DEPRECATION_WARNINGS
+#define GKO_END_DISABLE_DEPRECATION_WARNINGS
+#endif
+// clang-format on
 
 
 namespace gko {
@@ -531,6 +546,22 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x,
     template _macro(double, int64)
 #endif
 
+#if GINKGO_DPCPP_SINGLE_MODE
+#define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(_macro) \
+    template _macro(float, int32);                            \
+    template <>                                               \
+    _macro(double, int32) GKO_NOT_IMPLEMENTED;                \
+    template _macro(std::complex<float>, int32);              \
+    template <>                                               \
+    _macro(std::complex<double>, int32) GKO_NOT_IMPLEMENTED
+#else
+#define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(_macro) \
+    template _macro(float, int32);                            \
+    template _macro(double, int32);                           \
+    template _macro(std::complex<float>, int32);              \
+    template _macro(std::complex<double>, int32)
+#endif
+
 
 /**
  * Instantiates a template for each value and index type compiled by Ginkgo.
diff --git a/include/ginkgo/core/base/utils_helper.hpp b/include/ginkgo/core/base/utils_helper.hpp
index 3f26d5d7659..0fb509eb8e2 100644
--- a/include/ginkgo/core/base/utils_helper.hpp
+++ b/include/ginkgo/core/base/utils_helper.hpp
@@ -294,9 +294,9 @@ inline typename std::remove_reference<OwningPointer>::type&& give(
  *       same as calling .get() on the smart pointer.
  */
 template <typename Pointer>
-[[deprecated("no longer necessary, just pass the object without lend")]] inline
-    typename std::enable_if<detail::have_ownership_s<Pointer>::value,
-                            detail::pointee<Pointer>*>::type
+GKO_DEPRECATED("no longer necessary, just pass the object without lend")
+inline typename std::enable_if<detail::have_ownership_s<Pointer>::value,
+                               detail::pointee<Pointer>*>::type
     lend(const Pointer& p)
 {
     return p.get();
@@ -313,9 +313,9 @@ template <typename Pointer>
  *       returns `p`.
  */
 template <typename Pointer>
-[[deprecated("no longer necessary, just pass the object without lend")]] inline
-    typename std::enable_if<!detail::have_ownership_s<Pointer>::value,
-                            detail::pointee<Pointer>*>::type
+GKO_DEPRECATED("no longer necessary, just pass the object without lend")
+inline typename std::enable_if<!detail::have_ownership_s<Pointer>::value,
+                               detail::pointee<Pointer>*>::type
     lend(const Pointer& p)
 {
     return p;
diff --git a/include/ginkgo/core/distributed/partition.hpp b/include/ginkgo/core/distributed/partition.hpp
index 0096edf999c..bb36528a4a8 100644
--- a/include/ginkgo/core/distributed/partition.hpp
+++ b/include/ginkgo/core/distributed/partition.hpp
@@ -83,11 +83,11 @@ namespace distributed {
  * ```
  * starting_index[0] = 0,
  * starting_index[1] = 0,
- * starting_index[2] = 3,  // second range of part 1
+ * starting_index[2] = 3,  // second range of part 0
  * starting_index[3] = 0,
- * starting_index[4] = 5,  // third range of part 1
+ * starting_index[4] = 5,  // third range of part 0
  * ```
- * which you can use to iterate only over the the second range of part 1 (the
+ * which you can use to iterate only over the the second range of part 0 (the
  * third global range) with
  * ```
  * for(int i = 0; i < r[3] - r[2]; ++i){
@@ -231,7 +231,7 @@ class Partition
      *
      * @return  true if each part has no more than one contiguous range.
      */
-    bool has_connected_parts();
+    bool has_connected_parts() const;
 
     /**
      * Checks if the ranges are ordered by their part index.
@@ -240,7 +240,7 @@ class Partition
      *
      * @return  true if the ranges are ordered by their part index.
      */
-    bool has_ordered_parts();
+    bool has_ordered_parts() const;
 
     /**
      * Builds a partition from a given mapping global_index -> part_id.
@@ -260,15 +260,18 @@ class Partition
      *
      * @param exec  the Executor on which the partition should be built
      * @param ranges  the boundaries of the ranges representing each part.
-     *                Part i contains the indices [ranges[i], ranges[i + 1]).
-     *                Has to contain at least one element.
-     *                The first element has to be 0.
+     *                Part part_id[i] contains the indices
+     *                [ranges[i], ranges[i + 1]). Has to contain at least
+     *                one element. The first element has to be 0.
+     * @param part_ids  the part ids of the provided ranges. If empty, then
+     *                  it will assume range i belongs to part i.
      *
      * @return  a Partition representing the given contiguous partitioning.
      */
     static std::unique_ptr<Partition> build_from_contiguous(
         std::shared_ptr<const Executor> exec,
-        const array<global_index_type>& ranges);
+        const array<global_index_type>& ranges,
+        const array<comm_index_type>& part_ids = {});
 
     /**
      * Builds a partition by evenly distributing the global range.
diff --git a/include/ginkgo/core/distributed/partition_helpers.hpp b/include/ginkgo/core/distributed/partition_helpers.hpp
new file mode 100644
index 00000000000..6bc20350a7d
--- /dev/null
+++ b/include/ginkgo/core/distributed/partition_helpers.hpp
@@ -0,0 +1,101 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_PUBLIC_CORE_DISTRIBUTED_PARTITION_HELPERS_HPP_
+#define GKO_PUBLIC_CORE_DISTRIBUTED_PARTITION_HELPERS_HPP_
+
+
+#include <ginkgo/config.hpp>
+
+
+#if GINKGO_BUILD_MPI
+
+
+#include <ginkgo/core/base/mpi.hpp>
+#include <ginkgo/core/base/range.hpp>
+
+
+namespace gko {
+namespace experimental {
+namespace distributed {
+
+template <typename LocalIndexType, typename GlobalIndexType>
+class Partition;
+
+
+/**
+ * Builds a partition from a local range.
+ *
+ * @param exec  the Executor on which the partition should be built.
+ * @param comm  the communicator used to determine the global partition.
+ * @param local_range the start and end indices of the local range.
+ *
+ * @warning  This throws, if the resulting partition would contain gaps.
+ *           That means that for a partition of size `n` every local range `r_i
+ *           = [s_i, e_i)` either `s_i != 0` and another local range `r_j =
+ *           [s_j, e_j = s_i)` exists, or `e_i != n` and another local range
+ *           `r_j = [s_j = e_i, e_j)` exists.
+ *
+ * @return a Partition where each range has the individual local_start
+ *         and local_ends.
+ */
+template <typename LocalIndexType, typename GlobalIndexType>
+std::unique_ptr<Partition<LocalIndexType, GlobalIndexType>>
+build_partition_from_local_range(std::shared_ptr<const Executor> exec,
+                                 mpi::communicator comm, span local_range);
+
+
+/**
+ * Builds a partition from a local size.
+ *
+ * @param exec  the Executor on which the partition should be built.
+ * @param comm  the communicator used to determine the global partition.
+ * @param local_range the number of the locally owned indices
+ *
+ * @return a Partition where each range has the specified local size. More
+ *         specifically, if this is called on process i with local_size `s_i`,
+ *         then the range `i` has size `s_i`, and range `r_i = [start, start +
+ *         s_i)`, where `start = sum_j^(i-1) s_j`.
+ */
+template <typename LocalIndexType, typename GlobalIndexType>
+std::unique_ptr<Partition<LocalIndexType, GlobalIndexType>>
+build_partition_from_local_size(std::shared_ptr<const Executor> exec,
+                                mpi::communicator comm, size_type local_size);
+
+
+}  // namespace distributed
+}  // namespace experimental
+}  // namespace gko
+
+
+#endif  // GINKGO_BUILD_MPI
+#endif  // GKO_PUBLIC_CORE_DISTRIBUTED_PARTITION_HELPERS_HPP_
diff --git a/include/ginkgo/core/distributed/preconditioner/schwarz.hpp b/include/ginkgo/core/distributed/preconditioner/schwarz.hpp
index 9016442df67..d2dbe8e6588 100644
--- a/include/ginkgo/core/distributed/preconditioner/schwarz.hpp
+++ b/include/ginkgo/core/distributed/preconditioner/schwarz.hpp
@@ -40,6 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if GINKGO_BUILD_MPI
 
 
+#include <ginkgo/core/base/abstract_factory.hpp>
 #include <ginkgo/core/base/lin_op.hpp>
 #include <ginkgo/core/distributed/matrix.hpp>
 #include <ginkgo/core/distributed/vector.hpp>
@@ -93,8 +94,14 @@ class Schwarz
         /**
          * Local solver factory.
          */
-        std::shared_ptr<const LinOpFactory> GKO_FACTORY_PARAMETER_SCALAR(
-            local_solver_factory, nullptr);
+        std::shared_ptr<const LinOpFactory> GKO_DEFERRED_FACTORY_PARAMETER(
+            local_solver);
+
+        /**
+         * Generated Inner solvers.
+         */
+        std::shared_ptr<const LinOp> GKO_FACTORY_PARAMETER_SCALAR(
+            generated_local_solver, nullptr);
     };
     GKO_ENABLE_LIN_OP_FACTORY(Schwarz, parameters, Factory);
     GKO_ENABLE_BUILD_METHOD(Factory);
@@ -112,7 +119,7 @@ class Schwarz
     /**
      * Creates a Schwarz preconditioner from a matrix using a Schwarz::Factory.
      *
-     * @param factory  the factory to use to create the preconditoner
+     * @param factory  the factory to use to create the preconditioner
      * @param system_matrix  the matrix this preconditioner should be created
      *                       from
      */
@@ -126,11 +133,10 @@ class Schwarz
     }
 
     /**
-     * Generates the preconditoner.
+     * Generates the preconditioner.
      */
     void generate(std::shared_ptr<const LinOp> system_matrix);
 
-
     void apply_impl(const LinOp* b, LinOp* x) const override;
 
     template <typename VectorType>
@@ -140,6 +146,13 @@ class Schwarz
                     LinOp* x) const override;
 
 private:
+    /**
+     * Sets the solver operator used as the local solver.
+     *
+     * @param new_solver  the new local solver
+     */
+    void set_solver(std::shared_ptr<const LinOp> new_solver);
+
     std::shared_ptr<const LinOp> local_solver_;
 };
 
diff --git a/include/ginkgo/core/distributed/vector.hpp b/include/ginkgo/core/distributed/vector.hpp
index 1ad0b171788..87afa3a01b5 100644
--- a/include/ginkgo/core/distributed/vector.hpp
+++ b/include/ginkgo/core/distributed/vector.hpp
@@ -337,7 +337,7 @@ class Vector
                           array<char>& tmp) const;
 
     /**
-     * Computes the square of the column-wise Euclidian ($L^2$) norm of this
+     * Computes the square of the column-wise Euclidean ($L^2$) norm of this
      * (multi-)vector using a global reduction.
      *
      * @param result  a Dense row vector, used to store the norm
@@ -347,7 +347,7 @@ class Vector
     void compute_squared_norm2(ptr_param<LinOp> result) const;
 
     /**
-     * Computes the square of the column-wise Euclidian ($L^2$) norm of this
+     * Computes the square of the column-wise Euclidean ($L^2$) norm of this
      * (multi-)vector using a global reduction.
      *
      * @param result  a Dense row vector, used to store the norm
@@ -360,7 +360,7 @@ class Vector
     void compute_squared_norm2(ptr_param<LinOp> result, array<char>& tmp) const;
 
     /**
-     * Computes the Euclidian (L^2) norm of this (multi-)vector using a global
+     * Computes the Euclidean (L^2) norm of this (multi-)vector using a global
      * reduction.
      *
      * @param result  a Dense row matrix, used to store the norm
@@ -370,7 +370,7 @@ class Vector
     void compute_norm2(ptr_param<LinOp> result) const;
 
     /**
-     * Computes the Euclidian (L^2) norm of this (multi-)vector using a global
+     * Computes the Euclidean (L^2) norm of this (multi-)vector using a global
      * reduction.
      *
      * @param result  a Dense row matrix, used to store the norm
@@ -404,6 +404,29 @@ class Vector
      */
     void compute_norm1(ptr_param<LinOp> result, array<char>& tmp) const;
 
+    /**
+     * Computes the column-wise mean of this (multi-)vector using a global
+     * reduction.
+     *
+     * @param result  a Dense row matrix, used to store the mean
+     *                (the number of columns in result must match the number
+     *                of columns of this)
+     */
+    void compute_mean(ptr_param<LinOp> result) const;
+
+    /**
+     * Computes the column-wise arithmetic mean of this (multi-)vector using a
+     * global reduction.
+     *
+     * @param result  a Dense row matrix, used to store the mean
+     *                (the number of columns in result must match the number
+     *                of columns of this)
+     * @param tmp  the temporary storage to use for partial sums during the
+     *             reduction computation. It may be resized and/or reset to the
+     *             correct executor.
+     */
+    void compute_mean(ptr_param<LinOp> result, array<char>& tmp) const;
+
     /**
      * Returns a single element of the multi-vector.
      *
diff --git a/include/ginkgo/core/factorization/cholesky.hpp b/include/ginkgo/core/factorization/cholesky.hpp
index 3b40cf45f71..7516611009e 100644
--- a/include/ginkgo/core/factorization/cholesky.hpp
+++ b/include/ginkgo/core/factorization/cholesky.hpp
@@ -96,6 +96,13 @@ class Cholesky
         bool GKO_FACTORY_PARAMETER_SCALAR(skip_sorting, false);
     };
 
+    /**
+     * Returns the parameters used to construct the factory.
+     *
+     * @return the parameters used to construct the factory.
+     */
+    const parameters_type& get_parameters() { return parameters_; }
+
     /**
      * @copydoc LinOpFactory::generate
      * @note This function overrides the default LinOpFactory::generate to
diff --git a/include/ginkgo/core/factorization/lu.hpp b/include/ginkgo/core/factorization/lu.hpp
index 675dfd10a8d..e8994007f27 100644
--- a/include/ginkgo/core/factorization/lu.hpp
+++ b/include/ginkgo/core/factorization/lu.hpp
@@ -46,6 +46,25 @@ namespace experimental {
 namespace factorization {
 
 
+enum class symbolic_type {
+    /** An LU factorization algorithm that works on all matrices. */
+    general,
+    /**
+     * An LU factorization algorithm that works best on matrices with an almost
+     * symmetric sparsity pattern. It is correct for general matrices, but may
+     * use excessive amounts of memory and time.
+     */
+    near_symmetric,
+    /**
+     * An LU factorization algorithm that works only on matrices with a
+     * symmetric sparsity pattern. Running it on a matrix with a non-symmetric
+     * sparsity pattern is undefined behavior and will likely lead to the
+     * application crashing.
+     */
+    symmetric
+};
+
+
 /**
  * Computes an LU factorization of a sparse matrix. This LinOpFactory returns a
  * Factorization storing the L and U factors for the provided system matrix in
@@ -85,12 +104,14 @@ class Lu
             GKO_FACTORY_PARAMETER_SCALAR(symbolic_factorization, nullptr);
 
         /**
-         * If the system matrix has a symmetric sparsity pattern, set this flag
-         * to `true` to use a symbolic Cholesky factorization instead of a
-         * symbolic LU factorization to determine the sparsity pattern of L & U.
-         * This will most likely significantly reduce the generation runtime.
+         * If the symbolic factorization of the matrix is not provided to the
+         * factory, this parameter controls which algorithm will be used to
+         * compute it.
+         * @note Only use symbolic_factorization_algorithm::symmetric if you are
+         *       sure your matrix has a symmetric sparsity pattern!
          */
-        bool GKO_FACTORY_PARAMETER_SCALAR(symmetric_sparsity, false);
+        symbolic_type GKO_FACTORY_PARAMETER_SCALAR(symbolic_algorithm,
+                                                   symbolic_type::general);
 
         /**
          * The `system_matrix`, which will be given to this factory, must be
@@ -105,6 +126,13 @@ class Lu
         bool GKO_FACTORY_PARAMETER_SCALAR(skip_sorting, false);
     };
 
+    /**
+     * Returns the parameters used to construct the factory.
+     *
+     * @return the parameters used to construct the factory.
+     */
+    const parameters_type& get_parameters() { return parameters_; }
+
     /**
      * @copydoc LinOpFactory::generate
      * @note This function overrides the default LinOpFactory::generate to
diff --git a/include/ginkgo/core/factorization/par_ic.hpp b/include/ginkgo/core/factorization/par_ic.hpp
index 365a431208a..2df350f31a2 100644
--- a/include/ginkgo/core/factorization/par_ic.hpp
+++ b/include/ginkgo/core/factorization/par_ic.hpp
@@ -130,7 +130,7 @@ class ParIc : public Composition<ValueType> {
          * The number of iterations the `compute` kernel will use when doing
          * the factorization. The default value `0` means `Auto`, so the
          * implementation decides on the actual value depending on the
-         * ressources that are available.
+         * resources that are available.
          */
         size_type GKO_FACTORY_PARAMETER_SCALAR(iterations, 0);
 
diff --git a/include/ginkgo/core/factorization/par_ict.hpp b/include/ginkgo/core/factorization/par_ict.hpp
index a9b41f33d90..173136fa682 100644
--- a/include/ginkgo/core/factorization/par_ict.hpp
+++ b/include/ginkgo/core/factorization/par_ict.hpp
@@ -236,7 +236,7 @@ class ParIct : public Composition<ValueType> {
      * matrix_type
      *
      * @param system_matrix  the source matrix used to generate the factors.
-     *                       @note: system_matrix must be convertable to a Csr
+     *                       @note: system_matrix must be convertible to a Csr
      *                              Matrix, otherwise, an exception is thrown.
      * @return  A Composition, containing the incomplete LU factors for the
      *          given system_matrix (first element is L, then L^T)
diff --git a/include/ginkgo/core/factorization/par_ilu.hpp b/include/ginkgo/core/factorization/par_ilu.hpp
index 539946befec..878721afbd5 100644
--- a/include/ginkgo/core/factorization/par_ilu.hpp
+++ b/include/ginkgo/core/factorization/par_ilu.hpp
@@ -128,7 +128,7 @@ class ParIlu : public Composition<ValueType> {
          * The number of iterations the `compute` kernel will use when doing
          * the factorization. The default value `0` means `Auto`, so the
          * implementation decides on the actual value depending on the
-         * ressources that are available.
+         * resources that are available.
          */
         size_type GKO_FACTORY_PARAMETER_SCALAR(iterations, 0);
 
diff --git a/include/ginkgo/core/factorization/par_ilut.hpp b/include/ginkgo/core/factorization/par_ilut.hpp
index ba4ce7d1629..76f3789a44e 100644
--- a/include/ginkgo/core/factorization/par_ilut.hpp
+++ b/include/ginkgo/core/factorization/par_ilut.hpp
@@ -242,7 +242,7 @@ class ParIlut : public Composition<ValueType> {
      * while the dynamic type of U is u_matrix_type.
      *
      * @param system_matrix  the source matrix used to generate the factors.
-     *                       @note: system_matrix must be convertable to a Csr
+     *                       @note: system_matrix must be convertible to a Csr
      *                              Matrix, otherwise, an exception is thrown.
      * @return  A Composition, containing the incomplete LU factors for the
      *          given system_matrix (first element is L, then U)
diff --git a/include/ginkgo/core/log/batch_logger.hpp b/include/ginkgo/core/log/batch_logger.hpp
new file mode 100644
index 00000000000..713b5dbe1d5
--- /dev/null
+++ b/include/ginkgo/core/log/batch_logger.hpp
@@ -0,0 +1,188 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_PUBLIC_CORE_LOG_BATCH_LOGGER_HPP_
+#define GKO_PUBLIC_CORE_LOG_BATCH_LOGGER_HPP_
+
+
+#include <memory>
+
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/log/logger.hpp>
+
+
+namespace gko {
+namespace batch {
+/**
+ * @brief The logger namespace .
+ * @ref log
+ * @ingroup log
+ */
+namespace log {
+namespace detail {
+
+
+/**
+ * Stores logging data for batch solver kernels.
+ *
+ * @note Supports only single rhs
+ */
+template <typename ValueType>
+struct log_data final {
+    using real_type = remove_complex<ValueType>;
+
+    log_data(std::shared_ptr<const Executor> exec, size_type num_batch_items)
+        : res_norms(exec), iter_counts(exec)
+    {
+        const size_type workspace_size =
+            num_batch_items * (sizeof(real_type) + sizeof(int));
+        if (num_batch_items > 0) {
+            iter_counts.resize_and_reset(num_batch_items);
+            res_norms.resize_and_reset(num_batch_items);
+        } else {
+            GKO_INVALID_STATE("Invalid num batch items passed in");
+        }
+    }
+
+    log_data(std::shared_ptr<const Executor> exec, size_type num_batch_items,
+             array<unsigned char>& workspace)
+        : res_norms(exec), iter_counts(exec)
+    {
+        const size_type workspace_size =
+            num_batch_items * (sizeof(real_type) + sizeof(int));
+        if (num_batch_items > 0 && !workspace.is_owning() &&
+            workspace.get_num_elems() >= workspace_size) {
+            iter_counts =
+                array<int>::view(exec, num_batch_items,
+                                 reinterpret_cast<int*>(workspace.get_data()));
+            res_norms = array<real_type>::view(
+                exec, num_batch_items,
+                reinterpret_cast<real_type*>(workspace.get_data() +
+                                             (sizeof(int) * num_batch_items)));
+        } else {
+            GKO_INVALID_STATE("invalid workspace or num batch items passed in");
+        }
+    }
+
+    /**
+     * Stores residual norm values for every linear system in the batch.
+     */
+    array<real_type> res_norms;
+
+    /**
+     * Stores convergence iteration counts for every matrix in the batch
+     */
+    array<int> iter_counts;
+};
+
+
+}  // namespace detail
+
+
+/**
+ * Logs the final residuals and iteration counts for a batch solver.
+ *
+ * The purpose of this logger is to give simple access to standard data
+ * generated by the solver once it has converged.
+ *
+ * @note The final logged residuals are the implicit residuals that have been
+ * computed within the solver process. Depending on the solver algorithm, this
+ * may be significantly different from the true residual (||b - Ax||).
+ *
+ * @ingroup log
+ */
+template <typename ValueType = default_precision>
+class BatchConvergence final : public gko::log::Logger {
+public:
+    using real_type = remove_complex<ValueType>;
+    using mask_type = gko::log::Logger::mask_type;
+
+    void on_batch_solver_completed(
+        const array<int>& iteration_count,
+        const array<real_type>& residual_norm) const override;
+
+    /**
+     * Creates a convergence logger. This dynamically allocates the memory,
+     * constructs the object and returns an std::unique_ptr to this object.
+     * TODO: See if the objects can be pre-allocated beforehand instead of being
+     * copied in the `on_<>` event
+     *
+     * @param exec  the executor
+     * @param enabled_events  the events enabled for this logger. By default all
+     *                        events.
+     *
+     * @return an std::unique_ptr to the the constructed object
+     */
+    static std::unique_ptr<BatchConvergence> create(
+        const mask_type& enabled_events =
+            gko::log::Logger::batch_solver_completed_mask)
+    {
+        return std::unique_ptr<BatchConvergence>(
+            new BatchConvergence(enabled_events));
+    }
+
+    /**
+     * @return  The number of iterations for entire batch
+     */
+    const array<int>& get_num_iterations() const noexcept
+    {
+        return iteration_count_;
+    }
+
+    /**
+     * @return  The residual norms for the entire batch.
+     */
+    const array<real_type>& get_residual_norm() const noexcept
+    {
+        return residual_norm_;
+    }
+
+protected:
+    explicit BatchConvergence(const mask_type& enabled_events =
+                                  gko::log::Logger::batch_solver_completed_mask)
+        : gko::log::Logger(enabled_events)
+    {}
+
+private:
+    mutable array<int> iteration_count_{};
+    mutable array<real_type> residual_norm_{};
+};
+
+
+}  // namespace log
+}  // namespace batch
+}  // namespace gko
+
+
+#endif  // GKO_PUBLIC_CORE_LOG_BATCH_LOGGER_HPP_
diff --git a/include/ginkgo/core/log/convergence.hpp b/include/ginkgo/core/log/convergence.hpp
index bc1e2a50816..1640d3d877a 100644
--- a/include/ginkgo/core/log/convergence.hpp
+++ b/include/ginkgo/core/log/convergence.hpp
@@ -102,11 +102,11 @@ class Convergence : public Logger {
      * dependencies. At the same time, this method is short enough that it
      * shouldn't be a problem.
      */
-    [[deprecated(
-        "use single-parameter create")]] static std::unique_ptr<Convergence>
-    create(std::shared_ptr<const Executor>,
-           const mask_type& enabled_events = Logger::criterion_events_mask |
-                                             Logger::iteration_complete_mask)
+    GKO_DEPRECATED("use single-parameter create")
+    static std::unique_ptr<Convergence> create(
+        std::shared_ptr<const Executor>,
+        const mask_type& enabled_events = Logger::criterion_events_mask |
+                                          Logger::iteration_complete_mask)
     {
         return std::unique_ptr<Convergence>(new Convergence(enabled_events));
     }
@@ -188,7 +188,8 @@ class Convergence : public Logger {
      * @param enabled_events  the events enabled for this logger. By default all
      *                        events.
      */
-    [[deprecated("use single-parameter constructor")]] explicit Convergence(
+    GKO_DEPRECATED("use single-parameter constructor")
+    explicit Convergence(
         std::shared_ptr<const gko::Executor>,
         const mask_type& enabled_events = Logger::criterion_events_mask |
                                           Logger::iteration_complete_mask)
diff --git a/include/ginkgo/core/log/logger.hpp b/include/ginkgo/core/log/logger.hpp
index 0f22663347c..470e9e6438f 100644
--- a/include/ginkgo/core/log/logger.hpp
+++ b/include/ginkgo/core/log/logger.hpp
@@ -58,6 +58,20 @@ class PolymorphicObject;
 class Operation;
 class stopping_status;
 
+
+namespace batch {
+
+
+class BatchLinOp;
+class BatchLinOpFactory;
+
+template <typename ValueType>
+class MultiVector;
+
+
+}  // namespace batch
+
+
 /**
  * @brief The Stopping criterion namespace.
  * @ref stop
@@ -111,10 +125,10 @@ class Logger {
      * call only if the user activates this event through the mask. If the
      * event is activated, we rely on polymorphism and the virtual method
      * `on_##_event_name()` to either call the Logger class's function,
-     * which does nothing, or the overriden version in the derived class if
+     * which does nothing, or the overridden version in the derived class if
      * any. Therefore, to support a new event in any Logger (i.e. class
      * which derive from this class), the function `on_##_event_name()`
-     * should be overriden and implemented.
+     * should be overridden and implemented.
      *
      * @param _id  the unique id of the event
      *
@@ -448,12 +462,12 @@ public:                                                              \
      * @warning This on_iteration_complete function that this macro declares is
      * deprecated. Please use the version with the stopping information.
      */
-    [[deprecated(
+    GKO_DEPRECATED(
         "Please use the version with the additional stopping "
-        "information.")]] virtual void
-    on_iteration_complete(const LinOp* solver, const size_type& it,
-                          const LinOp* r, const LinOp* x = nullptr,
-                          const LinOp* tau = nullptr) const
+        "information.")
+    virtual void on_iteration_complete(const LinOp* solver, const size_type& it,
+                                       const LinOp* r, const LinOp* x = nullptr,
+                                       const LinOp* tau = nullptr) const
     {}
 
     /**
@@ -469,28 +483,17 @@ public:                                                              \
      * @warning This on_iteration_complete function that this macro declares is
      * deprecated. Please use the version with the stopping information.
      */
-    [[deprecated(
+    GKO_DEPRECATED(
         "Please use the version with the additional stopping "
-        "information.")]] virtual void
-    on_iteration_complete(const LinOp* solver, const size_type& it,
-                          const LinOp* r, const LinOp* x, const LinOp* tau,
-                          const LinOp* implicit_tau_sq) const
+        "information.")
+    virtual void on_iteration_complete(const LinOp* solver, const size_type& it,
+                                       const LinOp* r, const LinOp* x,
+                                       const LinOp* tau,
+                                       const LinOp* implicit_tau_sq) const
     {
-#if defined(__GNUC__) || defined(__clang__)
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-#endif
-#ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable : 5211, 4973, 4974)
-#endif
+        GKO_BEGIN_DISABLE_DEPRECATION_WARNINGS
         this->on_iteration_complete(solver, it, r, x, tau);
-#if defined(__GNUC__) || defined(__clang__)
-#pragma GCC diagnostic pop
-#endif
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
+        GKO_END_DISABLE_DEPRECATION_WARNINGS
     }
 
     /**
@@ -515,27 +518,9 @@ public:                                                              \
                                        const array<stopping_status>* status,
                                        bool stopped) const
     {
-#if defined(__GNUC__) || defined(__clang__)
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-#endif  // defined(__GNUC__) || defined(__clang__)
-#ifdef __NVCOMPILER
-#pragma diag_suppress 1445
-#endif  // __NVCOMPILER
-#ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable : 5211, 4973, 4974)
-#endif  // _MSC_VER
+        GKO_BEGIN_DISABLE_DEPRECATION_WARNINGS
         this->on_iteration_complete(solver, it, r, x, tau, implicit_tau_sq);
-#if defined(__GNUC__) || defined(__clang__)
-#pragma GCC diagnostic pop
-#endif  // defined(__GNUC__) || defined(__clang__)
-#ifdef __NVCOMPILER
-#pragma diag_warning 1445
-#endif  // __NVCOMPILER
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif  // _MSC_VER
+        GKO_END_DISABLE_DEPRECATION_WARNINGS
     }
 
 public:
@@ -563,6 +548,67 @@ public:                                                              \
                               const PolymorphicObject* input,
                               const PolymorphicObject* output)
 
+    /**
+     * BatchLinOp Factory's generate started event.
+     *
+     * @param factory  the factory used
+     * @param input  the BatchLinOp object used as input for the generation
+     * (usually a system matrix)
+     */
+    GKO_LOGGER_REGISTER_EVENT(24, batch_linop_factory_generate_started,
+                              const batch::BatchLinOpFactory* factory,
+                              const batch::BatchLinOp* input)
+
+    /**
+     * BatchLinOp Factory's generate completed event.
+     *
+     * @param factory  the factory used
+     * @param input  the BatchLinOp object used as input for the generation
+     * (usually a system matrix)
+     * @param output  the generated BatchLinOp object
+     */
+    GKO_LOGGER_REGISTER_EVENT(25, batch_linop_factory_generate_completed,
+                              const batch::BatchLinOpFactory* factory,
+                              const batch::BatchLinOp* input,
+                              const batch::BatchLinOp* output)
+
+public:
+    static constexpr size_type batch_solver_completed{26};
+    static constexpr mask_type batch_solver_completed_mask{mask_type{1} << 26};
+
+    template <size_type Event, typename... Params>
+    std::enable_if_t<Event == 26 && (26 < event_count_max)> on(
+        Params&&... params) const
+    {
+        if (enabled_events_ & batch_solver_completed_mask) {
+            this->on_batch_solver_completed(std::forward<Params>(params)...);
+        }
+    }
+
+protected:
+    /**
+     * Batch solver's event that records the iteration count and the residual
+     * norm.
+     *
+     * @param iters  the array of iteration counts.
+     * @param residual_norms  the array storing the residual norms.
+     */
+    virtual void on_batch_solver_completed(
+        const array<int>& iters, const array<double>& residual_norms) const
+    {}
+
+    /**
+     * Batch solver's event that records the iteration count and the residual
+     * norm.
+     *
+     * @param iters  the array of iteration counts.
+     * @param residual_norms  the array storing the residual norms.
+     */
+    virtual void on_batch_solver_completed(
+        const array<int>& iters, const array<float>& residual_norms) const
+    {}
+
+public:
 #undef GKO_LOGGER_REGISTER_EVENT
 
     /**
@@ -605,6 +651,13 @@ public:                                                              \
         linop_factory_generate_started_mask |
         linop_factory_generate_completed_mask;
 
+    /**
+     * Bitset Mask which activates all batch linop factory events
+     */
+    static constexpr mask_type batch_linop_factory_events_mask =
+        batch_linop_factory_generate_started_mask |
+        batch_linop_factory_generate_completed_mask;
+
     /**
      * Bitset Mask which activates all criterion events
      */
@@ -634,9 +687,9 @@ public:                                                              \
      *                           logs every event except linop's apply started
      *                           event.
      */
-    [[deprecated("use single-parameter constructor")]] explicit Logger(
-        std::shared_ptr<const gko::Executor> exec,
-        const mask_type& enabled_events = all_events_mask)
+    GKO_DEPRECATED("use single-parameter constructor")
+    explicit Logger(std::shared_ptr<const gko::Executor> exec,
+                    const mask_type& enabled_events = all_events_mask)
         : Logger{enabled_events}
     {}
 
diff --git a/include/ginkgo/core/log/papi.hpp b/include/ginkgo/core/log/papi.hpp
index 5d07879d116..ff459858a8d 100644
--- a/include/ginkgo/core/log/papi.hpp
+++ b/include/ginkgo/core/log/papi.hpp
@@ -44,18 +44,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <iostream>
 #include <map>
 #include <mutex>
-
-
-#include <papi.h>
+#include <sde_lib.h>
 
 
 #include <ginkgo/core/base/polymorphic_object.hpp>
 #include <ginkgo/core/log/logger.hpp>
 
 
-#include "third_party/papi_sde/papi_sde_interface.h"
-
-
 namespace gko {
 namespace log {
 
@@ -188,17 +183,18 @@ class Papi : public Logger {
                                const array<stopping_status>* status,
                                bool stopped) const override;
 
-    [[deprecated(
+    GKO_DEPRECATED(
         "Please use the version with the additional stopping "
-        "information.")]] void
-    on_iteration_complete(const LinOp* solver, const size_type& num_iterations,
-                          const LinOp* residual, const LinOp* solution,
-                          const LinOp* residual_norm) const override;
+        "information.")
+    void on_iteration_complete(const LinOp* solver,
+                               const size_type& num_iterations,
+                               const LinOp* residual, const LinOp* solution,
+                               const LinOp* residual_norm) const override;
 
-    [[deprecated(
+    GKO_DEPRECATED(
         "Please use the version with the additional stopping "
-        "information.")]] void
-    on_iteration_complete(
+        "information.")
+    void on_iteration_complete(
         const LinOp* solver, const size_type& num_iterations,
         const LinOp* residual, const LinOp* solution,
         const LinOp* residual_norm,
@@ -209,11 +205,12 @@ class Papi : public Logger {
      *
      * @param enabled_events  the events enabled for this Logger
      */
-    [[deprecated("use single-parameter create")]] static std::shared_ptr<Papi>
-    create(std::shared_ptr<const gko::Executor>,
-           const Logger::mask_type& enabled_events = Logger::all_events_mask)
+    GKO_DEPRECATED("use single-parameter create")
+    static std::shared_ptr<Papi> create(
+        std::shared_ptr<const gko::Executor>,
+        const Logger::mask_type& enabled_events = Logger::all_events_mask)
     {
-        return std::shared_ptr<Papi>(new Papi(enabled_events));
+        return Papi::create(enabled_events);
     }
 
     /**
@@ -224,7 +221,11 @@ class Papi : public Logger {
     static std::shared_ptr<Papi> create(
         const Logger::mask_type& enabled_events = Logger::all_events_mask)
     {
-        return std::shared_ptr<Papi>(new Papi(enabled_events));
+        return std::shared_ptr<Papi>(new Papi(enabled_events), [](auto logger) {
+            auto handle = logger->get_handle();
+            delete logger;
+            papi_sde_shutdown(handle);
+        });
     }
 
     /**
@@ -235,8 +236,16 @@ class Papi : public Logger {
      */
     const std::string get_handle_name() const { return name; }
 
+    /**
+     * Returns the corresponding papi_handle_t for this logger
+     *
+     * @return the corresponding papi_handle_t for this logger
+     */
+    const papi_handle_t get_handle() const { return papi_handle; }
+
 protected:
-    [[deprecated("use single-parameter constructor")]] explicit Papi(
+    GKO_DEPRECATED("use single-parameter constructor")
+    explicit Papi(
         std::shared_ptr<const gko::Executor> exec,
         const Logger::mask_type& enabled_events = Logger::all_events_mask)
         : Papi(enabled_events)
@@ -265,12 +274,10 @@ class Papi : public Logger {
 
         ~papi_queue()
         {
-            if (PAPI_is_initialized()) {
-                for (auto e : data) {
-                    std::ostringstream oss;
-                    oss << counter_name << "::" << e.first;
-                    papi_sde_unregister_counter(*handle, oss.str().c_str());
-                }
+            for (auto e : data) {
+                std::ostringstream oss;
+                oss << counter_name << "::" << e.first;
+                papi_sde_unregister_counter(*handle, oss.str().c_str());
             }
             data.clear();
         }
diff --git a/include/ginkgo/core/log/profiler_hook.hpp b/include/ginkgo/core/log/profiler_hook.hpp
index 9a26acd6ab0..710c9e26209 100644
--- a/include/ginkgo/core/log/profiler_hook.hpp
+++ b/include/ginkgo/core/log/profiler_hook.hpp
@@ -188,17 +188,18 @@ class ProfilerHook : public Logger {
         const LinOp* implicit_sq_residual_norm,
         const array<stopping_status>* status, bool stopped) const override;
 
-    [[deprecated(
+    GKO_DEPRECATED(
         "Please use the version with the additional stopping "
-        "information.")]] void
-    on_iteration_complete(const LinOp* solver, const size_type& num_iterations,
-                          const LinOp* residual, const LinOp* solution,
-                          const LinOp* residual_norm) const override;
+        "information.")
+    void on_iteration_complete(const LinOp* solver,
+                               const size_type& num_iterations,
+                               const LinOp* residual, const LinOp* solution,
+                               const LinOp* residual_norm) const override;
 
-    [[deprecated(
+    GKO_DEPRECATED(
         "Please use the version with the additional stopping "
-        "information.")]] void
-    on_iteration_complete(
+        "information.")
+    void on_iteration_complete(
         const LinOp* solver, const size_type& num_iterations,
         const LinOp* residual, const LinOp* solution,
         const LinOp* residual_norm,
@@ -298,7 +299,7 @@ class ProfilerHook : public Logger {
         std::vector<nested_summary_entry> children{};
     };
 
-    /** Recieves the results from ProfilerHook::create_summary(). */
+    /** Receives the results from ProfilerHook::create_summary(). */
     class SummaryWriter {
     public:
         virtual ~SummaryWriter() = default;
@@ -313,7 +314,7 @@ class ProfilerHook : public Logger {
                            std::chrono::nanoseconds overhead) = 0;
     };
 
-    /** Recieves the results from ProfilerHook::create_nested_summary(). */
+    /** Receives the results from ProfilerHook::create_nested_summary(). */
     class NestedSummaryWriter {
     public:
         virtual ~NestedSummaryWriter() = default;
diff --git a/include/ginkgo/core/log/record.hpp b/include/ginkgo/core/log/record.hpp
index 62cc3f0e7fc..1d27a57bb01 100644
--- a/include/ginkgo/core/log/record.hpp
+++ b/include/ginkgo/core/log/record.hpp
@@ -402,17 +402,18 @@ class Record : public Logger {
         const LinOp* residual_norm, const LinOp* implicit_resnorm_sq,
         const array<stopping_status>* status, bool stopped) const override;
 
-    [[deprecated(
+    GKO_DEPRECATED(
         "Please use the version with the additional stopping "
-        "information.")]] void
-    on_iteration_complete(const LinOp* solver, const size_type& num_iterations,
-                          const LinOp* residual, const LinOp* solution,
-                          const LinOp* residual_norm) const override;
+        "information.")
+    void on_iteration_complete(const LinOp* solver,
+                               const size_type& num_iterations,
+                               const LinOp* residual, const LinOp* solution,
+                               const LinOp* residual_norm) const override;
 
-    [[deprecated(
+    GKO_DEPRECATED(
         "Please use the version with the additional stopping "
-        "information.")]] void
-    on_iteration_complete(
+        "information.")
+    void on_iteration_complete(
         const LinOp* solver, const size_type& num_iterations,
         const LinOp* residual, const LinOp* solution,
         const LinOp* residual_norm,
@@ -436,10 +437,11 @@ class Record : public Logger {
      * dependencies. At the same time, this method is short enough that it
      * shouldn't be a problem.
      */
-    [[deprecated("use two-parameter create")]] static std::unique_ptr<Record>
-    create(std::shared_ptr<const Executor> exec,
-           const mask_type& enabled_events = Logger::all_events_mask,
-           size_type max_storage = 1)
+    GKO_DEPRECATED("use two-parameter create")
+    static std::unique_ptr<Record> create(
+        std::shared_ptr<const Executor> exec,
+        const mask_type& enabled_events = Logger::all_events_mask,
+        size_type max_storage = 1)
     {
         return std::unique_ptr<Record>(new Record(enabled_events, max_storage));
     }
@@ -493,10 +495,10 @@ class Record : public Logger {
      *                     storage. It is advised to control this to reduce
      *                     memory overhead of this logger.
      */
-    [[deprecated("use two-parameter constructor")]] explicit Record(
-        std::shared_ptr<const gko::Executor> exec,
-        const mask_type& enabled_events = Logger::all_events_mask,
-        size_type max_storage = 0)
+    GKO_DEPRECATED("use two-parameter constructor")
+    explicit Record(std::shared_ptr<const gko::Executor> exec,
+                    const mask_type& enabled_events = Logger::all_events_mask,
+                    size_type max_storage = 0)
         : Record(enabled_events, max_storage)
     {}
 
diff --git a/include/ginkgo/core/log/stream.hpp b/include/ginkgo/core/log/stream.hpp
index 6981beacce2..723a44e2051 100644
--- a/include/ginkgo/core/log/stream.hpp
+++ b/include/ginkgo/core/log/stream.hpp
@@ -164,17 +164,18 @@ class Stream : public Logger {
                                const array<stopping_status>* status,
                                bool stopped) const override;
 
-    [[deprecated(
+    GKO_DEPRECATED(
         "Please use the version with the additional stopping "
-        "information.")]] void
-    on_iteration_complete(const LinOp* solver, const size_type& num_iterations,
-                          const LinOp* residual, const LinOp* solution,
-                          const LinOp* residual_norm) const override;
+        "information.")
+    void on_iteration_complete(const LinOp* solver,
+                               const size_type& num_iterations,
+                               const LinOp* residual, const LinOp* solution,
+                               const LinOp* residual_norm) const override;
 
-    [[deprecated(
+    GKO_DEPRECATED(
         "Please use the version with the additional stopping "
-        "information.")]] void
-    on_iteration_complete(
+        "information.")
+    void on_iteration_complete(
         const LinOp* solver, const size_type& num_iterations,
         const LinOp* residual, const LinOp* solution,
         const LinOp* residual_norm,
@@ -198,10 +199,11 @@ class Stream : public Logger {
      * dependencies. At the same time, this method is short enough that it
      * shouldn't be a problem.
      */
-    [[deprecated("use three-parameter create")]] static std::unique_ptr<Stream>
-    create(std::shared_ptr<const Executor> exec,
-           const Logger::mask_type& enabled_events = Logger::all_events_mask,
-           std::ostream& os = std::cout, bool verbose = false)
+    GKO_DEPRECATED("use three-parameter create")
+    static std::unique_ptr<Stream> create(
+        std::shared_ptr<const Executor> exec,
+        const Logger::mask_type& enabled_events = Logger::all_events_mask,
+        std::ostream& os = std::cout, bool verbose = false)
     {
         return std::unique_ptr<Stream>(new Stream(enabled_events, os, verbose));
     }
@@ -243,7 +245,8 @@ class Stream : public Logger {
      *                 includes always printing residuals and other information
      *                 which can give a large output.
      */
-    [[deprecated("use three-parameter constructor")]] explicit Stream(
+    GKO_DEPRECATED("use three-parameter constructor")
+    explicit Stream(
         std::shared_ptr<const gko::Executor> exec,
         const Logger::mask_type& enabled_events = Logger::all_events_mask,
         std::ostream& os = std::cerr, bool verbose = false)
diff --git a/include/ginkgo/core/matrix/batch_dense.hpp b/include/ginkgo/core/matrix/batch_dense.hpp
new file mode 100644
index 00000000000..47230c24e32
--- /dev/null
+++ b/include/ginkgo/core/matrix/batch_dense.hpp
@@ -0,0 +1,374 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_PUBLIC_CORE_MATRIX_BATCH_DENSE_HPP_
+#define GKO_PUBLIC_CORE_MATRIX_BATCH_DENSE_HPP_
+
+
+#include <initializer_list>
+#include <vector>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/batch_lin_op.hpp>
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/mtx_io.hpp>
+#include <ginkgo/core/base/range_accessors.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/base/utils.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+namespace gko {
+namespace batch {
+namespace matrix {
+
+
+/**
+ * Dense is a batch matrix format which explicitly stores all values of the
+ * matrix in each of the batches.
+ *
+ * The values in each of the batches are stored in row-major format (values
+ * belonging to the same row appear consecutive in the memory and the values of
+ * each batch item are also stored consecutively in memory).
+ *
+ * @note Though the storage layout is the same as the multi-vector object, the
+ * class semantics and the operations it aims to provide are different. Hence it
+ * is recommended to create multi-vector objects if the user means to view the
+ * data as a set of vectors.
+ *
+ * @tparam ValueType  precision of matrix elements
+ *
+ * @ingroup batch_dense
+ * @ingroup mat_formats
+ * @ingroup BatchLinOp
+ */
+template <typename ValueType = default_precision>
+class Dense final : public EnableBatchLinOp<Dense<ValueType>>,
+                    public EnableCreateMethod<Dense<ValueType>>,
+                    public ConvertibleTo<Dense<next_precision<ValueType>>> {
+    friend class EnableCreateMethod<Dense>;
+    friend class EnablePolymorphicObject<Dense, BatchLinOp>;
+    friend class Dense<to_complex<ValueType>>;
+    friend class Dense<next_precision<ValueType>>;
+
+public:
+    using EnableBatchLinOp<Dense>::convert_to;
+    using EnableBatchLinOp<Dense>::move_to;
+
+    using value_type = ValueType;
+    using index_type = int32;
+    using transposed_type = Dense<ValueType>;
+    using unbatch_type = gko::matrix::Dense<ValueType>;
+    using absolute_type = remove_complex<Dense>;
+    using complex_type = to_complex<Dense>;
+
+    void convert_to(Dense<next_precision<ValueType>>* result) const override;
+
+    void move_to(Dense<next_precision<ValueType>>* result) override;
+
+    /**
+     * Creates a mutable view (of gko::matrix::Dense type) of one item of the
+     * batch::matrix::Dense<value_type> object. Does not perform any deep
+     * copies, but only returns a view of the data.
+     *
+     * @param item_id  The index of the batch item
+     *
+     * @return  a gko::matrix::Dense object with the data from the batch item
+     * at the given index.
+     */
+    std::unique_ptr<unbatch_type> create_view_for_item(size_type item_id);
+
+    /**
+     * @copydoc create_view_for_item(size_type)
+     */
+    std::unique_ptr<const unbatch_type> create_const_view_for_item(
+        size_type item_id) const;
+
+    /**
+     * Get the cumulative storage size offset
+     *
+     * @param batch_id the batch id
+     *
+     * @return the cumulative offset
+     */
+    size_type get_cumulative_offset(size_type batch_id) const
+    {
+        GKO_ASSERT(batch_id < this->get_num_batch_items());
+        return batch_id * this->get_common_size()[0] *
+               this->get_common_size()[1];
+    }
+
+    /**
+     * Returns a pointer to the array of values of the multi-vector
+     *
+     * @return the pointer to the array of values
+     */
+    value_type* get_values() noexcept { return values_.get_data(); }
+
+    /**
+     * @copydoc get_values()
+     *
+     * @note This is the constant version of the function, which can be
+     *       significantly more memory efficient than the non-constant version,
+     *       so always prefer this version.
+     */
+    const value_type* get_const_values() const noexcept
+    {
+        return values_.get_const_data();
+    }
+
+    /**
+     * Returns a single element for a particular batch item.
+     *
+     * @param batch_id  the batch item index to be queried
+     * @param row  the row of the requested element
+     * @param col  the column of the requested element
+     *
+     * @note  the method has to be called on the same Executor the matrix is
+     *        stored at (e.g. trying to call this method on a GPU Dense object
+     *        from the OMP may result in incorrect behaviour)
+     */
+    value_type& at(size_type batch_id, size_type row, size_type col)
+    {
+        GKO_ASSERT(batch_id < this->get_num_batch_items());
+        return values_.get_data()[linearize_index(batch_id, row, col)];
+    }
+
+    /**
+     * @copydoc Dense::at(size_type, size_type, size_type)
+     */
+    value_type at(size_type batch_id, size_type row, size_type col) const
+    {
+        GKO_ASSERT(batch_id < this->get_num_batch_items());
+        return values_.get_const_data()[linearize_index(batch_id, row, col)];
+    }
+
+    /**
+     * Returns a single element for a particular batch item.
+     *
+     * Useful for iterating across all elements of the matrix.
+     * However, it is less efficient than the two-parameter variant of this
+     * method.
+     *
+     * @param batch_id  the batch item index to be queried
+     * @param idx  a linear index of the requested element
+     *
+     * @note  the method has to be called on the same Executor the matrix is
+     *        stored at (e.g. trying to call this method on a GPU Dense object
+     *        from the OMP may result in incorrect behaviour)
+     */
+    ValueType& at(size_type batch_id, size_type idx) noexcept
+    {
+        GKO_ASSERT(batch_id < this->get_num_batch_items());
+        return values_.get_data()[linearize_index(batch_id, idx)];
+    }
+
+    /**
+     * @copydoc Dense::at(size_type, size_type, size_type)
+     */
+    ValueType at(size_type batch_id, size_type idx) const noexcept
+    {
+        GKO_ASSERT(batch_id < this->get_num_batch_items());
+        return values_.get_const_data()[linearize_index(batch_id, idx)];
+    }
+
+    /**
+     * Returns a pointer to the array of values of the matrix for a
+     * specific batch item.
+     *
+     * @param batch_id  the id of the batch item.
+     *
+     * @return the pointer to the array of values
+     */
+    value_type* get_values_for_item(size_type batch_id) noexcept
+    {
+        GKO_ASSERT(batch_id < this->get_num_batch_items());
+        return values_.get_data() + this->get_cumulative_offset(batch_id);
+    }
+
+    /**
+     * @copydoc get_values_for_item(size_type)
+     *
+     * @note This is the constant version of the function, which can be
+     *       significantly more memory efficient than the non-constant version,
+     *       so always prefer this version.
+     */
+    const value_type* get_const_values_for_item(
+        size_type batch_id) const noexcept
+    {
+        GKO_ASSERT(batch_id < this->get_num_batch_items());
+        return values_.get_const_data() + this->get_cumulative_offset(batch_id);
+    }
+
+    /**
+     * Returns the number of elements explicitly stored in the batch matrix,
+     * cumulative across all the batch items.
+     *
+     * @return the number of elements explicitly stored in the vector,
+     *         cumulative across all the batch items
+     */
+    size_type get_num_stored_elements() const noexcept
+    {
+        return values_.get_num_elems();
+    }
+
+    /**
+     * Creates a constant (immutable) batch dense matrix from a constant
+     * array.
+     *
+     * @param exec  the executor to create the matrix on
+     * @param size  the dimensions of the matrix
+     * @param values  the value array of the matrix
+     *
+     * @return A smart pointer to the constant matrix wrapping the input
+     * array (if it resides on the same executor as the matrix) or a copy of the
+     * array on the correct executor.
+     */
+    static std::unique_ptr<const Dense<value_type>> create_const(
+        std::shared_ptr<const Executor> exec, const batch_dim<2>& sizes,
+        gko::detail::const_array_view<ValueType>&& values);
+
+    /**
+     * Apply the matrix to a multi-vector. Represents the matrix vector
+     * multiplication, x = A * b, where x and b are both multi-vectors.
+     *
+     * @param b  the multi-vector to be applied to
+     * @param x  the output multi-vector
+     */
+    Dense* apply(ptr_param<const MultiVector<value_type>> b,
+                 ptr_param<MultiVector<value_type>> x);
+
+    /**
+     * Apply the matrix to a multi-vector with a linear combination of the given
+     * input vector. Represents the matrix vector multiplication, x = alpha * A
+     * * b + beta * x, where x and b are both multi-vectors.
+     *
+     * @param alpha  the scalar to scale the matrix-vector product with
+     * @param b      the multi-vector to be applied to
+     * @param beta   the scalar to scale the x vector with
+     * @param x      the output multi-vector
+     */
+    Dense* apply(ptr_param<const MultiVector<value_type>> alpha,
+                 ptr_param<const MultiVector<value_type>> b,
+                 ptr_param<const MultiVector<value_type>> beta,
+                 ptr_param<MultiVector<value_type>> x);
+
+    /**
+     * @copydoc apply(const MultiVector<value_type>*, MultiVector<value_type>*)
+     */
+    const Dense* apply(ptr_param<const MultiVector<value_type>> b,
+                       ptr_param<MultiVector<value_type>> x) const;
+
+    /**
+     * @copydoc apply(const MultiVector<value_type>*, const
+     * MultiVector<value_type>*, const MultiVector<value_type>*,
+     * MultiVector<value_type>*)
+     */
+    const Dense* apply(ptr_param<const MultiVector<value_type>> alpha,
+                       ptr_param<const MultiVector<value_type>> b,
+                       ptr_param<const MultiVector<value_type>> beta,
+                       ptr_param<MultiVector<value_type>> x) const;
+
+private:
+    inline size_type compute_num_elems(const batch_dim<2>& size)
+    {
+        return size.get_num_batch_items() * size.get_common_size()[0] *
+               size.get_common_size()[1];
+    }
+
+    /**
+     * Creates an uninitialized Dense matrix of the specified size.
+     *
+     * @param exec  Executor associated to the matrix
+     * @param size  size of the matrix
+     */
+    Dense(std::shared_ptr<const Executor> exec,
+          const batch_dim<2>& size = batch_dim<2>{});
+
+    /**
+     * Creates a Dense matrix from an already allocated (and initialized)
+     * array.
+     *
+     * @tparam ValuesArray  type of array of values
+     *
+     * @param exec  Executor associated to the matrix
+     * @param size  sizes of the batch matrices in a batch_dim object
+     * @param values  array of matrix values
+     *
+     * @note If `values` is not an rvalue, not an array of ValueType, or is on
+     *       the wrong executor, an internal copy will be created, and the
+     *       original array data will not be used in the matrix.
+     */
+    template <typename ValuesArray>
+    Dense(std::shared_ptr<const Executor> exec, const batch_dim<2>& size,
+          ValuesArray&& values)
+        : EnableBatchLinOp<Dense>(exec, size),
+          values_{exec, std::forward<ValuesArray>(values)}
+    {
+        // Ensure that the values array has the correct size
+        auto num_elems = compute_num_elems(size);
+        GKO_ENSURE_IN_BOUNDS(num_elems, values_.get_num_elems() + 1);
+    }
+
+    void apply_impl(const MultiVector<value_type>* b,
+                    MultiVector<value_type>* x) const;
+
+    void apply_impl(const MultiVector<value_type>* alpha,
+                    const MultiVector<value_type>* b,
+                    const MultiVector<value_type>* beta,
+                    MultiVector<value_type>* x) const;
+
+    size_type linearize_index(size_type batch, size_type row,
+                              size_type col) const noexcept
+    {
+        return this->get_cumulative_offset(batch) +
+               row * this->get_size().get_common_size()[1] + col;
+    }
+
+    size_type linearize_index(size_type batch, size_type idx) const noexcept
+    {
+        return linearize_index(batch, idx / this->get_common_size()[1],
+                               idx % this->get_common_size()[1]);
+    }
+
+    array<value_type> values_;
+};
+
+
+}  // namespace matrix
+}  // namespace batch
+}  // namespace gko
+
+
+#endif  // GKO_PUBLIC_CORE_MATRIX_BATCH_DENSE_HPP_
diff --git a/include/ginkgo/core/matrix/batch_ell.hpp b/include/ginkgo/core/matrix/batch_ell.hpp
new file mode 100644
index 00000000000..fa00a0631fd
--- /dev/null
+++ b/include/ginkgo/core/matrix/batch_ell.hpp
@@ -0,0 +1,385 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_PUBLIC_CORE_MATRIX_BATCH_ELL_HPP_
+#define GKO_PUBLIC_CORE_MATRIX_BATCH_ELL_HPP_
+
+
+#include <initializer_list>
+#include <vector>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/batch_lin_op.hpp>
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/mtx_io.hpp>
+#include <ginkgo/core/base/range_accessors.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/base/utils.hpp>
+#include <ginkgo/core/matrix/ell.hpp>
+
+
+namespace gko {
+namespace batch {
+namespace matrix {
+
+
+/**
+ * Ell is a sparse matrix format that stores the same number of nonzeros in each
+ * row, enabling coalesced accesses. It is suitable for sparsity patterns that
+ * have a similar number of nonzeros in every row. The values are stored in a
+ * column-major fashion similar to the monolithic gko::matrix::Ell class.
+ *
+ * Similar to the monolithic gko::matrix::Ell class, invalid_index<IndexType> is
+ * used as the column index for padded zero entries.
+ *
+ * @note It is also assumed that the sparsity pattern of all the items in the
+ * batch is the same and therefore only a single copy of the sparsity pattern is
+ * stored.
+ *
+ * @note Currently only IndexType of int32 is supported.
+ *
+ * @tparam ValueType  value precision of matrix elements
+ * @tparam IndexType  index precision of matrix elements
+ *
+ * @ingroup batch_ell
+ * @ingroup mat_formats
+ * @ingroup BatchLinOp
+ */
+template <typename ValueType = default_precision, typename IndexType = int32>
+class Ell final
+    : public EnableBatchLinOp<Ell<ValueType, IndexType>>,
+      public EnableCreateMethod<Ell<ValueType, IndexType>>,
+      public ConvertibleTo<Ell<next_precision<ValueType>, IndexType>> {
+    friend class EnableCreateMethod<Ell>;
+    friend class EnablePolymorphicObject<Ell, BatchLinOp>;
+    friend class Ell<to_complex<ValueType>, IndexType>;
+    friend class Ell<next_precision<ValueType>, IndexType>;
+    static_assert(std::is_same<IndexType, int32>::value,
+                  "IndexType must be a 32 bit integer");
+
+public:
+    using EnableBatchLinOp<Ell>::convert_to;
+    using EnableBatchLinOp<Ell>::move_to;
+
+    using value_type = ValueType;
+    using index_type = IndexType;
+    using unbatch_type = gko::matrix::Ell<value_type, index_type>;
+    using absolute_type = remove_complex<Ell>;
+    using complex_type = to_complex<Ell>;
+
+    void convert_to(
+        Ell<next_precision<ValueType>, IndexType>* result) const override;
+
+    void move_to(Ell<next_precision<ValueType>, IndexType>* result) override;
+
+    /**
+     * Creates a mutable view (of matrix::Ell type) of one item of the
+     * batch::matrix::Ell<value_type> object. Does not perform any deep
+     * copies, but only returns a view of the data.
+     *
+     * @param item_id  The index of the batch item
+     *
+     * @return  a batch::matrix::Ell object with the data from the batch item
+     * at the given index.
+     */
+    std::unique_ptr<unbatch_type> create_view_for_item(size_type item_id);
+
+    /**
+     * @copydoc create_view_for_item(size_type)
+     */
+    std::unique_ptr<const unbatch_type> create_const_view_for_item(
+        size_type item_id) const;
+
+    /**
+     * Returns a pointer to the array of values of the matrix
+     *
+     * @return the pointer to the array of values
+     */
+    value_type* get_values() noexcept { return values_.get_data(); }
+
+    /**
+     * @copydoc get_values()
+     *
+     * @note This is the constant version of the function, which can be
+     *       significantly more memory efficient than the non-constant version,
+     *       so always prefer this version.
+     */
+    const value_type* get_const_values() const noexcept
+    {
+        return values_.get_const_data();
+    }
+
+    /**
+     * Returns a pointer to the array of column indices of the matrix
+     *
+     * @return the pointer to the array of column indices
+     */
+    index_type* get_col_idxs() noexcept { return col_idxs_.get_data(); }
+
+    /**
+     * @copydoc get_col_idxs()
+     *
+     * @note This is the constant version of the function, which can be
+     *       significantly more memory efficient than the non-constant version,
+     *       so always prefer this version.
+     */
+    const index_type* get_const_col_idxs() const noexcept
+    {
+        return col_idxs_.get_const_data();
+    }
+
+    /**
+     * Returns the number of elements per row explicitly stored.
+     *
+     * @return the number of elements stored in each row of the ELL matrix. Same
+     * for each batch item
+     */
+    index_type get_num_stored_elements_per_row() const noexcept
+    {
+        return num_elems_per_row_;
+    }
+
+    /**
+     * Returns the number of elements explicitly stored in the batch matrix,
+     * cumulative across all the batch items.
+     *
+     * @return the number of elements explicitly stored in the vector,
+     *         cumulative across all the batch items
+     */
+    size_type get_num_stored_elements() const noexcept
+    {
+        return values_.get_num_elems();
+    }
+
+    /**
+     * Returns the number of stored elements in each batch item.
+     *
+     * @return the number of stored elements per batch item.
+     */
+    size_type get_num_elements_per_item() const noexcept
+    {
+        return this->get_num_stored_elements() / this->get_num_batch_items();
+    }
+
+    /**
+     * Returns a pointer to the array of col_idxs of the matrix. This is shared
+     * across all batch items.
+     *
+     * @param batch_id  the id of the batch item.
+     *
+     * @return the pointer to the array of col_idxs
+     */
+    index_type* get_col_idxs_for_item(size_type batch_id) noexcept
+    {
+        GKO_ASSERT(batch_id < this->get_num_batch_items());
+        return col_idxs_.get_data();
+    }
+
+    /**
+     * @copydoc get_col_idxs_for_item(size_type)
+     *
+     * @note This is the constant version of the function, which can be
+     *       significantly more memory efficient than the non-constant version,
+     *       so always prefer this version.
+     */
+    const index_type* get_const_col_idxs_for_item(
+        size_type batch_id) const noexcept
+    {
+        GKO_ASSERT(batch_id < this->get_num_batch_items());
+        return col_idxs_.get_const_data();
+    }
+
+    /**
+     * Returns a pointer to the array of values of the matrix for a
+     * specific batch item.
+     *
+     * @param batch_id  the id of the batch item.
+     *
+     * @return the pointer to the array of values
+     */
+    value_type* get_values_for_item(size_type batch_id) noexcept
+    {
+        GKO_ASSERT(batch_id < this->get_num_batch_items());
+        return values_.get_data() +
+               batch_id * this->get_num_elements_per_item();
+    }
+
+    /**
+     * @copydoc get_values_for_item(size_type)
+     *
+     * @note This is the constant version of the function, which can be
+     *       significantly more memory efficient than the non-constant version,
+     *       so always prefer this version.
+     */
+    const value_type* get_const_values_for_item(
+        size_type batch_id) const noexcept
+    {
+        GKO_ASSERT(batch_id < this->get_num_batch_items());
+        return values_.get_const_data() +
+               batch_id * this->get_num_elements_per_item();
+    }
+
+    /**
+     * Creates a constant (immutable) batch ell matrix from a constant
+     * array. The column indices array needs to be the same for all batch items.
+     *
+     * @param exec  the executor to create the matrix on
+     * @param size  the dimensions of the matrix
+     * @param num_elems_per_row  the number of elements to be stored in each row
+     * @param values  the value array of the matrix
+     * @param col_idxs the col_idxs array of a single batch item of the matrix.
+     *
+     * @return A smart pointer to the constant matrix wrapping the input
+     * array (if it resides on the same executor as the matrix) or a copy of the
+     * array on the correct executor.
+     */
+    static std::unique_ptr<const Ell> create_const(
+        std::shared_ptr<const Executor> exec, const batch_dim<2>& sizes,
+        const index_type num_elems_per_row,
+        gko::detail::const_array_view<value_type>&& values,
+        gko::detail::const_array_view<index_type>&& col_idxs);
+
+    /**
+     * Apply the matrix to a multi-vector. Represents the matrix vector
+     * multiplication, x = A * b, where x and b are both multi-vectors.
+     *
+     * @param b  the multi-vector to be applied to
+     * @param x  the output multi-vector
+     */
+    Ell* apply(ptr_param<const MultiVector<value_type>> b,
+               ptr_param<MultiVector<value_type>> x);
+
+    /**
+     * Apply the matrix to a multi-vector with a linear combination of the given
+     * input vector. Represents the matrix vector multiplication, x = alpha * A
+     * * b + beta * x, where x and b are both multi-vectors.
+     *
+     * @param alpha  the scalar to scale the matrix-vector product with
+     * @param b      the multi-vector to be applied to
+     * @param beta   the scalar to scale the x vector with
+     * @param x      the output multi-vector
+     */
+    Ell* apply(ptr_param<const MultiVector<value_type>> alpha,
+               ptr_param<const MultiVector<value_type>> b,
+               ptr_param<const MultiVector<value_type>> beta,
+               ptr_param<MultiVector<value_type>> x);
+
+    /**
+     * @copydoc apply(const MultiVector<value_type>*, MultiVector<value_type>*)
+     */
+    const Ell* apply(ptr_param<const MultiVector<value_type>> b,
+                     ptr_param<MultiVector<value_type>> x) const;
+
+    /**
+     * @copydoc apply(const MultiVector<value_type>*, const
+     * MultiVector<value_type>*, const MultiVector<value_type>*,
+     * MultiVector<value_type>*)
+     */
+    const Ell* apply(ptr_param<const MultiVector<value_type>> alpha,
+                     ptr_param<const MultiVector<value_type>> b,
+                     ptr_param<const MultiVector<value_type>> beta,
+                     ptr_param<MultiVector<value_type>> x) const;
+
+private:
+    size_type compute_num_elems(const batch_dim<2>& size,
+                                IndexType num_elems_per_row)
+    {
+        return size.get_num_batch_items() * size.get_common_size()[0] *
+               num_elems_per_row;
+    }
+
+    /**
+     * Creates an uninitialized Ell matrix of the specified size.
+     *
+     * @param exec  Executor associated to the matrix
+     * @param size  size of the matrix
+     * @param num_elems_per_row  the number of elements to be stored in each row
+     */
+    Ell(std::shared_ptr<const Executor> exec,
+        const batch_dim<2>& size = batch_dim<2>{},
+        const IndexType num_elems_per_row = 0);
+
+    /**
+     * Creates a Ell matrix from an already allocated (and initialized)
+     * array. The column indices array needs to be the same for all batch items.
+     *
+     * @tparam ValuesArray  type of array of values
+     *
+     * @param exec  Executor associated to the matrix
+     * @param size  size of the matrix
+     * @param num_elems_per_row  the number of elements to be stored in each row
+     * @param values  array of matrix values
+     * @param col_idxs the col_idxs array of a single batch item of the matrix.
+     *
+     * @note If `values` is not an rvalue, not an array of ValueType, or is on
+     *       the wrong executor, an internal copy will be created, and the
+     *       original array data will not be used in the matrix.
+     */
+    template <typename ValuesArray, typename IndicesArray>
+    Ell(std::shared_ptr<const Executor> exec, const batch_dim<2>& size,
+        const IndexType num_elems_per_row, ValuesArray&& values,
+        IndicesArray&& col_idxs)
+        : EnableBatchLinOp<Ell>(exec, size),
+          num_elems_per_row_{num_elems_per_row},
+          values_{exec, std::forward<ValuesArray>(values)},
+          col_idxs_{exec, std::forward<IndicesArray>(col_idxs)}
+    {
+        // Ensure that the value and col_idxs arrays have the correct size
+        auto num_elems = this->get_common_size()[0] * num_elems_per_row *
+                         this->get_num_batch_items();
+        GKO_ASSERT_EQ(num_elems, values_.get_num_elems());
+        GKO_ASSERT_EQ(this->get_num_elements_per_item(),
+                      col_idxs_.get_num_elems());
+    }
+
+    void apply_impl(const MultiVector<value_type>* b,
+                    MultiVector<value_type>* x) const;
+
+    void apply_impl(const MultiVector<value_type>* alpha,
+                    const MultiVector<value_type>* b,
+                    const MultiVector<value_type>* beta,
+                    MultiVector<value_type>* x) const;
+
+    index_type num_elems_per_row_;
+    array<value_type> values_;
+    array<index_type> col_idxs_;
+};
+
+
+}  // namespace matrix
+}  // namespace batch
+}  // namespace gko
+
+
+#endif  // GKO_PUBLIC_CORE_MATRIX_BATCH_ELL_HPP_
diff --git a/include/ginkgo/core/matrix/batch_identity.hpp b/include/ginkgo/core/matrix/batch_identity.hpp
new file mode 100644
index 00000000000..15b7623ac0f
--- /dev/null
+++ b/include/ginkgo/core/matrix/batch_identity.hpp
@@ -0,0 +1,141 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_PUBLIC_CORE_MATRIX_BATCH_IDENTITY_HPP_
+#define GKO_PUBLIC_CORE_MATRIX_BATCH_IDENTITY_HPP_
+
+
+#include <ginkgo/core/base/batch_lin_op.hpp>
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/base/utils.hpp>
+#include <ginkgo/core/matrix/identity.hpp>
+
+
+namespace gko {
+namespace batch {
+namespace matrix {
+
+
+/**
+ * The batch Identity matrix, which represents a batch of Identity matrices.
+ *
+ * @tparam ValueType  precision of matrix elements
+ *
+ * @ingroup batch_identity
+ * @ingroup mat_formats
+ * @ingroup BatchLinOp
+ */
+template <typename ValueType = default_precision>
+class Identity final : public EnableBatchLinOp<Identity<ValueType>>,
+                       public EnableCreateMethod<Identity<ValueType>> {
+    friend class EnableCreateMethod<Identity>;
+    friend class EnablePolymorphicObject<Identity, BatchLinOp>;
+
+public:
+    using EnableBatchLinOp<Identity>::convert_to;
+    using EnableBatchLinOp<Identity>::move_to;
+
+    using value_type = ValueType;
+    using index_type = int32;
+    using unbatch_type = gko::matrix::Identity<ValueType>;
+    using absolute_type = remove_complex<Identity>;
+    using complex_type = to_complex<Identity>;
+
+    /**
+     * Apply the matrix to a multi-vector. Represents the matrix vector
+     * multiplication, x = I * b, where x and b are both multi-vectors.
+     *
+     * @param b  the multi-vector to be applied to
+     * @param x  the output multi-vector
+     */
+    Identity* apply(ptr_param<const MultiVector<value_type>> b,
+                    ptr_param<MultiVector<value_type>> x);
+
+    /**
+     * Apply the matrix to a multi-vector with a linear combination of the given
+     * input vector. Represents the matrix vector multiplication, x = alpha * I
+     * * b + beta * x, where x and b are both multi-vectors.
+     *
+     * @param alpha  the scalar to scale the matrix-vector product with
+     * @param b      the multi-vector to be applied to
+     * @param beta   the scalar to scale the x vector with
+     * @param x      the output multi-vector
+     */
+    Identity* apply(ptr_param<const MultiVector<value_type>> alpha,
+                    ptr_param<const MultiVector<value_type>> b,
+                    ptr_param<const MultiVector<value_type>> beta,
+                    ptr_param<MultiVector<value_type>> x);
+
+    /**
+     * @copydoc apply(const MultiVector<value_type>*, MultiVector<value_type>*)
+     */
+    const Identity* apply(ptr_param<const MultiVector<value_type>> b,
+                          ptr_param<MultiVector<value_type>> x) const;
+
+    /**
+     * @copydoc apply(const MultiVector<value_type>*, const
+     * MultiVector<value_type>*, const MultiVector<value_type>*,
+     * MultiVector<value_type>*)
+     */
+    const Identity* apply(ptr_param<const MultiVector<value_type>> alpha,
+                          ptr_param<const MultiVector<value_type>> b,
+                          ptr_param<const MultiVector<value_type>> beta,
+                          ptr_param<MultiVector<value_type>> x) const;
+
+private:
+    /**
+     * Creates an Identity matrix of the specified size.
+     *
+     * @param exec  Executor associated to the matrix
+     * @param size  size of the batch matrices in a batch_dim object
+     */
+    Identity(std::shared_ptr<const Executor> exec,
+             const batch_dim<2>& size = batch_dim<2>{});
+
+    void apply_impl(const MultiVector<value_type>* b,
+                    MultiVector<value_type>* x) const;
+
+    void apply_impl(const MultiVector<value_type>* alpha,
+                    const MultiVector<value_type>* b,
+                    const MultiVector<value_type>* beta,
+                    MultiVector<value_type>* x) const;
+};
+
+
+}  // namespace matrix
+}  // namespace batch
+}  // namespace gko
+
+
+#endif  // GKO_PUBLIC_CORE_MATRIX_BATCH_IDENTITY_HPP_
diff --git a/include/ginkgo/core/matrix/coo.hpp b/include/ginkgo/core/matrix/coo.hpp
index 9ccd02d48db..15662294607 100644
--- a/include/ginkgo/core/matrix/coo.hpp
+++ b/include/ginkgo/core/matrix/coo.hpp
@@ -63,7 +63,7 @@ class Hybrid;
 /**
  * COO stores a matrix in the coordinate matrix format.
  *
- * The nonzero elements are stored in an array row-wise (but not neccessarily
+ * The nonzero elements are stored in an array row-wise (but not necessarily
  * sorted by column index within a row). Two extra arrays contain the row and
  * column indexes of each nonzero element of the matrix.
  *
diff --git a/include/ginkgo/core/matrix/csr.hpp b/include/ginkgo/core/matrix/csr.hpp
index 611e5d33c64..63ee74d6dcc 100644
--- a/include/ginkgo/core/matrix/csr.hpp
+++ b/include/ginkgo/core/matrix/csr.hpp
@@ -38,6 +38,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/index_set.hpp>
 #include <ginkgo/core/base/lin_op.hpp>
 #include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/permutation.hpp>
+#include <ginkgo/core/matrix/scaled_permutation.hpp>
 
 
 namespace gko {
@@ -763,6 +765,74 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
 
     std::unique_ptr<LinOp> conj_transpose() const override;
 
+    /**
+     * Creates a permuted copy $A'$ of this matrix $A$ with the given
+     * permutation $P$. By default, this computes a symmetric permutation
+     * (permute_mode::symmetric). For the effect of the different permutation
+     * modes, see @ref permute_mode
+     *
+     * @param permutation  The input permutation.
+     * @param mode  The permutation mode. If permute_mode::inverse is set, we
+     *              use the inverse permutation $P^{-1}$ instead of $P$.
+     *              If permute_mode::rows is set, the rows will be permuted.
+     *              If permute_mode::columns is set, the columns will be
+     *              permuted.
+     * @return  The permuted matrix.
+     */
+    std::unique_ptr<Csr> permute(
+        ptr_param<const Permutation<index_type>> permutation,
+        permute_mode mode = permute_mode::symmetric) const;
+
+    /**
+     * Creates a non-symmetrically permuted copy $A'$ of this matrix $A$ with
+     * the given row and column permutations $P$ and $Q$. The operation will
+     * compute $A'(i, j) = A(p[i], q[j])$, or $A' = P A Q^T$ if `invert` is
+     * `false`, and $A'(p[i], q[j]) = A(i,j)$, or $A' = P^{-1} A Q^{-T}$ if
+     * `invert` is `true`.
+     *
+     * @param row_permutation  The permutation $P$ to apply to the rows
+     * @param column_permutation  The permutation $Q$ to apply to the columns
+     * @param invert  If set to `false`, uses the input permutations, otherwise
+     *                uses their inverses $P^{-1}, Q^{-1}$
+     * @return  The permuted matrix.
+     */
+    std::unique_ptr<Csr> permute(
+        ptr_param<const Permutation<index_type>> row_permutation,
+        ptr_param<const Permutation<index_type>> column_permutation,
+        bool invert = false) const;
+
+    /**
+     * Creates a scaled and permuted copy of this matrix.
+     * For an explanation of the permutation modes, see
+     * @ref permute(ptr_param<const Permutation<index_type>>, permute_mode)
+     *
+     * @param permutation  The scaled permutation.
+     * @param mode  The permutation mode.
+     * @return The permuted matrix.
+     */
+    std::unique_ptr<Csr> scale_permute(
+        ptr_param<const ScaledPermutation<value_type, index_type>> permutation,
+        permute_mode = permute_mode::symmetric) const;
+
+    /**
+     * Creates a scaled and permuted copy of this matrix.
+     * For an explanation of the parameters, see
+     * @ref permute(ptr_param<const Permutation<index_type>>, ptr_param<const
+     * Permutation<index_type>>, permute_mode)
+     *
+     * @param row_permutation  The scaled row permutation.
+     * @param column_permutation  The scaled column permutation.
+     * @param invert  If set to `false`, uses the input permutations, otherwise
+     *                uses their inverses $P^{-1}, Q^{-1}$
+     * @return The permuted matrix.
+     */
+    std::unique_ptr<Csr> scale_permute(
+        ptr_param<const ScaledPermutation<value_type, index_type>>
+            row_permutation,
+        ptr_param<const ScaledPermutation<value_type, index_type>>
+            column_permutation,
+        bool invert = false) const;
+
     std::unique_ptr<LinOp> permute(
         const array<IndexType>* permutation_indices) const override;
 
diff --git a/include/ginkgo/core/matrix/dense.hpp b/include/ginkgo/core/matrix/dense.hpp
index a1e08d38c65..7d68f45f063 100644
--- a/include/ginkgo/core/matrix/dense.hpp
+++ b/include/ginkgo/core/matrix/dense.hpp
@@ -45,6 +45,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/range_accessors.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/base/utils.hpp>
+#include <ginkgo/core/matrix/permutation.hpp>
+#include <ginkgo/core/matrix/scaled_permutation.hpp>
 
 
 namespace gko {
@@ -401,6 +403,181 @@ class Dense
      */
     void fill(const ValueType value);
 
+    /**
+     * Creates a permuted copy $A'$ of this matrix $A$ with the given
+     * permutation $P$. By default, this computes a symmetric permutation
+     * (permute_mode::symmetric). For the effect of the different permutation
+     * modes, see @ref permute_mode.
+     *
+     * @param permutation  The input permutation.
+     * @param mode  The permutation mode. If permute_mode::inverse is set, we
+     *              use the inverse permutation $P^{-1}$ instead of $P$.
+     *              If permute_mode::rows is set, the rows will be permuted.
+     *              If permute_mode::columns is set, the columns will be
+     *              permuted.
+     * @return  The permuted matrix.
+     */
+    std::unique_ptr<Dense> permute(
+        ptr_param<const Permutation<int32>> permutation,
+        permute_mode mode = permute_mode::symmetric) const;
+
+    /**
+     * @copydoc permute(ptr_param<const Permutation<int32>>, permute_mode)
+     */
+    std::unique_ptr<Dense> permute(
+        ptr_param<const Permutation<int64>> permutation,
+        permute_mode mode = permute_mode::symmetric) const;
+
+    /**
+     * Overload of permute(ptr_param<const Permutation<int32>>, permute_mode)
+     * that writes the permuted copy into an existing Dense matrix.
+     * @param output  the output matrix.
+     */
+    void permute(ptr_param<const Permutation<int32>> permutation,
+                 ptr_param<Dense> output, permute_mode mode) const;
+
+    /**
+     * @copydoc permute(ptr_param<const Permutation<int32>>, ptr_param<Dense>,
+     * permute_mode)
+     */
+    void permute(ptr_param<const Permutation<int64>> permutation,
+                 ptr_param<Dense> output, permute_mode mode) const;
+
+    /**
+     * Creates a non-symmetrically permuted copy $A'$ of this matrix $A$ with
+     * the given row and column permutations $P$ and $Q$. The operation will
+     * compute $A'(i, j) = A(p[i], q[j])$, or $A' = P A Q^T$ if `invert` is
+     * `false`, and $A'(p[i], q[j]) = A(i,j)$, or $A' = P^{-1} A Q^{-T}$ if
+     * `invert` is `true`.
+     *
+     * @param row_permutation  The permutation $P$ to apply to the rows
+     * @param column_permutation  The permutation $Q$ to apply to the columns
+     * @param invert  If set to `false`, uses the input permutations, otherwise
+     *                uses their inverses $P^{-1}, Q^{-1}$
+     * @return  The permuted matrix.
+     */
+    std::unique_ptr<Dense> permute(
+        ptr_param<const Permutation<int32>> row_permutation,
+        ptr_param<const Permutation<int32>> column_permutation,
+        bool invert = false) const;
+
+    /**
+     * @copydoc permute(ptr_param<const Permutation<int32>>, ptr_param<const
+     * Permutation<int32>>, permute_mode)
+     */
+    std::unique_ptr<Dense> permute(
+        ptr_param<const Permutation<int64>> row_permutation,
+        ptr_param<const Permutation<int64>> column_permutation,
+        bool invert = false) const;
+
+    /**
+     * Overload of permute(ptr_param<const Permutation<int32>>, ptr_param<const
+     * Permutation<int32>>, permute_mode) that writes the permuted copy into an
+     * existing Dense matrix.
+     * @param output  the output matrix.
+     */
+    void permute(ptr_param<const Permutation<int32>> row_permutation,
+                 ptr_param<const Permutation<int32>> column_permutation,
+                 ptr_param<Dense> output, bool invert = false) const;
+
+    /**
+     * @copydoc permute(ptr_param<const Permutation<int32>>, ptr_param<const
+     * Permutation<int32>>, ptr_param<Dense>, permute_mode)
+     */
+    void permute(ptr_param<const Permutation<int64>> row_permutation,
+                 ptr_param<const Permutation<int64>> column_permutation,
+                 ptr_param<Dense> output, bool invert = false) const;
+
+    /**
+     * Creates a scaled and permuted copy of this matrix.
+     * For an explanation of the permutation modes, see
+     * @ref permute(ptr_param<const Permutation<index_type>>, permute_mode)
+     *
+     * @param permutation  The scaled permutation.
+     * @param mode  The permutation mode.
+     * @return The permuted matrix.
+     */
+    std::unique_ptr<Dense> scale_permute(
+        ptr_param<const ScaledPermutation<value_type, int32>> permutation,
+        permute_mode mode = permute_mode::symmetric) const;
+
+    /**
+     * @copydoc scale_permute(ptr_param<const ScaledPermutation<value_type,
+     * int32>>, permute_mode)
+     */
+    std::unique_ptr<Dense> scale_permute(
+        ptr_param<const ScaledPermutation<value_type, int64>> permutation,
+        permute_mode mode = permute_mode::symmetric) const;
+
+    /**
+     * Overload of scale_permute(ptr_param<const ScaledPermutation<value_type,
+     * int32>>, permute_mode) that writes the permuted copy into an
+     * existing Dense matrix.
+     * @param output  the output matrix.
+     */
+    void scale_permute(
+        ptr_param<const ScaledPermutation<value_type, int32>> permutation,
+        ptr_param<Dense> output, permute_mode mode) const;
+
+    /**
+     * @copydoc scale_permute(ptr_param<const ScaledPermutation<value_type,
+     * int32>>, ptr_param<Dense>, permute_mode)
+     */
+    void scale_permute(
+        ptr_param<const ScaledPermutation<value_type, int64>> permutation,
+        ptr_param<Dense> output, permute_mode mode) const;
+
+    /**
+     * Creates a scaled and permuted copy of this matrix.
+     * For an explanation of the parameters, see
+     * @ref permute(ptr_param<const Permutation<index_type>>, ptr_param<const
+     * Permutation<index_type>>, permute_mode)
+     *
+     * @param row_permutation  The scaled row permutation.
+     * @param column_permutation  The scaled column permutation.
+     * @param invert  If set to `false`, uses the input permutations, otherwise
+     *                uses their inverses $P^{-1}, Q^{-1}$
+     * @return The permuted matrix.
+     */
+    std::unique_ptr<Dense> scale_permute(
+        ptr_param<const ScaledPermutation<value_type, int32>> row_permutation,
+        ptr_param<const ScaledPermutation<value_type, int32>>
+            column_permutation,
+        bool invert = false) const;
+
+    /**
+     * @copydoc scale_permute(ptr_param<const ScaledPermutation<value_type,
+     * int32>>, ptr_param<const ScaledPermutation<value_type, int32>>, bool)
+     */
+    std::unique_ptr<Dense> scale_permute(
+        ptr_param<const ScaledPermutation<value_type, int64>> row_permutation,
+        ptr_param<const ScaledPermutation<value_type, int64>>
+            column_permutation,
+        bool invert = false) const;
+
+    /**
+     * Overload of scale_permute(ptr_param<const ScaledPermutation<value_type,
+     * int32>>, ptr_param<const ScaledPermutation<value_type, int32>>, bool)
+     * that writes the permuted copy into an existing Dense matrix.
+     * @param output  the output matrix.
+     */
+    void scale_permute(
+        ptr_param<const ScaledPermutation<value_type, int32>> row_permutation,
+        ptr_param<const ScaledPermutation<value_type, int32>>
+            column_permutation,
+        ptr_param<Dense> output, bool invert = false) const;
+
+    /**
+     * @copydoc scale_permute(ptr_param<const ScaledPermutation<value_type,
+     * int32>>, ptr_param<const ScaledPermutation<value_type, int32>>,
+     * ptr_param<Dense>, bool)
+     */
+    void scale_permute(
+        ptr_param<const ScaledPermutation<value_type, int64>> row_permutation,
+        ptr_param<const ScaledPermutation<value_type, int64>>
+            column_permutation,
+        ptr_param<Dense> output, bool invert = false) const;
+
     std::unique_ptr<LinOp> permute(
         const array<int32>* permutation_indices) const override;
 
@@ -792,7 +969,7 @@ class Dense
     void add_scaled(ptr_param<const LinOp> alpha, ptr_param<const LinOp> b);
 
     /**
-     * Subtracts `b` scaled by `alpha` fron the matrix (aka: BLAS axpy).
+     * Subtracts `b` scaled by `alpha` from the matrix (aka: BLAS axpy).
      *
      * @param alpha  If alpha is 1x1 Dense matrix, b is scaled
      *               by alpha. If it is a Dense row vector of values,
@@ -853,7 +1030,7 @@ class Dense
                           array<char>& tmp) const;
 
     /**
-     * Computes the column-wise Euclidian (L^2) norm of this matrix.
+     * Computes the column-wise Euclidean (L^2) norm of this matrix.
      *
      * @param result  a Dense row vector, used to store the norm
      *                (the number of columns in the vector must match the number
@@ -862,7 +1039,7 @@ class Dense
     void compute_norm2(ptr_param<LinOp> result) const;
 
     /**
-     * Computes the column-wise Euclidian (L^2) norm of this matrix.
+     * Computes the column-wise Euclidean (L^2) norm of this matrix.
      *
      * @param result  a Dense row vector, used to store the norm
      *                (the number of columns in the vector must match the
@@ -895,7 +1072,7 @@ class Dense
     void compute_norm1(ptr_param<LinOp> result, array<char>& tmp) const;
 
     /**
-     * Computes the square of the column-wise Euclidian (L^2) norm of this
+     * Computes the square of the column-wise Euclidean (L^2) norm of this
      * matrix.
      *
      * @param result  a Dense row vector, used to store the norm
@@ -905,7 +1082,7 @@ class Dense
     void compute_squared_norm2(ptr_param<LinOp> result) const;
 
     /**
-     * Computes the square of the column-wise Euclidian (L^2) norm of this
+     * Computes the square of the column-wise Euclidean (L^2) norm of this
      * matrix.
      *
      * @param result  a Dense row vector, used to store the norm
@@ -917,6 +1094,27 @@ class Dense
      */
     void compute_squared_norm2(ptr_param<LinOp> result, array<char>& tmp) const;
 
+    /**
+     * Computes the column-wise arithmetic mean of this matrix.
+     *
+     * @param result  a Dense row vector, used to store the mean
+     *                (the number of columns in the vector must match the number
+     *                of columns of this)
+     */
+    void compute_mean(ptr_param<LinOp> result) const;
+
+    /**
+     * Computes the column-wise arithmetic mean of this matrix.
+     *
+     * @param result  a Dense row vector, used to store the mean
+     *                (the number of columns in the vector must match the
+     *                number of columns of this)
+     * @param tmp  the temporary storage to use for partial sums during the
+     *             reduction computation. It may be resized and/or reset to the
+     *             correct executor.
+     */
+    void compute_mean(ptr_param<LinOp> result, array<char>& tmp) const;
+
     /**
      * Create a submatrix from the original matrix.
      * Warning: defining stride for this create_submatrix method might cause
@@ -1215,6 +1413,11 @@ class Dense
      */
     virtual void compute_squared_norm2_impl(LinOp* result) const;
 
+    /**
+     * @copydoc compute_mean(LinOp*) const
+     */
+    virtual void compute_mean_impl(LinOp* result) const;
+
     /**
      * Resizes the matrix to the given size.
      *
@@ -1252,19 +1455,24 @@ class Dense
     }
 
     template <typename IndexType>
-    void permute_impl(const array<IndexType>* permutation, Dense* output) const;
+    void permute_impl(const Permutation<IndexType>* permutation,
+                      permute_mode mode, Dense* output) const;
 
     template <typename IndexType>
-    void inverse_permute_impl(const array<IndexType>* permutation,
-                              Dense* output) const;
+    void permute_impl(const Permutation<IndexType>* row_permutation,
+                      const Permutation<IndexType>* col_permutation,
+                      bool invert, Dense* output) const;
 
     template <typename IndexType>
-    void row_permute_impl(const array<IndexType>* permutation,
-                          Dense* output) const;
+    void scale_permute_impl(
+        const ScaledPermutation<ValueType, IndexType>* permutation,
+        permute_mode mode, Dense* output) const;
 
     template <typename IndexType>
-    void inverse_row_permute_impl(const array<IndexType>* permutation,
-                                  Dense* output) const;
+    void scale_permute_impl(
+        const ScaledPermutation<ValueType, IndexType>* row_permutation,
+        const ScaledPermutation<ValueType, IndexType>* column_permutation,
+        bool invert, Dense* output) const;
 
     template <typename OutputType, typename IndexType>
     void row_gather_impl(const array<IndexType>* row_idxs,
@@ -1276,14 +1484,6 @@ class Dense
                          const Dense<ValueType>* beta,
                          Dense<OutputType>* row_collection) const;
 
-    template <typename IndexType>
-    void column_permute_impl(const array<IndexType>* permutation,
-                             Dense* output) const;
-
-    template <typename IndexType>
-    void inverse_column_permute_impl(const array<IndexType>* permutation,
-                                     Dense* output) const;
-
 private:
     array<value_type> values_;
     size_type stride_;
diff --git a/include/ginkgo/core/matrix/hybrid.hpp b/include/ginkgo/core/matrix/hybrid.hpp
index a923e7b9079..db65b57b6fb 100644
--- a/include/ginkgo/core/matrix/hybrid.hpp
+++ b/include/ginkgo/core/matrix/hybrid.hpp
@@ -279,7 +279,7 @@ class Hybrid
         /**
          * Get the percent setting
          *
-         * @retrun percent
+         * @return percent
          */
         auto get_percentage() const { return percent_; }
 
@@ -314,14 +314,14 @@ class Hybrid
         /**
          * Get the percent setting
          *
-         * @retrun percent
+         * @return percent
          */
         auto get_percentage() const { return strategy_.get_percentage(); }
 
         /**
          * Get the ratio setting
          *
-         * @retrun ratio
+         * @return ratio
          */
         auto get_ratio() const { return ratio_; }
 
@@ -356,7 +356,7 @@ class Hybrid
         /**
          * Get the percent setting
          *
-         * @retrun percent
+         * @return percent
          */
         auto get_percentage() const { return strategy_.get_percentage(); }
 
diff --git a/include/ginkgo/core/matrix/permutation.hpp b/include/ginkgo/core/matrix/permutation.hpp
index 163160a2af6..1b61249dcc4 100644
--- a/include/ginkgo/core/matrix/permutation.hpp
+++ b/include/ginkgo/core/matrix/permutation.hpp
@@ -52,36 +52,100 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 namespace gko {
 namespace matrix {
 
-/** @internal std::bitset allows to store any number of bits */
+
+/**
+ * Specifies how a permutation will be applied to a matrix.
+ * For the effect of the different permutation
+ * modes, see the following table.
+ *
+ * mode              | entry mapping              | matrix representation
+ * ------------------|----------------------------|----------------------
+ * none              | $A'(i, j) = A(i, j)$       | $A' = A$
+ * rows              | $A'(i, j) = A(p[i], j)$    | $A' = P A$
+ * columns           | $A'(i, j) = A(i, p[j])$    | $A' = A P^T$
+ * inverse_rows      | $A'(p[i], j) = A(i, j)$    | $A' = P^{-1} A$
+ * inverse_columns   | $A'(i, p[j]) = A(i, j)$    | $A' = A P^{-T}$
+ * symmetric         | $A'(i, j) = A(p[i], p[j])$ | $A' = P A P^T$
+ * inverse_symmetric | $A'(p[i], p[j]) = A(i, j)$ | $A' = P^{-1} A P^{-T}$
+ */
+enum class permute_mode : unsigned {
+    /** Neither rows nor columns will be permuted. */
+    none = 0b000u,
+    /** The rows will be permuted. */
+    rows = 0b001u,
+    /** The columns will be permuted. */
+    columns = 0b010u,
+    /**
+     * The rows and columns will be permuted. This is equivalent to
+     * `permute_mode::rows | permute_mode::columns`.
+     */
+    symmetric = 0b011u,
+    /** The permutation will be inverted before being applied. */
+    inverse = 0b100u,
+    /**
+     * The rows will be permuted using the inverse permutation. This is
+     * equivalent to `permute_mode::rows | permute_mode::inverse`.
+     */
+    inverse_rows = 0b101u,
+    /**
+     * The columns will be permuted using the inverse permutation. This is
+     * equivalent to `permute_mode::columns | permute_mode::inverse`.
+     */
+    inverse_columns = 0b110u,
+    /**
+     * The rows and columns will be permuted using the inverse permutation. This
+     * is equivalent to `permute_mode::symmetric | permute_mode::inverse`.
+     */
+    inverse_symmetric = 0b111u
+};
+
+
+/** Combines two permutation modes. */
+permute_mode operator|(permute_mode a, permute_mode b);
+
+
+/** Computes the intersection of two permutation modes. */
+permute_mode operator&(permute_mode a, permute_mode b);
+
+
+/** Computes the symmetric difference of two permutation modes. */
+permute_mode operator^(permute_mode a, permute_mode b);
+
+
+/** Prints a permutation mode. */
+std::ostream& operator<<(std::ostream& stream, permute_mode mode);
+
+
 using mask_type = gko::uint64;
 
 static constexpr mask_type row_permute = mask_type{1};
+GKO_DEPRECATED("permute mask is no longer supported")
 static constexpr mask_type column_permute = mask_type{1 << 2};
+GKO_DEPRECATED("permute mask is no longer supported")
 static constexpr mask_type inverse_permute = mask_type{1 << 3};
 
 /**
- * Permutation is a matrix "format" which stores the row and column permutation
- * arrays which can be used for re-ordering the rows and columns a matrix.
+ * Permutation is a matrix format that represents a permutation matrix,
+ * i.e. a matrix where each row and column has exactly one entry.
+ * The matrix can only be applied to Dense inputs, where it represents
+ * a row permutation: $A' = PA$ means $A'(i, j) = A(p[i], j)$.
  *
  * @tparam IndexType  precision of permutation array indices.
  *
- * @note This format is used mainly to allow for an abstraction of the
- * permutation/re-ordering and provides the user with an apply method which
- * calls the respective LinOp's permute operation if the respective LinOp
- * implements the Permutable interface. As such it only stores an array of the
- * permutation indices.
- *
  * @ingroup permutation
  * @ingroup mat_formats
  * @ingroup LinOp
  */
 template <typename IndexType = int32>
 class Permutation : public EnableLinOp<Permutation<IndexType>>,
-                    public EnableCreateMethod<Permutation<IndexType>> {
+                    public EnableCreateMethod<Permutation<IndexType>>,
+                    public WritableToMatrixData<default_precision, IndexType> {
     friend class EnableCreateMethod<Permutation>;
     friend class EnablePolymorphicObject<Permutation, LinOp>;
 
 public:
+    // value_type is only available to enable the usage of gko::write
+    using value_type = default_precision;
     using index_type = IndexType;
 
     /**
@@ -110,27 +174,38 @@ class Permutation : public EnableLinOp<Permutation<IndexType>>,
      * @return the number of elements explicitly stored in the permutation
      * array.
      */
-    size_type get_permutation_size() const noexcept
-    {
-        return permutation_.get_num_elems();
-    }
+    GKO_DEPRECATED("use get_size()[0] instead")
+    size_type get_permutation_size() const noexcept;
+
+    GKO_DEPRECATED("permute mask is no longer supported")
+    mask_type get_permute_mask() const;
+
+    GKO_DEPRECATED("permute mask is no longer supported")
+    void set_permute_mask(mask_type permute_mask);
 
     /**
-     * Get the permute masks
+     * Returns the inverse permutation.
      *
-     * @return  permute_mask the permute masks
+     * @return a newly created Permutation object storing the inverse
+     *         permutation of this Permutation.
      */
-    mask_type get_permute_mask() const { return enabled_permute_; }
+    std::unique_ptr<Permutation> compute_inverse() const;
 
     /**
-     * Set the permute masks
+     * Composes this permutation with another permutation.
+     * The resulting permutation fulfills `result[i] = this[other[i]]`
+     * or `result = other * this` from the matrix perspective, which is
+     * equivalent to first permuting by `this` and then by `other`:
+     * Combining permutations $P_1$ and $P_2$ with `P = P_1.combine(P_2)`
+     * performs the operation permute(A, P) = permute(permute(A, P_1), P_2).
      *
-     * @param permute_mask the permute masks
+     * @param other  the other permutation
+     * @return the combined permutation
      */
-    void set_permute_mask(mask_type permute_mask)
-    {
-        enabled_permute_ = permute_mask;
-    }
+    std::unique_ptr<Permutation> compose(
+        ptr_param<const Permutation> other) const;
+
+    void write(gko::matrix_data<value_type, index_type>& data) const override;
 
     /**
      * Creates a constant (immutable) Permutation matrix from a constant array.
@@ -143,50 +218,38 @@ class Permutation : public EnableLinOp<Permutation<IndexType>>,
      *          (if it resides on the same executor as the matrix) or a copy of
      *          the array on the correct executor.
      */
+    GKO_DEPRECATED("use create_const without size and permute mask")
     static std::unique_ptr<const Permutation> create_const(
         std::shared_ptr<const Executor> exec, size_type size,
         gko::detail::const_array_view<IndexType>&& perm_idxs,
-        mask_type enabled_permute = row_permute)
-    {
-        // cast const-ness away, but return a const object afterwards,
-        // so we can ensure that no modifications take place.
-        return std::unique_ptr<const Permutation>(new Permutation{
-            exec, size, gko::detail::array_const_cast(std::move(perm_idxs)),
-            enabled_permute});
-    }
-
-protected:
+        mask_type enabled_permute = row_permute);
     /**
-     * Creates an uninitialized Permutation arrays on the specified executor.
+     * Creates a constant (immutable) Permutation matrix from a constant array.
      *
-     * @param exec  Executor associated to the LinOp
+     * @param exec  the executor to create the matrix on
+     * @param size  the size of the square matrix
+     * @param perm_idxs  the permutation index array of the matrix
+     * @param enabled_permute  the mask describing the type of permutation
+     * @returns A smart pointer to the constant matrix wrapping the input array
+     *          (if it resides on the same executor as the matrix) or a copy of
+     *          the array on the correct executor.
      */
-    Permutation(std::shared_ptr<const Executor> exec)
-        : Permutation(std::move(exec), dim<2>{})
-    {}
+    static std::unique_ptr<const Permutation> create_const(
+        std::shared_ptr<const Executor> exec,
+        gko::detail::const_array_view<IndexType>&& perm_idxs);
 
+protected:
     /**
-     * Creates uninitialized Permutation arrays of the specified size.
+     * Creates an uninitialized Permutation arrays on the specified executor.
      *
-     * @param exec  Executor associated to the matrix
-     * @param size  size of the permutable matrix
-     * @param enabled_permute  mask for the type of permutation to apply.
+     * @param exec  Executor associated to the LinOp
      */
-    Permutation(std::shared_ptr<const Executor> exec, const dim<2>& size,
-                const mask_type& enabled_permute = row_permute)
-        : EnableLinOp<Permutation>(exec, size),
-          permutation_(exec, size[0]),
-          row_size_(size[0]),
-          col_size_(size[1]),
-          enabled_permute_(enabled_permute)
-    {}
+    Permutation(std::shared_ptr<const Executor> exec, size_type = 0);
 
     /**
      * Creates a Permutation matrix from an already allocated (and initialized)
      * row and column permutation arrays.
      *
-     * @tparam IndicesArray  type of array of indices
-     *
      * @param exec  Executor associated to the matrix
      * @param size  size of the permutation array.
      * @param permutation_indices array of permutation array
@@ -196,71 +259,50 @@ class Permutation : public EnableLinOp<Permutation<IndexType>>,
      * IndexType, or is on the wrong executor, an internal copy will be created,
      * and the original array data will not be used in the matrix.
      */
+    Permutation(std::shared_ptr<const Executor> exec,
+                array<IndexType> permutation_indices);
+
+    GKO_DEPRECATED(
+        "dim<2> is no longer supported as a dimension parameter, use size_type "
+        "instead")
+    Permutation(std::shared_ptr<const Executor> exec, const dim<2>& size);
+
+    GKO_DEPRECATED("permute mask is no longer supported")
+    Permutation(std::shared_ptr<const Executor> exec, const dim<2>& size,
+                const mask_type& enabled_permute);
+
     template <typename IndicesArray>
+    GKO_DEPRECATED("use the overload without dimensions")
     Permutation(std::shared_ptr<const Executor> exec, const dim<2>& size,
-                IndicesArray&& permutation_indices,
-                const mask_type& enabled_permute = row_permute)
-        : EnableLinOp<Permutation>(exec, size),
-          permutation_{exec, std::forward<IndicesArray>(permutation_indices)},
-          row_size_(size[0]),
-          col_size_(size[1]),
-          enabled_permute_(enabled_permute)
+                IndicesArray&& permutation_indices)
+        : Permutation{exec, array<IndexType>{exec, std::forward<IndicesArray>(
+                                                       permutation_indices)}}
     {
-        if (enabled_permute_ & row_permute) {
-            GKO_ASSERT_EQ(size[0], permutation_.get_num_elems());
-        }
-        if (enabled_permute_ & column_permute) {
-            GKO_ASSERT_EQ(size[1], permutation_.get_num_elems());
-        }
+        GKO_ASSERT_EQ(size[0], permutation_.get_num_elems());
+        GKO_ASSERT_IS_SQUARE_MATRIX(size);
     }
 
-    void apply_impl(const LinOp* in, LinOp* out) const
+    template <typename IndicesArray>
+    GKO_DEPRECATED("permute mask is no longer supported")
+    Permutation(std::shared_ptr<const Executor> exec, const dim<2>& size,
+                IndicesArray&& permutation_indices,
+                const mask_type& enabled_permute)
+        : Permutation{std::move(exec),
+                      array<IndexType>{exec, std::forward<IndicesArray>(
+                                                 permutation_indices)}}
     {
-        auto perm = as<Permutable<index_type>>(in);
-        std::unique_ptr<gko::LinOp> tmp{};
-        if (enabled_permute_ & inverse_permute) {
-            if (enabled_permute_ & row_permute) {
-                tmp = perm->inverse_row_permute(&permutation_);
-            }
-            if (enabled_permute_ & column_permute) {
-                if (enabled_permute_ & row_permute) {
-                    tmp = as<Permutable<index_type>>(tmp.get())
-                              ->inverse_column_permute(&permutation_);
-                } else {
-                    tmp = perm->inverse_column_permute(&permutation_);
-                }
-            }
-        } else {
-            if (enabled_permute_ & row_permute) {
-                tmp = perm->row_permute(&permutation_);
-            }
-            if (enabled_permute_ & column_permute) {
-                if (enabled_permute_ & row_permute) {
-                    tmp = as<Permutable<index_type>>(tmp.get())->column_permute(
-                        &permutation_);
-                } else {
-                    tmp = perm->column_permute(&permutation_);
-                }
-            }
-        }
-        out->move_from(tmp);
+        GKO_ASSERT_EQ(enabled_permute, row_permute);
+        GKO_ASSERT_EQ(size[0], permutation_.get_num_elems());
+        GKO_ASSERT_IS_SQUARE_MATRIX(size);
     }
 
+    void apply_impl(const LinOp* in, LinOp* out) const override;
 
     void apply_impl(const LinOp*, const LinOp* in, const LinOp*,
-                    LinOp* out) const
-    {
-        // Ignores alpha and beta and just performs a normal permutation as an
-        // advanced apply does not really make sense here.
-        this->apply_impl(in, out);
-    }
-
+                    LinOp* out) const override;
 
 private:
     array<index_type> permutation_;
-    size_type row_size_;
-    size_type col_size_;
-    mask_type enabled_permute_;
 };
 
 
diff --git a/include/ginkgo/core/matrix/scaled_permutation.hpp b/include/ginkgo/core/matrix/scaled_permutation.hpp
new file mode 100644
index 00000000000..d4a4687feb3
--- /dev/null
+++ b/include/ginkgo/core/matrix/scaled_permutation.hpp
@@ -0,0 +1,202 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_PUBLIC_CORE_MATRIX_SCALED_PERMUTATION_HPP_
+#define GKO_PUBLIC_CORE_MATRIX_SCALED_PERMUTATION_HPP_
+
+
+#include <memory>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/lin_op.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/base/utils.hpp>
+#include <ginkgo/core/matrix/permutation.hpp>
+
+
+namespace gko {
+namespace matrix {
+
+
+/**
+ * ScaledPermutation is a matrix combining a permutation with scaling factors.
+ * It is a combination of Diagonal and Permutation, and can be read as
+ * $SP = P \cdot S$, i.e. the scaling gets applied before the permutation.
+ *
+ * @tparam IndexType  index type of permutation indices
+ * @tparam ValueType  value type of the scaling factors
+ *
+ * @ingroup permutation
+ * @ingroup mat_formats
+ * @ingroup LinOp
+ */
+template <typename ValueType = default_precision, typename IndexType = int32>
+class ScaledPermutation final
+    : public EnableLinOp<ScaledPermutation<ValueType, IndexType>>,
+      public WritableToMatrixData<ValueType, IndexType> {
+    friend class EnablePolymorphicObject<ScaledPermutation, LinOp>;
+
+public:
+    using value_type = ValueType;
+    using index_type = IndexType;
+
+    /**
+     * Returns a pointer to the scaling factors.
+     *
+     * @return the pointer to the scaling factors.
+     */
+    value_type* get_scaling_factors() noexcept { return scale_.get_data(); }
+
+    /**
+     * @copydoc get_scaling_factors()
+     *
+     * @note This is the constant version of the function, which can be
+     *       significantly more memory efficient than the non-constant version,
+     *       so always prefer this version.
+     */
+    const value_type* get_const_scaling_factors() const noexcept
+    {
+        return scale_.get_const_data();
+    }
+
+    /**
+     * Returns a pointer to the permutation indices.
+     *
+     * @return the pointer to the permutation indices.
+     */
+    index_type* get_permutation() noexcept { return permutation_.get_data(); }
+
+    /**
+     * @copydoc get_permutation()
+     *
+     * @note This is the constant version of the function, which can be
+     *       significantly more memory efficient than the non-constant version,
+     *       so always prefer this version.
+     */
+    const index_type* get_const_permutation() const noexcept
+    {
+        return permutation_.get_const_data();
+    }
+
+    /**
+     * Returns the inverse of this operator as a scaled permutation.
+     * It is computed via $(P S)^-1 = P^{-1} (P S P^{-1})$.
+     *
+     * @return a newly created ScaledPermutation object storing the inverse
+     *         of the permutation and scaling factors of this
+     *         ScaledPermutation.
+     */
+    std::unique_ptr<ScaledPermutation> compute_inverse() const;
+
+    /**
+     * Composes this scaled permutation with another scaled permutation. This
+     * means `result = other * this` from the matrix perspective, which is
+     * equivalent to first scaling and permuting by `this` and then by `other`.
+     *
+     * @param other  the other permutation
+     * @return the combined permutation
+     */
+    std::unique_ptr<ScaledPermutation> compose(
+        ptr_param<const ScaledPermutation> other) const;
+
+    void write(gko::matrix_data<value_type, index_type>& data) const override;
+
+    /**
+     * Creates an uninitialized ScaledPermutation matrix.
+     *
+     * @param exec  Executor associated to the matrix
+     * @param size  dimensions of the (square) scaled permutation matrix
+     */
+    static std::unique_ptr<ScaledPermutation> create(
+        std::shared_ptr<const Executor> exec, size_type size = 0);
+
+    /**
+     * Create a ScaledPermutation from a Permutation.
+     * The permutation will be copied, the scaling factors are all set to 1.0.
+     *
+     * @param permutation  the permutation
+     * @return  the scaled permutation.
+     */
+    static std::unique_ptr<ScaledPermutation> create(
+        ptr_param<const Permutation<IndexType>> permutation);
+
+    /**
+     * Creates a ScaledPermutation matrix from already allocated arrays.
+     *
+     * @param exec  Executor associated to the matrix
+     * @param permutation_indices  array of permutation indices
+     * @param scaling_factors  array of scaling factors
+     */
+    static std::unique_ptr<ScaledPermutation> create(
+        std::shared_ptr<const Executor> exec, array<value_type> scaling_factors,
+        array<index_type> permutation_indices);
+
+    /**
+     * Creates a constant (immutable) ScaledPermutation matrix from constant
+     * arrays.
+     *
+     * @param exec  the executor to create the object on
+     * @param perm_idxs  the permutation index array of the matrix
+     * @param scale  the scaling factor array
+     * @returns A smart pointer to the constant matrix wrapping the input arrays
+     *          (if it resides on the same executor as the matrix) or a copy of
+     *          the arrays on the correct executor.
+     */
+    static std::unique_ptr<const ScaledPermutation> create_const(
+        std::shared_ptr<const Executor> exec,
+        gko::detail::const_array_view<value_type>&& scale,
+        gko::detail::const_array_view<index_type>&& perm_idxs);
+
+private:
+    ScaledPermutation(std::shared_ptr<const Executor> exec, size_type size = 0);
+
+    ScaledPermutation(std::shared_ptr<const Executor> exec,
+                      array<value_type> scaling_factors,
+                      array<index_type> permutation_indices);
+
+    void apply_impl(const LinOp* in, LinOp* out) const override;
+
+    void apply_impl(const LinOp*, const LinOp* in, const LinOp*,
+                    LinOp* out) const override;
+
+    array<value_type> scale_;
+    array<index_type> permutation_;
+};
+
+
+}  // namespace matrix
+}  // namespace gko
+
+
+#endif  // GKO_PUBLIC_CORE_MATRIX_SCALED_PERMUTATION_HPP_
diff --git a/include/ginkgo/core/multigrid/pgm.hpp b/include/ginkgo/core/multigrid/pgm.hpp
index a90507ce740..5856b53d108 100644
--- a/include/ginkgo/core/multigrid/pgm.hpp
+++ b/include/ginkgo/core/multigrid/pgm.hpp
@@ -196,10 +196,9 @@ class Pgm : public EnableLinOp<Pgm<ValueType, IndexType>>,
 
 
 template <typename ValueType = default_precision, typename IndexType = int32>
-using AmgxPgm
-    [[deprecated("This class is deprecated and will be removed in the next "
-                 "major release. Please use Pgm instead.")]] =
-        Pgm<ValueType, IndexType>;
+using AmgxPgm GKO_DEPRECATED(
+    "This class is deprecated and will be removed in the next "
+    "major release. Please use Pgm instead.") = Pgm<ValueType, IndexType>;
 
 
 }  // namespace multigrid
diff --git a/include/ginkgo/core/preconditioner/ic.hpp b/include/ginkgo/core/preconditioner/ic.hpp
index aa19a004dc1..357be6b34df 100644
--- a/include/ginkgo/core/preconditioner/ic.hpp
+++ b/include/ginkgo/core/preconditioner/ic.hpp
@@ -67,7 +67,7 @@ namespace preconditioner {
  * is a direct triangular solvers. The solver for L^H is the
  * conjugate-transposed solver for L, ensuring that the preconditioner is
  * symmetric and positive-definite. For this L solver, a factory can be provided
- * (using `with_l_solver_factory`) to have more control over their behavior. In
+ * (using `with_l_solver`) to have more control over their behavior. In
  * particular, it is possible to use an iterative method for solving the
  * triangular systems. The default parameters for an iterative triangluar solver
  * are:
@@ -119,19 +119,70 @@ class Ic : public EnableLinOp<Ic<LSolverType, IndexType>>, public Transposable {
     using index_type = IndexType;
     using transposed_type = Ic<LSolverType, IndexType>;
 
-    GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory)
-    {
+    class Factory;
+
+    struct parameters_type
+        : public enable_parameters_type<parameters_type, Factory> {
         /**
          * Factory for the L solver
          */
-        std::shared_ptr<typename l_solver_type::Factory>
-            GKO_FACTORY_PARAMETER_SCALAR(l_solver_factory, nullptr);
+        std::shared_ptr<const typename l_solver_type::Factory>
+            l_solver_factory{};
 
         /**
          * Factory for the factorization
          */
-        std::shared_ptr<LinOpFactory> GKO_FACTORY_PARAMETER_SCALAR(
-            factorization_factory, nullptr);
+        std::shared_ptr<const LinOpFactory> factorization_factory{};
+
+        GKO_DEPRECATED("use with_l_solver instead")
+        parameters_type& with_l_solver_factory(
+            deferred_factory_parameter<const typename l_solver_type::Factory>
+                solver)
+        {
+            return with_l_solver(std::move(solver));
+        }
+
+        parameters_type& with_l_solver(
+            deferred_factory_parameter<const typename l_solver_type::Factory>
+                solver)
+        {
+            this->l_solver_generator = std::move(solver);
+            this->deferred_factories["l_solver"] = [](const auto& exec,
+                                                      auto& params) {
+                if (!params.l_solver_generator.is_empty()) {
+                    params.l_solver_factory =
+                        params.l_solver_generator.on(exec);
+                }
+            };
+            return *this;
+        }
+
+        GKO_DEPRECATED("use with_factorization instead")
+        parameters_type& with_factorization_factory(
+            deferred_factory_parameter<const LinOpFactory> factorization)
+        {
+            return with_factorization(std::move(factorization));
+        }
+
+        parameters_type& with_factorization(
+            deferred_factory_parameter<const LinOpFactory> factorization)
+        {
+            this->factorization_generator = std::move(factorization);
+            this->deferred_factories["factorization"] = [](const auto& exec,
+                                                           auto& params) {
+                if (!params.factorization_generator.is_empty()) {
+                    params.factorization_factory =
+                        params.factorization_generator.on(exec);
+                }
+            };
+            return *this;
+        }
+
+    private:
+        deferred_factory_parameter<const typename l_solver_type::Factory>
+            l_solver_generator;
+
+        deferred_factory_parameter<const LinOpFactory> factorization_generator;
     };
 
     GKO_ENABLE_LIN_OP_FACTORY(Ic, parameters, Factory);
@@ -365,12 +416,10 @@ class Ic : public EnableLinOp<Ic<LSolverType, IndexType>>, public Transposable {
             static_cast<unsigned int>(mtx->get_size()[0])};
 
         return SolverType::build()
-            .with_criteria(gko::stop::Iteration::build()
-                               .with_max_iters(default_max_iters)
-                               .on(exec),
-                           gko::stop::ResidualNorm<value_type>::build()
-                               .with_reduction_factor(default_reduce_residual)
-                               .on(exec))
+            .with_criteria(
+                gko::stop::Iteration::build().with_max_iters(default_max_iters),
+                gko::stop::ResidualNorm<value_type>::build()
+                    .with_reduction_factor(default_reduce_residual))
             .on(exec)
             ->generate(mtx);
     }
diff --git a/include/ginkgo/core/preconditioner/ilu.hpp b/include/ginkgo/core/preconditioner/ilu.hpp
index 7db9d19c7c2..ed4b68b5ef4 100644
--- a/include/ginkgo/core/preconditioner/ilu.hpp
+++ b/include/ginkgo/core/preconditioner/ilu.hpp
@@ -66,11 +66,10 @@ namespace preconditioner {
  * It allows to set both the solver for L and the solver for U independently,
  * while providing the defaults solver::LowerTrs and solver::UpperTrs, which
  * are direct triangular solvers.
- * For these solvers, a factory can be provided (with `with_l_solver_factory`
- * and `with_u_solver_factory`) to have more control over their behavior.
- * In particular, it is possible to use an iterative method for solving the
- * triangular systems. The default parameters for an iterative triangluar
- * solver are:
+ * For these solvers, a factory can be provided (with `with_l_solver` and
+ * `with_u_solver`) to have more control over their behavior. In particular, it
+ * is possible to use an iterative method for solving the triangular systems.
+ * The default parameters for an iterative triangluar solver are:
  * - reduction factor = 1e-4
  * - max iteration = <number of rows of the matrix given to the solver>
  * Solvers without such criteria can also be used, in which case none are set.
@@ -131,25 +130,102 @@ class Ilu : public EnableLinOp<
         Ilu<typename USolverType::transposed_type,
             typename LSolverType::transposed_type, ReverseApply, IndexType>;
 
-    GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory)
-    {
+    class Factory;
+
+    struct parameters_type
+        : public enable_parameters_type<parameters_type, Factory> {
         /**
          * Factory for the L solver
          */
-        std::shared_ptr<typename l_solver_type::Factory>
-            GKO_FACTORY_PARAMETER_SCALAR(l_solver_factory, nullptr);
+        std::shared_ptr<const typename l_solver_type::Factory>
+            l_solver_factory{};
 
         /**
          * Factory for the U solver
          */
-        std::shared_ptr<typename u_solver_type::Factory>
-            GKO_FACTORY_PARAMETER_SCALAR(u_solver_factory, nullptr);
+        std::shared_ptr<const typename u_solver_type::Factory>
+            u_solver_factory{};
 
         /**
          * Factory for the factorization
          */
-        std::shared_ptr<LinOpFactory> GKO_FACTORY_PARAMETER_SCALAR(
-            factorization_factory, nullptr);
+        std::shared_ptr<const LinOpFactory> factorization_factory{};
+
+        GKO_DEPRECATED("use with_l_solver instead")
+        parameters_type& with_l_solver_factory(
+            deferred_factory_parameter<const typename l_solver_type::Factory>
+                solver)
+        {
+            return with_l_solver(std::move(solver));
+        }
+
+        parameters_type& with_l_solver(
+            deferred_factory_parameter<const typename l_solver_type::Factory>
+                solver)
+        {
+            this->l_solver_generator = std::move(solver);
+            this->deferred_factories["l_solver"] = [](const auto& exec,
+                                                      auto& params) {
+                if (!params.l_solver_generator.is_empty()) {
+                    params.l_solver_factory =
+                        params.l_solver_generator.on(exec);
+                }
+            };
+            return *this;
+        }
+
+        GKO_DEPRECATED("use with_u_solver instead")
+        parameters_type& with_u_solver_factory(
+            deferred_factory_parameter<const typename u_solver_type::Factory>
+                solver)
+        {
+            return with_u_solver(std::move(solver));
+        }
+
+        parameters_type& with_u_solver(
+            deferred_factory_parameter<const typename u_solver_type::Factory>
+                solver)
+        {
+            this->u_solver_generator = std::move(solver);
+            this->deferred_factories["u_solver"] = [](const auto& exec,
+                                                      auto& params) {
+                if (!params.u_solver_generator.is_empty()) {
+                    params.u_solver_factory =
+                        params.u_solver_generator.on(exec);
+                }
+            };
+            return *this;
+        }
+
+        GKO_DEPRECATED("use with_factorization instead")
+        parameters_type& with_factorization_factory(
+            deferred_factory_parameter<const LinOpFactory> factorization)
+        {
+            return with_factorization(std::move(factorization));
+        }
+
+        parameters_type& with_factorization(
+            deferred_factory_parameter<const LinOpFactory> factorization)
+        {
+            this->factorization_generator = std::move(factorization);
+            this->deferred_factories["factorization"] = [](const auto& exec,
+                                                           auto& params) {
+                if (!params.factorization_generator.is_empty()) {
+                    params.factorization_factory =
+                        params.factorization_generator.on(exec);
+                }
+            };
+            return *this;
+        }
+
+    private:
+        deferred_factory_parameter<const typename l_solver_type::Factory>
+            l_solver_generator;
+
+        deferred_factory_parameter<const typename u_solver_type::Factory>
+            u_solver_generator;
+
+        deferred_factory_parameter<const LinOpFactory> factorization_generator;
     };
 
     GKO_ENABLE_LIN_OP_FACTORY(Ilu, parameters, Factory);
@@ -393,12 +469,10 @@ class Ilu : public EnableLinOp<
             static_cast<unsigned int>(mtx->get_size()[0])};
 
         return SolverType::build()
-            .with_criteria(gko::stop::Iteration::build()
-                               .with_max_iters(default_max_iters)
-                               .on(exec),
-                           gko::stop::ResidualNorm<value_type>::build()
-                               .with_reduction_factor(default_reduce_residual)
-                               .on(exec))
+            .with_criteria(
+                gko::stop::Iteration::build().with_max_iters(default_max_iters),
+                gko::stop::ResidualNorm<value_type>::build()
+                    .with_reduction_factor(default_reduce_residual))
             .on(exec)
             ->generate(mtx);
     }
diff --git a/include/ginkgo/core/preconditioner/isai.hpp b/include/ginkgo/core/preconditioner/isai.hpp
index 7f03deae0a1..c5d5ddc6471 100644
--- a/include/ginkgo/core/preconditioner/isai.hpp
+++ b/include/ginkgo/core/preconditioner/isai.hpp
@@ -233,7 +233,7 @@ class Isai : public EnableLinOp<Isai<IsaiType, ValueType, IndexType>>,
     /**
      * Creates an Isai preconditioner from a matrix using an Isai::Factory.
      *
-     * @param factory  the factory to use to create the preconditoner
+     * @param factory  the factory to use to create the preconditioner
      * @param system_matrix  the matrix for which an ISAI is to be computed
      */
     explicit Isai(const Factory* factory,
diff --git a/include/ginkgo/core/preconditioner/jacobi.hpp b/include/ginkgo/core/preconditioner/jacobi.hpp
index bf215082a85..f48d8e34c8c 100644
--- a/include/ginkgo/core/preconditioner/jacobi.hpp
+++ b/include/ginkgo/core/preconditioner/jacobi.hpp
@@ -546,7 +546,7 @@ class Jacobi : public EnableLinOp<Jacobi<ValueType, IndexType>>,
     /**
      * Creates a Jacobi preconditioner from a matrix using a Jacobi::Factory.
      *
-     * @param factory  the factory to use to create the preconditoner
+     * @param factory  the factory to use to create the preconditioner
      * @param system_matrix  the matrix this preconditioner should be created
      *                       from
      */
@@ -593,7 +593,7 @@ class Jacobi : public EnableLinOp<Jacobi<ValueType, IndexType>>,
             max_block_stride = param_max_block_stride;
             if (this->get_executor() != this->get_executor()->get_master() &&
                 max_block_stride != default_block_stride) {
-                // only support the default value on the gpu devive
+                // only support the default value on the gpu device
                 GKO_NOT_SUPPORTED(this);
             }
         }
@@ -612,7 +612,7 @@ class Jacobi : public EnableLinOp<Jacobi<ValueType, IndexType>>,
     }
 
     /**
-     * Generates the preconditoner.
+     * Generates the preconditioner.
      *
      * @param system_matrix  the source matrix used to generate the
      *                       preconditioner
diff --git a/include/ginkgo/core/reorder/amd.hpp b/include/ginkgo/core/reorder/amd.hpp
index f15aa7ff0f0..3fcfd78db66 100644
--- a/include/ginkgo/core/reorder/amd.hpp
+++ b/include/ginkgo/core/reorder/amd.hpp
@@ -88,6 +88,13 @@ class Amd : public EnablePolymorphicObject<Amd<IndexType>, LinOpFactory>,
         bool GKO_FACTORY_PARAMETER_SCALAR(skip_sorting, false);
     };
 
+    /**
+     * Returns the parameters used to construct the factory.
+     *
+     * @return the parameters used to construct the factory.
+     */
+    const parameters_type& get_parameters() { return parameters_; }
+
     /**
      * @copydoc LinOpFactory::generate
      * @note This function overrides the default LinOpFactory::generate to
diff --git a/include/ginkgo/core/reorder/mc64.hpp b/include/ginkgo/core/reorder/mc64.hpp
new file mode 100644
index 00000000000..6d323934df5
--- /dev/null
+++ b/include/ginkgo/core/reorder/mc64.hpp
@@ -0,0 +1,172 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_PUBLIC_CORE_REORDER_MC64_HPP_
+#define GKO_PUBLIC_CORE_REORDER_MC64_HPP_
+
+
+#include <memory>
+
+
+#include <ginkgo/core/base/abstract_factory.hpp>
+#include <ginkgo/core/base/composition.hpp>
+#include <ginkgo/core/base/dim.hpp>
+#include <ginkgo/core/base/lin_op.hpp>
+#include <ginkgo/core/base/polymorphic_object.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/base/utils.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/diagonal.hpp>
+#include <ginkgo/core/matrix/identity.hpp>
+#include <ginkgo/core/matrix/permutation.hpp>
+#include <ginkgo/core/matrix/sparsity_csr.hpp>
+#include <ginkgo/core/reorder/reordering_base.hpp>
+
+
+namespace gko {
+namespace experimental {
+/**
+ * @brief The Reorder namespace.
+ *
+ * @ingroup reorder
+ */
+namespace reorder {
+
+
+/**
+ * Strategy defining the goal of the MC64 reordering.
+ * max_diagonal_product aims at maximizing the product of
+ * absolute diagonal entries.
+ * max_diag_sum aims at maximizing the sum of absolute values
+ * for the diagonal entries.
+ */
+enum class mc64_strategy { max_diagonal_product, max_diagonal_sum };
+
+
+/**
+ * MC64 is an algorithm for permuting large entries to the diagonal of a
+ * sparse matrix. This approach can increase numerical stability of e.g.
+ * an LU factorization without pivoting. Under the assumption of working
+ * on a nonsingular square matrix, the algorithm computes a minimum weight
+ * perfect matching on a weighted edge bipartite graph of the matrix. It is
+ * described in detail in "On Algorithms for Permuting Large Entries to the
+ * Diagonal of a Sparse Matrix" (Duff, Koster, 2001,
+ * DOI: 10.1137/S0895479899358443). There are two strategies for choosing the
+ * weights supported:
+ *  - Maximizing the product of the absolute values on the diagonal.
+ *    For this strategy, the weights are computed as
+ *    $c(i, j) = \log_2(a_i) - \log_2(|a(i, j)|)$ if $a(i, j) \neq 0 $ and
+ *    $c(i, j) = \infty$ otherwise. Here, a_i is the maximum absolute value in
+ *    row i of the matrix A. In this case, the implementation computes a row
+ *    permutation P and row and column scaling coefficients L and R such that
+ *    the matrix P*L*A*R has values with unity absolute value on the diagonal
+ *    and smaller or equal entries everywhere else.
+ *  - Maximizing the sum of the absolute values on the diagonal.
+ *    For this strategy, the weights are computed as
+ *    $c(i, j) = a_i - |a(i, j)|$ if $a(i, j) \neq 0$ and $c(i, j) =
+ *    \infty$ otherwise. In this case, no scaling coefficients are computed.
+ *
+ * This class creates a Combination of two ScaledPermutations representing the
+ * row and column permutation and scaling factors computed by this algorithm.
+ *
+ * @tparam ValueType  Type of the values of all matrices used in this class
+ * @tparam IndexType  Type of the indices of all matrices used in this class
+ */
+template <typename ValueType = default_precision, typename IndexType = int32>
+class Mc64 final
+    : public EnablePolymorphicObject<Mc64<ValueType, IndexType>, LinOpFactory>,
+      public EnablePolymorphicAssignment<Mc64<ValueType, IndexType>> {
+public:
+    struct parameters_type;
+    friend class EnablePolymorphicObject<Mc64<ValueType, IndexType>,
+                                         LinOpFactory>;
+    friend class enable_parameters_type<parameters_type,
+                                        Mc64<ValueType, IndexType>>;
+
+    using value_type = ValueType;
+    using index_type = IndexType;
+    using result_type = Composition<value_type>;
+    using matrix_type = matrix::Csr<value_type, index_type>;
+
+    struct parameters_type
+        : public enable_parameters_type<parameters_type, Mc64> {
+        /**
+         * This parameter controls the goal of the permutation.
+         */
+        mc64_strategy GKO_FACTORY_PARAMETER_SCALAR(
+            strategy, mc64_strategy::max_diagonal_product);
+
+        /**
+         * This parameter controls the tolerance below which a weight is
+         * considered to be zero.
+         */
+        remove_complex<ValueType> GKO_FACTORY_PARAMETER_SCALAR(tolerance,
+                                                               1e-14);
+    };
+
+    /**
+     * Returns the parameters used to construct the factory.
+     *
+     * @return the parameters used to construct the factory.
+     */
+    const parameters_type& get_parameters() const { return parameters_; }
+
+    /**
+     * @copydoc LinOpFactory::generate
+     * @note This function overrides the default LinOpFactory::generate to
+     *       return a Permutation instead of a generic LinOp, which would
+     *       need to be cast to ScaledPermutation again to access its indices.
+     *       It is only necessary because smart pointers aren't covariant.
+     */
+    std::unique_ptr<result_type> generate(
+        std::shared_ptr<const LinOp> system_matrix) const;
+
+    /** Creates a new parameter_type to set up the factory. */
+    static parameters_type build() { return {}; }
+
+private:
+    explicit Mc64(std::shared_ptr<const Executor> exec,
+                  const parameters_type& params = {});
+
+    std::unique_ptr<LinOp> generate_impl(
+        std::shared_ptr<const LinOp> system_matrix) const override;
+
+    parameters_type parameters_;
+};
+
+
+}  // namespace reorder
+}  // namespace experimental
+}  // namespace gko
+
+
+#endif  // GKO_PUBLIC_CORE_REORDER_MC64_HPP_
diff --git a/include/ginkgo/core/reorder/nested_dissection.hpp b/include/ginkgo/core/reorder/nested_dissection.hpp
index 8e870a61407..9faf68706c8 100644
--- a/include/ginkgo/core/reorder/nested_dissection.hpp
+++ b/include/ginkgo/core/reorder/nested_dissection.hpp
@@ -112,6 +112,8 @@ class NestedDissection
 
     /**
      * Returns the parameters used to construct the factory.
+     *
+     * @return the parameters used to construct the factory.
      */
     const parameters_type& get_parameters() { return parameters_; }
 
diff --git a/include/ginkgo/core/reorder/rcm.hpp b/include/ginkgo/core/reorder/rcm.hpp
index bb6e7986206..6aac61f61cd 100644
--- a/include/ginkgo/core/reorder/rcm.hpp
+++ b/include/ginkgo/core/reorder/rcm.hpp
@@ -64,20 +64,25 @@ enum class starting_strategy { minimum_degree, pseudo_peripheral };
 
 
 /**
- * Rcm is a reordering algorithm minimizing the bandwidth of a matrix. Such a
- * reordering typically also significantly reduces fill-in, though usually not
- * as effective as more complex algorithms, specifically AMD and nested
- * dissection schemes. The advantage of this algorithm is its low runtime.
+ * Rcm (Reverse Cuthill-McKee) is a reordering algorithm minimizing the
+ * bandwidth of a matrix. Such a reordering typically also significantly reduces
+ * fill-in, though usually not as effective as more complex algorithms,
+ * specifically AMD and nested dissection schemes. The advantage of this
+ * algorithm is its low runtime.
+ * It requires the input matrix to be structurally symmetric.
  *
  * @note  This class is derived from polymorphic object but is not a LinOp as it
  * does not make sense for this class to implement the apply methods. The
  * objective of this class is to generate a reordering/permutation vector (in
  * the form of the Permutation matrix), which can be used to apply to reorder a
  * matrix as required.
+ * @deprecated  This class is deprecated and should be replaced by
+ * gko::experimental::reorder::Rcm, which integrates more cleanly with the other
+ * reordering-related functionality of Ginkgo.
  *
  * There are two "starting strategies" currently available: minimum degree and
  * pseudo-peripheral. These strategies control how a starting vertex for a
- * connected component is choosen, which is then renumbered as first vertex in
+ * connected component is chosen, which is then renumbered as first vertex in
  * the component, starting the algorithm from there.
  * In general, the bandwidths obtained by choosing a pseudo-peripheral vertex
  * are slightly smaller than those obtained from choosing a vertex of minimum
@@ -149,73 +154,9 @@ class Rcm : public EnablePolymorphicObject<Rcm<ValueType, IndexType>,
     GKO_ENABLE_BUILD_METHOD(Factory);
 
 protected:
-    /**
-     * Generates the permutation matrix and if required the inverse permutation
-     * matrix.
-     */
-    void generate(std::shared_ptr<const Executor>& exec,
-                  std::unique_ptr<SparsityMatrix> adjacency_matrix) const;
-
-    explicit Rcm(std::shared_ptr<const Executor> exec)
-        : EnablePolymorphicObject<Rcm, ReorderingBase<IndexType>>(
-              std::move(exec))
-    {}
-
-    explicit Rcm(const Factory* factory, const ReorderingBaseArgs& args)
-        : EnablePolymorphicObject<Rcm, ReorderingBase<IndexType>>(
-              factory->get_executor()),
-          parameters_{factory->get_parameters()}
-    {
-        // Always execute the reordering on the cpu.
-        const auto is_gpu_executor =
-            this->get_executor() != this->get_executor()->get_master();
-        auto cpu_exec = is_gpu_executor ? this->get_executor()->get_master()
-                                        : this->get_executor();
-
-        auto adjacency_matrix = SparsityMatrix::create(cpu_exec);
-        array<IndexType> degrees;
-
-        // The adjacency matrix has to be square.
-        GKO_ASSERT_IS_SQUARE_MATRIX(args.system_matrix);
-        // This is needed because it does not make sense to call the copy and
-        // convert if the existing matrix is empty.
-        if (args.system_matrix->get_size()) {
-            auto tmp = copy_and_convert_to<SparsityMatrix>(cpu_exec,
-                                                           args.system_matrix);
-            // This function provided within the Sparsity matrix format removes
-            // the diagonal elements and outputs an adjacency matrix.
-            adjacency_matrix = tmp->to_adjacency_matrix();
-        }
-
-        auto const dim = adjacency_matrix->get_size();
-        permutation_ = PermutationMatrix::create(cpu_exec, dim);
-
-        // To make it explicit.
-        inv_permutation_ = nullptr;
-        if (parameters_.construct_inverse_permutation) {
-            inv_permutation_ = PermutationMatrix::create(cpu_exec, dim);
-        }
-
-        this->generate(cpu_exec, std::move(adjacency_matrix));
-
-        // Copy back results to gpu if necessary.
-        if (is_gpu_executor) {
-            const auto gpu_exec = this->get_executor();
-            auto gpu_perm = share(PermutationMatrix::create(gpu_exec, dim));
-            gpu_perm->copy_from(permutation_);
-            permutation_ = gpu_perm;
-            if (inv_permutation_) {
-                auto gpu_inv_perm =
-                    share(PermutationMatrix::create(gpu_exec, dim));
-                gpu_inv_perm->copy_from(inv_permutation_);
-                inv_permutation_ = gpu_inv_perm;
-            }
-        }
-        auto permutation_array =
-            make_array_view(this->get_executor(), permutation_->get_size()[0],
-                            permutation_->get_permutation());
-        this->set_permutation_array(permutation_array);
-    }
+    explicit Rcm(std::shared_ptr<const Executor> exec);
+
+    explicit Rcm(const Factory* factory, const ReorderingBaseArgs& args);
 
 private:
     std::shared_ptr<PermutationMatrix> permutation_;
@@ -224,6 +165,100 @@ class Rcm : public EnablePolymorphicObject<Rcm<ValueType, IndexType>,
 
 
 }  // namespace reorder
+
+
+namespace experimental {
+namespace reorder {
+
+
+using rcm_starting_strategy = gko::reorder::starting_strategy;
+
+
+/**
+ * Rcm (Reverse Cuthill-McKee) is a reordering algorithm minimizing the
+ * bandwidth of a matrix. Such a reordering typically also significantly reduces
+ * fill-in, though usually not as effective as more complex algorithms,
+ * specifically AMD and nested dissection schemes. The advantage of this
+ * algorithm is its low runtime.
+ *
+ * The class is a LinOpFactory generating a Permutation matrix out of a Csr
+ * system matrix, to be used with `Csr::permute(...)`.
+ *
+ * There are two "starting strategies" currently available: minimum degree and
+ * pseudo-peripheral. These strategies control how a starting vertex for a
+ * connected component is chosen, which is then renumbered as first vertex in
+ * the component, starting the algorithm from there.
+ * In general, the bandwidths obtained by choosing a pseudo-peripheral vertex
+ * are slightly smaller than those obtained from choosing a vertex of minimum
+ * degree. On the other hand, this strategy is much more expensive, relatively.
+ * The algorithm for finding a pseudo-peripheral vertex as
+ * described in "Computer Solution of Sparse Linear Systems" (George, Liu, Ng,
+ * Oak Ridge National Laboratory, 1994) is implemented here.
+ *
+ * @tparam IndexType  Type of the indices of all matrices used in this class
+ *
+ * @ingroup reorder
+ */
+template <typename IndexType = int32>
+class Rcm : public EnablePolymorphicObject<Rcm<IndexType>, LinOpFactory>,
+            public EnablePolymorphicAssignment<Rcm<IndexType>> {
+public:
+    struct parameters_type;
+    friend class EnablePolymorphicObject<Rcm<IndexType>, LinOpFactory>;
+    friend class enable_parameters_type<parameters_type, Rcm<IndexType>>;
+
+    using index_type = IndexType;
+    using permutation_type = matrix::Permutation<index_type>;
+
+    struct parameters_type
+        : public enable_parameters_type<parameters_type, Rcm<IndexType>> {
+        /**
+         * If set to false, computes the RCM reordering on A + A^T, otherwise
+         * assumes that A is symmetric and uses it directly.
+         */
+        bool GKO_FACTORY_PARAMETER_SCALAR(skip_symmetrize, false);
+
+        /**
+         * This parameter controls the strategy used to determine a starting
+         * vertex.
+         */
+        rcm_starting_strategy GKO_FACTORY_PARAMETER_SCALAR(
+            strategy, rcm_starting_strategy::pseudo_peripheral);
+    };
+
+    /**
+     * Returns the parameters used to construct the factory.
+     *
+     * @return the parameters used to construct the factory.
+     */
+    const parameters_type& get_parameters() { return parameters_; }
+
+    /**
+     * @copydoc LinOpFactory::generate
+     * @note This function overrides the default LinOpFactory::generate to
+     *       return a Permutation instead of a generic LinOp, which would
+     *       need to be cast to Permutation again to access its indices.
+     *       It is only necessary because smart pointers aren't covariant.
+     */
+    std::unique_ptr<permutation_type> generate(
+        std::shared_ptr<const LinOp> system_matrix) const;
+
+    /** Creates a new parameter_type to set up the factory. */
+    static parameters_type build() { return {}; }
+
+protected:
+    explicit Rcm(std::shared_ptr<const Executor> exec,
+                 const parameters_type& params = {});
+
+    std::unique_ptr<LinOp> generate_impl(
+        std::shared_ptr<const LinOp> system_matrix) const override;
+
+    parameters_type parameters_;
+};
+
+
+}  // namespace reorder
+}  // namespace experimental
 }  // namespace gko
 
 
diff --git a/include/ginkgo/core/reorder/reordering_base.hpp b/include/ginkgo/core/reorder/reordering_base.hpp
index 8cfb4c10c48..e0b80adb4cd 100644
--- a/include/ginkgo/core/reorder/reordering_base.hpp
+++ b/include/ginkgo/core/reorder/reordering_base.hpp
@@ -113,7 +113,7 @@ using ReorderingBaseFactory =
  * template parameters to enable a subclass of ReorderingBaseFactory.
  *
  * @tparam ConcreteFactory  the concrete factory which is being implemented
- *                          [CRTP parmeter]
+ *                          [CRTP parameter]
  * @tparam ConcreteReorderingBase  the concrete ReorderingBase type which this
  * factory produces, needs to have a constructor which takes a const
  * ConcreteFactory *, and a const ReorderingBaseArgs * as parameters.
diff --git a/include/ginkgo/core/solver/batch_bicgstab.hpp b/include/ginkgo/core/solver/batch_bicgstab.hpp
new file mode 100644
index 00000000000..4ce8ad7c1bd
--- /dev/null
+++ b/include/ginkgo/core/solver/batch_bicgstab.hpp
@@ -0,0 +1,120 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_PUBLIC_CORE_SOLVER_BATCH_BICGSTAB_HPP_
+#define GKO_PUBLIC_CORE_SOLVER_BATCH_BICGSTAB_HPP_
+
+
+#include <vector>
+
+
+#include <ginkgo/core/base/batch_lin_op.hpp>
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/lin_op.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/solver/batch_solver_base.hpp>
+#include <ginkgo/core/stop/batch_stop_enum.hpp>
+
+
+namespace gko {
+namespace batch {
+namespace solver {
+
+
+/**
+ * BiCGSTAB or the Bi-Conjugate Gradient-Stabilized is a Krylov subspace solver.
+ * Being a generic solver, it is capable of solving general matrices, including
+ * non-s.p.d matrices.
+ *
+ * This solver solves a batch of linear systems using the Bicgstab algorithm.
+ * Each linear system in the batch can converge independently.
+ *
+ * Unless otherwise specified via the `preconditioner` factory parameter, this
+ * implementation does not use any preconditioner by default. The type of
+ * tolerance (absolute or relative) and the maximum number of iterations to be
+ * used in the stopping criterion can be set via the factory parameters.
+ *
+ * @note The tolerance check is against the internal residual computed within
+ * the solver process. This implicit (internal) residual, can diverge from the
+ * true residual (||b - Ax||). A posterori checks (by computing the true
+ * residual, ||b - Ax||) are recommended to ensure that the solution has
+ * converged to the desired tolerance.
+ *
+ * @tparam ValueType  precision of matrix elements
+ *
+ * @ingroup solvers
+ * @ingroup BatchLinOp
+ */
+template <typename ValueType = default_precision>
+class Bicgstab final
+    : public EnableBatchSolver<Bicgstab<ValueType>, ValueType> {
+    friend class EnableBatchLinOp<Bicgstab>;
+    friend class EnablePolymorphicObject<Bicgstab, BatchLinOp>;
+
+public:
+    using value_type = ValueType;
+    using real_type = gko::remove_complex<ValueType>;
+
+    class Factory;
+
+    struct parameters_type
+        : enable_preconditioned_iterative_solver_factory_parameters<
+              parameters_type, Factory> {};
+    GKO_ENABLE_BATCH_LIN_OP_FACTORY(Bicgstab, parameters, Factory);
+    GKO_ENABLE_BUILD_METHOD(Factory);
+
+private:
+    explicit Bicgstab(std::shared_ptr<const Executor> exec)
+        : EnableBatchSolver<Bicgstab, ValueType>(std::move(exec))
+    {}
+
+    explicit Bicgstab(const Factory* factory,
+                      std::shared_ptr<const BatchLinOp> system_matrix)
+        : EnableBatchSolver<Bicgstab, ValueType>(factory->get_executor(),
+                                                 std::move(system_matrix),
+                                                 factory->get_parameters()),
+          parameters_{factory->get_parameters()}
+    {}
+
+    void solver_apply(
+        const MultiVector<ValueType>* b, MultiVector<ValueType>* x,
+        log::detail::log_data<real_type>* log_data) const override;
+};
+
+
+}  // namespace solver
+}  // namespace batch
+}  // namespace gko
+
+
+#endif  // GKO_PUBLIC_CORE_SOLVER_BATCH_BICGSTAB_HPP_
diff --git a/include/ginkgo/core/solver/batch_solver_base.hpp b/include/ginkgo/core/solver/batch_solver_base.hpp
new file mode 100644
index 00000000000..cd4ae8d1590
--- /dev/null
+++ b/include/ginkgo/core/solver/batch_solver_base.hpp
@@ -0,0 +1,433 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_PUBLIC_CORE_SOLVER_BATCH_SOLVER_BASE_HPP_
+#define GKO_PUBLIC_CORE_SOLVER_BATCH_SOLVER_BASE_HPP_
+
+
+#include <ginkgo/core/base/abstract_factory.hpp>
+#include <ginkgo/core/base/batch_lin_op.hpp>
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/utils_helper.hpp>
+#include <ginkgo/core/log/batch_logger.hpp>
+#include <ginkgo/core/matrix/batch_identity.hpp>
+#include <ginkgo/core/stop/batch_stop_enum.hpp>
+
+
+namespace gko {
+namespace batch {
+namespace solver {
+
+
+/**
+ * The BatchSolver is a base class for all batched solvers and provides the
+ * common getters and setter for these batched solver classes.
+ *
+ * @ingroup solvers
+ */
+class BatchSolver {
+public:
+    /**
+     * Returns the system operator (matrix) of the linear system.
+     *
+     * @return the system operator (matrix)
+     */
+    std::shared_ptr<const BatchLinOp> get_system_matrix() const
+    {
+        return this->system_matrix_;
+    }
+
+    /**
+     * Returns the generated preconditioner.
+     *
+     * @return the generated preconditioner.
+     */
+    std::shared_ptr<const BatchLinOp> get_preconditioner() const
+    {
+        return this->preconditioner_;
+    }
+
+    /**
+     * Get the residual tolerance used by the solver.
+     *
+     * @return The residual tolerance.
+     */
+    double get_tolerance() const { return this->residual_tol_; }
+
+    /**
+     * Update the residual tolerance to be used by the solver.
+     *
+     * @param res_tol  The residual tolerance to be used for subsequent
+     *                 invocations of the solver.
+     */
+    void reset_tolerance(double res_tol)
+    {
+        if (res_tol < 0) {
+            GKO_INVALID_STATE("Tolerance cannot be negative!");
+        }
+        this->residual_tol_ = res_tol;
+    }
+
+    /**
+     * Get the maximum number of iterations set on the solver.
+     *
+     * @return  Maximum number of iterations.
+     */
+    int get_max_iterations() const { return this->max_iterations_; }
+
+    /**
+     * Set the maximum number of iterations for the solver to use,
+     * independent of the factory that created it.
+     *
+     * @param max_iterations  The maximum number of iterations for the solver.
+     */
+    void reset_max_iterations(int max_iterations)
+    {
+        if (max_iterations < 0) {
+            GKO_INVALID_STATE("Max iterations cannot be negative!");
+        }
+        this->max_iterations_ = max_iterations;
+    }
+
+    /**
+     * Get the tolerance type.
+     *
+     * @return  The tolerance type.
+     */
+    ::gko::batch::stop::tolerance_type get_tolerance_type() const
+    {
+        return this->tol_type_;
+    }
+
+    /**
+     * Set the type of tolerance check to use inside the solver
+     *
+     * @param tol_type  The tolerance type.
+     */
+    void reset_tolerance_type(::gko::batch::stop::tolerance_type tol_type)
+    {
+        if (tol_type == ::gko::batch::stop::tolerance_type::absolute ||
+            tol_type == ::gko::batch::stop::tolerance_type::relative) {
+            this->tol_type_ = tol_type;
+        } else {
+            GKO_INVALID_STATE("Invalid tolerance type specified!");
+        }
+    }
+
+protected:
+    BatchSolver() {}
+
+    BatchSolver(std::shared_ptr<const BatchLinOp> system_matrix,
+                std::shared_ptr<const BatchLinOp> gen_preconditioner,
+                const double res_tol, const int max_iterations,
+                const ::gko::batch::stop::tolerance_type tol_type)
+        : system_matrix_{std::move(system_matrix)},
+          preconditioner_{std::move(gen_preconditioner)},
+          residual_tol_{res_tol},
+          max_iterations_{max_iterations},
+          tol_type_{tol_type},
+          workspace_{}
+    {}
+
+    void set_system_matrix_base(std::shared_ptr<const BatchLinOp> system_matrix)
+    {
+        this->system_matrix_ = std::move(system_matrix);
+    }
+
+    void set_preconditioner_base(std::shared_ptr<const BatchLinOp> precond)
+    {
+        this->preconditioner_ = std::move(precond);
+    }
+
+    std::shared_ptr<const BatchLinOp> system_matrix_{};
+    std::shared_ptr<const BatchLinOp> preconditioner_{};
+    double residual_tol_{};
+    int max_iterations_{};
+    ::gko::batch::stop::tolerance_type tol_type_{};
+    mutable array<unsigned char> workspace_{};
+};
+
+
+template <typename Parameters, typename Factory>
+struct enable_preconditioned_iterative_solver_factory_parameters
+    : enable_parameters_type<Parameters, Factory> {
+    /**
+     * Default maximum number iterations allowed.
+     *
+     * Generated solvers are initialized with this value for their maximum
+     * iterations.
+     */
+    int GKO_FACTORY_PARAMETER_SCALAR(max_iterations, 100);
+
+    /**
+     * Default residual tolerance.
+     *
+     * Generated solvers are initialized with this value for their residual
+     * tolerance.
+     */
+    double GKO_FACTORY_PARAMETER_SCALAR(tolerance, 1e-11);
+
+    /**
+     * To specify which type of tolerance check is to be considered, absolute or
+     * relative (to the rhs l2 norm)
+     */
+    ::gko::batch::stop::tolerance_type GKO_FACTORY_PARAMETER_SCALAR(
+        tolerance_type, ::gko::batch::stop::tolerance_type::absolute);
+
+    /**
+     * The preconditioner to be used by the iterative solver. By default, no
+     * preconditioner is used.
+     */
+    std::shared_ptr<const BatchLinOpFactory> GKO_DEFERRED_FACTORY_PARAMETER(
+        preconditioner);
+
+    /**
+     * Already generated preconditioner. If one is provided, the factory
+     * `preconditioner` will be ignored.
+     */
+    std::shared_ptr<const BatchLinOp> GKO_FACTORY_PARAMETER_SCALAR(
+        generated_preconditioner, nullptr);
+};
+
+
+/**
+ * This mixin provides apply and common iterative solver functionality to all
+ * the batched solvers.
+ *
+ * @tparam ConcreteSolver  The concrete solver class.
+ * @tparam ValueType  The value type of the multivectors.
+ * @tparam PolymorphicBase  The base class; must be a subclass of BatchLinOp.
+ */
+template <typename ConcreteSolver, typename ValueType,
+          typename PolymorphicBase = BatchLinOp>
+class EnableBatchSolver
+    : public BatchSolver,
+      public EnableBatchLinOp<ConcreteSolver, PolymorphicBase> {
+public:
+    using real_type = remove_complex<ValueType>;
+
+    const ConcreteSolver* apply(ptr_param<const MultiVector<ValueType>> b,
+                                ptr_param<MultiVector<ValueType>> x) const
+    {
+        this->validate_application_parameters(b.get(), x.get());
+        auto exec = this->get_executor();
+        this->apply_impl(make_temporary_clone(exec, b).get(),
+                         make_temporary_clone(exec, x).get());
+        return self();
+    }
+
+    const ConcreteSolver* apply(ptr_param<const MultiVector<ValueType>> alpha,
+                                ptr_param<const MultiVector<ValueType>> b,
+                                ptr_param<const MultiVector<ValueType>> beta,
+                                ptr_param<MultiVector<ValueType>> x) const
+    {
+        this->validate_application_parameters(alpha.get(), b.get(), beta.get(),
+                                              x.get());
+        auto exec = this->get_executor();
+        this->apply_impl(make_temporary_clone(exec, alpha).get(),
+                         make_temporary_clone(exec, b).get(),
+                         make_temporary_clone(exec, beta).get(),
+                         make_temporary_clone(exec, x).get());
+        return self();
+    }
+
+    ConcreteSolver* apply(ptr_param<const MultiVector<ValueType>> b,
+                          ptr_param<MultiVector<ValueType>> x)
+    {
+        this->validate_application_parameters(b.get(), x.get());
+        auto exec = this->get_executor();
+        this->apply_impl(make_temporary_clone(exec, b).get(),
+                         make_temporary_clone(exec, x).get());
+        return self();
+    }
+
+    ConcreteSolver* apply(ptr_param<const MultiVector<ValueType>> alpha,
+                          ptr_param<const MultiVector<ValueType>> b,
+                          ptr_param<const MultiVector<ValueType>> beta,
+                          ptr_param<MultiVector<ValueType>> x)
+    {
+        this->validate_application_parameters(alpha.get(), b.get(), beta.get(),
+                                              x.get());
+        auto exec = this->get_executor();
+        this->apply_impl(make_temporary_clone(exec, alpha).get(),
+                         make_temporary_clone(exec, b).get(),
+                         make_temporary_clone(exec, beta).get(),
+                         make_temporary_clone(exec, x).get());
+        return self();
+    }
+
+protected:
+    GKO_ENABLE_SELF(ConcreteSolver);
+
+    explicit EnableBatchSolver(std::shared_ptr<const Executor> exec)
+        : EnableBatchLinOp<ConcreteSolver, PolymorphicBase>(std::move(exec))
+    {}
+
+    template <typename FactoryParameters>
+    explicit EnableBatchSolver(std::shared_ptr<const Executor> exec,
+                               std::shared_ptr<const BatchLinOp> system_matrix,
+                               const FactoryParameters& params)
+        : BatchSolver(system_matrix, nullptr, params.tolerance,
+                      params.max_iterations, params.tolerance_type),
+          EnableBatchLinOp<ConcreteSolver, PolymorphicBase>(
+              exec, gko::transpose(system_matrix->get_size()))
+    {
+        GKO_ASSERT_BATCH_HAS_SQUARE_DIMENSIONS(system_matrix_);
+
+        using value_type = typename ConcreteSolver::value_type;
+        using Identity = matrix::Identity<value_type>;
+        using real_type = remove_complex<value_type>;
+
+        if (params.generated_preconditioner) {
+            GKO_ASSERT_BATCH_EQUAL_DIMENSIONS(params.generated_preconditioner,
+                                              this);
+            preconditioner_ = std::move(params.generated_preconditioner);
+        } else if (params.preconditioner) {
+            preconditioner_ = params.preconditioner->generate(system_matrix_);
+        } else {
+            auto id = Identity::create(exec, system_matrix->get_size());
+            preconditioner_ = std::move(id);
+        }
+        const size_type workspace_size = system_matrix->get_num_batch_items() *
+                                         (sizeof(real_type) + sizeof(int));
+        workspace_.set_executor(exec);
+        workspace_.resize_and_reset(workspace_size);
+    }
+
+    void set_system_matrix(std::shared_ptr<const BatchLinOp> new_system_matrix)
+    {
+        auto exec = self()->get_executor();
+        if (new_system_matrix) {
+            GKO_ASSERT_BATCH_EQUAL_DIMENSIONS(self(), new_system_matrix);
+            GKO_ASSERT_BATCH_HAS_SQUARE_DIMENSIONS(new_system_matrix);
+            if (new_system_matrix->get_executor() != exec) {
+                new_system_matrix = gko::clone(exec, new_system_matrix);
+            }
+        }
+        this->set_system_matrix_base(new_system_matrix);
+    }
+
+    void set_preconditioner(std::shared_ptr<const BatchLinOp> new_precond)
+    {
+        auto exec = self()->get_executor();
+        if (new_precond) {
+            GKO_ASSERT_BATCH_EQUAL_DIMENSIONS(self(), new_precond);
+            GKO_ASSERT_BATCH_HAS_SQUARE_DIMENSIONS(new_precond);
+            if (new_precond->get_executor() != exec) {
+                new_precond = gko::clone(exec, new_precond);
+            }
+        }
+        this->set_preconditioner_base(new_precond);
+    }
+
+    EnableBatchSolver& operator=(const EnableBatchSolver& other)
+    {
+        if (&other != this) {
+            this->set_size(other.get_size());
+            this->set_system_matrix(other.get_system_matrix());
+            this->set_preconditioner(other.get_preconditioner());
+            this->reset_tolerance(other.get_tolerance());
+            this->reset_max_iterations(other.get_max_iterations());
+            this->reset_tolerance_type(other.get_tolerance_type());
+        }
+        return *this;
+    }
+
+    EnableBatchSolver& operator=(EnableBatchSolver&& other)
+    {
+        if (&other != this) {
+            this->set_size(other.get_size());
+            this->set_system_matrix(other.get_system_matrix());
+            this->set_preconditioner(other.get_preconditioner());
+            this->reset_tolerance(other.get_tolerance());
+            this->reset_max_iterations(other.get_max_iterations());
+            this->reset_tolerance_type(other.get_tolerance_type());
+            other.set_system_matrix(nullptr);
+            other.set_preconditioner(nullptr);
+        }
+        return *this;
+    }
+
+    EnableBatchSolver(const EnableBatchSolver& other)
+        : EnableBatchLinOp<ConcreteSolver, PolymorphicBase>(
+              other.self()->get_executor(), other.self()->get_size())
+    {
+        *this = other;
+    }
+
+    EnableBatchSolver(EnableBatchSolver&& other)
+        : EnableBatchLinOp<ConcreteSolver, PolymorphicBase>(
+              other.self()->get_executor(), other.self()->get_size())
+    {
+        *this = std::move(other);
+    }
+
+    void apply_impl(const MultiVector<ValueType>* b,
+                    MultiVector<ValueType>* x) const
+    {
+        auto exec = this->get_executor();
+        if (b->get_common_size()[1] > 1) {
+            GKO_NOT_IMPLEMENTED;
+        }
+        auto workspace_view = workspace_.as_view();
+        auto log_data_ = std::make_unique<log::detail::log_data<real_type>>(
+            exec, b->get_num_batch_items(), workspace_view);
+
+        this->solver_apply(b, x, log_data_.get());
+
+        this->template log<gko::log::Logger::batch_solver_completed>(
+            log_data_->iter_counts, log_data_->res_norms);
+    }
+
+    void apply_impl(const MultiVector<ValueType>* alpha,
+                    const MultiVector<ValueType>* b,
+                    const MultiVector<ValueType>* beta,
+                    MultiVector<ValueType>* x) const
+    {
+        auto x_clone = x->clone();
+        this->apply(b, x_clone.get());
+        x->scale(beta);
+        x->add_scaled(alpha, x_clone.get());
+    }
+
+    virtual void solver_apply(const MultiVector<ValueType>* b,
+                              MultiVector<ValueType>* x,
+                              log::detail::log_data<real_type>* info) const = 0;
+};
+
+
+}  // namespace solver
+}  // namespace batch
+}  // namespace gko
+
+
+#endif  // GKO_PUBLIC_CORE_SOLVER_BATCH_SOLVER_BASE_HPP_
diff --git a/include/ginkgo/core/solver/bicg.hpp b/include/ginkgo/core/solver/bicg.hpp
index c7b47a0e807..205be85df6c 100644
--- a/include/ginkgo/core/solver/bicg.hpp
+++ b/include/ginkgo/core/solver/bicg.hpp
@@ -99,27 +99,12 @@ class Bicg
      */
     bool apply_uses_initial_guess() const override { return true; }
 
-    GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory)
-    {
-        /**
-         * Criterion factories.
-         */
-        std::vector<std::shared_ptr<const stop::CriterionFactory>>
-            GKO_FACTORY_PARAMETER_VECTOR(criteria, nullptr);
-
-        /**
-         * Preconditioner factory.
-         */
-        std::shared_ptr<const LinOpFactory> GKO_FACTORY_PARAMETER_SCALAR(
-            preconditioner, nullptr);
-
-        /**
-         * Already generated preconditioner. If one is provided, the factory
-         * `preconditioner` will be ignored.
-         */
-        std::shared_ptr<const LinOp> GKO_FACTORY_PARAMETER_SCALAR(
-            generated_preconditioner, nullptr);
-    };
+    class Factory;
+
+    struct parameters_type
+        : enable_preconditioned_iterative_solver_factory_parameters<
+              parameters_type, Factory> {};
+
     GKO_ENABLE_LIN_OP_FACTORY(Bicg, parameters, Factory);
     GKO_ENABLE_BUILD_METHOD(Factory);
 
diff --git a/include/ginkgo/core/solver/bicgstab.hpp b/include/ginkgo/core/solver/bicgstab.hpp
index 214e669b2ff..58d76c5e0df 100644
--- a/include/ginkgo/core/solver/bicgstab.hpp
+++ b/include/ginkgo/core/solver/bicgstab.hpp
@@ -98,27 +98,11 @@ class Bicgstab
      */
     bool apply_uses_initial_guess() const override { return true; }
 
-    GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory)
-    {
-        /**
-         * Criterion factories.
-         */
-        std::vector<std::shared_ptr<const stop::CriterionFactory>>
-            GKO_FACTORY_PARAMETER_VECTOR(criteria, nullptr);
-
-        /**
-         * Preconditioner factory.
-         */
-        std::shared_ptr<const LinOpFactory> GKO_FACTORY_PARAMETER_SCALAR(
-            preconditioner, nullptr);
-
-        /**
-         * Already generated preconditioner. If one is provided, the factory
-         * `preconditioner` will be ignored.
-         */
-        std::shared_ptr<const LinOp> GKO_FACTORY_PARAMETER_SCALAR(
-            generated_preconditioner, nullptr);
-    };
+    class Factory;
+    struct parameters_type
+        : enable_preconditioned_iterative_solver_factory_parameters<
+              parameters_type, Factory> {};
+
     GKO_ENABLE_LIN_OP_FACTORY(Bicgstab, parameters, Factory);
     GKO_ENABLE_BUILD_METHOD(Factory);
 
diff --git a/include/ginkgo/core/solver/cb_gmres.hpp b/include/ginkgo/core/solver/cb_gmres.hpp
index a2dbb1efce1..9bf4cf91a76 100644
--- a/include/ginkgo/core/solver/cb_gmres.hpp
+++ b/include/ginkgo/core/solver/cb_gmres.hpp
@@ -153,38 +153,23 @@ class CbGmres : public EnableLinOp<CbGmres<ValueType>>,
         return parameters_.storage_precision;
     }
 
-    GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory)
-    {
+    class Factory;
+
+    struct parameters_type
+        : enable_preconditioned_iterative_solver_factory_parameters<
+              parameters_type, Factory> {
         /**
          * Determines which storage type is used.
          */
         cb_gmres::storage_precision GKO_FACTORY_PARAMETER_SCALAR(
             storage_precision, cb_gmres::storage_precision::reduce1);
 
-        /**
-         * Criterion factories.
-         */
-        std::vector<std::shared_ptr<const stop::CriterionFactory>>
-            GKO_FACTORY_PARAMETER_VECTOR(criteria, nullptr);
-
-        /**
-         * Preconditioner factory.
-         */
-        std::shared_ptr<const LinOpFactory> GKO_FACTORY_PARAMETER_SCALAR(
-            preconditioner, nullptr);
-
-        /**
-         * Already generated preconditioner. If one is provided, the factory
-         * `preconditioner` will be ignored.
-         */
-        std::shared_ptr<const LinOp> GKO_FACTORY_PARAMETER_SCALAR(
-            generated_preconditioner, nullptr);
-
         /**
          * Krylov dimension factory.
          */
         size_type GKO_FACTORY_PARAMETER_SCALAR(krylov_dim, 100u);
     };
+
     GKO_ENABLE_LIN_OP_FACTORY(CbGmres, parameters, Factory);
     GKO_ENABLE_BUILD_METHOD(Factory);
 
diff --git a/include/ginkgo/core/solver/cg.hpp b/include/ginkgo/core/solver/cg.hpp
index bc0861cf270..c0fff29fedd 100644
--- a/include/ginkgo/core/solver/cg.hpp
+++ b/include/ginkgo/core/solver/cg.hpp
@@ -92,27 +92,12 @@ class Cg : public EnableLinOp<Cg<ValueType>>,
      */
     bool apply_uses_initial_guess() const override { return true; }
 
-    GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory)
-    {
-        /**
-         * Criterion factories.
-         */
-        std::vector<std::shared_ptr<const stop::CriterionFactory>>
-            GKO_FACTORY_PARAMETER_VECTOR(criteria, nullptr);
-
-        /**
-         * Preconditioner factory.
-         */
-        std::shared_ptr<const LinOpFactory> GKO_FACTORY_PARAMETER_SCALAR(
-            preconditioner, nullptr);
-
-        /**
-         * Already generated preconditioner. If one is provided, the factory
-         * `preconditioner` will be ignored.
-         */
-        std::shared_ptr<const LinOp> GKO_FACTORY_PARAMETER_SCALAR(
-            generated_preconditioner, nullptr);
-    };
+    class Factory;
+
+    struct parameters_type
+        : enable_preconditioned_iterative_solver_factory_parameters<
+              parameters_type, Factory> {};
+
     GKO_ENABLE_LIN_OP_FACTORY(Cg, parameters, Factory);
     GKO_ENABLE_BUILD_METHOD(Factory);
 
diff --git a/include/ginkgo/core/solver/cgs.hpp b/include/ginkgo/core/solver/cgs.hpp
index 22f81d8a292..57a834b0ead 100644
--- a/include/ginkgo/core/solver/cgs.hpp
+++ b/include/ginkgo/core/solver/cgs.hpp
@@ -90,27 +90,12 @@ class Cgs
      */
     bool apply_uses_initial_guess() const override { return true; }
 
-    GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory)
-    {
-        /**
-         * Criterion factories.
-         */
-        std::vector<std::shared_ptr<const stop::CriterionFactory>>
-            GKO_FACTORY_PARAMETER_VECTOR(criteria, nullptr);
-
-        /**
-         * Preconditioner factory.
-         */
-        std::shared_ptr<const LinOpFactory> GKO_FACTORY_PARAMETER_SCALAR(
-            preconditioner, nullptr);
-
-        /**
-         * Already generated preconditioner. If one is provided, the factory
-         * `preconditioner` will be ignored.
-         */
-        std::shared_ptr<const LinOp> GKO_FACTORY_PARAMETER_SCALAR(
-            generated_preconditioner, nullptr);
-    };
+    class Factory;
+
+    struct parameters_type
+        : enable_preconditioned_iterative_solver_factory_parameters<
+              parameters_type, Factory> {};
+
     GKO_ENABLE_LIN_OP_FACTORY(Cgs, parameters, Factory);
     GKO_ENABLE_BUILD_METHOD(Factory);
 
diff --git a/include/ginkgo/core/solver/direct.hpp b/include/ginkgo/core/solver/direct.hpp
index 4a9a69731be..d65dd93545d 100644
--- a/include/ginkgo/core/solver/direct.hpp
+++ b/include/ginkgo/core/solver/direct.hpp
@@ -74,8 +74,9 @@ class Direct : public EnableLinOp<Direct<ValueType, IndexType>>,
 
     std::unique_ptr<LinOp> conj_transpose() const override;
 
-    GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory)
-    {
+    class Factory;
+
+    struct parameters_type : enable_parameters_type<parameters_type, Factory> {
         /**
          * Number of right hand sides.
          *
@@ -86,8 +87,8 @@ class Direct : public EnableLinOp<Direct<ValueType, IndexType>>,
         gko::size_type GKO_FACTORY_PARAMETER_SCALAR(num_rhs, 1u);
 
         /** The factorization factory to use for generating the factors. */
-        std::shared_ptr<const LinOpFactory> GKO_FACTORY_PARAMETER_SCALAR(
-            factorization, nullptr);
+        std::shared_ptr<const LinOpFactory> GKO_DEFERRED_FACTORY_PARAMETER(
+            factorization);
     };
     GKO_ENABLE_LIN_OP_FACTORY(Direct, parameters, Factory);
     GKO_ENABLE_BUILD_METHOD(Factory);
diff --git a/include/ginkgo/core/solver/fcg.hpp b/include/ginkgo/core/solver/fcg.hpp
index cad7a29fc27..b6715f07512 100644
--- a/include/ginkgo/core/solver/fcg.hpp
+++ b/include/ginkgo/core/solver/fcg.hpp
@@ -98,27 +98,12 @@ class Fcg
      */
     bool apply_uses_initial_guess() const override { return true; }
 
-    GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory)
-    {
-        /**
-         * Criterion factories.
-         */
-        std::vector<std::shared_ptr<const stop::CriterionFactory>>
-            GKO_FACTORY_PARAMETER_VECTOR(criteria, nullptr);
-
-        /**
-         * Preconditioner factory.
-         */
-        std::shared_ptr<const LinOpFactory> GKO_FACTORY_PARAMETER_SCALAR(
-            preconditioner, nullptr);
-
-        /**
-         * Already generated preconditioner. If one is provided, the factory
-         * `preconditioner` will be ignored.
-         */
-        std::shared_ptr<const LinOp> GKO_FACTORY_PARAMETER_SCALAR(
-            generated_preconditioner, nullptr);
-    };
+    class Factory;
+
+    struct parameters_type
+        : enable_preconditioned_iterative_solver_factory_parameters<
+              parameters_type, Factory> {};
+
     GKO_ENABLE_LIN_OP_FACTORY(Fcg, parameters, Factory);
     GKO_ENABLE_BUILD_METHOD(Factory);
 
diff --git a/include/ginkgo/core/solver/gcr.hpp b/include/ginkgo/core/solver/gcr.hpp
index fdc95d30c8f..8dc68e6e33d 100644
--- a/include/ginkgo/core/solver/gcr.hpp
+++ b/include/ginkgo/core/solver/gcr.hpp
@@ -108,30 +108,12 @@ class Gcr
      */
     void set_krylov_dim(size_type other) { parameters_.krylov_dim = other; }
 
-    GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory)
-    {
-        /**
-         * Criterion factories.
-         */
-        std::vector<std::shared_ptr<const stop::CriterionFactory>>
-            GKO_FACTORY_PARAMETER_VECTOR(criteria, nullptr);
-
-        /**
-         * Preconditioner factory.
-         */
-        std::shared_ptr<const LinOpFactory> GKO_FACTORY_PARAMETER_SCALAR(
-            preconditioner, nullptr);
-
-        /**
-         * Already generated preconditioner. If one is provided, the factory
-         * `preconditioner` will be ignored.
-         */
-        std::shared_ptr<const LinOp> GKO_FACTORY_PARAMETER_SCALAR(
-            generated_preconditioner, nullptr);
-
-        /**
-         * Krylov dimension factory.
-         */
+    class Factory;
+
+    struct parameters_type
+        : enable_preconditioned_iterative_solver_factory_parameters<
+              parameters_type, Factory> {
+        /** Krylov subspace dimension/restart value. */
         size_type GKO_FACTORY_PARAMETER_SCALAR(krylov_dim, 0u);
     };
     GKO_ENABLE_LIN_OP_FACTORY(Gcr, parameters, Factory);
diff --git a/include/ginkgo/core/solver/gmres.hpp b/include/ginkgo/core/solver/gmres.hpp
index d7d0f57a8a4..0ea056c9333 100644
--- a/include/ginkgo/core/solver/gmres.hpp
+++ b/include/ginkgo/core/solver/gmres.hpp
@@ -109,35 +109,16 @@ class Gmres
      */
     void set_krylov_dim(size_type other) { parameters_.krylov_dim = other; }
 
-    GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory)
-    {
-        /**
-         * Criterion factories.
-         */
-        std::vector<std::shared_ptr<const stop::CriterionFactory>>
-            GKO_FACTORY_PARAMETER_VECTOR(criteria, nullptr);
-
-        /**
-         * Preconditioner factory.
-         */
-        std::shared_ptr<const LinOpFactory> GKO_FACTORY_PARAMETER_SCALAR(
-            preconditioner, nullptr);
-
-        /**
-         * Already generated preconditioner. If one is provided, the factory
-         * `preconditioner` will be ignored.
-         */
-        std::shared_ptr<const LinOp> GKO_FACTORY_PARAMETER_SCALAR(
-            generated_preconditioner, nullptr);
-
-        /**
-         * Krylov dimension factory.
-         */
+
+    class Factory;
+
+    struct parameters_type
+        : enable_preconditioned_iterative_solver_factory_parameters<
+              parameters_type, Factory> {
+        /** Krylov subspace dimension/restart value. */
         size_type GKO_FACTORY_PARAMETER_SCALAR(krylov_dim, 0u);
 
-        /**
-         * Flexible GMRES
-         */
+        /** Flexible GMRES */
         bool GKO_FACTORY_PARAMETER_SCALAR(flexible, false);
     };
     GKO_ENABLE_LIN_OP_FACTORY(Gmres, parameters, Factory);
diff --git a/include/ginkgo/core/solver/idr.hpp b/include/ginkgo/core/solver/idr.hpp
index 7ad152f6808..9c124cc184d 100644
--- a/include/ginkgo/core/solver/idr.hpp
+++ b/include/ginkgo/core/solver/idr.hpp
@@ -162,33 +162,29 @@ class Idr
      * Sets the complex_subspace parameter of the solver.
      *
      * @param other  the new complex_subspace parameter
+     * @deprecated Please use set_complex_subspace instead
      */
+    GKO_DEPRECATED("Use set_complex_subspace instead")
     void set_complex_subpsace(const bool other)
     {
-        parameters_.complex_subspace = other;
+        this->set_complex_subspace(other);
     }
 
-    GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory)
+    /**
+     * Sets the complex_subspace parameter of the solver.
+     *
+     * @param other  the new complex_subspace parameter
+     */
+    void set_complex_subspace(const bool other)
     {
-        /**
-         * Criterion factories.
-         */
-        std::vector<std::shared_ptr<const stop::CriterionFactory>>
-            GKO_FACTORY_PARAMETER_VECTOR(criteria, nullptr);
-
-        /**
-         * Preconditioner factory.
-         */
-        std::shared_ptr<const LinOpFactory> GKO_FACTORY_PARAMETER_SCALAR(
-            preconditioner, nullptr);
+        parameters_.complex_subspace = other;
+    }
 
-        /**
-         * Already generated preconditioner. If one is provided, the factory
-         * `preconditioner` will be ignored.
-         */
-        std::shared_ptr<const LinOp> GKO_FACTORY_PARAMETER_SCALAR(
-            generated_preconditioner, nullptr);
+    class Factory;
 
+    struct parameters_type
+        : enable_preconditioned_iterative_solver_factory_parameters<
+              parameters_type, Factory> {
         /**
          * Dimension of the subspace S. Determines how many intermediate
          * residuals are computed in each iteration.
diff --git a/include/ginkgo/core/solver/ir.hpp b/include/ginkgo/core/solver/ir.hpp
index 251924b70ff..9f2e92eb5a6 100644
--- a/include/ginkgo/core/solver/ir.hpp
+++ b/include/ginkgo/core/solver/ir.hpp
@@ -177,19 +177,15 @@ class Ir : public EnableLinOp<Ir<ValueType>>,
      */
     Ir(Ir&&);
 
-    GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory)
-    {
-        /**
-         * Criterion factories.
-         */
-        std::vector<std::shared_ptr<const stop::CriterionFactory>>
-            GKO_FACTORY_PARAMETER_VECTOR(criteria, nullptr);
+    class Factory;
 
+    struct parameters_type
+        : enable_iterative_solver_factory_parameters<parameters_type, Factory> {
         /**
          * Inner solver factory.
          */
-        std::shared_ptr<const LinOpFactory> GKO_FACTORY_PARAMETER_SCALAR(
-            solver, nullptr);
+        std::shared_ptr<const LinOpFactory> GKO_DEFERRED_FACTORY_PARAMETER(
+            solver);
 
         /**
          * Already generated solver. If one is provided, the factory `solver`
@@ -306,7 +302,7 @@ struct workspace_traits<Ir<ValueType>> {
  * limited stop criterion(iterations and relacation_factor).
  *
  * @param factory  the shared pointer of factory
- * @param iteration  the maximum number of iteraion, which default is 1
+ * @param iteration  the maximum number of iteration, which default is 1
  * @param relaxation_factor  the relaxation factor for Richardson
  *
  * @return the pointer of Ir(Richardson)
@@ -319,8 +315,7 @@ auto build_smoother(std::shared_ptr<const LinOpFactory> factory,
     return Ir<ValueType>::build()
         .with_solver(factory)
         .with_relaxation_factor(relaxation_factor)
-        .with_criteria(
-            gko::stop::Iteration::build().with_max_iters(iteration).on(exec))
+        .with_criteria(gko::stop::Iteration::build().with_max_iters(iteration))
         .on(exec);
 }
 
@@ -329,7 +324,7 @@ auto build_smoother(std::shared_ptr<const LinOpFactory> factory,
  * limited stop criterion(iterations and relacation_factor).
  *
  * @param solver  the shared pointer of solver
- * @param iteration  the maximum number of iteraion, which default is 1
+ * @param iteration  the maximum number of iteration, which default is 1
  * @param relaxation_factor  the relaxation factor for Richardson
  *
  * @return the pointer of Ir(Richardson)
@@ -344,8 +339,7 @@ auto build_smoother(std::shared_ptr<const LinOp> solver,
     return Ir<ValueType>::build()
         .with_generated_solver(solver)
         .with_relaxation_factor(relaxation_factor)
-        .with_criteria(
-            gko::stop::Iteration::build().with_max_iters(iteration).on(exec))
+        .with_criteria(gko::stop::Iteration::build().with_max_iters(iteration))
         .on(exec);
 }
 
diff --git a/include/ginkgo/core/solver/multigrid.hpp b/include/ginkgo/core/solver/multigrid.hpp
index 2d04a889445..a62a35ca0df 100644
--- a/include/ginkgo/core/solver/multigrid.hpp
+++ b/include/ginkgo/core/solver/multigrid.hpp
@@ -40,6 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <utility>
 
 
+#include <ginkgo/core/base/abstract_factory.hpp>
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/lin_op.hpp>
@@ -215,19 +216,17 @@ class Multigrid : public EnableLinOp<Multigrid>,
      */
     void set_cycle(multigrid::cycle cycle) { parameters_.cycle = cycle; }
 
-    GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory)
-    {
-        /**
-         * Criterion factories.
-         */
-        std::vector<std::shared_ptr<const stop::CriterionFactory>>
-            GKO_FACTORY_PARAMETER_VECTOR(criteria, nullptr);
 
+    class Factory;
+
+    struct parameters_type
+        : public enable_iterative_solver_factory_parameters<parameters_type,
+                                                            Factory> {
         /**
          * MultigridLevel Factory list
          */
-        std::vector<std::shared_ptr<const gko::LinOpFactory>>
-            GKO_FACTORY_PARAMETER_VECTOR(mg_level, nullptr);
+        std::vector<std::shared_ptr<const LinOpFactory>>
+            GKO_DEFERRED_FACTORY_VECTOR_PARAMETER(mg_level);
 
         /**
          * Custom selector size_type (size_type level, const LinOp* fine_matrix)
@@ -272,17 +271,16 @@ class Multigrid : public EnableLinOp<Multigrid>,
          * If any element in the vector is a `nullptr` then the smoother
          * application at the corresponding level is skipped.
          */
-        using smoother_list = std::vector<std::shared_ptr<const LinOpFactory>>;
-        smoother_list GKO_FACTORY_PARAMETER_VECTOR(pre_smoother,
-                                                   smoother_list{});
+        std::vector<std::shared_ptr<const LinOpFactory>>
+            GKO_DEFERRED_FACTORY_VECTOR_PARAMETER(pre_smoother);
 
         /**
          * Post-smooth Factory list.
          * It is similar to Pre-smooth Factory list. It is ignored if
          * the factory parameter post_uses_pre is set to true.
          */
-        smoother_list GKO_FACTORY_PARAMETER_VECTOR(post_smoother,
-                                                   smoother_list{});
+        std::vector<std::shared_ptr<const LinOpFactory>>
+            GKO_DEFERRED_FACTORY_VECTOR_PARAMETER(post_smoother);
 
         /**
          * Mid-smooth Factory list. If it contains available elements, multigrid
@@ -291,8 +289,8 @@ class Multigrid : public EnableLinOp<Multigrid>,
          * Pre-smooth Factory list. It is ignored if the factory parameter
          * mid_case is not mid.
          */
-        smoother_list GKO_FACTORY_PARAMETER_VECTOR(mid_smoother,
-                                                   smoother_list{});
+        std::vector<std::shared_ptr<const LinOpFactory>>
+            GKO_DEFERRED_FACTORY_VECTOR_PARAMETER(mid_smoother);
 
         /**
          * Whether post-smoothing-related calls use corresponding
@@ -333,7 +331,7 @@ class Multigrid : public EnableLinOp<Multigrid>,
          * coarsest level.
          */
         std::vector<std::shared_ptr<const LinOpFactory>>
-            GKO_FACTORY_PARAMETER_VECTOR(coarsest_solver, nullptr);
+            GKO_DEFERRED_FACTORY_VECTOR_PARAMETER(coarsest_solver);
 
         /**
          * Custom coarsest_solver selector
diff --git a/include/ginkgo/core/solver/solver_base.hpp b/include/ginkgo/core/solver/solver_base.hpp
index 6687a6df82e..070cc4e6b4a 100644
--- a/include/ginkgo/core/solver/solver_base.hpp
+++ b/include/ginkgo/core/solver/solver_base.hpp
@@ -35,11 +35,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <memory>
+#include <type_traits>
 #include <utility>
 
 
 #include <ginkgo/core/base/lin_op.hpp>
 #include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/log/logger.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/matrix/identity.hpp>
 #include <ginkgo/core/solver/workspace.hpp>
@@ -47,14 +49,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/stop/criterion.hpp>
 
 
-#if defined(__GNUC__) || defined(__clang__)
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-#endif
-#ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable : 5211, 4973, 4974)
-#endif
+GKO_BEGIN_DISABLE_DEPRECATION_WARNINGS
 
 
 namespace gko {
@@ -244,7 +239,7 @@ class EnableApplyWithInitialGuess : public ApplyWithInitialGuess {
             self(), alpha, b, beta, x);
     }
 
-    // TODO: should we provide the defaule implementation?
+    // TODO: should we provide the default implementation?
     /**
      * The class should override this method and must modify the input vectors
      * according to the initial_guess_mode
@@ -535,10 +530,9 @@ class SolverBaseLinOp {
 template <typename MatrixType>
 class
     // clang-format off
-    [[deprecated("This class will be replaced by the template-less detail::SolverBaseLinOp in a future release")]] SolverBase
+    GKO_DEPRECATED("This class will be replaced by the template-less detail::SolverBaseLinOp in a future release") SolverBase
     // clang-format on
-    : public detail::SolverBaseLinOp
-{
+    : public detail::SolverBaseLinOp {
 public:
     using detail::SolverBaseLinOp::SolverBaseLinOp;
 
@@ -859,14 +853,41 @@ class EnablePreconditionedIterativeSolver
 };
 
 
+template <typename Parameters, typename Factory>
+struct enable_iterative_solver_factory_parameters
+    : enable_parameters_type<Parameters, Factory> {
+    /**
+     * Stopping criteria to be used by the solver.
+     */
+    std::vector<std::shared_ptr<const stop::CriterionFactory>>
+        GKO_DEFERRED_FACTORY_VECTOR_PARAMETER(criteria);
+};
+
+
+template <typename Parameters, typename Factory>
+struct enable_preconditioned_iterative_solver_factory_parameters
+    : enable_iterative_solver_factory_parameters<Parameters, Factory> {
+    /**
+     * The preconditioner to be used by the iterative solver. By default, no
+     * preconditioner is used.
+     */
+    std::shared_ptr<const LinOpFactory> GKO_DEFERRED_FACTORY_PARAMETER(
+        preconditioner);
+
+    /**
+     * Already generated preconditioner. If one is provided, the factory
+     * `preconditioner` will be ignored.
+     */
+    std::shared_ptr<const LinOp> GKO_FACTORY_PARAMETER_SCALAR(
+        generated_preconditioner, nullptr);
+};
+
+
 }  // namespace solver
 }  // namespace gko
 
 
-#if defined(__GNUC__) || defined(__clang__)
-#pragma GCC diagnostic pop
-#endif
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
+GKO_END_DISABLE_DEPRECATION_WARNINGS
+
+
 #endif  // GKO_PUBLIC_CORE_SOLVER_SOLVER_BASE_HPP_
diff --git a/include/ginkgo/core/stop/batch_stop_enum.hpp b/include/ginkgo/core/stop/batch_stop_enum.hpp
new file mode 100644
index 00000000000..3c463b8730c
--- /dev/null
+++ b/include/ginkgo/core/stop/batch_stop_enum.hpp
@@ -0,0 +1,63 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_PUBLIC_CORE_STOP_BATCH_STOP_ENUM_HPP_
+#define GKO_PUBLIC_CORE_STOP_BATCH_STOP_ENUM_HPP_
+
+
+namespace gko {
+namespace batch {
+namespace stop {
+
+
+/**
+ * This enum provides two types of options for the convergence of an iterative
+ * solver.
+ *
+ * `absolute` tolerance implies that the convergence criteria check is
+ * against the computed residual ($||r|| \leq \tau$)
+ *
+ * With the `relative` tolerance type, the solver
+ * convergence criteria checks against the relative residual norm
+ * ($||r|| \leq ||b|| \times \tau$, where $||b||$ is the L2 norm of the rhs).
+ *
+ * @note the computed residual norm, $||r||$ may be implicit or explicit
+ * depending on the solver algorithm.
+ */
+enum class tolerance_type { absolute, relative };
+
+
+}  // namespace stop
+}  // namespace batch
+}  // namespace gko
+
+#endif  // GKO_PUBLIC_CORE_STOP_BATCH_STOP_ENUM_HPP_
diff --git a/include/ginkgo/core/stop/combined.hpp b/include/ginkgo/core/stop/combined.hpp
index f17d05abb14..7e113279de6 100644
--- a/include/ginkgo/core/stop/combined.hpp
+++ b/include/ginkgo/core/stop/combined.hpp
@@ -70,7 +70,7 @@ class Combined : public EnablePolymorphicObject<Combined, Criterion> {
          * too costly.
          */
         std::vector<std::shared_ptr<const CriterionFactory>>
-            GKO_FACTORY_PARAMETER_VECTOR(criteria, nullptr);
+            GKO_DEFERRED_FACTORY_VECTOR_PARAMETER(criteria);
     };
 
     class Factory
diff --git a/include/ginkgo/core/stop/criterion.hpp b/include/ginkgo/core/stop/criterion.hpp
index e094cc90206..1a52da3efae 100644
--- a/include/ginkgo/core/stop/criterion.hpp
+++ b/include/ginkgo/core/stop/criterion.hpp
@@ -259,7 +259,7 @@ using CriterionFactory = AbstractFactory<Criterion, CriterionArgs>;
  * template parameters to enable a subclass of CriterionFactory.
  *
  * @tparam ConcreteFactory  the concrete factory which is being implemented
- *                          [CRTP parmeter]
+ *                          [CRTP parameter]
  * @tparam ConcreteCriterion  the concrete Criterion type which this factory
  *                            produces, needs to have a constructor which takes
  *                            a const ConcreteFactory *, and a
diff --git a/include/ginkgo/core/stop/residual_norm.hpp b/include/ginkgo/core/stop/residual_norm.hpp
index fa42a3b1919..f13f60a213d 100644
--- a/include/ginkgo/core/stop/residual_norm.hpp
+++ b/include/ginkgo/core/stop/residual_norm.hpp
@@ -52,13 +52,13 @@ namespace stop {
  * The mode for the residual norm criterion.
  *
  * - absolute:        Check for tolerance against residual norm.
- *                    $ || r || < \tau $
+ *                    $ || r || \leq \tau $
  *
  * - initial_resnorm: Check for tolerance relative to the initial residual norm.
- *                    $ \frac{|| r ||}{|| r_0||} < \tau $
+ *                    $ || r || \leq \tau \times || r_0|| $
  *
  * - rhs_norm:        Check for tolerance relative to the rhs norm.
- *                    $ \frac{|| r ||}{|| b ||} < \tau $
+ *                    $ || r || \leq \tau \times || b || $
  *
  * @ingroup stop
  */
@@ -118,10 +118,11 @@ class ResidualNormBase
  * The ResidualNorm class is a stopping criterion which
  * stops the iteration process when the actual residual norm is below a
  * certain threshold relative to
- * 1. the norm of the right-hand side, norm(residual) / norm(right_hand_side)
- *                                                                  < threshold
- * 2. the initial residual, norm(residual) / norm(initial_residual) < threshold.
- * 3. one,  norm(residual) < threshold.
+ * 1. the norm of the right-hand side, norm(residual) $\leq$ < threshold *
+ *    norm(right_hand_side).
+ * 2. the initial residual, norm(residual) $\leq$ threshold *
+ *    norm(initial_residual).
+ * 3. one,  norm(residual) $\leq$ threshold.
  *
  * For better performance, the checks are run on the executor
  * where the algorithm is executed.
@@ -176,11 +177,11 @@ class ResidualNorm : public ResidualNormBase<ValueType> {
  * The ImplicitResidualNorm class is a stopping criterion which
  * stops the iteration process when the implicit residual norm is below a
  * certain threshold relative to
- * 1. the norm of the right-hand side, implicit_resnorm / norm(right_hand_side)
- *                                                          < threshold
- * 2. the initial residual, implicit_resnorm / norm(initial_residual) <
- *                                                          < threshold.
- * 3. one, implicit_resnorm < threshold.
+ * 1. the norm of the right-hand side, implicit_resnorm $\leq$ < threshold *
+ * norm(right_hand_side)
+ * 2. the initial residual, implicit_resnorm $\leq$ threshold *
+ * norm(initial_residual) .
+ * 3. one,  implicit_resnorm $\leq$ threshold.
  *
  * @note To use this stopping criterion there are some dependencies. The
  * constructor depends on either `b` or the `initial_residual` in order to
@@ -239,13 +240,7 @@ class ImplicitResidualNorm : public ResidualNormBase<ValueType> {
 // The following classes are deprecated, but they internally reference
 // themselves. To reduce unnecessary warnings, we disable deprecation warnings
 // for the definition of these classes.
-#if defined(__GNUC__) || defined(__clang__)
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-#elif defined(_MSC_BUILD) || defined(__INTEL_LLVM_COMPILER)
-#pragma warning(push)
-#pragma warning(disable : 4996)
-#endif
+GKO_BEGIN_DISABLE_DEPRECATION_WARNINGS
 
 
 /**
@@ -268,11 +263,10 @@ class ImplicitResidualNorm : public ResidualNormBase<ValueType> {
  * @ingroup stop
  */
 template <typename ValueType = default_precision>
-class [[deprecated(
+class GKO_DEPRECATED(
     "Please use the class ResidualNorm with the factory parameter baseline = "
-    "mode::initial_resnorm")]] ResidualNormReduction
-    : public ResidualNormBase<ValueType>
-{
+    "mode::initial_resnorm") ResidualNormReduction
+    : public ResidualNormBase<ValueType> {
 public:
     using ComplexVector = matrix::Dense<to_complex<ValueType>>;
     using NormVector = matrix::Dense<remove_complex<ValueType>>;
@@ -325,11 +319,10 @@ class [[deprecated(
  * @ingroup stop
  */
 template <typename ValueType = default_precision>
-class [[deprecated(
+class GKO_DEPRECATED(
     "Please use the class ResidualNorm with the factory parameter baseline = "
-    "mode::rhs_norm")]] RelativeResidualNorm
-    : public ResidualNormBase<ValueType>
-{
+    "mode::rhs_norm") RelativeResidualNorm
+    : public ResidualNormBase<ValueType> {
 public:
     using ComplexVector = matrix::Dense<to_complex<ValueType>>;
     using NormVector = matrix::Dense<remove_complex<ValueType>>;
@@ -380,11 +373,10 @@ class [[deprecated(
  * @ingroup stop
  */
 template <typename ValueType = default_precision>
-class [[deprecated(
+class GKO_DEPRECATED(
     "Please use the class ResidualNorm with the factory parameter baseline = "
-    "mode::absolute")]] AbsoluteResidualNorm
-    : public ResidualNormBase<ValueType>
-{
+    "mode::absolute") AbsoluteResidualNorm
+    : public ResidualNormBase<ValueType> {
 public:
     using NormVector = matrix::Dense<remove_complex<ValueType>>;
     using Vector = matrix::Dense<ValueType>;
@@ -416,11 +408,7 @@ class [[deprecated(
 };
 
 
-#if defined(__GNUC__) || defined(__clang__)
-#pragma GCC diagnostic pop
-#elif defined(_MSC_BUILD) || defined(__INTEL_LLVM_COMPILER)
-#pragma warning(pop)
-#endif
+GKO_END_DISABLE_DEPRECATION_WARNINGS
 
 
 }  // namespace stop
diff --git a/include/ginkgo/core/stop/stopping_status.hpp b/include/ginkgo/core/stop/stopping_status.hpp
index c644e1977df..ee7d7890cf4 100644
--- a/include/ginkgo/core/stop/stopping_status.hpp
+++ b/include/ginkgo/core/stop/stopping_status.hpp
@@ -96,7 +96,7 @@ class stopping_status {
     GKO_ATTRIBUTES GKO_INLINE void reset() noexcept { data_ = uint8{0}; }
 
     /**
-     * Call if a stop occured due to a hard limit (and convergence was not
+     * Call if a stop occurred due to a hard limit (and convergence was not
      * reached).
      * @param id  id of the stopping criteria.
      * @param set_finalized  Controls if the current version should count as
@@ -114,7 +114,7 @@ class stopping_status {
     }
 
     /**
-     * Call if convergence occured.
+     * Call if convergence occurred.
      * @param id  id of the stopping criteria.
      * @param set_finalized  Controls if the current version should count as
      * finalized (set to true) or not (set to false).
diff --git a/include/ginkgo/core/stop/time.hpp b/include/ginkgo/core/stop/time.hpp
index d1a752c5042..3d39b1de082 100644
--- a/include/ginkgo/core/stop/time.hpp
+++ b/include/ginkgo/core/stop/time.hpp
@@ -45,7 +45,7 @@ namespace stop {
 
 /**
  * The Time class is a stopping criterion which stops the iteration process
- * after a certain amout of time has passed.
+ * after a certain amount of time has passed.
  *
  * @ingroup stop
  */
diff --git a/include/ginkgo/ginkgo.hpp b/include/ginkgo/ginkgo.hpp
index 93663b02290..b35069c6720 100644
--- a/include/ginkgo/ginkgo.hpp
+++ b/include/ginkgo/ginkgo.hpp
@@ -39,6 +39,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include <ginkgo/core/base/abstract_factory.hpp>
 #include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/batch_dim.hpp>
+#include <ginkgo/core/base/batch_lin_op.hpp>
+#include <ginkgo/core/base/batch_multi_vector.hpp>
 #include <ginkgo/core/base/combination.hpp>
 #include <ginkgo/core/base/composition.hpp>
 #include <ginkgo/core/base/dense_cache.hpp>
@@ -48,6 +51,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/fwd_decls.hpp>
 #include <ginkgo/core/base/index_set.hpp>
 #include <ginkgo/core/base/intrinsics.hpp>
 #include <ginkgo/core/base/lin_op.hpp>
@@ -55,6 +59,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/matrix_assembly_data.hpp>
 #include <ginkgo/core/base/matrix_data.hpp>
+#include <ginkgo/core/base/memory.hpp>
 #include <ginkgo/core/base/mpi.hpp>
 #include <ginkgo/core/base/mtx_io.hpp>
 #include <ginkgo/core/base/name_demangling.hpp>
@@ -65,6 +70,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/range_accessors.hpp>
 #include <ginkgo/core/base/scoped_device_id_guard.hpp>
 #include <ginkgo/core/base/std_extensions.hpp>
+#include <ginkgo/core/base/stream.hpp>
 #include <ginkgo/core/base/temporary_clone.hpp>
 #include <ginkgo/core/base/temporary_conversion.hpp>
 #include <ginkgo/core/base/timer.hpp>
@@ -77,6 +83,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/distributed/lin_op.hpp>
 #include <ginkgo/core/distributed/matrix.hpp>
 #include <ginkgo/core/distributed/partition.hpp>
+#include <ginkgo/core/distributed/partition_helpers.hpp>
 #include <ginkgo/core/distributed/polymorphic_object.hpp>
 
 #include <ginkgo/core/distributed/preconditioner/schwarz.hpp>
@@ -93,6 +100,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/factorization/par_ilu.hpp>
 #include <ginkgo/core/factorization/par_ilut.hpp>
 
+#include <ginkgo/core/log/batch_logger.hpp>
 #include <ginkgo/core/log/convergence.hpp>
 #include <ginkgo/core/log/logger.hpp>
 #include <ginkgo/core/log/papi.hpp>
@@ -101,6 +109,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/log/record.hpp>
 #include <ginkgo/core/log/stream.hpp>
 
+#include <ginkgo/core/matrix/batch_dense.hpp>
+#include <ginkgo/core/matrix/batch_ell.hpp>
+#include <ginkgo/core/matrix/batch_identity.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
@@ -112,6 +123,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/identity.hpp>
 #include <ginkgo/core/matrix/permutation.hpp>
 #include <ginkgo/core/matrix/row_gatherer.hpp>
+#include <ginkgo/core/matrix/scaled_permutation.hpp>
 #include <ginkgo/core/matrix/sellp.hpp>
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
@@ -125,11 +137,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/preconditioner/jacobi.hpp>
 
 #include <ginkgo/core/reorder/amd.hpp>
+#include <ginkgo/core/reorder/mc64.hpp>
 #include <ginkgo/core/reorder/nested_dissection.hpp>
 #include <ginkgo/core/reorder/rcm.hpp>
 #include <ginkgo/core/reorder/reordering_base.hpp>
 #include <ginkgo/core/reorder/scaled_reordered.hpp>
 
+#include <ginkgo/core/solver/batch_bicgstab.hpp>
+#include <ginkgo/core/solver/batch_solver_base.hpp>
 #include <ginkgo/core/solver/bicg.hpp>
 #include <ginkgo/core/solver/bicgstab.hpp>
 #include <ginkgo/core/solver/cb_gmres.hpp>
@@ -147,6 +162,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/solver/triangular.hpp>
 #include <ginkgo/core/solver/workspace.hpp>
 
+#include <ginkgo/core/stop/batch_stop_enum.hpp>
 #include <ginkgo/core/stop/combined.hpp>
 #include <ginkgo/core/stop/criterion.hpp>
 #include <ginkgo/core/stop/iteration.hpp>
diff --git a/matrices/CMakeLists.txt b/matrices/CMakeLists.txt
index a4a2c603d7c..391bb346ae0 100644
--- a/matrices/CMakeLists.txt
+++ b/matrices/CMakeLists.txt
@@ -36,3 +36,6 @@ configure_file("test/isai_spd_excess.mtx" "${Ginkgo_BINARY_DIR}/matrices/test/is
 configure_file("test/isai_spd_excess_rhs.mtx" "${Ginkgo_BINARY_DIR}/matrices/test/isai_spd_excess_rhs.mtx")
 configure_file("test/isai_spd_inv.mtx" "${Ginkgo_BINARY_DIR}/matrices/test/isai_spd_inv.mtx")
 configure_file("test/isai_spd_inv_partial.mtx" "${Ginkgo_BINARY_DIR}/matrices/test/isai_spd_inv_partial.mtx")
+configure_file("test/1138_bus_mc64_result.mtx" "${Ginkgo_BINARY_DIR}/matrices/test/1138_bus_mc64_result.mtx")
+configure_file("test/nontrivial_mc64_example.mtx" "${Ginkgo_BINARY_DIR}/matrices/test/nontrivial_mc64_example.mtx")
+configure_file("test/nontrivial_mc64_result.mtx" "${Ginkgo_BINARY_DIR}/matrices/test/nontrivial_mc64_result.mtx")
diff --git a/matrices/config.hpp.in b/matrices/config.hpp.in
index 8ea9cf76453..e802365fcd9 100644
--- a/matrices/config.hpp.in
+++ b/matrices/config.hpp.in
@@ -67,6 +67,12 @@ const char* location_ani4_amd_chol_mtx =
 const char* location_isai_mtxs = "@Ginkgo_BINARY_DIR@/matrices/test/";
 const char* location_1138_bus_mtx =
     "@Ginkgo_BINARY_DIR@/matrices/test/1138_bus.mtx";
+const char* location_1138_bus_mc64_result =
+    "@Ginkgo_BINARY_DIR@/matrices/test/1138_bus_mc64_result.mtx";
+const char* location_nontrivial_mc64_example =
+    "@Ginkgo_BINARY_DIR@/matrices/test/nontrivial_mc64_example.mtx";
+const char* location_nontrivial_mc64_result =
+    "@Ginkgo_BINARY_DIR@/matrices/test/nontrivial_mc64_result.mtx";
 
 
 }  // namespace matrices
diff --git a/matrices/test/1138_bus_mc64_result.mtx b/matrices/test/1138_bus_mc64_result.mtx
new file mode 100644
index 00000000000..3bfcddb0f84
--- /dev/null
+++ b/matrices/test/1138_bus_mc64_result.mtx
@@ -0,0 +1,4057 @@
+%%MatrixMarket matrix coordinate real general
+% Generated 16-Mar-2022
+1138 1138 4054
+1 1  1
+5 1 -0.6492727920766413
+563 1 -0.1799809173071795
+2 2  1
+10 2 -0.5965460180056977
+563 2 -0.1799809173071795
+3 3  1
+11 3 -0.3240868407965323
+34 3 -0.5531896796194379
+35 3 -0.001603749139075492
+104 3 -0.2011814076823313
+475 3 -0.2728043514189628
+4 4  1
+7 4 -0.6560933806303378
+27 4 -0.05211953253771535
+101 4 -0.8358937496719409
+102 4 -0.01586529796895537
+103 4 -0.0004063658723966864
+1 5 -0.006114226606155904
+5 5  0.9999999999999999
+9 5 -0.6789259554494135
+6 6  1
+7 6 -0.2077280495519381
+37 6 -0.7072785174509324
+98 6 -0.1903083178071727
+103 6 -0.003788508991866723
+4 7 -0.5046605693847879
+6 7 -0.09382309841408967
+7 7  1
+37 7 -0.05942382962630921
+101 7 -0.02266501810886314
+102 7 -0.0228091649658566
+103 7 -5.576657673807354e-05
+8 8  1
+26 8 -0.8584300085831883
+35 8 -4.966777796632164e-05
+724 8 -0.1658543304651265
+5 9 -0.3507274959407548
+9 9  1
+10 9 -0.1586178957762868
+104 9 -0.05775643268922086
+2 10 -0.3727836251651863
+9 10 -0.1262302857737826
+10 10  0.9999999999999999
+104 10 -0.05775643268922086
+3 11 -0.1265619981302794
+11 11  0.9999999999999999
+12 11 -1
+38 11 -0.09892352745781073
+566 11 -0.08086159583567698
+11 12 -0.04556405099125133
+12 12  1
+13 13  1
+34 13 -0.03288526117299265
+104 13 -0.2185949619329241
+14 14  1
+413 14 -0.5229655372607853
+15 15  1
+16 15 -1
+17 15 -1
+18 15 -0.9999999999999999
+19 15 -0.9999999999999999
+411 15 -0.1341190427727194
+15 16 -0.01807852749599825
+16 16  1
+15 17 -0.0179682877208037
+17 17  1
+15 18 -0.01791367762954721
+18 18  0.9999999999999999
+15 19 -0.01791367762954721
+19 19  0.9999999999999999
+20 20  1
+21 20 -0.9963844951248662
+37 20 -0.2271550870477122
+102 20 -0.2888303918407034
+20 21 -0.9944597417179796
+21 21  1
+22 21 -1
+23 21 -1
+24 21 -1
+25 21 -1
+21 22 -0.0009076899792189114
+22 22  1
+21 23 -0.0009043944382427825
+23 23  1
+21 24 -0.0009034572142879311
+24 24  1
+21 25 -0.0009001921875800999
+25 25  1
+8 26 -0.1240168180982937
+26 26  1
+35 26 -6.177197728157473e-05
+4 27 -0.006931543040297045
+27 27  1
+28 27 -1
+29 27 -1
+30 27 -1
+101 27 -0.07713031382863941
+27 28 -0.2601646039655509
+28 28  1
+27 29 -0.2673350085723925
+29 29  1
+27 30 -0.1304693804136451
+30 30  1
+31 31  0.9999999999999999
+32 31 -0.3704915964445265
+100 31 -0.0260860216426223
+31 32 -0.3704915964445265
+32 32  0.9999999999999999
+100 32 -0.0260860216426223
+33 33  1
+100 33 -0.01615606831422106
+3 34 -0.4475011592382526
+13 34 -0.2592764642122821
+34 34  1
+104 34 -0.4187353558373992
+553 34 -0.1937821077006709
+3 35 -0.2307967227601994
+8 35 -0.01644486328850534
+26 35 -0.1415700371694145
+35 35  1
+104 35 -0.04597532653564613
+710 35 -0.9982709946372883
+36 36  1
+711 36 -0.9975908181741096
+6 37 -0.4863371018105125
+7 37 -0.09046757402222515
+20 37 -0.003175736609838774
+37 37  1
+102 37 -0.01048944640359718
+11 38 -0.2728772405046156
+38 38  1
+39 38 -0.9547163927096949
+98 38 -0.8096916821928273
+412 38 -0.05015428838455092
+38 39 -0.2090108403342631
+39 39  1
+99 39 -0.2372121791869226
+40 40  1
+41 40 -0.1280029924124869
+43 40 -0.01254773538947213
+45 40 -0.5398229350705784
+40 41 -0.5147013318641015
+41 41  1
+42 41 -0.9999999999999999
+44 41 -0.07692309751479924
+41 42 -0.02931082794831556
+42 42  0.9999999999999999
+40 43 -0.2246486143611993
+43 43  1
+146 43 -0.5964911799322897
+41 44 -0.8426861897514321
+44 44  1
+486 44 -0.9899715878154298
+40 45 -0.2606500131133009
+45 45  1
+49 45 -0.1892434736834814
+46 46  1
+48 46 -0.4954576443169026
+47 47  1
+48 47 -0.005198926244193236
+46 48 -1
+47 48 -1
+48 48  1
+54 48 -0.04006438898404348
+506 48 -0.9847327041547841
+45 49 -0.4601768965046814
+49 49  1
+50 49 -0.1280029924124869
+53 49 -0.06779335198707773
+49 50 -0.438374187024249
+50 50  1
+51 50 -0.9999999999999999
+52 50 -0.9531250053613283
+50 51 -0.02931082794831556
+51 51  0.9999999999999999
+50 52 -0.8426861897514321
+52 52  0.9999999999999999
+125 52 -0.112813716587248
+49 53 -0.3723823739238244
+53 53  1
+54 53 -0.729744543538038
+55 53 -1
+48 54 -0.003885942182074739
+53 54 -0.9006833121282544
+54 54  1
+425 54 -0.131399813155743
+445 54 -0.6184459469197153
+447 54 -0.1174874450546378
+53 55 -0.03152390331492758
+55 55  1
+56 56  1
+63 56 -0.002915193935309169
+57 57  1
+66 57 -0.2542059170280336
+58 58  1
+70 58 -0.06939652171522302
+59 59  1
+67 59 -0.1453397137875275
+60 60  1
+68 60 -0.3600731023058959
+61 61  1
+130 61 -0.3502350217089675
+62 62  1
+129 62 -0.276184688174664
+56 63 -1
+63 63  1
+64 63 -0.1891643739723574
+65 63 -0.9984105304355466
+63 64 -0.00300326655095561
+64 64  1
+71 64 -0.4245283028640084
+131 64 -0.01790155758236807
+226 64 -0.4377343502707456
+63 65 -0.9940812402952818
+65 65  1
+1095 65 -0.1167582363193619
+57 66 -1
+66 66  1
+67 66 -0.5799478038864539
+68 66 -0.083900485735579
+74 66 -0.08483516860378112
+183 66 -0.3694771125075278
+226 66 -0.1725924051061115
+59 67 -1
+66 67 -0.2196535628886999
+67 67  1
+68 67 -0.106033774730965
+69 67 -0.08028172632075595
+60 68 -1
+66 68 -0.05491337941001665
+67 68 -0.183235059922787
+68 68  1
+74 68 -0.1236471895212916
+90 68 -0.4987844472802113
+201 68 -0.2909768179804689
+67 69 -0.09147724319936251
+69 69  0.9999999999999999
+70 69 -0.7779716612780849
+58 70 -1
+69 70 -0.919718483375054
+70 70  0.9999999999999999
+182 70 -0.08057177429241594
+196 70 -1
+1063 70 -0.3087247699192932
+1065 70 -0.2920428780860505
+1074 70 -0.01012736249796797
+64 71 -0.2566123516218751
+71 71  1
+72 71 -0.6542056206830282
+71 72 -0.5754716971359916
+72 72  1
+73 72 -0.9999999999999999
+121 72 -0.08744215700388705
+72 73 -0.06542055029260259
+73 73  0.9999999999999999
+66 74 -0.07387576942472925
+68 74 -0.1645116442418972
+74 74  1
+75 74 -0.05817680201285781
+76 74 -0.1726339178191859
+78 74 -0.130913693988655
+80 74 -0.02407728877274043
+119 74 -0.8263417928469541
+74 75 -0.1047605315009727
+75 75  1
+204 75 -1
+810 75 -0.116290886260006
+918 75 -0.01293657229246839
+74 76 -0.06610842125172511
+76 76  1
+77 76 -0.005970149283403877
+78 76 -0.1384384969733886
+127 76 -0.6542056206830282
+76 77 -0.2037391490184505
+77 77  1
+1050 77 -0.9382000144858081
+74 78 -0.0893397910743087
+76 78 -0.246709612638192
+78 78  1
+79 78 -0.1610659872216043
+452 78 -0.4255502636251993
+78 79 -0.3460962043628888
+79 79  1
+416 79 -0.1327936542221724
+417 79 -0.1327936542221724
+454 79 -0.9129721382228279
+74 80 -0.04110857713539848
+80 80  1
+81 80 -0.1597918618507438
+447 80 -0.380435616258874
+452 80 -0.4130341017047693
+80 81 -0.1021264623004273
+81 81  1
+448 81 -0.8894740381274725
+450 81 -0.07986076148735938
+464 81 -0.217002790610145
+82 82  1
+94 82 -0.01072028153451134
+83 83  1
+94 83 -0.03715132479984279
+84 84  1
+95 84 -0.2295183725025479
+85 85  1
+87 85 -0.03241182583534607
+126 85 -0.5576554300505696
+86 86  1
+87 86 -0.9399429116273194
+241 86 -0.01610503466391137
+251 86 -0.3412323027937502
+253 86 -0.3767441238224044
+267 86 -0.1674639809318766
+269 86 -0.5714286302040831
+283 86 -0.6470587687197322
+291 86 -0.05190200332526986
+293 86 -0.2438398111509525
+294 86 -0.2438398111509525
+302 86 -0.8625592571177644
+315 86 -0.1480342823712487
+85 87 -0.3926701367972388
+86 87 -0.7344844565461368
+87 87  1
+88 87 -1
+87 88 -0.02764537533048392
+88 88  1
+89 89  1
+105 89 -0.09105905051266536
+115 89 -0.3525008070064463
+68 90 -0.111777566842221
+90 90  1
+91 90 -0.05037407604921124
+90 91 -0.5012155527197887
+91 91  1
+92 91 -0.6321285211920449
+108 91 -0.06697729906335466
+115 91 -0.3156281444720873
+116 91 -0.8189040706406221
+91 92 -0.2537012290405794
+92 92  1
+93 92 -0.1555708087540219
+92 93 -0.3678714788079553
+93 93  1
+94 93 -0.02349809905594921
+252 93 -0.4565232400999932
+264 93 -0.03303164861788143
+271 93 -1
+288 93 -0.5127387105960322
+292 93 -0.03384616202130215
+293 93 -0.1634940914263915
+294 93 -0.1634940914263915
+322 93 -0.2987569115956855
+323 93 -0.2363949685875952
+324 93 -0.4632257298681418
+325 93 -0.1217920716761051
+326 93 -0.1930758910117183
+82 94 -1
+83 94 -1
+93 94 -0.01781344165790433
+94 94  1
+96 94 -0.0004789799359466821
+264 94 -0.6240474348763411
+318 94 -0.6516684822611513
+325 94 -0.6427657848026371
+84 95 -1
+95 95  1
+97 95 -0.6835343784212797
+262 95 -0.05282055763401513
+268 95 -0.04503491331724746
+300 95 -0.2769582090771682
+94 96 -0.01034458327910479
+96 96  1
+145 96 -0.0007225227274768234
+704 96 -0.520062458688176
+705 96 -1
+95 97 -0.1081988151288643
+97 97  1
+268 97 -0.02418445958598323
+6 98 -0.09313961968281793
+38 98 -0.6173561895170943
+98 98  0.9999999999999999
+39 99 -0.04528355247048562
+99 99  1
+413 99 -0.09651670409910426
+31 100 -0.6295078704477037
+32 100 -0.6295078704477037
+33 100 -1
+100 100  1
+725 100 -0.9402255715577669
+732 100 -1
+4 101 -0.4178502489611676
+7 101 -0.01472963348864429
+27 101 -0.2899113868247994
+101 101  1
+102 101 -0.01603039966361481
+103 101 -0.0001440539928890901
+4 102 -0.01087919632728707
+7 102 -0.02033406699533006
+20 102 -0.002364544296135724
+37 102 -0.006142357991737697
+101 102 -0.02198990452750739
+102 102  1
+103 102 -0.003016233798460693
+4 103 -0.05967834899344121
+6 103 -0.3266997521159166
+7 103 -0.01064730857732617
+101 103 -0.04232093804494264
+102 103 -0.6459754203273675
+103 103  1
+478 103 -0.9807835087137711
+3 104 -0.06994587923121962
+9 104 -0.1948438981602913
+10 104 -0.2448360862180153
+13 104 -0.740723535787718
+34 104 -0.1799673938849595
+35 104 -0.0001110723376221515
+104 104  1
+89 105 -0.5863707737231639
+105 105  1
+106 105 -1
+109 105 -0.03571428984693892
+105 106 -0.02870338989898798
+106 106  1
+107 107  1
+108 107 -0.03187343937875503
+91 108 -0.1047468281823594
+107 108 -1
+108 108  1
+110 108 -0.08283446881443744
+105 109 -0.8802377021868432
+109 109  1
+1029 109 -0.9467159842580067
+108 110 -0.9011490732177322
+110 110  1
+111 110 -0.004160861891527749
+328 110 -1
+367 110 -0.03504377681956558
+370 110 -0.325748648239424
+110 111 -0.003812882068814073
+111 111  1
+367 111 -0.02368132661919568
+368 111 -0.06986898994029898
+701 111 -1
+112 112  1
+113 112 -0.000739056659585339
+370 112 -0.2147776966477931
+686 112 -0.8697204492488572
+112 113 -0.02459076972632483
+113 113  1
+114 113 -0.0007619042670442453
+115 113 -0.3318709180961821
+690 113 -0.9824224966892362
+113 114 -0.0007594107458898919
+114 114  1
+1009 114 -0.9907189449239523
+89 115 -0.4136292262768361
+91 115 -0.03751599861097331
+113 115 -0.002534267749880733
+115 115  1
+91 116 -0.5536618526143466
+116 116  1
+117 116 -1
+126 116 -0.1422590186464617
+116 117 -0.02511424529318564
+117 117  1
+118 118  1
+119 118 -0.02418339277197503
+74 119 -0.4902001130676245
+118 119 -1
+119 119  1
+123 119 -0.3049226741952487
+120 120  1
+121 120 -0.4648825071027528
+72 121 -0.2803737701458679
+120 121 -1
+121 121  1
+122 121 -0.4642021340696652
+504 121 -0.2603369044751871
+121 122 -0.2602993953478807
+122 122  1
+123 122 -0.3617099232549205
+124 122 -0.4224685870527805
+119 123 -0.1494748187606835
+122 123 -0.2651595312696484
+123 123  1
+125 123 -0.1027109792650798
+122 124 -0.2706380726830699
+124 124  1
+125 124 -0.08995603185041415
+127 124 -0.2803737701458679
+52 125 -0.04687500607617215
+123 125 -0.3333669558380759
+124 125 -0.3341091105981521
+125 125  0.9999999999999999
+126 125 -0.1701968830132461
+129 125 -0.3858750391740373
+546 125 -0.5260663232856418
+85 126 -0.6073298632027613
+116 126 -0.1559817069955036
+125 126 -0.2240299852154929
+126 126  1
+129 126 -0.3379402726512989
+76 127 -0.3769174562144449
+124 127 -0.2434223023490674
+127 127  1
+128 127 -0.9999999999999999
+127 128 -0.06542055029260259
+128 128  0.9999999999999999
+62 129 -1
+125 129 -0.1952237040984547
+126 129 -0.1298886682897226
+129 129  1
+61 130 -1
+130 130  0.9999999999999999
+133 130 -0.1490958824850217
+173 130 -0.2512651889842554
+213 130 -0.1527244967847158
+64 131 -0.1005030361241799
+131 131  1
+132 131 -0.744358699698249
+133 131 -0.8372307665851201
+136 131 -0.3623902402130909
+131 132 -0.08385468735710638
+132 132  1
+144 132 -0.0004401727678495663
+742 132 -0.2204103966717673
+130 133 -0.3838191980346491
+131 133 -0.8578979887268778
+133 133  1
+134 133 -1
+133 134 -0.0136733618138577
+134 134  1
+135 135  1
+136 135 -0.2548965474408207
+139 135 -0.05272090100578946
+740 135 -0.9882272487852217
+131 136 -0.0403458555550351
+135 136 -0.002536870188899712
+136 136  1
+141 136 -0.4039087414105968
+137 137  0.9999999999999999
+139 137 -0.8045668283673773
+141 137 -0.2896499621302946
+761 137 -0.3010446739903038
+138 138  0.9999999999999999
+141 138 -0.3064412964591086
+877 138 -0.4155496676241109
+882 138 -0.6158803906472619
+135 139 -0.0004734045915381463
+137 139 -0.6707682979885402
+139 139  1
+140 139 -0.2823079572220138
+139 140 -0.1427123594510089
+140 140  1
+829 140 -0.4880628508730103
+830 140 -0.3928506707862064
+136 141 -0.3827132123460883
+137 141 -0.2536055391507993
+138 141 -0.1330232953033071
+141 141  1
+142 142  1
+258 142 -0.08230603486136646
+366 142 -0.3618845534505992
+143 143  1
+144 143 -0.001247889818317225
+743 143 -0.0320912765102787
+745 143 -0.9995472051160823
+826 143 -1
+132 144 -0.04365066369403546
+143 144 -0.0006245443948639468
+144 144  0.9999999999999999
+827 144 -1
+96 145 -0.0007224585698749863
+145 145  0.9999999999999999
+318 145 -0.08697806565582218
+703 145 -1
+43 146 -0.9874521783072058
+146 146  0.9999999999999999
+147 146 -0.2272727061983529
+146 147 -0.4035088200677102
+147 147  0.9999999999999999
+148 147 -1
+147 148 -0.7727270619835285
+148 148  1
+149 149  1
+166 149 -0.01525068634536729
+150 150  1
+173 150 -0.3565748655620015
+151 151  0.9999999999999999
+175 151 -0.2479331092379944
+152 152  1
+180 152 -0.3652302557109839
+153 153  1
+183 153 -0.08663602991517734
+154 154  1
+207 154 -0.01556766147450305
+155 155  0.9999999999999999
+198 155 -0.02414044938673427
+156 156  1
+213 156 -0.5135083040398597
+157 157  1
+219 157 -0.2547388402230167
+158 158  1
+226 158 -0.1692082047663807
+159 159  1
+182 159 -0.07691289229425455
+205 159 -0.1709794891075046
+160 160  1
+172 160 -0.1497323443529348
+233 160 -0.1866913624909005
+161 161  0.9999999999999999
+222 161 -0.05187792770755063
+743 161 -0.0947485123883092
+162 162  1
+163 162 -0.3157700891404107
+169 162 -0.4498732370194708
+209 162 -0.09119822038020085
+223 162 -0.2308438203347603
+232 162 -0.1209976281228946
+162 163 -0.5265917525104066
+163 163  1
+186 163 -0.3685517004209947
+201 163 -0.07910433501159535
+227 163 -0.4626865654239252
+164 164  1
+166 164 -0.01525068634536729
+165 165  1
+166 165 -0.1775549610956214
+167 165 -0.5
+175 165 -0.2085522283900542
+194 165 -0.3616755741328196
+149 166 -1
+164 166 -1
+165 166 -0.5149451714697921
+166 166  1
+192 166 -0.0970117824447721
+201 166 -0.08616722206620209
+218 166 -0.4007447552591316
+165 167 -0.1472546650557122
+167 167  1
+175 167 -0.3231294692206997
+168 168  1
+180 168 -0.3295644787942922
+209 168 -0.7372772140177369
+162 169 -0.1317998320682076
+169 169  1
+187 169 -0.1311741564673933
+170 170  1
+188 170 -0.4342783193140172
+194 170 -0.09503486190684758
+171 171  0.9999999999999999
+178 171 -0.4447402774953585
+217 171 -0.00747844137787042
+160 172 -0.4683341404018499
+172 172  1
+173 172 -0.2063582177999838
+222 172 -0.05344538530284641
+755 172 -0.2735978619364898
+130 173 -0.1451751343221355
+150 173 -1
+172 173 -0.6365217315862682
+173 173  1
+184 173 -0.1994128604433656
+174 174  1
+211 174 -0.1511691547654236
+216 174 -0.2557193721038249
+151 175 -0.9999999999999999
+165 175 -0.09504019739290467
+167 175 -0.5
+175 175  1
+179 175 -0.5
+176 176  1
+177 176 -0.62337678302584
+178 176 -0.2379405377457809
+207 176 -0.7574299629892081
+176 177 -0.2002941904928748
+177 177  1
+207 177 -0.118216976845217
+171 178 -0.7504892495239552
+176 178 -0.02437351504897521
+178 178  1
+214 178 -0.2288037099033966
+217 178 -0.007410808621842783
+175 179 -0.2203850067055207
+179 179  1
+203 179 -0.7146710247080633
+152 180 -1
+168 180 -0.1348497012234844
+180 180  1
+209 180 -0.1064243677495804
+181 181  1
+183 181 -0.05815844833421143
+70 182 -0.02197442038564537
+159 182 -0.490602429381578
+182 182  1
+183 182 -0.07522289781862007
+205 182 -0.5821309466619323
+221 182 -0.2713530595291866
+66 183 -0.3327104858956616
+153 183 -1
+181 183 -1
+182 183 -0.4868079477176213
+183 183  1
+184 183 -0.477493261245094
+201 183 -0.2035303722962272
+226 183 -0.2204647378200683
+173 184 -0.185801436688662
+183 184 -0.2304995217561386
+184 184  1
+193 184 -0.1676384474933909
+213 184 -0.0732370501941501
+185 185  1
+187 185 -0.07500200885383727
+163 186 -0.3187074948796855
+186 186  0.9999999999999999
+187 186 -0.3875104162791563
+218 186 -0.0868534158829041
+169 187 -0.5501267629805292
+185 187 -1
+186 187 -0.3301609098494387
+187 187  0.9999999999999999
+188 187 -0.5657216806859828
+206 187 -0.4602206035291696
+209 187 -0.06510001427044376
+170 188 -0.7521173590257062
+187 188 -0.2759717830437856
+188 188  1
+189 189  0.9999999999999999
+211 189 -0.1359135748244117
+216 189 -0.2027324636256603
+190 190  1
+199 190 -0.04094432996341432
+191 191  0.9999999999999999
+195 191 -0.2904941520967104
+197 191 -0.404217889580768
+198 191 -0.9253837127083051
+211 191 -0.1547214539052061
+166 192 -0.2343725509899457
+192 192  1
+199 192 -0.8270755148854977
+218 192 -0.1445724738419677
+220 192 -0.1455457958672313
+184 193 -0.215514568149774
+193 193  1
+198 193 -0.05047547978142804
+199 193 -0.1319801551510881
+208 193 -0.3139073579938568
+165 194 -0.242759881115637
+170 194 -0.2478825419204235
+194 194  1
+216 194 -0.2897421391460267
+191 195 -0.009323070927852626
+195 195  1
+211 195 -0.3102529690195143
+70 196 -0.02292884455830111
+196 196  1
+191 197 -0.01282953777422882
+197 197  1
+212 197 -0.5095949027218473
+155 198 -0.9999999999999999
+191 198 -0.9664919152459959
+193 198 -0.2453617103968713
+198 198  1
+190 199 -1
+192 199 -0.8084315607946769
+193 199 -0.4306881991198629
+199 199  1
+200 200  1
+201 200 -0.2540543780055993
+202 200 -0.7745855166056066
+68 201 -0.1737035644114691
+163 201 -0.0936094226121587
+166 201 -0.04359607765817291
+183 201 -0.08831090097023078
+200 201 -0.7301358100091028
+201 201  1
+218 201 -0.0339917365150735
+200 202 -0.2698641899908972
+202 202  0.9999999999999999
+203 202 -0.2853287939084159
+179 203 -0.5
+202 203 -0.225414340098492
+203 203  1
+75 204 -0.9308288610614192
+204 204  1
+159 205 -0.5093975706184219
+182 205 -0.2718961077307849
+205 205  0.9999999999999999
+221 205 -0.3733510001547689
+187 206 -0.0808717309623898
+206 206  1
+216 206 -0.06885254755398093
+154 207 -1
+176 207 -0.7753322828410874
+177 207 -0.3766235785326799
+207 207  1
+208 207 -0.6860928046101488
+224 207 -0.6120997068106087
+193 208 -0.1563115215358269
+207 208 -0.09577067835565496
+208 208  1
+162 209 -0.0851493262845251
+168 209 -0.865150169523072
+180 209 -0.3052053707905789
+187 209 -0.04946941148022767
+209 209  1
+210 210  1
+215 210 -0.9999999999999999
+217 210 -0.9559942548580751
+218 210 -0.271933892120588
+174 211 -0.2519083562412447
+189 211 -0.2763485495176626
+191 211 -0.0113556500204848
+195 211 -0.7095055768721543
+211 211  1
+212 211 -0.4904050972781527
+197 212 -0.5957821104192319
+211 212 -0.2479427289688081
+212 212  1
+130 213 -0.1207707019718512
+156 213 -1
+184 213 -0.1075791019747063
+213 213  1
+504 213 -0.3751913986439322
+178 214 -0.1707963143513909
+214 214  1
+217 214 -0.02911656970974873
+210 215 -0.006238858049167328
+215 215  0.9999999999999999
+174 216 -0.7480916437587553
+189 216 -0.7236514504823374
+194 216 -0.5432895259843995
+206 216 -0.5397795023215715
+216 216  1
+218 216 -0.06190365785298323
+171 217 -0.2495107504760447
+178 217 -0.1465229838162297
+210 217 -0.927510608400462
+214 217 -0.7711962900966035
+217 217  1
+166 218 -0.5139749028013086
+186 218 -0.3012874689681785
+192 218 -0.07674982880452016
+201 218 -0.08616722206620209
+210 218 -0.06625073389262212
+216 218 -0.1829536801002387
+218 218  0.9999999999999999
+157 219 -1
+219 219  1
+220 219 -0.8544540058994244
+230 219 -0.9999999999999999
+192 220 -0.01780686433544928
+219 220 -0.5709662997850107
+220 220  1
+182 221 -0.08381126170553799
+205 221 -0.2468895294191365
+221 221  1
+222 221 -0.03463990807904439
+225 221 -0.1203411291473983
+161 222 -0.4338510401982535
+172 222 -0.07289600463347098
+221 222 -0.1662744800763948
+222 222  1
+225 222 -0.8796590049126298
+233 222 -0.8133086375090995
+743 222 -0.0409641298590186
+162 223 -0.1440988704141571
+223 223  0.9999999999999999
+232 223 -0.5170361072912917
+207 224 -0.01301470898659068
+224 224  1
+1026 224 -0.43046349456815
+221 225 -0.189021512882145
+222 225 -0.2878476166912821
+225 225  1
+64 226 -0.4537203635084046
+66 226 -0.06464090797717569
+158 226 -1
+183 226 -0.09169506357364615
+226 226  1
+163 227 -0.2719131304119618
+227 227  1
+228 227 -0.4729984736306375
+227 228 -0.5373134345760748
+228 228  0.9999999999999999
+229 228 -1
+231 228 -0.408748003944203
+232 228 -0.1349282437957417
+228 229 -0.2835377610395939
+229 229  1
+219 230 -0.1742949924561643
+230 230  0.9999999999999999
+228 231 -0.1309191974690454
+231 231  1
+232 231 -0.2270379223464048
+162 232 -0.1123602872849488
+223 232 -0.7691561796652396
+228 232 -0.1125445678607232
+231 232 -0.5912516755973105
+232 232  1
+160 233 -0.53166585959815
+222 233 -0.5429204819467379
+233 233  0.9999999999999999
+234 234  0.9999999999999999
+236 234 -0.1613145392141977
+307 234 -0.1058823103391086
+235 235  1
+236 235 -0.748804717517474
+243 235 -1
+270 235 -0.8631102783055937
+272 235 -0.1305043949783015
+298 235 -0.1722019677170294
+299 235 -0.2951069778206046
+234 236 -0.4989059470960984
+235 236 -0.03191543922976967
+236 236  1
+286 236 -1
+237 237  0.9999999999999999
+261 237 -0.02151130788651816
+287 237 -1
+309 237 -0.8265861301689351
+366 237 -0.09565227983504497
+702 237 -0.5505971291504373
+238 238  1
+239 238 -0.6963025937406854
+270 238 -0.06339368056547534
+238 239 -0.2192636847968434
+239 239  1
+270 239 -0.006618942422344889
+281 239 -1
+240 240  1
+259 240 -0.3796659389180249
+86 241 -0.006832413856724998
+241 241  0.9999999999999999
+242 241 -0.5882783477841544
+246 241 -0.7890770034316641
+253 241 -0.6232557247264666
+257 241 -0.4999999460000043
+266 241 -0.5
+275 241 -0.5127701742505889
+279 241 -0.7935779392240763
+289 241 -0.005700326017321154
+291 241 -0.8823339824135301
+292 241 -0.8856410305461146
+293 241 -0.5926662254385504
+294 241 -0.5926662254385504
+299 241 -0.3049046395971649
+310 241 -0.2702981499217172
+321 241 -0.523206833127686
+327 241 -0.5254236869221548
+241 242 -0.06412189951724677
+242 242  1
+244 242 -0.4822483429344955
+245 242 -0.7553191411740608
+235 243 -0.006895598231716217
+243 243  1
+242 244 -0.1815258847276199
+244 244  1
+272 244 -0.8694955058122561
+242 245 -0.2301958627893102
+245 245  0.9999999999999999
+298 245 -0.3144909496457951
+241 246 -0.03091591534401793
+246 246  1
+310 246 -0.3838363613116763
+247 247  1
+248 247 -0.5375000696062514
+255 247 -0.3433098630232098
+247 248 -0.4637681432976252
+248 248  1
+255 248 -0.2554863662590013
+249 249  1
+282 249 -0.9200002870400321
+250 250  1
+257 250 -0.4999999460000043
+263 250 -0.5955413901937813
+289 250 -0.02259771740419979
+317 250 -0.07692309751479924
+86 251 -0.01056812225816952
+251 251  1
+267 251 -0.2453812432123873
+93 252 -0.2149332660769647
+252 252  0.9999999999999999
+310 252 -0.3458653279391815
+313 252 -1
+86 253 -0.01096245521113784
+241 253 -0.04274793301149785
+253 253  1
+254 254  1
+255 254 -0.2288732053957545
+285 254 -0.6686047109873622
+247 255 -0.5362318567023748
+248 255 -0.4624999303937486
+254 255 -0.5428570981877516
+255 255  1
+258 255 -0.1200943598705512
+260 255 -0.3766815662173065
+263 255 -0.4044586098062187
+256 256  1
+290 256 -0.9999999999999999
+317 256 -0.9230772071006792
+321 256 -0.4767933442394409
+241 257 -0.03206094629604109
+250 257 -0.08239871533148115
+257 257  1
+142 258 -0.5871884879917958
+255 258 -0.03454690348785244
+258 258  1
+261 258 -0.9784886081591408
+327 258 -0.4745761953829469
+240 259 -1
+259 259  1
+278 259 -0.46945783007923
+255 260 -0.07903536849617317
+260 260  0.9999999999999999
+289 260 -0.02259771740419979
+237 261 -0.02282970359744397
+258 261 -0.4896153731583824
+261 261  1
+95 262 -0.1263017153519744
+262 262  1
+268 262 -0.1650494208645136
+276 262 -0.05882352941176471
+314 262 -0.4957263617363166
+250 263 -0.07007135267514689
+255 263 -0.05874821643659963
+263 263  1
+93 264 -0.02070333652463496
+94 264 -0.5159559063723644
+264 264  1
+311 264 -0.6561082453021649
+265 265  1
+276 265 -0.9411763764705883
+300 265 -0.1435899342293394
+241 266 -0.04274793301149785
+266 266  1
+307 266 -0.8941175448138609
+86 267 -0.01392387744333599
+251 267 -0.6587679343626871
+267 267  1
+282 267 -0.07999998895999877
+284 267 -0.2711864537224941
+302 267 -0.1374407428822357
+95 268 -0.09328283535347889
+97 268 -0.3164657040813212
+262 268 -0.1429750129714083
+268 268  1
+270 268 -0.02325341219859649
+301 268 -0.439393852686889
+86 269 -0.0408046952947163
+269 269  1
+284 269 -0.7288134763114009
+235 270 -0.9446969671920914
+238 270 -0.7807365809480077
+239 270 -0.2588680600634418
+268 270 -0.3397252077618215
+270 270  1
+275 270 -0.487229825749411
+279 270 -0.2064220107805109
+280 270 -1
+318 270 -0.2023510083909907
+320 270 -1
+93 271 -0.1078213461325165
+271 271  1
+235 272 -0.004349433488846199
+244 272 -0.5177512350982256
+272 272  1
+273 273  0.9999999999999999
+274 273 -0.5
+303 273 -0.499999860000056
+315 273 -0.1644825491266569
+273 274 -0.2339832475480544
+274 274  1
+312 274 -0.2339832475480544
+241 275 -0.004654008389144389
+270 275 -0.005511558988624638
+275 275  1
+262 276 -0.3413528496135886
+265 276 -0.9827284475345799
+276 276  1
+277 277  1
+285 277 -0.3313953652335733
+289 277 -0.9491042828330764
+259 278 -0.6203338692747337
+278 278  1
+309 278 -0.1734138698310648
+241 279 -0.01099232414756418
+270 279 -0.003563626116163423
+279 279  1
+270 280 -0.01265557454877674
+280 280  1
+239 281 -0.04482921117026357
+281 281  1
+249 282 -1
+267 282 -0.2560499084207828
+282 282  1
+86 283 -0.02040234764735815
+283 283  1
+292 283 -0.08051282868796882
+267 284 -0.136956962273108
+269 284 -0.428571369795917
+284 284  1
+254 285 -0.4571429018122484
+277 285 -0.01709401514500712
+285 285  1
+236 286 -0.08988085409142241
+286 286  1
+237 287 -0.1536997872135341
+287 287  1
+93 288 -0.1067642635419462
+288 288  0.9999999999999999
+291 288 -0.06576402484920824
+241 289 -0.01039814502541189
+250 289 -0.1059411914419505
+260 289 -0.6233181719890193
+277 289 -0.9829058904960274
+289 289  1
+256 290 -0.005166882463238922
+290 290  0.9999999999999999
+86 291 -0.007200828322190714
+241 291 -0.2885485132017875
+288 291 -0.487261054056874
+291 291  1
+93 292 -0.05202209332507221
+241 292 -0.2885485132017875
+283 292 -0.3529411381038126
+292 292  1
+86 293 -0.008394108494518396
+93 293 -0.06258594421391314
+241 293 -0.04809142463793508
+293 293  1
+86 294 -0.008394108494518396
+93 294 -0.06258594421391314
+241 294 -0.04809142463793508
+294 294  1
+295 295  1
+298 295 -0.5133069486640399
+296 296  1
+297 296 -0.8528974928165347
+296 297 -1
+297 297  1
+299 297 -0.399988152988994
+235 298 -0.006071317063447265
+245 298 -0.244680858825939
+295 298 -1
+298 298  1
+235 299 -0.006071317063447265
+241 299 -0.004598382004544293
+297 299 -0.1471025071834653
+299 299  1
+95 300 -0.4426981441267596
+265 300 -0.01727114954671646
+300 300  1
+696 300 -1
+268 301 -0.4260059606411062
+301 301  1
+314 301 -0.5042733457851207
+86 302 -0.1013082049552435
+267 302 -0.1941477638222605
+302 302  1
+273 303 -0.5515320123481384
+303 303  1
+312 303 -0.5515320123481384
+304 304  1
+305 304 -0.5730337519050649
+315 304 -0.1233618822381362
+329 304 -0.45
+304 305 -0.1906137411613363
+305 305  0.9999999999999999
+315 305 -0.1161053009300106
+306 306  1
+322 306 -0.3519447239589286
+234 307 -0.5010940529039015
+266 307 -0.5
+307 307  1
+308 308  0.9999999999999999
+323 308 -0.5515883957284556
+237 309 -0.5644585042091543
+278 309 -0.53054216992077
+309 309  1
+241 310 -0.005819465114137099
+246 310 -0.2109227314384797
+252 310 -0.07461455592005399
+310 310  1
+264 311 -0.3429205777001247
+311 311  1
+326 311 -0.8069241089882817
+274 312 -0.5
+303 312 -0.499999860000056
+312 312  0.9999999999999999
+315 312 -0.1644825491266569
+252 313 -0.4688617529348292
+313 313  1
+262 314 -0.4628513066987083
+301 314 -0.5606058871919597
+314 314  1
+86 315 -0.03672422282730684
+273 315 -0.2144846628893347
+304 315 -0.1509025179236003
+305 315 -0.4269662480949349
+312 315 -0.2144846628893347
+315 315  1
+316 315 -0.8956228451423282
+319 315 -0.1011236045953802
+323 315 -0.2120167713746828
+315 316 -0.1910119505115473
+316 316  0.9999999999999999
+322 316 -0.3492985502721898
+250 317 -0.7415884290842688
+256 317 -0.9868745682423764
+317 317  1
+94 318 -0.2384682812552523
+145 318 -0.001473861031868857
+270 318 -0.01483007300618289
+318 318  1
+704 318 -0.479937541311824
+315 319 -0.07401714118562433
+319 319  1
+329 319 -0.5499999504999995
+270 320 -0.007063094656009557
+320 320  1
+241 321 -0.03064231940382643
+256 321 -0.007958665745583737
+321 321  1
+93 322 -0.05252390250069872
+306 322 -1
+316 322 -0.1043771270933597
+322 322  1
+93 323 -0.05691613759176354
+308 323 -0.9999999999999999
+315 323 -0.01850428529640608
+323 323  1
+93 324 -0.03926666600237935
+324 324  1
+325 324 -0.2354420589975488
+93 325 -0.02353736591673529
+94 325 -0.163861334082621
+324 325 -0.5367740774299422
+325 325  1
+93 326 -0.02695533653312913
+311 326 -0.3438913557838784
+326 326  0.9999999999999999
+241 327 -0.03091591534401793
+258 327 -0.30798392658964
+327 327  1
+110 328 -0.9111791660706036
+328 328  1
+304 329 -0.6584838133482835
+319 329 -0.8988763954046198
+329 329  1
+330 330  1
+335 330 -0.07961856004948324
+331 331  1
+336 331 -0.004036503469976048
+332 332  1
+337 332 -0.006387721320127105
+333 333  1
+350 333 -0.0921005122339543
+334 334  1
+350 334 -0.05105154464484288
+330 335 -1
+335 335  1
+337 335 -0.1067585399664951
+331 336 -1
+336 336  1
+337 336 -0.8331808584559168
+332 337 -1
+335 337 -0.9203814069088204
+336 337 -0.9959633009029084
+337 337  1
+338 337 -0.2012269867927049
+339 337 -0.2060301256276172
+340 337 -0.1039862296139679
+337 338 -0.01471824980558974
+338 338  1
+340 338 -0.2553330263644302
+337 339 -0.01516072717293947
+339 339  1
+340 339 -0.2553330263644302
+337 340 -0.02379370398206635
+338 340 -0.7987728036092778
+339 340 -0.793969666034709
+340 340  1
+341 340 -0.3883928934870693
+342 340 -0.2232543326004295
+340 341 -0.1358463020436659
+341 341  1
+342 341 -0.1914151337886629
+340 342 -0.2495012313651646
+341 342 -0.6116071065129307
+342 342  1
+343 342 -0.3747337582460827
+344 342 -0.3811732129030788
+345 342 -0.1582682305842391
+370 342 -0.4594735804023605
+342 343 -0.08506133760186088
+343 343  0.9999999999999999
+346 343 -0.09236818336109996
+342 344 -0.1605549385574835
+344 344  0.9999999999999999
+346 344 -0.1696361337947664
+342 345 -0.0631863628937859
+345 345  1
+346 345 -0.2187014002957438
+343 346 -0.6252659776415691
+344 346 -0.6188264312716342
+345 346 -0.8417318069569906
+346 346  1
+347 346 -0.2580982636355207
+349 346 -0.2476966786950227
+367 346 -0.06003346222776961
+346 347 -0.137381439155999
+347 347  1
+348 347 -0.5118576987648027
+347 348 -0.7419017363644793
+348 348  1
+349 348 -0.4619209334789083
+346 349 -0.201947792261503
+348 349 -0.4881423012351971
+349 349  1
+350 349 -0.856847731308403
+333 350 -1
+334 350 -1
+349 350 -0.2903822681885516
+350 350  0.9999999999999999
+351 351  1
+366 351 -0.1761384366913647
+352 352  1
+353 352 -0.3102438686768026
+408 352 -1
+409 352 -0.6541753911724465
+352 353 -0.3546893357009125
+353 353  1
+363 353 -0.03948596574208448
+382 353 -0.5616295349317373
+354 354  0.9999999999999999
+382 354 -0.4383704650682625
+383 354 -0.270321998160119
+355 355  1
+375 355 -0.01774115593578681
+356 356  1
+383 356 -0.06342671662889754
+398 356 -0.09754281701496809
+357 357  1
+394 357 -0.02827324335128412
+395 357 -0.0009380818989512124
+358 358  1
+479 358 -0.3946138994678621
+359 359  1
+474 359 -0.2681610793000471
+360 360  1
+715 360 -0.2430060015603755
+361 361  1
+479 361 -0.07944494922793342
+480 361 -0.9822696400885458
+362 362  1
+403 362 -1
+714 362 -0.1117773054875538
+353 363 -0.2238729351343457
+363 363  1
+364 363 -1
+365 363 -1
+373 363 -1
+374 363 -1
+388 363 -0.3718125198786338
+395 363 -0.00626805062936796
+407 363 -0.1075371685714914
+474 363 -0.08092183564139589
+363 364 -0.09465071293757095
+364 364  1
+363 365 -0.09465071293757095
+365 365  1
+142 366 -0.4128115120082042
+237 366 -0.03243884717849358
+351 366 -1
+366 366  1
+699 366 -0.5652160454017799
+110 367 -0.0009813454289330364
+111 367 -0.0007236806387622489
+346 367 -0.1799645341646411
+367 367  0.9999999999999999
+368 367 -0.9301312422203849
+372 367 -0.500000150000015
+111 368 -0.0007780416527873758
+367 368 -0.3389391133225481
+368 368  1
+369 369  1
+371 369 -0.3045684834394144
+110 370 -0.001191863352282823
+112 370 -0.02858042770526963
+342 370 -0.276527744679703
+370 370  1
+369 371 -1
+371 371  1
+372 371 -0.500000150000015
+367 372 -0.5423025162397879
+371 372 -0.6954315165605854
+372 372  1
+363 373 -0.09465071293757095
+373 373  1
+363 374 -0.09465071293757095
+374 374  1
+355 375 -1
+375 375  1
+376 375 -1
+377 375 -1
+378 375 -1
+379 375 -1
+380 375 -1
+381 375 -1
+389 375 -0.8215297736331268
+404 375 -0.373579930708927
+375 376 -0.01654813940719981
+376 376  1
+375 377 -0.01654813940719981
+377 377  1
+375 378 -0.01654813940719981
+378 378  1
+375 379 -0.01605847694438298
+379 379  1
+375 380 -0.01605847694438298
+380 380  1
+375 381 -0.01605847694438298
+381 381  1
+353 382 -0.4658833134982665
+354 382 -0.2139376306780124
+382 382  1
+354 383 -0.7860623003057075
+356 383 -0.3931236374228056
+383 383  1
+384 383 -1
+385 383 -1
+386 383 -1
+387 383 -1
+392 383 -0.6685895441854749
+383 384 -0.09663793316238328
+384 384  1
+383 385 -0.09531836681606604
+385 385  1
+383 386 -0.1958273483403571
+386 386  1
+383 387 -0.1958273483403571
+387 387  1
+363 388 -0.3617245197804639
+388 388  1
+389 388 -0.1784703040014409
+390 388 -1
+391 388 -1
+395 388 -0.006030399251820348
+401 388 -0.3234717941970725
+405 388 -0.4204597559445792
+375 389 -0.8671575255467873
+388 389 -0.2444561274895281
+389 389  1
+388 390 -0.07525718206866674
+390 390  1
+388 391 -0.07525718206866674
+391 391  1
+383 392 -0.08264022921613286
+392 392  1
+395 392 -0.004392179741254537
+393 393  1
+474 393 -0.2331070676156656
+479 393 -0.135568940489809
+357 394 -0.436057357650007
+394 394  1
+395 394 -0.02492975318342171
+357 395 -0.5639426423499929
+363 395 -0.05096217708626882
+388 395 -0.05039733873277172
+392 395 -0.3314099757672116
+394 395 -0.9717267764874861
+395 395  1
+398 395 -0.9024572420959797
+399 395 -1
+405 395 -0.5795402440554207
+406 395 -0.04473415615634896
+407 395 -0.8924627314726891
+474 395 -0.3586263690444103
+396 396  1
+406 396 -0.008160603445508431
+716 396 -0.01440107594234689
+717 396 -0.01518780397874969
+397 397  1
+716 397 -0.9638436089137943
+717 397 -0.9616916812314886
+356 398 -0.6068764361306191
+395 398 -0.09713087099944968
+398 398  0.9999999999999999
+395 399 -0.09863677341162923
+399 399  1
+400 400  1
+473 400 -0.99959116721261
+388 401 -0.1001303177040349
+401 401  1
+402 401 -1
+714 401 -0.1139920054066689
+401 402 -0.2761471656099203
+402 402  1
+362 403 -0.4912971098677563
+403 403  1
+375 404 -0.01728097778433349
+404 404  1
+409 404 -0.3458247066267777
+388 405 -0.08268926825466046
+395 405 -0.01363787723109908
+405 405  1
+395 406 -0.6028540254419255
+396 406 -0.890751778274275
+406 406  0.9999999999999999
+479 406 -0.2014764706920652
+483 406 -0.9921875155029299
+716 406 -0.009499336509416116
+717 406 -0.01001831734328415
+363 407 -0.1112998442822022
+395 407 -0.113608433562474
+407 407  1
+352 408 -0.1967652099974787
+408 408  1
+352 409 -0.448545588416744
+404 409 -0.626420069291073
+409 409  1
+410 410  1
+411 410 -0.8516560025140886
+412 410 -0.7178859939985532
+731 410 -0.3523725805191165
+15 411 -0.9281257882689085
+410 411 -0.9846890696559202
+411 411  1
+476 411 -0.6017526963944623
+477 411 -0.2988908409683733
+486 411 -0.004925231405858653
+38 412 -0.07470946936061723
+410 412 -0.007896463883081959
+412 412  1
+413 412 -0.3805177586401105
+478 412 -0.0003190680987413605
+709 412 -0.01455597090867175
+724 412 -0.0802655551807238
+14 413 -1
+99 413 -0.7627878527204898
+412 413 -0.08437359900596564
+413 413  1
+414 414  0.9999999999999999
+431 414 -0.6507934735298834
+433 414 -0.1523900201385848
+415 415  0.9999999999999999
+432 415 -0.6507934735298834
+433 415 -0.1523900201385848
+79 416 -0.08517912325790625
+416 416  1
+433 416 -0.1523900201385848
+462 416 -0.4507657586936158
+79 417 -0.08517912325790625
+417 417  1
+433 417 -0.1523900201385848
+462 417 -0.4507657586936158
+418 418  1
+419 418 -1
+545 418 -0.413164800558622
+418 419 -0.2391770137487025
+419 419  1
+420 420  1
+421 420 -1
+422 420 -1
+470 420 -0.6000000719999943
+420 421 -0.02257424779570259
+421 421  1
+420 422 -0.006733126109865712
+422 422  1
+423 423  1
+425 423 -0.5809255943077843
+516 423 -0.09337482905823084
+424 424  1
+466 424 -0.5138873189634434
+516 424 -0.09337482905823084
+54 425 -0.06081201124010476
+423 425 -0.8671329569993718
+425 425  1
+426 425 -0.13712928823864
+427 425 -1
+466 425 -0.3659501043879658
+425 426 -0.08115871916319203
+426 426  1
+437 426 -0.8291218645184498
+519 426 -0.08737927029685764
+425 427 -0.03927966726534533
+427 427  1
+428 428  1
+429 428 -1
+430 428 -1
+431 428 -0.3492062401209539
+432 428 -0.3492062401209539
+428 429 -0.08171550257485548
+429 429  1
+428 430 -0.121060013499401
+430 430  1
+414 431 -0.7124180957751635
+428 431 -0.3986121439042686
+431 431  1
+415 432 -0.7124180957751635
+428 432 -0.3986121439042686
+432 432  1
+414 433 -0.2875816534536366
+415 433 -0.2875816534536366
+416 433 -0.253404422632383
+417 433 -0.253404422632383
+433 433  1
+434 433 -1
+510 433 -0.06870259929044649
+564 433 -0.1867630698817527
+433 434 -0.05323881730496434
+434 434  1
+435 435  1
+510 435 -0.03162044064262619
+436 436  0.9999999999999999
+534 436 -0.02896297492007903
+426 437 -0.8477084324092463
+437 437  1
+438 437 -1
+439 437 -0.2825888032115383
+441 437 -0.2669272691397696
+458 437 -0.02241746129357353
+437 438 -0.08329077174854423
+438 438  1
+437 439 -0.01135076424797693
+439 439  1
+440 439 -1
+460 439 -0.08957359368376369
+439 440 -0.5071701043568294
+440 440  1
+437 441 -0.02210991030693226
+441 441  0.9999999999999999
+442 441 -1
+441 442 -0.7330727308602304
+442 442  1
+443 443  0.9999999999999999
+444 443 -0.0555555715061778
+470 443 -0.3999999280000058
+506 443 -0.01526717767729127
+516 443 -0.330813701925721
+443 444 -0.04523789924166079
+444 444  1
+458 444 -0.9443355730454898
+54 445 -0.09425862763858191
+445 445  1
+446 445 -1
+447 445 -0.05361844470471311
+445 446 -0.1566157975298198
+446 446  1
+54 447 -0.07512071976718823
+80 447 -0.724611536861429
+445 447 -0.2249381550031186
+447 447  1
+449 447 -0.03546099909863812
+537 447 -0.539325834000758
+81 448 -0.3779204994172751
+448 448  1
+464 448 -0.2285592214937007
+447 449 -0.05874373051646643
+449 449  1
+450 449 -0.9104126627476437
+81 450 -0.4177015043707345
+449 450 -0.9645391986318976
+450 450  1
+451 450 -1
+450 451 -0.009726630389756765
+451 451  1
+78 452 -0.3845513381809875
+80 452 -0.1491847272822447
+452 452  1
+453 452 -1
+452 453 -0.1614155925405549
+453 453  1
+79 454 -0.6685758194143596
+454 454  1
+455 454 -1
+454 455 -0.08702793435845524
+455 455  1
+456 456  1
+457 456 -1
+564 456 -0.7166843852531996
+456 457 -0.229906405727996
+457 457  1
+437 458 -0.0541266435763989
+444 458 -0.9444447382716963
+458 458  1
+459 458 -1
+460 458 -0.4898200084705083
+458 459 -0.01412090668302508
+459 459  1
+439 460 -0.2102408653715157
+458 460 -0.01912577945458199
+460 460  1
+461 460 -1
+460 461 -0.420606397845728
+461 461  1
+416 462 -0.6138017574189535
+417 462 -0.6138017574189535
+462 462  1
+463 462 -1
+462 463 -0.09846825948369556
+463 463  1
+81 464 -0.04458611055226133
+448 464 -0.1105263541304762
+464 464  1
+465 464 -0.2900763128134739
+545 464 -0.2899948915629452
+464 465 -0.311504030173168
+465 465  1
+968 465 -0.8426778594247277
+424 466 -0.7251463901672432
+425 466 -0.1672362061079352
+466 466  1
+467 466 -1
+466 467 -0.1201626732593874
+467 467  1
+468 468  1
+544 468 -0.9981852991261886
+469 469  1
+543 469 -0.9925459796925092
+420 470 -0.9706928474123566
+443 470 -0.68359486992077
+470 470  1
+471 471  1
+472 471 -1
+478 471 -0.00107992012499105
+481 471 -0.09361663201601354
+492 471 -0.0002235220153358096
+707 471 -0.0005052306304767164
+708 471 -0.3266172835000307
+471 472 -0.3460054965830081
+472 472  1
+400 473 -1
+473 473  1
+474 473 -0.05918361949320084
+359 474 -1
+363 474 -0.0579245379054694
+393 474 -0.5
+395 474 -0.03157356304654174
+473 474 -0.0004093329828100307
+474 474  1
+3 475 -0.1251942406400489
+475 475  1
+710 475 -0.001226376915182903
+724 475 -0.06980953516683448
+411 476 -0.004164577412549833
+476 476  1
+482 476 -0.291284452389053
+411 477 -0.005823107641653814
+477 477  1
+478 477 -0.01573028649667073
+103 478 -0.9925893280765803
+412 478 -0.02912273602944862
+471 478 -0.1544236610002906
+477 478 -0.7011088967467495
+478 478  1
+708 478 -0.2620084430920087
+709 478 -0.01357229245502209
+358 479 -1
+361 479 -0.0185528758559691
+393 479 -0.5
+406 479 -0.002263226571543491
+479 479  1
+480 479 -0.0177304974292039
+714 479 -0.08225833079854974
+726 479 -0.111260350496243
+361 480 -0.9814471162924541
+479 480 -0.07585988069551974
+480 480  1
+471 481 -0.03530816880422673
+481 481  1
+484 481 -0.2060182480051046
+485 481 -0.3476151913290253
+491 481 -0.06959016411189313
+492 481 -0.0003428948333306764
+733 481 -0.5625245168985686
+476 482 -0.3982473036055377
+482 482  1
+483 482 -0.007812500372070319
+406 483 -0.9441818569791089
+482 483 -0.7087157276247361
+483 483  1
+481 484 -0.1709676363767527
+484 484  1
+709 484 -0.2410462936113267
+733 484 -0.4374752335655681
+481 485 -0.1645665681127969
+485 485  1
+491 485 -0.1998058630326408
+44 486 -0.9230772071006792
+411 486 -0.004237094217851903
+486 486  1
+731 486 -0.2412117210433243
+487 487  1
+491 487 -0.138177046659509
+488 488  1
+491 488 -0.138177046659509
+489 489  1
+492 489 -0.0006943201111128825
+490 490  1
+492 490 -0.0007051305018412538
+481 491 -0.1075688033309249
+485 491 -0.6523848086709747
+487 491 -1
+488 491 -1
+491 491  1
+492 491 -0.00168515758261729
+734 491 -0.04417811668870859
+471 492 -0.03146328150156909
+481 492 -0.1279742141131472
+489 492 -1
+490 492 -1
+491 492 -0.4068774043093595
+492 492  1
+707 492 -0.9932834175306575
+493 493  1
+707 493 -0.001116049274803777
+494 494  1
+505 494 -0.4168904076153283
+495 495  1
+513 495 -0.2498359477364026
+496 496  1
+521 496 -0.5822657287297308
+497 497  1
+522 497 -0.3998740109359429
+498 498  1
+523 498 -0.08112378897000688
+499 499  1
+500 499 -1
+819 499 -0.07041320998812001
+914 499 -0.1324665116304762
+499 500 -0.4508108223233004
+500 500  1
+501 501  1
+502 501 -0.2747150549780659
+505 501 -0.0635409252790694
+507 501 -0.1966114184340081
+521 501 -0.0627933662930675
+531 501 -0.208452852038472
+536 501 -0.06131576799783524
+501 502 -0.1456089431347203
+502 502  1
+607 502 -0.3226529423038578
+609 502 -0.5428428081954032
+613 502 -0.4454544728292564
+937 502 -0.07988497981065862
+503 503  1
+507 503 -0.1451380721103738
+513 503 -0.2998031372836831
+553 503 -0.3388953833156444
+121 504 -0.1873760874483269
+213 504 -0.2605300072529472
+504 504  1
+505 504 -0.2441787015025683
+494 505 -1
+501 505 -0.08152969501633238
+504 505 -0.3644716458548493
+505 505  1
+521 505 -0.07972932154571288
+526 505 -0.2453931027561656
+578 505 -0.4666797005313729
+48 506 -0.4954576443169026
+443 506 -0.09538536502534155
+506 506  1
+501 507 -0.2550172885925116
+503 507 -0.353193296328452
+507 507  1
+508 507 -0.0454025610058394
+509 507 -0.05815667212600774
+515 507 -0.02977840794404767
+521 507 -0.07505769234464378
+522 507 -0.1541682341722879
+547 507 -0.3353174792622302
+507 508 -0.04069453020853694
+508 508  1
+619 508 -0.9722897428820663
+639 508 -0.5639797596028586
+781 508 -0.5734986656347684
+507 509 -0.1794014226205734
+509 509  1
+511 509 -0.4059238789079981
+515 509 -0.1200980639341044
+520 509 -0.08759754206863871
+534 509 -0.7103549994776759
+433 510 -0.3020093404231428
+435 510 -1
+510 510  1
+520 510 -0.8759754206863871
+509 511 -0.01325578438014554
+511 511  1
+914 511 -0.03744986273220905
+926 511 -0.05148352503430811
+512 512  1
+520 512 -0.03642670612621217
+495 513 -1
+503 513 -0.4571890828742012
+513 513  1
+514 513 -0.3311924195687763
+540 513 -0.7753743459087041
+513 514 -0.05062341479103843
+514 514  1
+790 514 -0.1660027056546063
+795 514 -0.05988144097700143
+507 515 -0.06929746188636214
+509 515 -0.09059944222635875
+515 515  1
+516 515 -0.4824366631147753
+517 515 -0.3756343960723746
+518 515 -1
+536 515 -0.1154615517875781
+547 515 -0.6646825207377697
+423 516 -0.1328671418537833
+424 516 -0.274853814323998
+443 516 -0.1757815643468599
+515 516 -0.6054944401809566
+516 516  1
+515 517 -0.01502777984049534
+517 517  1
+993 517 -0.4405442145405706
+995 517 -0.149188085081202
+515 518 -0.02642157609996412
+518 518  1
+426 519 -0.01516226256748818
+519 519  1
+957 519 -0.1071748112080725
+967 519 -0.7594728487360221
+509 520 -0.2610126310296668
+510 520 -0.8996766615541634
+512 520 -1
+520 520  0.9999999999999999
+496 521 -1
+501 521 -0.1433430694521997
+505 521 -0.1418465311435778
+507 521 -0.1320983078566071
+521 521  0.9999999999999999
+522 521 -0.3998740109359429
+497 522 -1
+507 522 -0.1358119062128222
+521 522 -0.2001539103016142
+522 522  1
+523 522 -0.04849897122292566
+498 523 -1
+522 523 -0.04608389750739734
+523 523  0.9999999999999999
+582 523 -0.9364088751349131
+770 523 -0.0003674442805047742
+779 523 -0.2204038881448743
+802 523 -0.0004067669457320515
+524 524  1
+533 524 -1
+985 524 -0.5971998104086521
+1001 524 -0.2766233478085622
+525 525  1
+568 525 -0.4799622565372934
+505 526 -0.06018487941918286
+526 526  1
+527 526 -0.1433119470755233
+531 526 -0.1600863532646021
+526 527 -0.2991058560504902
+527 527  1
+921 527 -0.03688742360236372
+933 527 -0.0299909671787933
+1068 527 -0.3660187912510047
+1073 527 -0.1576227343708651
+1074 527 -0.08188246918212248
+1124 527 -0.7101449040705746
+528 528  1
+529 528 -1
+809 528 -0.2776735825024428
+819 528 -0.2467421630512848
+528 529 -0.5830603065551999
+529 529  1
+530 530  1
+578 530 -0.2111385610004441
+583 530 -0.3088121221459691
+780 530 -0.1000075133138704
+797 530 -0.04471075373065521
+803 530 -0.8013450274060447
+501 531 -0.1866509607965867
+526 531 -0.4555010411933442
+531 531  1
+532 531 -0.1017617299002866
+534 531 -0.1014792837115616
+531 532 -0.2631417833604881
+532 532  0.9999999999999999
+554 532 -0.1022922985661368
+647 532 -0.8580645063858513
+652 532 -0.02231579502961252
+663 532 -0.01357466031211483
+668 532 -0.2850678741183841
+672 532 -0.02092114750264132
+673 532 -0.5746163645773398
+677 532 -0.02987668404501208
+908 532 -0.06846725232738664
+912 532 -0.09550556340629787
+918 532 -0.0276024558479934
+1047 532 -0.1074318830518355
+524 533 -0.719724171828545
+533 533  1
+436 534 -0.9999999999999999
+509 534 -0.5769753935000953
+531 534 -0.3683189623500158
+534 534  0.9999999999999999
+535 534 -0.5724771087641494
+536 534 -0.08088832701704754
+542 534 -0.155661526463627
+534 535 -0.02896297492007903
+535 535  1
+885 535 -0.2389382708060702
+501 536 -0.1878499991446719
+515 536 -0.1171924610426119
+534 536 -0.07625280158678528
+536 536  1
+537 536 -0.460674165999242
+538 536 -0.1996180737509501
+539 536 -1
+542 536 -0.2432211350994171
+447 537 -0.38971453178003
+536 537 -0.5965514081714174
+537 537  1
+536 538 -0.03026899216704202
+538 538  1
+809 538 -0.1669793513444584
+914 538 -0.03868245568487514
+948 538 -0.1007759433217734
+953 538 -0.318198132925804
+964 538 -0.4999999999999999
+536 539 -0.0260313346953794
+539 539  1
+513 540 -0.3997374624136837
+540 540  1
+800 540 -0.4229102194028947
+541 541  1
+543 541 -0.001389147502258042
+534 542 -0.05398697888056379
+536 542 -0.08948270406709628
+542 542  1
+543 542 -0.004259854492759377
+545 542 -0.2968403078784328
+546 542 -0.4739336767143581
+469 543 -1
+541 543 -1
+542 543 -0.1670188249267029
+543 543  1
+544 543 -0.001814882543535852
+468 544 -1
+543 544 -0.001804629234449286
+544 544  1
+418 545 -0.7608230117236505
+464 545 -0.2429339577229863
+542 545 -0.08350941246335145
+545 545  1
+125 546 -0.2752654453505877
+542 546 -0.35058906213152
+546 546  1
+507 547 -0.1009467792192157
+515 547 -0.08598737268087776
+547 547  1
+548 548  1
+551 548 -0.1210293052360318
+549 549  1
+563 549 -0.2679749952654444
+550 550  1
+566 550 -0.2940161986573785
+548 551 -1
+551 551  1
+552 551 -0.4989148902469458
+555 551 -0.2635769128672333
+566 551 -0.2054483669876222
+551 552 -0.266857878092583
+552 552  1
+558 552 -0.02915535043456461
+560 552 -0.7271280724192077
+34 553 -0.23395766532261
+503 553 -0.1896175385033185
+553 553  1
+561 553 -0.1332681894485847
+710 553 -0.0005026541030934422
+532 554 -0.06814369435690237
+554 554  1
+574 554 -1
+909 554 -0.236042788519142
+551 555 -0.235876050136544
+555 555  1
+556 555 -0.1079488330221924
+557 555 -0.341628902596535
+564 555 -0.09655219225636062
+566 555 -0.06195576238862799
+555 556 -0.2768627906370897
+556 556  1
+559 556 -0.8785489123309654
+579 556 -0.09337267097732671
+621 556 -0.328171501383863
+903 556 -0.5968064283730297
+555 557 -0.1462806165772434
+557 557  1
+558 557 -0.1602578832894687
+552 558 -0.08580736837667124
+557 558 -0.658370898575437
+558 558  1
+979 558 -0.02160215504205074
+980 558 -0.6940015360166825
+986 558 -0.6110065977333146
+989 558 -0.2950958967151079
+991 558 -0.1403406325935518
+998 558 -0.2584449826266326
+1000 558 -1
+556 559 -0.4310944682410488
+559 559  1
+579 559 -0.1062805574061592
+552 560 -0.4152776844002867
+560 560  1
+980 560 -0.3059984080468032
+553 561 -0.3932637367421524
+561 561  0.9999999999999999
+562 561 -0.9410823745050327
+566 561 -0.3225061323250914
+561 562 -0.6735444055009948
+562 562  1
+563 562 -0.2657079047762322
+1 563 -0.003885774749979488
+2 563 -0.6272163748348137
+549 563 -1
+562 563 -0.05891719372626142
+563 563  0.9999999999999999
+567 563 -0.2679981417964429
+433 564 -0.03519176171755352
+456 564 -0.7700932153861747
+555 564 -0.1864957331028096
+564 564  1
+565 565  1
+567 565 -0.3970889990867262
+11 566 -0.3574718677076007
+550 566 -1
+551 566 -0.3762367970100193
+555 566 -0.1267838105991092
+561 566 -0.1931872056812246
+566 566  0.9999999999999999
+567 566 -0.3349125425574516
+563 567 -0.1063551397172834
+565 567 -1
+566 567 -0.03521225999061429
+567 567  1
+525 568 -1
+568 568  1
+773 568 -0.1566391664744798
+795 568 -0.6157886834004855
+569 569  1
+785 569 -0.2361169169313239
+570 570  1
+579 570 -0.1741125261996736
+571 571  0.9999999999999999
+776 571 -0.07859332663980086
+572 572  1
+783 572 -0.4191118647815879
+573 573  1
+796 573 -0.230679743715748
+554 574 -0.2657687129924692
+574 574  1
+575 575  1
+581 575 -0.9353465108067598
+576 576  1
+582 576 -0.06359102935136633
+784 576 -0.4170212987191654
+577 577  1
+786 577 -0.5935894596561302
+802 577 -0.0008895072277967837
+1090 577 -0.9619564997814242
+505 578 -0.07335841830019779
+530 578 -0.0370962223800683
+578 578  1
+780 578 -0.2748353517557316
+556 579 -0.05235689949571262
+559 579 -0.1214511011986866
+570 579 -1
+579 579  0.9999999999999999
+580 579 -0.1740910304173178
+782 579 -0.06588825253594437
+783 579 -0.2215729147980039
+895 579 -0.03747752678371188
+927 579 -0.02471142985934896
+579 580 -0.0345179419887753
+580 580  1
+587 580 -0.2069733953428794
+575 581 -1
+581 581  1
+776 581 -0.066411555971227
+796 581 -0.05842732851632546
+797 581 -0.06919204760188463
+799 581 -0.1549773221234991
+523 582 -0.7921498633077858
+576 582 -0.5608186964555192
+582 582  1
+530 583 -0.03537880319137759
+583 583  1
+780 583 -0.2418695276871182
+803 583 -0.02648565847385529
+584 584  1
+585 584 -0.374641251962961
+1068 584 -0.0684023244486532
+584 585 -0.8738478885659794
+585 585  1
+607 585 -0.3766376408137823
+613 585 -0.5545453641343804
+659 585 -0.05009393432454906
+586 586  1
+597 586 -0.2061081168433045
+580 587 -0.8259090890091247
+587 587  1
+598 587 -1
+614 587 -1
+588 588  1
+595 588 -0.5348727181080468
+597 588 -0.4010202797242549
+589 589  0.9999999999999999
+591 589 -0.3830584606499214
+590 590  0.9999999999999999
+595 590 -0.465127176308076
+616 590 -1
+589 591 -0.9999999999999999
+591 591  1
+601 591 -0.4090902857563337
+592 592  1
+611 592 -0.3907833701421984
+593 593  1
+604 593 -0.3421239234570931
+594 594  1
+600 594 -0.1839917141395476
+588 595 -0.5063764665193276
+590 595 -0.3739656975599155
+595 595  1
+596 596  1
+597 596 -0.3928717658456408
+603 596 -0.1320406839878019
+607 596 -0.1039697919469714
+586 597 -1
+588 597 -0.4936233335632385
+596 597 -0.2191827597861238
+597 597  0.9999999999999999
+587 598 -0.2107630726658843
+598 598  1
+599 599  1
+1105 599 -0.2711267887040513
+594 600 -1
+600 600  1
+601 600 -0.1968409935831506
+782 600 -0.8931520820659508
+591 601 -0.6169415393500786
+600 601 -0.01360014050489904
+601 601  1
+602 601 -0.5145183576883209
+633 601 -1
+601 602 -0.2193548198281817
+602 602  1
+640 602 -1
+596 603 -0.2941888080142874
+603 603  0.9999999999999999
+604 603 -0.1570757188803369
+605 603 -0.3504272275082358
+1092 603 -0.737473491883647
+593 604 -1
+603 604 -0.09837573943019724
+604 604  1
+612 604 -1
+615 604 -0.3913244846817505
+603 605 -0.2229634062499809
+605 605  1
+607 605 -0.1967395281395254
+606 606  1
+610 606 -0.08339052453905878
+502 607 -0.2758489594841405
+585 607 -0.3192468629545578
+596 607 -0.4866282056741895
+605 607 -0.6495724529020662
+607 607  1
+608 608  1
+610 608 -0.9166093863665015
+1101 608 -0.2267653744566368
+502 609 -0.09124000427014103
+609 609  1
+1068 609 -0.09634756031884485
+606 610 -1
+608 610 -0.9329655179918697
+610 610  1
+592 611 -1
+611 611  1
+615 611 -0.60867556693395
+604 612 -0.2546474705503056
+612 612  1
+502 613 -0.2261056230971598
+585 613 -0.2790694621528
+613 613  1
+587 614 -0.5822636816329866
+614 614  1
+604 615 -0.2461527897096037
+611 615 -0.6092164232115411
+615 615  1
+590 616 -0.6260340477694245
+616 616  1
+617 617  1
+1123 617 -0.3509934645019262
+618 618  1
+629 618 -0.001968927809830688
+639 618 -0.1745182224467162
+508 619 -0.6619217994878592
+619 619  1
+620 619 -0.705232145496726
+619 620 -0.02771025711793374
+620 620  1
+632 620 -0.195558932571758
+556 621 -0.2990475235745716
+621 621  1
+624 621 -0.7919706249864572
+1061 621 -0.07947020956091519
+622 622  1
+625 622 -0.6230892554591451
+641 622 -0.5039609316151602
+623 623  1
+639 623 -0.1035522515894542
+642 623 -0.4487804075312052
+621 624 -0.6390707609679974
+624 624  0.9999999999999999
+638 624 -1
+622 625 -0.8560679954310355
+625 625  1
+635 625 -0.3454506343378822
+626 626  1
+634 626 -0.4517125459341182
+635 626 -0.1539421842584462
+627 627  0.9999999999999999
+639 627 -0.1579500658341977
+628 628  1
+631 628 -0.8485153033716532
+642 628 -0.5512193693351958
+618 629 -0.3659629192546133
+629 629  1
+637 629 -1
+643 629 -1
+630 630  1
+631 630 -0.1514846455477235
+635 630 -0.2326534849387227
+796 630 -0.2191153123425395
+628 631 -0.7707538647207384
+630 631 -0.2128229150706677
+631 631  1
+620 632 -0.294767854503274
+632 632  1
+634 632 -0.5482876948738408
+641 632 -0.4960390683848397
+601 633 -0.1747138590233077
+633 633  1
+626 634 -0.6617171107619035
+632 634 -0.6391731904007066
+634 634  1
+625 635 -0.376910627773944
+626 635 -0.3382828892380965
+630 635 -0.1560084494801962
+635 635  1
+636 635 -0.622851710496922
+635 636 -0.2679534289134358
+636 636  1
+1094 636 -0.4724886990502823
+629 637 -0.9756034741629955
+637 637  1
+624 638 -0.2080291041595944
+638 638  1
+508 639 -0.2131612715308346
+618 639 -0.6340370807453867
+623 639 -0.563207644487879
+627 639 -0.9999999999999999
+639 639  1
+602 640 -0.4854811519757107
+640 640  1
+622 641 -0.1439324858503279
+632 641 -0.1652683449023171
+641 641  0.9999999999999999
+623 642 -0.4367919754594565
+628 642 -0.2292459728814157
+642 642  1
+629 643 -0.02242766046579602
+643 643  1
+644 644  1
+652 644 -0.01357544319813741
+662 644 -0.3568464207014346
+645 645  1
+650 645 -0.6126687660539137
+1060 645 -1
+646 646  1
+664 646 -0.5075074575187769
+1051 646 -0.2558586820875031
+532 647 -0.2152721524790423
+647 647  1
+1041 647 -0.2199412920133198
+648 648  1
+678 648 -0.8855993933715476
+1042 648 -1
+649 649  1
+672 649 -0.06452081199441895
+1055 649 -0.1850421293191803
+645 650 -0.8317547057889145
+650 650  1
+653 650 -0.5297586117440403
+651 651  1
+670 651 -0.2186760697949558
+1045 651 -0.2989839507864746
+532 652 -0.1730040471840586
+644 652 -0.6326531123531862
+652 652  1
+653 652 -0.1387911208629091
+663 652 -0.9864252094797445
+666 652 -0.1557376991327603
+668 652 -0.7149323060445101
+676 652 -0.107615937447433
+1054 652 -0.9271842283721742
+650 653 -0.3873312339460863
+652 653 -0.00108507077950433
+653 653  1
+1048 653 -1
+654 654  1
+677 654 -0.04213683633169196
+1057 654 -0.716049205825331
+655 655  1
+657 655 -0.4285715196734902
+665 655 -0.7350993163282319
+656 656  1
+678 656 -0.05510395740508132
+655 657 -0.2439024803331427
+657 657  0.9999999999999999
+1055 657 -0.4019195360280249
+658 658  1
+671 658 -0.7121210973186182
+672 658 -0.2688367704107556
+585 659 -0.02704233267860278
+659 659  1
+677 659 -0.06610215776480366
+660 660  0.9999999999999999
+1050 660 -0.003196593808555408
+1058 660 -0.03676270879182294
+661 661  1
+677 661 -0.5288172621184293
+644 662 -0.3673468876468138
+662 662  1
+672 662 -0.093780242114347
+532 663 -0.08689882342956604
+652 663 -0.8145264452734696
+663 663  1
+646 664 -0.8652423335813741
+664 664  0.9999999999999999
+1056 664 -0.7929315886183289
+655 665 -0.7560977011303056
+665 665  1
+682 665 -0.4336734104435798
+652 666 -0.02372407392610176
+666 666  1
+681 666 -0.3214285583418375
+667 667  1
+1051 667 -0.2960650248620035
+532 668 -0.1198983771102606
+652 668 -0.03878698754776282
+668 668  1
+669 669  1
+679 669 -0.556212841507715
+1058 669 -0.6261914624421282
+651 670 -0.4222026744971679
+670 670  1
+1121 670 -0.3392857142857142
+658 671 -0.441176332525945
+671 671  1
+679 671 -0.4437868247645969
+532 672 -0.02457061657989077
+649 672 -0.4469026573390634
+658 672 -0.5588236674740549
+662 672 -0.6431534686761595
+672 672  1
+673 672 -0.4253834601646409
+682 672 -0.5663261081789878
+1049 672 -0.6044303840505328
+1050 672 -0.003153612989491784
+1052 672 -0.08403365077678537
+1058 672 -0.03727330536135791
+1121 672 -0.6607145995535949
+532 673 -0.06211127975430462
+672 673 -0.03915097063496544
+673 673  1
+674 674  0.9999999999999999
+675 674 -0.9126760388043622
+1046 674 -0.2530282823179095
+674 675 -0.9470990191973778
+675 675  1
+1050 675 -0.005791358577418576
+652 676 -0.004533543872149961
+676 676  1
+1043 676 -0.8552336511981358
+532 677 -0.05351396888495191
+654 677 -0.2156249830869165
+659 677 -0.9499062024619441
+661 677 -1
+677 677  1
+678 677 -0.05929664922337102
+1041 677 -0.7800585909778973
+1059 677 -0.8240341259614482
+648 678 -0.7416107903768294
+656 678 -1
+677 678 -0.004598410882974645
+678 678  1
+669 679 -0.166666675
+671 679 -0.287878848560159
+679 679  1
+680 680  1
+1047 680 -0.434678379342025
+1057 680 -0.2839505965450388
+666 681 -0.8442622687852741
+681 681  0.9999999999999999
+1053 681 -0.9842378233775207
+665 682 -0.2649006836717682
+672 682 -0.1897669992176851
+682 682  1
+683 683  0.9999999999999999
+689 683 -0.1603582326493311
+684 684  1
+689 684 -0.1952977674083521
+685 685  0.9999999999999999
+691 685 -0.0001757727410346504
+112 686 -0.9468287959406042
+686 686  1
+687 686 -1
+688 686 -0.3433610223642173
+686 687 -0.05585362275077257
+687 687  1
+686 688 -0.07442595844058435
+688 688  1
+689 688 -0.4477990376054506
+694 688 -0.0442271064609753
+699 688 -0.1411416778758453
+700 688 -0.4806586950457414
+702 688 -0.4494028708495627
+683 689 -0.9999999999999999
+684 689 -1
+688 689 -0.06192368781378366
+689 689  1
+693 689 -0.05366300615138479
+113 690 -0.9959673282877629
+690 690  1
+691 690 -0.0004969703850658508
+694 690 -0.8581485117171185
+697 690 -0.5390444053308227
+685 691 -0.9999999999999999
+690 691 -0.0004902307906929223
+691 691  1
+692 691 -1
+695 691 -0.3940708255506621
+691 692 -0.9959286437045359
+692 692  1
+689 693 -0.1965450638917011
+693 693  1
+695 693 -0.3940708255506621
+688 694 -0.1167370150616157
+690 694 -0.01584552192177559
+694 694  1
+695 694 -0.2118583488986758
+691 695 -0.003399073587175626
+693 695 -0.9463369661209398
+694 695 -0.09762428073199735
+695 695  1
+300 696 -0.5794516115854407
+696 696  1
+690 697 -0.001242001116031956
+697 697  0.9999999999999999
+698 697 -1
+697 698 -0.4609555946691771
+698 698  1
+366 699 -0.3663246583698724
+688 699 -0.05378576694870625
+699 699  1
+700 699 -0.5193412397769371
+688 700 -0.103565368816487
+699 700 -0.2936422030181865
+700 700  0.9999999999999999
+111 701 -0.994337249364867
+701 701  1
+237 702 -0.2265728743020855
+688 702 -0.3206270406909385
+702 702  1
+145 703 -0.99780383376189
+703 703  1
+96 704 -0.001083295253868641
+318 704 -0.0590021492713087
+704 704  1
+96 705 -0.9977152321184488
+705 705  1
+706 706  1
+707 706 -0.001097550364435686
+471 707 -0.07133650891245298
+492 707 -0.996349375886751
+493 707 -1
+706 707 -1
+707 707  1
+726 707 -0.8887397157593007
+471 708 -0.3614628831984525
+478 708 -0.002027765000348178
+708 708  1
+709 708 -0.7308255150688829
+412 709 -0.00578783131494704
+478 709 -5.912608266239568e-05
+484 709 -0.479753514815638
+708 709 -0.4113744001354463
+709 709  1
+35 710 -0.9981733427827078
+475 710 -0.3845400923020677
+553 710 -0.07405872811734446
+710 710  1
+36 711 -1
+711 711  1
+724 711 -0.07467328524575109
+734 711 -0.2790363921690038
+712 712  1
+714 712 -0.1055072126411206
+713 713  1
+714 713 -0.1068908621622177
+362 714 -0.5087028901322437
+401 714 -0.4003811088984153
+479 714 -0.07069197270975155
+712 714 -1
+713 714 -1
+714 714  0.9999999999999999
+715 714 -0.7569936966261785
+734 714 -0.5373540493761518
+360 715 -1
+714 715 -0.2453100961326864
+715 715  1
+396 716 -0.05499465273268188
+397 716 -0.5172413593341292
+406 716 -0.0003323413444016625
+716 716  1
+717 716 -0.01310206761808757
+396 717 -0.05425351746305068
+397 717 -0.4827584958383075
+406 717 -0.0003278636563631247
+716 717 -0.01225595704434848
+717 717  1
+718 718  1
+724 718 -0.09255588411789725
+719 719  1
+724 719 -0.09255588411789725
+720 720  1
+724 720 -0.09255588411789725
+721 721  1
+724 721 -0.09255588411789725
+722 722  1
+724 722 -0.09255588411789725
+723 723  1
+724 723 -0.09255588411789725
+8 724 -0.859538077353734
+412 724 -0.1126751018741988
+475 724 -0.3426554623742616
+711 724 -0.001168138944449155
+718 724 -1
+719 724 -1
+720 724 -1
+721 724 -1
+722 724 -1
+723 724 -1
+724 724  0.9999999999999999
+725 724 -0.05977453412358324
+734 724 -0.1394315090697187
+100 725 -0.8735200655474057
+724 725 -0.01442783068290446
+725 725  0.9999999999999999
+479 726 -0.042343953959263
+707 726 -0.003997116498238908
+726 726  1
+727 727  1
+728 727 -0.06309665658939982
+731 727 -0.09986143978152316
+727 728 -0.06309665658939982
+728 728  1
+731 728 -0.09986143978152316
+729 729  1
+730 729 -0.02220540286408267
+731 729 -0.1031410258510035
+729 730 -0.02229187607625067
+730 730  0.9999999999999999
+731 730 -0.1035518398185848
+410 731 -0.007414827841886507
+486 731 -0.005102946155445339
+727 731 -0.9369030799909034
+728 731 -0.9369030799909034
+729 731 -0.9777080263348169
+730 731 -0.9777944999255457
+731 731  1
+100 732 -0.05815189403627591
+732 732  1
+481 733 -0.335306257606751
+484 733 -0.3142284164151389
+733 733  1
+491 734 -0.04737235494396129
+711 734 -0.001240784505419412
+714 734 -0.2342638841752238
+724 734 -0.03963393535279071
+734 734  1
+735 735  1
+743 735 -0.05755251786922217
+736 736  1
+742 736 -0.05209091210399883
+737 737  0.9999999999999999
+743 737 -0.1598881530284162
+738 738  1
+742 738 -0.2599284254568054
+739 739  1
+744 739 -0.05525845205611164
+135 740 -0.9969900869275656
+740 740  0.9999999999999999
+741 740 -0.4378658607581571
+742 740 -0.09692244698145769
+744 740 -0.6684489726901692
+740 741 -0.002872753882991917
+741 741  1
+749 741 -0.2751090554374274
+759 741 -0.2107954651925419
+132 742 -0.2119908445071003
+736 742 -1
+738 742 -1
+740 742 -0.0009305341467102409
+742 742  1
+743 742 -0.552713265273701
+143 743 -0.0001044606959223247
+161 743 -0.5661488680856371
+222 743 -0.02926873510750779
+735 743 -1
+737 743 -0.9999999999999999
+742 743 -0.3706477673201522
+743 743  1
+746 743 -0.4714804198026962
+739 744 -1
+740 744 -0.007969574459464401
+744 744  1
+756 744 -0.5451099963613916
+143 745 -0.4996355158911574
+745 745  1
+746 745 -0.5285195801973038
+743 746 -0.06204226786511624
+745 746 -0.0004528986369174763
+746 746  1
+747 747  1
+748 747 -0.4349736477921984
+758 747 -1
+747 748 -0.1871049295551823
+748 748  1
+757 748 -0.2004651915094611
+741 749 -0.453692268691801
+749 749  1
+753 749 -1
+750 750  1
+751 750 -1
+759 750 -0.7892046519254193
+750 751 -0.2544581608411395
+751 751  1
+752 752  1
+757 752 -0.1012553621557578
+762 752 -0.4736618169146121
+749 753 -0.7248905792177237
+753 753  1
+754 754  1
+757 754 -0.4222564566656446
+765 754 -1
+172 755 -0.1408499194273259
+755 755  1
+764 755 -1
+744 756 -0.2762923017243952
+756 756  1
+757 756 -0.2760229896691365
+748 757 -0.5650263522078015
+752 757 -0.7165992841086014
+754 757 -0.4183167723111768
+756 757 -0.4548900036386085
+757 757  0.9999999999999999
+747 758 -0.8128950704448178
+758 758  1
+741 759 -0.108441930800379
+750 759 -0.7455418944780647
+759 759  1
+760 760  1
+761 760 -0.2298681841111243
+137 761 -0.07562582962297858
+760 761 -1
+761 761  0.9999999999999999
+766 761 -0.6704084254553354
+767 761 -1
+752 762 -0.2834008563448807
+762 762  1
+763 762 -1
+762 763 -0.5263381830853878
+763 763  1
+755 764 -0.7264023124000653
+764 764  1
+754 765 -0.5816831293843782
+765 765  1
+761 766 -0.1743040266247976
+766 766  0.9999999999999999
+768 766 -0.2619779337887494
+761 767 -0.2947829678822902
+767 767  1
+766 768 -0.3295914328203243
+768 768  1
+769 768 -0.9999999999999999
+772 768 -0.4285714807346967
+768 769 -0.3413652410704459
+769 769  0.9999999999999999
+523 770 -0.01485008296068023
+770 770  0.9999999999999999
+823 770 -1
+771 771  1
+772 771 -0.5714286409795956
+768 772 -0.3966568251408047
+771 772 -1
+772 772  1
+568 773 -0.04007574293727283
+773 773  1
+774 773 -1
+795 773 -0.04745458626342001
+801 773 -0.9999999999999999
+802 773 -0.001442548396825602
+773 774 -0.2189780868862489
+774 774  1
+775 775  1
+776 775 -0.5457870631438515
+781 775 -0.2341867991589159
+571 776 -0.9999999999999999
+581 776 -0.02258200065941929
+775 776 -0.9021739340484409
+776 776  1
+781 776 -0.1923145352063157
+796 776 -0.03129039631630315
+797 776 -0.03199330563572712
+1127 776 -1
+777 777  1
+793 777 -0.0003904723506218138
+795 777 -0.04533632129182412
+800 777 -0.2885448902985526
+778 778  1
+779 778 -0.5226632078696118
+523 779 -0.04686735057886637
+778 779 -1
+779 779  0.9999999999999999
+780 779 -0.06573845393809882
+787 779 -0.5308989923599799
+530 780 -0.02059790377188857
+578 780 -0.3221816949736401
+583 780 -0.4348332597928126
+779 780 -0.06732392423501758
+780 780  1
+797 780 -0.0236557021771585
+949 780 -0.1427242945176912
+508 781 -0.07951433024592491
+775 781 -0.0978260886863422
+776 781 -0.04860012103373816
+781 781  1
+579 782 -0.2426158151962665
+600 782 -0.8024083417850998
+782 782  1
+783 782 -0.3593149383581041
+572 783 -1
+579 783 -0.09300592586506133
+782 783 -0.04095982616546692
+783 783  1
+576 784 -0.4391813035444809
+784 784  1
+802 784 -0.001450960172714632
+569 785 -1
+785 785  1
+786 785 -0.02057776968385605
+788 785 -0.08347258078454704
+796 785 -0.07401488949968267
+797 785 -0.1025123423848812
+802 785 -0.0004520248083188094
+577 786 -0.8573540185300089
+785 786 -0.3135633249482591
+786 786  1
+1128 786 -1
+779 787 -0.1896089227540602
+787 787  1
+949 787 -0.08633938344831096
+785 788 -0.1976238879723042
+788 788  1
+789 788 -0.4641734010869621
+796 788 -0.1144917695434805
+1132 788 -1
+788 789 -0.01456486454539043
+789 789  1
+1094 789 -0.5275111451229092
+514 790 -0.3984775023469074
+790 790  1
+791 790 -1
+795 790 -0.1862027328011212
+790 791 -0.4838106371231226
+791 791  1
+792 792  1
+799 792 -0.2687641777966632
+1110 792 -0.7539132836750267
+777 793 -0.160508901400097
+793 793  1
+794 793 -0.160508901400097
+825 793 -0.5
+793 794 -0.0003904723506218138
+794 794  1
+795 794 -0.04533632129182412
+800 794 -0.2885448902985526
+514 795 -0.2703301133894285
+568 795 -0.4799622565372934
+773 795 -0.144568060831429
+777 795 -0.2381114154710987
+790 795 -0.3501866572222711
+794 795 -0.2381114154710987
+795 795  1
+573 796 -1
+581 796 -0.01096794801344093
+630 796 -0.6311687789769125
+776 796 -0.01727434772442819
+785 796 -0.06986704642316205
+788 796 -0.04564907258316853
+796 796  1
+799 796 -0.5762585000798377
+530 797 -0.01834330654747463
+581 797 -0.0173727051857949
+776 797 -0.02362390141251874
+780 797 -0.04712062912965587
+785 797 -0.1294289295290424
+797 797  1
+798 797 -0.4452878035256336
+803 797 -0.01554255422889035
+805 797 -0.1379870515257248
+1119 797 -0.6217239036054392
+797 798 -0.04487448066988346
+798 798  1
+1114 798 -0.3934527465453918
+581 799 -0.01373086339498046
+792 799 -0.558453245793901
+796 799 -0.2719804479555745
+799 799  1
+540 800 -0.2246256540912959
+777 800 -0.6013796009803595
+794 800 -0.6013796009803595
+800 800  1
+773 801 -0.1345274728730633
+801 801  0.9999999999999999
+523 802 -0.01650987024036579
+577 802 -0.009960322772169745
+773 802 -0.3452871176355108
+784 802 -0.5829787012808346
+785 802 -0.0533997413338065
+802 802  1
+1137 802 -1
+530 803 -0.8885837679301019
+583 803 -0.2563544179509576
+797 803 -0.04200834653486555
+803 803  1
+804 803 -1
+803 804 -0.1566265359152365
+804 804  1
+797 805 -0.1753907606969664
+805 805  1
+1138 805 -1
+806 806  1
+821 806 -0.03344117591703837
+807 807  1
+887 807 -0.1321743617489629
+808 808  1
+897 808 -0.05494562236441237
+528 809 -0.1670566454489821
+538 809 -0.3189842366904131
+809 809  0.9999999999999999
+920 809 -1
+75 810 -0.006648777579010137
+810 810  1
+811 810 -0.9999999999999999
+918 810 -0.0373798275151182
+810 811 -0.6640821695946558
+811 811  0.9999999999999999
+812 812  1
+909 812 -0.5310963272777022
+813 813  1
+885 813 -0.1660328780840257
+940 813 -0.178088831887964
+1122 813 -0.9724576188156695
+814 814  1
+815 814 -1
+909 814 -0.02458779489654669
+918 814 -0.05811829989004948
+814 815 -0.7538914180468935
+815 815  1
+816 816  1
+817 816 -0.3406804300872179
+927 816 -0.1473626935769674
+816 817 -0.700427957958775
+817 817  1
+818 817 -0.9999999999999999
+907 817 -0.6772958127999048
+817 818 -0.3765415166948903
+818 818  0.9999999999999999
+499 819 -0.2745945315853772
+528 819 -0.2498830479958179
+819 819  1
+820 819 -1
+819 820 -0.6828444214243894
+820 820  1
+806 821 -1
+821 821  1
+888 821 -0.6583849294302128
+897 821 -0.4736691583138997
+921 821 -0.04963528406352395
+822 822  1
+880 822 -1
+885 822 -0.3823013227482067
+935 822 -0.3217162155339475
+770 823 -0.9996321353741822
+823 823  1
+824 824  1
+825 824 -0.5
+793 825 -0.9992186110461619
+824 825 -1
+825 825  1
+143 826 -0.4996355158911574
+826 826  1
+144 827 -0.9983118546537804
+827 827  1
+828 828  1
+837 828 -1
+838 828 -0.385702574026371
+874 828 -0.07033360555409447
+140 829 -0.4091571791409437
+829 829  1
+833 829 -0.3904693520223962
+839 829 -1
+140 830 -0.3085346440014443
+830 830  1
+833 830 -0.4848368165175393
+840 830 -1
+831 831  1
+841 831 -1
+869 831 -0.04597108605825639
+832 832  1
+842 832 -1
+869 832 -0.07456617864612559
+829 833 -0.256377491938501
+830 833 -0.3398024612087239
+833 833  1
+843 833 -1
+834 834  1
+844 834 -1
+874 834 -0.8521398635691807
+835 835  0.9999999999999999
+845 835 -1
+862 835 -0.9471698688264778
+836 836  1
+846 836 -1
+869 836 -0.02545062694485538
+828 837 -0.270004471018193
+837 837  1
+828 838 -0.3627874693957614
+838 838  1
+859 838 -1
+860 838 -0.4283176749291381
+829 839 -0.2555594737939714
+839 839  1
+830 840 -0.267346840039331
+840 840  1
+831 841 -0.477681250005796
+841 841  1
+832 842 -0.2641549858743126
+842 842  1
+833 843 -0.1246939511662575
+843 843  1
+834 844 -0.1490953414141997
+844 844  1
+835 845 -0.01659790580587195
+845 845  1
+836 846 -0.5218397520090229
+846 846  1
+847 847  1
+848 847 -0.455984542371944
+847 848 -1
+848 848  1
+873 848 -0.410560708120102
+849 849  0.9999999999999999
+854 849 -1
+863 849 -0.2225319250998413
+864 849 -0.0898036324146653
+850 850  1
+855 850 -1
+873 850 -0.1311546132828865
+851 851  0.9999999999999999
+856 851 -1
+858 851 -0.3568092151689177
+852 852  0.9999999999999999
+857 852 -1
+869 852 -0.005099027699841861
+853 853  1
+876 853 -0.02202335032195767
+849 854 -0.4010628451505971
+854 854  1
+850 855 -0.7212950926812342
+855 855  1
+851 856 -0.4096284587312885
+856 856  1
+852 857 -0.575232508844974
+857 857  1
+851 858 -0.5903715412687115
+858 858  1
+873 858 -0.1091327605195705
+838 859 -0.5628686543246464
+859 859  1
+838 860 -0.05142857745930671
+860 860  1
+861 860 -1
+860 861 -0.5716821844370392
+861 861  1
+835 862 -0.9834020533632732
+862 862  0.9999999999999999
+863 862 -0.5084036078177585
+849 863 -0.2994790821118928
+862 863 -0.05282978706669576
+863 863  1
+869 863 -0.01160564403407642
+849 864 -0.29945807273751
+864 864  1
+865 864 -0.9999999999999999
+869 864 -0.0119014518185992
+864 865 -0.7988387331300901
+865 865  0.9999999999999999
+866 866  1
+869 866 -0.273893179615365
+877 866 -0.03287389424901925
+867 867  1
+878 867 -0.07939993664601402
+868 868  1
+869 868 -0.2776195888384431
+871 868 -0.4992913970779654
+877 868 -0.03325193322086428
+831 869 -0.5223182863146845
+832 869 -0.7358450141256875
+836 869 -0.4781599412991924
+852 869 -0.4247671511925373
+863 869 -0.2690647509241414
+864 869 -0.1113575199017589
+866 869 -0.93407082053895
+868 869 -0.8514479652337646
+869 869  1
+870 869 -0.8501006099932081
+869 870 -0.273893179615365
+870 870  1
+871 870 -0.5007088145716529
+877 870 -0.03287389424901925
+868 871 -0.08857951552213374
+870 871 -0.08989707457096673
+871 871  1
+872 872  0.9999999999999999
+882 872 -0.01641700798105282
+848 873 -0.5440151119744769
+850 873 -0.2787049073187659
+858 873 -0.6431910923149631
+873 873  1
+882 873 -0.03194293124820115
+828 874 -0.3672077419293357
+834 874 -0.8509045978312132
+874 874  1
+878 874 -0.07879578573861272
+875 875  1
+876 875 -0.9779763226427206
+882 875 -0.1982995932865795
+853 876 -1
+875 876 -0.8454513124483387
+876 876  0.9999999999999999
+138 877 -0.2748086260543924
+866 877 -0.06592920729635793
+868 877 -0.05997261937438349
+870 877 -0.06000236610181685
+877 877  1
+878 877 -0.8418041075583099
+883 877 -0.9999999999999999
+867 878 -1
+874 878 -0.07752648524463591
+877 878 -0.3778751954386883
+878 878  1
+879 879  1
+882 879 -0.06319436853934944
+822 880 -0.5804282114926005
+880 880  1
+881 881  1
+882 881 -0.07426593740498603
+138 882 -0.5921683540004956
+872 882 -0.9999999999999999
+873 882 -0.3491518137334439
+875 882 -0.1545486503517991
+879 882 -1
+881 882 -1
+882 882  0.9999999999999999
+877 883 -0.1075752139676292
+883 883  0.9999999999999999
+884 884  1
+937 884 -0.5719764534423979
+1123 884 -0.6490067418822234
+535 885 -0.4275228912358506
+813 885 -0.01382382716026394
+822 885 -0.191987825286117
+885 885  1
+914 885 -0.05206272337087715
+918 885 -0.02079875188977461
+886 886  1
+895 886 -0.02728800900368913
+927 886 -0.04447147147798995
+937 886 -0.07492207534818357
+807 887 -1
+887 887  1
+896 887 -0.3265305453927528
+897 887 -0.04426340773109826
+821 888 -0.1527653650811575
+888 888  1
+924 888 -0.1478255789917181
+889 889  1
+917 889 -0.6971406303562184
+890 890  1
+891 890 -1
+908 890 -0.05869772868567801
+917 890 -0.03024125931369406
+890 891 -0.6160583177513715
+891 891  1
+892 892  1
+893 892 -1
+899 892 -0.3265907006889268
+927 892 -0.2826370242382277
+892 893 -0.4237216938716536
+893 893  1
+894 894  1
+921 894 -0.2020068935070616
+936 894 -0.001221476590241877
+1126 894 -1
+579 895 -0.2156585813277059
+886 895 -0.385776873865221
+895 895  1
+1125 895 -0.9999999999999999
+887 896 -0.5185786831079293
+896 896  1
+897 896 -0.135556681025335
+808 897 -1
+821 897 -0.7726063436497629
+887 897 -0.349246987657998
+896 897 -0.6734694546072473
+897 897  1
+913 897 -0.8327304641897268
+917 897 -0.005223057111627633
+944 897 -0.1028792245282151
+898 898  0.9999999999999999
+914 898 -0.1206081764098259
+926 898 -0.4623422888106051
+892 899 -0.3660724673478272
+899 899  0.9999999999999999
+900 899 -1
+908 899 -0.5184968937483233
+899 900 -0.1733170743416789
+900 900  1
+901 901  1
+913 901 -0.1672693817551257
+1129 901 -0.9984204987709444
+902 902  1
+906 902 -0.6517682227470042
+556 903 -0.1095520499454286
+903 903  1
+987 903 -0.1487510776119439
+904 904  0.9999999999999999
+905 904 -1
+939 904 -0.4951456810632513
+944 904 -0.2466584880139182
+904 905 -0.5349694032326376
+905 905  1
+902 906 -1
+906 906  1
+922 906 -0.001757723379530428
+939 906 -0.5048543189367488
+817 907 -0.2827781963036652
+907 907  0.9999999999999999
+923 907 -0.4503104906876705
+532 908 -0.02605769718940079
+890 908 -0.1307670539036446
+899 908 -0.5000921769605607
+908 908  1
+912 908 -0.2246456703084351
+917 908 -0.01015772416047608
+927 908 -0.1450001555634181
+554 909 -0.6319389031296334
+812 909 -1
+814 909 -0.1553155371562572
+909 909  1
+910 909 -1
+909 910 -0.2082730361969764
+910 910  1
+911 911  1
+912 911 -0.679848507942741
+532 912 -0.0350165342737612
+908 912 -0.2164160212004601
+911 912 -1
+912 912  1
+897 913 -0.2227525620806703
+901 913 -0.001084598126434692
+913 913  1
+499 914 -0.2745945315853772
+511 914 -0.2275925446622348
+538 914 -0.06611998810316257
+885 914 -0.1405254434122094
+898 914 -0.1821428124528147
+914 914  1
+935 914 -0.3094872452465444
+938 914 -1
+963 914 -0.3702866376914646
+915 915  1
+926 915 -0.4861744219496437
+916 916  0.9999999999999999
+991 916 -0.8299655993563372
+889 917 -1
+890 917 -0.2531746283449839
+897 917 -0.01624969659706565
+908 917 -0.03817153995519193
+917 917  1
+918 917 -0.5205989726498077
+930 917 -0.3938572626448919
+75 918 -0.004345736809232553
+532 918 -0.01217477038046258
+810 918 -0.2196268179697284
+814 918 -0.09079338027838257
+885 918 -0.07220217440798232
+917 918 -0.1605535684104505
+918 918  1
+919 918 -0.7532467495294317
+1130 918 -0.9999999999999999
+918 919 -0.226049597993403
+919 919  1
+940 919 -0.1218987246907173
+809 920 -0.5553473627084599
+920 920  1
+527 921 -0.03041136545924579
+821 921 -0.04118674565981324
+894 921 -0.002484520100705276
+921 921  1
+923 921 -0.5496895093123295
+924 921 -0.3644073607610019
+927 921 -0.03768493415326107
+933 921 -0.1279470579573126
+937 921 -0.03294795914486078
+906 922 -0.02873756894689408
+922 922  1
+930 922 -0.08731140174328945
+1133 922 -1
+907 923 -0.3227040158442471
+921 923 -0.1862179271685993
+923 923  1
+888 924 -0.3416147808803835
+921 924 -0.2354790420374323
+924 924  1
+925 924 -1
+924 925 -0.48776706024728
+925 925  1
+511 926 -0.3664836099996677
+898 926 -0.8178570207043603
+915 926 -1
+926 926  1
+579 927 -0.04043596919937775
+816 927 -0.2995720420412248
+886 927 -0.1787804394201799
+892 927 -0.210205677342559
+908 927 -0.09975087269221256
+921 927 -0.04218998861882794
+927 927  1
+928 927 -0.03555708593258307
+937 927 -0.03399996196356109
+927 928 -0.2689779772848467
+928 928  1
+929 928 -1
+937 928 -0.04163722846486935
+928 929 -0.9564855465454609
+929 929  1
+917 930 -0.09668373415618978
+922 930 -0.001612668953598928
+930 930  1
+931 930 -1
+930 931 -0.5188312276949276
+931 931  1
+932 932  1
+949 932 -0.7709359499803203
+527 933 -0.03753982792599602
+921 933 -0.1942560318186789
+933 933  1
+934 933 -1
+942 933 -1
+933 934 -0.7020252086672401
+934 934  1
+822 935 -0.2275843375974782
+914 935 -0.1615162428949418
+935 935  1
+947 935 -1
+894 936 -0.001222445281108993
+936 936  1
+937 936 -0.1646314168973877
+1134 936 -1
+502 937 -0.1320904409251641
+884 937 -0.547803646058268
+886 937 -0.4354427448897494
+921 937 -0.05332770890121061
+927 937 -0.04915426319738538
+928 937 -0.007957451692692642
+936 937 -0.003274682260230539
+937 937  1
+914 938 -0.4349492081554193
+938 938  1
+904 939 -0.4012270002649703
+906 939 -0.3194941757176905
+939 939  1
+813 940 -0.03126900629441422
+919 940 -0.2467532504705683
+940 940  1
+954 940 -0.1874301111765894
+941 941  1
+954 941 -0.7572176567255978
+933 942 -0.1400365794578813
+942 942  1
+943 943  1
+944 943 -0.6504622874578666
+897 944 -0.05256278122721902
+904 944 -0.06380367995760475
+943 944 -1
+944 944  0.9999999999999999
+945 945  1
+961 945 -0.1874605786331821
+946 946  1
+965 946 -0.3382024089200985
+935 947 -0.3687965392195082
+947 947  1
+538 948 -0.1054964333092564
+948 948  1
+952 948 -0.1648707494119165
+955 948 -0.1574397892112519
+957 948 -0.1207575826800948
+958 948 -0.6627219337250112
+964 948 -0.4999999999999999
+780 949 -0.2704284685213483
+787 949 -0.4691011672282398
+932 949 -1
+949 949  1
+950 950  1
+951 950 -0.1643688506062763
+955 950 -0.517514669206253
+956 950 -1
+950 951 -0.0743971039819234
+951 951  1
+965 951 -0.159450198430038
+969 951 -0.3870782893572819
+975 951 -1
+948 952 -0.1163800435639457
+952 952  1
+960 952 -0.1404494159891439
+538 953 -0.09902398723450459
+953 953  1
+955 953 -0.3250456862278263
+940 954 -0.7000124434213186
+941 954 -1
+954 954  1
+959 954 -0.9999999999999999
+948 955 -0.09817256914536197
+950 955 -0.6050255525535728
+953 955 -0.6818018670741961
+955 955  1
+950 956 -0.3205775463900476
+956 956  1
+519 957 -0.1764802927316111
+948 957 -0.1668723702664797
+957 957  1
+958 957 -0.3372780662749887
+967 957 -0.2405271512639779
+973 957 -0.1644736748658249
+948 958 -0.316471951763979
+957 958 -0.1165525973983591
+958 958  0.9999999999999999
+954 959 -0.05535217530648858
+959 959  0.9999999999999999
+952 960 -0.8351292505880835
+960 960  1
+973 960 -0.835526241581551
+945 961 -1
+961 961  1
+962 961 -0.1410723405285448
+963 961 -0.184400059898008
+968 961 -0.08117048756716724
+961 962 -0.3418599495835197
+962 962  1
+966 962 -0.2715053237319026
+914 963 -0.02226463131031359
+961 963 -0.1132213806940643
+963 963  1
+972 963 -0.2978216846032288
+538 964 -0.2107574791923369
+948 964 -0.2013270317439558
+964 964  0.9999999999999999
+946 965 -1
+951 965 -0.185414586123736
+965 965  1
+968 965 -0.0761516273906966
+969 965 -0.3686995531051487
+962 966 -0.8589275663637097
+966 966  1
+971 966 -0.8965161605780593
+519 967 -0.736140447719182
+957 967 -0.1415823276396603
+967 967  1
+465 968 -0.709923579278136
+961 968 -0.357458091089234
+965 968 -0.1910179242470197
+968 968  1
+951 969 -0.3800716211142183
+965 969 -0.3113295969197078
+969 969  1
+970 969 -1
+969 970 -0.2442220433881821
+970 970  1
+966 971 -0.7284944555342752
+971 971  1
+972 971 -0.7021783153967711
+963 972 -0.4453132105424169
+971 972 -0.1034835677775413
+972 972  1
+957 973 -0.5139328116127432
+960 973 -0.8595505195445672
+973 973  1
+974 974  1
+983 974 -0.8223473743679232
+951 975 -0.2701449047947291
+975 975  1
+976 976  1
+977 976 -1
+989 976 -0.2306466504301426
+1001 976 -0.7233766521914379
+976 977 -0.5562293631603693
+977 977  1
+978 978  1
+989 978 -0.2450275815194624
+558 979 -0.0445242729911732
+979 979  1
+983 979 -0.1713223593806408
+558 980 -0.1200940274106261
+560 980 -0.2728718776998083
+980 980  0.9999999999999999
+981 981  1
+982 981 -1
+984 981 -1
+998 981 -0.3215656741501349
+981 982 -0.8970161297845942
+982 982  1
+974 983 -1
+979 983 -0.9783974504680341
+983 983  0.9999999999999999
+986 983 -0.3889934022666854
+981 984 -0.05619858572854571
+984 984  1
+524 985 -0.1988711050130337
+985 985  1
+987 985 -0.3238585319179285
+989 985 -0.1085252173600295
+558 986 -0.1170444766881908
+983 986 -0.006330619860842363
+986 986  1
+903 987 -0.4031935716269703
+985 987 -0.3304891454051535
+987 987  1
+999 987 -1
+988 988  1
+993 988 -0.4224845603067158
+995 988 -0.4527417648780621
+558 989 -0.1397779311452972
+976 989 -0.08600836561183348
+978 989 -1
+985 989 -0.07231097610541735
+989 989  1
+990 989 -1
+989 990 -0.1207046335399614
+990 990  1
+558 991 -0.1341590335135993
+916 991 -0.9999999999999999
+991 991  1
+992 991 -0.9999999999999999
+991 992 -0.02969377817569055
+992 992  0.9999999999999999
+517 993 -0.470516292700504
+988 993 -0.4914718923176806
+993 993  1
+994 993 -1
+993 994 -0.1369714292127601
+994 994  1
+517 995 -0.153849420198677
+988 995 -0.5085281076823195
+995 995  0.9999999999999999
+997 995 -0.07644002373735428
+998 995 -0.1293267507538932
+996 996  1
+997 996 -0.8282005827474199
+995 997 -0.255983366143847
+996 997 -1
+997 997  1
+998 997 -0.2906627207097336
+558 998 -0.07802964467240925
+981 998 -0.04678519119718638
+995 998 -0.1420867838968887
+997 998 -0.09535944610596107
+998 998  1
+987 999 -0.527390257040387
+999 999  1
+558 1000 -0.1769575734462101
+1000 1000  1
+524 1001 -0.08140504055669844
+976 1001 -0.3577625760414913
+1001 1001  1
+1002 1002  1
+1027 1002 -0.2145724100524318
+1003 1003  1
+1035 1003 -0.1179423697455311
+1004 1004  1
+1032 1004 -0.162505704441258
+1005 1005  0.9999999999999999
+1039 1005 -0.1130121128694464
+1006 1006  1
+1007 1006 -1
+1013 1006 -0.02575059378740406
+1019 1006 -0.1739559461890909
+1021 1006 -0.584900570771349
+1030 1006 -0.2546360643290361
+1035 1006 -0.6222461605622936
+1006 1007 -0.03318804123612391
+1007 1007  1
+1008 1008  0.9999999999999999
+1023 1008 -0.3678773718994644
+114 1009 -0.9992375817251438
+1009 1009  1
+1010 1009 -1
+1018 1009 -1
+1019 1009 -0.1045050198588827
+1021 1009 -0.0449350580055885
+1022 1009 -1
+1040 1009 -0.06444798577102936
+1009 1010 -0.0001552850892835913
+1010 1010  1
+1011 1011  1
+1040 1011 -0.8187280470081262
+1012 1012  1
+1040 1012 -0.1168237881650059
+1006 1013 -0.1541101352275481
+1013 1013  0.9999999999999999
+1023 1013 -0.2270471899509603
+1029 1013 -0.04302149139955865
+1030 1013 -0.5789444404552598
+1037 1013 -1
+1038 1013 -1
+1039 1013 -0.06508193719117868
+1014 1014  1
+1039 1014 -0.8219060584309685
+1015 1015  1
+1029 1015 -0.003275834222457429
+1016 1016  1
+1024 1016 -0.2074952206497009
+1034 1016 -0.2015505937972701
+1017 1017  1
+1029 1017 -0.006986834020806925
+1009 1018 -0.002098980748349462
+1018 1018  1
+1006 1019 -0.08383980010605768
+1009 1019 -0.0006238784070824515
+1019 1019  1
+1033 1019 -1
+1020 1020  1
+1021 1020 -0.3701642865294616
+1006 1021 -0.4418959850847382
+1009 1021 -0.0004205088728788707
+1020 1021 -1
+1021 1021  0.9999999999999999
+1009 1022 -0.005661251396913712
+1022 1022  1
+1008 1023 -0.9999999999999999
+1013 1023 -0.02014251699571104
+1023 1023  1
+1024 1023 -0.6336274541872923
+1025 1023 -0.6236855676661436
+1027 1023 -0.3240149993642371
+1016 1024 -0.5566915211795602
+1023 1024 -0.1050528714070786
+1024 1024  0.9999999999999999
+1034 1024 -0.1937972923128213
+1023 1025 -0.1107689760619287
+1025 1025  1
+1028 1025 -0.5681164323437917
+224 1026 -0.3879003999395794
+1026 1026  1
+1027 1026 -0.2479961601462487
+1002 1027 -1
+1023 1027 -0.1892534701639192
+1026 1027 -0.5695362685047473
+1027 1027  1
+1028 1027 -0.4318834396027577
+1035 1027 -0.141530861018014
+1025 1028 -0.3763142626913779
+1027 1028 -0.08698684705657209
+1028 1028  1
+109 1029 -0.964285748724491
+1013 1029 -0.6073239660240807
+1015 1029 -1
+1017 1029 -1
+1029 1029  1
+1006 1030 -0.1145890904553682
+1013 1030 -0.04353276909942644
+1030 1030  0.9999999999999999
+1031 1030 -0.9999999999999999
+1030 1031 -0.1664194418949211
+1031 1031  0.9999999999999999
+1004 1032 -1
+1032 1032  1
+1034 1032 -0.6046518922245098
+1036 1032 -0.7254086850559581
+1019 1033 -0.7215390173566286
+1033 1033  1
+1016 1034 -0.4433082350451801
+1024 1034 -0.1588776886116987
+1032 1034 -0.2806835657041016
+1034 1034  1
+1003 1035 -1
+1006 1035 -0.1723770918598727
+1027 1035 -0.1264294286308967
+1035 1035  0.9999999999999999
+1036 1035 -0.2745914489995771
+1032 1036 -0.556810575506737
+1035 1036 -0.1182806086741611
+1036 1036  1
+1013 1037 -0.1581604805681969
+1037 1037  1
+1013 1038 -0.1370724075827306
+1038 1038  1
+1005 1039 -0.9999999999999999
+1013 1039 -0.008017132296849331
+1014 1039 -1
+1039 1039  1
+1009 1040 -0.0003209326033716147
+1011 1040 -1
+1012 1040 -1
+1040 1040  1
+647 1041 -0.1419354181044963
+677 1041 -0.07050897180790565
+1041 1041  0.9999999999999999
+648 1042 -0.2583893238312233
+1042 1042  1
+676 1043 -0.8923839465426134
+1043 1043  1
+1050 1043 -0.004886459034246927
+1044 1044  0.9999999999999999
+1050 1044 -0.006360677110208854
+1053 1044 -0.01576181343868616
+651 1045 -0.5777970464268083
+1045 1045  1
+1047 1045 -0.4578893603053118
+674 1046 -0.05290103952276962
+1046 1046  1
+1058 1046 -0.09992418073014558
+532 1047 -0.0215762421733098
+680 1047 -0.4450128531725415
+1045 1047 -0.701015904804251
+1047 1047  0.9999999999999999
+653 1048 -0.3314501736257623
+1048 1048  1
+672 1049 -0.06452081199441895
+1049 1049  1
+1054 1049 -0.07281552128805931
+77 1050 -0.9940298566807754
+660 1050 -0.4653917129373519
+672 1050 -0.02710958053407102
+675 1050 -0.08732396119563791
+1043 1050 -0.1447661820312439
+1044 1050 -0.6557758565655354
+1050 1050  1
+1051 1050 -0.2536903963910759
+1052 1050 -0.9159664866181876
+646 1051 -0.1347576096587197
+667 1051 -1
+1050 1051 -0.0008834275223601209
+1051 1051  1
+1056 1051 -0.09668500039665608
+672 1052 -0.02959671266462888
+1050 1052 -0.03752800057943233
+1052 1052  1
+681 1053 -0.6785713195153128
+1044 1053 -0.3442239499805578
+1053 1053  1
+652 1054 -0.08145264452734696
+1049 1054 -0.3955696159494672
+1054 1054  1
+649 1055 -0.5530973426609367
+657 1055 -0.5714287991837255
+1055 1055  1
+1056 1055 -0.1103833841839284
+1058 1055 -0.1998483990317826
+664 1056 -0.492492542481223
+1051 1056 -0.194386112194775
+1055 1056 -0.01539477313860625
+1056 1056  1
+654 1057 -0.7843746921818798
+680 1057 -0.5549872433952462
+1057 1057  1
+660 1058 -0.5346082870626481
+669 1058 -0.83333325
+672 1058 -0.03200437128501262
+1046 1058 -0.7469717176820906
+1055 1058 -0.3976438082123578
+1058 1058  1
+677 1059 -0.257959598784228
+1059 1059  1
+1061 1059 -0.9205298434616072
+645 1060 -0.1682453872844421
+1060 1060  1
+621 1061 -0.03275812377468568
+1059 1061 -0.1759656375407199
+1061 1061  1
+1062 1062  1
+1072 1062 -0.5991411089611631
+1085 1062 -0.4668527181241949
+70 1063 -0.02050133543111832
+1063 1063  1
+1074 1063 -0.0328038604222115
+1064 1064  1
+1075 1064 -0.458892414147185
+1087 1064 -0.008485362269695942
+70 1065 -0.0730549777235988
+1065 1065  1
+1077 1065 -0.08843521871124733
+1080 1065 -0.7779392017109086
+1066 1066  1
+1068 1066 -0.1453309616502498
+1067 1067  1
+1071 1067 -0.3817494717681219
+527 1068 -0.2355674147961204
+584 1068 -0.1261520731594827
+609 1068 -0.4571572410404319
+1066 1068 -1
+1068 1068  0.9999999999999999
+1069 1068 -0.4372715412347995
+1092 1068 -0.2625264532483245
+1068 1069 -0.224602414144263
+1069 1069  1
+1086 1069 -0.5616605382717522
+1070 1070  1
+1077 1070 -0.2570964897400501
+1113 1070 -0.06934306577087218
+1067 1071 -1
+1071 1071  1
+1088 1071 -0.2713032536378535
+1062 1072 -0.6581951455734653
+1072 1072  1
+1075 1072 -0.1683121598255496
+527 1073 -0.2048569533832116
+1073 1073  1
+1074 1073 -0.5194838447173812
+70 1074 -0.01417206153049319
+527 1074 -0.1725667983857126
+1063 1074 -0.6912751633035462
+1073 1074 -0.8423770087041061
+1074 1074  1
+1075 1074 -0.2012280514743605
+1081 1074 -1
+1086 1074 -0.4383394617282477
+1064 1075 -0.7287183863302658
+1072 1075 -0.4008587008714421
+1074 1075 -0.01597203424768444
+1075 1075  1
+1082 1075 -1
+1076 1076  1
+1077 1076 -0.2277826604506423
+1065 1077 -0.1286430330200971
+1070 1077 -0.7634732976778843
+1076 1077 -1
+1077 1077  1
+1078 1077 -0.3055127419781863
+1080 1077 -0.2220610125335382
+1088 1077 -0.7286964939415912
+1095 1077 -0.03461953194256959
+1077 1078 -0.1147492629682491
+1078 1078  1
+1084 1078 -1
+1085 1078 -0.533147281875805
+1079 1079  1
+1117 1079 -0.4030520800304267
+1065 1080 -0.5793140002588144
+1077 1080 -0.1136788486511149
+1080 1080  1
+1074 1081 -0.2708420622001276
+1081 1081  1
+1075 1082 -0.171567055163705
+1082 1082  1
+1083 1083  1
+1087 1083 -0.9915146179000117
+1078 1084 -0.5732410671179614
+1084 1084  1
+1062 1085 -0.3418048544265347
+1078 1085 -0.1212460935674943
+1085 1085  1
+1069 1086 -0.562728256745733
+1074 1086 -0.06888808470468043
+1086 1086  1
+1064 1087 -0.2712816136697344
+1083 1087 -1
+1087 1087  1
+1071 1088 -0.6182503844268341
+1077 1088 -0.140720175561743
+1088 1088  1
+1089 1089  1
+1106 1089 -0.6056747050442844
+1112 1089 -0.3093713235769833
+577 1090 -0.1326857144258304
+1090 1090  1
+1091 1090 -0.3211576989270636
+1090 1091 -0.038043484057704
+1091 1091  1
+1093 1091 -0.3791887956199022
+1108 1091 -1
+603 1092 -0.5466200686606856
+1068 1092 -0.09929791705708746
+1092 1092  1
+1091 1093 -0.3875789320023438
+1093 1093  1
+1098 1093 -0.5865385221662452
+636 1094 -0.3771481651195848
+789 1094 -0.535826440629891
+1094 1094  1
+65 1095 -0.001589825997012756
+1077 1095 -0.0575371733077517
+1095 1095  1
+1096 1095 -0.2689990840515936
+1100 1095 -0.06618670378289888
+1105 1095 -0.7288735255861399
+1113 1095 -0.9306569130101505
+1095 1096 -0.03255957037909323
+1096 1096  1
+1103 1096 -0.9999999999999999
+1107 1096 -1
+1112 1096 -0.2222319526933478
+1097 1097  1
+1100 1097 -0.8925375098710187
+1093 1098 -0.6208114045917884
+1098 1098  1
+1099 1098 -1
+1098 1099 -0.4134616039395449
+1099 1099  1
+1095 1100 -0.05383565269865898
+1097 1100 -1
+1100 1100  1
+1101 1100 -0.3511540620714831
+608 1101 -0.06703448200813029
+1100 1101 -0.04127576831682454
+1101 1101  1
+1120 1101 -0.5526833917123255
+1102 1102  1
+1112 1102 -0.4683966202140186
+1096 1103 -0.4592767036519202
+1103 1103  0.9999999999999999
+1104 1104  1
+1118 1104 -1
+1119 1104 -0.2865086534641524
+599 1105 -1
+1095 1105 -0.1190327517016704
+1105 1105  1
+1089 1106 -0.7553044358239164
+1106 1106  1
+1109 1106 -1
+1096 1107 -0.1416711696758076
+1107 1107  1
+1091 1108 -0.2912634372844867
+1108 1108  1
+1106 1109 -0.394325163645436
+1109 1109  1
+792 1110 -0.441546754206099
+1110 1110  1
+1114 1110 -0.1722713003852328
+1111 1111  1
+1119 1111 -0.09176736832354015
+1089 1112 -0.2446951548010683
+1096 1112 -0.1300528003062854
+1102 1112 -1
+1112 1112  1
+1070 1113 -0.236526991831141
+1095 1113 -0.6431943816097146
+1113 1113  0.9999999999999999
+798 1114 -0.5547121040593305
+1110 1114 -0.2460868099610006
+1114 1114  0.9999999999999999
+1115 1114 -1
+1116 1114 -1
+1114 1115 -0.287370564857355
+1115 1115  1
+1114 1116 -0.1469055193104745
+1116 1116  1
+1079 1117 -1
+1117 1117  1
+1120 1117 -0.4473166082876745
+1104 1118 -0.5011548049107137
+1118 1118  1
+797 1119 -0.4656625120257343
+1104 1119 -0.4988451950892863
+1111 1119 -1
+1119 1119  1
+1101 1120 -0.4220806401639369
+1117 1120 -0.5969479199695733
+1120 1120  1
+670 1121 -0.7813237856601329
+672 1121 -0.169791678428273
+1121 1121  1
+813 1122 -0.9549072658556735
+1122 1122  1
+1136 1122 -0.4718065848736832
+617 1123 -1
+884 1123 -0.4521964498073819
+1123 1123  1
+527 1124 -0.1757456862958543
+1124 1124  1
+1135 1124 -0.4043507132983384
+895 1125 -0.9352345465132308
+1125 1125  0.9999999999999999
+894 1126 -0.9962927945116223
+1126 1126  1
+776 1127 -0.2197097941051068
+1127 1127  1
+786 1128 -0.3858331333431592
+1128 1128  1
+901 1129 -0.9989151781165653
+1129 1129  1
+1135 1129 -0.5956496631521251
+918 1130 -0.09651555628092359
+1130 1130  0.9999999999999999
+1131 1131  1
+1136 1131 -0.5281931985670707
+788 1132 -0.8563137155180035
+1132 1132  1
+922 1133 -0.9966293993713262
+1133 1133  1
+936 1134 -0.9955033115417659
+1134 1134  1
+1124 1135 -0.2898550959294253
+1129 1135 -0.001579779788374791
+1135 1135  0.9999999999999999
+1122 1136 -0.02754236854238207
+1131 1136 -1
+1136 1136  1
+802 1137 -0.9953586426493263
+1137 1137  1
+805 1138 -0.8620128752031443
+1138 1138  1
diff --git a/matrices/test/nontrivial_mc64_example.mtx b/matrices/test/nontrivial_mc64_example.mtx
new file mode 100644
index 00000000000..185d4af98d0
--- /dev/null
+++ b/matrices/test/nontrivial_mc64_example.mtx
@@ -0,0 +1,7696 @@
+%%MatrixMarket matrix coordinate real general
+%-------------------------------------------------------------------------------
+% UF Sparse Matrix Collection, Tim Davis
+% http://www.cise.ufl.edu/research/sparse/matrices/HB/mahindas
+% name: HB/mahindas
+% [UNSYMMETRIC MATRIX             MAHINDAS ST ERTAR             24OCT84]
+% id: 208
+% date: 1984
+% author: K. Pearson
+% ed: I. Duff, R. Grimes, J. Lewis
+% fields: title A b name id date author ed kind
+% kind: economic problem
+%-------------------------------------------------------------------------------
+1258 1258 7682
+1 1 -1
+53 1 -1
+105 1 -1
+157 1 -1
+158 1 -1
+159 1 -1
+160 1 -1
+161 1 -1
+162 1 -1
+163 1 -1
+164 1 -1
+165 1 -1
+166 1 -1
+986 1 -.00593589153
+987 1 -.00827388279
+988 1 -.00860892702
+989 1 -.00137353456
+991 1 -.00492890272
+992 1 -.000155171787
+996 1 -.000130801986
+999 1 -.000856874278
+1003 1 -.00018323437
+1016 1 -.00137678441
+1017 1 -.000981827034
+1018 1 -.0039323396
+1029 1 -.000373784045
+1032 1 -6.89376247e-5
+1033 1 -.000166634141
+1035 1 -.000105326253
+1200 1 -.0100132599
+1201 1 -.00998593494
+1202 1 -.00546094589
+1229 1 -.000706019695
+1231 1 -.0027809015
+1236 1 -.000238090535
+1242 1 -.00164178631
+2 2 -1
+54 2 -1
+106 2 -1
+167 2 -1
+168 2 -1
+169 2 -1
+170 2 -1
+171 2 -1
+172 2 -1
+173 2 -1
+174 2 -1
+175 2 -1
+176 2 -1
+986 2 -.0619863123
+987 2 -.0716687143
+988 2 -.0668680817
+989 2 -.0115005681
+990 2 -.000190558087
+991 2 -.073800005
+992 2 -.00259988802
+996 2 -.00193114602
+999 2 -.0036379823
+1003 2 -.00155267026
+1016 2 -.00903802924
+1017 2 -.00642052805
+1018 2 -.0330437534
+1029 2 -.00313421409
+1032 2 -.000557913794
+1033 2 -.00134995126
+1035 2 -.000688492961
+1200 2 -.0700013712
+1201 2 -.0700070336
+1202 2 -.0382313095
+1229 2 -.00813334715
+1231 2 -.0148213431
+1236 2 -.00286629447
+1242 2 -.00533711677
+3 3 -1
+55 3 -1
+107 3 -1
+177 3 -1
+178 3 -1
+179 3 -1
+180 3 -1
+181 3 -1
+182 3 -1
+183 3 -1
+184 3 -1
+185 3 -1
+186 3 -1
+986 3 -.00643365784
+987 3 -.146014765
+988 3 -.132799476
+989 3 -.048029162
+990 3 -.0127418283
+991 3 -.209978819
+992 3 -.00617644563
+996 3 -.00345171918
+999 3 -.00997103006
+1003 3 -.00230328832
+1016 3 -.0180562958
+1017 3 -.0128339566
+1018 3 -.0490272008
+1029 3 -.00465024495
+1032 3 -.000843684305
+1033 3 -.00204204884
+1035 3 -.00137659872
+1200 3 -.169996798
+1201 3 -.170007035
+1202 3 -.0928501487
+1229 3 -.00388781494
+1231 3 -.0444640294
+1236 3 -.00644817576
+1242 3 -.00985071808
+4 4 -1
+56 4 -1
+108 4 -1
+187 4 -1
+188 4 -1
+189 4 -1
+190 4 -1
+191 4 -1
+192 4 -1
+193 4 -1
+194 4 -1
+195 4 -1
+196 4 -1
+986 4 -.0952748433
+987 4 -.184948742
+988 4 -.201589674
+989 4 -.0273518991
+990 4 -.000488014601
+991 4 -.138668075
+992 4 -.00640007528
+996 4 -.00422320887
+999 4 -.0026694655
+1003 4 -.00267779361
+1016 4 -.014202618
+1017 4 -.0100969886
+1018 4 -.0569765754
+1029 4 -.00540477782
+1032 4 -.000967531581
+1033 4 -.00234273169
+1035 4 -.00108307914
+1200 4 -.180010065
+1201 4 -.179992974
+1202 4 -.0983157754
+1229 4 -.0123726027
+1231 4 -.0111134751
+1236 4 -.00453819009
+1242 4 -.00738803856
+5 5 -1
+57 5 -1
+109 5 -1
+197 5 -1
+198 5 -1
+199 5 -1
+200 5 -1
+201 5 -1
+202 5 -1
+203 5 -1
+204 5 -1
+205 5 -1
+206 5 -1
+986 5 -.00871037971
+987 5 -.118917592
+988 5 -.102624908
+989 5 -.0330613479
+990 5 -.00681361323
+991 5 -.136338934
+992 5 -.00395535911
+996 5 -.00264995149
+999 5 -.00138095417
+1003 5 -.00148837746
+1016 5 -.00673240982
+1017 5 -.00478634099
+1018 5 -.0316886082
+1029 5 -.00300652371
+1032 5 -.000541481015
+1033 5 -.00131053664
+1035 5 -.000513465493
+1200 5 -.12998949
+1201 5 -.129992962
+1202 5 -.0710063577
+1229 5 -.00318179536
+1231 5 -.00648370478
+1236 5 -.00238879793
+1242 5 -.00410446571
+6 6 -1
+58 6 -1
+110 6 -1
+207 6 -1
+208 6 -1
+209 6 -1
+210 6 -1
+211 6 -1
+212 6 -1
+213 6 -1
+214 6 -1
+215 6 -1
+216 6 -1
+986 6 -.00251121563
+987 6 -.203185737
+988 6 -.192379743
+989 6 -.0552012399
+990 6 -.0222557895
+991 6 -.292612076
+992 6 -.0219613705
+996 6 -.00513518928
+999 6 -.00195196667
+1003 6 -.00293014268
+1016 6 -.0119892228
+1017 6 -.00852459297
+1018 6 -.0623729564
+1029 6 -.00591553887
+1032 6 -.00105771166
+1033 6 -.0025602926
+1035 6 -.000914247415
+1200 6 -.239998177
+1201 6 -.240014061
+1202 6 -.131086141
+1229 6 -.00388781494
+1231 6 -.00833763927
+1236 6 -.00429878384
+1242 6 -.00738803856
+7 7 -1
+59 7 -1
+111 7 -1
+217 7 -1
+218 7 -1
+219 7 -1
+220 7 -1
+221 7 -1
+222 7 -1
+223 7 -1
+224 7 -1
+225 7 -1
+226 7 -1
+986 7 -.00912127737
+987 7 -.0966502652
+988 7 -.0896248892
+989 7 -.0180861093
+990 7 -.00258647744
+991 7 -.0319746435
+992 7 -.00887065381
+996 7 -.00236170273
+999 7 -.000852607656
+1003 7 -.000657393481
+1016 7 -.00491426373
+1017 7 -.00349397887
+1018 7 -.0139749302
+1029 7 -.00132565643
+1032 7 -.000253706501
+1033 7 -.000614048797
+1035 7 -.000374643947
+1200 7 -.110008687
+1201 7 -.109985933
+1202 7 -.0600797832
+1229 7 -.00176975597
+1231 7 -.00370280328
+1236 7 -.0016718955
+1242 7 -.00246267952
+8 8 -1
+60 8 -1
+112 8 -1
+227 8 -1
+228 8 -1
+229 8 -1
+230 8 -1
+231 8 -1
+232 8 -1
+233 8 -1
+234 8 -1
+235 8 -1
+236 8 -1
+986 8 -.00172779313
+987 8 -.0732349306
+988 8 -.0692287683
+989 8 -.0376125723
+990 8 -.00632327469
+991 8 -.111695595
+992 8 -.00223021419
+996 8 -.00186090055
+999 8 -.000239640358
+1003 8 -.0010817257
+1016 8 -.00417646533
+1017 8 -.00296730525
+1018 8 -.0230374597
+1029 8 -.0021846632
+1032 8 -.000389978552
+1033 8 -.000944000029
+1035 8 -.000318302133
+1200 8 -.0899821669
+1201 8 -.0900140628
+1202 8 -.0491578877
+1229 8 -.00141517725
+1231 8 -.000926967128
+1236 8 -.00143248949
+1242 8 -.00287443749
+9 9 -1
+61 9 -1
+113 9 -1
+237 9 -1
+986 9 -.539141834
+987 9 -.0724108219
+989 9 -.270749658
+994 9 -.0121773174
+1016 9 -.0046902895
+1017 9 -.0154549666
+1018 9 -.132671088
+1020 9 -.0541632026
+1024 9 -.022907
+1029 9 -.0102059292
+1032 9 -.00230139447
+1033 9 -.00557160983
+1035 9 -.000713856425
+1199 9 -1
+1231 9 -.473746061
+1237 9 -.0332121551
+1242 9 -.0478609577
+10 10 -1
+62 10 -1
+114 10 -1
+238 10 -1
+988 10 -.128765732
+989 10 -.311814636
+990 10 -.0233154781
+992 10 -.0709880516
+1021 10 -.0469889343
+1024 10 -.00315410877
+1025 10 -.0107982764
+1032 10 -.000553905789
+1033 10 -.00134165853
+1035 10 -.000259249733
+1202 10 -.131625205
+1203 10 -.144391209
+1234 10 -.0564676113
+1236 10 -.0239616428
+1237 10 -.00595942466
+1238 10 -.150985867
+11 11 -1
+63 11 -1
+115 11 -1
+239 11 -1
+993 11 -.00688045193
+999 11 -.0194378905
+1016 11 -.0044465526
+1017 11 -.000245325267
+1024 11 -.00537030539
+1032 11 -.000153506466
+1033 11 -.000371707312
+1035 11 -7.68649334e-5
+1206 11 -.0963710174
+1237 11 -.00738798082
+12 12 -1
+64 12 -1
+116 12 -1
+240 12 -1
+993 12 -.0901386291
+995 12 -.122320458
+1001 12 -.0304286312
+1016 12 -.0837532878
+1024 12 -.0046857805
+1032 12 -.00173666712
+1033 12 -.00420468301
+1035 12 -.000642606348
+1206 12 -.69760108
+1212 12 -.387774557
+1237 12 -.00643205643
+13 13 -1
+65 13 -1
+117 13 -1
+241 13 -1
+1024 13 -.00567285763
+1025 13 -.0843898356
+1029 13 -.0712952465
+1031 13 -.00218436681
+1032 13 -.000209217687
+1033 13 -.000506146345
+1035 13 -.000128947213
+1237 13 -.00380237587
+14 14 -1
+66 14 -1
+118 14 -1
+242 14 -1
+995 14 -.00241041323
+999 14 -.0125423633
+1016 14 -.0131881451
+1017 14 -.0133477459
+1022 14 -.536979139
+1024 14 -.00527575752
+1029 14 -.130613148
+1032 14 -.00069779309
+1033 14 -.00168956094
+1035 14 -.0002896472
+1214 14 -.343984067
+1237 14 -.0051491987
+15 15 -1
+67 15 -1
+119 15 -1
+243 15 -1
+992 15 -.101745531
+1021 15 -.026953999
+1024 15 -.000438700983
+1029 15 -.0381027572
+1032 15 -.000216832879
+1033 15 -.000524975592
+1035 15 -9.08051734e-5
+1205 15 -.0447068959
+1237 15 -.000604115834
+16 16 -1
+68 16 -1
+120 16 -1
+244 16 -1
+1016 16 -.0170154739
+1023 16 -.0544804186
+1024 16 -.00294610416
+1029 16 -.00352424942
+1032 16 -.000530659396
+1033 16 -.00128507311
+1035 16 -.000373675866
+1229 16 -.0134332012
+1236 16 -.133385956
+1237 16 -.00307743694
+1242 16 -.0323741399
+17 17 -1
+69 17 -1
+121 17 -1
+245 17 -1
+1017 17 -.000522729533
+1023 17 -.00491144974
+1024 17 -.000117239055
+1028 17 -.00491660368
+1032 17 -4.84968186e-5
+1033 17 -.000117170734
+1035 17 -3.02038534e-5
+1237 17 -.000120823162
+18 18 -1
+70 18 -1
+122 18 -1
+246 18 -1
+1003 18 -.0184873845
+1016 18 -.000948597852
+1017 18 -.00118245208
+1032 18 -3.76752141e-5
+1033 18 -9.16097561e-5
+1035 18 -2.0716745e-5
+19 19 -1
+71 19 -1
+123 19 -1
+247 19 -1
+1017 19 -.000771998952
+1023 19 -.016720295
+1024 19 -.000654269592
+1032 19 -8.01600327e-5
+1033 19 -.00019453658
+1035 19 -4.97589099e-5
+1219 19 -.203145415
+1237 19 -.000749814324
+20 20 -1
+72 20 -1
+124 20 -1
+248 20 -1
+1016 20 -.0393997468
+1024 20 -3.02552417e-5
+1032 20 -1.28256052e-5
+1033 20 -3.05365866e-5
+1035 20 -9.0998783e-6
+1237 20 -3.55362245e-5
+21 21 -1
+73 21 -1
+125 21 -1
+249 21 -1
+1025 21 -.049051892
+1027 21 -.03672003
+1028 21 -.0271796845
+1029 21 -.0359273776
+1032 21 -.000238876892
+1033 21 -.000577658531
+1035 21 -.000104939027
+1221 21 -.0505565293
+22 22 -1
+74 22 -1
+126 22 -1
+250 22 -1
+1028 22 -.00121411122
+1032 22 -6.01200236e-6
+1033 22 -1.42439021e-5
+1035 22 -3.67867437e-6
+23 23 -1
+75 23 -1
+127 23 -1
+251 23 -1
+1024 23 -.000347935274
+1028 23 -.00235384447
+1032 23 -3.32664131e-5
+1033 23 -7.99024419e-5
+1035 23 -1.47146975e-5
+1223 23 -.136343017
+1237 23 -.000337594131
+24 24 -1
+76 24 -1
+128 24 -1
+252 24 -1
+1017 24 -.0018250834
+1024 24 -.00935265142
+1028 24 -.0397599563
+1029 24 -.00592482509
+1032 24 -.000505008211
+1033 24 -.00122273166
+1035 24 -.00755212503
+1231 24 -.00374332629
+1237 24 -.00920388196
+1241 24 -.0966170877
+25 25 -1
+77 25 -1
+129 25 -1
+253 25 -1
+999 25 -.0553583428
+1017 25 -.0156889856
+1023 25 -.0150773702
+1024 25 -.000801763905
+1028 25 -.0358140953
+1029 25 -.00303902687
+1032 25 -.000537873828
+1033 25 -.00130273169
+1035 25 -.000315010693
+1225 25 -.746915162
+1237 25 -.000920388207
+26 26 -1
+78 26 -1
+130 26 -1
+254 26 -1
+1017 26 -.00196155044
+1018 26 -.0213435292
+1023 26 -.00322694657
+1024 26 -.00308981654
+1028 26 -.000841127534
+1029 26 -.00117707148
+1032 26 -4.40880176e-5
+1033 26 -.000106243904
+1035 26 -6.73778268e-5
+1226 26 -1
+1237 26 -.00356072956
+27 27 -1
+79 27 -1
+131 27 -1
+255 27 -1
+1017 27 -.00653464487
+1023 27 -.00478093931
+1024 27 -.000298770494
+1025 27 -.125943303
+1028 27 -.00790266134
+1029 27 -.0120934229
+1031 27 -.000896510552
+1032 27 -.000188376071
+1033 27 -.000456000009
+1035 27 -.000134755654
+1237 27 -.000302057917
+28 28 -1
+80 28 -1
+132 28 -1
+256 28 -1
+999 28 -.0342344381
+1017 28 -.00749491062
+1023 28 -.0508867763
+1024 28 -.00153167162
+1028 28 -.00446431991
+1032 28 -.000415228977
+1033 28 -.00100536586
+1035 28 -.000218397094
+1228 28 -.0115931267
+1229 28 -.0134739932
+1236 28 -.0205113031
+1237 28 -.00180524017
+29 29 -1
+81 29 -1
+133 29 -1
+257 29 -1
+999 29 -.000225418378
+1016 29 -.00357041694
+1017 29 -2.39277597e-5
+1021 29 -.0142107746
+1024 29 -.00245445641
+1028 29 -.000650260481
+1029 29 -.00939567667
+1032 29 -.000157514456
+1033 29 -.000381170743
+1035 29 -7.84138465e-5
+1229 29 -.152073503
+1236 29 -.013414626
+1237 29 -.0029779356
+30 30 -1
+82 30 -1
+134 30 -1
+258 30 -1
+993 30 -.13983576
+995 30 -.0519003533
+998 30 -.434840858
+999 30 -.0361280963
+1016 30 -.0804134309
+1017 30 -.193728089
+1022 30 -.111547887
+1024 30 -.0011648268
+1032 30 -.00334107014
+1033 30 -.00808946323
+1035 30 -.00176595734
+1206 30 -.206027895
+1211 30 -.00305439346
+1212 30 -.612225473
+1230 30 -.0151701225
+1231 30 -.0947329998
+1236 30 -.0337812342
+1237 30 -.00157425471
+31 31 -1
+83 31 -1
+135 31 -1
+259 31 -1
+1017 31 -.0022081905
+1018 31 -.0220815986
+1024 31 -.0012631563
+1029 31 -.00698581478
+1032 31 -.000108616841
+1033 31 -.000263512193
+1035 31 -4.74355365e-5
+1231 31 -.168003932
+1237 31 -.00138235907
+32 32 -1
+84 32 -1
+136 32 -1
+260 32 -1
+994 32 -.0379240662
+995 32 -.0379112288
+1017 32 -3.78637087e-5
+1021 32 -.020616198
+1024 32 -.00118751824
+1029 32 -.00924244896
+1032 32 -.000309818512
+1033 32 -.000749756116
+1035 32 -.00011365168
+1237 32 -.00190118793
+33 33 -1
+85 33 -1
+137 33 -1
+261 33 -1
+990 33 -.103008263
+992 33 -.386701792
+994 33 -.0299895387
+996 33 -.0525375903
+1003 33 -.0123860007
+1017 33 -.000547971984
+1020 33 -.0851425901
+1021 33 -.0635736883
+1024 33 -.00141821441
+1032 33 -.00084248191
+1033 33 -.00203999993
+1035 33 -.000350635761
+1237 33 -.00242357049
+34 34 -1
+86 34 -1
+138 34 -1
+262 34 -1
+992 34 -.166872039
+994 34 -.0337420292
+1020 34 -.0880064443
+1024 34 -.00389158027
+1026 34 -.662715316
+1032 34 -.000541481015
+1033 34 -.00131102442
+1035 34 -.000228077814
+1203 34 -.12712808
+1234 34 -.00809310284
+1237 34 -.00743773161
+35 35 -1
+87 35 -1
+139 35 -1
+263 35 -1
+995 35 -.0297079161
+999 35 -.00542924087
+1017 35 -.00206804206
+1024 35 -.00353608117
+1032 35 -.000203606483
+1033 35 -.000492780469
+1035 35 -9.89369801e-5
+1235 35 -.320892364
+1237 35 -.00484358752
+36 36 -1
+88 36 -1
+140 36 -1
+264 36 -1
+996 36 -.1226741
+1017 36 -.0109305168
+1023 36 -.205520347
+1024 36 -.0430607721
+1029 36 -.00488937367
+1032 36 -.00213786797
+1033 36 -.00517551228
+1035 36 -.000938836427
+1236 36 -.467726886
+1237 36 -.0498075709
+37 37 -1
+89 37 -1
+141 37 -1
+265 37 -1
+1023 37 -.00256573432
+1024 37 -.000937912439
+1029 37 -.0942864418
+1031 37 -.0170647688
+1032 37 -.000370740134
+1033 37 -.00089843903
+1035 37 -.00022846504
+1237 37 -.000678741897
+38 38 -1
+90 38 -1
+142 38 -1
+266 38 -1
+1024 38 -.002072484
+1025 38 -.202045768
+1029 38 -.0614770278
+1032 38 -.00021001928
+1033 38 -.000507902412
+1035 38 -.000128366373
+1237 38 -.00147830695
+39 39 -1
+91 39 -1
+143 39 -1
+267 39 -1
+1024 39 -.000340371451
+1026 39 -.259070516
+1032 39 -6.81360279e-6
+1033 39 -1.66829268e-5
+1035 39 -6.38927668e-6
+1237 39 -.000604115834
+40 40 -1
+92 40 -1
+144 40 -1
+268 40 -1
+1025 40 -.015785791
+1027 40 -.0408238396
+1029 40 -.0586399846
+1031 40 -.0160451792
+1032 40 -.000332263327
+1033 40 -.000804975629
+1035 40 -.000156246853
+1240 40 -.0734207183
+41 41 -1
+93 41 -1
+145 41 -1
+269 41 -1
+1016 41 -.046553757
+1024 41 -.0124538131
+1028 41 -.412854701
+1029 41 -.00376570015
+1031 41 -.00429942692
+1032 41 -.00245209527
+1033 41 -.00593648758
+1035 41 -.00122170709
+1237 41 -.0208562091
+1241 41 -.158014163
+42 42 -1
+94 42 -1
+146 42 -1
+270 42 -1
+994 42 -.0314314887
+996 42 -.0347981751
+1003 42 -.00416617095
+1016 42 -.0112184873
+1017 42 -.0180052444
+1018 42 -.0245015007
+1020 42 -.0156204402
+1021 42 -.0299912523
+1022 42 -.0206400584
+1024 42 -.00367979356
+1027 42 -.0441914909
+1028 42 -.00477933232
+1032 42 -.000408415362
+1033 42 -.000988682965
+1035 42 -.000165540347
+1237 42 -.0060127289
+1242 42 -.0315532461
+43 43 -1
+95 43 -1
+147 43 -1
+271 43 -1
+1010 43 -.0185339358
+1024 43 -.00116104481
+1029 43 -.000473614549
+1032 43 -9.61920341e-6
+1033 43 -2.3121951e-5
+1035 43 -4.22079465e-5
+1223 43 -.032330662
+1237 43 -.0010412113
+44 44 -1
+96 44 -1
+148 44 -1
+272 44 -1
+1017 44 -.00453286432
+1024 44 -.178381115
+1032 44 -.000397593743
+1033 44 -.000962439051
+1035 44 -.00175821269
+1039 44 -1
+1223 44 -.471103936
+1237 44 -.314317912
+45 45 -1
+97 45 -1
+149 45 -1
+273 45 -1
+1017 45 -.0437515154
+1024 45 -.510670662
+1028 45 -.160897091
+1029 45 -.0078657167
+1032 45 -.00205049361
+1033 45 -.00496419519
+1035 45 -.00157118123
+1237 45 -.500001788
+1241 45 -.0621112846
+1242 45 -.0410499051
+46 46 -1
+98 46 -1
+150 46 -1
+274 46 -1
+1003 46 -.698079586
+1017 46 -.194451973
+1018 46 -.0852652192
+1023 46 -.0307986289
+1032 46 -.0070488723
+1033 46 -.017065756
+1035 46 -.27732712
+1236 46 -.0143419951
+47 47 -1
+99 47 -1
+151 47 -1
+275 47 -1
+1000 47 -.7457304
+1010 47 -.641808689
+1014 47 -.949445844
+1016 47 -.116051726
+1017 47 -.0741642267
+1018 47 -.146283031
+1023 47 -.169224694
+1025 47 -.178926215
+1028 47 -.140577674
+1029 47 -.0135398041
+1030 47 -.927194357
+1032 47 -.00480118487
+1033 47 -.011623024
+1035 47 -.0306480043
+1213 47 -1
+1227 47 -1
+1229 47 -.175491393
+1231 47 -.019035751
+1236 47 -.00282683177
+1238 47 -.279957622
+1241 47 -.0448854752
+48 48 -1
+100 48 -1
+152 48 -1
+276 48 -1
+1003 48 -.133129418
+1017 48 -.0100396676
+1032 48 -.000390780158
+1033 48 -.000946048764
+1216 48 -.0660542548
+1236 48 -.011331005
+49 49 -1
+101 49 -1
+153 49 -1
+277 49 -1
+1014 49 -.0376148261
+1016 49 -.040249534
+1017 49 -.0278500728
+1025 49 -.0421503261
+1030 49 -.0728056729
+1032 49 -.000298596104
+1033 49 -.000722439028
+1035 49 -.00263993279
+50 50 -1
+102 50 -1
+154 50 -1
+278 50 -1
+1032 50 -.00809095241
+1033 50 -.00952975638
+1250 50 -1
+51 51 -1
+103 51 -1
+155 51 -1
+279 51 -1
+52 52 -1
+104 52 -1
+156 52 -1
+280 52 -1
+1 53 1
+1094 53 -.00356158568
+2 54 1
+1094 54 -.0336924791
+3 55 1
+1094 55 -.0901731327
+4 56 1
+1094 56 -.0566755012
+5 57 1
+1094 57 -.0280031972
+6 58 1
+1094 58 -.0498667546
+7 59 1
+1094 59 -.023759827
+8 60 1
+1094 60 -.01643911
+9 61 1
+1094 61 -.00231136009
+10 62 1
+1094 62 -.00248041283
+11 63 1
+1094 63 -.00127226813
+12 64 1
+1094 64 -.00537730055
+13 65 1
+1094 65 -.00257951277
+14 66 1
+1094 66 -.00156978657
+15 67 1
+1094 67 -.00218602782
+16 68 1
+1094 68 -.00861218385
+17 69 1
+1094 69 -.000697780051
+18 70 1
+1094 70 -.000519145164
+19 71 1
+1094 71 -.00132793898
+20 72 1
+1094 72 -.000291506789
+21 73 1
+1094 73 -.00151601026
+22 74 1
+1094 74 -9.0428679e-5
+23 75 1
+1094 75 -.000142273973
+24 76 1
+1094 76 -.00739766378
+25 77 1
+1094 77 -.00819738582
+26 78 1
+1094 78 -.00208459608
+27 79 1
+1094 79 -.0036535447
+28 80 1
+1094 80 -.00488446048
+29 81 1
+1094 81 -.00122504996
+30 82 1
+1094 82 -.0436096154
+31 83 1
+1094 83 -.000693262264
+32 84 1
+1094 84 -.00165846641
+33 85 1
+1094 85 -.00810870528
+34 86 1
+1094 86 -.00322366226
+35 87 1
+1094 87 -.0012720495
+36 88 1
+1094 88 -.0130607868
+37 89 1
+1094 89 -.00165249128
+38 90 1
+1094 90 -.00177341502
+39 91 1
+1094 91 -.000316500373
+40 92 1
+1094 92 -.00204896391
+41 93 1
+1094 93 -.0236979984
+42 94 1
+1094 94 -.00135482708
+43 95 1
+1094 95 -.0015658153
+44 96 1
+1094 96 -.053842701
+45 97 1
+1094 97 -.022164572
+46 98 1
+1094 98 -.142507151
+47 99 1
+1094 99 -.0482241735
+48 100 1
+1094 100 -.179205015
+49 101 1
+50 102 1
+51 103 1
+1094 103 -.0576346964
+52 104 1
+1094 104 -.0317968652
+53 105 1
+1095 105 -1
+54 106 1
+1096 106 -1
+55 107 1
+1097 107 -1
+56 108 1
+1098 108 -1
+57 109 1
+1099 109 -1
+58 110 1
+1100 110 -1
+59 111 1
+1101 111 -1
+60 112 1
+1102 112 -1
+61 113 1
+1103 113 -1
+62 114 1
+1104 114 -1
+63 115 1
+1105 115 -1
+64 116 1
+1106 116 -1
+65 117 1
+1107 117 -1
+66 118 1
+1108 118 -1
+67 119 1
+1109 119 -1
+68 120 1
+1110 120 -1
+69 121 1
+1111 121 -1
+70 122 1
+1112 122 -1
+71 123 1
+1113 123 -1
+72 124 1
+1114 124 -1
+73 125 1
+1115 125 -1
+74 126 1
+1116 126 -1
+75 127 1
+1117 127 -1
+76 128 1
+1118 128 -1
+77 129 1
+1119 129 -1
+78 130 1
+1120 130 -1
+79 131 1
+1121 131 -1
+80 132 1
+1122 132 -1
+81 133 1
+1123 133 -1
+82 134 1
+1124 134 -1
+83 135 1
+1125 135 -1
+84 136 1
+1126 136 -1
+85 137 1
+1127 137 -1
+86 138 1
+1128 138 -1
+87 139 1
+1129 139 -1
+88 140 1
+1130 140 -1
+89 141 1
+1131 141 -1
+90 142 1
+1132 142 -1
+91 143 1
+1133 143 -1
+92 144 1
+1134 144 -1
+93 145 1
+1135 145 -1
+94 146 1
+1136 146 -1
+95 147 1
+1137 147 -1
+96 148 1
+1138 148 -1
+97 149 1
+1139 149 -1
+98 150 1
+1140 150 -1
+99 151 1
+1141 151 -1
+100 152 1
+1142 152 -1
+101 153 1
+1143 153 -1
+102 154 1
+1144 154 -1
+103 155 1
+1145 155 -1
+104 156 1
+1146 156 -1
+105 157 1
+1147 157 -1
+106 158 1
+1148 158 -1
+107 159 1
+1149 159 -1
+108 160 1
+1150 160 -1
+109 161 1
+1151 161 -1
+110 162 1
+1152 162 -1
+111 163 1
+1153 163 -1
+112 164 1
+1154 164 -1
+113 165 1
+1155 165 -1
+114 166 1
+1156 166 -1
+115 167 1
+1157 167 -1
+116 168 1
+1158 168 -1
+117 169 1
+1159 169 -1
+118 170 1
+1160 170 -1
+119 171 1
+1161 171 -1
+120 172 1
+1162 172 -1
+121 173 1
+1163 173 -1
+122 174 1
+1164 174 -1
+123 175 1
+1165 175 -1
+124 176 1
+1166 176 -1
+125 177 1
+1167 177 -1
+126 178 1
+1168 178 -1
+127 179 1
+1169 179 -1
+128 180 1
+1170 180 -1
+129 181 1
+1171 181 -1
+130 182 1
+1172 182 -1
+131 183 1
+1173 183 -1
+132 184 1
+1174 184 -1
+133 185 1
+1175 185 -1
+134 186 1
+1176 186 -1
+135 187 1
+1177 187 -1
+136 188 1
+1178 188 -1
+137 189 1
+1179 189 -1
+138 190 1
+1180 190 -1
+139 191 1
+1181 191 -1
+140 192 1
+1182 192 -1
+141 193 1
+1183 193 -1
+142 194 1
+1184 194 -1
+143 195 1
+1185 195 -1
+144 196 1
+1186 196 -1
+145 197 1
+1187 197 -1
+146 198 1
+1188 198 -1
+147 199 1
+1189 199 -1
+148 200 1
+1190 200 -1
+149 201 1
+1191 201 -1
+150 202 1
+1192 202 -1
+151 203 1
+1193 203 -1
+152 204 1
+1194 204 -1
+153 205 1
+1195 205 -1
+154 206 1
+1196 206 -1
+155 207 1
+1197 207 -1
+156 208 1
+1198 208 -1
+157 209 1
+1040 209 -.0178465787
+158 210 1
+1041 210 -.000569905678
+159 211 1
+1042 211 -.00449396856
+160 212 1
+1043 212 -5.19715795e-5
+161 213 1
+1044 213 -1.39432741e-5
+162 214 1
+1045 214 -.00359829981
+163 215 1
+1046 215 -.00216023461
+164 216 1
+1047 216 -.0296454262
+165 217 1
+1048 217 -.00490016444
+166 218 1
+1049 218 -.0130853141
+167 219 1
+1040 219 -.292256594
+168 220 1
+1041 220 -.0458589569
+169 221 1
+1042 221 -.0251878798
+170 222 1
+1043 222 -.00707555935
+171 223 1
+1044 223 -.0032789933
+172 224 1
+1045 224 -.0671901256
+173 225 1
+1046 225 -.0452964678
+174 226 1
+1047 226 -.156934559
+175 227 1
+1048 227 -.052540075
+176 228 1
+1049 228 -.0818570703
+177 229 1
+1040 229 -.0368810818
+178 230 1
+1041 230 -.144300938
+179 231 1
+1042 231 -.0779947191
+180 232 1
+1043 232 -.254118741
+181 233 1
+1044 233 -.248348296
+182 234 1
+1045 234 -.214205191
+183 235 1
+1046 235 -.120531969
+184 236 1
+1047 236 -.481847942
+185 237 1
+1048 237 -.16768302
+186 238 1
+1049 238 -.287884384
+187 239 1
+1040 239 -.531182885
+188 240 1
+1041 240 -.348290294
+189 241 1
+1042 241 -.613069355
+190 242 1
+1043 242 -.0259709395
+191 243 1
+1044 243 -.00881912094
+192 244 1
+1045 244 -.131283581
+193 245 1
+1046 245 -.115888983
+194 246 1
+1047 246 -.119731724
+195 247 1
+1048 247 -.163664266
+196 248 1
+1049 248 -.143314779
+197 249 1
+1040 249 -.0488956943
+198 250 1
+1041 250 -.182706028
+199 251 1
+1042 251 -.0637331307
+200 252 1
+1043 252 -.164185643
+201 253 1
+1044 253 -.131180644
+202 254 1
+1045 254 -.137433782
+203 255 1
+1046 255 -.0762486756
+204 256 1
+1047 256 -.0659873635
+205 257 1
+1048 257 -.153722018
+206 258 1
+1049 258 -.100952752
+207 259 1
+1040 259 -.015941849
+208 260 1
+1041 260 -.175022557
+209 261 1
+1042 261 -.148241401
+210 262 1
+1043 262 -.295332223
+211 263 1
+1044 263 -.435067326
+212 264 1
+1045 264 -.299447656
+213 265 1
+1046 265 -.429889739
+214 266 1
+1047 266 -.094635874
+215 267 1
+1048 267 -.262589782
+216 268 1
+1049 268 -.157239005
+217 269 1
+1040 269 -.0479257405
+218 270 1
+1041 270 -.0681549832
+219 271 1
+1042 271 -.041436553
+220 272 1
+1043 272 -.0313165896
+221 273 1
+1044 273 -.0482902043
+222 274 1
+1045 274 -.0312532112
+223 275 1
+1046 275 -.165843651
+224 276 1
+1047 276 -.0394780114
+225 277 1
+1048 277 -.106069766
+226 278 1
+1049 278 -.106182404
+227 279 1
+1040 279 -.00906958152
+228 280 1
+1041 280 -.0350963511
+229 281 1
+1042 281 -.0258430243
+230 282 1
+1043 282 -.221948341
+231 283 1
+1044 283 -.125001445
+232 284 1
+1045 284 -.115588143
+233 285 1
+1046 285 -.0441402867
+234 286 1
+1047 286 -.0117391003
+235 287 1
+1048 287 -.0888309106
+236 288 1
+1049 288 -.109484315
+237 289 1
+1050 289 -1
+238 290 1
+1051 290 -1
+239 291 1
+1052 291 -1
+240 292 1
+1053 292 -1
+241 293 1
+1054 293 -1
+242 294 1
+1055 294 -1
+243 295 1
+1056 295 -1
+244 296 1
+1057 296 -1
+245 297 1
+1058 297 -1
+246 298 1
+1059 298 -1
+247 299 1
+1060 299 -1
+248 300 1
+1061 300 -1
+249 301 1
+1062 301 -1
+250 302 1
+1063 302 -1
+251 303 1
+1064 303 -1
+252 304 1
+1065 304 -1
+253 305 1
+1066 305 -1
+254 306 1
+1067 306 -1
+255 307 1
+1068 307 -1
+256 308 1
+1069 308 -1
+257 309 1
+1070 309 -1
+258 310 1
+1071 310 -1
+259 311 1
+1072 311 -1
+260 312 1
+1073 312 -1
+261 313 1
+1074 313 -1
+262 314 1
+1075 314 -1
+263 315 1
+1076 315 -1
+264 316 1
+1077 316 -1
+265 317 1
+1078 317 -1
+266 318 1
+1079 318 -1
+267 319 1
+1080 319 -1
+268 320 1
+1081 320 -1
+269 321 1
+1082 321 -1
+270 322 1
+1083 322 -1
+271 323 1
+1084 323 -1
+272 324 1
+1085 324 -1
+273 325 1
+1086 325 -1
+274 326 1
+1087 326 -1
+275 327 1
+1088 327 -1
+276 328 1
+1089 328 -1
+277 329 1
+1090 329 -1
+278 330 1
+1091 330 -1
+279 331 1
+1092 331 -1
+280 332 1
+1093 332 -1
+285 333 1
+286 334 1
+287 335 1
+288 336 1
+989 336 -.0303959493
+289 337 1
+990 337 -.35601595
+290 338 1
+291 339 1
+992 339 -.209159389
+292 340 1
+293 341 1
+994 341 -.819865763
+294 342 1
+995 342 -.645634174
+295 343 1
+996 343 -.639332294
+296 344 1
+997 344 -.997142851
+297 345 1
+998 345 -.5
+298 346 1
+299 347 1
+1000 347 -.251026601
+300 348 1
+1001 348 -.573770761
+301 349 1
+1002 349 -.999197245
+302 350 1
+1003 350 -.0612838604
+303 351 1
+1004 351 -.979579866
+304 352 1
+1005 352 -.941222906
+305 353 1
+1006 353 -.982795656
+306 354 1
+1007 354 -.999544799
+307 355 1
+1008 355 -.998440027
+308 356 1
+309 357 1
+310 358 1
+1011 358 -.068720825
+311 359 1
+312 360 1
+313 361 1
+314 362 1
+1015 362 -.937243104
+315 363 1
+1016 363 -.305369467
+316 364 1
+1017 364 -.24439393
+317 365 1
+1018 365 -.059916757
+318 366 1
+1019 366 -.409031749
+319 367 1
+1020 367 -.654813766
+320 368 1
+1021 368 -.775436103
+321 369 1
+1022 369 -.0567775071
+322 370 1
+1023 370 -.349151224
+323 371 1
+1024 371 -.100001134
+324 372 1
+1025 372 -.060165748
+325 373 1
+1026 373 -.0782141462
+326 374 1
+1027 374 -.87296319
+327 375 1
+1028 375 -.0758721083
+328 376 1
+1029 376 -.0622733496
+329 377 1
+330 378 1
+331 379 1
+1032 379 -.742725551
+332 380 1
+1033 380 -.790000021
+333 381 1
+334 382 1
+1035 382 -.662945509
+335 383 1
+1036 383 -1
+336 384 1
+337 385 1
+338 386 1
+339 387 1
+340 388 1
+341 389 1
+342 390 1
+1202 390 -.32218644
+343 391 1
+1203 391 -.728480697
+344 392 1
+1204 392 -1
+345 393 1
+1205 393 -.955293119
+346 394 1
+347 395 1
+1207 395 -1
+348 396 1
+1208 396 -1
+349 397 1
+1209 397 -1
+350 398 1
+1210 398 -1
+351 399 1
+1211 399 -.99694562
+352 400 1
+353 401 1
+354 402 1
+1214 402 -.656015933
+355 403 1
+1215 403 -1
+356 404 1
+1216 404 -.86789149
+357 405 1
+1217 405 -1
+358 406 1
+1218 406 -1
+359 407 1
+1219 407 -.796854556
+360 408 1
+1220 408 -1
+361 409 1
+1221 409 -.80319649
+362 410 1
+363 411 1
+1223 411 -.23457785
+364 412 1
+1224 412 -.101583153
+365 413 1
+366 414 1
+367 415 1
+368 416 1
+1228 416 -.988406897
+369 417 1
+1229 417 -.610173583
+370 418 1
+1230 418 -.984829903
+371 419 1
+1231 419 -.145812437
+372 420 1
+1232 420 -1
+373 421 1
+1233 421 -1
+374 422 1
+1234 422 -.935439289
+375 423 1
+1235 423 -.679107606
+376 424 1
+1236 424 -.254835814
+377 425 1
+378 426 1
+1238 426 -.569056511
+379 427 1
+1239 427 -1
+380 428 1
+1240 428 -.926579297
+381 429 1
+1241 429 -.0386048704
+382 430 1
+1242 430 -.491169512
+383 431 1
+1243 431 -1
+384 432 1
+1244 432 -1
+385 433 1
+1245 433 -1
+386 434 1
+1246 434 -1
+387 435 1
+1247 435 -1
+388 436 1
+1248 436 -1
+389 437 1
+1249 437 -1
+390 438 1
+391 439 1
+1251 439 -1
+392 440 1
+1252 440 -1
+285 441 -1
+339 441 -1
+447 441 1
+286 442 -1
+340 442 -1
+448 442 1
+287 443 -1
+341 443 -1
+449 443 1
+288 444 -1
+342 444 -1
+450 444 1
+289 445 -1
+343 445 -1
+451 445 1
+290 446 -1
+344 446 -1
+452 446 1
+291 447 -1
+345 447 -1
+453 447 1
+292 448 -1
+346 448 -1
+454 448 1
+293 449 -1
+347 449 -1
+455 449 1
+294 450 -1
+348 450 -1
+456 450 1
+295 451 -1
+349 451 -1
+457 451 1
+296 452 -1
+350 452 -1
+458 452 1
+297 453 -1
+351 453 -1
+459 453 1
+298 454 -1
+352 454 -1
+460 454 1
+299 455 -1
+353 455 -1
+461 455 1
+300 456 -1
+354 456 -1
+462 456 1
+301 457 -1
+355 457 -1
+463 457 1
+302 458 -1
+356 458 -1
+464 458 1
+303 459 -1
+357 459 -1
+465 459 1
+304 460 -1
+358 460 -1
+466 460 1
+305 461 -1
+359 461 -1
+467 461 1
+306 462 -1
+360 462 -1
+468 462 1
+307 463 -1
+361 463 -1
+469 463 1
+308 464 -1
+362 464 -1
+470 464 1
+309 465 -1
+363 465 -1
+471 465 1
+310 466 -1
+364 466 -1
+472 466 1
+311 467 -1
+365 467 -1
+473 467 1
+312 468 -1
+366 468 -1
+474 468 1
+313 469 -1
+367 469 -1
+475 469 1
+314 470 -1
+368 470 -1
+476 470 1
+315 471 -1
+369 471 -1
+477 471 1
+316 472 -1
+370 472 -1
+478 472 1
+317 473 -1
+371 473 -1
+479 473 1
+318 474 -1
+372 474 -1
+480 474 1
+319 475 -1
+373 475 -1
+481 475 1
+320 476 -1
+374 476 -1
+482 476 1
+321 477 -1
+375 477 -1
+483 477 1
+322 478 -1
+376 478 -1
+484 478 1
+323 479 -1
+377 479 -1
+485 479 1
+324 480 -1
+378 480 -1
+486 480 1
+325 481 -1
+379 481 -1
+487 481 1
+326 482 -1
+380 482 -1
+488 482 1
+327 483 -1
+381 483 -1
+489 483 1
+328 484 -1
+382 484 -1
+490 484 1
+329 485 -1
+383 485 -1
+491 485 1
+330 486 -1
+384 486 -1
+492 486 1
+331 487 -1
+385 487 -1
+493 487 1
+332 488 -1
+386 488 -1
+494 488 1
+333 489 -1
+387 489 -1
+495 489 1
+334 490 -1
+388 490 -1
+496 490 1
+335 491 -1
+389 491 -1
+497 491 1
+336 492 -1
+390 492 -1
+498 492 1
+337 493 -1
+391 493 -1
+499 493 1
+338 494 -1
+392 494 -1
+500 494 1
+501 495 .0500000007
+986 495 -.269156814
+1032 495 -.00465328991
+1033 495 -.00622985372
+1254 495 -.0380239636
+505 496 .0500000007
+990 496 -.466260761
+1032 496 -.00184849033
+1033 496 -.00247492688
+1254 496 -.0151058435
+508 497 .560000002
+993 497 -.763145924
+1032 497 -.106123865
+1033 497 -.00968712196
+1254 497 -.216793314
+514 498 .239999995
+999 498 -.815083683
+1032 498 -.0105610844
+1033 498 -.0141391223
+1254 498 -.086298421
+516 499 .0500000007
+1001 499 -.39580062
+1032 499 -.00164448307
+1033 499 -.00220156088
+1254 499 -.0134372637
+534 500 .0500000007
+1019 500 -.511013329
+1032 500 -.00136712939
+1033 500 -.00183034141
+1254 500 -.0111717703
+546 501 .0500000007
+1031 501 -.959510028
+1032 501 -.0295934808
+1033 501 -.0396195129
+1254 501 -.241819128
+555 502 1
+556 503 1
+557 504 1
+558 505 1
+559 506 1
+560 507 1
+561 508 1
+562 509 1
+563 510 1
+564 511 1
+565 512 1
+566 513 1
+567 514 1
+568 515 1
+569 516 1
+570 517 1
+571 518 1
+572 519 1
+1003 519 -.027629815
+573 520 1
+574 521 1
+575 522 1
+576 523 1
+577 524 1
+578 525 1
+579 526 1
+580 527 1
+581 528 1
+582 529 1
+583 530 1
+584 531 1
+585 532 1
+586 533 1
+1017 533 -.00340405246
+587 534 1
+588 535 1
+1019 535 -.079954952
+589 536 1
+1020 536 -.0364257284
+590 537 1
+591 538 1
+592 539 1
+593 540 1
+1024 540 -.0636040792
+594 541 1
+595 542 1
+596 543 1
+597 544 1
+598 545 1
+1029 545 -.125452146
+599 546 1
+600 547 1
+601 548 1
+1032 548 -.0137514537
+602 549 1
+1033 549 -.00257521961
+603 550 1
+604 551 1
+1035 551 -.00106875168
+605 552 1
+606 553 1
+607 554 1
+1038 554 -1
+608 555 1
+609 556 1
+610 557 1
+611 558 1
+612 559 1
+613 560 1
+614 561 1
+615 562 1
+616 563 1
+617 564 1
+618 565 1
+619 566 1
+620 567 1
+621 568 1
+622 569 1
+623 570 1
+624 571 1
+625 572 1
+626 573 1
+1216 573 -.0660542548
+627 574 1
+628 575 1
+629 576 1
+630 577 1
+631 578 1
+632 579 1
+633 580 1
+1223 580 -.12564452
+634 581 1
+635 582 1
+636 583 1
+637 584 1
+638 585 1
+639 586 1
+640 587 1
+641 588 1
+642 589 1
+643 590 1
+644 591 1
+645 592 1
+646 593 1
+647 594 1
+648 595 1
+649 596 1
+650 597 1
+651 598 1
+652 599 1
+1242 599 -.0715435967
+653 600 1
+654 601 1
+655 602 1
+656 603 1
+657 604 1
+658 605 1
+659 606 1
+660 607 1
+661 608 1
+662 609 1
+986 610 1
+1040 610 1
+987 611 1
+1041 611 1
+988 612 1
+1042 612 1
+989 613 1
+1043 613 1
+990 614 1
+1044 614 1
+991 615 1
+1045 615 1
+992 616 1
+1046 616 1
+993 617 1
+1047 617 1
+994 618 1
+1048 618 1
+995 619 1
+1049 619 1
+996 620 1
+1050 620 1
+997 621 1
+1051 621 1
+998 622 1
+1052 622 1
+999 623 1
+1053 623 1
+1000 624 1
+1054 624 1
+1001 625 1
+1055 625 1
+1002 626 1
+1056 626 1
+1003 627 1
+1057 627 1
+1004 628 1
+1058 628 1
+1005 629 1
+1059 629 1
+1006 630 1
+1060 630 1
+1007 631 1
+1061 631 1
+1008 632 1
+1062 632 1
+1009 633 1
+1063 633 1
+1010 634 1
+1064 634 1
+1011 635 1
+1065 635 1
+1012 636 1
+1066 636 1
+1013 637 1
+1067 637 1
+1014 638 1
+1068 638 1
+1015 639 1
+1069 639 1
+1016 640 1
+1070 640 1
+1017 641 1
+1071 641 1
+1018 642 1
+1072 642 1
+1019 643 1
+1073 643 1
+1020 644 1
+1074 644 1
+1021 645 1
+1075 645 1
+1022 646 1
+1076 646 1
+1023 647 1
+1077 647 1
+1024 648 1
+1078 648 1
+1025 649 1
+1079 649 1
+1026 650 1
+1080 650 1
+1027 651 1
+1081 651 1
+1028 652 1
+1082 652 1
+1029 653 1
+1083 653 1
+1030 654 1
+1084 654 1
+1031 655 1
+1085 655 1
+1032 656 1
+1086 656 1
+1033 657 1
+1087 657 1
+1034 658 1
+1088 658 1
+1035 659 1
+1089 659 1
+1036 660 1
+1090 660 1
+1037 661 1
+1091 661 1
+1038 662 1
+1092 662 1
+1039 663 1
+1093 663 1
+1199 664 1
+1253 664 -2.12379946e-5
+1200 665 1
+1253 665 -.00138877379
+1201 666 1
+1253 666 -.0016952916
+1202 667 1
+1253 667 -.0130305747
+1203 668 1
+1253 668 -.0715932772
+1204 669 1
+1205 670 1
+1253 670 -.0156391002
+1206 671 1
+1253 671 -.0196913686
+1207 672 1
+1253 672 -.000465031248
+1208 673 1
+1253 673 -.0646045059
+1209 674 1
+1253 674 -1.41096707e-5
+1210 675 1
+1253 675 -.0213194918
+1211 676 1
+1253 676 -.0482018702
+1212 677 1
+1253 677 -.011754605
+1213 678 1
+1253 678 -7.12832334e-5
+1214 679 1
+1253 679 -.00290225632
+1215 680 1
+1253 680 -.00432887627
+1216 681 1
+1253 681 -.0161347762
+1217 682 1
+1253 682 -.00365646253
+1218 683 1
+1253 683 -.00112061657
+1219 684 1
+1253 684 -.00128508243
+1220 685 1
+1253 685 -.000721944845
+1221 686 1
+1253 686 -.00312227919
+1222 687 1
+1253 687 -.000479361363
+1223 688 1
+1253 688 -.00376875186
+1224 689 1
+1253 689 -.00984223001
+1225 690 1
+1253 690 -.00119829318
+1226 691 1
+1253 691 -8.72301025e-5
+1227 692 1
+1253 692 -.00248197932
+1228 693 1
+1253 693 -.0026970047
+1229 694 1
+1253 694 -.0204223525
+1230 695 1
+1253 695 -.121343024
+1231 696 1
+1253 696 -.0130022084
+1232 697 1
+1253 697 -.00675853249
+1233 698 1
+1253 698 -.0129593657
+1234 699 1
+1253 699 -.0472033173
+1235 700 1
+1253 700 -.00285191718
+1236 701 1
+1253 701 -.0533207394
+1237 702 1
+1253 702 -.0206797067
+1238 703 1
+1253 703 -.00944649801
+1239 704 1
+1240 705 1
+1253 705 -.00370004075
+1241 706 1
+1253 706 -.0503387488
+1242 707 1
+1253 707 -.0259307083
+1243 708 1
+1244 709 1
+1245 710 1
+1246 711 1
+1247 712 1
+1248 713 1
+1249 714 1
+1250 715 1
+1253 715 -.288725168
+1251 716 1
+1252 717 1
+281 718 1
+1011 718 -.548067629
+1032 718 -.00743604545
+1033 718 -.00482097548
+1034 718 -.131497085
+1224 718 -.247637033
+1231 718 -.00229462353
+1241 718 -.101192757
+1242 718 -.0262266193
+282 719 1
+1011 719 -.213327646
+1032 719 -.00371461594
+1033 719 -.00346439029
+1034 719 -.128691241
+1224 719 -.302950412
+1241 719 -.00893040746
+1242 719 -.0201682709
+283 720 1
+1011 720 -.0083140498
+1032 720 -.000252504105
+1033 720 -.000163707315
+1034 720 -.000393955124
+1224 720 -.062551029
+1241 720 -.0197172053
+284 721 1
+1009 721 -1
+1011 721 -.0402082577
+1012 721 -.967077792
+1013 721 -.997612417
+1028 721 -.0500000529
+1032 721 -.00488695642
+1033 721 -.00211219513
+1034 721 -.739417732
+1221 721 -.146247
+1222 721 -1
+1224 721 -.28527838
+1225 721 -.253084838
+1241 721 -.469926745
+1242 721 -.197006494
+281 722 -.0399474278
+925 722 -.0500000007
+977 722 .00667069061
+281 723 -.131836802
+926 723 -.0900000036
+977 723 .0220149979
+281 724 -.242864177
+927 724 -.0500000007
+977 724 .0405550972
+281 725 -.181239873
+928 725 -.0900000036
+977 725 .0302646551
+281 726 -.0938324183
+929 726 -.0700000003
+977 726 .0156687703
+281 727 -.179161012
+930 727 -.0700000003
+977 727 .0299175121
+281 728 -.0624370202
+931 728 -.0700000003
+977 728 .0104261544
+281 729 -.0686812699
+932 729 -.0700000003
+977 729 .0114688613
+282 730 -.0683486238
+933 730 -.129999995
+977 730 .00817558262
+282 731 -.0505733937
+934 731 -.0799999982
+977 731 .0060493825
+282 732 -.0338348635
+935 732 -.100000001
+977 732 .00404718798
+282 733 -.0425351672
+936 733 -.0799999982
+977 733 .00508788275
+282 734 -.0193547402
+937 734 -.109999999
+977 734 .00231513497
+282 735 -.023237003
+938 735 -.180000007
+977 735 .00277951523
+282 736 -.00978440326
+939 736 -.140000001
+977 736 .00117037038
+282 737 -.061770644
+940 737 -.129999995
+977 737 .00738875149
+282 738 -.000721712539
+941 738 -.129999995
+977 738 8.63283058e-5
+282 739 -.00198470941
+942 739 -.129999995
+977 739 .000237402841
+282 740 -.00272782869
+943 740 -.129999995
+977 740 .000326291716
+282 741 -.00198165141
+944 741 -.100000001
+977 741 .000237037035
+282 742 -.00797706377
+945 742 -.140000001
+977 742 .000954183808
+282 743 -.000946483167
+946 743 -.140000001
+977 743 .000113214446
+282 744 -.00112079515
+947 744 -.129999995
+977 744 .000134064932
+282 745 -.0594709478
+948 745 -.0900000036
+977 745 .00711367186
+282 746 -.0249847099
+949 746 -.109999999
+977 746 .00298856874
+282 747 -.0744204894
+950 747 -.109999999
+977 747 .00890187453
+282 748 -.0019556575
+951 748 -.150000006
+977 748 .000233927756
+282 749 -.0174281336
+952 749 -.129999995
+977 749 .00208468223
+282 750 -.0133516816
+953 750 -.140000001
+977 750 .00159707363
+282 751 -.0467324145
+954 751 -.109999999
+977 751 .00558994059
+282 752 -.00600917451
+955 752 -.140000001
+977 752 .000718792842
+282 753 -.0112645263
+956 753 -.100000001
+977 753 .00134741655
+282 754 -.0153486235
+957 754 -.119999997
+977 754 .00183593959
+282 755 -.0424143746
+958 755 -.140000001
+977 755 .00507343374
+282 756 -.0116284406
+959 756 -.0799999982
+977 756 .00139094645
+282 757 -.0841406733
+960 757 -.109999999
+977 757 .0100645637
+282 758 -.131145254
+961 758 -.200000003
+977 758 .0156870596
+282 759 -.00849541277
+962 759 -.119999997
+977 759 .00101618655
+282 760 -.00309327221
+963 760 -.0599999987
+977 760 .000370004564
+282 761 -.00779969431
+964 761 -.150000006
+977 761 .000932967523
+282 762 -.0735229328
+965 762 -.119999997
+977 762 .0087945126
+282 763 -.0398944952
+966 763 -.129999995
+977 763 .00477201631
+283 764 -.0177741945
+967 764 -.200000003
+977 764 .000100777317
+283 765 -.982225835
+968 765 -.200000003
+977 765 .00556909014
+284 766 -.0799999982
+969 766 -.159999996
+978 766 1
+284 767 -.360000014
+970 767 -.109999999
+979 767 1
+284 768 -.0799999982
+971 768 -.200000003
+980 768 1
+284 769 -.239999995
+972 769 -.150000006
+981 769 1
+284 770 -.239999995
+973 770 -.150000006
+982 770 1
+974 771 -.159999996
+983 771 1
+975 772 -.119999997
+984 772 1
+976 773 -.200000003
+157 774 -2.0382719
+158 774 2.5072279
+159 774 2.5072279
+160 774 2.5072279
+161 774 2.5072279
+162 774 2.5072279
+163 774 2.5072279
+164 774 .0574199595
+165 774 .889650881
+166 774 .889650881
+167 774 -.684651732
+168 774 .757262707
+169 774 .757262707
+170 774 .757262707
+171 774 .757262707
+172 774 .757262707
+173 774 .757262707
+174 774 .171273291
+175 774 .757262707
+176 774 .298174381
+177 774 -.173215106
+178 774 -.173215106
+179 774 -.173215106
+180 774 -.173215106
+181 774 -.173215106
+182 774 -.173215106
+183 774 -.173215106
+184 774 .00630014949
+185 774 .097612977
+186 774 .097612977
+187 774 -.74012363
+188 774 .712008893
+189 774 .281816572
+190 774 .712008893
+191 774 .712008893
+192 774 .712008893
+193 774 .712008893
+194 774 .161038041
+195 774 .712008893
+196 774 .280355543
+197 774 -2.82574105
+198 774 1.02045894
+199 774 1.02045894
+200 774 1.02045894
+201 774 1.02045894
+202 774 .0305909496
+203 774 1.02045894
+204 774 .016741449
+205 774 .0993608907
+206 774 .0993608907
+207 774 -.0301978122
+208 774 -.0301978122
+209 774 -.0301978122
+210 774 -.0301978122
+211 774 -.0301978122
+212 774 .00475290697
+213 774 -.0301978122
+214 774 .00260111387
+215 774 .0154376719
+216 774 .0154376719
+217 774 -.169090942
+218 774 -.169090942
+219 774 -.169090942
+220 774 -.169090942
+221 774 -.169090942
+222 774 -.169090942
+223 774 -.169090942
+224 774 .016070785
+225 774 .0953804851
+226 774 .0953804851
+227 774 -.12439438
+228 774 -.12439438
+229 774 -.12439438
+230 774 -.12439438
+231 774 -.12439438
+232 774 .00998765975
+233 774 -.12439438
+234 774 .00546592707
+235 774 .0324404053
+236 774 .0324404053
+339 774 -.5
+393 774 -1
+664 774 .110741682
+665 774 .262503922
+666 774 .0154968342
+667 774 .303868353
+668 774 .0555011146
+669 774 .0102592101
+670 774 .0757173747
+671 774 .0159322936
+672 774 -.612610757
+774 774 -.869999826
+986 774 8.81728047e-5
+1199 774 -.499836445
+157 775 .0104071647
+158 775 -.0639951527
+159 775 -.0639951527
+160 775 -.0639951527
+161 775 -.0639951527
+162 775 -.0639951527
+163 775 -.0639951527
+164 775 .000238342502
+165 775 .00369282067
+166 775 .00369282067
+167 775 .0154453237
+168 775 -.0784192756
+169 775 -.0784192756
+170 775 -.0784192756
+171 775 -.0784192756
+172 775 -.0784192756
+173 775 -.0784192756
+174 775 .00284985313
+175 775 .012600258
+176 775 .00496138772
+177 775 -.0880930871
+178 775 -.0880930871
+179 775 -.0880930871
+180 775 -.0880930871
+181 775 -.0880930871
+182 775 -.0880930871
+183 775 -.0880930871
+184 775 .00320410589
+185 775 .0496436357
+186 775 .0496436357
+187 775 .0606838651
+188 775 -.236646697
+189 775 .0195946172
+190 775 -.236646697
+191 775 -.236646697
+192 775 -.236646697
+193 775 -.236646697
+194 775 .0111969244
+195 775 .0495057516
+196 775 .0194930322
+197 775 .495642304
+198 775 -.430728585
+199 775 -.430728585
+200 775 -.430728585
+201 775 -.430728585
+202 775 .0148581862
+203 775 -.430728585
+204 775 .00813141
+205 775 .0482601114
+206 775 .0482601114
+207 775 -.043094445
+208 775 -.043094445
+209 775 -.043094445
+210 775 -.043094445
+211 775 -.043094445
+212 775 .00678273896
+213 775 -.043094445
+214 775 .00371197611
+215 775 .0220306665
+216 775 .0220306665
+217 775 -.0312564522
+218 775 -.0312564522
+219 775 -.0312564522
+220 775 -.0312564522
+221 775 -.0312564522
+222 775 -.0312564522
+223 775 -.0312564522
+224 775 .00297068362
+225 775 .0176310781
+226 775 .0176310781
+227 775 -.0625699833
+228 775 -.0625699833
+229 775 -.0625699833
+230 775 -.0625699833
+231 775 -.0625699833
+232 775 .00502376119
+233 775 -.0625699833
+234 775 .00274934387
+235 775 .0163174197
+236 775 .0163174197
+340 775 -.5
+394 775 -1
+664 775 -.00931064598
+665 775 -.00382446544
+666 775 -.000113382936
+667 775 .0148005467
+668 775 .0114515871
+669 775 -.00279633701
+670 775 -.00722729228
+671 775 -.0107579976
+672 775 -.0106948726
+775 775 -.869891703
+987 775 .0407628454
+1200 775 -.454577208
+157 776 .0621434972
+158 776 -.382129312
+159 776 -.382129312
+160 776 -.382129312
+161 776 -.382129312
+162 776 -.382129312
+163 776 -.382129312
+164 776 .00142319617
+165 776 .0220506545
+166 776 .0220506545
+167 776 .00642392877
+168 776 -.0326156914
+169 776 -.0326156914
+170 776 -.0326156914
+171 776 -.0326156914
+172 776 -.0326156914
+173 776 -.0326156914
+174 776 .00118529436
+175 776 .0052406257
+176 776 .00206351141
+177 776 -.0360557176
+178 776 -.0360557176
+179 776 -.0360557176
+180 776 -.0360557176
+181 776 -.0360557176
+182 776 -.0360557176
+183 776 -.0360557176
+184 776 .00131141208
+185 776 .0203187
+186 776 .0203187
+187 776 .0320153944
+188 776 .0261180811
+189 776 -.510462344
+190 776 .0261180811
+191 776 .0261180811
+192 776 .0261180811
+193 776 .0261180811
+194 776 .00590723613
+195 776 .0261180811
+196 776 .0102840699
+197 776 .130923346
+198 776 -.11377646
+199 776 -.11377646
+200 776 -.11377646
+201 776 -.11377646
+202 776 .00392477307
+203 776 -.11377646
+204 776 .00214790273
+205 776 .0127478531
+206 776 .0127478531
+207 776 -.0276396852
+208 776 -.0276396852
+209 776 -.0276396852
+210 776 -.0276396852
+211 776 -.0276396852
+212 776 .00435027713
+213 776 -.0276396852
+214 776 .0023807676
+215 776 .0141299125
+216 776 .0141299125
+217 776 -.0143900374
+218 776 -.0143900374
+219 776 -.0143900374
+220 776 -.0143900374
+221 776 -.0143900374
+222 776 -.0143900374
+223 776 -.0143900374
+224 776 .00136766164
+225 776 .00811710395
+226 776 .00811710395
+227 776 -.0348886102
+228 776 -.0348886102
+229 776 -.0348886102
+230 776 -.0348886102
+231 776 -.0348886102
+232 776 .00280121644
+233 776 -.0348886102
+234 776 .0015330161
+235 776 .00909848697
+236 776 .00909848697
+341 776 -.5
+395 776 -1
+664 776 -.0037658806
+665 776 -.00467685144
+666 776 -.00274560135
+667 776 .0282335691
+668 776 -.00528713875
+669 776 -.00331864133
+670 776 -.00925511029
+671 776 -.00926723704
+673 776 -.0304701719
+776 776 -.87013799
+988 776 .0651841611
+1201 776 -.423311949
+157 777 .000524101779
+158 777 -.00322277728
+159 777 -.00322277728
+160 777 -.00322277728
+161 777 -.00322277728
+162 777 -.00322277728
+163 777 -.00322277728
+164 777 1.20028599e-5
+165 777 .000185969388
+166 777 .000185969388
+167 777 .00131599407
+168 777 -.00668158941
+169 777 -.00668158941
+170 777 -.00668158941
+171 777 -.00668158941
+172 777 -.00668158941
+173 777 -.00668158941
+174 777 .000242817172
+175 777 .00107358478
+176 777 .000422727084
+177 777 -.0856701881
+178 777 -.0856701881
+179 777 -.0856701881
+180 777 -.0856701881
+181 777 -.0856701881
+182 777 -.0856701881
+183 777 -.0856701881
+184 777 .00311598089
+185 777 .0482782498
+186 777 .0482782498
+187 777 .00249884813
+188 777 -.00974466838
+189 777 .000806869706
+190 777 -.00974466838
+191 777 -.00974466838
+192 777 -.00974466838
+193 777 -.00974466838
+194 777 .000461068412
+195 777 .00203855429
+196 777 .000802686671
+197 777 .245963693
+198 777 -.213750109
+199 777 -.213750109
+200 777 -.213750109
+201 777 -.213750109
+202 777 .00737341121
+203 777 -.213750109
+204 777 .00403523212
+205 777 .0239491984
+206 777 .0239491984
+207 777 -.040156737
+208 777 -.040156737
+209 777 -.040156737
+210 777 -.040156737
+211 777 -.040156737
+212 777 .00632036664
+213 777 -.040156737
+214 777 .00345893437
+215 777 .0205288567
+216 777 .0205288567
+217 777 -.00793116167
+218 777 -.00793116167
+219 777 -.00793116167
+220 777 -.00793116167
+221 777 -.00793116167
+222 777 -.00793116167
+223 777 -.00793116167
+224 777 .000753795495
+225 777 .00447379425
+226 777 .00447379425
+227 777 -.218512505
+228 777 -.218512505
+229 777 -.218512505
+230 777 -.218512505
+231 777 -.218512505
+232 777 .0175444297
+233 777 -.218512505
+234 777 .00960150547
+235 777 .0569851622
+236 777 .0569851622
+288 777 .471892297
+342 777 -.0281077065
+396 777 -.0562154129
+664 777 -.000882008986
+665 777 -.000362094084
+666 777 .00752938213
+667 777 -6.91006426e-5
+668 777 .0129995495
+669 777 .0131666847
+670 777 .00185310841
+671 777 .0287142098
+672 777 -.0220831167
+673 777 -.0538090281
+777 777 -.870029628
+989 777 .152490258
+1202 777 -.0962755829
+1256 777 -.000112710521
+157 778 .000449230138
+158 778 -.00276238052
+159 778 -.00276238052
+160 778 -.00276238052
+161 778 -.00276238052
+162 778 -.00276238052
+163 778 -.00276238052
+164 778 1.02881659e-5
+165 778 .000159402334
+166 778 .000159402334
+167 778 .00194844441
+168 778 -.00989267789
+169 778 -.00989267789
+170 778 -.00989267789
+171 778 -.00989267789
+172 778 -.00989267789
+173 778 -.00989267789
+174 778 .000359512109
+175 778 .00158953632
+176 778 .00062588451
+177 778 -.267490625
+178 778 -.267490625
+179 778 -.267490625
+180 778 -.267490625
+181 778 -.267490625
+182 778 -.267490625
+183 778 -.267490625
+184 778 .00972912088
+185 778 .150740623
+186 778 .150740623
+187 778 .00271101436
+188 778 -.0105720451
+189 778 .000875377445
+190 778 -.0105720451
+191 778 -.0105720451
+192 778 -.0105720451
+193 778 -.0105720451
+194 778 .000500215683
+195 778 .00221163896
+196 778 .000870839227
+197 778 .627855837
+198 778 -.545626283
+199 778 -.545626283
+200 778 -.545626283
+201 778 -.545626283
+202 778 .0188216381
+203 778 -.545626283
+204 778 .0103004798
+205 778 .0611335933
+206 778 .0611335933
+207 778 -.188998535
+208 778 -.188998535
+209 778 -.188998535
+210 778 -.188998535
+211 778 -.188998535
+212 778 .0297469385
+213 778 -.188998535
+214 778 .0162795484
+215 778 .0966195017
+216 778 .0966195017
+217 778 -.0390729122
+218 778 -.0390729122
+219 778 -.0390729122
+220 778 -.0390729122
+221 778 -.0390729122
+222 778 -.0390729122
+223 778 -.0390729122
+224 778 .00371357752
+225 778 .0220401697
+226 778 .0220401697
+227 778 -.393182158
+228 778 -.393182158
+229 778 -.393182158
+230 778 -.393182158
+231 778 -.393182158
+232 778 .031568706
+233 778 -.393182158
+234 778 .0172765423
+235 778 .102536693
+236 778 .102536693
+289 778 1.67131245
+343 778 -.328687608
+397 778 -.164343804
+664 778 2.97306397e-5
+665 778 .000807423727
+666 778 .027500771
+667 778 .00133185962
+668 778 .0393917151
+669 778 .0723160058
+670 778 .0204517543
+671 778 .059062589
+673 778 -.012854564
+696 778 -.0463634431
+778 778 -.870002031
+990 778 .0437857695
+1203 778 -.0176200569
+1256 778 -.00421766937
+157 779 .455968589
+158 779 -2.80381632
+159 779 -2.80381632
+160 779 -2.80381632
+161 779 -2.80381632
+162 779 -2.80381632
+163 779 -2.80381632
+164 779 .0104424879
+165 779 .161793381
+166 779 .161793381
+167 779 .157031372
+168 779 -.797282577
+169 779 -.797282577
+170 779 -.797282577
+171 779 -.797282577
+172 779 -.797282577
+173 779 -.797282577
+174 779 .0289742295
+175 779 .128105819
+176 779 .0504420325
+177 779 -.907425106
+178 779 -.907425106
+179 779 -.907425106
+180 779 -.907425106
+181 779 -.907425106
+182 779 -.907425106
+183 779 -.907425106
+184 779 .0330047049
+185 779 .511366904
+186 779 .511366904
+187 779 .158726871
+188 779 -.618981421
+189 779 .0512523726
+190 779 -.618981421
+191 779 -.618981421
+192 779 -.618981421
+193 779 -.618981421
+194 779 .02928707
+195 779 .12948899
+196 779 .0509866662
+197 779 .0775558501
+198 779 .0775558501
+199 779 .0775558501
+200 779 .0775558501
+201 779 .0775558501
+202 779 -.112975061
+203 779 .0775558501
+204 779 .00127236603
+205 779 .00755152293
+206 779 .00755152293
+207 779 .0805267543
+208 779 .0805267543
+209 779 .0805267543
+210 779 .0805267543
+211 779 .0805267543
+212 779 -.112885997
+213 779 .0805267543
+214 779 .0013211061
+215 779 .00784079637
+216 779 .00784079637
+217 779 -.0994591713
+218 779 -.0994591713
+219 779 -.0994591713
+220 779 -.0994591713
+221 779 -.0994591713
+222 779 -.0994591713
+223 779 -.0994591713
+224 779 .00945282448
+225 779 .0561027341
+226 779 .0561027341
+227 779 .114812307
+228 779 .114812307
+229 779 .114812307
+230 779 .114812307
+231 779 .114812307
+232 779 -.111858197
+233 779 .114812307
+234 779 .00188358827
+235 779 .0111791408
+236 779 .0111791408
+344 779 -2
+398 779 -1
+664 779 -.0111589003
+665 779 -.00679647923
+666 779 .00194025785
+667 779 -.00464311987
+668 779 .00136367977
+669 779 .00470969081
+670 779 -.00126966089
+671 779 .00761918724
+779 779 -1
+157 780 .106317788
+158 780 -.653763413
+159 780 -.653763413
+160 780 -.653763413
+161 780 -.653763413
+162 780 -.653763413
+163 780 -.653763413
+164 780 .00243486557
+165 780 .0377252176
+166 780 .0377252176
+167 780 .0411161855
+168 780 -.208755851
+169 780 -.208755851
+170 780 -.208755851
+171 780 -.208755851
+172 780 -.208755851
+173 780 -.208755851
+174 780 .00758644473
+175 780 .033542484
+176 780 .0132074496
+177 780 -.198312685
+178 780 -.198312685
+179 780 -.198312685
+180 780 -.198312685
+181 780 -.198312685
+182 780 -.198312685
+183 780 -.198312685
+184 780 .00721299369
+185 780 .111756369
+186 780 .111756369
+187 780 .0544188842
+188 780 -.212215364
+189 780 .0175716747
+190 780 -.212215364
+191 780 -.212215364
+192 780 -.212215364
+193 780 -.212215364
+194 780 .0100409575
+195 780 .0443947949
+196 780 .0174805783
+197 780 .557472408
+198 780 -.48446089
+199 780 -.48446089
+200 780 -.48446089
+201 780 -.48446089
+202 780 .0167117082
+203 780 -.48446089
+204 780 .00914578326
+205 780 .0542804413
+206 780 .0542804413
+207 780 -.285272539
+208 780 -.285272539
+209 780 -.285272539
+210 780 -.285272539
+211 780 -.285272539
+212 780 .0448997393
+213 780 -.285272539
+214 780 .0245721899
+215 780 .145836532
+216 780 .145836532
+217 780 -.20498237
+218 780 -.20498237
+219 780 -.20498237
+220 780 -.20498237
+221 780 -.20498237
+222 780 -.20498237
+223 780 -.20498237
+224 780 .0194819868
+225 780 .115626052
+226 780 .115626052
+227 780 -.212087378
+228 780 -.212087378
+229 780 -.212087378
+230 780 -.212087378
+231 780 -.212087378
+232 780 .0170285534
+233 780 -.212087378
+234 780 .00931918249
+235 780 .0553095713
+236 780 .0553095713
+291 780 1.46780527
+345 780 -.532194793
+399 780 -.266097397
+664 780 .00653083064
+665 780 .0170512833
+666 780 .0203899294
+667 780 .0267378725
+668 780 .0349783003
+669 780 .109160982
+670 780 .107301265
+671 780 .0318611152
+673 780 -.0597859584
+678 780 -.335545868
+696 780 -.265876949
+697 780 -.157820597
+780 780 -.87005651
+992 780 .0426713973
+1205 780 -.0706640184
+1256 780 -.00378513522
+157 781 .196210727
+158 781 .196210727
+159 781 .196210727
+160 781 .196210727
+161 781 .196210727
+162 781 .196210727
+163 781 .196210727
+164 781 -.0996064246
+165 781 .0696223304
+166 781 .0696223304
+167 781 .189191252
+168 781 .154341772
+169 781 .154341772
+170 781 .154341772
+171 781 .154341772
+172 781 .154341772
+173 781 .154341772
+174 781 -.262691885
+175 781 .154341772
+176 781 .0607725158
+177 781 .169322357
+178 781 .169322357
+179 781 .169322357
+180 781 .169322357
+181 781 .169322357
+182 781 .169322357
+183 781 .169322357
+184 781 -.100222215
+185 781 .0600814112
+186 781 .0600814112
+187 781 .0746707991
+188 781 .0609162562
+189 781 .0241109487
+190 781 .0609162562
+191 781 .0609162562
+192 781 .0609162562
+193 781 .0609162562
+194 781 -.283822298
+195 781 .0609162562
+196 781 .0239859503
+197 781 .0464772396
+198 781 .0464772396
+199 781 .0464772396
+200 781 .0464772396
+201 781 .0464772396
+202 781 .00139327801
+203 781 .0464772396
+204 781 -.0623375066
+205 781 .00452543469
+206 781 .00452543469
+207 781 .0317638852
+208 781 .0317638852
+209 781 .0317638852
+210 781 .0317638852
+211 781 .0317638852
+212 781 .000952206377
+213 781 .0317638852
+214 781 -.0625788867
+215 781 .00309281261
+216 781 .00309281261
+217 781 .0272320695
+218 781 .0272320695
+219 781 .0272320695
+220 781 .0272320695
+221 781 .0272320695
+222 781 .0272320695
+223 781 .0272320695
+224 781 -.0626532361
+225 781 .00265155477
+226 781 .00265155477
+227 781 .0145535357
+228 781 .0145535357
+229 781 .0145535357
+230 781 .0145535357
+231 781 .0145535357
+232 781 .000436280679
+233 781 .0145535357
+234 781 -.0628612414
+235 781 .00141706085
+236 781 .00141706085
+346 781 -.5
+400 781 -1
+664 781 .567007899
+665 781 .368021756
+666 781 .504498124
+667 781 .171694726
+668 781 .187478885
+669 781 .148706645
+670 781 .158462361
+671 781 .0524044745
+674 781 -.157417119
+675 781 -.247411415
+693 781 -.141924486
+781 781 -.890000045
+993 781 .0239348169
+1206 781 -.344784945
+157 782 .274278283
+158 782 .274278283
+159 782 .274278283
+160 782 .274278283
+161 782 .274278283
+162 782 .274278283
+163 782 .274278283
+164 782 .00628145831
+165 782 -.269419193
+166 782 -.269419193
+167 782 .152858287
+168 782 .124701418
+169 782 .124701418
+170 782 .124701418
+171 782 .124701418
+172 782 .124701418
+173 782 .124701418
+174 782 .0282042436
+175 782 -1.19109857
+176 782 .0491015427
+177 782 .498320878
+178 782 .498320878
+179 782 .498320878
+180 782 .498320878
+181 782 .498320878
+182 782 .498320878
+183 782 .498320878
+184 782 .0114124306
+185 782 -.329651117
+186 782 -.329651117
+187 782 .246326566
+188 782 .200952619
+189 782 .0795380175
+190 782 .200952619
+191 782 .200952619
+192 782 .200952619
+193 782 .200952619
+194 782 .0454503
+195 782 -1.1148473
+196 782 .0791256726
+197 782 .350749463
+198 782 .350749463
+199 782 .350749463
+200 782 .350749463
+201 782 .350749463
+202 782 .0105146412
+203 782 .350749463
+204 782 .00575432694
+205 782 -.340347946
+206 782 .0341520645
+207 782 .285520315
+208 782 .285520315
+209 782 .285520315
+210 782 .285520315
+211 782 .285520315
+212 782 .00855922513
+213 782 .285520315
+214 782 .00468419027
+215 782 -.346699238
+216 782 .0278007798
+217 782 .237027302
+218 782 .237027302
+219 782 .237027302
+220 782 .237027302
+221 782 .237027302
+222 782 .237027302
+223 782 .237027302
+224 782 .00388862356
+225 782 -.351420939
+226 782 .0230790731
+227 782 .356762469
+228 782 .356762469
+229 782 .356762469
+230 782 .356762469
+231 782 .356762469
+232 782 .0106948968
+233 782 .356762469
+234 782 .00585297495
+235 782 -.339762479
+236 782 .034737546
+293 782 .00739657879
+347 782 -1.99260342
+401 782 -.996301711
+664 782 .0511565208
+665 782 .0672518909
+666 782 .0958291367
+667 782 .128103361
+668 782 .238389209
+669 782 .225222245
+670 782 .232392117
+671 782 .216449484
+672 782 -.0155363604
+695 782 -.275162965
+696 782 -.0660882518
+697 782 -.102282472
+705 782 -.153743282
+782 782 -.890000701
+1256 782 -.0475550555
+157 783 .931972265
+158 783 .931972265
+159 783 .931972265
+160 783 .931972265
+161 783 .931972265
+162 783 .931972265
+163 783 .931972265
+164 783 .0213438161
+165 783 -.9154616
+166 783 -.9154616
+167 783 .119320869
+168 783 .097341679
+169 783 .097341679
+170 783 .097341679
+171 783 .097341679
+172 783 .097341679
+173 783 .097341679
+174 783 .022016177
+175 783 .097341679
+176 783 -.479771465
+177 783 1.08861959
+178 783 1.08861959
+179 783 1.08861959
+180 783 1.08861959
+181 783 1.08861959
+182 783 1.08861959
+183 783 1.08861959
+184 783 .0249313172
+185 783 -.720147789
+186 783 -.720147789
+187 783 .10807126
+188 783 .0881642774
+189 783 .0348958485
+190 783 .0881642774
+191 783 .0881642774
+192 783 .0881642774
+193 783 .0881642774
+194 783 .0199404843
+195 783 .0881642774
+196 783 -.483385086
+197 783 .293100953
+198 783 .293100953
+199 783 .293100953
+200 783 .293100953
+201 783 .293100953
+202 783 .00878647529
+203 783 .293100953
+204 783 .00480855675
+205 783 .0285388995
+206 783 -.345961124
+207 783 .21754922
+208 783 .21754922
+209 783 .21754922
+210 783 .21754922
+211 783 .21754922
+212 783 .00652161241
+213 783 .21754922
+214 783 .00356906978
+215 783 .0211825129
+216 783 -.353317499
+217 783 .301923811
+218 783 .301923811
+219 783 .301923811
+220 783 .301923811
+221 783 .301923811
+222 783 .301923811
+223 783 .301923811
+224 783 .00495330291
+225 783 .0293979701
+226 783 -.345102042
+227 783 .559506476
+228 783 .559506476
+229 783 .559506476
+230 783 .559506476
+231 783 .559506476
+232 783 .0167726837
+233 783 .559506476
+234 783 .00917915348
+235 783 .0544784926
+236 783 -.32002154
+294 783 .695431352
+348 783 -1.30456865
+402 783 -.652284324
+664 783 .173825145
+665 783 .133323938
+666 783 .209345996
+667 783 .142736718
+668 783 .199208006
+669 783 .171605751
+670 783 .296019554
+671 783 .339455217
+675 783 -.233187303
+677 783 -.0143305426
+693 783 -.0365852825
+695 783 -.350010514
+698 783 -.368325382
+783 783 -.890000045
+1256 783 -.0476517156
+295 784 .000457370246
+349 784 -2.09954262
+403 784 -.999782205
+664 784 -.00107030303
+665 784 -.00193744991
+666 784 -.00154613005
+667 784 -.00259090355
+668 784 -.00322100311
+669 784 -.00345216854
+670 784 -.00405561412
+671 784 -.00355400215
+672 784 1
+696 784 -.0907458216
+699 784 -.116984315
+705 784 -.133410409
+784 784 -.870000362
+1256 784 -.029065825
+296 785 .900660157
+350 785 -1.19933975
+404 785 -.571114182
+673 785 1
+785 785 -.870074153
+1256 785 -.0214263145
+297 786 2.60216546
+351 786 -.297834694
+405 786 -.102701619
+674 786 1
+693 786 -.0192899816
+786 786 -.869974673
+998 786 .0376543887
+1211 786 -.00859325007
+1256 786 -.00232234877
+352 787 -2
+406 787 -1
+664 787 -.00597090367
+665 787 -.00310818246
+666 787 -.00380347972
+667 787 -.00139464473
+668 787 -.00142943056
+669 787 -.00111747673
+670 787 -.00124684128
+671 787 -.00038974901
+674 787 -.162022635
+675 787 1
+677 787 -.0391151533
+688 787 -.134885684
+691 787 -.117574222
+692 787 -.00208823266
+693 787 -.0133590531
+698 787 -.0353096239
+787 787 -.869999945
+999 787 .0484563895
+1212 787 -.403310657
+353 788 -1.39999998
+407 788 -1
+676 788 1
+710 788 -.0296438169
+788 788 -.869649827
+1000 788 .0107462266
+1213 788 -1.3855896
+1256 788 -.000952560804
+300 789 .124517657
+354 789 -1.07548237
+408 789 -.896235287
+675 789 -.0097569963
+677 789 1
+789 789 -.86999929
+1256 789 -.00712292502
+301 790 .74846828
+355 790 -1.25153172
+409 790 -.62576586
+678 790 1
+790 790 -.869565189
+1256 790 -.00548300613
+302 791 1.67540276
+356 791 -.32459721
+410 791 -.162298605
+664 791 -.000564882183
+665 791 -.000586885144
+666 791 -.000388702523
+667 791 -.000618933933
+668 791 -.000681592501
+669 791 -.000742134813
+670 791 -.000425319507
+671 791 -.000778341491
+679 791 1
+681 791 -.324237466
+696 791 -.00806020573
+705 791 -.00601767702
+709 791 -.0423720069
+711 791 -.0160365067
+791 791 -.870012224
+1003 791 .0407752655
+1216 791 -.111877225
+1256 791 -.00104968902
+303 792 .66875571
+357 792 -.531244338
+411 792 -.442703605
+680 792 1
+792 792 -.87057811
+1256 792 -.00133256649
+304 793 .627053499
+358 793 -1.3729465
+412 793 -.68647325
+681 793 1
+793 793 -.869837284
+1256 793 -.00091919367
+305 794 .18695505
+359 794 -1.01304495
+413 794 -.844204128
+682 794 1
+794 794 -.869710445
+1256 794 -.00245653326
+306 795 .721822023
+360 795 -1.27817798
+414 795 -.639088988
+683 795 1
+795 795 -.888888896
+1256 795 -.000483603566
+307 796 .105930507
+361 796 -.394069493
+415 796 -.788138986
+684 796 1
+796 796 -.870722413
+1256 796 -.00403519627
+362 797 -.5
+416 797 -1
+685 797 1
+719 797 -.00144721544
+797 797 -1
+1009 797 .269034058
+1222 797 -.230965927
+309 798 1.20000005
+686 798 1
+706 798 -.00698827021
+710 798 -.00422448991
+798 798 -.870033681
+1010 798 .0182339847
+1223 798 -.00698937196
+310 799 .151235133
+364 799 -.348764867
+418 799 -.697529733
+687 799 1
+716 799 -.295740426
+717 799 -.160700306
+718 799 -.132129028
+719 799 -.00511926599
+799 799 -.870001733
+1011 799 .0798794478
+1224 799 -.272300512
+1256 799 -.000932077994
+365 800 -.5
+419 800 -1
+688 800 1
+719 800 -.144242674
+800 800 -.869969308
+1012 800 .00420977129
+1225 800 -.125440717
+366 801 -.5
+420 801 -1
+689 801 1
+719 801 -.0319625288
+801 801 -.868035197
+367 802 -1.29999995
+421 802 -1
+690 802 1
+710 802 -.059168525
+712 802 -.00159905537
+802 802 -.869984448
+1014 802 .175757363
+1227 802 -1.11488414
+314 803 .200484842
+368 803 -1.6995151
+422 803 -.894481659
+691 803 1
+803 803 -.869993567
+1256 803 -.010565496
+315 804 1.61500096
+369 804 -.384999037
+423 804 -.192499518
+664 804 -.00103561732
+665 804 -.000833546976
+666 804 -.000743498676
+667 804 -.000800973328
+668 804 -.000752254389
+669 804 -.000740913558
+670 804 -.000775766151
+671 804 -.000733237015
+672 804 -.000431161869
+674 804 -.00400092453
+675 804 -.00904091913
+677 804 -.00443976279
+679 804 -.00415170519
+681 804 -.004059311
+683 804 -.340312958
+692 804 .996429563
+693 804 -.00320973806
+704 804 -.0038649186
+705 804 -.00395374373
+710 804 -.00508075068
+712 804 -.00120204582
+804 804 -.869974256
+1016 804 .271766007
+1229 804 -.129452288
+1256 804 -.00127621123
+316 805 2.23066592
+370 805 -1.16933417
+424 805 -.34392181
+664 805 -.0185023677
+665 805 -.0148349488
+666 805 -.0132394843
+667 805 -.0142659442
+668 805 -.0133985188
+669 805 -.0131980311
+670 805 -.0138182044
+671 805 -.0130513879
+672 805 -.0355932601
+674 805 -.00553016691
+677 805 -.112575263
+680 805 -.0402331427
+681 805 -.126768902
+682 805 -.0323383622
+687 805 -.0140888244
+688 805 -.103382498
+689 805 -.0601734221
+690 805 -.115011364
+691 805 -.0696121082
+692 805 -.000599461142
+693 805 .806271911
+694 805 -.101612881
+695 805 -.000495907036
+696 805 -.00217979099
+698 805 -.0363733061
+699 805 -.0240057558
+705 805 -.158976257
+707 805 -.00514993165
+708 805 -.0666899681
+709 805 -.0721485987
+710 805 -.0813447908
+711 805 -.00739258807
+712 805 -.0208374932
+805 805 -.869999588
+1017 805 .0235442594
+1230 805 -.0497347564
+1256 805 -.0255885925
+317 806 2.04773259
+371 806 -.352267474
+425 806 -.146778107
+664 806 -.0016104097
+665 806 -.00165919599
+666 806 -.0010991086
+667 806 -.00174943579
+668 806 -.00192774378
+669 806 -.0020985764
+670 806 -.00120108563
+671 806 -.00220202422
+672 806 -.00664001377
+689 806 -.0142286755
+694 806 .977918148
+705 806 -.0047013103
+709 806 -.000687512336
+710 806 -.00348676136
+806 806 -.870021164
+1018 806 .668438077
+1231 806 -.279838085
+1256 806 -.00013633183
+318 807 .233849168
+372 807 -.266150832
+426 807 -.532301664
+695 807 1
+807 807 -.870003164
+1256 807 -.00326989894
+319 808 .291807085
+373 808 -.808192909
+427 808 -.734720826
+672 808 -.0313579403
+696 808 .914857388
+697 808 -.121057041
+705 808 -.0346712917
+808 808 -.86999917
+1256 808 -.0172352381
+320 809 1.1448102
+374 809 -.8551898
+428 809 -.4275949
+673 809 -.0418436378
+678 809 -.093989566
+692 809 -.0650645867
+695 809 -.0493461937
+696 809 -.0462170057
+697 809 1
+705 809 -.0483944751
+809 809 -.869988739
+1021 809 .0537051633
+1234 809 -.0483965762
+1256 809 -.0148378145
+321 810 1.42642903
+375 810 -.573570967
+429 810 -.286785483
+677 810 -.257495135
+693 810 -.00634217123
+698 810 1
+705 810 -.0103614554
+810 810 -.870010138
+1256 810 -.000337993901
+322 811 .485329628
+376 811 -1.51467037
+430 811 -.757335186
+679 811 -.151637703
+680 811 -.172124177
+682 811 -.318911761
+688 811 -.0452379622
+689 811 -.0450736023
+690 811 -.0383139811
+691 811 -.215203106
+699 811 .794479668
+700 811 -.0168030038
+709 811 -.00520322053
+710 811 -.0845131576
+811 811 -.870004654
+1023 811 .290093929
+1236 811 -.660795033
+1256 811 -.0166454222
+377 812 -.5
+431 812 -1
+672 812 -.0036679036
+673 812 -.00106854446
+674 812 -.00841675978
+675 812 -.000881052285
+676 812 -.0108826561
+677 812 -.00309364079
+678 812 -.000581978704
+679 812 -.00125210162
+680 812 -.000627377944
+682 812 -.00190549623
+683 812 -.000455192028
+686 812 -.00403084466
+687 812 -.00501968898
+688 812 -.000367323461
+689 812 -.00659003854
+690 812 -.000365600252
+691 812 -.000989085878
+692 812 -.00427527772
+693 812 -8.09862686e-5
+694 812 -.00404128386
+695 812 -.00108135282
+696 812 -.000392236834
+697 812 -.00148049882
+698 812 -.00432409951
+699 812 -.00657516345
+700 812 .999062061
+701 812 -.00375950173
+702 812 -.00498532085
+704 812 -.00180093071
+705 812 -.00225895038
+706 812 -.00507186539
+707 812 -.0140905408
+708 812 -.0541200675
+812 812 -.870759308
+1024 812 .229801148
+1237 812 -.215929925
+1256 812 -.000727965671
+324 813 1.27107942
+378 813 -.128920481
+432 813 -.0920860618
+673 813 -.00201665331
+676 813 -.0892450362
+684 813 -.0487057231
+690 813 -.0849580914
+701 813 .797954261
+703 813 -.0126410509
+710 813 -.00752177229
+712 813 -.00120873482
+813 813 -.869994819
+1025 813 .169435248
+1238 813 -.162539542
+1256 813 -.000241443879
+379 814 -2
+433 814 -1
+697 814 -.0172134973
+702 814 .740929484
+814 814 -1
+1256 814 -3.88732915e-5
+326 815 .333289057
+380 815 -.966710925
+434 815 -.743623793
+684 815 -.0455313362
+703 815 .959176183
+705 815 -.0186752286
+815 815 -.870153308
+1027 815 .0195702668
+1240 815 -.0602502711
+1256 815 -.00437467685
+327 816 .0834249258
+381 816 -.416575074
+435 816 -.833150148
+680 816 -.181939617
+684 816 -.338542223
+685 816 -.396428585
+686 816 -.188573435
+687 816 -.1475683
+688 816 -.113464832
+689 816 -.0124057271
+690 816 -.0668724552
+691 816 -.0199355744
+692 816 -.00783251971
+704 816 .587145507
+705 816 -.0202887654
+708 816 -.117915452
+710 816 -.0741321817
+719 816 -.0236270837
+816 816 -.869995236
+1028 816 .074881725
+1241 816 -.190253779
+1256 816 -.00381938345
+328 817 1.04966235
+382 817 -.150337696
+436 817 -.125281408
+664 817 -.000797772198
+665 817 -.0008201811
+666 817 -.000543315487
+667 817 -.000864872884
+668 817 -.000953199051
+669 817 -.00103727891
+670 817 -.00059378345
+671 817 -.00108829024
+672 817 -.00266206125
+676 817 -.222796991
+677 817 -.124763541
+678 817 -.0823399574
+679 817 -.00243991031
+684 817 -.105415531
+687 817 -.00518004317
+688 817 -.00226804917
+689 817 -.00408953428
+690 817 -.0241064783
+692 817 -.026659552
+694 817 -.0364078544
+695 817 -.0137097631
+699 817 -.00121616852
+700 817 -.153590724
+701 817 -.181663513
+703 817 -.138760403
+704 817 -.000887066359
+705 817 1
+706 817 -.00337022962
+708 817 -.0013579115
+710 817 -.00168195146
+817 817 -.869997799
+1029 817 .0393045694
+1242 817 -.0444007665
+1256 817 -.000738454866
+383 818 -2
+437 818 -1
+706 818 1
+710 818 -.016186187
+712 818 -.000867007475
+818 818 -1
+384 819 -.5
+438 819 -1
+676 819 -.0530493185
+690 819 -.0138881821
+700 819 -.216033772
+703 819 -.295067728
+704 819 -.00787093677
+707 819 1
+819 819 -.870000243
+385 820 -2
+439 820 -1
+664 820 -.000852278376
+665 820 -.000845697825
+666 820 -.000570983102
+667 820 -.000896822661
+668 820 -.000994418398
+669 820 -.00107432459
+670 820 -.000658257341
+671 820 -.00112529914
+672 820 -.00347715081
+673 820 -.0017706576
+674 820 -.00227015419
+675 820 -.00308119413
+676 820 -.00378716434
+677 820 -.00386095257
+678 820 -.00271422835
+679 820 -.00212809048
+680 820 -.00244879792
+681 820 -.00264982809
+682 820 -.00220288569
+683 820 -.00182076811
+684 820 -.0040599457
+685 820 -.0026785715
+686 820 -.00363652292
+687 820 -.00255754474
+688 820 -.00232522679
+689 820 -.000887275673
+690 820 -.00217509014
+691 820 -.00253010611
+692 820 -.00258888165
+693 820 -.00219188794
+694 820 -.00327900588
+695 820 -.00266205659
+696 820 -.00219861837
+697 820 -.00194378418
+698 820 -.00234935014
+699 820 -.00308026723
+700 820 -.00349826226
+701 820 -.00359485205
+702 820 -.000941671722
+703 820 -.00455429452
+704 820 -.00334591372
+705 820 -.0023657456
+706 820 -.000396497606
+707 820 -.000296347367
+708 820 .997949481
+709 820 -.00171580527
+710 820 -.00345474901
+711 820 -.000188774109
+712 820 -.000146566963
+713 820 -.00498844869
+716 820 -.0203209203
+717 820 -.0141712539
+718 820 -.0203225799
+719 820 -.0031510531
+774 820 -.0199997593
+775 820 -.0200722013
+776 820 -.0200752821
+777 820 -.0199858136
+778 820 -.019998353
+780 820 -.0199913085
+781 820 -.0799998567
+782 820 -.08000388
+783 820 -.0799991563
+784 820 -.0200005732
+785 820 -.0198985543
+786 820 -.0200221576
+787 820 -.0199998934
+788 820 -.0194552504
+789 820 -.0200004857
+790 820 -.0217391308
+791 820 -.0199947488
+792 820 -.0198446941
+793 820 -.0200250316
+794 820 -.0200445447
+796 820 -.0190114081
+798 820 -.0199775547
+799 820 -.0199930165
+800 820 -.0200082418
+801 820 -.0205278583
+802 820 -.0199066866
+803 820 -.0200088024
+804 820 -.020014802
+805 820 -.0200016722
+806 820 -.0199828986
+807 820 -.0199990626
+808 820 -.0200033169
+809 820 -.0199887399
+810 820 -.019996183
+811 820 -.0199976135
+812 820 -.0193861071
+813 820 -.0199948251
+815 820 -.0198376905
+816 820 -.0200031791
+817 820 -.019998813
+819 820 -.0199998822
+820 820 -1
+1256 820 -.0510172583
+386 821 -2
+440 821 -1
+664 821 -.00846332218
+665 821 -.00840655249
+666 821 -.00567755196
+667 821 -.00892104488
+668 821 -.00988750812
+669 821 -.0106834034
+670 821 -.00654513668
+671 821 -.0111905383
+672 821 -.0345831774
+673 821 -.0176194515
+674 821 -.0225829966
+675 821 -.0306469649
+676 821 -.0376394801
+677 821 -.0384055004
+678 821 -.0269967895
+679 821 -.0211716071
+680 821 -.0243058372
+681 821 -.0264700912
+682 821 -.0219627712
+683 821 -.0178093892
+684 821 -.0403337888
+685 821 -.0260714293
+686 821 -.0358832814
+687 821 -.0254394505
+688 821 -.0231361799
+689 821 -.0087840287
+690 821 -.0216305777
+691 821 -.0251667406
+692 821 -.0257373042
+693 821 -.0218023974
+694 821 -.0326811634
+695 821 -.0264655948
+696 821 -.0218711272
+697 821 -.0193342511
+698 821 -.0233593863
+699 821 -.0306346249
+700 821 -.0348275639
+701 821 -.0357152671
+702 821 -.00947211031
+703 821 -.0453286879
+704 821 -.0332781151
+705 821 -.0235274453
+706 821 -.00391541375
+707 821 -.00294704316
+708 821 -.0203939229
+709 821 .982934237
+710 821 -.034358874
+711 821 -.0018774796
+712 821 -.00145681656
+713 821 -.0241378937
+716 821 -.0541237667
+717 821 -.0542966351
+718 821 -.0541290306
+719 821 -.00559503818
+774 821 -.110000402
+775 821 -.110036097
+776 821 -.109786704
+777 821 -.109984562
+778 821 -.109999612
+780 821 -.109952196
+781 821 -.0300000962
+782 821 -.0299953986
+783 821 -.0300008152
+784 821 -.109999068
+785 821 -.110027306
+786 821 -.110003166
+787 821 -.110000178
+788 821 -.110894933
+789 821 -.110000238
+790 821 -.108695649
+791 821 -.109992996
+792 821 -.109577231
+793 821 -.110137671
+794 821 -.110244997
+795 821 -.111111112
+796 821 -.110266164
+798 821 -.109988779
+799 821 -.110005237
+800 821 -.110022433
+801 821 -.111436948
+802 821 -.110108867
+803 821 -.10999763
+804 821 -.110010929
+805 821 -.10999874
+806 821 -.109995946
+807 821 -.109997772
+808 821 -.109997511
+809 821 -.110022523
+810 821 -.109993681
+811 821 -.10999772
+812 821 -.109854601
+813 821 -.110010341
+815 821 -.110009015
+816 821 -.110001586
+817 821 -.110003375
+819 821 -.109999895
+821 821 -1
+1256 821 -.222929522
+387 822 -2
+441 822 -1
+710 822 1
+716 822 -.499400884
+717 822 -.682299674
+718 822 -.0440645143
+719 822 -.662580967
+822 822 -1
+388 823 -2
+442 823 -1
+664 823 -.00269557815
+665 823 -.00216041761
+666 823 -.00192859373
+667 823 -.00207822118
+668 823 -.00195203384
+669 823 -.00192230428
+670 823 -.00201220834
+671 823 -.00190132763
+672 823 -.00223271595
+673 823 -.00171556475
+674 823 -.00235313643
+675 823 -.00236013927
+676 823 -.00483189942
+677 823 -.0033176248
+678 823 -.00235300022
+679 823 -.00310212583
+680 823 -.00315712788
+681 823 -.00301629351
+682 823 -.00283070817
+683 823 -.00267425319
+684 823 -.00369209819
+685 823 -.00339285703
+686 823 -.00332982815
+687 823 -.0791742802
+688 823 -.00281903427
+689 823 -.00280701765
+690 823 -.00322098448
+691 823 -.00275478722
+692 823 -.00266793137
+693 823 -.00239829789
+694 823 -.00296441489
+695 823 -.00202150992
+696 823 -.00189424248
+697 823 -.00169487623
+698 823 -.00236322428
+699 823 -.00280019036
+700 823 -.00446264818
+701 823 -.00454844814
+702 823 -.00182795106
+703 823 -.00443343259
+704 823 -.00345091801
+705 823 -.00198499765
+706 823 -.00360151986
+707 823 -.00271283323
+708 823 -.00325249461
+709 823 -.139743254
+710 823 -.0456520617
+711 823 1
+712 823 -.00268247048
+823 823 -1
+1256 823 -.094266355
+389 824 -2
+443 824 -1
+712 824 1
+824 824 -1
+1256 824 -.13993834
+390 825 -2
+444 825 -1
+713 825 1
+825 825 -1
+391 826 -2
+445 826 -1
+714 826 1
+826 826 -1
+392 827 -2
+446 827 -1
+707 827 -.260716975
+715 827 1
+827 827 -1
+339 828 .5
+672 828 -.00020044182
+720 828 1
+986 828 -8.81728047e-5
+1199 828 .499836445
+340 829 .5
+664 829 -.00108516833
+665 829 -.000930146081
+666 829 -.00100851082
+667 829 -.00146263081
+668 829 -.00209262152
+669 829 -.00213684351
+670 829 -.00250200182
+671 829 -.00227604178
+721 829 1
+987 829 -.0407628454
+1200 829 .454577208
+341 830 .5
+664 830 -.0014072503
+665 830 -.00120961515
+666 830 -.001311498
+667 830 -.00190175441
+668 830 -.00272121769
+669 830 -.0027788328
+670 830 -.00325281033
+671 830 -.00296070473
+722 830 1
+988 830 -.0651841611
+1201 830 .423311949
+288 831 -.471892297
+342 831 .0281077027
+396 831 -.943784595
+664 831 -.00577269914
+665 831 -.00495510874
+666 831 -.0053729373
+667 831 -.00779202906
+668 831 -.0111498525
+669 831 -.011384421
+670 831 -.0133284107
+671 831 -.0121284807
+673 831 -.0359768905
+723 831 1
+989 831 -.152490258
+1202 831 .0962755829
+1256 831 -.00189226482
+289 832 -1.67131233
+343 832 .328687668
+397 832 -.835656166
+673 832 -.197824225
+697 832 -.19558987
+724 832 1
+990 832 -.0437857695
+1203 832 .0176200569
+1256 832 -.021446025
+344 833 2
+725 833 1
+291 834 -1.46780527
+345 834 .532194734
+399 834 -.733902633
+678 834 -.0890327096
+726 834 1
+992 834 -.0426713973
+1205 834 .0706640184
+1256 834 -.0104394881
+346 835 .5
+674 835 -.153060555
+675 835 -.132922277
+693 835 -.0145159997
+727 835 1
+993 835 -.0239348169
+1206 835 .344784945
+293 836 -.00739663653
+347 836 1.99260342
+401 836 -.00369831827
+728 836 1
+1256 836 -.000176526592
+294 837 -.695431292
+348 837 1.30456877
+402 837 -.347715646
+729 837 1
+1256 837 -.0254018791
+295 838 -.000457389804
+349 838 2.09954262
+403 838 -.000217804685
+730 838 1
+1256 838 -6.33205173e-6
+296 839 -.900660157
+350 839 1.19933975
+404 839 -.428885818
+731 839 1
+1256 839 -.0160903763
+297 840 -2.60216546
+351 840 .297834665
+405 840 -.897298396
+693 840 -.000593723962
+732 840 1
+998 840 -.0376543887
+1211 840 .00859325007
+1256 840 -.0202902332
+352 841 2
+675 841 -.0465898141
+693 841 -.0271990262
+733 841 1
+999 841 -.0484563895
+1212 841 .403310657
+353 842 1.39999998
+710 842 -.000308300077
+734 842 1
+1000 842 -.0107462266
+1213 842 1.3855896
+300 843 -.124517642
+354 843 1.07548237
+408 843 -.103764698
+677 843 -.0348328426
+735 843 1
+1256 843 -.000824680901
+301 844 -.74846828
+355 844 1.25153172
+409 844 -.37423414
+736 844 1
+1256 844 -.00327906664
+302 845 -1.67540276
+356 845 .324597239
+410 845 -.83770138
+711 845 -.00289995759
+737 845 1
+1003 845 -.0407752655
+1216 845 .111877225
+1256 845 -.00541795138
+303 846 -.66875571
+357 846 .531244338
+411 846 -.557296395
+738 846 1
+1256 846 -.0016774981
+304 847 -.627053499
+358 847 1.3729465
+412 847 -.31352675
+739 847 1
+1256 847 -.000419815013
+305 848 -.18695505
+359 848 1.01304495
+413 848 -.155795872
+682 848 -.0462385714
+740 848 1
+1256 848 -.000453347369
+306 849 -.721822023
+360 849 1.27817798
+414 849 -.360911012
+741 849 1
+1256 849 -.000273104146
+307 850 -.105930492
+361 850 .394069523
+415 850 -.211860985
+684 850 -.0168937333
+719 850 -.00185398629
+742 850 1
+1256 850 -.00108470803
+362 851 .5
+719 851 -.00168574753
+743 851 1
+1009 851 -.269034058
+1222 851 .230965927
+309 852 -1.20000005
+417 852 -1
+686 852 -.355678231
+706 852 -.0318024121
+707 852 -.00837958045
+744 852 1
+1010 852 -.0182339847
+1223 852 .00698937196
+1256 852 -.00038452071
+310 853 -.151235133
+364 853 .348764867
+418 853 -.302470267
+716 853 -.0391993411
+717 853 -.0669464841
+718 853 -.291612893
+719 853 -.0106548648
+745 853 1
+1011 853 -.0798794478
+1224 853 .272300512
+1256 853 -.000404177612
+365 854 .5
+688 854 -.0250663608
+719 854 -.00126683037
+746 854 1
+1012 854 -.00420977129
+1225 854 .125440717
+366 855 .5
+689 855 -.00976003241
+747 855 1
+367 856 1.29999995
+710 856 -.00982436165
+748 856 1
+1014 856 -.175757363
+1227 856 1.11488414
+314 857 -.200484797
+368 857 1.6995151
+422 857 -.105518319
+691 857 -.00129680149
+749 857 1
+1256 857 -.00124636805
+315 858 -1.61500096
+369 858 .384999037
+423 858 -.807500482
+664 858 -.00111489906
+665 858 -.00157474761
+666 858 -.000336079829
+667 858 -.00146485993
+668 858 -.000746365869
+669 858 -.000504391151
+670 858 -.000586504175
+671 858 -.000521592912
+679 858 -.00688093295
+691 858 -.0104867527
+692 858 -.319255888
+710 858 -.0161293726
+750 858 1
+1016 858 -.271766007
+1229 858 .129452288
+1256 858 -.00535347452
+316 859 -2.23066592
+370 859 1.16933429
+424 859 -.65607816
+693 859 -.00718148332
+751 859 1
+1017 859 -.0235442594
+1230 859 .0497347564
+1256 859 -.0488137603
+317 860 -2.04773259
+371 860 .352267474
+425 860 -.853221893
+664 860 -.00272035366
+665 860 -.0017776665
+666 860 -.00238104025
+667 860 -.000815090723
+668 860 -.000942158105
+669 860 -.000670078967
+670 860 -.000760167604
+671 860 -.000211644132
+672 860 -.0566360168
+687 860 -.00150002027
+693 860 -.00491754897
+694 860 -.401309192
+710 860 -.00108380883
+716 860 -.000496166467
+752 860 1
+1018 860 -.668438077
+1231 860 .279838085
+1256 860 -.000792497536
+318 861 -.233849168
+372 861 .266150832
+426 861 -.467698336
+753 861 1
+1256 861 -.00287304446
+319 862 -.291807055
+373 862 .808192968
+427 862 -.265279144
+754 862 1
+1256 862 -.00622297544
+320 863 -1.1448102
+374 863 .8551898
+428 863 -.5724051
+673 863 -.0558000579
+697 863 -.0089808302
+755 863 1
+1021 863 -.0537051633
+1234 863 .0483965762
+1256 863 -.0198628195
+321 864 -1.42642903
+375 864 .573570967
+429 864 -.713214517
+698 864 -.0667206198
+756 864 1
+1256 864 -.000840566121
+322 865 -.485329628
+376 865 1.51467037
+430 865 -.242664814
+664 865 -.000896874291
+665 865 -.00132383301
+666 865 -.00132967182
+667 865 -.00128170592
+668 865 -.00133668678
+669 865 -.00133038755
+670 865 -.00132171414
+671 865 -.00125945604
+673 865 -.0233388562
+679 865 -.16298537
+691 865 -.0380810276
+692 865 -.0671791732
+693 865 -.00675262418
+699 865 -.205335543
+709 865 -.00106370752
+710 865 -.000619772589
+711 865 -.00166779512
+757 865 1
+1023 865 -.290093929
+1236 865 .660795033
+1256 865 -.00533351488
+377 866 .5
+672 866 -.00565960491
+673 866 -.00214861985
+674 866 -.0123228477
+675 866 -.00128709001
+676 866 -.00776296109
+677 866 -.00321339467
+678 866 -.000852899859
+679 866 -.00139193831
+680 866 -.000688091968
+682 866 -.00232404447
+683 866 -.000568990072
+686 866 -.00416228548
+687 866 -.0052571753
+688 866 -.000448758365
+689 866 -.00808227435
+690 866 -.000393367372
+691 866 -.00124063122
+692 866 -.00552031258
+693 866 -.00011648349
+694 866 -.00470676506
+695 866 -.00184243242
+696 866 -.000713348098
+697 866 -.00301135471
+698 866 -.00630347338
+699 866 -.00809392985
+700 866 -.000722343859
+701 866 -.00285392837
+702 866 -.0094167171
+704 866 -.00320973643
+705 866 -.00392820593
+706 866 -.00484057469
+707 866 -.0264233109
+708 866 -.0563934073
+758 866 1
+1024 866 -.229801148
+1237 866 .215929925
+324 867 -1.27107942
+378 867 .12892051
+432 867 -.907913923
+673 867 -.0293939412
+710 867 -.0122682666
+759 867 1
+1025 867 -.169435248
+1238 867 .162539542
+1256 867 -.00238049356
+379 868 2
+760 868 1
+326 869 -.333289027
+380 869 .966710985
+434 869 -.256376177
+703 869 -.0238482412
+761 869 1
+1027 869 -.0195702668
+1240 869 .0602502711
+1256 869 -.00150823966
+327 870 -.0834249184
+381 870 .416575074
+435 870 -.166849837
+687 870 -.141137898
+704 870 -.0621925406
+708 870 -.0179157741
+710 870 -.00931619946
+716 870 -.0797656104
+717 870 -.00982721709
+718 870 -.457741946
+719 870 -.0874001831
+762 870 1
+1028 870 -.074881725
+1241 870 .190253779
+1256 870 -.000764884287
+328 871 -1.04966235
+382 871 .150337681
+436 871 -.874718606
+664 871 -.00310189673
+665 871 -.00123634702
+666 871 -.00101881835
+667 871 -.0010465408
+668 871 -.00115193555
+669 871 -.00114678754
+670 871 -.000976467039
+671 871 -.00126755168
+672 871 -.0110509451
+679 871 -.0198407471
+705 871 -.0279315859
+708 871 -.00627332646
+716 871 -.010952903
+717 871 -.0117584094
+719 871 -.0194125865
+763 871 1
+1029 871 -.0393045694
+1242 871 .0444007665
+1256 871 -.00515591446
+383 872 2
+764 872 1
+384 873 .5
+765 873 1
+385 874 2
+766 874 1
+386 875 2
+767 875 1
+387 876 2
+768 876 1
+388 877 2
+769 877 1
+389 878 2
+770 878 1
+390 879 2
+713 879 -.970873654
+771 879 1
+391 880 2
+772 880 1
+392 881 2
+773 881 1
+1 882 .20696941
+2 882 .142192304
+3 882 .0991375148
+4 882 .121215612
+5 882 .107199252
+6 882 .0979955196
+7 882 .0866832137
+8 882 .113363981
+9 882 .514388204
+10 882 .399175406
+11 882 .279817909
+12 882 .377357244
+13 882 .0494234264
+14 882 .367496192
+15 882 .0883079469
+16 882 .273117274
+17 882 .162650615
+18 882 .108350277
+19 882 .178367764
+20 882 .1423015
+21 882 .199999988
+22 882 .157062501
+23 882 .346346229
+24 882 .168499485
+25 882 .143396467
+26 882 .221011877
+27 882 .131137341
+28 882 .203243569
+29 882 .262830466
+30 882 .220662773
+31 882 .205521852
+32 882 .304139465
+33 882 .250172228
+34 882 .327778846
+35 882 .386875868
+36 882 .28614679
+37 882 .349301606
+38 882 .204914153
+39 882 .16316402
+40 882 .144504279
+41 882 .119339019
+42 882 .449773788
+43 882 .122339576
+44 882 .175039321
+45 882 .313483953
+46 882 .235000014
+47 882 .118269891
+48 882 .00908070803
+49 882 .5
+50 882 .25
+53 882 -.29303059
+54 882 -.357807696
+55 882 -.400862485
+56 882 -.378784388
+57 882 -.392800748
+58 882 -.40200448
+59 882 -.413316786
+60 882 -.386636019
+61 882 -.185611829
+62 882 -.100824602
+63 882 -.220182091
+64 882 -.222642779
+65 882 -.450576574
+66 882 -.132503808
+67 882 -.411692053
+68 882 -.426882714
+69 882 -.337349385
+70 882 -.391649723
+71 882 -.42163226
+72 882 -.3576985
+73 882 -.5
+74 882 -.542937458
+75 882 -.253653795
+76 882 -.431500524
+77 882 -.356603533
+78 882 -.278988123
+79 882 -.368862659
+80 882 -.39675644
+81 882 -.237169534
+82 882 -.379337251
+83 882 -.294478148
+84 882 -.395860523
+85 882 -.44982776
+86 882 -.172221169
+87 882 -.213124171
+88 882 -.313853234
+89 882 -.150698408
+90 882 -.295085847
+91 882 -.33683598
+92 882 -.355495721
+93 882 -.380660981
+94 882 -.150226235
+95 882 -.377660424
+96 882 -.324960679
+97 882 -.186516061
+98 882 -.264999986
+99 882 -.381730109
+100 882 -.490919292
+102 882 -.25
+103 882 -.5
+104 882 -.5
+105 882 -.29303059
+106 882 -.357807696
+107 882 -.400862485
+108 882 -.378784388
+109 882 -.392800748
+110 882 -.40200448
+111 882 -.413316786
+112 882 -.386636019
+113 882 -.185611829
+114 882 -.100824602
+115 882 -.220182091
+116 882 -.222642779
+117 882 -.450576574
+118 882 -.132503808
+119 882 -.411692053
+120 882 -.426882714
+121 882 -.337349385
+122 882 -.391649723
+123 882 -.42163226
+124 882 -.3576985
+125 882 -.5
+126 882 -.542937458
+127 882 -.253653795
+128 882 -.431500524
+129 882 -.356603533
+130 882 -.278988123
+131 882 -.368862659
+132 882 -.39675644
+133 882 -.237169534
+134 882 -.379337251
+135 882 -.294478148
+136 882 -.395860523
+137 882 -.44982776
+138 882 -.172221169
+139 882 -.213124171
+140 882 -.313853234
+141 882 -.150698408
+142 882 -.295085847
+143 882 -.33683598
+144 882 -.355495721
+145 882 -.380660981
+146 882 -.150226235
+147 882 -.377660424
+148 882 -.324960679
+149 882 -.186516061
+150 882 -.264999986
+151 882 -.381730109
+152 882 -.490919292
+154 882 -.25
+155 882 -.5
+156 882 -.5
+664 882 -.484386444
+665 882 -.561829507
+666 882 -.671342134
+667 882 -.577910066
+668 882 -.565740168
+669 882 -.557189345
+670 882 -.678157926
+671 882 -.521830022
+672 882 -.0384170078
+673 882 -.0872260258
+674 882 -.206981167
+675 882 -.104951881
+676 882 -.513661385
+677 882 -.0955502614
+678 882 -.301023483
+679 882 -.379936486
+680 882 -.387598157
+681 882 -.401674479
+682 882 -.401453912
+683 882 -.455248922
+684 882 -.283446878
+685 882 -.443214297
+686 882 -.17109184
+687 882 -.412138194
+688 882 -.389837623
+689 882 -.461512387
+690 882 -.46407631
+691 882 -.327409387
+692 882 -.221497595
+693 882 -.314730257
+694 882 -.230232194
+695 882 -.156761721
+696 882 -.232789949
+697 882 -.127302766
+698 882 -.161466956
+699 882 -.207014278
+700 882 -.171532094
+701 882 -.333930194
+702 882 -.481194258
+703 882 -.308955878
+704 882 -.35572347
+705 882 -.0863323063
+706 882 -.710011542
+707 882 -.441480815
+708 882 -.243827671
+709 882 -.381600082
+710 882 -.38172999
+711 882 -.95232147
+714 882 -1
+715 882 -1
+1258 882 1
+1 883 -.175656468
+53 883 .324343532
+105 883 -.175656468
+664 883 -.290364295
+828 883 -2.5
+2 884 -.0748264492
+54 884 .425173551
+106 884 -.0748264492
+665 884 -.11749246
+829 884 -2.70000005
+3 885 -.0577010401
+55 885 .442298949
+107 885 -.0577010401
+666 885 -.0966344848
+830 885 -2.5
+4 886 -.0647369847
+56 886 .435263008
+108 886 -.0647369847
+667 886 -.0987690017
+831 886 -2.70000005
+5 887 -.0703428686
+57 887 .429657131
+109 887 -.0703428686
+668 887 -.101312913
+832 887 -2.9000001
+6 888 -.0771906078
+58 888 .422809392
+110 888 -.0771906078
+669 888 -.106988318
+833 888 -2.9000001
+7 889 -.0580480322
+59 889 .44195196
+111 889 -.0580480322
+670 889 -.0952434912
+834 889 -2.9000001
+8 890 -.086330615
+60 890 .413669378
+112 890 -.086330615
+671 890 -.116517611
+835 890 -2.9000001
+9 891 -.514388204
+61 891 .185611799
+113 891 -.514388204
+672 891 -.106465489
+836 891 -2.4000001
+10 892 -.399175406
+62 892 .100824594
+114 892 -.399175406
+673 892 -.345337152
+837 892 -1.5
+11 893 -.279817909
+63 893 .220182091
+115 893 -.279817909
+674 893 -.263041526
+838 893 -2.4000001
+12 894 -.377357244
+64 894 .222642779
+116 894 -.377357244
+675 894 -.177882954
+839 894 -1.89999998
+13 895 -.0494234152
+65 895 .450576574
+117 895 -.0494234152
+676 895 -.0563431382
+840 895 -1.60000002
+14 896 -.367496192
+66 896 .132503808
+118 896 -.367496192
+677 896 -.265006363
+841 896 -2.0999999
+15 897 -.0883079469
+67 897 .411692053
+119 897 -.0883079469
+678 897 -.0645695329
+842 897 -1.79999995
+16 898 -.273117244
+68 898 .426882774
+120 898 -.273117244
+679 898 -.243081301
+843 898 -1.79999995
+17 899 -.1626506
+69 899 .337349415
+121 899 -.1626506
+680 899 -.186877683
+844 899 -1.79999995
+18 900 -.108350284
+70 900 .391649723
+122 900 -.108350284
+681 900 -.111123636
+845 900 -1.79999995
+19 901 -.178367749
+71 901 .42163229
+123 901 -.178367749
+682 901 -.169831485
+846 901 -1.79999995
+20 902 -.1423015
+72 902 .3576985
+124 902 -.1423015
+683 902 -.181109533
+847 902 -1.79999995
+21 903 -.200000003
+73 903 .5
+125 903 -.200000003
+684 903 -.113378748
+848 903 -1.89999998
+22 904 -.157062501
+74 904 .542937458
+126 904 -.157062501
+685 904 -.128214285
+849 904 -1.89999998
+23 905 -.346346229
+75 905 .253653795
+127 905 -.346346229
+686 905 -.233613744
+850 905 -1.79999995
+24 906 -.1684995
+76 906 .431500524
+128 906 -.1684995
+687 906 -.160938576
+851 906 -2.20000005
+25 907 -.143396482
+77 907 .356603533
+129 907 -.143396482
+688 907 -.156760484
+852 907 -2.5999999
+26 908 -.221011877
+78 908 .278988123
+130 908 -.221011877
+689 908 -.36560598
+853 908 -2.5999999
+27 909 -.131137356
+79 909 .368862659
+131 909 -.131137356
+690 909 -.164987534
+854 909 -2.20000005
+28 910 -.203243569
+80 910 .39675644
+132 910 -.203243569
+691 910 -.167719662
+855 910 -1.5
+29 911 -.262830466
+81 911 .237169534
+133 911 -.262830466
+692 911 -.245462865
+856 911 -3
+30 912 -.220662788
+82 912 .379337251
+134 912 -.220662788
+693 912 -.183080494
+857 912 -1.89999998
+31 913 -.205521852
+83 913 .294478148
+135 913 -.205521852
+694 913 -.160683393
+858 913 -2.4000001
+32 914 -.304139495
+84 914 .395860523
+136 914 -.304139495
+695 914 -.120439976
+859 914 -2.4000001
+33 915 -.250172228
+85 915 .44982776
+137 915 -.250172228
+696 915 -.1294664
+860 915 -2.4000001
+34 916 -.327778846
+86 916 .172221154
+138 916 -.327778846
+697 916 -.242288172
+861 916 -2.4000001
+35 917 -.386875868
+87 917 .213124156
+139 917 -.386875868
+698 917 -.293104559
+862 917 -1.89999998
+36 918 -.28614679
+88 918 .313853234
+140 918 -.28614679
+699 918 -.188739419
+863 918 -2.20000005
+37 919 -.349301606
+89 919 .150698394
+141 919 -.349301606
+700 919 -.39759168
+864 919 -1.70000005
+38 920 -.204914153
+90 920 .295085847
+142 920 -.204914153
+701 920 -.231888533
+865 920 -2.20000005
+39 921 -.16316402
+91 921 .33683598
+143 921 -.16316402
+702 921 -.233091459
+866 921 -2.9000001
+40 922 -.144504279
+92 922 .355495721
+144 922 -.144504279
+703 922 -.12558645
+867 922 -2.20000005
+41 923 -.119339012
+93 923 .380660981
+145 923 -.119339012
+704 923 -.111520983
+868 923 -2.5999999
+42 924 -.449773788
+94 924 .150226235
+146 924 -.449773788
+705 924 -.258476883
+869 924 -1.79999995
+43 925 -.122339584
+95 925 .377660424
+147 925 -.122339584
+706 925 -.230001658
+870 925 -1.70000005
+44 926 -.175039321
+96 926 .324960679
+148 926 -.175039321
+707 926 -.237802625
+871 926 -1.70000005
+45 927 -.313483924
+97 927 .186516076
+149 927 -.313483924
+708 927 -.4098095
+872 927 -1.60000002
+46 928 -.234999999
+98 928 .265000015
+150 928 -.234999999
+709 928 -.338400066
+873 928 -1.60000002
+47 929 -.118269883
+99 929 .381730109
+151 929 -.118269883
+710 929 -.118269853
+874 929 -1.60000002
+48 930 -.00908071082
+100 930 .490919292
+152 930 -.00908071082
+711 930 -.0176154319
+875 930 -1.5
+49 931 -.5
+153 931 -.5
+712 931 -.96999979
+876 931 -1.5
+50 932 -.25
+102 932 .25
+154 932 -.25
+877 932 -1.60000002
+103 933 .5
+878 933 -1.70000005
+104 934 .5
+879 934 -1.70000005
+1 935 -.03131295
+53 935 -.03131295
+105 935 .468687057
+664 935 -.0517610461
+2 936 -.067365855
+54 936 -.067365855
+106 936 .432634145
+665 936 -.105777845
+3 937 -.0414364599
+55 937 -.0414364599
+107 937 .458563536
+666 937 -.0693954676
+4 938 -.0564786419
+56 938 -.0564786419
+108 938 .443521351
+667 938 -.0861692801
+5 939 -.0368563868
+57 939 -.0368563868
+109 939 .463143617
+668 939 -.0530832484
+6 940 -.0208049174
+58 940 -.0208049174
+110 940 .479195088
+669 940 -.0288361926
+7 941 -.0286351871
+59 941 -.0286351871
+111 941 .471364826
+670 941 -.0469837673
+8 942 -.0270333719
+60 942 -.0270333719
+112 942 .472966641
+671 942 -.0364860594
+113 943 .699999988
+114 944 .5
+115 945 .5
+116 946 .600000024
+117 947 .5
+118 948 .5
+119 949 .5
+120 950 .699999988
+121 951 .5
+122 952 .5
+123 953 .600000024
+124 954 .5
+125 955 .699999988
+126 956 .699999988
+127 957 .600000024
+128 958 .600000024
+129 959 .5
+130 960 .5
+131 961 .5
+132 962 .600000024
+133 963 .5
+134 964 .600000024
+135 965 .5
+136 966 .699999988
+137 967 .699999988
+138 968 .5
+139 969 .600000024
+140 970 .600000024
+141 971 .5
+142 972 .5
+143 973 .5
+144 974 .5
+145 975 .5
+146 976 .600000024
+147 977 .5
+148 978 .5
+149 979 .5
+150 980 .5
+151 981 .5
+152 982 .5
+153 983 .5
+154 984 .5
+155 985 .5
+156 986 .5
+393 987 1
+447 987 .21751985
+394 988 1
+448 988 .21751985
+395 989 1
+449 989 .21751985
+396 990 1
+447 990 .000778065412
+448 990 .000778065412
+449 990 .000778065412
+450 990 .218297914
+451 990 .000778065412
+452 990 .00194401259
+453 990 .000778065412
+454 990 .00230427063
+455 990 .00110379385
+456 990 .00124306313
+457 990 .000778065412
+458 990 .000778065412
+459 990 .00230427063
+460 990 .00164245476
+461 990 .00148016575
+462 990 .00194401259
+463 990 .000778065412
+464 990 .00148016575
+465 990 .00148016575
+466 990 .00148016575
+467 990 .00194401259
+468 990 .00194401259
+469 990 .00148016575
+470 990 .00194401259
+471 990 .00194401259
+472 990 .00148016575
+473 990 .00194401259
+474 990 .00194401259
+475 990 .00194401259
+476 990 .00148016575
+477 990 .00194401259
+478 990 .00230427063
+479 990 .00148016575
+480 990 .00124306313
+481 990 .000778065412
+482 990 .000778065412
+483 990 .00164245476
+484 990 .000962222868
+485 990 .000962222868
+486 990 .00148016575
+487 990 .000778065412
+488 990 .00194401259
+489 990 .00148016575
+490 990 .00194401259
+491 990 .00194401259
+492 990 .00194401259
+493 990 .00194401259
+494 990 .00194401259
+495 990 .00194401259
+496 990 .00194401259
+497 990 .00144678715
+498 990 .00194401259
+499 990 .00194401259
+500 990 .00194401259
+397 991 1
+447 991 .00995924044
+448 991 .00995924044
+449 991 .00995924044
+450 991 .00995924044
+451 991 .227479085
+452 991 .024883369
+453 991 .00995924044
+454 991 .029494673
+455 991 .0141285667
+456 991 .0159112141
+457 991 .00995924044
+458 991 .00995924044
+459 991 .029494673
+460 991 .0210234281
+461 991 .0189461298
+462 991 .024883369
+463 991 .00995924044
+464 991 .0189461298
+465 991 .0189461298
+466 991 .0189461298
+467 991 .024883369
+468 991 .024883369
+469 991 .0189461298
+470 991 .024883369
+471 991 .024883369
+472 991 .0189461298
+473 991 .024883369
+474 991 .024883369
+475 991 .024883369
+476 991 .0189461298
+477 991 .024883369
+478 991 .029494673
+479 991 .0189461298
+480 991 .0159112141
+481 991 .00995924044
+482 991 .00995924044
+483 991 .0210234281
+484 991 .0123164579
+485 991 .0123164579
+486 991 .0189461298
+487 991 .00995924044
+488 991 .024883369
+489 991 .0189461298
+490 991 .024883369
+491 991 .024883369
+492 991 .024883369
+493 991 .024883369
+494 991 .024883369
+495 991 .024883369
+496 991 .024883369
+497 991 .0185188837
+498 991 .024883369
+499 991 .024883369
+500 991 .024883369
+398 992 1
+452 992 .543477893
+399 993 1
+447 993 .00552011142
+448 993 .00552011142
+449 993 .00552011142
+450 993 .00552011142
+451 993 .00552011142
+452 993 .0137921125
+453 993 .223039955
+454 993 .0163480211
+455 993 .0078310445
+456 993 .00881911349
+457 993 .00552011142
+458 993 .00552011142
+459 993 .0163480211
+460 993 .0116526615
+461 993 .0105012767
+462 993 .0137921125
+463 993 .00552011142
+464 993 .0105012767
+465 993 .0105012767
+466 993 .0105012767
+467 993 .0137921125
+468 993 .0137921125
+469 993 .0105012767
+470 993 .0137921125
+471 993 .0137921125
+472 993 .0105012767
+473 993 .0137921125
+474 993 .0137921125
+475 993 .0137921125
+476 993 .0105012767
+477 993 .0137921125
+478 993 .0163480211
+479 993 .0105012767
+480 993 .00881911349
+481 993 .00552011142
+482 993 .00552011142
+483 993 .0116526615
+484 993 .00682664663
+485 993 .00682664663
+486 993 .0105012767
+487 993 .00552011142
+488 993 .0137921125
+489 993 .0105012767
+490 993 .0137921125
+491 993 .0137921125
+492 993 .0137921125
+493 993 .0137921125
+494 993 .0137921125
+495 993 .0137921125
+496 993 .0137921125
+497 993 .0102644665
+498 993 .0137921125
+499 993 .0137921125
+500 993 .0137921125
+400 994 1
+454 994 .644193411
+401 995 1
+447 995 .0163674168
+448 995 .0163674168
+449 995 .0163674168
+450 995 .0163674168
+451 995 .0163674168
+452 995 .0408943295
+453 995 .0163674168
+454 995 .048472736
+455 995 .331801593
+456 995 .0261491276
+457 995 .0163674168
+458 995 .0163674168
+459 995 .048472736
+460 995 .034550745
+461 995 .0311368294
+462 995 .0408943295
+463 995 .0163674168
+464 995 .0311368294
+465 995 .0311368294
+466 995 .0311368294
+467 995 .0408943295
+468 995 .0408943295
+469 995 .0311368294
+470 995 .0408943295
+471 995 .0408943295
+472 995 .0311368294
+473 995 .0408943295
+474 995 .0408943295
+475 995 .0408943295
+476 995 .0311368294
+477 995 .0408943295
+478 995 .048472736
+479 995 .0311368294
+480 995 .0261491276
+481 995 .0163674168
+482 995 .0163674168
+483 995 .034550745
+484 995 .0202413611
+485 995 .0202413611
+486 995 .0311368294
+487 995 .0163674168
+488 995 .0408943295
+489 995 .0311368294
+490 995 .0408943295
+491 995 .0408943295
+492 995 .0408943295
+493 995 .0408943295
+494 995 .0408943295
+495 995 .0408943295
+496 995 .0408943295
+497 995 .0304346774
+498 995 .0408943295
+499 995 .0408943295
+500 995 .0408943295
+402 996 1
+447 996 .0236398429
+448 996 .0236398429
+449 996 .0236398429
+450 996 .0236398429
+451 996 .0236398429
+452 996 .0590646379
+453 996 .0236398429
+454 996 .0700103045
+455 996 .0335364044
+456 996 .385284722
+457 996 .0236398429
+458 996 .0236398429
+459 996 .0700103045
+460 996 .049902454
+461 996 .0449716561
+462 996 .0590646379
+463 996 .0236398429
+464 996 .0449716561
+465 996 .0449716561
+466 996 .0449716561
+467 996 .0590646379
+468 996 .0590646379
+469 996 .0449716561
+470 996 .0590646379
+471 996 .0590646379
+472 996 .0449716561
+473 996 .0590646379
+474 996 .0590646379
+475 996 .0590646379
+476 996 .0449716561
+477 996 .0590646379
+478 996 .0700103045
+479 996 .0449716561
+480 996 .0377677977
+481 996 .0236398429
+482 996 .0236398429
+483 996 .049902454
+484 996 .0292350724
+485 996 .0292350724
+486 996 .0449716561
+487 996 .0236398429
+488 996 .0590646379
+489 996 .0449716561
+490 996 .0590646379
+491 996 .0590646379
+492 996 .0590646379
+493 996 .0590646379
+494 996 .0590646379
+495 996 .0590646379
+496 996 .0590646379
+497 996 .0439575128
+498 996 .0590646379
+499 996 .0590646379
+500 996 .0590646379
+403 997 1
+447 997 .0112819523
+448 997 .0112819523
+449 997 .0112819523
+450 997 .0112819523
+451 997 .0112819523
+452 997 .0281881951
+453 997 .0112819523
+454 997 .033411935
+455 997 .0160050169
+456 997 .0180244222
+457 997 .228801802
+458 997 .0112819523
+459 997 .033411935
+460 997 .0238156039
+461 997 .0214624126
+462 997 .0281881951
+463 997 .0112819523
+464 997 .0214624126
+465 997 .0214624126
+466 997 .0214624126
+467 997 .0281881951
+468 997 .0281881951
+469 997 .0214624126
+470 997 .0281881951
+471 997 .0281881951
+472 997 .0214624126
+473 997 .0281881951
+474 997 .0281881951
+475 997 .0281881951
+476 997 .0214624126
+477 997 .0281881951
+478 997 .033411935
+479 997 .0214624126
+480 997 .0180244222
+481 997 .0112819523
+482 997 .0112819523
+483 997 .0238156039
+484 997 .0139522376
+485 997 .0139522376
+486 997 .0214624126
+487 997 .0112819523
+488 997 .0281881951
+489 997 .0214624126
+490 997 .0281881951
+491 997 .0281881951
+492 997 .0281881951
+493 997 .0281881951
+494 997 .0281881951
+495 997 .0281881951
+496 997 .0281881951
+497 997 .0209784228
+498 997 .0281881951
+499 997 .0281881951
+500 997 .0281881951
+404 998 1
+447 998 .0145590007
+448 998 .0145590007
+449 998 .0145590007
+450 998 .0145590007
+451 998 .0145590007
+452 998 .0363759659
+453 998 .0145590007
+454 998 .0431170389
+455 998 .020653965
+456 998 .0232599434
+457 998 .0145590007
+458 998 .23207885
+459 998 .0431170389
+460 998 .030733278
+461 998 .0276965592
+462 998 .0363759659
+463 998 .0145590007
+464 998 .0276965592
+465 998 .0276965592
+466 998 .0276965592
+467 998 .0363759659
+468 998 .0363759659
+469 998 .0276965592
+470 998 .0363759659
+471 998 .0363759659
+472 998 .0276965592
+473 998 .0363759659
+474 998 .0363759659
+475 998 .0363759659
+476 998 .0276965592
+477 998 .0363759659
+478 998 .0431170389
+479 998 .0276965592
+480 998 .0232599434
+481 998 .0145590007
+482 998 .0145590007
+483 998 .030733278
+484 998 .0180049166
+485 998 .0180049166
+486 998 .0276965592
+487 998 .0145590007
+488 998 .0363759659
+489 998 .0276965592
+490 998 .0363759659
+491 998 .0363759659
+492 998 .0363759659
+493 998 .0363759659
+494 998 .0363759659
+495 998 .0363759659
+496 998 .0363759659
+497 998 .0270719863
+498 998 .0363759659
+499 998 .0363759659
+500 998 .0363759659
+405 999 1
+447 999 .00399022875
+448 999 .00399022875
+449 999 .00399022875
+450 999 .00399022875
+451 999 .00399022875
+452 999 .00996967033
+453 999 .00399022875
+454 999 .0118172178
+455 999 .00566069456
+456 999 .00637492212
+457 999 .00399022875
+458 999 .00399022875
+459 999 .656010628
+460 999 .00842316169
+461 999 .00759087969
+462 999 .00996967033
+463 999 .00399022875
+464 999 .00759087969
+465 999 .00759087969
+466 999 .00759087969
+467 999 .00996967033
+468 999 .00996967033
+469 999 .00759087969
+470 999 .00996967033
+471 999 .00996967033
+472 999 .00759087969
+473 999 .00996967033
+474 999 .00996967033
+475 999 .00996967033
+476 999 .00759087969
+477 999 .00996967033
+478 999 .0118172178
+479 999 .00759087969
+480 999 .00637492212
+481 999 .00399022875
+482 999 .00399022875
+483 999 .00842316169
+484 999 .00493466202
+485 999 .00493466202
+486 999 .00759087969
+487 999 .00399022875
+488 999 .00996967033
+489 999 .00759087969
+490 999 .00996967033
+491 999 .00996967033
+492 999 .00996967033
+493 999 .00996967033
+494 999 .00996967033
+495 999 .00996967033
+496 999 .00996967033
+497 999 .00741970027
+498 999 .00996967033
+499 999 .00996967033
+500 999 .00996967033
+406 1000 1
+460 1000 .459172845
+407 1001 1
+447 1001 .000276930194
+448 1001 .000276930194
+449 1001 .000276930194
+450 1001 .000276930194
+451 1001 .000276930194
+452 1001 .000691915862
+453 1001 .000276930194
+454 1001 .000820139423
+455 1001 .000392863934
+456 1001 .00044243288
+457 1001 .000276930194
+458 1001 .000276930194
+459 1001 .000820139423
+460 1001 .000584584952
+461 1001 .41432938
+462 1001 .000691915862
+463 1001 .000276930194
+464 1001 .000526822812
+465 1001 .000526822812
+466 1001 .000526822812
+467 1001 .000691915862
+468 1001 .000691915862
+469 1001 .000526822812
+470 1001 .000691915862
+471 1001 .000691915862
+472 1001 .000526822812
+473 1001 .000691915862
+474 1001 .000691915862
+475 1001 .000691915862
+476 1001 .000526822812
+477 1001 .000691915862
+478 1001 .000820139423
+479 1001 .000526822812
+480 1001 .00044243288
+481 1001 .000276930194
+482 1001 .000276930194
+483 1001 .000584584952
+484 1001 .000342475803
+485 1001 .000342475803
+486 1001 .000526822812
+487 1001 .000276930194
+488 1001 .000691915862
+489 1001 .000526822812
+490 1001 .000691915862
+491 1001 .000691915862
+492 1001 .000691915862
+493 1001 .000691915862
+494 1001 .000691915862
+495 1001 .000691915862
+496 1001 .000691915862
+497 1001 .000514942629
+498 1001 .000691915862
+499 1001 .000691915862
+500 1001 .000691915862
+408 1002 1
+447 1002 .00179941696
+448 1002 .00179941696
+449 1002 .00179941696
+450 1002 .00179941696
+451 1002 .00179941696
+452 1002 .00449588103
+453 1002 .00179941696
+454 1002 .00532904267
+455 1002 .00255272305
+456 1002 .00287480839
+457 1002 .00179941696
+458 1002 .00179941696
+459 1002 .00532904267
+460 1002 .00379847386
+461 1002 .00342315156
+462 1002 .547973752
+463 1002 .00179941696
+464 1002 .00342315156
+465 1002 .00342315156
+466 1002 .00342315156
+467 1002 .00449588103
+468 1002 .00449588103
+469 1002 .00342315156
+470 1002 .00449588103
+471 1002 .00449588103
+472 1002 .00342315156
+473 1002 .00449588103
+474 1002 .00449588103
+475 1002 .00449588103
+476 1002 .00342315156
+477 1002 .00449588103
+478 1002 .00532904267
+479 1002 .00342315156
+480 1002 .00287480839
+481 1002 .00179941696
+482 1002 .00179941696
+483 1002 .00379847386
+484 1002 .00222531473
+485 1002 .00222531473
+486 1002 .00342315156
+487 1002 .00179941696
+488 1002 .00449588103
+489 1002 .00342315156
+490 1002 .00449588103
+491 1002 .00449588103
+492 1002 .00449588103
+493 1002 .00449588103
+494 1002 .00449588103
+495 1002 .00449588103
+496 1002 .00449588103
+497 1002 .00334595726
+498 1002 .00449588103
+499 1002 .00449588103
+500 1002 .00449588103
+409 1003 1
+447 1003 .00340027385
+448 1003 .00340027385
+449 1003 .00340027385
+450 1003 .00340027385
+451 1003 .00340027385
+452 1003 .00849565491
+453 1003 .00340027385
+454 1003 .0100700418
+455 1003 .00482376106
+456 1003 .00543239061
+457 1003 .00340027385
+458 1003 .00340027385
+459 1003 .0100700418
+460 1003 .00717779715
+461 1003 .00646856846
+462 1003 .00849565491
+463 1003 .220920131
+464 1003 .00646856846
+465 1003 .00646856846
+466 1003 .00646856846
+467 1003 .00849565491
+468 1003 .00849565491
+469 1003 .00646856846
+470 1003 .00849565491
+471 1003 .00849565491
+472 1003 .00646856846
+473 1003 .00849565491
+474 1003 .00849565491
+475 1003 .00849565491
+476 1003 .00646856846
+477 1003 .00849565491
+478 1003 .0100700418
+479 1003 .00646856846
+480 1003 .00543239061
+481 1003 .00340027385
+482 1003 .00340027385
+483 1003 .00717779715
+484 1003 .0042050723
+485 1003 .0042050723
+486 1003 .00646856846
+487 1003 .00340027385
+488 1003 .00849565491
+489 1003 .00646856846
+490 1003 .00849565491
+491 1003 .00849565491
+492 1003 .00849565491
+493 1003 .00849565491
+494 1003 .00849565491
+495 1003 .00849565491
+496 1003 .00849565491
+497 1003 .00632269774
+498 1003 .00849565491
+499 1003 .00849565491
+500 1003 .00849565491
+410 1004 1
+447 1004 .00188028405
+448 1004 .00188028405
+449 1004 .00188028405
+450 1004 .00188028405
+451 1004 .00188028405
+452 1004 .00469792867
+453 1004 .00188028405
+454 1004 .00556853367
+455 1004 .00266744406
+456 1004 .00300400425
+457 1004 .00188028405
+458 1004 .00188028405
+459 1004 .00556853367
+460 1004 .00396917993
+461 1004 .00357698998
+462 1004 .00469792867
+463 1004 .00188028405
+464 1004 .417379558
+465 1004 .00357698998
+466 1004 .00357698998
+467 1004 .00469792867
+468 1004 .00469792867
+469 1004 .00357698998
+470 1004 .00469792867
+471 1004 .00469792867
+472 1004 .00357698998
+473 1004 .00469792867
+474 1004 .00469792867
+475 1004 .00469792867
+476 1004 .00357698998
+477 1004 .00469792867
+478 1004 .00556853367
+479 1004 .00357698998
+480 1004 .00300400425
+481 1004 .00188028405
+482 1004 .00188028405
+483 1004 .00396917993
+484 1004 .00232532178
+485 1004 .00232532178
+486 1004 .00357698998
+487 1004 .00188028405
+488 1004 .00469792867
+489 1004 .00357698998
+490 1004 .00469792867
+491 1004 .00469792867
+492 1004 .00469792867
+493 1004 .00469792867
+494 1004 .00469792867
+495 1004 .00469792867
+496 1004 .00469792867
+497 1004 .00349632674
+498 1004 .00469792867
+499 1004 .00469792867
+500 1004 .00469792867
+411 1005 1
+447 1005 .000875091413
+448 1005 .000875091413
+449 1005 .000875091413
+450 1005 .000875091413
+451 1005 .000875091413
+452 1005 .00218643411
+453 1005 .000875091413
+454 1005 .00259161694
+455 1005 .00124143891
+456 1005 .00139807514
+457 1005 .000875091413
+458 1005 .000875091413
+459 1005 .00259161694
+460 1005 .00184727146
+461 1005 .00166474504
+462 1005 .00218643411
+463 1005 .000875091413
+464 1005 .00166474504
+465 1005 .415467322
+466 1005 .00166474504
+467 1005 .00218643411
+468 1005 .00218643411
+469 1005 .00166474504
+470 1005 .00218643411
+471 1005 .00218643411
+472 1005 .00166474504
+473 1005 .00218643411
+474 1005 .00218643411
+475 1005 .00218643411
+476 1005 .00166474504
+477 1005 .00218643411
+478 1005 .00259161694
+479 1005 .00166474504
+480 1005 .00139807514
+481 1005 .000875091413
+482 1005 .000875091413
+483 1005 .00184727146
+484 1005 .00108221371
+485 1005 .00108221371
+486 1005 .00166474504
+487 1005 .000875091413
+488 1005 .00218643411
+489 1005 .00166474504
+490 1005 .00218643411
+491 1005 .00218643411
+492 1005 .00218643411
+493 1005 .00218643411
+494 1005 .00218643411
+495 1005 .00218643411
+496 1005 .00218643411
+497 1005 .00162720389
+498 1005 .00218643411
+499 1005 .00218643411
+500 1005 .00218643411
+412 1006 1
+447 1006 .000389279012
+448 1006 .000389279012
+449 1006 .000389279012
+450 1006 .000389279012
+451 1006 .000389279012
+452 1006 .000972621783
+453 1006 .000389279012
+454 1006 .00115286477
+455 1006 .000552246405
+456 1006 .000621925166
+457 1006 .000389279012
+458 1006 .000389279012
+459 1006 .00115286477
+460 1006 .000821747351
+461 1006 .000740551506
+462 1006 .000972621783
+463 1006 .000389279012
+464 1006 .000740551506
+465 1006 .000740551506
+466 1006 .414543122
+467 1006 .000972621783
+468 1006 .000972621783
+469 1006 .000740551506
+470 1006 .000972621783
+471 1006 .000972621783
+472 1006 .000740551506
+473 1006 .000972621783
+474 1006 .000972621783
+475 1006 .000972621783
+476 1006 .000740551506
+477 1006 .000972621783
+478 1006 .00115286477
+479 1006 .000740551506
+480 1006 .000621925166
+481 1006 .000389279012
+482 1006 .000389279012
+483 1006 .000821747351
+484 1006 .000481416093
+485 1006 .000481416093
+486 1006 .000740551506
+487 1006 .000389279012
+488 1006 .000972621783
+489 1006 .000740551506
+490 1006 .000972621783
+491 1006 .000972621783
+492 1006 .000972621783
+493 1006 .000972621783
+494 1006 .000972621783
+495 1006 .000972621783
+496 1006 .000972621783
+497 1006 .00072385167
+498 1006 .000972621783
+499 1006 .000972621783
+500 1006 .000972621783
+413 1007 1
+447 1007 .000658825913
+448 1007 .000658825913
+449 1007 .000658825913
+450 1007 .000658825913
+451 1007 .000658825913
+452 1007 .0016460903
+453 1007 .000658825913
+454 1007 .00195113849
+455 1007 .000934636162
+456 1007 .00105256215
+457 1007 .000658825913
+458 1007 .000658825913
+459 1007 .00195113849
+460 1007 .00139074656
+461 1007 .00125332864
+462 1007 .0016460903
+463 1007 .000658825913
+464 1007 .00125332864
+465 1007 .00125332864
+466 1007 .00125332864
+467 1007 .545123994
+468 1007 .0016460903
+469 1007 .00125332864
+470 1007 .0016460903
+471 1007 .0016460903
+472 1007 .00125332864
+473 1007 .0016460903
+474 1007 .0016460903
+475 1007 .0016460903
+476 1007 .00125332864
+477 1007 .0016460903
+478 1007 .00195113849
+479 1007 .00125332864
+480 1007 .00105256215
+481 1007 .000658825913
+482 1007 .000658825913
+483 1007 .00139074656
+484 1007 .000814761093
+485 1007 .000814761093
+486 1007 .00125332864
+487 1007 .000658825913
+488 1007 .0016460903
+489 1007 .00125332864
+490 1007 .0016460903
+491 1007 .0016460903
+492 1007 .0016460903
+493 1007 .0016460903
+494 1007 .0016460903
+495 1007 .0016460903
+496 1007 .0016460903
+497 1007 .00122506532
+498 1007 .0016460903
+499 1007 .0016460903
+500 1007 .0016460903
+414 1008 1
+447 1008 .000171326144
+448 1008 .000171326144
+449 1008 .000171326144
+450 1008 .000171326144
+451 1008 .000171326144
+452 1008 .000428061961
+453 1008 .000171326144
+454 1008 .000507389021
+455 1008 .000243049974
+456 1008 .000273716345
+457 1008 .000171326144
+458 1008 .000171326144
+459 1008 .000507389021
+460 1008 .000361660408
+461 1008 .000325925212
+462 1008 .000428061961
+463 1008 .000171326144
+464 1008 .000325925212
+465 1008 .000325925212
+466 1008 .000325925212
+467 1008 .000428061961
+468 1008 .543905973
+469 1008 .000325925212
+470 1008 .000428061961
+471 1008 .000428061961
+472 1008 .000325925212
+473 1008 .000428061961
+474 1008 .000428061961
+475 1008 .000428061961
+476 1008 .000325925212
+477 1008 .000428061961
+478 1008 .000507389021
+479 1008 .000325925212
+480 1008 .000273716345
+481 1008 .000171326144
+482 1008 .000171326144
+483 1008 .000361660408
+484 1008 .00021187673
+485 1008 .00021187673
+486 1008 .000325925212
+487 1008 .000171326144
+488 1008 .000428061961
+489 1008 .000325925212
+490 1008 .000428061961
+491 1008 .000428061961
+492 1008 .000428061961
+493 1008 .000428061961
+494 1008 .000428061961
+495 1008 .000428061961
+496 1008 .000428061961
+497 1008 .000318575389
+498 1008 .000428061961
+499 1008 .000428061961
+500 1008 .000428061961
+415 1009 1
+447 1009 .00148846791
+448 1009 .00148846791
+449 1009 .00148846791
+450 1009 .00148846791
+451 1009 .00148846791
+452 1009 .00371896802
+453 1009 .00148846791
+454 1009 .0044081551
+455 1009 .00211159862
+456 1009 .00237802579
+457 1009 .00148846791
+458 1009 .00148846791
+459 1009 .0044081551
+460 1009 .00314207654
+461 1009 .00283161202
+462 1009 .00371896802
+463 1009 .00148846791
+464 1009 .00283161202
+465 1009 .00283161202
+466 1009 .00283161202
+467 1009 .00371896802
+468 1009 .00371896802
+469 1009 .416634172
+470 1009 .00371896802
+471 1009 .00371896802
+472 1009 .00283161202
+473 1009 .00371896802
+474 1009 .00371896802
+475 1009 .00371896802
+476 1009 .00283161202
+477 1009 .00371896802
+478 1009 .0044081551
+479 1009 .00283161202
+480 1009 .00237802579
+481 1009 .00148846791
+482 1009 .00148846791
+483 1009 .00314207654
+484 1009 .00184076803
+485 1009 .00184076803
+486 1009 .00283161202
+487 1009 .00148846791
+488 1009 .00371896802
+489 1009 .00283161202
+490 1009 .00371896802
+491 1009 .00371896802
+492 1009 .00371896802
+493 1009 .00371896802
+494 1009 .00371896802
+495 1009 .00371896802
+496 1009 .00371896802
+497 1009 .00276775728
+498 1009 .00371896802
+499 1009 .00371896802
+500 1009 .00371896802
+416 1010 1
+470 1010 .543477893
+417 1011 1
+447 1011 8.7059314e-5
+448 1011 8.7059314e-5
+449 1011 8.7059314e-5
+450 1011 8.7059314e-5
+451 1011 8.7059314e-5
+452 1011 .000217519526
+453 1011 8.7059314e-5
+454 1011 .000257829524
+455 1011 .000123505743
+456 1011 .00013908885
+457 1011 8.7059314e-5
+458 1011 8.7059314e-5
+459 1011 .000257829524
+460 1011 .000183777607
+461 1011 .000165618767
+462 1011 .000217519526
+463 1011 8.7059314e-5
+464 1011 .000165618767
+465 1011 .000165618767
+466 1011 .000165618767
+467 1011 .000217519526
+468 1011 .000217519526
+469 1011 .000165618767
+470 1011 .000217519526
+471 1011 .54369539
+472 1011 .000165618767
+473 1011 .000217519526
+474 1011 .000217519526
+475 1011 .000217519526
+476 1011 .000165618767
+477 1011 .000217519526
+478 1011 .000257829524
+479 1011 .000165618767
+480 1011 .00013908885
+481 1011 8.7059314e-5
+482 1011 8.7059314e-5
+483 1011 .000183777607
+484 1011 .000107665073
+485 1011 .000107665073
+486 1011 .000165618767
+487 1011 8.7059314e-5
+488 1011 .000217519526
+489 1011 .000165618767
+490 1011 .000217519526
+491 1011 .000217519526
+492 1011 .000217519526
+493 1011 .000217519526
+494 1011 .000217519526
+495 1011 .000217519526
+496 1011 .000217519526
+497 1011 .00016188396
+498 1011 .000217519526
+499 1011 .000217519526
+500 1011 .000217519526
+418 1012 1
+447 1012 .000388478627
+448 1012 .000388478627
+449 1012 .000388478627
+450 1012 .000388478627
+451 1012 .000388478627
+452 1012 .000970622001
+453 1012 .000388478627
+454 1012 .00115049444
+455 1012 .000551110948
+456 1012 .000620646402
+457 1012 .000388478627
+458 1012 .000388478627
+459 1012 .00115049444
+460 1012 .000820057758
+461 1012 .00073902891
+462 1012 .000970622001
+463 1012 .000388478627
+464 1012 .00073902891
+465 1012 .00073902891
+466 1012 .00073902891
+467 1012 .000970622001
+468 1012 .000970622001
+469 1012 .00073902891
+470 1012 .000970622001
+471 1012 .000970622001
+472 1012 .414541602
+473 1012 .000970622001
+474 1012 .000970622001
+475 1012 .000970622001
+476 1012 .00073902891
+477 1012 .000970622001
+478 1012 .00115049444
+479 1012 .00073902891
+480 1012 .000620646402
+481 1012 .000388478627
+482 1012 .000388478627
+483 1012 .000820057758
+484 1012 .000480426272
+485 1012 .000480426272
+486 1012 .00073902891
+487 1012 .000388478627
+488 1012 .000970622001
+489 1012 .00073902891
+490 1012 .000970622001
+491 1012 .000970622001
+492 1012 .000970622001
+493 1012 .000970622001
+494 1012 .000970622001
+495 1012 .000970622001
+496 1012 .000970622001
+497 1012 .000722363358
+498 1012 .000970622001
+499 1012 .000970622001
+500 1012 .000970622001
+419 1013 1
+473 1013 .543477893
+420 1014 1
+474 1014 .543477893
+421 1015 1
+475 1015 .543477893
+422 1016 1
+447 1016 .00343396631
+448 1016 .00343396631
+449 1016 .00343396631
+450 1016 .00343396631
+451 1016 .00343396631
+452 1016 .00857983623
+453 1016 .00343396631
+454 1016 .0101698246
+455 1016 .0048715584
+456 1016 .00548621872
+457 1016 .00343396631
+458 1016 .00343396631
+459 1016 .0101698246
+460 1016 .00724892039
+461 1016 .00653266395
+462 1016 .00857983623
+463 1016 .00343396631
+464 1016 .00653266395
+465 1016 .00653266395
+466 1016 .00653266395
+467 1016 .00857983623
+468 1016 .00857983623
+469 1016 .00653266395
+470 1016 .00857983623
+471 1016 .00857983623
+472 1016 .00653266395
+473 1016 .00857983623
+474 1016 .00857983623
+475 1016 .00857983623
+476 1016 .420335233
+477 1016 .00857983623
+478 1016 .0101698246
+479 1016 .00653266395
+480 1016 .00548621872
+481 1016 .00343396631
+482 1016 .00343396631
+483 1016 .00724892039
+484 1016 .00424673967
+485 1016 .00424673967
+486 1016 .00653266395
+487 1016 .00343396631
+488 1016 .00857983623
+489 1016 .00653266395
+490 1016 .00857983623
+491 1016 .00857983623
+492 1016 .00857983623
+493 1016 .00857983623
+494 1016 .00857983623
+495 1016 .00857983623
+496 1016 .00857983623
+497 1016 .00638534827
+498 1016 .00857983623
+499 1016 .00857983623
+500 1016 .00857983623
+423 1017 1
+447 1017 .0015010268
+448 1017 .0015010268
+449 1017 .0015010268
+450 1017 .0015010268
+451 1017 .0015010268
+452 1017 .0037503466
+453 1017 .0015010268
+454 1017 .00444534887
+455 1017 .00212941528
+456 1017 .0023980902
+457 1017 .0015010268
+458 1017 .0015010268
+459 1017 .00444534887
+460 1017 .00316858804
+461 1017 .0028555037
+462 1017 .0037503466
+463 1017 .0015010268
+464 1017 .0028555037
+465 1017 .0028555037
+466 1017 .0028555037
+467 1017 .0037503466
+468 1017 .0037503466
+469 1017 .0028555037
+470 1017 .0037503466
+471 1017 .0037503466
+472 1017 .0028555037
+473 1017 .0037503466
+474 1017 .0037503466
+475 1017 .0037503466
+476 1017 .0028555037
+477 1017 .547228217
+478 1017 .00444534887
+479 1017 .0028555037
+480 1017 .0023980902
+481 1017 .0015010268
+482 1017 .0015010268
+483 1017 .00316858804
+484 1017 .00185629935
+485 1017 .00185629935
+486 1017 .0028555037
+487 1017 .0015010268
+488 1017 .0037503466
+489 1017 .0028555037
+490 1017 .0037503466
+491 1017 .0037503466
+492 1017 .0037503466
+493 1017 .0037503466
+494 1017 .0037503466
+495 1017 .0037503466
+496 1017 .0037503466
+497 1017 .00279111019
+498 1017 .0037503466
+499 1017 .0037503466
+500 1017 .0037503466
+424 1018 1
+447 1018 .0131290816
+448 1018 .0131290816
+449 1018 .0131290816
+450 1018 .0131290816
+451 1018 .0131290816
+452 1018 .0328032821
+453 1018 .0131290816
+454 1018 .0388822779
+455 1018 .0186254252
+456 1018 .0209754556
+457 1018 .0131290816
+458 1018 .0131290816
+459 1018 .0388822779
+460 1018 .0277147945
+461 1018 .024976328
+462 1018 .0328032821
+463 1018 .0131290816
+464 1018 .024976328
+465 1018 .024976328
+466 1018 .024976328
+467 1018 .0328032821
+468 1018 .0328032821
+469 1018 .024976328
+470 1018 .0328032821
+471 1018 .0328032821
+472 1018 .024976328
+473 1018 .0328032821
+474 1018 .0328032821
+475 1018 .0328032821
+476 1018 .024976328
+477 1018 .0328032821
+478 1018 .683075666
+479 1018 .024976328
+480 1018 .0209754556
+481 1018 .0131290816
+482 1018 .0131290816
+483 1018 .0277147945
+484 1018 .0162365567
+485 1018 .0162365567
+486 1018 .024976328
+487 1018 .0131290816
+488 1018 .0328032821
+489 1018 .024976328
+490 1018 .0328032821
+491 1018 .0328032821
+492 1018 .0328032821
+493 1018 .0328032821
+494 1018 .0328032821
+495 1018 .0328032821
+496 1018 .0328032821
+497 1018 .0244130976
+498 1018 .0328032821
+499 1018 .0328032821
+500 1018 .0328032821
+425 1019 1
+447 1019 .000270030956
+448 1019 .000270030956
+449 1019 .000270030956
+450 1019 .000270030956
+451 1019 .000270030956
+452 1019 .000674678013
+453 1019 .000270030956
+454 1019 .000799707079
+455 1019 .000383076462
+456 1019 .000431410444
+457 1019 .000270030956
+458 1019 .000270030956
+459 1019 .000799707079
+460 1019 .000570021046
+461 1019 .000513697974
+462 1019 .000674678013
+463 1019 .000270030956
+464 1019 .000513697974
+465 1019 .000513697974
+466 1019 .000513697974
+467 1019 .000674678013
+468 1019 .000674678013
+469 1019 .000513697974
+470 1019 .000674678013
+471 1019 .000674678013
+472 1019 .000513697974
+473 1019 .000674678013
+474 1019 .000674678013
+475 1019 .000674678013
+476 1019 .000513697974
+477 1019 .000674678013
+478 1019 .000799707079
+479 1019 .414316267
+480 1019 .000431410444
+481 1019 .000270030956
+482 1019 .000270030956
+483 1019 .000570021046
+484 1019 .000333943637
+485 1019 .000333943637
+486 1019 .000513697974
+487 1019 .000270030956
+488 1019 .000674678013
+489 1019 .000513697974
+490 1019 .000674678013
+491 1019 .000674678013
+492 1019 .000674678013
+493 1019 .000674678013
+494 1019 .000674678013
+495 1019 .000674678013
+496 1019 .000674678013
+497 1019 .000502113777
+498 1019 .000674678013
+499 1019 .000674678013
+500 1019 .000674678013
+426 1020 1
+447 1020 .00198783097
+448 1020 .00198783097
+449 1020 .00198783097
+450 1020 .00198783097
+451 1020 .00198783097
+452 1020 .00496663759
+453 1020 .00198783097
+454 1020 .00588703807
+455 1020 .00282001449
+456 1020 .00317582488
+457 1020 .00198783097
+458 1020 .00198783097
+459 1020 .00588703807
+460 1020 .00419620611
+461 1020 .00378158386
+462 1020 .00496663759
+463 1020 .00198783097
+464 1020 .00378158386
+465 1020 .00378158386
+466 1020 .00378158386
+467 1020 .00496663759
+468 1020 .00496663759
+469 1020 .00378158386
+470 1020 .00496663759
+471 1020 .00496663759
+472 1020 .00378158386
+473 1020 .00496663759
+474 1020 .00496663759
+475 1020 .00496663759
+476 1020 .00378158386
+477 1020 .00496663759
+478 1020 .00588703807
+479 1020 .00378158386
+480 1020 .350692749
+481 1020 .00198783097
+482 1020 .00198783097
+483 1020 .00419620611
+484 1020 .00245832372
+485 1020 .00245832372
+486 1020 .00378158386
+487 1020 .00198783097
+488 1020 .00496663759
+489 1020 .00378158386
+490 1020 .00496663759
+491 1020 .00496663759
+492 1020 .00496663759
+493 1020 .00496663759
+494 1020 .00496663759
+495 1020 .00496663759
+496 1020 .00496663759
+497 1020 .00369630684
+498 1020 .00496663759
+499 1020 .00496663759
+500 1020 .00496663759
+427 1021 1
+447 1021 .00910336524
+448 1021 .00910336524
+449 1021 .00910336524
+450 1021 .00910336524
+451 1021 .00910336524
+452 1021 .022744948
+453 1021 .00910336524
+454 1021 .0269599687
+455 1021 .0129143884
+456 1021 .0145438388
+457 1021 .00910336524
+458 1021 .00910336524
+459 1021 .0269599687
+460 1021 .0192167219
+461 1021 .0173179395
+462 1021 .022744948
+463 1021 .00910336524
+464 1021 .0173179395
+465 1021 .0173179395
+466 1021 .0173179395
+467 1021 .022744948
+468 1021 .022744948
+469 1021 .0173179395
+470 1021 .022744948
+471 1021 .022744948
+472 1021 .0173179395
+473 1021 .022744948
+474 1021 .022744948
+475 1021 .022744948
+476 1021 .0173179395
+477 1021 .022744948
+478 1021 .0269599687
+479 1021 .0173179395
+480 1021 .0145438388
+481 1021 .226623222
+482 1021 .00910336524
+483 1021 .0192167219
+484 1021 .011258008
+485 1021 .011258008
+486 1021 .0173179395
+487 1021 .00910336524
+488 1021 .022744948
+489 1021 .0173179395
+490 1021 .022744948
+491 1021 .022744948
+492 1021 .022744948
+493 1021 .022744948
+494 1021 .022744948
+495 1021 .022744948
+496 1021 .022744948
+497 1021 .0169274099
+498 1021 .022744948
+499 1021 .022744948
+500 1021 .022744948
+428 1022 1
+447 1022 .0134661812
+448 1022 .0134661812
+449 1022 .0134661812
+450 1022 .0134661812
+451 1022 .0134661812
+452 1022 .0336455368
+453 1022 .0134661812
+454 1022 .0398806147
+455 1022 .01910365
+456 1022 .021514019
+457 1022 .0134661812
+458 1022 .0134661812
+459 1022 .0398806147
+460 1022 .0284263939
+461 1022 .0256176181
+462 1022 .0336455368
+463 1022 .0134661812
+464 1022 .0256176181
+465 1022 .0256176181
+466 1022 .0256176181
+467 1022 .0336455368
+468 1022 .0336455368
+469 1022 .0256176181
+470 1022 .0336455368
+471 1022 .0336455368
+472 1022 .0256176181
+473 1022 .0336455368
+474 1022 .0336455368
+475 1022 .0336455368
+476 1022 .0256176181
+477 1022 .0336455368
+478 1022 .0398806147
+479 1022 .0256176181
+480 1022 .021514019
+481 1022 .0134661812
+482 1022 .230986029
+483 1022 .0284263939
+484 1022 .0166534446
+485 1022 .0166534446
+486 1022 .0256176181
+487 1022 .0134661812
+488 1022 .0336455368
+489 1022 .0256176181
+490 1022 .0336455368
+491 1022 .0336455368
+492 1022 .0336455368
+493 1022 .0336455368
+494 1022 .0336455368
+495 1022 .0336455368
+496 1022 .0336455368
+497 1022 .0250399243
+498 1022 .0336455368
+499 1022 .0336455368
+500 1022 .0336455368
+429 1023 1
+447 1023 .000316114078
+448 1023 .000316114078
+449 1023 .000316114078
+450 1023 .000316114078
+451 1023 .000316114078
+452 1023 .000789817655
+453 1023 .000316114078
+454 1023 .000936184078
+455 1023 .000448451785
+456 1023 .000505034404
+457 1023 .000316114078
+458 1023 .000316114078
+459 1023 .000936184078
+460 1023 .000667300075
+461 1023 .000601364998
+462 1023 .000789817655
+463 1023 .000316114078
+464 1023 .000601364998
+465 1023 .000601364998
+466 1023 .000601364998
+467 1023 .000789817655
+468 1023 .000789817655
+469 1023 .000601364998
+470 1023 .000789817655
+471 1023 .000789817655
+472 1023 .000601364998
+473 1023 .000789817655
+474 1023 .000789817655
+475 1023 .000789817655
+476 1023 .000601364998
+477 1023 .000789817655
+478 1023 .000936184078
+479 1023 .000601364998
+480 1023 .000505034404
+481 1023 .000316114078
+482 1023 .000316114078
+483 1023 .459840149
+484 1023 .000390934001
+485 1023 .000390934001
+486 1023 .000601364998
+487 1023 .000316114078
+488 1023 .000789817655
+489 1023 .000601364998
+490 1023 .000789817655
+491 1023 .000789817655
+492 1023 .000789817655
+493 1023 .000789817655
+494 1023 .000789817655
+495 1023 .000789817655
+496 1023 .000789817655
+497 1023 .000587803836
+498 1023 .000789817655
+499 1023 .000789817655
+500 1023 .000789817655
+430 1024 1
+447 1024 .00796811283
+448 1024 .00796811283
+449 1024 .00796811283
+450 1024 .00796811283
+451 1024 .00796811283
+452 1024 .0199084971
+453 1024 .00796811283
+454 1024 .0235978737
+455 1024 .0113038765
+456 1024 .0127301235
+457 1024 .00796811283
+458 1024 .00796811283
+459 1024 .0235978737
+460 1024 .0168202631
+461 1024 .0151582742
+462 1024 .0199084971
+463 1024 .00796811283
+464 1024 .0151582742
+465 1024 .0151582742
+466 1024 .0151582742
+467 1024 .0199084971
+468 1024 .0199084971
+469 1024 .0151582742
+470 1024 .0199084971
+471 1024 .0199084971
+472 1024 .0151582742
+473 1024 .0199084971
+474 1024 .0199084971
+475 1024 .0199084971
+476 1024 .0151582742
+477 1024 .0199084971
+478 1024 .0235978737
+479 1024 .0151582742
+480 1024 .0127301235
+481 1024 .00796811283
+482 1024 .00796811283
+483 1024 .0168202631
+484 1024 .278857887
+485 1024 .0098540578
+486 1024 .0151582742
+487 1024 .00796811283
+488 1024 .0199084971
+489 1024 .0151582742
+490 1024 .0199084971
+491 1024 .0199084971
+492 1024 .0199084971
+493 1024 .0199084971
+494 1024 .0199084971
+495 1024 .0199084971
+496 1024 .0199084971
+497 1024 .0148164472
+498 1024 .0199084971
+499 1024 .0199084971
+500 1024 .0199084971
+431 1025 1
+447 1025 .00026391237
+448 1025 .00026391237
+449 1025 .00026391237
+450 1025 .00026391237
+451 1025 .00026391237
+452 1025 .000659390527
+453 1025 .00026391237
+454 1025 .000781586568
+455 1025 .000374396332
+456 1025 .000421635137
+457 1025 .00026391237
+458 1025 .00026391237
+459 1025 .000781586568
+460 1025 .000557104941
+461 1025 .00050205813
+462 1025 .000659390527
+463 1025 .00026391237
+464 1025 .00050205813
+465 1025 .00050205813
+466 1025 .00050205813
+467 1025 .000659390527
+468 1025 .000659390527
+469 1025 .00050205813
+470 1025 .000659390527
+471 1025 .000659390527
+472 1025 .00050205813
+473 1025 .000659390527
+474 1025 .000659390527
+475 1025 .000659390527
+476 1025 .00050205813
+477 1025 .000659390527
+478 1025 .000781586568
+479 1025 .00050205813
+480 1025 .000421635137
+481 1025 .00026391237
+482 1025 .00026391237
+483 1025 .000557104941
+484 1025 .000326376816
+485 1025 .269330204
+486 1025 .00050205813
+487 1025 .00026391237
+488 1025 .000659390527
+489 1025 .00050205813
+490 1025 .000659390527
+491 1025 .000659390527
+492 1025 .000659390527
+493 1025 .000659390527
+494 1025 .000659390527
+495 1025 .000659390527
+496 1025 .000659390527
+497 1025 .000490736391
+498 1025 .000659390527
+499 1025 .000659390527
+500 1025 .000659390527
+432 1026 1
+447 1026 .000762254349
+448 1026 .000762254349
+449 1026 .000762254349
+450 1026 .000762254349
+451 1026 .000762254349
+452 1026 .00190450845
+453 1026 .000762254349
+454 1026 .00225744559
+455 1026 .00108136376
+456 1026 .00121780287
+457 1026 .000762254349
+458 1026 .000762254349
+459 1026 .00225744559
+460 1026 .00160907849
+461 1026 .00145008741
+462 1026 .00190450845
+463 1026 .000762254349
+464 1026 .00145008741
+465 1026 .00145008741
+466 1026 .00145008741
+467 1026 .00190450845
+468 1026 .00190450845
+469 1026 .00145008741
+470 1026 .00190450845
+471 1026 .00190450845
+472 1026 .00145008741
+473 1026 .00190450845
+474 1026 .00190450845
+475 1026 .00190450845
+476 1026 .00145008741
+477 1026 .00190450845
+478 1026 .00225744559
+479 1026 .00145008741
+480 1026 .00121780287
+481 1026 .000762254349
+482 1026 .000762254349
+483 1026 .00160907849
+484 1026 .000942669634
+485 1026 .000942669634
+486 1026 .415252656
+487 1026 .000762254349
+488 1026 .00190450845
+489 1026 .00145008741
+490 1026 .00190450845
+491 1026 .00190450845
+492 1026 .00190450845
+493 1026 .00190450845
+494 1026 .00190450845
+495 1026 .00190450845
+496 1026 .00190450845
+497 1026 .00141738704
+498 1026 .00190450845
+499 1026 .00190450845
+500 1026 .00190450845
+433 1027 1
+447 1027 1.5085453e-5
+448 1027 1.5085453e-5
+449 1027 1.5085453e-5
+450 1027 1.5085453e-5
+451 1027 1.5085453e-5
+452 1027 3.76913194e-5
+453 1027 1.5085453e-5
+454 1027 4.46761514e-5
+455 1027 2.14008123e-5
+456 1027 2.41010202e-5
+457 1027 1.5085453e-5
+458 1027 1.5085453e-5
+459 1027 4.46761514e-5
+460 1027 3.18445891e-5
+461 1027 2.86980667e-5
+462 1027 3.76913194e-5
+463 1027 1.5085453e-5
+464 1027 2.86980667e-5
+465 1027 2.86980667e-5
+466 1027 2.86980667e-5
+467 1027 3.76913194e-5
+468 1027 3.76913194e-5
+469 1027 2.86980667e-5
+470 1027 3.76913194e-5
+471 1027 3.76913194e-5
+472 1027 2.86980667e-5
+473 1027 3.76913194e-5
+474 1027 3.76913194e-5
+475 1027 3.76913194e-5
+476 1027 2.86980667e-5
+477 1027 3.76913194e-5
+478 1027 4.46761514e-5
+479 1027 2.86980667e-5
+480 1027 2.41010202e-5
+481 1027 1.5085453e-5
+482 1027 1.5085453e-5
+483 1027 3.18445891e-5
+484 1027 1.86559755e-5
+485 1027 1.86559755e-5
+486 1027 2.86980667e-5
+487 1027 .21753493
+488 1027 3.76913194e-5
+489 1027 2.86980667e-5
+490 1027 3.76913194e-5
+491 1027 3.76913194e-5
+492 1027 3.76913194e-5
+493 1027 3.76913194e-5
+494 1027 3.76913194e-5
+495 1027 3.76913194e-5
+496 1027 3.76913194e-5
+497 1027 2.80509066e-5
+498 1027 3.76913194e-5
+499 1027 3.76913194e-5
+500 1027 3.76913194e-5
+434 1028 1
+447 1028 .00133195089
+448 1028 .00133195089
+449 1028 .00133195089
+450 1028 .00133195089
+451 1028 .00133195089
+452 1028 .00332790706
+453 1028 .00133195089
+454 1028 .00394462375
+455 1028 .00188955734
+456 1028 .00212796894
+457 1028 .00133195089
+458 1028 .00133195089
+459 1028 .00394462375
+460 1028 .00281167752
+461 1028 .00253385911
+462 1028 .00332790706
+463 1028 .00133195089
+464 1028 .00253385911
+465 1028 .00253385911
+466 1028 .00253385911
+467 1028 .00332790706
+468 1028 .00332790706
+469 1028 .00253385911
+470 1028 .00332790706
+471 1028 .00332790706
+472 1028 .00253385911
+473 1028 .00332790706
+474 1028 .00332790706
+475 1028 .00332790706
+476 1028 .00253385911
+477 1028 .00332790706
+478 1028 .00394462375
+479 1028 .00253385911
+480 1028 .00212796894
+481 1028 .00133195089
+482 1028 .00133195089
+483 1028 .00281167752
+484 1028 .00164720556
+485 1028 .00164720556
+486 1028 .00253385911
+487 1028 .00133195089
+488 1028 .546805799
+489 1028 .00253385911
+490 1028 .00332790706
+491 1028 .00332790706
+492 1028 .00332790706
+493 1028 .00332790706
+494 1028 .00332790706
+495 1028 .00332790706
+496 1028 .00332790706
+497 1028 .00247671921
+498 1028 .00332790706
+499 1028 .00332790706
+500 1028 .00332790706
+435 1029 1
+447 1029 .00133274659
+448 1029 .00133274659
+449 1029 .00133274659
+450 1029 .00133274659
+451 1029 .00133274659
+452 1029 .0033298952
+453 1029 .00133274659
+454 1029 .00394698046
+455 1029 .00189068634
+456 1029 .00212924019
+457 1029 .00133274659
+458 1029 .00133274659
+459 1029 .00394698046
+460 1029 .00281335716
+461 1029 .00253537297
+462 1029 .0033298952
+463 1029 .00133274659
+464 1029 .00253537297
+465 1029 .00253537297
+466 1029 .00253537297
+467 1029 .0033298952
+468 1029 .0033298952
+469 1029 .00253537297
+470 1029 .0033298952
+471 1029 .0033298952
+472 1029 .00253537297
+473 1029 .0033298952
+474 1029 .0033298952
+475 1029 .0033298952
+476 1029 .00253537297
+477 1029 .0033298952
+478 1029 .00394698046
+479 1029 .00253537297
+480 1029 .00212924019
+481 1029 .00133274659
+482 1029 .00133274659
+483 1029 .00281335716
+484 1029 .0016481895
+485 1029 .0016481895
+486 1029 .00253537297
+487 1029 .00133274659
+488 1029 .0033298952
+489 1029 .416337937
+490 1029 .0033298952
+491 1029 .0033298952
+492 1029 .0033298952
+493 1029 .0033298952
+494 1029 .0033298952
+495 1029 .0033298952
+496 1029 .0033298952
+497 1029 .00247819885
+498 1029 .0033298952
+499 1029 .0033298952
+500 1029 .0033298952
+436 1030 1
+447 1030 .00133454381
+448 1030 .00133454381
+449 1030 .00133454381
+450 1030 .00133454381
+451 1030 .00133454381
+452 1030 .00333438558
+453 1030 .00133454381
+454 1030 .00395230297
+455 1030 .00189323595
+456 1030 .00213211169
+457 1030 .00133454381
+458 1030 .00133454381
+459 1030 .00395230297
+460 1030 .00281715114
+461 1030 .00253879209
+462 1030 .00333438558
+463 1030 .00133454381
+464 1030 .00253879209
+465 1030 .00253879209
+466 1030 .00253879209
+467 1030 .00333438558
+468 1030 .00333438558
+469 1030 .00253879209
+470 1030 .00333438558
+471 1030 .00333438558
+472 1030 .00253879209
+473 1030 .00333438558
+474 1030 .00333438558
+475 1030 .00333438558
+476 1030 .00253879209
+477 1030 .00333438558
+478 1030 .00395230297
+479 1030 .00253879209
+480 1030 .00213211169
+481 1030 .00133454381
+482 1030 .00133454381
+483 1030 .00281715114
+484 1030 .0016504121
+485 1030 .0016504121
+486 1030 .00253879209
+487 1030 .00133454381
+488 1030 .00333438558
+489 1030 .00253879209
+490 1030 .546812296
+491 1030 .00333438558
+492 1030 .00333438558
+493 1030 .00333438558
+494 1030 .00333438558
+495 1030 .00333438558
+496 1030 .00333438558
+497 1030 .00248154067
+498 1030 .00333438558
+499 1030 .00333438558
+500 1030 .00333438558
+437 1031 1
+491 1031 .543477893
+438 1032 1
+492 1032 .543477893
+439 1033 1
+447 1033 .0115508148
+448 1033 .0115508148
+449 1033 .0115508148
+450 1033 .0115508148
+451 1033 .0115508148
+452 1033 .0288599506
+453 1033 .0115508148
+454 1033 .0342081785
+455 1033 .0163864344
+456 1033 .018453965
+457 1033 .0115508148
+458 1033 .0115508148
+459 1033 .0342081785
+460 1033 .0243831556
+461 1033 .0219738856
+462 1033 .0288599506
+463 1033 .0115508148
+464 1033 .0219738856
+465 1033 .0219738856
+466 1033 .0219738856
+467 1033 .0288599506
+468 1033 .0288599506
+469 1033 .0219738856
+470 1033 .0288599506
+471 1033 .0288599506
+472 1033 .0219738856
+473 1033 .0288599506
+474 1033 .0288599506
+475 1033 .0288599506
+476 1033 .0219738856
+477 1033 .0288599506
+478 1033 .0342081785
+479 1033 .0219738856
+480 1033 .018453965
+481 1033 .0115508148
+482 1033 .0115508148
+483 1033 .0243831556
+484 1033 .0142847355
+485 1033 .0142847355
+486 1033 .0219738856
+487 1033 .0115508148
+488 1033 .0288599506
+489 1033 .0219738856
+490 1033 .0288599506
+491 1033 .0288599506
+492 1033 .0288599506
+493 1033 .572337866
+494 1033 .0288599506
+495 1033 .0288599506
+496 1033 .0288599506
+497 1033 .0214783605
+498 1033 .0288599506
+499 1033 .0288599506
+500 1033 .0288599506
+440 1034 1
+447 1034 .0504734591
+448 1034 .0504734591
+449 1034 .0504734591
+450 1034 .0504734591
+451 1034 .0504734591
+452 1034 .126108989
+453 1034 .0504734591
+454 1034 .149479091
+455 1034 .0716036186
+456 1034 .0806380734
+457 1034 .0504734591
+458 1034 .0504734591
+459 1034 .149479091
+460 1034 .106546797
+461 1034 .0960190371
+462 1034 .126108989
+463 1034 .0504734591
+464 1034 .0960190371
+465 1034 .0960190371
+466 1034 .0960190371
+467 1034 .126108989
+468 1034 .126108989
+469 1034 .0960190371
+470 1034 .126108989
+471 1034 .126108989
+472 1034 .0960190371
+473 1034 .126108989
+474 1034 .126108989
+475 1034 .126108989
+476 1034 .0960190371
+477 1034 .126108989
+478 1034 .149479091
+479 1034 .0960190371
+480 1034 .0806380734
+481 1034 .0504734591
+482 1034 .0504734591
+483 1034 .106546797
+484 1034 .0624198392
+485 1034 .0624198392
+486 1034 .0960190371
+487 1034 .0504734591
+488 1034 .126108989
+489 1034 .0960190371
+490 1034 .126108989
+491 1034 .126108989
+492 1034 .126108989
+493 1034 .126108989
+494 1034 .669586897
+495 1034 .126108989
+496 1034 .126108989
+497 1034 .0938537493
+498 1034 .126108989
+499 1034 .126108989
+500 1034 .126108989
+441 1035 1
+495 1035 .543477893
+442 1036 1
+447 1036 .0213428419
+448 1036 .0213428419
+449 1036 .0213428419
+450 1036 .0213428419
+451 1036 .0213428419
+452 1036 .0533255301
+453 1036 .0213428419
+454 1036 .0632076487
+455 1036 .0302777849
+456 1036 .0340980291
+457 1036 .0213428419
+458 1036 .0213428419
+459 1036 .0632076487
+460 1036 .045053605
+461 1036 .0406019129
+462 1036 .0533255301
+463 1036 .0213428419
+464 1036 .0406019129
+465 1036 .0406019129
+466 1036 .0406019129
+467 1036 .0533255301
+468 1036 .0533255301
+469 1036 .0406019129
+470 1036 .0533255301
+471 1036 .0533255301
+472 1036 .0406019129
+473 1036 .0533255301
+474 1036 .0533255301
+475 1036 .0533255301
+476 1036 .0406019129
+477 1036 .0533255301
+478 1036 .0632076487
+479 1036 .0406019129
+480 1036 .0340980291
+481 1036 .0213428419
+482 1036 .0213428419
+483 1036 .045053605
+484 1036 .0263944026
+485 1036 .0263944026
+486 1036 .0406019129
+487 1036 .0213428419
+488 1036 .0533255301
+489 1036 .0406019129
+490 1036 .0533255301
+491 1036 .0533255301
+492 1036 .0533255301
+493 1036 .0533255301
+494 1036 .0533255301
+495 1036 .0533255301
+496 1036 .596803427
+497 1036 .0396863185
+498 1036 .0533255301
+499 1036 .0533255301
+500 1036 .0533255301
+443 1037 1
+447 1037 .041330751
+448 1037 .041330751
+449 1037 .041330751
+450 1037 .041330751
+451 1037 .041330751
+452 1037 .10326574
+453 1037 .041330751
+454 1037 .122402608
+455 1037 .0586334132
+456 1037 .0660313815
+457 1037 .041330751
+458 1037 .041330751
+459 1037 .122402608
+460 1037 .0872470215
+461 1037 .0786262453
+462 1037 .10326574
+463 1037 .041330751
+464 1037 .0786262453
+465 1037 .0786262453
+466 1037 .0786262453
+467 1037 .10326574
+468 1037 .10326574
+469 1037 .0786262453
+470 1037 .10326574
+471 1037 .10326574
+472 1037 .0786262453
+473 1037 .10326574
+474 1037 .10326574
+475 1037 .10326574
+476 1037 .0786262453
+477 1037 .10326574
+478 1037 .122402608
+479 1037 .0786262453
+480 1037 .0660313815
+481 1037 .041330751
+482 1037 .041330751
+483 1037 .0872470215
+484 1037 .0511131771
+485 1037 .0511131771
+486 1037 .0786262453
+487 1037 .041330751
+488 1037 .10326574
+489 1037 .0786262453
+490 1037 .10326574
+491 1037 .10326574
+492 1037 .10326574
+493 1037 .10326574
+494 1037 .10326574
+495 1037 .10326574
+496 1037 .10326574
+497 1037 .481324255
+498 1037 .10326574
+499 1037 .10326574
+500 1037 .10326574
+444 1038 1
+498 1038 .543477893
+445 1039 1
+499 1039 .543477893
+446 1040 1
+500 1040 .543477893
+501 1041 1
+774 1041 1
+1254 1041 -.0380239636
+502 1042 1
+775 1042 1
+1254 1042 -.000453596498
+503 1043 1
+776 1043 1
+1254 1043 -.000104409068
+504 1044 1
+777 1044 1
+1254 1044 -.00156986958
+505 1045 1
+778 1045 1
+1254 1045 -.0151058435
+506 1046 1
+779 1046 1
+507 1047 1
+780 1047 1
+1254 1047 -.000602873974
+508 1048 1
+781 1048 1
+1254 1048 -.216793314
+509 1049 1
+782 1049 1
+1254 1049 -.00540687004
+510 1050 1
+783 1050 1
+1254 1050 -.021726191
+511 1051 1
+784 1051 1
+1254 1051 -.0160277095
+512 1052 1
+785 1052 1
+1254 1052 -.000167879829
+513 1053 1
+786 1053 1
+1254 1053 -.000827674405
+514 1054 1
+787 1054 1
+1254 1054 -.086298421
+515 1055 1
+788 1055 1
+1254 1055 -3.36676676e-5
+516 1056 1
+789 1056 1
+1254 1056 -.0134372637
+517 1057 1
+790 1057 1
+1254 1057 -1.20522391e-5
+518 1058 1
+791 1058 1
+1254 1058 -.00149709766
+519 1059 1
+792 1059 1
+1254 1059 -7.5916003e-5
+520 1060 1
+793 1060 1
+1254 1060 -.000157006609
+521 1061 1
+794 1061 1
+1254 1061 -.000117640331
+522 1062 1
+795 1062 1
+1254 1062 -5.89511671e-7
+523 1063 1
+796 1063 1
+1254 1063 -1.7226841e-5
+524 1064 1
+797 1064 1
+525 1065 1
+798 1065 1
+1254 1065 -.000583616551
+526 1066 1
+799 1066 1
+1254 1066 -.00450151134
+527 1067 1
+800 1067 1
+1254 1067 -.0014306138
+528 1068 1
+801 1068 1
+1254 1068 -2.2335942e-5
+529 1069 1
+802 1069 1
+1254 1069 -.000210586673
+530 1070 1
+803 1070 1
+1254 1070 -.00193471182
+531 1071 1
+804 1071 1
+1254 1071 -.0018588613
+532 1072 1
+805 1072 1
+1254 1072 -.0156698748
+533 1073 1
+806 1073 1
+1254 1073 -.00145537336
+534 1074 1
+807 1074 1
+1254 1074 -.0111717703
+535 1075 1
+808 1075 1
+1254 1075 -.00473823305
+536 1076 1
+809 1076 1
+1254 1076 -.00116330304
+537 1077 1
+810 1077 1
+1254 1077 -.00446149008
+538 1078 1
+811 1078 1
+1254 1078 -.0120798806
+539 1079 1
+812 1079 1
+1254 1079 -4.05453029e-5
+540 1080 1
+813 1080 1
+1254 1080 -.00253228028
+541 1081 1
+814 1081 1
+542 1082 1
+815 1082 1
+1254 1082 -7.26409417e-5
+543 1083 1
+816 1083 1
+1254 1083 -.00411937665
+544 1084 1
+817 1084 1
+1254 1084 -.00660947384
+545 1085 1
+818 1085 1
+546 1086 1
+819 1086 1
+1254 1086 -.241819128
+547 1087 1
+820 1087 1
+548 1088 1
+821 1088 1
+549 1089 1
+822 1089 1
+550 1090 1
+823 1090 1
+551 1091 1
+824 1091 1
+552 1092 1
+825 1092 1
+1254 1092 -.265067309
+553 1093 1
+826 1093 1
+554 1094 1
+827 1094 1
+716 1095 1
+828 1095 2.5
+829 1095 2.70000005
+830 1095 2.5
+831 1095 2.70000005
+832 1095 2.9000001
+833 1095 2.9000001
+834 1095 2.9000001
+835 1095 2.9000001
+977 1095 .166986734
+1257 1095 -.57133919
+717 1096 1
+836 1096 2.4000001
+837 1096 1.5
+838 1096 2.4000001
+839 1096 1.89999998
+840 1096 1.60000002
+841 1096 2.0999999
+842 1096 1.79999995
+843 1096 1.79999995
+844 1096 1.79999995
+845 1096 1.79999995
+846 1096 1.79999995
+847 1096 1.79999995
+848 1096 1.89999998
+849 1096 1.89999998
+850 1096 1.79999995
+851 1096 2.20000005
+852 1096 2.5999999
+853 1096 2.5999999
+854 1096 2.20000005
+855 1096 1.5
+856 1096 3
+857 1096 1.89999998
+858 1096 2.4000001
+859 1096 2.4000001
+860 1096 2.4000001
+861 1096 2.4000001
+862 1096 1.89999998
+863 1096 2.20000005
+864 1096 1.70000005
+865 1096 2.20000005
+866 1096 2.9000001
+867 1096 2.20000005
+868 1096 2.5999999
+869 1096 1.79999995
+977 1096 .119615935
+1257 1096 -.409261674
+718 1097 1
+870 1097 1.70000005
+871 1097 1.70000005
+879 1097 1.70000005
+977 1097 .00566986762
+1257 1097 -.0193992499
+719 1098 1
+872 1098 1.60000002
+873 1098 1.60000002
+874 1098 1.60000002
+875 1098 1.5
+876 1098 1.5
+877 1098 1.60000002
+878 1098 1.70000005
+880 1099 -60
+925 1099 1
+881 1100 -36
+926 1100 1
+882 1101 -60
+927 1101 1
+883 1102 -36
+928 1102 1
+884 1103 -41
+929 1103 1
+885 1104 -41
+930 1104 1
+886 1105 -41
+931 1105 1
+887 1106 -41
+932 1106 1
+888 1107 -19
+933 1107 1
+889 1108 -33
+934 1108 1
+890 1109 -60
+935 1109 1
+891 1110 -60
+936 1110 1
+892 1111 -32
+937 1111 1
+893 1112 -26
+938 1112 1
+894 1113 -60
+939 1113 1
+895 1114 -31
+940 1114 1
+896 1115 -30
+941 1115 1
+897 1116 -31
+942 1116 1
+898 1117 -30
+943 1117 1
+899 1118 -41
+944 1118 1
+900 1119 -30
+945 1119 1
+901 1120 -30
+946 1120 1
+902 1121 -30
+947 1121 1
+903 1122 -60
+948 1122 1
+904 1123 -28
+949 1123 1
+905 1124 -28
+950 1124 1
+906 1125 -31
+951 1125 1
+907 1126 -23
+952 1126 1
+908 1127 -15
+953 1127 1
+909 1128 -45
+954 1128 1
+910 1129 -15
+955 1129 1
+911 1130 -49
+956 1130 1
+912 1131 -26
+957 1131 1
+913 1132 -15
+958 1132 1
+914 1133 -60
+959 1133 1
+915 1134 -60
+960 1134 1
+916 1135 -8
+961 1135 1
+917 1136 -25
+962 1136 1
+918 1137 -9
+963 1137 1
+919 1138 -31
+964 1138 1
+920 1139 -21
+965 1139 1
+921 1140 -30
+966 1140 1
+922 1141 -8
+967 1141 1
+923 1142 -8
+968 1142 1
+969 1143 1
+970 1144 1
+971 1145 1
+972 1146 1
+973 1147 1
+974 1148 1
+975 1149 1
+924 1150 -8
+976 1150 1
+828 1151 1
+880 1151 1
+829 1152 1
+881 1152 1
+830 1153 1
+882 1153 1
+831 1154 1
+883 1154 1
+832 1155 1
+884 1155 1
+833 1156 1
+885 1156 1
+834 1157 1
+886 1157 1
+835 1158 1
+887 1158 1
+836 1159 1
+888 1159 1
+837 1160 1
+889 1160 1
+838 1161 1
+890 1161 1
+839 1162 1
+891 1162 1
+840 1163 1
+892 1163 1
+841 1164 1
+893 1164 1
+842 1165 1
+894 1165 1
+843 1166 1
+895 1166 1
+844 1167 1
+896 1167 1
+845 1168 1
+897 1168 1
+846 1169 1
+898 1169 1
+847 1170 1
+899 1170 1
+848 1171 1
+900 1171 1
+849 1172 1
+901 1172 1
+850 1173 1
+902 1173 1
+851 1174 1
+903 1174 1
+852 1175 1
+904 1175 1
+853 1176 1
+905 1176 1
+854 1177 1
+906 1177 1
+855 1178 1
+907 1178 1
+856 1179 1
+908 1179 1
+857 1180 1
+909 1180 1
+858 1181 1
+910 1181 1
+859 1182 1
+911 1182 1
+860 1183 1
+912 1183 1
+861 1184 1
+913 1184 1
+862 1185 1
+914 1185 1
+863 1186 1
+915 1186 1
+864 1187 1
+916 1187 1
+865 1188 1
+917 1188 1
+866 1189 1
+918 1189 1
+867 1190 1
+919 1190 1
+868 1191 1
+920 1191 1
+869 1192 1
+921 1192 1
+870 1193 1
+922 1193 1
+871 1194 1
+923 1194 1
+872 1195 1
+873 1196 1
+874 1197 1
+875 1198 1
+876 1199 1
+877 1200 1
+878 1201 1
+879 1202 1
+924 1202 1
+880 1203 -1
+881 1203 -1
+882 1203 -1
+883 1203 -1
+884 1203 -1
+885 1203 -1
+886 1203 -1
+887 1203 -1
+888 1203 -1
+889 1203 -1
+890 1203 -1
+891 1203 -1
+892 1203 -1
+893 1203 -1
+894 1203 -1
+895 1203 -1
+896 1203 -1
+897 1203 -1
+898 1203 -1
+899 1203 -1
+900 1203 -1
+901 1203 -1
+902 1203 -1
+903 1203 -1
+904 1203 -1
+905 1203 -1
+906 1203 -1
+907 1203 -1
+908 1203 -1
+909 1203 -1
+910 1203 -1
+911 1203 -1
+912 1203 -1
+913 1203 -1
+914 1203 -1
+915 1203 -1
+916 1203 -1
+917 1203 -1
+918 1203 -1
+919 1203 -1
+920 1203 -1
+921 1203 -1
+922 1203 -1
+923 1203 -1
+924 1203 -1
+1094 1204 1
+1253 1205 1
+1255 1205 13607688
+1254 1206 1
+1255 1206 -15266873
+1255 1207 100
+663 1208 1
+1256 1208 1
+1258 1208 -1
+985 1209 1
+1257 1209 1
+447 1210 -.495945245
+448 1210 -.495945245
+449 1210 -.495945245
+450 1210 -.495945245
+451 1210 -.495945245
+452 1210 -1.23912954
+453 1210 -.495945245
+454 1210 -1.46876097
+455 1210 -.703567266
+456 1210 -.79233861
+457 1210 -.495945245
+458 1210 -.495945245
+459 1210 -1.46876097
+460 1210 -1.0469141
+461 1210 -.943469822
+462 1210 -1.23912954
+463 1210 -.495945245
+464 1210 -.943469822
+465 1210 -.943469822
+466 1210 -.943469822
+467 1210 -1.23912954
+468 1210 -1.23912954
+469 1210 -.943469822
+470 1210 -1.23912954
+471 1210 -1.23912954
+472 1210 -.943469822
+473 1210 -1.23912954
+474 1210 -1.23912954
+475 1210 -1.23912954
+476 1210 -.943469822
+477 1210 -1.23912954
+478 1210 -1.46876097
+479 1210 -.943469822
+480 1210 -.79233861
+481 1210 -.495945245
+482 1210 -.495945245
+483 1210 -1.0469141
+484 1210 -.613328755
+485 1210 -.613328755
+486 1210 -.943469822
+487 1210 -.495945245
+488 1210 -1.23912954
+489 1210 -.943469822
+490 1210 -1.23912954
+491 1210 -1.23912954
+492 1210 -1.23912954
+493 1210 -1.23912954
+494 1210 -1.23912954
+495 1210 -1.23912954
+496 1210 -1.23912954
+497 1210 -.922194004
+498 1210 -1.23912954
+499 1210 -1.23912954
+500 1210 -1.23912954
+663 1210 -1
+977 1211 -.292272508
+985 1211 -1
+775 1212 1
+776 1213 1
+777 1214 1
+779 1215 1
+780 1216 1
+782 1217 1
+783 1218 1
+784 1219 1
+785 1220 1
+786 1221 1
+788 1222 1
+790 1223 1
+791 1224 1
+792 1225 1
+793 1226 1
+794 1227 1
+795 1228 1
+796 1229 1
+797 1230 1
+798 1231 1
+799 1232 1
+800 1233 1
+801 1234 1
+802 1235 1
+803 1236 1
+804 1237 1
+805 1238 1
+806 1239 1
+808 1240 1
+809 1241 1
+810 1242 1
+811 1243 1
+812 1244 1
+813 1245 1
+814 1246 1
+815 1247 1
+816 1248 1
+817 1249 1
+818 1250 1
+820 1251 1
+821 1252 1
+822 1253 1
+823 1254 1
+824 1255 1
+825 1256 1
+826 1257 1
+827 1258 1
diff --git a/matrices/test/nontrivial_mc64_result.mtx b/matrices/test/nontrivial_mc64_result.mtx
new file mode 100644
index 00000000000..d03fb9b6965
--- /dev/null
+++ b/matrices/test/nontrivial_mc64_result.mtx
@@ -0,0 +1,7684 @@
+%%MatrixMarket matrix coordinate real general
+1258 1258 7682
+1 1 -1
+1 105 1
+1 882 -0.0272284
+1 883 0.324344
+1 935 -0.0668099
+2 2 -1
+2 106 1
+2 882 -0.0691867
+2 884 0.88477
+2 936 -0.155711
+3 3 -1
+3 107 1
+3 882 -0.118567
+3 885 0.917368
+3 937 -0.119677
+4 4 -1
+4 108 1
+4 882 -0.148731
+4 886 1
+4 938 -0.148731
+5 5 -1
+5 109 1
+5 882 -0.0809082
+5 887 0.95243
+5 939 -0.155931
+6 6 -1
+6 110 1
+6 882 -0.150521
+6 888 1
+6 940 -0.174949
+7 7 -1
+7 111 1
+7 882 -0.0539074
+7 889 0.620342
+7 941 -0.0814787
+8 8 -1
+8 112 1
+8 882 -0.0558806
+8 890 0.643433
+8 942 -0.0889036
+9 9 -1
+9 289 1
+10 10 -1
+10 114 1
+10 882 -0.0302074
+10 892 0.325091
+11 11 -1
+11 115 1
+11 882 -0.0597808
+11 893 0.643358
+12 12 -1
+12 116 1
+12 882 -0.0393616
+12 894 0.423608
+13 13 -1
+13 293 1
+14 14 -1
+14 118 1
+14 882 -0.0310215
+14 896 0.333851
+15 15 -1
+15 295 1
+16 16 -1
+16 296 1
+17 17 -1
+17 297 1
+18 18 -1
+18 298 1
+19 19 -1
+19 299 1
+20 20 -1
+20 300 1
+21 21 -1
+21 301 1
+22 22 -1
+22 302 1
+23 23 -1
+23 303 1
+24 24 -1
+24 304 1
+25 25 -1
+25 305 1
+26 26 -1
+26 306 1
+27 27 -1
+27 307 1
+28 28 -1
+28 308 1
+29 29 -1
+29 309 1
+30 30 -1
+30 310 1
+31 31 -1
+31 311 1
+32 32 -1
+32 136 1
+32 882 -0.0367834
+32 914 0.395861
+33 33 -1
+33 313 1
+34 34 -1
+34 138 1
+34 882 -0.0744019
+34 916 0.800709
+35 35 -1
+35 315 1
+36 36 -1
+36 316 1
+37 37 -1
+37 317 1
+38 38 -1
+38 318 1
+39 39 -1
+39 319 1
+40 40 -1
+40 320 1
+41 41 -1
+41 321 1
+42 42 -1
+42 322 1
+43 43 -1
+43 323 1
+44 44 -1
+44 148 1
+44 882 -0.0603907
+44 926 0.62276
+45 45 -1
+45 325 1
+46 46 -1
+46 326 1
+47 47 -1
+47 327 1
+48 48 -1
+48 328 1
+49 49 -1
+49 153 1
+50 50 -1
+50 330 1
+51 51 -1
+51 331 1
+52 52 -1
+52 332 1
+53 1 -1
+53 53 1
+53 882 0.0192316
+53 883 -0.175656
+53 935 -0.0668099
+54 2 -0.480547
+54 54 1
+54 882 0.0132125
+54 884 -0.0748264
+54 936 -0.0748264
+55 3 -0.314153
+55 55 1
+55 882 0.00921186
+55 885 -0.0375969
+55 937 -0.0375969
+56 4 -0.236647
+56 56 1
+56 882 0.0112634
+56 886 -0.0351966
+56 938 -0.0351966
+57 5 -0.451117
+57 57 1
+57 882 0.00996095
+57 887 -0.0703429
+57 939 -0.0703429
+58 6 -0.248166
+58 58 1
+58 882 0.00910574
+58 888 -0.0453067
+58 940 -0.0434164
+59 7 -0.712432
+59 59 1
+59 882 0.0080546
+59 889 -0.058048
+59 941 -0.058048
+60 8 -0.64291
+60 60 1
+60 882 0.0105338
+60 890 -0.0863306
+60 942 -0.057157
+61 9 -0.185612
+61 61 1
+61 882 0.0477969
+61 891 -0.514388
+62 10 -0.310143
+62 62 1
+62 882 0.0370914
+62 892 -0.399175
+63 11 -0.342239
+63 63 1
+63 882 0.0260007
+63 893 -0.279818
+64 12 -0.525587
+64 64 1
+64 882 0.035064
+64 894 -0.377357
+65 13 -0.450577
+65 65 1
+65 882 0.00459242
+65 895 -0.0494234
+66 14 -0.396895
+66 66 1
+66 882 0.0341477
+66 896 -0.367496
+67 15 -0.411692
+67 67 1
+67 882 0.00820557
+67 897 -0.0883079
+68 16 -0.426883
+68 68 1
+68 882 0.0253781
+68 898 -0.273117
+69 17 -0.337349
+69 69 1
+69 882 0.0151135
+69 899 -0.162651
+70 18 -0.39165
+70 70 1
+70 882 0.0100679
+70 900 -0.10835
+71 19 -0.421632
+71 71 1
+71 882 0.0165739
+71 901 -0.178368
+72 20 -0.357698
+72 72 1
+72 882 0.0132227
+72 902 -0.142301
+73 21 -0.5
+73 73 1
+73 882 0.018584
+73 903 -0.2
+74 22 -0.542937
+74 74 1
+74 882 0.0145942
+74 904 -0.157063
+75 23 -0.253654
+75 75 1
+75 882 0.0321825
+75 905 -0.346346
+76 24 -0.431501
+76 76 1
+76 882 0.015657
+76 906 -0.168499
+77 25 -0.356604
+77 77 1
+77 882 0.0133244
+77 907 -0.143396
+78 26 -0.278988
+78 78 1
+78 882 0.0205364
+78 908 -0.221012
+79 27 -0.368863
+79 79 1
+79 882 0.0121853
+79 909 -0.131137
+80 28 -0.396756
+80 80 1
+80 882 0.0188854
+80 910 -0.203244
+81 29 -0.23717
+81 81 1
+81 882 0.0244222
+81 911 -0.26283
+82 30 -0.379337
+82 82 1
+82 882 0.020504
+82 912 -0.220663
+83 31 -0.294478
+83 83 1
+83 882 0.0190971
+83 913 -0.205522
+84 32 -1
+84 84 1
+84 882 0.0282606
+84 914 -0.304139
+85 33 -0.449828
+85 85 1
+85 882 0.023246
+85 915 -0.250172
+86 34 -0.215086
+86 86 1
+86 882 0.0304572
+86 916 -0.327779
+87 35 -0.213124
+87 87 1
+87 882 0.0359485
+87 917 -0.386876
+88 36 -0.313853
+88 88 1
+88 882 0.0265888
+88 918 -0.286147
+89 37 -0.150698
+89 89 1
+89 882 0.0324571
+89 919 -0.349302
+90 38 -0.295086
+90 90 1
+90 882 0.0190406
+90 920 -0.204914
+91 39 -0.336836
+91 91 1
+91 882 0.0151612
+91 921 -0.163164
+92 40 -0.355496
+92 92 1
+92 882 0.0134273
+92 922 -0.144504
+93 41 -0.380661
+93 93 1
+93 882 0.011089
+93 923 -0.119339
+94 42 -0.150226
+94 94 1
+94 882 0.041793
+94 924 -0.449774
+95 43 -0.37766
+95 95 1
+95 882 0.0113678
+95 925 -0.12234
+96 44 -0.5
+96 96 1
+96 882 0.0162646
+96 926 -0.167724
+97 45 -0.186516
+97 97 1
+97 882 0.0291289
+97 927 -0.313484
+98 46 -0.265
+98 98 1
+98 882 0.0218362
+98 928 -0.235
+99 47 -0.38173
+99 99 1
+99 882 0.0109896
+99 929 -0.11827
+100 48 -0.490919
+100 100 1
+100 882 0.000843779
+100 930 -0.00908071
+101 49 -0.240662
+101 101 1
+101 882 0.04646
+101 931 -0.5
+102 50 -0.25
+102 102 1
+102 882 0.02323
+102 932 -0.25
+103 51 -0.5
+103 103 1
+104 52 -0.5
+104 104 1
+105 105 -1
+106 106 -1
+107 107 -1
+108 108 -1
+109 109 -1
+110 110 -1
+111 111 -1
+112 112 -1
+113 113 -1
+114 114 -1
+115 115 -1
+116 116 -1
+117 117 -1
+118 118 -1
+119 119 -1
+120 120 -1
+121 121 -1
+122 122 -1
+123 123 -1
+124 124 -1
+125 125 -1
+126 126 -1
+127 127 -1
+128 128 -1
+129 129 -1
+130 130 -1
+131 131 -1
+132 132 -1
+133 133 -1
+134 134 -1
+135 135 -1
+136 136 -1
+137 137 -1
+138 138 -1
+139 139 -1
+140 140 -1
+141 141 -1
+142 142 -1
+143 143 -1
+144 144 -1
+145 145 -1
+146 146 -1
+147 147 -1
+148 148 -1
+149 149 -1
+150 150 -1
+151 151 -1
+152 152 -1
+153 153 -1
+154 154 -1
+155 155 -1
+156 156 -1
+157 157 -1
+158 158 -1
+159 159 -1
+160 160 -1
+161 161 -1
+162 162 -1
+163 163 -1
+164 164 -1
+165 165 -1
+166 166 -1
+167 167 -1
+168 168 -1
+169 169 -1
+170 170 -1
+171 171 -1
+172 172 -1
+173 173 -1
+174 174 -1
+175 175 -1
+176 176 -1
+177 177 -1
+178 178 -1
+179 179 -1
+180 180 -1
+181 181 -1
+182 182 -1
+183 183 -1
+184 184 -1
+185 185 -1
+186 186 -1
+187 187 -1
+188 188 -1
+189 189 -1
+190 190 -1
+191 191 -1
+192 192 -1
+193 193 -1
+194 194 -1
+195 195 -1
+196 196 -1
+197 197 -1
+198 198 -1
+199 199 -1
+200 200 -1
+201 201 -1
+202 202 -1
+203 203 -1
+204 204 -1
+205 205 -1
+206 206 -1
+207 207 -1
+208 208 -1
+209 1 -0.490612
+209 209 1
+209 774 -0.176719
+209 775 0.00510588
+209 776 0.0304883
+209 777 0.000295542
+209 778 0.000182222
+209 779 0.0855255
+209 780 0.0300062
+209 781 0.0120144
+209 782 0.0375191
+209 783 0.137524
+210 1 -0.356657
+210 210 1
+210 774 0.158026
+210 775 -0.0228243
+210 776 -0.136289
+210 777 -0.00132113
+210 778 -0.000814568
+210 779 -0.382316
+210 780 -0.134134
+210 781 0.00873402
+210 782 0.027275
+210 783 0.0999746
+211 1 -0.356657
+211 211 1
+211 774 0.158026
+211 775 -0.0228243
+211 776 -0.136289
+211 777 -0.00132113
+211 778 -0.000814568
+211 779 -0.382316
+211 780 -0.134134
+211 781 0.00873402
+211 782 0.027275
+211 783 0.0999746
+212 1 -0.356657
+212 212 1
+212 774 0.158026
+212 775 -0.0228243
+212 776 -0.136289
+212 777 -0.00132113
+212 778 -0.000814568
+212 779 -0.382316
+212 780 -0.134134
+212 781 0.00873402
+212 782 0.027275
+212 783 0.0999746
+213 1 -0.356657
+213 213 1
+213 774 0.158026
+213 775 -0.0228243
+213 776 -0.136289
+213 777 -0.00132113
+213 778 -0.000814568
+213 779 -0.382316
+213 780 -0.134134
+213 781 0.00873402
+213 782 0.027275
+213 783 0.0999746
+214 1 -0.356657
+214 214 1
+214 774 0.158026
+214 775 -0.0228243
+214 776 -0.136289
+214 777 -0.00132113
+214 778 -0.000814568
+214 779 -0.382316
+214 780 -0.134134
+214 781 0.00873402
+214 782 0.027275
+214 783 0.0999746
+215 1 -0.356657
+215 215 1
+215 774 0.158026
+215 775 -0.0228243
+215 776 -0.136289
+215 777 -0.00132113
+215 778 -0.000814568
+215 779 -0.382316
+215 780 -0.134134
+215 781 0.00873402
+215 782 0.027275
+215 783 0.0999746
+216 1 -1
+216 216 1
+216 774 0.0101472
+216 775 0.000238343
+216 776 0.0014232
+216 777 1.37959e-05
+216 778 8.50612e-06
+216 779 0.00399233
+216 780 0.00140069
+216 781 -0.0124316
+216 782 0.00175139
+216 783 0.0064196
+217 1 -1
+217 217 1
+217 774 0.157218
+217 775 0.00369282
+217 776 0.0220507
+217 777 0.000213751
+217 778 0.000131792
+217 779 0.0618562
+217 780 0.0217019
+217 781 0.0086894
+217 782 -0.0751193
+217 783 -0.275344
+218 1 -1
+218 218 1
+218 774 0.157218
+218 775 0.00369282
+218 776 0.0220507
+218 777 0.000213751
+218 778 0.000131792
+218 779 0.0618562
+218 780 0.0217019
+218 781 0.0086894
+218 782 -0.0751193
+218 783 -0.275344
+219 2 -0.480547
+219 219 1
+219 774 -0.120991
+219 775 0.0154453
+219 776 0.00642393
+219 777 0.00151259
+219 778 0.00161095
+219 779 0.0600357
+219 780 0.0236527
+219 781 0.0236125
+219 782 0.0426199
+219 783 0.0358883
+220 2 -0.480547
+220 220 1
+220 774 0.133823
+220 775 -0.0784193
+220 776 -0.0326157
+220 777 -0.00767973
+220 778 -0.00817914
+220 779 -0.304814
+220 780 -0.12009
+220 781 0.019263
+220 782 0.0347692
+220 783 0.0292776
+221 2 -0.480547
+221 221 1
+221 774 0.133823
+221 775 -0.0784193
+221 776 -0.0326157
+221 777 -0.00767973
+221 778 -0.00817914
+221 779 -0.304814
+221 780 -0.12009
+221 781 0.019263
+221 782 0.0347692
+221 783 0.0292776
+222 2 -0.480547
+222 222 1
+222 774 0.133823
+222 775 -0.0784193
+222 776 -0.0326157
+222 777 -0.00767973
+222 778 -0.00817914
+222 779 -0.304814
+222 780 -0.12009
+222 781 0.019263
+222 782 0.0347692
+222 783 0.0292776
+223 2 -0.480547
+223 223 1
+223 774 0.133823
+223 775 -0.0784193
+223 776 -0.0326157
+223 777 -0.00767973
+223 778 -0.00817914
+223 779 -0.304814
+223 780 -0.12009
+223 781 0.019263
+223 782 0.0347692
+223 783 0.0292776
+224 2 -0.480547
+224 224 1
+224 774 0.133823
+224 775 -0.0784193
+224 776 -0.0326157
+224 777 -0.00767973
+224 778 -0.00817914
+224 779 -0.304814
+224 780 -0.12009
+224 781 0.019263
+224 782 0.0347692
+224 783 0.0292776
+225 2 -0.480547
+225 225 1
+225 774 0.133823
+225 775 -0.0784193
+225 776 -0.0326157
+225 777 -0.00767973
+225 778 -0.00817914
+225 779 -0.304814
+225 780 -0.12009
+225 781 0.019263
+225 782 0.0347692
+225 783 0.0292776
+226 2 -0.4982
+226 226 1
+226 774 0.0313792
+226 775 0.00295454
+226 776 0.00122884
+226 777 0.000289343
+226 778 0.000308159
+226 779 0.0114842
+226 780 0.00452453
+226 781 -0.0339904
+226 782 0.00815277
+226 783 0.00686508
+227 2 -0.403449
+227 227 1
+227 774 0.112353
+227 775 0.0105787
+227 776 0.00439983
+227 777 0.00103599
+227 778 0.00110336
+227 779 0.0411191
+227 780 0.0162
+227 781 0.0161725
+227 782 -0.27882
+227 783 0.0245803
+228 2 -0.480547
+228 228 1
+228 774 0.0526932
+228 775 0.00496139
+228 776 0.00206351
+228 777 0.000485877
+228 778 0.000517473
+228 779 0.0192848
+228 780 0.00759777
+228 781 0.00758487
+228 782 0.0136905
+228 783 -0.144301
+229 3 -0.288579
+229 229 1
+229 774 -0.0281186
+229 775 -0.0809218
+229 776 -0.0331206
+229 777 -0.0904523
+229 778 -0.203154
+229 779 -0.318682
+229 780 -0.104795
+229 781 0.0194124
+229 782 0.127631
+229 783 0.300771
+230 3 -0.550007
+230 230 1
+230 774 -0.0535916
+230 775 -0.15423
+230 776 -0.063125
+230 777 -0.172394
+230 778 -0.387195
+230 779 -0.607381
+230 780 -0.199731
+230 781 0.0369984
+230 782 0.243254
+230 783 0.573244
+231 3 -0.288579
+231 231 1
+231 774 -0.0281186
+231 775 -0.0809218
+231 776 -0.0331206
+231 777 -0.0904523
+231 778 -0.203154
+231 779 -0.318682
+231 780 -0.104795
+231 781 0.0194124
+231 782 0.127631
+231 783 0.300771
+232 3 -0.45548
+232 232 1
+232 774 -0.0443811
+232 775 -0.127723
+232 776 -0.0522761
+232 777 -0.142766
+232 778 -0.32065
+232 779 -0.502993
+232 780 -0.165404
+232 781 0.0306397
+232 782 0.201447
+232 783 0.474724
+233 3 -0.288579
+233 233 1
+233 774 -0.0281186
+233 775 -0.0809218
+233 776 -0.0331206
+233 777 -0.0904523
+233 778 -0.203154
+233 779 -0.318682
+233 780 -0.104795
+233 781 0.0194124
+233 782 0.127631
+233 783 0.300771
+234 214 -0.135764
+234 224 -0.904153
+234 234 -1
+234 244 -1
+234 254 -1
+234 264 -1
+234 274 -0.420563
+234 284 -1
+234 615 1
+235 3 -0.288579
+235 235 1
+235 774 -0.0281186
+235 775 -0.0809218
+235 776 -0.0331206
+235 777 -0.0904523
+235 778 -0.203154
+235 779 -0.318682
+235 780 -0.104795
+235 781 0.0194124
+235 782 0.127631
+235 783 0.300771
+236 3 -1
+236 236 1
+236 774 0.003544
+236 775 0.0101992
+236 776 0.00417444
+236 777 0.0114004
+236 778 0.0256051
+236 779 0.0401659
+236 780 0.0132081
+236 781 -0.0398166
+236 782 0.0101289
+236 783 0.0238693
+237 3 -0.697099
+237 237 1
+237 774 0.0382777
+237 775 0.110158
+237 776 0.0450868
+237 777 0.123132
+237 778 0.276553
+237 779 0.43382
+237 780 0.142657
+237 781 0.0166393
+237 782 -0.203954
+237 783 -0.480631
+238 3 -0.669703
+238 238 1
+238 774 0.0367733
+238 775 0.105829
+238 776 0.0433149
+238 777 0.118293
+238 778 0.265684
+238 779 0.41677
+238 780 0.13705
+238 781 0.0159854
+238 782 -0.195938
+238 783 -0.461742
+239 4 -0.236647
+239 239 1
+239 774 -0.130794
+239 775 0.0606839
+239 776 0.0320154
+239 777 0.00287214
+239 778 0.00224143
+239 779 0.0606839
+239 780 0.0313052
+239 781 0.00931949
+239 782 0.0686807
+239 783 0.0325047
+240 210 -0.019387
+240 220 -0.556394
+240 230 -1
+240 240 -1
+240 250 -1
+240 260 -1
+240 270 -0.826905
+240 280 -0.425814
+240 611 1
+241 211 -0.040263
+241 221 -0.0804856
+241 231 -0.271311
+241 241 -1
+241 251 -0.20782
+241 261 -0.473692
+241 271 -0.132407
+241 281 -0.0825791
+241 612 1
+242 4 -0.236647
+242 242 1
+242 774 0.125826
+242 775 -0.236647
+242 776 0.0261181
+242 777 -0.0112004
+242 778 -0.00874083
+242 779 -0.236647
+242 780 -0.12208
+242 781 0.00760281
+242 782 0.0560295
+242 783 0.0265173
+243 4 -0.236647
+243 243 1
+243 774 0.125826
+243 775 -0.236647
+243 776 0.0261181
+243 777 -0.0112004
+243 778 -0.00874083
+243 779 -0.236647
+243 780 -0.12208
+243 781 0.00760281
+243 782 0.0560295
+243 783 0.0265173
+244 4 -0.418068
+244 244 1
+244 774 0.222288
+244 775 -0.418068
+244 776 0.0461411
+244 777 -0.019787
+244 778 -0.0154419
+244 779 -0.418068
+244 780 -0.21567
+244 781 0.0134314
+244 782 0.0989837
+244 783 0.0468463
+245 4 -0.236647
+245 245 1
+245 774 0.125826
+245 775 -0.236647
+245 776 0.0261181
+245 777 -0.0112004
+245 778 -0.00874083
+245 779 -0.236647
+245 780 -0.12208
+245 781 0.00760281
+245 782 0.0560295
+245 783 0.0265173
+246 4 -0.236647
+246 246 1
+246 774 0.0284585
+246 775 0.0111969
+246 776 0.00590724
+246 777 0.000529946
+246 778 0.000413572
+246 779 0.0111969
+246 780 0.0057762
+246 781 -0.0354232
+246 782 0.0126724
+246 783 0.00599752
+247 4 -0.51253
+247 247 1
+247 774 0.272513
+247 775 0.10722
+247 776 0.0565666
+247 777 0.00507466
+247 778 0.00396029
+247 779 0.10722
+247 780 0.0553118
+247 781 0.0164662
+247 782 -0.67322
+247 783 0.0574311
+248 4 -0.251139
+248 248 1
+248 774 0.0525784
+248 775 0.0206868
+248 776 0.0109139
+248 777 0.000979098
+248 778 0.000764092
+248 779 0.0206868
+248 780 0.0106718
+248 781 0.00317696
+248 782 0.0234129
+248 783 -0.154292
+249 5 -0.159645
+249 249 1
+249 774 -0.176719
+249 775 0.175403
+249 776 0.0463324
+249 777 0.100047
+249 778 0.183705
+249 779 0.0104931
+249 780 0.11349
+249 781 0.00205281
+249 782 0.0346089
+249 783 0.0311976
+250 5 -1
+250 250 1
+250 774 0.399752
+250 775 -0.954805
+250 776 -0.252211
+250 777 -0.544607
+250 778 -1
+250 779 0.0657277
+250 780 -0.617784
+250 781 0.0128586
+250 782 0.216786
+250 783 0.195418
+251 5 -0.442072
+251 251 1
+251 774 0.176719
+251 775 -0.422093
+251 776 -0.111495
+251 777 -0.240756
+251 778 -0.442072
+251 779 0.0290564
+251 780 -0.273105
+251 781 0.00568442
+251 782 0.0958351
+251 783 0.0863889
+252 5 -0.442072
+252 252 1
+252 774 0.176719
+252 775 -0.422093
+252 776 -0.111495
+252 777 -0.240756
+252 778 -0.442072
+252 779 0.0290564
+252 780 -0.273105
+252 781 0.00568442
+252 782 0.0958351
+252 783 0.0863889
+253 5 -0.442072
+253 253 1
+253 774 0.176719
+253 775 -0.422093
+253 776 -0.111495
+253 777 -0.240756
+253 778 -0.442072
+253 779 0.0290564
+253 780 -0.273105
+253 781 0.00568442
+253 782 0.0958351
+253 783 0.0863889
+254 5 -0.834293
+254 254 1
+254 774 0.00999785
+254 775 0.0274787
+254 776 0.00725846
+254 777 0.0156734
+254 778 0.0287793
+254 779 -0.0798795
+254 780 0.0177794
+254 781 0.000321595
+254 782 0.00542185
+254 783 0.00488743
+255 5 -0.442072
+255 255 1
+255 774 0.176719
+255 775 -0.422093
+255 776 -0.111495
+255 777 -0.240756
+255 778 -0.442072
+255 779 0.0290564
+255 780 -0.273105
+255 781 0.00568442
+255 782 0.0958351
+255 783 0.0863889
+256 5 -0.451117
+256 256 1
+256 774 0.00295854
+256 775 0.00813141
+256 776 0.0021479
+256 777 0.00463804
+256 778 0.0085163
+256 779 0.000486446
+256 780 0.00526124
+256 781 -0.0077802
+256 782 0.00160442
+256 783 0.00144627
+257 5 -0.917677
+257 257 1
+257 774 0.0357191
+257 775 0.0981723
+257 776 0.0259321
+257 777 0.0559961
+257 778 0.102819
+257 779 0.00587298
+257 780 0.0635201
+257 781 0.00114895
+257 782 -0.19304
+257 783 0.0174612
+258 5 -0.451117
+258 258 1
+258 774 0.017559
+258 775 0.0482601
+258 776 0.0127479
+258 777 0.0275269
+258 778 0.0505445
+258 779 0.00288707
+258 780 0.0312256
+258 781 0.000564809
+258 782 0.00952226
+258 783 -0.104055
+259 6 -0.248166
+259 259 1
+259 774 -0.00533653
+259 775 -0.0430944
+259 776 -0.0276397
+259 777 -0.0461556
+259 778 -0.156262
+259 779 0.0307867
+259 780 -0.164107
+259 781 0.00396438
+259 782 0.0796086
+259 783 0.0654325
+260 6 -0.526981
+260 260 1
+260 774 -0.0113321
+260 775 -0.091511
+260 776 -0.0586929
+260 777 -0.0980114
+260 778 -0.331821
+260 779 0.0653755
+260 780 -0.348481
+260 781 0.00841835
+260 782 0.169049
+260 783 0.138946
+261 6 -0.248166
+261 261 1
+261 774 -0.00533653
+261 775 -0.0430944
+261 776 -0.0276397
+261 777 -0.0461556
+261 778 -0.156262
+261 779 0.0307867
+261 780 -0.164107
+261 781 0.00396438
+261 782 0.0796086
+261 783 0.0654325
+262 6 -0.418163
+262 262 1
+262 774 -0.00899212
+262 775 -0.0726146
+262 776 -0.0465732
+262 777 -0.0777727
+262 778 -0.263302
+262 779 0.0518759
+262 780 -0.276522
+262 781 0.00668002
+262 782 0.134141
+262 783 0.110255
+263 6 -0.248166
+263 263 1
+263 774 -0.00533653
+263 775 -0.0430944
+263 776 -0.0276397
+263 777 -0.0461556
+263 778 -0.156262
+263 779 0.0307867
+263 780 -0.164107
+263 781 0.00396438
+263 782 0.0796086
+263 783 0.0654325
+264 6 -1
+264 264 1
+264 774 0.00338455
+264 775 0.0273314
+264 776 0.0175297
+264 777 0.0292729
+264 778 0.0991045
+264 779 -0.173908
+264 780 0.10408
+264 781 0.000478883
+264 782 0.00961645
+264 783 0.00790403
+265 6 -0.60407
+265 265 1
+265 774 -0.0129898
+265 775 -0.104898
+265 776 -0.0672787
+265 777 -0.112349
+265 778 -0.380361
+265 779 0.0749389
+265 780 -0.399458
+265 781 0.00964982
+265 782 0.193778
+265 783 0.159271
+266 6 -0.248166
+266 266 1
+266 774 0.000459667
+266 775 0.00371198
+266 776 0.00238077
+266 777 0.00397565
+266 778 0.0134597
+266 779 0.00050508
+266 780 0.0141355
+266 781 -0.00781032
+266 782 0.00130604
+266 783 0.00107347
+267 6 -0.862353
+267 267 1
+267 774 0.00948
+267 775 0.0765544
+267 776 0.0491001
+267 777 0.0819923
+267 778 0.277588
+267 779 0.0104166
+267 780 0.291525
+267 781 0.00134134
+267 782 -0.335907
+267 783 0.0221389
+268 6 -0.288952
+268 268 1
+268 774 0.0031765
+268 775 0.0256514
+268 776 0.0164522
+268 777 0.0274735
+268 778 0.0930126
+268 779 0.00349033
+268 780 0.0976825
+268 781 0.000449447
+268 782 0.00902534
+268 783 -0.123733
+269 7 -0.712432
+269 269 1
+269 774 -0.0298816
+269 775 -0.0312565
+269 776 -0.01439
+269 777 -0.00911597
+269 778 -0.032305
+269 779 -0.0380249
+269 780 -0.117919
+269 781 0.00339877
+269 782 0.0660878
+269 783 0.09081
+270 7 -0.712432
+270 270 1
+270 774 -0.0298816
+270 775 -0.0312565
+270 776 -0.01439
+270 777 -0.00911597
+270 778 -0.032305
+270 779 -0.0380249
+270 780 -0.117919
+270 781 0.00339877
+270 782 0.0660878
+270 783 0.09081
+271 7 -0.712432
+271 271 1
+271 774 -0.0298816
+271 775 -0.0312565
+271 776 -0.01439
+271 777 -0.00911597
+271 778 -0.032305
+271 779 -0.0380249
+271 780 -0.117919
+271 781 0.00339877
+271 782 0.0660878
+271 783 0.09081
+272 7 -0.712432
+272 272 1
+272 774 -0.0298816
+272 775 -0.0312565
+272 776 -0.01439
+272 777 -0.00911597
+272 778 -0.032305
+272 779 -0.0380249
+272 780 -0.117919
+272 781 0.00339877
+272 782 0.0660878
+272 783 0.09081
+273 7 -0.712432
+273 273 1
+273 774 -0.0298816
+273 775 -0.0312565
+273 776 -0.01439
+273 777 -0.00911597
+273 778 -0.032305
+273 779 -0.0380249
+273 780 -0.117919
+273 781 0.00339877
+273 782 0.0660878
+273 783 0.09081
+274 7 -0.712432
+274 274 1
+274 774 -0.0298816
+274 775 -0.0312565
+274 776 -0.01439
+274 777 -0.00911597
+274 778 -0.032305
+274 779 -0.0380249
+274 780 -0.117919
+274 781 0.00339877
+274 782 0.0660878
+274 783 0.09081
+275 7 -0.712432
+275 275 1
+275 774 -0.0298816
+275 775 -0.0312565
+275 776 -0.01439
+275 777 -0.00911597
+275 778 -0.032305
+275 779 -0.0380249
+275 780 -0.117919
+275 781 0.00339877
+275 782 0.0660878
+275 783 0.09081
+276 7 -0.712432
+276 276 1
+276 774 0.00284002
+276 775 0.00297068
+276 776 0.00136766
+276 777 0.000866402
+276 778 0.00307034
+276 779 0.00361397
+276 780 0.0112073
+276 781 -0.0078196
+276 782 0.00108422
+276 783 0.00148981
+277 7 -1
+277 277 1
+277 774 0.0236592
+277 775 0.0247477
+277 776 0.0113935
+277 777 0.00721769
+277 778 0.0255779
+277 779 0.0301067
+277 780 0.0933639
+277 781 0.000464513
+277 782 -0.137533
+277 783 0.0124111
+278 7 -0.712432
+278 278 1
+278 774 0.0168556
+278 775 0.0176311
+278 776 0.0081171
+278 777 0.00514212
+278 778 0.0182225
+278 779 0.021449
+278 780 0.0665155
+278 781 0.000330934
+278 782 0.0064349
+278 783 -0.103797
+279 8 -0.64291
+279 279 1
+279 774 -0.0219829
+279 775 -0.06257
+279 776 -0.0348886
+279 777 -0.251155
+279 778 -0.325078
+279 779 0.0438946
+279 780 -0.122006
+279 781 0.00181639
+279 782 0.0994723
+279 783 0.168283
+280 8 -0.64291
+280 280 1
+280 774 -0.0219829
+280 775 -0.06257
+280 776 -0.0348886
+280 777 -0.251155
+280 778 -0.325078
+280 779 0.0438946
+280 780 -0.122006
+280 781 0.00181639
+280 782 0.0994723
+280 783 0.168283
+281 8 -0.64291
+281 281 1
+281 774 -0.0219829
+281 775 -0.06257
+281 776 -0.0348886
+281 777 -0.251155
+281 778 -0.325078
+281 779 0.0438946
+281 780 -0.122006
+281 781 0.00181639
+281 782 0.0994723
+281 783 0.168283
+282 8 -0.814131
+282 282 1
+282 774 -0.0278374
+282 775 -0.0792337
+282 776 -0.0441802
+282 777 -0.318043
+282 778 -0.411653
+282 779 0.0555847
+282 780 -0.154499
+282 781 0.00230014
+282 782 0.125964
+282 783 0.213101
+283 8 -0.64291
+283 283 1
+283 774 -0.0219829
+283 775 -0.06257
+283 776 -0.0348886
+283 777 -0.251155
+283 778 -0.325078
+283 779 0.0438946
+283 780 -0.122006
+283 781 0.00181639
+283 782 0.0994723
+283 783 0.168283
+284 8 -1
+284 284 1
+284 774 0.00274535
+284 775 0.00781409
+284 776 0.00435709
+284 777 0.0313657
+284 778 0.0405976
+284 779 -0.0665182
+284 780 0.0152368
+284 781 8.46948e-05
+284 782 0.0046382
+284 783 0.00784672
+285 8 -0.64291
+285 285 1
+285 774 -0.0219829
+285 775 -0.06257
+285 776 -0.0348886
+285 777 -0.251155
+285 778 -0.325078
+285 779 0.0438946
+285 780 -0.122006
+285 781 0.00181639
+285 782 0.0994723
+285 783 0.168283
+286 8 -0.64291
+286 286 1
+286 774 0.000965935
+286 775 0.00274934
+286 776 0.00153302
+286 777 0.0110358
+286 778 0.014284
+286 779 0.000720127
+286 780 0.00536099
+286 781 -0.00784556
+286 782 0.00163192
+286 783 0.00276082
+287 8 -0.755752
+287 287 1
+287 774 0.00673906
+287 775 0.0191814
+287 776 0.0106954
+287 777 0.076994
+287 778 0.0996556
+287 779 0.00502412
+287 780 0.0374021
+287 781 0.000207902
+287 782 -0.11136
+287 783 0.0192615
+288 8 -0.64291
+288 288 1
+288 774 0.00573284
+288 775 0.0163174
+288 776 0.00909849
+288 777 0.065498
+288 778 0.084776
+288 779 0.00427397
+288 780 0.0318176
+288 781 0.00017686
+288 782 0.00968551
+288 783 -0.0962532
+289 289 -1
+289 620 1
+290 10 -1
+290 290 1
+291 11 -1
+291 291 1
+292 12 -1
+292 292 1
+293 293 -1
+293 624 1
+294 14 -0.396895
+294 294 1
+295 295 -1
+295 626 1
+296 296 -1
+296 627 1
+297 297 -1
+297 628 1
+298 298 -1
+298 629 1
+299 299 -1
+299 630 1
+300 300 -1
+300 631 1
+301 301 -1
+301 632 1
+302 302 -1
+302 633 1
+303 303 -1
+303 634 1
+304 304 -1
+304 635 1
+305 305 -1
+305 636 1
+306 306 -1
+306 637 1
+307 307 -1
+307 638 1
+308 308 -1
+308 639 1
+309 309 -1
+309 640 1
+310 310 -1
+310 641 1
+311 311 -1
+311 642 1
+312 32 -1
+312 312 1
+313 313 -1
+313 644 1
+314 34 -1
+314 314 1
+315 315 -1
+315 646 1
+316 316 -1
+316 647 1
+317 317 -1
+317 648 1
+318 318 -1
+318 649 1
+319 319 -1
+319 650 1
+320 320 -1
+320 651 1
+321 321 -1
+321 652 1
+322 322 -1
+322 653 1
+323 323 -1
+323 654 1
+324 44 -0.5
+324 324 1
+325 325 -1
+325 656 1
+326 326 -1
+326 657 1
+327 327 -1
+327 658 1
+328 328 -1
+328 659 1
+329 49 -1
+329 329 1
+330 330 -1
+330 661 1
+331 331 -1
+331 662 1
+332 332 -1
+332 663 1
+333 333 1
+333 441 -1
+334 334 1
+334 442 -1
+335 335 1
+335 443 -1
+336 336 1
+336 444 -1
+336 777 0.542386
+336 831 -0.471892
+337 337 1
+337 445 -0.598332
+337 778 0.826787
+337 832 -1
+338 338 1
+338 446 -1
+339 1 -0.000878618
+339 2 -0.00707421
+339 3 -0.0109867
+339 4 -0.00857576
+339 5 -0.0101033
+339 6 -0.0308595
+339 7 -0.0357838
+339 8 -0.00811866
+339 10 -0.124662
+339 15 -0.237178
+339 33 -0.98494
+339 34 -0.203228
+339 339 -1
+339 616 1
+339 780 0.138992
+339 834 -0.138992
+340 340 1
+340 448 -1
+341 9 -0.0299104
+341 32 -0.501857
+341 33 -0.178518
+341 34 -0.096039
+341 42 -0.062485
+341 341 -1
+341 618 1
+342 12 -0.476066
+342 14 -0.00708418
+342 30 -0.145787
+342 32 -0.280731
+342 35 -0.0468844
+342 342 -1
+342 619 1
+343 343 1
+343 451 -1
+343 784 0.000750351
+343 838 -0.000750383
+344 344 -1
+344 621 1
+345 30 -0.481977
+345 345 -1
+345 622 1
+345 786 0.0289408
+345 840 -0.0289408
+346 346 1
+346 454 -1
+347 347 1
+347 455 -1
+348 348 1
+348 456 -1
+348 789 0.032998
+348 843 -0.124518
+349 349 1
+349 457 -1
+349 790 1
+349 844 -1
+350 350 1
+350 458 -0.596871
+350 791 1
+350 845 -1
+351 351 1
+351 459 -1
+351 792 1
+351 846 -1
+352 352 1
+352 460 -1
+352 793 1
+352 847 -1
+353 353 1
+353 461 -1
+353 794 0.406251
+353 848 -0.435779
+354 354 1
+354 462 -1
+354 795 1
+354 849 -1
+355 355 1
+355 463 -1
+355 796 0.211531
+355 850 -0.21153
+356 356 1
+356 464 -1
+357 357 1
+357 465 -0.833333
+357 798 1
+357 852 -1
+358 358 1
+358 466 -1
+358 799 0.0674362
+358 853 -0.151235
+359 359 1
+359 467 -1
+360 360 1
+360 468 -1
+361 361 1
+361 469 -1
+362 362 1
+362 470 -1
+362 803 0.281935
+362 857 -0.281935
+363 363 1
+363 471 -1
+363 804 1
+363 858 -1
+364 364 1
+364 472 -0.65419
+364 805 1
+364 859 -1
+365 365 1
+365 473 -0.488345
+365 806 0.440547
+365 860 -0.440547
+366 366 1
+366 474 -1
+366 807 0.0281648
+366 861 -0.233849
+367 367 1
+367 475 -1
+367 808 0.424783
+367 862 -0.424783
+368 10 -0.0677556
+368 15 -0.0515922
+368 29 -0.0156699
+368 32 -0.095851
+368 33 -0.132957
+368 42 -0.0209473
+368 368 -1
+368 645 1
+368 809 0.0604974
+368 863 -0.0604974
+369 369 1
+369 477 -0.701051
+369 810 1
+369 864 -1
+370 370 1
+370 478 -1
+370 811 0.539913
+370 865 -0.539913
+371 371 1
+371 479 -1
+372 372 1
+372 480 -0.786733
+372 813 1
+372 867 -1
+373 373 1
+373 481 -1
+374 374 1
+374 482 -1
+374 815 0.818432
+374 869 -0.818432
+375 375 1
+375 483 -1
+375 816 0.0834249
+375 870 -0.0834249
+376 376 1
+376 484 -0.952687
+376 817 1
+376 871 -1
+377 377 1
+377 485 -1
+378 378 1
+378 486 -1
+379 379 1
+379 487 -1
+380 380 1
+380 488 -1
+381 381 1
+381 489 -1
+382 382 1
+382 490 -1
+383 383 -1
+383 660 1
+384 384 1
+384 492 -1
+385 385 1
+385 493 -1
+386 386 1
+386 494 -1
+387 387 1
+387 441 -1
+387 774 -0.0883596
+387 828 0.5
+388 388 1
+388 442 -1
+388 775 -0.5
+388 829 0.5
+389 389 1
+389 443 -1
+389 776 -0.5
+389 830 0.5
+390 390 1
+390 444 -1
+390 777 -0.0323066
+390 831 0.0281077
+391 391 1
+391 445 -1
+391 778 -0.271755
+391 832 0.328688
+392 392 1
+392 446 -1
+392 779 -0.764633
+392 833 1
+393 393 1
+393 447 -0.844375
+393 780 -0.306152
+393 834 0.306152
+394 394 1
+394 448 -1
+394 781 -0.0624038
+394 835 0.5
+395 395 1
+395 449 -0.0921706
+395 782 -0.555577
+395 836 1
+396 396 1
+396 450 -0.209166
+396 783 -0.392377
+396 837 0.392377
+397 397 1
+397 451 -0.290321
+397 784 -1
+397 838 1
+398 398 1
+398 452 -0.311031
+398 785 -0.414177
+398 839 0.414177
+399 399 1
+399 453 -0.684478
+399 786 -0.0783429
+399 840 0.0783429
+400 400 1
+400 454 -0.5
+400 787 -0.177883
+400 841 1
+401 401 1
+401 455 -0.714286
+401 788 -1
+401 842 1
+402 402 1
+402 456 -0.929815
+402 789 -0.265006
+402 843 1
+403 403 1
+403 457 -0.412023
+403 790 -0.688953
+403 844 0.688953
+404 404 1
+404 458 -1
+404 791 -0.324597
+404 845 0.324597
+405 405 1
+405 459 -0.344382
+405 792 -0.273569
+405 846 0.273569
+406 406 1
+406 460 -0.416107
+406 793 -0.911075
+406 847 0.911075
+407 407 1
+407 461 -0.423489
+407 794 -0.93224
+407 848 1
+408 408 1
+408 462 -0.357861
+408 795 -0.633689
+408 849 0.633689
+409 409 1
+409 463 -0.500781
+409 796 -0.394069
+409 850 0.39407
+410 410 1
+410 464 -1
+410 797 -0.5
+410 851 0.5
+411 411 1
+411 465 -1
+412 412 1
+412 466 -1
+412 799 -0.155515
+412 853 0.348765
+413 413 1
+413 467 -1
+413 800 -0.5
+413 854 0.5
+414 414 1
+414 468 -1
+414 801 -0.5
+414 855 0.5
+415 415 1
+415 469 -0.769231
+415 802 -1
+415 856 1
+416 416 1
+416 470 -0.418415
+416 803 -1
+416 857 1
+417 417 1
+417 471 -0.776664
+417 804 -0.185148
+417 858 0.185148
+418 418 1
+418 472 -1
+418 805 -0.80131
+418 859 0.80131
+419 419 1
+419 473 -1
+419 806 -0.15519
+419 860 0.15519
+420 420 1
+420 474 -1
+420 807 -0.0320552
+420 861 0.266151
+421 421 1
+421 475 -0.686955
+421 808 -0.808193
+421 862 0.808193
+422 422 1
+422 476 -0.277374
+422 809 -0.207202
+422 863 0.207202
+423 423 1
+423 477 -1
+423 810 -0.573571
+423 864 0.573571
+424 424 1
+424 478 -0.593465
+424 811 -1
+424 865 1
+425 425 1
+425 479 -1
+425 812 -0.327889
+425 866 0.327889
+426 426 1
+426 480 -1
+426 813 -0.12892
+426 867 0.128921
+427 427 1
+427 481 -1
+427 814 -1
+427 868 1
+428 428 1
+428 482 -0.407229
+428 815 -0.966711
+428 869 0.966711
+429 429 1
+429 483 -1
+429 816 -0.416575
+429 870 0.416575
+430 430 1
+430 484 -1
+430 817 -0.150338
+430 871 0.150338
+431 431 1
+431 485 -1
+431 818 -1
+431 872 1
+432 432 1
+432 486 -1
+432 819 -0.113932
+432 873 0.5
+433 433 1
+433 487 -0.251124
+433 820 -0.877537
+433 874 1
+434 434 1
+434 488 -0.335443
+434 821 -0.833626
+434 875 1
+435 435 1
+435 489 -1
+435 822 -0.528119
+435 876 1
+436 436 1
+436 490 -0.740512
+436 823 -1
+436 877 1
+437 437 1
+437 491 -0.240662
+437 824 -1
+437 878 1
+438 438 1
+438 492 -0.5
+438 825 -1
+438 879 1
+439 439 1
+439 493 -1
+439 826 -1
+439 880 1
+440 440 1
+440 494 -1
+440 827 -1
+440 881 1
+441 441 1
+441 987 0.21752
+441 990 0.000778065
+441 991 0.00995924
+441 993 0.00552011
+441 995 0.00454668
+441 996 0.0128337
+441 997 0.011282
+441 998 0.014559
+441 999 0.00399023
+441 1001 0.00027693
+441 1002 0.00179942
+441 1003 0.00340027
+441 1004 0.00188028
+441 1005 0.000725365
+441 1006 0.000389279
+441 1007 0.000518497
+441 1008 0.000112724
+441 1009 0.00148847
+441 1011 8.70593e-05
+441 1012 0.000388479
+441 1016 0.00343397
+441 1017 0.00150103
+441 1018 0.0131291
+441 1019 0.000270031
+441 1020 0.00198783
+441 1021 0.00910337
+441 1022 0.0134662
+441 1023 0.000316114
+441 1024 0.00796811
+441 1025 0.000263912
+441 1026 0.000762254
+441 1027 1.50855e-05
+441 1028 0.000991959
+441 1029 0.00133275
+441 1030 0.00133454
+441 1033 0.00506813
+441 1034 0.0252857
+441 1036 0.0213428
+441 1037 0.0206654
+441 1210 -0.0649712
+442 442 1
+442 988 0.21752
+442 990 0.000778065
+442 991 0.00995924
+442 993 0.00552011
+442 995 0.00454668
+442 996 0.0128337
+442 997 0.011282
+442 998 0.014559
+442 999 0.00399023
+442 1001 0.00027693
+442 1002 0.00179942
+442 1003 0.00340027
+442 1004 0.00188028
+442 1005 0.000725365
+442 1006 0.000389279
+442 1007 0.000518497
+442 1008 0.000112724
+442 1009 0.00148847
+442 1011 8.70593e-05
+442 1012 0.000388479
+442 1016 0.00343397
+442 1017 0.00150103
+442 1018 0.0131291
+442 1019 0.000270031
+442 1020 0.00198783
+442 1021 0.00910337
+442 1022 0.0134662
+442 1023 0.000316114
+442 1024 0.00796811
+442 1025 0.000263912
+442 1026 0.000762254
+442 1027 1.50855e-05
+442 1028 0.000991959
+442 1029 0.00133275
+442 1030 0.00133454
+442 1033 0.00506813
+442 1034 0.0252857
+442 1036 0.0213428
+442 1037 0.0206654
+442 1210 -0.0649712
+443 443 1
+443 989 0.21752
+443 990 0.000778065
+443 991 0.00995924
+443 993 0.00552011
+443 995 0.00454668
+443 996 0.0128337
+443 997 0.011282
+443 998 0.014559
+443 999 0.00399023
+443 1001 0.00027693
+443 1002 0.00179942
+443 1003 0.00340027
+443 1004 0.00188028
+443 1005 0.000725365
+443 1006 0.000389279
+443 1007 0.000518497
+443 1008 0.000112724
+443 1009 0.00148847
+443 1011 8.70593e-05
+443 1012 0.000388479
+443 1016 0.00343397
+443 1017 0.00150103
+443 1018 0.0131291
+443 1019 0.000270031
+443 1020 0.00198783
+443 1021 0.00910337
+443 1022 0.0134662
+443 1023 0.000316114
+443 1024 0.00796811
+443 1025 0.000263912
+443 1026 0.000762254
+443 1027 1.50855e-05
+443 1028 0.000991959
+443 1029 0.00133275
+443 1030 0.00133454
+443 1033 0.00506813
+443 1034 0.0252857
+443 1036 0.0213428
+443 1037 0.0206654
+443 1210 -0.0649712
+444 444 1
+444 990 0.218298
+444 991 0.00995924
+444 993 0.00552011
+444 995 0.00454668
+444 996 0.0128337
+444 997 0.011282
+444 998 0.014559
+444 999 0.00399023
+444 1001 0.00027693
+444 1002 0.00179942
+444 1003 0.00340027
+444 1004 0.00188028
+444 1005 0.000725365
+444 1006 0.000389279
+444 1007 0.000518497
+444 1008 0.000112724
+444 1009 0.00148847
+444 1011 8.70593e-05
+444 1012 0.000388479
+444 1016 0.00343397
+444 1017 0.00150103
+444 1018 0.0131291
+444 1019 0.000270031
+444 1020 0.00198783
+444 1021 0.00910337
+444 1022 0.0134662
+444 1023 0.000316114
+444 1024 0.00796811
+444 1025 0.000263912
+444 1026 0.000762254
+444 1027 1.50855e-05
+444 1028 0.000991959
+444 1029 0.00133275
+444 1030 0.00133454
+444 1033 0.00506813
+444 1034 0.0252857
+444 1036 0.0213428
+444 1037 0.0206654
+444 1210 -0.0649712
+445 445 1
+445 990 0.000778065
+445 991 0.227479
+445 993 0.00552011
+445 995 0.00454668
+445 996 0.0128337
+445 997 0.011282
+445 998 0.014559
+445 999 0.00399023
+445 1001 0.00027693
+445 1002 0.00179942
+445 1003 0.00340027
+445 1004 0.00188028
+445 1005 0.000725365
+445 1006 0.000389279
+445 1007 0.000518497
+445 1008 0.000112724
+445 1009 0.00148847
+445 1011 8.70593e-05
+445 1012 0.000388479
+445 1016 0.00343397
+445 1017 0.00150103
+445 1018 0.0131291
+445 1019 0.000270031
+445 1020 0.00198783
+445 1021 0.00910337
+445 1022 0.0134662
+445 1023 0.000316114
+445 1024 0.00796811
+445 1025 0.000263912
+445 1026 0.000762254
+445 1027 1.50855e-05
+445 1028 0.000991959
+445 1029 0.00133275
+445 1030 0.00133454
+445 1033 0.00506813
+445 1034 0.0252857
+445 1036 0.0213428
+445 1037 0.0206654
+445 1210 -0.0649712
+446 446 1
+446 990 0.00194401
+446 991 0.0248834
+446 992 0.543478
+446 993 0.0137921
+446 995 0.01136
+446 996 0.0320654
+446 997 0.0281882
+446 998 0.036376
+446 999 0.00996967
+446 1001 0.000691916
+446 1002 0.00449588
+446 1003 0.00849565
+446 1004 0.00469793
+446 1005 0.00181234
+446 1006 0.000972622
+446 1007 0.00129547
+446 1008 0.000281642
+446 1009 0.00371897
+446 1011 0.00021752
+446 1012 0.000970622
+446 1016 0.00857984
+446 1017 0.00375035
+446 1018 0.0328033
+446 1019 0.000674678
+446 1020 0.00496664
+446 1021 0.0227449
+446 1022 0.0336455
+446 1023 0.000789818
+446 1024 0.0199085
+446 1025 0.000659391
+446 1026 0.00190451
+446 1027 3.76913e-05
+446 1028 0.00247843
+446 1029 0.0033299
+446 1030 0.00333439
+446 1033 0.0126628
+446 1034 0.0631768
+446 1036 0.0533255
+446 1037 0.0516329
+446 1210 -0.162332
+447 447 1
+447 990 0.000921469
+447 991 0.0117948
+447 993 0.264148
+447 995 0.00538467
+447 996 0.0151991
+447 997 0.0133613
+447 998 0.0172423
+447 999 0.00472566
+447 1001 0.000327971
+447 1002 0.00213106
+447 1003 0.00402697
+447 1004 0.00222683
+447 1005 0.000859055
+447 1006 0.000461026
+447 1007 0.00061406
+447 1008 0.000133499
+447 1009 0.0017628
+447 1011 0.000103105
+447 1012 0.000460078
+447 1016 0.00406687
+447 1017 0.00177768
+447 1018 0.0155489
+447 1019 0.0003198
+447 1020 0.0023542
+447 1021 0.0107812
+447 1022 0.0159481
+447 1023 0.000374376
+447 1024 0.0094367
+447 1025 0.000312553
+447 1026 0.000902744
+447 1027 1.78658e-05
+447 1028 0.00117478
+447 1029 0.00157838
+447 1030 0.00158051
+447 1033 0.00600223
+447 1034 0.029946
+447 1036 0.0252765
+447 1037 0.0244742
+447 1210 -0.0769458
+448 448 1
+448 990 0.00230427
+448 991 0.0294947
+448 993 0.016348
+448 994 0.644193
+448 995 0.0134652
+448 996 0.0380076
+448 997 0.0334119
+448 998 0.043117
+448 999 0.0118172
+448 1001 0.000820139
+448 1002 0.00532904
+448 1003 0.01007
+448 1004 0.00556853
+448 1005 0.0021482
+448 1006 0.00115286
+448 1007 0.00153555
+448 1008 0.000333835
+448 1009 0.00440816
+448 1011 0.00025783
+448 1012 0.00115049
+448 1016 0.0101698
+448 1017 0.00444535
+448 1018 0.0388823
+448 1019 0.000799707
+448 1020 0.00588704
+448 1021 0.02696
+448 1022 0.0398806
+448 1023 0.000936184
+448 1024 0.0235979
+448 1025 0.000781587
+448 1026 0.00225745
+448 1027 4.46762e-05
+448 1028 0.00293772
+448 1029 0.00394698
+448 1030 0.0039523
+448 1033 0.0150095
+448 1034 0.0748846
+448 1036 0.0632076
+448 1037 0.0612013
+448 1210 -0.192415
+449 341 1
+449 449 -1
+449 782 0.0223749
+449 836 -0.0402736
+450 450 1
+450 990 0.00594296
+450 991 0.0760699
+450 993 0.0421633
+450 995 0.0347281
+450 996 1
+450 997 0.086173
+450 998 0.111203
+450 999 0.0304779
+450 1001 0.00211523
+450 1002 0.0137442
+450 1003 0.0259717
+450 1004 0.0143618
+450 1005 0.00554043
+450 1006 0.00297336
+450 1007 0.00396034
+450 1008 0.000860997
+450 1009 0.0113691
+450 1011 0.00066497
+450 1012 0.00296725
+450 1016 0.0262291
+450 1017 0.011465
+450 1018 0.100282
+450 1019 0.00206253
+450 1020 0.0151833
+450 1021 0.0695326
+450 1022 0.102856
+450 1023 0.00241452
+450 1024 0.0608614
+450 1025 0.0020158
+450 1026 0.00582219
+450 1027 0.000115225
+450 1028 0.0075767
+450 1029 0.0101797
+450 1030 0.0101934
+450 1033 0.038711
+450 1034 0.193135
+450 1036 0.163019
+450 1037 0.157845
+450 1210 -0.496258
+451 451 1
+451 990 0.00268001
+451 991 0.0343042
+451 993 0.0190138
+451 995 0.0156608
+451 996 0.0442053
+451 997 0.788099
+451 998 0.0501479
+451 999 0.0137442
+451 1001 0.000953875
+451 1002 0.00619802
+451 1003 0.0117121
+451 1004 0.00647656
+451 1005 0.00249849
+451 1006 0.00134086
+451 1007 0.00178594
+451 1008 0.000388272
+451 1009 0.00512697
+451 1011 0.000299872
+451 1012 0.0013381
+451 1016 0.0118282
+451 1017 0.00517023
+451 1018 0.0452226
+451 1019 0.000930111
+451 1020 0.006847
+451 1021 0.0313562
+451 1022 0.0463837
+451 1023 0.00108884
+451 1024 0.0274458
+451 1025 0.000909035
+451 1026 0.00262555
+451 1027 5.19612e-05
+451 1028 0.00341676
+451 1029 0.00459059
+451 1030 0.00459678
+451 1033 0.017457
+451 1034 0.0870956
+451 1036 0.0735146
+451 1037 0.071181
+451 1210 -0.223791
+452 452 1
+452 990 0.00250157
+452 991 0.03202
+452 993 0.0177478
+452 995 0.0146181
+452 996 0.0412619
+452 997 0.0362727
+452 998 0.746159
+452 999 0.012829
+452 1001 0.000890361
+452 1002 0.00578532
+452 1003 0.0109323
+452 1004 0.00604532
+452 1005 0.00233213
+452 1006 0.00125157
+452 1007 0.00166702
+452 1008 0.000362419
+452 1009 0.00478559
+452 1011 0.000279905
+452 1012 0.001249
+452 1016 0.0110406
+452 1017 0.00482597
+452 1018 0.0422114
+452 1019 0.000868179
+452 1020 0.00639109
+452 1021 0.0292683
+452 1022 0.0432952
+452 1023 0.00101634
+452 1024 0.0256184
+452 1025 0.000848507
+452 1026 0.00245073
+452 1027 4.85014e-05
+452 1028 0.00318926
+452 1029 0.00428493
+452 1030 0.0042907
+452 1033 0.0162946
+452 1034 0.0812963
+452 1036 0.0686196
+452 1037 0.0664414
+452 1210 -0.208889
+453 453 1
+453 990 0.00336647
+453 991 0.0430908
+453 993 0.0238839
+453 995 0.0196722
+453 996 0.0555279
+453 997 0.0488138
+453 998 0.0629926
+453 999 0.958411
+453 1001 0.0011982
+453 1002 0.00778556
+453 1003 0.014712
+453 1004 0.00813545
+453 1005 0.00313845
+453 1006 0.0016843
+453 1007 0.00224339
+453 1008 0.000487723
+453 1009 0.00644017
+453 1011 0.000376681
+453 1012 0.00168084
+453 1016 0.0148578
+453 1017 0.00649451
+453 1018 0.0568058
+453 1019 0.00116835
+453 1020 0.00860078
+453 1021 0.0393877
+453 1022 0.0582643
+453 1023 0.00136774
+453 1024 0.0344757
+453 1025 0.00114187
+453 1026 0.00329806
+453 1027 6.52704e-05
+453 1028 0.00429192
+453 1029 0.00576641
+453 1030 0.00577419
+453 1033 0.0219284
+453 1034 0.109404
+453 1036 0.0923444
+453 1037 0.0894132
+453 1210 -0.281112
+454 454 1
+454 990 0.00164245
+454 991 0.0210234
+454 993 0.0116527
+454 995 0.00959779
+454 996 0.0270913
+454 997 0.0238156
+454 998 0.0307333
+454 999 0.00842316
+454 1000 0.459173
+454 1001 0.000584585
+454 1002 0.00379847
+454 1003 0.0071778
+454 1004 0.00396918
+454 1005 0.00153121
+454 1006 0.000821747
+454 1007 0.00109452
+454 1008 0.000237953
+454 1009 0.00314208
+454 1011 0.000183778
+454 1012 0.000820058
+454 1016 0.00724892
+454 1017 0.00316859
+454 1018 0.0277148
+454 1019 0.000570021
+454 1020 0.00419621
+454 1021 0.0192167
+454 1022 0.0284264
+454 1023 0.0006673
+454 1024 0.0168203
+454 1025 0.000557105
+454 1026 0.00160908
+454 1027 3.18446e-05
+454 1028 0.00209397
+454 1029 0.00281336
+454 1030 0.00281715
+454 1033 0.0106986
+454 1034 0.0533768
+454 1036 0.0450536
+454 1037 0.0436235
+454 1210 -0.137151
+455 455 1
+455 990 0.00148017
+455 991 0.0189461
+455 993 0.0105013
+455 995 0.00864945
+455 996 0.0244145
+455 997 0.0214624
+455 998 0.0276966
+455 999 0.00759088
+455 1001 0.414329
+455 1002 0.00342315
+455 1003 0.00646857
+455 1004 0.00357699
+455 1005 0.00137991
+455 1006 0.000740552
+455 1007 0.000986371
+455 1008 0.000214442
+455 1009 0.00283161
+455 1011 0.000165619
+455 1012 0.000739029
+455 1016 0.00653266
+455 1017 0.0028555
+455 1018 0.0249763
+455 1019 0.000513698
+455 1020 0.00378158
+455 1021 0.0173179
+455 1022 0.0256176
+455 1023 0.000601365
+455 1024 0.0151583
+455 1025 0.000502058
+455 1026 0.00145009
+455 1027 2.86981e-05
+455 1028 0.00188707
+455 1029 0.00253537
+455 1030 0.00253879
+455 1033 0.00964145
+455 1034 0.0481027
+455 1036 0.0406019
+455 1037 0.0393131
+455 1210 -0.123599
+456 456 1
+456 990 0.00194401
+456 991 0.0248834
+456 993 0.0137921
+456 995 0.01136
+456 996 0.0320654
+456 997 0.0281882
+456 998 0.036376
+456 999 0.00996967
+456 1001 0.000691916
+456 1002 0.547974
+456 1003 0.00849565
+456 1004 0.00469793
+456 1005 0.00181234
+456 1006 0.000972622
+456 1007 0.00129547
+456 1008 0.000281642
+456 1009 0.00371897
+456 1011 0.00021752
+456 1012 0.000970622
+456 1016 0.00857984
+456 1017 0.00375035
+456 1018 0.0328033
+456 1019 0.000674678
+456 1020 0.00496664
+456 1021 0.0227449
+456 1022 0.0336455
+456 1023 0.000789818
+456 1024 0.0199085
+456 1025 0.000659391
+456 1026 0.00190451
+456 1027 3.76913e-05
+456 1028 0.00247843
+456 1029 0.0033299
+456 1030 0.00333439
+456 1033 0.0126628
+456 1034 0.0631768
+456 1036 0.0533255
+456 1037 0.0516329
+456 1210 -0.162332
+457 457 1
+457 990 0.0018884
+457 991 0.0241716
+457 993 0.0133976
+457 995 0.011035
+457 996 0.0311481
+457 997 0.0273819
+457 998 0.0353354
+457 999 0.00968449
+457 1001 0.000672123
+457 1002 0.00436728
+457 1003 0.536184
+457 1004 0.00456354
+457 1005 0.0017605
+457 1006 0.0009448
+457 1007 0.00125842
+457 1008 0.000273586
+457 1009 0.00361259
+457 1011 0.000211297
+457 1012 0.000942857
+457 1016 0.00833441
+457 1017 0.00364307
+457 1018 0.0318649
+457 1019 0.000655379
+457 1020 0.00482457
+457 1021 0.0220943
+457 1022 0.0326831
+457 1023 0.000767225
+457 1024 0.019339
+457 1025 0.000640529
+457 1026 0.00185003
+457 1027 3.66132e-05
+457 1028 0.00240753
+457 1029 0.00323464
+457 1030 0.003239
+457 1033 0.0123006
+457 1034 0.0613697
+457 1036 0.0518001
+457 1037 0.0501559
+457 1210 -0.157688
+458 458 1
+458 990 0.00148017
+458 991 0.0189461
+458 993 0.0105013
+458 995 0.00864945
+458 996 0.0244145
+458 997 0.0214624
+458 998 0.0276966
+458 999 0.00759088
+458 1001 0.000526823
+458 1002 0.00342315
+458 1003 0.00646857
+458 1004 0.41738
+458 1005 0.00137991
+458 1006 0.000740552
+458 1007 0.000986371
+458 1008 0.000214442
+458 1009 0.00283161
+458 1011 0.000165619
+458 1012 0.000739029
+458 1016 0.00653266
+458 1017 0.0028555
+458 1018 0.0249763
+458 1019 0.000513698
+458 1020 0.00378158
+458 1021 0.0173179
+458 1022 0.0256176
+458 1023 0.000601365
+458 1024 0.0151583
+458 1025 0.000502058
+458 1026 0.00145009
+458 1027 2.86981e-05
+458 1028 0.00188707
+458 1029 0.00253537
+458 1030 0.00253879
+458 1033 0.00964145
+458 1034 0.0481027
+458 1036 0.0406019
+458 1037 0.0393131
+458 1210 -0.123599
+459 459 1
+459 990 0.00429804
+459 991 0.0550149
+459 993 0.0304931
+459 995 0.0251159
+459 996 0.0708937
+459 997 0.0623216
+459 998 0.080424
+459 999 0.0220421
+459 1001 0.00152976
+459 1002 0.00993999
+459 1003 0.0187831
+459 1004 0.0103867
+459 1005 1
+459 1006 0.00215038
+459 1007 0.00286418
+459 1008 0.000622686
+459 1009 0.00822231
+459 1011 0.000480916
+459 1012 0.00214596
+459 1016 0.0189693
+459 1017 0.00829168
+459 1018 0.0725251
+459 1019 0.00149165
+459 1020 0.0109808
+459 1021 0.050287
+459 1022 0.0743873
+459 1023 0.00174622
+459 1024 0.0440159
+459 1025 0.00145785
+459 1026 0.0042107
+459 1027 8.33321e-05
+459 1028 0.00547959
+459 1029 0.0073621
+459 1030 0.00737203
+459 1033 0.0279964
+459 1034 0.139678
+459 1036 0.117898
+459 1037 0.114156
+459 1210 -0.358901
+460 460 1
+460 990 0.00355717
+460 991 0.0455318
+460 993 0.0252369
+460 995 0.0207866
+460 996 0.0586735
+460 997 0.051579
+460 998 0.0665611
+460 999 0.0182426
+460 1001 0.00126607
+460 1002 0.00822661
+460 1003 0.0155454
+460 1004 0.00859632
+460 1005 0.00331624
+460 1006 0.996241
+460 1007 0.00237047
+460 1008 0.000515352
+460 1009 0.006805
+460 1011 0.000398019
+460 1012 0.00177605
+460 1016 0.0156995
+460 1017 0.00686242
+460 1018 0.0600238
+460 1019 0.00123453
+460 1020 0.009088
+460 1021 0.0416189
+460 1022 0.0615649
+460 1023 0.00144522
+460 1024 0.0364288
+460 1025 0.00120656
+460 1026 0.00348489
+460 1027 6.89679e-05
+460 1028 0.00453505
+460 1029 0.00609307
+460 1030 0.00610129
+460 1033 0.0231706
+460 1034 0.115602
+460 1036 0.0975756
+460 1037 0.0944783
+460 1210 -0.297036
+461 461 1
+461 990 0.00453136
+461 991 0.0580014
+461 993 0.0321485
+461 995 0.0264793
+461 996 0.0747421
+461 997 0.0657047
+461 998 0.0847899
+461 999 0.0232386
+461 1001 0.00161281
+461 1002 0.0104796
+461 1003 0.0198028
+461 1004 0.0109505
+461 1005 0.00422444
+461 1006 0.00226711
+461 1007 1
+461 1008 0.000656488
+461 1009 0.00866866
+461 1011 0.000507023
+461 1012 0.00226245
+461 1016 0.019999
+461 1017 0.0087418
+461 1018 0.0764622
+461 1019 0.00157263
+461 1020 0.0115769
+461 1021 0.0530169
+461 1022 0.0784254
+461 1023 0.00184101
+461 1024 0.0464053
+461 1025 0.00153699
+461 1026 0.00443928
+461 1027 8.78559e-05
+461 1028 0.00577705
+461 1029 0.00776176
+461 1030 0.00777222
+461 1033 0.0295162
+461 1034 0.147261
+461 1036 0.124298
+461 1037 0.120353
+461 1210 -0.378384
+462 462 1
+462 990 0.00543231
+462 991 0.0695335
+462 993 0.0385404
+462 995 0.031744
+462 996 0.0896027
+462 997 0.0787685
+462 998 0.101648
+462 999 0.027859
+462 1001 0.00193347
+462 1002 0.0125632
+462 1003 0.0237401
+462 1004 0.0131278
+462 1005 0.00506436
+462 1006 0.00271787
+462 1007 0.00362005
+462 1008 1
+462 1009 0.0103922
+462 1011 0.000607832
+462 1012 0.00271228
+462 1016 0.0239753
+462 1017 0.0104799
+462 1018 0.0916648
+462 1019 0.00188531
+462 1020 0.0138787
+462 1021 0.063558
+462 1022 0.0940183
+462 1023 0.00220705
+462 1024 0.0556319
+462 1025 0.00184259
+462 1026 0.00532192
+462 1027 0.000105324
+462 1028 0.00692567
+462 1029 0.00930499
+462 1030 0.00931753
+462 1033 0.0353848
+462 1034 0.17654
+462 1036 0.149012
+462 1037 0.144282
+462 1210 -0.453616
+463 463 1
+463 990 0.00295571
+463 991 0.0378331
+463 993 0.0209698
+463 995 0.0172719
+463 996 0.0487528
+463 997 0.0428579
+463 998 0.0553067
+463 999 0.0151581
+463 1001 0.001052
+463 1002 0.00683562
+463 1003 0.012917
+463 1004 0.00714282
+463 1005 0.00275552
+463 1006 0.00147879
+463 1007 0.00196966
+463 1008 0.000428214
+463 1009 0.831968
+463 1011 0.000330721
+463 1012 0.00147575
+463 1016 0.0130449
+463 1017 0.0057021
+463 1018 0.0498747
+463 1019 0.00102579
+463 1020 0.00755137
+463 1021 0.0345818
+463 1022 0.0511553
+463 1023 0.00120085
+463 1024 0.0302693
+463 1025 0.00100255
+463 1026 0.00289565
+463 1027 5.73066e-05
+463 1028 0.00376825
+463 1029 0.00506284
+463 1030 0.00506966
+463 1033 0.0192528
+463 1034 0.0960553
+463 1036 0.0810772
+463 1037 0.0785036
+463 1210 -0.246812
+464 464 1
+464 990 0.00194401
+464 991 0.0248834
+464 993 0.0137921
+464 995 0.01136
+464 996 0.0320654
+464 997 0.0281882
+464 998 0.036376
+464 999 0.00996967
+464 1001 0.000691916
+464 1002 0.00449588
+464 1003 0.00849565
+464 1004 0.00469793
+464 1005 0.00181234
+464 1006 0.000972622
+464 1007 0.00129547
+464 1008 0.000281642
+464 1009 0.00371897
+464 1010 0.543478
+464 1011 0.00021752
+464 1012 0.000970622
+464 1016 0.00857984
+464 1017 0.00375035
+464 1018 0.0328033
+464 1019 0.000674678
+464 1020 0.00496664
+464 1021 0.0227449
+464 1022 0.0336455
+464 1023 0.000789818
+464 1024 0.0199085
+464 1025 0.000659391
+464 1026 0.00190451
+464 1027 3.76913e-05
+464 1028 0.00247843
+464 1029 0.0033299
+464 1030 0.00333439
+464 1033 0.0126628
+464 1034 0.0631768
+464 1036 0.0533255
+464 1037 0.0516329
+464 1210 -0.162332
+465 465 1
+465 990 0.00194401
+465 991 0.0248834
+465 993 0.0137921
+465 995 0.01136
+465 996 0.0320654
+465 997 0.0281882
+465 998 0.036376
+465 999 0.00996967
+465 1001 0.000691916
+465 1002 0.00449588
+465 1003 0.00849565
+465 1004 0.00469793
+465 1005 0.00181234
+465 1006 0.000972622
+465 1007 0.00129547
+465 1008 0.000281642
+465 1009 0.00371897
+465 1011 0.543695
+465 1012 0.000970622
+465 1016 0.00857984
+465 1017 0.00375035
+465 1018 0.0328033
+465 1019 0.000674678
+465 1020 0.00496664
+465 1021 0.0227449
+465 1022 0.0336455
+465 1023 0.000789818
+465 1024 0.0199085
+465 1025 0.000659391
+465 1026 0.00190451
+465 1027 3.76913e-05
+465 1028 0.00247843
+465 1029 0.0033299
+465 1030 0.00333439
+465 1033 0.0126628
+465 1034 0.0631768
+465 1036 0.0533255
+465 1037 0.0516329
+465 1210 -0.162332
+466 466 1
+466 990 0.00148017
+466 991 0.0189461
+466 993 0.0105013
+466 995 0.00864945
+466 996 0.0244145
+466 997 0.0214624
+466 998 0.0276966
+466 999 0.00759088
+466 1001 0.000526823
+466 1002 0.00342315
+466 1003 0.00646857
+466 1004 0.00357699
+466 1005 0.00137991
+466 1006 0.000740552
+466 1007 0.000986371
+466 1008 0.000214442
+466 1009 0.00283161
+466 1011 0.000165619
+466 1012 0.414542
+466 1016 0.00653266
+466 1017 0.0028555
+466 1018 0.0249763
+466 1019 0.000513698
+466 1020 0.00378158
+466 1021 0.0173179
+466 1022 0.0256176
+466 1023 0.000601365
+466 1024 0.0151583
+466 1025 0.000502058
+466 1026 0.00145009
+466 1027 2.86981e-05
+466 1028 0.00188707
+466 1029 0.00253537
+466 1030 0.00253879
+466 1033 0.00964145
+466 1034 0.0481027
+466 1036 0.0406019
+466 1037 0.0393131
+466 1210 -0.123599
+467 467 1
+467 990 0.00194401
+467 991 0.0248834
+467 993 0.0137921
+467 995 0.01136
+467 996 0.0320654
+467 997 0.0281882
+467 998 0.036376
+467 999 0.00996967
+467 1001 0.000691916
+467 1002 0.00449588
+467 1003 0.00849565
+467 1004 0.00469793
+467 1005 0.00181234
+467 1006 0.000972622
+467 1007 0.00129547
+467 1008 0.000281642
+467 1009 0.00371897
+467 1011 0.00021752
+467 1012 0.000970622
+467 1013 0.543478
+467 1016 0.00857984
+467 1017 0.00375035
+467 1018 0.0328033
+467 1019 0.000674678
+467 1020 0.00496664
+467 1021 0.0227449
+467 1022 0.0336455
+467 1023 0.000789818
+467 1024 0.0199085
+467 1025 0.000659391
+467 1026 0.00190451
+467 1027 3.76913e-05
+467 1028 0.00247843
+467 1029 0.0033299
+467 1030 0.00333439
+467 1033 0.0126628
+467 1034 0.0631768
+467 1036 0.0533255
+467 1037 0.0516329
+467 1210 -0.162332
+468 468 1
+468 990 0.00194401
+468 991 0.0248834
+468 993 0.0137921
+468 995 0.01136
+468 996 0.0320654
+468 997 0.0281882
+468 998 0.036376
+468 999 0.00996967
+468 1001 0.000691916
+468 1002 0.00449588
+468 1003 0.00849565
+468 1004 0.00469793
+468 1005 0.00181234
+468 1006 0.000972622
+468 1007 0.00129547
+468 1008 0.000281642
+468 1009 0.00371897
+468 1011 0.00021752
+468 1012 0.000970622
+468 1014 0.543478
+468 1016 0.00857984
+468 1017 0.00375035
+468 1018 0.0328033
+468 1019 0.000674678
+468 1020 0.00496664
+468 1021 0.0227449
+468 1022 0.0336455
+468 1023 0.000789818
+468 1024 0.0199085
+468 1025 0.000659391
+468 1026 0.00190451
+468 1027 3.76913e-05
+468 1028 0.00247843
+468 1029 0.0033299
+468 1030 0.00333439
+468 1033 0.0126628
+468 1034 0.0631768
+468 1036 0.0533255
+468 1037 0.0516329
+468 1210 -0.162332
+469 469 1
+469 990 0.00194401
+469 991 0.0248834
+469 993 0.0137921
+469 995 0.01136
+469 996 0.0320654
+469 997 0.0281882
+469 998 0.036376
+469 999 0.00996967
+469 1001 0.000691916
+469 1002 0.00449588
+469 1003 0.00849565
+469 1004 0.00469793
+469 1005 0.00181234
+469 1006 0.000972622
+469 1007 0.00129547
+469 1008 0.000281642
+469 1009 0.00371897
+469 1011 0.00021752
+469 1012 0.000970622
+469 1015 0.543478
+469 1016 0.00857984
+469 1017 0.00375035
+469 1018 0.0328033
+469 1019 0.000674678
+469 1020 0.00496664
+469 1021 0.0227449
+469 1022 0.0336455
+469 1023 0.000789818
+469 1024 0.0199085
+469 1025 0.000659391
+469 1026 0.00190451
+469 1027 3.76913e-05
+469 1028 0.00247843
+469 1029 0.0033299
+469 1030 0.00333439
+469 1033 0.0126628
+469 1034 0.0631768
+469 1036 0.0533255
+469 1037 0.0516329
+469 1210 -0.162332
+470 470 1
+470 990 0.00349654
+470 991 0.0447557
+470 993 0.0248068
+470 995 0.0204323
+470 996 0.0576734
+470 997 0.0506999
+470 998 0.0654266
+470 999 0.0179317
+470 1001 0.00124449
+470 1002 0.00808638
+470 1003 0.0152805
+470 1004 0.00844979
+470 1005 0.00325971
+470 1006 0.00174938
+470 1007 0.00233007
+470 1008 0.000506567
+470 1009 0.00668901
+470 1011 0.000391235
+470 1012 0.00174578
+470 1016 0.992942
+470 1017 0.00674545
+470 1018 0.0590007
+470 1019 0.00121349
+470 1020 0.0089331
+470 1021 0.0409095
+470 1022 0.0605156
+470 1023 0.00142058
+470 1024 0.0358078
+470 1025 0.00118599
+470 1026 0.00342549
+470 1027 6.77924e-05
+470 1028 0.00445775
+470 1029 0.00598922
+470 1030 0.00599729
+470 1033 0.0227756
+470 1034 0.113631
+470 1036 0.0959124
+470 1037 0.0928679
+470 1210 -0.291973
+471 471 1
+471 990 0.00250303
+471 991 0.0320388
+471 993 0.0177581
+471 995 0.0146266
+471 996 0.041286
+471 997 0.0362939
+471 998 0.0468362
+471 999 0.0128365
+471 1001 0.000890882
+471 1002 0.00578871
+471 1003 0.0109386
+471 1004 0.00604885
+471 1005 0.00233349
+471 1006 0.00125231
+471 1007 0.001668
+471 1008 0.000362631
+471 1009 0.00478839
+471 1011 0.000280069
+471 1012 0.00124973
+471 1016 0.011047
+471 1017 0.704588
+471 1018 0.0422361
+471 1019 0.000868687
+471 1020 0.00639483
+471 1021 0.0292854
+471 1022 0.0433206
+471 1023 0.00101694
+471 1024 0.0256333
+471 1025 0.000849003
+471 1026 0.00245216
+471 1027 4.85297e-05
+471 1028 0.00319112
+471 1029 0.00428743
+471 1030 0.00429321
+471 1033 0.0163041
+471 1034 0.0813438
+471 1036 0.0686597
+471 1037 0.0664803
+471 1210 -0.209012
+472 472 1
+472 990 0.00226931
+472 991 0.0290472
+472 993 0.0161
+472 995 0.0132609
+472 996 0.037431
+472 997 0.0329051
+472 998 0.0424629
+472 999 0.0116379
+472 1001 0.000807698
+472 1002 0.0052482
+472 1003 0.00991728
+472 1004 0.00548406
+472 1005 0.00211561
+472 1006 0.00113538
+472 1007 0.00151225
+472 1008 0.000328771
+472 1009 0.00434128
+472 1011 0.000253918
+472 1012 0.00113304
+472 1016 0.0100155
+472 1017 0.00437791
+472 1018 0.672713
+472 1019 0.000787575
+472 1020 0.00579773
+472 1021 0.026551
+472 1022 0.0392756
+472 1023 0.000921982
+472 1024 0.0232399
+472 1025 0.00076973
+472 1026 0.0022232
+472 1027 4.39984e-05
+472 1028 0.00289316
+472 1029 0.0038871
+472 1030 0.00389235
+472 1033 0.0147818
+472 1034 0.0737486
+472 1036 0.0622488
+472 1037 0.0602729
+472 1210 -0.189496
+473 473 1
+473 990 0.00148017
+473 991 0.0189461
+473 993 0.0105013
+473 995 0.00864945
+473 996 0.0244145
+473 997 0.0214624
+473 998 0.0276966
+473 999 0.00759088
+473 1001 0.000526823
+473 1002 0.00342315
+473 1003 0.00646857
+473 1004 0.00357699
+473 1005 0.00137991
+473 1006 0.000740552
+473 1007 0.000986371
+473 1008 0.000214442
+473 1009 0.00283161
+473 1011 0.000165619
+473 1012 0.000739029
+473 1016 0.00653266
+473 1017 0.0028555
+473 1018 0.0249763
+473 1019 0.414316
+473 1020 0.00378158
+473 1021 0.0173179
+473 1022 0.0256176
+473 1023 0.000601365
+473 1024 0.0151583
+473 1025 0.000502058
+473 1026 0.00145009
+473 1027 2.86981e-05
+473 1028 0.00188707
+473 1029 0.00253537
+473 1030 0.00253879
+473 1033 0.00964145
+473 1034 0.0481027
+473 1036 0.0406019
+473 1037 0.0393131
+473 1210 -0.123599
+474 474 1
+474 990 0.00124306
+474 991 0.0159112
+474 993 0.00881911
+474 995 0.00726392
+474 996 0.0205036
+474 997 0.0180244
+474 998 0.0232599
+474 999 0.00637492
+474 1001 0.000442433
+474 1002 0.00287481
+474 1003 0.00543239
+474 1004 0.003004
+474 1005 0.00115887
+474 1006 0.000621925
+474 1007 0.000828368
+474 1008 0.000180091
+474 1009 0.00237803
+474 1011 0.000139089
+474 1012 0.000620646
+474 1016 0.00548622
+474 1017 0.00239809
+474 1018 0.0209755
+474 1019 0.00043141
+474 1020 0.350693
+474 1021 0.0145438
+474 1022 0.021514
+474 1023 0.000505034
+474 1024 0.0127301
+474 1025 0.000421635
+474 1026 0.0012178
+474 1027 2.4101e-05
+474 1028 0.00158479
+474 1029 0.00212924
+474 1030 0.00213211
+474 1033 0.00809702
+474 1034 0.0403973
+474 1036 0.034098
+474 1037 0.0330157
+474 1210 -0.1038
+475 475 1
+475 990 0.00113263
+475 991 0.0144977
+475 993 0.00803562
+475 995 0.00661859
+475 996 0.0186821
+475 997 0.0164231
+475 998 0.0211935
+475 999 0.00580857
+475 1001 0.000403127
+475 1002 0.00261941
+475 1003 0.00494977
+475 1004 0.00273713
+475 1005 0.00105591
+475 1006 0.000566673
+475 1007 0.000754775
+475 1008 0.000164092
+475 1009 0.00216676
+475 1011 0.000126732
+475 1012 0.000565508
+475 1016 0.00499882
+475 1017 0.00218504
+475 1018 0.019112
+475 1019 0.000393084
+475 1020 0.00289368
+475 1021 0.329895
+475 1022 0.0196027
+475 1023 0.000460167
+475 1024 0.0115992
+475 1025 0.000384177
+475 1026 0.00110961
+475 1027 2.19599e-05
+475 1028 0.00144399
+475 1029 0.00194008
+475 1030 0.00194269
+475 1033 0.00737768
+475 1034 0.0368084
+475 1036 0.0310688
+475 1037 0.0300826
+475 1210 -0.0945784
+476 476 1
+476 990 0.00280511
+476 991 0.0359055
+476 993 0.0199013
+476 995 0.0163919
+476 996 0.0462687
+476 997 0.0406742
+476 998 0.0524887
+476 999 0.0143857
+476 1001 0.0009984
+476 1002 0.00648733
+476 1003 0.0122588
+476 1004 0.00677888
+476 1005 0.00261512
+476 1006 0.00140344
+476 1007 0.00186931
+476 1008 0.000406396
+476 1009 0.00536629
+476 1011 0.00031387
+476 1012 0.00140056
+476 1016 0.0123803
+476 1017 0.00541156
+476 1018 0.0473335
+476 1019 0.000973527
+476 1020 0.00716661
+476 1021 0.0328198
+476 1022 0.83276
+476 1023 0.00113967
+476 1024 0.028727
+476 1025 0.000951468
+476 1026 0.00274811
+476 1027 5.43867e-05
+476 1028 0.00357625
+476 1029 0.00480487
+476 1030 0.00481135
+476 1033 0.0182718
+476 1034 0.091161
+476 1036 0.0769461
+476 1037 0.0745037
+476 1210 -0.234237
+477 477 1
+477 990 0.00164245
+477 991 0.0210234
+477 993 0.0116527
+477 995 0.00959779
+477 996 0.0270913
+477 997 0.0238156
+477 998 0.0307333
+477 999 0.00842316
+477 1001 0.000584585
+477 1002 0.00379847
+477 1003 0.0071778
+477 1004 0.00396918
+477 1005 0.00153121
+477 1006 0.000821747
+477 1007 0.00109452
+477 1008 0.000237953
+477 1009 0.00314208
+477 1011 0.000183778
+477 1012 0.000820058
+477 1016 0.00724892
+477 1017 0.00316859
+477 1018 0.0277148
+477 1019 0.000570021
+477 1020 0.00419621
+477 1021 0.0192167
+477 1022 0.0284264
+477 1023 0.45984
+477 1024 0.0168203
+477 1025 0.000557105
+477 1026 0.00160908
+477 1027 3.18446e-05
+477 1028 0.00209397
+477 1029 0.00281336
+477 1030 0.00281715
+477 1033 0.0106986
+477 1034 0.0533768
+477 1036 0.0450536
+477 1037 0.0436235
+477 1210 -0.137151
+478 478 1
+478 990 0.00107044
+478 991 0.0137016
+478 993 0.00759442
+478 995 0.00625519
+478 996 0.0176563
+478 997 0.0155214
+478 998 0.0200299
+478 999 0.00548965
+478 1001 0.000380993
+478 1002 0.00247559
+478 1003 0.004678
+478 1004 0.00258684
+478 1005 0.000997937
+478 1006 0.000535559
+478 1007 0.000713333
+478 1008 0.000155082
+478 1009 0.00204779
+478 1011 0.000119774
+478 1012 0.000534458
+478 1016 0.00472436
+478 1017 0.00206507
+478 1018 0.0180626
+478 1019 0.000371501
+478 1020 0.0027348
+478 1021 0.0125242
+478 1022 0.0185264
+478 1023 0.000434901
+478 1024 0.31022
+478 1025 0.000363083
+478 1026 0.00104869
+478 1027 2.07541e-05
+478 1028 0.00136471
+478 1029 0.00183356
+478 1030 0.00183603
+478 1033 0.0069726
+478 1034 0.0347874
+478 1036 0.0293629
+478 1037 0.0284308
+478 1210 -0.0893855
+479 479 1
+479 990 0.000962223
+479 991 0.0123165
+479 993 0.00682665
+479 995 0.00562281
+479 996 0.0158713
+479 997 0.0139522
+479 998 0.0180049
+479 999 0.00493466
+479 1001 0.000342476
+479 1002 0.00222531
+479 1003 0.00420507
+479 1004 0.00232532
+479 1005 0.000897049
+479 1006 0.000481416
+479 1007 0.000641218
+479 1008 0.000139404
+479 1009 0.00184077
+479 1011 0.000107665
+479 1012 0.000480426
+479 1016 0.00424674
+479 1017 0.0018563
+479 1018 0.0162366
+479 1019 0.000333944
+479 1020 0.00245832
+479 1021 0.011258
+479 1022 0.0166534
+479 1023 0.000390934
+479 1024 0.00985406
+479 1025 0.26933
+479 1026 0.00094267
+479 1027 1.8656e-05
+479 1028 0.00122674
+479 1029 0.00164819
+479 1030 0.00165041
+479 1033 0.00626769
+479 1034 0.0312705
+479 1036 0.0263944
+479 1037 0.0255566
+479 1210 -0.0803489
+480 480 1
+480 990 0.00148017
+480 991 0.0189461
+480 993 0.0105013
+480 995 0.00864945
+480 996 0.0244145
+480 997 0.0214624
+480 998 0.0276966
+480 999 0.00759088
+480 1001 0.000526823
+480 1002 0.00342315
+480 1003 0.00646857
+480 1004 0.00357699
+480 1005 0.00137991
+480 1006 0.000740552
+480 1007 0.000986371
+480 1008 0.000214442
+480 1009 0.00283161
+480 1011 0.000165619
+480 1012 0.000739029
+480 1016 0.00653266
+480 1017 0.0028555
+480 1018 0.0249763
+480 1019 0.000513698
+480 1020 0.00378158
+480 1021 0.0173179
+480 1022 0.0256176
+480 1023 0.000601365
+480 1024 0.0151583
+480 1025 0.000502058
+480 1026 0.415253
+480 1027 2.86981e-05
+480 1028 0.00188707
+480 1029 0.00253537
+480 1030 0.00253879
+480 1033 0.00964145
+480 1034 0.0481027
+480 1036 0.0406019
+480 1037 0.0393131
+480 1210 -0.123599
+481 481 1
+481 990 0.000778065
+481 991 0.00995924
+481 993 0.00552011
+481 995 0.00454668
+481 996 0.0128337
+481 997 0.011282
+481 998 0.014559
+481 999 0.00399023
+481 1001 0.00027693
+481 1002 0.00179942
+481 1003 0.00340027
+481 1004 0.00188028
+481 1005 0.000725365
+481 1006 0.000389279
+481 1007 0.000518497
+481 1008 0.000112724
+481 1009 0.00148847
+481 1011 8.70593e-05
+481 1012 0.000388479
+481 1016 0.00343397
+481 1017 0.00150103
+481 1018 0.0131291
+481 1019 0.000270031
+481 1020 0.00198783
+481 1021 0.00910337
+481 1022 0.0134662
+481 1023 0.000316114
+481 1024 0.00796811
+481 1025 0.000263912
+481 1026 0.000762254
+481 1027 0.217535
+481 1028 0.000991959
+481 1029 0.00133275
+481 1030 0.00133454
+481 1033 0.00506813
+481 1034 0.0252857
+481 1036 0.0213428
+481 1037 0.0206654
+481 1210 -0.0649712
+482 482 1
+482 990 0.00477376
+482 991 0.0611042
+482 993 0.0338682
+482 995 0.0278958
+482 996 0.0787404
+482 997 0.0692196
+482 998 0.0893256
+482 999 0.0244817
+482 1001 0.00169908
+482 1002 0.0110402
+482 1003 0.0208621
+482 1004 0.0115363
+482 1005 0.00445042
+482 1006 0.00238839
+482 1007 0.0031812
+482 1008 0.000691607
+482 1009 0.00913238
+482 1011 0.000534146
+482 1012 0.00238348
+482 1016 0.0210688
+482 1017 0.00920943
+482 1018 0.0805525
+482 1019 0.00165675
+482 1020 0.0121962
+482 1021 0.055853
+482 1022 0.0826207
+482 1023 0.00193949
+482 1024 0.0488877
+482 1025 0.00161921
+482 1026 0.00467675
+482 1027 9.25556e-05
+482 1028 1
+482 1029 0.00817696
+482 1030 0.00818799
+482 1033 0.0310951
+482 1034 0.155138
+482 1036 0.130947
+482 1037 0.126791
+482 1210 -0.398626
+483 483 1
+483 990 0.00148017
+483 991 0.0189461
+483 993 0.0105013
+483 995 0.00864945
+483 996 0.0244145
+483 997 0.0214624
+483 998 0.0276966
+483 999 0.00759088
+483 1001 0.000526823
+483 1002 0.00342315
+483 1003 0.00646857
+483 1004 0.00357699
+483 1005 0.00137991
+483 1006 0.000740552
+483 1007 0.000986371
+483 1008 0.000214442
+483 1009 0.00283161
+483 1011 0.000165619
+483 1012 0.000739029
+483 1016 0.00653266
+483 1017 0.0028555
+483 1018 0.0249763
+483 1019 0.000513698
+483 1020 0.00378158
+483 1021 0.0173179
+483 1022 0.0256176
+483 1023 0.000601365
+483 1024 0.0151583
+483 1025 0.000502058
+483 1026 0.00145009
+483 1027 2.86981e-05
+483 1028 0.00188707
+483 1029 0.416338
+483 1030 0.00253879
+483 1033 0.00964145
+483 1034 0.0481027
+483 1036 0.0406019
+483 1037 0.0393131
+483 1210 -0.123599
+484 484 1
+484 990 0.00194401
+484 991 0.0248834
+484 993 0.0137921
+484 995 0.01136
+484 996 0.0320654
+484 997 0.0281882
+484 998 0.036376
+484 999 0.00996967
+484 1001 0.000691916
+484 1002 0.00449588
+484 1003 0.00849565
+484 1004 0.00469793
+484 1005 0.00181234
+484 1006 0.000972622
+484 1007 0.00129547
+484 1008 0.000281642
+484 1009 0.00371897
+484 1011 0.00021752
+484 1012 0.000970622
+484 1016 0.00857984
+484 1017 0.00375035
+484 1018 0.0328033
+484 1019 0.000674678
+484 1020 0.00496664
+484 1021 0.0227449
+484 1022 0.0336455
+484 1023 0.000789818
+484 1024 0.0199085
+484 1025 0.000659391
+484 1026 0.00190451
+484 1027 3.76913e-05
+484 1028 0.00247843
+484 1029 0.0033299
+484 1030 0.546812
+484 1033 0.0126628
+484 1034 0.0631768
+484 1036 0.0533255
+484 1037 0.0516329
+484 1210 -0.162332
+485 485 1
+485 990 0.00194401
+485 991 0.0248834
+485 993 0.0137921
+485 995 0.01136
+485 996 0.0320654
+485 997 0.0281882
+485 998 0.036376
+485 999 0.00996967
+485 1001 0.000691916
+485 1002 0.00449588
+485 1003 0.00849565
+485 1004 0.00469793
+485 1005 0.00181234
+485 1006 0.000972622
+485 1007 0.00129547
+485 1008 0.000281642
+485 1009 0.00371897
+485 1011 0.00021752
+485 1012 0.000970622
+485 1016 0.00857984
+485 1017 0.00375035
+485 1018 0.0328033
+485 1019 0.000674678
+485 1020 0.00496664
+485 1021 0.0227449
+485 1022 0.0336455
+485 1023 0.000789818
+485 1024 0.0199085
+485 1025 0.000659391
+485 1026 0.00190451
+485 1027 3.76913e-05
+485 1028 0.00247843
+485 1029 0.0033299
+485 1030 0.00333439
+485 1031 0.543478
+485 1033 0.0126628
+485 1034 0.0631768
+485 1036 0.0533255
+485 1037 0.0516329
+485 1210 -0.162332
+486 486 1
+486 990 0.00194401
+486 991 0.0248834
+486 993 0.0137921
+486 995 0.01136
+486 996 0.0320654
+486 997 0.0281882
+486 998 0.036376
+486 999 0.00996967
+486 1001 0.000691916
+486 1002 0.00449588
+486 1003 0.00849565
+486 1004 0.00469793
+486 1005 0.00181234
+486 1006 0.000972622
+486 1007 0.00129547
+486 1008 0.000281642
+486 1009 0.00371897
+486 1011 0.00021752
+486 1012 0.000970622
+486 1016 0.00857984
+486 1017 0.00375035
+486 1018 0.0328033
+486 1019 0.000674678
+486 1020 0.00496664
+486 1021 0.0227449
+486 1022 0.0336455
+486 1023 0.000789818
+486 1024 0.0199085
+486 1025 0.000659391
+486 1026 0.00190451
+486 1027 3.76913e-05
+486 1028 0.00247843
+486 1029 0.0033299
+486 1030 0.00333439
+486 1032 0.543478
+486 1033 0.0126628
+486 1034 0.0631768
+486 1036 0.0533255
+486 1037 0.0516329
+486 1210 -0.162332
+487 487 1
+487 990 0.00774125
+487 991 0.099088
+487 993 0.0549216
+487 995 0.0452365
+487 996 0.127687
+487 997 0.112248
+487 998 0.144853
+487 999 0.0397002
+487 1001 0.00275528
+487 1002 0.017903
+487 1003 0.0338305
+487 1004 0.0187076
+487 1005 0.00721692
+487 1006 0.00387308
+487 1007 0.00515871
+487 1008 0.00112153
+487 1009 0.0148093
+487 1011 0.000866184
+487 1012 0.00386511
+487 1016 0.0341658
+487 1017 0.0149343
+487 1018 0.130626
+487 1019 0.00268663
+487 1020 0.0197776
+487 1021 0.0905726
+487 1022 0.13398
+487 1023 0.00314513
+487 1024 0.0792776
+487 1025 0.00262576
+487 1026 0.00758394
+487 1027 0.000150091
+487 1028 0.00986935
+487 1029 0.01326
+487 1030 0.0132779
+487 1033 1
+487 1034 0.251576
+487 1036 0.212348
+487 1037 0.205607
+487 1210 -0.646421
+488 488 1
+488 990 0.00579536
+488 991 0.0741806
+488 993 0.0411161
+488 995 0.0338656
+488 996 0.0955911
+488 997 0.0840327
+488 998 0.108442
+488 999 0.0297209
+488 1001 0.00206269
+488 1002 0.0134028
+488 1003 0.0253267
+488 1004 0.0140051
+488 1005 0.00540282
+488 1006 0.00289951
+488 1007 0.00386198
+488 1008 0.000839612
+488 1009 0.0110867
+488 1011 0.000648454
+488 1012 0.00289355
+488 1016 0.0255776
+488 1017 0.0111803
+488 1018 0.0977909
+488 1019 0.0020113
+488 1020 0.0148062
+488 1021 0.0678057
+488 1022 0.100302
+488 1023 0.00235455
+488 1024 0.0593499
+488 1025 0.00196573
+488 1026 0.00567759
+488 1027 0.000112363
+488 1028 0.00738852
+488 1029 0.00992686
+488 1030 0.00994024
+488 1033 0.0377496
+488 1034 1
+488 1036 0.15897
+488 1037 0.153924
+488 1210 -0.483932
+489 489 1
+489 990 0.00194401
+489 991 0.0248834
+489 993 0.0137921
+489 995 0.01136
+489 996 0.0320654
+489 997 0.0281882
+489 998 0.036376
+489 999 0.00996967
+489 1001 0.000691916
+489 1002 0.00449588
+489 1003 0.00849565
+489 1004 0.00469793
+489 1005 0.00181234
+489 1006 0.000972622
+489 1007 0.00129547
+489 1008 0.000281642
+489 1009 0.00371897
+489 1011 0.00021752
+489 1012 0.000970622
+489 1016 0.00857984
+489 1017 0.00375035
+489 1018 0.0328033
+489 1019 0.000674678
+489 1020 0.00496664
+489 1021 0.0227449
+489 1022 0.0336455
+489 1023 0.000789818
+489 1024 0.0199085
+489 1025 0.000659391
+489 1026 0.00190451
+489 1027 3.76913e-05
+489 1028 0.00247843
+489 1029 0.0033299
+489 1030 0.00333439
+489 1033 0.0126628
+489 1034 0.0631768
+489 1035 0.543478
+489 1036 0.0533255
+489 1037 0.0516329
+489 1210 -0.162332
+490 490 1
+490 990 0.00262523
+490 991 0.0336029
+490 993 0.0186251
+490 995 0.0153407
+490 996 0.0433016
+490 997 0.0380658
+490 998 0.0491227
+490 999 0.0134632
+490 1001 0.000934375
+490 1002 0.00607131
+490 1003 0.0114727
+490 1004 0.00634416
+490 1005 0.00244741
+490 1006 0.00131344
+490 1007 0.00174943
+490 1008 0.000380334
+490 1009 0.00502216
+490 1011 0.000293742
+490 1012 0.00131074
+490 1016 0.0115864
+490 1017 0.00506453
+490 1018 0.0442981
+490 1019 0.000911096
+490 1020 0.00670703
+490 1021 0.0307152
+490 1022 0.0454355
+490 1023 0.00106658
+490 1024 0.0268848
+490 1025 0.000890452
+490 1026 0.00257188
+490 1027 5.0899e-05
+490 1028 0.00334691
+490 1029 0.00449675
+490 1030 0.00450281
+490 1033 0.0171001
+490 1034 0.085315
+490 1036 0.805933
+490 1037 0.0697259
+490 1210 -0.219216
+491 383 1
+491 491 -1
+492 492 1
+492 990 0.00194401
+492 991 0.0248834
+492 993 0.0137921
+492 995 0.01136
+492 996 0.0320654
+492 997 0.0281882
+492 998 0.036376
+492 999 0.00996967
+492 1001 0.000691916
+492 1002 0.00449588
+492 1003 0.00849565
+492 1004 0.00469793
+492 1005 0.00181234
+492 1006 0.000972622
+492 1007 0.00129547
+492 1008 0.000281642
+492 1009 0.00371897
+492 1011 0.00021752
+492 1012 0.000970622
+492 1016 0.00857984
+492 1017 0.00375035
+492 1018 0.0328033
+492 1019 0.000674678
+492 1020 0.00496664
+492 1021 0.0227449
+492 1022 0.0336455
+492 1023 0.000789818
+492 1024 0.0199085
+492 1025 0.000659391
+492 1026 0.00190451
+492 1027 3.76913e-05
+492 1028 0.00247843
+492 1029 0.0033299
+492 1030 0.00333439
+492 1033 0.0126628
+492 1034 0.0631768
+492 1036 0.0533255
+492 1037 0.0516329
+492 1038 0.543478
+492 1210 -0.162332
+493 493 1
+493 990 0.00194401
+493 991 0.0248834
+493 993 0.0137921
+493 995 0.01136
+493 996 0.0320654
+493 997 0.0281882
+493 998 0.036376
+493 999 0.00996967
+493 1001 0.000691916
+493 1002 0.00449588
+493 1003 0.00849565
+493 1004 0.00469793
+493 1005 0.00181234
+493 1006 0.000972622
+493 1007 0.00129547
+493 1008 0.000281642
+493 1009 0.00371897
+493 1011 0.00021752
+493 1012 0.000970622
+493 1016 0.00857984
+493 1017 0.00375035
+493 1018 0.0328033
+493 1019 0.000674678
+493 1020 0.00496664
+493 1021 0.0227449
+493 1022 0.0336455
+493 1023 0.000789818
+493 1024 0.0199085
+493 1025 0.000659391
+493 1026 0.00190451
+493 1027 3.76913e-05
+493 1028 0.00247843
+493 1029 0.0033299
+493 1030 0.00333439
+493 1033 0.0126628
+493 1034 0.0631768
+493 1036 0.0533255
+493 1037 0.0516329
+493 1039 0.543478
+493 1210 -0.162332
+494 494 1
+494 990 0.00194401
+494 991 0.0248834
+494 993 0.0137921
+494 995 0.01136
+494 996 0.0320654
+494 997 0.0281882
+494 998 0.036376
+494 999 0.00996967
+494 1001 0.000691916
+494 1002 0.00449588
+494 1003 0.00849565
+494 1004 0.00469793
+494 1005 0.00181234
+494 1006 0.000972622
+494 1007 0.00129547
+494 1008 0.000281642
+494 1009 0.00371897
+494 1011 0.00021752
+494 1012 0.000970622
+494 1016 0.00857984
+494 1017 0.00375035
+494 1018 0.0328033
+494 1019 0.000674678
+494 1020 0.00496664
+494 1021 0.0227449
+494 1022 0.0336455
+494 1023 0.000789818
+494 1024 0.0199085
+494 1025 0.000659391
+494 1026 0.00190451
+494 1027 3.76913e-05
+494 1028 0.00247843
+494 1029 0.0033299
+494 1030 0.00333439
+494 1033 0.0126628
+494 1034 0.0631768
+494 1036 0.0533255
+494 1037 0.0516329
+494 1040 0.543478
+494 1210 -0.162332
+495 1 -0.00717212
+495 2 -0.035991
+495 3 -0.00244208
+495 4 -0.0272421
+495 5 -0.00474775
+495 6 -0.000752988
+495 7 -0.00785165
+495 8 -0.00134216
+495 9 -0.120912
+495 495 -1
+495 610 1
+495 774 1.8827e-05
+495 828 -0.000106536
+496 2 -9.15722e-05
+496 3 -0.00400288
+496 4 -0.000115487
+496 5 -0.00307373
+496 6 -0.00552314
+496 7 -0.00184269
+496 8 -0.0040653
+496 10 -0.00723113
+496 33 -0.046336
+496 337 -0.595014
+496 496 -1
+496 614 1
+496 778 0.0362015
+496 832 -0.0437858
+497 11 -0.0155559
+497 12 -0.312971
+497 30 -0.350423
+497 497 -1
+497 617 1
+497 781 0.0197342
+497 835 -0.158117
+498 1 -0.00163032
+498 2 -0.00332623
+498 3 -0.00595986
+498 4 -0.00120193
+498 5 -0.00118529
+498 6 -0.00092166
+498 7 -0.00115571
+498 8 -0.000293134
+498 11 -0.0126571
+498 14 -0.00947131
+498 25 -0.0375599
+498 28 -0.025843
+498 29 -0.000101719
+498 30 -0.0260751
+498 35 -0.00220154
+498 498 -1
+498 623 1
+498 787 0.0163999
+498 841 -0.0921948
+499 12 -0.0159929
+499 348 -0.573771
+499 499 -1
+499 625 1
+500 366 -0.409032
+500 500 -1
+500 535 -0.079955
+500 643 1
+501 13 -0.000984225
+501 27 -0.000330689
+501 37 -0.00257163
+501 40 -0.00570399
+501 41 -0.00163662
+501 501 -1
+501 655 1
+502 502 1
+503 503 1
+504 504 1
+505 505 1
+506 506 1
+507 507 1
+508 508 1
+509 509 1
+510 510 1
+511 511 1
+512 512 1
+513 513 1
+514 514 1
+515 515 1
+516 516 1
+517 517 1
+518 518 1
+519 519 1
+520 520 1
+521 521 1
+522 522 1
+523 523 1
+524 524 1
+525 525 1
+526 526 1
+527 527 1
+528 528 1
+529 529 1
+530 530 1
+531 531 1
+532 532 1
+533 533 1
+534 534 1
+535 535 1
+536 536 1
+537 537 1
+538 538 1
+539 539 1
+540 540 1
+541 541 1
+542 542 1
+543 543 1
+544 544 1
+545 545 1
+546 546 1
+547 547 1
+548 548 1
+549 549 1
+550 550 1
+551 551 1
+552 552 1
+553 553 1
+554 554 1
+555 555 1
+556 556 1
+557 557 1
+558 558 1
+559 559 1
+560 560 1
+561 561 1
+562 562 1
+563 563 1
+564 564 1
+565 565 1
+566 566 1
+567 567 1
+568 568 1
+569 569 1
+570 570 1
+571 571 1
+572 572 1
+573 573 1
+574 574 1
+575 575 1
+576 576 1
+577 577 1
+578 578 1
+579 579 1
+580 580 1
+581 581 1
+582 582 1
+583 583 1
+584 584 1
+585 585 1
+586 586 1
+587 587 1
+588 588 1
+589 589 1
+590 590 1
+591 591 1
+592 592 1
+593 593 1
+594 594 1
+595 595 1
+596 596 1
+597 597 1
+598 598 1
+599 599 1
+600 600 1
+601 601 1
+602 602 1
+603 603 1
+604 604 1
+605 605 1
+606 606 1
+607 607 1
+608 608 1
+609 609 1
+610 209 -0.043952
+610 219 -0.353123
+610 229 -0.0485111
+610 239 -0.641809
+610 249 -0.166942
+610 259 -0.019262
+610 269 -0.0579069
+610 279 -0.0109584
+610 610 1
+611 1 -0.100385
+611 2 -0.417853
+611 3 -0.556539
+611 4 -0.531019
+611 5 -0.650868
+611 6 -0.611778
+611 7 -0.83542
+611 8 -0.571251
+611 9 -0.163067
+611 611 1
+611 775 0.494564
+611 829 -0.494564
+612 1 -0.0275091
+612 2 -0.102679
+612 3 -0.13331
+612 4 -0.152439
+612 5 -0.147934
+612 6 -0.152556
+612 7 -0.204032
+612 8 -0.142221
+612 10 -0.127611
+612 612 1
+612 776 0.20829
+612 830 -0.20829
+613 212 -0.000831395
+613 222 -0.0403694
+613 232 -1
+613 242 -0.148177
+613 252 -0.955922
+613 262 -1
+613 272 -0.178676
+613 282 -1
+613 613 1
+614 213 -3.90944e-05
+614 223 -0.00327899
+614 233 -0.270357
+614 243 -0.00881912
+614 253 -0.133864
+614 263 -0.435067
+614 273 -0.0482902
+614 283 -0.125001
+614 614 1
+615 1 -0.0663264
+615 2 -0.477231
+615 3 -0.887672
+615 4 -0.441584
+615 5 -0.827647
+615 6 -0.977173
+615 7 -0.306539
+615 8 -0.966324
+615 615 1
+616 215 -0.0342955
+616 225 -0.256479
+616 235 -0.74296
+616 245 -0.65619
+616 255 -0.44057
+616 265 -1
+616 275 -0.939044
+616 285 -0.249932
+616 616 1
+617 216 -0.195842
+617 226 -1
+617 236 -1
+617 246 -0.790967
+617 256 -0.435923
+617 266 -0.62518
+617 276 -0.260798
+617 286 -0.0775504
+617 617 1
+618 217 -0.0648448
+618 227 -0.828139
+618 237 -1
+618 247 -1
+618 257 -1
+618 267 -1
+618 277 -1
+618 287 -1
+618 618 1
+619 218 -0.0968963
+619 228 -0.606149
+619 238 -1
+619 248 -1
+619 258 -0.747552
+619 268 -1
+619 278 -0.786277
+619 288 -0.810728
+619 619 1
+620 1 -0.000704707
+620 2 -0.00499972
+620 3 -0.00584212
+620 4 -0.0053844
+620 5 -0.00644053
+620 6 -0.00686584
+620 7 -0.00906491
+620 8 -0.00644567
+620 33 -0.127324
+620 36 -0.207431
+620 42 -0.0281642
+620 343 -1
+620 620 1
+621 290 -1
+621 621 1
+622 291 -1
+622 622 1
+623 292 -1
+623 623 1
+624 47 -0.631786
+624 347 -0.557123
+624 624 1
+624 788 0.0238499
+624 842 -0.0238499
+625 294 -1
+625 625 1
+626 349 -1
+626 626 1
+627 1 -0.000429238
+627 2 -0.00174786
+627 3 -0.00169504
+627 4 -0.00148446
+627 5 -0.00157287
+627 6 -0.00170342
+627 7 -0.00109714
+627 8 -0.00162914
+627 18 -0.0169615
+627 33 -0.0130517
+627 42 -0.00146614
+627 46 -0.433353
+627 48 -0.1531
+627 350 -0.240523
+627 519 -0.0647246
+627 627 1
+627 791 0.0955186
+627 845 -0.0955186
+628 351 -1
+628 628 1
+629 352 -1
+629 629 1
+630 353 -1
+630 630 1
+631 354 -1
+631 631 1
+632 355 -1
+632 632 1
+633 633 1
+633 721 -0.515079
+633 797 0.495516
+633 851 -0.495516
+634 43 -0.0275948
+634 47 -0.965874
+634 634 1
+634 798 0.0718853
+634 852 -0.0718853
+635 358 -0.15926
+635 635 1
+635 718 -1
+635 719 -0.494386
+635 720 -0.0192678
+635 721 -0.026059
+635 799 0.0825456
+635 853 -0.18512
+636 636 1
+636 721 -0.758402
+636 800 0.0118052
+636 854 -0.0118052
+637 637 1
+637 721 -1
+638 47 -0.982566
+638 49 -0.0245416
+638 638 1
+638 802 0.476485
+638 856 -0.476485
+639 362 -1
+639 639 1
+640 1 -0.00580506
+640 2 -0.0183126
+640 3 -0.0239172
+640 4 -0.0141713
+640 5 -0.0128056
+640 6 -0.0125451
+640 7 -0.0147619
+640 8 -0.0113214
+640 9 -0.00367068
+640 11 -0.00641643
+640 12 -0.185604
+640 14 -0.0220699
+640 16 -0.0306262
+640 18 -0.00156647
+640 20 -0.0594226
+640 29 -0.00357042
+640 30 -0.128616
+640 41 -0.0747195
+640 42 -0.00710593
+640 47 -0.186788
+640 49 -0.0408423
+640 363 -1
+640 640 1
+640 804 0.551057
+640 858 -0.551057
+641 1 -0.00258827
+641 2 -0.00813357
+641 3 -0.0106286
+641 4 -0.00629893
+641 5 -0.00569203
+641 6 -0.00557687
+641 7 -0.00656203
+641 8 -0.00502906
+641 9 -0.0075622
+641 11 -0.000221333
+641 14 -0.0139655
+641 17 -0.00046487
+641 18 -0.00122083
+641 19 -0.000858075
+641 24 -0.00207605
+641 25 -0.0147487
+641 26 -0.00144265
+641 27 -0.0063542
+641 28 -0.00783908
+641 29 -1.49601e-05
+641 30 -0.193728
+641 31 -0.00171421
+641 32 -9.98154e-05
+641 33 -0.000649799
+641 35 -0.00116189
+641 36 -0.00904361
+641 42 -0.00713049
+641 44 -0.00597472
+641 45 -0.0215122
+641 46 -0.135842
+641 47 -0.0746321
+641 48 -0.0129928
+641 49 -0.0176689
+641 364 -1
+641 533 -0.00897368
+641 641 1
+641 805 0.0431877
+641 859 -0.0431877
+642 1 -0.0133536
+642 2 -0.0539228
+642 3 -0.0523028
+642 4 -0.0457872
+642 5 -0.0485444
+642 6 -0.0525637
+642 7 -0.0338096
+642 8 -0.0502958
+642 9 -0.0836236
+642 26 -0.0202208
+642 31 -0.0220816
+642 42 -0.0124993
+642 46 -0.0767299
+642 47 -0.189626
+642 365 -0.416647
+642 642 1
+642 806 1
+642 860 -1
+643 312 -1
+643 643 1
+644 9 -0.0223493
+644 33 -0.0851426
+644 34 -0.0420804
+644 42 -0.00521666
+644 367 -1
+644 536 -0.0809771
+644 644 1
+645 314 -1
+645 645 1
+646 14 -1
+646 30 -0.198543
+646 42 -0.0145487
+646 369 -0.380009
+646 646 1
+647 16 -0.0741007
+647 17 -0.00527914
+647 19 -0.0224621
+647 25 -0.0171311
+647 26 -0.00286847
+647 27 -0.0056189
+647 28 -0.0643283
+647 36 -0.20552
+647 37 -0.00123195
+647 46 -0.0260046
+647 47 -0.205823
+647 370 -1
+647 647 1
+647 811 0.924298
+647 865 -0.924298
+648 9 -0.028214
+648 10 -0.00649127
+648 11 -0.0121961
+648 12 -0.0163425
+648 13 -0.0169614
+648 14 -0.0138948
+648 15 -0.00119848
+648 16 -0.00834542
+648 17 -0.000262448
+648 19 -0.00183055
+648 20 -7.1814e-05
+648 23 -0.000585641
+648 24 -0.0267798
+648 25 -0.00189725
+648 26 -0.00572018
+648 27 -0.000731297
+648 28 -0.00403256
+648 29 -0.00386283
+648 30 -0.0029321
+648 31 -0.00246832
+648 32 -0.0078801
+648 33 -0.0042333
+648 34 -0.0055543
+648 35 -0.00500088
+648 36 -0.0896809
+648 37 -0.000937912
+648 38 -0.00405818
+648 39 -0.000760787
+648 41 -0.0314581
+648 42 -0.00366826
+648 43 -0.00290966
+648 44 -0.591848
+648 45 -0.632046
+648 371 -0.663585
+648 540 -0.422062
+648 648 1
+648 812 1
+648 866 -1
+649 10 -0.0113493
+649 13 -0.128858
+649 21 -0.0831146
+649 27 -0.157431
+649 38 -0.202046
+649 40 -0.0190175
+649 47 -0.231463
+649 49 -0.0343764
+649 372 -0.259163
+649 649 1
+649 813 0.57419
+649 867 -0.57419
+650 34 -0.423175
+650 39 -0.259071
+650 373 -0.232202
+650 650 1
+651 21 -0.0516462
+651 40 -0.0408238
+651 42 -0.0186745
+651 374 -1
+651 651 1
+651 815 0.0550506
+651 869 -0.0550506
+652 17 -0.00435719
+652 21 -0.0357006
+652 22 -0.00173169
+652 23 -0.00156849
+652 24 -0.0450701
+652 25 -0.0335507
+652 26 -0.000616466
+652 27 -0.00765772
+652 28 -0.00465308
+652 29 -0.000405143
+652 41 -0.412855
+652 42 -0.00188614
+652 45 -0.0788363
+652 47 -0.140973
+652 375 -0.199317
+652 652 1
+652 721 -0.036733
+652 816 0.196715
+652 870 -0.196715
+653 1 -0.00248814
+653 2 -0.0100258
+653 3 -0.00972458
+653 4 -0.00851398
+653 5 -0.00902834
+653 6 -0.00977217
+653 7 -0.00628679
+653 8 -0.00934951
+653 9 -0.0126099
+653 13 -0.213837
+653 14 -0.345077
+653 15 -0.10442
+653 16 -0.0100145
+653 21 -0.119578
+653 24 -0.0170181
+653 25 -0.00721397
+653 26 -0.00218596
+653 27 -0.029694
+653 29 -0.0148334
+653 31 -0.0136938
+653 32 -0.0615235
+653 36 -0.0102149
+653 37 -0.0945828
+653 38 -0.120758
+653 40 -0.138766
+653 41 -0.00954198
+653 43 -0.00119064
+653 45 -0.00976582
+653 47 -0.0344051
+653 376 -0.435117
+653 545 -0.835088
+653 653 1
+653 817 0.261636
+653 871 -0.261636
+654 47 -0.937186
+654 49 -0.046395
+654 654 1
+655 324 -1
+655 655 1
+656 1 -0.000369607
+656 2 -0.00143743
+656 3 -0.00142103
+656 4 -0.00122758
+656 5 -0.00130965
+656 6 -0.00140732
+656 7 -0.000969078
+656 8 -0.00134423
+656 9 -0.00229024
+656 10 -0.000921046
+656 11 -0.000281669
+656 12 -0.00489378
+656 13 -0.000505418
+656 14 -0.00148486
+656 15 -0.00047861
+656 16 -0.00121453
+656 17 -8.77156e-05
+656 18 -7.91111e-05
+656 19 -0.000181207
+656 20 -2.45968e-05
+656 21 -0.000640365
+656 22 -1.75006e-05
+656 23 -4.52409e-05
+656 24 -0.00116832
+656 25 -0.00102837
+656 26 -6.59462e-05
+656 27 -0.000372541
+656 28 -0.000883274
+656 29 -0.000200292
+656 30 -0.00679508
+656 31 -0.000171488
+656 32 -0.00166108
+656 33 -0.00203184
+656 34 -0.000624423
+656 35 -0.000232653
+656 36 -0.00359742
+656 37 -0.000299545
+656 38 -0.00033227
+656 39 -1.23049e-05
+656 40 -0.000633287
+656 41 -0.00500449
+656 42 -0.000328951
+656 43 -1.94771e-05
+656 44 -0.00106584
+656 45 -0.00205049
+656 46 -0.010015
+656 47 -0.00982627
+656 48 -0.00102855
+656 49 -0.000385279
+656 50 -0.0108448
+656 379 -1
+656 495 -0.0767144
+656 496 -0.0212555
+656 497 -0.11286
+656 498 -0.0365119
+656 499 -0.022276
+656 500 -0.0143437
+656 501 -0.16536
+656 548 -0.073728
+656 656 1
+656 718 -0.0313887
+656 719 -0.0199158
+656 720 -0.00135379
+656 721 -0.00732734
+657 1 -0.000628808
+657 2 -0.00244798
+657 3 -0.00242081
+657 4 -0.00209207
+657 5 -0.00223096
+657 6 -0.00239765
+657 7 -0.00165082
+657 8 -0.00229022
+657 9 -0.00390248
+657 10 -0.00157021
+657 11 -0.000480048
+657 12 -0.00833934
+657 13 -0.000860595
+657 14 -0.00253048
+657 15 -0.000815578
+657 16 -0.0020701
+657 17 -0.00014916
+657 18 -0.000135392
+657 19 -0.00030952
+657 20 -4.12185e-05
+657 21 -0.00108992
+657 22 -2.91832e-05
+657 23 -7.64813e-05
+657 24 -0.00199098
+657 25 -0.00175305
+657 26 -0.000111852
+657 27 -0.000634722
+657 28 -0.00150523
+657 29 -0.00034114
+657 30 -0.0115798
+657 31 -0.000292825
+657 32 -0.00282927
+657 33 -0.00346282
+657 34 -0.00106409
+657 35 -0.000396315
+657 36 -0.00612963
+657 37 -0.000510918
+657 38 -0.000565565
+657 39 -2.12053e-05
+657 40 -0.00107987
+657 41 -0.00852751
+657 42 -0.000560476
+657 43 -3.29519e-05
+657 44 -0.00181592
+657 45 -0.00349397
+657 46 -0.0170658
+657 47 -0.0167429
+657 48 -0.00175258
+657 49 -0.000656089
+657 50 -0.00899034
+657 380 -1
+657 495 -0.0722878
+657 496 -0.0200303
+657 497 -0.0072509
+657 498 -0.0344048
+657 499 -0.0209898
+657 500 -0.0135162
+657 501 -0.155817
+657 549 -0.00971781
+657 657 1
+657 718 -0.0143231
+657 719 -0.0130732
+657 720 -0.000617763
+657 721 -0.00222901
+658 658 1
+658 718 -0.271211
+658 719 -0.337126
+658 720 -0.00103203
+658 721 -0.541698
+659 1 -0.000214549
+659 2 -0.000673947
+659 3 -0.000880923
+659 4 -0.000522096
+659 5 -0.000471835
+659 6 -0.000462164
+659 7 -0.000543691
+659 8 -0.00041685
+659 9 -0.000269902
+659 10 -0.000163783
+659 11 -5.35855e-05
+659 12 -0.000687986
+659 13 -0.000118351
+659 14 -0.000234172
+659 15 -7.61505e-05
+659 16 -0.000324933
+659 17 -2.07555e-05
+659 18 -1.65276e-05
+659 19 -4.27361e-05
+659 20 -6.63044e-06
+659 21 -0.00010688
+659 22 -4.06847e-06
+659 23 -7.60296e-06
+659 24 -0.00663805
+659 25 -0.000228824
+659 26 -3.82906e-05
+659 27 -0.000101252
+659 28 -0.000176507
+659 29 -3.78828e-05
+659 30 -0.00136457
+659 31 -2.84542e-05
+659 32 -0.000231508
+659 33 -0.000321286
+659 34 -9.99274e-05
+659 35 -4.29518e-05
+659 36 -0.000600214
+659 37 -7.01323e-05
+659 38 -7.71595e-05
+659 39 -4.38389e-06
+659 40 -0.000113145
+659 41 -0.000947317
+659 42 -5.0657e-05
+659 43 -3.24702e-05
+659 44 -0.00179073
+659 45 -0.000596942
+659 46 -0.149702
+659 47 -0.0238313
+659 49 -0.00129417
+659 382 -1
+659 551 -0.00217704
+659 659 1
+660 329 -1
+660 660 1
+661 661 1
+662 554 -1
+662 662 1
+663 44 -1
+663 663 1
+664 9 -0.185612
+664 664 1
+664 774 -0.0883307
+664 828 0.499836
+665 1 -0.0100133
+665 2 -0.033639
+665 3 -0.053405
+665 4 -0.0425988
+665 5 -0.0586404
+665 6 -0.0595594
+665 7 -0.0783737
+665 8 -0.0578505
+665 665 1
+665 775 -0.454577
+665 829 0.454577
+666 1 -0.00998593
+666 2 -0.0336417
+666 3 -0.0534082
+666 4 -0.0425947
+666 5 -0.058642
+666 6 -0.0595634
+666 7 -0.0783575
+666 8 -0.057871
+666 666 1
+666 776 -0.423312
+666 830 0.423312
+667 1 -0.00546095
+667 2 -0.0183719
+667 3 -0.0291691
+667 4 -0.0232661
+667 5 -0.0320322
+667 6 -0.0325312
+667 7 -0.0428028
+667 8 -0.0316041
+667 10 -0.0408226
+667 390 -0.322186
+667 667 1
+667 777 -0.110658
+667 831 0.0962756
+668 10 -0.0447819
+668 34 -0.0273434
+668 391 -0.728481
+668 668 1
+668 778 -0.014568
+668 832 0.0176201
+669 392 -1
+669 669 1
+670 15 -0.0184055
+670 393 -0.955293
+670 670 1
+670 780 -0.0406504
+670 834 0.0406504
+671 11 -0.0329819
+671 12 -0.36665
+671 30 -0.0781541
+671 671 1
+671 781 -0.0430318
+671 835 0.344785
+672 395 -1
+672 672 1
+673 396 -1
+673 673 1
+674 397 -1
+674 674 1
+675 398 -1
+675 675 1
+676 30 -0.00115865
+676 399 -0.996946
+676 676 1
+676 786 -0.00226038
+676 840 0.00226038
+677 12 -0.203809
+677 30 -0.23224
+677 677 1
+677 787 -0.0717421
+677 841 0.403311
+678 47 -0.2755
+678 678 1
+678 788 -1
+678 842 1
+679 14 -0.136525
+679 402 -0.705534
+679 679 1
+680 403 -1
+680 680 1
+681 48 -0.0324273
+681 404 -0.867891
+681 573 -0.0660543
+681 681 1
+681 791 -0.111877
+681 845 0.111877
+682 405 -1
+682 682 1
+683 406 -1
+683 683 1
+684 19 -0.0856527
+684 407 -0.807249
+684 684 1
+685 408 -1
+685 685 1
+686 21 -0.0252783
+686 409 -0.803196
+686 686 1
+686 721 -0.0408988
+687 687 1
+687 721 -0.279656
+687 797 -0.230966
+687 851 0.230966
+688 23 -0.0345839
+688 43 -0.01221
+688 44 -0.235552
+688 411 -0.234578
+688 580 -0.125645
+688 688 1
+688 798 -0.00698937
+688 852 0.00698937
+689 412 -0.101583
+689 689 1
+689 718 -0.194968
+689 719 -0.30295
+689 720 -0.062551
+689 721 -0.0797798
+689 799 -0.12142
+689 853 0.272301
+690 25 -0.266353
+690 690 1
+690 721 -0.0707766
+690 800 -0.125441
+690 854 0.125441
+691 26 -0.278988
+691 691 1
+692 47 -0.342394
+692 692 1
+692 802 -1
+692 856 1
+693 28 -0.00459965
+693 416 -1
+693 693 1
+694 1 -0.00070602
+694 2 -0.00390846
+694 3 -0.00122137
+694 4 -0.00292794
+694 5 -0.00143536
+694 6 -0.000964824
+694 7 -0.00126083
+694 8 -0.000909832
+694 16 -0.0057344
+694 28 -0.00534589
+694 29 -0.0360672
+694 47 -0.0669903
+694 417 -0.610174
+694 694 1
+694 804 -0.0622544
+694 858 0.0622544
+695 30 -0.00575459
+695 418 -1
+695 695 1
+695 805 -0.0346067
+695 859 0.0346067
+696 1 -0.0027809
+696 2 -0.00712236
+696 3 -0.0139685
+696 4 -0.00262997
+696 5 -0.00292491
+696 6 -0.00206912
+696 7 -0.002638
+696 8 -0.000595957
+696 9 -0.0879329
+696 24 -0.00161525
+696 30 -0.0359358
+696 31 -0.0494735
+696 47 -0.00726652
+696 419 -0.145812
+696 696 1
+696 718 -0.00180659
+696 806 -0.123282
+696 860 0.123282
+697 420 -1
+697 697 1
+698 421 -1
+698 698 1
+699 10 -0.017513
+699 34 -0.00174071
+699 422 -0.935439
+699 699 1
+699 809 -0.0117259
+699 863 0.0117259
+700 35 -0.0683899
+700 423 -0.679108
+700 700 1
+701 1 -0.000238091
+701 2 -0.00137739
+701 3 -0.00202571
+701 4 -0.00107395
+701 5 -0.00107763
+701 6 -0.00106681
+701 7 -0.00119111
+701 8 -0.000920962
+701 10 -0.00743153
+701 16 -0.0569402
+701 28 -0.00813799
+701 29 -0.00318154
+701 30 -0.0128145
+701 36 -0.146798
+701 46 -0.00380063
+701 47 -0.00107909
+701 48 -0.00556261
+701 424 -0.385992
+701 701 1
+701 811 -0.660795
+701 865 0.660795
+702 9 -0.00616457
+702 10 -0.00184827
+702 11 -0.00252845
+702 12 -0.0033806
+702 13 -0.00171326
+702 14 -0.00204369
+702 15 -0.00024871
+702 16 -0.0013137
+702 17 -4.07596e-05
+702 19 -0.000316146
+702 20 -1.27113e-05
+702 23 -8.5632e-05
+702 24 -0.00397148
+702 25 -0.000328214
+702 26 -0.000993401
+702 27 -0.000111418
+702 28 -0.000716241
+702 29 -0.000706276
+702 30 -0.000597173
+702 31 -0.000407075
+702 32 -0.00190119
+702 33 -0.00109019
+702 34 -0.00159975
+702 35 -0.00103229
+702 36 -0.0156323
+702 37 -0.000102285
+702 38 -0.000436227
+702 39 -0.000203488
+702 41 -0.00793915
+702 42 -0.00090327
+702 43 -0.000393224
+702 44 -0.157159
+702 45 -0.0932584
+702 702 1
+702 812 -0.141602
+702 866 0.141602
+703 10 -0.0468272
+703 47 -0.106868
+703 426 -0.569057
+703 703 1
+703 813 -0.16254
+703 867 0.16254
+704 427 -1
+704 704 1
+705 40 -0.0261008
+705 428 -0.926579
+705 705 1
+705 815 -0.0602503
+705 869 0.0602503
+706 24 -0.0416903
+706 41 -0.0601498
+706 45 -0.0115848
+706 47 -0.0171341
+706 429 -0.0386049
+706 706 1
+706 718 -0.0796703
+706 719 -0.00893041
+706 720 -0.0197172
+706 721 -0.131418
+706 816 -0.190254
+706 870 0.190254
+707 1 -0.00164179
+707 2 -0.00256474
+707 3 -0.00309463
+707 4 -0.00174835
+707 5 -0.00185159
+707 6 -0.00183346
+707 7 -0.00175449
+707 8 -0.00184801
+707 9 -0.00888356
+707 16 -0.01382
+707 42 -0.00474013
+707 45 -0.00765647
+707 430 -0.49117
+707 599 -0.0715436
+707 707 1
+707 718 -0.0206485
+707 719 -0.0201683
+707 721 -0.055094
+707 817 -0.0444008
+707 871 0.0444008
+708 431 -1
+708 708 1
+709 432 -1
+709 709 1
+710 433 -1
+710 710 1
+711 434 -1
+711 711 1
+712 435 -1
+712 712 1
+713 436 -1
+713 713 1
+714 437 -1
+714 714 1
+715 50 -0.25
+715 715 1
+716 439 -1
+716 716 1
+717 440 -1
+717 717 1
+718 718 1
+718 722 -0.167496
+718 723 -0.167496
+718 724 -0.167496
+718 725 -0.167496
+718 726 -0.167496
+718 727 -0.167496
+718 728 -0.167496
+718 729 -0.167496
+719 719 1
+719 730 -0.184096
+719 731 -0.184096
+719 732 -0.184096
+719 733 -0.184096
+719 734 -0.175952
+719 735 -0.129094
+719 736 -0.0698886
+719 737 -0.184096
+719 738 -0.00555164
+719 739 -0.015267
+719 740 -0.0209833
+719 741 -0.0198165
+719 742 -0.056979
+719 743 -0.00676059
+719 744 -0.0086215
+719 745 -0.184096
+719 746 -0.184096
+719 747 -0.184096
+719 748 -0.0130377
+719 749 -0.134063
+719 750 -0.0953692
+719 751 -0.184096
+719 752 -0.0429227
+719 753 -0.112645
+719 754 -0.127905
+719 755 -0.184096
+719 756 -0.145356
+719 757 -0.184096
+719 758 -0.184096
+719 759 -0.0707951
+719 760 -0.0515545
+719 761 -0.051998
+719 762 -0.184096
+719 763 -0.184096
+720 720 1
+720 764 -0.088871
+720 765 -1
+721 721 1
+721 766 -0.286066
+721 767 -1
+721 768 -0.286066
+721 769 -0.858198
+721 770 -0.858198
+722 722 -1
+722 1099 1
+723 723 -1
+723 1100 1
+724 722 1
+724 723 1
+724 724 1
+724 725 1
+724 726 1
+724 727 1
+724 728 1
+724 729 1
+724 730 1
+724 731 1
+724 732 1
+724 733 1
+724 734 0.955764
+724 735 0.701235
+724 736 0.379632
+724 737 1
+724 738 0.0301562
+724 739 0.0829296
+724 740 0.11398
+724 741 0.107642
+724 742 0.309508
+724 743 0.0367232
+724 744 0.0468316
+724 745 1
+724 746 1
+724 747 1
+724 748 0.0708203
+724 749 0.728222
+724 750 0.518041
+724 751 1
+724 752 0.233154
+724 753 0.611884
+724 754 0.694775
+724 755 1
+724 756 0.789564
+724 757 1
+724 758 1
+724 759 0.384556
+724 760 0.280042
+724 761 0.282451
+724 762 1
+724 763 1
+724 764 0.0228823
+724 765 0.257478
+724 1095 1
+724 1096 1
+724 1097 0.246717
+724 1211 -1
+725 725 -1
+725 1102 1
+726 726 -1
+726 1103 1
+727 727 -1
+727 1104 1
+728 728 -1
+728 1105 1
+729 729 -1
+729 1106 1
+730 730 -1
+730 1107 1
+731 731 -1
+731 1108 1
+732 732 -1
+732 1109 1
+733 733 -1
+733 1110 1
+734 734 -1
+734 1111 1
+735 735 -1
+735 1112 1
+736 736 -1
+736 1113 1
+737 737 -1
+737 1114 1
+738 738 -1
+738 1115 1
+739 739 -1
+739 1116 1
+740 740 -1
+740 1117 1
+741 741 -1
+741 1118 1
+742 742 -1
+742 1119 1
+743 743 -1
+743 1120 1
+744 744 -1
+744 1121 1
+745 745 -1
+745 1122 1
+746 746 -1
+746 1123 1
+747 747 -1
+747 1124 1
+748 748 -1
+748 1125 1
+749 749 -1
+749 1126 1
+750 750 -1
+750 1127 1
+751 751 -1
+751 1128 1
+752 752 -1
+752 1129 1
+753 753 -1
+753 1130 1
+754 754 -1
+754 1131 1
+755 755 -1
+755 1132 1
+756 756 -1
+756 1133 1
+757 757 -1
+757 1134 1
+758 758 -1
+758 1135 1
+759 759 -1
+759 1136 1
+760 760 -1
+760 1137 1
+761 761 -1
+761 1138 1
+762 762 -1
+762 1139 1
+763 763 -1
+763 1140 1
+764 764 -1
+764 1141 1
+765 765 -1
+765 1142 1
+766 766 1
+767 767 1
+768 768 1
+769 769 1
+770 770 1
+771 771 1
+772 772 1
+773 773 -1
+773 1150 1
+774 774 -1
+774 820 -0.0570765
+774 821 -0.298217
+774 1041 1
+775 4 -1
+775 240 1
+775 774 0.531703
+775 775 -1
+775 776 0.110367
+775 777 -0.0473296
+775 778 -0.0369362
+775 779 -1
+775 780 -0.515874
+775 781 0.0321273
+775 782 0.236764
+775 783 0.112054
+776 4 -0.463593
+776 241 1
+776 774 0.0975633
+776 775 0.038386
+776 776 -1
+776 777 0.00181679
+776 778 0.00141783
+776 779 0.038386
+776 780 0.0198023
+776 781 0.00589511
+776 782 0.0434444
+776 783 0.0205611
+777 1 -0.00783667
+777 2 -0.0315317
+777 3 -0.086087
+777 4 -0.03693
+777 5 -0.0850945
+777 6 -0.0781598
+777 7 -0.0735158
+777 8 -0.137967
+777 9 -0.286725
+777 10 -0.55176
+777 336 -0.173423
+777 613 1
+777 777 1
+777 831 -0.87003
+778 778 -1
+778 820 -0.0121988
+778 821 -0.063741
+778 1045 1
+779 3 -0.905539
+779 234 1
+779 774 -0.088234
+779 775 -0.253927
+779 776 -0.10393
+779 777 -0.283832
+779 778 -0.637483
+779 779 -1
+779 780 -0.328839
+779 781 0.0609147
+779 782 0.400496
+779 783 0.943797
+780 339 1
+780 447 -1
+780 780 1
+780 834 -1
+781 781 -1
+781 820 -0.316005
+781 821 -0.112573
+781 1048 1
+782 782 -1
+782 836 -0.00668143
+782 995 1
+783 342 1
+783 450 -1
+783 783 1
+783 837 -1
+784 774 -0.227297
+784 775 -0.0224543
+784 777 -0.0532907
+784 782 -0.00909488
+784 784 1
+784 804 -0.000435337
+784 805 -0.0519988
+784 806 -0.00614166
+784 808 -0.0658373
+784 812 -0.00505009
+784 817 -0.00558911
+784 820 -0.0032032
+784 821 -0.0302643
+784 823 -0.00234384
+784 828 -0.000420836
+784 860 -0.0523853
+784 866 -0.00779233
+784 871 -0.0232019
+784 882 -0.00749475
+784 891 -0.223529
+785 344 1
+785 452 -1
+785 785 1
+785 839 -1
+786 345 1
+786 453 -1
+786 786 1
+786 840 -1
+787 787 -1
+787 820 -0.0567035
+787 821 -0.296266
+787 1054 1
+788 788 1
+788 812 -0.0071366
+788 813 -0.089245
+788 817 -0.222797
+788 819 -0.012088
+788 820 -0.00166169
+788 821 -0.0156886
+788 823 -0.00241595
+788 866 -0.00509077
+788 882 -0.0477294
+788 895 -0.0563431
+789 789 -1
+789 820 -0.0380628
+789 821 -0.198866
+789 1056 1
+790 780 -0.350648
+790 790 1
+790 809 -0.041368
+790 812 -0.000693291
+790 817 -0.149576
+790 820 -0.00216339
+790 821 -0.0204412
+790 823 -0.00213719
+790 834 -0.0930398
+790 866 -0.00101603
+790 882 -0.0508115
+790 897 -0.117295
+791 791 1
+791 804 -0.00199658
+791 811 -0.151638
+791 812 -0.0008211
+791 817 -0.00243991
+791 820 -0.000933739
+791 821 -0.00882461
+791 823 -0.00155106
+791 858 -0.00330908
+791 865 -0.162985
+791 866 -0.000912802
+791 871 -0.0198407
+791 882 -0.0353037
+791 898 -0.243081
+792 792 1
+792 805 -0.0543641
+792 811 -0.334248
+792 812 -0.000798938
+792 816 -0.353309
+792 820 -0.00208649
+792 821 -0.0196734
+792 823 -0.00306542
+792 866 -0.000876255
+792 882 -0.0699388
+792 899 -0.362898
+793 791 -0.48861
+793 793 1
+793 804 -0.00294179
+793 805 -0.132927
+793 820 -0.00175207
+793 821 -0.0166263
+793 823 -0.0022727
+793 882 -0.0562448
+793 900 -0.167458
+794 794 1
+794 805 -0.0241374
+794 811 -0.342092
+794 812 -0.00134041
+794 820 -0.00103681
+794 821 -0.00981976
+794 823 -0.00151823
+794 848 -0.0495994
+794 866 -0.00163483
+794 882 -0.0400145
+794 901 -0.182176
+795 795 1
+795 804 -0.330107
+795 812 -0.000602097
+795 820 -0.00161141
+795 821 -0.0149729
+795 823 -0.00269704
+795 866 -0.000752621
+795 882 -0.0853244
+795 902 -0.365306
+796 796 1
+796 813 -0.0487057
+796 815 -0.0455313
+796 816 -0.338542
+796 817 -0.105416
+796 820 -0.00178138
+796 821 -0.0168117
+796 823 -0.00184605
+796 850 -0.0168937
+796 882 -0.0263379
+796 903 -0.113379
+797 797 1
+797 816 -0.396429
+797 820 -0.00117527
+797 821 -0.0108669
+797 823 -0.00169643
+797 882 -0.0411835
+797 904 -0.128214
+798 798 1
+798 812 -0.00264334
+798 816 -0.188573
+798 820 -0.00159559
+798 821 -0.0149566
+798 823 -0.00166491
+798 852 -0.355678
+798 866 -0.00272953
+798 882 -0.0158978
+798 905 -0.233614
+799 799 1
+799 805 -0.0219854
+799 812 -0.00738232
+799 816 -0.330943
+799 817 -0.011617
+799 820 -0.00251663
+799 821 -0.0237798
+799 823 -0.0887798
+799 860 -0.001482
+799 866 -0.00773159
+799 870 -0.316522
+799 882 -0.0858839
+799 906 -0.360928
+800 787 -0.0239939
+800 800 1
+800 805 -0.0719362
+800 811 -0.045238
+800 812 -0.000240882
+800 816 -0.113465
+800 817 -0.00226805
+800 820 -0.00102024
+800 821 -0.00964347
+800 823 -0.00140952
+800 854 -0.0250664
+800 866 -0.000294286
+800 882 -0.0362237
+800 907 -0.15676
+801 801 1
+801 805 -0.0418702
+801 806 -0.0062684
+801 811 -0.0450736
+801 812 -0.0043216
+801 816 -0.0124057
+801 817 -0.00408953
+801 820 -0.000389309
+801 821 -0.0036613
+801 823 -0.00140351
+801 855 -0.00976003
+801 866 -0.00530017
+801 882 -0.0428837
+801 908 -0.365606
+802 802 1
+802 805 -0.0800279
+802 811 -0.038314
+802 812 -0.000239752
+802 813 -0.0849581
+802 816 -0.0668725
+802 817 -0.0241065
+802 819 -0.00316462
+802 820 -0.000954361
+802 821 -0.00901591
+802 823 -0.00161049
+802 866 -0.000257961
+802 882 -0.043122
+802 909 -0.164988
+803 787 -0.0351324
+803 803 1
+803 805 -0.0813666
+803 811 -0.361501
+803 812 -0.00108956
+803 816 -0.033488
+803 820 -0.00186481
+803 821 -0.0176209
+803 823 -0.00231376
+803 857 -0.0012968
+803 858 -0.00847154
+803 865 -0.063969
+803 866 -0.00136666
+803 882 -0.0511047
+803 910 -0.281738
+804 787 -0.000775186
+804 804 1
+804 805 -0.000870471
+804 809 -0.032898
+804 812 -0.00585078
+804 816 -0.0163454
+804 817 -0.0556347
+804 820 -0.0023705
+804 821 -0.0223871
+804 823 -0.0027838
+804 858 -0.3204
+804 865 -0.140193
+804 866 -0.00755463
+804 882 -0.0429508
+804 911 -0.512246
+805 781 -0.031573
+805 783 -0.0196137
+805 786 -0.00904428
+805 787 -0.00423573
+805 804 -0.00275136
+805 805 1
+805 810 -0.0113046
+805 812 -9.46642e-05
+805 820 -0.00171424
+805 821 -0.0161981
+805 823 -0.00213743
+805 835 -0.0258741
+805 840 -0.000278373
+805 841 -0.048481
+805 859 -0.00890702
+805 860 -0.00386152
+805 865 -0.0120362
+805 866 -0.000136157
+805 882 -0.0521273
+805 912 -0.326332
+806 805 -0.164118
+806 806 1
+806 812 -0.00615151
+806 817 -0.0845086
+806 820 -0.00333951
+806 821 -0.0316187
+806 823 -0.00344044
+806 860 -0.410371
+806 866 -0.00716448
+806 882 -0.049657
+806 913 -0.372972
+807 807 -1
+807 820 -0.083744
+807 821 -0.437556
+807 1074 1
+808 778 -0.0419002
+808 780 -0.167184
+808 782 -0.0201416
+808 784 -0.0472442
+808 791 -0.00881034
+808 805 -0.00165791
+808 808 1
+808 809 -0.01224
+808 812 -0.000281159
+808 820 -0.00105446
+808 821 -0.00996459
+808 823 -0.00103527
+808 866 -0.000511334
+808 882 -0.0236439
+808 915 -0.141515
+809 368 1
+809 476 -1
+809 809 1
+809 863 -1
+810 783 -0.110782
+810 787 -0.00628098
+810 805 -0.0253095
+810 810 1
+810 812 -0.00283565
+810 820 -0.00103082
+810 821 -0.0097365
+810 823 -0.00118161
+810 864 -0.0667206
+810 866 -0.00413368
+810 882 -0.0150035
+810 917 -0.293105
+811 784 -0.0701326
+811 805 -0.0210249
+811 811 1
+811 812 -0.00542725
+811 817 -0.00153077
+811 820 -0.00170114
+811 821 -0.0160721
+811 823 -0.00176228
+811 865 -0.258453
+811 866 -0.00668087
+811 882 -0.0242118
+811 918 -0.237564
+812 811 -0.0256471
+812 812 1
+812 817 -0.234432
+812 819 -0.0751362
+812 820 -0.00234282
+812 821 -0.0221572
+812 823 -0.00340576
+812 866 -0.000723022
+812 882 -0.024328
+812 919 -0.60686
+813 812 -0.00308965
+813 813 1
+813 817 -0.227662
+813 820 -0.00197669
+813 821 -0.018656
+813 823 -0.00285007
+813 866 -0.00234542
+813 882 -0.0388854
+813 920 -0.290604
+814 812 -0.00882476
+814 814 1
+814 820 -0.00111529
+814 821 -0.0106572
+814 823 -0.00246711
+814 866 -0.016669
+814 882 -0.120693
+814 921 -0.629187
+815 813 -0.0131791
+815 815 1
+815 817 -0.144666
+815 819 -0.070097
+815 820 -0.00208333
+815 821 -0.0196977
+815 823 -0.00231106
+815 869 -0.0248633
+815 882 -0.02993
+815 922 -0.130932
+816 804 -0.00316559
+816 812 -0.00201144
+816 816 1
+816 817 -0.00151081
+816 819 -0.00305462
+816 820 -0.00250037
+816 821 -0.0236241
+816 823 -0.00293872
+816 866 -0.00358492
+816 870 -0.105924
+816 882 -0.0562958
+816 923 -0.189938
+817 782 -0.0428666
+817 784 -0.0635426
+817 791 -0.00601768
+817 804 -0.00190138
+817 805 -0.11062
+817 806 -0.00207115
+817 808 -0.0346713
+817 809 -0.0117254
+817 810 -0.0103615
+817 812 -0.00148137
+817 815 -0.0186752
+817 816 -0.0202888
+817 817 1
+817 820 -0.00103801
+817 821 -0.00980655
+817 823 -0.000992499
+817 866 -0.00257603
+817 871 -0.0279316
+817 882 -0.008022
+817 924 -0.258477
+818 798 -0.0139765
+818 812 -0.00665203
+818 817 -0.00674046
+818 818 1
+818 820 -0.000347941
+818 821 -0.00326399
+818 823 -0.00360152
+818 852 -0.0636048
+818 866 -0.00634868
+818 882 -0.131949
+818 925 -0.460003
+819 819 -1
+819 820 -0.0442657
+819 821 -0.23128
+819 1086 1
+820 805 -0.105978
+820 812 -0.0810533
+820 816 -0.269294
+820 817 -0.00310118
+820 820 1
+820 821 -0.0194133
+820 823 -0.00371401
+820 866 -0.084458
+820 870 -0.0409159
+820 871 -0.014327
+820 882 -0.0517426
+820 927 -0.935919
+821 791 -0.103422
+821 805 -0.122536
+821 806 -0.000739276
+821 811 -0.0127001
+821 820 -0.00183754
+821 821 1
+821 823 -0.170543
+821 865 -0.00259631
+821 882 -0.0865469
+821 928 -0.82597
+822 788 -0.112262
+822 798 -0.0159983
+822 802 -0.224073
+822 804 -0.00925309
+822 805 -0.214353
+822 806 -0.00581718
+822 811 -0.320054
+822 813 -0.0284852
+822 816 -0.280741
+822 817 -0.0063696
+822 818 -0.0306488
+822 820 -0.00574051
+822 821 -0.0542349
+822 822 1
+822 823 -0.0864428
+822 842 -0.00116754
+822 856 -0.0372051
+822 858 -0.0293749
+822 860 -0.00180819
+822 865 -0.0023471
+822 867 -0.0464603
+822 870 -0.0352807
+822 882 -0.134327
+822 929 -0.447891
+823 791 -0.032073
+823 805 -0.0102879
+823 820 -0.000165656
+823 821 -0.00156512
+823 823 1
+823 845 -0.00579992
+823 865 -0.00333559
+823 882 -0.176979
+823 930 -0.0352309
+824 824 -1
+824 1037 1
+825 820 -0.00218877
+825 821 -0.010061
+825 825 1
+825 879 -0.970874
+826 826 1
+826 882 -0.18584
+827 827 1
+827 882 -0.18584
+828 828 1
+829 829 1
+830 830 1
+831 831 1
+832 832 1
+833 833 1
+834 834 1
+835 835 1
+836 836 1
+837 837 1
+838 838 1
+839 839 1
+840 840 1
+841 841 1
+842 842 1
+843 843 1
+844 844 1
+845 845 1
+846 846 1
+847 847 1
+848 848 1
+849 849 1
+850 850 1
+851 851 1
+852 852 1
+853 853 1
+854 854 1
+855 855 1
+856 856 1
+857 857 1
+858 858 1
+859 859 1
+860 860 1
+861 861 1
+862 862 1
+863 863 1
+864 864 1
+865 865 1
+866 866 1
+867 867 1
+868 868 1
+869 869 1
+870 870 1
+871 871 1
+872 872 1
+873 873 1
+874 874 1
+875 875 1
+876 876 1
+877 877 1
+878 878 1
+879 879 1
+880 880 1
+881 881 1
+882 882 1
+882 1208 -1
+883 774 0.0673987
+883 775 -0.0320654
+883 776 -0.0129695
+883 777 -0.00349137
+883 778 8.46554e-05
+883 779 -0.0146927
+883 780 0.0129387
+883 781 0.243718
+883 782 0.0491226
+883 783 0.180055
+883 784 -0.00175565
+883 787 -0.00365789
+883 791 -0.00194543
+883 804 -0.00171521
+883 805 -0.0443389
+883 806 -0.00244335
+883 817 -0.00274749
+883 820 -0.00128787
+883 821 -0.012149
+883 823 -0.00464172
+883 829 -0.00373727
+883 830 -0.0048465
+883 831 -0.0198809
+883 858 -0.00184652
+883 860 -0.00412738
+883 865 -0.00308879
+883 871 -0.0106828
+883 882 -0.155009
+883 883 -1
+883 935 -0.380344
+884 774 0.39483
+884 775 -0.0325507
+884 776 -0.0398055
+884 777 -0.00354224
+884 778 0.00568179
+884 779 -0.0221155
+884 780 0.0834861
+884 781 0.390935
+884 782 0.159594
+884 783 0.341298
+884 784 -0.00785409
+884 787 -0.00470577
+884 791 -0.00499509
+884 804 -0.00341178
+884 805 -0.087857
+884 806 -0.00622128
+884 817 -0.00698071
+884 820 -0.00315821
+884 821 -0.0298229
+884 823 -0.00919386
+884 829 -0.00791664
+884 830 -0.0102953
+884 831 -0.0421738
+884 858 -0.00644557
+884 860 -0.00666549
+884 865 -0.0112674
+884 871 -0.0105228
+884 882 -0.444328
+884 884 -1
+884 936 -1
+885 774 0.0434937
+885 775 -0.00180072
+885 776 -0.043605
+885 777 0.137444
+885 778 0.361109
+885 779 0.011781
+885 780 0.186287
+885 781 1
+885 782 0.424346
+885 783 1
+885 784 -0.0116955
+885 787 -0.0107452
+885 791 -0.00617329
+885 804 -0.00567858
+885 805 -0.146309
+885 806 -0.0076901
+885 817 -0.00862882
+885 820 -0.00397885
+885 821 -0.0375839
+885 823 -0.0153148
+885 829 -0.0160169
+885 830 -0.0208289
+885 831 -0.0853318
+885 858 -0.00256686
+885 860 -0.0166593
+885 865 -0.0211176
+885 871 -0.0161807
+885 882 -0.990723
+885 885 -1
+885 937 -1
+886 774 1
+886 775 0.275619
+886 776 0.525771
+886 777 -0.00147904
+886 778 0.0205061
+886 779 -0.033057
+886 780 0.286434
+886 781 0.399051
+886 782 0.665142
+886 783 0.799471
+886 784 -0.0229804
+886 787 -0.00461986
+886 791 -0.0115259
+886 804 -0.00717314
+886 805 -0.184855
+886 806 -0.0143523
+886 817 -0.0161058
+886 820 -0.00732778
+886 821 -0.0692449
+886 823 -0.0193505
+886 829 -0.0272374
+886 830 -0.0354148
+886 831 -0.145105
+886 858 -0.0131186
+886 860 -0.00668696
+886 865 -0.0238682
+886 871 -0.0194889
+886 882 -1
+886 886 -1
+886 938 -1
+887 774 0.0968101
+887 775 0.113032
+887 776 -0.0521862
+887 777 0.147479
+887 778 0.321465
+887 779 0.00514601
+887 780 0.19861
+887 781 0.230956
+887 782 0.656062
+887 783 0.591396
+887 784 -0.0151426
+887 787 -0.00250976
+887 791 -0.0067276
+887 804 -0.00357076
+887 805 -0.0920222
+887 806 -0.00838255
+887 817 -0.00940847
+887 820 -0.00430665
+887 821 -0.0406784
+887 823 -0.00963369
+887 829 -0.020655
+887 830 -0.0268595
+887 831 -0.110054
+887 858 -0.00354281
+887 860 -0.00409686
+887 865 -0.0131936
+887 871 -0.0113701
+887 882 -0.518873
+887 887 -1
+887 939 -1
+888 774 0.0288711
+888 775 -0.0445302
+888 776 -0.0528477
+888 777 0.240995
+888 778 0.952124
+888 779 0.0286735
+888 780 1
+888 781 0.295554
+888 782 1
+888 783 0.821927
+888 784 -0.0261838
+888 787 -0.00316547
+888 791 -0.0118181
+888 804 -0.00567406
+888 805 -0.146243
+888 806 -0.0147225
+888 817 -0.0165181
+888 820 -0.00750648
+888 821 -0.0709115
+888 823 -0.0153059
+888 829 -0.0340281
+888 830 -0.0442515
+888 831 -0.181291
+888 858 -0.00386272
+888 860 -0.00470093
+888 865 -0.0211857
+888 871 -0.018262
+888 882 -0.824475
+888 888 -1
+888 940 -0.958277
+889 774 0.14049
+889 775 -0.0758823
+889 776 -0.0971732
+889 777 0.0223631
+889 778 0.177537
+889 779 -0.00509654
+889 780 0.648092
+889 781 0.20765
+889 782 0.680314
+889 783 0.934805
+889 784 -0.0202813
+889 787 -0.00232868
+889 791 -0.0044656
+889 804 -0.00391702
+889 805 -0.100952
+889 806 -0.00555559
+889 817 -0.00623437
+889 820 -0.00303247
+889 821 -0.0286434
+889 823 -0.0105635
+889 829 -0.0262695
+889 830 -0.0341526
+889 831 -0.13994
+889 858 -0.00296139
+889 860 -0.00351614
+889 865 -0.0138772
+889 871 -0.0102523
+889 882 -0.661614
+889 889 -1
+889 941 -1
+890 774 0.0241641
+890 775 -0.0923294
+890 776 -0.0795351
+890 777 0.283251
+890 778 0.419097
+890 779 0.025
+890 780 0.157303
+890 781 0.056133
+890 782 0.51795
+890 783 0.876248
+890 784 -0.0145279
+890 787 -0.000595015
+890 791 -0.00668003
+890 804 -0.00302631
+890 805 -0.0779409
+890 806 -0.00832573
+890 817 -0.00934014
+890 820 -0.00423752
+890 821 -0.0400314
+890 823 -0.00815897
+890 829 -0.0195339
+890 830 -0.0254099
+890 831 -0.104091
+890 858 -0.00215278
+890 860 -0.000800215
+890 865 -0.0108091
+890 871 -0.0108786
+890 882 -0.416147
+890 890 -1
+890 942 -0.662071
+891 9 -1
+891 113 1
+891 882 -0.09292
+891 891 1
+892 776 -0.0882331
+892 777 -0.179093
+892 778 -0.0307757
+892 780 -0.0995916
+892 785 1
+892 809 -0.0293575
+892 812 -0.00202911
+892 813 -0.00583967
+892 820 -0.00224971
+892 821 -0.0212662
+892 823 -0.0024839
+892 831 -0.104179
+892 832 -0.572844
+892 863 -0.0391493
+892 865 -0.0675828
+892 866 -0.00408012
+892 867 -0.0851166
+892 882 -0.0234699
+892 892 -1
+893 781 -0.0746911
+893 786 1
+893 787 -0.109568
+893 804 -0.0073147
+893 805 -0.014629
+893 812 -0.0209835
+893 820 -0.00378675
+893 821 -0.0357848
+893 823 -0.00447294
+893 835 -0.581887
+893 866 -0.0307216
+893 882 -0.0731165
+893 893 -1
+894 781 -0.173591
+894 783 -0.394282
+894 787 1
+894 789 -0.0145358
+894 804 -0.0244421
+894 812 -0.00324806
+894 820 -0.00760011
+894 821 -0.0718116
+894 823 -0.00663397
+894 835 -0.747246
+894 841 -0.261913
+894 866 -0.00474494
+894 882 -0.0548233
+894 894 -1
+895 13 -1
+895 117 1
+895 882 -0.09292
+895 895 1
+896 783 -0.0162646
+896 787 -0.0262557
+896 789 1
+896 804 -0.00805683
+896 805 -0.295588
+896 810 -0.971656
+896 812 -0.00765544
+896 817 -0.470795
+896 820 -0.00639254
+896 821 -0.0604058
+896 823 -0.00625952
+896 843 -0.131442
+896 866 -0.00795178
+896 882 -0.0335031
+896 896 -1
+897 15 -1
+897 119 1
+897 882 -0.09292
+897 897 1
+898 16 -1
+898 120 1
+898 882 -0.09292
+898 898 1
+899 17 -1
+899 121 1
+899 882 -0.09292
+899 899 1
+900 18 -1
+900 122 1
+900 882 -0.09292
+900 900 1
+901 19 -1
+901 123 1
+901 882 -0.09292
+901 901 1
+902 20 -1
+902 124 1
+902 882 -0.09292
+902 902 1
+903 21 -1
+903 125 1
+903 882 -0.09292
+903 903 1
+904 22 -1
+904 126 1
+904 882 -0.09292
+904 904 1
+905 23 -1
+905 127 1
+905 882 -0.09292
+905 905 1
+906 24 -1
+906 128 1
+906 882 -0.09292
+906 906 1
+907 25 -1
+907 129 1
+907 882 -0.09292
+907 907 1
+908 26 -1
+908 130 1
+908 882 -0.09292
+908 908 1
+909 27 -1
+909 131 1
+909 882 -0.09292
+909 909 1
+910 28 -1
+910 132 1
+910 882 -0.09292
+910 910 1
+911 29 -1
+911 133 1
+911 882 -0.09292
+911 911 1
+912 30 -1
+912 134 1
+912 882 -0.09292
+912 912 1
+913 31 -1
+913 135 1
+913 882 -0.09292
+913 913 1
+914 782 -0.637004
+914 783 -0.874071
+914 805 -0.00286504
+914 807 1
+914 809 -0.0992694
+914 812 -0.0058878
+914 817 -0.113831
+914 820 -0.009698
+914 821 -0.0915909
+914 823 -0.00839219
+914 866 -0.0100318
+914 882 -0.120942
+914 914 -1
+915 33 -1
+915 137 1
+915 882 -0.09292
+915 915 1
+916 780 -0.374713
+916 782 -0.117704
+916 808 -0.499641
+916 809 1
+916 812 -0.00400712
+916 814 -0.0355228
+916 820 -0.00352007
+916 821 -0.0332611
+916 823 -0.00349765
+916 832 -0.807261
+916 863 -0.00898083
+916 866 -0.00815054
+916 882 -0.0488219
+916 916 -1
+917 35 -1
+917 139 1
+917 882 -0.09292
+917 917 1
+918 36 -1
+918 140 1
+918 882 -0.09292
+918 918 1
+919 37 -1
+919 141 1
+919 882 -0.09292
+919 919 1
+920 38 -1
+920 142 1
+920 882 -0.09292
+920 920 1
+921 39 -1
+921 143 1
+921 882 -0.09292
+921 921 1
+922 40 -1
+922 144 1
+922 882 -0.09292
+922 922 1
+923 41 -1
+923 145 1
+923 882 -0.09292
+923 923 1
+924 42 -1
+924 146 1
+924 882 -0.09292
+924 924 1
+925 43 -1
+925 147 1
+925 882 -0.09292
+925 925 1
+926 805 -0.0157263
+926 812 -0.0405516
+926 819 1
+926 820 -0.000570637
+926 821 -0.00539078
+926 823 -0.00595274
+926 827 -0.572088
+926 852 -0.0367744
+926 866 -0.0760444
+926 882 -0.18003
+926 926 -1
+927 45 -1
+927 149 1
+927 882 -0.09292
+927 927 1
+928 46 -1
+928 150 1
+928 882 -0.09292
+928 928 1
+929 47 -1
+929 151 1
+929 882 -0.09292
+929 929 1
+930 48 -1
+930 152 1
+930 882 -0.09292
+930 930 1
+931 802 -0.00164851
+931 804 -0.00059595
+931 805 -0.0149477
+931 813 -0.00124612
+931 818 -0.000446911
+931 820 -6.62979e-05
+931 821 -0.000626001
+931 823 -0.00138272
+931 824 0.515464
+931 931 -1
+932 50 -1
+932 154 1
+932 882 -0.09292
+932 932 1
+933 51 -1
+933 155 1
+933 882 -0.09292
+933 933 1
+934 52 -1
+934 156 1
+934 882 -0.09292
+934 934 1
+935 1 -1
+935 157 1
+935 882 -0.0272284
+935 883 -0.175656
+935 935 1
+936 2 -1
+936 158 1
+936 882 -0.0691867
+936 884 -0.155711
+936 936 1
+937 3 -0.755043
+937 159 1
+937 882 -0.0895231
+937 885 -0.0903614
+937 937 1
+938 4 -0.856188
+938 160 1
+938 882 -0.127341
+938 886 -0.127341
+938 938 1
+939 5 -0.510347
+939 161 1
+939 882 -0.0412913
+939 887 -0.0795787
+939 939 1
+940 6 -0.248166
+940 162 1
+940 882 -0.0373542
+940 888 -0.0453067
+940 940 1
+941 7 -0.745588
+941 163 1
+941 882 -0.0401927
+941 889 -0.0607495
+941 941 1
+942 8 -0.64291
+942 164 1
+942 882 -0.0359262
+942 890 -0.0863306
+942 942 1
+943 9 -0.185612
+943 165 1
+943 882 -0.017247
+943 891 -0.514388
+943 943 1
+944 10 -0.310143
+944 166 1
+944 882 -0.00936862
+944 892 -0.399175
+944 944 1
+945 11 -0.342239
+945 167 1
+945 882 -0.0204593
+945 893 -0.279818
+945 945 1
+946 12 -0.525587
+946 168 1
+946 882 -0.020688
+946 894 -0.377357
+946 946 1
+947 13 -0.450577
+947 169 1
+947 882 -0.0418676
+947 895 -0.0494234
+947 947 1
+948 14 -0.396895
+948 170 1
+948 882 -0.0123123
+948 896 -0.367496
+948 948 1
+949 15 -0.411692
+949 171 1
+949 882 -0.0382544
+949 897 -0.0883079
+949 949 1
+950 16 -0.426883
+950 172 1
+950 882 -0.0396659
+950 898 -0.273117
+950 950 1
+951 17 -0.337349
+951 173 1
+951 882 -0.0313465
+951 899 -0.162651
+951 951 1
+952 18 -0.39165
+952 174 1
+952 882 -0.0363921
+952 900 -0.10835
+952 952 1
+953 19 -0.421632
+953 175 1
+953 882 -0.0391781
+953 901 -0.178368
+953 953 1
+954 20 -0.357698
+954 176 1
+954 882 -0.0332373
+954 902 -0.142301
+954 954 1
+955 21 -0.5
+955 177 1
+955 882 -0.04646
+955 903 -0.2
+955 955 1
+956 22 -0.542937
+956 178 1
+956 882 -0.0504497
+956 904 -0.157063
+956 956 1
+957 23 -0.253654
+957 179 1
+957 882 -0.0235695
+957 905 -0.346346
+957 957 1
+958 24 -0.431501
+958 180 1
+958 882 -0.040095
+958 906 -0.168499
+958 958 1
+959 25 -0.356604
+959 181 1
+959 882 -0.0331356
+959 907 -0.143396
+959 959 1
+960 26 -0.278988
+960 182 1
+960 882 -0.0259236
+960 908 -0.221012
+960 960 1
+961 27 -0.368863
+961 183 1
+961 882 -0.0342747
+961 909 -0.131137
+961 961 1
+962 28 -0.396756
+962 184 1
+962 882 -0.0368666
+962 910 -0.203244
+962 962 1
+963 29 -0.23717
+963 185 1
+963 882 -0.0220378
+963 911 -0.26283
+963 963 1
+964 30 -0.379337
+964 186 1
+964 882 -0.035248
+964 912 -0.220663
+964 964 1
+965 31 -0.294478
+965 187 1
+965 882 -0.0273629
+965 913 -0.205522
+965 965 1
+966 32 -1
+966 188 1
+966 882 -0.0367834
+966 914 -0.304139
+966 966 1
+967 33 -0.449828
+967 189 1
+967 882 -0.041798
+967 915 -0.250172
+967 967 1
+968 34 -0.215086
+968 190 1
+968 882 -0.0160028
+968 916 -0.327779
+968 968 1
+969 35 -0.213124
+969 191 1
+969 882 -0.0198035
+969 917 -0.386876
+969 969 1
+970 36 -0.313853
+970 192 1
+970 882 -0.0291632
+970 918 -0.286147
+970 970 1
+971 37 -0.150698
+971 193 1
+971 882 -0.0140029
+971 919 -0.349302
+971 971 1
+972 38 -0.295086
+972 194 1
+972 882 -0.0274194
+972 920 -0.204914
+972 972 1
+973 39 -0.336836
+973 195 1
+973 882 -0.0312988
+973 921 -0.163164
+973 973 1
+974 40 -0.355496
+974 196 1
+974 882 -0.0330327
+974 922 -0.144504
+974 974 1
+975 41 -0.380661
+975 197 1
+975 882 -0.035371
+975 923 -0.119339
+975 975 1
+976 42 -0.150226
+976 198 1
+976 882 -0.013959
+976 924 -0.449774
+976 976 1
+977 43 -0.37766
+977 199 1
+977 882 -0.0350922
+977 925 -0.12234
+977 977 1
+978 44 -0.5
+978 200 1
+978 882 -0.0301953
+978 926 -0.167724
+978 978 1
+979 45 -0.186516
+979 201 1
+979 882 -0.0173311
+979 927 -0.313484
+979 979 1
+980 46 -0.265
+980 202 1
+980 882 -0.0246238
+980 928 -0.235
+980 980 1
+981 47 -0.38173
+981 203 1
+981 882 -0.0354704
+981 929 -0.11827
+981 981 1
+982 48 -0.490919
+982 204 1
+982 882 -0.0456162
+982 930 -0.00908071
+982 982 1
+983 49 -0.240662
+983 205 1
+983 931 -0.5
+983 983 1
+984 50 -0.25
+984 206 1
+984 882 -0.02323
+984 932 -0.25
+984 984 1
+985 51 -0.5
+985 207 1
+985 882 -0.04646
+985 985 1
+986 52 -0.5
+986 208 1
+986 882 -0.04646
+986 986 1
+987 774 -0.176719
+987 987 1
+988 775 -1
+988 988 1
+989 776 -1
+989 989 1
+990 777 -0.0646132
+990 831 -0.943785
+990 990 1
+991 778 -0.135877
+991 832 -0.835656
+991 991 1
+992 779 -0.382316
+992 992 1
+993 780 -0.153076
+993 834 -0.422188
+993 993 1
+994 781 -0.124808
+994 994 1
+995 449 1
+995 990 0.0119756
+995 991 0.153287
+995 993 0.0849625
+995 995 1
+995 996 0.19753
+995 997 0.173646
+995 998 0.224084
+995 999 0.0614154
+995 1001 0.00426236
+995 1002 0.0276956
+995 1003 0.0523351
+995 1004 0.0289403
+995 1005 0.0111644
+995 1006 0.00599157
+995 1007 0.00798041
+995 1008 0.00173498
+995 1009 0.0229097
+995 1011 0.00133997
+995 1012 0.00597925
+995 1016 0.0528537
+995 1017 0.023103
+995 1018 0.202076
+995 1019 0.00415617
+995 1020 0.0305956
+995 1021 0.140114
+995 1022 0.207264
+995 1023 0.00486545
+995 1024 0.122641
+995 1025 0.00406199
+995 1026 0.0117322
+995 1027 0.000232187
+995 1028 0.0152677
+995 1029 0.0205129
+995 1030 0.0205406
+995 1033 0.0780059
+995 1034 0.389183
+995 1036 0.328497
+995 1037 0.31807
+995 1210 -1
+996 783 -0.36138
+996 837 -0.192642
+996 996 1
+997 784 -0.47619
+997 838 -0.000103739
+997 997 1
+998 785 -0.197227
+998 839 -0.14811
+998 998 1
+999 786 -0.0270148
+999 840 -0.236027
+999 999 1
+1000 787 -0.177883
+1000 1000 1
+1001 788 -1
+1001 1001 1
+1002 789 -0.237508
+1002 843 -0.103765
+1002 1002 1
+1003 790 -0.344477
+1003 844 -0.206011
+1003 1003 1
+1004 791 -0.162299
+1004 845 -0.837701
+1004 1004 1
+1005 792 -0.275031
+1005 846 -0.346223
+1005 1005 1
+1006 793 -0.455538
+1006 847 -0.208054
+1006 1006 1
+1007 794 -1
+1007 848 -0.197961
+1007 1007 1
+1008 795 -0.481565
+1008 849 -0.271953
+1008 1008 1
+1009 796 -0.788139
+1009 850 -0.211861
+1009 1009 1
+1010 797 -1
+1010 1010 1
+1011 852 -1
+1011 1011 1
+1012 799 -0.31103
+1012 853 -0.30247
+1012 1012 1
+1013 800 -1
+1013 1013 1
+1014 801 -1
+1014 1014 1
+1015 802 -1
+1015 1015 1
+1016 803 -0.532489
+1016 857 -0.0628155
+1016 1016 1
+1017 804 -0.0925742
+1017 858 -0.388332
+1017 1017 1
+1018 805 -0.23931
+1018 859 -0.456516
+1018 1018 1
+1019 806 -0.0646626
+1019 860 -0.375884
+1019 1019 1
+1020 807 -0.0641104
+1020 861 -0.467698
+1020 1020 1
+1021 808 -0.734721
+1021 862 -0.265279
+1021 1021 1
+1022 809 -0.103601
+1022 863 -0.138687
+1022 1022 1
+1023 810 -0.286785
+1023 864 -0.713215
+1023 1023 1
+1024 811 -0.757335
+1024 865 -0.242665
+1024 1024 1
+1025 812 -0.655777
+1025 1025 1
+1026 813 -0.0920861
+1026 867 -0.907914
+1026 1026 1
+1027 814 -0.5
+1027 1027 1
+1028 815 -0.9985
+1028 869 -0.344249
+1028 1028 1
+1029 816 -0.83315
+1029 870 -0.16685
+1029 1029 1
+1030 817 -0.125281
+1030 871 -0.874719
+1030 1030 1
+1031 818 -0.5
+1031 1031 1
+1032 819 -0.227864
+1032 1032 1
+1033 820 -1
+1033 1033 1
+1034 821 -0.832012
+1034 1034 1
+1035 822 -0.264059
+1035 1035 1
+1036 823 -0.5
+1036 1036 1
+1037 491 1
+1037 990 0.00601169
+1037 991 0.0769497
+1037 993 0.0426509
+1037 995 0.0351297
+1037 996 0.0991594
+1037 997 0.0871696
+1037 998 0.11249
+1037 999 0.0308304
+1037 1001 0.00213969
+1037 1002 0.0139031
+1037 1003 0.0262721
+1037 1004 0.0145279
+1037 1005 0.00560451
+1037 1006 0.00300775
+1037 1007 0.00400615
+1037 1008 0.000870955
+1037 1009 0.0115006
+1037 1011 0.000672661
+1037 1012 0.00300157
+1037 1016 0.0265324
+1037 1017 0.0115976
+1037 1018 0.101441
+1037 1019 0.00208638
+1037 1020 0.0153589
+1037 1021 0.0703368
+1037 1022 0.104046
+1037 1023 0.00244244
+1037 1024 0.0615653
+1037 1025 0.00203911
+1037 1026 0.00588953
+1037 1027 0.000116557
+1037 1028 0.00766433
+1037 1029 0.0102974
+1037 1030 0.0103113
+1037 1033 0.0391587
+1037 1034 0.195369
+1037 1036 0.164905
+1037 1037 1
+1037 1210 -0.501997
+1038 825 -1
+1038 1038 1
+1039 826 -0.5
+1039 1039 1
+1040 827 -0.5
+1040 1040 1
+1041 495 1
+1041 1041 1
+1042 1042 1
+1043 1043 1
+1044 1044 1
+1045 496 0.149083
+1045 1045 1
+1046 1046 1
+1047 1047 1
+1048 497 1
+1048 1048 1
+1049 1049 1
+1050 1050 1
+1051 1051 1
+1052 1052 1
+1053 1053 1
+1054 498 1
+1054 1054 1
+1055 1055 1
+1056 499 0.547921
+1056 1056 1
+1057 1057 1
+1058 1058 1
+1059 1059 1
+1060 1060 1
+1061 1061 1
+1062 1062 1
+1063 1063 1
+1064 1064 1
+1065 1065 1
+1066 1066 1
+1067 1067 1
+1068 1068 1
+1069 1069 1
+1070 1070 1
+1071 1071 1
+1072 1072 1
+1073 1073 1
+1074 500 0.933784
+1074 1074 1
+1075 1075 1
+1076 1076 1
+1077 1077 1
+1078 1078 1
+1079 1079 1
+1080 1080 1
+1081 1081 1
+1082 1082 1
+1083 1083 1
+1084 1084 1
+1085 1085 1
+1086 501 0.26286
+1086 1086 1
+1087 1087 1
+1088 1088 1
+1089 1089 1
+1090 1090 1
+1091 1091 1
+1092 1092 1
+1093 1093 1
+1094 1094 1
+1095 799 -1
+1095 820 -0.0676126
+1095 821 -0.171072
+1095 822 -1
+1095 853 -0.297254
+1095 860 -0.00165756
+1095 870 -0.604874
+1095 871 -0.0830574
+1095 1095 1
+1096 799 -0.389236
+1096 820 -0.0337754
+1096 821 -0.122934
+1096 822 -0.978662
+1096 853 -0.36365
+1096 870 -0.053381
+1096 871 -0.0638712
+1096 1096 1
+1097 799 -0.0614863
+1097 820 -0.00930582
+1097 821 -0.0235457
+1097 822 -0.0121431
+1097 853 -0.304332
+1097 870 -0.477706
+1097 1097 1
+1098 797 -0.00144722
+1098 799 -0.00228269
+1098 800 -0.144243
+1098 801 -0.0319625
+1098 816 -0.0236271
+1098 820 -0.00138258
+1098 821 -0.00233209
+1098 822 -0.174961
+1098 850 -0.00185399
+1098 851 -0.00168575
+1098 853 -0.0106549
+1098 854 -0.00126683
+1098 870 -0.0874002
+1098 871 -0.0194126
+1098 1098 1
+1099 1099 -1
+1099 1151 0.252439
+1099 1203 -0.164485
+1100 1100 -1
+1100 1152 0.833114
+1100 1203 -0.502631
+1101 724 -1
+1101 1101 1
+1102 1102 -1
+1102 1154 1
+1102 1203 -0.690982
+1103 1103 -1
+1103 1155 0.718982
+1103 1203 -0.403858
+1104 1104 -1
+1104 1156 1
+1104 1203 -0.771115
+1105 1105 -1
+1105 1157 0.478418
+1105 1203 -0.268731
+1106 1106 -1
+1106 1158 0.526264
+1106 1203 -0.295607
+1107 1107 -1
+1107 1159 0.360745
+1107 1203 -0.244849
+1108 1108 -1
+1108 1160 0.156087
+1108 1203 -0.169505
+1109 1109 -1
+1109 1161 0.0735158
+1109 1203 -0.0498974
+1110 1110 -1
+1110 1162 0.0914569
+1110 1203 -0.07841
+1111 1111 -1
+1111 1163 0.05
+1111 1203 -0.0509048
+1112 1112 -1
+1112 1164 0.0807692
+1112 1203 -0.0626521
+1113 1113 -1
+1113 1165 0.03
+1113 1203 -0.0271492
+1114 1114 -1
+1114 1166 0.149867
+1114 1203 -0.135626
+1115 1115 -1
+1115 1167 0.06
+1115 1203 -0.0542984
+1116 1116 -1
+1116 1168 0.0580645
+1116 1203 -0.0525469
+1117 1117 -1
+1117 1169 0.06
+1117 1203 -0.0542984
+1118 1118 -1
+1118 1170 0.0439024
+1118 1203 -0.0397306
+1119 1119 -1
+1119 1171 0.0633333
+1119 1203 -0.0542984
+1120 1120 -1
+1120 1172 0.0633333
+1120 1203 -0.0542984
+1121 1121 -1
+1121 1173 0.06
+1121 1203 -0.0542984
+1122 1122 -1
+1122 1174 0.13161
+1122 1203 -0.0974487
+1123 1123 -1
+1123 1175 0.114565
+1123 1203 -0.0717775
+1124 1124 -1
+1124 1176 0.341249
+1124 1203 -0.213799
+1125 1125 -1
+1125 1177 0.0709677
+1125 1203 -0.0525469
+1126 1126 -1
+1126 1178 0.0652174
+1126 1203 -0.0708241
+1127 1127 -1
+1127 1179 0.2
+1127 1203 -0.108597
+1128 1128 -1
+1128 1180 0.0974367
+1128 1203 -0.0835368
+1129 1129 -1
+1129 1181 0.16
+1129 1203 -0.108597
+1130 1130 -1
+1130 1182 0.0489796
+1130 1203 -0.0332439
+1131 1131 -1
+1131 1183 0.0923077
+1131 1203 -0.0626521
+1132 1132 -1
+1132 1184 0.263306
+1132 1203 -0.178714
+1133 1133 -1
+1133 1185 0.0316667
+1133 1203 -0.0271492
+1134 1134 -1
+1134 1186 0.152349
+1134 1203 -0.112805
+1135 1135 -1
+1135 1187 0.756899
+1135 1203 -0.725266
+1136 1136 -1
+1136 1188 0.088
+1136 1203 -0.0651581
+1137 1137 -1
+1137 1189 0.322222
+1137 1203 -0.180995
+1138 1138 -1
+1138 1190 0.0709677
+1138 1203 -0.0525469
+1139 1139 -1
+1139 1191 0.412052
+1139 1203 -0.258159
+1140 1140 -1
+1140 1192 0.100018
+1140 1203 -0.0905135
+1141 1141 -1
+1141 1193 0.2125
+1141 1203 -0.203619
+1142 1142 -1
+1142 1194 1
+1142 1203 -1
+1143 766 -0.16
+1143 1143 1
+1144 767 -0.0854504
+1144 1144 1
+1145 768 -0.2
+1145 1145 1
+1146 769 -0.15
+1146 1146 1
+1147 770 -0.15
+1147 1147 1
+1148 771 -0.16
+1148 1148 1
+1149 772 -0.12
+1149 1149 1
+1150 1150 -1
+1150 1202 0.2125
+1150 1203 -0.203619
+1151 883 -1
+1151 1095 0.131871
+1151 1151 1
+1152 884 -1
+1152 1095 0.131871
+1152 1152 1
+1153 885 -1
+1153 1095 0.202387
+1153 1153 1
+1154 886 -0.622688
+1154 1095 0.151033
+1154 1154 1
+1155 887 -1
+1155 1095 0.131871
+1155 1155 1
+1156 888 -0.805762
+1156 1095 0.181034
+1156 1156 1
+1157 889 -1
+1157 1095 0.131871
+1157 1157 1
+1158 890 -1
+1158 1095 0.131871
+1158 1158 1
+1159 891 -1
+1159 1096 0.184096
+1159 1159 1
+1160 892 -1
+1160 1096 0.184096
+1160 1160 1
+1161 893 -1
+1161 1096 0.184096
+1161 1161 1
+1162 894 -1
+1162 1096 0.184096
+1162 1162 1
+1163 895 -1
+1163 1096 0.184096
+1163 1163 1
+1164 896 -1
+1164 1096 0.184096
+1164 1164 1
+1165 897 -1
+1165 1096 0.184096
+1165 1165 1
+1166 898 -1
+1166 1096 0.184096
+1166 1166 1
+1167 899 -1
+1167 1096 0.184096
+1167 1167 1
+1168 900 -1
+1168 1096 0.184096
+1168 1168 1
+1169 901 -1
+1169 1096 0.184096
+1169 1169 1
+1170 902 -1
+1170 1096 0.184096
+1170 1170 1
+1171 903 -1
+1171 1096 0.184096
+1171 1171 1
+1172 904 -1
+1172 1096 0.184096
+1172 1172 1
+1173 905 -1
+1173 1096 0.184096
+1173 1173 1
+1174 906 -1
+1174 1096 0.184096
+1174 1174 1
+1175 907 -1
+1175 1096 0.184096
+1175 1175 1
+1176 908 -1
+1176 1096 0.184096
+1176 1176 1
+1177 909 -1
+1177 1096 0.184096
+1177 1177 1
+1178 910 -1
+1178 1096 0.184096
+1178 1178 1
+1179 911 -1
+1179 1096 0.184096
+1179 1179 1
+1180 912 -1
+1180 1096 0.184096
+1180 1180 1
+1181 913 -1
+1181 1096 0.184096
+1181 1181 1
+1182 914 -1
+1182 1096 0.184096
+1182 1182 1
+1183 915 -1
+1183 1096 0.184096
+1183 1183 1
+1184 916 -1
+1184 1096 0.184096
+1184 1184 1
+1185 917 -1
+1185 1096 0.184096
+1185 1185 1
+1186 918 -1
+1186 1096 0.184096
+1186 1186 1
+1187 919 -1
+1187 1096 0.184096
+1187 1187 1
+1188 920 -1
+1188 1096 0.184096
+1188 1188 1
+1189 921 -1
+1189 1096 0.184096
+1189 1189 1
+1190 922 -1
+1190 1096 0.184096
+1190 1190 1
+1191 923 -1
+1191 1096 0.184096
+1191 1191 1
+1192 924 -1
+1192 1096 0.184096
+1192 1192 1
+1193 925 -1
+1193 1097 0.958208
+1193 1193 1
+1194 926 -1
+1194 1097 1
+1194 1194 1
+1195 927 -1
+1195 1098 1
+1195 1195 1
+1196 928 -1
+1196 1098 1
+1196 1196 1
+1197 929 -1
+1197 1098 1
+1197 1197 1
+1198 930 -1
+1198 1098 1
+1198 1198 1
+1199 931 -1
+1199 1098 1
+1199 1199 1
+1200 932 -1
+1200 1098 1
+1200 1200 1
+1201 933 -1
+1201 1098 1
+1201 1201 1
+1202 934 -1
+1202 1097 0.958208
+1202 1202 1
+1203 1101 -1
+1203 1153 1
+1203 1203 -1
+1204 53 -0.00356159
+1204 54 -0.0336925
+1204 55 -0.0901731
+1204 56 -0.0566755
+1204 57 -0.0280032
+1204 58 -0.0498668
+1204 59 -0.0237598
+1204 60 -0.0164391
+1204 61 -0.00231136
+1204 62 -0.00248041
+1204 63 -0.00127227
+1204 64 -0.0053773
+1204 65 -0.00257951
+1204 66 -0.00156979
+1204 67 -0.00218603
+1204 68 -0.00861218
+1204 69 -0.00069778
+1204 70 -0.000519145
+1204 71 -0.00132794
+1204 72 -0.000291507
+1204 73 -0.00151601
+1204 74 -9.04287e-05
+1204 75 -0.000142274
+1204 76 -0.00739766
+1204 77 -0.00819739
+1204 78 -0.0020846
+1204 79 -0.00365354
+1204 80 -0.00488446
+1204 81 -0.00122505
+1204 82 -0.0436096
+1204 83 -0.000693262
+1204 84 -0.00165847
+1204 85 -0.00810871
+1204 86 -0.00322366
+1204 87 -0.00127205
+1204 88 -0.0130608
+1204 89 -0.00165249
+1204 90 -0.00177342
+1204 91 -0.0003165
+1204 92 -0.00204896
+1204 93 -0.023698
+1204 94 -0.00135483
+1204 95 -0.00156582
+1204 96 -0.0538427
+1204 97 -0.0221646
+1204 98 -0.142507
+1204 99 -0.0482242
+1204 100 -0.179205
+1204 103 -0.0576347
+1204 104 -0.0317969
+1204 1204 1
+1205 664 -2.1238e-05
+1205 665 -0.00138877
+1205 666 -0.00169529
+1205 667 -0.0130306
+1205 668 -0.0715933
+1205 670 -0.0156391
+1205 671 -0.0196914
+1205 672 -0.000465031
+1205 673 -0.0646045
+1205 674 -1.41097e-05
+1205 675 -0.0213195
+1205 676 -0.0482019
+1205 677 -0.0117546
+1205 678 -9.87693e-05
+1205 679 -0.00290226
+1205 680 -0.00432888
+1205 681 -0.0161348
+1205 682 -0.00365646
+1205 683 -0.00112062
+1205 684 -0.00128508
+1205 685 -0.000721945
+1205 686 -0.00312228
+1205 687 -0.000479361
+1205 688 -0.00376875
+1205 689 -0.00984223
+1205 690 -0.00119829
+1205 691 -8.72301e-05
+1205 692 -0.00276712
+1205 693 -0.002697
+1205 694 -0.0204224
+1205 695 -0.121343
+1205 696 -0.0130022
+1205 697 -0.00675853
+1205 698 -0.0129594
+1205 699 -0.0472033
+1205 700 -0.00285192
+1205 701 -0.0533207
+1205 702 -0.0206797
+1205 703 -0.0094465
+1205 705 -0.00370004
+1205 706 -0.0503387
+1205 707 -0.0259307
+1205 715 -0.288725
+1205 1205 1
+1206 495 -0.11692
+1206 496 -0.0323978
+1206 497 -0.043002
+1206 498 -0.0556474
+1206 499 -0.0339496
+1206 500 -0.021862
+1206 501 -0.252024
+1206 1041 -0.00584602
+1206 1042 -0.000453596
+1206 1043 -0.000104409
+1206 1044 -0.00156987
+1206 1045 -0.0108657
+1206 1047 -0.000602874
+1206 1048 -0.0240811
+1206 1049 -0.00540687
+1206 1050 -0.0217262
+1206 1051 -0.0160277
+1206 1052 -0.00016788
+1206 1053 -0.000827674
+1206 1054 -0.0133554
+1206 1055 -3.36677e-05
+1206 1056 -0.00309803
+1206 1057 -1.20522e-05
+1206 1058 -0.0014971
+1206 1059 -7.5916e-05
+1206 1060 -0.000157007
+1206 1061 -0.00011764
+1206 1062 -5.89512e-07
+1206 1063 -1.72268e-05
+1206 1065 -0.000583617
+1206 1066 -0.00450151
+1206 1067 -0.00143061
+1206 1068 -2.23359e-05
+1206 1069 -0.000210587
+1206 1070 -0.00193471
+1206 1071 -0.00185886
+1206 1072 -0.0156699
+1206 1073 -0.00145537
+1206 1074 -0.00117061
+1206 1075 -0.00473823
+1206 1076 -0.0011633
+1206 1077 -0.00446149
+1206 1078 -0.0120799
+1206 1079 -4.05453e-05
+1206 1080 -0.00253228
+1206 1082 -7.26409e-05
+1206 1083 -0.00411938
+1206 1084 -0.00660947
+1206 1086 -0.0479387
+1206 1092 -0.265067
+1206 1206 1
+1207 1205 0.891321
+1207 1206 -1
+1207 1207 1
+1208 777 -0.00139419
+1208 778 -0.0375281
+1208 780 -0.0234336
+1208 782 -0.142696
+1208 783 -0.154243
+1208 784 -0.148987
+1208 785 -0.0796309
+1208 786 -0.0065742
+1208 788 -0.0102514
+1208 789 -0.0203145
+1208 790 -0.0324831
+1208 791 -0.0112967
+1208 792 -0.00738503
+1208 793 -0.00656445
+1208 794 -0.0246457
+1208 795 -0.00258027
+1208 796 -0.0434266
+1208 799 -0.00447284
+1208 803 -0.0676893
+1208 804 -0.00660502
+1208 805 -0.191619
+1208 806 -0.000646368
+1208 807 -0.00423834
+1208 808 -0.185485
+1208 809 -0.0386895
+1208 810 -0.00363747
+1208 811 -0.179137
+1208 812 -0.00513758
+1208 813 -0.00259841
+1208 814 -0.000209176
+1208 815 -0.04708
+1208 816 -0.041104
+1208 817 -0.00794721
+1208 820 -0.240904
+1208 821 -1
+1208 823 -0.507245
+1208 824 -0.753005
+1208 831 -0.0203645
+1208 832 -0.230801
+1208 834 -0.0646304
+1208 836 -0.000953411
+1208 837 -0.0822229
+1208 838 -3.24572e-05
+1208 839 -0.0597999
+1208 840 -0.0574384
+1208 843 -0.00887517
+1208 844 -0.0194263
+1208 845 -0.0583077
+1208 846 -0.00929663
+1208 847 -0.00299812
+1208 848 -0.0048789
+1208 849 -0.00145715
+1208 850 -0.0116736
+1208 852 -0.00413819
+1208 853 -0.00434974
+1208 857 -0.00798503
+1208 858 -0.0277068
+1208 859 -0.365539
+1208 860 -0.00375734
+1208 861 -0.0309196
+1208 862 -0.0669713
+1208 863 -0.0517922
+1208 864 -0.00904613
+1208 865 -0.057399
+1208 867 -0.0256187
+1208 869 -0.0162316
+1208 870 -0.00823165
+1208 871 -0.0554877
+1208 1208 1
+1209 1095 -1
+1209 1096 -1
+1209 1097 -0.246717
+1209 1209 1
+1210 1208 0.709287
+1210 1210 -1
+1211 1209 1
+1211 1211 -1
+1212 775 -0.869892
+1212 820 -0.00880705
+1212 821 -0.0458645
+1212 1042 1
+1212 1212 1
+1213 776 -0.870138
+1213 820 -0.0088084
+1213 821 -0.0457606
+1213 1043 1
+1213 1213 1
+1214 777 -1
+1214 820 -0.00876915
+1214 821 -0.045843
+1214 1044 1
+1214 1214 1
+1215 779 -0.382316
+1215 1046 1
+1215 1215 1
+1216 780 -0.500512
+1216 820 -0.00877156
+1216 821 -0.0458295
+1216 1047 1
+1216 1216 1
+1217 782 -0.24815
+1217 820 -0.0351032
+1217 821 -0.0125025
+1217 1049 1
+1217 1217 1
+1218 783 -0.267686
+1218 820 -0.0351011
+1218 821 -0.0125047
+1218 1050 1
+1218 1218 1
+1219 784 -0.414376
+1219 820 -0.00877562
+1219 821 -0.0458491
+1219 1051 1
+1219 1219 1
+1220 785 -0.300469
+1220 820 -0.00873086
+1220 821 -0.0458608
+1220 1052 1
+1220 1220 1
+1221 786 -0.228839
+1221 820 -0.00878509
+1221 821 -0.0458508
+1221 1053 1
+1221 1221 1
+1222 788 -0.86965
+1222 820 -0.00853635
+1222 821 -0.0462225
+1222 1055 1
+1222 1222 1
+1223 790 -0.478685
+1223 820 -0.00953845
+1223 821 -0.0453058
+1223 1057 1
+1223 1223 1
+1224 791 -0.870012
+1224 820 -0.00877307
+1224 821 -0.0458465
+1224 1058 1
+1224 1224 1
+1225 792 -0.448312
+1225 820 -0.00870723
+1225 821 -0.0456732
+1225 1059 1
+1225 1225 1
+1226 793 -0.577217
+1226 820 -0.00878635
+1226 821 -0.0459068
+1226 1060 1
+1226 1226 1
+1227 794 -0.810779
+1227 820 -0.00879491
+1227 821 -0.0459516
+1227 1061 1
+1227 1227 1
+1228 795 -0.440689
+1228 821 -0.0463126
+1228 1062 1
+1228 1228 1
+1229 796 -0.870722
+1229 820 -0.00834161
+1229 821 -0.0459604
+1229 1063 1
+1229 1229 1
+1230 797 -1
+1230 1064 1
+1230 1230 1
+1231 798 -0.870034
+1231 820 -0.00876552
+1231 821 -0.0458448
+1231 1065 1
+1231 1231 1
+1232 799 -0.387936
+1232 820 -0.00877231
+1232 821 -0.0458516
+1232 1066 1
+1232 1232 1
+1233 800 -0.869969
+1233 820 -0.00877899
+1233 821 -0.0458588
+1233 1067 1
+1233 1233 1
+1234 801 -0.868035
+1234 820 -0.00900698
+1234 821 -0.0464484
+1234 1068 1
+1234 1234 1
+1235 802 -0.869984
+1235 820 -0.00873443
+1235 821 -0.0458948
+1235 1069 1
+1235 1235 1
+1236 803 -0.517911
+1236 820 -0.00877923
+1236 821 -0.0458485
+1236 1070 1
+1236 1236 1
+1237 804 -0.418376
+1237 820 -0.00878186
+1237 821 -0.045854
+1237 1071 1
+1237 1237 1
+1238 805 -0.605368
+1238 820 -0.0087761
+1238 821 -0.0458489
+1238 1072 1
+1238 1238 1
+1239 806 -0.383285
+1239 820 -0.00876787
+1239 821 -0.0458478
+1239 1073 1
+1239 1239 1
+1240 808 -0.869999
+1240 820 -0.00877683
+1240 821 -0.0458484
+1240 1075 1
+1240 1240 1
+1241 809 -0.210788
+1241 820 -0.00877043
+1241 821 -0.0458588
+1241 1076 1
+1241 1241 1
+1242 810 -0.87001
+1242 820 -0.00877369
+1242 821 -0.0458468
+1242 1077 1
+1242 1242 1
+1243 811 -0.870005
+1243 820 -0.00877432
+1243 821 -0.0458485
+1243 1078 1
+1243 1243 1
+1244 812 -0.571024
+1244 820 -0.00850601
+1244 821 -0.0457889
+1244 1079 1
+1244 1244 1
+1245 813 -0.869995
+1245 820 -0.0087731
+1245 821 -0.0458538
+1245 1080 1
+1245 1245 1
+1246 814 -0.5
+1246 1081 1
+1246 1246 1
+1247 815 -0.870153
+1247 820 -0.00870415
+1247 821 -0.0458532
+1247 1082 1
+1247 1247 1
+1248 816 -0.869995
+1248 820 -0.00877676
+1248 821 -0.0458501
+1248 1083 1
+1248 1248 1
+1249 817 -0.869998
+1249 820 -0.00877485
+1249 821 -0.0458509
+1249 1084 1
+1249 1249 1
+1250 818 -0.5
+1250 1085 1
+1250 1250 1
+1251 820 -0.438768
+1251 1087 1
+1251 1251 1
+1252 821 -0.416813
+1252 1088 1
+1252 1252 1
+1253 822 -0.264059
+1253 1089 1
+1253 1253 1
+1254 823 -0.5
+1254 1090 1
+1254 1254 1
+1255 824 -0.5
+1255 1091 1
+1255 1255 1
+1256 825 -1
+1256 1092 1
+1256 1256 1
+1257 826 -0.5
+1257 1093 1
+1257 1257 1
+1258 827 -0.5
+1258 1094 1
+1258 1258 1
diff --git a/omp/CMakeLists.txt b/omp/CMakeLists.txt
index 6499e3b49d4..557061c5fed 100644
--- a/omp/CMakeLists.txt
+++ b/omp/CMakeLists.txt
@@ -1,12 +1,17 @@
 add_library(ginkgo_omp $<TARGET_OBJECTS:ginkgo_omp_device> "")
+# we don't split up the dense kernels into distinct compilations
+list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp)
 target_sources(ginkgo_omp
     PRIVATE
+    base/batch_multi_vector_kernels.cpp
     base/device_matrix_data_kernels.cpp
+    base/executor.cpp
     base/index_set_kernels.cpp
     base/scoped_device_id.cpp
     base/version.cpp
     components/prefix_sum_kernels.cpp
     distributed/matrix_kernels.cpp
+    distributed/partition_helpers_kernels.cpp
     distributed/partition_kernels.cpp
     distributed/vector_kernels.cpp
     factorization/cholesky_kernels.cpp
@@ -18,6 +23,8 @@ target_sources(ginkgo_omp
     factorization/par_ict_kernels.cpp
     factorization/par_ilu_kernels.cpp
     factorization/par_ilut_kernels.cpp
+    matrix/batch_dense_kernels.cpp
+    matrix/batch_ell_kernels.cpp
     matrix/coo_kernels.cpp
     matrix/csr_kernels.cpp
     matrix/dense_kernels.cpp
@@ -31,6 +38,7 @@ target_sources(ginkgo_omp
     preconditioner/isai_kernels.cpp
     preconditioner/jacobi_kernels.cpp
     reorder/rcm_kernels.cpp
+    solver/batch_bicgstab_kernels.cpp
     solver/cb_gmres_kernels.cpp
     solver/idr_kernels.cpp
     solver/lower_trs_kernels.cpp
diff --git a/omp/base/batch_multi_vector_kernels.cpp b/omp/base/batch_multi_vector_kernels.cpp
new file mode 100644
index 00000000000..6067e762c98
--- /dev/null
+++ b/omp/base/batch_multi_vector_kernels.cpp
@@ -0,0 +1,187 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/base/batch_multi_vector_kernels.hpp"
+
+
+#include <algorithm>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/range_accessors.hpp>
+
+
+#include "core/components/prefix_sum_kernels.hpp"
+#include "reference/base/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace omp {
+/**
+ * @brief The batch::MultiVector matrix format namespace.
+ * @ref batch::MultiVector
+ * @ingroup batch_multi_vector
+ */
+namespace batch_multi_vector {
+
+
+#include "reference/base/batch_multi_vector_kernels.hpp.inc"
+
+
+template <typename ValueType>
+void scale(std::shared_ptr<const DefaultExecutor> exec,
+           const batch::MultiVector<ValueType>* const alpha,
+           batch::MultiVector<ValueType>* const x)
+{
+    const auto x_ub = host::get_batch_struct(x);
+    const auto alpha_ub = host::get_batch_struct(alpha);
+#pragma omp parallel for
+    for (size_type batch = 0; batch < x->get_num_batch_items(); ++batch) {
+        const auto alpha_b = gko::batch::extract_batch_item(alpha_ub, batch);
+        const auto x_b = gko::batch::extract_batch_item(x_ub, batch);
+        scale_kernel(alpha_b, x_b);
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_BATCH_MULTI_VECTOR_SCALE_KERNEL);
+
+
+template <typename ValueType>
+void add_scaled(std::shared_ptr<const DefaultExecutor> exec,
+                const batch::MultiVector<ValueType>* const alpha,
+                const batch::MultiVector<ValueType>* const x,
+                batch::MultiVector<ValueType>* const y)
+{
+    const auto x_ub = host::get_batch_struct(x);
+    const auto y_ub = host::get_batch_struct(y);
+    const auto alpha_ub = host::get_batch_struct(alpha);
+#pragma omp parallel for
+    for (size_type batch = 0; batch < y->get_num_batch_items(); ++batch) {
+        const auto alpha_b = gko::batch::extract_batch_item(alpha_ub, batch);
+        const auto x_b = gko::batch::extract_batch_item(x_ub, batch);
+        const auto y_b = gko::batch::extract_batch_item(y_ub, batch);
+        add_scaled_kernel(alpha_b, x_b, y_b);
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_BATCH_MULTI_VECTOR_ADD_SCALED_KERNEL);
+
+
+template <typename ValueType>
+void compute_dot(std::shared_ptr<const DefaultExecutor> exec,
+                 const batch::MultiVector<ValueType>* const x,
+                 const batch::MultiVector<ValueType>* const y,
+                 batch::MultiVector<ValueType>* const result)
+{
+    const auto x_ub = host::get_batch_struct(x);
+    const auto y_ub = host::get_batch_struct(y);
+    const auto res_ub = host::get_batch_struct(result);
+#pragma omp parallel for
+    for (size_type batch = 0; batch < result->get_num_batch_items(); ++batch) {
+        const auto res_b = gko::batch::extract_batch_item(res_ub, batch);
+        const auto x_b = gko::batch::extract_batch_item(x_ub, batch);
+        const auto y_b = gko::batch::extract_batch_item(y_ub, batch);
+        compute_dot_product_kernel(x_b, y_b, res_b);
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_DOT_KERNEL);
+
+
+template <typename ValueType>
+void compute_conj_dot(std::shared_ptr<const DefaultExecutor> exec,
+                      const batch::MultiVector<ValueType>* const x,
+                      const batch::MultiVector<ValueType>* const y,
+                      batch::MultiVector<ValueType>* const result)
+{
+    const auto x_ub = host::get_batch_struct(x);
+    const auto y_ub = host::get_batch_struct(y);
+    const auto res_ub = host::get_batch_struct(result);
+#pragma omp parallel for
+    for (size_type batch = 0; batch < result->get_num_batch_items(); ++batch) {
+        const auto res_b = gko::batch::extract_batch_item(res_ub, batch);
+        const auto x_b = gko::batch::extract_batch_item(x_ub, batch);
+        const auto y_b = gko::batch::extract_batch_item(y_ub, batch);
+        compute_conj_dot_product_kernel(x_b, y_b, res_b);
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_CONJ_DOT_KERNEL);
+
+
+template <typename ValueType>
+void compute_norm2(std::shared_ptr<const DefaultExecutor> exec,
+                   const batch::MultiVector<ValueType>* const x,
+                   batch::MultiVector<remove_complex<ValueType>>* const result)
+{
+    const auto x_ub = host::get_batch_struct(x);
+    const auto res_ub = host::get_batch_struct(result);
+#pragma omp parallel for
+    for (size_type batch = 0; batch < result->get_num_batch_items(); ++batch) {
+        const auto res_b = gko::batch::extract_batch_item(res_ub, batch);
+        const auto x_b = gko::batch::extract_batch_item(x_ub, batch);
+        compute_norm2_kernel(x_b, res_b);
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_NORM2_KERNEL);
+
+
+template <typename ValueType>
+void copy(std::shared_ptr<const DefaultExecutor> exec,
+          const batch::MultiVector<ValueType>* x,
+          batch::MultiVector<ValueType>* result)
+{
+    const auto x_ub = host::get_batch_struct(x);
+    const auto result_ub = host::get_batch_struct(result);
+#pragma omp parallel for
+    for (size_type batch = 0; batch < x->get_num_batch_items(); ++batch) {
+        const auto result_b = gko::batch::extract_batch_item(result_ub, batch);
+        const auto x_b = gko::batch::extract_batch_item(x_ub, batch);
+        copy_kernel(x_b, result_b);
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR_COPY_KERNEL);
+
+
+}  // namespace batch_multi_vector
+}  // namespace omp
+}  // namespace kernels
+}  // namespace gko
diff --git a/cuda/test/base/cuda_executor_reset.cpp b/omp/base/executor.cpp
similarity index 63%
rename from cuda/test/base/cuda_executor_reset.cpp
rename to omp/base/executor.cpp
index c8159b9c4d7..49fd1332ed5 100644
--- a/cuda/test/base/cuda_executor_reset.cpp
+++ b/omp/base/executor.cpp
@@ -33,55 +33,20 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/executor.hpp>
 
 
-#include <thread>
+#include <omp.h>
 
 
-#include <gtest/gtest.h>
+namespace gko {
 
 
-namespace {
-
-
-#define GTEST_ASSERT_NO_EXIT(statement) \
-    ASSERT_EXIT({ {statement} exit(0); }, ::testing::ExitedWithCode(0), "")
-
-
-TEST(DeviceReset, HipCuda)
-{
-    GTEST_ASSERT_NO_EXIT({
-        auto ref = gko::ReferenceExecutor::create();
-        auto hip = gko::HipExecutor::create(0, ref, true);
-        auto cuda = gko::CudaExecutor::create(0, ref, true);
-    });
-}
-
-
-TEST(DeviceReset, CudaHip)
-{
-    GTEST_ASSERT_NO_EXIT({
-        auto ref = gko::ReferenceExecutor::create();
-        auto cuda = gko::CudaExecutor::create(0, ref, true);
-        auto hip = gko::HipExecutor::create(0, ref, true);
-    });
-}
-
-
-void func()
-{
-    auto ref = gko::ReferenceExecutor::create();
-    auto exec = gko::CudaExecutor::create(0, ref, true);
-}
-
-
-TEST(DeviceReset, CudaCuda)
+int OmpExecutor::get_num_omp_threads()
 {
-    GTEST_ASSERT_NO_EXIT({
-        std::thread t1(func);
-        std::thread t2(func);
-        t1.join();
-        t2.join();
-    });
+    int num_threads;
+#pragma omp parallel
+#pragma omp single
+    num_threads = omp_get_num_threads();
+    return num_threads;
 }
 
 
-}  // namespace
+}  // namespace gko
diff --git a/omp/base/kernel_launch_reduction.hpp b/omp/base/kernel_launch_reduction.hpp
index d8d081e323b..ef57803ad31 100644
--- a/omp/base/kernel_launch_reduction.hpp
+++ b/omp/base/kernel_launch_reduction.hpp
@@ -62,9 +62,11 @@ void run_kernel_reduction_impl(std::shared_ptr<const OmpExecutor> exec,
                                ValueType* result, size_type size,
                                array<char>& tmp, MappedKernelArgs... args)
 {
-    const auto num_threads = static_cast<int64>(omp_get_max_threads());
     const auto ssize = static_cast<int64>(size);
-    const auto work_per_thread = ceildiv(ssize, num_threads);
+    // Limit the number of threads to the number of columns
+    const auto num_threads = std::min<int64>(omp_get_max_threads(), ssize);
+    const auto work_per_thread =
+        ceildiv(ssize, std::max<int64>(num_threads, 1));
     const auto required_storage = sizeof(ValueType) * num_threads;
     if (tmp.get_num_elems() < required_storage) {
         tmp.resize_and_reset(required_storage);
@@ -73,14 +75,17 @@ void run_kernel_reduction_impl(std::shared_ptr<const OmpExecutor> exec,
 #pragma omp parallel num_threads(num_threads)
     {
         const auto thread_id = omp_get_thread_num();
-        const auto begin = thread_id * work_per_thread;
-        const auto end = std::min(ssize, begin + work_per_thread);
+        if (thread_id < num_threads) {
+            const auto begin = thread_id * work_per_thread;
+            const auto end = std::min(ssize, begin + work_per_thread);
 
-        auto local_partial = identity;
-        for (auto i = begin; i < end; i++) {
-            local_partial = op(local_partial, fn(i, map_to_device(args)...));
+            auto local_partial = identity;
+            for (auto i = begin; i < end; i++) {
+                local_partial =
+                    op(local_partial, fn(i, map_to_device(args)...));
+            }
+            partial[thread_id] = local_partial;
         }
-        partial[thread_id] = local_partial;
     }
     *result =
         finalize(std::accumulate(partial, partial + num_threads, identity, op));
@@ -99,8 +104,9 @@ void run_kernel_reduction_sized_impl(syn::value_list<int, remainder_cols>,
 {
     const auto rows = static_cast<int64>(size[0]);
     const auto cols = static_cast<int64>(size[1]);
-    const auto num_threads = static_cast<int64>(omp_get_max_threads());
-    const auto work_per_thread = ceildiv(rows, num_threads);
+    // Limit the number of threads to the number of columns
+    const auto num_threads = std::min<int64>(omp_get_max_threads(), rows);
+    const auto work_per_thread = ceildiv(rows, std::max<int64>(num_threads, 1));
     const auto required_storage = sizeof(ValueType) * num_threads;
     if (tmp.get_num_elems() < required_storage) {
         tmp.resize_and_reset(required_storage);
@@ -109,43 +115,46 @@ void run_kernel_reduction_sized_impl(syn::value_list<int, remainder_cols>,
     static_assert(remainder_cols < block_size, "remainder too large");
     const auto rounded_cols = cols / block_size * block_size;
     GKO_ASSERT(rounded_cols + remainder_cols == cols);
-#pragma omp parallel
+#pragma omp parallel num_threads(num_threads)
     {
         const auto thread_id = omp_get_thread_num();
-        const auto begin = thread_id * work_per_thread;
-        const auto end = std::min(rows, begin + work_per_thread);
-
-        auto local_partial = identity;
-        if (rounded_cols == 0 || cols == block_size) {
-            // we group all sizes <= block_size here and unroll explicitly
-            constexpr auto local_cols =
-                remainder_cols == 0 ? block_size : remainder_cols;
-            for (auto row = begin; row < end; row++) {
-#pragma unroll
-                for (int64 col = 0; col < local_cols; col++) {
-                    local_partial = op(local_partial, fn(row, col, args...));
-                }
-            }
-        } else {
-            // we operate in block_size blocks plus an explicitly unrolled
-            // remainder
-            for (auto row = begin; row < end; row++) {
-                for (int64 base_col = 0; base_col < rounded_cols;
-                     base_col += block_size) {
+        if (thread_id < num_threads) {
+            const auto begin = thread_id * work_per_thread;
+            const auto end = std::min(rows, begin + work_per_thread);
+
+            auto local_partial = identity;
+            if (rounded_cols == 0 || cols == block_size) {
+                // we group all sizes <= block_size here and unroll explicitly
+                constexpr auto local_cols =
+                    remainder_cols == 0 ? block_size : remainder_cols;
+                for (auto row = begin; row < end; row++) {
 #pragma unroll
-                    for (int64 i = 0; i < block_size; i++) {
+                    for (int64 col = 0; col < local_cols; col++) {
                         local_partial =
-                            op(local_partial, fn(row, base_col + i, args...));
+                            op(local_partial, fn(row, col, args...));
                     }
                 }
+            } else {
+                // we operate in block_size blocks plus an explicitly unrolled
+                // remainder
+                for (auto row = begin; row < end; row++) {
+                    for (int64 base_col = 0; base_col < rounded_cols;
+                         base_col += block_size) {
 #pragma unroll
-                for (int64 i = 0; i < remainder_cols; i++) {
-                    local_partial =
-                        op(local_partial, fn(row, rounded_cols + i, args...));
+                        for (int64 i = 0; i < block_size; i++) {
+                            local_partial = op(local_partial,
+                                               fn(row, base_col + i, args...));
+                        }
+                    }
+#pragma unroll
+                    for (int64 i = 0; i < remainder_cols; i++) {
+                        local_partial = op(local_partial,
+                                           fn(row, rounded_cols + i, args...));
+                    }
                 }
             }
+            partial[thread_id] = local_partial;
         }
-        partial[thread_id] = local_partial;
     }
     *result =
         finalize(std::accumulate(partial, partial + num_threads, identity, op));
@@ -210,12 +219,12 @@ void run_kernel_row_reduction_impl(std::shared_ptr<const OmpExecutor> exec,
     constexpr int block_size = 8;
     const auto rows = static_cast<int64>(size[0]);
     const auto cols = static_cast<int64>(size[1]);
-    const auto num_threads = static_cast<int64>(omp_get_max_threads());
+    const auto available_threads = static_cast<int64>(omp_get_max_threads());
     if (rows <= 0) {
         return;
     }
     // enough work to keep all threads busy or only very small reduction sizes
-    if (rows >= reduction_kernel_oversubscription * num_threads ||
+    if (rows >= reduction_kernel_oversubscription * available_threads ||
         cols < rows) {
 #pragma omp parallel for
         for (int64 row = 0; row < rows; row++) {
@@ -229,8 +238,12 @@ void run_kernel_row_reduction_impl(std::shared_ptr<const OmpExecutor> exec,
         }
     } else {
         // small number of rows and large reduction sizes: do partial sum first
-        const auto work_per_thread = ceildiv(cols, num_threads);
-        const auto required_storage = sizeof(ValueType) * rows * num_threads;
+        const auto num_threads = std::min<int64>(available_threads, cols);
+        const auto work_per_thread =
+            ceildiv(cols, std::max<int64>(num_threads, 1));
+        const auto temp_elems_per_row = num_threads;
+        const auto required_storage =
+            sizeof(ValueType) * rows * temp_elems_per_row;
         if (tmp.get_num_elems() < required_storage) {
             tmp.resize_and_reset(required_storage);
         }
@@ -238,16 +251,19 @@ void run_kernel_row_reduction_impl(std::shared_ptr<const OmpExecutor> exec,
 #pragma omp parallel num_threads(num_threads)
         {
             const auto thread_id = static_cast<int64>(omp_get_thread_num());
-            const auto begin = thread_id * work_per_thread;
-            const auto end = std::min(begin + work_per_thread, cols);
-            for (int64 row = 0; row < rows; row++) {
-                auto local_partial = identity;
-                for (int64 col = begin; col < end; col++) {
-                    local_partial = op(local_partial, [&]() {
-                        return fn(row, col, args...);
-                    }());
+            if (thread_id < num_threads) {
+                const auto begin = thread_id * work_per_thread;
+                const auto end = std::min(begin + work_per_thread, cols);
+                for (int64 row = 0; row < rows; row++) {
+                    auto local_partial = identity;
+                    for (int64 col = begin; col < end; col++) {
+                        local_partial = op(local_partial, [&]() {
+                            return fn(row, col, args...);
+                        }());
+                    }
+                    partial[row * temp_elems_per_row + thread_id] =
+                        local_partial;
                 }
-                partial[row * num_threads + thread_id] = local_partial;
             }
         }
         // then accumulate the partial sums and write to result
@@ -255,10 +271,11 @@ void run_kernel_row_reduction_impl(std::shared_ptr<const OmpExecutor> exec,
         for (int64 row = 0; row < rows; row++) {
             [&] {
                 auto local_partial = identity;
-                for (int64 thread_id = 0; thread_id < num_threads;
+                for (int64 thread_id = 0; thread_id < temp_elems_per_row;
                      thread_id++) {
-                    local_partial = op(local_partial,
-                                       partial[row * num_threads + thread_id]);
+                    local_partial =
+                        op(local_partial,
+                           partial[row * temp_elems_per_row + thread_id]);
                 }
                 result[row * result_stride] = finalize(local_partial);
             }();
@@ -302,12 +319,12 @@ void run_kernel_col_reduction_sized_impl(
 {
     const auto rows = static_cast<int64>(size[0]);
     const auto cols = static_cast<int64>(size[1]);
-    const auto num_threads = static_cast<int64>(omp_get_max_threads());
+    const auto available_threads = static_cast<int64>(omp_get_max_threads());
     static_assert(remainder_cols < block_size, "remainder too large");
     GKO_ASSERT(cols % block_size == remainder_cols);
     const auto num_col_blocks = ceildiv(cols, block_size);
     // enough work to keep all threads busy or only very small reduction sizes
-    if (cols >= reduction_kernel_oversubscription * num_threads ||
+    if (cols >= reduction_kernel_oversubscription * available_threads ||
         rows < cols) {
 #pragma omp parallel for
         for (int64 col_block = 0; col_block < num_col_blocks; col_block++) {
@@ -324,10 +341,14 @@ void run_kernel_col_reduction_sized_impl(
         }
     } else {
         // number of blocks that need to be reduced afterwards
-        const auto reduction_size =
-            ceildiv(reduction_kernel_oversubscription * num_threads, cols);
-        const auto rows_per_thread = ceildiv(rows, reduction_size);
-        const auto required_storage = sizeof(ValueType) * rows * reduction_size;
+        // This reduction_size definition ensures we don't use more temporary
+        // storage than the input vector
+        const auto reduction_size = std::min(
+            rows, ceildiv(reduction_kernel_oversubscription * available_threads,
+                          std::max<int64>(cols, 1)));
+        const auto rows_per_thread =
+            ceildiv(rows, std::max<int64>(reduction_size, 1));
+        const auto required_storage = sizeof(ValueType) * cols * reduction_size;
         if (tmp.get_num_elems() < required_storage) {
             tmp.resize_and_reset(required_storage);
         }
diff --git a/omp/distributed/partition_helpers_kernels.cpp b/omp/distributed/partition_helpers_kernels.cpp
new file mode 100644
index 00000000000..2c006a22885
--- /dev/null
+++ b/omp/distributed/partition_helpers_kernels.cpp
@@ -0,0 +1,72 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/distributed/partition_helpers_kernels.hpp"
+
+
+#include "core/base/iterator_factory.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace omp {
+namespace partition_helpers {
+
+
+template <typename GlobalIndexType>
+void sort_by_range_start(
+    std::shared_ptr<const DefaultExecutor> exec,
+    array<GlobalIndexType>& range_start_ends,
+    array<experimental::distributed::comm_index_type>& part_ids)
+{
+    auto part_ids_d = part_ids.get_data();
+    auto num_parts = part_ids.get_num_elems();
+    auto start_it = detail::make_permute_iterator(
+        range_start_ends.get_data(), [](const auto i) { return 2 * i; });
+    auto end_it = detail::make_permute_iterator(
+        range_start_ends.get_data() + 1, [](const auto i) { return 2 * i; });
+    auto sort_it = detail::make_zip_iterator(start_it, end_it, part_ids_d);
+    // TODO: use TBB or parallel std with c++17
+    std::stable_sort(sort_it, sort_it + num_parts,
+                     [](const auto& a, const auto& b) {
+                         return std::get<0>(a) < std::get<0>(b);
+                     });
+}
+
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(
+    GKO_DECLARE_PARTITION_HELPERS_SORT_BY_RANGE_START);
+
+
+}  // namespace partition_helpers
+}  // namespace omp
+}  // namespace kernels
+}  // namespace gko
diff --git a/omp/factorization/lu_kernels.cpp b/omp/factorization/lu_kernels.cpp
index 8aa187d3ca0..f130efaaa54 100644
--- a/omp/factorization/lu_kernels.cpp
+++ b/omp/factorization/lu_kernels.cpp
@@ -130,6 +130,89 @@ void factorize(std::shared_ptr<const DefaultExecutor> exec,
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LU_FACTORIZE);
 
 
+template <typename IndexType>
+void symbolic_factorize_simple(
+    std::shared_ptr<const DefaultExecutor> exec, const IndexType* row_ptrs,
+    const IndexType* col_idxs, const IndexType* lookup_offsets,
+    const int64* lookup_descs, const int32* lookup_storage,
+    matrix::Csr<float, IndexType>* factors, IndexType* out_row_nnz)
+{
+    const auto num_rows = factors->get_size()[0];
+    const auto factor_row_ptrs = factors->get_const_row_ptrs();
+    const auto factor_cols = factors->get_const_col_idxs();
+    const auto factor_vals = factors->get_values();
+    array<IndexType> diag_idx_array{exec, num_rows};
+    const auto diag_idxs = diag_idx_array.get_data();
+    for (size_type row = 0; row < num_rows; row++) {
+        matrix::csr::device_sparsity_lookup<IndexType> lookup{
+            factor_row_ptrs, factor_cols,  lookup_offsets,
+            lookup_storage,  lookup_descs, row};
+        const auto factor_begin = factor_row_ptrs[row];
+        const auto factor_end = factor_row_ptrs[row + 1];
+        const auto mtx_begin = row_ptrs[row];
+        const auto mtx_end = row_ptrs[row + 1];
+        // initialize the row
+        std::fill(factor_vals + factor_begin, factor_vals + factor_end,
+                  zero<float>());
+        for (auto nz = row_ptrs[row]; nz < row_ptrs[row + 1]; nz++) {
+            const auto col = col_idxs[nz];
+            factor_vals[lookup.lookup_unsafe(col) + factor_begin] =
+                one<float>();
+        }
+        diag_idxs[row] = lookup.lookup_unsafe(row) + factor_begin;
+        const auto row_diag = diag_idxs[row];
+        factor_vals[row_diag] = one<float>();
+        // apply factorization
+        for (auto lower_nz = factor_begin; lower_nz < row_diag; lower_nz++) {
+            const auto dep = factor_cols[lower_nz];
+            const auto dep_diag_idx = diag_idxs[dep];
+            const auto dep_diag = factor_vals[dep_diag_idx];
+            const auto dep_end = factor_row_ptrs[dep + 1];
+            if (factor_vals[lower_nz] == one<float>()) {
+                for (auto dep_nz = dep_diag_idx + 1; dep_nz < dep_end;
+                     dep_nz++) {
+                    const auto col = factor_cols[dep_nz];
+                    const auto val = factor_vals[dep_nz];
+                    const auto nz = factor_begin + lookup.lookup_unsafe(col);
+                    if (val == one<float>()) {
+                        factor_vals[nz] = one<float>();
+                    }
+                }
+            }
+        }
+        IndexType row_nnz{};
+        for (auto nz = factor_begin; nz < factor_end; nz++) {
+            row_nnz += factor_vals[nz] == one<float>() ? 1 : 0;
+        }
+        out_row_nnz[row] = row_nnz;
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_LU_SYMMETRIC_FACTORIZE_SIMPLE);
+
+
+template <typename IndexType>
+void symbolic_factorize_simple_finalize(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const matrix::Csr<float, IndexType>* factors, IndexType* out_col_idxs)
+{
+    const auto col_idxs = factors->get_const_col_idxs();
+    const auto vals = factors->get_const_values();
+    size_type output_idx{};
+    // copy all nonzero entries from the symmetric factor to the unsymmetric
+    // factor
+    for (size_type i = 0; i < factors->get_num_stored_elements(); i++) {
+        if (vals[i] == one<float>()) {
+            out_col_idxs[output_idx] = col_idxs[i];
+            ++output_idx;
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(
+    GKO_DECLARE_LU_SYMMETRIC_FACTORIZE_SIMPLE_FINALIZE);
+
+
 }  // namespace lu_factorization
 }  // namespace omp
 }  // namespace kernels
diff --git a/omp/matrix/batch_dense_kernels.cpp b/omp/matrix/batch_dense_kernels.cpp
new file mode 100644
index 00000000000..b91a4133dba
--- /dev/null
+++ b/omp/matrix/batch_dense_kernels.cpp
@@ -0,0 +1,117 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/matrix/batch_dense_kernels.hpp"
+
+
+#include <algorithm>
+
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/matrix/batch_dense.hpp>
+
+
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "reference/base/batch_struct.hpp"
+#include "reference/matrix/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace omp {
+/**
+ * @brief The Dense matrix format namespace.
+ * @ref Dense
+ * @ingroup batch_dense
+ */
+namespace batch_dense {
+
+
+#include "reference/matrix/batch_dense_kernels.hpp.inc"
+
+
+template <typename ValueType>
+void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
+                  const batch::matrix::Dense<ValueType>* mat,
+                  const batch::MultiVector<ValueType>* b,
+                  batch::MultiVector<ValueType>* x)
+{
+    const auto b_ub = host::get_batch_struct(b);
+    const auto x_ub = host::get_batch_struct(x);
+    const auto mat_ub = host::get_batch_struct(mat);
+#pragma omp parallel for
+    for (size_type batch = 0; batch < x->get_num_batch_items(); ++batch) {
+        const auto mat_item = batch::matrix::extract_batch_item(mat_ub, batch);
+        const auto b_item = batch::extract_batch_item(b_ub, batch);
+        const auto x_item = batch::extract_batch_item(x_ub, batch);
+        simple_apply_kernel(mat_item, b_item, x_item);
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_BATCH_DENSE_SIMPLE_APPLY_KERNEL);
+
+
+template <typename ValueType>
+void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
+                    const batch::MultiVector<ValueType>* alpha,
+                    const batch::matrix::Dense<ValueType>* mat,
+                    const batch::MultiVector<ValueType>* b,
+                    const batch::MultiVector<ValueType>* beta,
+                    batch::MultiVector<ValueType>* x)
+{
+    const auto b_ub = host::get_batch_struct(b);
+    const auto x_ub = host::get_batch_struct(x);
+    const auto mat_ub = host::get_batch_struct(mat);
+    const auto alpha_ub = host::get_batch_struct(alpha);
+    const auto beta_ub = host::get_batch_struct(beta);
+#pragma omp parallel for
+    for (size_type batch = 0; batch < x->get_num_batch_items(); ++batch) {
+        const auto mat_item = batch::matrix::extract_batch_item(mat_ub, batch);
+        const auto b_item = batch::extract_batch_item(b_ub, batch);
+        const auto x_item = batch::extract_batch_item(x_ub, batch);
+        const auto alpha_item = batch::extract_batch_item(alpha_ub, batch);
+        const auto beta_item = batch::extract_batch_item(beta_ub, batch);
+        advanced_apply_kernel(alpha_item.values[0], mat_item, b_item,
+                              beta_item.values[0], x_item);
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_BATCH_DENSE_ADVANCED_APPLY_KERNEL);
+
+
+}  // namespace batch_dense
+}  // namespace omp
+}  // namespace kernels
+}  // namespace gko
diff --git a/omp/matrix/batch_ell_kernels.cpp b/omp/matrix/batch_ell_kernels.cpp
new file mode 100644
index 00000000000..17710a97366
--- /dev/null
+++ b/omp/matrix/batch_ell_kernels.cpp
@@ -0,0 +1,117 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/matrix/batch_ell_kernels.hpp"
+
+
+#include <algorithm>
+
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/matrix/batch_dense.hpp>
+
+
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "reference/base/batch_struct.hpp"
+#include "reference/matrix/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace omp {
+/**
+ * @brief The Ell matrix format namespace.
+ * @ref Ell
+ * @ingroup batch_ell
+ */
+namespace batch_ell {
+
+
+#include "reference/matrix/batch_ell_kernels.hpp.inc"
+
+
+template <typename ValueType, typename IndexType>
+void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
+                  const batch::matrix::Ell<ValueType, IndexType>* mat,
+                  const batch::MultiVector<ValueType>* b,
+                  batch::MultiVector<ValueType>* x)
+{
+    const auto b_ub = host::get_batch_struct(b);
+    const auto x_ub = host::get_batch_struct(x);
+    const auto mat_ub = host::get_batch_struct(mat);
+#pragma omp parallel for
+    for (size_type batch = 0; batch < x->get_num_batch_items(); ++batch) {
+        const auto mat_item = batch::matrix::extract_batch_item(mat_ub, batch);
+        const auto b_item = batch::extract_batch_item(b_ub, batch);
+        const auto x_item = batch::extract_batch_item(x_ub, batch);
+        simple_apply_kernel(mat_item, b_item, x_item);
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
+    GKO_DECLARE_BATCH_ELL_SIMPLE_APPLY_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
+                    const batch::MultiVector<ValueType>* alpha,
+                    const batch::matrix::Ell<ValueType, IndexType>* mat,
+                    const batch::MultiVector<ValueType>* b,
+                    const batch::MultiVector<ValueType>* beta,
+                    batch::MultiVector<ValueType>* x)
+{
+    const auto b_ub = host::get_batch_struct(b);
+    const auto x_ub = host::get_batch_struct(x);
+    const auto mat_ub = host::get_batch_struct(mat);
+    const auto alpha_ub = host::get_batch_struct(alpha);
+    const auto beta_ub = host::get_batch_struct(beta);
+#pragma omp parallel for
+    for (size_type batch = 0; batch < x->get_num_batch_items(); ++batch) {
+        const auto mat_item = batch::matrix::extract_batch_item(mat_ub, batch);
+        const auto b_item = batch::extract_batch_item(b_ub, batch);
+        const auto x_item = batch::extract_batch_item(x_ub, batch);
+        const auto alpha_item = batch::extract_batch_item(alpha_ub, batch);
+        const auto beta_item = batch::extract_batch_item(beta_ub, batch);
+        advanced_apply_kernel(alpha_item.values[0], mat_item, b_item,
+                              beta_item.values[0], x_item);
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
+    GKO_DECLARE_BATCH_ELL_ADVANCED_APPLY_KERNEL);
+
+
+}  // namespace batch_ell
+}  // namespace omp
+}  // namespace kernels
+}  // namespace gko
diff --git a/omp/matrix/csr_kernels.cpp b/omp/matrix/csr_kernels.cpp
index 7d4a5a7ebd1..19a173b37bd 100644
--- a/omp/matrix/csr_kernels.cpp
+++ b/omp/matrix/csr_kernels.cpp
@@ -909,6 +909,20 @@ void inv_symm_permute(std::shared_ptr<const DefaultExecutor> exec,
                       const IndexType* perm,
                       const matrix::Csr<ValueType, IndexType>* orig,
                       matrix::Csr<ValueType, IndexType>* permuted)
+{
+    inv_nonsymm_permute(exec, perm, perm, orig, permuted);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_INV_SYMM_PERMUTE_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void inv_nonsymm_permute(std::shared_ptr<const DefaultExecutor> exec,
+                         const IndexType* row_perm,
+                         const IndexType* column_perm,
+                         const matrix::Csr<ValueType, IndexType>* orig,
+                         matrix::Csr<ValueType, IndexType>* permuted)
 {
     auto in_row_ptrs = orig->get_const_row_ptrs();
     auto in_col_idxs = orig->get_const_col_idxs();
@@ -921,26 +935,26 @@ void inv_symm_permute(std::shared_ptr<const DefaultExecutor> exec,
 #pragma omp parallel for
     for (size_type row = 0; row < num_rows; ++row) {
         auto src_row = row;
-        auto dst_row = perm[row];
+        auto dst_row = row_perm[row];
         p_row_ptrs[dst_row] = in_row_ptrs[src_row + 1] - in_row_ptrs[src_row];
     }
     components::prefix_sum_nonnegative(exec, p_row_ptrs, num_rows + 1);
 #pragma omp parallel for
     for (size_type row = 0; row < num_rows; ++row) {
         auto src_row = row;
-        auto dst_row = perm[row];
+        auto dst_row = row_perm[row];
         auto src_begin = in_row_ptrs[src_row];
         auto dst_begin = p_row_ptrs[dst_row];
         auto row_size = in_row_ptrs[src_row + 1] - src_begin;
         for (IndexType i = 0; i < row_size; ++i) {
-            p_col_idxs[dst_begin + i] = perm[in_col_idxs[src_begin + i]];
+            p_col_idxs[dst_begin + i] = column_perm[in_col_idxs[src_begin + i]];
             p_vals[dst_begin + i] = in_vals[src_begin + i];
         }
     }
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CSR_INV_SYMM_PERMUTE_KERNEL);
+    GKO_DECLARE_CSR_INV_NONSYMM_PERMUTE_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -982,10 +996,10 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 
 template <typename ValueType, typename IndexType>
-void inverse_row_permute(std::shared_ptr<const OmpExecutor> exec,
-                         const IndexType* perm,
-                         const matrix::Csr<ValueType, IndexType>* orig,
-                         matrix::Csr<ValueType, IndexType>* row_permuted)
+void inv_row_permute(std::shared_ptr<const OmpExecutor> exec,
+                     const IndexType* perm,
+                     const matrix::Csr<ValueType, IndexType>* orig,
+                     matrix::Csr<ValueType, IndexType>* row_permuted)
 {
     auto orig_row_ptrs = orig->get_const_row_ptrs();
     auto orig_col_idxs = orig->get_const_col_idxs();
@@ -1017,7 +1031,146 @@ void inverse_row_permute(std::shared_ptr<const OmpExecutor> exec,
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CSR_INVERSE_ROW_PERMUTE_KERNEL);
+    GKO_DECLARE_CSR_INV_ROW_PERMUTE_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void inv_symm_scale_permute(std::shared_ptr<const DefaultExecutor> exec,
+                            const ValueType* scale, const IndexType* perm,
+                            const matrix::Csr<ValueType, IndexType>* orig,
+                            matrix::Csr<ValueType, IndexType>* permuted)
+{
+    inv_nonsymm_scale_permute(exec, scale, perm, scale, perm, orig, permuted);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_INV_SYMM_SCALE_PERMUTE_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void inv_nonsymm_scale_permute(std::shared_ptr<const DefaultExecutor> exec,
+                               const ValueType* row_scale,
+                               const IndexType* row_perm,
+                               const ValueType* col_scale,
+                               const IndexType* col_perm,
+                               const matrix::Csr<ValueType, IndexType>* orig,
+                               matrix::Csr<ValueType, IndexType>* permuted)
+{
+    auto in_row_ptrs = orig->get_const_row_ptrs();
+    auto in_col_idxs = orig->get_const_col_idxs();
+    auto in_vals = orig->get_const_values();
+    auto p_row_ptrs = permuted->get_row_ptrs();
+    auto p_col_idxs = permuted->get_col_idxs();
+    auto p_vals = permuted->get_values();
+    size_type num_rows = orig->get_size()[0];
+
+#pragma omp parallel for
+    for (size_type row = 0; row < num_rows; ++row) {
+        auto src_row = row;
+        auto dst_row = row_perm[row];
+        p_row_ptrs[dst_row] = in_row_ptrs[src_row + 1] - in_row_ptrs[src_row];
+    }
+    components::prefix_sum_nonnegative(exec, p_row_ptrs, num_rows + 1);
+#pragma omp parallel for
+    for (size_type row = 0; row < num_rows; ++row) {
+        auto src_row = row;
+        auto dst_row = row_perm[row];
+        auto src_begin = in_row_ptrs[src_row];
+        auto dst_begin = p_row_ptrs[dst_row];
+        auto row_size = in_row_ptrs[src_row + 1] - src_begin;
+        for (IndexType i = 0; i < row_size; ++i) {
+            const auto dst_col = col_perm[in_col_idxs[src_begin + i]];
+            p_col_idxs[dst_begin + i] = dst_col;
+            p_vals[dst_begin + i] = in_vals[src_begin + i] /
+                                    (row_scale[dst_row] * col_scale[dst_col]);
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_INV_NONSYMM_SCALE_PERMUTE_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void row_scale_permute(std::shared_ptr<const OmpExecutor> exec,
+                       const ValueType* scale, const IndexType* perm,
+                       const matrix::Csr<ValueType, IndexType>* orig,
+                       matrix::Csr<ValueType, IndexType>* row_permuted)
+{
+    auto orig_row_ptrs = orig->get_const_row_ptrs();
+    auto orig_col_idxs = orig->get_const_col_idxs();
+    auto orig_vals = orig->get_const_values();
+    auto rp_row_ptrs = row_permuted->get_row_ptrs();
+    auto rp_col_idxs = row_permuted->get_col_idxs();
+    auto rp_vals = row_permuted->get_values();
+    size_type num_rows = orig->get_size()[0];
+
+#pragma omp parallel for
+    for (size_type row = 0; row < num_rows; ++row) {
+        auto src_row = perm[row];
+        auto dst_row = row;
+        rp_row_ptrs[dst_row] =
+            orig_row_ptrs[src_row + 1] - orig_row_ptrs[src_row];
+    }
+    components::prefix_sum_nonnegative(exec, rp_row_ptrs, num_rows + 1);
+#pragma omp parallel for
+    for (size_type row = 0; row < num_rows; ++row) {
+        auto src_row = perm[row];
+        auto dst_row = row;
+        auto src_begin = orig_row_ptrs[src_row];
+        auto dst_begin = rp_row_ptrs[dst_row];
+        auto row_size = orig_row_ptrs[src_row + 1] - src_begin;
+        std::copy_n(orig_col_idxs + src_begin, row_size,
+                    rp_col_idxs + dst_begin);
+        for (IndexType i = 0; i < row_size; i++) {
+            rp_vals[i + dst_begin] = orig_vals[i + src_begin] * scale[src_row];
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_ROW_SCALE_PERMUTE_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void inv_row_scale_permute(std::shared_ptr<const OmpExecutor> exec,
+                           const ValueType* scale, const IndexType* perm,
+                           const matrix::Csr<ValueType, IndexType>* orig,
+                           matrix::Csr<ValueType, IndexType>* row_permuted)
+{
+    auto orig_row_ptrs = orig->get_const_row_ptrs();
+    auto orig_col_idxs = orig->get_const_col_idxs();
+    auto orig_vals = orig->get_const_values();
+    auto rp_row_ptrs = row_permuted->get_row_ptrs();
+    auto rp_col_idxs = row_permuted->get_col_idxs();
+    auto rp_vals = row_permuted->get_values();
+    size_type num_rows = orig->get_size()[0];
+
+#pragma omp parallel for
+    for (size_type row = 0; row < num_rows; ++row) {
+        auto src_row = row;
+        auto dst_row = perm[row];
+        rp_row_ptrs[dst_row] =
+            orig_row_ptrs[src_row + 1] - orig_row_ptrs[src_row];
+    }
+    components::prefix_sum_nonnegative(exec, rp_row_ptrs, num_rows + 1);
+#pragma omp parallel for
+    for (size_type row = 0; row < num_rows; ++row) {
+        auto src_row = row;
+        auto dst_row = perm[row];
+        auto src_begin = orig_row_ptrs[src_row];
+        auto dst_begin = rp_row_ptrs[dst_row];
+        auto row_size = orig_row_ptrs[src_row + 1] - src_begin;
+        std::copy_n(orig_col_idxs + src_begin, row_size,
+                    rp_col_idxs + dst_begin);
+        for (IndexType i = 0; i < row_size; i++) {
+            rp_vals[i + dst_begin] = orig_vals[i + src_begin] / scale[dst_row];
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_INV_ROW_SCALE_PERMUTE_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -1134,12 +1287,17 @@ void add_scaled_identity(std::shared_ptr<const OmpExecutor> exec,
     const auto nrows = static_cast<IndexType>(mtx->get_size()[0]);
     const auto row_ptrs = mtx->get_const_row_ptrs();
     const auto vals = mtx->get_values();
+    const auto beta_val = beta->get_const_values()[0];
+    const auto alpha_val = alpha->get_const_values()[0];
 #pragma omp parallel for
     for (IndexType row = 0; row < nrows; row++) {
         for (IndexType iz = row_ptrs[row]; iz < row_ptrs[row + 1]; iz++) {
-            vals[iz] *= beta->get_const_values()[0];
-            if (row == mtx->get_const_col_idxs()[iz]) {
-                vals[iz] += alpha->get_const_values()[0];
+            if (beta_val != one<ValueType>()) {
+                vals[iz] *= beta_val;
+            }
+            if (row == mtx->get_const_col_idxs()[iz] &&
+                alpha_val != zero<ValueType>()) {
+                vals[iz] += alpha_val;
             }
         }
     }
diff --git a/omp/reorder/rcm_kernels.cpp b/omp/reorder/rcm_kernels.cpp
index 579770b9b2f..4de58456cc1 100644
--- a/omp/reorder/rcm_kernels.cpp
+++ b/omp/reorder/rcm_kernels.cpp
@@ -99,7 +99,7 @@ GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_RCM_GET_DEGREE_OF_NODES_KERNEL);
 
 // This constant controls how many nodes can be dequeued from the
 // UbfsLinearQueue at once at most. Increasing it reduces lock contention and
-// "unneccesary work", but disturbs queue ordering, generating extra work.
+// "unnecessary work", but disturbs queue ordering, generating extra work.
 constexpr int32 chunk_bound = 512;
 
 
@@ -235,8 +235,8 @@ struct UbfsLinearQueue {
 #define GKO_CMPXCHG_IMPL(ptr, ptr_expected, replace_with) \
     return __atomic_compare_exchange_n(                   \
         ptr, ptr_expected, replace_with, true,            \
-        std::memory_order::memory_order_acq_rel,          \
-        std::memory_order::memory_order_acquire);
+        static_cast<int>(std::memory_order_acq_rel),      \
+        static_cast<int>(std::memory_order_acquire));
 #endif
 
 /**
@@ -633,7 +633,7 @@ vector<IndexType> compute_level_offsets(std::shared_ptr<const OmpExecutor> exec,
 }
 
 
-// Signal value to which the entire permutation is intialized.
+// Signal value to which the entire permutation is initialized.
 // Threads spin on this value, until it is replaced by another value,
 // written by another thread.
 constexpr int32 perm_untouched = -1;
@@ -697,7 +697,7 @@ void write_permutation(std::shared_ptr<const OmpExecutor> exec,
 
                     // Will not be written by multiple threads, but can be read
                     // while written. This is only necessary to guarantee the
-                    // abscence of reads-while-writes.
+                    // absence of reads-while-writes.
                     IndexType neighbour_level;
 #pragma omp atomic read
                     neighbour_level = levels[neighbour];
diff --git a/omp/solver/batch_bicgstab_kernels.cpp b/omp/solver/batch_bicgstab_kernels.cpp
new file mode 100644
index 00000000000..16d4e4b5c61
--- /dev/null
+++ b/omp/solver/batch_bicgstab_kernels.cpp
@@ -0,0 +1,138 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/solver/batch_bicgstab_kernels.hpp"
+
+
+#include "core/solver/batch_dispatch.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace omp {
+/**
+ * @brief The batch Bicgstab solver namespace.
+ *
+ * @ingroup batch_bicgstab
+ */
+namespace batch_bicgstab {
+
+
+namespace {
+
+
+constexpr int max_num_rhs = 1;
+
+
+#include "reference/base/batch_multi_vector_kernels.hpp.inc"
+#include "reference/matrix/batch_dense_kernels.hpp.inc"
+#include "reference/matrix/batch_ell_kernels.hpp.inc"
+#include "reference/solver/batch_bicgstab_kernels.hpp.inc"
+
+
+}  // unnamed namespace
+
+
+template <typename T>
+using settings = gko::kernels::batch_bicgstab::settings<T>;
+
+
+template <typename ValueType>
+class kernel_caller {
+public:
+    kernel_caller(std::shared_ptr<const DefaultExecutor> exec,
+                  const settings<remove_complex<ValueType>> settings)
+        : exec_{std::move(exec)}, settings_{settings}
+    {}
+
+    template <typename BatchMatrixType, typename PrecondType, typename StopType,
+              typename LogType>
+    void call_kernel(
+        const LogType& logger, const BatchMatrixType& mat, PrecondType precond,
+        const gko::batch::multi_vector::uniform_batch<const ValueType>& b,
+        const gko::batch::multi_vector::uniform_batch<ValueType>& x) const
+    {
+        using real_type = typename gko::remove_complex<ValueType>;
+        const size_type num_batch_items = mat.num_batch_items;
+        const auto num_rows = mat.num_rows;
+        const auto num_rhs = b.num_rhs;
+        if (num_rhs > max_num_rhs) {
+            GKO_NOT_IMPLEMENTED;
+        }
+
+        const int local_size_bytes =
+            gko::kernels::batch_bicgstab::local_memory_requirement<ValueType>(
+                num_rows, num_rhs) +
+            PrecondType::dynamic_work_size(num_rows,
+                                           mat.get_single_item_num_nnz()) *
+                sizeof(ValueType);
+
+#pragma omp parallel for
+        for (size_type batch_id = 0; batch_id < num_batch_items; batch_id++) {
+            // TODO: Align to cache line boundary
+            // TODO: Allocate and free once per thread rather than once per
+            // work-item.
+            auto local_space = array<unsigned char>(exec_, local_size_bytes);
+            batch_entry_bicgstab_impl<StopType, PrecondType, LogType,
+                                      BatchMatrixType, ValueType>(
+                settings_, logger, precond, mat, b, x, batch_id,
+                local_space.get_data());
+        }
+    }
+
+private:
+    const std::shared_ptr<const DefaultExecutor> exec_;
+    const settings<remove_complex<ValueType>> settings_;
+};
+
+
+template <typename ValueType>
+void apply(std::shared_ptr<const DefaultExecutor> exec,
+           const settings<remove_complex<ValueType>>& settings,
+           const batch::BatchLinOp* const mat,
+           const batch::BatchLinOp* const precond,
+           const batch::MultiVector<ValueType>* const b,
+           batch::MultiVector<ValueType>* const x,
+           batch::log::detail::log_data<remove_complex<ValueType>>& logdata)
+{
+    auto dispatcher = batch::solver::create_dispatcher<ValueType>(
+        kernel_caller<ValueType>(exec, settings), settings, mat, precond);
+    dispatcher.apply(b, x, logdata);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_APPLY_KERNEL);
+
+
+}  // namespace batch_bicgstab
+}  // namespace omp
+}  // namespace kernels
+}  // namespace gko
diff --git a/omp/stop/residual_norm_kernels.cpp b/omp/stop/residual_norm_kernels.cpp
index 9b6fdbede64..37edf0d1176 100644
--- a/omp/stop/residual_norm_kernels.cpp
+++ b/omp/stop/residual_norm_kernels.cpp
@@ -65,7 +65,7 @@ void residual_norm(std::shared_ptr<const OmpExecutor> exec,
     bool local_one_changed = false;
 #pragma omp parallel for reduction(|| : local_one_changed)
     for (size_type i = 0; i < tau->get_size()[1]; ++i) {
-        if (tau->at(i) < rel_residual_goal * orig_tau->at(i)) {
+        if (tau->at(i) <= rel_residual_goal * orig_tau->at(i)) {
             stop_status->get_data()[i].converge(stoppingId, setFinalized);
             local_one_changed = true;
         }
@@ -110,7 +110,7 @@ void implicit_residual_norm(
     bool local_one_changed = false;
 #pragma omp parallel for reduction(|| : local_one_changed)
     for (size_type i = 0; i < tau->get_size()[1]; ++i) {
-        if (sqrt(abs(tau->at(i))) < rel_residual_goal * orig_tau->at(i)) {
+        if (sqrt(abs(tau->at(i))) <= rel_residual_goal * orig_tau->at(i)) {
             stop_status->get_data()[i].converge(stoppingId, setFinalized);
             local_one_changed = true;
         }
diff --git a/omp/test/base/CMakeLists.txt b/omp/test/base/CMakeLists.txt
index 4c511b6def7..cfd00fe28cf 100644
--- a/omp/test/base/CMakeLists.txt
+++ b/omp/test/base/CMakeLists.txt
@@ -1,4 +1,2 @@
-ginkgo_create_test(kernel_launch)
-target_compile_definitions(omp_test_base_kernel_launch PRIVATE GKO_COMPILING_OMP)
-target_link_libraries(omp_test_base_kernel_launch PRIVATE OpenMP::OpenMP_CXX)
-ginkgo_create_test(index_set)
+ginkgo_create_omp_test(kernel_launch)
+ginkgo_create_omp_test(index_set)
diff --git a/omp/test/matrix/CMakeLists.txt b/omp/test/matrix/CMakeLists.txt
index 88ab52e9c3f..398921ce75a 100644
--- a/omp/test/matrix/CMakeLists.txt
+++ b/omp/test/matrix/CMakeLists.txt
@@ -1 +1 @@
-ginkgo_create_test(fbcsr_kernels)
+ginkgo_create_omp_test(fbcsr_kernels)
diff --git a/omp/test/reorder/CMakeLists.txt b/omp/test/reorder/CMakeLists.txt
index 8987ae28a48..65aea4a0fdb 100644
--- a/omp/test/reorder/CMakeLists.txt
+++ b/omp/test/reorder/CMakeLists.txt
@@ -1 +1 @@
-ginkgo_create_test(rcm_kernels)
+ginkgo_create_test(rcm_kernels RESOURCE_TYPE cpu)
diff --git a/omp/test/reorder/rcm_kernels.cpp b/omp/test/reorder/rcm_kernels.cpp
index d2996ffb319..9c0797a8812 100644
--- a/omp/test/reorder/rcm_kernels.cpp
+++ b/omp/test/reorder/rcm_kernels.cpp
@@ -62,7 +62,7 @@ class Rcm : public ::testing::Test {
     using Mtx = gko::matrix::Dense<v_type>;
     using CsrMtx = gko::matrix::Csr<v_type, i_type>;
     using reorder_type = gko::reorder::Rcm<v_type, i_type>;
-    using strategy = gko::reorder::starting_strategy;
+    using new_reorder_type = gko::experimental::reorder::Rcm<i_type>;
     using perm_type = gko::matrix::Permutation<i_type>;
 
     Rcm()
@@ -110,24 +110,23 @@ class Rcm : public ::testing::Test {
     }
 
     static bool is_valid_start_node(std::shared_ptr<CsrMtx> mtx,
-                                    std::shared_ptr<reorder_type> reorder,
-                                    i_type start,
-                                    std::vector<bool>& already_visited)
+                                    const i_type* permutation, i_type start,
+                                    std::vector<bool>& already_visited,
+                                    gko::reorder::starting_strategy strategy)
     {
         if (already_visited[start]) {
             return false;
         }
 
-        const auto n = gko::as<perm_type>(reorder->get_permutation())
-                           ->get_permutation_size();
+        const auto n = mtx->get_size()[0];
         auto degrees = std::vector<i_type>(n);
         for (gko::size_type i = 0; i < n; ++i) {
             degrees[i] =
                 mtx->get_const_row_ptrs()[i + 1] - mtx->get_const_row_ptrs()[i];
         }
 
-        switch (reorder->get_parameters().strategy) {
-        case strategy::minimum_degree: {
+        switch (strategy) {
+        case gko::reorder::starting_strategy::minimum_degree: {
             auto min_degree = std::numeric_limits<i_type>::max();
             for (gko::size_type i = 0; i < n; ++i) {
                 if (!already_visited[i] && degrees[i] < min_degree) {
@@ -140,7 +139,7 @@ class Rcm : public ::testing::Test {
             break;
         }
 
-        case strategy::pseudo_peripheral: {
+        case gko::reorder::starting_strategy::pseudo_peripheral: {
             // Check if any valid contender has a lowereq height than the
             // selected start node.
 
@@ -196,10 +195,10 @@ class Rcm : public ::testing::Test {
     }
 
     static bool is_rcm_ordered(std::shared_ptr<CsrMtx> mtx,
-                               std::shared_ptr<reorder_type> reorder)
+                               const i_type* permutation,
+                               gko::reorder::starting_strategy strategy)
     {
-        const auto n = gko::as<perm_type>(reorder->get_permutation())
-                           ->get_permutation_size();
+        const auto n = mtx->get_size()[0];
         const auto row_ptrs = mtx->get_const_row_ptrs();
         const auto col_idxs = mtx->get_const_col_idxs();
         auto degrees = std::vector<i_type>(n);
@@ -210,14 +209,8 @@ class Rcm : public ::testing::Test {
 
         // Following checks for cm ordering, therefore create a reversed perm.
         auto perm = std::vector<i_type>(n);
-        std::copy_n(gko::as<perm_type>(reorder->get_permutation())
-                        ->get_const_permutation(),
-                    n, perm.begin());
-        for (gko::size_type i = 0; i < n / 2; ++i) {
-            const auto tmp = perm[i];
-            perm[i] = perm[n - i - 1];
-            perm[n - i - 1] = tmp;
-        }
+        std::copy_n(permutation, n, perm.begin());
+        std::reverse(perm.begin(), perm.end());
 
         // Now check for cm ordering.
 
@@ -225,8 +218,8 @@ class Rcm : public ::testing::Test {
         std::vector<bool> already_visited(n);
         while (base_offset != n) {
             // Assert valid start node.
-            if (!is_valid_start_node(mtx, reorder, perm[base_offset],
-                                     already_visited)) {
+            if (!is_valid_start_node(mtx, permutation, perm[base_offset],
+                                     already_visited, strategy)) {
                 return false;
             }
 
@@ -331,7 +324,30 @@ TEST_F(Rcm, OmpPermutationIsRcmOrdered)
 
     auto perm = d_reorder_op->get_permutation();
 
-    ASSERT_PRED2(is_rcm_ordered, d_1138_bus_mtx, d_reorder_op);
+    ASSERT_PRED3(is_rcm_ordered, d_1138_bus_mtx, perm->get_const_permutation(),
+                 d_reorder_op->get_parameters().strategy);
+}
+
+TEST_F(Rcm, OmpPermutationIsRcmOrderedMinDegree)
+{
+    d_reorder_op =
+        reorder_type::build()
+            .with_strategy(gko::reorder::starting_strategy::minimum_degree)
+            .on(omp)
+            ->generate(d_1138_bus_mtx);
+
+    auto perm = d_reorder_op->get_permutation();
+
+    ASSERT_PRED3(is_rcm_ordered, d_1138_bus_mtx, perm->get_const_permutation(),
+                 d_reorder_op->get_parameters().strategy);
+}
+
+TEST_F(Rcm, OmpPermutationIsRcmOrderedNewInterface)
+{
+    auto perm = new_reorder_type::build().on(omp)->generate(d_1138_bus_mtx);
+
+    ASSERT_PRED3(is_rcm_ordered, d_1138_bus_mtx, perm->get_const_permutation(),
+                 gko::reorder::starting_strategy::pseudo_peripheral);
 }
 
 }  // namespace
diff --git a/reference/CMakeLists.txt b/reference/CMakeLists.txt
index ab04aec75a1..f8dff69723b 100644
--- a/reference/CMakeLists.txt
+++ b/reference/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_library(ginkgo_reference $<TARGET_OBJECTS:ginkgo_reference_device> "")
 target_sources(ginkgo_reference
     PRIVATE
+    base/batch_multi_vector_kernels.cpp
     base/device_matrix_data_kernels.cpp
     base/index_set_kernels.cpp
     base/scoped_device_id.cpp
@@ -12,6 +13,7 @@ target_sources(ginkgo_reference
     components/precision_conversion_kernels.cpp
     components/prefix_sum_kernels.cpp
     distributed/matrix_kernels.cpp
+    distributed/partition_helpers_kernels.cpp
     distributed/partition_kernels.cpp
     distributed/vector_kernels.cpp
     factorization/cholesky_kernels.cpp
@@ -23,6 +25,8 @@ target_sources(ginkgo_reference
     factorization/par_ict_kernels.cpp
     factorization/par_ilu_kernels.cpp
     factorization/par_ilut_kernels.cpp
+    matrix/batch_dense_kernels.cpp
+    matrix/batch_ell_kernels.cpp
     matrix/coo_kernels.cpp
     matrix/csr_kernels.cpp
     matrix/dense_kernels.cpp
@@ -31,12 +35,15 @@ target_sources(ginkgo_reference
     matrix/fbcsr_kernels.cpp
     matrix/fft_kernels.cpp
     matrix/hybrid_kernels.cpp
+    matrix/permutation_kernels.cpp
+    matrix/scaled_permutation_kernels.cpp
     matrix/sellp_kernels.cpp
     matrix/sparsity_csr_kernels.cpp
     multigrid/pgm_kernels.cpp
     preconditioner/isai_kernels.cpp
     preconditioner/jacobi_kernels.cpp
     reorder/rcm_kernels.cpp
+    solver/batch_bicgstab_kernels.cpp
     solver/bicg_kernels.cpp
     solver/bicgstab_kernels.cpp
     solver/cg_kernels.cpp
diff --git a/reference/base/batch_multi_vector_kernels.cpp b/reference/base/batch_multi_vector_kernels.cpp
new file mode 100644
index 00000000000..89476e61453
--- /dev/null
+++ b/reference/base/batch_multi_vector_kernels.cpp
@@ -0,0 +1,181 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/base/batch_multi_vector_kernels.hpp"
+
+
+#include <algorithm>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/range_accessors.hpp>
+
+
+#include "core/base/batch_struct.hpp"
+#include "reference/base/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace reference {
+/**
+ * @brief The batch::MultiVector matrix format namespace.
+ * @ref batch::MultiVector
+ * @ingroup batch_multi_vector
+ */
+namespace batch_multi_vector {
+
+
+#include "reference/base/batch_multi_vector_kernels.hpp.inc"
+
+
+template <typename ValueType>
+void scale(std::shared_ptr<const DefaultExecutor> exec,
+           const batch::MultiVector<ValueType>* alpha,
+           batch::MultiVector<ValueType>* x)
+{
+    const auto x_ub = host::get_batch_struct(x);
+    const auto alpha_ub = host::get_batch_struct(alpha);
+    for (size_type batch = 0; batch < x->get_num_batch_items(); ++batch) {
+        const auto alpha_b = batch::extract_batch_item(alpha_ub, batch);
+        const auto x_b = batch::extract_batch_item(x_ub, batch);
+        scale_kernel(alpha_b, x_b);
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_BATCH_MULTI_VECTOR_SCALE_KERNEL);
+
+
+template <typename ValueType>
+void add_scaled(std::shared_ptr<const DefaultExecutor> exec,
+                const batch::MultiVector<ValueType>* alpha,
+                const batch::MultiVector<ValueType>* x,
+                batch::MultiVector<ValueType>* y)
+{
+    const auto x_ub = host::get_batch_struct(x);
+    const auto y_ub = host::get_batch_struct(y);
+    const auto alpha_ub = host::get_batch_struct(alpha);
+    for (size_type batch = 0; batch < y->get_num_batch_items(); ++batch) {
+        const auto alpha_b = batch::extract_batch_item(alpha_ub, batch);
+        const auto x_b = batch::extract_batch_item(x_ub, batch);
+        const auto y_b = batch::extract_batch_item(y_ub, batch);
+        add_scaled_kernel(alpha_b, x_b, y_b);
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_BATCH_MULTI_VECTOR_ADD_SCALED_KERNEL);
+
+
+template <typename ValueType>
+void compute_dot(std::shared_ptr<const DefaultExecutor> exec,
+                 const batch::MultiVector<ValueType>* x,
+                 const batch::MultiVector<ValueType>* y,
+                 batch::MultiVector<ValueType>* result)
+{
+    const auto x_ub = host::get_batch_struct(x);
+    const auto y_ub = host::get_batch_struct(y);
+    const auto res_ub = host::get_batch_struct(result);
+    for (size_type batch = 0; batch < result->get_num_batch_items(); ++batch) {
+        const auto res_b = batch::extract_batch_item(res_ub, batch);
+        const auto x_b = batch::extract_batch_item(x_ub, batch);
+        const auto y_b = batch::extract_batch_item(y_ub, batch);
+        compute_dot_product_kernel(x_b, y_b, res_b);
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_DOT_KERNEL);
+
+
+template <typename ValueType>
+void compute_conj_dot(std::shared_ptr<const DefaultExecutor> exec,
+                      const batch::MultiVector<ValueType>* x,
+                      const batch::MultiVector<ValueType>* y,
+                      batch::MultiVector<ValueType>* result)
+{
+    const auto x_ub = host::get_batch_struct(x);
+    const auto y_ub = host::get_batch_struct(y);
+    const auto res_ub = host::get_batch_struct(result);
+    for (size_type batch = 0; batch < result->get_num_batch_items(); ++batch) {
+        const auto res_b = batch::extract_batch_item(res_ub, batch);
+        const auto x_b = batch::extract_batch_item(x_ub, batch);
+        const auto y_b = batch::extract_batch_item(y_ub, batch);
+        compute_conj_dot_product_kernel(x_b, y_b, res_b);
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_CONJ_DOT_KERNEL);
+
+
+template <typename ValueType>
+void compute_norm2(std::shared_ptr<const DefaultExecutor> exec,
+                   const batch::MultiVector<ValueType>* x,
+                   batch::MultiVector<remove_complex<ValueType>>* result)
+{
+    const auto x_ub = host::get_batch_struct(x);
+    const auto res_ub = host::get_batch_struct(result);
+    for (size_type batch = 0; batch < result->get_num_batch_items(); ++batch) {
+        const auto res_b = batch::extract_batch_item(res_ub, batch);
+        const auto x_b = batch::extract_batch_item(x_ub, batch);
+        compute_norm2_kernel(x_b, res_b);
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_BATCH_MULTI_VECTOR_COMPUTE_NORM2_KERNEL);
+
+
+template <typename ValueType>
+void copy(std::shared_ptr<const DefaultExecutor> exec,
+          const batch::MultiVector<ValueType>* x,
+          batch::MultiVector<ValueType>* result)
+{
+    const auto x_ub = host::get_batch_struct(x);
+    const auto result_ub = host::get_batch_struct(result);
+    for (size_type batch = 0; batch < x->get_num_batch_items(); ++batch) {
+        const auto result_b = batch::extract_batch_item(result_ub, batch);
+        const auto x_b = batch::extract_batch_item(x_ub, batch);
+        copy_kernel(x_b, result_b);
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR_COPY_KERNEL);
+
+
+}  // namespace batch_multi_vector
+}  // namespace reference
+}  // namespace kernels
+}  // namespace gko
diff --git a/reference/base/batch_multi_vector_kernels.hpp.inc b/reference/base/batch_multi_vector_kernels.hpp.inc
new file mode 100644
index 00000000000..a14b18ec9f7
--- /dev/null
+++ b/reference/base/batch_multi_vector_kernels.hpp.inc
@@ -0,0 +1,152 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+template <typename ValueType>
+inline void scale_kernel(
+    const gko::batch::multi_vector::batch_item<const ValueType>& alpha,
+    const gko::batch::multi_vector::batch_item<ValueType>& x)
+{
+    if (alpha.num_rhs == 1) {
+        for (int i = 0; i < x.num_rows; ++i) {
+            for (int j = 0; j < x.num_rhs; ++j) {
+                x.values[i * x.stride + j] *= alpha.values[0];
+            }
+        }
+    } else {
+        for (int i = 0; i < x.num_rows; ++i) {
+            for (int j = 0; j < x.num_rhs; ++j) {
+                x.values[i * x.stride + j] *= alpha.values[j];
+            }
+        }
+    }
+}
+
+
+template <typename ValueType>
+inline void add_scaled_kernel(
+    const gko::batch::multi_vector::batch_item<const ValueType>& alpha,
+    const gko::batch::multi_vector::batch_item<const ValueType>& x,
+    const gko::batch::multi_vector::batch_item<ValueType>& y)
+{
+    if (alpha.num_rhs == 1) {
+        for (int i = 0; i < x.num_rows; ++i) {
+            for (int j = 0; j < x.num_rhs; ++j) {
+                y.values[i * y.stride + j] +=
+                    alpha.values[0] * x.values[i * x.stride + j];
+            }
+        }
+    } else {
+        for (int i = 0; i < x.num_rows; ++i) {
+            for (int j = 0; j < x.num_rhs; ++j) {
+                y.values[i * y.stride + j] +=
+                    alpha.values[j] * x.values[i * x.stride + j];
+            }
+        }
+    }
+}
+
+
+template <typename ValueType>
+inline void compute_dot_product_kernel(
+    const gko::batch::multi_vector::batch_item<const ValueType>& x,
+    const gko::batch::multi_vector::batch_item<const ValueType>& y,
+    const gko::batch::multi_vector::batch_item<ValueType>& result)
+{
+    for (int c = 0; c < result.num_rhs; c++) {
+        result.values[c] = gko::zero<ValueType>();
+    }
+
+    for (int r = 0; r < x.num_rows; r++) {
+        for (int c = 0; c < x.num_rhs; c++) {
+            result.values[c] +=
+                x.values[r * x.stride + c] * y.values[r * y.stride + c];
+        }
+    }
+}
+
+
+template <typename ValueType>
+inline void compute_conj_dot_product_kernel(
+    const gko::batch::multi_vector::batch_item<const ValueType>& x,
+    const gko::batch::multi_vector::batch_item<const ValueType>& y,
+    const gko::batch::multi_vector::batch_item<ValueType>& result)
+{
+    for (int c = 0; c < result.num_rhs; c++) {
+        result.values[c] = gko::zero<ValueType>();
+    }
+
+    for (int r = 0; r < x.num_rows; r++) {
+        for (int c = 0; c < x.num_rhs; c++) {
+            result.values[c] +=
+                conj(x.values[r * x.stride + c]) * y.values[r * y.stride + c];
+        }
+    }
+}
+
+
+template <typename ValueType>
+inline void compute_norm2_kernel(
+    const gko::batch::multi_vector::batch_item<const ValueType>& x,
+    const gko::batch::multi_vector::batch_item<gko::remove_complex<ValueType>>&
+        result)
+{
+    for (int j = 0; j < x.num_rhs; ++j) {
+        result.values[j] = gko::zero<gko::remove_complex<ValueType>>();
+    }
+    for (int i = 0; i < x.num_rows; ++i) {
+        for (int j = 0; j < x.num_rhs; ++j) {
+            result.values[j] += squared_norm(x.values[i * x.stride + j]);
+        }
+    }
+    for (int j = 0; j < x.num_rhs; ++j) {
+        result.values[j] = sqrt(result.values[j]);
+    }
+}
+
+
+/**
+ * Copies the values of one multi-vector into another.
+ *
+ * Note that the output multi-vector should already have memory allocated
+ * and stride set.
+ */
+template <typename ValueType>
+inline void copy_kernel(
+    const gko::batch::multi_vector::batch_item<const ValueType>& in,
+    const gko::batch::multi_vector::batch_item<ValueType>& out)
+{
+    for (int iz = 0; iz < in.num_rows * in.num_rhs; iz++) {
+        const int i = iz / in.num_rhs;
+        const int j = iz % in.num_rhs;
+        out.values[i * out.stride + j] = in.values[i * in.stride + j];
+    }
+}
diff --git a/reference/base/batch_struct.hpp b/reference/base/batch_struct.hpp
new file mode 100644
index 00000000000..0a3dbf37493
--- /dev/null
+++ b/reference/base/batch_struct.hpp
@@ -0,0 +1,95 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_REFERENCE_BASE_BATCH_STRUCT_HPP_
+#define GKO_REFERENCE_BASE_BATCH_STRUCT_HPP_
+
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/math.hpp>
+
+
+#include "core/base/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+/**
+ * @brief A namespace for shared functionality between omp and reference
+ *  executors.
+ */
+namespace host {
+
+
+/** @file batch_struct.hpp
+ *
+ * Helper functions to generate a batch struct from a batch LinOp.
+ *
+ * A specialization is needed for every format of every kind of linear algebra
+ * object. These are intended to be called on the host.
+ */
+
+
+/**
+ * Generates an immutable uniform batch struct from a batch of multi-vectors.
+ */
+template <typename ValueType>
+inline batch::multi_vector::uniform_batch<const ValueType> get_batch_struct(
+    const batch::MultiVector<ValueType>* const op)
+{
+    return {op->get_const_values(), op->get_num_batch_items(),
+            static_cast<int32>(op->get_common_size()[1]),
+            static_cast<int32>(op->get_common_size()[0]),
+            static_cast<int32>(op->get_common_size()[1])};
+}
+
+
+/**
+ * Generates a uniform batch struct from a batch of multi-vectors.
+ */
+template <typename ValueType>
+inline batch::multi_vector::uniform_batch<ValueType> get_batch_struct(
+    batch::MultiVector<ValueType>* const op)
+{
+    return {op->get_values(), op->get_num_batch_items(),
+            static_cast<int32>(op->get_common_size()[1]),
+            static_cast<int32>(op->get_common_size()[0]),
+            static_cast<int32>(op->get_common_size()[1])};
+}
+
+
+}  // namespace host
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_REFERENCE_BASE_BATCH_STRUCT_HPP_
diff --git a/reference/distributed/partition_helpers_kernels.cpp b/reference/distributed/partition_helpers_kernels.cpp
new file mode 100644
index 00000000000..b68c10b1d01
--- /dev/null
+++ b/reference/distributed/partition_helpers_kernels.cpp
@@ -0,0 +1,114 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/distributed/partition_helpers_kernels.hpp"
+
+
+#include "core/base/iterator_factory.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace reference {
+namespace partition_helpers {
+
+
+template <typename GlobalIndexType>
+void sort_by_range_start(
+    std::shared_ptr<const DefaultExecutor> exec,
+    array<GlobalIndexType>& range_start_ends,
+    array<experimental::distributed::comm_index_type>& part_ids)
+{
+    auto part_ids_d = part_ids.get_data();
+    auto num_parts = part_ids.get_num_elems();
+    auto start_it = detail::make_permute_iterator(
+        range_start_ends.get_data(), [](const auto i) { return 2 * i; });
+    auto end_it = detail::make_permute_iterator(
+        range_start_ends.get_data() + 1, [](const auto i) { return 2 * i; });
+    auto sort_it = detail::make_zip_iterator(start_it, end_it, part_ids_d);
+    std::stable_sort(sort_it, sort_it + num_parts,
+                     [](const auto& a, const auto& b) {
+                         return std::get<0>(a) < std::get<0>(b);
+                     });
+}
+
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(
+    GKO_DECLARE_PARTITION_HELPERS_SORT_BY_RANGE_START);
+
+
+template <typename GlobalIndexType>
+void check_consecutive_ranges(std::shared_ptr<const DefaultExecutor> exec,
+                              const array<GlobalIndexType>& range_start_ends,
+                              bool& result)
+{
+    auto num_parts = range_start_ends.get_num_elems() / 2;
+    auto start_it =
+        detail::make_permute_iterator(range_start_ends.get_const_data() + 2,
+                                      [](const auto i) { return 2 * i; });
+    auto end_it =
+        detail::make_permute_iterator(range_start_ends.get_const_data() + 1,
+                                      [](const auto i) { return 2 * i; });
+    auto range_it = detail::make_zip_iterator(start_it, end_it);
+
+    if (num_parts) {
+        result = std::all_of(
+            range_it, range_it + num_parts - 1,
+            [](const auto& r) { return std::get<0>(r) == std::get<1>(r); });
+    } else {
+        result = true;
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(
+    GKO_DECLARE_PARTITION_HELPERS_CHECK_CONSECUTIVE_RANGES);
+
+
+template <typename GlobalIndexType>
+void compress_ranges(std::shared_ptr<const DefaultExecutor> exec,
+                     const array<GlobalIndexType>& range_start_ends,
+                     array<GlobalIndexType>& range_offsets)
+{
+    range_offsets.get_data()[0] = range_start_ends.get_const_data()[0];
+    for (int i = 0; i < range_offsets.get_num_elems() - 1; ++i) {
+        range_offsets.get_data()[i + 1] =
+            range_start_ends.get_const_data()[2 * i + 1];
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(
+    GKO_DECLARE_PARTITION_HELPERS_COMPRESS_RANGES);
+
+
+}  // namespace partition_helpers
+}  // namespace reference
+}  // namespace kernels
+}  // namespace gko
diff --git a/reference/distributed/partition_kernels.cpp b/reference/distributed/partition_kernels.cpp
index 6eae93d27d0..e9a2bfe7667 100644
--- a/reference/distributed/partition_kernels.cpp
+++ b/reference/distributed/partition_kernels.cpp
@@ -55,14 +55,16 @@ void count_ranges(std::shared_ptr<const DefaultExecutor> exec,
 template <typename GlobalIndexType>
 void build_from_contiguous(std::shared_ptr<const DefaultExecutor> exec,
                            const array<GlobalIndexType>& ranges,
+                           const array<comm_index_type>& part_id_mapping,
                            GlobalIndexType* range_bounds,
                            comm_index_type* part_ids)
 {
+    bool uses_mapping = part_id_mapping.get_num_elems() > 0;
     range_bounds[0] = 0;
     for (comm_index_type i = 0; i < ranges.get_num_elems() - 1; i++) {
         auto end = ranges.get_const_data()[i + 1];
         range_bounds[i + 1] = end;
-        part_ids[i] = i;
+        part_ids[i] = uses_mapping ? part_id_mapping.get_const_data()[i] : i;
     }
 }
 
diff --git a/reference/factorization/lu_kernels.cpp b/reference/factorization/lu_kernels.cpp
index 8d7b186c924..4c03d5ecd80 100644
--- a/reference/factorization/lu_kernels.cpp
+++ b/reference/factorization/lu_kernels.cpp
@@ -128,6 +128,88 @@ void factorize(std::shared_ptr<const DefaultExecutor> exec,
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LU_FACTORIZE);
 
 
+template <typename IndexType>
+void symbolic_factorize_simple(
+    std::shared_ptr<const DefaultExecutor> exec, const IndexType* row_ptrs,
+    const IndexType* col_idxs, const IndexType* lookup_offsets,
+    const int64* lookup_descs, const int32* lookup_storage,
+    matrix::Csr<float, IndexType>* factors, IndexType* out_row_nnz)
+{
+    const auto num_rows = factors->get_size()[0];
+    const auto factor_row_ptrs = factors->get_const_row_ptrs();
+    const auto factor_cols = factors->get_const_col_idxs();
+    const auto factor_vals = factors->get_values();
+    array<IndexType> diag_idx_array{exec, num_rows};
+    const auto diag_idxs = diag_idx_array.get_data();
+    for (size_type row = 0; row < num_rows; row++) {
+        matrix::csr::device_sparsity_lookup<IndexType> lookup{
+            factor_row_ptrs, factor_cols,  lookup_offsets,
+            lookup_storage,  lookup_descs, row};
+        const auto factor_begin = factor_row_ptrs[row];
+        const auto factor_end = factor_row_ptrs[row + 1];
+        const auto mtx_begin = row_ptrs[row];
+        const auto mtx_end = row_ptrs[row + 1];
+        // initialize the row
+        std::fill(factor_vals + factor_begin, factor_vals + factor_end,
+                  zero<float>());
+        for (auto nz = mtx_begin; nz < mtx_end; nz++) {
+            const auto col = col_idxs[nz];
+            factor_vals[lookup.lookup_unsafe(col) + factor_begin] =
+                one<float>();
+        }
+        diag_idxs[row] = lookup.lookup_unsafe(row) + factor_begin;
+        const auto row_diag = diag_idxs[row];
+        factor_vals[row_diag] = one<float>();
+        // apply factorization
+        for (auto lower_nz = factor_begin; lower_nz < row_diag; lower_nz++) {
+            const auto dep = factor_cols[lower_nz];
+            const auto dep_diag_idx = diag_idxs[dep];
+            const auto dep_end = factor_row_ptrs[dep + 1];
+            if (factor_vals[lower_nz] == one<float>()) {
+                for (auto dep_nz = dep_diag_idx + 1; dep_nz < dep_end;
+                     dep_nz++) {
+                    const auto col = factor_cols[dep_nz];
+                    const auto val = factor_vals[dep_nz];
+                    const auto nz = factor_begin + lookup.lookup_unsafe(col);
+                    if (val == one<float>()) {
+                        factor_vals[nz] = one<float>();
+                    }
+                }
+            }
+        }
+        IndexType row_nnz{};
+        for (auto nz = factor_begin; nz < factor_end; nz++) {
+            row_nnz += factor_vals[nz] == one<float>() ? 1 : 0;
+        }
+        out_row_nnz[row] = row_nnz;
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_LU_SYMMETRIC_FACTORIZE_SIMPLE);
+
+
+template <typename IndexType>
+void symbolic_factorize_simple_finalize(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const matrix::Csr<float, IndexType>* factors, IndexType* out_col_idxs)
+{
+    const auto col_idxs = factors->get_const_col_idxs();
+    const auto vals = factors->get_const_values();
+    size_type output_idx{};
+    // copy all nonzero entries from the symmetric factor to the unsymmetric
+    // factor
+    for (size_type i = 0; i < factors->get_num_stored_elements(); i++) {
+        if (vals[i] == one<float>()) {
+            out_col_idxs[output_idx] = col_idxs[i];
+            ++output_idx;
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(
+    GKO_DECLARE_LU_SYMMETRIC_FACTORIZE_SIMPLE_FINALIZE);
+
+
 }  // namespace lu_factorization
 }  // namespace reference
 }  // namespace kernels
diff --git a/reference/log/batch_logger.hpp b/reference/log/batch_logger.hpp
new file mode 100644
index 00000000000..2598c23766f
--- /dev/null
+++ b/reference/log/batch_logger.hpp
@@ -0,0 +1,92 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_REFERENCE_LOG_BATCH_LOGGER_HPP_
+#define GKO_REFERENCE_LOG_BATCH_LOGGER_HPP_
+
+
+#include <ginkgo/core/base/types.hpp>
+
+
+namespace gko {
+namespace kernels {
+namespace host {
+namespace batch_log {
+
+
+/**
+ * Logs the final residual norm and iteration count for a batch solver.
+ *
+ * @note Supports only a single RHS per batch item.
+ */
+template <typename RealType>
+class SimpleFinalLogger final {
+public:
+    /**
+     * Constructor
+     *
+     * @param batch_residuals  residuals norms of size
+     *                         num_batch_items.
+     * @param batch_iters  final iteration counts for each
+     *                     linear system in the batch.
+     */
+    SimpleFinalLogger(RealType* const batch_residuals, int* const batch_iters)
+        : final_residuals_{batch_residuals}, final_iters_{batch_iters}
+    {}
+
+    /**
+     * Logs the final iteration count and the final residual norm.
+     *
+     * @param batch_idx  The index of linear system in the batch to log.
+     * @param iter  The final iteration count (0-based).
+     * @param res_norm  Norm of final residual norm
+     */
+    void log_iteration(const size_type batch_idx, const int iter,
+                       const RealType res_norm)
+    {
+        final_iters_[batch_idx] = iter;
+        final_residuals_[batch_idx] = res_norm;
+    }
+
+private:
+    RealType* const final_residuals_;
+    int* const final_iters_;
+};
+
+
+}  // namespace batch_log
+}  // namespace host
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_REFERENCE_LOG_BATCH_LOGGER_HPP_
diff --git a/reference/matrix/batch_dense_kernels.cpp b/reference/matrix/batch_dense_kernels.cpp
new file mode 100644
index 00000000000..87d73bb8e34
--- /dev/null
+++ b/reference/matrix/batch_dense_kernels.cpp
@@ -0,0 +1,115 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/matrix/batch_dense_kernels.hpp"
+
+
+#include <algorithm>
+
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/matrix/batch_dense.hpp>
+
+
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "reference/base/batch_struct.hpp"
+#include "reference/matrix/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace reference {
+/**
+ * @brief The Dense matrix format namespace.
+ * @ref Dense
+ * @ingroup batch_dense
+ */
+namespace batch_dense {
+
+
+#include "reference/matrix/batch_dense_kernels.hpp.inc"
+
+
+template <typename ValueType>
+void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
+                  const batch::matrix::Dense<ValueType>* mat,
+                  const batch::MultiVector<ValueType>* b,
+                  batch::MultiVector<ValueType>* x)
+{
+    const auto b_ub = host::get_batch_struct(b);
+    const auto x_ub = host::get_batch_struct(x);
+    const auto mat_ub = host::get_batch_struct(mat);
+    for (size_type batch = 0; batch < x->get_num_batch_items(); ++batch) {
+        const auto mat_item = batch::matrix::extract_batch_item(mat_ub, batch);
+        const auto b_item = batch::extract_batch_item(b_ub, batch);
+        const auto x_item = batch::extract_batch_item(x_ub, batch);
+        simple_apply_kernel(mat_item, b_item, x_item);
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_BATCH_DENSE_SIMPLE_APPLY_KERNEL);
+
+
+template <typename ValueType>
+void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
+                    const batch::MultiVector<ValueType>* alpha,
+                    const batch::matrix::Dense<ValueType>* mat,
+                    const batch::MultiVector<ValueType>* b,
+                    const batch::MultiVector<ValueType>* beta,
+                    batch::MultiVector<ValueType>* x)
+{
+    const auto b_ub = host::get_batch_struct(b);
+    const auto x_ub = host::get_batch_struct(x);
+    const auto mat_ub = host::get_batch_struct(mat);
+    const auto alpha_ub = host::get_batch_struct(alpha);
+    const auto beta_ub = host::get_batch_struct(beta);
+    for (size_type batch = 0; batch < x->get_num_batch_items(); ++batch) {
+        const auto mat_item = batch::matrix::extract_batch_item(mat_ub, batch);
+        const auto b_item = batch::extract_batch_item(b_ub, batch);
+        const auto x_item = batch::extract_batch_item(x_ub, batch);
+        const auto alpha_item = batch::extract_batch_item(alpha_ub, batch);
+        const auto beta_item = batch::extract_batch_item(beta_ub, batch);
+        advanced_apply_kernel(alpha_item.values[0], mat_item, b_item,
+                              beta_item.values[0], x_item);
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_BATCH_DENSE_ADVANCED_APPLY_KERNEL);
+
+
+}  // namespace batch_dense
+}  // namespace reference
+}  // namespace kernels
+}  // namespace gko
diff --git a/reference/matrix/batch_dense_kernels.hpp.inc b/reference/matrix/batch_dense_kernels.hpp.inc
new file mode 100644
index 00000000000..17144267af1
--- /dev/null
+++ b/reference/matrix/batch_dense_kernels.hpp.inc
@@ -0,0 +1,88 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+template <typename ValueType>
+inline void simple_apply_kernel(
+    const gko::batch::matrix::dense::batch_item<const ValueType>& a,
+    const gko::batch::multi_vector::batch_item<const ValueType>& b,
+    const gko::batch::multi_vector::batch_item<ValueType>& c)
+{
+    for (int row = 0; row < c.num_rows; ++row) {
+        for (int col = 0; col < c.num_rhs; ++col) {
+            c.values[row * c.stride + col] = gko::zero<ValueType>();
+        }
+    }
+
+    for (int row = 0; row < c.num_rows; ++row) {
+        for (int inner = 0; inner < a.num_cols; ++inner) {
+            for (int col = 0; col < c.num_rhs; ++col) {
+                c.values[row * c.stride + col] +=
+                    a.values[row * a.stride + inner] *
+                    b.values[inner * b.stride + col];
+            }
+        }
+    }
+}
+
+
+template <typename ValueType>
+inline void advanced_apply_kernel(
+    const ValueType alpha,
+    const gko::batch::matrix::dense::batch_item<const ValueType>& a,
+    const gko::batch::multi_vector::batch_item<const ValueType>& b,
+    const ValueType beta,
+    const gko::batch::multi_vector::batch_item<ValueType>& c)
+{
+    if (beta != gko::zero<ValueType>()) {
+        for (int row = 0; row < c.num_rows; ++row) {
+            for (int col = 0; col < c.num_rhs; ++col) {
+                c.values[row * c.stride + col] *= beta;
+            }
+        }
+    } else {
+        for (int row = 0; row < c.num_rows; ++row) {
+            for (int col = 0; col < c.num_rhs; ++col) {
+                c.values[row * c.stride + col] = gko::zero<ValueType>();
+            }
+        }
+    }
+
+    for (int row = 0; row < c.num_rows; ++row) {
+        for (int inner = 0; inner < a.num_cols; ++inner) {
+            for (int col = 0; col < c.num_rhs; ++col) {
+                c.values[row * c.stride + col] +=
+                    alpha * a.values[row * a.stride + inner] *
+                    b.values[inner * b.stride + col];
+            }
+        }
+    }
+}
diff --git a/reference/matrix/batch_ell_kernels.cpp b/reference/matrix/batch_ell_kernels.cpp
new file mode 100644
index 00000000000..1d3a0e1ef94
--- /dev/null
+++ b/reference/matrix/batch_ell_kernels.cpp
@@ -0,0 +1,115 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/matrix/batch_ell_kernels.hpp"
+
+
+#include <algorithm>
+
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/matrix/batch_ell.hpp>
+
+
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
+#include "reference/base/batch_struct.hpp"
+#include "reference/matrix/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace reference {
+/**
+ * @brief The Ell matrix format namespace.
+ * @ref Ell
+ * @ingroup batch_ell
+ */
+namespace batch_ell {
+
+
+#include "reference/matrix/batch_ell_kernels.hpp.inc"
+
+
+template <typename ValueType, typename IndexType>
+void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
+                  const batch::matrix::Ell<ValueType, IndexType>* mat,
+                  const batch::MultiVector<ValueType>* b,
+                  batch::MultiVector<ValueType>* x)
+{
+    const auto b_ub = host::get_batch_struct(b);
+    const auto x_ub = host::get_batch_struct(x);
+    const auto mat_ub = host::get_batch_struct(mat);
+    for (size_type batch = 0; batch < x->get_num_batch_items(); ++batch) {
+        const auto mat_item = batch::matrix::extract_batch_item(mat_ub, batch);
+        const auto b_item = batch::extract_batch_item(b_ub, batch);
+        const auto x_item = batch::extract_batch_item(x_ub, batch);
+        simple_apply_kernel(mat_item, b_item, x_item);
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
+    GKO_DECLARE_BATCH_ELL_SIMPLE_APPLY_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
+                    const batch::MultiVector<ValueType>* alpha,
+                    const batch::matrix::Ell<ValueType, IndexType>* mat,
+                    const batch::MultiVector<ValueType>* b,
+                    const batch::MultiVector<ValueType>* beta,
+                    batch::MultiVector<ValueType>* x)
+{
+    const auto b_ub = host::get_batch_struct(b);
+    const auto x_ub = host::get_batch_struct(x);
+    const auto mat_ub = host::get_batch_struct(mat);
+    const auto alpha_ub = host::get_batch_struct(alpha);
+    const auto beta_ub = host::get_batch_struct(beta);
+    for (size_type batch = 0; batch < x->get_num_batch_items(); ++batch) {
+        const auto mat_item = batch::matrix::extract_batch_item(mat_ub, batch);
+        const auto b_item = batch::extract_batch_item(b_ub, batch);
+        const auto x_item = batch::extract_batch_item(x_ub, batch);
+        const auto alpha_item = batch::extract_batch_item(alpha_ub, batch);
+        const auto beta_item = batch::extract_batch_item(beta_ub, batch);
+        advanced_apply_kernel(alpha_item.values[0], mat_item, b_item,
+                              beta_item.values[0], x_item);
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
+    GKO_DECLARE_BATCH_ELL_ADVANCED_APPLY_KERNEL);
+
+
+}  // namespace batch_ell
+}  // namespace reference
+}  // namespace kernels
+}  // namespace gko
diff --git a/reference/matrix/batch_ell_kernels.hpp.inc b/reference/matrix/batch_ell_kernels.hpp.inc
new file mode 100644
index 00000000000..979df1a19bd
--- /dev/null
+++ b/reference/matrix/batch_ell_kernels.hpp.inc
@@ -0,0 +1,80 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+template <typename ValueType, typename IndexType>
+inline void simple_apply_kernel(
+    const gko::batch::matrix::ell::batch_item<const ValueType, IndexType>& a,
+    const gko::batch::multi_vector::batch_item<const ValueType>& b,
+    const gko::batch::multi_vector::batch_item<ValueType>& c)
+{
+    for (int row = 0; row < c.num_rows; ++row) {
+        for (int j = 0; j < c.num_rhs; ++j) {
+            c.values[row * c.stride + j] = zero<ValueType>();
+        }
+        for (auto k = 0; k < a.num_stored_elems_per_row; ++k) {
+            auto val = a.values[row + k * a.stride];
+            auto col = a.col_idxs[row + k * a.stride];
+            if (col != invalid_index<IndexType>()) {
+                for (int j = 0; j < c.num_rhs; ++j) {
+                    c.values[row * c.stride + j] +=
+                        val * b.values[col * b.stride + j];
+                }
+            }
+        }
+    }
+}
+
+
+template <typename ValueType, typename IndexType>
+inline void advanced_apply_kernel(
+    const ValueType alpha,
+    const gko::batch::matrix::ell::batch_item<const ValueType, IndexType>& a,
+    const gko::batch::multi_vector::batch_item<const ValueType>& b,
+    const ValueType beta,
+    const gko::batch::multi_vector::batch_item<ValueType>& c)
+{
+    for (int row = 0; row < a.num_rows; ++row) {
+        for (int j = 0; j < c.num_rhs; ++j) {
+            c.values[row * c.stride + j] *= beta;
+        }
+        for (auto k = 0; k < a.num_stored_elems_per_row; ++k) {
+            auto val = a.values[row + k * a.stride];
+            auto col = a.col_idxs[row + k * a.stride];
+            if (col != invalid_index<IndexType>()) {
+                for (int j = 0; j < b.num_rhs; ++j) {
+                    c.values[row * c.stride + j] +=
+                        alpha * val * b.values[col * b.stride + j];
+                }
+            }
+        }
+    }
+}
diff --git a/reference/matrix/batch_struct.hpp b/reference/matrix/batch_struct.hpp
new file mode 100644
index 00000000000..94beff5c2c2
--- /dev/null
+++ b/reference/matrix/batch_struct.hpp
@@ -0,0 +1,133 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_REFERENCE_MATRIX_BATCH_STRUCT_HPP_
+#define GKO_REFERENCE_MATRIX_BATCH_STRUCT_HPP_
+
+
+#include "core/matrix/batch_struct.hpp"
+
+
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/matrix/batch_dense.hpp>
+#include <ginkgo/core/matrix/batch_ell.hpp>
+
+
+#include "core/base/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+/**
+ * @brief A namespace for shared functionality between omp and reference
+ *  executors.
+ */
+namespace host {
+
+
+/** @file batch_struct.hpp
+ *
+ * Helper functions to generate a batch struct from a batch LinOp.
+ *
+ * A specialization is needed for every format of every kind of linear algebra
+ * object. These are intended to be called on the host.
+ */
+
+
+/**
+ * Generates an immutable uniform batch struct from a batch of dense matrices.
+ */
+template <typename ValueType>
+inline batch::matrix::dense::uniform_batch<const ValueType> get_batch_struct(
+    const batch::matrix::Dense<ValueType>* const op)
+{
+    return {op->get_const_values(), op->get_num_batch_items(),
+            static_cast<int32>(op->get_common_size()[1]),
+            static_cast<int32>(op->get_common_size()[0]),
+            static_cast<int32>(op->get_common_size()[1])};
+}
+
+
+/**
+ * Generates a uniform batch struct from a batch of dense matrices.
+ */
+template <typename ValueType>
+inline batch::matrix::dense::uniform_batch<ValueType> get_batch_struct(
+    batch::matrix::Dense<ValueType>* const op)
+{
+    return {op->get_values(), op->get_num_batch_items(),
+            static_cast<int32>(op->get_common_size()[1]),
+            static_cast<int32>(op->get_common_size()[0]),
+            static_cast<int32>(op->get_common_size()[1])};
+}
+
+
+/**
+ * Generates an immutable uniform batch struct from a batch of ell matrices.
+ */
+template <typename ValueType, typename IndexType>
+inline batch::matrix::ell::uniform_batch<const ValueType, const IndexType>
+get_batch_struct(const batch::matrix::Ell<ValueType, IndexType>* const op)
+{
+    return {op->get_const_values(),
+            op->get_const_col_idxs(),
+            op->get_num_batch_items(),
+            static_cast<IndexType>(op->get_common_size()[0]),
+            static_cast<IndexType>(op->get_common_size()[0]),
+            static_cast<IndexType>(op->get_common_size()[1]),
+            static_cast<IndexType>(op->get_num_stored_elements_per_row())};
+}
+
+
+/**
+ * Generates a uniform batch struct from a batch of ell matrices.
+ */
+template <typename ValueType, typename IndexType>
+inline batch::matrix::ell::uniform_batch<ValueType, IndexType> get_batch_struct(
+    batch::matrix::Ell<ValueType, IndexType>* const op)
+{
+    return {op->get_values(),
+            op->get_col_idxs(),
+            op->get_num_batch_items(),
+            static_cast<IndexType>(op->get_common_size()[0]),
+            static_cast<IndexType>(op->get_common_size()[0]),
+            static_cast<IndexType>(op->get_common_size()[1]),
+            static_cast<IndexType>(op->get_num_stored_elements_per_row())};
+}
+
+
+}  // namespace host
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_REFERENCE_MATRIX_BATCH_STRUCT_HPP_
diff --git a/reference/matrix/csr_kernels.cpp b/reference/matrix/csr_kernels.cpp
index 3a05a09cd45..6065884dce8 100644
--- a/reference/matrix/csr_kernels.cpp
+++ b/reference/matrix/csr_kernels.cpp
@@ -834,24 +834,25 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_CONVERT_TO_HYBRID_KERNEL);
 
 
-template <typename IndexType>
-void invert_permutation(std::shared_ptr<const DefaultExecutor> exec,
-                        size_type size, const IndexType* permutation_indices,
-                        IndexType* inv_permutation)
+template <typename ValueType, typename IndexType>
+void inv_symm_permute(std::shared_ptr<const ReferenceExecutor> exec,
+                      const IndexType* perm,
+                      const matrix::Csr<ValueType, IndexType>* orig,
+                      matrix::Csr<ValueType, IndexType>* permuted)
 {
-    for (IndexType i = 0; i < static_cast<IndexType>(size); ++i) {
-        inv_permutation[permutation_indices[i]] = i;
-    }
+    inv_nonsymm_permute(exec, perm, perm, orig, permuted);
 }
 
-GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_INVERT_PERMUTATION_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_INV_SYMM_PERMUTE_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
-void inv_symm_permute(std::shared_ptr<const ReferenceExecutor> exec,
-                      const IndexType* perm,
-                      const matrix::Csr<ValueType, IndexType>* orig,
-                      matrix::Csr<ValueType, IndexType>* permuted)
+void inv_nonsymm_permute(std::shared_ptr<const ReferenceExecutor> exec,
+                         const IndexType* row_perm,
+                         const IndexType* column_perm,
+                         const matrix::Csr<ValueType, IndexType>* orig,
+                         matrix::Csr<ValueType, IndexType>* permuted)
 {
     auto in_row_ptrs = orig->get_const_row_ptrs();
     auto in_col_idxs = orig->get_const_col_idxs();
@@ -863,25 +864,25 @@ void inv_symm_permute(std::shared_ptr<const ReferenceExecutor> exec,
 
     for (size_type row = 0; row < num_rows; ++row) {
         auto src_row = row;
-        auto dst_row = perm[row];
+        auto dst_row = row_perm[row];
         p_row_ptrs[dst_row] = in_row_ptrs[src_row + 1] - in_row_ptrs[src_row];
     }
     components::prefix_sum_nonnegative(exec, p_row_ptrs, num_rows + 1);
     for (size_type row = 0; row < num_rows; ++row) {
         auto src_row = row;
-        auto dst_row = perm[row];
+        auto dst_row = row_perm[row];
         auto src_begin = in_row_ptrs[src_row];
         auto dst_begin = p_row_ptrs[dst_row];
         auto row_size = in_row_ptrs[src_row + 1] - src_begin;
         for (IndexType i = 0; i < row_size; ++i) {
-            p_col_idxs[dst_begin + i] = perm[in_col_idxs[src_begin + i]];
+            p_col_idxs[dst_begin + i] = column_perm[in_col_idxs[src_begin + i]];
             p_vals[dst_begin + i] = in_vals[src_begin + i];
         }
     }
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CSR_INV_SYMM_PERMUTE_KERNEL);
+    GKO_DECLARE_CSR_INV_NONSYMM_PERMUTE_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
@@ -920,10 +921,10 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 
 template <typename ValueType, typename IndexType>
-void inverse_row_permute(std::shared_ptr<const ReferenceExecutor> exec,
-                         const IndexType* perm,
-                         const matrix::Csr<ValueType, IndexType>* orig,
-                         matrix::Csr<ValueType, IndexType>* row_permuted)
+void inv_row_permute(std::shared_ptr<const ReferenceExecutor> exec,
+                     const IndexType* perm,
+                     const matrix::Csr<ValueType, IndexType>* orig,
+                     matrix::Csr<ValueType, IndexType>* row_permuted)
 {
     auto in_row_ptrs = orig->get_const_row_ptrs();
     auto in_col_idxs = orig->get_const_col_idxs();
@@ -951,21 +952,21 @@ void inverse_row_permute(std::shared_ptr<const ReferenceExecutor> exec,
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CSR_INVERSE_ROW_PERMUTE_KERNEL);
+    GKO_DECLARE_CSR_INV_ROW_PERMUTE_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
-void inverse_column_permute(std::shared_ptr<const ReferenceExecutor> exec,
-                            const IndexType* perm,
-                            const matrix::Csr<ValueType, IndexType>* orig,
-                            matrix::Csr<ValueType, IndexType>* column_permuted)
+void inv_col_permute(std::shared_ptr<const ReferenceExecutor> exec,
+                     const IndexType* perm,
+                     const matrix::Csr<ValueType, IndexType>* orig,
+                     matrix::Csr<ValueType, IndexType>* col_permuted)
 {
     auto in_row_ptrs = orig->get_const_row_ptrs();
     auto in_col_idxs = orig->get_const_col_idxs();
     auto in_vals = orig->get_const_values();
-    auto cp_row_ptrs = column_permuted->get_row_ptrs();
-    auto cp_col_idxs = column_permuted->get_col_idxs();
-    auto cp_vals = column_permuted->get_values();
+    auto cp_row_ptrs = col_permuted->get_row_ptrs();
+    auto cp_col_idxs = col_permuted->get_col_idxs();
+    auto cp_vals = col_permuted->get_values();
     auto num_rows = orig->get_size()[0];
 
     for (size_type row = 0; row < num_rows; ++row) {
@@ -981,7 +982,167 @@ void inverse_column_permute(std::shared_ptr<const ReferenceExecutor> exec,
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CSR_INVERSE_COLUMN_PERMUTE_KERNEL);
+    GKO_DECLARE_CSR_INV_COL_PERMUTE_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void inv_symm_scale_permute(std::shared_ptr<const ReferenceExecutor> exec,
+                            const ValueType* scale, const IndexType* perm,
+                            const matrix::Csr<ValueType, IndexType>* orig,
+                            matrix::Csr<ValueType, IndexType>* permuted)
+{
+    inv_nonsymm_scale_permute(exec, scale, perm, scale, perm, orig, permuted);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_INV_SYMM_SCALE_PERMUTE_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void inv_nonsymm_scale_permute(std::shared_ptr<const ReferenceExecutor> exec,
+                               const ValueType* row_scale,
+                               const IndexType* row_perm,
+                               const ValueType* col_scale,
+                               const IndexType* col_perm,
+                               const matrix::Csr<ValueType, IndexType>* orig,
+                               matrix::Csr<ValueType, IndexType>* permuted)
+{
+    auto in_row_ptrs = orig->get_const_row_ptrs();
+    auto in_col_idxs = orig->get_const_col_idxs();
+    auto in_vals = orig->get_const_values();
+    auto p_row_ptrs = permuted->get_row_ptrs();
+    auto p_col_idxs = permuted->get_col_idxs();
+    auto p_vals = permuted->get_values();
+    size_type num_rows = orig->get_size()[0];
+
+    for (size_type row = 0; row < num_rows; ++row) {
+        auto src_row = row;
+        auto dst_row = row_perm[row];
+        p_row_ptrs[dst_row] = in_row_ptrs[src_row + 1] - in_row_ptrs[src_row];
+    }
+    components::prefix_sum_nonnegative(exec, p_row_ptrs, num_rows + 1);
+    for (size_type row = 0; row < num_rows; ++row) {
+        auto src_row = row;
+        auto dst_row = row_perm[row];
+        auto src_begin = in_row_ptrs[src_row];
+        auto dst_begin = p_row_ptrs[dst_row];
+        auto row_size = in_row_ptrs[src_row + 1] - src_begin;
+        for (IndexType i = 0; i < row_size; ++i) {
+            const auto dst_col = col_perm[in_col_idxs[src_begin + i]];
+            p_col_idxs[dst_begin + i] = dst_col;
+            p_vals[dst_begin + i] = in_vals[src_begin + i] /
+                                    (row_scale[dst_row] * col_scale[dst_col]);
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_INV_NONSYMM_SCALE_PERMUTE_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void row_scale_permute(std::shared_ptr<const ReferenceExecutor> exec,
+                       const ValueType* scale, const IndexType* perm,
+                       const matrix::Csr<ValueType, IndexType>* orig,
+                       matrix::Csr<ValueType, IndexType>* row_permuted)
+{
+    auto in_row_ptrs = orig->get_const_row_ptrs();
+    auto in_col_idxs = orig->get_const_col_idxs();
+    auto in_vals = orig->get_const_values();
+    auto rp_row_ptrs = row_permuted->get_row_ptrs();
+    auto rp_col_idxs = row_permuted->get_col_idxs();
+    auto rp_vals = row_permuted->get_values();
+    size_type num_rows = orig->get_size()[0];
+
+    for (size_type row = 0; row < num_rows; ++row) {
+        auto src_row = perm[row];
+        auto dst_row = row;
+        rp_row_ptrs[dst_row] = in_row_ptrs[src_row + 1] - in_row_ptrs[src_row];
+    }
+    components::prefix_sum_nonnegative(exec, rp_row_ptrs, num_rows + 1);
+    for (size_type row = 0; row < num_rows; ++row) {
+        const auto src_row = perm[row];
+        const auto dst_row = row;
+        const auto src_begin = in_row_ptrs[src_row];
+        const auto dst_begin = rp_row_ptrs[dst_row];
+        const auto row_size = in_row_ptrs[src_row + 1] - src_begin;
+        std::copy_n(in_col_idxs + src_begin, row_size, rp_col_idxs + dst_begin);
+        for (IndexType i = 0; i < row_size; i++) {
+            rp_vals[i + dst_begin] = in_vals[i + src_begin] * scale[src_row];
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_ROW_SCALE_PERMUTE_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void inv_row_scale_permute(std::shared_ptr<const ReferenceExecutor> exec,
+                           const ValueType* scale, const IndexType* perm,
+                           const matrix::Csr<ValueType, IndexType>* orig,
+                           matrix::Csr<ValueType, IndexType>* row_permuted)
+{
+    auto in_row_ptrs = orig->get_const_row_ptrs();
+    auto in_col_idxs = orig->get_const_col_idxs();
+    auto in_vals = orig->get_const_values();
+    auto rp_row_ptrs = row_permuted->get_row_ptrs();
+    auto rp_col_idxs = row_permuted->get_col_idxs();
+    auto rp_vals = row_permuted->get_values();
+    size_type num_rows = orig->get_size()[0];
+
+    for (size_type row = 0; row < num_rows; ++row) {
+        auto src_row = row;
+        auto dst_row = perm[row];
+        rp_row_ptrs[dst_row] = in_row_ptrs[src_row + 1] - in_row_ptrs[src_row];
+    }
+    components::prefix_sum_nonnegative(exec, rp_row_ptrs, num_rows + 1);
+    for (size_type row = 0; row < num_rows; ++row) {
+        auto src_row = row;
+        auto dst_row = perm[row];
+        auto src_begin = in_row_ptrs[src_row];
+        auto dst_begin = rp_row_ptrs[dst_row];
+        auto row_size = in_row_ptrs[src_row + 1] - src_begin;
+        std::copy_n(in_col_idxs + src_begin, row_size, rp_col_idxs + dst_begin);
+        for (IndexType i = 0; i < row_size; i++) {
+            rp_vals[i + dst_begin] = in_vals[i + src_begin] / scale[dst_row];
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_INV_ROW_SCALE_PERMUTE_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void inv_col_scale_permute(std::shared_ptr<const ReferenceExecutor> exec,
+                           const ValueType* scale, const IndexType* perm,
+                           const matrix::Csr<ValueType, IndexType>* orig,
+                           matrix::Csr<ValueType, IndexType>* col_permuted)
+{
+    auto in_row_ptrs = orig->get_const_row_ptrs();
+    auto in_col_idxs = orig->get_const_col_idxs();
+    auto in_vals = orig->get_const_values();
+    auto cp_row_ptrs = col_permuted->get_row_ptrs();
+    auto cp_col_idxs = col_permuted->get_col_idxs();
+    auto cp_vals = col_permuted->get_values();
+    auto num_rows = orig->get_size()[0];
+
+    for (size_type row = 0; row < num_rows; ++row) {
+        auto row_begin = in_row_ptrs[row];
+        auto row_end = in_row_ptrs[row + 1];
+        cp_row_ptrs[row] = in_row_ptrs[row];
+        for (auto k = row_begin; k < row_end; ++k) {
+            const auto out_col = perm[in_col_idxs[k]];
+            cp_col_idxs[k] = out_col;
+            cp_vals[k] = in_vals[k] / scale[out_col];
+        }
+    }
+    cp_row_ptrs[num_rows] = in_row_ptrs[num_rows];
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_CSR_INV_COL_SCALE_PERMUTE_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
diff --git a/reference/matrix/dense_kernels.cpp b/reference/matrix/dense_kernels.cpp
index ba399b0f445..6a19c85cf83 100644
--- a/reference/matrix/dense_kernels.cpp
+++ b/reference/matrix/dense_kernels.cpp
@@ -397,6 +397,27 @@ void compute_norm1(std::shared_ptr<const ReferenceExecutor> exec,
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL);
 
 
+template <typename ValueType>
+void compute_mean(std::shared_ptr<const ReferenceExecutor> exec,
+                  const matrix::Dense<ValueType>* x,
+                  matrix::Dense<ValueType>* result, array<char>&)
+{
+    using ValueType_nc = gko::remove_complex<ValueType>;
+    for (size_type j = 0; j < x->get_size()[1]; ++j) {
+        result->at(0, j) = zero<ValueType>();
+    }
+
+    for (size_type i = 0; i < x->get_size()[1]; ++i) {
+        for (size_type j = 0; j < x->get_size()[0]; ++j) {
+            result->at(0, i) += x->at(j, i);
+        }
+        result->at(0, i) /= static_cast<ValueType_nc>(x->get_size()[0]);
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_MEAN_KERNEL);
+
+
 template <typename ValueType, typename IndexType>
 void fill_in_matrix_data(std::shared_ptr<const ReferenceExecutor> exec,
                          const device_matrix_data<ValueType, IndexType>& data,
@@ -841,11 +862,9 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_CONJ_TRANSPOSE_KERNEL);
 
 template <typename ValueType, typename IndexType>
 void symm_permute(std::shared_ptr<const ReferenceExecutor> exec,
-                  const array<IndexType>* permutation_indices,
-                  const matrix::Dense<ValueType>* orig,
+                  const IndexType* perm, const matrix::Dense<ValueType>* orig,
                   matrix::Dense<ValueType>* permuted)
 {
-    auto perm = permutation_indices->get_const_data();
     auto size = orig->get_size()[0];
     for (size_type i = 0; i < size; ++i) {
         for (size_type j = 0; j < size; ++j) {
@@ -860,11 +879,10 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType, typename IndexType>
 void inv_symm_permute(std::shared_ptr<const ReferenceExecutor> exec,
-                      const array<IndexType>* permutation_indices,
+                      const IndexType* perm,
                       const matrix::Dense<ValueType>* orig,
                       matrix::Dense<ValueType>* permuted)
 {
-    auto perm = permutation_indices->get_const_data();
     auto size = orig->get_size()[0];
     for (size_type i = 0; i < size; ++i) {
         for (size_type j = 0; j < size; ++j) {
@@ -877,14 +895,46 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_INV_SYMM_PERMUTE_KERNEL);
 
 
+template <typename ValueType, typename IndexType>
+void nonsymm_permute(std::shared_ptr<const ReferenceExecutor> exec,
+                     const IndexType* row_perm, const IndexType* col_perm,
+                     const matrix::Dense<ValueType>* orig,
+                     matrix::Dense<ValueType>* permuted)
+{
+    for (size_type i = 0; i < orig->get_size()[0]; ++i) {
+        for (size_type j = 0; j < orig->get_size()[1]; ++j) {
+            permuted->at(i, j) = orig->at(row_perm[i], col_perm[j]);
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_DENSE_NONSYMM_PERMUTE_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void inv_nonsymm_permute(std::shared_ptr<const ReferenceExecutor> exec,
+                         const IndexType* row_perm, const IndexType* col_perm,
+                         const matrix::Dense<ValueType>* orig,
+                         matrix::Dense<ValueType>* permuted)
+{
+    for (size_type i = 0; i < orig->get_size()[0]; ++i) {
+        for (size_type j = 0; j < orig->get_size()[1]; ++j) {
+            permuted->at(row_perm[i], col_perm[j]) = orig->at(i, j);
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_DENSE_INV_NONSYMM_PERMUTE_KERNEL);
+
+
 template <typename ValueType, typename OutputType, typename IndexType>
 void row_gather(std::shared_ptr<const ReferenceExecutor> exec,
-                const array<IndexType>* row_idxs,
-                const matrix::Dense<ValueType>* orig,
+                const IndexType* rows, const matrix::Dense<ValueType>* orig,
                 matrix::Dense<OutputType>* row_collection)
 {
-    auto rows = row_idxs->get_const_data();
-    for (size_type i = 0; i < row_idxs->get_num_elems(); ++i) {
+    for (size_type i = 0; i < row_collection->get_size()[0]; ++i) {
         for (size_type j = 0; j < orig->get_size()[1]; ++j) {
             row_collection->at(i, j) = orig->at(rows[i], j);
         }
@@ -898,16 +948,15 @@ GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_2(
 template <typename ValueType, typename OutputType, typename IndexType>
 void advanced_row_gather(std::shared_ptr<const ReferenceExecutor> exec,
                          const matrix::Dense<ValueType>* alpha,
-                         const array<IndexType>* row_idxs,
+                         const IndexType* rows,
                          const matrix::Dense<ValueType>* orig,
                          const matrix::Dense<ValueType>* beta,
                          matrix::Dense<OutputType>* row_collection)
 {
     using type = highest_precision<ValueType, OutputType>;
-    auto rows = row_idxs->get_const_data();
     auto scalar_alpha = alpha->at(0, 0);
     auto scalar_beta = beta->at(0, 0);
-    for (size_type i = 0; i < row_idxs->get_num_elems(); ++i) {
+    for (size_type i = 0; i < row_collection->get_size()[0]; ++i) {
         for (size_type j = 0; j < orig->get_size()[1]; ++j) {
             row_collection->at(i, j) =
                 static_cast<type>(scalar_alpha * orig->at(rows[i], j)) +
@@ -922,30 +971,27 @@ GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_2(
 
 
 template <typename ValueType, typename IndexType>
-void column_permute(std::shared_ptr<const ReferenceExecutor> exec,
-                    const array<IndexType>* permutation_indices,
-                    const matrix::Dense<ValueType>* orig,
-                    matrix::Dense<ValueType>* column_permuted)
+void col_permute(std::shared_ptr<const ReferenceExecutor> exec,
+                 const IndexType* perm, const matrix::Dense<ValueType>* orig,
+                 matrix::Dense<ValueType>* col_permuted)
 {
-    auto perm = permutation_indices->get_const_data();
-    for (size_type j = 0; j < orig->get_size()[1]; ++j) {
-        for (size_type i = 0; i < orig->get_size()[0]; ++i) {
-            column_permuted->at(i, j) = orig->at(i, perm[j]);
+    for (size_type i = 0; i < orig->get_size()[0]; ++i) {
+        for (size_type j = 0; j < orig->get_size()[1]; ++j) {
+            col_permuted->at(i, j) = orig->at(i, perm[j]);
         }
     }
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_DENSE_COLUMN_PERMUTE_KERNEL);
+    GKO_DECLARE_DENSE_COL_PERMUTE_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
-void inverse_row_permute(std::shared_ptr<const ReferenceExecutor> exec,
-                         const array<IndexType>* permutation_indices,
-                         const matrix::Dense<ValueType>* orig,
-                         matrix::Dense<ValueType>* row_permuted)
+void inv_row_permute(std::shared_ptr<const ReferenceExecutor> exec,
+                     const IndexType* perm,
+                     const matrix::Dense<ValueType>* orig,
+                     matrix::Dense<ValueType>* row_permuted)
 {
-    auto perm = permutation_indices->get_const_data();
     for (size_type i = 0; i < orig->get_size()[0]; ++i) {
         for (size_type j = 0; j < orig->get_size()[1]; ++j) {
             row_permuted->at(perm[i], j) = orig->at(i, j);
@@ -958,21 +1004,176 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 
 template <typename ValueType, typename IndexType>
-void inverse_column_permute(std::shared_ptr<const ReferenceExecutor> exec,
-                            const array<IndexType>* permutation_indices,
+void inv_col_permute(std::shared_ptr<const ReferenceExecutor> exec,
+                     const IndexType* perm,
+                     const matrix::Dense<ValueType>* orig,
+                     matrix::Dense<ValueType>* col_permuted)
+{
+    for (size_type i = 0; i < orig->get_size()[0]; ++i) {
+        for (size_type j = 0; j < orig->get_size()[1]; ++j) {
+            col_permuted->at(i, perm[j]) = orig->at(i, j);
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_DENSE_INV_COL_PERMUTE_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void symm_scale_permute(std::shared_ptr<const ReferenceExecutor> exec,
+                        const ValueType* scale, const IndexType* perm,
+                        const matrix::Dense<ValueType>* orig,
+                        matrix::Dense<ValueType>* permuted)
+{
+    for (size_type i = 0; i < orig->get_size()[0]; ++i) {
+        for (size_type j = 0; j < orig->get_size()[1]; ++j) {
+            const auto row = perm[i];
+            const auto col = perm[j];
+            permuted->at(i, j) = scale[row] * scale[col] * orig->at(row, col);
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_DENSE_SYMM_SCALE_PERMUTE_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void inv_symm_scale_permute(std::shared_ptr<const ReferenceExecutor> exec,
+                            const ValueType* scale, const IndexType* perm,
                             const matrix::Dense<ValueType>* orig,
-                            matrix::Dense<ValueType>* column_permuted)
+                            matrix::Dense<ValueType>* permuted)
 {
-    auto perm = permutation_indices->get_const_data();
-    for (size_type j = 0; j < orig->get_size()[1]; ++j) {
-        for (size_type i = 0; i < orig->get_size()[0]; ++i) {
-            column_permuted->at(i, perm[j]) = orig->at(i, j);
+    for (size_type i = 0; i < orig->get_size()[0]; ++i) {
+        for (size_type j = 0; j < orig->get_size()[1]; ++j) {
+            const auto row = perm[i];
+            const auto col = perm[j];
+            permuted->at(row, col) = orig->at(i, j) / (scale[row] * scale[col]);
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_DENSE_INV_SYMM_SCALE_PERMUTE_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void nonsymm_scale_permute(std::shared_ptr<const ReferenceExecutor> exec,
+                           const ValueType* row_scale,
+                           const IndexType* row_perm,
+                           const ValueType* col_scale,
+                           const IndexType* col_perm,
+                           const matrix::Dense<ValueType>* orig,
+                           matrix::Dense<ValueType>* permuted)
+{
+    for (size_type i = 0; i < orig->get_size()[0]; ++i) {
+        for (size_type j = 0; j < orig->get_size()[1]; ++j) {
+            const auto row = row_perm[i];
+            const auto col = col_perm[j];
+            permuted->at(i, j) =
+                row_scale[row] * col_scale[col] * orig->at(row, col);
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_DENSE_NONSYMM_SCALE_PERMUTE_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void inv_nonsymm_scale_permute(std::shared_ptr<const ReferenceExecutor> exec,
+                               const ValueType* row_scale,
+                               const IndexType* row_perm,
+                               const ValueType* col_scale,
+                               const IndexType* col_perm,
+                               const matrix::Dense<ValueType>* orig,
+                               matrix::Dense<ValueType>* permuted)
+{
+    for (size_type i = 0; i < orig->get_size()[0]; ++i) {
+        for (size_type j = 0; j < orig->get_size()[1]; ++j) {
+            const auto row = row_perm[i];
+            const auto col = col_perm[j];
+            permuted->at(row, col) =
+                orig->at(i, j) / (row_scale[row] * col_scale[col]);
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_DENSE_INV_NONSYMM_SCALE_PERMUTE_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void row_scale_permute(std::shared_ptr<const ReferenceExecutor> exec,
+                       const ValueType* scale, const IndexType* perm,
+                       const matrix::Dense<ValueType>* orig,
+                       matrix::Dense<ValueType>* permuted)
+{
+    for (size_type i = 0; i < orig->get_size()[0]; ++i) {
+        for (size_type j = 0; j < orig->get_size()[1]; ++j) {
+            const auto row = perm[i];
+            permuted->at(i, j) = scale[row] * orig->at(row, j);
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_DENSE_ROW_SCALE_PERMUTE_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void inv_row_scale_permute(std::shared_ptr<const ReferenceExecutor> exec,
+                           const ValueType* scale, const IndexType* perm,
+                           const matrix::Dense<ValueType>* orig,
+                           matrix::Dense<ValueType>* permuted)
+{
+    for (size_type i = 0; i < orig->get_size()[0]; ++i) {
+        for (size_type j = 0; j < orig->get_size()[1]; ++j) {
+            const auto row = perm[i];
+            permuted->at(row, j) = orig->at(i, j) / scale[row];
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_DENSE_INV_ROW_SCALE_PERMUTE_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void col_scale_permute(std::shared_ptr<const ReferenceExecutor> exec,
+                       const ValueType* scale, const IndexType* perm,
+                       const matrix::Dense<ValueType>* orig,
+                       matrix::Dense<ValueType>* permuted)
+{
+    for (size_type i = 0; i < orig->get_size()[0]; ++i) {
+        for (size_type j = 0; j < orig->get_size()[1]; ++j) {
+            const auto col = perm[j];
+            permuted->at(i, j) = scale[col] * orig->at(i, col);
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_DENSE_COL_SCALE_PERMUTE_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void inv_col_scale_permute(std::shared_ptr<const ReferenceExecutor> exec,
+                           const ValueType* scale, const IndexType* perm,
+                           const matrix::Dense<ValueType>* orig,
+                           matrix::Dense<ValueType>* permuted)
+{
+    for (size_type i = 0; i < orig->get_size()[0]; ++i) {
+        for (size_type j = 0; j < orig->get_size()[1]; ++j) {
+            const auto col = perm[j];
+            permuted->at(i, col) = orig->at(i, j) / scale[col];
         }
     }
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_DENSE_INV_COLUMN_PERMUTE_KERNEL);
+    GKO_DECLARE_DENSE_INV_COL_SCALE_PERMUTE_KERNEL);
 
 
 template <typename ValueType>
diff --git a/reference/matrix/permutation_kernels.cpp b/reference/matrix/permutation_kernels.cpp
new file mode 100644
index 00000000000..72076d2e69d
--- /dev/null
+++ b/reference/matrix/permutation_kernels.cpp
@@ -0,0 +1,74 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/matrix/permutation_kernels.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace reference {
+namespace permutation {
+
+
+template <typename IndexType>
+void invert(std::shared_ptr<const DefaultExecutor> exec,
+            const IndexType* permutation, size_type size,
+            IndexType* output_permutation)
+{
+    for (size_type i = 0; i < size; i++) {
+        output_permutation[permutation[i]] = i;
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_PERMUTATION_INVERT_KERNEL);
+
+
+template <typename IndexType>
+void compose(std::shared_ptr<const DefaultExecutor> exec,
+             const IndexType* first_permutation,
+             const IndexType* second_permutation, size_type size,
+             IndexType* output_permutation)
+{
+    // P_2 P_1 does a row permutation of P_1 with indices from P_2
+    // row i of P_2 P_1 x accesses row P_2[i] of P_1 x = row P_1[P_2[i]] of x
+    for (size_type i = 0; i < size; i++) {
+        output_permutation[i] = first_permutation[second_permutation[i]];
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_PERMUTATION_COMPOSE_KERNEL);
+
+
+}  // namespace permutation
+}  // namespace reference
+}  // namespace kernels
+}  // namespace gko
diff --git a/reference/matrix/scaled_permutation_kernels.cpp b/reference/matrix/scaled_permutation_kernels.cpp
new file mode 100644
index 00000000000..586ab9b9316
--- /dev/null
+++ b/reference/matrix/scaled_permutation_kernels.cpp
@@ -0,0 +1,90 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/matrix/scaled_permutation_kernels.hpp"
+
+
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+namespace gko {
+namespace kernels {
+namespace reference {
+namespace scaled_permutation {
+
+
+template <typename ValueType, typename IndexType>
+void invert(std::shared_ptr<const DefaultExecutor> exec,
+            const ValueType* input_scale, const IndexType* input_permutation,
+            size_type size, ValueType* output_scale,
+            IndexType* output_permutation)
+{
+    for (size_type i = 0; i < size; i++) {
+        const auto ip = input_permutation[i];
+        output_permutation[ip] = i;
+        output_scale[i] = one<ValueType>() / input_scale[ip];
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_SCALED_PERMUTATION_INVERT_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void compose(std::shared_ptr<const DefaultExecutor> exec,
+             const ValueType* first_scale, const IndexType* first_permutation,
+             const ValueType* second_scale, const IndexType* second_permutation,
+             size_type size, ValueType* output_scale,
+             IndexType* output_permutation)
+{
+    // P_2 S_2 P_1 S_1 = P_2 P_1 S'_2 S_1 with S'_2 = P_1^-1 S_2 P_1^-T
+    // P_2 P_1 does a row permutation of P_1 with indices from P_2
+    // row i of P_2 P_1 x accesses row P_2[i] of P_1 x = row P_1[P_2[i]] of x
+    for (size_type i = 0; i < size; i++) {
+        const auto second_permuted = second_permutation[i];
+        const auto combined_permuted = first_permutation[second_permuted];
+        output_permutation[i] = combined_permuted;
+        // output_scale[i] = first_scale[i] * second_scale[inv_first_perm[i]];
+        // second_perm[i] = inv_first_perm[combined_perm[i]];
+        output_scale[combined_permuted] =
+            first_scale[combined_permuted] * second_scale[second_permuted];
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_SCALED_PERMUTATION_COMPOSE_KERNEL);
+
+
+}  // namespace scaled_permutation
+}  // namespace reference
+}  // namespace kernels
+}  // namespace gko
diff --git a/reference/preconditioner/batch_identity.hpp b/reference/preconditioner/batch_identity.hpp
new file mode 100644
index 00000000000..6d6d462e660
--- /dev/null
+++ b/reference/preconditioner/batch_identity.hpp
@@ -0,0 +1,95 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_REFERENCE_PRECONDITIONER_BATCH_IDENTITY_HPP_
+#define GKO_REFERENCE_PRECONDITIONER_BATCH_IDENTITY_HPP_
+
+
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace host {
+namespace batch_preconditioner {
+
+
+/**
+ * Identity preconditioner for batch solvers. Enables unpreconditioned solves
+ * by performing a copy of the preconditioned vector to the un-preconditioned
+ * vector.
+ */
+template <typename ValueType>
+class Identity final {
+public:
+    using value_type = ValueType;
+
+    /**
+     * The size of the work vector required in case of static allocation.
+     */
+    static constexpr int work_size = 0;
+
+    /**
+     * The size of the work vector required in case of dynamic allocation.
+     */
+    static int dynamic_work_size(int, int) { return 0; }
+
+    /**
+     * Sets the input and generates the identity preconditioner.(Nothing needs
+     * to be actually generated.)
+     */
+    template <typename batch_item_type>
+    void generate(size_type, const batch_item_type&, ValueType* const)
+    {}
+
+    /**
+     * Applies the preconditioner to the vector. For the identity
+     * preconditioner, this is equivalent to a copy.
+     */
+    void apply(const gko::batch::multi_vector::batch_item<const ValueType>& r,
+               const gko::batch::multi_vector::batch_item<ValueType>& z) const
+    {
+        for (int i = 0; i < r.num_rows; i++) {
+            z.values[i * z.stride] = r.values[i * r.stride];
+        }
+    }
+};
+
+
+}  // namespace batch_preconditioner
+}  // namespace host
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_REFERENCE_PRECONDITIONER_BATCH_IDENTITY_HPP_
diff --git a/reference/reorder/rcm_kernels.cpp b/reference/reorder/rcm_kernels.cpp
index be14aeb557d..5e357cf775c 100644
--- a/reference/reorder/rcm_kernels.cpp
+++ b/reference/reorder/rcm_kernels.cpp
@@ -111,7 +111,7 @@ std::pair<IndexType, size_type> rls_contender_and_height(
     // The last levels size is required to compute the contender.
     IndexType last_level_size = 0;
 
-    // While there are still nodes whose neighbours haven't been inspected.
+    // While there are still nodes whose neighbors haven't been inspected.
     while (rls_index < rls_offset) {
         auto parent = rls_p[rls_index];
         --current_level_countdown;
@@ -255,12 +255,12 @@ void get_permutation(std::shared_ptr<const ReferenceExecutor> exec,
             ++tail_offset;
         }
 
-        // Get the neigbours of the next vertex,
+        // Get the neighbors of the next vertex,
         // check if they have already been visited,
         // if no, insert them to sort.
         auto prev_head_offset = head_offset;
 
-        // Get the next vertex neighbours.
+        // Get the next vertex neighbors.
         auto row_start = row_ptrs[next_vertex];
         auto row_end = row_ptrs[next_vertex + 1];
         for (auto neighbor_idx = row_start; neighbor_idx < row_end;
@@ -276,7 +276,7 @@ void get_permutation(std::shared_ptr<const ReferenceExecutor> exec,
             }
         }
 
-        // Sort all just-added neighbours by degree.
+        // Sort all just-added neighbors by degree.
         std::sort(
             linear_queue_p + prev_head_offset, linear_queue_p + head_offset,
             [&](IndexType i, IndexType j) { return degrees[i] < degrees[j]; });
diff --git a/reference/solver/batch_bicgstab_kernels.cpp b/reference/solver/batch_bicgstab_kernels.cpp
new file mode 100644
index 00000000000..6580996a77b
--- /dev/null
+++ b/reference/solver/batch_bicgstab_kernels.cpp
@@ -0,0 +1,136 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/solver/batch_bicgstab_kernels.hpp"
+
+
+#include "core/solver/batch_dispatch.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace reference {
+
+
+/**
+ * @brief The batch Bicgstab solver namespace.
+ *
+ * @ingroup batch_bicgstab
+ */
+namespace batch_bicgstab {
+
+
+namespace {
+
+
+constexpr int max_num_rhs = 1;
+
+
+#include "reference/base/batch_multi_vector_kernels.hpp.inc"
+#include "reference/matrix/batch_dense_kernels.hpp.inc"
+#include "reference/matrix/batch_ell_kernels.hpp.inc"
+#include "reference/solver/batch_bicgstab_kernels.hpp.inc"
+
+
+}  // unnamed namespace
+
+
+template <typename T>
+using settings = gko::kernels::batch_bicgstab::settings<T>;
+
+
+template <typename ValueType>
+class kernel_caller {
+public:
+    kernel_caller(std::shared_ptr<const DefaultExecutor> exec,
+                  const settings<remove_complex<ValueType>> settings)
+        : exec_{std::move(exec)}, settings_{settings}
+    {}
+
+    template <typename BatchMatrixType, typename PrecType, typename StopType,
+              typename LogType>
+    void call_kernel(
+        const LogType& logger, const BatchMatrixType& mat, PrecType prec,
+        const gko::batch::multi_vector::uniform_batch<const ValueType>& b,
+        const gko::batch::multi_vector::uniform_batch<ValueType>& x) const
+    {
+        using real_type = typename gko::remove_complex<ValueType>;
+        const size_type num_batch_items = mat.num_batch_items;
+        const auto num_rows = mat.num_rows;
+        const auto num_rhs = b.num_rhs;
+        if (num_rhs > max_num_rhs) {
+            GKO_NOT_IMPLEMENTED;
+        }
+
+        const size_type local_size_bytes =
+            gko::kernels::batch_bicgstab::local_memory_requirement<ValueType>(
+                num_rows, num_rhs) +
+            PrecType::dynamic_work_size(num_rows,
+                                        mat.get_single_item_num_nnz()) *
+                sizeof(ValueType);
+        array<unsigned char> local_space(exec_, local_size_bytes);
+
+        for (size_type batch_id = 0; batch_id < num_batch_items; batch_id++) {
+            batch_entry_bicgstab_impl<StopType, PrecType, LogType,
+                                      BatchMatrixType, ValueType>(
+                settings_, logger, prec, mat, b, x, batch_id,
+                local_space.get_data());
+        }
+    }
+
+private:
+    const std::shared_ptr<const DefaultExecutor> exec_;
+    const settings<remove_complex<ValueType>> settings_;
+};
+
+
+template <typename ValueType>
+void apply(std::shared_ptr<const DefaultExecutor> exec,
+           const settings<remove_complex<ValueType>>& settings,
+           const batch::BatchLinOp* const mat,
+           const batch::BatchLinOp* const precon,
+           const batch::MultiVector<ValueType>* const b,
+           batch::MultiVector<ValueType>* const x,
+           batch::log::detail::log_data<remove_complex<ValueType>>& log_data)
+{
+    auto dispatcher = batch::solver::create_dispatcher<ValueType>(
+        kernel_caller<ValueType>(exec, settings), settings, mat, precon);
+    dispatcher.apply(b, x, log_data);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_BICGSTAB_APPLY_KERNEL);
+
+
+}  // namespace batch_bicgstab
+}  // namespace reference
+}  // namespace kernels
+}  // namespace gko
diff --git a/reference/solver/batch_bicgstab_kernels.hpp.inc b/reference/solver/batch_bicgstab_kernels.hpp.inc
new file mode 100644
index 00000000000..1ec38751a0f
--- /dev/null
+++ b/reference/solver/batch_bicgstab_kernels.hpp.inc
@@ -0,0 +1,349 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+template <typename BatchMatrixType_entry, typename ValueType>
+inline void initialize(
+    const BatchMatrixType_entry& A_entry,
+    const gko::batch::multi_vector::batch_item<const ValueType>& b_entry,
+    const gko::batch::multi_vector::batch_item<const ValueType>& x_entry,
+    const gko::batch::multi_vector::batch_item<ValueType>& rho_old_entry,
+    const gko::batch::multi_vector::batch_item<ValueType>& omega_entry,
+    const gko::batch::multi_vector::batch_item<ValueType>& alpha_entry,
+    const gko::batch::multi_vector::batch_item<ValueType>& r_entry,
+    const gko::batch::multi_vector::batch_item<ValueType>& r_hat_entry,
+    const gko::batch::multi_vector::batch_item<ValueType>& p_entry,
+    const gko::batch::multi_vector::batch_item<ValueType>& p_hat_entry,
+    const gko::batch::multi_vector::batch_item<ValueType>& v_entry,
+    const gko::batch::multi_vector::batch_item<
+        typename gko::remove_complex<ValueType>>& rhs_norms_entry,
+    const gko::batch::multi_vector::batch_item<
+        typename gko::remove_complex<ValueType>>& res_norms_entry)
+{
+    rho_old_entry.values[0] = one<ValueType>();
+    omega_entry.values[0] = one<ValueType>();
+    alpha_entry.values[0] = one<ValueType>();
+
+    // Compute norms of rhs
+    compute_norm2_kernel<ValueType>(b_entry, rhs_norms_entry);
+
+    // r = b
+    copy_kernel(b_entry, r_entry);
+
+    // r = b - A*x
+    advanced_apply_kernel(static_cast<ValueType>(-1.0), A_entry,
+                          gko::batch::to_const(x_entry),
+                          static_cast<ValueType>(1.0), r_entry);
+    compute_norm2_kernel<ValueType>(gko::batch::to_const(r_entry),
+                                    res_norms_entry);
+
+    for (int r = 0; r < p_entry.num_rows; r++) {
+        r_hat_entry.values[r * r_hat_entry.stride] =
+            r_entry.values[r * r_entry.stride];
+        p_entry.values[r * p_entry.stride] = zero<ValueType>();
+        p_hat_entry.values[r * p_hat_entry.stride] = zero<ValueType>();
+        v_entry.values[r * v_entry.stride] = zero<ValueType>();
+    }
+}
+
+
+template <typename ValueType>
+inline void update_p(
+    const gko::batch::multi_vector::batch_item<const ValueType>& rho_new_entry,
+    const gko::batch::multi_vector::batch_item<const ValueType>& rho_old_entry,
+    const gko::batch::multi_vector::batch_item<const ValueType>& alpha_entry,
+    const gko::batch::multi_vector::batch_item<const ValueType>& omega_entry,
+    const gko::batch::multi_vector::batch_item<const ValueType>& r_entry,
+    const gko::batch::multi_vector::batch_item<const ValueType>& v_entry,
+    const gko::batch::multi_vector::batch_item<ValueType>& p_entry)
+{
+    const ValueType beta = (rho_new_entry.values[0] / rho_old_entry.values[0]) *
+                           (alpha_entry.values[0] / omega_entry.values[0]);
+    for (int r = 0; r < p_entry.num_rows; r++) {
+        p_entry.values[r * p_entry.stride] =
+            r_entry.values[r * r_entry.stride] +
+            beta * (p_entry.values[r * p_entry.stride] -
+                    omega_entry.values[0] * v_entry.values[r * v_entry.stride]);
+    }
+}
+
+
+template <typename ValueType>
+inline void compute_alpha(
+    const gko::batch::multi_vector::batch_item<const ValueType>& rho_new_entry,
+    const gko::batch::multi_vector::batch_item<const ValueType>& r_hat_entry,
+    const gko::batch::multi_vector::batch_item<const ValueType>& v_entry,
+    const gko::batch::multi_vector::batch_item<ValueType>& alpha_entry)
+{
+    compute_dot_product_kernel<ValueType>(r_hat_entry, v_entry, alpha_entry);
+    alpha_entry.values[0] = rho_new_entry.values[0] / alpha_entry.values[0];
+}
+
+
+template <typename ValueType>
+inline void update_s(
+    const gko::batch::multi_vector::batch_item<const ValueType>& r_entry,
+    const gko::batch::multi_vector::batch_item<const ValueType>& alpha_entry,
+    const gko::batch::multi_vector::batch_item<const ValueType>& v_entry,
+    const gko::batch::multi_vector::batch_item<ValueType>& s_entry)
+{
+    for (int r = 0; r < s_entry.num_rows; r++) {
+        s_entry.values[r * s_entry.stride] =
+            r_entry.values[r * r_entry.stride] -
+            alpha_entry.values[0] * v_entry.values[r * v_entry.stride];
+    }
+}
+
+
+template <typename ValueType>
+inline void compute_omega(
+    const gko::batch::multi_vector::batch_item<const ValueType>& t_entry,
+    const gko::batch::multi_vector::batch_item<const ValueType>& s_entry,
+    const gko::batch::multi_vector::batch_item<ValueType>& temp_entry,
+    const gko::batch::multi_vector::batch_item<ValueType>& omega_entry)
+{
+    compute_dot_product_kernel<ValueType>(t_entry, s_entry, omega_entry);
+    compute_dot_product_kernel<ValueType>(t_entry, t_entry, temp_entry);
+    omega_entry.values[0] /= temp_entry.values[0];
+}
+
+
+template <typename ValueType>
+inline void update_x_and_r(
+    const gko::batch::multi_vector::batch_item<const ValueType>& p_hat_entry,
+    const gko::batch::multi_vector::batch_item<const ValueType>& s_hat_entry,
+    const gko::batch::multi_vector::batch_item<const ValueType>& alpha_entry,
+    const gko::batch::multi_vector::batch_item<const ValueType>& omega_entry,
+    const gko::batch::multi_vector::batch_item<const ValueType>& s_entry,
+    const gko::batch::multi_vector::batch_item<const ValueType>& t_entry,
+    const gko::batch::multi_vector::batch_item<ValueType>& x_entry,
+    const gko::batch::multi_vector::batch_item<ValueType>& r_entry)
+{
+    const ValueType omega = omega_entry.values[0];
+    for (int r = 0; r < x_entry.num_rows; r++) {
+        x_entry.values[r * x_entry.stride] =
+            x_entry.values[r * x_entry.stride] +
+            alpha_entry.values[0] * p_hat_entry.values[r * p_hat_entry.stride] +
+            omega * s_hat_entry.values[r * s_hat_entry.stride];
+
+        r_entry.values[r * r_entry.stride] =
+            s_entry.values[r * s_entry.stride] -
+            omega * t_entry.values[r * t_entry.stride];
+    }
+}
+
+template <typename ValueType>
+inline void update_x_middle(
+    const gko::batch::multi_vector::batch_item<const ValueType>& alpha_entry,
+    const gko::batch::multi_vector::batch_item<const ValueType>& p_hat_entry,
+    const gko::batch::multi_vector::batch_item<ValueType>& x_entry)
+{
+    for (int r = 0; r < x_entry.num_rows; r++) {
+        x_entry.values[r * x_entry.stride] =
+            x_entry.values[r * x_entry.stride] +
+            alpha_entry.values[0] * p_hat_entry.values[r * p_hat_entry.stride];
+    }
+}
+
+
+template <typename StopType, typename PrecType, typename LogType,
+          typename BatchMatrixType, typename ValueType>
+inline void batch_entry_bicgstab_impl(
+    const gko::kernels::batch_bicgstab::settings<remove_complex<ValueType>>&
+        settings,
+    LogType logger, PrecType prec, const BatchMatrixType& a,
+    const gko::batch::multi_vector::uniform_batch<const ValueType>& b,
+    const gko::batch::multi_vector::uniform_batch<ValueType>& x,
+    const size_type batch_item_id, unsigned char* const local_space)
+{
+    using real_type = typename gko::remove_complex<ValueType>;
+    const auto num_rows = a.num_rows;
+    const auto num_rhs = b.num_rhs;
+    GKO_ASSERT(num_rhs <= max_num_rhs);
+
+    unsigned char* const shared_space = local_space;
+    ValueType* const r = reinterpret_cast<ValueType*>(shared_space);
+    ValueType* const r_hat = r + num_rows * num_rhs;
+    ValueType* const p = r_hat + num_rows * num_rhs;
+    ValueType* const p_hat = p + num_rows * num_rhs;
+    ValueType* const v = p_hat + num_rows * num_rhs;
+    ValueType* const s = v + num_rows * num_rhs;
+    ValueType* const s_hat = s + num_rows * num_rhs;
+    ValueType* const t = s_hat + num_rows * num_rhs;
+    ValueType* const prec_work = t + num_rows * num_rhs;
+    ValueType rho_old[max_num_rhs];
+    ValueType rho_new[max_num_rhs];
+    ValueType omega[max_num_rhs];
+    ValueType alpha[max_num_rhs];
+    ValueType temp[max_num_rhs];
+    real_type norms_rhs[max_num_rhs];
+    real_type norms_res[max_num_rhs];
+
+    const auto A_entry = gko::batch::matrix::extract_batch_item(
+        gko::batch::matrix::to_const(a), batch_item_id);
+    const gko::batch::multi_vector::batch_item<const ValueType> b_entry =
+        gko::batch::extract_batch_item(gko::batch::to_const(b), batch_item_id);
+    const gko::batch::multi_vector::batch_item<ValueType> x_entry =
+        gko::batch::extract_batch_item(x, batch_item_id);
+
+    const gko::batch::multi_vector::batch_item<ValueType> r_entry{
+        r, num_rhs, num_rows, num_rhs};
+    const gko::batch::multi_vector::batch_item<ValueType> r_hat_entry{
+        r_hat, num_rhs, num_rows, num_rhs};
+    const gko::batch::multi_vector::batch_item<ValueType> p_entry{
+        p, num_rhs, num_rows, num_rhs};
+    const gko::batch::multi_vector::batch_item<ValueType> p_hat_entry{
+        p_hat, num_rhs, num_rows, num_rhs};
+    const gko::batch::multi_vector::batch_item<ValueType> v_entry{
+        v, num_rhs, num_rows, num_rhs};
+    const gko::batch::multi_vector::batch_item<ValueType> s_entry{
+        s, num_rhs, num_rows, num_rhs};
+    const gko::batch::multi_vector::batch_item<ValueType> s_hat_entry{
+        s_hat, num_rhs, num_rows, num_rhs};
+    const gko::batch::multi_vector::batch_item<ValueType> t_entry{
+        t, num_rhs, num_rows, num_rhs};
+    const gko::batch::multi_vector::batch_item<ValueType> rho_old_entry{
+        rho_old, num_rhs, 1, num_rhs};
+    const gko::batch::multi_vector::batch_item<ValueType> rho_new_entry{
+        rho_new, num_rhs, 1, num_rhs};
+    const gko::batch::multi_vector::batch_item<ValueType> omega_entry{
+        omega, num_rhs, 1, num_rhs};
+    const gko::batch::multi_vector::batch_item<ValueType> alpha_entry{
+        alpha, num_rhs, 1, num_rhs};
+    const gko::batch::multi_vector::batch_item<ValueType> temp_entry{
+        temp, num_rhs, 1, num_rhs};
+    const gko::batch::multi_vector::batch_item<real_type> rhs_norms_entry{
+        norms_rhs, num_rhs, 1, num_rhs};
+    const gko::batch::multi_vector::batch_item<real_type> res_norms_entry{
+        norms_res, num_rhs, 1, num_rhs};
+
+    // generate preconditioner
+    prec.generate(batch_item_id, A_entry, prec_work);
+
+    // initialization
+    // rho_old = 1, omega = 1, alpha = 1
+    // compute b norms
+    // r = b - A*x
+    // compute residual norms
+    // r_hat = r
+    // p = 0
+    // p_hat = 0
+    // v = 0
+    initialize(A_entry, b_entry, gko::batch::to_const(x_entry), rho_old_entry,
+               omega_entry, alpha_entry, r_entry, r_hat_entry, p_entry,
+               p_hat_entry, v_entry, rhs_norms_entry, res_norms_entry);
+
+    // stopping criterion object
+    StopType stop(settings.residual_tol, rhs_norms_entry.values);
+
+    int iter{};
+
+    for (iter = 0; iter < settings.max_iterations; iter++) {
+        if (stop.check_converged(res_norms_entry.values)) {
+            logger.log_iteration(batch_item_id, iter,
+                                 res_norms_entry.values[0]);
+            break;
+        }
+
+        // rho_new =  < r_hat , r > = (r_hat)' * (r)
+        compute_dot_product_kernel<ValueType>(gko::batch::to_const(r_hat_entry),
+                                              gko::batch::to_const(r_entry),
+                                              rho_new_entry);
+
+        // beta = (rho_new / rho_old)*(alpha / omega)
+        // p = r + beta*(p - omega * v)
+        update_p(gko::batch::to_const(rho_new_entry),
+                 gko::batch::to_const(rho_old_entry),
+                 gko::batch::to_const(alpha_entry),
+                 gko::batch::to_const(omega_entry),
+                 gko::batch::to_const(r_entry), gko::batch::to_const(v_entry),
+                 p_entry);
+
+        // p_hat = precond * p
+        prec.apply(gko::batch::to_const(p_entry), p_hat_entry);
+
+        // v = A * p_hat
+        simple_apply_kernel(A_entry, gko::batch::to_const(p_hat_entry),
+                            v_entry);
+
+        // alpha = rho_new / < r_hat , v>
+        compute_alpha(gko::batch::to_const(rho_new_entry),
+                      gko::batch::to_const(r_hat_entry),
+                      gko::batch::to_const(v_entry), alpha_entry);
+
+        // s = r - alpha*v
+        update_s(gko::batch::to_const(r_entry),
+                 gko::batch::to_const(alpha_entry),
+                 gko::batch::to_const(v_entry), s_entry);
+
+        // an estimate of residual norms
+        compute_norm2_kernel<ValueType>(gko::batch::to_const(s_entry),
+                                        res_norms_entry);
+
+        if (stop.check_converged(res_norms_entry.values)) {
+            // update x for the systems
+            // x = x + alpha * p_hat
+            update_x_middle(gko::batch::to_const(alpha_entry),
+                            gko::batch::to_const(p_hat_entry), x_entry);
+            logger.log_iteration(batch_item_id, iter,
+                                 res_norms_entry.values[0]);
+            break;
+        }
+
+        // s_hat = precond * s
+        prec.apply(gko::batch::to_const(s_entry), s_hat_entry);
+
+        // t = A * s_hat
+        simple_apply_kernel(A_entry, gko::batch::to_const(s_hat_entry),
+                            t_entry);
+        // omega = <t,s> / <t,t>
+        compute_omega(gko::batch::to_const(t_entry),
+                      gko::batch::to_const(s_entry), temp_entry, omega_entry);
+
+
+        // x = x + alpha * p_hat + omega * s_hat
+        // r = s - omega * t
+        update_x_and_r(gko::batch::to_const(p_hat_entry),
+                       gko::batch::to_const(s_hat_entry),
+                       gko::batch::to_const(alpha_entry),
+                       gko::batch::to_const(omega_entry),
+                       gko::batch::to_const(s_entry),
+                       gko::batch::to_const(t_entry), x_entry, r_entry);
+
+        compute_norm2_kernel<ValueType>(gko::batch::to_const(r_entry),
+                                        res_norms_entry);
+
+        // rho_old = rho_new
+        copy_kernel(gko::batch::to_const(rho_new_entry), rho_old_entry);
+    }
+
+    logger.log_iteration(batch_item_id, iter, res_norms_entry.values[0]);
+}
diff --git a/reference/stop/batch_criteria.hpp b/reference/stop/batch_criteria.hpp
new file mode 100644
index 00000000000..875132b1320
--- /dev/null
+++ b/reference/stop/batch_criteria.hpp
@@ -0,0 +1,128 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_REFERENCE_STOP_BATCH_CRITERIA_HPP_
+#define GKO_REFERENCE_STOP_BATCH_CRITERIA_HPP_
+
+
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/stop/batch_stop_enum.hpp>
+
+
+namespace gko {
+namespace kernels {
+namespace host {
+namespace batch_stop {
+
+
+/**
+ * Stopping criterion for batch solvers with relative residual threshold.
+ *
+ * @note Supports only one right hand side.
+ */
+template <typename ValueType>
+class SimpleRelResidual {
+public:
+    using real_type = remove_complex<ValueType>;
+
+    /**
+     * Set up the stopping criterion and convergence variable.
+     *
+     * @param rel_res_tol  Tolerance on relative residual norm.
+     * @param rhs_b_norms  The RHS norms.
+     */
+    SimpleRelResidual(const real_type rel_res_tol,
+                      const real_type* const rhs_b_norms)
+        : rel_tol_{rel_res_tol}, rhs_norms_{rhs_b_norms}
+    {}
+
+    /**
+     * Checks whether the right hand side has converged.
+     *
+     * @param residual_norms  Current residual norm.
+     *
+     * @return  true if converged, false otherwise.
+     */
+    bool check_converged(const real_type* const residual_norms) const
+    {
+        return residual_norms[0] <= (rel_tol_ * rhs_norms_[0]);
+    }
+
+private:
+    const real_type rel_tol_;
+    const real_type* const rhs_norms_;
+};
+
+
+/**
+ * Stopping criterion for batch solvers that checks for an absolute residual
+ * threshold.
+ *
+ * @note Supports only one right hand side.
+ */
+template <typename ValueType>
+class SimpleAbsResidual {
+public:
+    using real_type = remove_complex<ValueType>;
+
+    /**
+     * Set up the stopping criterion and convergence variable.
+     *
+     * @param tol  Tolerance on residual norm.
+     * @param dummy  for uniform creation of stopping criteria.
+     */
+    SimpleAbsResidual(const real_type tol, const real_type*) : abs_tol_{tol} {}
+
+    /**
+     * Checks whether the different right hand sides have converged.
+     *
+     * @param residual_norms  current residual norm of each RHS.
+     * @return  true if converged, false otherwise.
+     */
+    bool check_converged(const real_type* const residual_norms) const
+    {
+        return (residual_norms[0] <= abs_tol_);
+    }
+
+private:
+    const real_type abs_tol_;
+};
+
+
+}  // namespace batch_stop
+}  // namespace host
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_REFERENCE_STOP_BATCH_CRITERIA_HPP_
diff --git a/reference/stop/residual_norm_kernels.cpp b/reference/stop/residual_norm_kernels.cpp
index 2f2caa9cc1d..08f9384375b 100644
--- a/reference/stop/residual_norm_kernels.cpp
+++ b/reference/stop/residual_norm_kernels.cpp
@@ -67,7 +67,7 @@ void residual_norm(std::shared_ptr<const ReferenceExecutor> exec,
     *all_converged = true;
     *one_changed = false;
     for (size_type i = 0; i < tau->get_size()[1]; ++i) {
-        if (tau->at(i) < rel_residual_goal * orig_tau->at(i)) {
+        if (tau->at(i) <= rel_residual_goal * orig_tau->at(i)) {
             stop_status->get_data()[i].converge(stoppingId, setFinalized);
             *one_changed = true;
         }
@@ -107,7 +107,7 @@ void implicit_residual_norm(
     *all_converged = true;
     *one_changed = false;
     for (size_type i = 0; i < tau->get_size()[1]; ++i) {
-        if (sqrt(abs(tau->at(i))) < rel_residual_goal * orig_tau->at(i)) {
+        if (sqrt(abs(tau->at(i))) <= rel_residual_goal * orig_tau->at(i)) {
             stop_status->get_data()[i].converge(stoppingId, setFinalized);
             *one_changed = true;
         }
diff --git a/reference/test/base/CMakeLists.txt b/reference/test/base/CMakeLists.txt
index b4d922ec187..7230b329918 100644
--- a/reference/test/base/CMakeLists.txt
+++ b/reference/test/base/CMakeLists.txt
@@ -1,4 +1,5 @@
 ginkgo_create_test(array)
+ginkgo_create_test(batch_multi_vector_kernels)
 ginkgo_create_test(combination)
 ginkgo_create_test(composition)
 ginkgo_create_test(index_set)
diff --git a/reference/test/base/batch_multi_vector_kernels.cpp b/reference/test/base/batch_multi_vector_kernels.cpp
new file mode 100644
index 00000000000..26395bf6791
--- /dev/null
+++ b/reference/test/base/batch_multi_vector_kernels.cpp
@@ -0,0 +1,428 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+
+
+#include <complex>
+#include <memory>
+#include <random>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/exception.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+#include "core/base/batch_multi_vector_kernels.hpp"
+#include "core/base/batch_utilities.hpp"
+#include "core/test/utils.hpp"
+#include "core/test/utils/batch_helpers.hpp"
+
+
+template <typename T>
+class MultiVector : public ::testing::Test {
+protected:
+    using value_type = T;
+    using size_type = gko::size_type;
+    using Mtx = gko::batch::MultiVector<value_type>;
+    using DenseMtx = gko::matrix::Dense<value_type>;
+    using ComplexMtx = gko::to_complex<Mtx>;
+    MultiVector()
+        : exec(gko::ReferenceExecutor::create()),
+          mtx_0(gko::batch::initialize<Mtx>(
+              {{I<T>({1.0, -1.0, 1.5}), I<T>({-2.0, 2.0, 3.0})},
+               {{1.0, -2.0, -0.5}, {1.0, -2.5, 4.0}}},
+              exec)),
+          mtx_00(gko::initialize<DenseMtx>(
+              {I<T>({1.0, -1.0, 1.5}), I<T>({-2.0, 2.0, 3.0})}, exec)),
+          mtx_01(gko::initialize<DenseMtx>(
+              {I<T>({1.0, -2.0, -0.5}), I<T>({1.0, -2.5, 4.0})}, exec)),
+          mtx_1(gko::batch::initialize<Mtx>(
+              {{{1.0, -1.0, 2.2}, {-2.0, 2.0, -0.5}},
+               {{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}}},
+              exec)),
+          mtx_10(gko::initialize<DenseMtx>(
+              {I<T>({1.0, -1.0, 2.2}), I<T>({-2.0, 2.0, -0.5})}, exec)),
+          mtx_11(gko::initialize<DenseMtx>({{1.0, 2.5, 3.0}, {1.0, 2.0, 3.0}},
+                                           exec)),
+          mtx_2(gko::batch::initialize<Mtx>(
+              {{{1.0, 1.5}, {6.0, 1.0}, {-0.25, 1.0}},
+               {I<T>({2.0, -2.0}), I<T>({1.0, 3.0}), I<T>({4.0, 3.0})}},
+              exec)),
+          mtx_20(gko::initialize<DenseMtx>(
+              {I<T>({1.0, 1.5}), I<T>({6.0, 1.0}), I<T>({-0.25, 1.0})}, exec)),
+          mtx_21(gko::initialize<DenseMtx>(
+              {I<T>({2.0, -2.0}), I<T>({1.0, 3.0}), I<T>({4.0, 3.0})}, exec)),
+          mtx_3(gko::batch::initialize<Mtx>(
+              {{I<T>({1.0, 1.5}), I<T>({6.0, 1.0})}, {{2.0, -2.0}, {1.0, 3.0}}},
+              exec)),
+          mtx_30(gko::initialize<DenseMtx>({I<T>({1.0, 1.5}), I<T>({6.0, 1.0})},
+                                           exec)),
+          mtx_31(gko::initialize<DenseMtx>(
+              {I<T>({2.0, -2.0}), I<T>({1.0, 3.0})}, exec)),
+          mtx_4(gko::batch::initialize<Mtx>(
+              {{{1.0, 1.5, 3.0}, {6.0, 1.0, 5.0}, {6.0, 1.0, 5.5}},
+               {{2.0, -2.0, 1.5}, {4.0, 3.0, 2.2}, {-1.25, 3.0, 0.5}}},
+              exec)),
+          mtx_5(gko::batch::initialize<Mtx>(
+              {{{1.0, 1.5}, {6.0, 1.0}, {7.0, -4.5}},
+               {I<T>({2.0, -2.0}), I<T>({1.0, 3.0}), I<T>({4.0, 3.0})}},
+              exec)),
+          mtx_6(gko::batch::initialize<Mtx>(
+              {{{1.0, 0.0, 3.0}, {0.0, 3.0, 0.0}, {0.0, 1.0, 5.0}},
+               {{2.0, 0.0, 5.0}, {0.0, 1.0, 0.0}, {0.0, -1.0, 8.0}}},
+              exec))
+    {}
+
+    std::shared_ptr<const gko::ReferenceExecutor> exec;
+    std::unique_ptr<Mtx> mtx_0;
+    std::unique_ptr<DenseMtx> mtx_00;
+    std::unique_ptr<DenseMtx> mtx_01;
+    std::unique_ptr<Mtx> mtx_1;
+    std::unique_ptr<DenseMtx> mtx_10;
+    std::unique_ptr<DenseMtx> mtx_11;
+    std::unique_ptr<Mtx> mtx_2;
+    std::unique_ptr<DenseMtx> mtx_20;
+    std::unique_ptr<DenseMtx> mtx_21;
+    std::unique_ptr<Mtx> mtx_3;
+    std::unique_ptr<DenseMtx> mtx_30;
+    std::unique_ptr<DenseMtx> mtx_31;
+    std::unique_ptr<Mtx> mtx_4;
+    std::unique_ptr<Mtx> mtx_5;
+    std::unique_ptr<Mtx> mtx_6;
+
+    std::default_random_engine rand_engine;
+};
+
+TYPED_TEST_SUITE(MultiVector, gko::test::ValueTypes, TypenameNameGenerator);
+
+
+TYPED_TEST(MultiVector, ScalesData)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using T = typename TestFixture::value_type;
+    auto alpha = gko::batch::initialize<Mtx>(
+        {{{2.0, -2.0, 1.5}}, {{3.0, -1.0, 0.25}}}, this->exec);
+    auto ualpha = gko::batch::unbatch<gko::batch::MultiVector<T>>(alpha.get());
+
+    this->mtx_0->scale(alpha.get());
+
+    this->mtx_00->scale(ualpha[0].get());
+    this->mtx_01->scale(ualpha[1].get());
+    auto res =
+        gko::batch::unbatch<gko::batch::MultiVector<T>>(this->mtx_0.get());
+    GKO_ASSERT_MTX_NEAR(res[0].get(), this->mtx_00.get(), 0.);
+    GKO_ASSERT_MTX_NEAR(res[1].get(), this->mtx_01.get(), 0.);
+}
+
+
+TYPED_TEST(MultiVector, ScalesDataWithScalar)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using T = typename TestFixture::value_type;
+    auto alpha = gko::batch::initialize<Mtx>({{2.0}, {-2.0}}, this->exec);
+    auto ualpha = gko::batch::unbatch<gko::batch::MultiVector<T>>(alpha.get());
+
+    this->mtx_1->scale(alpha.get());
+
+    this->mtx_10->scale(ualpha[0].get());
+    this->mtx_11->scale(ualpha[1].get());
+    auto res =
+        gko::batch::unbatch<gko::batch::MultiVector<T>>(this->mtx_1.get());
+    GKO_ASSERT_MTX_NEAR(res[0].get(), this->mtx_10.get(), 0.);
+    GKO_ASSERT_MTX_NEAR(res[1].get(), this->mtx_11.get(), 0.);
+}
+
+
+TYPED_TEST(MultiVector, ScalesDataWithMultipleScalars)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using T = typename TestFixture::value_type;
+    auto alpha = gko::batch::initialize<Mtx>(
+        {{{2.0, -2.0, -1.5}}, {{2.0, -2.0, 3.0}}}, this->exec);
+    auto ualpha = gko::batch::unbatch<gko::batch::MultiVector<T>>(alpha.get());
+
+    this->mtx_1->scale(alpha.get());
+    this->mtx_10->scale(ualpha[0].get());
+    this->mtx_11->scale(ualpha[1].get());
+
+    auto res =
+        gko::batch::unbatch<gko::batch::MultiVector<T>>(this->mtx_1.get());
+    GKO_ASSERT_MTX_NEAR(res[0].get(), this->mtx_10.get(), 0.);
+    GKO_ASSERT_MTX_NEAR(res[1].get(), this->mtx_11.get(), 0.);
+}
+
+
+TYPED_TEST(MultiVector, AddsScaled)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using T = typename TestFixture::value_type;
+    auto alpha = gko::batch::initialize<Mtx>(
+        {{{2.0, -2.0, 1.5}}, {{2.0, -2.0, 3.0}}}, this->exec);
+    auto ualpha = gko::batch::unbatch<gko::batch::MultiVector<T>>(alpha.get());
+
+    this->mtx_1->add_scaled(alpha.get(), this->mtx_0.get());
+
+    this->mtx_10->add_scaled(ualpha[0].get(), this->mtx_00.get());
+    this->mtx_11->add_scaled(ualpha[1].get(), this->mtx_01.get());
+    auto res =
+        gko::batch::unbatch<gko::batch::MultiVector<T>>(this->mtx_1.get());
+    GKO_ASSERT_MTX_NEAR(res[0].get(), this->mtx_10.get(), 0.);
+    GKO_ASSERT_MTX_NEAR(res[1].get(), this->mtx_11.get(), 0.);
+}
+
+
+TYPED_TEST(MultiVector, AddsScaledWithScalar)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using T = typename TestFixture::value_type;
+    auto alpha = gko::batch::initialize<Mtx>({{2.0}, {-2.0}}, this->exec);
+    auto ualpha = gko::batch::unbatch<gko::batch::MultiVector<T>>(alpha.get());
+
+    this->mtx_1->add_scaled(alpha.get(), this->mtx_0.get());
+
+    this->mtx_10->add_scaled(ualpha[0].get(), this->mtx_00.get());
+    this->mtx_11->add_scaled(ualpha[1].get(), this->mtx_01.get());
+    auto res =
+        gko::batch::unbatch<gko::batch::MultiVector<T>>(this->mtx_1.get());
+    GKO_ASSERT_MTX_NEAR(res[0].get(), this->mtx_10.get(), 0.);
+    GKO_ASSERT_MTX_NEAR(res[1].get(), this->mtx_11.get(), 0.);
+}
+
+
+TYPED_TEST(MultiVector, AddScaledFailsOnWrongSizes)
+{
+    using Mtx = typename TestFixture::Mtx;
+    auto alpha = gko::batch::initialize<Mtx>(
+        {{2.0, 3.0, 4.0, 5.0}, {-2.0, 2.0, 4.0, 5.0}}, this->exec);
+
+    ASSERT_THROW(this->mtx_1->add_scaled(alpha.get(), this->mtx_2.get()),
+                 gko::DimensionMismatch);
+}
+
+
+TYPED_TEST(MultiVector, ComputesDot)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using T = typename TestFixture::value_type;
+    auto result =
+        Mtx::create(this->exec, gko::batch_dim<2>(2, gko::dim<2>{1, 3}));
+    auto ures = gko::batch::unbatch<gko::batch::MultiVector<T>>(result.get());
+
+    this->mtx_0->compute_dot(this->mtx_1.get(), result.get());
+
+    this->mtx_00->compute_dot(this->mtx_10.get(), ures[0].get());
+    this->mtx_01->compute_dot(this->mtx_11.get(), ures[1].get());
+    auto res = gko::batch::unbatch<gko::batch::MultiVector<T>>(result.get());
+    GKO_ASSERT_MTX_NEAR(res[0].get(), ures[0].get(), 0.);
+    GKO_ASSERT_MTX_NEAR(res[1].get(), ures[1].get(), 0.);
+}
+
+
+TYPED_TEST(MultiVector, ComputeDotFailsOnWrongInputSize)
+{
+    using Mtx = typename TestFixture::Mtx;
+
+    auto result =
+        Mtx::create(this->exec, gko::batch_dim<2>(2, gko::dim<2>{1, 3}));
+
+    ASSERT_THROW(this->mtx_1->compute_dot(this->mtx_2.get(), result.get()),
+                 gko::DimensionMismatch);
+}
+
+
+TYPED_TEST(MultiVector, ComputeDotFailsOnWrongResultSize)
+{
+    using Mtx = typename TestFixture::Mtx;
+
+    auto result =
+        Mtx::create(this->exec, gko::batch_dim<2>(2, gko::dim<2>{1, 2}));
+
+    ASSERT_THROW(this->mtx_0->compute_dot(this->mtx_1.get(), result.get()),
+                 gko::DimensionMismatch);
+}
+
+
+TYPED_TEST(MultiVector, ComputesConjDot)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using T = typename TestFixture::value_type;
+    auto result =
+        Mtx::create(this->exec, gko::batch_dim<2>(2, gko::dim<2>{1, 3}));
+    auto ures = gko::batch::unbatch<gko::batch::MultiVector<T>>(result.get());
+
+    this->mtx_0->compute_conj_dot(this->mtx_1.get(), result.get());
+
+    this->mtx_00->compute_conj_dot(this->mtx_10.get(), ures[0].get());
+    this->mtx_01->compute_conj_dot(this->mtx_11.get(), ures[1].get());
+    auto res = gko::batch::unbatch<gko::batch::MultiVector<T>>(result.get());
+    GKO_ASSERT_MTX_NEAR(res[0].get(), ures[0].get(), 0.);
+    GKO_ASSERT_MTX_NEAR(res[1].get(), ures[1].get(), 0.);
+}
+
+
+TYPED_TEST(MultiVector, ComputeConjDotFailsOnWrongInputSize)
+{
+    using Mtx = typename TestFixture::Mtx;
+
+    auto result =
+        Mtx::create(this->exec, gko::batch_dim<2>(2, gko::dim<2>{1, 3}));
+
+    ASSERT_THROW(this->mtx_1->compute_conj_dot(this->mtx_2.get(), result.get()),
+                 gko::DimensionMismatch);
+}
+
+
+TYPED_TEST(MultiVector, ComputeConjDotFailsOnWrongResultSize)
+{
+    using Mtx = typename TestFixture::Mtx;
+
+    auto result =
+        Mtx::create(this->exec, gko::batch_dim<2>(2, gko::dim<2>{1, 2}));
+
+    ASSERT_THROW(this->mtx_0->compute_conj_dot(this->mtx_1.get(), result.get()),
+                 gko::DimensionMismatch);
+}
+
+
+TYPED_TEST(MultiVector, ComputesNorm2)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using T = typename TestFixture::value_type;
+    using T_nc = gko::remove_complex<T>;
+    using NormVector = gko::batch::MultiVector<T_nc>;
+    auto mtx(gko::batch::initialize<Mtx>(
+        {{I<T>{1.0, 0.0}, I<T>{2.0, 3.0}, I<T>{2.0, 4.0}},
+         {I<T>{-4.0, 2.0}, I<T>{-3.0, -2.0}, I<T>{0.0, 1.0}}},
+        this->exec));
+    auto batch_size = gko::batch_dim<2>(2, gko::dim<2>{1, 2});
+    auto result = NormVector::create(this->exec, batch_size);
+
+    mtx->compute_norm2(result.get());
+
+    EXPECT_EQ(result->at(0, 0, 0), T_nc{3.0});
+    EXPECT_EQ(result->at(0, 0, 1), T_nc{5.0});
+    EXPECT_EQ(result->at(1, 0, 0), T_nc{5.0});
+    EXPECT_EQ(result->at(1, 0, 1), T_nc{3.0});
+}
+
+
+TYPED_TEST(MultiVector, CopiesData)
+{
+    gko::kernels::reference::batch_multi_vector::copy(
+        this->exec, this->mtx_0.get(), this->mtx_1.get());
+
+    GKO_ASSERT_BATCH_MTX_NEAR(this->mtx_1.get(), this->mtx_0.get(), 0.);
+}
+
+
+TYPED_TEST(MultiVector, ConvertsToPrecision)
+{
+    using MultiVector = typename TestFixture::Mtx;
+    using T = typename TestFixture::value_type;
+    using OtherT = typename gko::next_precision<T>;
+    using OtherMultiVector = typename gko::batch::MultiVector<OtherT>;
+    auto tmp = OtherMultiVector::create(this->exec);
+    auto res = MultiVector::create(this->exec);
+    // If OtherT is more precise: 0, otherwise r
+    auto residual = r<OtherT>::value < r<T>::value
+                        ? gko::remove_complex<T>{0}
+                        : gko::remove_complex<T>{r<OtherT>::value};
+
+    this->mtx_1->convert_to(tmp.get());
+    tmp->convert_to(res.get());
+
+    auto ures = gko::batch::unbatch<gko::batch::MultiVector<T>>(res.get());
+    auto umtx =
+        gko::batch::unbatch<gko::batch::MultiVector<T>>(this->mtx_1.get());
+    GKO_ASSERT_MTX_NEAR(umtx[0].get(), ures[0].get(), residual);
+    GKO_ASSERT_MTX_NEAR(umtx[1].get(), ures[1].get(), residual);
+}
+
+
+TYPED_TEST(MultiVector, MovesToPrecision)
+{
+    using MultiVector = typename TestFixture::Mtx;
+    using T = typename TestFixture::value_type;
+    using OtherT = typename gko::next_precision<T>;
+    using OtherMultiVector = typename gko::batch::MultiVector<OtherT>;
+    auto tmp = OtherMultiVector::create(this->exec);
+    auto res = MultiVector::create(this->exec);
+    // If OtherT is more precise: 0, otherwise r
+    auto residual = r<OtherT>::value < r<T>::value
+                        ? gko::remove_complex<T>{0}
+                        : gko::remove_complex<T>{r<OtherT>::value};
+
+    this->mtx_1->move_to(tmp.get());
+    tmp->move_to(res.get());
+
+    auto ures = gko::batch::unbatch<gko::batch::MultiVector<T>>(res.get());
+    auto umtx =
+        gko::batch::unbatch<gko::batch::MultiVector<T>>(this->mtx_1.get());
+    GKO_ASSERT_MTX_NEAR(umtx[0].get(), ures[0].get(), residual);
+    GKO_ASSERT_MTX_NEAR(umtx[1].get(), ures[1].get(), residual);
+}
+
+
+TYPED_TEST(MultiVector, ConvertsEmptyToPrecision)
+{
+    using MultiVector = typename TestFixture::Mtx;
+    using T = typename TestFixture::value_type;
+    using OtherT = typename gko::next_precision<T>;
+    using OtherMultiVector = typename gko::batch::MultiVector<OtherT>;
+    auto empty = OtherMultiVector::create(this->exec);
+    auto res = MultiVector::create(this->exec);
+
+    empty->convert_to(res.get());
+
+    ASSERT_FALSE(res->get_num_batch_items());
+}
+
+
+TYPED_TEST(MultiVector, MovesEmptyToPrecision)
+{
+    using MultiVector = typename TestFixture::Mtx;
+    using T = typename TestFixture::value_type;
+    using OtherT = typename gko::next_precision<T>;
+    using OtherMultiVector = typename gko::batch::MultiVector<OtherT>;
+    auto empty = OtherMultiVector::create(this->exec);
+    auto res = MultiVector::create(this->exec);
+
+    empty->move_to(res.get());
+
+    ASSERT_FALSE(res->get_num_batch_items());
+}
diff --git a/reference/test/distributed/CMakeLists.txt b/reference/test/distributed/CMakeLists.txt
index 2985c7b5e11..42ad2d7e1a2 100644
--- a/reference/test/distributed/CMakeLists.txt
+++ b/reference/test/distributed/CMakeLists.txt
@@ -1,3 +1,4 @@
 ginkgo_create_test(matrix_kernels)
+ginkgo_create_test(partition_helpers_kernels)
 ginkgo_create_test(partition_kernels)
 ginkgo_create_test(vector_kernels)
diff --git a/reference/test/distributed/partition_helpers_kernels.cpp b/reference/test/distributed/partition_helpers_kernels.cpp
new file mode 100644
index 00000000000..f0ce4918d01
--- /dev/null
+++ b/reference/test/distributed/partition_helpers_kernels.cpp
@@ -0,0 +1,145 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <algorithm>
+#include <memory>
+#include <vector>
+
+
+#include <gtest/gtest-typed-test.h>
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/distributed/partition.hpp>
+
+
+#include "core/distributed/partition_helpers_kernels.hpp"
+#include "core/test/utils.hpp"
+
+
+namespace {
+
+
+using comm_index_type = gko::experimental::distributed::comm_index_type;
+
+
+template <typename GlobalIndexType>
+class PartitionHelpers : public ::testing::Test {
+protected:
+    using global_index_type = GlobalIndexType;
+
+    PartitionHelpers() : ref(gko::ReferenceExecutor::create()) {}
+
+    std::shared_ptr<const gko::ReferenceExecutor> ref;
+    gko::array<global_index_type> default_range_start_ends{
+        this->ref, {0, 4, 4, 7, 7, 9, 9, 11}};
+    gko::array<comm_index_type> default_part_ids{this->ref, {0, 1, 2, 3}};
+};
+
+TYPED_TEST_SUITE(PartitionHelpers, gko::test::IndexTypes,
+                 TypenameNameGenerator);
+
+
+TYPED_TEST(PartitionHelpers, CanSortByRangeStartIdentity)
+{
+    using itype = typename TestFixture::global_index_type;
+    auto range_start_ends = this->default_range_start_ends;
+    auto part_ids = this->default_part_ids;
+
+    gko::kernels::reference::partition_helpers::sort_by_range_start(
+        this->ref, range_start_ends, part_ids);
+
+    GKO_ASSERT_ARRAY_EQ(range_start_ends, this->default_range_start_ends);
+    GKO_ASSERT_ARRAY_EQ(part_ids, this->default_part_ids);
+}
+
+
+TYPED_TEST(PartitionHelpers, CanSortByRangeStart)
+{
+    using global_index_type = typename TestFixture::global_index_type;
+    gko::array<global_index_type> range_start_ends{this->ref,
+                                                   {7, 9, 4, 7, 0, 4, 9, 11}};
+    gko::array<comm_index_type> result_part_ids{this->ref, {2, 1, 0, 3}};
+    auto part_ids = this->default_part_ids;
+
+    gko::kernels::reference::partition_helpers::sort_by_range_start(
+        this->ref, range_start_ends, part_ids);
+
+    GKO_ASSERT_ARRAY_EQ(range_start_ends, this->default_range_start_ends);
+    GKO_ASSERT_ARRAY_EQ(part_ids, result_part_ids);
+}
+
+
+TYPED_TEST(PartitionHelpers, CanCheckConsecutiveRanges)
+{
+    using global_index_type = typename TestFixture::global_index_type;
+    auto range_start_ends = this->default_range_start_ends;
+    bool result = false;
+
+    gko::kernels::reference::partition_helpers::check_consecutive_ranges(
+        this->ref, range_start_ends, result);
+
+    ASSERT_TRUE(result);
+}
+
+
+TYPED_TEST(PartitionHelpers, CanCheckNonConsecutiveRanges)
+{
+    using global_index_type = typename TestFixture::global_index_type;
+    gko::array<global_index_type> range_start_ends{this->ref,
+                                                   {7, 9, 4, 7, 0, 4, 9, 11}};
+    bool result = true;
+
+    gko::kernels::reference::partition_helpers::check_consecutive_ranges(
+        this->ref, range_start_ends, result);
+
+    ASSERT_FALSE(result);
+}
+
+
+TYPED_TEST(PartitionHelpers, CanCompressRanges)
+{
+    using itype = typename TestFixture::global_index_type;
+    auto range_start_ends = this->default_range_start_ends;
+    gko::array<itype> range_offsets{this->ref,
+                                    range_start_ends.get_num_elems() / 2 + 1};
+    gko::array<itype> expected_range_offsets{this->ref, {0, 4, 7, 9, 11}};
+
+    gko::kernels::reference::partition_helpers::compress_ranges(
+        this->ref, range_start_ends, range_offsets);
+
+    GKO_ASSERT_ARRAY_EQ(range_offsets, expected_range_offsets);
+}
+
+
+}  // namespace
diff --git a/reference/test/distributed/partition_kernels.cpp b/reference/test/distributed/partition_kernels.cpp
index 4cc7750a193..f92349ee2eb 100644
--- a/reference/test/distributed/partition_kernels.cpp
+++ b/reference/test/distributed/partition_kernels.cpp
@@ -171,6 +171,28 @@ TYPED_TEST(Partition, BuildsFromRangeWithSingleElement)
 }
 
 
+TYPED_TEST(Partition, BuildsFromRangesWithPartIds)
+{
+    using global_index_type = typename TestFixture::global_index_type;
+    using part_type = typename TestFixture::part_type;
+    gko::array<global_index_type> ranges{this->ref, {0, 5, 5, 7, 9, 10}};
+    gko::array<comm_index_type> part_id{this->ref, {0, 4, 3, 1, 2}};
+
+    auto partition =
+        part_type::build_from_contiguous(this->ref, ranges, part_id);
+
+    EXPECT_EQ(partition->get_size(),
+              ranges.get_data()[ranges.get_num_elems() - 1]);
+    EXPECT_EQ(partition->get_num_ranges(), ranges.get_num_elems() - 1);
+    EXPECT_EQ(partition->get_num_parts(), ranges.get_num_elems() - 1);
+    EXPECT_EQ(partition->get_num_empty_parts(), 1);
+    assert_equal_data(partition->get_range_bounds(), {0, 5, 5, 7, 9, 10});
+    assert_equal_data(partition->get_part_ids(), {0, 4, 3, 1, 2});
+    assert_equal_data(partition->get_range_starting_indices(), {0, 0, 0, 0, 0});
+    assert_equal_data(partition->get_part_sizes(), {5, 2, 1, 2, 0});
+}
+
+
 TYPED_TEST(Partition, BuildsFromGlobalSize)
 {
     using part_type = typename TestFixture::part_type;
diff --git a/reference/test/factorization/cholesky_kernels.cpp b/reference/test/factorization/cholesky_kernels.cpp
index d5fae12a2e9..36bbd7e176e 100644
--- a/reference/test/factorization/cholesky_kernels.cpp
+++ b/reference/test/factorization/cholesky_kernels.cpp
@@ -34,6 +34,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <algorithm>
+#include <initializer_list>
 #include <memory>
 
 
@@ -67,12 +68,15 @@ class Cholesky : public ::testing::Test {
     using index_type =
         typename std::tuple_element<1, decltype(ValueIndexType())>::type;
     using matrix_type = gko::matrix::Csr<value_type, index_type>;
+    using sparsity_matrix_type =
+        gko::matrix::SparsityCsr<value_type, index_type>;
     using elimination_forest =
         gko::factorization::elimination_forest<index_type>;
 
     Cholesky()
         : ref(gko::ReferenceExecutor::create()),
           tmp{ref},
+          ref_row_nnz{ref},
           storage_offsets{ref},
           storage{ref},
           row_descs{ref}
@@ -98,13 +102,27 @@ class Cholesky : public ::testing::Test {
         return result;
     }
 
+    void setup(
+        std::initializer_list<std::initializer_list<value_type>> mtx_list,
+        std::initializer_list<std::initializer_list<value_type>> factor_list)
+    {
+        mtx = gko::initialize<matrix_type>(mtx_list, ref);
+        l_factor_ref = gko::initialize<matrix_type>(factor_list, ref);
+        setup_impl();
+    }
+
     void setup(const char* name_mtx, const char* name_factor)
     {
         std::ifstream stream{name_mtx};
         std::ifstream ref_stream{name_factor};
         mtx = gko::read<matrix_type>(stream, this->ref);
-        num_rows = mtx->get_size()[0];
         l_factor_ref = gko::read<matrix_type>(ref_stream, this->ref);
+        setup_impl();
+    }
+
+    void setup_impl()
+    {
+        num_rows = mtx->get_size()[0];
         combined_ref = combined_factor(l_factor_ref.get());
         l_factor = matrix_type::create(ref, l_factor_ref->get_size(),
                                        l_factor_ref->get_num_stored_elements());
@@ -123,6 +141,13 @@ class Cholesky : public ::testing::Test {
         storage_offsets.resize_and_reset(num_rows + 1);
         row_descs.resize_and_reset(num_rows);
 
+        ref_row_nnz.resize_and_reset(num_rows);
+        const auto ref_row_ptrs = l_factor_ref->get_const_row_ptrs();
+        for (gko::size_type row = 0; row < num_rows; row++) {
+            ref_row_nnz.get_data()[row] =
+                ref_row_ptrs[row + 1] - ref_row_ptrs[row];
+        }
+
         const auto allowed = gko::matrix::csr::sparsity_type::bitmap |
                              gko::matrix::csr::sparsity_type::full |
                              gko::matrix::csr::sparsity_type::hash;
@@ -149,7 +174,7 @@ class Cholesky : public ::testing::Test {
         }
     }
 
-    void forall_matrices(std::function<void()> fn)
+    void forall_matrices(std::function<void()> fn, bool non_spd)
     {
         {
             SCOPED_TRACE("ani1");
@@ -163,11 +188,87 @@ class Cholesky : public ::testing::Test {
                         gko::matrices::location_ani1_amd_chol_mtx);
             fn();
         }
+        {
+            SCOPED_TRACE("example");
+            this->setup(
+                {{4, 0, 1, 0, 0, 0, 0, 1, 0, 0},
+                 {0, 4, 0, 0, 1, 0, 0, 0, 0, 1},
+                 {1, 0, 4.25, 0, 0, 0, 1, 0, 0, 0},
+                 {0, 0, 0, 4, 0, 0, 0, 0, 1, 1},
+                 {0, 1, 0, 0, 4.25, 0, 0, 0, 1, 1},
+                 {0, 0, 0, 0, 0, 4, 2, 4, 0, 0},
+                 {0, 0, 1, 0, 0, 2, 5.25, 0, 0, 0},
+                 {1, 0, 0, 0, 0, 4, 0, 8, 1, 1},
+                 {0, 0, 0, 1, 1, 0, 0, 1, 4, 0},
+                 {0, 1, 0, 1, 1, 0, 0, 1, 0, 4}},
+                {{2, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+                 {0, 2, 0, 0, 0, 0, 0, 0, 0, 0},
+                 {0.5, 0, 2, 0, 0, 0, 0, 0, 0, 0},
+                 {0, 0, 0, 2, 0, 0, 0, 0, 0, 0},
+                 {0, 0.5, 0, 0, 2, 0, 0, 0, 0, 0},
+                 {0, 0, 0, 0, 0, 2, 0, 0, 0, 0},
+                 {0, 0, 0.5, 0, 0, 1, 2, 0, 0, 0},
+                 {0.5, 0, -0.125, 0, 0, 2, -0.96875, 1.67209402770897, 0, 0},
+                 {0, 0, 0, 0.5, 0.5, 0, 0, 0.598052491922453, 1.7726627476498,
+                  0},
+                 {0, 0.5, 0, 0.5, 0.375, 0, 0, 0.598052491922453,
+                  -0.448571948696326, 1.67346688755653}});
+            fn();
+        }
+        {
+            SCOPED_TRACE("separable");
+            this->setup({{4, 0, 1, 0, 0, 0, 0, 0, 0, 0},
+                         {0, 4, 2, 0, 0, 0, 0, 0, 0, 0},
+                         {1, 2, 5.25, 0, 0, 0, 0, 0, 0, 0},
+                         {0, 0, 0, 4, 1, 0, 0, 0, 0, 0},
+                         {0, 0, 0, 1, 4.25, 1, 0, 0, 0, 4},
+                         {0, 0, 0, 0, 1, 4.25, 0, 0, 0, 0},
+                         {0, 0, 0, 0, 0, 0, 4, 1, 0, 4},
+                         {0, 0, 0, 0, 0, 0, 1, 4.25, 0, 0},
+                         {0, 0, 0, 0, 0, 0, 0, 0, 4, 1},
+                         {0, 0, 0, 0, 4, 0, 4, 0, 1, 17.75}},
+                        {{2, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+                         {0, 2, 0, 0, 0, 0, 0, 0, 0, 0},
+                         {0.5, 1, 2, 0, 0, 0, 0, 0, 0, 0},
+                         {0, 0, 0, 2, 0, 0, 0, 0, 0, 0},
+                         {0, 0, 0, 0.5, 2, 0, 0, 0, 0, 0},
+                         {0, 0, 0, 0, 0.5, 2, 0, 0, 0, 0},
+                         {0, 0, 0, 0, 0, 0, 2, 0, 0, 0},
+                         {0, 0, 0, 0, 0, 0, 0.5, 2, 0, 0},
+                         {0, 0, 0, 0, 0, 0, 0, 0, 2, 0},
+                         {0, 0, 0, 0, 2, -0.5, 2, -0.5, 0.5, 3}});
+            fn();
+        }
+        if (non_spd) {
+            SCOPED_TRACE("missing diagonal");
+            this->setup({{1, 0, 1, 0, 0, 0, 0, 0, 0, 0},
+                         {0, 1, 1, 0, 0, 0, 0, 0, 0, 0},
+                         {1, 1, 0, 1, 0, 0, 0, 0, 0, 0},
+                         {0, 0, 1, 1, 1, 0, 0, 0, 0, 0},
+                         {0, 0, 0, 1, 0, 1, 0, 0, 0, 0},
+                         {0, 0, 0, 0, 1, 1, 1, 0, 0, 0},
+                         {0, 0, 0, 0, 0, 1, 1, 1, 0, 1},
+                         {0, 0, 0, 0, 0, 0, 1, 1, 0, 0},
+                         {0, 0, 0, 0, 0, 0, 0, 0, 1, 1},
+                         {0, 0, 0, 0, 0, 0, 1, 0, 1, 0}},
+                        {{1., 0., 0., 0., 0., 0., 0., 0., 0., 0.},
+                         {0., 1., 0., 0., 0., 0., 0., 0., 0., 0.},
+                         {1., 1., 1., 0., 0., 0., 0., 0., 0., 0.},
+                         {0., 0., 1., 1., 0., 0., 0., 0., 0., 0.},
+                         {0., 0., 0., 1., 1., 0., 0., 0., 0., 0.},
+                         {0., 0., 0., 0., 1., 1., 0., 0., 0., 0.},
+                         {0., 0., 0., 0., 0., 1., 1., 0., 0., 0.},
+                         {0., 0., 0., 0., 0., 0., 1., 1., 0., 0.},
+                         {0., 0., 0., 0., 0., 0., 0., 0., 1., 0.},
+                         {0., 0., 0., 0., 0., 0., 1., 1., 1., 1.}});
+            fn();
+        }
     }
 
     std::shared_ptr<const gko::ReferenceExecutor> ref;
     gko::size_type num_rows;
     gko::array<index_type> tmp;
+    gko::array<index_type> ref_row_nnz;
     gko::array<index_type> storage_offsets;
     gko::array<gko::int32> storage;
     gko::array<gko::int64> row_descs;
@@ -183,255 +284,51 @@ TYPED_TEST_SUITE(Cholesky, gko::test::ValueIndexTypes,
                  PairTypenameNameGenerator);
 
 
-TYPED_TEST(Cholesky, KernelSymbolicCountExample)
-{
-    using matrix_type = typename TestFixture::matrix_type;
-    using elimination_forest = typename TestFixture::elimination_forest;
-    using index_type = typename TestFixture::index_type;
-    auto mtx = gko::initialize<typename TestFixture::matrix_type>(
-        {{1, 0, 1, 0, 0, 0, 0, 1, 0, 0},
-         {0, 1, 0, 1, 0, 0, 0, 0, 0, 1},
-         {1, 0, 1, 0, 0, 0, 0, 0, 0, 0},
-         {0, 0, 0, 1, 0, 0, 0, 0, 1, 1},
-         {0, 1, 0, 0, 1, 0, 0, 0, 1, 1},
-         {0, 0, 0, 0, 0, 1, 0, 1, 0, 0},
-         {0, 0, 1, 0, 0, 1, 1, 0, 0, 0},
-         {1, 0, 0, 0, 0, 1, 0, 1, 1, 1},
-         {0, 0, 0, 1, 1, 0, 0, 1, 1, 0},
-         {0, 1, 0, 1, 1, 0, 0, 1, 0, 1}},
-        this->ref);
-    std::unique_ptr<elimination_forest> forest;
-    gko::factorization::compute_elim_forest(mtx.get(), forest);
-    gko::array<index_type> row_nnz{this->ref, 10};
-
-    gko::kernels::reference::cholesky::symbolic_count(
-        this->ref, mtx.get(), *forest, row_nnz.get_data(), this->tmp);
-
-    GKO_ASSERT_ARRAY_EQ(row_nnz, I<index_type>({1, 1, 2, 1, 2, 1, 3, 5, 4, 6}));
-}
-
-
-TYPED_TEST(Cholesky, KernelSymbolicFactorizeExample)
-{
-    using matrix_type = typename TestFixture::matrix_type;
-    using elimination_forest = typename TestFixture::elimination_forest;
-    using index_type = typename TestFixture::index_type;
-    auto mtx = gko::initialize<typename TestFixture::matrix_type>(
-        {{1, 0, 1, 0, 0, 0, 0, 1, 0, 0},
-         {0, 1, 0, 1, 0, 0, 0, 0, 0, 1},
-         {1, 0, 1, 0, 0, 0, 0, 0, 0, 0},
-         {0, 0, 0, 1, 0, 0, 0, 0, 1, 1},
-         {0, 1, 0, 0, 1, 0, 0, 0, 1, 1},
-         {0, 0, 0, 0, 0, 1, 0, 1, 0, 0},
-         {0, 0, 1, 0, 0, 1, 1, 0, 0, 0},
-         {1, 0, 0, 0, 0, 1, 0, 1, 1, 1},
-         {0, 0, 0, 1, 1, 0, 0, 1, 1, 0},
-         {0, 1, 0, 1, 1, 0, 0, 1, 0, 1}},
-        this->ref);
-    std::unique_ptr<elimination_forest> forest;
-    gko::factorization::compute_elim_forest(mtx.get(), forest);
-    auto l_factor = matrix_type::create(this->ref, gko::dim<2>{10, 10}, 26);
-    gko::kernels::reference::cholesky::symbolic_count(
-        this->ref, mtx.get(), *forest, l_factor->get_row_ptrs(), this->tmp);
-    gko::kernels::reference::components::prefix_sum_nonnegative(
-        this->ref, l_factor->get_row_ptrs(), 11);
-
-    gko::kernels::reference::cholesky::symbolic_factorize(
-        this->ref, mtx.get(), *forest, l_factor.get(), this->tmp);
-
-    GKO_ASSERT_MTX_EQ_SPARSITY(l_factor,
-                               l({{1., 0., 0., 0., 0., 0., 0., 0., 0., 0.},
-                                  {0., 1., 0., 0., 0., 0., 0., 0., 0., 0.},
-                                  {1., 0., 1., 0., 0., 0., 0., 0., 0., 0.},
-                                  {0., 0., 0., 1., 0., 0., 0., 0., 0., 0.},
-                                  {0., 1., 0., 0., 1., 0., 0., 0., 0., 0.},
-                                  {0., 0., 0., 0., 0., 1., 0., 0., 0., 0.},
-                                  {0., 0., 1., 0., 0., 1., 1., 0., 0., 0.},
-                                  {1., 0., 1., 0., 0., 1., 1., 1., 0., 0.},
-                                  {0., 0., 0., 1., 1., 0., 0., 1., 1., 0.},
-                                  {0., 1., 0., 1., 1., 0., 0., 1., 1., 1.}}));
-}
-
-
-TYPED_TEST(Cholesky, KernelSymbolicCountSeparable)
-{
-    using matrix_type = typename TestFixture::matrix_type;
-    using elimination_forest = typename TestFixture::elimination_forest;
-    using index_type = typename TestFixture::index_type;
-    auto mtx = gko::initialize<typename TestFixture::matrix_type>(
-        {{1, 0, 1, 0, 0, 0, 0, 0, 0, 0},
-         {0, 1, 1, 0, 0, 0, 0, 0, 0, 0},
-         {1, 1, 1, 0, 0, 0, 0, 0, 0, 0},
-         {0, 0, 0, 1, 1, 0, 0, 0, 0, 0},
-         {0, 0, 0, 1, 1, 1, 0, 0, 0, 1},
-         {0, 0, 0, 0, 1, 1, 0, 0, 0, 0},
-         {0, 0, 0, 0, 0, 0, 1, 1, 0, 1},
-         {0, 0, 0, 0, 0, 0, 1, 1, 0, 0},
-         {0, 0, 0, 0, 0, 0, 0, 0, 1, 1},
-         {0, 0, 0, 0, 1, 0, 1, 0, 1, 1}},
-        this->ref);
-    std::unique_ptr<elimination_forest> forest;
-    gko::factorization::compute_elim_forest(mtx.get(), forest);
-    gko::array<index_type> row_nnz{this->ref, 10};
-
-    gko::kernels::reference::cholesky::symbolic_count(
-        this->ref, mtx.get(), *forest, row_nnz.get_data(), this->tmp);
-
-    GKO_ASSERT_ARRAY_EQ(row_nnz, I<index_type>({1, 1, 3, 1, 2, 2, 1, 2, 1, 6}));
-}
-
-
-TYPED_TEST(Cholesky, KernelSymbolicFactorizeSeparable)
+TYPED_TEST(Cholesky, KernelSymbolicCount)
 {
     using matrix_type = typename TestFixture::matrix_type;
-    using index_type = typename TestFixture::index_type;
+    using sparsity_matrix_type = typename TestFixture::sparsity_matrix_type;
     using elimination_forest = typename TestFixture::elimination_forest;
-    auto mtx = gko::initialize<typename TestFixture::matrix_type>(
-        {{1, 0, 1, 0, 0, 0, 0, 0, 0, 0},
-         {0, 1, 1, 0, 0, 0, 0, 0, 0, 0},
-         {1, 1, 1, 0, 0, 0, 0, 0, 0, 0},
-         {0, 0, 0, 1, 1, 0, 0, 0, 0, 0},
-         {0, 0, 0, 1, 1, 1, 0, 0, 0, 1},
-         {0, 0, 0, 0, 1, 1, 0, 0, 0, 0},
-         {0, 0, 0, 0, 0, 0, 1, 1, 0, 1},
-         {0, 0, 0, 0, 0, 0, 1, 1, 0, 0},
-         {0, 0, 0, 0, 0, 0, 0, 0, 1, 1},
-         {0, 0, 0, 0, 1, 0, 1, 0, 1, 1}},
-        this->ref);
-    std::unique_ptr<elimination_forest> forest;
-    gko::factorization::compute_elim_forest(mtx.get(), forest);
-    auto l_factor = matrix_type::create(this->ref, gko::dim<2>{10, 10}, 26);
-    gko::kernels::reference::cholesky::symbolic_count(
-        this->ref, mtx.get(), *forest, l_factor->get_row_ptrs(), this->tmp);
-    gko::kernels::reference::components::prefix_sum_nonnegative(
-        this->ref, l_factor->get_row_ptrs(), 11);
-
-    gko::kernels::reference::cholesky::symbolic_factorize(
-        this->ref, mtx.get(), *forest, l_factor.get(), this->tmp);
-
-    GKO_ASSERT_MTX_EQ_SPARSITY(l_factor,
-                               l({{1., 0., 0., 0., 0., 0., 0., 0., 0., 0.},
-                                  {0., 1., 0., 0., 0., 0., 0., 0., 0., 0.},
-                                  {1., 1., 1., 0., 0., 0., 0., 0., 0., 0.},
-                                  {0., 0., 0., 1., 0., 0., 0., 0., 0., 0.},
-                                  {0., 0., 0., 1., 1., 0., 0., 0., 0., 0.},
-                                  {0., 0., 0., 0., 1., 1., 0., 0., 0., 0.},
-                                  {0., 0., 0., 0., 0., 0., 1., 0., 0., 0.},
-                                  {0., 0., 0., 0., 0., 0., 1., 1., 0., 0.},
-                                  {0., 0., 0., 0., 0., 0., 0., 0., 1., 0.},
-                                  {0., 0., 0., 0., 1., 1., 1., 1., 1., 1.}}));
-}
-
-
-TYPED_TEST(Cholesky, KernelSymbolicCountMissingDiagonal)
-{
-    using matrix_type = typename TestFixture::matrix_type;
     using index_type = typename TestFixture::index_type;
-    using elimination_forest = typename TestFixture::elimination_forest;
-    auto mtx = gko::initialize<typename TestFixture::matrix_type>(
-        {{1, 0, 1, 0, 0, 0, 0, 0, 0, 0},
-         {0, 1, 1, 0, 0, 0, 0, 0, 0, 0},
-         {1, 1, 0, 1, 0, 0, 0, 0, 0, 0},
-         {0, 0, 1, 1, 1, 0, 0, 0, 0, 0},
-         {0, 0, 0, 1, 0, 1, 0, 0, 0, 0},
-         {0, 0, 0, 0, 1, 1, 1, 0, 0, 0},
-         {0, 0, 0, 0, 0, 1, 1, 1, 0, 1},
-         {0, 0, 0, 0, 0, 0, 1, 1, 0, 0},
-         {0, 0, 0, 0, 0, 0, 0, 0, 1, 1},
-         {0, 0, 0, 0, 0, 0, 1, 0, 1, 0}},
-        this->ref);
-    std::unique_ptr<elimination_forest> forest;
-    gko::factorization::compute_elim_forest(mtx.get(), forest);
-    gko::array<index_type> row_nnz{this->ref, 10};
-
-    gko::kernels::reference::cholesky::symbolic_count(
-        this->ref, mtx.get(), *forest, row_nnz.get_data(), this->tmp);
-
-    GKO_ASSERT_ARRAY_EQ(row_nnz, I<index_type>({1, 1, 3, 2, 2, 2, 2, 2, 1, 4}));
+    this->forall_matrices(
+        [this] {
+            gko::factorization::compute_elim_forest(this->mtx.get(),
+                                                    this->forest);
+            gko::array<index_type> row_nnz{this->ref, this->num_rows};
+
+            gko::kernels::reference::cholesky::symbolic_count(
+                this->ref, this->mtx.get(), *this->forest, row_nnz.get_data(),
+                this->tmp);
+
+            GKO_ASSERT_ARRAY_EQ(row_nnz, this->ref_row_nnz);
+        },
+        true);
 }
 
 
-TYPED_TEST(Cholesky, KernelSymbolicFactorizeMissingDiagonal)
+TYPED_TEST(Cholesky, KernelSymbolicFactorize)
 {
     using matrix_type = typename TestFixture::matrix_type;
-    using index_type = typename TestFixture::index_type;
+    using sparsity_matrix_type = typename TestFixture::sparsity_matrix_type;
     using elimination_forest = typename TestFixture::elimination_forest;
-    auto mtx = gko::initialize<typename TestFixture::matrix_type>(
-        {{1, 0, 1, 0, 0, 0, 0, 0, 0, 0},
-         {0, 1, 1, 0, 0, 0, 0, 0, 0, 0},
-         {1, 1, 0, 1, 0, 0, 0, 0, 0, 0},
-         {0, 0, 1, 1, 1, 0, 0, 0, 0, 0},
-         {0, 0, 0, 1, 0, 1, 0, 0, 0, 0},
-         {0, 0, 0, 0, 1, 1, 1, 0, 0, 0},
-         {0, 0, 0, 0, 0, 1, 1, 1, 0, 1},
-         {0, 0, 0, 0, 0, 0, 1, 1, 0, 0},
-         {0, 0, 0, 0, 0, 0, 0, 0, 1, 1},
-         {0, 0, 0, 0, 0, 0, 1, 0, 1, 0}},
-        this->ref);
-    std::unique_ptr<elimination_forest> forest;
-    gko::factorization::compute_elim_forest(mtx.get(), forest);
-    auto l_factor = matrix_type::create(this->ref, gko::dim<2>{10, 10}, 20);
-    gko::kernels::reference::cholesky::symbolic_count(
-        this->ref, mtx.get(), *forest, l_factor->get_row_ptrs(), this->tmp);
-    gko::kernels::reference::components::prefix_sum_nonnegative(
-        this->ref, l_factor->get_row_ptrs(), 11);
-
-    gko::kernels::reference::cholesky::symbolic_factorize(
-        this->ref, mtx.get(), *forest, l_factor.get(), this->tmp);
-
-    GKO_ASSERT_MTX_EQ_SPARSITY(l_factor,
-                               l({{1., 0., 0., 0., 0., 0., 0., 0., 0., 0.},
-                                  {0., 1., 0., 0., 0., 0., 0., 0., 0., 0.},
-                                  {1., 1., 1., 0., 0., 0., 0., 0., 0., 0.},
-                                  {0., 0., 1., 1., 0., 0., 0., 0., 0., 0.},
-                                  {0., 0., 0., 1., 1., 0., 0., 0., 0., 0.},
-                                  {0., 0., 0., 0., 1., 1., 0., 0., 0., 0.},
-                                  {0., 0., 0., 0., 0., 1., 1., 0., 0., 0.},
-                                  {0., 0., 0., 0., 0., 0., 1., 1., 0., 0.},
-                                  {0., 0., 0., 0., 0., 0., 0., 0., 1., 0.},
-                                  {0., 0., 0., 0., 0., 0., 1., 1., 1., 1.}}));
-}
-
-
-TYPED_TEST(Cholesky, KernelSymbolicCountAni1)
-{
     using index_type = typename TestFixture::index_type;
-    using elimination_forest = typename TestFixture::elimination_forest;
-    this->setup(gko::matrices::location_ani1_mtx,
-                gko::matrices::location_ani1_chol_mtx);
-    std::unique_ptr<elimination_forest> forest;
-    gko::factorization::compute_elim_forest(this->mtx.get(), forest);
-    gko::array<index_type> row_nnz{this->ref, this->mtx->get_size()[0]};
-
-    gko::kernels::reference::cholesky::symbolic_count(
-        this->ref, this->mtx.get(), *forest, row_nnz.get_data(), this->tmp);
-
-    GKO_ASSERT_ARRAY_EQ(
-        row_nnz, I<index_type>({1, 2, 3, 3, 2, 2,  7,  7,  7,  8, 8, 7,
-                                8, 8, 8, 8, 2, 10, 10, 10, 10, 9, 8, 8,
-                                8, 7, 8, 2, 8, 8,  7,  5,  8,  6, 4, 4}));
-}
-
-
-TYPED_TEST(Cholesky, KernelSymbolicFactorize)
-{
-    using elimination_forest = typename TestFixture::elimination_forest;
-    this->forall_matrices([this] {
-        std::unique_ptr<elimination_forest> forest;
-        gko::factorization::compute_elim_forest(this->mtx.get(), forest);
-        gko::kernels::reference::cholesky::symbolic_count(
-            this->ref, this->mtx.get(), *forest, this->l_factor->get_row_ptrs(),
-            this->tmp);
-        gko::kernels::reference::components::prefix_sum_nonnegative(
-            this->ref, this->l_factor->get_row_ptrs(),
-            this->mtx->get_size()[0] + 1);
-
-        gko::kernels::reference::cholesky::symbolic_factorize(
-            this->ref, this->mtx.get(), *forest, this->l_factor.get(),
-            this->tmp);
-
-        GKO_ASSERT_MTX_EQ_SPARSITY(this->l_factor, this->l_factor_ref);
-    });
+    this->forall_matrices(
+        [this] {
+            gko::factorization::compute_elim_forest(this->mtx.get(),
+                                                    this->forest);
+            gko::kernels::reference::cholesky::symbolic_count(
+                this->ref, this->mtx.get(), *this->forest,
+                this->l_factor->get_row_ptrs(), this->tmp);
+            gko::kernels::reference::components::prefix_sum_nonnegative(
+                this->ref, this->l_factor->get_row_ptrs(), this->num_rows + 1);
+
+            gko::kernels::reference::cholesky::symbolic_factorize(
+                this->ref, this->mtx.get(), *this->forest, this->l_factor.get(),
+                this->tmp);
+
+            GKO_ASSERT_MTX_EQ_SPARSITY(this->l_factor, this->l_factor_ref);
+        },
+        true);
 }
 
 
@@ -439,14 +336,16 @@ TYPED_TEST(Cholesky, SymbolicFactorize)
 {
     using matrix_type = typename TestFixture::matrix_type;
     using elimination_forest = typename TestFixture::elimination_forest;
-    this->forall_matrices([this] {
-        std::unique_ptr<matrix_type> combined_factor;
-        std::unique_ptr<elimination_forest> forest;
-        gko::factorization::symbolic_cholesky(this->mtx.get(), true,
-                                              combined_factor, forest);
-
-        GKO_ASSERT_MTX_EQ_SPARSITY(combined_factor, this->combined_ref);
-    });
+    this->forall_matrices(
+        [this] {
+            std::unique_ptr<matrix_type> combined_factor;
+            std::unique_ptr<elimination_forest> forest;
+            gko::factorization::symbolic_cholesky(this->mtx.get(), true,
+                                                  combined_factor, forest);
+
+            GKO_ASSERT_MTX_EQ_SPARSITY(combined_factor, this->combined_ref);
+        },
+        true);
 }
 
 
@@ -454,55 +353,39 @@ TYPED_TEST(Cholesky, SymbolicFactorizeOnlyLower)
 {
     using matrix_type = typename TestFixture::matrix_type;
     using elimination_forest = typename TestFixture::elimination_forest;
-    this->forall_matrices([this] {
-        std::unique_ptr<matrix_type> l_factor;
-        std::unique_ptr<elimination_forest> forest;
-        gko::factorization::symbolic_cholesky(this->mtx.get(), false, l_factor,
-                                              forest);
-
-        GKO_ASSERT_MTX_EQ_SPARSITY(l_factor, this->l_factor_ref);
-    });
+    this->forall_matrices(
+        [this] {
+            std::unique_ptr<matrix_type> l_factor;
+            std::unique_ptr<elimination_forest> forest;
+            gko::factorization::symbolic_cholesky(this->mtx.get(), false,
+                                                  l_factor, forest);
+
+            GKO_ASSERT_MTX_EQ_SPARSITY(l_factor, this->l_factor_ref);
+        },
+        true);
 }
 
 
-TYPED_TEST(Cholesky, KernelSymbolicCountAni1Amd)
-{
-    using index_type = typename TestFixture::index_type;
-    using elimination_forest = typename TestFixture::elimination_forest;
-    this->setup(gko::matrices::location_ani1_amd_mtx,
-                gko::matrices::location_ani1_amd_chol_mtx);
-    std::unique_ptr<elimination_forest> forest;
-    gko::factorization::compute_elim_forest(this->mtx.get(), forest);
-    gko::array<index_type> row_nnz{this->ref, this->mtx->get_size()[0]};
-
-    gko::kernels::reference::cholesky::symbolic_count(
-        this->ref, this->mtx.get(), *forest, row_nnz.get_data(), this->tmp);
-
-    GKO_ASSERT_ARRAY_EQ(
-        row_nnz, I<index_type>({1, 1,  2, 3, 5,  4, 1, 2,  3,  4, 1,  2,
-                                2, 2,  5, 1, 4,  4, 4, 1,  2,  3, 4,  3,
-                                8, 10, 4, 8, 10, 7, 7, 13, 21, 6, 11, 14}));
-}
-
-
-TYPED_TEST(Cholesky, KernelForestFromFactor)
+TYPED_TEST(Cholesky, KernelForestFromFactorPlusPostprocessing)
 {
     using matrix_type = typename TestFixture::matrix_type;
     using index_type = typename TestFixture::index_type;
     using elimination_forest = typename TestFixture::elimination_forest;
-    this->forall_matrices([this] {
-        std::unique_ptr<matrix_type> combined_factor;
-        std::unique_ptr<elimination_forest> forest_ref;
-        gko::factorization::symbolic_cholesky(this->mtx.get(), true,
-                                              combined_factor, forest_ref);
-        elimination_forest forest{this->ref,
-                                  static_cast<index_type>(this->num_rows)};
-
-        gko::kernels::reference::cholesky::forest_from_factor(
-            this->ref, combined_factor.get(), forest);
-
-        this->assert_equal_forests(forest, *forest_ref);
-    });
+    this->forall_matrices(
+        [this] {
+            std::unique_ptr<matrix_type> combined_factor;
+            std::unique_ptr<elimination_forest> forest_ref;
+            gko::factorization::symbolic_cholesky(this->mtx.get(), true,
+                                                  combined_factor, forest_ref);
+            elimination_forest forest{this->ref,
+                                      static_cast<index_type>(this->num_rows)};
+
+            gko::kernels::reference::cholesky::forest_from_factor(
+                this->ref, combined_factor.get(), forest);
+
+            this->assert_equal_forests(forest, *forest_ref);
+        },
+        true);
 }
 
 
@@ -510,39 +393,46 @@ TYPED_TEST(Cholesky, KernelInitializeWorks)
 {
     using value_type = typename TestFixture::value_type;
     using index_type = typename TestFixture::index_type;
-    this->forall_matrices([this] {
-        std::fill_n(this->combined->get_values(),
-                    this->combined->get_num_stored_elements(),
-                    gko::zero<value_type>());
-        gko::array<index_type> diag_idxs{this->ref, this->num_rows};
-        gko::array<index_type> transpose_idxs{
-            this->ref, this->combined->get_num_stored_elements()};
-
-        gko::kernels::reference::cholesky::initialize(
-            this->ref, this->mtx.get(), this->storage_offsets.get_const_data(),
-            this->row_descs.get_const_data(), this->storage.get_const_data(),
-            diag_idxs.get_data(), transpose_idxs.get_data(),
-            this->combined.get());
-
-        GKO_ASSERT_MTX_NEAR(this->mtx, this->combined, 0.0);
-        for (gko::size_type row = 0; row < this->num_rows; row++) {
-            const auto diag_pos = diag_idxs.get_const_data()[row];
-            const auto begin_pos = this->combined->get_const_row_ptrs()[row];
-            const auto end_pos = this->combined->get_const_row_ptrs()[row + 1];
-            ASSERT_GE(diag_pos, begin_pos);
-            ASSERT_LT(diag_pos, end_pos);
-            ASSERT_EQ(this->combined->get_const_col_idxs()[diag_pos], row);
-            for (auto nz = begin_pos; nz < end_pos; nz++) {
-                const auto trans_pos = transpose_idxs.get_const_data()[nz];
-                const auto col = this->combined->get_const_col_idxs()[nz];
-                ASSERT_GE(trans_pos, this->combined->get_const_row_ptrs()[col]);
-                ASSERT_LT(trans_pos,
-                          this->combined->get_const_row_ptrs()[col + 1]);
-                ASSERT_EQ(this->combined->get_const_col_idxs()[trans_pos], row);
-                ASSERT_EQ(transpose_idxs.get_const_data()[trans_pos], nz);
+    this->forall_matrices(
+        [this] {
+            std::fill_n(this->combined->get_values(),
+                        this->combined->get_num_stored_elements(),
+                        gko::zero<value_type>());
+            gko::array<index_type> diag_idxs{this->ref, this->num_rows};
+            gko::array<index_type> transpose_idxs{
+                this->ref, this->combined->get_num_stored_elements()};
+
+            gko::kernels::reference::cholesky::initialize(
+                this->ref, this->mtx.get(),
+                this->storage_offsets.get_const_data(),
+                this->row_descs.get_const_data(),
+                this->storage.get_const_data(), diag_idxs.get_data(),
+                transpose_idxs.get_data(), this->combined.get());
+
+            GKO_ASSERT_MTX_NEAR(this->mtx, this->combined, 0.0);
+            for (gko::size_type row = 0; row < this->num_rows; row++) {
+                const auto diag_pos = diag_idxs.get_const_data()[row];
+                const auto begin_pos =
+                    this->combined->get_const_row_ptrs()[row];
+                const auto end_pos =
+                    this->combined->get_const_row_ptrs()[row + 1];
+                ASSERT_GE(diag_pos, begin_pos);
+                ASSERT_LT(diag_pos, end_pos);
+                ASSERT_EQ(this->combined->get_const_col_idxs()[diag_pos], row);
+                for (auto nz = begin_pos; nz < end_pos; nz++) {
+                    const auto trans_pos = transpose_idxs.get_const_data()[nz];
+                    const auto col = this->combined->get_const_col_idxs()[nz];
+                    ASSERT_GE(trans_pos,
+                              this->combined->get_const_row_ptrs()[col]);
+                    ASSERT_LT(trans_pos,
+                              this->combined->get_const_row_ptrs()[col + 1]);
+                    ASSERT_EQ(this->combined->get_const_col_idxs()[trans_pos],
+                              row);
+                    ASSERT_EQ(transpose_idxs.get_const_data()[trans_pos], nz);
+                }
             }
-        }
-    });
+        },
+        true);
 }
 
 
@@ -550,26 +440,30 @@ TYPED_TEST(Cholesky, KernelFactorizeWorks)
 {
     using value_type = typename TestFixture::value_type;
     using index_type = typename TestFixture::index_type;
-    this->forall_matrices([this] {
-        gko::array<index_type> diag_idxs{this->ref, this->num_rows};
-        gko::array<index_type> transpose_idxs{
-            this->ref, this->combined->get_num_stored_elements()};
-        gko::array<int> tmp{this->ref};
-        gko::kernels::reference::cholesky::initialize(
-            this->ref, this->mtx.get(), this->storage_offsets.get_const_data(),
-            this->row_descs.get_const_data(), this->storage.get_const_data(),
-            diag_idxs.get_data(), transpose_idxs.get_data(),
-            this->combined.get());
-
-        gko::kernels::reference::cholesky::factorize(
-            this->ref, this->storage_offsets.get_const_data(),
-            this->row_descs.get_const_data(), this->storage.get_const_data(),
-            diag_idxs.get_data(), transpose_idxs.get_data(), *this->forest,
-            this->combined.get(), tmp);
-
-        GKO_ASSERT_MTX_NEAR(this->combined, this->combined_ref,
-                            r<value_type>::value);
-    });
+    this->forall_matrices(
+        [this] {
+            gko::array<index_type> diag_idxs{this->ref, this->num_rows};
+            gko::array<index_type> transpose_idxs{
+                this->ref, this->combined->get_num_stored_elements()};
+            gko::array<int> tmp{this->ref};
+            gko::kernels::reference::cholesky::initialize(
+                this->ref, this->mtx.get(),
+                this->storage_offsets.get_const_data(),
+                this->row_descs.get_const_data(),
+                this->storage.get_const_data(), diag_idxs.get_data(),
+                transpose_idxs.get_data(), this->combined.get());
+
+            gko::kernels::reference::cholesky::factorize(
+                this->ref, this->storage_offsets.get_const_data(),
+                this->row_descs.get_const_data(),
+                this->storage.get_const_data(), diag_idxs.get_data(),
+                transpose_idxs.get_data(), *this->forest, this->combined.get(),
+                tmp);
+
+            GKO_ASSERT_MTX_NEAR(this->combined, this->combined_ref,
+                                r<value_type>::value);
+        },
+        false);
 }
 
 
@@ -577,23 +471,25 @@ TYPED_TEST(Cholesky, FactorizeWorks)
 {
     using value_type = typename TestFixture::value_type;
     using index_type = typename TestFixture::index_type;
-    this->forall_matrices([this] {
-        auto factory =
-            gko::experimental::factorization::Cholesky<value_type,
-                                                       index_type>::build()
-                .on(this->ref);
-
-        auto cholesky = factory->generate(this->mtx);
-
-        GKO_ASSERT_MTX_NEAR(cholesky->get_combined(), this->combined_ref,
-                            r<value_type>::value);
-        ASSERT_EQ(cholesky->get_storage_type(),
-                  gko::experimental::factorization::storage_type::
-                      symm_combined_cholesky);
-        ASSERT_EQ(cholesky->get_lower_factor(), nullptr);
-        ASSERT_EQ(cholesky->get_upper_factor(), nullptr);
-        ASSERT_EQ(cholesky->get_diagonal(), nullptr);
-    });
+    this->forall_matrices(
+        [this] {
+            auto factory =
+                gko::experimental::factorization::Cholesky<value_type,
+                                                           index_type>::build()
+                    .on(this->ref);
+
+            auto cholesky = factory->generate(this->mtx);
+
+            GKO_ASSERT_MTX_NEAR(cholesky->get_combined(), this->combined_ref,
+                                r<value_type>::value);
+            ASSERT_EQ(cholesky->get_storage_type(),
+                      gko::experimental::factorization::storage_type::
+                          symm_combined_cholesky);
+            ASSERT_EQ(cholesky->get_lower_factor(), nullptr);
+            ASSERT_EQ(cholesky->get_upper_factor(), nullptr);
+            ASSERT_EQ(cholesky->get_diagonal(), nullptr);
+        },
+        false);
 }
 
 
@@ -601,28 +497,30 @@ TYPED_TEST(Cholesky, FactorizeWithKnownSparsityWorks)
 {
     using value_type = typename TestFixture::value_type;
     using index_type = typename TestFixture::index_type;
-    this->forall_matrices([this] {
-        auto pattern =
-            gko::share(gko::matrix::SparsityCsr<value_type, index_type>::create(
-                this->ref));
-        pattern->copy_from(this->combined_ref.get());
-        auto factory =
-            gko::experimental::factorization::Cholesky<value_type,
-                                                       index_type>::build()
-                .with_symbolic_factorization(pattern)
-                .on(this->ref);
-
-        auto cholesky = factory->generate(this->mtx);
-
-        GKO_ASSERT_MTX_NEAR(cholesky->get_combined(), this->combined_ref,
-                            r<value_type>::value);
-        ASSERT_EQ(cholesky->get_storage_type(),
-                  gko::experimental::factorization::storage_type::
-                      symm_combined_cholesky);
-        ASSERT_EQ(cholesky->get_lower_factor(), nullptr);
-        ASSERT_EQ(cholesky->get_upper_factor(), nullptr);
-        ASSERT_EQ(cholesky->get_diagonal(), nullptr);
-    });
+    this->forall_matrices(
+        [this] {
+            auto pattern = gko::share(
+                gko::matrix::SparsityCsr<value_type, index_type>::create(
+                    this->ref));
+            pattern->copy_from(this->combined_ref.get());
+            auto factory =
+                gko::experimental::factorization::Cholesky<value_type,
+                                                           index_type>::build()
+                    .with_symbolic_factorization(pattern)
+                    .on(this->ref);
+
+            auto cholesky = factory->generate(this->mtx);
+
+            GKO_ASSERT_MTX_NEAR(cholesky->get_combined(), this->combined_ref,
+                                r<value_type>::value);
+            ASSERT_EQ(cholesky->get_storage_type(),
+                      gko::experimental::factorization::storage_type::
+                          symm_combined_cholesky);
+            ASSERT_EQ(cholesky->get_lower_factor(), nullptr);
+            ASSERT_EQ(cholesky->get_upper_factor(), nullptr);
+            ASSERT_EQ(cholesky->get_diagonal(), nullptr);
+        },
+        false);
 }
 
 
diff --git a/reference/test/factorization/factorization.cpp b/reference/test/factorization/factorization.cpp
index d9928491771..2f8231f1da7 100644
--- a/reference/test/factorization/factorization.cpp
+++ b/reference/test/factorization/factorization.cpp
@@ -71,9 +71,13 @@ class Factorization : public ::testing::Test {
         : ref(gko::ReferenceExecutor::create()),
           lower_mtx{gko::initialize<matrix_type>(
               {{1.0, 0.0, 0.0}, {3.0, 1.0, 0.0}, {1.0, 2.0, 1.0}}, ref)},
+          lower_cholesky_mtx{gko::initialize<matrix_type>(
+              {{1.0, 0.0, 0.0}, {3.0, -1.0, 0.0}, {1.0, 2.0, 5.0}}, ref)},
           diagonal{diag_type::create(ref, 3)},
           upper_mtx(gko::initialize<matrix_type>(
               {{1.0, 2.0, 1.0}, {0.0, 1.0, 3.0}, {0.0, 0.0, 1.0}}, ref)),
+          upper_nonunit_mtx(gko::initialize<matrix_type>(
+              {{1.0, 2.0, 1.0}, {0.0, -1.0, 3.0}, {0.0, 0.0, 5.0}}, ref)),
           combined_mtx(gko::initialize<matrix_type>(
               {{1.0, 2.0, 1.0}, {3.0, -1.0, 3.0}, {1.0, 2.0, 5.0}}, ref)),
           input(gko::initialize<vector_type>({1.0, 2.0, 3.0}, ref)),
@@ -88,8 +92,10 @@ class Factorization : public ::testing::Test {
 
     std::shared_ptr<const gko::ReferenceExecutor> ref;
     std::shared_ptr<matrix_type> lower_mtx;
+    std::shared_ptr<matrix_type> lower_cholesky_mtx;
     std::shared_ptr<diag_type> diagonal;
     std::shared_ptr<matrix_type> upper_mtx;
+    std::shared_ptr<matrix_type> upper_nonunit_mtx;
     std::shared_ptr<matrix_type> combined_mtx;
     std::shared_ptr<vector_type> input;
     std::shared_ptr<vector_type> output;
@@ -261,6 +267,87 @@ TYPED_TEST(Factorization, CreateSymmCombinedLDLWorks)
 }
 
 
+TYPED_TEST(Factorization, UnpackCombinedLUWorks)
+{
+    using factorization_type = typename TestFixture::factorization_type;
+    auto fact = factorization_type::create_from_combined_lu(
+        this->combined_mtx->clone());
+
+    auto separated = fact->unpack();
+
+    ASSERT_EQ(separated->get_storage_type(),
+              gko::experimental::factorization::storage_type::composition);
+    ASSERT_EQ(separated->get_combined(), nullptr);
+    ASSERT_EQ(separated->get_diagonal(), nullptr);
+    GKO_ASSERT_MTX_NEAR(separated->get_lower_factor(), this->lower_mtx, 0.0);
+    GKO_ASSERT_MTX_NEAR(separated->get_upper_factor(), this->upper_nonunit_mtx,
+                        0.0);
+}
+
+
+TYPED_TEST(Factorization, UnpackSymmCombinedCholeskyWorks)
+{
+    using matrix_type = typename TestFixture::matrix_type;
+    using factorization_type = typename TestFixture::factorization_type;
+    auto fact = factorization_type::create_from_combined_cholesky(
+        this->combined_mtx->clone());
+
+    auto separated = fact->unpack();
+
+    ASSERT_EQ(separated->get_storage_type(),
+              gko::experimental::factorization::storage_type::symm_composition);
+    ASSERT_EQ(separated->get_combined(), nullptr);
+    ASSERT_EQ(separated->get_diagonal(), nullptr);
+    GKO_ASSERT_MTX_NEAR(separated->get_lower_factor(), this->lower_cholesky_mtx,
+                        0.0);
+    GKO_ASSERT_MTX_NEAR(
+        separated->get_upper_factor(),
+        gko::as<matrix_type>(this->lower_cholesky_mtx->conj_transpose()), 0.0);
+}
+
+
+TYPED_TEST(Factorization, UnpackCompositionWorks)
+{
+    using factorization_type = typename TestFixture::factorization_type;
+    using composition_type = typename TestFixture::composition_type;
+    auto fact = factorization_type::create_from_composition(
+        composition_type::create(this->lower_mtx, this->upper_nonunit_mtx));
+
+    auto separated = fact->unpack();
+
+    ASSERT_EQ(separated->get_storage_type(),
+              gko::experimental::factorization::storage_type::composition);
+    ASSERT_EQ(separated->get_combined(), nullptr);
+    ASSERT_EQ(separated->get_diagonal(), nullptr);
+    GKO_ASSERT_MTX_NEAR(separated->get_lower_factor(), this->lower_mtx, 0.0);
+    GKO_ASSERT_MTX_NEAR(separated->get_upper_factor(), this->upper_nonunit_mtx,
+                        0.0);
+}
+
+
+TYPED_TEST(Factorization, UnpackSymmCompositionWorks)
+{
+    using matrix_type = typename TestFixture::matrix_type;
+    using factorization_type = typename TestFixture::factorization_type;
+    using composition_type = typename TestFixture::composition_type;
+    auto fact = factorization_type::create_from_symm_composition(
+        composition_type::create(this->lower_cholesky_mtx,
+                                 this->lower_cholesky_mtx->conj_transpose()));
+
+    auto separated = fact->unpack();
+
+    ASSERT_EQ(separated->get_storage_type(),
+              gko::experimental::factorization::storage_type::symm_composition);
+    ASSERT_EQ(separated->get_combined(), nullptr);
+    ASSERT_EQ(separated->get_diagonal(), nullptr);
+    GKO_ASSERT_MTX_NEAR(separated->get_lower_factor(), this->lower_cholesky_mtx,
+                        0.0);
+    GKO_ASSERT_MTX_NEAR(
+        separated->get_upper_factor(),
+        gko::as<matrix_type>(this->lower_cholesky_mtx->conj_transpose()), 0.0);
+}
+
+
 TYPED_TEST(Factorization, ApplyFromCompositionWorks)
 {
     using factorization_type = typename TestFixture::factorization_type;
diff --git a/reference/test/factorization/lu_kernels.cpp b/reference/test/factorization/lu_kernels.cpp
index 5cde9f132d3..33bd3ca3209 100644
--- a/reference/test/factorization/lu_kernels.cpp
+++ b/reference/test/factorization/lu_kernels.cpp
@@ -164,6 +164,19 @@ TYPED_TEST(Lu, SymbolicLUWorks)
 }
 
 
+TYPED_TEST(Lu, SymbolicLUNearSymmWorks)
+{
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    this->forall_matrices([this] {
+        std::unique_ptr<gko::matrix::Csr<value_type, index_type>> lu;
+        gko::factorization::symbolic_lu_near_symm(this->mtx.get(), lu);
+
+        GKO_ASSERT_MTX_EQ_SPARSITY(lu, this->mtx_lu);
+    });
+}
+
+
 TYPED_TEST(Lu, SymbolicLUWorksWithMissingDiagonal)
 {
     using matrix_type = typename TestFixture::matrix_type;
@@ -252,7 +265,8 @@ TYPED_TEST(Lu, FactorizeSymmetricWorks)
             auto factory =
                 gko::experimental::factorization::Lu<value_type,
                                                      index_type>::build()
-                    .with_symmetric_sparsity(true)
+                    .with_symbolic_algorithm(gko::experimental::factorization::
+                                                 symbolic_type::symmetric)
                     .on(this->ref);
 
             auto lu = factory->generate(this->mtx);
@@ -275,10 +289,38 @@ TYPED_TEST(Lu, FactorizeNonsymmetricWorks)
     using value_type = typename TestFixture::value_type;
     using index_type = typename TestFixture::index_type;
     this->forall_matrices([this] {
-        auto factory = gko::experimental::factorization::Lu<value_type,
-                                                            index_type>::build()
-                           .with_symmetric_sparsity(false)
-                           .on(this->ref);
+        auto factory =
+            gko::experimental::factorization::Lu<value_type,
+                                                 index_type>::build()
+                .with_symbolic_algorithm(
+                    gko::experimental::factorization::symbolic_type::general)
+                .on(this->ref);
+
+        auto lu = factory->generate(this->mtx);
+
+        GKO_ASSERT_MTX_EQ_SPARSITY(lu->get_combined(), this->mtx_lu);
+        GKO_ASSERT_MTX_NEAR(lu->get_combined(), this->mtx_lu,
+                            15 * r<value_type>::value);
+        ASSERT_EQ(lu->get_storage_type(),
+                  gko::experimental::factorization::storage_type::combined_lu);
+        ASSERT_EQ(lu->get_lower_factor(), nullptr);
+        ASSERT_EQ(lu->get_upper_factor(), nullptr);
+        ASSERT_EQ(lu->get_diagonal(), nullptr);
+    });
+}
+
+
+TYPED_TEST(Lu, FactorizeNearSymmetricWorks)
+{
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    this->forall_matrices([this] {
+        auto factory =
+            gko::experimental::factorization::Lu<value_type,
+                                                 index_type>::build()
+                .with_symbolic_algorithm(gko::experimental::factorization::
+                                             symbolic_type::near_symmetric)
+                .on(this->ref);
 
         auto lu = factory->generate(this->mtx);
 
diff --git a/reference/test/log/CMakeLists.txt b/reference/test/log/CMakeLists.txt
index 2d9e8f188cb..44faca51f90 100644
--- a/reference/test/log/CMakeLists.txt
+++ b/reference/test/log/CMakeLists.txt
@@ -1,4 +1,4 @@
 ginkgo_create_test(convergence)
 if (GINKGO_HAVE_PAPI_SDE)
-    ginkgo_create_test(papi PAPI::PAPI)
+    ginkgo_create_test(papi ADDITIONAL_LIBRARIES PAPI::PAPI)
 endif()
diff --git a/reference/test/matrix/CMakeLists.txt b/reference/test/matrix/CMakeLists.txt
index 9670a5df80c..6f3348da432 100644
--- a/reference/test/matrix/CMakeLists.txt
+++ b/reference/test/matrix/CMakeLists.txt
@@ -1,3 +1,5 @@
+ginkgo_create_test(batch_dense_kernels)
+ginkgo_create_test(batch_ell_kernels)
 ginkgo_create_test(coo_kernels)
 ginkgo_create_test(csr_kernels)
 ginkgo_create_test(dense_kernels)
@@ -8,6 +10,7 @@ ginkgo_create_test(fft_kernels)
 ginkgo_create_test(hybrid_kernels)
 ginkgo_create_test(identity)
 ginkgo_create_test(permutation)
+ginkgo_create_test(scaled_permutation)
 ginkgo_create_test(sellp_kernels)
 ginkgo_create_test(sparsity_csr)
 ginkgo_create_test(sparsity_csr_kernels)
diff --git a/reference/test/matrix/batch_dense_kernels.cpp b/reference/test/matrix/batch_dense_kernels.cpp
new file mode 100644
index 00000000000..518d4fe024b
--- /dev/null
+++ b/reference/test/matrix/batch_dense_kernels.cpp
@@ -0,0 +1,218 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/matrix/batch_dense.hpp>
+
+
+#include <complex>
+#include <memory>
+#include <random>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/exception.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+#include "core/matrix/batch_dense_kernels.hpp"
+#include "core/test/utils.hpp"
+
+
+template <typename T>
+class Dense : public ::testing::Test {
+protected:
+    using value_type = T;
+    using size_type = gko::size_type;
+    using BMtx = gko::batch::matrix::Dense<value_type>;
+    using BMVec = gko::batch::MultiVector<value_type>;
+    using DenseMtx = gko::matrix::Dense<value_type>;
+    Dense()
+        : exec(gko::ReferenceExecutor::create()),
+          mtx_0(gko::batch::initialize<BMtx>(
+              {{I<T>({1.0, -1.0, 1.5}), I<T>({-2.0, 2.0, 3.0})},
+               {{1.0, -2.0, -0.5}, {1.0, -2.5, 4.0}}},
+              exec)),
+          mtx_00(gko::initialize<DenseMtx>(
+              {I<T>({1.0, -1.0, 1.5}), I<T>({-2.0, 2.0, 3.0})}, exec)),
+          mtx_01(gko::initialize<DenseMtx>(
+              {I<T>({1.0, -2.0, -0.5}), I<T>({1.0, -2.5, 4.0})}, exec)),
+          b_0(gko::batch::initialize<BMVec>(
+              {{I<T>({1.0, 0.0, 1.0}), I<T>({2.0, 0.0, 1.0}),
+                I<T>({1.0, 0.0, 2.0})},
+               {I<T>({-1.0, 1.0, 1.0}), I<T>({1.0, -1.0, 1.0}),
+                I<T>({1.0, 0.0, 2.0})}},
+              exec)),
+          b_00(gko::initialize<DenseMtx>(
+              {I<T>({1.0, 0.0, 1.0}), I<T>({2.0, 0.0, 1.0}),
+               I<T>({1.0, 0.0, 2.0})},
+              exec)),
+          b_01(gko::initialize<DenseMtx>(
+              {I<T>({-1.0, 1.0, 1.0}), I<T>({1.0, -1.0, 1.0}),
+               I<T>({1.0, 0.0, 2.0})},
+              exec)),
+          x_0(gko::batch::initialize<BMVec>(
+              {{I<T>({2.0, 0.0, 1.0}), I<T>({2.0, 0.0, 2.0})},
+               {I<T>({-2.0, 1.0, 1.0}), I<T>({1.0, -1.0, -1.0})}},
+              exec)),
+          x_00(gko::initialize<DenseMtx>(
+              {I<T>({2.0, 0.0, 1.0}), I<T>({2.0, 0.0, 2.0})}, exec)),
+          x_01(gko::initialize<DenseMtx>(
+              {I<T>({-2.0, 1.0, 1.0}), I<T>({1.0, -1.0, -1.0})}, exec))
+    {}
+
+    std::shared_ptr<const gko::ReferenceExecutor> exec;
+    std::unique_ptr<BMtx> mtx_0;
+    std::unique_ptr<DenseMtx> mtx_00;
+    std::unique_ptr<DenseMtx> mtx_01;
+    std::unique_ptr<BMVec> b_0;
+    std::unique_ptr<DenseMtx> b_00;
+    std::unique_ptr<DenseMtx> b_01;
+    std::unique_ptr<BMVec> x_0;
+    std::unique_ptr<DenseMtx> x_00;
+    std::unique_ptr<DenseMtx> x_01;
+
+    std::default_random_engine rand_engine;
+};
+
+
+TYPED_TEST_SUITE(Dense, gko::test::ValueTypes, TypenameNameGenerator);
+
+
+TYPED_TEST(Dense, AppliesToBatchMultiVector)
+{
+    using T = typename TestFixture::value_type;
+
+    this->mtx_0->apply(this->b_0.get(), this->x_0.get());
+
+    this->mtx_00->apply(this->b_00.get(), this->x_00.get());
+    this->mtx_01->apply(this->b_01.get(), this->x_01.get());
+    auto res = gko::batch::unbatch<gko::batch::MultiVector<T>>(this->x_0.get());
+    GKO_ASSERT_MTX_NEAR(res[0].get(), this->x_00.get(), 0.);
+    GKO_ASSERT_MTX_NEAR(res[1].get(), this->x_01.get(), 0.);
+}
+
+
+TYPED_TEST(Dense, AppliesLinearCombinationToBatchMultiVector)
+{
+    using BMtx = typename TestFixture::BMtx;
+    using BMVec = typename TestFixture::BMVec;
+    using DenseMtx = typename TestFixture::DenseMtx;
+    using T = typename TestFixture::value_type;
+    auto alpha = gko::batch::initialize<BMVec>({{1.5}, {-1.0}}, this->exec);
+    auto beta = gko::batch::initialize<BMVec>({{2.5}, {-4.0}}, this->exec);
+    auto alpha0 = gko::initialize<DenseMtx>({1.5}, this->exec);
+    auto alpha1 = gko::initialize<DenseMtx>({-1.0}, this->exec);
+    auto beta0 = gko::initialize<DenseMtx>({2.5}, this->exec);
+    auto beta1 = gko::initialize<DenseMtx>({-4.0}, this->exec);
+
+    this->mtx_0->apply(alpha.get(), this->b_0.get(), beta.get(),
+                       this->x_0.get());
+
+    this->mtx_00->apply(alpha0.get(), this->b_00.get(), beta0.get(),
+                        this->x_00.get());
+    this->mtx_01->apply(alpha1.get(), this->b_01.get(), beta1.get(),
+                        this->x_01.get());
+    auto res = gko::batch::unbatch<gko::batch::MultiVector<T>>(this->x_0.get());
+    GKO_ASSERT_MTX_NEAR(res[0].get(), this->x_00.get(), 0.);
+    GKO_ASSERT_MTX_NEAR(res[1].get(), this->x_01.get(), 0.);
+}
+
+
+TYPED_TEST(Dense, ApplyFailsOnWrongNumberOfResultCols)
+{
+    using BMVec = typename TestFixture::BMVec;
+
+    auto res = BMVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{2}});
+
+    ASSERT_THROW(this->mtx_0->apply(this->b_0.get(), res.get()),
+                 gko::DimensionMismatch);
+}
+
+
+TYPED_TEST(Dense, ApplyFailsOnWrongNumberOfResultRows)
+{
+    using BMVec = typename TestFixture::BMVec;
+
+    auto res = BMVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{3}});
+
+    ASSERT_THROW(this->mtx_0->apply(this->b_0.get(), res.get()),
+                 gko::DimensionMismatch);
+}
+
+
+TYPED_TEST(Dense, ApplyFailsOnWrongInnerDimension)
+{
+    using BMVec = typename TestFixture::BMVec;
+
+    auto res =
+        BMVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{2, 3}});
+
+    ASSERT_THROW(this->mtx_0->apply(res.get(), this->x_0.get()),
+                 gko::DimensionMismatch);
+}
+
+
+TYPED_TEST(Dense, AdvancedApplyFailsOnWrongInnerDimension)
+{
+    using BMVec = typename TestFixture::BMVec;
+    auto res =
+        BMVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{2, 3}});
+    auto alpha =
+        BMVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{1, 1}});
+    auto beta =
+        BMVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{1, 1}});
+
+    ASSERT_THROW(
+        this->mtx_0->apply(alpha.get(), res.get(), beta.get(), this->x_0.get()),
+        gko::DimensionMismatch);
+}
+
+
+TYPED_TEST(Dense, AdvancedApplyFailsOnWrongAlphaDimension)
+{
+    using BMVec = typename TestFixture::BMVec;
+    auto res =
+        BMVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{3, 3}});
+    auto alpha =
+        BMVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{2, 1}});
+    auto beta =
+        BMVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{1, 1}});
+
+    ASSERT_THROW(
+        this->mtx_0->apply(alpha.get(), res.get(), beta.get(), this->x_0.get()),
+        gko::DimensionMismatch);
+}
diff --git a/reference/test/matrix/batch_ell_kernels.cpp b/reference/test/matrix/batch_ell_kernels.cpp
new file mode 100644
index 00000000000..d0ab012294c
--- /dev/null
+++ b/reference/test/matrix/batch_ell_kernels.cpp
@@ -0,0 +1,258 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/matrix/batch_ell.hpp>
+
+
+#include <complex>
+#include <memory>
+#include <random>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/exception.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/matrix/ell.hpp>
+
+
+#include "core/matrix/batch_ell_kernels.hpp"
+#include "core/test/utils.hpp"
+
+
+template <typename T>
+class Ell : public ::testing::Test {
+protected:
+    using value_type = T;
+    using size_type = gko::size_type;
+    using BMtx = gko::batch::matrix::Ell<value_type>;
+    using BMVec = gko::batch::MultiVector<value_type>;
+    using EllMtx = gko::matrix::Ell<value_type>;
+    using DenseMtx = gko::matrix::Dense<value_type>;
+    Ell()
+        : exec(gko::ReferenceExecutor::create()),
+          mtx_0(gko::batch::initialize<BMtx>(
+              {{I<T>({1.0, -1.0, 1.5}), I<T>({-2.0, 2.0, 3.0})},
+               {{1.0, -2.0, -0.5}, {1.0, -2.5, 4.0}}},
+              exec)),
+          mtx_00(gko::initialize<EllMtx>(
+              {I<T>({1.0, -1.0, 1.5}), I<T>({-2.0, 2.0, 3.0})}, exec)),
+          mtx_01(gko::initialize<EllMtx>(
+              {I<T>({1.0, -2.0, -0.5}), I<T>({1.0, -2.5, 4.0})}, exec)),
+          b_0(gko::batch::initialize<BMVec>(
+              {{I<T>({1.0, 0.0, 1.0}), I<T>({2.0, 0.0, 1.0}),
+                I<T>({1.0, 0.0, 2.0})},
+               {I<T>({-1.0, 1.0, 1.0}), I<T>({1.0, -1.0, 1.0}),
+                I<T>({1.0, 0.0, 2.0})}},
+              exec)),
+          b_00(gko::initialize<DenseMtx>(
+              {I<T>({1.0, 0.0, 1.0}), I<T>({2.0, 0.0, 1.0}),
+               I<T>({1.0, 0.0, 2.0})},
+              exec)),
+          b_01(gko::initialize<DenseMtx>(
+              {I<T>({-1.0, 1.0, 1.0}), I<T>({1.0, -1.0, 1.0}),
+               I<T>({1.0, 0.0, 2.0})},
+              exec)),
+          x_0(gko::batch::initialize<BMVec>(
+              {{I<T>({2.0, 0.0, 1.0}), I<T>({2.0, 0.0, 2.0})},
+               {I<T>({-2.0, 1.0, 1.0}), I<T>({1.0, -1.0, -1.0})}},
+              exec)),
+          x_00(gko::initialize<DenseMtx>(
+              {I<T>({2.0, 0.0, 1.0}), I<T>({2.0, 0.0, 2.0})}, exec)),
+          x_01(gko::initialize<DenseMtx>(
+              {I<T>({-2.0, 1.0, 1.0}), I<T>({1.0, -1.0, -1.0})}, exec))
+    {}
+
+    std::shared_ptr<const gko::ReferenceExecutor> exec;
+    std::unique_ptr<BMtx> mtx_0;
+    std::unique_ptr<EllMtx> mtx_00;
+    std::unique_ptr<EllMtx> mtx_01;
+    std::unique_ptr<BMVec> b_0;
+    std::unique_ptr<DenseMtx> b_00;
+    std::unique_ptr<DenseMtx> b_01;
+    std::unique_ptr<BMVec> x_0;
+    std::unique_ptr<DenseMtx> x_00;
+    std::unique_ptr<DenseMtx> x_01;
+
+    std::ranlux48 rand_engine;
+};
+
+
+TYPED_TEST_SUITE(Ell, gko::test::ValueTypes, TypenameNameGenerator);
+
+
+TYPED_TEST(Ell, AppliesToBatchMultiVector)
+{
+    using T = typename TestFixture::value_type;
+
+    this->mtx_0->apply(this->b_0.get(), this->x_0.get());
+
+    this->mtx_00->apply(this->b_00.get(), this->x_00.get());
+    this->mtx_01->apply(this->b_01.get(), this->x_01.get());
+    auto res = gko::batch::unbatch<gko::batch::MultiVector<T>>(this->x_0.get());
+    GKO_ASSERT_MTX_NEAR(res[0].get(), this->x_00.get(), r<T>::value);
+    GKO_ASSERT_MTX_NEAR(res[1].get(), this->x_01.get(), r<T>::value);
+}
+
+
+TYPED_TEST(Ell, ConstAppliesToBatchMultiVector)
+{
+    using T = typename TestFixture::value_type;
+    using BMtx = typename TestFixture::BMtx;
+
+    static_cast<const BMtx*>(this->mtx_0.get())->apply(this->b_0, this->x_0);
+
+    this->mtx_00->apply(this->b_00.get(), this->x_00.get());
+    this->mtx_01->apply(this->b_01.get(), this->x_01.get());
+    auto res = gko::batch::unbatch<gko::batch::MultiVector<T>>(this->x_0.get());
+    GKO_ASSERT_MTX_NEAR(res[0].get(), this->x_00.get(), r<T>::value);
+    GKO_ASSERT_MTX_NEAR(res[1].get(), this->x_01.get(), r<T>::value);
+}
+
+
+TYPED_TEST(Ell, AppliesLinearCombinationToBatchMultiVector)
+{
+    using BMtx = typename TestFixture::BMtx;
+    using BMVec = typename TestFixture::BMVec;
+    using DenseMtx = typename TestFixture::DenseMtx;
+    using T = typename TestFixture::value_type;
+    auto alpha = gko::batch::initialize<BMVec>({{1.5}, {-1.0}}, this->exec);
+    auto beta = gko::batch::initialize<BMVec>({{2.5}, {-4.0}}, this->exec);
+    auto alpha0 = gko::initialize<DenseMtx>({1.5}, this->exec);
+    auto alpha1 = gko::initialize<DenseMtx>({-1.0}, this->exec);
+    auto beta0 = gko::initialize<DenseMtx>({2.5}, this->exec);
+    auto beta1 = gko::initialize<DenseMtx>({-4.0}, this->exec);
+
+    this->mtx_0->apply(alpha.get(), this->b_0.get(), beta.get(),
+                       this->x_0.get());
+
+    this->mtx_00->apply(alpha0.get(), this->b_00.get(), beta0.get(),
+                        this->x_00.get());
+    this->mtx_01->apply(alpha1.get(), this->b_01.get(), beta1.get(),
+                        this->x_01.get());
+    auto res = gko::batch::unbatch<gko::batch::MultiVector<T>>(this->x_0.get());
+    GKO_ASSERT_MTX_NEAR(res[0].get(), this->x_00.get(), r<T>::value);
+    GKO_ASSERT_MTX_NEAR(res[1].get(), this->x_01.get(), r<T>::value);
+}
+
+
+TYPED_TEST(Ell, ConstAppliesLinearCombinationToBatchMultiVector)
+{
+    using BMtx = typename TestFixture::BMtx;
+    using BMVec = typename TestFixture::BMVec;
+    using DenseMtx = typename TestFixture::DenseMtx;
+    using T = typename TestFixture::value_type;
+    auto alpha = gko::batch::initialize<BMVec>({{1.5}, {-1.0}}, this->exec);
+    auto beta = gko::batch::initialize<BMVec>({{2.5}, {-4.0}}, this->exec);
+    auto alpha0 = gko::initialize<DenseMtx>({1.5}, this->exec);
+    auto alpha1 = gko::initialize<DenseMtx>({-1.0}, this->exec);
+    auto beta0 = gko::initialize<DenseMtx>({2.5}, this->exec);
+    auto beta1 = gko::initialize<DenseMtx>({-4.0}, this->exec);
+
+    static_cast<const BMtx*>(this->mtx_0.get())
+        ->apply(alpha.get(), this->b_0.get(), beta.get(), this->x_0.get());
+
+    this->mtx_00->apply(alpha0.get(), this->b_00.get(), beta0.get(),
+                        this->x_00.get());
+    this->mtx_01->apply(alpha1.get(), this->b_01.get(), beta1.get(),
+                        this->x_01.get());
+    auto res = gko::batch::unbatch<gko::batch::MultiVector<T>>(this->x_0.get());
+    GKO_ASSERT_MTX_NEAR(res[0].get(), this->x_00.get(), r<T>::value);
+    GKO_ASSERT_MTX_NEAR(res[1].get(), this->x_01.get(), r<T>::value);
+}
+
+
+TYPED_TEST(Ell, ApplyFailsOnWrongNumberOfResultCols)
+{
+    using BMVec = typename TestFixture::BMVec;
+    auto res = BMVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{2}});
+
+    ASSERT_THROW(this->mtx_0->apply(this->b_0.get(), res.get()),
+                 gko::DimensionMismatch);
+}
+
+
+TYPED_TEST(Ell, ApplyFailsOnWrongNumberOfResultRows)
+{
+    using BMVec = typename TestFixture::BMVec;
+    auto res = BMVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{3}});
+
+    ASSERT_THROW(this->mtx_0->apply(this->b_0.get(), res.get()),
+                 gko::DimensionMismatch);
+}
+
+
+TYPED_TEST(Ell, ApplyFailsOnWrongInnerDimension)
+{
+    using BMVec = typename TestFixture::BMVec;
+    auto res =
+        BMVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{2, 3}});
+
+    ASSERT_THROW(this->mtx_0->apply(res.get(), this->x_0.get()),
+                 gko::DimensionMismatch);
+}
+
+
+TYPED_TEST(Ell, AdvancedApplyFailsOnWrongInnerDimension)
+{
+    using BMVec = typename TestFixture::BMVec;
+    auto res =
+        BMVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{2, 3}});
+    auto alpha =
+        BMVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{1, 1}});
+    auto beta =
+        BMVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{1, 1}});
+
+    ASSERT_THROW(
+        this->mtx_0->apply(alpha.get(), res.get(), beta.get(), this->x_0.get()),
+        gko::DimensionMismatch);
+}
+
+
+TYPED_TEST(Ell, AdvancedApplyFailsOnWrongAlphaDimension)
+{
+    using BMVec = typename TestFixture::BMVec;
+    auto res =
+        BMVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{3, 3}});
+    auto alpha =
+        BMVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{2, 1}});
+    auto beta =
+        BMVec::create(this->exec, gko::batch_dim<2>{2, gko::dim<2>{1, 1}});
+
+    ASSERT_THROW(
+        this->mtx_0->apply(alpha.get(), res.get(), beta.get(), this->x_0.get()),
+        gko::DimensionMismatch);
+}
diff --git a/reference/test/matrix/csr_kernels.cpp b/reference/test/matrix/csr_kernels.cpp
index 0c5ac3bde53..305eb8bf5ee 100644
--- a/reference/test/matrix/csr_kernels.cpp
+++ b/reference/test/matrix/csr_kernels.cpp
@@ -49,6 +49,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/ell.hpp>
 #include <ginkgo/core/matrix/hybrid.hpp>
 #include <ginkgo/core/matrix/identity.hpp>
+#include <ginkgo/core/matrix/permutation.hpp>
+#include <ginkgo/core/matrix/scaled_permutation.hpp>
 #include <ginkgo/core/matrix/sellp.hpp>
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
@@ -56,6 +58,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/matrix/csr_kernels.hpp"
 #include "core/matrix/csr_lookup.hpp"
 #include "core/test/utils.hpp"
+#include "core/test/utils/assertions.hpp"
 
 
 namespace {
@@ -76,6 +79,8 @@ class Csr : public ::testing::Test {
     using Hybrid = gko::matrix::Hybrid<value_type, index_type>;
     using Vec = gko::matrix::Dense<value_type>;
     using MixedVec = gko::matrix::Dense<gko::next_precision<value_type>>;
+    using Perm = gko::matrix::Permutation<index_type>;
+    using ScaledPerm = gko::matrix::ScaledPermutation<value_type, index_type>;
 
     Csr()
         : exec(gko::ReferenceExecutor::create()),
@@ -87,7 +92,21 @@ class Csr : public ::testing::Test {
                                   std::make_shared<typename Mtx::classical>())),
           mtx3_unsorted(
               Mtx::create(exec, gko::dim<2>(3, 3), 7,
-                          std::make_shared<typename Mtx::classical>()))
+                          std::make_shared<typename Mtx::classical>())),
+          perm3(Perm::create(exec, gko::array<index_type>{exec, {1, 2, 0}})),
+          perm3_rev(perm3->compute_inverse()),
+          perm2(Perm::create(exec, gko::array<index_type>{exec, {1, 0}})),
+          perm0(Perm::create(exec)),
+          scale_perm3(ScaledPerm::create(
+              exec, gko::array<value_type>{this->exec, {2.0, 3.0, 5.0}},
+              gko::array<index_type>{exec, {1, 2, 0}})),
+          scale_perm3_rev(ScaledPerm::create(
+              exec, gko::array<value_type>{this->exec, {7.0, 11.0, 13.0}},
+              gko::array<index_type>{exec, {1, 2, 0}})),
+          scale_perm2(ScaledPerm::create(
+              exec, gko::array<value_type>{this->exec, {17.0, 19.0}},
+              gko::array<index_type>{exec, {1, 0}})),
+          scale_perm0(ScaledPerm::create(exec))
     {
         this->create_mtx(mtx.get());
         this->create_mtx2(mtx2.get());
@@ -123,7 +142,7 @@ class Csr : public ::testing::Test {
         value_type* v = m->get_values();
         index_type* c = m->get_col_idxs();
         index_type* r = m->get_row_ptrs();
-        // It keeps an explict zero
+        // It keeps an explicit zero
         /*
          *  1    3   2
          * {0}   5   0
@@ -349,6 +368,14 @@ class Csr : public ::testing::Test {
     std::unique_ptr<Mtx> mtx2;
     std::unique_ptr<Mtx> mtx3_sorted;
     std::unique_ptr<Mtx> mtx3_unsorted;
+    std::unique_ptr<Perm> perm3;
+    std::unique_ptr<Perm> perm3_rev;
+    std::unique_ptr<Perm> perm2;
+    std::unique_ptr<Perm> perm0;
+    std::unique_ptr<ScaledPerm> scale_perm3;
+    std::unique_ptr<ScaledPerm> scale_perm3_rev;
+    std::unique_ptr<ScaledPerm> scale_perm2;
+    std::unique_ptr<ScaledPerm> scale_perm0;
     index_type invalid_index = gko::invalid_index<index_type>();
 };
 
@@ -810,7 +837,7 @@ TYPED_TEST(Csr, ConvertsToPrecision)
     GKO_ASSERT_MTX_NEAR(this->mtx2, res, residual);
     auto first_strategy = this->mtx2->get_strategy();
     auto second_strategy = res->get_strategy();
-    ASSERT_EQ(typeid(*first_strategy), typeid(*second_strategy));
+    GKO_ASSERT_DYNAMIC_TYPE_EQ(first_strategy, second_strategy);
 }
 
 
@@ -835,7 +862,7 @@ TYPED_TEST(Csr, MovesToPrecision)
     GKO_ASSERT_MTX_NEAR(this->mtx2, res, residual);
     auto first_strategy = this->mtx2->get_strategy();
     auto second_strategy = res->get_strategy();
-    ASSERT_EQ(typeid(*first_strategy), typeid(*second_strategy));
+    GKO_ASSERT_DYNAMIC_TYPE_EQ(first_strategy, second_strategy);
 }
 
 
@@ -1284,6 +1311,449 @@ TYPED_TEST(Csr, NonSquareMtxIsTransposable)
 }
 
 
+template <typename ValueType, typename IndexType>
+std::unique_ptr<gko::matrix::Csr<ValueType, IndexType>> csr_from_permutation(
+    gko::matrix::Permutation<IndexType>* perm, bool invert)
+{
+    gko::matrix_data<double, IndexType> double_data;
+    if (invert) {
+        perm->compute_inverse()->write(double_data);
+    } else {
+        perm->write(double_data);
+    }
+    gko::matrix_data<ValueType, IndexType> data;
+    data.size = double_data.size;
+    for (auto entry : double_data.nonzeros) {
+        data.nonzeros.emplace_back(entry.row, entry.column, 1.0);
+    }
+    auto mtx =
+        gko::matrix::Csr<ValueType, IndexType>::create(perm->get_executor());
+    mtx->read(data);
+    return mtx;
+}
+
+
+template <typename ValueType, typename IndexType>
+std::unique_ptr<gko::matrix::Csr<ValueType, IndexType>> csr_from_permutation(
+    gko::matrix::ScaledPermutation<ValueType, IndexType>* perm, bool invert)
+{
+    gko::matrix_data<ValueType, IndexType> data;
+    if (invert) {
+        perm->compute_inverse()->write(data);
+    } else {
+        perm->write(data);
+    }
+    auto mtx =
+        gko::matrix::Csr<ValueType, IndexType>::create(perm->get_executor());
+    mtx->read(data);
+    return mtx;
+}
+
+
+template <typename ValueType, typename IndexType, typename Permutation>
+std::unique_ptr<gko::matrix::Csr<ValueType, IndexType>> ref_permute(
+    gko::matrix::Csr<ValueType, IndexType>* input, Permutation* permutation,
+    gko::matrix::permute_mode mode)
+{
+    using gko::matrix::permute_mode;
+    using Csr = gko::matrix::Csr<ValueType, IndexType>;
+    auto result = input->clone();
+    auto permutation_csr = csr_from_permutation<ValueType>(
+        permutation, (mode & permute_mode::inverse) == permute_mode::inverse);
+    if ((mode & permute_mode::rows) == permute_mode::rows) {
+        // compute P * A
+        permutation_csr->apply(input, result);
+    }
+    if ((mode & permute_mode::columns) == permute_mode::columns) {
+        // compute A * P^T = (P * A^T)^T
+        auto tmp = result->transpose();
+        auto tmp2 = gko::as<Csr>(tmp->clone());
+        permutation_csr->apply(tmp, tmp2);
+        result = gko::as<Csr>(tmp2->transpose());
+    }
+    return result;
+}
+
+
+template <typename ValueType, typename IndexType, typename Permutation>
+std::unique_ptr<gko::matrix::Csr<ValueType, IndexType>> ref_permute(
+    gko::matrix::Csr<ValueType, IndexType>* input, Permutation* row_permutation,
+    Permutation* col_permutation, bool invert)
+{
+    using gko::matrix::permute_mode;
+    using Csr = gko::matrix::Csr<ValueType, IndexType>;
+    auto result = input->clone();
+    auto row_permutation_csr =
+        csr_from_permutation<ValueType>(row_permutation, invert);
+    auto col_permutation_csr =
+        csr_from_permutation<ValueType>(col_permutation, invert);
+    row_permutation_csr->apply(input, result);
+    auto tmp = result->transpose();
+    auto tmp2 = gko::as<Csr>(tmp->clone());
+    col_permutation_csr->apply(tmp, tmp2);
+    return gko::as<Csr>(tmp2->transpose());
+}
+
+
+TYPED_TEST(Csr, Permute)
+{
+    using gko::matrix::permute_mode;
+
+    for (auto mode :
+         {permute_mode::none, permute_mode::rows, permute_mode::columns,
+          permute_mode::symmetric, permute_mode::inverse_rows,
+          permute_mode::inverse_columns, permute_mode::inverse_symmetric}) {
+        SCOPED_TRACE(mode);
+
+        auto permuted = this->mtx3_sorted->permute(this->perm3, mode);
+        auto ref_permuted =
+            ref_permute(this->mtx3_sorted.get(), this->perm3.get(), mode);
+
+        GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0);
+        GKO_ASSERT_MTX_EQ_SPARSITY(permuted, ref_permuted);
+        ASSERT_TRUE(permuted->is_sorted_by_column_index());
+    }
+}
+
+
+TYPED_TEST(Csr, PermuteRoundtrip)
+{
+    using gko::matrix::permute_mode;
+
+    for (auto mode :
+         {permute_mode::rows, permute_mode::columns, permute_mode::symmetric,
+          permute_mode::inverse_rows, permute_mode::inverse_columns,
+          permute_mode::inverse_symmetric}) {
+        SCOPED_TRACE(mode);
+
+        auto permuted =
+            this->mtx3_sorted->permute(this->perm3, mode)
+                ->permute(this->perm3, mode ^ permute_mode::inverse);
+
+        GKO_ASSERT_MTX_NEAR(this->mtx3_sorted, permuted, 0.0);
+        GKO_ASSERT_MTX_EQ_SPARSITY(permuted, this->mtx3_sorted);
+        ASSERT_TRUE(permuted->is_sorted_by_column_index());
+    }
+}
+
+
+TYPED_TEST(Csr, PermuteInverted)
+{
+    using gko::matrix::permute_mode;
+
+    for (auto mode :
+         {permute_mode::rows, permute_mode::columns, permute_mode::symmetric}) {
+        SCOPED_TRACE(mode);
+
+        auto permuted = this->mtx3_sorted->permute(this->perm3, mode);
+        auto inv_inv_permuted = this->mtx3_sorted->permute(
+            this->perm3->compute_inverse(), mode | permute_mode::inverse);
+
+        GKO_ASSERT_MTX_NEAR(permuted, inv_inv_permuted, 0.0);
+        GKO_ASSERT_MTX_EQ_SPARSITY(permuted, inv_inv_permuted);
+        ASSERT_TRUE(permuted->is_sorted_by_column_index());
+        ASSERT_TRUE(inv_inv_permuted->is_sorted_by_column_index());
+    }
+}
+
+
+TYPED_TEST(Csr, PermuteRectangular)
+{
+    using gko::matrix::permute_mode;
+
+    for (auto mode :
+         {permute_mode::rows, permute_mode::columns, permute_mode::inverse_rows,
+          permute_mode::inverse_columns}) {
+        auto perm = (mode & permute_mode::rows) == permute_mode::rows
+                        ? this->perm2.get()
+                        : this->perm3.get();
+
+        auto permuted = this->mtx2->permute(perm, mode);
+        auto ref_permuted = ref_permute(this->mtx2.get(), perm, mode);
+
+        GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0);
+        GKO_ASSERT_MTX_EQ_SPARSITY(permuted, ref_permuted);
+        ASSERT_TRUE(permuted->is_sorted_by_column_index());
+    }
+}
+
+
+TYPED_TEST(Csr, PermuteFailsWithIncorrectPermutationSize)
+{
+    using gko::matrix::permute_mode;
+
+    for (auto mode :
+         {/* no permute_mode::none */ permute_mode::rows, permute_mode::columns,
+          permute_mode::symmetric, permute_mode::inverse_rows,
+          permute_mode::inverse_columns, permute_mode::inverse_symmetric}) {
+        SCOPED_TRACE(mode);
+
+        ASSERT_THROW(this->mtx3_sorted->permute(this->perm0, mode),
+                     gko::DimensionMismatch);
+    }
+}
+
+
+TYPED_TEST(Csr, NonsymmPermute)
+{
+    auto permuted = this->mtx3_sorted->permute(this->perm3, this->perm3_rev);
+    auto ref_permuted = ref_permute(this->mtx3_sorted.get(), this->perm3.get(),
+                                    this->perm3_rev.get(), false);
+
+    GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0);
+    GKO_ASSERT_MTX_EQ_SPARSITY(permuted, ref_permuted);
+    ASSERT_TRUE(permuted->is_sorted_by_column_index());
+}
+
+
+TYPED_TEST(Csr, NonsymmPermuteInverse)
+{
+    auto permuted =
+        this->mtx3_sorted->permute(this->perm3, this->perm3_rev, true);
+    auto ref_permuted = ref_permute(this->mtx3_sorted.get(), this->perm3.get(),
+                                    this->perm3_rev.get(), true);
+
+    GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0);
+    GKO_ASSERT_MTX_EQ_SPARSITY(permuted, ref_permuted);
+    ASSERT_TRUE(permuted->is_sorted_by_column_index());
+}
+
+
+TYPED_TEST(Csr, NonsymmPermuteRectangular)
+{
+    auto permuted = this->mtx2->permute(this->perm2, this->perm3);
+    auto ref_permuted = ref_permute(this->mtx2.get(), this->perm2.get(),
+                                    this->perm3.get(), false);
+
+    GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0);
+    GKO_ASSERT_MTX_EQ_SPARSITY(permuted, ref_permuted);
+    ASSERT_TRUE(permuted->is_sorted_by_column_index());
+}
+
+
+TYPED_TEST(Csr, NonsymmPermuteInverseRectangular)
+{
+    auto permuted = this->mtx2->permute(this->perm2, this->perm3, true);
+    auto ref_permuted = ref_permute(this->mtx2.get(), this->perm2.get(),
+                                    this->perm3.get(), true);
+
+    GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0);
+    GKO_ASSERT_MTX_EQ_SPARSITY(permuted, ref_permuted);
+    ASSERT_TRUE(permuted->is_sorted_by_column_index());
+}
+
+
+TYPED_TEST(Csr, NonsymmPermuteRoundtrip)
+{
+    auto permuted = this->mtx3_sorted->permute(this->perm3, this->perm3_rev)
+                        ->permute(this->perm3, this->perm3_rev, true);
+
+    GKO_ASSERT_MTX_NEAR(this->mtx3_sorted, permuted, 0.0);
+    GKO_ASSERT_MTX_EQ_SPARSITY(permuted, this->mtx3_sorted);
+    ASSERT_TRUE(permuted->is_sorted_by_column_index());
+}
+
+
+TYPED_TEST(Csr, NonsymmPermuteInverted)
+{
+    auto permuted = this->mtx3_sorted->permute(this->perm3, this->perm3_rev);
+    auto inv_inv_permuted =
+        this->mtx3_sorted->permute(this->perm3->compute_inverse(),
+                                   this->perm3_rev->compute_inverse(), true);
+
+    GKO_ASSERT_MTX_NEAR(permuted, inv_inv_permuted, 0.0);
+    GKO_ASSERT_MTX_EQ_SPARSITY(permuted, inv_inv_permuted);
+    ASSERT_TRUE(permuted->is_sorted_by_column_index());
+    ASSERT_TRUE(inv_inv_permuted->is_sorted_by_column_index());
+}
+
+
+TYPED_TEST(Csr, NonsymmPermuteFailsWithIncorrectPermutationSize)
+{
+    ASSERT_THROW(this->mtx3_sorted->permute(this->perm0, this->perm3_rev),
+                 gko::DimensionMismatch);
+    ASSERT_THROW(this->mtx3_sorted->permute(this->perm3_rev, this->perm0),
+                 gko::DimensionMismatch);
+    ASSERT_THROW(this->mtx3_sorted->permute(this->perm0, this->perm0),
+                 gko::DimensionMismatch);
+}
+
+
+TYPED_TEST(Csr, ScaledPermute)
+{
+    using gko::matrix::permute_mode;
+    using value_type = typename TestFixture::value_type;
+
+    for (auto mode :
+         {permute_mode::none, permute_mode::rows, permute_mode::columns,
+          permute_mode::symmetric, permute_mode::inverse_rows,
+          permute_mode::inverse_columns, permute_mode::inverse_symmetric}) {
+        SCOPED_TRACE(mode);
+
+        auto permuted =
+            this->mtx3_sorted->scale_permute(this->scale_perm3, mode);
+        auto ref_permuted =
+            ref_permute(this->mtx3_sorted.get(), this->scale_perm3.get(), mode);
+
+        GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, r<value_type>::value);
+        GKO_ASSERT_MTX_EQ_SPARSITY(permuted, ref_permuted);
+        ASSERT_TRUE(permuted->is_sorted_by_column_index());
+    }
+}
+
+
+TYPED_TEST(Csr, ScaledPermuteRoundtrip)
+{
+    using gko::matrix::permute_mode;
+    using value_type = typename TestFixture::value_type;
+
+    for (auto mode :
+         {permute_mode::rows, permute_mode::columns, permute_mode::symmetric,
+          permute_mode::inverse_rows, permute_mode::inverse_columns,
+          permute_mode::inverse_symmetric}) {
+        SCOPED_TRACE(mode);
+
+        auto permuted =
+            this->mtx3_sorted->scale_permute(this->scale_perm3, mode)
+                ->scale_permute(this->scale_perm3,
+                                mode ^ permute_mode::inverse);
+
+        GKO_ASSERT_MTX_NEAR(this->mtx3_sorted, permuted, r<value_type>::value);
+        GKO_ASSERT_MTX_EQ_SPARSITY(permuted, this->mtx3_sorted);
+        ASSERT_TRUE(permuted->is_sorted_by_column_index());
+    }
+}
+
+
+TYPED_TEST(Csr, ScaledPermuteRectangular)
+{
+    using gko::matrix::permute_mode;
+    using value_type = typename TestFixture::value_type;
+
+    for (auto mode :
+         {permute_mode::rows, permute_mode::columns, permute_mode::inverse_rows,
+          permute_mode::inverse_columns}) {
+        auto perm = (mode & permute_mode::rows) == permute_mode::rows
+                        ? this->scale_perm2.get()
+                        : this->scale_perm3.get();
+
+        auto permuted = this->mtx2->scale_permute(perm, mode);
+        auto ref_permuted = ref_permute(this->mtx2.get(), perm, mode);
+
+        GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, r<value_type>::value);
+        GKO_ASSERT_MTX_EQ_SPARSITY(permuted, ref_permuted);
+        ASSERT_TRUE(permuted->is_sorted_by_column_index());
+    }
+}
+
+
+TYPED_TEST(Csr, ScaledPermuteFailsWithIncorrectPermutationSize)
+{
+    using gko::matrix::permute_mode;
+
+    for (auto mode :
+         {/* no permute_mode::none */ permute_mode::rows, permute_mode::columns,
+          permute_mode::symmetric, permute_mode::inverse_rows,
+          permute_mode::inverse_columns, permute_mode::inverse_symmetric}) {
+        SCOPED_TRACE(mode);
+
+        ASSERT_THROW(this->mtx3_sorted->scale_permute(this->scale_perm0, mode),
+                     gko::DimensionMismatch);
+    }
+}
+
+
+TYPED_TEST(Csr, NonsymmScaledPermute)
+{
+    using value_type = typename TestFixture::value_type;
+
+    auto permuted = this->mtx3_sorted->scale_permute(this->scale_perm3,
+                                                     this->scale_perm3_rev);
+    auto ref_permuted =
+        ref_permute(this->mtx3_sorted.get(), this->scale_perm3.get(),
+                    this->scale_perm3_rev.get(), false);
+
+    GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, r<value_type>::value);
+    GKO_ASSERT_MTX_EQ_SPARSITY(permuted, ref_permuted);
+    ASSERT_TRUE(permuted->is_sorted_by_column_index());
+}
+
+
+TYPED_TEST(Csr, NonsymmScaledPermuteInverse)
+{
+    using value_type = typename TestFixture::value_type;
+
+    auto permuted = this->mtx3_sorted->scale_permute(
+        this->scale_perm3, this->scale_perm3_rev, true);
+    auto ref_permuted =
+        ref_permute(this->mtx3_sorted.get(), this->scale_perm3.get(),
+                    this->scale_perm3_rev.get(), true);
+
+    GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, r<value_type>::value);
+    GKO_ASSERT_MTX_EQ_SPARSITY(permuted, ref_permuted);
+    ASSERT_TRUE(permuted->is_sorted_by_column_index());
+}
+
+
+TYPED_TEST(Csr, NonsymmScaledPermuteRectangular)
+{
+    using value_type = typename TestFixture::value_type;
+
+    auto permuted =
+        this->mtx2->scale_permute(this->scale_perm2, this->scale_perm3);
+    auto ref_permuted = ref_permute(this->mtx2.get(), this->scale_perm2.get(),
+                                    this->scale_perm3.get(), false);
+
+    GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, r<value_type>::value);
+    GKO_ASSERT_MTX_EQ_SPARSITY(permuted, ref_permuted);
+    ASSERT_TRUE(permuted->is_sorted_by_column_index());
+}
+
+
+TYPED_TEST(Csr, NonsymmScaledPermuteInverseRectangular)
+{
+    using value_type = typename TestFixture::value_type;
+
+    auto permuted =
+        this->mtx2->scale_permute(this->scale_perm2, this->scale_perm3, true);
+    auto ref_permuted = ref_permute(this->mtx2.get(), this->scale_perm2.get(),
+                                    this->scale_perm3.get(), true);
+
+    GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, r<value_type>::value);
+    GKO_ASSERT_MTX_EQ_SPARSITY(permuted, ref_permuted);
+    ASSERT_TRUE(permuted->is_sorted_by_column_index());
+}
+
+
+TYPED_TEST(Csr, NonsymmScaledPermuteRoundtrip)
+{
+    using value_type = typename TestFixture::value_type;
+
+    auto permuted =
+        this->mtx3_sorted
+            ->scale_permute(this->scale_perm3, this->scale_perm3_rev)
+            ->scale_permute(this->scale_perm3, this->scale_perm3_rev, true);
+
+    GKO_ASSERT_MTX_NEAR(this->mtx3_sorted, permuted, r<value_type>::value);
+    GKO_ASSERT_MTX_EQ_SPARSITY(permuted, this->mtx3_sorted);
+    ASSERT_TRUE(permuted->is_sorted_by_column_index());
+}
+
+
+TYPED_TEST(Csr, NonsymmScaledPermuteFailsWithIncorrectPermutationSize)
+{
+    ASSERT_THROW(this->mtx3_sorted->scale_permute(this->scale_perm0,
+                                                  this->scale_perm3_rev),
+                 gko::DimensionMismatch);
+    ASSERT_THROW(this->mtx3_sorted->scale_permute(this->scale_perm3_rev,
+                                                  this->scale_perm0),
+                 gko::DimensionMismatch);
+    ASSERT_THROW(
+        this->mtx3_sorted->scale_permute(this->scale_perm0, this->scale_perm0),
+        gko::DimensionMismatch);
+}
+
+
 TYPED_TEST(Csr, SquareMatrixIsPermutable)
 {
     using Csr = typename TestFixture::Mtx;
diff --git a/reference/test/matrix/dense_kernels.cpp b/reference/test/matrix/dense_kernels.cpp
index 9edab89e382..b70265ed217 100644
--- a/reference/test/matrix/dense_kernels.cpp
+++ b/reference/test/matrix/dense_kernels.cpp
@@ -35,6 +35,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include <complex>
 #include <memory>
+#include <numeric>
 #include <random>
 
 
@@ -50,6 +51,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/diagonal.hpp>
 #include <ginkgo/core/matrix/ell.hpp>
 #include <ginkgo/core/matrix/hybrid.hpp>
+#include <ginkgo/core/matrix/permutation.hpp>
+#include <ginkgo/core/matrix/scaled_permutation.hpp>
 #include <ginkgo/core/matrix/sellp.hpp>
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
@@ -68,7 +71,6 @@ class Dense : public ::testing::Test {
     using Mtx = gko::matrix::Dense<value_type>;
     using MixedMtx = gko::matrix::Dense<gko::next_precision<value_type>>;
     using ComplexMtx = gko::to_complex<Mtx>;
-    using MixedComplexMtx = gko::to_complex<MixedMtx>;
     using RealMtx = gko::remove_complex<Mtx>;
     Dense()
         : exec(gko::ReferenceExecutor::create()),
@@ -97,7 +99,6 @@ class Dense : public ::testing::Test {
     std::unique_ptr<Mtx> mtx6;
     std::unique_ptr<Mtx> mtx7;
     std::unique_ptr<Mtx> mtx8;
-    gko::int32 invalid_index = gko::invalid_index<gko::int32>();
     std::default_random_engine rand_engine;
 
     template <typename MtxType>
@@ -700,6 +701,37 @@ TYPED_TEST(Dense, ComputesNorm1Mixed)
 }
 
 
+TYPED_TEST(Dense, ComputesMean)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using T = typename TestFixture::value_type;
+
+    auto iota = Mtx::create(this->exec, gko::dim<2>{10, 1});
+    std::iota(iota->get_values(), iota->get_values() + 10, 1);
+    auto iota_result = Mtx::create(this->exec, gko::dim<2>{1, 1});
+    iota->compute_mean(iota_result.get());
+    GKO_EXPECT_NEAR(iota_result->at(0, 0), T{5.5}, r<T>::value * 10);
+
+    auto result = Mtx::create(this->exec, gko::dim<2>{1, 3});
+
+    this->mtx4->compute_mean(result.get());
+
+    GKO_EXPECT_NEAR(result->at(0, 0), T{0.5}, r<T>::value * 10);
+    GKO_EXPECT_NEAR(result->at(0, 1), T{4.0}, r<T>::value * 10);
+    GKO_EXPECT_NEAR(result->at(0, 2), T{1.0}, r<T>::value * 10);
+}
+
+
+TYPED_TEST(Dense, ComputesMeanFailsOnWrongResultSize)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using T = typename TestFixture::value_type;
+    auto result = Mtx::create(this->exec, gko::dim<2>{1, 2});
+
+    ASSERT_THROW(this->mtx4->compute_mean(result), gko::DimensionMismatch);
+}
+
+
 TYPED_TEST(Dense, ComputeDotFailsOnWrongInputSize)
 {
     using Mtx = typename TestFixture::Mtx;
@@ -780,3061 +812,2761 @@ TYPED_TEST(Dense, MovesToPrecision)
 }
 
 
-TYPED_TEST(Dense, ConvertsToCoo32)
+TYPED_TEST(Dense, SquareMatrixIsTransposable)
 {
+    using Mtx = typename TestFixture::Mtx;
     using T = typename TestFixture::value_type;
-    using Coo = typename gko::matrix::Coo<T, gko::int32>;
-    auto coo_mtx = Coo::create(this->mtx4->get_executor());
-
-    this->mtx4->convert_to(coo_mtx);
-    auto v = coo_mtx->get_const_values();
-    auto c = coo_mtx->get_const_col_idxs();
-    auto r = coo_mtx->get_const_row_idxs();
+    auto trans = gko::as<Mtx>(this->mtx5->transpose());
 
-    ASSERT_EQ(coo_mtx->get_size(), gko::dim<2>(2, 3));
-    ASSERT_EQ(coo_mtx->get_num_stored_elements(), 4);
-    EXPECT_EQ(r[0], 0);
-    EXPECT_EQ(r[1], 0);
-    EXPECT_EQ(r[2], 0);
-    EXPECT_EQ(r[3], 1);
-    EXPECT_EQ(c[0], 0);
-    EXPECT_EQ(c[1], 1);
-    EXPECT_EQ(c[2], 2);
-    EXPECT_EQ(c[3], 1);
-    EXPECT_EQ(v[0], T{1.0});
-    EXPECT_EQ(v[1], T{3.0});
-    EXPECT_EQ(v[2], T{2.0});
-    EXPECT_EQ(v[3], T{5.0});
+    GKO_ASSERT_MTX_NEAR(
+        trans, l<T>({{1.0, -2.0, 2.1}, {-1.0, 2.0, 3.4}, {-0.5, 4.5, 1.2}}),
+        0.0);
 }
 
 
-TYPED_TEST(Dense, MovesToCoo32)
+TYPED_TEST(Dense, SquareMatrixIsTransposableIntoDense)
 {
+    using Mtx = typename TestFixture::Mtx;
     using T = typename TestFixture::value_type;
-    using Coo = typename gko::matrix::Coo<T, gko::int32>;
-    auto coo_mtx = Coo::create(this->mtx4->get_executor());
+    auto trans = Mtx::create(this->exec, this->mtx5->get_size());
 
-    this->mtx4->move_to(coo_mtx);
-    auto v = coo_mtx->get_const_values();
-    auto c = coo_mtx->get_const_col_idxs();
-    auto r = coo_mtx->get_const_row_idxs();
+    this->mtx5->transpose(trans);
 
-    ASSERT_EQ(coo_mtx->get_size(), gko::dim<2>(2, 3));
-    ASSERT_EQ(coo_mtx->get_num_stored_elements(), 4);
-    EXPECT_EQ(r[0], 0);
-    EXPECT_EQ(r[1], 0);
-    EXPECT_EQ(r[2], 0);
-    EXPECT_EQ(r[3], 1);
-    EXPECT_EQ(c[0], 0);
-    EXPECT_EQ(c[1], 1);
-    EXPECT_EQ(c[2], 2);
-    EXPECT_EQ(c[3], 1);
-    EXPECT_EQ(v[0], T{1.0});
-    EXPECT_EQ(v[1], T{3.0});
-    EXPECT_EQ(v[2], T{2.0});
-    EXPECT_EQ(v[3], T{5.0});
+    GKO_ASSERT_MTX_NEAR(
+        trans, l<T>({{1.0, -2.0, 2.1}, {-1.0, 2.0, 3.4}, {-0.5, 4.5, 1.2}}),
+        0.0);
 }
 
 
-TYPED_TEST(Dense, ConvertsToCoo64)
+TYPED_TEST(Dense, SquareSubmatrixIsTransposableIntoDense)
 {
+    using Mtx = typename TestFixture::Mtx;
     using T = typename TestFixture::value_type;
-    using Coo = typename gko::matrix::Coo<T, gko::int64>;
-    auto coo_mtx = Coo::create(this->mtx4->get_executor());
+    auto trans = Mtx::create(this->exec, gko::dim<2>{2, 2}, 4);
 
-    this->mtx4->convert_to(coo_mtx);
-    auto v = coo_mtx->get_const_values();
-    auto c = coo_mtx->get_const_col_idxs();
-    auto r = coo_mtx->get_const_row_idxs();
+    this->mtx5->create_submatrix({0, 2}, {0, 2})->transpose(trans);
 
-    ASSERT_EQ(coo_mtx->get_size(), gko::dim<2>(2, 3));
-    ASSERT_EQ(coo_mtx->get_num_stored_elements(), 4);
-    EXPECT_EQ(r[0], 0);
-    EXPECT_EQ(r[1], 0);
-    EXPECT_EQ(r[2], 0);
-    EXPECT_EQ(r[3], 1);
-    EXPECT_EQ(c[0], 0);
-    EXPECT_EQ(c[1], 1);
-    EXPECT_EQ(c[2], 2);
-    EXPECT_EQ(c[3], 1);
-    EXPECT_EQ(v[0], T{1.0});
-    EXPECT_EQ(v[1], T{3.0});
-    EXPECT_EQ(v[2], T{2.0});
-    EXPECT_EQ(v[3], T{5.0});
+    GKO_ASSERT_MTX_NEAR(trans, l<T>({{1.0, -2.0}, {-1.0, 2.0}}), 0.0);
+    ASSERT_EQ(trans->get_stride(), 4);
 }
 
 
-TYPED_TEST(Dense, MovesToCoo64)
+TYPED_TEST(Dense, SquareMatrixIsTransposableIntoDenseFailsForWrongDimensions)
 {
-    using T = typename TestFixture::value_type;
-    using Coo = typename gko::matrix::Coo<T, gko::int64>;
-    auto coo_mtx = Coo::create(this->mtx4->get_executor());
-
-    this->mtx4->move_to(coo_mtx);
-    auto v = coo_mtx->get_const_values();
-    auto c = coo_mtx->get_const_col_idxs();
-    auto r = coo_mtx->get_const_row_idxs();
+    using Mtx = typename TestFixture::Mtx;
 
-    ASSERT_EQ(coo_mtx->get_size(), gko::dim<2>(2, 3));
-    ASSERT_EQ(coo_mtx->get_num_stored_elements(), 4);
-    EXPECT_EQ(r[0], 0);
-    EXPECT_EQ(r[1], 0);
-    EXPECT_EQ(r[2], 0);
-    EXPECT_EQ(r[3], 1);
-    EXPECT_EQ(c[0], 0);
-    EXPECT_EQ(c[1], 1);
-    EXPECT_EQ(c[2], 2);
-    EXPECT_EQ(c[3], 1);
-    EXPECT_EQ(v[0], T{1.0});
-    EXPECT_EQ(v[1], T{3.0});
-    EXPECT_EQ(v[2], T{2.0});
-    EXPECT_EQ(v[3], T{5.0});
+    ASSERT_THROW(this->mtx5->transpose(Mtx::create(this->exec)),
+                 gko::DimensionMismatch);
 }
 
 
-TYPED_TEST(Dense, ConvertsToCsr32)
+TYPED_TEST(Dense, NonSquareMatrixIsTransposable)
 {
+    using Mtx = typename TestFixture::Mtx;
     using T = typename TestFixture::value_type;
-    using Csr = typename gko::matrix::Csr<T, gko::int32>;
-    auto csr_s_classical = std::make_shared<typename Csr::classical>();
-    auto csr_s_merge = std::make_shared<typename Csr::merge_path>();
-    auto csr_mtx_c = Csr::create(this->mtx4->get_executor(), csr_s_classical);
-    auto csr_mtx_m = Csr::create(this->mtx4->get_executor(), csr_s_merge);
-
-    this->mtx4->convert_to(csr_mtx_c);
-    this->mtx4->convert_to(csr_mtx_m);
+    auto trans = gko::as<Mtx>(this->mtx4->transpose());
 
-    auto v = csr_mtx_c->get_const_values();
-    auto c = csr_mtx_c->get_const_col_idxs();
-    auto r = csr_mtx_c->get_const_row_ptrs();
-    ASSERT_EQ(csr_mtx_c->get_size(), gko::dim<2>(2, 3));
-    ASSERT_EQ(csr_mtx_c->get_num_stored_elements(), 4);
-    EXPECT_EQ(r[0], 0);
-    EXPECT_EQ(r[1], 3);
-    EXPECT_EQ(r[2], 4);
-    EXPECT_EQ(c[0], 0);
-    EXPECT_EQ(c[1], 1);
-    EXPECT_EQ(c[2], 2);
-    EXPECT_EQ(c[3], 1);
-    EXPECT_EQ(v[0], T{1.0});
-    EXPECT_EQ(v[1], T{3.0});
-    EXPECT_EQ(v[2], T{2.0});
-    EXPECT_EQ(v[3], T{5.0});
-    ASSERT_EQ(csr_mtx_c->get_strategy()->get_name(), "classical");
-    GKO_ASSERT_MTX_NEAR(csr_mtx_c, csr_mtx_m, 0.0);
-    ASSERT_EQ(csr_mtx_m->get_strategy()->get_name(), "merge_path");
+    GKO_ASSERT_MTX_NEAR(trans, l<T>({{1.0, 0.0}, {3.0, 5.0}, {2.0, 0.0}}), 0.0);
 }
 
 
-TYPED_TEST(Dense, MovesToCsr32)
+TYPED_TEST(Dense, NonSquareMatrixIsTransposableIntoDense)
 {
+    using Mtx = typename TestFixture::Mtx;
     using T = typename TestFixture::value_type;
-    using Csr = typename gko::matrix::Csr<T, gko::int32>;
-    auto csr_s_classical = std::make_shared<typename Csr::classical>();
-    auto csr_s_merge = std::make_shared<typename Csr::merge_path>();
-    auto csr_mtx_c = Csr::create(this->mtx4->get_executor(), csr_s_classical);
-    auto csr_mtx_m = Csr::create(this->mtx4->get_executor(), csr_s_merge);
-    auto mtx_clone = this->mtx4->clone();
+    auto trans =
+        Mtx::create(this->exec, gko::transpose(this->mtx4->get_size()));
 
-    this->mtx4->move_to(csr_mtx_c);
-    mtx_clone->move_to(csr_mtx_m);
+    this->mtx4->transpose(trans);
 
-    auto v = csr_mtx_c->get_const_values();
-    auto c = csr_mtx_c->get_const_col_idxs();
-    auto r = csr_mtx_c->get_const_row_ptrs();
-    ASSERT_EQ(csr_mtx_c->get_size(), gko::dim<2>(2, 3));
-    ASSERT_EQ(csr_mtx_c->get_num_stored_elements(), 4);
-    EXPECT_EQ(r[0], 0);
-    EXPECT_EQ(r[1], 3);
-    EXPECT_EQ(r[2], 4);
-    EXPECT_EQ(c[0], 0);
-    EXPECT_EQ(c[1], 1);
-    EXPECT_EQ(c[2], 2);
-    EXPECT_EQ(c[3], 1);
-    EXPECT_EQ(v[0], T{1.0});
-    EXPECT_EQ(v[1], T{3.0});
-    EXPECT_EQ(v[2], T{2.0});
-    EXPECT_EQ(v[3], T{5.0});
-    ASSERT_EQ(csr_mtx_c->get_strategy()->get_name(), "classical");
-    GKO_ASSERT_MTX_NEAR(csr_mtx_c, csr_mtx_m, 0.0);
-    ASSERT_EQ(csr_mtx_m->get_strategy()->get_name(), "merge_path");
+    GKO_ASSERT_MTX_NEAR(trans, l<T>({{1.0, 0.0}, {3.0, 5.0}, {2.0, 0.0}}), 0.0);
 }
 
 
-TYPED_TEST(Dense, ConvertsToCsr64)
+TYPED_TEST(Dense, NonSquareSubmatrixIsTransposableIntoDense)
 {
+    using Mtx = typename TestFixture::Mtx;
     using T = typename TestFixture::value_type;
-    using Csr = typename gko::matrix::Csr<T, gko::int64>;
-    auto csr_s_classical = std::make_shared<typename Csr::classical>();
-    auto csr_s_merge = std::make_shared<typename Csr::merge_path>();
-    auto csr_mtx_c = Csr::create(this->mtx4->get_executor(), csr_s_classical);
-    auto csr_mtx_m = Csr::create(this->mtx4->get_executor(), csr_s_merge);
+    auto trans = Mtx::create(this->exec, gko::dim<2>{2, 1}, 5);
 
-    this->mtx4->convert_to(csr_mtx_c);
-    this->mtx4->convert_to(csr_mtx_m);
+    this->mtx4->create_submatrix({0, 1}, {0, 2})->transpose(trans);
 
-    auto v = csr_mtx_c->get_const_values();
-    auto c = csr_mtx_c->get_const_col_idxs();
-    auto r = csr_mtx_c->get_const_row_ptrs();
-    ASSERT_EQ(csr_mtx_c->get_size(), gko::dim<2>(2, 3));
-    ASSERT_EQ(csr_mtx_c->get_num_stored_elements(), 4);
-    EXPECT_EQ(r[0], 0);
-    EXPECT_EQ(r[1], 3);
-    EXPECT_EQ(r[2], 4);
-    EXPECT_EQ(c[0], 0);
-    EXPECT_EQ(c[1], 1);
-    EXPECT_EQ(c[2], 2);
-    EXPECT_EQ(c[3], 1);
-    EXPECT_EQ(v[0], T{1.0});
-    EXPECT_EQ(v[1], T{3.0});
-    EXPECT_EQ(v[2], T{2.0});
-    EXPECT_EQ(v[3], T{5.0});
-    ASSERT_EQ(csr_mtx_c->get_strategy()->get_name(), "classical");
-    GKO_ASSERT_MTX_NEAR(csr_mtx_c, csr_mtx_m, 0.0);
-    ASSERT_EQ(csr_mtx_m->get_strategy()->get_name(), "merge_path");
+    GKO_ASSERT_MTX_NEAR(trans, l({1.0, 3.0}), 0.0);
+    ASSERT_EQ(trans->get_stride(), 5);
 }
 
 
-TYPED_TEST(Dense, MovesToCsr64)
+TYPED_TEST(Dense, NonSquareMatrixIsTransposableIntoDenseFailsForWrongDimensions)
 {
-    using T = typename TestFixture::value_type;
-    using Csr = typename gko::matrix::Csr<T, gko::int64>;
-    auto csr_s_classical = std::make_shared<typename Csr::classical>();
-    auto csr_s_merge = std::make_shared<typename Csr::merge_path>();
-    auto csr_mtx_c = Csr::create(this->mtx4->get_executor(), csr_s_classical);
-    auto csr_mtx_m = Csr::create(this->mtx4->get_executor(), csr_s_merge);
-    auto mtx_clone = this->mtx4->clone();
-
-    this->mtx4->move_to(csr_mtx_c);
-    mtx_clone->move_to(csr_mtx_m);
+    using Mtx = typename TestFixture::Mtx;
 
-    auto v = csr_mtx_c->get_const_values();
-    auto c = csr_mtx_c->get_const_col_idxs();
-    auto r = csr_mtx_c->get_const_row_ptrs();
-    ASSERT_EQ(csr_mtx_c->get_size(), gko::dim<2>(2, 3));
-    ASSERT_EQ(csr_mtx_c->get_num_stored_elements(), 4);
-    EXPECT_EQ(r[0], 0);
-    EXPECT_EQ(r[1], 3);
-    EXPECT_EQ(r[2], 4);
-    EXPECT_EQ(c[0], 0);
-    EXPECT_EQ(c[1], 1);
-    EXPECT_EQ(c[2], 2);
-    EXPECT_EQ(c[3], 1);
-    EXPECT_EQ(v[0], T{1.0});
-    EXPECT_EQ(v[1], T{3.0});
-    EXPECT_EQ(v[2], T{2.0});
-    EXPECT_EQ(v[3], T{5.0});
-    ASSERT_EQ(csr_mtx_c->get_strategy()->get_name(), "classical");
-    GKO_ASSERT_MTX_NEAR(csr_mtx_c, csr_mtx_m, 0.0);
-    ASSERT_EQ(csr_mtx_m->get_strategy()->get_name(), "merge_path");
+    ASSERT_THROW(this->mtx4->transpose(Mtx::create(this->exec)),
+                 gko::DimensionMismatch);
 }
 
 
-TYPED_TEST(Dense, ConvertsToSparsityCsr32)
+TYPED_TEST(Dense, ExtractsDiagonalFromSquareMatrix)
 {
     using T = typename TestFixture::value_type;
-    using SparsityCsr = typename gko::matrix::SparsityCsr<T, gko::int32>;
-    auto sparsity_csr_mtx = SparsityCsr::create(this->mtx4->get_executor());
 
-    this->mtx4->convert_to(sparsity_csr_mtx);
-    auto v = sparsity_csr_mtx->get_const_value();
-    auto c = sparsity_csr_mtx->get_const_col_idxs();
-    auto r = sparsity_csr_mtx->get_const_row_ptrs();
+    auto diag = this->mtx5->extract_diagonal();
 
-    ASSERT_EQ(sparsity_csr_mtx->get_size(), gko::dim<2>(2, 3));
-    ASSERT_EQ(sparsity_csr_mtx->get_num_nonzeros(), 4);
-    EXPECT_EQ(r[0], 0);
-    EXPECT_EQ(r[1], 3);
-    EXPECT_EQ(r[2], 4);
-    EXPECT_EQ(c[0], 0);
-    EXPECT_EQ(c[1], 1);
-    EXPECT_EQ(c[2], 2);
-    EXPECT_EQ(c[3], 1);
-    EXPECT_EQ(v[0], T{1.0});
+    ASSERT_EQ(diag->get_size()[0], 3);
+    ASSERT_EQ(diag->get_size()[1], 3);
+    ASSERT_EQ(diag->get_values()[0], T{1.});
+    ASSERT_EQ(diag->get_values()[1], T{2.});
+    ASSERT_EQ(diag->get_values()[2], T{1.2});
 }
 
 
-TYPED_TEST(Dense, MovesToSparsityCsr32)
+TYPED_TEST(Dense, ExtractsDiagonalFromTallSkinnyMatrix)
 {
     using T = typename TestFixture::value_type;
-    using SparsityCsr = typename gko::matrix::SparsityCsr<T, gko::int32>;
-    auto sparsity_csr_mtx = SparsityCsr::create(this->mtx4->get_executor());
 
-    this->mtx4->move_to(sparsity_csr_mtx);
-    auto v = sparsity_csr_mtx->get_const_value();
-    auto c = sparsity_csr_mtx->get_const_col_idxs();
-    auto r = sparsity_csr_mtx->get_const_row_ptrs();
+    auto diag = this->mtx4->extract_diagonal();
 
-    ASSERT_EQ(sparsity_csr_mtx->get_size(), gko::dim<2>(2, 3));
-    ASSERT_EQ(sparsity_csr_mtx->get_num_nonzeros(), 4);
-    EXPECT_EQ(r[0], 0);
-    EXPECT_EQ(r[1], 3);
-    EXPECT_EQ(r[2], 4);
-    EXPECT_EQ(c[0], 0);
-    EXPECT_EQ(c[1], 1);
-    EXPECT_EQ(c[2], 2);
-    EXPECT_EQ(c[3], 1);
-    EXPECT_EQ(v[0], T{1.0});
+    ASSERT_EQ(diag->get_size()[0], 2);
+    ASSERT_EQ(diag->get_size()[1], 2);
+    ASSERT_EQ(diag->get_values()[0], T{1.});
+    ASSERT_EQ(diag->get_values()[1], T{5.});
 }
 
 
-TYPED_TEST(Dense, ConvertsToSparsityCsr64)
+TYPED_TEST(Dense, ExtractsDiagonalFromShortFatMatrix)
 {
     using T = typename TestFixture::value_type;
-    using SparsityCsr = typename gko::matrix::SparsityCsr<T, gko::int64>;
-    auto sparsity_csr_mtx = SparsityCsr::create(this->mtx4->get_executor());
 
-    this->mtx4->convert_to(sparsity_csr_mtx);
-    auto v = sparsity_csr_mtx->get_const_value();
-    auto c = sparsity_csr_mtx->get_const_col_idxs();
-    auto r = sparsity_csr_mtx->get_const_row_ptrs();
+    auto diag = this->mtx8->extract_diagonal();
 
-    ASSERT_EQ(sparsity_csr_mtx->get_size(), gko::dim<2>(2, 3));
-    ASSERT_EQ(sparsity_csr_mtx->get_num_nonzeros(), 4);
-    EXPECT_EQ(r[0], 0);
-    EXPECT_EQ(r[1], 3);
-    EXPECT_EQ(r[2], 4);
-    EXPECT_EQ(c[0], 0);
-    EXPECT_EQ(c[1], 1);
-    EXPECT_EQ(c[2], 2);
-    EXPECT_EQ(c[3], 1);
-    EXPECT_EQ(v[0], T{1.0});
+    ASSERT_EQ(diag->get_size()[0], 2);
+    ASSERT_EQ(diag->get_size()[1], 2);
+    ASSERT_EQ(diag->get_values()[0], T{1.});
+    ASSERT_EQ(diag->get_values()[1], T{2.});
 }
 
 
-TYPED_TEST(Dense, MovesToSparsityCsr64)
+TYPED_TEST(Dense, ExtractsDiagonalFromSquareMatrixIntoDiagonal)
 {
     using T = typename TestFixture::value_type;
-    using SparsityCsr = typename gko::matrix::SparsityCsr<T, gko::int64>;
-    auto sparsity_csr_mtx = SparsityCsr::create(this->mtx4->get_executor());
+    auto diag = gko::matrix::Diagonal<T>::create(this->exec, 3);
 
-    this->mtx4->move_to(sparsity_csr_mtx);
-    auto v = sparsity_csr_mtx->get_const_value();
-    auto c = sparsity_csr_mtx->get_const_col_idxs();
-    auto r = sparsity_csr_mtx->get_const_row_ptrs();
+    this->mtx5->extract_diagonal(diag);
 
-    ASSERT_EQ(sparsity_csr_mtx->get_size(), gko::dim<2>(2, 3));
-    ASSERT_EQ(sparsity_csr_mtx->get_num_nonzeros(), 4);
-    EXPECT_EQ(r[0], 0);
-    EXPECT_EQ(r[1], 3);
-    EXPECT_EQ(r[2], 4);
-    EXPECT_EQ(c[0], 0);
-    EXPECT_EQ(c[1], 1);
-    EXPECT_EQ(c[2], 2);
-    EXPECT_EQ(c[3], 1);
-    EXPECT_EQ(v[0], T{1.0});
+    ASSERT_EQ(diag->get_size()[0], 3);
+    ASSERT_EQ(diag->get_size()[1], 3);
+    ASSERT_EQ(diag->get_values()[0], T{1.});
+    ASSERT_EQ(diag->get_values()[1], T{2.});
+    ASSERT_EQ(diag->get_values()[2], T{1.2});
 }
 
 
-TYPED_TEST(Dense, ConvertsToEll32)
+TYPED_TEST(Dense, ExtractsDiagonalFromTallSkinnyMatrixIntoDiagonal)
 {
     using T = typename TestFixture::value_type;
-    using Ell = typename gko::matrix::Ell<T, gko::int32>;
-    auto ell_mtx = Ell::create(this->mtx6->get_executor());
+    auto diag = gko::matrix::Diagonal<T>::create(this->exec, 2);
 
-    this->mtx6->convert_to(ell_mtx);
-    auto v = ell_mtx->get_const_values();
-    auto c = ell_mtx->get_const_col_idxs();
+    this->mtx4->extract_diagonal(diag);
 
-    ASSERT_EQ(ell_mtx->get_size(), gko::dim<2>(2, 3));
-    ASSERT_EQ(ell_mtx->get_num_stored_elements_per_row(), 2);
-    ASSERT_EQ(ell_mtx->get_num_stored_elements(), 4);
-    ASSERT_EQ(ell_mtx->get_stride(), 2);
-    EXPECT_EQ(c[0], 0);
-    EXPECT_EQ(c[1], 1);
-    EXPECT_EQ(c[2], 1);
-    EXPECT_EQ(c[3], this->invalid_index);
-    EXPECT_EQ(v[0], T{1.0});
-    EXPECT_EQ(v[1], T{1.5});
-    EXPECT_EQ(v[2], T{2.0});
-    EXPECT_EQ(v[3], T{0.0});
+    ASSERT_EQ(diag->get_size()[0], 2);
+    ASSERT_EQ(diag->get_size()[1], 2);
+    ASSERT_EQ(diag->get_values()[0], T{1.});
+    ASSERT_EQ(diag->get_values()[1], T{5.});
 }
 
 
-TYPED_TEST(Dense, MovesToEll32)
+TYPED_TEST(Dense, ExtractsDiagonalFromShortFatMatrixIntoDiagonal)
 {
     using T = typename TestFixture::value_type;
-    using Ell = typename gko::matrix::Ell<T, gko::int32>;
-    auto ell_mtx = Ell::create(this->mtx6->get_executor());
+    auto diag = gko::matrix::Diagonal<T>::create(this->exec, 2);
 
-    this->mtx6->move_to(ell_mtx);
-    auto v = ell_mtx->get_const_values();
-    auto c = ell_mtx->get_const_col_idxs();
+    this->mtx8->extract_diagonal(diag);
 
-    ASSERT_EQ(ell_mtx->get_size(), gko::dim<2>(2, 3));
-    ASSERT_EQ(ell_mtx->get_num_stored_elements_per_row(), 2);
-    ASSERT_EQ(ell_mtx->get_num_stored_elements(), 4);
-    ASSERT_EQ(ell_mtx->get_stride(), 2);
-    EXPECT_EQ(c[0], 0);
-    EXPECT_EQ(c[1], 1);
-    EXPECT_EQ(c[2], 1);
-    EXPECT_EQ(c[3], this->invalid_index);
-    EXPECT_EQ(v[0], T{1.0});
-    EXPECT_EQ(v[1], T{1.5});
-    EXPECT_EQ(v[2], T{2.0});
-    EXPECT_EQ(v[3], T{0.0});
+    ASSERT_EQ(diag->get_size()[0], 2);
+    ASSERT_EQ(diag->get_size()[1], 2);
+    ASSERT_EQ(diag->get_values()[0], T{1.});
+    ASSERT_EQ(diag->get_values()[1], T{2.});
 }
 
 
-TYPED_TEST(Dense, ConvertsToEll64)
+TYPED_TEST(Dense, InplaceAbsolute)
 {
     using T = typename TestFixture::value_type;
-    using Ell = typename gko::matrix::Ell<T, gko::int64>;
-    auto ell_mtx = Ell::create(this->mtx6->get_executor());
 
-    this->mtx6->convert_to(ell_mtx);
-    auto v = ell_mtx->get_const_values();
-    auto c = ell_mtx->get_const_col_idxs();
+    this->mtx5->compute_absolute_inplace();
 
-    ASSERT_EQ(ell_mtx->get_size(), gko::dim<2>(2, 3));
-    ASSERT_EQ(ell_mtx->get_num_stored_elements_per_row(), 2);
-    ASSERT_EQ(ell_mtx->get_num_stored_elements(), 4);
-    ASSERT_EQ(ell_mtx->get_stride(), 2);
-    EXPECT_EQ(c[0], 0);
-    EXPECT_EQ(c[1], 1);
-    EXPECT_EQ(c[2], 1);
-    EXPECT_EQ(c[3], this->invalid_index);
-    EXPECT_EQ(v[0], T{1.0});
-    EXPECT_EQ(v[1], T{1.5});
-    EXPECT_EQ(v[2], T{2.0});
-    EXPECT_EQ(v[3], T{0.0});
+    GKO_ASSERT_MTX_NEAR(
+        this->mtx5, l<T>({{1.0, 1.0, 0.5}, {2.0, 2.0, 4.5}, {2.1, 3.4, 1.2}}),
+        0.0);
 }
 
 
-TYPED_TEST(Dense, MovesToEll64)
+TYPED_TEST(Dense, InplaceAbsoluteSubMatrix)
 {
     using T = typename TestFixture::value_type;
-    using Ell = typename gko::matrix::Ell<T, gko::int64>;
-    auto ell_mtx = Ell::create(this->mtx6->get_executor());
+    auto mtx = this->mtx5->create_submatrix(gko::span{0, 2}, gko::span{0, 2});
 
-    this->mtx6->move_to(ell_mtx);
-    auto v = ell_mtx->get_const_values();
-    auto c = ell_mtx->get_const_col_idxs();
+    mtx->compute_absolute_inplace();
 
-    ASSERT_EQ(ell_mtx->get_size(), gko::dim<2>(2, 3));
-    ASSERT_EQ(ell_mtx->get_num_stored_elements_per_row(), 2);
-    ASSERT_EQ(ell_mtx->get_num_stored_elements(), 4);
-    ASSERT_EQ(ell_mtx->get_stride(), 2);
-    EXPECT_EQ(c[0], 0);
-    EXPECT_EQ(c[1], 1);
-    EXPECT_EQ(c[2], 1);
-    EXPECT_EQ(c[3], this->invalid_index);
-    EXPECT_EQ(v[0], T{1.0});
-    EXPECT_EQ(v[1], T{1.5});
-    EXPECT_EQ(v[2], T{2.0});
-    EXPECT_EQ(v[3], T{0.0});
+    GKO_ASSERT_MTX_NEAR(
+        this->mtx5, l<T>({{1.0, 1.0, -0.5}, {2.0, 2.0, 4.5}, {2.1, 3.4, 1.2}}),
+        0.0);
 }
 
 
-TYPED_TEST(Dense, ConvertsToEllWithStride)
+TYPED_TEST(Dense, OutplaceAbsolute)
 {
     using T = typename TestFixture::value_type;
-    using Ell = typename gko::matrix::Ell<T, gko::int32>;
-    auto ell_mtx =
-        Ell::create(this->mtx6->get_executor(), gko::dim<2>{2, 3}, 2, 3);
 
-    this->mtx6->convert_to(ell_mtx);
-    auto v = ell_mtx->get_const_values();
-    auto c = ell_mtx->get_const_col_idxs();
+    auto abs_mtx = this->mtx5->compute_absolute();
 
-    ASSERT_EQ(ell_mtx->get_size(), gko::dim<2>(2, 3));
-    ASSERT_EQ(ell_mtx->get_num_stored_elements_per_row(), 2);
-    ASSERT_EQ(ell_mtx->get_num_stored_elements(), 6);
-    ASSERT_EQ(ell_mtx->get_stride(), 3);
-    EXPECT_EQ(c[0], 0);
-    EXPECT_EQ(c[1], 1);
-    EXPECT_EQ(c[2], this->invalid_index);
-    EXPECT_EQ(c[3], 1);
-    EXPECT_EQ(c[4], this->invalid_index);
-    EXPECT_EQ(c[5], this->invalid_index);
-    EXPECT_EQ(v[0], T{1.0});
-    EXPECT_EQ(v[1], T{1.5});
-    EXPECT_EQ(v[2], T{0.0});
-    EXPECT_EQ(v[3], T{2.0});
-    EXPECT_EQ(v[4], T{0.0});
-    EXPECT_EQ(v[5], T{0.0});
+    GKO_ASSERT_MTX_NEAR(
+        abs_mtx, l<T>({{1.0, 1.0, 0.5}, {2.0, 2.0, 4.5}, {2.1, 3.4, 1.2}}),
+        0.0);
 }
 
 
-TYPED_TEST(Dense, MovesToEllWithStride)
+TYPED_TEST(Dense, OutplaceAbsoluteIntoDense)
 {
+    using Mtx = typename TestFixture::Mtx;
     using T = typename TestFixture::value_type;
-    using Ell = typename gko::matrix::Ell<T, gko::int32>;
-    auto ell_mtx =
-        Ell::create(this->mtx6->get_executor(), gko::dim<2>{2, 3}, 2, 3);
+    auto abs_mtx =
+        gko::remove_complex<Mtx>::create(this->exec, this->mtx5->get_size());
 
-    this->mtx6->move_to(ell_mtx);
-    auto v = ell_mtx->get_const_values();
-    auto c = ell_mtx->get_const_col_idxs();
+    this->mtx5->compute_absolute(abs_mtx);
 
-    ASSERT_EQ(ell_mtx->get_size(), gko::dim<2>(2, 3));
-    ASSERT_EQ(ell_mtx->get_num_stored_elements_per_row(), 2);
-    ASSERT_EQ(ell_mtx->get_num_stored_elements(), 6);
-    ASSERT_EQ(ell_mtx->get_stride(), 3);
-    EXPECT_EQ(c[0], 0);
-    EXPECT_EQ(c[1], 1);
-    EXPECT_EQ(c[2], this->invalid_index);
-    EXPECT_EQ(c[3], 1);
-    EXPECT_EQ(c[4], this->invalid_index);
-    EXPECT_EQ(c[5], this->invalid_index);
-    EXPECT_EQ(v[0], T{1.0});
-    EXPECT_EQ(v[1], T{1.5});
-    EXPECT_EQ(v[2], T{0.0});
-    EXPECT_EQ(v[3], T{2.0});
-    EXPECT_EQ(v[4], T{0.0});
-    EXPECT_EQ(v[5], T{0.0});
+    GKO_ASSERT_MTX_NEAR(
+        abs_mtx, l<T>({{1.0, 1.0, 0.5}, {2.0, 2.0, 4.5}, {2.1, 3.4, 1.2}}),
+        0.0);
 }
 
 
-TYPED_TEST(Dense, MovesToHybridAutomatically32)
+TYPED_TEST(Dense, OutplaceAbsoluteSubMatrix)
 {
     using T = typename TestFixture::value_type;
-    using Hybrid = typename gko::matrix::Hybrid<T, gko::int32>;
-    auto hybrid_mtx = Hybrid::create(this->mtx4->get_executor());
+    auto mtx = this->mtx5->create_submatrix(gko::span{0, 2}, gko::span{0, 2});
 
-    this->mtx4->move_to(hybrid_mtx);
-    auto v = hybrid_mtx->get_const_coo_values();
-    auto c = hybrid_mtx->get_const_coo_col_idxs();
-    auto r = hybrid_mtx->get_const_coo_row_idxs();
-    auto n = hybrid_mtx->get_ell_num_stored_elements_per_row();
-    auto p = hybrid_mtx->get_ell_stride();
+    auto abs_mtx = mtx->compute_absolute();
 
-    ASSERT_EQ(hybrid_mtx->get_size(), gko::dim<2>(2, 3));
-    ASSERT_EQ(hybrid_mtx->get_ell_num_stored_elements(), 0);
-    ASSERT_EQ(hybrid_mtx->get_coo_num_stored_elements(), 4);
-    EXPECT_EQ(n, 0);
-    EXPECT_EQ(p, 2);
-    EXPECT_EQ(r[0], 0);
-    EXPECT_EQ(r[1], 0);
-    EXPECT_EQ(r[2], 0);
-    EXPECT_EQ(r[3], 1);
-    EXPECT_EQ(c[0], 0);
-    EXPECT_EQ(c[1], 1);
-    EXPECT_EQ(c[2], 2);
-    EXPECT_EQ(c[3], 1);
-    EXPECT_EQ(v[0], T{1.0});
-    EXPECT_EQ(v[1], T{3.0});
-    EXPECT_EQ(v[2], T{2.0});
-    EXPECT_EQ(v[3], T{5.0});
+    GKO_ASSERT_MTX_NEAR(abs_mtx, l<T>({{1.0, 1.0}, {2.0, 2.0}}), 0);
+    GKO_ASSERT_EQ(abs_mtx->get_stride(), 2);
 }
 
 
-TYPED_TEST(Dense, ConvertsToHybridAutomatically32)
+TYPED_TEST(Dense, OutplaceSubmatrixAbsoluteIntoDense)
 {
+    using Mtx = typename TestFixture::Mtx;
     using T = typename TestFixture::value_type;
-    using Hybrid = typename gko::matrix::Hybrid<T, gko::int32>;
-    auto hybrid_mtx = Hybrid::create(this->mtx4->get_executor());
+    auto mtx = this->mtx5->create_submatrix(gko::span{0, 2}, gko::span{0, 2});
+    auto abs_mtx =
+        gko::remove_complex<Mtx>::create(this->exec, gko::dim<2>{2, 2}, 4);
 
-    this->mtx4->convert_to(hybrid_mtx);
-    auto v = hybrid_mtx->get_const_coo_values();
-    auto c = hybrid_mtx->get_const_coo_col_idxs();
-    auto r = hybrid_mtx->get_const_coo_row_idxs();
-    auto n = hybrid_mtx->get_ell_num_stored_elements_per_row();
-    auto p = hybrid_mtx->get_ell_stride();
+    mtx->compute_absolute(abs_mtx);
 
-    ASSERT_EQ(hybrid_mtx->get_size(), gko::dim<2>(2, 3));
-    ASSERT_EQ(hybrid_mtx->get_ell_num_stored_elements(), 0);
-    ASSERT_EQ(hybrid_mtx->get_coo_num_stored_elements(), 4);
-    EXPECT_EQ(n, 0);
-    EXPECT_EQ(p, 2);
-    EXPECT_EQ(r[0], 0);
-    EXPECT_EQ(r[1], 0);
-    EXPECT_EQ(r[2], 0);
-    EXPECT_EQ(r[3], 1);
-    EXPECT_EQ(c[0], 0);
-    EXPECT_EQ(c[1], 1);
-    EXPECT_EQ(c[2], 2);
-    EXPECT_EQ(c[3], 1);
-    EXPECT_EQ(v[0], T{1.0});
-    EXPECT_EQ(v[1], T{3.0});
-    EXPECT_EQ(v[2], T{2.0});
-    EXPECT_EQ(v[3], T{5.0});
+    GKO_ASSERT_MTX_NEAR(abs_mtx, l<T>({{1.0, 1.0}, {2.0, 2.0}}), 0);
+    GKO_ASSERT_EQ(abs_mtx->get_stride(), 4);
 }
 
 
-TYPED_TEST(Dense, MovesToHybridAutomatically64)
+TYPED_TEST(Dense, AppliesToComplex)
 {
-    using T = typename TestFixture::value_type;
-    using Hybrid = typename gko::matrix::Hybrid<T, gko::int64>;
-    auto hybrid_mtx = Hybrid::create(this->mtx4->get_executor());
+    using value_type = typename TestFixture::value_type;
+    using complex_type = gko::to_complex<value_type>;
+    using Vec = gko::matrix::Dense<complex_type>;
+    auto exec = gko::ReferenceExecutor::create();
+    auto b =
+        gko::initialize<Vec>({{complex_type{1.0, 0.0}, complex_type{2.0, 1.0}},
+                              {complex_type{2.0, 2.0}, complex_type{3.0, 3.0}},
+                              {complex_type{3.0, 4.0}, complex_type{4.0, 5.0}}},
+                             exec);
+    auto x = Vec::create(exec, gko::dim<2>{2, 2});
 
-    this->mtx4->move_to(hybrid_mtx);
-    auto v = hybrid_mtx->get_const_coo_values();
-    auto c = hybrid_mtx->get_const_coo_col_idxs();
-    auto r = hybrid_mtx->get_const_coo_row_idxs();
-    auto n = hybrid_mtx->get_ell_num_stored_elements_per_row();
-    auto p = hybrid_mtx->get_ell_stride();
+    this->mtx1->apply(b, x);
 
-    ASSERT_EQ(hybrid_mtx->get_size(), gko::dim<2>(2, 3));
-    ASSERT_EQ(hybrid_mtx->get_ell_num_stored_elements(), 0);
-    ASSERT_EQ(hybrid_mtx->get_coo_num_stored_elements(), 4);
-    EXPECT_EQ(n, 0);
-    EXPECT_EQ(p, 2);
-    EXPECT_EQ(r[0], 0);
-    EXPECT_EQ(r[1], 0);
-    EXPECT_EQ(r[2], 0);
-    EXPECT_EQ(r[3], 1);
-    EXPECT_EQ(c[0], 0);
-    EXPECT_EQ(c[1], 1);
-    EXPECT_EQ(c[2], 2);
-    EXPECT_EQ(c[3], 1);
-    EXPECT_EQ(v[0], T{1.0});
-    EXPECT_EQ(v[1], T{3.0});
-    EXPECT_EQ(v[2], T{2.0});
-    EXPECT_EQ(v[3], T{5.0});
+    GKO_ASSERT_MTX_NEAR(
+        x,
+        l({{complex_type{14.0, 16.0}, complex_type{20.0, 22.0}},
+           {complex_type{17.0, 19.0}, complex_type{24.5, 26.5}}}),
+        0.0);
 }
 
 
-TYPED_TEST(Dense, ConvertsToHybridAutomatically64)
+TYPED_TEST(Dense, AppliesToMixedComplex)
 {
-    using T = typename TestFixture::value_type;
-    using Hybrid = typename gko::matrix::Hybrid<T, gko::int64>;
-    auto hybrid_mtx = Hybrid::create(this->mtx4->get_executor());
-
-    this->mtx4->convert_to(hybrid_mtx);
-    auto v = hybrid_mtx->get_const_coo_values();
-    auto c = hybrid_mtx->get_const_coo_col_idxs();
-    auto r = hybrid_mtx->get_const_coo_row_idxs();
-    auto n = hybrid_mtx->get_ell_num_stored_elements_per_row();
-    auto p = hybrid_mtx->get_ell_stride();
-
-    ASSERT_EQ(hybrid_mtx->get_size(), gko::dim<2>(2, 3));
-    ASSERT_EQ(hybrid_mtx->get_ell_num_stored_elements(), 0);
-    ASSERT_EQ(hybrid_mtx->get_coo_num_stored_elements(), 4);
-    EXPECT_EQ(n, 0);
-    EXPECT_EQ(p, 2);
-    EXPECT_EQ(r[0], 0);
-    EXPECT_EQ(r[1], 0);
-    EXPECT_EQ(r[2], 0);
-    EXPECT_EQ(r[3], 1);
-    EXPECT_EQ(c[0], 0);
-    EXPECT_EQ(c[1], 1);
-    EXPECT_EQ(c[2], 2);
-    EXPECT_EQ(c[3], 1);
-    EXPECT_EQ(v[0], T{1.0});
-    EXPECT_EQ(v[1], T{3.0});
-    EXPECT_EQ(v[2], T{2.0});
-    EXPECT_EQ(v[3], T{5.0});
-}
+    using mixed_value_type =
+        gko::next_precision<typename TestFixture::value_type>;
+    using mixed_complex_type = gko::to_complex<mixed_value_type>;
+    using Vec = gko::matrix::Dense<mixed_complex_type>;
+    auto exec = gko::ReferenceExecutor::create();
+    auto b = gko::initialize<Vec>(
+        {{mixed_complex_type{1.0, 0.0}, mixed_complex_type{2.0, 1.0}},
+         {mixed_complex_type{2.0, 2.0}, mixed_complex_type{3.0, 3.0}},
+         {mixed_complex_type{3.0, 4.0}, mixed_complex_type{4.0, 5.0}}},
+        exec);
+    auto x = Vec::create(exec, gko::dim<2>{2, 2});
 
+    this->mtx1->apply(b, x);
 
-TYPED_TEST(Dense, MovesToHybridWithStrideAutomatically)
-{
-    using T = typename TestFixture::value_type;
-    using Hybrid = typename gko::matrix::Hybrid<T, gko::int32>;
-    auto hybrid_mtx =
-        Hybrid::create(this->mtx4->get_executor(), gko::dim<2>{2, 3}, 0, 3);
-
-    this->mtx4->move_to(hybrid_mtx);
-    auto v = hybrid_mtx->get_const_coo_values();
-    auto c = hybrid_mtx->get_const_coo_col_idxs();
-    auto r = hybrid_mtx->get_const_coo_row_idxs();
-    auto n = hybrid_mtx->get_ell_num_stored_elements_per_row();
-    auto p = hybrid_mtx->get_ell_stride();
-
-    ASSERT_EQ(hybrid_mtx->get_size(), gko::dim<2>(2, 3));
-    ASSERT_EQ(hybrid_mtx->get_ell_num_stored_elements(), 0);
-    ASSERT_EQ(hybrid_mtx->get_coo_num_stored_elements(), 4);
-    EXPECT_EQ(n, 0);
-    EXPECT_EQ(p, 3);
-    EXPECT_EQ(r[0], 0);
-    EXPECT_EQ(r[1], 0);
-    EXPECT_EQ(r[2], 0);
-    EXPECT_EQ(r[3], 1);
-    EXPECT_EQ(c[0], 0);
-    EXPECT_EQ(c[1], 1);
-    EXPECT_EQ(c[2], 2);
-    EXPECT_EQ(c[3], 1);
-    EXPECT_EQ(v[0], T{1.0});
-    EXPECT_EQ(v[1], T{3.0});
-    EXPECT_EQ(v[2], T{2.0});
-    EXPECT_EQ(v[3], T{5.0});
-}
-
-
-TYPED_TEST(Dense, ConvertsToHybridWithStrideAutomatically)
-{
-    using T = typename TestFixture::value_type;
-    using Hybrid = typename gko::matrix::Hybrid<T, gko::int32>;
-    auto hybrid_mtx =
-        Hybrid::create(this->mtx4->get_executor(), gko::dim<2>{2, 3}, 0, 3);
-
-    this->mtx4->convert_to(hybrid_mtx);
-    auto v = hybrid_mtx->get_const_coo_values();
-    auto c = hybrid_mtx->get_const_coo_col_idxs();
-    auto r = hybrid_mtx->get_const_coo_row_idxs();
-    auto n = hybrid_mtx->get_ell_num_stored_elements_per_row();
-    auto p = hybrid_mtx->get_ell_stride();
-
-    ASSERT_EQ(hybrid_mtx->get_size(), gko::dim<2>(2, 3));
-    ASSERT_EQ(hybrid_mtx->get_ell_num_stored_elements(), 0);
-    ASSERT_EQ(hybrid_mtx->get_coo_num_stored_elements(), 4);
-    EXPECT_EQ(n, 0);
-    EXPECT_EQ(p, 3);
-    EXPECT_EQ(r[0], 0);
-    EXPECT_EQ(r[1], 0);
-    EXPECT_EQ(r[2], 0);
-    EXPECT_EQ(r[3], 1);
-    EXPECT_EQ(c[0], 0);
-    EXPECT_EQ(c[1], 1);
-    EXPECT_EQ(c[2], 2);
-    EXPECT_EQ(c[3], 1);
-    EXPECT_EQ(v[0], T{1.0});
-    EXPECT_EQ(v[1], T{3.0});
-    EXPECT_EQ(v[2], T{2.0});
-    EXPECT_EQ(v[3], T{5.0});
-}
-
-
-TYPED_TEST(Dense, MovesToHybridWithStrideAndCooLengthByColumns2)
-{
-    using T = typename TestFixture::value_type;
-    using Hybrid = typename gko::matrix::Hybrid<T, gko::int32>;
-    auto hybrid_mtx =
-        Hybrid::create(this->mtx4->get_executor(), gko::dim<2>{2, 3}, 2, 3, 3,
-                       std::make_shared<typename Hybrid::column_limit>(2));
-
-    this->mtx4->move_to(hybrid_mtx);
-    auto v = hybrid_mtx->get_const_ell_values();
-    auto c = hybrid_mtx->get_const_ell_col_idxs();
-    auto n = hybrid_mtx->get_ell_num_stored_elements_per_row();
-    auto p = hybrid_mtx->get_ell_stride();
-
-    ASSERT_EQ(hybrid_mtx->get_size(), gko::dim<2>(2, 3));
-    ASSERT_EQ(hybrid_mtx->get_ell_num_stored_elements(), 6);
-    ASSERT_EQ(hybrid_mtx->get_coo_num_stored_elements(), 1);
-    EXPECT_EQ(n, 2);
-    EXPECT_EQ(p, 3);
-    EXPECT_EQ(c[0], 0);
-    EXPECT_EQ(c[1], 1);
-    EXPECT_EQ(c[2], this->invalid_index);
-    EXPECT_EQ(c[3], 1);
-    EXPECT_EQ(c[4], this->invalid_index);
-    EXPECT_EQ(c[5], this->invalid_index);
-    EXPECT_EQ(v[0], T{1.0});
-    EXPECT_EQ(v[1], T{5.0});
-    EXPECT_EQ(v[2], T{0.0});
-    EXPECT_EQ(v[3], T{3.0});
-    EXPECT_EQ(v[4], T{0.0});
-    EXPECT_EQ(v[5], T{0.0});
-    EXPECT_EQ(hybrid_mtx->get_const_coo_values()[0], T{2.0});
-    EXPECT_EQ(hybrid_mtx->get_const_coo_row_idxs()[0], 0);
-    EXPECT_EQ(hybrid_mtx->get_const_coo_col_idxs()[0], 2);
-}
-
-
-TYPED_TEST(Dense, ConvertsToHybridWithStrideAndCooLengthByColumns2)
-{
-    using T = typename TestFixture::value_type;
-    using Hybrid = typename gko::matrix::Hybrid<T, gko::int32>;
-    auto hybrid_mtx =
-        Hybrid::create(this->mtx4->get_executor(), gko::dim<2>{2, 3}, 2, 3, 3,
-                       std::make_shared<typename Hybrid::column_limit>(2));
-
-    this->mtx4->convert_to(hybrid_mtx);
-    auto v = hybrid_mtx->get_const_ell_values();
-    auto c = hybrid_mtx->get_const_ell_col_idxs();
-    auto n = hybrid_mtx->get_ell_num_stored_elements_per_row();
-    auto p = hybrid_mtx->get_ell_stride();
-
-    ASSERT_EQ(hybrid_mtx->get_size(), gko::dim<2>(2, 3));
-    ASSERT_EQ(hybrid_mtx->get_ell_num_stored_elements(), 6);
-    ASSERT_EQ(hybrid_mtx->get_coo_num_stored_elements(), 1);
-    EXPECT_EQ(n, 2);
-    EXPECT_EQ(p, 3);
-    EXPECT_EQ(c[0], 0);
-    EXPECT_EQ(c[1], 1);
-    EXPECT_EQ(c[2], this->invalid_index);
-    EXPECT_EQ(c[3], 1);
-    EXPECT_EQ(c[4], this->invalid_index);
-    EXPECT_EQ(c[5], this->invalid_index);
-    EXPECT_EQ(v[0], T{1.0});
-    EXPECT_EQ(v[1], T{5.0});
-    EXPECT_EQ(v[2], T{0.0});
-    EXPECT_EQ(v[3], T{3.0});
-    EXPECT_EQ(v[4], T{0.0});
-    EXPECT_EQ(v[5], T{0.0});
-    EXPECT_EQ(hybrid_mtx->get_const_coo_row_idxs()[0], 0);
-    EXPECT_EQ(hybrid_mtx->get_const_coo_col_idxs()[0], 2);
-    EXPECT_EQ(hybrid_mtx->get_const_coo_values()[0], T{2.0});
-}
-
-
-TYPED_TEST(Dense, MovesToHybridWithStrideByPercent40)
-{
-    using T = typename TestFixture::value_type;
-    using Hybrid = typename gko::matrix::Hybrid<T, gko::int32>;
-    auto hybrid_mtx =
-        Hybrid::create(this->mtx4->get_executor(), gko::dim<2>{2, 3}, 1, 3,
-                       std::make_shared<typename Hybrid::imbalance_limit>(0.4));
-
-    this->mtx4->move_to(hybrid_mtx);
-    auto v = hybrid_mtx->get_const_ell_values();
-    auto c = hybrid_mtx->get_const_ell_col_idxs();
-    auto n = hybrid_mtx->get_ell_num_stored_elements_per_row();
-    auto p = hybrid_mtx->get_ell_stride();
-    auto coo_v = hybrid_mtx->get_const_coo_values();
-    auto coo_c = hybrid_mtx->get_const_coo_col_idxs();
-    auto coo_r = hybrid_mtx->get_const_coo_row_idxs();
-
-    ASSERT_EQ(hybrid_mtx->get_size(), gko::dim<2>(2, 3));
-    ASSERT_EQ(hybrid_mtx->get_ell_num_stored_elements(), 3);
-    EXPECT_EQ(n, 1);
-    EXPECT_EQ(p, 3);
-    EXPECT_EQ(c[0], 0);
-    EXPECT_EQ(c[1], 1);
-    EXPECT_EQ(c[2], this->invalid_index);
-    EXPECT_EQ(v[0], T{1.0});
-    EXPECT_EQ(v[1], T{5.0});
-    EXPECT_EQ(v[2], T{0.0});
-    ASSERT_EQ(hybrid_mtx->get_coo_num_stored_elements(), 2);
-    EXPECT_EQ(coo_v[0], T{3.0});
-    EXPECT_EQ(coo_v[1], T{2.0});
-    EXPECT_EQ(coo_c[0], 1);
-    EXPECT_EQ(coo_c[1], 2);
-    EXPECT_EQ(coo_r[0], 0);
-    EXPECT_EQ(coo_r[1], 0);
-}
-
-
-TYPED_TEST(Dense, ConvertsToHybridWithStrideByPercent40)
-{
-    using T = typename TestFixture::value_type;
-    using Hybrid = typename gko::matrix::Hybrid<T, gko::int32>;
-    auto hybrid_mtx =
-        Hybrid::create(this->mtx4->get_executor(), gko::dim<2>{2, 3}, 1, 3,
-                       std::make_shared<typename Hybrid::imbalance_limit>(0.4));
-
-    this->mtx4->convert_to(hybrid_mtx);
-    auto v = hybrid_mtx->get_const_ell_values();
-    auto c = hybrid_mtx->get_const_ell_col_idxs();
-    auto n = hybrid_mtx->get_ell_num_stored_elements_per_row();
-    auto p = hybrid_mtx->get_ell_stride();
-    auto coo_v = hybrid_mtx->get_const_coo_values();
-    auto coo_c = hybrid_mtx->get_const_coo_col_idxs();
-    auto coo_r = hybrid_mtx->get_const_coo_row_idxs();
-
-    ASSERT_EQ(hybrid_mtx->get_size(), gko::dim<2>(2, 3));
-    ASSERT_EQ(hybrid_mtx->get_ell_num_stored_elements(), 3);
-    EXPECT_EQ(n, 1);
-    EXPECT_EQ(p, 3);
-    EXPECT_EQ(c[0], 0);
-    EXPECT_EQ(c[1], 1);
-    EXPECT_EQ(c[2], this->invalid_index);
-    EXPECT_EQ(v[0], T{1.0});
-    EXPECT_EQ(v[1], T{5.0});
-    EXPECT_EQ(v[2], T{0.0});
-    ASSERT_EQ(hybrid_mtx->get_coo_num_stored_elements(), 2);
-    EXPECT_EQ(coo_v[0], T{3.0});
-    EXPECT_EQ(coo_v[1], T{2.0});
-    EXPECT_EQ(coo_c[0], 1);
-    EXPECT_EQ(coo_c[1], 2);
-    EXPECT_EQ(coo_r[0], 0);
-    EXPECT_EQ(coo_r[1], 0);
-}
-
-
-TYPED_TEST(Dense, ConvertsToSellp32)
-{
-    using T = typename TestFixture::value_type;
-    using Sellp = typename gko::matrix::Sellp<T, gko::int32>;
-    auto sellp_mtx = Sellp::create(this->mtx7->get_executor());
-
-    this->mtx7->convert_to(sellp_mtx);
-    auto v = sellp_mtx->get_const_values();
-    auto c = sellp_mtx->get_const_col_idxs();
-    auto s = sellp_mtx->get_const_slice_sets();
-    auto l = sellp_mtx->get_const_slice_lengths();
-
-    ASSERT_EQ(sellp_mtx->get_size(), gko::dim<2>(2, 3));
-    ASSERT_EQ(sellp_mtx->get_total_cols(), 3);
-    ASSERT_EQ(sellp_mtx->get_num_stored_elements(),
-              3 * gko::matrix::default_slice_size);
-    ASSERT_EQ(sellp_mtx->get_slice_size(), gko::matrix::default_slice_size);
-    ASSERT_EQ(sellp_mtx->get_stride_factor(),
-              gko::matrix::default_stride_factor);
-    EXPECT_EQ(c[0], 0);
-    EXPECT_EQ(c[1], 1);
-    EXPECT_EQ(c[gko::matrix::default_slice_size], 1);
-    EXPECT_EQ(c[gko::matrix::default_slice_size + 1], this->invalid_index);
-    EXPECT_EQ(c[2 * gko::matrix::default_slice_size], 2);
-    EXPECT_EQ(c[2 * gko::matrix::default_slice_size + 1], this->invalid_index);
-    EXPECT_EQ(v[0], T{1.0});
-    EXPECT_EQ(v[1], T{1.5});
-    EXPECT_EQ(v[gko::matrix::default_slice_size], T{2.0});
-    EXPECT_EQ(v[gko::matrix::default_slice_size + 1], T{0.0});
-    EXPECT_EQ(v[2 * gko::matrix::default_slice_size], T{3.0});
-    EXPECT_EQ(v[2 * gko::matrix::default_slice_size + 1], T{0.0});
-    EXPECT_EQ(s[0], 0);
-    EXPECT_EQ(s[1], 3);
-    EXPECT_EQ(l[0], 3);
+    GKO_ASSERT_MTX_NEAR(
+        x,
+        l({{mixed_complex_type{14.0, 16.0}, mixed_complex_type{20.0, 22.0}},
+           {mixed_complex_type{17.0, 19.0}, mixed_complex_type{24.5, 26.5}}}),
+        0.0);
 }
 
 
-TYPED_TEST(Dense, MovesToSellp32)
+TYPED_TEST(Dense, AdvancedAppliesToComplex)
 {
-    using T = typename TestFixture::value_type;
-    using Sellp = typename gko::matrix::Sellp<T, gko::int32>;
-    auto sellp_mtx = Sellp::create(this->mtx7->get_executor());
-
-    this->mtx7->move_to(sellp_mtx);
-    auto v = sellp_mtx->get_const_values();
-    auto c = sellp_mtx->get_const_col_idxs();
-    auto s = sellp_mtx->get_const_slice_sets();
-    auto l = sellp_mtx->get_const_slice_lengths();
-
-    ASSERT_EQ(sellp_mtx->get_size(), gko::dim<2>(2, 3));
-    ASSERT_EQ(sellp_mtx->get_total_cols(), 3);
-    ASSERT_EQ(sellp_mtx->get_num_stored_elements(),
-              3 * gko::matrix::default_slice_size);
-    ASSERT_EQ(sellp_mtx->get_slice_size(), gko::matrix::default_slice_size);
-    ASSERT_EQ(sellp_mtx->get_stride_factor(),
-              gko::matrix::default_stride_factor);
-    EXPECT_EQ(c[0], 0);
-    EXPECT_EQ(c[1], 1);
-    EXPECT_EQ(c[gko::matrix::default_slice_size], 1);
-    EXPECT_EQ(c[gko::matrix::default_slice_size + 1], this->invalid_index);
-    EXPECT_EQ(c[2 * gko::matrix::default_slice_size], 2);
-    EXPECT_EQ(c[2 * gko::matrix::default_slice_size + 1], this->invalid_index);
-    EXPECT_EQ(v[0], T{1.0});
-    EXPECT_EQ(v[1], T{1.5});
-    EXPECT_EQ(v[gko::matrix::default_slice_size], T{2.0});
-    EXPECT_EQ(v[gko::matrix::default_slice_size + 1], T{0.0});
-    EXPECT_EQ(v[2 * gko::matrix::default_slice_size], T{3.0});
-    EXPECT_EQ(v[2 * gko::matrix::default_slice_size + 1], T{0.0});
-    EXPECT_EQ(s[0], 0);
-    EXPECT_EQ(s[1], 3);
-    EXPECT_EQ(l[0], 3);
-}
-
+    using value_type = typename TestFixture::value_type;
+    using complex_type = gko::to_complex<value_type>;
+    using Dense = gko::matrix::Dense<value_type>;
+    using DenseComplex = gko::matrix::Dense<complex_type>;
+    auto exec = gko::ReferenceExecutor::create();
 
-TYPED_TEST(Dense, ConvertsToSellp64)
-{
-    using T = typename TestFixture::value_type;
-    using Sellp = typename gko::matrix::Sellp<T, gko::int64>;
-    auto sellp_mtx = Sellp::create(this->mtx7->get_executor());
+    auto b = gko::initialize<DenseComplex>(
+        {{complex_type{1.0, 0.0}, complex_type{2.0, 1.0}},
+         {complex_type{2.0, 2.0}, complex_type{3.0, 3.0}},
+         {complex_type{3.0, 4.0}, complex_type{4.0, 5.0}}},
+        exec);
+    auto x = gko::initialize<DenseComplex>(
+        {{complex_type{1.0, 0.0}, complex_type{2.0, 1.0}},
+         {complex_type{2.0, 2.0}, complex_type{3.0, 3.0}}},
+        exec);
+    auto alpha = gko::initialize<Dense>({-1.0}, this->exec);
+    auto beta = gko::initialize<Dense>({2.0}, this->exec);
 
-    this->mtx7->convert_to(sellp_mtx);
-    auto v = sellp_mtx->get_const_values();
-    auto c = sellp_mtx->get_const_col_idxs();
-    auto s = sellp_mtx->get_const_slice_sets();
-    auto l = sellp_mtx->get_const_slice_lengths();
+    this->mtx1->apply(alpha, b, beta, x);
 
-    ASSERT_EQ(sellp_mtx->get_size(), gko::dim<2>(2, 3));
-    ASSERT_EQ(sellp_mtx->get_total_cols(), 3);
-    ASSERT_EQ(sellp_mtx->get_num_stored_elements(),
-              3 * gko::matrix::default_slice_size);
-    ASSERT_EQ(sellp_mtx->get_slice_size(), gko::matrix::default_slice_size);
-    ASSERT_EQ(sellp_mtx->get_stride_factor(),
-              gko::matrix::default_stride_factor);
-    EXPECT_EQ(c[0], 0);
-    EXPECT_EQ(c[1], 1);
-    EXPECT_EQ(c[gko::matrix::default_slice_size], 1);
-    EXPECT_EQ(c[gko::matrix::default_slice_size + 1], this->invalid_index);
-    EXPECT_EQ(c[2 * gko::matrix::default_slice_size], 2);
-    EXPECT_EQ(c[2 * gko::matrix::default_slice_size + 1], this->invalid_index);
-    EXPECT_EQ(v[0], T{1.0});
-    EXPECT_EQ(v[1], T{1.5});
-    EXPECT_EQ(v[gko::matrix::default_slice_size], T{2.0});
-    EXPECT_EQ(v[gko::matrix::default_slice_size + 1], T{0.0});
-    EXPECT_EQ(v[2 * gko::matrix::default_slice_size], T{3.0});
-    EXPECT_EQ(v[2 * gko::matrix::default_slice_size + 1], T{0.0});
-    EXPECT_EQ(s[0], 0);
-    EXPECT_EQ(s[1], 3);
-    EXPECT_EQ(l[0], 3);
+    GKO_ASSERT_MTX_NEAR(
+        x,
+        l({{complex_type{-12.0, -16.0}, complex_type{-16.0, -20.0}},
+           {complex_type{-13.0, -15.0}, complex_type{-18.5, -20.5}}}),
+        0.0);
 }
 
 
-TYPED_TEST(Dense, MovesToSellp64)
+TYPED_TEST(Dense, AdvancedAppliesToMixedComplex)
 {
-    using T = typename TestFixture::value_type;
-    using Sellp = typename gko::matrix::Sellp<T, gko::int64>;
-    auto sellp_mtx = Sellp::create(this->mtx7->get_executor());
-
-    this->mtx7->move_to(sellp_mtx);
-    auto v = sellp_mtx->get_const_values();
-    auto c = sellp_mtx->get_const_col_idxs();
-    auto s = sellp_mtx->get_const_slice_sets();
-    auto l = sellp_mtx->get_const_slice_lengths();
-
-    ASSERT_EQ(sellp_mtx->get_size(), gko::dim<2>(2, 3));
-    ASSERT_EQ(sellp_mtx->get_total_cols(), 3);
-    ASSERT_EQ(sellp_mtx->get_num_stored_elements(),
-              3 * gko::matrix::default_slice_size);
-    ASSERT_EQ(sellp_mtx->get_slice_size(), gko::matrix::default_slice_size);
-    ASSERT_EQ(sellp_mtx->get_stride_factor(),
-              gko::matrix::default_stride_factor);
-    EXPECT_EQ(c[0], 0);
-    EXPECT_EQ(c[1], 1);
-    EXPECT_EQ(c[gko::matrix::default_slice_size], 1);
-    EXPECT_EQ(c[gko::matrix::default_slice_size + 1], this->invalid_index);
-    EXPECT_EQ(c[2 * gko::matrix::default_slice_size], 2);
-    EXPECT_EQ(c[2 * gko::matrix::default_slice_size + 1], this->invalid_index);
-    EXPECT_EQ(v[0], T{1.0});
-    EXPECT_EQ(v[1], T{1.5});
-    EXPECT_EQ(v[gko::matrix::default_slice_size], T{2.0});
-    EXPECT_EQ(v[gko::matrix::default_slice_size + 1], T{0.0});
-    EXPECT_EQ(v[2 * gko::matrix::default_slice_size], T{3.0});
-    EXPECT_EQ(v[2 * gko::matrix::default_slice_size + 1], T{0.0});
-    EXPECT_EQ(s[0], 0);
-    EXPECT_EQ(s[1], 3);
-    EXPECT_EQ(l[0], 3);
-}
-
+    using mixed_value_type =
+        gko::next_precision<typename TestFixture::value_type>;
+    using mixed_complex_type = gko::to_complex<mixed_value_type>;
+    using MixedDense = gko::matrix::Dense<mixed_value_type>;
+    using MixedDenseComplex = gko::matrix::Dense<mixed_complex_type>;
+    auto exec = gko::ReferenceExecutor::create();
 
-TYPED_TEST(Dense, ConvertsToSellpWithSliceSizeAndStrideFactor)
-{
-    using T = typename TestFixture::value_type;
-    using Sellp = typename gko::matrix::Sellp<T, gko::int32>;
-    auto sellp_mtx =
-        Sellp::create(this->mtx7->get_executor(), gko::dim<2>{}, 2, 2, 0);
+    auto b = gko::initialize<MixedDenseComplex>(
+        {{mixed_complex_type{1.0, 0.0}, mixed_complex_type{2.0, 1.0}},
+         {mixed_complex_type{2.0, 2.0}, mixed_complex_type{3.0, 3.0}},
+         {mixed_complex_type{3.0, 4.0}, mixed_complex_type{4.0, 5.0}}},
+        exec);
+    auto x = gko::initialize<MixedDenseComplex>(
+        {{mixed_complex_type{1.0, 0.0}, mixed_complex_type{2.0, 1.0}},
+         {mixed_complex_type{2.0, 2.0}, mixed_complex_type{3.0, 3.0}}},
+        exec);
+    auto alpha = gko::initialize<MixedDense>({-1.0}, this->exec);
+    auto beta = gko::initialize<MixedDense>({2.0}, this->exec);
 
-    this->mtx7->convert_to(sellp_mtx);
-    auto v = sellp_mtx->get_const_values();
-    auto c = sellp_mtx->get_const_col_idxs();
-    auto s = sellp_mtx->get_const_slice_sets();
-    auto l = sellp_mtx->get_const_slice_lengths();
+    this->mtx1->apply(alpha, b, beta, x);
 
-    ASSERT_EQ(sellp_mtx->get_size(), gko::dim<2>(2, 3));
-    ASSERT_EQ(sellp_mtx->get_total_cols(), 4);
-    ASSERT_EQ(sellp_mtx->get_num_stored_elements(), 8);
-    ASSERT_EQ(sellp_mtx->get_slice_size(), 2);
-    ASSERT_EQ(sellp_mtx->get_stride_factor(), 2);
-    EXPECT_EQ(c[0], 0);
-    EXPECT_EQ(c[1], 1);
-    EXPECT_EQ(c[2], 1);
-    EXPECT_EQ(c[3], this->invalid_index);
-    EXPECT_EQ(c[4], 2);
-    EXPECT_EQ(c[5], this->invalid_index);
-    EXPECT_EQ(c[6], this->invalid_index);
-    EXPECT_EQ(c[7], this->invalid_index);
-    EXPECT_EQ(v[0], T{1.0});
-    EXPECT_EQ(v[1], T{1.5});
-    EXPECT_EQ(v[2], T{2.0});
-    EXPECT_EQ(v[3], T{0.0});
-    EXPECT_EQ(v[4], T{3.0});
-    EXPECT_EQ(v[5], T{0.0});
-    EXPECT_EQ(v[6], T{0.0});
-    EXPECT_EQ(v[7], T{0.0});
-    EXPECT_EQ(s[0], 0);
-    EXPECT_EQ(s[1], 4);
-    EXPECT_EQ(l[0], 4);
+    GKO_ASSERT_MTX_NEAR(
+        x,
+        l({{mixed_complex_type{-12.0, -16.0}, mixed_complex_type{-16.0, -20.0}},
+           {mixed_complex_type{-13.0, -15.0},
+            mixed_complex_type{-18.5, -20.5}}}),
+        0.0);
 }
 
 
-TYPED_TEST(Dense, MovesToSellpWithSliceSizeAndStrideFactor)
+TYPED_TEST(Dense, MakeComplex)
 {
     using T = typename TestFixture::value_type;
-    using Sellp = typename gko::matrix::Sellp<T, gko::int32>;
-    auto sellp_mtx =
-        Sellp::create(this->mtx7->get_executor(), gko::dim<2>{}, 2, 2, 0);
 
-    this->mtx7->move_to(sellp_mtx);
-    auto v = sellp_mtx->get_const_values();
-    auto c = sellp_mtx->get_const_col_idxs();
-    auto s = sellp_mtx->get_const_slice_sets();
-    auto l = sellp_mtx->get_const_slice_lengths();
-
-    ASSERT_EQ(sellp_mtx->get_size(), gko::dim<2>(2, 3));
-    ASSERT_EQ(sellp_mtx->get_total_cols(), 4);
-    ASSERT_EQ(sellp_mtx->get_num_stored_elements(), 8);
-    ASSERT_EQ(sellp_mtx->get_slice_size(), 2);
-    ASSERT_EQ(sellp_mtx->get_stride_factor(), 2);
-    EXPECT_EQ(c[0], 0);
-    EXPECT_EQ(c[1], 1);
-    EXPECT_EQ(c[2], 1);
-    EXPECT_EQ(c[3], this->invalid_index);
-    EXPECT_EQ(c[4], 2);
-    EXPECT_EQ(c[5], this->invalid_index);
-    EXPECT_EQ(c[6], this->invalid_index);
-    EXPECT_EQ(c[7], this->invalid_index);
-    EXPECT_EQ(v[0], T{1.0});
-    EXPECT_EQ(v[1], T{1.5});
-    EXPECT_EQ(v[2], T{2.0});
-    EXPECT_EQ(v[3], T{0.0});
-    EXPECT_EQ(v[4], T{3.0});
-    EXPECT_EQ(v[5], T{0.0});
-    EXPECT_EQ(v[6], T{0.0});
-    EXPECT_EQ(v[7], T{0.0});
-    EXPECT_EQ(s[0], 0);
-    EXPECT_EQ(s[1], 4);
-    EXPECT_EQ(l[0], 4);
+    auto complex_mtx = this->mtx5->make_complex();
+
+    GKO_ASSERT_MTX_NEAR(complex_mtx, this->mtx5, 0.0);
 }
 
 
-TYPED_TEST(Dense, ConvertsToAndFromSellpWithMoreThanOneSlice)
+TYPED_TEST(Dense, MakeComplexIntoDense)
 {
     using T = typename TestFixture::value_type;
-    using Mtx = typename TestFixture::Mtx;
-    using Sellp = typename gko::matrix::Sellp<T, gko::int32>;
-    auto x = this->template gen_mtx<Mtx>(65, 25);
+    using ComplexMtx = typename TestFixture::ComplexMtx;
+    auto exec = this->mtx5->get_executor();
 
-    auto sellp_mtx = Sellp::create(this->exec);
-    auto dense_mtx = Mtx::create(this->exec);
-    x->convert_to(sellp_mtx);
-    sellp_mtx->convert_to(dense_mtx);
+    auto complex_mtx = ComplexMtx::create(exec, this->mtx5->get_size());
+    this->mtx5->make_complex(complex_mtx);
 
-    GKO_ASSERT_MTX_NEAR(dense_mtx, x, 0.0);
+    GKO_ASSERT_MTX_NEAR(complex_mtx, this->mtx5, 0.0);
 }
 
 
-TYPED_TEST(Dense, ConvertsEmptyToPrecision)
+TYPED_TEST(Dense, MakeComplexIntoDenseFailsForWrongDimensions)
 {
-    using Dense = typename TestFixture::Mtx;
     using T = typename TestFixture::value_type;
-    using OtherT = typename gko::next_precision<T>;
-    using OtherDense = typename gko::matrix::Dense<OtherT>;
-    auto empty = OtherDense::create(this->exec);
-    auto res = Dense::create(this->exec);
+    using ComplexMtx = typename TestFixture::ComplexMtx;
+    auto exec = this->mtx5->get_executor();
 
-    empty->convert_to(res);
+    auto complex_mtx = ComplexMtx::create(exec);
 
-    ASSERT_FALSE(res->get_size());
+    ASSERT_THROW(this->mtx5->make_complex(complex_mtx), gko::DimensionMismatch);
 }
 
 
-TYPED_TEST(Dense, MovesEmptyToPrecision)
+TYPED_TEST(Dense, GetReal)
 {
-    using Dense = typename TestFixture::Mtx;
     using T = typename TestFixture::value_type;
-    using OtherT = typename gko::next_precision<T>;
-    using OtherDense = typename gko::matrix::Dense<OtherT>;
-    auto empty = OtherDense::create(this->exec);
-    auto res = Dense::create(this->exec);
 
-    empty->move_to(res);
+    auto real_mtx = this->mtx5->get_real();
 
-    ASSERT_FALSE(res->get_size());
+    GKO_ASSERT_MTX_NEAR(real_mtx, this->mtx5, 0.0);
 }
 
 
-TYPED_TEST(Dense, ConvertsEmptyToCoo)
+TYPED_TEST(Dense, GetRealIntoDense)
 {
-    using Dense = typename TestFixture::Mtx;
     using T = typename TestFixture::value_type;
-    using Coo = typename gko::matrix::Coo<T, gko::int32>;
-    auto empty = Dense::create(this->exec);
-    auto res = Coo::create(this->exec);
+    using RealMtx = typename TestFixture::RealMtx;
+    auto exec = this->mtx5->get_executor();
 
-    empty->convert_to(res);
+    auto real_mtx = RealMtx::create(exec, this->mtx5->get_size());
+    this->mtx5->get_real(real_mtx);
 
-    ASSERT_EQ(res->get_num_stored_elements(), 0);
-    ASSERT_FALSE(res->get_size());
+    GKO_ASSERT_MTX_NEAR(real_mtx, this->mtx5, 0.0);
 }
 
 
-TYPED_TEST(Dense, MovesEmptyToCoo)
+TYPED_TEST(Dense, GetRealIntoDenseFailsForWrongDimensions)
 {
-    using Dense = typename TestFixture::Mtx;
     using T = typename TestFixture::value_type;
-    using Coo = typename gko::matrix::Coo<T, gko::int32>;
-    auto empty = Dense::create(this->exec);
-    auto res = Coo::create(this->exec);
-
-    empty->move_to(res);
+    using RealMtx = typename TestFixture::RealMtx;
+    auto exec = this->mtx5->get_executor();
 
-    ASSERT_EQ(res->get_num_stored_elements(), 0);
-    ASSERT_FALSE(res->get_size());
+    auto real_mtx = RealMtx::create(exec);
+    ASSERT_THROW(this->mtx5->get_real(real_mtx), gko::DimensionMismatch);
 }
 
 
-TYPED_TEST(Dense, ConvertsEmptyMatrixToCsr)
+TYPED_TEST(Dense, GetImag)
 {
-    using Dense = typename TestFixture::Mtx;
     using T = typename TestFixture::value_type;
-    using Csr = typename gko::matrix::Csr<T, gko::int32>;
-    auto empty = Dense::create(this->exec);
-    auto res = Csr::create(this->exec);
 
-    empty->convert_to(res);
+    auto imag_mtx = this->mtx5->get_imag();
 
-    ASSERT_EQ(res->get_num_stored_elements(), 0);
-    ASSERT_EQ(*res->get_const_row_ptrs(), 0);
-    ASSERT_FALSE(res->get_size());
+    GKO_ASSERT_MTX_NEAR(
+        imag_mtx, l<T>({{0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}}),
+        0.0);
 }
 
 
-TYPED_TEST(Dense, MovesEmptyMatrixToCsr)
+TYPED_TEST(Dense, GetImagIntoDense)
 {
-    using Dense = typename TestFixture::Mtx;
     using T = typename TestFixture::value_type;
-    using Csr = typename gko::matrix::Csr<T, gko::int32>;
-    auto empty = Dense::create(this->exec);
-    auto res = Csr::create(this->exec);
+    using RealMtx = typename TestFixture::RealMtx;
+    auto exec = this->mtx5->get_executor();
 
-    empty->move_to(res);
+    auto imag_mtx = RealMtx::create(exec, this->mtx5->get_size());
+    this->mtx5->get_imag(imag_mtx);
 
-    ASSERT_EQ(res->get_num_stored_elements(), 0);
-    ASSERT_EQ(*res->get_const_row_ptrs(), 0);
-    ASSERT_FALSE(res->get_size());
+    GKO_ASSERT_MTX_NEAR(
+        imag_mtx, l<T>({{0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}}),
+        0.0);
 }
 
 
-TYPED_TEST(Dense, ConvertsEmptyToSparsityCsr)
+TYPED_TEST(Dense, GetImagIntoDenseFailsForWrongDimensions)
 {
-    using Dense = typename TestFixture::Mtx;
     using T = typename TestFixture::value_type;
-    using SparsityCsr = typename gko::matrix::SparsityCsr<T, gko::int32>;
-    auto empty = Dense::create(this->exec);
-    auto res = SparsityCsr::create(this->exec);
-
-    empty->convert_to(res);
+    using RealMtx = typename TestFixture::RealMtx;
+    auto exec = this->mtx5->get_executor();
 
-    ASSERT_EQ(res->get_num_nonzeros(), 0);
-    ASSERT_EQ(*res->get_const_row_ptrs(), 0);
-    ASSERT_FALSE(res->get_size());
+    auto imag_mtx = RealMtx::create(exec);
+    ASSERT_THROW(this->mtx5->get_imag(imag_mtx), gko::DimensionMismatch);
 }
 
 
-TYPED_TEST(Dense, MovesEmptyToSparsityCsr)
+TYPED_TEST(Dense, MakeTemporaryConversionDoesntConvertOnMatch)
 {
-    using Dense = typename TestFixture::Mtx;
+    using Mtx = typename TestFixture::Mtx;
     using T = typename TestFixture::value_type;
-    using SparsityCsr = typename gko::matrix::SparsityCsr<T, gko::int32>;
-    auto empty = Dense::create(this->exec);
-    auto res = SparsityCsr::create(this->exec);
-
-    empty->move_to(res);
+    auto alpha = gko::initialize<Mtx>({8.0}, this->exec);
 
-    ASSERT_EQ(res->get_num_nonzeros(), 0);
-    ASSERT_EQ(*res->get_const_row_ptrs(), 0);
-    ASSERT_FALSE(res->get_size());
+    ASSERT_EQ(gko::make_temporary_conversion<T>(alpha).get(), alpha.get());
 }
 
 
-TYPED_TEST(Dense, ConvertsEmptyToEll)
+TYPED_TEST(Dense, MakeTemporaryConversionConvertsBack)
 {
-    using Dense = typename TestFixture::Mtx;
+    using MixedMtx = typename TestFixture::MixedMtx;
     using T = typename TestFixture::value_type;
-    using Ell = typename gko::matrix::Ell<T, gko::int32>;
-    auto empty = Dense::create(this->exec);
-    auto res = Ell::create(this->exec);
+    using MixedT = typename MixedMtx::value_type;
+    auto alpha = gko::initialize<MixedMtx>({8.0}, this->exec);
 
-    empty->convert_to(res);
+    {
+        auto conversion = gko::make_temporary_conversion<T>(alpha);
+        conversion->at(0, 0) = T{7.0};
+    }
 
-    ASSERT_EQ(res->get_num_stored_elements(), 0);
-    ASSERT_FALSE(res->get_size());
+    ASSERT_EQ(alpha->at(0, 0), MixedT{7.0});
 }
 
 
-TYPED_TEST(Dense, MovesEmptyToEll)
+TYPED_TEST(Dense, MakeTemporaryConversionConstDoesntConvertBack)
 {
-    using Dense = typename TestFixture::Mtx;
+    using MixedMtx = typename TestFixture::MixedMtx;
     using T = typename TestFixture::value_type;
-    using Ell = typename gko::matrix::Ell<T, gko::int32>;
-    auto empty = Dense::create(this->exec);
-    auto res = Ell::create(this->exec);
+    using MixedT = typename MixedMtx::value_type;
+    auto alpha = gko::initialize<MixedMtx>({8.0}, this->exec);
 
-    empty->move_to(res);
+    {
+        auto conversion = gko::make_temporary_conversion<T>(
+            static_cast<const MixedMtx*>(alpha.get()));
+        alpha->at(0, 0) = MixedT{7.0};
+    }
 
-    ASSERT_EQ(res->get_num_stored_elements(), 0);
-    ASSERT_FALSE(res->get_size());
+    ASSERT_EQ(alpha->at(0, 0), MixedT{7.0});
 }
 
 
-TYPED_TEST(Dense, ConvertsEmptyToHybrid)
+TYPED_TEST(Dense, ScaleAddIdentityRectangular)
 {
-    using Dense = typename TestFixture::Mtx;
     using T = typename TestFixture::value_type;
-    using Hybrid = typename gko::matrix::Hybrid<T, gko::int32>;
-    auto empty = Dense::create(this->exec);
-    auto res = Hybrid::create(this->exec);
+    using Vec = typename TestFixture::Mtx;
+    auto alpha = gko::initialize<Vec>({2.0}, this->exec);
+    auto beta = gko::initialize<Vec>({-1.0}, this->exec);
+    auto b = gko::initialize<Vec>(
+        {I<T>{2.0, 0.0}, I<T>{1.0, 2.5}, I<T>{0.0, -4.0}}, this->exec);
 
-    empty->convert_to(res);
+    b->add_scaled_identity(alpha, beta);
 
-    ASSERT_EQ(res->get_num_stored_elements(), 0);
-    ASSERT_FALSE(res->get_size());
+    GKO_ASSERT_MTX_NEAR(b, l({{0.0, 0.0}, {-1.0, -0.5}, {0.0, 4.0}}), 0.0);
 }
 
 
-TYPED_TEST(Dense, MovesEmptyToHybrid)
-{
-    using Dense = typename TestFixture::Mtx;
-    using T = typename TestFixture::value_type;
-    using Hybrid = typename gko::matrix::Hybrid<T, gko::int32>;
-    auto empty = Dense::create(this->exec);
-    auto res = Hybrid::create(this->exec);
+template <typename ValueIndexType>
+class DenseWithIndexType
+    : public Dense<
+          typename std::tuple_element<0, decltype(ValueIndexType())>::type> {
+public:
+    using value_type =
+        typename std::tuple_element<0, decltype(ValueIndexType())>::type;
+    using index_type =
+        typename std::tuple_element<1, decltype(ValueIndexType())>::type;
+    using Permutation = gko::matrix::Permutation<index_type>;
+    using ScaledPermutation =
+        gko::matrix::ScaledPermutation<value_type, index_type>;
 
-    empty->move_to(res);
 
-    ASSERT_EQ(res->get_num_stored_elements(), 0);
-    ASSERT_FALSE(res->get_size());
-}
+    DenseWithIndexType()
+    {
+        perm2 = Permutation::create(this->exec,
+                                    gko::array<index_type>{this->exec, {1, 0}});
+        perm3 = Permutation::create(
+            this->exec, gko::array<index_type>{this->exec, {1, 2, 0}});
+        perm3_rev = Permutation::create(
+            this->exec, gko::array<index_type>{this->exec, {2, 0, 1}});
+        perm0 = Permutation::create(this->exec, 0);
+        scale_perm2 = ScaledPermutation::create(
+            this->exec, gko::array<value_type>{this->exec, {17.0, 19.0}},
+            gko::array<index_type>{this->exec, {1, 0}});
+        scale_perm3 = ScaledPermutation::create(
+            this->exec, gko::array<value_type>{this->exec, {2.0, 3.0, 5.0}},
+            gko::array<index_type>{this->exec, {1, 2, 0}});
+        scale_perm3_rev = ScaledPermutation::create(
+            this->exec, gko::array<value_type>{this->exec, {7.0, 11.0, 13.0}},
+            gko::array<index_type>{this->exec, {2, 0, 1}});
+        scale_perm0 = ScaledPermutation::create(this->exec, 0);
+    }
 
+    std::unique_ptr<Permutation> perm2;
+    std::unique_ptr<Permutation> perm3;
+    std::unique_ptr<Permutation> perm3_rev;
+    std::unique_ptr<Permutation> perm0;
+    std::unique_ptr<ScaledPermutation> scale_perm2;
+    std::unique_ptr<ScaledPermutation> scale_perm3;
+    std::unique_ptr<ScaledPermutation> scale_perm3_rev;
+    std::unique_ptr<ScaledPermutation> scale_perm0;
+};
 
-TYPED_TEST(Dense, ConvertsEmptyToSellp)
-{
-    using Dense = typename TestFixture::Mtx;
-    using T = typename TestFixture::value_type;
-    using Sellp = typename gko::matrix::Sellp<T, gko::int32>;
-    auto empty = Dense::create(this->exec);
-    auto res = Sellp::create(this->exec);
+TYPED_TEST_SUITE(DenseWithIndexType, gko::test::ValueIndexTypes,
+                 PairTypenameNameGenerator);
 
-    empty->convert_to(res);
 
-    ASSERT_EQ(res->get_num_stored_elements(), 0);
-    ASSERT_EQ(*res->get_const_slice_sets(), 0);
-    ASSERT_FALSE(res->get_size());
+template <typename ValueType, typename IndexType>
+void assert_coo_eq_mtx4(const gko::matrix::Coo<ValueType, IndexType>* coo_mtx)
+{
+    auto v = coo_mtx->get_const_values();
+    auto c = coo_mtx->get_const_col_idxs();
+    auto r = coo_mtx->get_const_row_idxs();
+
+    ASSERT_EQ(coo_mtx->get_size(), gko::dim<2>(2, 3));
+    ASSERT_EQ(coo_mtx->get_num_stored_elements(), 4);
+    EXPECT_EQ(r[0], 0);
+    EXPECT_EQ(r[1], 0);
+    EXPECT_EQ(r[2], 0);
+    EXPECT_EQ(r[3], 1);
+    EXPECT_EQ(c[0], 0);
+    EXPECT_EQ(c[1], 1);
+    EXPECT_EQ(c[2], 2);
+    EXPECT_EQ(c[3], 1);
+    EXPECT_EQ(v[0], ValueType{1.0});
+    EXPECT_EQ(v[1], ValueType{3.0});
+    EXPECT_EQ(v[2], ValueType{2.0});
+    EXPECT_EQ(v[3], ValueType{5.0});
 }
 
 
-TYPED_TEST(Dense, MovesEmptyToSellp)
+TYPED_TEST(DenseWithIndexType, ConvertsToCoo)
 {
-    using Dense = typename TestFixture::Mtx;
-    using T = typename TestFixture::value_type;
-    using Sellp = typename gko::matrix::Sellp<T, gko::int32>;
-    auto empty = Dense::create(this->exec);
-    auto res = Sellp::create(this->exec);
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using Coo = typename gko::matrix::Coo<value_type, index_type>;
+    auto coo_mtx = Coo::create(this->mtx4->get_executor());
 
-    empty->move_to(res);
+    this->mtx4->convert_to(coo_mtx);
 
-    ASSERT_EQ(res->get_num_stored_elements(), 0);
-    ASSERT_EQ(*res->get_const_slice_sets(), 0);
-    ASSERT_FALSE(res->get_size());
+    assert_coo_eq_mtx4(coo_mtx.get());
 }
 
 
-TYPED_TEST(Dense, SquareMatrixIsTransposable)
+TYPED_TEST(DenseWithIndexType, MovesToCoo)
 {
-    using Mtx = typename TestFixture::Mtx;
-    using T = typename TestFixture::value_type;
-    auto trans = gko::as<Mtx>(this->mtx5->transpose());
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using Coo = typename gko::matrix::Coo<value_type, index_type>;
+    auto coo_mtx = Coo::create(this->mtx4->get_executor());
 
-    GKO_ASSERT_MTX_NEAR(
-        trans, l<T>({{1.0, -2.0, 2.1}, {-1.0, 2.0, 3.4}, {-0.5, 4.5, 1.2}}),
-        0.0);
+    this->mtx4->move_to(coo_mtx);
+
+    assert_coo_eq_mtx4(coo_mtx.get());
 }
 
 
-TYPED_TEST(Dense, SquareMatrixIsTransposableIntoDense)
+template <typename ValueType, typename IndexType>
+void assert_csr_eq_mtx4(const gko::matrix::Csr<ValueType, IndexType>* csr_mtx)
 {
-    using Mtx = typename TestFixture::Mtx;
-    using T = typename TestFixture::value_type;
-    auto trans = Mtx::create(this->exec, this->mtx5->get_size());
-
-    this->mtx5->transpose(trans);
-
-    GKO_ASSERT_MTX_NEAR(
-        trans, l<T>({{1.0, -2.0, 2.1}, {-1.0, 2.0, 3.4}, {-0.5, 4.5, 1.2}}),
-        0.0);
+    auto v = csr_mtx->get_const_values();
+    auto c = csr_mtx->get_const_col_idxs();
+    auto r = csr_mtx->get_const_row_ptrs();
+    ASSERT_EQ(csr_mtx->get_size(), gko::dim<2>(2, 3));
+    ASSERT_EQ(csr_mtx->get_num_stored_elements(), 4);
+    EXPECT_EQ(r[0], 0);
+    EXPECT_EQ(r[1], 3);
+    EXPECT_EQ(r[2], 4);
+    EXPECT_EQ(c[0], 0);
+    EXPECT_EQ(c[1], 1);
+    EXPECT_EQ(c[2], 2);
+    EXPECT_EQ(c[3], 1);
+    EXPECT_EQ(v[0], ValueType{1.0});
+    EXPECT_EQ(v[1], ValueType{3.0});
+    EXPECT_EQ(v[2], ValueType{2.0});
+    EXPECT_EQ(v[3], ValueType{5.0});
 }
 
 
-TYPED_TEST(Dense, SquareSubmatrixIsTransposableIntoDense)
+TYPED_TEST(DenseWithIndexType, ConvertsToCsr)
 {
-    using Mtx = typename TestFixture::Mtx;
-    using T = typename TestFixture::value_type;
-    auto trans = Mtx::create(this->exec, gko::dim<2>{2, 2}, 4);
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using Csr = typename gko::matrix::Csr<value_type, index_type>;
+    auto csr_s_classical = std::make_shared<typename Csr::classical>();
+    auto csr_s_merge = std::make_shared<typename Csr::merge_path>();
+    auto csr_mtx_c = Csr::create(this->mtx4->get_executor(), csr_s_classical);
+    auto csr_mtx_m = Csr::create(this->mtx4->get_executor(), csr_s_merge);
 
-    this->mtx5->create_submatrix({0, 2}, {0, 2})->transpose(trans);
+    this->mtx4->convert_to(csr_mtx_c);
+    this->mtx4->convert_to(csr_mtx_m);
 
-    GKO_ASSERT_MTX_NEAR(trans, l<T>({{1.0, -2.0}, {-1.0, 2.0}}), 0.0);
-    ASSERT_EQ(trans->get_stride(), 4);
+    assert_csr_eq_mtx4(csr_mtx_c.get());
+    ASSERT_EQ(csr_mtx_c->get_strategy()->get_name(), "classical");
+    GKO_ASSERT_MTX_NEAR(csr_mtx_c, csr_mtx_m, 0.0);
+    ASSERT_EQ(csr_mtx_m->get_strategy()->get_name(), "merge_path");
 }
 
 
-TYPED_TEST(Dense, SquareMatrixIsTransposableIntoDenseFailsForWrongDimensions)
-{
-    using Mtx = typename TestFixture::Mtx;
+TYPED_TEST(DenseWithIndexType, MovesToCsr)
+{
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using Csr = typename gko::matrix::Csr<value_type, index_type>;
+    auto csr_s_classical = std::make_shared<typename Csr::classical>();
+    auto csr_s_merge = std::make_shared<typename Csr::merge_path>();
+    auto csr_mtx_c = Csr::create(this->mtx4->get_executor(), csr_s_classical);
+    auto csr_mtx_m = Csr::create(this->mtx4->get_executor(), csr_s_merge);
+    auto mtx_clone = this->mtx4->clone();
+
+    this->mtx4->move_to(csr_mtx_c);
+    mtx_clone->move_to(csr_mtx_m);
 
-    ASSERT_THROW(this->mtx5->transpose(Mtx::create(this->exec)),
-                 gko::DimensionMismatch);
+    assert_csr_eq_mtx4(csr_mtx_c.get());
+    ASSERT_EQ(csr_mtx_c->get_strategy()->get_name(), "classical");
+    GKO_ASSERT_MTX_NEAR(csr_mtx_c, csr_mtx_m, 0.0);
+    ASSERT_EQ(csr_mtx_m->get_strategy()->get_name(), "merge_path");
 }
 
 
-TYPED_TEST(Dense, NonSquareMatrixIsTransposable)
+template <typename ValueType, typename IndexType>
+void assert_sparsity_csr_eq_mtx4(
+    const gko::matrix::SparsityCsr<ValueType, IndexType>* sparsity_csr_mtx)
 {
-    using Mtx = typename TestFixture::Mtx;
-    using T = typename TestFixture::value_type;
-    auto trans = gko::as<Mtx>(this->mtx4->transpose());
+    auto v = sparsity_csr_mtx->get_const_value();
+    auto c = sparsity_csr_mtx->get_const_col_idxs();
+    auto r = sparsity_csr_mtx->get_const_row_ptrs();
 
-    GKO_ASSERT_MTX_NEAR(trans, l<T>({{1.0, 0.0}, {3.0, 5.0}, {2.0, 0.0}}), 0.0);
+    ASSERT_EQ(sparsity_csr_mtx->get_size(), gko::dim<2>(2, 3));
+    ASSERT_EQ(sparsity_csr_mtx->get_num_nonzeros(), 4);
+    EXPECT_EQ(r[0], 0);
+    EXPECT_EQ(r[1], 3);
+    EXPECT_EQ(r[2], 4);
+    EXPECT_EQ(c[0], 0);
+    EXPECT_EQ(c[1], 1);
+    EXPECT_EQ(c[2], 2);
+    EXPECT_EQ(c[3], 1);
+    EXPECT_EQ(v[0], ValueType{1.0});
 }
 
 
-TYPED_TEST(Dense, NonSquareMatrixIsTransposableIntoDense)
+TYPED_TEST(DenseWithIndexType, ConvertsToSparsityCsr)
 {
-    using Mtx = typename TestFixture::Mtx;
-    using T = typename TestFixture::value_type;
-    auto trans =
-        Mtx::create(this->exec, gko::transpose(this->mtx4->get_size()));
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using SparsityCsr =
+        typename gko::matrix::SparsityCsr<value_type, index_type>;
+    auto sparsity_csr_mtx = SparsityCsr::create(this->mtx4->get_executor());
 
-    this->mtx4->transpose(trans);
+    this->mtx4->convert_to(sparsity_csr_mtx);
 
-    GKO_ASSERT_MTX_NEAR(trans, l<T>({{1.0, 0.0}, {3.0, 5.0}, {2.0, 0.0}}), 0.0);
+    assert_sparsity_csr_eq_mtx4(sparsity_csr_mtx.get());
 }
 
 
-TYPED_TEST(Dense, NonSquareSubmatrixIsTransposableIntoDense)
+TYPED_TEST(DenseWithIndexType, MovesToSparsityCsr)
 {
-    using Mtx = typename TestFixture::Mtx;
-    using T = typename TestFixture::value_type;
-    auto trans = Mtx::create(this->exec, gko::dim<2>{2, 1}, 5);
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using SparsityCsr =
+        typename gko::matrix::SparsityCsr<value_type, index_type>;
+    auto sparsity_csr_mtx = SparsityCsr::create(this->mtx4->get_executor());
 
-    this->mtx4->create_submatrix({0, 1}, {0, 2})->transpose(trans);
+    this->mtx4->move_to(sparsity_csr_mtx);
 
-    GKO_ASSERT_MTX_NEAR(trans, l({1.0, 3.0}), 0.0);
-    ASSERT_EQ(trans->get_stride(), 5);
+    assert_sparsity_csr_eq_mtx4(sparsity_csr_mtx.get());
 }
 
 
-TYPED_TEST(Dense, NonSquareMatrixIsTransposableIntoDenseFailsForWrongDimensions)
+template <typename ValueType, typename IndexType>
+void assert_ell_eq_mtx6(const gko::matrix::Ell<ValueType, IndexType>* ell_mtx)
 {
-    using Mtx = typename TestFixture::Mtx;
+    auto v = ell_mtx->get_const_values();
+    auto c = ell_mtx->get_const_col_idxs();
 
-    ASSERT_THROW(this->mtx4->transpose(Mtx::create(this->exec)),
-                 gko::DimensionMismatch);
+    ASSERT_EQ(ell_mtx->get_size(), gko::dim<2>(2, 3));
+    ASSERT_EQ(ell_mtx->get_num_stored_elements_per_row(), 2);
+    ASSERT_EQ(ell_mtx->get_num_stored_elements(), 4);
+    ASSERT_EQ(ell_mtx->get_stride(), 2);
+    EXPECT_EQ(c[0], 0);
+    EXPECT_EQ(c[1], 1);
+    EXPECT_EQ(c[2], 1);
+    EXPECT_EQ(c[3], gko::invalid_index<IndexType>());
+    EXPECT_EQ(v[0], ValueType{1.0});
+    EXPECT_EQ(v[1], ValueType{1.5});
+    EXPECT_EQ(v[2], ValueType{2.0});
+    EXPECT_EQ(v[3], ValueType{0.0});
 }
 
 
-TYPED_TEST(Dense, SquareMatrixCanGatherRows)
+TYPED_TEST(DenseWithIndexType, ConvertsToEll)
 {
-    using Mtx = typename TestFixture::Mtx;
-    using T = typename TestFixture::value_type;
-    auto exec = this->mtx5->get_executor();
-    gko::array<gko::int32> permute_idxs{exec, {1, 0}};
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using Ell = typename gko::matrix::Ell<value_type, index_type>;
+    auto ell_mtx = Ell::create(this->mtx6->get_executor());
 
-    auto row_collection = this->mtx5->row_gather(&permute_idxs);
+    this->mtx6->convert_to(ell_mtx);
 
-    GKO_ASSERT_MTX_NEAR(row_collection,
-                        l<T>({{-2.0, 2.0, 4.5}, {1.0, -1.0, -0.5}}), 0.0);
+    assert_ell_eq_mtx6(ell_mtx.get());
 }
 
 
-TYPED_TEST(Dense, SquareMatrixCanGatherRowsIntoDense)
+TYPED_TEST(DenseWithIndexType, MovesToEll)
 {
-    using Mtx = typename TestFixture::Mtx;
-    using T = typename TestFixture::value_type;
-    auto exec = this->mtx5->get_executor();
-    gko::array<gko::int32> permute_idxs{exec, {1, 0}};
-    auto row_collection = Mtx::create(exec, gko::dim<2>{2, 3});
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using Ell = typename gko::matrix::Ell<value_type, index_type>;
+    auto ell_mtx = Ell::create(this->mtx6->get_executor());
 
-    this->mtx5->row_gather(&permute_idxs, row_collection);
+    this->mtx6->move_to(ell_mtx);
 
-    GKO_ASSERT_MTX_NEAR(row_collection,
-                        l<T>({{-2.0, 2.0, 4.5}, {1.0, -1.0, -0.5}}), 0.0);
+    assert_ell_eq_mtx6(ell_mtx.get());
 }
 
 
-TYPED_TEST(Dense, SquareSubmatrixCanGatherRowsIntoDense)
+template <typename ValueType, typename IndexType>
+void assert_strided_ell_eq_mtx6(
+    const gko::matrix::Ell<ValueType, IndexType>* ell_mtx)
 {
-    using Mtx = typename TestFixture::Mtx;
-    using T = typename TestFixture::value_type;
-    auto exec = this->mtx5->get_executor();
-    gko::array<gko::int32> permute_idxs{exec, {1, 0}};
-    auto row_collection = Mtx::create(exec, gko::dim<2>{2, 2}, 4);
-
-    this->mtx5->create_submatrix({0, 2}, {1, 3})
-        ->row_gather(&permute_idxs, row_collection);
+    constexpr auto invalid_index = gko::invalid_index<IndexType>();
+    auto v = ell_mtx->get_const_values();
+    auto c = ell_mtx->get_const_col_idxs();
 
-    GKO_ASSERT_MTX_NEAR(row_collection, l<T>({{2.0, 4.5}, {-1.0, -0.5}}), 0.0);
-    ASSERT_EQ(row_collection->get_stride(), 4);
+    ASSERT_EQ(ell_mtx->get_size(), gko::dim<2>(2, 3));
+    ASSERT_EQ(ell_mtx->get_num_stored_elements_per_row(), 2);
+    ASSERT_EQ(ell_mtx->get_num_stored_elements(), 6);
+    ASSERT_EQ(ell_mtx->get_stride(), 3);
+    EXPECT_EQ(c[0], 0);
+    EXPECT_EQ(c[1], 1);
+    EXPECT_EQ(c[2], invalid_index);
+    EXPECT_EQ(c[3], 1);
+    EXPECT_EQ(c[4], invalid_index);
+    EXPECT_EQ(c[5], invalid_index);
+    EXPECT_EQ(v[0], ValueType{1.0});
+    EXPECT_EQ(v[1], ValueType{1.5});
+    EXPECT_EQ(v[2], ValueType{0.0});
+    EXPECT_EQ(v[3], ValueType{2.0});
+    EXPECT_EQ(v[4], ValueType{0.0});
+    EXPECT_EQ(v[5], ValueType{0.0});
 }
 
 
-TYPED_TEST(Dense, NonSquareSubmatrixCanGatherRowsIntoMixedDense)
+TYPED_TEST(DenseWithIndexType, ConvertsToEllWithStride)
 {
-    using Mtx = typename TestFixture::Mtx;
-    using MixedMtx = typename TestFixture::MixedMtx;
-    using T = typename TestFixture::value_type;
-    auto exec = this->mtx4->get_executor();
-    gko::array<gko::int32> gather_index{exec, {1, 0, 1}};
-    auto row_collection = MixedMtx::create(exec, gko::dim<2>{3, 3}, 4);
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using Ell = typename gko::matrix::Ell<value_type, index_type>;
+    auto ell_mtx =
+        Ell::create(this->mtx6->get_executor(), gko::dim<2>{2, 3}, 2, 3);
 
-    this->mtx4->row_gather(&gather_index, row_collection);
+    this->mtx6->convert_to(ell_mtx);
 
-    GKO_ASSERT_MTX_NEAR(
-        row_collection,
-        l<typename MixedMtx::value_type>(
-            {{0.0, 5.0, 0.0}, {1.0, 3.0, 2.0}, {0.0, 5.0, 0.0}}),
-        0.0);
+    assert_strided_ell_eq_mtx6(ell_mtx.get());
 }
 
 
-TYPED_TEST(Dense, NonSquareSubmatrixCanAdvancedGatherRowsIntoMixedDense)
+TYPED_TEST(DenseWithIndexType, MovesToEllWithStride)
 {
-    using Mtx = typename TestFixture::Mtx;
-    using MixedMtx = typename TestFixture::MixedMtx;
-    using T = typename TestFixture::value_type;
-    auto exec = this->mtx4->get_executor();
-    gko::array<gko::int32> gather_index{exec, {1, 0, 1}};
-    auto row_collection = gko::initialize<MixedMtx>(
-        {{1.0, 0.5, -1.0}, {-1.5, 0.5, 1.0}, {2.0, -3.0, 1.0}}, exec);
-    auto alpha = gko::initialize<MixedMtx>({1.0}, exec);
-    auto beta = gko::initialize<Mtx>({2.0}, exec);
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using Ell = typename gko::matrix::Ell<value_type, index_type>;
+    auto ell_mtx =
+        Ell::create(this->mtx6->get_executor(), gko::dim<2>{2, 3}, 2, 3);
 
-    this->mtx4->row_gather(alpha, &gather_index, beta, row_collection);
+    this->mtx6->move_to(ell_mtx);
 
-    GKO_ASSERT_MTX_NEAR(
-        row_collection,
-        l<typename MixedMtx::value_type>(
-            {{2.0, 6.0, -2.0}, {-2.0, 4.0, 4.0}, {4.0, -1.0, 2.0}}),
-        0.0);
+    assert_strided_ell_eq_mtx6(ell_mtx.get());
 }
 
 
-TYPED_TEST(Dense, SquareMatrixGatherRowsIntoDenseFailsForWrongDimensions)
+template <typename ValueType, typename IndexType>
+void assert_hybrid_auto_eq_mtx4(
+    const gko::matrix::Hybrid<ValueType, IndexType>* hybrid_mtx)
 {
-    using Mtx = typename TestFixture::Mtx;
-    using T = typename TestFixture::value_type;
-    auto exec = this->mtx5->get_executor();
-    gko::array<gko::int32> permute_idxs{exec, {1, 0}};
+    auto v = hybrid_mtx->get_const_coo_values();
+    auto c = hybrid_mtx->get_const_coo_col_idxs();
+    auto r = hybrid_mtx->get_const_coo_row_idxs();
+    auto n = hybrid_mtx->get_ell_num_stored_elements_per_row();
+    auto p = hybrid_mtx->get_ell_stride();
 
-    ASSERT_THROW(this->mtx5->row_gather(&permute_idxs, Mtx::create(exec)),
-                 gko::DimensionMismatch);
+    ASSERT_EQ(hybrid_mtx->get_size(), gko::dim<2>(2, 3));
+    ASSERT_EQ(hybrid_mtx->get_ell_num_stored_elements(), 0);
+    ASSERT_EQ(hybrid_mtx->get_coo_num_stored_elements(), 4);
+    EXPECT_EQ(n, 0);
+    EXPECT_EQ(p, 2);
+    EXPECT_EQ(r[0], 0);
+    EXPECT_EQ(r[1], 0);
+    EXPECT_EQ(r[2], 0);
+    EXPECT_EQ(r[3], 1);
+    EXPECT_EQ(c[0], 0);
+    EXPECT_EQ(c[1], 1);
+    EXPECT_EQ(c[2], 2);
+    EXPECT_EQ(c[3], 1);
+    EXPECT_EQ(v[0], ValueType{1.0});
+    EXPECT_EQ(v[1], ValueType{3.0});
+    EXPECT_EQ(v[2], ValueType{2.0});
+    EXPECT_EQ(v[3], ValueType{5.0});
 }
 
 
-TYPED_TEST(Dense, SquareMatrixCanGatherRows64)
+TYPED_TEST(DenseWithIndexType, MovesToHybridAutomatically)
 {
-    using Mtx = typename TestFixture::Mtx;
-    using T = typename TestFixture::value_type;
-    auto exec = this->mtx5->get_executor();
-    gko::array<gko::int64> permute_idxs{exec, {1, 0}};
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using Hybrid = typename gko::matrix::Hybrid<value_type, index_type>;
+    auto hybrid_mtx = Hybrid::create(this->mtx4->get_executor());
 
-    auto row_collection = this->mtx5->row_gather(&permute_idxs);
+    this->mtx4->move_to(hybrid_mtx);
 
-    GKO_ASSERT_MTX_NEAR(row_collection,
-                        l<T>({{-2.0, 2.0, 4.5}, {1.0, -1.0, -0.5}}), 0.0);
+    assert_hybrid_auto_eq_mtx4(hybrid_mtx.get());
 }
 
 
-TYPED_TEST(Dense, SquareMatrixCanGatherRowsIntoDense64)
+TYPED_TEST(DenseWithIndexType, ConvertsToHybridAutomatically)
 {
-    using Mtx = typename TestFixture::Mtx;
-    using T = typename TestFixture::value_type;
-    auto exec = this->mtx5->get_executor();
-    gko::array<gko::int64> permute_idxs{exec, {1, 0}};
-    auto row_collection = Mtx::create(exec, gko::dim<2>{2, 3});
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using Hybrid = typename gko::matrix::Hybrid<value_type, index_type>;
+    auto hybrid_mtx = Hybrid::create(this->mtx4->get_executor());
 
-    this->mtx5->row_gather(&permute_idxs, row_collection);
+    this->mtx4->convert_to(hybrid_mtx);
 
-    GKO_ASSERT_MTX_NEAR(row_collection,
-                        l<T>({{-2.0, 2.0, 4.5}, {1.0, -1.0, -0.5}}), 0.0);
+    assert_hybrid_auto_eq_mtx4(hybrid_mtx.get());
 }
 
 
-TYPED_TEST(Dense, SquareSubmatrixCanGatherRowsIntoDense64)
+template <typename ValueType, typename IndexType>
+void assert_hybrid_strided_eq_mtx4(
+    const gko::matrix::Hybrid<ValueType, IndexType>* hybrid_mtx)
 {
-    using Mtx = typename TestFixture::Mtx;
-    using T = typename TestFixture::value_type;
-    auto exec = this->mtx5->get_executor();
-    gko::array<gko::int64> permute_idxs{exec, {1, 0}};
-    auto row_collection = Mtx::create(exec, gko::dim<2>{2, 2}, 4);
-
-    this->mtx5->create_submatrix({0, 2}, {1, 3})
-        ->row_gather(&permute_idxs, row_collection);
+    auto v = hybrid_mtx->get_const_coo_values();
+    auto c = hybrid_mtx->get_const_coo_col_idxs();
+    auto r = hybrid_mtx->get_const_coo_row_idxs();
+    auto n = hybrid_mtx->get_ell_num_stored_elements_per_row();
+    auto p = hybrid_mtx->get_ell_stride();
 
-    GKO_ASSERT_MTX_NEAR(row_collection, l<T>({{2.0, 4.5}, {-1.0, -0.5}}), 0.0);
-    ASSERT_EQ(row_collection->get_stride(), 4);
+    ASSERT_EQ(hybrid_mtx->get_size(), gko::dim<2>(2, 3));
+    ASSERT_EQ(hybrid_mtx->get_ell_num_stored_elements(), 0);
+    ASSERT_EQ(hybrid_mtx->get_coo_num_stored_elements(), 4);
+    EXPECT_EQ(n, 0);
+    EXPECT_EQ(p, 3);
+    EXPECT_EQ(r[0], 0);
+    EXPECT_EQ(r[1], 0);
+    EXPECT_EQ(r[2], 0);
+    EXPECT_EQ(r[3], 1);
+    EXPECT_EQ(c[0], 0);
+    EXPECT_EQ(c[1], 1);
+    EXPECT_EQ(c[2], 2);
+    EXPECT_EQ(c[3], 1);
+    EXPECT_EQ(v[0], ValueType{1.0});
+    EXPECT_EQ(v[1], ValueType{3.0});
+    EXPECT_EQ(v[2], ValueType{2.0});
+    EXPECT_EQ(v[3], ValueType{5.0});
 }
 
 
-TYPED_TEST(Dense, NonSquareSubmatrixCanGatherRowsIntoMixedDense64)
+TYPED_TEST(DenseWithIndexType, MovesToHybridWithStrideAutomatically)
 {
-    using Mtx = typename TestFixture::Mtx;
-    using MixedMtx = typename TestFixture::MixedMtx;
-    using T = typename TestFixture::value_type;
-    auto exec = this->mtx4->get_executor();
-    gko::array<gko::int64> gather_index{exec, {1, 0, 1}};
-    auto row_collection = MixedMtx::create(exec, gko::dim<2>{3, 3}, 4);
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using Hybrid = typename gko::matrix::Hybrid<value_type, index_type>;
+    auto hybrid_mtx =
+        Hybrid::create(this->mtx4->get_executor(), gko::dim<2>{2, 3}, 0, 3);
 
-    this->mtx4->row_gather(&gather_index, row_collection);
+    this->mtx4->move_to(hybrid_mtx);
 
-    GKO_ASSERT_MTX_NEAR(
-        row_collection,
-        l<typename MixedMtx::value_type>(
-            {{0.0, 5.0, 0.0}, {1.0, 3.0, 2.0}, {0.0, 5.0, 0.0}}),
-        0.0);
+    assert_hybrid_strided_eq_mtx4(hybrid_mtx.get());
 }
 
 
-TYPED_TEST(Dense, SquareMatrixGatherRowsIntoDenseFailsForWrongDimensions64)
+TYPED_TEST(DenseWithIndexType, ConvertsToHybridWithStrideAutomatically)
 {
-    using Mtx = typename TestFixture::Mtx;
-    using T = typename TestFixture::value_type;
-    auto exec = this->mtx5->get_executor();
-    gko::array<gko::int64> permute_idxs{exec, {1, 0}};
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using Hybrid = typename gko::matrix::Hybrid<value_type, index_type>;
+    auto hybrid_mtx =
+        Hybrid::create(this->mtx4->get_executor(), gko::dim<2>{2, 3}, 0, 3);
 
-    ASSERT_THROW(this->mtx5->row_gather(&permute_idxs, Mtx::create(exec)),
-                 gko::DimensionMismatch);
+    this->mtx4->convert_to(hybrid_mtx);
+
+    assert_hybrid_strided_eq_mtx4(hybrid_mtx.get());
 }
 
 
-TYPED_TEST(Dense, SquareMatrixIsPermutable)
+template <typename ValueType, typename IndexType>
+void assert_hybrid_limited_eq_mtx4(
+    const gko::matrix::Hybrid<ValueType, IndexType>* hybrid_mtx)
 {
-    using Mtx = typename TestFixture::Mtx;
-    auto exec = this->mtx5->get_executor();
-    gko::array<gko::int32> permute_idxs{exec, {1, 2, 0}};
-
-    auto ref_permuted =
-        gko::as<Mtx>(gko::as<Mtx>(this->mtx5->row_permute(&permute_idxs))
-                         ->column_permute(&permute_idxs));
-    auto permuted = gko::as<Mtx>(this->mtx5->permute(&permute_idxs));
+    constexpr auto invalid_index = gko::invalid_index<IndexType>();
+    auto v = hybrid_mtx->get_const_ell_values();
+    auto c = hybrid_mtx->get_const_ell_col_idxs();
+    auto n = hybrid_mtx->get_ell_num_stored_elements_per_row();
+    auto p = hybrid_mtx->get_ell_stride();
 
-    GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0);
+    ASSERT_EQ(hybrid_mtx->get_size(), gko::dim<2>(2, 3));
+    ASSERT_EQ(hybrid_mtx->get_ell_num_stored_elements(), 6);
+    ASSERT_EQ(hybrid_mtx->get_coo_num_stored_elements(), 1);
+    EXPECT_EQ(n, 2);
+    EXPECT_EQ(p, 3);
+    EXPECT_EQ(c[0], 0);
+    EXPECT_EQ(c[1], 1);
+    EXPECT_EQ(c[2], invalid_index);
+    EXPECT_EQ(c[3], 1);
+    EXPECT_EQ(c[4], invalid_index);
+    EXPECT_EQ(c[5], invalid_index);
+    EXPECT_EQ(v[0], ValueType{1.0});
+    EXPECT_EQ(v[1], ValueType{5.0});
+    EXPECT_EQ(v[2], ValueType{0.0});
+    EXPECT_EQ(v[3], ValueType{3.0});
+    EXPECT_EQ(v[4], ValueType{0.0});
+    EXPECT_EQ(v[5], ValueType{0.0});
+    EXPECT_EQ(hybrid_mtx->get_const_coo_values()[0], ValueType{2.0});
+    EXPECT_EQ(hybrid_mtx->get_const_coo_row_idxs()[0], 0);
+    EXPECT_EQ(hybrid_mtx->get_const_coo_col_idxs()[0], 2);
 }
 
 
-TYPED_TEST(Dense, SquareMatrixIsPermutableIntoDense)
+TYPED_TEST(DenseWithIndexType, MovesToHybridWithStrideAndCooLengthByColumns2)
 {
-    using Mtx = typename TestFixture::Mtx;
-    auto exec = this->mtx5->get_executor();
-    gko::array<gko::int32> permute_idxs{exec, {1, 2, 0}};
-    auto permuted = Mtx::create(exec, this->mtx5->get_size());
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using Hybrid = typename gko::matrix::Hybrid<value_type, index_type>;
+    auto hybrid_mtx =
+        Hybrid::create(this->mtx4->get_executor(), gko::dim<2>{2, 3}, 2, 3, 3,
+                       std::make_shared<typename Hybrid::column_limit>(2));
 
-    auto ref_permuted =
-        gko::as<Mtx>(gko::as<Mtx>(this->mtx5->row_permute(&permute_idxs))
-                         ->column_permute(&permute_idxs));
-    this->mtx5->permute(&permute_idxs, permuted);
+    this->mtx4->move_to(hybrid_mtx);
 
-    GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0);
+    assert_hybrid_limited_eq_mtx4(hybrid_mtx.get());
 }
 
 
-TYPED_TEST(Dense, SquareSubmatrixIsPermutableIntoDense)
+TYPED_TEST(DenseWithIndexType, ConvertsToHybridWithStrideAndCooLengthByColumns2)
 {
-    using Mtx = typename TestFixture::Mtx;
-    auto exec = this->mtx5->get_executor();
-    gko::array<gko::int32> permute_idxs{exec, {1, 0}};
-    auto permuted = Mtx::create(exec, gko::dim<2>{2, 2}, 4);
-    auto mtx = this->mtx5->create_submatrix({0, 2}, {1, 3});
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using Hybrid = typename gko::matrix::Hybrid<value_type, index_type>;
+    auto hybrid_mtx =
+        Hybrid::create(this->mtx4->get_executor(), gko::dim<2>{2, 3}, 2, 3, 3,
+                       std::make_shared<typename Hybrid::column_limit>(2));
 
-    auto ref_permuted =
-        gko::as<Mtx>(gko::as<Mtx>(mtx->row_permute(&permute_idxs))
-                         ->column_permute(&permute_idxs));
-    mtx->permute(&permute_idxs, permuted);
+    this->mtx4->convert_to(hybrid_mtx);
 
-    GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0);
-    ASSERT_EQ(permuted->get_stride(), 4);
+    assert_hybrid_limited_eq_mtx4(hybrid_mtx.get());
 }
 
 
-TYPED_TEST(Dense, NonSquareMatrixPermuteIntoDenseFails)
+template <typename ValueType, typename IndexType>
+void assert_hybrid_percent_eq_mtx4(
+    const gko::matrix::Hybrid<ValueType, IndexType>* hybrid_mtx)
 {
-    using Mtx = typename TestFixture::Mtx;
-    auto exec = this->mtx4->get_executor();
-    gko::array<gko::int32> permute_idxs{exec, {1, 2, 0}};
+    auto v = hybrid_mtx->get_const_ell_values();
+    auto c = hybrid_mtx->get_const_ell_col_idxs();
+    auto n = hybrid_mtx->get_ell_num_stored_elements_per_row();
+    auto p = hybrid_mtx->get_ell_stride();
+    auto coo_v = hybrid_mtx->get_const_coo_values();
+    auto coo_c = hybrid_mtx->get_const_coo_col_idxs();
+    auto coo_r = hybrid_mtx->get_const_coo_row_idxs();
 
-    ASSERT_THROW(this->mtx4->permute(&permute_idxs, this->mtx4->clone()),
-                 gko::DimensionMismatch);
+    ASSERT_EQ(hybrid_mtx->get_size(), gko::dim<2>(2, 3));
+    ASSERT_EQ(hybrid_mtx->get_ell_num_stored_elements(), 3);
+    EXPECT_EQ(n, 1);
+    EXPECT_EQ(p, 3);
+    EXPECT_EQ(c[0], 0);
+    EXPECT_EQ(c[1], 1);
+    EXPECT_EQ(c[2], gko::invalid_index<IndexType>());
+    EXPECT_EQ(v[0], ValueType{1.0});
+    EXPECT_EQ(v[1], ValueType{5.0});
+    EXPECT_EQ(v[2], ValueType{0.0});
+    ASSERT_EQ(hybrid_mtx->get_coo_num_stored_elements(), 2);
+    EXPECT_EQ(coo_v[0], ValueType{3.0});
+    EXPECT_EQ(coo_v[1], ValueType{2.0});
+    EXPECT_EQ(coo_c[0], 1);
+    EXPECT_EQ(coo_c[1], 2);
+    EXPECT_EQ(coo_r[0], 0);
+    EXPECT_EQ(coo_r[1], 0);
 }
 
 
-TYPED_TEST(Dense, SquareMatrixPermuteIntoDenseFailsForWrongPermutationSize)
+TYPED_TEST(DenseWithIndexType, MovesToHybridWithStrideByPercent40)
 {
-    using Mtx = typename TestFixture::Mtx;
-    auto exec = this->mtx5->get_executor();
-    gko::array<gko::int32> permute_idxs{exec, {1, 2}};
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using Hybrid = typename gko::matrix::Hybrid<value_type, index_type>;
+    auto hybrid_mtx =
+        Hybrid::create(this->mtx4->get_executor(), gko::dim<2>{2, 3}, 1, 3,
+                       std::make_shared<typename Hybrid::imbalance_limit>(0.4));
 
-    ASSERT_THROW(this->mtx5->permute(&permute_idxs, this->mtx5->clone()),
-                 gko::ValueMismatch);
+    this->mtx4->move_to(hybrid_mtx);
+
+    assert_hybrid_percent_eq_mtx4(hybrid_mtx.get());
 }
 
 
-TYPED_TEST(Dense, SquareMatrixPermuteIntoDenseFailsForWrongDimensions)
+TYPED_TEST(DenseWithIndexType, ConvertsToHybridWithStrideByPercent40)
 {
-    using Mtx = typename TestFixture::Mtx;
-    auto exec = this->mtx5->get_executor();
-    gko::array<gko::int32> permute_idxs{exec, {1, 2, 0}};
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using Hybrid = typename gko::matrix::Hybrid<value_type, index_type>;
+    auto hybrid_mtx =
+        Hybrid::create(this->mtx4->get_executor(), gko::dim<2>{2, 3}, 1, 3,
+                       std::make_shared<typename Hybrid::imbalance_limit>(0.4));
 
-    ASSERT_THROW(this->mtx5->permute(&permute_idxs, Mtx::create(exec)),
-                 gko::DimensionMismatch);
+    this->mtx4->convert_to(hybrid_mtx);
+
+    assert_hybrid_percent_eq_mtx4(hybrid_mtx.get());
 }
 
 
-TYPED_TEST(Dense, SquareMatrixIsInversePermutable)
+template <typename ValueType, typename IndexType>
+void assert_sellp_eq_mtx7(
+    const gko::matrix::Sellp<ValueType, IndexType>* sellp_mtx)
 {
-    using Mtx = typename TestFixture::Mtx;
-    auto exec = this->mtx5->get_executor();
-    gko::array<gko::int32> permute_idxs{exec, {1, 2, 0}};
-
-    auto ref_permuted = gko::as<Mtx>(
-        gko::as<Mtx>(this->mtx5->inverse_row_permute(&permute_idxs))
-            ->inverse_column_permute(&permute_idxs));
-    auto permuted = gko::as<Mtx>(this->mtx5->inverse_permute(&permute_idxs));
+    constexpr auto invalid_index = gko::invalid_index<IndexType>();
+    auto v = sellp_mtx->get_const_values();
+    auto c = sellp_mtx->get_const_col_idxs();
+    auto s = sellp_mtx->get_const_slice_sets();
+    auto l = sellp_mtx->get_const_slice_lengths();
 
-    GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0);
+    ASSERT_EQ(sellp_mtx->get_size(), gko::dim<2>(2, 3));
+    ASSERT_EQ(sellp_mtx->get_total_cols(), 3);
+    ASSERT_EQ(sellp_mtx->get_num_stored_elements(),
+              3 * gko::matrix::default_slice_size);
+    ASSERT_EQ(sellp_mtx->get_slice_size(), gko::matrix::default_slice_size);
+    ASSERT_EQ(sellp_mtx->get_stride_factor(),
+              gko::matrix::default_stride_factor);
+    EXPECT_EQ(c[0], 0);
+    EXPECT_EQ(c[1], 1);
+    EXPECT_EQ(c[gko::matrix::default_slice_size], 1);
+    EXPECT_EQ(c[gko::matrix::default_slice_size + 1], invalid_index);
+    EXPECT_EQ(c[2 * gko::matrix::default_slice_size], 2);
+    EXPECT_EQ(c[2 * gko::matrix::default_slice_size + 1], invalid_index);
+    EXPECT_EQ(v[0], ValueType{1.0});
+    EXPECT_EQ(v[1], ValueType{1.5});
+    EXPECT_EQ(v[gko::matrix::default_slice_size], ValueType{2.0});
+    EXPECT_EQ(v[gko::matrix::default_slice_size + 1], ValueType{0.0});
+    EXPECT_EQ(v[2 * gko::matrix::default_slice_size], ValueType{3.0});
+    EXPECT_EQ(v[2 * gko::matrix::default_slice_size + 1], ValueType{0.0});
+    EXPECT_EQ(s[0], 0);
+    EXPECT_EQ(s[1], 3);
+    EXPECT_EQ(l[0], 3);
 }
 
 
-TYPED_TEST(Dense, SquareMatrixIsInversePermutableIntoDense)
+TYPED_TEST(DenseWithIndexType, ConvertsToSellp)
 {
-    using Mtx = typename TestFixture::Mtx;
-    auto exec = this->mtx5->get_executor();
-    gko::array<gko::int32> permute_idxs{exec, {1, 2, 0}};
-    auto permuted = Mtx::create(exec, this->mtx5->get_size());
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using Sellp = typename gko::matrix::Sellp<value_type, index_type>;
+    auto sellp_mtx = Sellp::create(this->mtx7->get_executor());
 
-    auto ref_permuted = gko::as<Mtx>(
-        gko::as<Mtx>(this->mtx5->inverse_row_permute(&permute_idxs))
-            ->inverse_column_permute(&permute_idxs));
-    this->mtx5->inverse_permute(&permute_idxs, permuted);
+    this->mtx7->convert_to(sellp_mtx);
 
-    GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0);
+    assert_sellp_eq_mtx7(sellp_mtx.get());
 }
 
 
-TYPED_TEST(Dense, SquareSubmatrixIsInversePermutableIntoDense)
+TYPED_TEST(DenseWithIndexType, MovesToSellp)
 {
-    using Mtx = typename TestFixture::Mtx;
-    auto exec = this->mtx5->get_executor();
-    gko::array<gko::int32> permute_idxs{exec, {1, 0}};
-    auto permuted = Mtx::create(exec, gko::dim<2>{2, 2}, 4);
-    auto mtx = this->mtx5->create_submatrix({0, 2}, {1, 3});
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using Sellp = typename gko::matrix::Sellp<value_type, index_type>;
+    auto sellp_mtx = Sellp::create(this->mtx7->get_executor());
 
-    auto ref_permuted =
-        gko::as<Mtx>(gko::as<Mtx>(mtx->inverse_row_permute(&permute_idxs))
-                         ->inverse_column_permute(&permute_idxs));
-    mtx->inverse_permute(&permute_idxs, permuted);
+    this->mtx7->move_to(sellp_mtx);
 
-    GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0);
-    ASSERT_EQ(permuted->get_stride(), 4);
+    assert_sellp_eq_mtx7(sellp_mtx.get());
 }
 
 
-TYPED_TEST(Dense, NonSquareMatrixInversePermuteIntoDenseFails)
+template <typename ValueType, typename IndexType>
+void assert_sellp_strided_eq_mtx7(
+    const gko::matrix::Sellp<ValueType, IndexType>* sellp_mtx)
 {
-    using Mtx = typename TestFixture::Mtx;
-    auto exec = this->mtx4->get_executor();
-    gko::array<gko::int32> permute_idxs{exec, {1, 2, 0}};
+    constexpr auto invalid_index = gko::invalid_index<IndexType>();
+    auto v = sellp_mtx->get_const_values();
+    auto c = sellp_mtx->get_const_col_idxs();
+    auto s = sellp_mtx->get_const_slice_sets();
+    auto l = sellp_mtx->get_const_slice_lengths();
 
-    ASSERT_THROW(
-        this->mtx4->inverse_permute(&permute_idxs, this->mtx4->clone()),
-        gko::DimensionMismatch);
+    ASSERT_EQ(sellp_mtx->get_size(), gko::dim<2>(2, 3));
+    ASSERT_EQ(sellp_mtx->get_total_cols(), 4);
+    ASSERT_EQ(sellp_mtx->get_num_stored_elements(), 8);
+    ASSERT_EQ(sellp_mtx->get_slice_size(), 2);
+    ASSERT_EQ(sellp_mtx->get_stride_factor(), 2);
+    EXPECT_EQ(c[0], 0);
+    EXPECT_EQ(c[1], 1);
+    EXPECT_EQ(c[2], 1);
+    EXPECT_EQ(c[3], invalid_index);
+    EXPECT_EQ(c[4], 2);
+    EXPECT_EQ(c[5], invalid_index);
+    EXPECT_EQ(c[6], invalid_index);
+    EXPECT_EQ(c[7], invalid_index);
+    EXPECT_EQ(v[0], ValueType{1.0});
+    EXPECT_EQ(v[1], ValueType{1.5});
+    EXPECT_EQ(v[2], ValueType{2.0});
+    EXPECT_EQ(v[3], ValueType{0.0});
+    EXPECT_EQ(v[4], ValueType{3.0});
+    EXPECT_EQ(v[5], ValueType{0.0});
+    EXPECT_EQ(v[6], ValueType{0.0});
+    EXPECT_EQ(v[7], ValueType{0.0});
+    EXPECT_EQ(s[0], 0);
+    EXPECT_EQ(s[1], 4);
+    EXPECT_EQ(l[0], 4);
 }
 
 
-TYPED_TEST(Dense,
-           SquareMatrixInversePermuteIntoDenseFailsForWrongPermutationSize)
+TYPED_TEST(DenseWithIndexType, ConvertsToSellpWithSliceSizeAndStrideFactor)
 {
-    using Mtx = typename TestFixture::Mtx;
-    auto exec = this->mtx5->get_executor();
-    gko::array<gko::int32> permute_idxs{exec, {0, 1}};
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using Sellp = typename gko::matrix::Sellp<value_type, index_type>;
+    auto sellp_mtx =
+        Sellp::create(this->mtx7->get_executor(), gko::dim<2>{}, 2, 2, 0);
 
-    ASSERT_THROW(
-        this->mtx5->inverse_permute(&permute_idxs, this->mtx5->clone()),
-        gko::ValueMismatch);
+    this->mtx7->convert_to(sellp_mtx);
+
+    assert_sellp_strided_eq_mtx7(sellp_mtx.get());
 }
 
 
-TYPED_TEST(Dense, SquareMatrixInversePermuteIntoDenseFailsForWrongDimensions)
+TYPED_TEST(DenseWithIndexType, MovesToSellpWithSliceSizeAndStrideFactor)
 {
-    using Mtx = typename TestFixture::Mtx;
-    auto exec = this->mtx5->get_executor();
-    gko::array<gko::int32> permute_idxs{exec, {1, 2, 0}};
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using Sellp = typename gko::matrix::Sellp<value_type, index_type>;
+    auto sellp_mtx =
+        Sellp::create(this->mtx7->get_executor(), gko::dim<2>{}, 2, 2, 0);
 
-    ASSERT_THROW(this->mtx5->inverse_permute(&permute_idxs, Mtx::create(exec)),
-                 gko::DimensionMismatch);
+    this->mtx7->move_to(sellp_mtx);
+
+    assert_sellp_strided_eq_mtx7(sellp_mtx.get());
 }
 
 
-TYPED_TEST(Dense, SquareMatrixIsPermutable64)
+TYPED_TEST(DenseWithIndexType, ConvertsToAndFromSellpWithMoreThanOneSlice)
 {
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
     using Mtx = typename TestFixture::Mtx;
-    auto exec = this->mtx5->get_executor();
-    gko::array<gko::int64> permute_idxs{exec, {1, 2, 0}};
+    using Sellp = typename gko::matrix::Sellp<value_type, index_type>;
+    auto x = this->template gen_mtx<Mtx>(65, 25);
 
-    auto ref_permuted =
-        gko::as<Mtx>(gko::as<Mtx>(this->mtx5->row_permute(&permute_idxs))
-                         ->column_permute(&permute_idxs));
-    auto permuted = gko::as<Mtx>(this->mtx5->permute(&permute_idxs));
+    auto sellp_mtx = Sellp::create(this->exec);
+    auto dense_mtx = Mtx::create(this->exec);
+    x->convert_to(sellp_mtx);
+    sellp_mtx->convert_to(dense_mtx);
 
-    GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0);
+    GKO_ASSERT_MTX_NEAR(dense_mtx, x, 0.0);
 }
 
 
-TYPED_TEST(Dense, SquareMatrixIsPermutableIntoDense64)
+TYPED_TEST(Dense, ConvertsEmptyToPrecision)
 {
-    using Mtx = typename TestFixture::Mtx;
-    auto exec = this->mtx5->get_executor();
-    gko::array<gko::int64> permute_idxs{exec, {1, 2, 0}};
-    auto permuted = Mtx::create(exec, this->mtx5->get_size());
+    using Dense = typename TestFixture::Mtx;
+    using T = typename TestFixture::value_type;
+    using OtherT = typename gko::next_precision<T>;
+    using OtherDense = typename gko::matrix::Dense<OtherT>;
+    auto empty = OtherDense::create(this->exec);
+    auto res = Dense::create(this->exec);
 
-    auto ref_permuted =
-        gko::as<Mtx>(gko::as<Mtx>(this->mtx5->row_permute(&permute_idxs))
-                         ->column_permute(&permute_idxs));
-    this->mtx5->permute(&permute_idxs, permuted);
+    empty->convert_to(res);
 
-    GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0);
+    ASSERT_FALSE(res->get_size());
 }
 
 
-TYPED_TEST(Dense, SquareSubmatrixIsPermutableIntoDense64)
+TYPED_TEST(Dense, MovesEmptyToPrecision)
 {
-    using Mtx = typename TestFixture::Mtx;
-    auto exec = this->mtx5->get_executor();
-    gko::array<gko::int64> permute_idxs{exec, {1, 0}};
-    auto permuted = Mtx::create(exec, gko::dim<2>{2, 2}, 4);
-    auto mtx = this->mtx5->create_submatrix({0, 2}, {1, 3});
+    using Dense = typename TestFixture::Mtx;
+    using T = typename TestFixture::value_type;
+    using OtherT = typename gko::next_precision<T>;
+    using OtherDense = typename gko::matrix::Dense<OtherT>;
+    auto empty = OtherDense::create(this->exec);
+    auto res = Dense::create(this->exec);
 
-    auto ref_permuted =
-        gko::as<Mtx>(gko::as<Mtx>(mtx->row_permute(&permute_idxs))
-                         ->column_permute(&permute_idxs));
-    mtx->permute(&permute_idxs, permuted);
+    empty->move_to(res);
 
-    GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0);
-    ASSERT_EQ(permuted->get_stride(), 4);
+    ASSERT_FALSE(res->get_size());
 }
 
 
-TYPED_TEST(Dense, NonSquareMatrixPermuteIntoDenseFails64)
+TYPED_TEST(DenseWithIndexType, ConvertsEmptyToCoo)
 {
-    using Mtx = typename TestFixture::Mtx;
-    auto exec = this->mtx4->get_executor();
-    gko::array<gko::int64> permute_idxs{exec, {1, 2, 0}};
+    using Dense = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using Coo = typename gko::matrix::Coo<value_type, index_type>;
+    auto empty = Dense::create(this->exec);
+    auto res = Coo::create(this->exec);
 
-    ASSERT_THROW(this->mtx4->permute(&permute_idxs, this->mtx4->clone()),
-                 gko::DimensionMismatch);
+    empty->convert_to(res);
+
+    ASSERT_EQ(res->get_num_stored_elements(), 0);
+    ASSERT_FALSE(res->get_size());
 }
 
 
-TYPED_TEST(Dense, SquareMatrixPermuteIntoDenseFailsForWrongPermutationSize64)
+TYPED_TEST(DenseWithIndexType, MovesEmptyToCoo)
 {
-    using Mtx = typename TestFixture::Mtx;
-    auto exec = this->mtx5->get_executor();
-    gko::array<gko::int64> permute_idxs{exec, {1, 2}};
+    using Dense = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using Coo = typename gko::matrix::Coo<value_type, index_type>;
+    auto empty = Dense::create(this->exec);
+    auto res = Coo::create(this->exec);
 
-    ASSERT_THROW(this->mtx5->permute(&permute_idxs, this->mtx5->clone()),
-                 gko::ValueMismatch);
+    empty->move_to(res);
+
+    ASSERT_EQ(res->get_num_stored_elements(), 0);
+    ASSERT_FALSE(res->get_size());
 }
 
 
-TYPED_TEST(Dense, SquareMatrixPermuteIntoDenseFailsForWrongDimensions64)
+TYPED_TEST(DenseWithIndexType, ConvertsEmptyMatrixToCsr)
 {
-    using Mtx = typename TestFixture::Mtx;
-    auto exec = this->mtx5->get_executor();
-    gko::array<gko::int64> permute_idxs{exec, {1, 2, 0}};
+    using Dense = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using Csr = typename gko::matrix::Csr<value_type, index_type>;
+    auto empty = Dense::create(this->exec);
+    auto res = Csr::create(this->exec);
 
-    ASSERT_THROW(this->mtx5->permute(&permute_idxs, Mtx::create(exec)),
-                 gko::DimensionMismatch);
+    empty->convert_to(res);
+
+    ASSERT_EQ(res->get_num_stored_elements(), 0);
+    ASSERT_EQ(*res->get_const_row_ptrs(), 0);
+    ASSERT_FALSE(res->get_size());
 }
 
 
-TYPED_TEST(Dense, SquareMatrixIsInversePermutable64)
+TYPED_TEST(DenseWithIndexType, MovesEmptyMatrixToCsr)
 {
-    using Mtx = typename TestFixture::Mtx;
-    auto exec = this->mtx5->get_executor();
-    gko::array<gko::int64> permute_idxs{exec, {1, 2, 0}};
+    using Dense = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using Csr = typename gko::matrix::Csr<value_type, index_type>;
+    auto empty = Dense::create(this->exec);
+    auto res = Csr::create(this->exec);
 
-    auto ref_permuted = gko::as<Mtx>(
-        gko::as<Mtx>(this->mtx5->inverse_row_permute(&permute_idxs))
-            ->inverse_column_permute(&permute_idxs));
-    auto permuted = gko::as<Mtx>(this->mtx5->inverse_permute(&permute_idxs));
+    empty->move_to(res);
+
+    ASSERT_EQ(res->get_num_stored_elements(), 0);
+    ASSERT_EQ(*res->get_const_row_ptrs(), 0);
+    ASSERT_FALSE(res->get_size());
+}
+
+
+TYPED_TEST(DenseWithIndexType, ConvertsEmptyToSparsityCsr)
+{
+    using Dense = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using SparsityCsr =
+        typename gko::matrix::SparsityCsr<value_type, index_type>;
+    auto empty = Dense::create(this->exec);
+    auto res = SparsityCsr::create(this->exec);
+
+    empty->convert_to(res);
 
-    GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0);
+    ASSERT_EQ(res->get_num_nonzeros(), 0);
+    ASSERT_EQ(*res->get_const_row_ptrs(), 0);
+    ASSERT_FALSE(res->get_size());
 }
 
 
-TYPED_TEST(Dense, SquareMatrixIsInversePermutableIntoDense64)
+TYPED_TEST(DenseWithIndexType, MovesEmptyToSparsityCsr)
 {
-    using Mtx = typename TestFixture::Mtx;
-    auto exec = this->mtx5->get_executor();
-    gko::array<gko::int64> permute_idxs{exec, {1, 2, 0}};
-    auto permuted = Mtx::create(exec, this->mtx5->get_size());
+    using Dense = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using SparsityCsr =
+        typename gko::matrix::SparsityCsr<value_type, index_type>;
+    auto empty = Dense::create(this->exec);
+    auto res = SparsityCsr::create(this->exec);
 
-    auto ref_permuted = gko::as<Mtx>(
-        gko::as<Mtx>(this->mtx5->inverse_row_permute(&permute_idxs))
-            ->inverse_column_permute(&permute_idxs));
-    this->mtx5->inverse_permute(&permute_idxs, permuted);
+    empty->move_to(res);
 
-    GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0);
+    ASSERT_EQ(res->get_num_nonzeros(), 0);
+    ASSERT_EQ(*res->get_const_row_ptrs(), 0);
+    ASSERT_FALSE(res->get_size());
 }
 
 
-TYPED_TEST(Dense, SquareSubmatrixIsInversePermutableIntoDense64)
+TYPED_TEST(DenseWithIndexType, ConvertsEmptyToEll)
 {
-    using Mtx = typename TestFixture::Mtx;
-    auto exec = this->mtx5->get_executor();
-    gko::array<gko::int64> permute_idxs{exec, {1, 0}};
-    auto permuted = Mtx::create(exec, gko::dim<2>{2, 2}, 4);
-    auto mtx = this->mtx5->create_submatrix({0, 2}, {1, 3});
+    using Dense = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using Ell = typename gko::matrix::Ell<value_type, index_type>;
+    auto empty = Dense::create(this->exec);
+    auto res = Ell::create(this->exec);
 
-    auto ref_permuted =
-        gko::as<Mtx>(gko::as<Mtx>(mtx->inverse_row_permute(&permute_idxs))
-                         ->inverse_column_permute(&permute_idxs));
-    mtx->inverse_permute(&permute_idxs, permuted);
+    empty->convert_to(res);
 
-    GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0);
-    ASSERT_EQ(permuted->get_stride(), 4);
+    ASSERT_EQ(res->get_num_stored_elements(), 0);
+    ASSERT_FALSE(res->get_size());
 }
 
 
-TYPED_TEST(Dense, NonSquareMatrixInversePermuteIntoDenseFails64)
+TYPED_TEST(DenseWithIndexType, MovesEmptyToEll)
 {
-    using Mtx = typename TestFixture::Mtx;
-    auto exec = this->mtx4->get_executor();
-    gko::array<gko::int64> permute_idxs{exec, {1, 2, 0}};
+    using Dense = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using Ell = typename gko::matrix::Ell<value_type, index_type>;
+    auto empty = Dense::create(this->exec);
+    auto res = Ell::create(this->exec);
 
-    ASSERT_THROW(
-        this->mtx4->inverse_permute(&permute_idxs, this->mtx4->clone()),
-        gko::DimensionMismatch);
+    empty->move_to(res);
+
+    ASSERT_EQ(res->get_num_stored_elements(), 0);
+    ASSERT_FALSE(res->get_size());
 }
 
 
-TYPED_TEST(Dense,
-           SquareMatrixInversePermuteIntoDenseFailsForWrongPermutationSize64)
+TYPED_TEST(DenseWithIndexType, ConvertsEmptyToHybrid)
 {
-    using Mtx = typename TestFixture::Mtx;
-    auto exec = this->mtx5->get_executor();
-    gko::array<gko::int64> permute_idxs{exec, {1, 2}};
+    using Dense = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using Hybrid = typename gko::matrix::Hybrid<value_type, index_type>;
+    auto empty = Dense::create(this->exec);
+    auto res = Hybrid::create(this->exec);
 
-    ASSERT_THROW(
-        this->mtx5->inverse_permute(&permute_idxs, this->mtx5->clone()),
-        gko::ValueMismatch);
+    empty->convert_to(res);
+
+    ASSERT_EQ(res->get_num_stored_elements(), 0);
+    ASSERT_FALSE(res->get_size());
 }
 
 
-TYPED_TEST(Dense, SquareMatrixInversePermuteIntoDenseFailsForWrongDimensions64)
+TYPED_TEST(DenseWithIndexType, MovesEmptyToHybrid)
 {
-    using Mtx = typename TestFixture::Mtx;
-    auto exec = this->mtx5->get_executor();
-    gko::array<gko::int64> permute_idxs{exec, {1, 2, 0}};
+    using Dense = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using Hybrid = typename gko::matrix::Hybrid<value_type, index_type>;
+    auto empty = Dense::create(this->exec);
+    auto res = Hybrid::create(this->exec);
 
-    ASSERT_THROW(this->mtx5->inverse_permute(&permute_idxs, Mtx::create(exec)),
-                 gko::DimensionMismatch);
+    empty->move_to(res);
+
+    ASSERT_EQ(res->get_num_stored_elements(), 0);
+    ASSERT_FALSE(res->get_size());
 }
 
 
-TYPED_TEST(Dense, SquareMatrixIsRowPermutable)
+TYPED_TEST(DenseWithIndexType, ConvertsEmptyToSellp)
 {
-    using Mtx = typename TestFixture::Mtx;
-    using T = typename TestFixture::value_type;
-    auto exec = this->mtx5->get_executor();
-    gko::array<gko::int32> permute_idxs{exec, {1, 2, 0}};
+    using Dense = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using Sellp = typename gko::matrix::Sellp<value_type, index_type>;
+    auto empty = Dense::create(this->exec);
+    auto res = Sellp::create(this->exec);
 
-    auto row_permute = gko::as<Mtx>(this->mtx5->row_permute(&permute_idxs));
+    empty->convert_to(res);
 
-    GKO_ASSERT_MTX_NEAR(
-        row_permute,
-        l<T>({{-2.0, 2.0, 4.5}, {2.1, 3.4, 1.2}, {1.0, -1.0, -0.5}}), 0.0);
+    ASSERT_EQ(res->get_num_stored_elements(), 0);
+    ASSERT_EQ(*res->get_const_slice_sets(), 0);
+    ASSERT_FALSE(res->get_size());
 }
 
 
-TYPED_TEST(Dense, NonSquareMatrixIsRowPermutable)
+TYPED_TEST(DenseWithIndexType, MovesEmptyToSellp)
 {
-    using Mtx = typename TestFixture::Mtx;
-    using T = typename TestFixture::value_type;
-    auto exec = this->mtx4->get_executor();
-    gko::array<gko::int32> permute_idxs{exec, {1, 0}};
+    using Dense = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using Sellp = typename gko::matrix::Sellp<value_type, index_type>;
+    auto empty = Dense::create(this->exec);
+    auto res = Sellp::create(this->exec);
 
-    auto row_permute = gko::as<Mtx>(this->mtx4->row_permute(&permute_idxs));
+    empty->move_to(res);
 
-    GKO_ASSERT_MTX_NEAR(row_permute, l<T>({{0.0, 5.0, 0.0}, {1.0, 3.0, 2.0}}),
-                        0.0);
+    ASSERT_EQ(res->get_num_stored_elements(), 0);
+    ASSERT_EQ(*res->get_const_slice_sets(), 0);
+    ASSERT_FALSE(res->get_size());
 }
 
 
-TYPED_TEST(Dense, SquareMatrixIsRowPermutableIntoDense)
+template <typename ValueType, typename IndexType>
+std::unique_ptr<gko::matrix::Dense<ValueType>> ref_permute(
+    gko::matrix::Dense<ValueType>* input,
+    gko::matrix::Permutation<IndexType>* permutation,
+    gko::matrix::permute_mode mode)
 {
-    using Mtx = typename TestFixture::Mtx;
-    using T = typename TestFixture::value_type;
-    auto exec = this->mtx5->get_executor();
-    gko::array<gko::int32> permute_idxs{exec, {1, 2, 0}};
-    auto row_permute = Mtx::create(exec, this->mtx5->get_size());
-
-    this->mtx5->row_permute(&permute_idxs, row_permute);
-
-    GKO_ASSERT_MTX_NEAR(
-        row_permute,
-        l<T>({{-2.0, 2.0, 4.5}, {2.1, 3.4, 1.2}, {1.0, -1.0, -0.5}}), 0.0);
+    using gko::matrix::permute_mode;
+    auto result = input->clone();
+    auto permutation_dense =
+        gko::matrix::Dense<double>::create(input->get_executor());
+    gko::matrix_data<double, IndexType> permutation_data;
+    if ((mode & permute_mode::inverse) == permute_mode::inverse) {
+        permutation->compute_inverse()->write(permutation_data);
+    } else {
+        permutation->write(permutation_data);
+    }
+    permutation_dense->read(permutation_data);
+    if ((mode & permute_mode::rows) == permute_mode::rows) {
+        // compute P * A
+        permutation_dense->apply(input, result);
+    }
+    if ((mode & permute_mode::columns) == permute_mode::columns) {
+        // compute A * P^T = (P * A^T)^T
+        auto tmp = result->transpose();
+        auto tmp2 = gko::as<gko::matrix::Dense<ValueType>>(tmp->clone());
+        permutation_dense->apply(tmp, tmp2);
+        tmp2->transpose(result);
+    }
+    return result;
+}
+
+
+template <typename ValueType, typename IndexType>
+std::unique_ptr<gko::matrix::Dense<ValueType>> ref_permute(
+    gko::matrix::Dense<ValueType>* input,
+    gko::matrix::Permutation<IndexType>* row_permutation,
+    gko::matrix::Permutation<IndexType>* col_permutation, bool invert)
+{
+    using gko::matrix::permute_mode;
+    auto result = input->clone();
+    auto row_permutation_dense =
+        gko::matrix::Dense<double>::create(input->get_executor());
+    auto col_permutation_dense =
+        gko::matrix::Dense<double>::create(input->get_executor());
+    gko::matrix_data<double, IndexType> row_permutation_data;
+    gko::matrix_data<double, IndexType> col_permutation_data;
+    if (invert) {
+        row_permutation->compute_inverse()->write(row_permutation_data);
+        col_permutation->compute_inverse()->write(col_permutation_data);
+    } else {
+        row_permutation->write(row_permutation_data);
+        col_permutation->write(col_permutation_data);
+    }
+    row_permutation_dense->read(row_permutation_data);
+    col_permutation_dense->read(col_permutation_data);
+    row_permutation_dense->apply(input, result);
+    auto tmp = result->transpose();
+    auto tmp2 = gko::as<gko::matrix::Dense<ValueType>>(tmp->clone());
+    col_permutation_dense->apply(tmp, tmp2);
+    tmp2->transpose(result);
+    return result;
 }
 
 
-TYPED_TEST(Dense, SquareSubmatrixIsRowPermutableIntoDense)
+TYPED_TEST(DenseWithIndexType, Permute)
 {
-    using Mtx = typename TestFixture::Mtx;
-    using T = typename TestFixture::value_type;
-    auto exec = this->mtx5->get_executor();
-    gko::array<gko::int32> permute_idxs{exec, {1, 0}};
-    auto row_permute = Mtx::create(exec, gko::dim<2>{2, 2}, 4);
+    using gko::matrix::permute_mode;
 
-    this->mtx5->create_submatrix({0, 2}, {0, 2})
-        ->row_permute(&permute_idxs, row_permute);
+    for (auto mode :
+         {permute_mode::none, permute_mode::rows, permute_mode::columns,
+          permute_mode::symmetric, permute_mode::inverse_rows,
+          permute_mode::inverse_columns, permute_mode::inverse_symmetric}) {
+        SCOPED_TRACE(mode);
+
+        auto permuted = this->mtx5->permute(this->perm3, mode);
+        auto ref_permuted =
+            ref_permute(this->mtx5.get(), this->perm3.get(), mode);
 
-    GKO_ASSERT_MTX_NEAR(row_permute, l<T>({{-2.0, 2.0}, {1.0, -1.0}}), 0.0);
-    ASSERT_EQ(row_permute->get_stride(), 4);
+        GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0);
+    }
 }
 
 
-TYPED_TEST(Dense, SquareMatrixRowPermuteIntoDenseFailsForWrongPermutationSize)
+TYPED_TEST(DenseWithIndexType, PermuteRoundtrip)
 {
-    using Mtx = typename TestFixture::Mtx;
-    using T = typename TestFixture::value_type;
-    auto exec = this->mtx5->get_executor();
-    gko::array<gko::int32> permute_idxs{exec, {1, 2}};
-    auto row_permute = Mtx::create(exec, this->mtx5->get_size());
+    using gko::matrix::permute_mode;
+
+    for (auto mode :
+         {permute_mode::rows, permute_mode::columns, permute_mode::symmetric}) {
+        SCOPED_TRACE(mode);
+
+        auto permuted =
+            this->mtx5->permute(this->perm3, mode)
+                ->permute(this->perm3, mode | permute_mode::inverse);
 
-    ASSERT_THROW(this->mtx5->row_permute(&permute_idxs, row_permute),
-                 gko::ValueMismatch);
+        GKO_ASSERT_MTX_NEAR(this->mtx5, permuted, 0.0);
+    }
 }
 
 
-TYPED_TEST(Dense, SquareMatrixRowPermuteIntoDenseFailsForWrongDimensions)
+TYPED_TEST(DenseWithIndexType, PermuteStridedIntoDense)
 {
+    using gko::matrix::permute_mode;
     using Mtx = typename TestFixture::Mtx;
-    using T = typename TestFixture::value_type;
-    auto exec = this->mtx5->get_executor();
-    gko::array<gko::int32> permute_idxs{exec, {1, 2, 0}};
+    auto mtx = Mtx::create(this->exec, this->mtx5->get_size(),
+                           this->mtx5->get_size()[1] + 1);
+    mtx->copy_from(this->mtx5);
 
-    ASSERT_THROW(this->mtx5->row_permute(&permute_idxs, Mtx::create(exec)),
-                 gko::DimensionMismatch);
+    for (auto mode :
+         {permute_mode::none, permute_mode::rows, permute_mode::columns,
+          permute_mode::symmetric, permute_mode::inverse,
+          permute_mode::inverse_rows, permute_mode::inverse_columns,
+          permute_mode::inverse_symmetric}) {
+        SCOPED_TRACE(mode);
+        auto permuted = Mtx::create(this->exec, this->mtx5->get_size(),
+                                    this->mtx5->get_size()[1] + 2);
+
+        this->mtx5->permute(this->perm3, permuted, mode);
+        auto ref_permuted =
+            ref_permute(this->mtx5.get(), this->perm3.get(), mode);
+
+        GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0);
+    }
 }
 
 
-TYPED_TEST(Dense, SquareMatrixIsColPermutable)
+TYPED_TEST(DenseWithIndexType, PermuteRectangular)
 {
-    using Mtx = typename TestFixture::Mtx;
-    using T = typename TestFixture::value_type;
-    auto exec = this->mtx5->get_executor();
-    gko::array<gko::int32> permute_idxs{exec, {1, 2, 0}};
+    using gko::matrix::permute_mode;
 
-    auto c_permute = gko::as<Mtx>(this->mtx5->column_permute(&permute_idxs));
+    auto rpermuted = this->mtx1->permute(this->perm2, permute_mode::rows);
+    auto irpermuted =
+        this->mtx1->permute(this->perm2, permute_mode::inverse_rows);
+    auto cpermuted = this->mtx1->permute(this->perm3, permute_mode::columns);
+    auto icpermuted =
+        this->mtx1->permute(this->perm3, permute_mode::inverse_columns);
+    auto ref_rpermuted =
+        ref_permute(this->mtx1.get(), this->perm2.get(), permute_mode::rows);
+    auto ref_irpermuted = ref_permute(this->mtx1.get(), this->perm2.get(),
+                                      permute_mode::inverse_rows);
+    auto ref_cpermuted =
+        ref_permute(this->mtx1.get(), this->perm3.get(), permute_mode::columns);
+    auto ref_icpermuted = ref_permute(this->mtx1.get(), this->perm3.get(),
+                                      permute_mode::inverse_columns);
 
-    GKO_ASSERT_MTX_NEAR(
-        c_permute, l<T>({{-1.0, -0.5, 1.0}, {2.0, 4.5, -2.0}, {3.4, 1.2, 2.1}}),
-        0.0);
+    GKO_ASSERT_MTX_NEAR(rpermuted, ref_rpermuted, 0.0);
+    GKO_ASSERT_MTX_NEAR(irpermuted, ref_irpermuted, 0.0);
+    GKO_ASSERT_MTX_NEAR(cpermuted, ref_cpermuted, 0.0);
+    GKO_ASSERT_MTX_NEAR(icpermuted, ref_icpermuted, 0.0);
 }
 
 
-TYPED_TEST(Dense, NonSquareMatrixIsColPermutable)
+TYPED_TEST(DenseWithIndexType, PermuteFailsWithIncorrectPermutationSize)
 {
-    using Mtx = typename TestFixture::Mtx;
-    using T = typename TestFixture::value_type;
-    auto exec = this->mtx4->get_executor();
-    gko::array<gko::int32> permute_idxs{exec, {1, 2, 0}};
+    using gko::matrix::permute_mode;
 
-    auto c_permute = gko::as<Mtx>(this->mtx4->column_permute(&permute_idxs));
+    for (auto mode :
+         {/* no permute_mode::none */ permute_mode::rows, permute_mode::columns,
+          permute_mode::symmetric, permute_mode::inverse_rows,
+          permute_mode::inverse_columns, permute_mode::inverse_symmetric}) {
+        SCOPED_TRACE(mode);
 
-    GKO_ASSERT_MTX_NEAR(c_permute, l<T>({{3.0, 2.0, 1.0}, {5.0, 0.0, 0.0}}),
-                        0.0);
+        ASSERT_THROW(this->mtx5->permute(this->perm0, mode),
+                     gko::DimensionMismatch);
+    }
 }
 
 
-TYPED_TEST(Dense, SquareMatrixIsColPermutableIntoDense)
+TYPED_TEST(DenseWithIndexType, PermuteFailsWithIncorrectOutputSize)
 {
+    using gko::matrix::permute_mode;
     using Mtx = typename TestFixture::Mtx;
-    using T = typename TestFixture::value_type;
-    auto exec = this->mtx5->get_executor();
-    gko::array<gko::int32> permute_idxs{exec, {1, 2, 0}};
-    auto c_permute = Mtx::create(exec, this->mtx5->get_size());
+    auto output = Mtx::create(this->exec);
 
-    this->mtx5->column_permute(&permute_idxs, c_permute);
+    for (auto mode :
+         {permute_mode::none, permute_mode::rows, permute_mode::columns,
+          permute_mode::symmetric, permute_mode::inverse_rows,
+          permute_mode::inverse_columns, permute_mode::inverse_symmetric}) {
+        SCOPED_TRACE(mode);
 
-    GKO_ASSERT_MTX_NEAR(
-        c_permute, l<T>({{-1.0, -0.5, 1.0}, {2.0, 4.5, -2.0}, {3.4, 1.2, 2.1}}),
-        0.0);
+        ASSERT_THROW(this->mtx5->permute(this->perm3, output, mode),
+                     gko::DimensionMismatch);
+    }
 }
 
 
-TYPED_TEST(Dense, SquareSubmatrixIsColPermutableIntoDense)
+TYPED_TEST(DenseWithIndexType, NonsymmPermute)
 {
-    using Mtx = typename TestFixture::Mtx;
-    using T = typename TestFixture::value_type;
-    auto exec = this->mtx5->get_executor();
-    gko::array<gko::int32> permute_idxs{exec, {1, 0}};
-    auto c_permute = Mtx::create(exec, gko::dim<2>{2, 2}, 4);
-
-    this->mtx5->create_submatrix({0, 2}, {0, 2})
-        ->column_permute(&permute_idxs, c_permute);
+    auto permuted = this->mtx5->permute(this->perm3, this->perm3_rev);
+    auto ref_permuted = ref_permute(this->mtx5.get(), this->perm3.get(),
+                                    this->perm3_rev.get(), false);
 
-    GKO_ASSERT_MTX_NEAR(c_permute, l<T>({{-1.0, 1.0}, {2.0, -2.0}}), 0.0);
-    ASSERT_EQ(c_permute->get_stride(), 4);
+    GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0);
 }
 
 
-TYPED_TEST(Dense, SquareMatrixColPermuteIntoDenseFailsForWrongPermutationSize)
+TYPED_TEST(DenseWithIndexType, NonsymmPermuteInverse)
 {
-    using Mtx = typename TestFixture::Mtx;
-    using T = typename TestFixture::value_type;
-    auto exec = this->mtx5->get_executor();
-    gko::array<gko::int32> permute_idxs{exec, {1, 2}};
-    auto row_permute = Mtx::create(exec, this->mtx5->get_size());
+    auto permuted = this->mtx5->permute(this->perm3, this->perm3_rev, true);
+    auto ref_permuted = ref_permute(this->mtx5.get(), this->perm3.get(),
+                                    this->perm3_rev.get(), true);
 
-    ASSERT_THROW(this->mtx5->column_permute(&permute_idxs, row_permute),
-                 gko::ValueMismatch);
+    GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0);
 }
 
 
-TYPED_TEST(Dense, SquareMatrixColPermuteIntoDenseFailsForWrongDimensions)
+TYPED_TEST(DenseWithIndexType, NonsymmPermuteRectangular)
 {
-    using Mtx = typename TestFixture::Mtx;
-    using T = typename TestFixture::value_type;
-    auto exec = this->mtx5->get_executor();
-    gko::array<gko::int32> permute_idxs{exec, {1, 2, 0}};
+    auto permuted = this->mtx1->permute(this->perm2, this->perm3);
+    auto ref_permuted = ref_permute(this->mtx1.get(), this->perm2.get(),
+                                    this->perm3.get(), false);
 
-    ASSERT_THROW(this->mtx5->column_permute(&permute_idxs, Mtx::create(exec)),
-                 gko::DimensionMismatch);
+    GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0);
 }
 
 
-TYPED_TEST(Dense, SquareMatrixIsInverseRowPermutable)
+TYPED_TEST(DenseWithIndexType, NonsymmPermuteInverseRectangular)
 {
-    using Mtx = typename TestFixture::Mtx;
-    using T = typename TestFixture::value_type;
-    auto exec = this->mtx5->get_executor();
-    gko::array<gko::int32> inverse_permute_idxs{exec, {1, 2, 0}};
-
-    auto inverse_row_permute =
-        gko::as<Mtx>(this->mtx5->inverse_row_permute(&inverse_permute_idxs));
+    auto permuted = this->mtx1->permute(this->perm2, this->perm3, true);
+    auto ref_permuted = ref_permute(this->mtx1.get(), this->perm2.get(),
+                                    this->perm3.get(), true);
 
-    GKO_ASSERT_MTX_NEAR(
-        inverse_row_permute,
-        l<T>({{2.1, 3.4, 1.2}, {1.0, -1.0, -0.5}, {-2.0, 2.0, 4.5}}), 0.0);
+    GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0);
 }
 
 
-TYPED_TEST(Dense, NonSquareMatrixIsInverseRowPermutable)
+TYPED_TEST(DenseWithIndexType, NonsymmPermuteRoundtrip)
 {
-    using Mtx = typename TestFixture::Mtx;
-    using T = typename TestFixture::value_type;
-    auto exec = this->mtx4->get_executor();
-    gko::array<gko::int32> inverse_permute_idxs{exec, {1, 0}};
-
-    auto inverse_row_permute =
-        gko::as<Mtx>(this->mtx4->inverse_row_permute(&inverse_permute_idxs));
+    auto permuted = this->mtx5->permute(this->perm3, this->perm3_rev)
+                        ->permute(this->perm3, this->perm3_rev, true);
 
-    GKO_ASSERT_MTX_NEAR(inverse_row_permute,
-                        l<T>({{0.0, 5.0, 0.0}, {1.0, 3.0, 2.0}}), 0.0);
+    GKO_ASSERT_MTX_NEAR(this->mtx5, permuted, 0.0);
 }
 
 
-TYPED_TEST(Dense, SquareMatrixIsInverseRowPermutableIntoDense)
+TYPED_TEST(DenseWithIndexType, NonsymmPermuteInverseInverted)
 {
-    using Mtx = typename TestFixture::Mtx;
-    using T = typename TestFixture::value_type;
-    auto exec = this->mtx5->get_executor();
-    gko::array<gko::int32> permute_idxs{exec, {1, 2, 0}};
-    auto row_permute = Mtx::create(exec, this->mtx5->get_size());
+    auto inv_permuted = this->mtx5->permute(this->perm3, this->perm3_rev, true);
+    auto preinv_permuted = this->mtx5->permute(this->perm3_rev, this->perm3);
 
-    this->mtx5->inverse_row_permute(&permute_idxs, row_permute);
-
-    GKO_ASSERT_MTX_NEAR(
-        row_permute,
-        l<T>({{2.1, 3.4, 1.2}, {1.0, -1.0, -0.5}, {-2.0, 2.0, 4.5}}), 0.0);
+    GKO_ASSERT_MTX_NEAR(inv_permuted, preinv_permuted, 0.0);
 }
 
 
-TYPED_TEST(Dense, SquareSubmatrixIsInverseRowPermutableIntoDense)
+TYPED_TEST(DenseWithIndexType, NonsymmPermuteStridedIntoDense)
 {
     using Mtx = typename TestFixture::Mtx;
-    using T = typename TestFixture::value_type;
-    auto exec = this->mtx5->get_executor();
-    gko::array<gko::int32> permute_idxs{exec, {1, 0}};
-    auto row_permute = Mtx::create(exec, gko::dim<2>{2, 2}, 4);
+    auto mtx = Mtx::create(this->exec, this->mtx5->get_size(),
+                           this->mtx5->get_size()[1] + 1);
+    auto permuted = Mtx::create(this->exec, this->mtx5->get_size(),
+                                this->mtx5->get_size()[1] + 2);
+    mtx->copy_from(this->mtx5);
 
-    this->mtx5->create_submatrix({0, 2}, {0, 2})
-        ->inverse_row_permute(&permute_idxs, row_permute);
+    mtx->permute(this->perm3, this->perm3_rev, permuted);
+    auto ref_permuted = ref_permute(this->mtx5.get(), this->perm3.get(),
+                                    this->perm3_rev.get(), false);
 
-    GKO_ASSERT_MTX_NEAR(row_permute, l<T>({{-2.0, 2.0}, {1.0, -1.0}}), 0.0);
-    ASSERT_EQ(row_permute->get_stride(), 4);
+    GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0);
 }
 
 
-TYPED_TEST(Dense,
-           SquareMatrixInverseRowPermuteIntoDenseFailsForWrongPermutationSize)
+TYPED_TEST(DenseWithIndexType, NonsymmPermuteInverseStridedIntoDense)
 {
     using Mtx = typename TestFixture::Mtx;
-    using T = typename TestFixture::value_type;
-    auto exec = this->mtx5->get_executor();
-    gko::array<gko::int32> permute_idxs{exec, {1, 2}};
-    auto row_permute = Mtx::create(exec, this->mtx5->get_size());
+    auto mtx = Mtx::create(this->exec, this->mtx5->get_size(),
+                           this->mtx5->get_size()[1] + 1);
+    auto permuted = Mtx::create(this->exec, this->mtx5->get_size(),
+                                this->mtx5->get_size()[1] + 2);
+    mtx->copy_from(this->mtx5);
+
+    mtx->permute(this->perm3, this->perm3_rev, permuted, true);
+    auto ref_permuted = ref_permute(this->mtx5.get(), this->perm3.get(),
+                                    this->perm3_rev.get(), true);
 
-    ASSERT_THROW(this->mtx5->inverse_row_permute(&permute_idxs, row_permute),
-                 gko::ValueMismatch);
+    GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0);
 }
 
 
-TYPED_TEST(Dense, SquareMatrixInverseRowPermuteIntoDenseFailsForWrongDimensions)
+TYPED_TEST(DenseWithIndexType, NonsymmPermuteFailsWithIncorrectPermutationSize)
 {
-    using Mtx = typename TestFixture::Mtx;
-    using T = typename TestFixture::value_type;
-    auto exec = this->mtx5->get_executor();
-    gko::array<gko::int32> permute_idxs{exec, {1, 2, 0}};
-
-    ASSERT_THROW(
-        this->mtx5->inverse_row_permute(&permute_idxs, Mtx::create(exec)),
-        gko::DimensionMismatch);
+    ASSERT_THROW(this->mtx5->permute(this->perm0, this->perm3_rev),
+                 gko::DimensionMismatch);
+    ASSERT_THROW(this->mtx5->permute(this->perm3_rev, this->perm0),
+                 gko::DimensionMismatch);
+    ASSERT_THROW(this->mtx5->permute(this->perm0, this->perm0),
+                 gko::DimensionMismatch);
 }
 
 
-TYPED_TEST(Dense, SquareMatrixIsInverseColPermutable)
+TYPED_TEST(DenseWithIndexType, SquareMatrixCanGatherRows)
 {
     using Mtx = typename TestFixture::Mtx;
-    using T = typename TestFixture::value_type;
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
     auto exec = this->mtx5->get_executor();
-    gko::array<gko::int32> inverse_permute_idxs{exec, {1, 2, 0}};
+    gko::array<index_type> permute_idxs{exec, {1, 0}};
 
-    auto inverse_c_permute =
-        gko::as<Mtx>(this->mtx5->inverse_column_permute(&inverse_permute_idxs));
+    auto row_collection = this->mtx5->row_gather(&permute_idxs);
 
-    GKO_ASSERT_MTX_NEAR(
-        inverse_c_permute,
-        l<T>({{-0.5, 1.0, -1.0}, {4.5, -2.0, 2.0}, {1.2, 2.1, 3.4}}), 0.0);
+    GKO_ASSERT_MTX_NEAR(row_collection,
+                        l<value_type>({{-2.0, 2.0, 4.5}, {1.0, -1.0, -0.5}}),
+                        0.0);
 }
 
 
-TYPED_TEST(Dense, NonSquareMatrixIsInverseColPermutable)
+TYPED_TEST(DenseWithIndexType, SquareMatrixCanGatherRowsIntoDense)
 {
     using Mtx = typename TestFixture::Mtx;
-    using T = typename TestFixture::value_type;
-    auto exec = this->mtx4->get_executor();
-    gko::array<gko::int32> inverse_permute_idxs{exec, {1, 2, 0}};
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    auto exec = this->mtx5->get_executor();
+    gko::array<index_type> permute_idxs{exec, {1, 0}};
+    auto row_collection = Mtx::create(exec, gko::dim<2>{2, 3});
 
-    auto inverse_c_permute =
-        gko::as<Mtx>(this->mtx4->inverse_column_permute(&inverse_permute_idxs));
+    this->mtx5->row_gather(&permute_idxs, row_collection);
 
-    GKO_ASSERT_MTX_NEAR(inverse_c_permute,
-                        l<T>({{2.0, 1.0, 3.0}, {0.0, 0.0, 5.0}}), 0.0);
+    GKO_ASSERT_MTX_NEAR(row_collection,
+                        l<value_type>({{-2.0, 2.0, 4.5}, {1.0, -1.0, -0.5}}),
+                        0.0);
 }
 
 
-TYPED_TEST(Dense, SquareMatrixIsInverseColPermutableIntoDense)
+TYPED_TEST(DenseWithIndexType, SquareSubmatrixCanGatherRowsIntoDense)
 {
     using Mtx = typename TestFixture::Mtx;
-    using T = typename TestFixture::value_type;
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
     auto exec = this->mtx5->get_executor();
-    gko::array<gko::int32> permute_idxs{exec, {1, 2, 0}};
-    auto c_permute = Mtx::create(exec, this->mtx5->get_size());
+    gko::array<index_type> permute_idxs{exec, {1, 0}};
+    auto row_collection = Mtx::create(exec, gko::dim<2>{2, 2}, 4);
 
-    this->mtx5->inverse_column_permute(&permute_idxs, c_permute);
+    this->mtx5->create_submatrix({0, 2}, {1, 3})
+        ->row_gather(&permute_idxs, row_collection);
 
-    GKO_ASSERT_MTX_NEAR(
-        c_permute, l<T>({{-0.5, 1.0, -1.0}, {4.5, -2.0, 2.0}, {1.2, 2.1, 3.4}}),
-        0.0);
+    GKO_ASSERT_MTX_NEAR(row_collection,
+                        l<value_type>({{2.0, 4.5}, {-1.0, -0.5}}), 0.0);
+    ASSERT_EQ(row_collection->get_stride(), 4);
 }
 
 
-TYPED_TEST(Dense, SquareSubmatrixIsInverseColPermutableIntoDense)
+TYPED_TEST(DenseWithIndexType, NonSquareSubmatrixCanGatherRowsIntoMixedDense)
 {
     using Mtx = typename TestFixture::Mtx;
-    using T = typename TestFixture::value_type;
-    auto exec = this->mtx5->get_executor();
-    gko::array<gko::int32> permute_idxs{exec, {1, 0}};
-    auto c_permute = Mtx::create(exec, gko::dim<2>{2, 2}, 4);
+    using MixedMtx = typename TestFixture::MixedMtx;
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    auto exec = this->mtx4->get_executor();
+    gko::array<index_type> gather_index{exec, {1, 0, 1}};
+    auto row_collection = MixedMtx::create(exec, gko::dim<2>{3, 3}, 4);
 
-    this->mtx5->create_submatrix({0, 2}, {0, 2})
-        ->column_permute(&permute_idxs, c_permute);
+    this->mtx4->row_gather(&gather_index, row_collection);
 
-    GKO_ASSERT_MTX_NEAR(c_permute, l<T>({{-1.0, 1.0}, {2.0, -2.0}}), 0.0);
-    ASSERT_EQ(c_permute->get_stride(), 4);
+    GKO_ASSERT_MTX_NEAR(
+        row_collection,
+        l<typename MixedMtx::value_type>(
+            {{0.0, 5.0, 0.0}, {1.0, 3.0, 2.0}, {0.0, 5.0, 0.0}}),
+        0.0);
 }
 
 
-TYPED_TEST(Dense,
-           SquareMatrixInverseColPermuteIntoDenseFailsForWrongPermutationSize)
+TYPED_TEST(DenseWithIndexType,
+           NonSquareSubmatrixCanAdvancedGatherRowsIntoMixedDense)
 {
     using Mtx = typename TestFixture::Mtx;
-    using T = typename TestFixture::value_type;
-    auto exec = this->mtx5->get_executor();
-    gko::array<gko::int32> permute_idxs{exec, {1, 2}};
-    auto row_permute = Mtx::create(exec, this->mtx5->get_size());
+    using MixedMtx = typename TestFixture::MixedMtx;
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    auto exec = this->mtx4->get_executor();
+    gko::array<index_type> gather_index{exec, {1, 0, 1}};
+    auto row_collection = gko::initialize<MixedMtx>(
+        {{1.0, 0.5, -1.0}, {-1.5, 0.5, 1.0}, {2.0, -3.0, 1.0}}, exec);
+    auto alpha = gko::initialize<MixedMtx>({1.0}, exec);
+    auto beta = gko::initialize<Mtx>({2.0}, exec);
+
+    this->mtx4->row_gather(alpha, &gather_index, beta, row_collection);
 
-    ASSERT_THROW(this->mtx5->inverse_column_permute(&permute_idxs, row_permute),
-                 gko::ValueMismatch);
+    GKO_ASSERT_MTX_NEAR(
+        row_collection,
+        l<typename MixedMtx::value_type>(
+            {{2.0, 6.0, -2.0}, {-2.0, 4.0, 4.0}, {4.0, -1.0, 2.0}}),
+        0.0);
 }
 
 
-TYPED_TEST(Dense, SquareMatrixInverseColPermuteIntoDenseFailsForWrongDimensions)
+TYPED_TEST(DenseWithIndexType,
+           SquareMatrixGatherRowsIntoDenseFailsForWrongDimensions)
 {
     using Mtx = typename TestFixture::Mtx;
-    using T = typename TestFixture::value_type;
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
     auto exec = this->mtx5->get_executor();
-    gko::array<gko::int32> permute_idxs{exec, {1, 2, 0}};
-
-    ASSERT_THROW(
-        this->mtx5->inverse_column_permute(&permute_idxs, Mtx::create(exec)),
-        gko::DimensionMismatch);
+    gko::array<index_type> permute_idxs{exec, {1, 0}};
+
+    ASSERT_THROW(this->mtx5->row_gather(&permute_idxs, Mtx::create(exec)),
+                 gko::DimensionMismatch);
 }
 
 
-TYPED_TEST(Dense, SquareMatrixIsRowPermutable64)
+TYPED_TEST(DenseWithIndexType, SquareMatrixIsPermutable)
 {
     using Mtx = typename TestFixture::Mtx;
-    using T = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
     auto exec = this->mtx5->get_executor();
-    gko::array<gko::int64> permute_idxs{exec, {1, 2, 0}};
+    gko::array<index_type> permute_idxs{exec, {1, 2, 0}};
 
-    auto row_permute = gko::as<Mtx>(this->mtx5->row_permute(&permute_idxs));
+    auto ref_permuted =
+        gko::as<Mtx>(gko::as<Mtx>(this->mtx5->row_permute(&permute_idxs))
+                         ->column_permute(&permute_idxs));
+    auto permuted = gko::as<Mtx>(this->mtx5->permute(&permute_idxs));
 
-    GKO_ASSERT_MTX_NEAR(
-        row_permute,
-        l<T>({{-2.0, 2.0, 4.5}, {2.1, 3.4, 1.2}, {1.0, -1.0, -0.5}}), 0.0);
+    GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0);
 }
 
 
-TYPED_TEST(Dense, NonSquareMatrixIsRowPermutable64)
+TYPED_TEST(DenseWithIndexType, SquareMatrixIsPermutableIntoDense)
 {
     using Mtx = typename TestFixture::Mtx;
-    using T = typename TestFixture::value_type;
-    auto exec = this->mtx4->get_executor();
-    gko::array<gko::int64> permute_idxs{exec, {1, 0}};
+    using index_type = typename TestFixture::index_type;
+    auto exec = this->mtx5->get_executor();
+    gko::array<index_type> permute_idxs{exec, {1, 2, 0}};
+    auto permuted = Mtx::create(exec, this->mtx5->get_size());
 
-    auto row_permute = gko::as<Mtx>(this->mtx4->row_permute(&permute_idxs));
+    auto ref_permuted =
+        gko::as<Mtx>(gko::as<Mtx>(this->mtx5->row_permute(&permute_idxs))
+                         ->column_permute(&permute_idxs));
+    this->mtx5->permute(&permute_idxs, permuted);
 
-    GKO_ASSERT_MTX_NEAR(row_permute, l<T>({{0.0, 5.0, 0.0}, {1.0, 3.0, 2.0}}),
-                        0.0);
+    GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0);
 }
 
 
-TYPED_TEST(Dense, SquareMatrixIsRowPermutableIntoDense64)
+TYPED_TEST(DenseWithIndexType, SquareSubmatrixIsPermutableIntoDense)
 {
     using Mtx = typename TestFixture::Mtx;
-    using T = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
     auto exec = this->mtx5->get_executor();
-    gko::array<gko::int64> permute_idxs{exec, {1, 2, 0}};
-    auto row_permute = Mtx::create(exec, this->mtx5->get_size());
+    gko::array<index_type> permute_idxs{exec, {1, 0}};
+    auto permuted = Mtx::create(exec, gko::dim<2>{2, 2}, 4);
+    auto mtx = this->mtx5->create_submatrix({0, 2}, {1, 3});
 
-    this->mtx5->row_permute(&permute_idxs, row_permute);
+    auto ref_permuted =
+        gko::as<Mtx>(gko::as<Mtx>(mtx->row_permute(&permute_idxs))
+                         ->column_permute(&permute_idxs));
+    mtx->permute(&permute_idxs, permuted);
 
-    GKO_ASSERT_MTX_NEAR(
-        row_permute,
-        l<T>({{-2.0, 2.0, 4.5}, {2.1, 3.4, 1.2}, {1.0, -1.0, -0.5}}), 0.0);
+    GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0);
+    ASSERT_EQ(permuted->get_stride(), 4);
 }
 
 
-TYPED_TEST(Dense, SquareSubmatrixIsRowPermutableIntoDense64)
+TYPED_TEST(DenseWithIndexType, NonSquareMatrixPermuteIntoDenseFails)
 {
     using Mtx = typename TestFixture::Mtx;
-    using T = typename TestFixture::value_type;
-    auto exec = this->mtx5->get_executor();
-    gko::array<gko::int64> permute_idxs{exec, {1, 0}};
-    auto row_permute = Mtx::create(exec, gko::dim<2>{2, 2}, 4);
-
-    this->mtx5->create_submatrix({0, 2}, {0, 2})
-        ->row_permute(&permute_idxs, row_permute);
+    using index_type = typename TestFixture::index_type;
+    auto exec = this->mtx4->get_executor();
+    gko::array<index_type> permute_idxs{exec, {1, 2, 0}};
 
-    GKO_ASSERT_MTX_NEAR(row_permute, l<T>({{-2.0, 2.0}, {1.0, -1.0}}), 0.0);
-    ASSERT_EQ(row_permute->get_stride(), 4);
+    ASSERT_THROW(this->mtx4->permute(&permute_idxs, this->mtx4->clone()),
+                 gko::DimensionMismatch);
 }
 
 
-TYPED_TEST(Dense, SquareMatrixRowPermuteIntoDenseFailsForWrongPermutationSize64)
+TYPED_TEST(DenseWithIndexType,
+           SquareMatrixPermuteIntoDenseFailsForWrongPermutationSize)
 {
     using Mtx = typename TestFixture::Mtx;
-    using T = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
     auto exec = this->mtx5->get_executor();
-    gko::array<gko::int64> permute_idxs{exec, {1, 2}};
-    auto row_permute = Mtx::create(exec, this->mtx5->get_size());
+    gko::array<index_type> permute_idxs{exec, {1, 2}};
 
-    ASSERT_THROW(this->mtx5->row_permute(&permute_idxs, row_permute),
-                 gko::ValueMismatch);
+    ASSERT_THROW(this->mtx5->permute(&permute_idxs, this->mtx5->clone()),
+                 gko::DimensionMismatch);
 }
 
 
-TYPED_TEST(Dense, SquareMatrixRowPermuteIntoDenseFailsForWrongDimensions64)
+TYPED_TEST(DenseWithIndexType,
+           SquareMatrixPermuteIntoDenseFailsForWrongDimensions)
 {
     using Mtx = typename TestFixture::Mtx;
-    using T = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
     auto exec = this->mtx5->get_executor();
-    gko::array<gko::int64> permute_idxs{exec, {1, 2, 0}};
+    gko::array<index_type> permute_idxs{exec, {1, 2, 0}};
 
-    ASSERT_THROW(this->mtx5->row_permute(&permute_idxs, Mtx::create(exec)),
+    ASSERT_THROW(this->mtx5->permute(&permute_idxs, Mtx::create(exec)),
                  gko::DimensionMismatch);
 }
 
 
-TYPED_TEST(Dense, SquareMatrixIsColPermutable64)
+TYPED_TEST(DenseWithIndexType, SquareMatrixIsInversePermutable)
 {
     using Mtx = typename TestFixture::Mtx;
-    using T = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
     auto exec = this->mtx5->get_executor();
-    gko::array<gko::int64> permute_idxs{exec, {1, 2, 0}};
+    gko::array<index_type> permute_idxs{exec, {1, 2, 0}};
 
-    auto c_permute = gko::as<Mtx>(this->mtx5->column_permute(&permute_idxs));
+    auto ref_permuted = gko::as<Mtx>(
+        gko::as<Mtx>(this->mtx5->inverse_row_permute(&permute_idxs))
+            ->inverse_column_permute(&permute_idxs));
+    auto permuted = gko::as<Mtx>(this->mtx5->inverse_permute(&permute_idxs));
 
-    GKO_ASSERT_MTX_NEAR(
-        c_permute, l<T>({{-1.0, -0.5, 1.0}, {2.0, 4.5, -2.0}, {3.4, 1.2, 2.1}}),
-        0.0);
+    GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0);
 }
 
 
-TYPED_TEST(Dense, NonSquareMatrixIsColPermutable64)
+TYPED_TEST(DenseWithIndexType, SquareMatrixIsInversePermutableIntoDense)
 {
     using Mtx = typename TestFixture::Mtx;
-    using T = typename TestFixture::value_type;
-    auto exec = this->mtx4->get_executor();
-    gko::array<gko::int64> permute_idxs{exec, {1, 2, 0}};
+    using index_type = typename TestFixture::index_type;
+    auto exec = this->mtx5->get_executor();
+    gko::array<index_type> permute_idxs{exec, {1, 2, 0}};
+    auto permuted = Mtx::create(exec, this->mtx5->get_size());
 
-    auto c_permute = gko::as<Mtx>(this->mtx4->column_permute(&permute_idxs));
+    auto ref_permuted = gko::as<Mtx>(
+        gko::as<Mtx>(this->mtx5->inverse_row_permute(&permute_idxs))
+            ->inverse_column_permute(&permute_idxs));
+    this->mtx5->inverse_permute(&permute_idxs, permuted);
 
-    GKO_ASSERT_MTX_NEAR(c_permute, l<T>({{3.0, 2.0, 1.0}, {5.0, 0.0, 0.0}}),
-                        0.0);
+    GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0);
 }
 
 
-TYPED_TEST(Dense, SquareMatrixIsColPermutableIntoDense64)
+TYPED_TEST(DenseWithIndexType, SquareSubmatrixIsInversePermutableIntoDense)
 {
     using Mtx = typename TestFixture::Mtx;
-    using T = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
     auto exec = this->mtx5->get_executor();
-    gko::array<gko::int64> permute_idxs{exec, {1, 2, 0}};
-    auto c_permute = Mtx::create(exec, this->mtx5->get_size());
+    gko::array<index_type> permute_idxs{exec, {1, 0}};
+    auto permuted = Mtx::create(exec, gko::dim<2>{2, 2}, 4);
+    auto mtx = this->mtx5->create_submatrix({0, 2}, {1, 3});
 
-    this->mtx5->column_permute(&permute_idxs, c_permute);
+    auto ref_permuted =
+        gko::as<Mtx>(gko::as<Mtx>(mtx->inverse_row_permute(&permute_idxs))
+                         ->inverse_column_permute(&permute_idxs));
+    mtx->inverse_permute(&permute_idxs, permuted);
 
-    GKO_ASSERT_MTX_NEAR(
-        c_permute, l<T>({{-1.0, -0.5, 1.0}, {2.0, 4.5, -2.0}, {3.4, 1.2, 2.1}}),
-        0.0);
+    GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, 0.0);
+    ASSERT_EQ(permuted->get_stride(), 4);
 }
 
 
-TYPED_TEST(Dense, SquareSubmatrixIsColPermutableIntoDense64)
+TYPED_TEST(DenseWithIndexType, NonSquareMatrixInversePermuteIntoDenseFails)
 {
     using Mtx = typename TestFixture::Mtx;
-    using T = typename TestFixture::value_type;
-    auto exec = this->mtx5->get_executor();
-    gko::array<gko::int64> permute_idxs{exec, {1, 0}};
-    auto c_permute = Mtx::create(exec, gko::dim<2>{2, 2}, 4);
-
-    this->mtx5->create_submatrix({0, 2}, {0, 2})
-        ->column_permute(&permute_idxs, c_permute);
+    using index_type = typename TestFixture::index_type;
+    auto exec = this->mtx4->get_executor();
+    gko::array<index_type> permute_idxs{exec, {1, 2, 0}};
 
-    GKO_ASSERT_MTX_NEAR(c_permute, l<T>({{-1.0, 1.0}, {2.0, -2.0}}), 0.0);
-    ASSERT_EQ(c_permute->get_stride(), 4);
+    ASSERT_THROW(
+        this->mtx4->inverse_permute(&permute_idxs, this->mtx4->clone()),
+        gko::DimensionMismatch);
 }
 
 
-TYPED_TEST(Dense, SquareMatrixColPermuteIntoDenseFailsForWrongPermutationSize64)
+TYPED_TEST(DenseWithIndexType,
+           SquareMatrixInversePermuteIntoDenseFailsForWrongPermutationSize)
 {
     using Mtx = typename TestFixture::Mtx;
-    using T = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
     auto exec = this->mtx5->get_executor();
-    gko::array<gko::int64> permute_idxs{exec, {1, 2}};
-    auto row_permute = Mtx::create(exec, this->mtx5->get_size());
+    gko::array<index_type> permute_idxs{exec, {0, 1}};
 
-    ASSERT_THROW(this->mtx5->column_permute(&permute_idxs, row_permute),
-                 gko::ValueMismatch);
+    ASSERT_THROW(
+        this->mtx5->inverse_permute(&permute_idxs, this->mtx5->clone()),
+        gko::DimensionMismatch);
 }
 
 
-TYPED_TEST(Dense, SquareMatrixColPermuteIntoDenseFailsForWrongDimensions64)
+TYPED_TEST(DenseWithIndexType,
+           SquareMatrixInversePermuteIntoDenseFailsForWrongDimensions)
 {
     using Mtx = typename TestFixture::Mtx;
-    using T = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
     auto exec = this->mtx5->get_executor();
-    gko::array<gko::int64> permute_idxs{exec, {1, 2, 0}};
+    gko::array<index_type> permute_idxs{exec, {1, 2, 0}};
 
-    ASSERT_THROW(this->mtx5->column_permute(&permute_idxs, Mtx::create(exec)),
+    ASSERT_THROW(this->mtx5->inverse_permute(&permute_idxs, Mtx::create(exec)),
                  gko::DimensionMismatch);
 }
 
 
-TYPED_TEST(Dense, SquareMatrixIsInverseRowPermutable64)
+TYPED_TEST(DenseWithIndexType, SquareMatrixIsRowPermutable)
 {
     using Mtx = typename TestFixture::Mtx;
-    using T = typename TestFixture::value_type;
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
     auto exec = this->mtx5->get_executor();
-    gko::array<gko::int64> inverse_permute_idxs{exec, {1, 2, 0}};
+    gko::array<index_type> permute_idxs{exec, {1, 2, 0}};
 
-    auto inverse_row_permute =
-        gko::as<Mtx>(this->mtx5->inverse_row_permute(&inverse_permute_idxs));
+    auto permuted = gko::as<Mtx>(this->mtx5->row_permute(&permute_idxs));
 
     GKO_ASSERT_MTX_NEAR(
-        inverse_row_permute,
-        l<T>({{2.1, 3.4, 1.2}, {1.0, -1.0, -0.5}, {-2.0, 2.0, 4.5}}), 0.0);
+        permuted,
+        l<value_type>({{-2.0, 2.0, 4.5}, {2.1, 3.4, 1.2}, {1.0, -1.0, -0.5}}),
+        0.0);
 }
 
 
-TYPED_TEST(Dense, NonSquareMatrixIsInverseRowPermutable64)
+TYPED_TEST(DenseWithIndexType, NonSquareMatrixIsRowPermutable)
 {
     using Mtx = typename TestFixture::Mtx;
-    using T = typename TestFixture::value_type;
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
     auto exec = this->mtx4->get_executor();
-    gko::array<gko::int64> inverse_permute_idxs{exec, {1, 0}};
+    gko::array<index_type> permute_idxs{exec, {1, 0}};
 
-    auto inverse_row_permute =
-        gko::as<Mtx>(this->mtx4->inverse_row_permute(&inverse_permute_idxs));
+    auto permuted = gko::as<Mtx>(this->mtx4->row_permute(&permute_idxs));
 
-    GKO_ASSERT_MTX_NEAR(inverse_row_permute,
-                        l<T>({{0.0, 5.0, 0.0}, {1.0, 3.0, 2.0}}), 0.0);
+    GKO_ASSERT_MTX_NEAR(permuted,
+                        l<value_type>({{0.0, 5.0, 0.0}, {1.0, 3.0, 2.0}}), 0.0);
 }
 
 
-TYPED_TEST(Dense, SquareMatrixIsInverseRowPermutableIntoDense64)
+TYPED_TEST(DenseWithIndexType, SquareMatrixIsRowPermutableIntoDense)
 {
     using Mtx = typename TestFixture::Mtx;
-    using T = typename TestFixture::value_type;
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
     auto exec = this->mtx5->get_executor();
-    gko::array<gko::int64> permute_idxs{exec, {1, 2, 0}};
-    auto row_permute = Mtx::create(exec, this->mtx5->get_size());
+    gko::array<index_type> permute_idxs{exec, {1, 2, 0}};
+    auto permuted = Mtx::create(exec, this->mtx5->get_size());
 
-    this->mtx5->inverse_row_permute(&permute_idxs, row_permute);
+    this->mtx5->row_permute(&permute_idxs, permuted);
 
     GKO_ASSERT_MTX_NEAR(
-        row_permute,
-        l<T>({{2.1, 3.4, 1.2}, {1.0, -1.0, -0.5}, {-2.0, 2.0, 4.5}}), 0.0);
+        permuted,
+        l<value_type>({{-2.0, 2.0, 4.5}, {2.1, 3.4, 1.2}, {1.0, -1.0, -0.5}}),
+        0.0);
 }
 
 
-TYPED_TEST(Dense, SquareSubmatrixIsInverseRowPermutableIntoDense64)
+TYPED_TEST(DenseWithIndexType, SquareSubmatrixIsRowPermutableIntoDense)
 {
     using Mtx = typename TestFixture::Mtx;
-    using T = typename TestFixture::value_type;
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
     auto exec = this->mtx5->get_executor();
-    gko::array<gko::int64> permute_idxs{exec, {1, 0}};
-    auto row_permute = Mtx::create(exec, gko::dim<2>{2, 2}, 4);
+    gko::array<index_type> permute_idxs{exec, {1, 0}};
+    auto permuted = Mtx::create(exec, gko::dim<2>{2, 2}, 4);
 
     this->mtx5->create_submatrix({0, 2}, {0, 2})
-        ->inverse_row_permute(&permute_idxs, row_permute);
+        ->row_permute(&permute_idxs, permuted);
 
-    GKO_ASSERT_MTX_NEAR(row_permute, l<T>({{-2.0, 2.0}, {1.0, -1.0}}), 0.0);
-    ASSERT_EQ(row_permute->get_stride(), 4);
+    GKO_ASSERT_MTX_NEAR(permuted, l<value_type>({{-2.0, 2.0}, {1.0, -1.0}}),
+                        0.0);
+    ASSERT_EQ(permuted->get_stride(), 4);
 }
 
 
-TYPED_TEST(Dense,
-           SquareMatrixInverseRowPermuteIntoDenseFailsForWrongPermutationSize64)
+TYPED_TEST(DenseWithIndexType,
+           SquareMatrixRowPermuteIntoDenseFailsForWrongPermutationSize)
 {
     using Mtx = typename TestFixture::Mtx;
-    using T = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
     auto exec = this->mtx5->get_executor();
-    gko::array<gko::int64> permute_idxs{exec, {1, 2}};
-    auto row_permute = Mtx::create(exec, this->mtx5->get_size());
+    gko::array<index_type> permute_idxs{exec, {1, 2}};
+    auto permuted = Mtx::create(exec, this->mtx5->get_size());
 
-    ASSERT_THROW(this->mtx5->inverse_row_permute(&permute_idxs, row_permute),
-                 gko::ValueMismatch);
+    ASSERT_THROW(this->mtx5->row_permute(&permute_idxs, permuted),
+                 gko::DimensionMismatch);
 }
 
 
-TYPED_TEST(Dense,
-           SquareMatrixInverseRowPermuteIntoDenseFailsForWrongDimensions64)
+TYPED_TEST(DenseWithIndexType,
+           SquareMatrixRowPermuteIntoDenseFailsForWrongDimensions)
 {
     using Mtx = typename TestFixture::Mtx;
-    using T = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
     auto exec = this->mtx5->get_executor();
-    gko::array<gko::int64> permute_idxs{exec, {1, 2, 0}};
+    gko::array<index_type> permute_idxs{exec, {1, 2, 0}};
 
-    ASSERT_THROW(
-        this->mtx5->inverse_row_permute(&permute_idxs, Mtx::create(exec)),
-        gko::DimensionMismatch);
+    ASSERT_THROW(this->mtx5->row_permute(&permute_idxs, Mtx::create(exec)),
+                 gko::DimensionMismatch);
 }
 
 
-TYPED_TEST(Dense, SquareMatrixIsInverseColPermutable64)
+TYPED_TEST(DenseWithIndexType, SquareMatrixIsColPermutable)
 {
     using Mtx = typename TestFixture::Mtx;
-    using T = typename TestFixture::value_type;
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
     auto exec = this->mtx5->get_executor();
-    gko::array<gko::int64> inverse_permute_idxs{exec, {1, 2, 0}};
+    gko::array<index_type> permute_idxs{exec, {1, 2, 0}};
 
-    auto inverse_c_permute =
-        gko::as<Mtx>(this->mtx5->inverse_column_permute(&inverse_permute_idxs));
+    auto permuted = gko::as<Mtx>(this->mtx5->column_permute(&permute_idxs));
 
     GKO_ASSERT_MTX_NEAR(
-        inverse_c_permute,
-        l<T>({{-0.5, 1.0, -1.0}, {4.5, -2.0, 2.0}, {1.2, 2.1, 3.4}}), 0.0);
+        permuted,
+        l<value_type>({{-1.0, -0.5, 1.0}, {2.0, 4.5, -2.0}, {3.4, 1.2, 2.1}}),
+        0.0);
 }
 
 
-TYPED_TEST(Dense, NonSquareMatrixIsInverseColPermutable64)
+TYPED_TEST(DenseWithIndexType, NonSquareMatrixIsColPermutable)
 {
     using Mtx = typename TestFixture::Mtx;
-    using T = typename TestFixture::value_type;
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
     auto exec = this->mtx4->get_executor();
-    gko::array<gko::int64> inverse_permute_idxs{exec, {1, 2, 0}};
+    gko::array<index_type> permute_idxs{exec, {1, 2, 0}};
 
-    auto inverse_c_permute =
-        gko::as<Mtx>(this->mtx4->inverse_column_permute(&inverse_permute_idxs));
+    auto permuted = gko::as<Mtx>(this->mtx4->column_permute(&permute_idxs));
 
-    GKO_ASSERT_MTX_NEAR(inverse_c_permute,
-                        l<T>({{2.0, 1.0, 3.0}, {0.0, 0.0, 5.0}}), 0.0);
+    GKO_ASSERT_MTX_NEAR(permuted,
+                        l<value_type>({{3.0, 2.0, 1.0}, {5.0, 0.0, 0.0}}), 0.0);
 }
 
 
-TYPED_TEST(Dense, SquareMatrixIsInverseColPermutableIntoDense64)
+TYPED_TEST(DenseWithIndexType, SquareMatrixIsColPermutableIntoDense)
 {
     using Mtx = typename TestFixture::Mtx;
-    using T = typename TestFixture::value_type;
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
     auto exec = this->mtx5->get_executor();
-    gko::array<gko::int64> permute_idxs{exec, {1, 2, 0}};
-    auto c_permute = Mtx::create(exec, this->mtx5->get_size());
+    gko::array<index_type> permute_idxs{exec, {1, 2, 0}};
+    auto permuted = Mtx::create(exec, this->mtx5->get_size());
 
-    this->mtx5->inverse_column_permute(&permute_idxs, c_permute);
+    this->mtx5->column_permute(&permute_idxs, permuted);
 
     GKO_ASSERT_MTX_NEAR(
-        c_permute, l<T>({{-0.5, 1.0, -1.0}, {4.5, -2.0, 2.0}, {1.2, 2.1, 3.4}}),
+        permuted,
+        l<value_type>({{-1.0, -0.5, 1.0}, {2.0, 4.5, -2.0}, {3.4, 1.2, 2.1}}),
         0.0);
 }
 
 
-TYPED_TEST(Dense, SquareSubmatrixIsInverseColPermutableIntoDense64)
+TYPED_TEST(DenseWithIndexType, SquareSubmatrixIsColPermutableIntoDense)
 {
     using Mtx = typename TestFixture::Mtx;
-    using T = typename TestFixture::value_type;
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
     auto exec = this->mtx5->get_executor();
-    gko::array<gko::int64> permute_idxs{exec, {1, 0}};
-    auto c_permute = Mtx::create(exec, gko::dim<2>{2, 2}, 4);
+    gko::array<index_type> permute_idxs{exec, {1, 0}};
+    auto permuted = Mtx::create(exec, gko::dim<2>{2, 2}, 4);
 
     this->mtx5->create_submatrix({0, 2}, {0, 2})
-        ->column_permute(&permute_idxs, c_permute);
+        ->column_permute(&permute_idxs, permuted);
 
-    GKO_ASSERT_MTX_NEAR(c_permute, l<T>({{-1.0, 1.0}, {2.0, -2.0}}), 0.0);
-    ASSERT_EQ(c_permute->get_stride(), 4);
+    GKO_ASSERT_MTX_NEAR(permuted, l<value_type>({{-1.0, 1.0}, {2.0, -2.0}}),
+                        0.0);
+    ASSERT_EQ(permuted->get_stride(), 4);
 }
 
 
-TYPED_TEST(Dense,
-           SquareMatrixInverseColPermuteIntoDenseFailsForWrongPermutationSize64)
+TYPED_TEST(DenseWithIndexType,
+           SquareMatrixColPermuteIntoDenseFailsForWrongPermutationSize)
 {
     using Mtx = typename TestFixture::Mtx;
-    using T = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
     auto exec = this->mtx5->get_executor();
-    gko::array<gko::int64> permute_idxs{exec, {1, 2}};
-    auto row_permute = Mtx::create(exec, this->mtx5->get_size());
+    gko::array<index_type> permute_idxs{exec, {1, 2}};
+    auto permuted = Mtx::create(exec, this->mtx5->get_size());
 
-    ASSERT_THROW(this->mtx5->inverse_column_permute(&permute_idxs, row_permute),
-                 gko::ValueMismatch);
+    ASSERT_THROW(this->mtx5->column_permute(&permute_idxs, permuted),
+                 gko::DimensionMismatch);
 }
 
 
-TYPED_TEST(Dense,
-           SquareMatrixInverseColPermuteIntoDenseFailsForWrongDimensions64)
+TYPED_TEST(DenseWithIndexType,
+           SquareMatrixColPermuteIntoDenseFailsForWrongDimensions)
 {
     using Mtx = typename TestFixture::Mtx;
-    using T = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
     auto exec = this->mtx5->get_executor();
-    gko::array<gko::int64> permute_idxs{exec, {1, 2, 0}};
-
-    ASSERT_THROW(
-        this->mtx5->inverse_column_permute(&permute_idxs, Mtx::create(exec)),
-        gko::DimensionMismatch);
-}
-
-
-TYPED_TEST(Dense, ExtractsDiagonalFromSquareMatrix)
-{
-    using T = typename TestFixture::value_type;
-
-    auto diag = this->mtx5->extract_diagonal();
+    gko::array<index_type> permute_idxs{exec, {1, 2, 0}};
 
-    ASSERT_EQ(diag->get_size()[0], 3);
-    ASSERT_EQ(diag->get_size()[1], 3);
-    ASSERT_EQ(diag->get_values()[0], T{1.});
-    ASSERT_EQ(diag->get_values()[1], T{2.});
-    ASSERT_EQ(diag->get_values()[2], T{1.2});
+    ASSERT_THROW(this->mtx5->column_permute(&permute_idxs, Mtx::create(exec)),
+                 gko::DimensionMismatch);
 }
 
 
-TYPED_TEST(Dense, ExtractsDiagonalFromTallSkinnyMatrix)
+TYPED_TEST(DenseWithIndexType, SquareMatrixIsInverseRowPermutable)
 {
-    using T = typename TestFixture::value_type;
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    auto exec = this->mtx5->get_executor();
+    gko::array<index_type> inverse_permute_idxs{exec, {1, 2, 0}};
 
-    auto diag = this->mtx4->extract_diagonal();
+    auto permuted =
+        gko::as<Mtx>(this->mtx5->inverse_row_permute(&inverse_permute_idxs));
 
-    ASSERT_EQ(diag->get_size()[0], 2);
-    ASSERT_EQ(diag->get_size()[1], 2);
-    ASSERT_EQ(diag->get_values()[0], T{1.});
-    ASSERT_EQ(diag->get_values()[1], T{5.});
+    GKO_ASSERT_MTX_NEAR(
+        permuted,
+        l<value_type>({{2.1, 3.4, 1.2}, {1.0, -1.0, -0.5}, {-2.0, 2.0, 4.5}}),
+        0.0);
 }
 
 
-TYPED_TEST(Dense, ExtractsDiagonalFromShortFatMatrix)
+TYPED_TEST(DenseWithIndexType, NonSquareMatrixIsInverseRowPermutable)
 {
-    using T = typename TestFixture::value_type;
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    auto exec = this->mtx4->get_executor();
+    gko::array<index_type> inverse_permute_idxs{exec, {1, 0}};
 
-    auto diag = this->mtx8->extract_diagonal();
+    auto permuted =
+        gko::as<Mtx>(this->mtx4->inverse_row_permute(&inverse_permute_idxs));
 
-    ASSERT_EQ(diag->get_size()[0], 2);
-    ASSERT_EQ(diag->get_size()[1], 2);
-    ASSERT_EQ(diag->get_values()[0], T{1.});
-    ASSERT_EQ(diag->get_values()[1], T{2.});
+    GKO_ASSERT_MTX_NEAR(permuted,
+                        l<value_type>({{0.0, 5.0, 0.0}, {1.0, 3.0, 2.0}}), 0.0);
 }
 
 
-TYPED_TEST(Dense, ExtractsDiagonalFromSquareMatrixIntoDiagonal)
+TYPED_TEST(DenseWithIndexType, SquareMatrixIsInverseRowPermutableIntoDense)
 {
-    using T = typename TestFixture::value_type;
-    auto diag = gko::matrix::Diagonal<T>::create(this->exec, 3);
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    auto exec = this->mtx5->get_executor();
+    gko::array<index_type> permute_idxs{exec, {1, 2, 0}};
+    auto permuted = Mtx::create(exec, this->mtx5->get_size());
 
-    this->mtx5->extract_diagonal(diag);
+    this->mtx5->inverse_row_permute(&permute_idxs, permuted);
 
-    ASSERT_EQ(diag->get_size()[0], 3);
-    ASSERT_EQ(diag->get_size()[1], 3);
-    ASSERT_EQ(diag->get_values()[0], T{1.});
-    ASSERT_EQ(diag->get_values()[1], T{2.});
-    ASSERT_EQ(diag->get_values()[2], T{1.2});
+    GKO_ASSERT_MTX_NEAR(
+        permuted,
+        l<value_type>({{2.1, 3.4, 1.2}, {1.0, -1.0, -0.5}, {-2.0, 2.0, 4.5}}),
+        0.0);
 }
 
 
-TYPED_TEST(Dense, ExtractsDiagonalFromTallSkinnyMatrixIntoDiagonal)
+TYPED_TEST(DenseWithIndexType, SquareSubmatrixIsInverseRowPermutableIntoDense)
 {
-    using T = typename TestFixture::value_type;
-    auto diag = gko::matrix::Diagonal<T>::create(this->exec, 2);
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    auto exec = this->mtx5->get_executor();
+    gko::array<index_type> permute_idxs{exec, {1, 0}};
+    auto permuted = Mtx::create(exec, gko::dim<2>{2, 2}, 4);
 
-    this->mtx4->extract_diagonal(diag);
+    this->mtx5->create_submatrix({0, 2}, {0, 2})
+        ->inverse_row_permute(&permute_idxs, permuted);
 
-    ASSERT_EQ(diag->get_size()[0], 2);
-    ASSERT_EQ(diag->get_size()[1], 2);
-    ASSERT_EQ(diag->get_values()[0], T{1.});
-    ASSERT_EQ(diag->get_values()[1], T{5.});
+    GKO_ASSERT_MTX_NEAR(permuted, l<value_type>({{-2.0, 2.0}, {1.0, -1.0}}),
+                        0.0);
+    ASSERT_EQ(permuted->get_stride(), 4);
 }
 
 
-TYPED_TEST(Dense, ExtractsDiagonalFromShortFatMatrixIntoDiagonal)
+TYPED_TEST(DenseWithIndexType,
+           SquareMatrixInverseRowPermuteIntoDenseFailsForWrongPermutationSize)
 {
-    using T = typename TestFixture::value_type;
-    auto diag = gko::matrix::Diagonal<T>::create(this->exec, 2);
-
-    this->mtx8->extract_diagonal(diag);
+    using Mtx = typename TestFixture::Mtx;
+    using index_type = typename TestFixture::index_type;
+    auto exec = this->mtx5->get_executor();
+    gko::array<index_type> permute_idxs{exec, {1, 2}};
+    auto permuted = Mtx::create(exec, this->mtx5->get_size());
 
-    ASSERT_EQ(diag->get_size()[0], 2);
-    ASSERT_EQ(diag->get_size()[1], 2);
-    ASSERT_EQ(diag->get_values()[0], T{1.});
-    ASSERT_EQ(diag->get_values()[1], T{2.});
+    ASSERT_THROW(this->mtx5->inverse_row_permute(&permute_idxs, permuted),
+                 gko::DimensionMismatch);
 }
 
 
-TYPED_TEST(Dense, InplaceAbsolute)
+TYPED_TEST(DenseWithIndexType,
+           SquareMatrixInverseRowPermuteIntoDenseFailsForWrongDimensions)
 {
-    using T = typename TestFixture::value_type;
-
-    this->mtx5->compute_absolute_inplace();
+    using Mtx = typename TestFixture::Mtx;
+    using index_type = typename TestFixture::index_type;
+    auto exec = this->mtx5->get_executor();
+    gko::array<index_type> permute_idxs{exec, {1, 2, 0}};
 
-    GKO_ASSERT_MTX_NEAR(
-        this->mtx5, l<T>({{1.0, 1.0, 0.5}, {2.0, 2.0, 4.5}, {2.1, 3.4, 1.2}}),
-        0.0);
+    ASSERT_THROW(
+        this->mtx5->inverse_row_permute(&permute_idxs, Mtx::create(exec)),
+        gko::DimensionMismatch);
 }
 
 
-TYPED_TEST(Dense, InplaceAbsoluteSubMatrix)
+TYPED_TEST(DenseWithIndexType, SquareMatrixIsInverseColPermutable)
 {
-    using T = typename TestFixture::value_type;
-    auto mtx = this->mtx5->create_submatrix(gko::span{0, 2}, gko::span{0, 2});
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    auto exec = this->mtx5->get_executor();
+    gko::array<index_type> inverse_permute_idxs{exec, {1, 2, 0}};
 
-    mtx->compute_absolute_inplace();
+    auto permuted =
+        gko::as<Mtx>(this->mtx5->inverse_column_permute(&inverse_permute_idxs));
 
     GKO_ASSERT_MTX_NEAR(
-        this->mtx5, l<T>({{1.0, 1.0, -0.5}, {2.0, 2.0, 4.5}, {2.1, 3.4, 1.2}}),
+        permuted,
+        l<value_type>({{-0.5, 1.0, -1.0}, {4.5, -2.0, 2.0}, {1.2, 2.1, 3.4}}),
         0.0);
 }
 
 
-TYPED_TEST(Dense, OutplaceAbsolute)
+TYPED_TEST(DenseWithIndexType, NonSquareMatrixIsInverseColPermutable)
 {
-    using T = typename TestFixture::value_type;
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    auto exec = this->mtx4->get_executor();
+    gko::array<index_type> inverse_permute_idxs{exec, {1, 2, 0}};
 
-    auto abs_mtx = this->mtx5->compute_absolute();
+    auto permuted =
+        gko::as<Mtx>(this->mtx4->inverse_column_permute(&inverse_permute_idxs));
 
-    GKO_ASSERT_MTX_NEAR(
-        abs_mtx, l<T>({{1.0, 1.0, 0.5}, {2.0, 2.0, 4.5}, {2.1, 3.4, 1.2}}),
-        0.0);
+    GKO_ASSERT_MTX_NEAR(permuted,
+                        l<value_type>({{2.0, 1.0, 3.0}, {0.0, 0.0, 5.0}}), 0.0);
 }
 
 
-TYPED_TEST(Dense, OutplaceAbsoluteIntoDense)
+TYPED_TEST(DenseWithIndexType, SquareMatrixIsInverseColPermutableIntoDense)
 {
     using Mtx = typename TestFixture::Mtx;
-    using T = typename TestFixture::value_type;
-    auto abs_mtx =
-        gko::remove_complex<Mtx>::create(this->exec, this->mtx5->get_size());
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    auto exec = this->mtx5->get_executor();
+    gko::array<index_type> permute_idxs{exec, {1, 2, 0}};
+    auto permuted = Mtx::create(exec, this->mtx5->get_size());
 
-    this->mtx5->compute_absolute(abs_mtx);
+    this->mtx5->inverse_column_permute(&permute_idxs, permuted);
 
     GKO_ASSERT_MTX_NEAR(
-        abs_mtx, l<T>({{1.0, 1.0, 0.5}, {2.0, 2.0, 4.5}, {2.1, 3.4, 1.2}}),
+        permuted,
+        l<value_type>({{-0.5, 1.0, -1.0}, {4.5, -2.0, 2.0}, {1.2, 2.1, 3.4}}),
         0.0);
 }
 
 
-TYPED_TEST(Dense, OutplaceAbsoluteSubMatrix)
+TYPED_TEST(DenseWithIndexType, SquareSubmatrixIsInverseColPermutableIntoDense)
 {
-    using T = typename TestFixture::value_type;
-    auto mtx = this->mtx5->create_submatrix(gko::span{0, 2}, gko::span{0, 2});
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    auto exec = this->mtx5->get_executor();
+    gko::array<index_type> permute_idxs{exec, {1, 0}};
+    auto permuted = Mtx::create(exec, gko::dim<2>{2, 2}, 4);
 
-    auto abs_mtx = mtx->compute_absolute();
+    this->mtx5->create_submatrix({0, 2}, {0, 2})
+        ->column_permute(&permute_idxs, permuted);
 
-    GKO_ASSERT_MTX_NEAR(abs_mtx, l<T>({{1.0, 1.0}, {2.0, 2.0}}), 0);
-    GKO_ASSERT_EQ(abs_mtx->get_stride(), 2);
+    GKO_ASSERT_MTX_NEAR(permuted, l<value_type>({{-1.0, 1.0}, {2.0, -2.0}}),
+                        0.0);
+    ASSERT_EQ(permuted->get_stride(), 4);
 }
 
 
-TYPED_TEST(Dense, OutplaceSubmatrixAbsoluteIntoDense)
+TYPED_TEST(DenseWithIndexType,
+           SquareMatrixInverseColPermuteIntoDenseFailsForWrongPermutationSize)
 {
     using Mtx = typename TestFixture::Mtx;
-    using T = typename TestFixture::value_type;
-    auto mtx = this->mtx5->create_submatrix(gko::span{0, 2}, gko::span{0, 2});
-    auto abs_mtx =
-        gko::remove_complex<Mtx>::create(this->exec, gko::dim<2>{2, 2}, 4);
-
-    mtx->compute_absolute(abs_mtx);
+    using index_type = typename TestFixture::index_type;
+    auto exec = this->mtx5->get_executor();
+    gko::array<index_type> permute_idxs{exec, {1, 2}};
+    auto permuted = Mtx::create(exec, this->mtx5->get_size());
 
-    GKO_ASSERT_MTX_NEAR(abs_mtx, l<T>({{1.0, 1.0}, {2.0, 2.0}}), 0);
-    GKO_ASSERT_EQ(abs_mtx->get_stride(), 4);
+    ASSERT_THROW(this->mtx5->inverse_column_permute(&permute_idxs, permuted),
+                 gko::DimensionMismatch);
 }
 
 
-TYPED_TEST(Dense, AppliesToComplex)
+TYPED_TEST(DenseWithIndexType,
+           SquareMatrixInverseColPermuteIntoDenseFailsForWrongDimensions)
 {
-    using value_type = typename TestFixture::value_type;
-    using complex_type = gko::to_complex<value_type>;
-    using Vec = gko::matrix::Dense<complex_type>;
-    auto exec = gko::ReferenceExecutor::create();
-    auto b =
-        gko::initialize<Vec>({{complex_type{1.0, 0.0}, complex_type{2.0, 1.0}},
-                              {complex_type{2.0, 2.0}, complex_type{3.0, 3.0}},
-                              {complex_type{3.0, 4.0}, complex_type{4.0, 5.0}}},
-                             exec);
-    auto x = Vec::create(exec, gko::dim<2>{2, 2});
-
-    this->mtx1->apply(b, x);
+    using Mtx = typename TestFixture::Mtx;
+    using index_type = typename TestFixture::index_type;
+    auto exec = this->mtx5->get_executor();
+    gko::array<index_type> permute_idxs{exec, {1, 2, 0}};
 
-    GKO_ASSERT_MTX_NEAR(
-        x,
-        l({{complex_type{14.0, 16.0}, complex_type{20.0, 22.0}},
-           {complex_type{17.0, 19.0}, complex_type{24.5, 26.5}}}),
-        0.0);
+    ASSERT_THROW(
+        this->mtx5->inverse_column_permute(&permute_idxs, Mtx::create(exec)),
+        gko::DimensionMismatch);
 }
 
 
-TYPED_TEST(Dense, AppliesToMixedComplex)
+template <typename ValueType, typename IndexType>
+std::unique_ptr<gko::matrix::Dense<ValueType>> ref_scaled_permute(
+    gko::matrix::Dense<ValueType>* input,
+    gko::matrix::ScaledPermutation<ValueType, IndexType>* permutation,
+    gko::matrix::permute_mode mode)
 {
-    using mixed_value_type =
-        gko::next_precision<typename TestFixture::value_type>;
-    using mixed_complex_type = gko::to_complex<mixed_value_type>;
-    using Vec = gko::matrix::Dense<mixed_complex_type>;
-    auto exec = gko::ReferenceExecutor::create();
-    auto b = gko::initialize<Vec>(
-        {{mixed_complex_type{1.0, 0.0}, mixed_complex_type{2.0, 1.0}},
-         {mixed_complex_type{2.0, 2.0}, mixed_complex_type{3.0, 3.0}},
-         {mixed_complex_type{3.0, 4.0}, mixed_complex_type{4.0, 5.0}}},
-        exec);
-    auto x = Vec::create(exec, gko::dim<2>{2, 2});
-
-    this->mtx1->apply(b, x);
-
-    GKO_ASSERT_MTX_NEAR(
-        x,
-        l({{mixed_complex_type{14.0, 16.0}, mixed_complex_type{20.0, 22.0}},
-           {mixed_complex_type{17.0, 19.0}, mixed_complex_type{24.5, 26.5}}}),
-        0.0);
+    using gko::matrix::permute_mode;
+    auto result = input->clone();
+    auto permutation_dense =
+        gko::matrix::Dense<ValueType>::create(input->get_executor());
+    gko::matrix_data<ValueType, IndexType> permutation_data;
+    if ((mode & permute_mode::inverse) == permute_mode::inverse) {
+        permutation->compute_inverse()->write(permutation_data);
+    } else {
+        permutation->write(permutation_data);
+    }
+    permutation_dense->read(permutation_data);
+    if ((mode & permute_mode::rows) == permute_mode::rows) {
+        // compute P * A
+        permutation_dense->apply(input, result);
+    }
+    if ((mode & permute_mode::columns) == permute_mode::columns) {
+        // compute A * P^T = (P * A^T)^T
+        auto tmp = result->transpose();
+        auto tmp2 = gko::as<gko::matrix::Dense<ValueType>>(tmp->clone());
+        permutation_dense->apply(tmp, tmp2);
+        tmp2->transpose(result);
+    }
+    return result;
+}
+
+
+template <typename ValueType, typename IndexType>
+std::unique_ptr<gko::matrix::Dense<ValueType>> ref_scaled_permute(
+    gko::matrix::Dense<ValueType>* input,
+    gko::matrix::ScaledPermutation<ValueType, IndexType>* row_permutation,
+    gko::matrix::ScaledPermutation<ValueType, IndexType>* col_permutation,
+    bool invert)
+{
+    using gko::matrix::permute_mode;
+    auto result = input->clone();
+    auto row_permutation_dense =
+        gko::matrix::Dense<ValueType>::create(input->get_executor());
+    auto col_permutation_dense =
+        gko::matrix::Dense<ValueType>::create(input->get_executor());
+    gko::matrix_data<ValueType, IndexType> row_permutation_data;
+    gko::matrix_data<ValueType, IndexType> col_permutation_data;
+    if (invert) {
+        row_permutation->compute_inverse()->write(row_permutation_data);
+        col_permutation->compute_inverse()->write(col_permutation_data);
+    } else {
+        row_permutation->write(row_permutation_data);
+        col_permutation->write(col_permutation_data);
+    }
+    row_permutation_dense->read(row_permutation_data);
+    col_permutation_dense->read(col_permutation_data);
+    row_permutation_dense->apply(input, result);
+    auto tmp = result->transpose();
+    auto tmp2 = gko::as<gko::matrix::Dense<ValueType>>(tmp->clone());
+    col_permutation_dense->apply(tmp, tmp2);
+    tmp2->transpose(result);
+    return result;
 }
 
 
-TYPED_TEST(Dense, AdvancedAppliesToComplex)
+TYPED_TEST(DenseWithIndexType, ScaledPermute)
 {
+    using gko::matrix::permute_mode;
     using value_type = typename TestFixture::value_type;
-    using complex_type = gko::to_complex<value_type>;
-    using Dense = gko::matrix::Dense<value_type>;
-    using DenseComplex = gko::matrix::Dense<complex_type>;
-    auto exec = gko::ReferenceExecutor::create();
 
-    auto b = gko::initialize<DenseComplex>(
-        {{complex_type{1.0, 0.0}, complex_type{2.0, 1.0}},
-         {complex_type{2.0, 2.0}, complex_type{3.0, 3.0}},
-         {complex_type{3.0, 4.0}, complex_type{4.0, 5.0}}},
-        exec);
-    auto x = gko::initialize<DenseComplex>(
-        {{complex_type{1.0, 0.0}, complex_type{2.0, 1.0}},
-         {complex_type{2.0, 2.0}, complex_type{3.0, 3.0}}},
-        exec);
-    auto alpha = gko::initialize<Dense>({-1.0}, this->exec);
-    auto beta = gko::initialize<Dense>({2.0}, this->exec);
+    for (auto mode :
+         {permute_mode::none, permute_mode::rows, permute_mode::columns,
+          permute_mode::symmetric, permute_mode::inverse_rows,
+          permute_mode::inverse_columns, permute_mode::inverse_symmetric}) {
+        SCOPED_TRACE(mode);
 
-    this->mtx1->apply(alpha, b, beta, x);
+        auto permuted = this->mtx5->scale_permute(this->scale_perm3, mode);
+        auto ref_permuted =
+            ref_scaled_permute(this->mtx5.get(), this->scale_perm3.get(), mode);
 
-    GKO_ASSERT_MTX_NEAR(
-        x,
-        l({{complex_type{-12.0, -16.0}, complex_type{-16.0, -20.0}},
-           {complex_type{-13.0, -15.0}, complex_type{-18.5, -20.5}}}),
-        0.0);
+        GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, r<value_type>::value);
+    }
 }
 
 
-TYPED_TEST(Dense, AdvancedAppliesToMixedComplex)
+TYPED_TEST(DenseWithIndexType, ScaledPermuteRoundtrip)
 {
-    using mixed_value_type =
-        gko::next_precision<typename TestFixture::value_type>;
-    using mixed_complex_type = gko::to_complex<mixed_value_type>;
-    using MixedDense = gko::matrix::Dense<mixed_value_type>;
-    using MixedDenseComplex = gko::matrix::Dense<mixed_complex_type>;
-    auto exec = gko::ReferenceExecutor::create();
+    using gko::matrix::permute_mode;
+    using value_type = typename TestFixture::value_type;
 
-    auto b = gko::initialize<MixedDenseComplex>(
-        {{mixed_complex_type{1.0, 0.0}, mixed_complex_type{2.0, 1.0}},
-         {mixed_complex_type{2.0, 2.0}, mixed_complex_type{3.0, 3.0}},
-         {mixed_complex_type{3.0, 4.0}, mixed_complex_type{4.0, 5.0}}},
-        exec);
-    auto x = gko::initialize<MixedDenseComplex>(
-        {{mixed_complex_type{1.0, 0.0}, mixed_complex_type{2.0, 1.0}},
-         {mixed_complex_type{2.0, 2.0}, mixed_complex_type{3.0, 3.0}}},
-        exec);
-    auto alpha = gko::initialize<MixedDense>({-1.0}, this->exec);
-    auto beta = gko::initialize<MixedDense>({2.0}, this->exec);
+    for (auto mode :
+         {permute_mode::rows, permute_mode::columns, permute_mode::symmetric}) {
+        SCOPED_TRACE(mode);
 
-    this->mtx1->apply(alpha, b, beta, x);
+        auto permuted = this->mtx5->scale_permute(this->scale_perm3, mode)
+                            ->scale_permute(this->scale_perm3,
+                                            mode | permute_mode::inverse);
 
-    GKO_ASSERT_MTX_NEAR(
-        x,
-        l({{mixed_complex_type{-12.0, -16.0}, mixed_complex_type{-16.0, -20.0}},
-           {mixed_complex_type{-13.0, -15.0},
-            mixed_complex_type{-18.5, -20.5}}}),
-        0.0);
+        GKO_ASSERT_MTX_NEAR(this->mtx5, permuted, r<value_type>::value);
+    }
 }
 
 
-TYPED_TEST(Dense, MakeComplex)
+TYPED_TEST(DenseWithIndexType, ScaledPermuteStridedIntoDense)
 {
-    using T = typename TestFixture::value_type;
+    using gko::matrix::permute_mode;
+    using value_type = typename TestFixture::value_type;
+    using Mtx = typename TestFixture::Mtx;
+    auto mtx = Mtx::create(this->exec, this->mtx5->get_size(),
+                           this->mtx5->get_size()[1] + 1);
+    mtx->copy_from(this->mtx5);
 
-    auto complex_mtx = this->mtx5->make_complex();
+    for (auto mode :
+         {permute_mode::none, permute_mode::rows, permute_mode::columns,
+          permute_mode::symmetric, permute_mode::inverse,
+          permute_mode::inverse_rows, permute_mode::inverse_columns,
+          permute_mode::inverse_symmetric}) {
+        SCOPED_TRACE(mode);
+        auto permuted = Mtx::create(this->exec, this->mtx5->get_size(),
+                                    this->mtx5->get_size()[1] + 2);
 
-    GKO_ASSERT_MTX_NEAR(complex_mtx, this->mtx5, 0.0);
+        this->mtx5->scale_permute(this->scale_perm3, permuted, mode);
+        auto ref_permuted =
+            ref_scaled_permute(this->mtx5.get(), this->scale_perm3.get(), mode);
+
+        GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, r<value_type>::value);
+    }
 }
 
 
-TYPED_TEST(Dense, MakeComplexIntoDense)
+TYPED_TEST(DenseWithIndexType, ScaledPermuteRectangular)
 {
-    using T = typename TestFixture::value_type;
-    using ComplexMtx = typename TestFixture::ComplexMtx;
-    auto exec = this->mtx5->get_executor();
-
-    auto complex_mtx = ComplexMtx::create(exec, this->mtx5->get_size());
-    this->mtx5->make_complex(complex_mtx);
+    using gko::matrix::permute_mode;
+    using value_type = typename TestFixture::value_type;
 
-    GKO_ASSERT_MTX_NEAR(complex_mtx, this->mtx5, 0.0);
+    auto rpermuted =
+        this->mtx1->scale_permute(this->scale_perm2, permute_mode::rows);
+    auto irpermuted = this->mtx1->scale_permute(this->scale_perm2,
+                                                permute_mode::inverse_rows);
+    auto cpermuted =
+        this->mtx1->scale_permute(this->scale_perm3, permute_mode::columns);
+    auto icpermuted = this->mtx1->scale_permute(this->scale_perm3,
+                                                permute_mode::inverse_columns);
+    auto ref_rpermuted = ref_scaled_permute(
+        this->mtx1.get(), this->scale_perm2.get(), permute_mode::rows);
+    auto ref_irpermuted = ref_scaled_permute(
+        this->mtx1.get(), this->scale_perm2.get(), permute_mode::inverse_rows);
+    auto ref_cpermuted = ref_scaled_permute(
+        this->mtx1.get(), this->scale_perm3.get(), permute_mode::columns);
+    auto ref_icpermuted =
+        ref_scaled_permute(this->mtx1.get(), this->scale_perm3.get(),
+                           permute_mode::inverse_columns);
+
+    GKO_ASSERT_MTX_NEAR(rpermuted, ref_rpermuted, r<value_type>::value);
+    GKO_ASSERT_MTX_NEAR(irpermuted, ref_irpermuted, r<value_type>::value);
+    GKO_ASSERT_MTX_NEAR(cpermuted, ref_cpermuted, r<value_type>::value);
+    GKO_ASSERT_MTX_NEAR(icpermuted, ref_icpermuted, r<value_type>::value);
+}
+
+
+TYPED_TEST(DenseWithIndexType, ScaledPermuteFailsWithIncorrectPermutationSize)
+{
+    using gko::matrix::permute_mode;
+
+    for (auto mode :
+         {/* no permute_mode::none */ permute_mode::rows, permute_mode::columns,
+          permute_mode::symmetric, permute_mode::inverse_rows,
+          permute_mode::inverse_columns, permute_mode::inverse_symmetric}) {
+        SCOPED_TRACE(mode);
+
+        ASSERT_THROW(this->mtx5->scale_permute(this->scale_perm0, mode),
+                     gko::DimensionMismatch);
+    }
 }
 
 
-TYPED_TEST(Dense, MakeComplexIntoDenseFailsForWrongDimensions)
+TYPED_TEST(DenseWithIndexType, ScaledPermuteFailsWithIncorrectOutputSize)
 {
-    using T = typename TestFixture::value_type;
-    using ComplexMtx = typename TestFixture::ComplexMtx;
-    auto exec = this->mtx5->get_executor();
+    using gko::matrix::permute_mode;
+    using Mtx = typename TestFixture::Mtx;
+    auto output = Mtx::create(this->exec);
 
-    auto complex_mtx = ComplexMtx::create(exec);
+    for (auto mode :
+         {permute_mode::none, permute_mode::rows, permute_mode::columns,
+          permute_mode::symmetric, permute_mode::inverse_rows,
+          permute_mode::inverse_columns, permute_mode::inverse_symmetric}) {
+        SCOPED_TRACE(mode);
 
-    ASSERT_THROW(this->mtx5->make_complex(complex_mtx), gko::DimensionMismatch);
+        ASSERT_THROW(this->mtx5->scale_permute(this->scale_perm3, output, mode),
+                     gko::DimensionMismatch);
+    }
 }
 
 
-TYPED_TEST(Dense, GetReal)
+TYPED_TEST(DenseWithIndexType, NonsymmScaledPermute)
 {
-    using T = typename TestFixture::value_type;
+    using value_type = typename TestFixture::value_type;
 
-    auto real_mtx = this->mtx5->get_real();
+    auto permuted =
+        this->mtx5->scale_permute(this->scale_perm3, this->scale_perm3_rev);
+    auto ref_permuted =
+        ref_scaled_permute(this->mtx5.get(), this->scale_perm3.get(),
+                           this->scale_perm3_rev.get(), false);
 
-    GKO_ASSERT_MTX_NEAR(real_mtx, this->mtx5, 0.0);
+    GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, r<value_type>::value);
 }
 
 
-TYPED_TEST(Dense, GetRealIntoDense)
+TYPED_TEST(DenseWithIndexType, NonsymmScaledPermuteInverse)
 {
-    using T = typename TestFixture::value_type;
-    using RealMtx = typename TestFixture::RealMtx;
-    auto exec = this->mtx5->get_executor();
+    using value_type = typename TestFixture::value_type;
 
-    auto real_mtx = RealMtx::create(exec, this->mtx5->get_size());
-    this->mtx5->get_real(real_mtx);
+    auto permuted = this->mtx5->scale_permute(this->scale_perm3,
+                                              this->scale_perm3_rev, true);
+    auto ref_permuted =
+        ref_scaled_permute(this->mtx5.get(), this->scale_perm3.get(),
+                           this->scale_perm3_rev.get(), true);
 
-    GKO_ASSERT_MTX_NEAR(real_mtx, this->mtx5, 0.0);
+    GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, r<value_type>::value);
 }
 
 
-TYPED_TEST(Dense, GetRealIntoDenseFailsForWrongDimensions)
+TYPED_TEST(DenseWithIndexType, NonsymmScaledPermuteRectangular)
 {
-    using T = typename TestFixture::value_type;
-    using RealMtx = typename TestFixture::RealMtx;
-    auto exec = this->mtx5->get_executor();
+    using value_type = typename TestFixture::value_type;
 
-    auto real_mtx = RealMtx::create(exec);
-    ASSERT_THROW(this->mtx5->get_real(real_mtx), gko::DimensionMismatch);
+    auto permuted =
+        this->mtx1->scale_permute(this->scale_perm2, this->scale_perm3);
+    auto ref_permuted =
+        ref_scaled_permute(this->mtx1.get(), this->scale_perm2.get(),
+                           this->scale_perm3.get(), false);
+
+    GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, r<value_type>::value);
 }
 
 
-TYPED_TEST(Dense, GetImag)
+TYPED_TEST(DenseWithIndexType, NonsymmScaledPermuteInverseRectangular)
 {
-    using T = typename TestFixture::value_type;
+    using value_type = typename TestFixture::value_type;
 
-    auto imag_mtx = this->mtx5->get_imag();
+    auto permuted =
+        this->mtx1->scale_permute(this->scale_perm2, this->scale_perm3, true);
+    auto ref_permuted =
+        ref_scaled_permute(this->mtx1.get(), this->scale_perm2.get(),
+                           this->scale_perm3.get(), true);
 
-    GKO_ASSERT_MTX_NEAR(
-        imag_mtx, l<T>({{0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}}),
-        0.0);
+    GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, r<value_type>::value);
 }
 
 
-TYPED_TEST(Dense, GetImagIntoDense)
+TYPED_TEST(DenseWithIndexType, NonsymmScaledPermuteRoundtrip)
 {
-    using T = typename TestFixture::value_type;
-    using RealMtx = typename TestFixture::RealMtx;
-    auto exec = this->mtx5->get_executor();
+    using value_type = typename TestFixture::value_type;
 
-    auto imag_mtx = RealMtx::create(exec, this->mtx5->get_size());
-    this->mtx5->get_imag(imag_mtx);
+    auto permuted =
+        this->mtx5->scale_permute(this->scale_perm3, this->scale_perm3_rev)
+            ->scale_permute(this->scale_perm3, this->scale_perm3_rev, true);
 
-    GKO_ASSERT_MTX_NEAR(
-        imag_mtx, l<T>({{0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}}),
-        0.0);
+    GKO_ASSERT_MTX_NEAR(this->mtx5, permuted, r<value_type>::value);
 }
 
 
-TYPED_TEST(Dense, GetImagIntoDenseFailsForWrongDimensions)
+TYPED_TEST(DenseWithIndexType, NonsymmScaledPermuteInverseInverted)
 {
-    using T = typename TestFixture::value_type;
-    using RealMtx = typename TestFixture::RealMtx;
-    auto exec = this->mtx5->get_executor();
+    using value_type = typename TestFixture::value_type;
 
-    auto imag_mtx = RealMtx::create(exec);
-    ASSERT_THROW(this->mtx5->get_imag(imag_mtx), gko::DimensionMismatch);
-}
+    auto inv_permuted = this->mtx5->scale_permute(this->scale_perm3,
+                                                  this->scale_perm3_rev, true);
+    auto preinv_permuted =
+        this->mtx5->scale_permute(this->scale_perm3->compute_inverse(),
+                                  this->scale_perm3_rev->compute_inverse());
 
+    GKO_ASSERT_MTX_NEAR(inv_permuted, preinv_permuted, r<value_type>::value);
+}
 
-TYPED_TEST(Dense, MakeTemporaryConversionDoesntConvertOnMatch)
+TYPED_TEST(DenseWithIndexType, NonsymmScaledPermuteStridedIntoDense)
 {
     using Mtx = typename TestFixture::Mtx;
-    using T = typename TestFixture::value_type;
-    auto alpha = gko::initialize<Mtx>({8.0}, this->exec);
+    using value_type = typename TestFixture::value_type;
+    auto mtx = Mtx::create(this->exec, this->mtx5->get_size(),
+                           this->mtx5->get_size()[1] + 1);
+    auto permuted = Mtx::create(this->exec, this->mtx5->get_size(),
+                                this->mtx5->get_size()[1] + 2);
+    mtx->copy_from(this->mtx5);
 
-    ASSERT_EQ(gko::make_temporary_conversion<T>(alpha).get(), alpha.get());
+    mtx->scale_permute(this->scale_perm3, this->scale_perm3_rev, permuted);
+    auto ref_permuted =
+        ref_scaled_permute(this->mtx5.get(), this->scale_perm3.get(),
+                           this->scale_perm3_rev.get(), false);
+
+    GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, r<value_type>::value);
 }
 
 
-TYPED_TEST(Dense, MakeTemporaryConversionConvertsBack)
+TYPED_TEST(DenseWithIndexType, NonsymmScaledPermuteInverseStridedIntoDense)
 {
-    using MixedMtx = typename TestFixture::MixedMtx;
-    using T = typename TestFixture::value_type;
-    using MixedT = typename MixedMtx::value_type;
-    auto alpha = gko::initialize<MixedMtx>({8.0}, this->exec);
-
-    {
-        auto conversion = gko::make_temporary_conversion<T>(alpha);
-        conversion->at(0, 0) = T{7.0};
-    }
+    using Mtx = typename TestFixture::Mtx;
+    using value_type = typename TestFixture::value_type;
+    auto mtx = Mtx::create(this->exec, this->mtx5->get_size(),
+                           this->mtx5->get_size()[1] + 1);
+    auto permuted = Mtx::create(this->exec, this->mtx5->get_size(),
+                                this->mtx5->get_size()[1] + 2);
+    mtx->copy_from(this->mtx5);
+
+    mtx->scale_permute(this->scale_perm3, this->scale_perm3_rev, permuted,
+                       true);
+    auto ref_permuted =
+        ref_scaled_permute(this->mtx5.get(), this->scale_perm3.get(),
+                           this->scale_perm3_rev.get(), true);
 
-    ASSERT_EQ(alpha->at(0, 0), MixedT{7.0});
+    GKO_ASSERT_MTX_NEAR(permuted, ref_permuted, r<value_type>::value);
 }
 
 
-TYPED_TEST(Dense, MakeTemporaryConversionConstDoesntConvertBack)
+TYPED_TEST(DenseWithIndexType, NonsymmScaledPermuteFailsWithIncorrectOutputSize)
 {
-    using MixedMtx = typename TestFixture::MixedMtx;
-    using T = typename TestFixture::value_type;
-    using MixedT = typename MixedMtx::value_type;
-    auto alpha = gko::initialize<MixedMtx>({8.0}, this->exec);
-
-    {
-        auto conversion = gko::make_temporary_conversion<T>(
-            static_cast<const MixedMtx*>(alpha.get()));
-        alpha->at(0, 0) = MixedT{7.0};
-    }
-
-    ASSERT_EQ(alpha->at(0, 0), MixedT{7.0});
+    ASSERT_THROW(
+        this->mtx5->scale_permute(this->scale_perm3, this->scale_perm3,
+                                  TestFixture::Mtx::create(this->exec)),
+        gko::DimensionMismatch);
 }
 
 
-TYPED_TEST(Dense, ScaleAddIdentityRectangular)
+TYPED_TEST(DenseWithIndexType,
+           NonsymmScaledPermuteFailsWithIncorrectPermutationSize)
 {
-    using T = typename TestFixture::value_type;
-    using Vec = typename TestFixture::Mtx;
-    using MixedVec = typename TestFixture::MixedMtx;
-    auto alpha = gko::initialize<Vec>({2.0}, this->exec);
-    auto beta = gko::initialize<Vec>({-1.0}, this->exec);
-    auto b = gko::initialize<Vec>(
-        {I<T>{2.0, 0.0}, I<T>{1.0, 2.5}, I<T>{0.0, -4.0}}, this->exec);
-
-    b->add_scaled_identity(alpha, beta);
-
-    GKO_ASSERT_MTX_NEAR(b, l({{0.0, 0.0}, {-1.0, -0.5}, {0.0, 4.0}}), 0.0);
+    ASSERT_THROW(
+        this->mtx5->scale_permute(this->scale_perm0, this->scale_perm3_rev),
+        gko::DimensionMismatch);
+    ASSERT_THROW(
+        this->mtx5->scale_permute(this->scale_perm3_rev, this->scale_perm0),
+        gko::DimensionMismatch);
+    ASSERT_THROW(
+        this->mtx5->scale_permute(this->scale_perm0, this->scale_perm0),
+        gko::DimensionMismatch);
 }
 
 
diff --git a/reference/test/matrix/permutation.cpp b/reference/test/matrix/permutation.cpp
index 2bd2e3d9741..5946eea7eb8 100644
--- a/reference/test/matrix/permutation.cpp
+++ b/reference/test/matrix/permutation.cpp
@@ -33,11 +33,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/permutation.hpp>
 
 
+#include <random>
+
+
 #include <gtest/gtest.h>
 
 
+#include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
-#include <ginkgo/core/base/range.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
@@ -51,15 +54,33 @@ namespace {
 template <typename ValueIndexType>
 class Permutation : public ::testing::Test {
 protected:
-    using v_type =
+    using value_type =
         typename std::tuple_element<0, decltype(ValueIndexType())>::type;
-    using i_type =
+    using index_type =
         typename std::tuple_element<1, decltype(ValueIndexType())>::type;
-    using Vec = gko::matrix::Dense<v_type>;
-    using Csr = gko::matrix::Csr<v_type, i_type>;
+    using Vec = gko::matrix::Dense<value_type>;
 
     Permutation() : exec(gko::ReferenceExecutor::create()) {}
 
+    std::unique_ptr<gko::matrix::Dense<double>> ref_combine(
+        const gko::matrix::Permutation<index_type>* first,
+        const gko::matrix::Permutation<index_type>* second)
+    {
+        using Mtx = gko::matrix::Dense<double>;
+        const auto exec = first->get_executor();
+        gko::matrix_data<double, index_type> first_perm_data;
+        gko::matrix_data<double, index_type> second_perm_data;
+        first->write(first_perm_data);
+        second->write(second_perm_data);
+        const auto first_mtx = Mtx::create(exec);
+        const auto second_mtx = Mtx::create(exec);
+        first_mtx->read(first_perm_data);
+        second_mtx->read(second_perm_data);
+        auto combined_mtx = first_mtx->clone();
+        second_mtx->apply(first_mtx, combined_mtx);
+        return combined_mtx;
+    }
+
     std::shared_ptr<const gko::Executor> exec;
 };
 
@@ -67,415 +88,156 @@ TYPED_TEST_SUITE(Permutation, gko::test::ValueIndexTypes,
                  PairTypenameNameGenerator);
 
 
-TYPED_TEST(Permutation, AppliesRowPermutationToDense)
+TYPED_TEST(Permutation, Invert)
 {
-    using i_type = typename TestFixture::i_type;
-    using T = typename TestFixture::v_type;
-    using Vec = typename TestFixture::Vec;
-    // clang-format off
-    auto x = gko::initialize<Vec>(
-        {I<T>{2.0, 3.0},
-         I<T>{4.0, 2.5}}, this->exec);
-    // clang-format on
-    auto y = Vec::create(this->exec, gko::dim<2>{2});
-    i_type rdata[] = {1, 0};
+    using index_type = typename TestFixture::index_type;
+    auto perm = gko::matrix::Permutation<index_type>::create(
+        this->exec, gko::array<index_type>{this->exec, {1, 2, 0}});
 
-    auto perm = gko::matrix::Permutation<i_type>::create(
-        this->exec, gko::dim<2>{2}, gko::make_array_view(this->exec, 2, rdata));
+    auto inv = perm->compute_inverse();
 
-    perm->apply(x, y);
-    // clang-format off
-    GKO_ASSERT_MTX_NEAR(y,
-                        l({{4.0, 2.5},
-                           {2.0, 3.0}}),
-                        0.0);
-    // clang-format on
+    EXPECT_EQ(inv->get_const_permutation()[0], 2);
+    EXPECT_EQ(inv->get_const_permutation()[1], 0);
+    EXPECT_EQ(inv->get_const_permutation()[2], 1);
 }
 
 
-TYPED_TEST(Permutation, AppliesColPermutationToDense)
+TYPED_TEST(Permutation, Combine)
 {
-    using i_type = typename TestFixture::i_type;
-    using T = typename TestFixture::v_type;
-    using Vec = typename TestFixture::Vec;
-    // clang-format off
-    auto x = gko::initialize<Vec>(
-        {I<T>{2.0, 3.0},
-         I<T>{4.0, 2.5}}, this->exec);
-    // clang-format on
-    auto y = Vec::create(this->exec, gko::dim<2>{2});
-    i_type rdata[] = {1, 0};
-
-    auto perm = gko::matrix::Permutation<i_type>::create(
-        this->exec, gko::dim<2>{2}, gko::make_array_view(this->exec, 2, rdata),
-        gko::matrix::column_permute);
-
-    perm->apply(x, y);
-    // clang-format off
-    GKO_ASSERT_MTX_NEAR(y,
-                        l({{3.0, 2.0},
-                           {2.5, 4.0}}),
-                        0.0);
-    // clang-format on
-}
+    using index_type = typename TestFixture::index_type;
+    const auto perm = gko::matrix::Permutation<index_type>::create(
+        this->exec, gko::array<index_type>{this->exec, {1, 2, 0}});
+    const auto perm2 = gko::matrix::Permutation<index_type>::create(
+        this->exec, gko::array<index_type>{this->exec, {0, 2, 1}});
+    const auto ref_combined = this->ref_combine(perm.get(), perm2.get());
 
+    const auto combined = perm->compose(perm2);
 
-TYPED_TEST(Permutation, AppliesRowAndColPermutationToDense)
-{
-    using i_type = typename TestFixture::i_type;
-    using T = typename TestFixture::v_type;
-    using Vec = typename TestFixture::Vec;
-    // clang-format off
-    auto x = gko::initialize<Vec>(
-        {I<T>{2.0, 3.0},
-         I<T>{4.0, 2.5}}, this->exec);
-    // clang-format on
-    auto y1 = Vec::create(this->exec, gko::dim<2>{2});
-    auto y2 = Vec::create(this->exec, gko::dim<2>{2});
-    i_type cdata[] = {1, 0};
-    i_type rdata[] = {1, 0};
-
-    auto rperm = gko::matrix::Permutation<i_type>::create(
-        this->exec, gko::dim<2>{2}, gko::make_array_view(this->exec, 2, rdata));
-    auto cperm = gko::matrix::Permutation<i_type>::create(
-        this->exec, gko::dim<2>{2}, gko::make_array_view(this->exec, 2, cdata),
-        gko::matrix::column_permute);
-
-    rperm->apply(x, y1);
-    cperm->apply(y1, y2);
-    // clang-format off
-    GKO_ASSERT_MTX_NEAR(y2,
-                        l({{2.5, 4.0},
-                           {3.0, 2.0}}),
-                        0.0);
-    // clang-format on
+    GKO_ASSERT_MTX_NEAR(combined, ref_combined, 0.0);
 }
 
 
-TYPED_TEST(Permutation, AppliesRowAndColPermutationToDenseWithOneArray)
+TYPED_TEST(Permutation, CombineLarger)
 {
-    using i_type = typename TestFixture::i_type;
-    using T = typename TestFixture::v_type;
-    using Vec = typename TestFixture::Vec;
-    // clang-format off
-    auto x = gko::initialize<Vec>(
-        {I<T>{2.0, 3.0},
-         I<T>{4.0, 2.5}}, this->exec);
-    // clang-format on
-    auto y1 = Vec::create(this->exec, gko::dim<2>{2});
-    i_type data[] = {1, 0};
-
-    auto perm = gko::matrix::Permutation<i_type>::create(
-        this->exec, gko::dim<2>{2}, gko::make_array_view(this->exec, 2, data),
-        gko::matrix::row_permute | gko::matrix::column_permute);
-
-    perm->apply(x, y1);
-    // clang-format off
-    GKO_ASSERT_MTX_NEAR(y1,
-                        l({{2.5, 4.0},
-                           {3.0, 2.0}}),
-                        0.0);
-    // clang-format on
+    using index_type = typename TestFixture::index_type;
+    const auto perm = gko::matrix::Permutation<index_type>::create(
+        this->exec,
+        gko::array<index_type>{this->exec, {6, 2, 4, 0, 1, 5, 9, 8, 3, 7}});
+    const auto perm2 = gko::matrix::Permutation<index_type>::create(
+        this->exec,
+        gko::array<index_type>{this->exec, {9, 2, 1, 6, 3, 7, 8, 4, 0, 5}});
+    const auto ref_combined = this->ref_combine(perm.get(), perm2.get());
+
+    const auto combined = perm->compose(perm2);
+
+    GKO_ASSERT_MTX_NEAR(combined, ref_combined, 0.0);
 }
 
 
-TYPED_TEST(Permutation, AppliesInverseRowAndColPermutationToDense)
+TYPED_TEST(Permutation, CombineWithInverse)
 {
-    using i_type = typename TestFixture::i_type;
-    using Vec = typename TestFixture::Vec;
-    // clang-format off
-    auto x = gko::initialize<Vec>({{2.0, 3.0, 0.0},
-                                  {0.0, 1.0, 0.0},
-                                  {0.0, 4.0, 2.5}},
-                                  this->exec);
-    // clang-format on
-    auto y1 = Vec::create(this->exec, gko::dim<2>{3});
-    auto y2 = Vec::create(this->exec, gko::dim<2>{3});
-    i_type cdata[] = {1, 2, 0};
-    i_type rdata[] = {1, 2, 0};
-
-    auto rperm = gko::matrix::Permutation<i_type>::create(
-        this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, rdata),
-        gko::matrix::row_permute | gko::matrix::inverse_permute);
-    auto cperm = gko::matrix::Permutation<i_type>::create(
-        this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, cdata),
-        gko::matrix::inverse_permute | gko::matrix::column_permute);
-
-    rperm->apply(x, y1);
-    cperm->apply(y1, y2);
-    // clang-format off
-    GKO_ASSERT_MTX_NEAR(y2,
-                        l({{2.5, 0.0, 4.0},
-                           {0.0, 2.0, 3.0},
-                           {0.0, 0.0, 1.0}}),
-                        0.0);
-    // clang-format on
+    using index_type = typename TestFixture::index_type;
+    const gko::size_type size = 20;
+    auto perm = gko::matrix::Permutation<index_type>::create(this->exec, size);
+    std::iota(perm->get_permutation(), perm->get_permutation() + size, 0);
+    std::shuffle(perm->get_permutation(), perm->get_permutation() + size,
+                 std::default_random_engine{29584});
+
+    auto combined = perm->compose(perm->compute_inverse());
+
+    for (index_type i = 0; i < size; i++) {
+        ASSERT_EQ(combined->get_const_permutation()[i], i);
+    }
 }
 
 
-TYPED_TEST(Permutation, AppliesInverseRowAndColPermutationToDenseWithOneArray)
+TYPED_TEST(Permutation, CombineFailsWithMismatchingSize)
 {
-    using i_type = typename TestFixture::i_type;
-    using Vec = typename TestFixture::Vec;
-    // clang-format off
-    auto x = gko::initialize<Vec>({{2.0, 3.0, 0.0},
-                                   {0.0, 1.0, 0.0},
-                                   {0.0, 4.0, 2.5}},
-                                 this->exec);
-    // clang-format on
-    auto y1 = Vec::create(this->exec, gko::dim<2>{3});
-    i_type data[] = {1, 2, 0};
-
-    auto perm = gko::matrix::Permutation<i_type>::create(
-        this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, data),
-        gko::matrix::column_permute | gko::matrix::row_permute |
-            gko::matrix::inverse_permute);
+    using index_type = typename TestFixture::index_type;
+    auto perm = gko::matrix::Permutation<index_type>::create(
+        this->exec, gko::array<index_type>{this->exec, {1, 2, 0}});
+    auto perm0 = gko::matrix::Permutation<index_type>::create(this->exec);
 
-    perm->apply(x, y1);
-    // clang-format off
-    GKO_ASSERT_MTX_NEAR(y1,
-                        l({{2.5, 0.0, 4.0},
-                           {0.0, 2.0, 3.0},
-                           {0.0, 0.0, 1.0}}),
-                        0.0);
-    // clang-format on
+    ASSERT_THROW(perm->compose(perm0), gko::DimensionMismatch);
 }
 
 
-TYPED_TEST(Permutation, AppliesInverseRowPermutationToDense)
+TYPED_TEST(Permutation, Write)
 {
-    using i_type = typename TestFixture::i_type;
-    using Vec = typename TestFixture::Vec;
-    // clang-format off
-    auto x = gko::initialize<Vec>({{2.0, 3.0, 0.0},
-                                 {0.0, 1.0, 0.0},
-                                 {0.0, 4.0, 2.5}},
-                                this->exec);
-    // clang-format on
-    auto y = Vec::create(this->exec, gko::dim<2>{3});
-    i_type rdata[] = {1, 2, 0};
-
-    auto rperm = gko::matrix::Permutation<i_type>::create(
-        this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, rdata),
-        gko::matrix::row_permute | gko::matrix::inverse_permute);
+    using index_type = typename TestFixture::index_type;
+    auto perm = gko::matrix::Permutation<index_type>::create(
+        this->exec, gko::array<index_type>{this->exec, {1, 2, 0}});
 
-    rperm->apply(x, y);
-    // clang-format off
-    GKO_ASSERT_MTX_NEAR(y,
-                        l({{0.0, 4.0, 2.5},
-                           {2.0, 3.0, 0.0},
-                           {0.0, 1.0, 0.0}}),
-                          0.0);
-    // clang-format on
+    GKO_ASSERT_MTX_NEAR(
+        perm, l<double>({{0.0, 1.0, 0.0}, {0.0, 0.0, 1.0}, {1.0, 0.0, 0.0}}),
+        0.0);
 }
 
 
-TYPED_TEST(Permutation, AppliesInverseColPermutationToDense)
+TYPED_TEST(Permutation, AppliesRowPermutationToDense)
 {
-    using i_type = typename TestFixture::i_type;
+    using index_type = typename TestFixture::index_type;
+    using T = typename TestFixture::value_type;
     using Vec = typename TestFixture::Vec;
     // clang-format off
-    auto x = gko::initialize<Vec>({{2.0, 3.0, 0.0},
-                                   {0.0, 1.0, 0.0},
-                                   {0.0, 4.0, 2.5}},
-                                  this->exec);
-    // clang-format on
-    auto y = Vec::create(this->exec, gko::dim<2>{3});
-    i_type cdata[] = {1, 2, 0};
-
-    auto cperm = gko::matrix::Permutation<i_type>::create(
-        this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, cdata),
-        gko::matrix::inverse_permute | gko::matrix::column_permute);
-
-    cperm->apply(x, y);
-    // clang-format off
-    GKO_ASSERT_MTX_NEAR(y,
-                      l({{0.0, 2.0, 3.0},
-                         {0.0, 0.0, 1.0},
-                         {2.5, 0.0, 4.0}}),
-                      0.0);
-    // clang-format on
-}
-
-
-TYPED_TEST(Permutation, AppliesRowPermutationToCsr)
-{
-    using i_type = typename TestFixture::i_type;
-    using Csr = typename TestFixture::Csr;
-    // clang-format off
-    auto x = gko::initialize<Csr>(
-                                  {{2.0, 3.0, 0.0},
-                                   {0.0, 1.0, 0.0},
-                                   {0.0, 4.0, 2.5}},
-                                  this->exec);
+    auto x = gko::initialize<Vec>(
+        {I<T>{2.0, 3.0},
+         I<T>{4.0, 2.5}}, this->exec);
     // clang-format on
-    auto y = Csr::create(this->exec, gko::dim<2>{3});
-    i_type rdata[] = {1, 2, 0};
+    auto y = Vec::create(this->exec, gko::dim<2>{2});
+    index_type rdata[] = {1, 0};
 
-    auto perm = gko::matrix::Permutation<i_type>::create(
-        this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, rdata));
+    auto perm = gko::matrix::Permutation<index_type>::create(
+        this->exec, gko::make_array_view(this->exec, 2, rdata));
 
     perm->apply(x, y);
     // clang-format off
     GKO_ASSERT_MTX_NEAR(y,
-                        l({{0.0, 1.0, 0.0},
-                           {0.0, 4.0, 2.5},
-                           {2.0, 3.0, 0.0}}),
+                        l({{4.0, 2.5},
+                           {2.0, 3.0}}),
                         0.0);
     // clang-format on
 }
 
 
-TYPED_TEST(Permutation, AppliesColPermutationToCsr)
+TYPED_TEST(Permutation, AdvancedAppliesRowPermutationToDense)
 {
-    using i_type = typename TestFixture::i_type;
-    using Csr = typename TestFixture::Csr;
-    // clang-format off
-    auto x = gko::initialize<Csr>(
-                                  {{2.0, 3.0, 0.0},
-                                   {0.0, 1.0, 0.0},
-                                   {0.0, 4.0, 2.5}},
-                                  this->exec);
-    // clang-format on
-    auto y = Csr::create(this->exec, gko::dim<2>{3});
-    i_type cdata[] = {1, 2, 0};
-
-    auto perm = gko::matrix::Permutation<i_type>::create(
-        this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, cdata),
-        gko::matrix::column_permute);
-
-    perm->apply(x, y);
-    // clang-format off
-    GKO_ASSERT_MTX_NEAR(y,
-                      l({{3.0, 0.0, 2.0},
-                         {1.0, 0.0, 0.0},
-                         {4.0, 2.5, 0.0}}),
-                      0.0);
-    // clang-format on
-}
-
-
-TYPED_TEST(Permutation, AppliesRowAndColPermutationToCsr)
-{
-    using i_type = typename TestFixture::i_type;
-    using Csr = typename TestFixture::Csr;
-    // clang-format off
-    auto x = gko::initialize<Csr>(
-                                  {{2.0, 3.0, 0.0},
-                                   {0.0, 1.0, 0.0},
-                                   {0.0, 4.0, 2.5}},
-                                  this->exec);
-    // clang-format on
-    auto y1 = Csr::create(this->exec, gko::dim<2>{3});
-    auto y2 = Csr::create(this->exec, gko::dim<2>{3});
-    i_type cdata[] = {1, 2, 0};
-    i_type rdata[] = {1, 2, 0};
-
-    auto rperm = gko::matrix::Permutation<i_type>::create(
-        this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, rdata));
-    auto cperm = gko::matrix::Permutation<i_type>::create(
-        this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, cdata),
-        gko::matrix::column_permute);
-
-    rperm->apply(x, y1);
-    cperm->apply(y1, y2);
+    using index_type = typename TestFixture::index_type;
+    using T = typename TestFixture::value_type;
+    using Vec = typename TestFixture::Vec;
     // clang-format off
-    GKO_ASSERT_MTX_NEAR(y2,
-                      l({{1.0, 0.0, 0.0},
-                         {4.0, 2.5, 0.0},
-                         {3.0, 0.0, 2.0}}),
-                      0.0);
+    auto x = gko::initialize<Vec>(
+        {I<T>{2.0, 3.0},
+         I<T>{4.0, 2.5}}, this->exec);
     // clang-format on
-}
+    auto alpha = gko::initialize<Vec>({2.0}, this->exec);
+    auto beta = gko::initialize<Vec>({-1.0}, this->exec);
+    auto y = x->clone();
+    index_type rdata[] = {1, 0};
 
+    auto perm = gko::matrix::Permutation<index_type>::create(
+        this->exec, gko::make_array_view(this->exec, 2, rdata));
 
-TYPED_TEST(Permutation, AppliesInverseRowPermutationToCsr)
-{
-    using i_type = typename TestFixture::i_type;
-    using Csr = typename TestFixture::Csr;
-    // clang-format off
-    auto x = gko::initialize<Csr>({{2.0, 3.0, 0.0},
-                                   {0.0, 1.0, 0.0},
-                                   {0.0, 4.0, 2.5}},
-                                  this->exec);
-    // clang-format on
-    auto y = Csr::create(this->exec, gko::dim<2>{3});
-    i_type rdata[] = {1, 2, 0};
-
-    auto rperm = gko::matrix::Permutation<i_type>::create(
-        this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, rdata),
-        gko::matrix::row_permute | gko::matrix::inverse_permute);
+    perm->apply(alpha, x, beta, y);
 
-    rperm->apply(x, y);
     // clang-format off
     GKO_ASSERT_MTX_NEAR(y,
-                        l({{0.0, 4.0, 2.5},
-                           {2.0, 3.0, 0.0},
-                           {0.0, 1.0, 0.0}}),
-                          0.0);
+                        l({{6.0, 2.0},
+                           {0.0, 3.5}}),
+                        0.0);
     // clang-format on
 }
 
 
-TYPED_TEST(Permutation, AppliesInverseColPermutationToCsr)
+TYPED_TEST(Permutation, ApplyFailsWithNonDenseMatrix)
 {
-    using i_type = typename TestFixture::i_type;
-    using Csr = typename TestFixture::Csr;
-    // clang-format off
-    auto x = gko::initialize<Csr>({{2.0, 3.0, 0.0},
-                                   {0.0, 1.0, 0.0},
-                                   {0.0, 4.0, 2.5}},
-                                  this->exec);
-    // clang-format on
-    auto y = Csr::create(this->exec, gko::dim<2>{3});
-    i_type cdata[] = {1, 2, 0};
-
-    auto cperm = gko::matrix::Permutation<i_type>::create(
-        this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, cdata),
-        gko::matrix::inverse_permute | gko::matrix::column_permute);
-
-    cperm->apply(x, y);
-    // clang-format off
-    GKO_ASSERT_MTX_NEAR(y,
-                      l({{0.0, 2.0, 3.0},
-                         {0.0, 0.0, 1.0},
-                         {2.5, 0.0, 4.0}}),
-                      0.0);
-    // clang-format on
-}
-
+    using index_type = typename TestFixture::index_type;
+    using T = typename TestFixture::value_type;
+    auto mtx = gko::matrix::Csr<T, index_type>::create(this->exec);
+    auto mtx2 = mtx->clone();
+    auto perm = gko::matrix::Permutation<index_type>::create(this->exec);
 
-TYPED_TEST(Permutation, AppliesInverseRowAndColPermutationToCsr)
-{
-    using i_type = typename TestFixture::i_type;
-    using Csr = typename TestFixture::Csr;
-    // clang-format off
-    auto x = gko::initialize<Csr>({{2.0, 3.0, 0.0},
-                                   {0.0, 1.0, 0.0},
-                                   {0.0, 4.0, 2.5}},
-                                  this->exec);
-    // clang-format on
-    auto y1 = Csr::create(this->exec, gko::dim<2>{3});
-    auto y2 = Csr::create(this->exec, gko::dim<2>{3});
-    i_type cdata[] = {1, 2, 0};
-    i_type rdata[] = {1, 2, 0};
-
-    auto rperm = gko::matrix::Permutation<i_type>::create(
-        this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, rdata),
-        gko::matrix::row_permute | gko::matrix::inverse_permute);
-    auto cperm = gko::matrix::Permutation<i_type>::create(
-        this->exec, gko::dim<2>{3}, gko::make_array_view(this->exec, 3, cdata),
-        gko::matrix::inverse_permute | gko::matrix::column_permute);
-
-    rperm->apply(x, y1);
-    cperm->apply(y1, y2);
-    // clang-format off
-    GKO_ASSERT_MTX_NEAR(y2,
-                        l({{2.5, 0.0, 4.0},
-                           {0.0, 2.0, 3.0},
-                           {0.0, 0.0, 1.0}}),
-                        0.0);
-    // clang-format on
+    ASSERT_THROW(perm->apply(mtx, mtx2), gko::NotSupported);
 }
 
 
diff --git a/reference/test/matrix/scaled_permutation.cpp b/reference/test/matrix/scaled_permutation.cpp
new file mode 100644
index 00000000000..028716694f4
--- /dev/null
+++ b/reference/test/matrix/scaled_permutation.cpp
@@ -0,0 +1,246 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/matrix/scaled_permutation.hpp>
+
+
+#include <random>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/matrix/permutation.hpp>
+
+
+#include "core/test/utils.hpp"
+
+
+namespace {
+
+
+template <typename ValueIndexType>
+class ScaledPermutation : public ::testing::Test {
+protected:
+    using value_type =
+        typename std::tuple_element<0, decltype(ValueIndexType())>::type;
+    using index_type =
+        typename std::tuple_element<1, decltype(ValueIndexType())>::type;
+    using Vec = gko::matrix::Dense<value_type>;
+    using Mtx = gko::matrix::ScaledPermutation<value_type, index_type>;
+
+    ScaledPermutation() : exec(gko::ReferenceExecutor::create())
+    {
+        perm3 = Mtx::create(exec,
+                            gko::array<value_type>{this->exec, {1.0, 2.0, 4.0}},
+                            gko::array<index_type>{this->exec, {1, 2, 0}});
+        perm2 =
+            Mtx::create(exec, gko::array<value_type>{this->exec, {3.0, 5.0}},
+                        gko::array<index_type>{this->exec, {1, 0}});
+    }
+
+    std::unique_ptr<Vec> ref_combine(const Mtx* first, const Mtx* second)
+    {
+        const auto exec = first->get_executor();
+        gko::matrix_data<value_type, index_type> first_perm_data;
+        gko::matrix_data<value_type, index_type> second_perm_data;
+        first->write(first_perm_data);
+        second->write(second_perm_data);
+        const auto first_mtx = Vec::create(exec);
+        const auto second_mtx = Vec::create(exec);
+        first_mtx->read(first_perm_data);
+        second_mtx->read(second_perm_data);
+        auto combined_mtx = first_mtx->clone();
+        second_mtx->apply(first_mtx, combined_mtx);
+        return combined_mtx;
+    }
+
+    std::shared_ptr<const gko::Executor> exec;
+    std::unique_ptr<Mtx> perm3;
+    std::unique_ptr<Mtx> perm2;
+};
+
+TYPED_TEST_SUITE(ScaledPermutation, gko::test::ValueIndexTypes,
+                 PairTypenameNameGenerator);
+
+
+TYPED_TEST(ScaledPermutation, Invert)
+{
+    using T = typename TestFixture::value_type;
+    auto inv = this->perm3->compute_inverse();
+
+    EXPECT_EQ(inv->get_const_permutation()[0], 2);
+    EXPECT_EQ(inv->get_const_permutation()[1], 0);
+    EXPECT_EQ(inv->get_const_permutation()[2], 1);
+    EXPECT_EQ(inv->get_const_scaling_factors()[0], T{0.5});
+    EXPECT_EQ(inv->get_const_scaling_factors()[1], T{0.25});
+    EXPECT_EQ(inv->get_const_scaling_factors()[2], T{1.0});
+}
+
+
+TYPED_TEST(ScaledPermutation, CreateFromPermutation)
+{
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using Mtx = typename TestFixture::Mtx;
+    auto non_scaled = gko::matrix::Permutation<index_type>::create(
+        this->exec, gko::array<index_type>{this->exec, {1, 2, 0}});
+
+    auto scaled = Mtx::create(non_scaled);
+
+    EXPECT_EQ(scaled->get_const_permutation()[0], 1);
+    EXPECT_EQ(scaled->get_const_permutation()[1], 2);
+    EXPECT_EQ(scaled->get_const_permutation()[2], 0);
+    EXPECT_EQ(scaled->get_const_scaling_factors()[0], gko::one<value_type>());
+    EXPECT_EQ(scaled->get_const_scaling_factors()[1], gko::one<value_type>());
+    EXPECT_EQ(scaled->get_const_scaling_factors()[2], gko::one<value_type>());
+}
+
+
+TYPED_TEST(ScaledPermutation, Combine)
+{
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using Vec = typename TestFixture::Vec;
+    using Mtx = typename TestFixture::Mtx;
+    const auto other_perm = Mtx::create(
+        this->exec, gko::array<value_type>{this->exec, {3.0, 5.0, 7.0}},
+        gko::array<index_type>{this->exec, {1, 0, 2}});
+    const auto ref_combined =
+        this->ref_combine(this->perm3.get(), other_perm.get());
+
+    const auto combined = this->perm3->compose(other_perm);
+
+    GKO_ASSERT_MTX_NEAR(combined, ref_combined, 0.0);
+}
+
+
+TYPED_TEST(ScaledPermutation, CombineLarger)
+{
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    using Vec = typename TestFixture::Vec;
+    using Mtx = typename TestFixture::Mtx;
+    const auto perm = Mtx::create(
+        this->exec,
+        gko::array<value_type>{
+            this->exec,
+            {1.0, 2.0, 3.0, 5.0, 7.0, 11.0, 13.0, 17.0, 19.0, 23.0}},
+        gko::array<index_type>{this->exec, {6, 2, 4, 0, 1, 5, 9, 8, 3, 7}});
+    const auto perm2 = Mtx::create(
+        this->exec,
+        gko::array<value_type>{
+            this->exec,
+            {29.0, 31.0, 37.0, 41.0, 43.0, 47.0, 53.0, 59.0, 61.0, 67.0}},
+        gko::array<index_type>{this->exec, {9, 2, 1, 6, 3, 7, 8, 4, 0, 5}});
+    const auto ref_combined = this->ref_combine(perm.get(), perm2.get());
+
+    const auto combined = perm->compose(perm2);
+
+    GKO_ASSERT_MTX_NEAR(combined, ref_combined, 0.0);
+}
+
+
+TYPED_TEST(ScaledPermutation, CombineWithInverse)
+{
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    const gko::size_type size = 20;
+    auto rng = std::default_random_engine{3754};
+    auto dist = std::uniform_real_distribution<gko::remove_complex<value_type>>{
+        1.0, 2.0};
+    auto perm = gko::matrix::ScaledPermutation<value_type, index_type>::create(
+        this->exec, size);
+    std::iota(perm->get_permutation(), perm->get_permutation() + size, 0);
+    std::shuffle(perm->get_permutation(), perm->get_permutation() + size, rng);
+    for (gko::size_type i = 0; i < size; i++) {
+        perm->get_scaling_factors()[i] = dist(rng);
+    }
+
+    auto combined = perm->compose(perm->compute_inverse());
+
+    for (index_type i = 0; i < size; i++) {
+        ASSERT_EQ(combined->get_const_permutation()[i], i);
+        ASSERT_LT(gko::abs(combined->get_const_scaling_factors()[i] -
+                           gko::one<value_type>()),
+                  r<value_type>::value);
+    }
+}
+
+
+TYPED_TEST(ScaledPermutation, CombineFailsWithMismatchingSize)
+{
+    ASSERT_THROW(this->perm3->compose(this->perm2), gko::DimensionMismatch);
+}
+
+
+TYPED_TEST(ScaledPermutation, Write)
+{
+    using T = typename TestFixture::value_type;
+
+    GKO_ASSERT_MTX_NEAR(
+        this->perm3, l<T>({{0.0, 2.0, 0.0}, {0.0, 0.0, 4.0}, {1.0, 0.0, 0.0}}),
+        0.0);
+}
+
+
+TYPED_TEST(ScaledPermutation, AppliesToDense)
+{
+    using T = typename TestFixture::value_type;
+    using Vec = typename TestFixture::Vec;
+    auto x = gko::initialize<Vec>({I<T>{2.0, 3.0}, I<T>{4.0, 2.5}}, this->exec);
+    auto y = Vec::create(this->exec, gko::dim<2>{2});
+
+    this->perm2->apply(x, y);
+
+    GKO_ASSERT_MTX_NEAR(y, l({{20.0, 12.5}, {6.0, 9.0}}), 0.0);
+}
+
+
+TYPED_TEST(ScaledPermutation, AdvancedAppliesToDense)
+{
+    using T = typename TestFixture::value_type;
+    using Vec = typename TestFixture::Vec;
+    auto alpha = gko::initialize<Vec>({2.0}, this->exec);
+    auto beta = gko::initialize<Vec>({-1.0}, this->exec);
+    auto x = gko::initialize<Vec>({I<T>{2.0, 3.0}, I<T>{4.0, 2.5}}, this->exec);
+    auto y = x->clone();
+
+    this->perm2->apply(alpha, x, beta, y);
+
+    GKO_ASSERT_MTX_NEAR(y, l({{38.0, 22.0}, {8.0, 15.5}}), 0.0);
+}
+
+
+}  // namespace
diff --git a/reference/test/matrix/sparsity_csr_kernels.cpp b/reference/test/matrix/sparsity_csr_kernels.cpp
index 4d356ffd828..dde558d27fd 100644
--- a/reference/test/matrix/sparsity_csr_kernels.cpp
+++ b/reference/test/matrix/sparsity_csr_kernels.cpp
@@ -96,7 +96,7 @@ class SparsityCsr : public ::testing::Test {
     {
         index_type* c = m->get_col_idxs();
         index_type* r = m->get_row_ptrs();
-        // It keeps an explict zero
+        // It keeps an explicit zero
         /*
          *  1    1   1
          * {0}   1   0
diff --git a/reference/test/preconditioner/ilu.cpp b/reference/test/preconditioner/ilu.cpp
index 3511de4f011..5150626c898 100644
--- a/reference/test/preconditioner/ilu.cpp
+++ b/reference/test/preconditioner/ilu.cpp
@@ -80,37 +80,29 @@ class Ilu : public ::testing::Test {
           u_factor(gko::initialize<Mtx>(
               {{2., 1., 1.}, {0., 4., 1.}, {0., 0., 3.}}, exec)),
           l_u_composition(Composition::create(l_factor, u_factor)),
-          l_factory(
-              l_solver_type::build()
-                  .with_criteria(
-                      gko::stop::Iteration::build().with_max_iters(10u).on(
-                          exec),
-                      gko::stop::Time::build()
-                          .with_time_limit(std::chrono::seconds(6))
-                          .on(exec),
-                      gko::stop::ResidualNorm<value_type>::build()
-                          .with_reduction_factor(r<T>::value)
-                          .on(exec))
-                  .on(exec)),
-          u_factory(
-              u_solver_type::build()
-                  .with_criteria(
-                      gko::stop::Iteration::build().with_max_iters(10u).on(
-                          exec),
-                      gko::stop::Time::build()
-                          .with_time_limit(std::chrono::seconds(6))
-                          .on(exec),
-                      gko::stop::ResidualNorm<value_type>::build()
-                          .with_reduction_factor(r<T>::value)
-                          .on(exec))
-                  .on(exec)),
+          l_factory(l_solver_type::build()
+                        .with_criteria(
+                            gko::stop::Iteration::build().with_max_iters(10u),
+                            gko::stop::Time::build().with_time_limit(
+                                std::chrono::seconds(6)),
+                            gko::stop::ResidualNorm<value_type>::build()
+                                .with_reduction_factor(r<T>::value))
+                        .on(exec)),
+          u_factory(u_solver_type::build()
+                        .with_criteria(
+                            gko::stop::Iteration::build().with_max_iters(10u),
+                            gko::stop::Time::build().with_time_limit(
+                                std::chrono::seconds(6)),
+                            gko::stop::ResidualNorm<value_type>::build()
+                                .with_reduction_factor(r<T>::value))
+                        .on(exec)),
           ilu_pre_factory(ilu_prec_type::build()
-                              .with_l_solver_factory(l_factory)
-                              .with_u_solver_factory(u_factory)
+                              .with_l_solver(l_factory)
+                              .with_u_solver(u_factory)
                               .on(exec)),
           ilu_rev_pre_factory(ilu_rev_prec_type::build()
-                                  .with_l_solver_factory(l_factory)
-                                  .with_u_solver_factory(u_factory)
+                                  .with_l_solver(l_factory)
+                                  .with_u_solver(u_factory)
                                   .on(exec))
     {}
 
@@ -301,7 +293,7 @@ TYPED_TEST(Ilu, SolvesCustomTypeDefaultFactorySingleRhs)
         ilu_prec_type::build().on(this->exec)->generate(this->mtx);
     preconditioner->apply(b, x);
 
-    // Since it uses Bicgstab with default parmeters, the result will not be
+    // Since it uses Bicgstab with default parameters, the result will not be
     // accurate
     GKO_ASSERT_MTX_NEAR(x, l({-0.125, 0.25, 1.0}), 1e-1);
 }
@@ -622,9 +614,8 @@ TEST_F(DefaultIlu, CanBeUsedAsPreconditioner)
 {
     auto solver =
         gko::solver::Bicgstab<>::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(2u).on(this->exec))
-            .with_preconditioner(default_ilu_prec_type::build().on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(2u))
+            .with_preconditioner(default_ilu_prec_type::build())
             .on(this->exec)
             ->generate(this->mtx);
     auto x = Mtx::create(this->exec, gko::dim<2>{3, 1});
@@ -643,8 +634,7 @@ TEST_F(DefaultIlu, CanBeUsedAsGeneratedPreconditioner)
         default_ilu_prec_type::build().on(this->exec)->generate(this->mtx);
     auto solver =
         gko::solver::Bicgstab<>::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(2u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(2u))
             .with_generated_preconditioner(precond)
             .on(this->exec)
             ->generate(this->mtx);
diff --git a/reference/test/preconditioner/isai_kernels.cpp b/reference/test/preconditioner/isai_kernels.cpp
index eea171d60fe..86d0f40142a 100644
--- a/reference/test/preconditioner/isai_kernels.cpp
+++ b/reference/test/preconditioner/isai_kernels.cpp
@@ -82,16 +82,13 @@ class Isai : public ::testing::Test {
         : exec{gko::ReferenceExecutor::create()},
           excess_solver_factory(
               excess_solver_type::build()
-                  .with_preconditioner(
-                      bj::build().with_max_block_size(16u).on(exec))
+                  .with_preconditioner(bj::build().with_max_block_size(16u))
                   .with_criteria(
-                      gko::stop::Iteration::build().with_max_iters(1000u).on(
-                          exec),
+                      gko::stop::Iteration::build().with_max_iters(1000u),
                       gko::stop::ResidualNorm<value_type>::build()
                           .with_baseline(gko::stop::mode::rhs_norm)
                           .with_reduction_factor(
-                              gko::remove_complex<value_type>{1e-6})
-                          .on(exec))
+                              gko::remove_complex<value_type>{1e-6}))
                   .on(exec)),
           a_dense{gko::initialize<Dense>({{2, 1, 2}, {1, -2, 3}, {-1, 1, 1}},
                                          exec)},
diff --git a/reference/test/reorder/CMakeLists.txt b/reference/test/reorder/CMakeLists.txt
index 766decfb749..731fb1de8f5 100644
--- a/reference/test/reorder/CMakeLists.txt
+++ b/reference/test/reorder/CMakeLists.txt
@@ -3,4 +3,6 @@ if(GINKGO_HAVE_METIS)
 endif()
 ginkgo_create_test(rcm)
 ginkgo_create_test(rcm_kernels)
+ginkgo_create_test(mc64)
+ginkgo_create_test(mc64_kernels)
 ginkgo_create_test(scaled_reordered)
diff --git a/reference/test/reorder/mc64.cpp b/reference/test/reorder/mc64.cpp
new file mode 100644
index 00000000000..e06d040e48e
--- /dev/null
+++ b/reference/test/reorder/mc64.cpp
@@ -0,0 +1,167 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/reorder/mc64.hpp>
+
+
+#include <algorithm>
+#include <fstream>
+#include <memory>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/composition.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/matrix/permutation.hpp>
+#include <ginkgo/core/matrix/scaled_permutation.hpp>
+#include <ginkgo/core/matrix/sparsity_csr.hpp>
+
+
+#include "core/test/utils.hpp"
+#include "core/test/utils/assertions.hpp"
+
+
+namespace {
+
+
+template <typename ValueIndexType>
+class Mc64 : public ::testing::Test {
+protected:
+    using v_type =
+        typename std::tuple_element<0, decltype(ValueIndexType())>::type;
+    using i_type =
+        typename std::tuple_element<1, decltype(ValueIndexType())>::type;
+    using real_type = gko::remove_complex<v_type>;
+    using reorder_type = gko::experimental::reorder::Mc64<v_type, i_type>;
+    using perm_type = gko::matrix::ScaledPermutation<v_type, i_type>;
+    using result_type = gko::Composition<v_type>;
+    using Mtx = gko::matrix::Dense<v_type>;
+    using CsrMtx = gko::matrix::Csr<v_type, i_type>;
+    Mc64()
+        : exec(gko::ReferenceExecutor::create()),
+          mc64_factory(reorder_type::build().on(exec)),
+          id3_mtx(gko::initialize<CsrMtx>(
+              {{1.0, 0.0, 0.0}, {0.0, 1.0, 0.0}, {0.0, 0.0, 1.0}}, exec)),
+          not_id3_mtx(gko::initialize<CsrMtx>(
+              {{1.0, 0.0, 2.0}, {0.0, 1.0, 0.0}, {2.0, 0.0, 1.0}}, exec))
+    {}
+
+    std::pair<std::shared_ptr<const perm_type>,
+              std::shared_ptr<const perm_type>>
+    unpack(const result_type* result)
+    {
+        GKO_ASSERT_EQ(result->get_operators().size(), 2);
+        return std::make_pair(gko::as<perm_type>(result->get_operators()[0]),
+                              gko::as<perm_type>(result->get_operators()[1]));
+    }
+
+    void assert_correct_permutation(const result_type* mc64)
+    {
+        auto perm = unpack(mc64).first->get_const_permutation();
+
+        ASSERT_EQ(perm[0], 0);
+        ASSERT_EQ(perm[1], 1);
+        ASSERT_EQ(perm[2], 2);
+    }
+
+    std::shared_ptr<const gko::Executor> exec;
+    std::shared_ptr<CsrMtx> id3_mtx;
+    std::shared_ptr<CsrMtx> not_id3_mtx;
+    std::unique_ptr<reorder_type> mc64_factory;
+};
+
+TYPED_TEST_SUITE(Mc64, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
+
+
+TYPED_TEST(Mc64, HasSensibleDefaults)
+{
+    using real_type = typename TestFixture::real_type;
+
+    ASSERT_EQ(this->mc64_factory->get_parameters().strategy,
+              gko::experimental::reorder::mc64_strategy::max_diagonal_product);
+    ASSERT_EQ(this->mc64_factory->get_parameters().tolerance, real_type{1e-14});
+}
+
+
+TYPED_TEST(Mc64, CanBeCreatedWithReorderingStrategy)
+{
+    using reorder_type = typename TestFixture::reorder_type;
+
+    auto mc64 =
+        reorder_type::build()
+            .with_strategy(
+                gko::experimental::reorder::mc64_strategy::max_diagonal_sum)
+            .on(this->exec)
+            ->generate(this->not_id3_mtx);
+
+    auto perm = this->unpack(mc64.get()).first->get_const_permutation();
+    ASSERT_EQ(perm[0], 2);
+    ASSERT_EQ(perm[1], 1);
+    ASSERT_EQ(perm[2], 0);
+}
+
+
+TYPED_TEST(Mc64, CanBeCreatedEmpty)
+{
+    using reorder_type = typename TestFixture::reorder_type;
+    using matrix_type = typename TestFixture::CsrMtx;
+
+    auto mc64 =
+        reorder_type::build()
+            .with_strategy(
+                gko::experimental::reorder::mc64_strategy::max_diagonal_sum)
+            .on(this->exec)
+            ->generate(matrix_type::create(this->exec));
+
+    ASSERT_FALSE(mc64->get_size());
+}
+
+
+TYPED_TEST(Mc64, CanBeCreatedWithTolerance)
+{
+    using reorder_type = typename TestFixture::reorder_type;
+    using real_type = typename TestFixture::real_type;
+
+    auto mc64 = reorder_type::build()
+                    .with_tolerance(real_type{1e-10})
+                    .on(this->exec)
+                    ->generate(this->id3_mtx);
+
+    this->assert_correct_permutation(mc64.get());
+}
+
+
+}  // namespace
diff --git a/reference/test/reorder/mc64_kernels.cpp b/reference/test/reorder/mc64_kernels.cpp
new file mode 100644
index 00000000000..35db2ca95a8
--- /dev/null
+++ b/reference/test/reorder/mc64_kernels.cpp
@@ -0,0 +1,432 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/reorder/mc64.hpp>
+
+
+#include <algorithm>
+#include <cmath>
+#include <fstream>
+#include <limits>
+#include <memory>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/permutation.hpp>
+
+
+#include "core/components/addressable_pq.hpp"
+#include "core/reorder/mc64.hpp"
+#include "core/test/utils.hpp"
+#include "core/test/utils/assertions.hpp"
+#include "matrices/config.hpp"
+
+
+namespace {
+
+
+template <typename ValueIndexType>
+class Mc64 : public ::testing::Test {
+protected:
+    using value_type =
+        typename std::tuple_element<0, decltype(ValueIndexType())>::type;
+    using index_type =
+        typename std::tuple_element<1, decltype(ValueIndexType())>::type;
+    using real_type = gko::remove_complex<value_type>;
+    using matrix_type = gko::matrix::Csr<value_type, index_type>;
+    using perm_type = gko::matrix::ScaledPermutation<value_type, index_type>;
+    // this is a constexpr member functions to avoid having to export the symbol
+    // for the constant variable
+    static constexpr real_type inf()
+    {
+        return std::numeric_limits<real_type>::infinity();
+    }
+
+    Mc64()
+        : ref(gko::ReferenceExecutor::create()),
+          tmp{ref},
+          mtx(gko::initialize<matrix_type>({{1., 2., 0., 0., 3., 0.},
+                                            {5., 1., 0., 0., 0., 0.},
+                                            {0., 0., 0., 6., 0., 4.},
+                                            {0., 0., 4., 0., 0., 3.},
+                                            {0., 0., 0., 4., 2., 0.},
+                                            {0., 5., 8., 0., 0., 0.}},
+                                           ref)),
+          weights{ref, 13},
+          dual_u{ref, 6},
+          distance{ref, 6},
+          row_maxima{ref, 6},
+          initialized_weights_sum{ref, I<real_type>{2., 1., 0., 0., 4., 0., 2.,
+                                                    0., 1., 0., 2., 3., 0.}},
+          initialized_dual_u_sum{ref, I<real_type>{0., 1., 0., 0., 0., 1.}},
+          initialized_row_maxima_sum{ref, I<real_type>{3., 5., 6., 4., 4., 8.}},
+          // if the logarithms are merged together, the rounding messes up the
+          // accuracy for GKO_ASSRT_ARRAY_EQ
+          initialized_weights_product{
+              ref,
+              I<real_type>{static_cast<real_type>(std::log2(3.)),
+                           static_cast<real_type>(std::log2(3.)) -
+                               static_cast<real_type>(std::log2(2.)),
+                           0., 0., static_cast<real_type>(std::log2(5.)), 0.,
+                           static_cast<real_type>(std::log2(6.)) -
+                               static_cast<real_type>(std::log2(4.)),
+                           0.,
+                           static_cast<real_type>(std::log2(4.)) -
+                               static_cast<real_type>(std::log2(3.)),
+                           0.,
+                           static_cast<real_type>(std::log2(4.)) -
+                               static_cast<real_type>(std::log2(2.)),
+                           static_cast<real_type>(std::log2(8.)) -
+                               static_cast<real_type>(std::log2(5.)),
+                           0.}},
+          initialized_dual_u_product{
+              ref, I<real_type>{0.,
+                                static_cast<real_type>(std::log2(3.)) -
+                                    static_cast<real_type>(std::log2(2.)),
+                                0., 0., 0.,
+                                static_cast<real_type>(std::log2(4.)) -
+                                    static_cast<real_type>(std::log2(3.))}},
+          initialized_row_maxima_product{
+              ref, I<real_type>{static_cast<real_type>(std::log2(3.)),
+                                static_cast<real_type>(std::log2(5.)),
+                                static_cast<real_type>(std::log2(6.)),
+                                static_cast<real_type>(std::log2(4.)),
+                                static_cast<real_type>(std::log2(4.)),
+                                static_cast<real_type>(std::log2(8.))}},
+          initialized_distance{
+              ref, I<real_type>{inf(), inf(), inf(), inf(), inf(), inf()}},
+          empty_permutation{ref, I<index_type>{-1, -1, -1, -1, -1, -1}},
+          empty_inverse_permutation{ref, I<index_type>{-1, -1, -1, -1, -1, -1}},
+          empty_matched_idxs{ref, I<index_type>{0, 0, 0, 0, 0, 0}},
+          empty_unmatched_rows{ref, I<index_type>{0, 0, 0, 0, 0, 0}},
+          initial_parents{ref, I<index_type>{0, 0, 0, 0, 0, 0}},
+          initial_generation{ref, I<index_type>{0, 0, 0, 0, 0, 0}},
+          initial_marked_cols{ref, I<index_type>{0, 0, 0, 0, 0, 0}},
+          initial_matched_idxs{ref, I<index_type>{1, 3, 5, 8, 0, 12}},
+          initial_unmatched_rows{ref, I<index_type>{4, -1, 0, 0, 0, 0}},
+          initial_matching_permutation{ref, I<index_type>{1, 0, 3, 5, -1, 2}},
+          initial_matching_inverse_permutation{
+              ref, I<index_type>{1, 0, 5, 2, -1, 3}},
+          final_permutation{ref, I<index_type>{1, 0, 3, 5, 4, 2}},
+          final_inverse_permutation{ref, I<index_type>{1, 0, 5, 2, 4, 3}},
+          final_parents{ref, I<index_type>{0, 0, 3, 4, 4, 2}},
+          final_generation{ref, I<index_type>{0, 0, -4, -4, 0, -4}},
+          final_marked_cols{ref, I<index_type>{3, 5, 2, 0, 0, 0}},
+          final_matched_idxs{ref, I<index_type>{1, 3, 5, 8, 10, 12}},
+          final_weights{ref, I<real_type>{2., 1., 0., 0., 4., 0., 2., 0., 1.,
+                                          0., 2., 3., 0.}},
+          final_dual_u{ref, I<real_type>{0., 1., -1., -2., 0., 0.}},
+          final_distance{ref, I<real_type>{inf(), inf(), 1., 0., inf(), 1.}},
+          zero_tol{1e-14}
+    {}
+
+    std::pair<std::shared_ptr<const perm_type>,
+              std::shared_ptr<const perm_type>>
+    unpack(const gko::Composition<value_type>* result)
+    {
+        return std::make_pair(gko::as<perm_type>(result->get_operators()[0]),
+                              gko::as<perm_type>(result->get_operators()[1]));
+    }
+
+    void assert_array_near(const gko::array<real_type>& a,
+                           const gko::array<real_type>& b, std::string name)
+    {
+        ASSERT_EQ(a.get_num_elems(), b.get_num_elems());
+        for (gko::size_type i = 0; i < a.get_num_elems(); i++) {
+            if (std::isfinite(a.get_const_data()[i]) ||
+                std::isfinite(b.get_const_data()[i])) {
+                ASSERT_NEAR(a.get_const_data()[i], b.get_const_data()[i],
+                            r<value_type>::value)
+                    << name << '[' << i << ']';
+            }
+        }
+    }
+
+    std::shared_ptr<const gko::ReferenceExecutor> ref;
+    gko::array<real_type> tmp;
+    gko::array<real_type> weights;
+    gko::array<real_type> dual_u;
+    gko::array<real_type> distance;
+    gko::array<real_type> row_maxima;
+    gko::array<real_type> initialized_weights_sum;
+    gko::array<real_type> initialized_dual_u_sum;
+    gko::array<real_type> initialized_row_maxima_sum;
+    gko::array<real_type> initialized_weights_product;
+    gko::array<real_type> initialized_dual_u_product;
+    gko::array<real_type> initialized_row_maxima_product;
+    gko::array<real_type> initialized_distance;
+    gko::array<real_type> final_weights;
+    gko::array<real_type> final_dual_u;
+    gko::array<real_type> final_distance;
+    gko::array<index_type> empty_permutation;
+    gko::array<index_type> empty_inverse_permutation;
+    gko::array<index_type> empty_matched_idxs;
+    gko::array<index_type> empty_unmatched_rows;
+    gko::array<index_type> initial_matching_permutation;
+    gko::array<index_type> initial_matching_inverse_permutation;
+    gko::array<index_type> initial_parents;
+    gko::array<index_type> initial_generation;
+    gko::array<index_type> initial_marked_cols;
+    gko::array<index_type> initial_matched_idxs;
+    gko::array<index_type> initial_unmatched_rows;
+    gko::array<index_type> final_permutation;
+    gko::array<index_type> final_inverse_permutation;
+    gko::array<index_type> final_parents;
+    gko::array<index_type> final_generation;
+    gko::array<index_type> final_marked_cols;
+    gko::array<index_type> final_matched_idxs;
+    std::shared_ptr<matrix_type> mtx;
+    const real_type zero_tol;
+};
+
+TYPED_TEST_SUITE(Mc64, gko::test::ValueIndexTypes, PairTypenameNameGenerator);
+
+
+TYPED_TEST(Mc64, InitializeWeightsSum)
+{
+    this->dual_u.fill(
+        std::numeric_limits<typename TestFixture::real_type>::infinity());
+
+    gko::experimental::reorder::mc64::initialize_weights(
+        this->mtx.get(), this->weights, this->dual_u, this->row_maxima,
+        gko::experimental::reorder::mc64_strategy::max_diagonal_sum);
+
+    this->assert_array_near(this->weights, this->initialized_weights_sum,
+                            "weights");
+    this->assert_array_near(this->dual_u, this->initialized_dual_u_sum,
+                            "dual_u");
+    this->assert_array_near(this->row_maxima, this->initialized_row_maxima_sum,
+                            "row_maxima");
+}
+
+
+TYPED_TEST(Mc64, InitializeWeightsProduct)
+{
+    this->dual_u.fill(
+        std::numeric_limits<typename TestFixture::real_type>::infinity());
+
+    gko::experimental::reorder::mc64::initialize_weights(
+        this->mtx.get(), this->weights, this->dual_u, this->row_maxima,
+        gko::experimental::reorder::mc64_strategy::max_diagonal_product);
+
+    this->assert_array_near(this->weights, this->initialized_weights_product,
+                            "weights");
+    this->assert_array_near(this->dual_u, this->initialized_dual_u_product,
+                            "dual_u");
+    this->assert_array_near(this->row_maxima,
+                            this->initialized_row_maxima_product, "row_maxima");
+}
+
+
+TYPED_TEST(Mc64, InitialMatching)
+{
+    const auto num_rows = this->mtx->get_size()[0];
+
+    gko::experimental::reorder::mc64::initial_matching(
+        num_rows, this->mtx->get_const_row_ptrs(),
+        this->mtx->get_const_col_idxs(), this->initialized_weights_sum,
+        this->initialized_dual_u_sum, this->empty_permutation,
+        this->empty_inverse_permutation, this->empty_matched_idxs,
+        this->empty_unmatched_rows, this->zero_tol);
+
+    GKO_ASSERT_ARRAY_EQ(this->empty_permutation,
+                        this->initial_matching_permutation);
+    GKO_ASSERT_ARRAY_EQ(this->empty_inverse_permutation,
+                        this->initial_matching_inverse_permutation);
+    GKO_ASSERT_ARRAY_EQ(this->empty_matched_idxs, this->initial_matched_idxs);
+    GKO_ASSERT_ARRAY_EQ(this->empty_unmatched_rows,
+                        this->initial_unmatched_rows);
+}
+
+
+TYPED_TEST(Mc64, ShortestAugmentingPath)
+{
+    using index_type = typename TestFixture::index_type;
+    using real_type = typename TestFixture::real_type;
+    gko::addressable_priority_queue<real_type, index_type> Q{
+        this->ref, this->mtx->get_size()[0]};
+    std::vector<index_type> q_j{};
+
+    gko::experimental::reorder::mc64::shortest_augmenting_path(
+        this->mtx->get_size()[0], this->mtx->get_const_row_ptrs(),
+        this->mtx->get_const_col_idxs(), this->initialized_weights_sum,
+        this->initialized_dual_u_sum, this->initialized_distance,
+        this->initial_matching_permutation,
+        this->initial_matching_inverse_permutation, 4 * gko::one<index_type>(),
+        this->initial_parents, this->initial_generation,
+        this->initial_marked_cols, this->initial_matched_idxs, Q, q_j,
+        this->zero_tol);
+
+    GKO_ASSERT_ARRAY_EQ(this->initial_matching_permutation,
+                        this->final_permutation);
+    GKO_ASSERT_ARRAY_EQ(this->initial_matching_inverse_permutation,
+                        this->final_inverse_permutation);
+    GKO_ASSERT_ARRAY_EQ(this->initial_parents, this->final_parents);
+    GKO_ASSERT_ARRAY_EQ(this->initial_generation, this->final_generation);
+    GKO_ASSERT_ARRAY_EQ(this->initial_marked_cols, this->final_marked_cols);
+    GKO_ASSERT_ARRAY_EQ(this->initial_matched_idxs, this->final_matched_idxs);
+    this->assert_array_near(this->initialized_weights_sum, this->final_weights,
+                            "weights");
+    this->assert_array_near(this->initialized_dual_u_sum, this->final_dual_u,
+                            "dual_u");
+    this->assert_array_near(this->initialized_distance, this->final_distance,
+                            "distance");
+}
+
+
+TYPED_TEST(Mc64, CreatesCorrectPermutationAndScalingExampleSum)
+{
+    using index_type = typename TestFixture::index_type;
+    using real_type = typename TestFixture::real_type;
+    using value_type = typename TestFixture::value_type;
+    auto mc64_factory =
+        gko::experimental::reorder::Mc64<value_type, index_type>::build()
+            .with_strategy(
+                gko::experimental::reorder::mc64_strategy::max_diagonal_sum)
+            .on(this->ref);
+
+    auto mc64 = mc64_factory->generate(this->mtx);
+
+    auto perm_obj = this->unpack(mc64.get()).first;
+    auto perm = perm_obj->get_const_permutation();
+    ASSERT_EQ(perm[0], 1);
+    ASSERT_EQ(perm[1], 0);
+    ASSERT_EQ(perm[2], 5);
+    ASSERT_EQ(perm[3], 2);
+    ASSERT_EQ(perm[4], 4);
+    ASSERT_EQ(perm[5], 3);
+}
+
+
+TYPED_TEST(Mc64, CreatesCorrectPermutationAndScalingExampleProduct)
+{
+    using index_type = typename TestFixture::index_type;
+    using value_type = typename TestFixture::value_type;
+    auto mc64_factory =
+        gko::experimental::reorder::Mc64<value_type, index_type>::build()
+            .with_strategy(
+                gko::experimental::reorder::mc64_strategy::max_diagonal_product)
+            .on(this->ref);
+    auto mc64 = mc64_factory->generate(this->mtx);
+
+    auto perm = this->unpack(mc64.get()).first->get_const_permutation();
+    auto row_scaling =
+        this->unpack(mc64.get()).first->get_const_scaling_factors();
+    auto col_scaling =
+        this->unpack(mc64.get()).second->get_const_scaling_factors();
+
+    ASSERT_EQ(perm[0], 1);
+    ASSERT_EQ(perm[1], 5);
+    ASSERT_EQ(perm[2], 3);
+    ASSERT_EQ(perm[3], 4);
+    ASSERT_EQ(perm[4], 0);
+    ASSERT_EQ(perm[5], 2);
+    GKO_ASSERT_NEAR(row_scaling[0], value_type{1. / 3.}, r<value_type>::value);
+    GKO_ASSERT_NEAR(row_scaling[1], value_type{0.2}, r<value_type>::value);
+    GKO_ASSERT_NEAR(row_scaling[2], value_type{0.2}, r<value_type>::value);
+    GKO_ASSERT_NEAR(row_scaling[3], value_type{4. / 15.}, r<value_type>::value);
+    GKO_ASSERT_NEAR(row_scaling[4], value_type{0.3}, r<value_type>::value);
+    GKO_ASSERT_NEAR(row_scaling[5], value_type{2. / 15.}, r<value_type>::value);
+    GKO_ASSERT_NEAR(col_scaling[0], value_type{1.}, r<value_type>::value);
+    GKO_ASSERT_NEAR(col_scaling[1], value_type{1.5}, r<value_type>::value);
+    GKO_ASSERT_NEAR(col_scaling[2], value_type{0.9375}, r<value_type>::value);
+    GKO_ASSERT_NEAR(col_scaling[3], value_type{5. / 6.}, r<value_type>::value);
+    GKO_ASSERT_NEAR(col_scaling[4], value_type{1.}, r<value_type>::value);
+    GKO_ASSERT_NEAR(col_scaling[5], value_type{1.25}, r<value_type>::value);
+}
+
+
+TYPED_TEST(Mc64, CreatesCorrectPermutationAndScalingLargeTrivialExampleProduct)
+{
+    using index_type = typename TestFixture::index_type;
+    using real_type = typename TestFixture::real_type;
+    using value_type = typename TestFixture::value_type;
+    using matrix_type = typename TestFixture::matrix_type;
+    using perm_type = typename TestFixture::perm_type;
+    // read input data
+    std::ifstream mtx_stream{gko::matrices::location_1138_bus_mtx};
+    auto mtx = gko::share(gko::read<matrix_type>(mtx_stream, this->ref));
+    std::ifstream result_stream{gko::matrices::location_1138_bus_mc64_result};
+    auto expected_result = gko::read<matrix_type>(result_stream, this->ref);
+    // compute mc64
+    auto mc64_factory =
+        gko::experimental::reorder::Mc64<value_type, index_type>::build()
+            .with_strategy(
+                gko::experimental::reorder::mc64_strategy::max_diagonal_product)
+            .on(this->ref);
+    auto mc64 = mc64_factory->generate(mtx);
+    // get components
+    auto row_perm = gko::as<perm_type>(mc64->get_operators()[0]);
+    auto col_perm = gko::as<perm_type>(mc64->get_operators()[1]);
+
+    mtx = mtx->scale_permute(row_perm, col_perm);
+
+    GKO_ASSERT_MTX_NEAR(mtx, expected_result, r<value_type>::value);
+}
+
+
+TYPED_TEST(Mc64, CreatesCorrectPermutationAndScalingLargeExampleProduct)
+{
+    using index_type = typename TestFixture::index_type;
+    using real_type = typename TestFixture::real_type;
+    using value_type = typename TestFixture::value_type;
+    using matrix_type = typename TestFixture::matrix_type;
+    using perm_type = typename TestFixture::perm_type;
+    // read input data
+    std::ifstream mtx_stream{gko::matrices::location_nontrivial_mc64_example};
+    auto mtx = gko::share(gko::read<matrix_type>(mtx_stream, this->ref));
+    mtx->sort_by_column_index();
+    std::ifstream result_stream{gko::matrices::location_nontrivial_mc64_result};
+    auto expected_result = gko::read<matrix_type>(result_stream, this->ref);
+    // compute mc64
+    auto mc64_factory =
+        gko::experimental::reorder::Mc64<value_type, index_type>::build()
+            .with_strategy(
+                gko::experimental::reorder::mc64_strategy::max_diagonal_product)
+            .on(this->ref);
+    auto mc64 = mc64_factory->generate(mtx);
+    // get components
+    auto row_perm = gko::as<perm_type>(mc64->get_operators()[0]);
+    auto col_perm = gko::as<perm_type>(mc64->get_operators()[1]);
+
+    mtx = mtx->scale_permute(row_perm, col_perm);
+
+    GKO_ASSERT_MTX_NEAR(mtx, expected_result, 1e-6);
+    GKO_ASSERT_MTX_EQ_SPARSITY(mtx, expected_result);
+}
+
+
+}  // namespace
diff --git a/reference/test/reorder/rcm.cpp b/reference/test/reorder/rcm.cpp
index c7314c9e26d..3c2286178b7 100644
--- a/reference/test/reorder/rcm.cpp
+++ b/reference/test/reorder/rcm.cpp
@@ -69,12 +69,12 @@ class Rcm : public ::testing::Test {
           rcm_factory(reorder_type::build().on(exec)),
           // clang-format off
           id3_mtx(gko::initialize<CsrMtx>(
-              {{1.0, 0.0, 0.0}, 
-              {0.0, 1.0, 0.0}, 
+              {{1.0, 0.0, 0.0},
+              {0.0, 1.0, 0.0},
               {0.0, 0.0, 1.0}}, exec)),
           not_id3_mtx(gko::initialize<CsrMtx>(
-              {{1.0, 0.0, 1.0}, 
-              {0.0, 1.0, 0.0}, 
+              {{1.0, 0.0, 1.0},
+              {0.0, 1.0, 0.0},
               {1.0, 0.0, 1.0}}, exec)),
           // clang-format on
           reorder_op(rcm_factory->generate(id3_mtx))
diff --git a/reference/test/reorder/rcm_kernels.cpp b/reference/test/reorder/rcm_kernels.cpp
index 4c79af9e73a..ec43de5e1f6 100644
--- a/reference/test/reorder/rcm_kernels.cpp
+++ b/reference/test/reorder/rcm_kernels.cpp
@@ -59,9 +59,9 @@ class Rcm : public ::testing::Test {
     using i_type = int;
     using CsrMtx = gko::matrix::Csr<v_type, i_type>;
     using reorder_type = gko::reorder::Rcm<v_type, i_type>;
+    using new_reorder_type = gko::experimental::reorder::Rcm<i_type>;
     using perm_type = gko::matrix::Permutation<i_type>;
 
-
     Rcm()
         : exec(gko::ReferenceExecutor::create()),
           // clang-format off
@@ -83,6 +83,17 @@ class Rcm : public ::testing::Test {
                                          {1., 0., 0., 0., 1., 0., 0., 1., 1.},
                                          {1., 0., 0., 1., 1., 0., 0., 1., 1.}},
                                         exec)),
+        p_mtx_1_lower(gko::initialize<CsrMtx>(
+                                        {{1., 0., 0., 0., 0., 0., 0., 0., 0.},
+                                         {0., 1., 0., 0., 0., 0., 0., 0., 0.},
+                                         {0., 1., 1., 0., 0., 0., 0., 0., 0.},
+                                         {1., 1., 0., 1., 0., 0., 0., 0., 0.},
+                                         {1., 0., 0., 1., 1., 0., 0., 0., 0.},
+                                         {1., 1., 1., 1., 1., 1., 0., 0., 0.},
+                                         {0., 1., 1., 1., 1., 1., 1., 0., 0.},
+                                         {1., 0., 0., 0., 1., 0., 0., 1., 0.},
+                                         {1., 0., 0., 1., 1., 0., 0., 1., 1.}},
+                                        exec)),
           // clang-format on
           rcm_factory(reorder_type::build().on(exec)),
           reorder_op_0(rcm_factory->generate(p_mtx_0)),
@@ -95,10 +106,11 @@ class Rcm : public ::testing::Test {
     std::unique_ptr<reorder_type> reorder_op_0;
     std::shared_ptr<CsrMtx> p_mtx_1;
     std::unique_ptr<reorder_type> reorder_op_1;
+    std::shared_ptr<CsrMtx> p_mtx_1_lower;
 
     static bool is_permutation(const perm_type* input_perm)
     {
-        const auto perm_size = input_perm->get_permutation_size();
+        const auto perm_size = input_perm->get_size()[0];
         auto perm_sorted = std::vector<i_type>(perm_size);
         std::copy_n(input_perm->get_const_permutation(), perm_size,
                     perm_sorted.begin());
@@ -140,4 +152,29 @@ TEST_F(Rcm, PermutesPerfectFullBand)
 }
 
 
+TEST_F(Rcm, NewInterfaceWorksOnSymmetric)
+{
+    std::vector<i_type> correct = {7, 8, 0, 4, 3, 5, 6, 1, 2};
+
+    auto permutation =
+        new_reorder_type::build().with_skip_symmetrize(true).on(exec)->generate(
+            p_mtx_1);
+
+    auto p = permutation->get_const_permutation();
+    ASSERT_TRUE(std::equal(p, p + correct.size(), correct.begin()));
+}
+
+
+TEST_F(Rcm, NewInterfaceWorksOnNonsymmetric)
+{
+    std::vector<i_type> correct = {7, 8, 0, 4, 3, 5, 6, 1, 2};
+
+    auto permutation =
+        new_reorder_type::build().on(exec)->generate(p_mtx_1_lower);
+
+    auto p = permutation->get_const_permutation();
+    ASSERT_TRUE(std::equal(p, p + correct.size(), correct.begin()));
+}
+
+
 }  // namespace
diff --git a/reference/test/reorder/scaled_reordered.cpp b/reference/test/reorder/scaled_reordered.cpp
index 8789ded37ca..7200f587d02 100644
--- a/reference/test/reorder/scaled_reordered.cpp
+++ b/reference/test/reorder/scaled_reordered.cpp
@@ -54,6 +54,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/test/utils.hpp"
 
 
+GKO_BEGIN_DISABLE_DEPRECATION_WARNINGS
+
+
 namespace {
 
 
@@ -110,11 +113,9 @@ class ScaledReordered : public ::testing::Test {
           solver_factory(
               Bicgstab::build()
                   .with_criteria(
-                      gko::stop::Iteration::build().with_max_iters(100u).on(
-                          exec),
+                      gko::stop::Iteration::build().with_max_iters(100u),
                       gko::stop::ResidualNorm<value_type>::build()
-                          .with_reduction_factor(r<value_type>::value)
-                          .on(exec))
+                          .with_reduction_factor(r<value_type>::value))
                   .on(exec)),
           tol{r<value_type>::value}
     {
@@ -570,3 +571,6 @@ TYPED_TEST(ScaledReordered, SolvesMultipleRhs)
 
 
 }  // namespace
+
+
+GKO_END_DISABLE_DEPRECATION_WARNINGS
diff --git a/reference/test/solver/CMakeLists.txt b/reference/test/solver/CMakeLists.txt
index 95fd0e4f932..04d7a9f4619 100644
--- a/reference/test/solver/CMakeLists.txt
+++ b/reference/test/solver/CMakeLists.txt
@@ -1,3 +1,4 @@
+ginkgo_create_test(batch_bicgstab_kernels)
 ginkgo_create_test(bicg_kernels)
 ginkgo_create_test(bicgstab_kernels)
 ginkgo_create_test(cg_kernels)
diff --git a/reference/test/solver/batch_bicgstab_kernels.cpp b/reference/test/solver/batch_bicgstab_kernels.cpp
new file mode 100644
index 00000000000..211318e8a8f
--- /dev/null
+++ b/reference/test/solver/batch_bicgstab_kernels.cpp
@@ -0,0 +1,308 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/solver/batch_bicgstab.hpp>
+
+
+#include <memory>
+#include <random>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/log/batch_logger.hpp>
+#include <ginkgo/core/matrix/batch_dense.hpp>
+#include <ginkgo/core/matrix/batch_ell.hpp>
+
+
+#include "core/base/batch_utilities.hpp"
+#include "core/matrix/batch_dense_kernels.hpp"
+#include "core/solver/batch_bicgstab_kernels.hpp"
+#include "core/test/utils.hpp"
+#include "core/test/utils/batch_helpers.hpp"
+
+
+template <typename T>
+class BatchBicgstab : public ::testing::Test {
+protected:
+    using value_type = T;
+    using real_type = gko::remove_complex<value_type>;
+    using solver_type = gko::batch::solver::Bicgstab<value_type>;
+    using Mtx = gko::batch::matrix::Dense<value_type>;
+    using EllMtx = gko::batch::matrix::Ell<value_type>;
+    using MVec = gko::batch::MultiVector<value_type>;
+    using RealMVec = gko::batch::MultiVector<real_type>;
+    using Settings = gko::kernels::batch_bicgstab::settings<real_type>;
+    using LogData = gko::batch::log::detail::log_data<real_type>;
+    using LinSys = gko::test::LinearSystem<Mtx>;
+
+    BatchBicgstab()
+        : exec(gko::ReferenceExecutor::create()),
+          mat(gko::share(gko::test::generate_3pt_stencil_batch_matrix<Mtx>(
+              exec, num_batch_items, num_rows))),
+          linear_system(gko::test::generate_batch_linear_system(mat, num_rhs))
+    {
+        auto executor = this->exec;
+        solve_lambda = [executor](const Settings opts,
+                                  const gko::batch::BatchLinOp* prec,
+                                  const Mtx* mtx, const MVec* b, MVec* x,
+                                  LogData& log_data) {
+            gko::kernels::reference::batch_bicgstab::apply<
+                typename Mtx::value_type>(executor, opts, mtx, prec, b, x,
+                                          log_data);
+        };
+    }
+
+    std::shared_ptr<const gko::ReferenceExecutor> exec;
+    const real_type eps = 1e-3;
+    const gko::size_type num_batch_items = 2;
+    const int num_rows = 15;
+    const int num_rhs = 1;
+    const Settings solver_settings{100, eps,
+                                   gko::batch::stop::tolerance_type::relative};
+    std::shared_ptr<const Mtx> mat;
+    LinSys linear_system;
+    std::function<void(const Settings, const gko::batch::BatchLinOp*,
+                       const Mtx*, const MVec*, MVec*, LogData&)>
+        solve_lambda;
+};
+
+TYPED_TEST_SUITE(BatchBicgstab, gko::test::RealValueTypes);
+
+
+TYPED_TEST(BatchBicgstab, SolvesStencilSystem)
+{
+    auto res = gko::test::solve_linear_system(this->exec, this->solve_lambda,
+                                              this->solver_settings,
+                                              this->linear_system);
+
+    for (size_t i = 0; i < this->num_batch_items; i++) {
+        ASSERT_LE(res.host_res_norm->get_const_values()[i] /
+                      this->linear_system.host_rhs_norm->get_const_values()[i],
+                  this->solver_settings.residual_tol);
+    }
+    GKO_ASSERT_BATCH_MTX_NEAR(res.x, this->linear_system.exact_sol,
+                              this->eps * 10);
+}
+
+
+TYPED_TEST(BatchBicgstab, StencilSystemLoggerLogsResidual)
+{
+    using value_type = typename TestFixture::value_type;
+    using real_type = gko::remove_complex<value_type>;
+
+    auto res = gko::test::solve_linear_system(this->exec, this->solve_lambda,
+                                              this->solver_settings,
+                                              this->linear_system);
+
+    const int ref_iters = 2;
+    auto iter_array = res.log_data->iter_counts.get_const_data();
+    auto res_log_array = res.log_data->res_norms.get_const_data();
+    for (size_t i = 0; i < this->num_batch_items; i++) {
+        ASSERT_LE(
+            res_log_array[i] / this->linear_system.host_rhs_norm->at(i, 0, 0),
+            this->solver_settings.residual_tol);
+        ASSERT_NEAR(res_log_array[i], res.host_res_norm->get_const_values()[i],
+                    10 * this->eps);
+    }
+}
+
+
+TYPED_TEST(BatchBicgstab, StencilSystemLoggerLogsIterations)
+{
+    using value_type = typename TestFixture::value_type;
+    using Settings = typename TestFixture::Settings;
+    using real_type = gko::remove_complex<value_type>;
+    const int ref_iters = 5;
+    const Settings solver_settings{ref_iters, 0,
+                                   gko::batch::stop::tolerance_type::relative};
+
+    auto res = gko::test::solve_linear_system(
+        this->exec, this->solve_lambda, solver_settings, this->linear_system);
+
+    auto iter_array = res.log_data->iter_counts.get_const_data();
+    for (size_t i = 0; i < this->num_batch_items; i++) {
+        ASSERT_EQ(iter_array[i], ref_iters);
+    }
+}
+
+
+TYPED_TEST(BatchBicgstab, CanSolveDenseSystem)
+{
+    using value_type = typename TestFixture::value_type;
+    using real_type = gko::remove_complex<value_type>;
+    using Solver = typename TestFixture::solver_type;
+    using Mtx = typename TestFixture::Mtx;
+    const real_type tol = 1e-5;
+    const int max_iters = 1000;
+    auto solver_factory =
+        Solver::build()
+            .with_max_iterations(max_iters)
+            .with_tolerance(tol)
+            .with_tolerance_type(gko::batch::stop::tolerance_type::relative)
+            .on(this->exec);
+    const int num_rows = 13;
+    const size_t num_batch_items = 5;
+    const int num_rhs = 1;
+    auto stencil_mat =
+        gko::share(gko::test::generate_3pt_stencil_batch_matrix<Mtx>(
+            this->exec, num_batch_items, num_rows));
+    auto linear_system =
+        gko::test::generate_batch_linear_system(stencil_mat, num_rhs);
+    auto solver = gko::share(solver_factory->generate(linear_system.matrix));
+
+    auto res =
+        gko::test::solve_linear_system(this->exec, linear_system, solver);
+
+    GKO_ASSERT_BATCH_MTX_NEAR(res.x, linear_system.exact_sol, tol * 10);
+    for (size_t i = 0; i < num_batch_items; i++) {
+        ASSERT_LE(res.host_res_norm->get_const_values()[i] /
+                      linear_system.host_rhs_norm->get_const_values()[i],
+                  tol);
+    }
+}
+
+
+TYPED_TEST(BatchBicgstab, ApplyLogsResAndIters)
+{
+    using value_type = typename TestFixture::value_type;
+    using real_type = gko::remove_complex<value_type>;
+    using Solver = typename TestFixture::solver_type;
+    using Mtx = typename TestFixture::Mtx;
+    using Logger = gko::batch::log::BatchConvergence<value_type>;
+    const real_type tol = 1e-5;
+    const int max_iters = 1000;
+    auto solver_factory =
+        Solver::build()
+            .with_max_iterations(max_iters)
+            .with_tolerance(tol)
+            .with_tolerance_type(gko::batch::stop::tolerance_type::relative)
+            .on(this->exec);
+    const int num_rows = 13;
+    const size_t num_batch_items = 5;
+    const int num_rhs = 1;
+    std::shared_ptr<Logger> logger = Logger::create();
+    auto stencil_mat =
+        gko::share(gko::test::generate_3pt_stencil_batch_matrix<Mtx>(
+            this->exec, num_batch_items, num_rows));
+    auto linear_system =
+        gko::test::generate_batch_linear_system(stencil_mat, num_rhs);
+    auto solver = gko::share(solver_factory->generate(linear_system.matrix));
+
+    solver->add_logger(logger);
+    auto res =
+        gko::test::solve_linear_system(this->exec, linear_system, solver);
+    solver->remove_logger(logger);
+
+    auto iter_counts = logger->get_num_iterations();
+    auto res_norm = logger->get_residual_norm();
+    GKO_ASSERT_BATCH_MTX_NEAR(res.x, linear_system.exact_sol, tol * 50);
+    for (size_t i = 0; i < num_batch_items; i++) {
+        auto rel_res_norm = res.host_res_norm->get_const_values()[i] /
+                            linear_system.host_rhs_norm->get_const_values()[i];
+        ASSERT_LE(iter_counts.get_const_data()[i], max_iters);
+        EXPECT_LE(res_norm.get_const_data()[i], tol * 50);
+        ASSERT_LE(rel_res_norm, tol * 50);
+    }
+}
+
+
+TYPED_TEST(BatchBicgstab, CanSolveEllSystem)
+{
+    using value_type = typename TestFixture::value_type;
+    using real_type = gko::remove_complex<value_type>;
+    using Solver = typename TestFixture::solver_type;
+    using Mtx = typename TestFixture::EllMtx;
+    const real_type tol = 1e-5;
+    const int max_iters = 1000;
+    auto solver_factory =
+        Solver::build()
+            .with_max_iterations(max_iters)
+            .with_tolerance(tol)
+            .with_tolerance_type(gko::batch::stop::tolerance_type::relative)
+            .on(this->exec);
+    const int num_rows = 13;
+    const size_t num_batch_items = 2;
+    const int num_rhs = 1;
+    auto stencil_mat =
+        gko::share(gko::test::generate_3pt_stencil_batch_matrix<Mtx>(
+            this->exec, num_batch_items, num_rows, 3));
+    auto linear_system =
+        gko::test::generate_batch_linear_system(stencil_mat, num_rhs);
+    auto solver = gko::share(solver_factory->generate(linear_system.matrix));
+
+    auto res =
+        gko::test::solve_linear_system(this->exec, linear_system, solver);
+
+    GKO_ASSERT_BATCH_MTX_NEAR(res.x, linear_system.exact_sol, tol * 10);
+    for (size_t i = 0; i < num_batch_items; i++) {
+        ASSERT_LE(res.host_res_norm->get_const_values()[i] /
+                      linear_system.host_rhs_norm->get_const_values()[i],
+                  tol * 10);
+    }
+}
+
+
+TYPED_TEST(BatchBicgstab, CanSolveDenseHpdSystem)
+{
+    using value_type = typename TestFixture::value_type;
+    using real_type = gko::remove_complex<value_type>;
+    using Solver = typename TestFixture::solver_type;
+    using Mtx = typename TestFixture::Mtx;
+    const real_type tol = 1e-5;
+    const int max_iters = 1000;
+    auto solver_factory =
+        Solver::build()
+            .with_max_iterations(max_iters)
+            .with_tolerance(tol)
+            .with_tolerance_type(gko::batch::stop::tolerance_type::absolute)
+            .on(this->exec);
+    const int num_rows = 65;
+    const gko::size_type num_batch_items = 5;
+    const int num_rhs = 1;
+    auto diag_dom_mat =
+        gko::share(gko::test::generate_diag_dominant_batch_matrix<Mtx>(
+            this->exec, num_batch_items, num_rows, true));
+    auto linear_system =
+        gko::test::generate_batch_linear_system(diag_dom_mat, num_rhs);
+    auto solver = gko::share(solver_factory->generate(linear_system.matrix));
+
+    auto res =
+        gko::test::solve_linear_system(this->exec, linear_system, solver);
+
+    GKO_ASSERT_BATCH_MTX_NEAR(res.x, linear_system.exact_sol, tol * 50);
+    for (size_t i = 0; i < num_batch_items; i++) {
+        ASSERT_LE(res.host_res_norm->get_const_values()[i], tol * 50);
+    }
+}
diff --git a/reference/test/solver/bicg_kernels.cpp b/reference/test/solver/bicg_kernels.cpp
index e317677b2de..aa27eb4afa3 100644
--- a/reference/test/solver/bicg_kernels.cpp
+++ b/reference/test/solver/bicg_kernels.cpp
@@ -64,17 +64,14 @@ class Bicg : public ::testing::Test {
               {{2, -1.0, 0.0}, {-1.0, 2, -1.0}, {0.0, -1.0, 2}}, exec)),
           stopped{},
           non_stopped{},
-          bicg_factory(
-              Solver::build()
-                  .with_criteria(
-                      gko::stop::Iteration::build().with_max_iters(4u).on(exec),
-                      gko::stop::Time::build()
-                          .with_time_limit(std::chrono::seconds(6))
-                          .on(exec),
-                      gko::stop::ResidualNorm<value_type>::build()
-                          .with_reduction_factor(r<value_type>::value)
-                          .on(exec))
-                  .on(exec)),
+          bicg_factory(Solver::build()
+                           .with_criteria(
+                               gko::stop::Iteration::build().with_max_iters(4u),
+                               gko::stop::Time::build().with_time_limit(
+                                   std::chrono::seconds(6)),
+                               gko::stop::ResidualNorm<value_type>::build()
+                                   .with_reduction_factor(r<value_type>::value))
+                           .on(exec)),
           mtx_big(gko::initialize<Mtx>(
               {{8828.0, 2673.0, 4150.0, -3139.5, 3829.5, 5856.0},
                {2673.0, 10765.5, 1805.0, 73.0, 1966.0, 3919.5},
@@ -86,20 +83,16 @@ class Bicg : public ::testing::Test {
           bicg_factory_big(
               Solver::build()
                   .with_criteria(
-                      gko::stop::Iteration::build().with_max_iters(100u).on(
-                          exec),
+                      gko::stop::Iteration::build().with_max_iters(100u),
                       gko::stop::ResidualNorm<value_type>::build()
-                          .with_reduction_factor(r<value_type>::value)
-                          .on(exec))
+                          .with_reduction_factor(r<value_type>::value))
                   .on(exec)),
           bicg_factory_big2(
               Solver::build()
                   .with_criteria(
-                      gko::stop::Iteration::build().with_max_iters(100u).on(
-                          exec),
+                      gko::stop::Iteration::build().with_max_iters(100u),
                       gko::stop::ImplicitResidualNorm<value_type>::build()
-                          .with_reduction_factor(r<value_type>::value)
-                          .on(exec))
+                          .with_reduction_factor(r<value_type>::value))
                   .on(exec)),
           mtx_non_symmetric(gko::initialize<Mtx>(
               {{1.0, 2.0, 3.0}, {3.0, 2.0, -1.0}, {0.0, -1.0, 2}}, exec))
diff --git a/reference/test/solver/bicgstab_kernels.cpp b/reference/test/solver/bicgstab_kernels.cpp
index ec44b6b6f17..70302e95796 100644
--- a/reference/test/solver/bicgstab_kernels.cpp
+++ b/reference/test/solver/bicgstab_kernels.cpp
@@ -69,36 +69,29 @@ class Bicgstab : public ::testing::Test {
           bicgstab_factory(
               Solver::build()
                   .with_criteria(
-                      gko::stop::Iteration::build().with_max_iters(8u).on(exec),
-                      gko::stop::Time::build()
-                          .with_time_limit(std::chrono::seconds(6))
-                          .on(exec),
+                      gko::stop::Iteration::build().with_max_iters(8u),
+                      gko::stop::Time::build().with_time_limit(
+                          std::chrono::seconds(6)),
                       gko::stop::ResidualNorm<value_type>::build()
-                          .with_reduction_factor(r<value_type>::value)
-                          .on(exec))
+                          .with_reduction_factor(r<value_type>::value))
                   .on(exec)),
           bicgstab_factory2(
               Solver::build()
                   .with_criteria(
-                      gko::stop::Iteration::build().with_max_iters(8u).on(exec),
-                      gko::stop::Time::build()
-                          .with_time_limit(std::chrono::seconds(6))
-                          .on(exec),
+                      gko::stop::Iteration::build().with_max_iters(8u),
+                      gko::stop::Time::build().with_time_limit(
+                          std::chrono::seconds(6)),
                       gko::stop::ImplicitResidualNorm<value_type>::build()
-                          .with_reduction_factor(r<value_type>::value)
-                          .on(exec))
+                          .with_reduction_factor(r<value_type>::value))
                   .on(exec)),
           bicgstab_factory_precision(
               Solver::build()
                   .with_criteria(
-                      gko::stop::Iteration::build().with_max_iters(50u).on(
-                          exec),
-                      gko::stop::Time::build()
-                          .with_time_limit(std::chrono::seconds(6))
-                          .on(exec),
+                      gko::stop::Iteration::build().with_max_iters(50u),
+                      gko::stop::Time::build().with_time_limit(
+                          std::chrono::seconds(6)),
                       gko::stop::ResidualNorm<value_type>::build()
-                          .with_reduction_factor(r<value_type>::value)
-                          .on(exec))
+                          .with_reduction_factor(r<value_type>::value))
                   .on(exec))
     {
         auto small_size = gko::dim<2>{2, 2};
diff --git a/reference/test/solver/cb_gmres_kernels.cpp b/reference/test/solver/cb_gmres_kernels.cpp
index 1127d7caff7..60d2a32b9ee 100644
--- a/reference/test/solver/cb_gmres_kernels.cpp
+++ b/reference/test/solver/cb_gmres_kernels.cpp
@@ -77,15 +77,12 @@ class CbGmres : public ::testing::Test {
               gmres_type::build()
                   .with_storage_precision(storage_prec)
                   .with_criteria(
-                      gko::stop::Iteration::build().with_max_iters(100u).on(
-                          exec),
-                      gko::stop::Time::build()
-                          .with_time_limit(std::chrono::seconds(6))
-                          .on(exec),
+                      gko::stop::Iteration::build().with_max_iters(100u),
+                      gko::stop::Time::build().with_time_limit(
+                          std::chrono::seconds(6)),
                       gko::stop::ResidualNorm<value_type>::build()
                           .with_baseline(gko::stop::mode::initial_resnorm)
-                          .with_reduction_factor(this->reduction_factor())
-                          .on(exec))
+                          .with_reduction_factor(this->reduction_factor()))
                   .on(exec)),
           mtx_big(gko::initialize<Mtx>(
               {{2295.7, -764.8, 1166.5, 428.9, 291.7, -774.5},
@@ -99,12 +96,10 @@ class CbGmres : public ::testing::Test {
               gmres_type::build()
                   .with_storage_precision(storage_prec)
                   .with_criteria(
-                      gko::stop::Iteration::build().with_max_iters(100u).on(
-                          exec),
+                      gko::stop::Iteration::build().with_max_iters(100u),
                       gko::stop::ResidualNorm<value_type>::build()
                           .with_baseline(gko::stop::mode::initial_resnorm)
-                          .with_reduction_factor(this->reduction_factor())
-                          .on(exec))
+                          .with_reduction_factor(this->reduction_factor()))
                   .on(exec)),
           mtx_medium(
               gko::initialize<Mtx>({{-86.40, 153.30, -108.90, 8.60, -61.60},
@@ -271,16 +266,12 @@ TYPED_TEST(CbGmres, SolvesStencilSystem2)
     auto factory =
         gmres_type::build()
             .with_storage_precision(this->storage_prec)
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(100u).on(
-                    this->exec),
-                gko::stop::Time::build()
-                    .with_time_limit(std::chrono::seconds(6))
-                    .on(this->exec),
-                gko::stop::ResidualNorm<T>::build()
-                    .with_baseline(gko::stop::mode::initial_resnorm)
-                    .with_reduction_factor(this->reduction_factor())
-                    .on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(100u),
+                           gko::stop::Time::build().with_time_limit(
+                               std::chrono::seconds(6)),
+                           gko::stop::ResidualNorm<T>::build()
+                               .with_baseline(gko::stop::mode::initial_resnorm)
+                               .with_reduction_factor(this->reduction_factor()))
             .on(this->exec);
     auto solver = factory->generate(this->mtx2);
     auto b = gko::initialize<Mtx>({33.0, 20.0, 20.0}, this->exec);
@@ -526,13 +517,10 @@ TYPED_TEST(CbGmres, SolvesBigDenseSystem1WithRestart)
         gmres_type::build()
             .with_krylov_dim(4u)
             .with_storage_precision(this->storage_prec)
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(200u).on(
-                    this->exec),
-                gko::stop::ResidualNorm<value_type>::build()
-                    .with_baseline(gko::stop::mode::initial_resnorm)
-                    .with_reduction_factor(this->reduction_factor())
-                    .on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(200u),
+                           gko::stop::ResidualNorm<value_type>::build()
+                               .with_baseline(gko::stop::mode::initial_resnorm)
+                               .with_reduction_factor(this->reduction_factor()))
             .on(this->exec);
     auto solver = cb_gmres_factory_restart->generate(this->mtx_medium);
     auto b = gko::initialize<Mtx>(
@@ -554,17 +542,13 @@ TYPED_TEST(CbGmres, SolvesWithPreconditioner)
     auto cb_gmres_factory_preconditioner =
         gmres_type::build()
             .with_storage_precision(this->storage_prec)
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(100u).on(
-                    this->exec),
-                gko::stop::ResidualNorm<value_type>::build()
-                    .with_baseline(gko::stop::mode::initial_resnorm)
-                    .with_reduction_factor(this->reduction_factor())
-                    .on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(100u),
+                           gko::stop::ResidualNorm<value_type>::build()
+                               .with_baseline(gko::stop::mode::initial_resnorm)
+                               .with_reduction_factor(this->reduction_factor()))
             .with_preconditioner(
                 gko::preconditioner::Jacobi<value_type>::build()
-                    .with_max_block_size(3u)
-                    .on(this->exec))
+                    .with_max_block_size(3u))
             .on(this->exec);
     auto solver = cb_gmres_factory_preconditioner->generate(this->mtx_big);
     auto b = gko::initialize<Mtx>(
diff --git a/reference/test/solver/cg_kernels.cpp b/reference/test/solver/cg_kernels.cpp
index 76b8cf55946..c089442488f 100644
--- a/reference/test/solver/cg_kernels.cpp
+++ b/reference/test/solver/cg_kernels.cpp
@@ -64,18 +64,14 @@ class Cg : public ::testing::Test {
               {{2, -1.0, 0.0}, {-1.0, 2, -1.0}, {0.0, -1.0, 2}}, exec)),
           stopped{},
           non_stopped{},
-          cg_factory(
-              Solver::build()
-                  .with_criteria(
-                      gko::stop::Iteration::build().with_max_iters(400u).on(
-                          exec),
-                      gko::stop::Time::build()
-                          .with_time_limit(std::chrono::seconds(6))
-                          .on(exec),
-                      gko::stop::ResidualNorm<value_type>::build()
-                          .with_reduction_factor(r<value_type>::value)
-                          .on(exec))
-                  .on(exec)),
+          cg_factory(Solver::build()
+                         .with_criteria(
+                             gko::stop::Iteration::build().with_max_iters(400u),
+                             gko::stop::Time::build().with_time_limit(
+                                 std::chrono::seconds(6)),
+                             gko::stop::ResidualNorm<value_type>::build()
+                                 .with_reduction_factor(r<value_type>::value))
+                         .on(exec)),
           mtx_big(gko::initialize<Mtx>(
               {{8828.0, 2673.0, 4150.0, -3139.5, 3829.5, 5856.0},
                {2673.0, 10765.5, 1805.0, 73.0, 1966.0, 3919.5},
@@ -87,20 +83,16 @@ class Cg : public ::testing::Test {
           cg_factory_big(
               Solver::build()
                   .with_criteria(
-                      gko::stop::Iteration::build().with_max_iters(100u).on(
-                          exec),
+                      gko::stop::Iteration::build().with_max_iters(100u),
                       gko::stop::ResidualNorm<value_type>::build()
-                          .with_reduction_factor(r<value_type>::value)
-                          .on(exec))
+                          .with_reduction_factor(r<value_type>::value))
                   .on(exec)),
           cg_factory_big2(
               Solver::build()
                   .with_criteria(
-                      gko::stop::Iteration::build().with_max_iters(100u).on(
-                          exec),
+                      gko::stop::Iteration::build().with_max_iters(100u),
                       gko::stop::ImplicitResidualNorm<value_type>::build()
-                          .with_reduction_factor(r<value_type>::value)
-                          .on(exec))
+                          .with_reduction_factor(r<value_type>::value))
                   .on(exec))
     {
         auto small_size = gko::dim<2>{2, 2};
diff --git a/reference/test/solver/cgs_kernels.cpp b/reference/test/solver/cgs_kernels.cpp
index 9c3ce2071a7..91c7c1e821b 100644
--- a/reference/test/solver/cgs_kernels.cpp
+++ b/reference/test/solver/cgs_kernels.cpp
@@ -65,15 +65,12 @@ class Cgs : public ::testing::Test {
               {{1.0, -3.0, 0.0}, {-4.0, 1.0, -3.0}, {2.0, -1.0, 2.0}}, exec)),
           stopped{},
           non_stopped{},
-          cgs_factory(
-              Solver::build()
-                  .with_criteria(
-                      gko::stop::Iteration::build().with_max_iters(40u).on(
-                          exec),
-                      gko::stop::ResidualNorm<value_type>::build()
-                          .with_reduction_factor(r<value_type>::value)
-                          .on(exec))
-                  .on(exec)),
+          cgs_factory(Solver::build()
+                          .with_criteria(
+                              gko::stop::Iteration::build().with_max_iters(40u),
+                              gko::stop::ResidualNorm<value_type>::build()
+                                  .with_reduction_factor(r<value_type>::value))
+                          .on(exec)),
           mtx_big(
               gko::initialize<Mtx>({{-99.0, 87.0, -67.0, -62.0, -68.0, -19.0},
                                     {-30.0, -17.0, -1.0, 9.0, 23.0, 77.0},
@@ -85,20 +82,16 @@ class Cgs : public ::testing::Test {
           cgs_factory_big(
               Solver::build()
                   .with_criteria(
-                      gko::stop::Iteration::build().with_max_iters(100u).on(
-                          exec),
+                      gko::stop::Iteration::build().with_max_iters(100u),
                       gko::stop::ResidualNorm<value_type>::build()
-                          .with_reduction_factor(r<value_type>::value)
-                          .on(exec))
+                          .with_reduction_factor(r<value_type>::value))
                   .on(exec)),
           cgs_factory_big2(
               Solver::build()
                   .with_criteria(
-                      gko::stop::Iteration::build().with_max_iters(100u).on(
-                          exec),
+                      gko::stop::Iteration::build().with_max_iters(100u),
                       gko::stop::ImplicitResidualNorm<value_type>::build()
-                          .with_reduction_factor(r<value_type>::value)
-                          .on(exec))
+                          .with_reduction_factor(r<value_type>::value))
                   .on(exec))
     {
         auto small_size = gko::dim<2>{2, 2};
diff --git a/reference/test/solver/direct.cpp b/reference/test/solver/direct.cpp
index 617015bac1f..7e9ddb6e4c4 100644
--- a/reference/test/solver/direct.cpp
+++ b/reference/test/solver/direct.cpp
@@ -77,8 +77,9 @@ class Direct : public ::testing::Test {
                 .with_factorization(
                     gko::experimental::factorization::Lu<value_type,
                                                          index_type>::build()
-                        .with_symmetric_sparsity(true)
-                        .on(exec))
+                        .with_symbolic_algorithm(
+                            gko::experimental::factorization::symbolic_type::
+                                symmetric))
                 .on(exec);
         solver = factory->generate(mtx);
         std::normal_distribution<gko::remove_complex<value_type>> dist(0, 1);
diff --git a/reference/test/solver/fcg_kernels.cpp b/reference/test/solver/fcg_kernels.cpp
index e8163752689..3dd4149405e 100644
--- a/reference/test/solver/fcg_kernels.cpp
+++ b/reference/test/solver/fcg_kernels.cpp
@@ -65,17 +65,14 @@ class Fcg : public ::testing::Test {
               {{2, -1.0, 0.0}, {-1.0, 2, -1.0}, {0.0, -1.0, 2}}, exec)),
           stopped{},
           non_stopped{},
-          fcg_factory(
-              Solver::build()
-                  .with_criteria(
-                      gko::stop::Iteration::build().with_max_iters(4u).on(exec),
-                      gko::stop::Time::build()
-                          .with_time_limit(std::chrono::seconds(6))
-                          .on(exec),
-                      gko::stop::ResidualNorm<value_type>::build()
-                          .with_reduction_factor(r<value_type>::value)
-                          .on(exec))
-                  .on(exec)),
+          fcg_factory(Solver::build()
+                          .with_criteria(
+                              gko::stop::Iteration::build().with_max_iters(4u),
+                              gko::stop::Time::build().with_time_limit(
+                                  std::chrono::seconds(6)),
+                              gko::stop::ResidualNorm<value_type>::build()
+                                  .with_reduction_factor(r<value_type>::value))
+                          .on(exec)),
           mtx_big(gko::initialize<Mtx>(
               {{8828.0, 2673.0, 4150.0, -3139.5, 3829.5, 5856.0},
                {2673.0, 10765.5, 1805.0, 73.0, 1966.0, 3919.5},
@@ -87,20 +84,16 @@ class Fcg : public ::testing::Test {
           fcg_factory_big(
               Solver::build()
                   .with_criteria(
-                      gko::stop::Iteration::build().with_max_iters(100u).on(
-                          exec),
+                      gko::stop::Iteration::build().with_max_iters(100u),
                       gko::stop::ResidualNorm<value_type>::build()
-                          .with_reduction_factor(r<value_type>::value)
-                          .on(exec))
+                          .with_reduction_factor(r<value_type>::value))
                   .on(exec)),
           fcg_factory_big2(
               Solver::build()
                   .with_criteria(
-                      gko::stop::Iteration::build().with_max_iters(100u).on(
-                          exec),
+                      gko::stop::Iteration::build().with_max_iters(100u),
                       gko::stop::ImplicitResidualNorm<value_type>::build()
-                          .with_reduction_factor(r<value_type>::value)
-                          .on(exec))
+                          .with_reduction_factor(r<value_type>::value))
                   .on(exec))
     {
         auto small_size = gko::dim<2>{2, 2};
diff --git a/reference/test/solver/gcr_kernels.cpp b/reference/test/solver/gcr_kernels.cpp
index 888cbc3b4fe..8943a131d2b 100644
--- a/reference/test/solver/gcr_kernels.cpp
+++ b/reference/test/solver/gcr_kernels.cpp
@@ -72,18 +72,15 @@ class Gcr : public ::testing::Test {
           non_stopped{},
           mtx(gko::initialize<Mtx>(
               {{1.0, 2.0, 3.0}, {3.0, 2.0, -1.0}, {0.0, -1.0, 2}}, exec)),
-          gcr_factory(
-              Solver::build()
-                  .with_criteria(
-                      gko::stop::Iteration::build().with_max_iters(4u).on(exec),
-                      gko::stop::Time::build()
-                          .with_time_limit(std::chrono::seconds(6))
-                          .on(exec),
-                      gko::stop::ResidualNorm<value_type>::build()
-                          .with_reduction_factor(r<value_type>::value)
-                          .on(exec))
-                  .with_krylov_dim(3u)
-                  .on(exec)),
+          gcr_factory(Solver::build()
+                          .with_criteria(
+                              gko::stop::Iteration::build().with_max_iters(4u),
+                              gko::stop::Time::build().with_time_limit(
+                                  std::chrono::seconds(6)),
+                              gko::stop::ResidualNorm<value_type>::build()
+                                  .with_reduction_factor(r<value_type>::value))
+                          .with_krylov_dim(3u)
+                          .on(exec)),
           mtx_big(gko::initialize<Mtx>(
               {{2295.7, -764.8, 1166.5, 428.9, 291.7, -774.5},
                {2752.6, -1127.7, 1212.8, -299.1, 987.7, 786.8},
@@ -95,20 +92,16 @@ class Gcr : public ::testing::Test {
           gcr_factory_big(
               Solver::build()
                   .with_criteria(
-                      gko::stop::Iteration::build().with_max_iters(100u).on(
-                          exec),
+                      gko::stop::Iteration::build().with_max_iters(100u),
                       gko::stop::ResidualNorm<value_type>::build()
-                          .with_reduction_factor(r<value_type>::value)
-                          .on(exec))
+                          .with_reduction_factor(r<value_type>::value))
                   .on(exec)),
           gcr_factory_big2(
               Solver::build()
                   .with_criteria(
-                      gko::stop::Iteration::build().with_max_iters(100u).on(
-                          exec),
+                      gko::stop::Iteration::build().with_max_iters(100u),
                       gko::stop::ImplicitResidualNorm<value_type>::build()
-                          .with_reduction_factor(r<value_type>::value)
-                          .on(exec))
+                          .with_reduction_factor(r<value_type>::value))
                   .on(exec)),
           mtx_medium(
               gko::initialize<Mtx>({{-86.40, 153.30, -108.90, 8.60, -61.60},
@@ -581,12 +574,9 @@ TYPED_TEST(Gcr, SolvesBigDenseSystem1WithRestart)
     auto gcr_factory_restart =
         Solver::build()
             .with_krylov_dim(4u)
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(200u).on(
-                    this->exec),
-                gko::stop::ResidualNorm<value_type>::build()
-                    .with_reduction_factor(r<value_type>::value)
-                    .on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(200u),
+                           gko::stop::ResidualNorm<value_type>::build()
+                               .with_reduction_factor(r<value_type>::value))
             .on(this->exec);
     auto solver = gcr_factory_restart->generate(this->mtx_medium);
     auto b = gko::initialize<Mtx>(
@@ -607,16 +597,12 @@ TYPED_TEST(Gcr, SolvesWithPreconditioner)
     using value_type = typename TestFixture::value_type;
     auto gcr_factory_preconditioner =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(100u).on(
-                    this->exec),
-                gko::stop::ResidualNorm<value_type>::build()
-                    .with_reduction_factor(r<value_type>::value)
-                    .on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(100u),
+                           gko::stop::ResidualNorm<value_type>::build()
+                               .with_reduction_factor(r<value_type>::value))
             .with_preconditioner(
                 gko::preconditioner::Jacobi<value_type>::build()
-                    .with_max_block_size(3u)
-                    .on(this->exec))
+                    .with_max_block_size(3u))
             .on(this->exec);
     auto solver = gcr_factory_preconditioner->generate(this->mtx_big);
     auto b = gko::initialize<Mtx>(
diff --git a/reference/test/solver/gmres_kernels.cpp b/reference/test/solver/gmres_kernels.cpp
index 585fec833bc..4c651e7917b 100644
--- a/reference/test/solver/gmres_kernels.cpp
+++ b/reference/test/solver/gmres_kernels.cpp
@@ -76,13 +76,11 @@ class Gmres : public ::testing::Test {
           gmres_factory(
               Solver::build()
                   .with_criteria(
-                      gko::stop::Iteration::build().with_max_iters(4u).on(exec),
-                      gko::stop::Time::build()
-                          .with_time_limit(std::chrono::seconds(6))
-                          .on(exec),
+                      gko::stop::Iteration::build().with_max_iters(4u),
+                      gko::stop::Time::build().with_time_limit(
+                          std::chrono::seconds(6)),
                       gko::stop::ResidualNorm<value_type>::build()
-                          .with_reduction_factor(r<value_type>::value)
-                          .on(exec))
+                          .with_reduction_factor(r<value_type>::value))
                   .with_krylov_dim(3u)
                   .on(exec)),
           mtx_big(gko::initialize<Mtx>(
@@ -96,20 +94,16 @@ class Gmres : public ::testing::Test {
           gmres_factory_big(
               Solver::build()
                   .with_criteria(
-                      gko::stop::Iteration::build().with_max_iters(100u).on(
-                          exec),
+                      gko::stop::Iteration::build().with_max_iters(100u),
                       gko::stop::ResidualNorm<value_type>::build()
-                          .with_reduction_factor(r<value_type>::value)
-                          .on(exec))
+                          .with_reduction_factor(r<value_type>::value))
                   .on(exec)),
           gmres_factory_big2(
               Solver::build()
                   .with_criteria(
-                      gko::stop::Iteration::build().with_max_iters(100u).on(
-                          exec),
+                      gko::stop::Iteration::build().with_max_iters(100u),
                       gko::stop::ImplicitResidualNorm<value_type>::build()
-                          .with_reduction_factor(r<value_type>::value)
-                          .on(exec))
+                          .with_reduction_factor(r<value_type>::value))
                   .on(exec)),
           mtx_medium(
               gko::initialize<Mtx>({{-86.40, 153.30, -108.90, 8.60, -61.60},
@@ -724,12 +718,9 @@ TYPED_TEST(Gmres, SolvesBigDenseSystem1WithRestart)
     auto gmres_factory_restart =
         Solver::build()
             .with_krylov_dim(4u)
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(200u).on(
-                    this->exec),
-                gko::stop::ResidualNorm<value_type>::build()
-                    .with_reduction_factor(r<value_type>::value)
-                    .on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(200u),
+                           gko::stop::ResidualNorm<value_type>::build()
+                               .with_reduction_factor(r<value_type>::value))
             .on(this->exec);
     auto solver = gmres_factory_restart->generate(this->mtx_medium);
     auto b = gko::initialize<Mtx>(
@@ -750,16 +741,12 @@ TYPED_TEST(Gmres, SolvesWithPreconditioner)
     using value_type = typename TestFixture::value_type;
     auto gmres_factory_preconditioner =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(100u).on(
-                    this->exec),
-                gko::stop::ResidualNorm<value_type>::build()
-                    .with_reduction_factor(r<value_type>::value)
-                    .on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(100u),
+                           gko::stop::ResidualNorm<value_type>::build()
+                               .with_reduction_factor(r<value_type>::value))
             .with_preconditioner(
                 gko::preconditioner::Jacobi<value_type>::build()
-                    .with_max_block_size(3u)
-                    .on(this->exec))
+                    .with_max_block_size(3u))
             .on(this->exec);
     auto solver = gmres_factory_preconditioner->generate(this->mtx_big);
     auto b = gko::initialize<Mtx>(
diff --git a/reference/test/solver/idr_kernels.cpp b/reference/test/solver/idr_kernels.cpp
index 3e74e0c319b..da1b73a035c 100644
--- a/reference/test/solver/idr_kernels.cpp
+++ b/reference/test/solver/idr_kernels.cpp
@@ -62,30 +62,24 @@ class Idr : public ::testing::Test {
         : exec(gko::ReferenceExecutor::create()),
           mtx(gko::initialize<Mtx>(
               {{1.0, -3.0, 0.0}, {-4.0, 1.0, -3.0}, {2.0, -1.0, 2.0}}, exec)),
-          idr_factory(
-              Solver::build()
-                  .with_deterministic(true)
-                  .with_criteria(
-                      gko::stop::Iteration::build().with_max_iters(8u).on(exec),
-                      gko::stop::Time::build()
-                          .with_time_limit(std::chrono::seconds(6))
-                          .on(exec),
-                      gko::stop::ResidualNorm<value_type>::build()
-                          .with_reduction_factor(r<value_type>::value)
-                          .on(exec))
-                  .on(exec)),
+          idr_factory(Solver::build()
+                          .with_deterministic(true)
+                          .with_criteria(
+                              gko::stop::Iteration::build().with_max_iters(8u),
+                              gko::stop::Time::build().with_time_limit(
+                                  std::chrono::seconds(6)),
+                              gko::stop::ResidualNorm<value_type>::build()
+                                  .with_reduction_factor(r<value_type>::value))
+                          .on(exec)),
           idr_factory_precision(
               Solver::build()
                   .with_deterministic(true)
                   .with_criteria(
-                      gko::stop::Iteration::build().with_max_iters(50u).on(
-                          exec),
-                      gko::stop::Time::build()
-                          .with_time_limit(std::chrono::seconds(6))
-                          .on(exec),
+                      gko::stop::Iteration::build().with_max_iters(50u),
+                      gko::stop::Time::build().with_time_limit(
+                          std::chrono::seconds(6)),
                       gko::stop::ResidualNorm<value_type>::build()
-                          .with_reduction_factor(r<value_type>::value)
-                          .on(exec))
+                          .with_reduction_factor(r<value_type>::value))
                   .on(exec))
     {}
 
diff --git a/reference/test/solver/ir_kernels.cpp b/reference/test/solver/ir_kernels.cpp
index 8b4255b72ef..fc6154f3366 100644
--- a/reference/test/solver/ir_kernels.cpp
+++ b/reference/test/solver/ir_kernels.cpp
@@ -65,15 +65,12 @@ class Ir : public ::testing::Test {
           // Eigenvalues of mtx are 0.9, 1.0 and 1.1
           // Richardson iteration, converges since
           // | relaxation_factor * lambda - 1 | < 1
-          ir_factory(
-              Solver::build()
-                  .with_criteria(
-                      gko::stop::Iteration::build().with_max_iters(30u).on(
-                          exec),
-                      gko::stop::ResidualNorm<value_type>::build()
-                          .with_reduction_factor(r<value_type>::value)
-                          .on(exec))
-                  .on(exec))
+          ir_factory(Solver::build()
+                         .with_criteria(
+                             gko::stop::Iteration::build().with_max_iters(30u),
+                             gko::stop::ResidualNorm<value_type>::build()
+                                 .with_reduction_factor(r<value_type>::value))
+                         .on(exec))
     {}
 
     std::shared_ptr<const gko::ReferenceExecutor> exec;
@@ -187,11 +184,9 @@ TYPED_TEST(Ir, SolvesTriangularSystemWithIterativeInnerSolver)
 
     auto solver_factory =
         gko::solver::Ir<value_type>::build()
-            .with_criteria(gko::stop::Iteration::build().with_max_iters(30u).on(
-                               this->exec),
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(30u),
                            gko::stop::ResidualNorm<value_type>::build()
-                               .with_reduction_factor(r<value_type>::value)
-                               .on(this->exec))
+                               .with_reduction_factor(r<value_type>::value))
             .with_solver(inner_solver_factory)
             .on(this->exec);
     auto b = gko::initialize<Mtx>({3.9, 9.0, 2.2}, this->exec);
@@ -354,16 +349,15 @@ TYPED_TEST(Ir, RichardsonSolvesTriangularSystem)
 {
     using Mtx = typename TestFixture::Mtx;
     using value_type = typename TestFixture::value_type;
-    auto solver = gko::solver::Ir<value_type>::build()
-                      .with_criteria(
-                          gko::stop::Iteration::build().with_max_iters(100u).on(
-                              this->exec),
-                          gko::stop::ResidualNorm<value_type>::build()
-                              .with_reduction_factor(r<value_type>::value)
-                              .on(this->exec))
-                      .with_relaxation_factor(value_type{0.9})
-                      .on(this->exec)
-                      ->generate(this->mtx);
+    auto solver =
+        gko::solver::Ir<value_type>::build()
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(100u),
+                           gko::stop::ResidualNorm<value_type>::build()
+                               .with_reduction_factor(r<value_type>::value)
+                               .on(this->exec))
+            .with_relaxation_factor(value_type{0.9})
+            .on(this->exec)
+            ->generate(this->mtx);
     auto b = gko::initialize<Mtx>({3.9, 9.0, 2.2}, this->exec);
     auto x = gko::initialize<Mtx>({0.0, 0.0, 0.0}, this->exec);
 
@@ -386,12 +380,10 @@ TYPED_TEST(Ir, RichardsonSolvesTriangularSystemWithIterativeInnerSolver)
             .on(this->exec));
     auto solver_factory =
         gko::solver::Ir<value_type>::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(100u).on(
-                    this->exec),
-                gko::stop::ResidualNorm<value_type>::build()
-                    .with_reduction_factor(r<value_type>::value)
-                    .on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(100u),
+                           gko::stop::ResidualNorm<value_type>::build()
+                               .with_reduction_factor(r<value_type>::value)
+                               .on(this->exec))
             .with_relaxation_factor(value_type{0.9})
             .with_solver(inner_solver_factory)
             .on(this->exec);
@@ -410,8 +402,7 @@ TYPED_TEST(Ir, RichardsonTransposedSolvesTriangularSystem)
     using value_type = typename TestFixture::value_type;
     auto solver =
         gko::solver::Ir<value_type>::build()
-            .with_criteria(gko::stop::Iteration::build().with_max_iters(30u).on(
-                               this->exec),
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(30u),
                            gko::stop::ResidualNorm<value_type>::build()
                                .with_reduction_factor(r<value_type>::value)
                                .on(this->exec))
@@ -433,8 +424,7 @@ TYPED_TEST(Ir, RichardsonConjTransposedSolvesTriangularSystem)
     using value_type = typename TestFixture::value_type;
     auto solver =
         gko::solver::Ir<value_type>::build()
-            .with_criteria(gko::stop::Iteration::build().with_max_iters(30u).on(
-                               this->exec),
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(30u),
                            gko::stop::ResidualNorm<value_type>::build()
                                .with_reduction_factor(r<value_type>::value)
                                .on(this->exec))
@@ -457,8 +447,7 @@ TYPED_TEST(Ir, ApplyWithGivenInitialGuessModeIsEquivalentToRef)
     using initial_guess_mode = gko::solver::initial_guess_mode;
     auto ref_solver =
         gko::solver::Ir<value_type>::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(1u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(1u))
             .on(this->exec)
             ->generate(this->mtx);
     auto b = gko::initialize<Mtx>({3.9, 9.0, 2.2}, this->exec);
@@ -466,9 +455,7 @@ TYPED_TEST(Ir, ApplyWithGivenInitialGuessModeIsEquivalentToRef)
                        initial_guess_mode::zero}) {
         auto solver =
             gko::solver::Ir<value_type>::build()
-                .with_criteria(
-                    gko::stop::Iteration::build().with_max_iters(1u).on(
-                        this->exec))
+                .with_criteria(gko::stop::Iteration::build().with_max_iters(1u))
                 .with_default_initial_guess(guess)
                 .on(this->exec)
                 ->generate(this->mtx);
diff --git a/reference/test/solver/multigrid_kernels.cpp b/reference/test/solver/multigrid_kernels.cpp
index 3efb9d41c5e..c35db0b1427 100644
--- a/reference/test/solver/multigrid_kernels.cpp
+++ b/reference/test/solver/multigrid_kernels.cpp
@@ -289,30 +289,26 @@ class Multigrid : public ::testing::Test {
                                  .on(exec)),
           smoother_factory(gko::give(
               Smoother::build()
-                  .with_solver(
-                      InnerSolver::build().with_max_block_size(1u).on(exec))
+                  .with_solver(InnerSolver::build().with_max_block_size(1u))
                   .with_criteria(
-                      gko::stop::Iteration::build().with_max_iters(1u).on(exec))
+                      gko::stop::Iteration::build().with_max_iters(1u))
                   .on(exec))),
           coarsest_factory(
               CoarsestSolver::build()
                   .with_criteria(
-                      gko::stop::Iteration::build().with_max_iters(4u).on(exec),
-                      gko::stop::Time::build()
-                          .with_time_limit(std::chrono::seconds(6))
-                          .on(exec),
+                      gko::stop::Iteration::build().with_max_iters(4u),
+                      gko::stop::Time::build().with_time_limit(
+                          std::chrono::seconds(6)),
                       gko::stop::ResidualNorm<value_type>::build()
                           .with_baseline(gko::stop::mode::initial_resnorm)
-                          .with_reduction_factor(r<value_type>::value)
-                          .on(exec))
+                          .with_reduction_factor(r<value_type>::value))
                   .on(exec)),
           coarsestnext_factory(
               CoarsestNextSolver::build()
                   .with_criteria(
-                      gko::stop::Iteration::build().with_max_iters(4u).on(exec),
-                      gko::stop::Time::build()
-                          .with_time_limit(std::chrono::seconds(6))
-                          .on(exec))
+                      gko::stop::Iteration::build().with_max_iters(4u),
+                      gko::stop::Time::build().with_time_limit(
+                          std::chrono::seconds(6)))
                   .on(exec)),
           rp_factory(DummyRPFactory::build().on(exec)),
           lo_factory(DummyFactory::build().on(exec)),
@@ -357,14 +353,12 @@ class Multigrid : public ::testing::Test {
                 .with_mid_case(gko::solver::multigrid::mid_smooth_type::both)
                 .with_mg_level(coarse_factory)
                 .with_criteria(
-                    gko::stop::Iteration::build().with_max_iters(4u).on(exec),
-                    gko::stop::Time::build()
-                        .with_time_limit(std::chrono::seconds(6))
-                        .on(exec),
+                    gko::stop::Iteration::build().with_max_iters(4u),
+                    gko::stop::Time::build().with_time_limit(
+                        std::chrono::seconds(6)),
                     gko::stop::ResidualNorm<value_type>::build()
                         .with_baseline(gko::stop::mode::initial_resnorm)
-                        .with_reduction_factor(r<value_type>::value)
-                        .on(exec))
+                        .with_reduction_factor(r<value_type>::value))
                 .with_cycle(cycle)
                 .with_min_coarse_rows(1u)
                 .on(exec));
@@ -382,14 +376,12 @@ class Multigrid : public ::testing::Test {
                 .with_mid_case(gko::solver::multigrid::mid_smooth_type::both)
                 .with_mg_level(coarse_factory, coarsenext_factory)
                 .with_criteria(
-                    gko::stop::Iteration::build().with_max_iters(200u).on(exec),
-                    gko::stop::Time::build()
-                        .with_time_limit(std::chrono::seconds(100))
-                        .on(exec),
+                    gko::stop::Iteration::build().with_max_iters(200u),
+                    gko::stop::Time::build().with_time_limit(
+                        std::chrono::seconds(100)),
                     gko::stop::ResidualNorm<value_type>::build()
                         .with_baseline(gko::stop::mode::initial_resnorm)
-                        .with_reduction_factor(r<value_type>::value)
-                        .on(exec))
+                        .with_reduction_factor(r<value_type>::value))
                 .with_cycle(cycle)
                 .with_min_coarse_rows(1u)
                 .on(exec));
@@ -413,9 +405,7 @@ class Multigrid : public ::testing::Test {
                     gko::matrix::IdentityFactory<value_type>::create(exec))
                 .with_post_uses_pre(false)
                 .with_mid_case(mid_case)
-                .with_criteria(
-                    gko::stop::Iteration::build().with_max_iters(1u).on(
-                        this->exec))
+                .with_criteria(gko::stop::Iteration::build().with_max_iters(1u))
                 .with_cycle(cycle)
                 .with_min_coarse_rows(1u)
                 .on(this->exec));
@@ -435,9 +425,7 @@ class Multigrid : public ::testing::Test {
                 .with_coarsest_solver(this->lo_factory)
                 .with_post_uses_pre(true)
                 .with_mid_case(mid_case)
-                .with_criteria(
-                    gko::stop::Iteration::build().with_max_iters(1u).on(
-                        this->exec))
+                .with_criteria(gko::stop::Iteration::build().with_max_iters(1u))
                 .with_cycle(cycle)
                 .with_min_coarse_rows(1u)
                 .on(this->exec));
@@ -1273,8 +1261,7 @@ TYPED_TEST(Multigrid, ZeroGuessIgnoresInput)
             .with_coarsest_solver(this->coarsest_factory)
             .with_max_levels(2u)
             .with_mg_level(this->coarse_factory)
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(1u).on(this->exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(1u))
             .with_min_coarse_rows(1u);
     auto normal_mg = common_part
                          .with_default_initial_guess(
diff --git a/reference/test/stop/residual_norm_kernels.cpp b/reference/test/stop/residual_norm_kernels.cpp
index cc8d145231e..18c3a5d57af 100644
--- a/reference/test/stop/residual_norm_kernels.cpp
+++ b/reference/test/stop/residual_norm_kernels.cpp
@@ -54,6 +54,7 @@ class ResidualNorm : public ::testing::Test {
 protected:
     using Mtx = gko::matrix::Dense<T>;
     using NormVector = gko::matrix::Dense<gko::remove_complex<T>>;
+    using ValueType = T;
 
     ResidualNorm()
     {
@@ -102,6 +103,39 @@ TYPED_TEST(ResidualNorm, CanCreateFactory)
     ASSERT_EQ(this->abs_factory_->get_executor(), this->exec_);
 }
 
+TYPED_TEST(ResidualNorm, CheckIfResZeroConverges)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using NormVector = typename TestFixture::NormVector;
+    using T = typename TestFixture::ValueType;
+    using mode = typename gko::stop::mode;
+    std::shared_ptr<gko::LinOp> mtx = gko::initialize<Mtx>({1.0}, this->exec_);
+    std::shared_ptr<gko::LinOp> rhs = gko::initialize<Mtx>({0.0}, this->exec_);
+    std::shared_ptr<gko::LinOp> x = gko::initialize<Mtx>({0.0}, this->exec_);
+    std::shared_ptr<gko::LinOp> res_norm =
+        gko::initialize<NormVector>({0.0}, this->exec_);
+
+    for (auto baseline :
+         {mode::rhs_norm, mode::initial_resnorm, mode::absolute}) {
+        gko::remove_complex<T> factor =
+            (baseline == mode::absolute) ? 0.0 : r<T>::value;
+        auto criterion = gko::stop::ResidualNorm<T>::build()
+                             .with_reduction_factor(factor)
+                             .with_baseline(baseline)
+                             .on(this->exec_)
+                             ->generate(mtx, rhs, x.get(), nullptr);
+        constexpr gko::uint8 RelativeStoppingId{1};
+        bool one_changed{};
+        gko::array<gko::stopping_status> stop_status(this->exec_, 1);
+        stop_status.get_data()[0].reset();
+
+        EXPECT_TRUE(criterion->update().residual_norm(res_norm).check(
+            RelativeStoppingId, true, &stop_status, &one_changed));
+        EXPECT_TRUE(stop_status.get_data()[0].has_converged());
+        EXPECT_TRUE(one_changed);
+    }
+}
+
 
 TYPED_TEST(ResidualNorm, CannotCreateCriterionWithoutNeededInput)
 {
@@ -240,7 +274,7 @@ TYPED_TEST(ResidualNorm, WaitsTillResidualGoal)
 }
 
 
-TYPED_TEST(ResidualNorm, SelfCalulatesThrowWithoutMatrix)
+TYPED_TEST(ResidualNorm, SelfCalculatesThrowWithoutMatrix)
 {
     using Mtx = typename TestFixture::Mtx;
     using NormVector = typename TestFixture::NormVector;
@@ -297,7 +331,7 @@ TYPED_TEST(ResidualNorm, SelfCalulatesThrowWithoutMatrix)
 }
 
 
-TYPED_TEST(ResidualNorm, RelativeSelfCalulatesThrowWithoutRhs)
+TYPED_TEST(ResidualNorm, RelativeSelfCalculatesThrowWithoutRhs)
 {
     // only relative residual norm allows generation without rhs.
     using Mtx = typename TestFixture::Mtx;
@@ -322,7 +356,7 @@ TYPED_TEST(ResidualNorm, RelativeSelfCalulatesThrowWithoutRhs)
 }
 
 
-TYPED_TEST(ResidualNorm, SelfCalulatesAndWaitsTillResidualGoal)
+TYPED_TEST(ResidualNorm, SelfCalculatesAndWaitsTillResidualGoal)
 {
     using Mtx = typename TestFixture::Mtx;
     using NormVector = typename TestFixture::NormVector;
@@ -776,6 +810,7 @@ class ImplicitResidualNorm : public ::testing::Test {
 protected:
     using Mtx = gko::matrix::Dense<T>;
     using NormVector = gko::matrix::Dense<gko::remove_complex<T>>;
+    using ValueType = T;
 
     ImplicitResidualNorm()
     {
@@ -820,6 +855,40 @@ TYPED_TEST(ImplicitResidualNorm, CanCreateFactory)
     ASSERT_EQ(this->factory_->get_executor(), this->exec_);
 }
 
+TYPED_TEST(ImplicitResidualNorm, CheckIfResZeroConverges)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using T = typename TestFixture::ValueType;
+    using mode = typename gko::stop::mode;
+    std::shared_ptr<gko::LinOp> mtx = gko::initialize<Mtx>({1.0}, this->exec_);
+    std::shared_ptr<gko::LinOp> rhs = gko::initialize<Mtx>({0.0}, this->exec_);
+    std::shared_ptr<gko::LinOp> x = gko::initialize<Mtx>({0.0}, this->exec_);
+    std::shared_ptr<gko::LinOp> implicit_sq_res_norm =
+        gko::initialize<Mtx>({0.0}, this->exec_);
+
+    for (auto baseline :
+         {mode::rhs_norm, mode::initial_resnorm, mode::absolute}) {
+        gko::remove_complex<T> factor =
+            (baseline == mode::absolute) ? 0.0 : r<T>::value;
+        auto criterion = gko::stop::ImplicitResidualNorm<T>::build()
+                             .with_reduction_factor(factor)
+                             .with_baseline(baseline)
+                             .on(this->exec_)
+                             ->generate(mtx, rhs, x.get(), nullptr);
+        constexpr gko::uint8 RelativeStoppingId{1};
+        bool one_changed{};
+        gko::array<gko::stopping_status> stop_status(this->exec_, 1);
+        stop_status.get_data()[0].reset();
+
+        EXPECT_TRUE(
+            criterion->update()
+                .implicit_sq_residual_norm(implicit_sq_res_norm)
+                .check(RelativeStoppingId, true, &stop_status, &one_changed));
+        EXPECT_TRUE(stop_status.get_data()[0].has_converged());
+        EXPECT_TRUE(one_changed);
+    }
+}
+
 
 TYPED_TEST(ImplicitResidualNorm, CannotCreateCriterionWithoutBAndInitRes)
 {
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 8a6eb305b6a..6e72dbdf0aa 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -14,3 +14,4 @@ add_subdirectory(preconditioner)
 add_subdirectory(reorder)
 add_subdirectory(solver)
 add_subdirectory(stop)
+add_subdirectory(tools)
diff --git a/test/base/CMakeLists.txt b/test/base/CMakeLists.txt
index 80026fdabe1..d0567f45403 100644
--- a/test/base/CMakeLists.txt
+++ b/test/base/CMakeLists.txt
@@ -1,3 +1,4 @@
+ginkgo_create_common_test(batch_multi_vector_kernels)
 ginkgo_create_common_and_reference_test(device_matrix_data_kernels)
 ginkgo_create_common_device_test(kernel_launch_generic)
 ginkgo_create_common_and_reference_test(executor)
diff --git a/test/base/batch_multi_vector_kernels.cpp b/test/base/batch_multi_vector_kernels.cpp
new file mode 100644
index 00000000000..6f4eb3d05a8
--- /dev/null
+++ b/test/base/batch_multi_vector_kernels.cpp
@@ -0,0 +1,340 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+
+
+#include <memory>
+#include <random>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/math.hpp>
+
+
+#include "core/base/batch_multi_vector_kernels.hpp"
+#include "core/base/batch_utilities.hpp"
+#include "core/test/utils.hpp"
+#include "core/test/utils/assertions.hpp"
+#include "core/test/utils/batch_helpers.hpp"
+#include "test/utils/executor.hpp"
+
+
+class MultiVector : public CommonTestFixture {
+protected:
+    using Mtx = gko::batch::MultiVector<value_type>;
+    using NormVector = gko::batch::MultiVector<gko::remove_complex<value_type>>;
+    using ComplexMtx = gko::batch::MultiVector<std::complex<value_type>>;
+
+    MultiVector() : rand_engine(15) {}
+
+    template <typename MtxType>
+    std::unique_ptr<MtxType> gen_mtx(const gko::size_type num_batch_items,
+                                     gko::size_type num_rows,
+                                     gko::size_type num_cols)
+    {
+        return gko::test::generate_random_batch_matrix<MtxType>(
+            num_batch_items, num_rows, num_cols,
+            std::uniform_int_distribution<>(num_cols, num_cols),
+            std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
+    }
+
+    void set_up_vector_data(gko::size_type num_vecs, const int num_rows = 252,
+                            bool different_alpha = false)
+    {
+        x = gen_mtx<Mtx>(batch_size, num_rows, num_vecs);
+        y = gen_mtx<Mtx>(batch_size, num_rows, num_vecs);
+        c_x = gen_mtx<ComplexMtx>(batch_size, num_rows, num_vecs);
+        c_y = gen_mtx<ComplexMtx>(batch_size, num_rows, num_vecs);
+        if (different_alpha) {
+            alpha = gen_mtx<Mtx>(batch_size, 1, num_vecs);
+            beta = gen_mtx<Mtx>(batch_size, 1, num_vecs);
+        } else {
+            alpha = gko::batch::initialize<Mtx>(batch_size, {2.0}, ref);
+            beta = gko::batch::initialize<Mtx>(batch_size, {-0.5}, ref);
+        }
+        dx = gko::clone(exec, x);
+        dy = gko::clone(exec, y);
+        dc_x = gko::clone(exec, c_x);
+        dc_y = gko::clone(exec, c_y);
+        dalpha = gko::clone(exec, alpha);
+        dbeta = gko::clone(exec, beta);
+        expected = Mtx::create(
+            ref, gko::batch_dim<2>(batch_size, gko::dim<2>{1, num_vecs}));
+        dresult = Mtx::create(
+            exec, gko::batch_dim<2>(batch_size, gko::dim<2>{1, num_vecs}));
+    }
+
+    std::default_random_engine rand_engine;
+
+    const size_t batch_size = 11;
+    std::unique_ptr<Mtx> x;
+    std::unique_ptr<ComplexMtx> c_x;
+    std::unique_ptr<ComplexMtx> c_y;
+    std::unique_ptr<Mtx> y;
+    std::unique_ptr<Mtx> alpha;
+    std::unique_ptr<Mtx> beta;
+    std::unique_ptr<Mtx> expected;
+    std::unique_ptr<Mtx> square;
+    std::unique_ptr<Mtx> dresult;
+    std::unique_ptr<Mtx> dx;
+    std::unique_ptr<ComplexMtx> dc_x;
+    std::unique_ptr<ComplexMtx> dc_y;
+    std::unique_ptr<Mtx> dy;
+    std::unique_ptr<Mtx> dalpha;
+    std::unique_ptr<Mtx> dbeta;
+    std::unique_ptr<Mtx> dsquare;
+};
+
+
+TEST_F(MultiVector, SingleVectorAddScaledIsEquivalentToRef)
+{
+    set_up_vector_data(1);
+
+    x->add_scaled(alpha.get(), y.get());
+    dx->add_scaled(dalpha.get(), dy.get());
+
+    GKO_ASSERT_BATCH_MTX_NEAR(dx, x, r<value_type>::value);
+}
+
+
+TEST_F(MultiVector, MultipleVectorAddScaledIsEquivalentToRef)
+{
+    set_up_vector_data(20);
+
+    x->add_scaled(alpha.get(), y.get());
+    dx->add_scaled(dalpha.get(), dy.get());
+
+    GKO_ASSERT_BATCH_MTX_NEAR(dx, x, 5 * r<value_type>::value);
+}
+
+
+TEST_F(MultiVector, MultipleVectorAddScaledWithDifferentAlphaIsEquivalentToRef)
+{
+    set_up_vector_data(20, 252, true);
+
+    x->add_scaled(alpha.get(), y.get());
+    dx->add_scaled(dalpha.get(), dy.get());
+
+    GKO_ASSERT_BATCH_MTX_NEAR(dx, x, 5 * r<value_type>::value);
+}
+
+
+TEST_F(MultiVector, SingleVectorScaleIsEquivalentToRef)
+{
+    set_up_vector_data(1);
+
+    x->scale(alpha.get());
+    dx->scale(dalpha.get());
+
+    GKO_ASSERT_BATCH_MTX_NEAR(dx, x, 5 * r<value_type>::value);
+}
+
+
+TEST_F(MultiVector, MultipleVectorScaleIsEquivalentToRef)
+{
+    set_up_vector_data(20);
+
+    x->scale(alpha.get());
+    dx->scale(dalpha.get());
+
+    GKO_ASSERT_BATCH_MTX_NEAR(dx, x, 5 * r<value_type>::value);
+}
+
+
+TEST_F(MultiVector, MultipleVectorScaleWithDifferentAlphaIsEquivalentToRef)
+{
+    set_up_vector_data(20, true);
+
+    x->scale(alpha.get());
+    dx->scale(dalpha.get());
+
+    GKO_ASSERT_BATCH_MTX_NEAR(dx, x, 5 * r<value_type>::value);
+}
+
+
+TEST_F(MultiVector, ComputeNorm2SingleSmallIsEquivalentToRef)
+{
+    set_up_vector_data(1, 10);
+    auto norm_size =
+        gko::batch_dim<2>(batch_size, gko::dim<2>{1, x->get_common_size()[1]});
+    auto norm_expected = NormVector::create(this->ref, norm_size);
+    auto dnorm = NormVector::create(this->exec, norm_size);
+
+    x->compute_norm2(norm_expected.get());
+    dx->compute_norm2(dnorm.get());
+
+    GKO_ASSERT_BATCH_MTX_NEAR(norm_expected, dnorm, 5 * r<value_type>::value);
+}
+
+
+TEST_F(MultiVector, ComputeNorm2SingleIsEquivalentToRef)
+{
+    set_up_vector_data(1);
+    auto norm_size =
+        gko::batch_dim<2>(batch_size, gko::dim<2>{1, x->get_common_size()[1]});
+    auto norm_expected = NormVector::create(this->ref, norm_size);
+    auto dnorm = NormVector::create(this->exec, norm_size);
+
+    x->compute_norm2(norm_expected.get());
+    dx->compute_norm2(dnorm.get());
+
+    GKO_ASSERT_BATCH_MTX_NEAR(norm_expected, dnorm, 5 * r<value_type>::value);
+}
+
+
+TEST_F(MultiVector, ComputeNorm2IsEquivalentToRef)
+{
+    set_up_vector_data(20);
+    auto norm_size =
+        gko::batch_dim<2>(batch_size, gko::dim<2>{1, x->get_common_size()[1]});
+    auto norm_expected = NormVector::create(this->ref, norm_size);
+    auto dnorm = NormVector::create(this->exec, norm_size);
+
+    x->compute_norm2(norm_expected.get());
+    dx->compute_norm2(dnorm.get());
+
+    GKO_ASSERT_BATCH_MTX_NEAR(norm_expected, dnorm, 5 * r<value_type>::value);
+}
+
+
+TEST_F(MultiVector, ComputeDotIsEquivalentToRef)
+{
+    set_up_vector_data(20);
+    auto dot_size =
+        gko::batch_dim<2>(batch_size, gko::dim<2>{1, x->get_common_size()[1]});
+    auto dot_expected = Mtx::create(this->ref, dot_size);
+    auto ddot = Mtx::create(this->exec, dot_size);
+    auto cdot_expected = ComplexMtx::create(this->ref, dot_size);
+    auto dc_dot = ComplexMtx::create(this->exec, dot_size);
+
+    x->compute_dot(y.get(), dot_expected.get());
+    dx->compute_dot(dy.get(), ddot.get());
+    c_x->compute_dot(c_y.get(), cdot_expected.get());
+    dc_x->compute_dot(dc_y.get(), dc_dot.get());
+
+    GKO_ASSERT_BATCH_MTX_NEAR(dot_expected, ddot, 5 * r<value_type>::value);
+    GKO_ASSERT_BATCH_MTX_NEAR(cdot_expected, dc_dot, 5 * r<value_type>::value);
+}
+
+
+TEST_F(MultiVector, ComputeDotSingleIsEquivalentToRef)
+{
+    set_up_vector_data(1);
+    auto dot_size =
+        gko::batch_dim<2>(batch_size, gko::dim<2>{1, x->get_common_size()[1]});
+    auto dot_expected = Mtx::create(this->ref, dot_size);
+    auto ddot = Mtx::create(this->exec, dot_size);
+
+    x->compute_dot(y.get(), dot_expected.get());
+    dx->compute_dot(dy.get(), ddot.get());
+
+    GKO_ASSERT_BATCH_MTX_NEAR(dot_expected, ddot, 5 * r<value_type>::value);
+}
+
+
+TEST_F(MultiVector, ComputeDotSingleSmallIsEquivalentToRef)
+{
+    set_up_vector_data(1, 10);
+    auto dot_size =
+        gko::batch_dim<2>(batch_size, gko::dim<2>{1, x->get_common_size()[1]});
+    auto dot_expected = Mtx::create(this->ref, dot_size);
+    auto ddot = Mtx::create(this->exec, dot_size);
+
+    x->compute_dot(y.get(), dot_expected.get());
+    dx->compute_dot(dy.get(), ddot.get());
+
+    GKO_ASSERT_BATCH_MTX_NEAR(dot_expected, ddot, 5 * r<value_type>::value);
+}
+
+
+TEST_F(MultiVector, ComputeConjDotIsEquivalentToRef)
+{
+    set_up_vector_data(20);
+    auto dot_size =
+        gko::batch_dim<2>(batch_size, gko::dim<2>{1, x->get_common_size()[1]});
+    auto dot_expected = Mtx::create(this->ref, dot_size);
+    auto ddot = Mtx::create(this->exec, dot_size);
+    auto cdot_expected = ComplexMtx::create(this->ref, dot_size);
+    auto dc_dot = ComplexMtx::create(this->exec, dot_size);
+
+    x->compute_conj_dot(y.get(), dot_expected.get());
+    dx->compute_conj_dot(dy.get(), ddot.get());
+    c_x->compute_conj_dot(c_y.get(), cdot_expected.get());
+    dc_x->compute_conj_dot(dc_y.get(), dc_dot.get());
+
+    GKO_ASSERT_BATCH_MTX_NEAR(dot_expected, ddot, 5 * r<value_type>::value);
+    GKO_ASSERT_BATCH_MTX_NEAR(cdot_expected, dc_dot, 5 * r<value_type>::value);
+}
+
+
+TEST_F(MultiVector, ComputeConjDotSingleIsEquivalentToRef)
+{
+    set_up_vector_data(1);
+    auto dot_size =
+        gko::batch_dim<2>(batch_size, gko::dim<2>{1, x->get_common_size()[1]});
+    auto dot_expected = Mtx::create(this->ref, dot_size);
+    auto ddot = Mtx::create(this->exec, dot_size);
+
+    x->compute_conj_dot(y.get(), dot_expected.get());
+    dx->compute_conj_dot(dy.get(), ddot.get());
+
+    GKO_ASSERT_BATCH_MTX_NEAR(dot_expected, ddot, 5 * r<value_type>::value);
+}
+
+
+TEST_F(MultiVector, CopySingleIsEquivalentToRef)
+{
+    set_up_vector_data(1);
+
+    gko::kernels::reference::batch_multi_vector::copy(this->ref, x.get(),
+                                                      y.get());
+    gko::kernels::EXEC_NAMESPACE::batch_multi_vector::copy(this->exec, dx.get(),
+                                                           dy.get());
+
+    GKO_ASSERT_BATCH_MTX_NEAR(dy, y, 0.0);
+}
+
+
+TEST_F(MultiVector, CopyIsEquivalentToRef)
+{
+    set_up_vector_data(20);
+
+    gko::kernels::reference::batch_multi_vector::copy(this->ref, x.get(),
+                                                      y.get());
+    gko::kernels::EXEC_NAMESPACE::batch_multi_vector::copy(this->exec, dx.get(),
+                                                           dy.get());
+
+    GKO_ASSERT_BATCH_MTX_NEAR(dy, y, 0.0);
+}
diff --git a/test/base/kernel_launch_generic.cpp b/test/base/kernel_launch_generic.cpp
index a90a5ea6c70..d4a0f83c819 100644
--- a/test/base/kernel_launch_generic.cpp
+++ b/test/base/kernel_launch_generic.cpp
@@ -33,6 +33,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common/unified/base/kernel_launch.hpp"
 
 
+#include <algorithm>
 #include <memory>
 #include <type_traits>
 
@@ -321,104 +322,182 @@ TEST_F(KernelLaunch, Runs2DDense)
 
 void run1d_reduction(std::shared_ptr<gko::EXEC_TYPE> exec)
 {
-    gko::array<int64> output{exec, 1};
+    gko::array<int64> output{exec, {-1l}};
+    auto run_reduction = [&](int64 init, size_type size) {
+        gko::kernels::EXEC_NAMESPACE::run_kernel_reduction(
+            exec,
+            [] GKO_KERNEL(auto i, auto a, auto dummy) {
+                static_assert(is_same<decltype(i), int64>::value, "index");
+                static_assert(is_same<decltype(a), int64*>::value, "value");
+                static_assert(is_same<decltype(dummy), int64>::value, "dummy");
+                return i + 1;
+            },
+            [] GKO_KERNEL(auto i, auto j) { return i + j; },
+            [] GKO_KERNEL(auto j) { return j * 2; }, init, output.get_data(),
+            size, output, move_only_val);
+    };
 
-    gko::kernels::EXEC_NAMESPACE::run_kernel_reduction(
-        exec,
-        [] GKO_KERNEL(auto i, auto a, auto dummy) {
-            static_assert(is_same<decltype(i), int64>::value, "index");
-            static_assert(is_same<decltype(a), int64*>::value, "value");
-            static_assert(is_same<decltype(dummy), int64>::value, "dummy");
-            return i + 1;
-        },
-        [] GKO_KERNEL(auto i, auto j) { return i + j; },
-        [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(),
-        size_type{100000}, output, move_only_val);
+    {
+        SCOPED_TRACE("Size 0");
+        run_reduction(int64{0}, size_type{0});
 
-    // 2 * sum i=0...99999 (i+1)
-    ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10000100000LL);
+        ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), int64{0});
+    }
 
-    gko::kernels::EXEC_NAMESPACE::run_kernel_reduction(
-        exec,
-        [] GKO_KERNEL(auto i, auto a, auto dummy) {
-            static_assert(is_same<decltype(i), int64>::value, "index");
-            static_assert(is_same<decltype(a), int64*>::value, "value");
-            static_assert(is_same<decltype(dummy), int64>::value, "dummy");
-            return i + 1;
-        },
-        [] GKO_KERNEL(auto i, auto j) {
-            static_assert(is_same<decltype(i), int64>::value, "a");
-            static_assert(is_same<decltype(i), int64>::value, "b");
-            return i + j;
-        },
-        [] GKO_KERNEL(auto j) {
-            static_assert(is_same<decltype(j), int64>::value, "value");
-            return j * 2;
-        },
-        int64{}, output.get_data(), size_type{100}, output, move_only_val);
+    {
+        SCOPED_TRACE("Size 100000");
+        run_reduction(int64{0}, size_type{100000});
+
+        ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()),
+                  int64{10000100000});
+    }
+
+    {
+        SCOPED_TRACE("Size 100");
+        run_reduction(int64{0}, size_type{100});
 
-    // 2 * sum i=0...99 (i+1)
-    ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10100LL);
+        // 2 * sum i=0...99 (i+1)
+        ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()),
+                  int64{10100});
+    }
 }
 
 TEST_F(KernelLaunch, Reduction1D) { run1d_reduction(exec); }
 
 
-void run2d_reduction(std::shared_ptr<gko::EXEC_TYPE> exec)
+void run1d_reduction_cached(std::shared_ptr<gko::EXEC_TYPE> exec,
+                            std::vector<size_type> sizes)
 {
     gko::array<int64> output{exec, 1};
+    gko::array<char> temp(exec);
+    for (const auto& size : sizes) {
+        temp.clear();
+        gko::kernels::EXEC_NAMESPACE::run_kernel_reduction_cached(
+            exec, [] GKO_KERNEL(auto i) { return i + 1; },
+            [] GKO_KERNEL(auto i, auto j) { return std::max(i, j); },
+            [] GKO_KERNEL(auto j) { return j; }, int64{}, output.get_data(),
+            size, temp);
+
+        ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()),
+                  static_cast<int64>(size));
+        // The temporary storage (used for partial sums) must be smaller than
+        // the input array
+        ASSERT_LE(temp.get_num_elems(), size * sizeof(int64));
+    }
+}
 
-    gko::kernels::EXEC_NAMESPACE::run_kernel_reduction(
-        exec,
-        [] GKO_KERNEL(auto i, auto j, auto a, auto dummy) {
-            static_assert(is_same<decltype(i), int64>::value, "index");
-            static_assert(is_same<decltype(j), int64>::value, "index");
-            static_assert(is_same<decltype(a), int64*>::value, "value");
-            static_assert(is_same<decltype(dummy), int64>::value, "dummy");
-            return (i + 1) * (j + 1);
-        },
-        [] GKO_KERNEL(auto i, auto j) {
-            static_assert(is_same<decltype(i), int64>::value, "a");
-            static_assert(is_same<decltype(i), int64>::value, "b");
-            return i + j;
-        },
-        [] GKO_KERNEL(auto j) {
-            static_assert(is_same<decltype(j), int64>::value, "value");
-            return j * 4;
-        },
-        int64{}, output.get_data(), gko::dim<2>{1000, 100}, output,
-        move_only_val);
+TEST_F(KernelLaunch, Reduction1DCached)
+{
+    run1d_reduction_cached(exec, {10, 1000, 1000000, 1234567, 7654321});
+}
 
-    // 4 * sum i=0...999 sum j=0...99 of (i+1)*(j+1)
-    ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10110100000LL);
 
-    gko::kernels::EXEC_NAMESPACE::run_kernel_reduction(
-        exec,
-        [] GKO_KERNEL(auto i, auto j, auto a, auto dummy) {
-            static_assert(is_same<decltype(i), int64>::value, "index");
-            static_assert(is_same<decltype(j), int64>::value, "index");
-            static_assert(is_same<decltype(a), int64*>::value, "value");
-            static_assert(is_same<decltype(dummy), int64>::value, "dummy");
-            return (i + 1) * (j + 1);
-        },
-        [] GKO_KERNEL(auto i, auto j) {
-            static_assert(is_same<decltype(i), int64>::value, "a");
-            static_assert(is_same<decltype(i), int64>::value, "b");
-            return i + j;
-        },
-        [] GKO_KERNEL(auto j) {
-            static_assert(is_same<decltype(j), int64>::value, "value");
-            return j * 4;
-        },
-        int64{}, output.get_data(), gko::dim<2>{10, 10}, output, move_only_val);
+void run2d_reduction(std::shared_ptr<gko::EXEC_TYPE> exec)
+{
+    gko::array<int64> output{exec, {-1l}};
+    auto run_reduction = [&](int64 init, gko::dim<2> size) {
+        gko::kernels::EXEC_NAMESPACE::run_kernel_reduction(
+            exec,
+            [] GKO_KERNEL(auto i, auto j, auto a, auto dummy) {
+                static_assert(is_same<decltype(i), int64>::value, "index");
+                static_assert(is_same<decltype(j), int64>::value, "index");
+                static_assert(is_same<decltype(a), int64*>::value, "value");
+                static_assert(is_same<decltype(dummy), int64>::value, "dummy");
+                return (i + 1) * (j + 1);
+            },
+            [] GKO_KERNEL(auto i, auto j) {
+                static_assert(is_same<decltype(i), int64>::value, "a");
+                static_assert(is_same<decltype(i), int64>::value, "b");
+                return i + j;
+            },
+            [] GKO_KERNEL(auto j) {
+                static_assert(is_same<decltype(j), int64>::value, "value");
+                return j * 4;
+            },
+            init, output.get_data(), size, output, move_only_val);
+    };
 
-    // 4 * sum i=0...9 sum j=0...9 of (i+1)*(j+1)
-    ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 12100LL);
+    {
+        SCOPED_TRACE("Dim 0x0");
+        run_reduction(int64{0}, gko::dim<2>{0, 0});
+
+        ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), int64{0});
+    }
+
+    {
+        SCOPED_TRACE("Dim 0x10");
+        run_reduction(int64{0}, gko::dim<2>{0, 10});
+
+        ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), int64{0});
+    }
+
+    {
+        SCOPED_TRACE("Dim 10x0");
+        run_reduction(int64{0}, gko::dim<2>{10, 0});
+
+        ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), int64{0});
+    }
+
+    {
+        SCOPED_TRACE("Dim 1000x100");
+        run_reduction(int64{0}, gko::dim<2>{1000, 100});
+
+        // 4 * sum i=0...999 sum j=0...99 of (i+1)*(j+1)
+        ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()),
+                  int64{10110100000});
+    }
+
+    {
+        SCOPED_TRACE("Dim 10x10");
+        run_reduction(int64{0}, gko::dim<2>{10, 10});
+
+        // 4 * sum i=0...9 sum j=0...9 of (i+1)*(j+1)
+        ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()),
+                  int64{12100});
+    }
 }
 
 TEST_F(KernelLaunch, Reduction2D) { run2d_reduction(exec); }
 
 
+void run2d_reduction_cached(std::shared_ptr<gko::EXEC_TYPE> exec,
+                            std::vector<gko::dim<2>> dims)
+{
+    gko::array<int64> output{exec, 1};
+    gko::array<char> temp(exec);
+    for (const auto& dim : dims) {
+        temp.clear();
+        gko::kernels::EXEC_NAMESPACE::run_kernel_reduction_cached(
+            exec, [] GKO_KERNEL(auto i, auto j) { return i + j + 2; },
+            [] GKO_KERNEL(auto i, auto j) { return std::max(i, j); },
+            [] GKO_KERNEL(auto j) { return j; }, int64{}, output.get_data(),
+            dim, temp);
+
+        ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()),
+                  static_cast<int64>(dim[0] + dim[1]));
+        // The temporary storage (used for partial sums) must be smaller than
+        // the input array
+        ASSERT_LE(temp.get_num_elems(), dim[0] * dim[1] * sizeof(int64));
+    }
+}
+
+TEST_F(KernelLaunch, Reduction2DCached)
+{
+    run2d_reduction_cached(exec, {{20, 10},
+                                  {10, 3000},
+                                  {1000, 5},
+                                  {30, 50},
+                                  {600, 500},
+                                  {500, 600},
+                                  {1000, 900},
+                                  {900, 1000},
+                                  {1, 100000},
+                                  {100000, 1},
+                                  {500000, 20},
+                                  {20, 500000}});
+}
+
+
 void run2d_row_reduction(std::shared_ptr<gko::EXEC_TYPE> exec)
 {
     for (auto num_rows : {0, 100, 1000, 10000}) {
@@ -468,6 +547,49 @@ void run2d_row_reduction(std::shared_ptr<gko::EXEC_TYPE> exec)
 TEST_F(KernelLaunch, ReductionRow2D) { run2d_row_reduction(exec); }
 
 
+void run2d_row_reduction_cached(std::shared_ptr<gko::EXEC_TYPE> exec,
+                                std::vector<gko::dim<2>> dims)
+{
+    const size_type result_stride = 1;
+    gko::array<char> temp(exec);
+    for (const auto& dim : dims) {
+        gko::array<int64> host_ref{exec->get_master(), dim[0]};
+        gko::array<int64> output{exec, host_ref};
+        temp.clear();
+        for (int64 i = 0; i < host_ref.get_num_elems(); ++i) {
+            host_ref.get_data()[i] = dim[1] + i + 1;
+        }
+
+        gko::kernels::EXEC_NAMESPACE::run_kernel_row_reduction_cached(
+            exec, [] GKO_KERNEL(auto i, auto j) { return i + j + 2; },
+            [] GKO_KERNEL(auto i, auto j) { return std::max(i, j); },
+            [] GKO_KERNEL(auto j) { return j; }, int64{}, output.get_data(),
+            result_stride, dim, temp);
+
+        GKO_ASSERT_ARRAY_EQ(host_ref, output);
+        // The temporary storage (used for partial sums) must be smaller than
+        // the input array
+        ASSERT_LE(temp.get_num_elems(), dim[0] * dim[1] * sizeof(int64));
+    }
+}
+
+TEST_F(KernelLaunch, ReductionRowCached)
+{
+    run2d_row_reduction_cached(exec, {{20, 10},
+                                      {10, 3000},
+                                      {1000, 5},
+                                      {30, 50},
+                                      {600, 500},
+                                      {500, 600},
+                                      {1000, 900},
+                                      {900, 1000},
+                                      {1, 100000},
+                                      {100000, 1},
+                                      {500000, 20},
+                                      {20, 500000}});
+}
+
+
 void run2d_col_reduction(std::shared_ptr<gko::EXEC_TYPE> exec)
 {
     // empty, most threads idle, most threads busy, multiple blocks
@@ -517,3 +639,43 @@ void run2d_col_reduction(std::shared_ptr<gko::EXEC_TYPE> exec)
 }
 
 TEST_F(KernelLaunch, ReductionCol2D) { run2d_col_reduction(exec); }
+
+
+void run2d_col_reduction_cached(std::shared_ptr<gko::EXEC_TYPE> exec,
+                                std::vector<gko::dim<2>> dims)
+{
+    gko::array<char> temp(exec);
+    for (const auto& dim : dims) {
+        gko::array<int64> host_ref{exec->get_master(), dim[1]};
+        gko::array<int64> output{exec, host_ref};
+        temp.clear();
+        for (int64 i = 0; i < host_ref.get_num_elems(); ++i) {
+            host_ref.get_data()[i] = dim[0] + i + 1;
+        }
+
+        gko::kernels::EXEC_NAMESPACE::run_kernel_col_reduction_cached(
+            exec, [] GKO_KERNEL(auto i, auto j) { return i + j + 2; },
+            [] GKO_KERNEL(auto i, auto j) { return std::max(i, j); },
+            [] GKO_KERNEL(auto j) { return j; }, int64{}, output.get_data(),
+            dim, temp);
+
+        GKO_ASSERT_ARRAY_EQ(host_ref, output);
+        ASSERT_LE(temp.get_num_elems(), dim[0] * dim[1] * sizeof(int64));
+    }
+}
+
+TEST_F(KernelLaunch, ReductionColCached)
+{
+    run2d_col_reduction_cached(exec, {{20, 10},
+                                      {10, 3000},
+                                      {1000, 5},
+                                      {30, 50},
+                                      {600, 500},
+                                      {500, 600},
+                                      {1000, 900},
+                                      {900, 1000},
+                                      {1, 100000},
+                                      {100000, 1},
+                                      {500000, 20},
+                                      {20, 500000}});
+}
diff --git a/test/distributed/CMakeLists.txt b/test/distributed/CMakeLists.txt
index 1c8e9b1e8fc..32b3810ea31 100644
--- a/test/distributed/CMakeLists.txt
+++ b/test/distributed/CMakeLists.txt
@@ -1,3 +1,4 @@
 ginkgo_create_common_test(matrix_kernels DISABLE_EXECUTORS dpcpp)
 ginkgo_create_common_test(partition_kernels DISABLE_EXECUTORS dpcpp)
 ginkgo_create_common_test(vector_kernels DISABLE_EXECUTORS dpcpp)
+ginkgo_create_common_and_reference_test(partition_helper_kernels)
diff --git a/test/distributed/partition_helper_kernels.cpp b/test/distributed/partition_helper_kernels.cpp
new file mode 100644
index 00000000000..44d7ac2b8f0
--- /dev/null
+++ b/test/distributed/partition_helper_kernels.cpp
@@ -0,0 +1,277 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <gtest/gtest-typed-test.h>
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/executor.hpp>
+
+
+#include "core/base/iterator_factory.hpp"
+#include "core/distributed/partition_helpers_kernels.hpp"
+#include "core/test/utils.hpp"
+#include "test/utils/executor.hpp"
+
+
+using gko::experimental::distributed::comm_index_type;
+
+
+// TODO: remove with c++17
+template <typename T>
+T clamp(const T& v, const T& lo, const T& hi)
+{
+    return v < lo ? lo : (v > hi ? hi : v);
+}
+
+
+template <typename IndexType>
+std::vector<IndexType> create_iota(IndexType min, IndexType max)
+{
+    std::vector<IndexType> iota(clamp(max - min, IndexType(0), max));
+    std::iota(iota.begin(), iota.end(), min);
+    return iota;
+}
+
+
+template <typename IndexType>
+std::vector<IndexType> create_range_offsets(gko::size_type num_ranges)
+{
+    std::default_random_engine engine;
+    std::uniform_int_distribution<IndexType> dist(5, 10);
+    std::vector<IndexType> range_sizes(num_ranges);
+    std::generate(range_sizes.begin(), range_sizes.end(),
+                  [&]() { return dist(engine); });
+
+    std::vector<IndexType> range_offsets(num_ranges + 1, 0);
+    std::partial_sum(range_sizes.begin(), range_sizes.end(),
+                     range_offsets.begin() + 1);
+    return range_offsets;
+}
+
+
+template <typename IndexType>
+std::vector<IndexType> create_ranges(
+    const std::vector<IndexType>& range_offsets)
+{
+    assert(range_offsets.size() >= 2);
+    gko::size_type num_ranges = range_offsets.size() - 1;
+    std::vector<IndexType> ranges(num_ranges * 2, 0);
+    for (gko::size_type i = 1; i < num_ranges; ++i) {
+        ranges[2 * i - 1] = range_offsets[i];
+        ranges[2 * i] = range_offsets[i];
+    }
+    ranges.back() = range_offsets.back();
+    return ranges;
+}
+
+
+template <typename IndexType>
+std::vector<IndexType> create_ranges(gko::size_type num_ranges)
+{
+    auto range_offsets = create_range_offsets<IndexType>(num_ranges);
+
+    return create_ranges(range_offsets);
+}
+
+
+std::vector<std::size_t> sample_unique(std::size_t min, std::size_t max,
+                                       gko::size_type n)
+{
+    std::default_random_engine engine;
+    auto values = create_iota(min, max);
+    std::shuffle(values.begin(), values.end(), engine);
+    values.erase(values.begin() + clamp(n, gko::size_type(0), values.size()),
+                 values.end());
+    return values;
+}
+
+
+template <typename IndexType>
+std::vector<IndexType> remove_indices(const std::vector<IndexType>& source,
+                                      std::vector<std::size_t> idxs)
+{
+    std::sort(idxs.begin(), idxs.end(), std::greater<>{});
+    auto result = source;
+    for (auto idx : idxs) {
+        result.erase(result.begin() + 2 * idx, result.begin() + 2 * idx + 1);
+    }
+    return result;
+}
+
+
+template <typename IndexType>
+gko::array<IndexType> make_array(std::shared_ptr<const gko::Executor> exec,
+                                 const std::vector<IndexType>& v)
+{
+    return gko::array<IndexType>(exec, v.begin(), v.end());
+}
+
+
+template <typename IndexType>
+std::pair<std::vector<IndexType>, std::vector<comm_index_type>>
+shuffle_range_and_pid(const std::vector<IndexType>& ranges,
+                      const std::vector<comm_index_type>& pid)
+{
+    std::default_random_engine engine;
+
+    auto result = std::make_pair(ranges, pid);
+
+    auto num_ranges = result.second.size();
+    auto range_start_it = gko::detail::make_permute_iterator(
+        result.first.begin(), [](const auto i) { return 2 * i; });
+    auto range_end_it = gko::detail::make_permute_iterator(
+        result.first.begin() + 1, [](const auto i) { return 2 * i; });
+    auto zip_it = gko::detail::make_zip_iterator(range_start_it, range_end_it,
+                                                 result.second.begin());
+    std::shuffle(zip_it, zip_it + num_ranges, engine);
+
+    return result;
+}
+
+
+template <typename IndexType>
+class PartitionHelpers : public CommonTestFixture {
+protected:
+    using index_type = IndexType;
+};
+
+TYPED_TEST_SUITE(PartitionHelpers, gko::test::IndexTypes,
+                 TypenameNameGenerator);
+
+
+TYPED_TEST(PartitionHelpers, CanCheckConsecutiveRanges)
+{
+    using index_type = typename TestFixture::index_type;
+    auto offsets = make_array(this->exec, create_ranges<index_type>(100));
+    bool result = false;
+
+    gko::kernels::EXEC_NAMESPACE::partition_helpers::check_consecutive_ranges(
+        this->exec, offsets, result);
+
+    ASSERT_TRUE(result);
+}
+
+
+TYPED_TEST(PartitionHelpers, CanCheckNonConsecutiveRanges)
+{
+    using index_type = typename TestFixture::index_type;
+    auto full_range_ends = create_ranges<index_type>(100);
+    auto removal_idxs = sample_unique(0, full_range_ends.size() / 2, 4);
+    auto start_ends =
+        make_array(this->exec, remove_indices(full_range_ends, removal_idxs));
+    bool result = true;
+
+    gko::kernels::EXEC_NAMESPACE::partition_helpers::check_consecutive_ranges(
+        this->exec, start_ends, result);
+
+    ASSERT_FALSE(result);
+}
+
+
+TYPED_TEST(PartitionHelpers, CanCheckConsecutiveRangesWithSingleRange)
+{
+    using index_type = typename TestFixture::index_type;
+    auto start_ends = make_array(this->ref, create_ranges<index_type>(1));
+    bool result = false;
+
+    gko::kernels::EXEC_NAMESPACE::partition_helpers::check_consecutive_ranges(
+        this->exec, start_ends, result);
+
+    ASSERT_TRUE(result);
+}
+
+
+TYPED_TEST(PartitionHelpers, CanCheckConsecutiveRangesWithSingleElement)
+{
+    using index_type = typename TestFixture::index_type;
+    auto start_ends = gko::array<index_type>(this->exec, {1});
+    bool result = false;
+
+    gko::kernels::EXEC_NAMESPACE::partition_helpers::check_consecutive_ranges(
+        this->exec, start_ends, result);
+
+    ASSERT_TRUE(result);
+}
+
+
+TYPED_TEST(PartitionHelpers, CanSortConsecutiveRanges)
+{
+    using index_type = typename TestFixture::index_type;
+    auto start_ends = make_array(this->exec, create_ranges<index_type>(100));
+    auto part_ids = create_iota<comm_index_type>(0, 100);
+    auto part_ids_arr = gko::array<comm_index_type>(
+        this->exec, part_ids.begin(), part_ids.end());
+    auto expected_start_ends = start_ends;
+    auto expected_part_ids = part_ids_arr;
+
+    gko::kernels::EXEC_NAMESPACE::partition_helpers::sort_by_range_start(
+        this->exec, start_ends, part_ids_arr);
+
+    GKO_ASSERT_ARRAY_EQ(expected_start_ends, start_ends);
+    GKO_ASSERT_ARRAY_EQ(expected_part_ids, part_ids_arr);
+}
+
+
+TYPED_TEST(PartitionHelpers, CanSortNonConsecutiveRanges)
+{
+    using index_type = typename TestFixture::index_type;
+    auto ranges = create_ranges<index_type>(100);
+    auto part_ids = create_iota(0, 100);
+    auto shuffled = shuffle_range_and_pid(ranges, part_ids);
+    auto expected_start_ends = make_array(this->exec, ranges);
+    auto expected_part_ids = gko::array<comm_index_type>(
+        this->exec, part_ids.begin(), part_ids.end());
+    auto start_ends = make_array(this->exec, shuffled.first);
+    auto part_ids_arr = gko::array<comm_index_type>(
+        this->exec, shuffled.second.begin(), shuffled.second.end());
+
+    gko::kernels::EXEC_NAMESPACE::partition_helpers::sort_by_range_start(
+        this->exec, start_ends, part_ids_arr);
+
+    GKO_ASSERT_ARRAY_EQ(expected_start_ends, start_ends);
+    GKO_ASSERT_ARRAY_EQ(expected_part_ids, part_ids_arr);
+}
+
+
+TYPED_TEST(PartitionHelpers, CanCompressRanges)
+{
+    using index_type = typename TestFixture::index_type;
+    auto expected_offsets = create_range_offsets<index_type>(100);
+    auto ranges = make_array(this->exec, create_ranges(expected_offsets));
+    gko::array<index_type> offsets{this->exec, expected_offsets.size()};
+
+    gko::kernels::EXEC_NAMESPACE::partition_helpers::compress_ranges(
+        this->exec, ranges, offsets);
+
+    GKO_ASSERT_ARRAY_EQ(offsets, make_array(this->exec, expected_offsets));
+}
diff --git a/test/distributed/partition_kernels.cpp b/test/distributed/partition_kernels.cpp
index 686d1432da5..7033abb37ef 100644
--- a/test/distributed/partition_kernels.cpp
+++ b/test/distributed/partition_kernels.cpp
@@ -276,6 +276,22 @@ TYPED_TEST(Partition, BuildsFromContiguousWithSingleEntry)
 }
 
 
+TYPED_TEST(Partition, BuildsFromContiguousWithPartId)
+{
+    using global_index_type = typename TestFixture::global_index_type;
+    using part_type = typename TestFixture::part_type;
+    gko::array<global_index_type> ranges{this->ref,
+                                         {0, 1234, 3134, 4578, 16435, 60000}};
+    gko::array<comm_index_type> part_id{this->ref, {0, 4, 3, 1, 2}};
+    gko::array<global_index_type> dranges{this->exec, ranges};
+
+    auto part = part_type::build_from_contiguous(this->ref, ranges, part_id);
+    auto dpart = part_type::build_from_contiguous(this->exec, dranges, part_id);
+
+    this->assert_equal(part, dpart);
+}
+
+
 TYPED_TEST(Partition, BuildsFromGlobalSize)
 {
     using global_index_type = typename TestFixture::global_index_type;
diff --git a/test/factorization/lu_kernels.cpp b/test/factorization/lu_kernels.cpp
index 509ed0415f1..9580a551323 100644
--- a/test/factorization/lu_kernels.cpp
+++ b/test/factorization/lu_kernels.cpp
@@ -53,6 +53,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/factorization/cholesky_kernels.hpp"
 #include "core/factorization/elimination_forest.hpp"
+#include "core/factorization/symbolic.hpp"
 #include "core/matrix/csr_kernels.hpp"
 #include "core/matrix/csr_lookup.hpp"
 #include "core/test/utils.hpp"
@@ -238,19 +239,92 @@ TYPED_TEST(Lu, KernelFactorizeIsEquivalentToRef)
 }
 
 
+TYPED_TEST(Lu, SymbolicCholeskyWorks)
+{
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    this->forall_matrices([this] {
+        std::unique_ptr<gko::matrix::Csr<value_type, index_type>> dlu;
+        std::unique_ptr<gko::factorization::elimination_forest<index_type>>
+            forest;
+        gko::factorization::symbolic_cholesky(this->dmtx.get(), true, dlu,
+                                              forest);
+
+        GKO_ASSERT_MTX_EQ_SPARSITY(dlu, this->dmtx_lu);
+    });
+}
+
+
+TYPED_TEST(Lu, SymbolicLUWorks)
+{
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    this->forall_matrices([this] {
+        std::unique_ptr<gko::matrix::Csr<value_type, index_type>> dlu;
+        gko::factorization::symbolic_lu(this->dmtx.get(), dlu);
+
+        GKO_ASSERT_MTX_EQ_SPARSITY(dlu, this->dmtx_lu);
+    });
+}
+
+
+TYPED_TEST(Lu, SymbolicLUNearSymmWorks)
+{
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    this->forall_matrices([this] {
+        std::unique_ptr<gko::matrix::Csr<value_type, index_type>> dlu;
+        gko::factorization::symbolic_lu_near_symm(this->dmtx.get(), dlu);
+
+        GKO_ASSERT_MTX_EQ_SPARSITY(dlu, this->dmtx_lu);
+    });
+}
+
+
 TYPED_TEST(Lu, GenerateSymmWithUnknownSparsityIsEquivalentToRef)
 {
     using value_type = typename TestFixture::value_type;
     using index_type = typename TestFixture::index_type;
     this->forall_matrices([this] {
-        auto factory = gko::experimental::factorization::Lu<value_type,
-                                                            index_type>::build()
-                           .with_symmetric_sparsity(true)
-                           .on(this->ref);
+        auto factory =
+            gko::experimental::factorization::Lu<value_type,
+                                                 index_type>::build()
+                .with_symbolic_algorithm(
+                    gko::experimental::factorization::symbolic_type::symmetric)
+                .on(this->ref);
+        auto dfactory =
+            gko::experimental::factorization::Lu<value_type,
+                                                 index_type>::build()
+                .with_symbolic_algorithm(
+                    gko::experimental::factorization::symbolic_type::symmetric)
+                .on(this->exec);
+
+        auto lu = factory->generate(this->mtx);
+        auto dlu = dfactory->generate(this->dmtx);
+
+        GKO_ASSERT_MTX_EQ_SPARSITY(lu->get_combined(), dlu->get_combined());
+        GKO_ASSERT_MTX_NEAR(lu->get_combined(), dlu->get_combined(),
+                            r<value_type>::value);
+    });
+}
+
+
+TYPED_TEST(Lu, GenerateNearSymmWithUnknownSparsityIsEquivalentToRef)
+{
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::index_type;
+    this->forall_matrices([this] {
+        auto factory =
+            gko::experimental::factorization::Lu<value_type,
+                                                 index_type>::build()
+                .with_symbolic_algorithm(gko::experimental::factorization::
+                                             symbolic_type::near_symmetric)
+                .on(this->ref);
         auto dfactory =
             gko::experimental::factorization::Lu<value_type,
                                                  index_type>::build()
-                .with_symmetric_sparsity(true)
+                .with_symbolic_algorithm(gko::experimental::factorization::
+                                             symbolic_type::near_symmetric)
                 .on(this->exec);
 
         auto lu = factory->generate(this->mtx);
diff --git a/test/matrix/CMakeLists.txt b/test/matrix/CMakeLists.txt
index a9cf267a3c8..d49373811dc 100644
--- a/test/matrix/CMakeLists.txt
+++ b/test/matrix/CMakeLists.txt
@@ -1,3 +1,5 @@
+ginkgo_create_common_test(batch_dense_kernels)
+ginkgo_create_common_test(batch_ell_kernels)
 ginkgo_create_common_device_test(csr_kernels)
 ginkgo_create_common_test(csr_kernels2)
 ginkgo_create_common_test(coo_kernels)
@@ -12,5 +14,7 @@ else()
 endif()
 ginkgo_create_common_test(hybrid_kernels)
 ginkgo_create_common_test(matrix)
+ginkgo_create_common_test(permutation_kernels)
+ginkgo_create_common_test(scaled_permutation_kernels)
 ginkgo_create_common_test(sellp_kernels)
 ginkgo_create_common_test(sparsity_csr_kernels)
diff --git a/test/matrix/batch_dense_kernels.cpp b/test/matrix/batch_dense_kernels.cpp
new file mode 100644
index 00000000000..fa75a8f61e4
--- /dev/null
+++ b/test/matrix/batch_dense_kernels.cpp
@@ -0,0 +1,138 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/matrix/batch_dense_kernels.hpp"
+
+
+#include <memory>
+#include <random>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/batch_dense.hpp>
+
+
+#include "core/base/batch_utilities.hpp"
+#include "core/test/utils.hpp"
+#include "core/test/utils/assertions.hpp"
+#include "core/test/utils/batch_helpers.hpp"
+#include "test/utils/executor.hpp"
+
+
+class Dense : public CommonTestFixture {
+protected:
+    using BMtx = gko::batch::matrix::Dense<value_type>;
+    using BMVec = gko::batch::MultiVector<value_type>;
+
+    Dense() : rand_engine(15) {}
+
+    template <typename BMtxType>
+    std::unique_ptr<BMtxType> gen_mtx(const gko::size_type num_batch_items,
+                                      gko::size_type num_rows,
+                                      gko::size_type num_cols)
+    {
+        return gko::test::generate_random_batch_matrix<BMtxType>(
+            num_batch_items, num_rows, num_cols,
+            std::uniform_int_distribution<>(num_cols, num_cols),
+            std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
+    }
+
+    void set_up_apply_data(gko::size_type num_rows, gko::size_type num_vecs = 1)
+    {
+        const gko::size_type num_cols = 32;
+        mat = gen_mtx<BMtx>(batch_size, num_rows, num_cols);
+        y = gen_mtx<BMVec>(batch_size, num_cols, num_vecs);
+        alpha = gen_mtx<BMVec>(batch_size, 1, 1);
+        beta = gen_mtx<BMVec>(batch_size, 1, 1);
+        dmat = gko::clone(exec, mat);
+        dy = gko::clone(exec, y);
+        dalpha = gko::clone(exec, alpha);
+        dbeta = gko::clone(exec, beta);
+        expected = BMVec::create(
+            ref,
+            gko::batch_dim<2>(batch_size, gko::dim<2>{num_rows, num_vecs}));
+        expected->fill(gko::one<value_type>());
+        dresult = gko::clone(exec, expected);
+    }
+
+    std::default_random_engine rand_engine;
+
+    const gko::size_type batch_size = 11;
+    std::unique_ptr<BMtx> mat;
+    std::unique_ptr<BMVec> y;
+    std::unique_ptr<BMVec> alpha;
+    std::unique_ptr<BMVec> beta;
+    std::unique_ptr<BMVec> expected;
+    std::unique_ptr<BMVec> dresult;
+    std::unique_ptr<BMtx> dmat;
+    std::unique_ptr<BMVec> dy;
+    std::unique_ptr<BMVec> dalpha;
+    std::unique_ptr<BMVec> dbeta;
+};
+
+
+TEST_F(Dense, SingleVectorApplyIsEquivalentToRefForSmallMatrices)
+{
+    set_up_apply_data(10);
+
+    mat->apply(y.get(), expected.get());
+    dmat->apply(dy.get(), dresult.get());
+
+    GKO_ASSERT_BATCH_MTX_NEAR(dresult, expected, r<value_type>::value);
+}
+
+
+TEST_F(Dense, SingleVectorApplyIsEquivalentToRef)
+{
+    set_up_apply_data(257);
+
+    mat->apply(y.get(), expected.get());
+    dmat->apply(dy.get(), dresult.get());
+
+    GKO_ASSERT_BATCH_MTX_NEAR(dresult, expected, r<value_type>::value);
+}
+
+
+TEST_F(Dense, SingleVectorAdvancedApplyIsEquivalentToRef)
+{
+    set_up_apply_data(257);
+
+    mat->apply(alpha.get(), y.get(), beta.get(), expected.get());
+    dmat->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get());
+
+    GKO_ASSERT_BATCH_MTX_NEAR(dresult, expected, r<value_type>::value);
+}
diff --git a/test/matrix/batch_ell_kernels.cpp b/test/matrix/batch_ell_kernels.cpp
new file mode 100644
index 00000000000..7a4c6558c5d
--- /dev/null
+++ b/test/matrix/batch_ell_kernels.cpp
@@ -0,0 +1,143 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/matrix/batch_ell_kernels.hpp"
+
+
+#include <memory>
+#include <random>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/batch_ell.hpp>
+
+
+#include "core/base/batch_utilities.hpp"
+#include "core/test/utils.hpp"
+#include "core/test/utils/assertions.hpp"
+#include "core/test/utils/batch_helpers.hpp"
+#include "test/utils/executor.hpp"
+
+
+class Ell : public CommonTestFixture {
+protected:
+    using BMtx = gko::batch::matrix::Ell<value_type, gko::int32>;
+    using BMVec = gko::batch::MultiVector<value_type>;
+
+    Ell() : rand_engine(15) {}
+
+    template <typename BMtxType>
+    std::unique_ptr<BMtxType> gen_mtx(const gko::size_type num_batch_items,
+                                      gko::size_type num_rows,
+                                      gko::size_type num_cols,
+                                      int num_elems_per_row)
+    {
+        return gko::test::generate_random_batch_matrix<BMtxType>(
+            num_batch_items, num_rows, num_cols,
+            std::uniform_int_distribution<>(num_elems_per_row,
+                                            num_elems_per_row),
+            std::normal_distribution<>(-1.0, 1.0), rand_engine, ref,
+            num_elems_per_row);
+    }
+
+    std::unique_ptr<BMVec> gen_mvec(const gko::size_type num_batch_items,
+                                    gko::size_type num_rows,
+                                    gko::size_type num_cols)
+    {
+        return gko::test::generate_random_batch_matrix<BMVec>(
+            num_batch_items, num_rows, num_cols,
+            std::uniform_int_distribution<>(num_cols, num_cols),
+            std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
+    }
+
+    void set_up_apply_data(gko::size_type num_vecs = 1,
+                           int num_elems_per_row = 5)
+    {
+        const gko::size_type num_rows = 252;
+        const gko::size_type num_cols = 32;
+        GKO_ASSERT(num_elems_per_row <= num_cols);
+        mat = gen_mtx<BMtx>(batch_size, num_rows, num_cols, num_elems_per_row);
+        y = gen_mvec(batch_size, num_cols, num_vecs);
+        alpha = gen_mvec(batch_size, 1, 1);
+        beta = gen_mvec(batch_size, 1, 1);
+        dmat = gko::clone(exec, mat);
+        dy = gko::clone(exec, y);
+        dalpha = gko::clone(exec, alpha);
+        dbeta = gko::clone(exec, beta);
+        expected = BMVec::create(
+            ref,
+            gko::batch_dim<2>(batch_size, gko::dim<2>{num_rows, num_vecs}));
+        expected->fill(gko::one<value_type>());
+        dresult = gko::clone(exec, expected);
+    }
+
+    std::ranlux48 rand_engine;
+
+    const gko::size_type batch_size = 11;
+    std::unique_ptr<BMtx> mat;
+    std::unique_ptr<BMVec> y;
+    std::unique_ptr<BMVec> alpha;
+    std::unique_ptr<BMVec> beta;
+    std::unique_ptr<BMVec> expected;
+    std::unique_ptr<BMVec> dresult;
+    std::unique_ptr<BMtx> dmat;
+    std::unique_ptr<BMVec> dy;
+    std::unique_ptr<BMVec> dalpha;
+    std::unique_ptr<BMVec> dbeta;
+};
+
+
+TEST_F(Ell, SingleVectorApplyIsEquivalentToRef)
+{
+    set_up_apply_data(1);
+
+    mat->apply(y.get(), expected.get());
+    dmat->apply(dy.get(), dresult.get());
+
+    GKO_ASSERT_BATCH_MTX_NEAR(dresult, expected, r<value_type>::value);
+}
+
+
+TEST_F(Ell, SingleVectorAdvancedApplyIsEquivalentToRef)
+{
+    set_up_apply_data(1);
+
+    mat->apply(alpha.get(), y.get(), beta.get(), expected.get());
+    dmat->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get());
+
+    GKO_ASSERT_BATCH_MTX_NEAR(dresult, expected, r<value_type>::value);
+}
diff --git a/test/matrix/csr_kernels2.cpp b/test/matrix/csr_kernels2.cpp
index 4d3ffa61323..1f1d459f330 100644
--- a/test/matrix/csr_kernels2.cpp
+++ b/test/matrix/csr_kernels2.cpp
@@ -48,6 +48,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/ell.hpp>
 #include <ginkgo/core/matrix/hybrid.hpp>
 #include <ginkgo/core/matrix/identity.hpp>
+#include <ginkgo/core/matrix/permutation.hpp>
+#include <ginkgo/core/matrix/scaled_permutation.hpp>
 #include <ginkgo/core/matrix/sellp.hpp>
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
@@ -55,6 +57,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/csr_kernels.hpp"
 #include "core/test/utils.hpp"
+#include "core/test/utils/assertions.hpp"
 #include "core/test/utils/unsort_matrix.hpp"
 #include "core/utils/matrix_utils.hpp"
 #include "test/utils/executor.hpp"
@@ -68,6 +71,8 @@ class Csr : public CommonTestFixture {
     using Mtx = gko::matrix::Csr<value_type>;
     using ComplexVec = gko::matrix::Dense<std::complex<value_type>>;
     using ComplexMtx = gko::matrix::Csr<std::complex<value_type>>;
+    using Perm = gko::matrix::Permutation<index_type>;
+    using ScaledPerm = gko::matrix::ScaledPermutation<value_type, index_type>;
 
     Csr()
 #ifdef GINKGO_FAST_TESTS
@@ -162,8 +167,8 @@ class Csr : public CommonTestFixture {
         beta2 = gko::initialize<Vec2>({-1.0}, ref);
         dmtx = Mtx::create(exec, strategy);
         dmtx->copy_from(mtx);
-        square_dmtx = Mtx::create(exec, strategy);
-        square_dmtx->copy_from(square_mtx);
+        dsquare_mtx = Mtx::create(exec, strategy);
+        dsquare_mtx->copy_from(square_mtx);
         dresult = gko::clone(exec, expected);
         dresult2 = gko::clone(exec, expected2);
         dy = gko::clone(exec, y);
@@ -180,8 +185,22 @@ class Csr : public CommonTestFixture {
         std::vector<int> tmp2(mtx->get_size()[1], 0);
         std::iota(tmp2.begin(), tmp2.end(), 0);
         std::shuffle(tmp2.begin(), tmp2.end(), rng);
+        std::vector<value_type> scale(mtx->get_size()[0]);
+        std::vector<value_type> scale2(mtx->get_size()[1]);
+        std::uniform_real_distribution<value_type> dist(1, 2);
+        auto gen = [&] { return dist(rng); };
+        std::generate(scale.begin(), scale.end(), gen);
+        std::generate(scale2.begin(), scale2.end(), gen);
         rpermute_idxs = std::make_unique<Arr>(ref, tmp.begin(), tmp.end());
         cpermute_idxs = std::make_unique<Arr>(ref, tmp2.begin(), tmp2.end());
+        rpermutation = Perm::create(ref, *rpermute_idxs);
+        cpermutation = Perm::create(ref, *cpermute_idxs);
+        srpermutation = ScaledPerm::create(
+            ref, gko::array<value_type>(ref, scale.begin(), scale.end()),
+            *rpermute_idxs);
+        scpermutation = ScaledPerm::create(
+            ref, gko::array<value_type>(ref, scale2.begin(), scale2.end()),
+            *cpermute_idxs);
     }
 
     template <typename StrategyType>
@@ -192,8 +211,8 @@ class Csr : public CommonTestFixture {
         complex_mtx = ComplexMtx::create(ref, strategy);
         complex_mtx->move_from(
             gen_mtx<ComplexVec>(mtx_size[0], mtx_size[1], 1));
-        complex_dmtx = ComplexMtx::create(exec, strategy);
-        complex_dmtx->copy_from(complex_mtx);
+        dcomplex_mtx = ComplexMtx::create(exec, strategy);
+        dcomplex_mtx->copy_from(complex_mtx);
     }
 
     void unsort_mtx()
@@ -220,8 +239,8 @@ class Csr : public CommonTestFixture {
 
     std::unique_ptr<Mtx> dmtx;
     std::unique_ptr<Mtx> dmtx2;
-    std::unique_ptr<ComplexMtx> complex_dmtx;
-    std::unique_ptr<Mtx> square_dmtx;
+    std::unique_ptr<ComplexMtx> dcomplex_mtx;
+    std::unique_ptr<Mtx> dsquare_mtx;
     std::unique_ptr<Vec> dresult;
     std::unique_ptr<Vec2> dresult2;
     std::unique_ptr<Vec> dy;
@@ -232,6 +251,10 @@ class Csr : public CommonTestFixture {
     std::unique_ptr<Vec2> dbeta2;
     std::unique_ptr<Arr> rpermute_idxs;
     std::unique_ptr<Arr> cpermute_idxs;
+    std::unique_ptr<Perm> rpermutation;
+    std::unique_ptr<Perm> cpermutation;
+    std::unique_ptr<ScaledPerm> srpermutation;
+    std::unique_ptr<ScaledPerm> scpermutation;
 };
 
 
@@ -510,11 +533,11 @@ TEST_F(Csr, AdvancedApplyToCsrMatrixIsEquivalentToRef)
     auto d_trans = dmtx->transpose();
 
     mtx->apply(alpha, trans, beta, square_mtx);
-    dmtx->apply(dalpha, d_trans, dbeta, square_dmtx);
+    dmtx->apply(dalpha, d_trans, dbeta, dsquare_mtx);
 
-    GKO_ASSERT_MTX_NEAR(square_dmtx, square_mtx, r<value_type>::value);
-    GKO_ASSERT_MTX_EQ_SPARSITY(square_dmtx, square_mtx);
-    ASSERT_TRUE(square_dmtx->is_sorted_by_column_index());
+    GKO_ASSERT_MTX_NEAR(dsquare_mtx, square_mtx, r<value_type>::value);
+    GKO_ASSERT_MTX_EQ_SPARSITY(dsquare_mtx, square_mtx);
+    ASSERT_TRUE(dsquare_mtx->is_sorted_by_column_index());
 }
 
 
@@ -525,11 +548,11 @@ TEST_F(Csr, SimpleApplyToCsrMatrixIsEquivalentToRef)
     auto d_trans = dmtx->transpose();
 
     mtx->apply(trans, square_mtx);
-    dmtx->apply(d_trans, square_dmtx);
+    dmtx->apply(d_trans, dsquare_mtx);
 
-    GKO_ASSERT_MTX_NEAR(square_dmtx, square_mtx, r<value_type>::value);
-    GKO_ASSERT_MTX_EQ_SPARSITY(square_dmtx, square_mtx);
-    ASSERT_TRUE(square_dmtx->is_sorted_by_column_index());
+    GKO_ASSERT_MTX_NEAR(dsquare_mtx, square_mtx, r<value_type>::value);
+    GKO_ASSERT_MTX_EQ_SPARSITY(dsquare_mtx, square_mtx);
+    ASSERT_TRUE(dsquare_mtx->is_sorted_by_column_index());
 }
 
 
@@ -542,11 +565,11 @@ TEST_F(Csr, SimpleApplyToSparseCsrMatrixIsEquivalentToRef)
     dmtx2->copy_from(mtx2);
 
     mtx->apply(mtx2, square_mtx);
-    dmtx->apply(dmtx2, square_dmtx);
+    dmtx->apply(dmtx2, dsquare_mtx);
 
-    GKO_ASSERT_MTX_EQ_SPARSITY(square_dmtx, square_mtx);
-    GKO_ASSERT_MTX_NEAR(square_dmtx, square_mtx, r<value_type>::value);
-    ASSERT_TRUE(square_dmtx->is_sorted_by_column_index());
+    GKO_ASSERT_MTX_EQ_SPARSITY(dsquare_mtx, square_mtx);
+    GKO_ASSERT_MTX_NEAR(dsquare_mtx, square_mtx, r<value_type>::value);
+    ASSERT_TRUE(dsquare_mtx->is_sorted_by_column_index());
 }
 
 
@@ -560,11 +583,11 @@ TEST_F(Csr, SimpleApplySparseToSparseCsrMatrixIsEquivalentToRef)
     auto dmtx2 = gko::clone(exec, mtx2);
 
     mtx1->apply(mtx2, square_mtx);
-    dmtx1->apply(dmtx2, square_dmtx);
+    dmtx1->apply(dmtx2, dsquare_mtx);
 
-    GKO_ASSERT_MTX_EQ_SPARSITY(square_dmtx, square_mtx);
-    GKO_ASSERT_MTX_NEAR(square_dmtx, square_mtx, r<value_type>::value);
-    ASSERT_TRUE(square_dmtx->is_sorted_by_column_index());
+    GKO_ASSERT_MTX_EQ_SPARSITY(dsquare_mtx, square_mtx);
+    GKO_ASSERT_MTX_NEAR(dsquare_mtx, square_mtx, r<value_type>::value);
+    ASSERT_TRUE(dsquare_mtx->is_sorted_by_column_index());
 }
 
 
@@ -581,11 +604,11 @@ TEST_F(Csr, SimpleApplyToEmptyCsrMatrixIsEquivalentToRef)
     dmtx2->copy_from(mtx2);
 
     mtx->apply(mtx2, square_mtx);
-    dmtx->apply(dmtx2, square_dmtx);
+    dmtx->apply(dmtx2, dsquare_mtx);
 
-    GKO_ASSERT_MTX_EQ_SPARSITY(square_dmtx, square_mtx);
-    GKO_ASSERT_MTX_NEAR(square_dmtx, square_mtx, r<value_type>::value);
-    ASSERT_TRUE(square_dmtx->is_sorted_by_column_index());
+    GKO_ASSERT_MTX_EQ_SPARSITY(dsquare_mtx, square_mtx);
+    GKO_ASSERT_MTX_NEAR(dsquare_mtx, square_mtx, r<value_type>::value);
+    ASSERT_TRUE(dsquare_mtx->is_sorted_by_column_index());
 }
 
 
@@ -673,7 +696,7 @@ TEST_F(Csr, ConjugateTransposeIsEquivalentToRef)
     set_up_apply_complex_data<ComplexMtx::classical>();
 
     auto trans = gko::as<ComplexMtx>(complex_mtx->conj_transpose());
-    auto d_trans = gko::as<ComplexMtx>(complex_dmtx->conj_transpose());
+    auto d_trans = gko::as<ComplexMtx>(dcomplex_mtx->conj_transpose());
 
     GKO_ASSERT_MTX_NEAR(d_trans, trans, 0.0);
     ASSERT_TRUE(d_trans->is_sorted_by_column_index());
@@ -868,12 +891,176 @@ TEST_F(Csr, MoveToHybridIsEquivalentToRef)
 }
 
 
+TEST_F(Csr, IsGenericPermutable)
+{
+    using gko::matrix::permute_mode;
+    set_up_apply_data<Mtx::classical>();
+
+    for (auto mode :
+         {permute_mode::none, permute_mode::rows, permute_mode::columns,
+          permute_mode::symmetric, permute_mode::inverse_rows,
+          permute_mode::inverse_columns, permute_mode::inverse_symmetric}) {
+        SCOPED_TRACE(mode);
+        auto permuted = square_mtx->permute(rpermutation, mode);
+        auto dpermuted = dsquare_mtx->permute(rpermutation, mode);
+
+        GKO_ASSERT_MTX_NEAR(permuted, dpermuted, 0);
+        GKO_ASSERT_MTX_EQ_SPARSITY(permuted, dpermuted);
+        ASSERT_TRUE(dpermuted->is_sorted_by_column_index());
+    }
+}
+
+
+TEST_F(Csr, IsColPermutableHypersparse)
+{
+    using gko::matrix::permute_mode;
+    auto hypersparse_mtx = gko::initialize<Mtx>(
+        {{0.0, 0.0, 0.0}, {1.0, 0.0, 0.0}, {0.0, 0.0, 2.0}}, ref);
+    auto dhypersparse_mtx = hypersparse_mtx->clone();
+    auto perm3 = Perm::create(ref, gko::array<index_type>{ref, {1, 2, 0}});
+
+    for (auto mode : {permute_mode::columns, permute_mode::inverse_columns}) {
+        SCOPED_TRACE(mode);
+        auto permuted = hypersparse_mtx->permute(perm3, mode);
+        auto dpermuted = dhypersparse_mtx->permute(perm3, mode);
+
+        GKO_ASSERT_MTX_NEAR(permuted, dpermuted, 0);
+        GKO_ASSERT_MTX_EQ_SPARSITY(permuted, dpermuted);
+        ASSERT_TRUE(dpermuted->is_sorted_by_column_index());
+    }
+}
+
+
+TEST_F(Csr, IsGenericPermutableRectangular)
+{
+    using gko::matrix::permute_mode;
+    set_up_apply_data<Mtx::classical>();
+
+    for (auto mode :
+         {permute_mode::rows, permute_mode::columns, permute_mode::inverse_rows,
+          permute_mode::inverse_columns}) {
+        SCOPED_TRACE(mode);
+        auto perm = (mode & permute_mode::rows) == permute_mode::rows
+                        ? rpermutation.get()
+                        : cpermutation.get();
+
+        auto permuted = mtx->permute(perm, mode);
+        auto dpermuted = dmtx->permute(perm, mode);
+
+        GKO_ASSERT_MTX_NEAR(permuted, dpermuted, 0);
+        GKO_ASSERT_MTX_EQ_SPARSITY(permuted, dpermuted);
+        ASSERT_TRUE(dpermuted->is_sorted_by_column_index());
+    }
+}
+
+
+TEST_F(Csr, IsNonsymmPermutable)
+{
+    using gko::matrix::permute_mode;
+    set_up_apply_data<Mtx::classical>();
+
+    for (auto invert : {false, true}) {
+        SCOPED_TRACE(invert);
+        auto permuted = mtx->permute(rpermutation, cpermutation, invert);
+        auto dpermuted = dmtx->permute(rpermutation, cpermutation, invert);
+
+        GKO_ASSERT_MTX_NEAR(permuted, dpermuted, 0);
+        GKO_ASSERT_MTX_EQ_SPARSITY(permuted, dpermuted);
+        ASSERT_TRUE(dpermuted->is_sorted_by_column_index());
+    }
+}
+
+
+TEST_F(Csr, IsGenericScalePermutable)
+{
+    using gko::matrix::permute_mode;
+    set_up_apply_data<Mtx::classical>();
+
+    for (auto mode :
+         {permute_mode::none, permute_mode::rows, permute_mode::columns,
+          permute_mode::symmetric, permute_mode::inverse_rows,
+          permute_mode::inverse_columns, permute_mode::inverse_symmetric}) {
+        SCOPED_TRACE(mode);
+        auto permuted = square_mtx->scale_permute(srpermutation, mode);
+        auto dpermuted = dsquare_mtx->scale_permute(srpermutation, mode);
+
+        GKO_EXPECT_MTX_NEAR(permuted, dpermuted, r<value_type>::value);
+        GKO_EXPECT_MTX_EQ_SPARSITY(permuted, dpermuted);
+        EXPECT_TRUE(dpermuted->is_sorted_by_column_index());
+    }
+}
+
+
+TEST_F(Csr, IsColScalePermutableHypersparse)
+{
+    using gko::matrix::permute_mode;
+    auto hypersparse_mtx = gko::initialize<Mtx>(
+        {{0.0, 0.0, 0.0}, {1.0, 0.0, 0.0}, {0.0, 0.0, 2.0}}, ref);
+    auto dhypersparse_mtx = hypersparse_mtx->clone();
+    auto perm3 =
+        ScaledPerm::create(ref, gko::array<value_type>{ref, {1.0, 2.0, 4.0}},
+                           gko::array<index_type>{ref, {1, 2, 0}});
+
+    for (auto mode : {permute_mode::columns, permute_mode::inverse_columns}) {
+        SCOPED_TRACE(mode);
+        auto permuted = hypersparse_mtx->scale_permute(perm3, mode);
+        auto dpermuted = dhypersparse_mtx->scale_permute(perm3, mode);
+
+        GKO_ASSERT_MTX_NEAR(permuted, dpermuted, r<value_type>::value);
+        GKO_ASSERT_MTX_EQ_SPARSITY(permuted, dpermuted);
+        ASSERT_TRUE(dpermuted->is_sorted_by_column_index());
+    }
+}
+
+
+TEST_F(Csr, IsGenericScalePermutableRectangular)
+{
+    using gko::matrix::permute_mode;
+    set_up_apply_data<Mtx::classical>();
+
+    for (auto mode :
+         {permute_mode::rows, permute_mode::columns, permute_mode::inverse_rows,
+          permute_mode::inverse_columns}) {
+        SCOPED_TRACE(mode);
+        auto perm = (mode & permute_mode::rows) == permute_mode::rows
+                        ? srpermutation.get()
+                        : scpermutation.get();
+
+        auto permuted = mtx->scale_permute(perm, mode);
+        auto dpermuted = dmtx->scale_permute(perm, mode);
+
+        GKO_ASSERT_MTX_NEAR(permuted, dpermuted, r<value_type>::value);
+        GKO_ASSERT_MTX_EQ_SPARSITY(permuted, dpermuted);
+        ASSERT_TRUE(dpermuted->is_sorted_by_column_index());
+    }
+}
+
+
+TEST_F(Csr, IsNonsymmScalePermutable)
+{
+    using gko::matrix::permute_mode;
+    set_up_apply_data<Mtx::classical>();
+
+    for (auto invert : {false, true}) {
+        SCOPED_TRACE(invert);
+        auto permuted =
+            mtx->scale_permute(srpermutation, scpermutation, invert);
+        auto dpermuted =
+            dmtx->scale_permute(srpermutation, scpermutation, invert);
+
+        GKO_EXPECT_MTX_NEAR(permuted, dpermuted, r<value_type>::value);
+        GKO_EXPECT_MTX_EQ_SPARSITY(permuted, dpermuted);
+        EXPECT_TRUE(dpermuted->is_sorted_by_column_index());
+    }
+}
+
+
 TEST_F(Csr, IsPermutable)
 {
     set_up_apply_data<Mtx::classical>();
 
     auto permuted = gko::as<Mtx>(square_mtx->permute(rpermute_idxs.get()));
-    auto dpermuted = gko::as<Mtx>(square_dmtx->permute(rpermute_idxs.get()));
+    auto dpermuted = gko::as<Mtx>(dsquare_mtx->permute(rpermute_idxs.get()));
 
     GKO_ASSERT_MTX_EQ_SPARSITY(permuted, dpermuted);
     GKO_ASSERT_MTX_NEAR(permuted, dpermuted, 0);
@@ -887,7 +1074,7 @@ TEST_F(Csr, IsInversePermutable)
     auto permuted =
         gko::as<Mtx>(square_mtx->inverse_permute(rpermute_idxs.get()));
     auto dpermuted =
-        gko::as<Mtx>(square_dmtx->inverse_permute(rpermute_idxs.get()));
+        gko::as<Mtx>(dsquare_mtx->inverse_permute(rpermute_idxs.get()));
 
     GKO_ASSERT_MTX_EQ_SPARSITY(permuted, dpermuted);
     GKO_ASSERT_MTX_NEAR(permuted, dpermuted, 0);
@@ -1141,9 +1328,9 @@ TEST_F(Csr, InplaceAbsoluteComplexMatrixIsEquivalentToRef)
     set_up_apply_complex_data<ComplexMtx::classical>();
 
     complex_mtx->compute_absolute_inplace();
-    complex_dmtx->compute_absolute_inplace();
+    dcomplex_mtx->compute_absolute_inplace();
 
-    GKO_ASSERT_MTX_NEAR(complex_mtx, complex_dmtx, r<value_type>::value);
+    GKO_ASSERT_MTX_NEAR(complex_mtx, dcomplex_mtx, r<value_type>::value);
 }
 
 
@@ -1152,7 +1339,7 @@ TEST_F(Csr, OutplaceAbsoluteComplexMatrixIsEquivalentToRef)
     set_up_apply_complex_data<ComplexMtx::classical>();
 
     auto abs_mtx = complex_mtx->compute_absolute();
-    auto dabs_mtx = complex_dmtx->compute_absolute();
+    auto dabs_mtx = dcomplex_mtx->compute_absolute();
 
     GKO_ASSERT_MTX_NEAR(abs_mtx, dabs_mtx, r<value_type>::value);
 }
@@ -1311,17 +1498,16 @@ TEST_F(Csr, CreateSubMatrixIsEquivalentToRef)
 }
 
 
-#ifndef GKO_COMPILING_DPCPP
-
-
 TEST_F(Csr, CanDetectMissingDiagonalEntry)
 {
     using T = double;
     using Csr = Mtx;
-    auto ref_mtx = gen_mtx<Csr>(103, 98, 10);
+    auto ref_mtx = gen_mtx<Csr>(103, 104, 10);
     const auto rowptrs = ref_mtx->get_row_ptrs();
     const auto colidxs = ref_mtx->get_col_idxs();
-    const int testrow = 15;
+    gko::utils::ensure_all_diagonal_entries(ref_mtx.get());
+    // Choose the last row to ensure that kernel assign enough work
+    const int testrow = 102;
     gko::utils::remove_diagonal_entry_from_row(ref_mtx.get(), testrow);
     auto mtx = gko::clone(exec, ref_mtx);
     bool has_diags = true;
@@ -1359,6 +1545,3 @@ TEST_F(Csr, AddScaledIdentityToNonSquare)
 
     GKO_ASSERT_MTX_NEAR(mtx, dmtx, r<value_type>::value);
 }
-
-
-#endif  // GKO_COMPILING_DPCPP
diff --git a/test/matrix/dense_kernels.cpp b/test/matrix/dense_kernels.cpp
index e9449ee9262..c289afed41b 100644
--- a/test/matrix/dense_kernels.cpp
+++ b/test/matrix/dense_kernels.cpp
@@ -50,6 +50,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/diagonal.hpp>
 #include <ginkgo/core/matrix/ell.hpp>
 #include <ginkgo/core/matrix/hybrid.hpp>
+#include <ginkgo/core/matrix/permutation.hpp>
+#include <ginkgo/core/matrix/scaled_permutation.hpp>
 #include <ginkgo/core/matrix/sellp.hpp>
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
@@ -70,6 +72,9 @@ class Dense : public CommonTestFixture {
     using ComplexMtx = gko::matrix::Dense<std::complex<value_type>>;
     using Diagonal = gko::matrix::Diagonal<value_type>;
     using MixedComplexMtx = gko::matrix::Dense<std::complex<mixed_type>>;
+    using Permutation = gko::matrix::Permutation<index_type>;
+    using ScaledPermutation =
+        gko::matrix::ScaledPermutation<value_type, index_type>;
 
     Dense() : rand_engine(15) {}
 
@@ -145,16 +150,37 @@ class Dense : public CommonTestFixture {
         std::iota(tmp2.begin(), tmp2.end(), 0);
         std::shuffle(tmp2.begin(), tmp2.end(), rng);
         std::vector<int> tmp3(x->get_size()[0] / 10);
+        std::vector<value_type> scale_factors(tmp.size());
+        std::vector<value_type> scale_factors2(tmp2.size());
         std::uniform_int_distribution<int> row_dist(0, x->get_size()[0] - 1);
+        std::uniform_real_distribution<value_type> scale_dist{1, 2};
         for (auto& i : tmp3) {
             i = row_dist(rng);
         }
+        for (auto& s : scale_factors) {
+            s = scale_dist(rng);
+        }
+        for (auto& s : scale_factors2) {
+            s = scale_dist(rng);
+        }
         rpermute_idxs =
             std::unique_ptr<Arr>(new Arr{ref, tmp.begin(), tmp.end()});
         cpermute_idxs =
             std::unique_ptr<Arr>(new Arr{ref, tmp2.begin(), tmp2.end()});
         rgather_idxs =
             std::unique_ptr<Arr>(new Arr{ref, tmp3.begin(), tmp3.end()});
+        rpermutation = Permutation::create(ref, *rpermute_idxs);
+        cpermutation = Permutation::create(ref, *cpermute_idxs);
+        rspermutation = ScaledPermutation::create(
+            ref,
+            gko::array<value_type>{ref, scale_factors.begin(),
+                                   scale_factors.end()},
+            *rpermute_idxs);
+        cspermutation = ScaledPermutation::create(
+            ref,
+            gko::array<value_type>{ref, scale_factors2.begin(),
+                                   scale_factors2.end()},
+            *cpermute_idxs);
     }
 
     template <typename ConvertedType, typename InputType>
@@ -187,6 +213,10 @@ class Dense : public CommonTestFixture {
     std::unique_ptr<Mtx> dsquare;
     std::unique_ptr<Arr> rpermute_idxs;
     std::unique_ptr<Arr> cpermute_idxs;
+    std::unique_ptr<Permutation> rpermutation;
+    std::unique_ptr<Permutation> cpermutation;
+    std::unique_ptr<ScaledPermutation> rspermutation;
+    std::unique_ptr<ScaledPermutation> cspermutation;
     std::unique_ptr<Arr> rgather_idxs;
 };
 
@@ -1278,6 +1308,190 @@ TEST_F(Dense, CanAdvancedGatherRowsIntoMixedDenseCrossExecutor)
 }
 
 
+TEST_F(Dense, IsGenericPermutable)
+{
+    using gko::matrix::permute_mode;
+    set_up_apply_data();
+
+    for (auto mode :
+         {permute_mode::none, permute_mode::rows, permute_mode::columns,
+          permute_mode::symmetric, permute_mode::inverse_rows,
+          permute_mode::inverse_columns, permute_mode::inverse_symmetric}) {
+        SCOPED_TRACE(mode);
+        auto permuted = square->permute(rpermutation, mode);
+        auto dpermuted = dsquare->permute(rpermutation, mode);
+
+        GKO_ASSERT_MTX_NEAR(permuted, dpermuted, 0);
+    }
+}
+
+
+TEST_F(Dense, IsGenericPermutableRectangular)
+{
+    using gko::matrix::permute_mode;
+    set_up_apply_data();
+
+    for (auto mode :
+         {permute_mode::rows, permute_mode::columns, permute_mode::inverse_rows,
+          permute_mode::inverse_columns}) {
+        SCOPED_TRACE(mode);
+        auto perm = (mode & permute_mode::rows) == permute_mode::rows
+                        ? rpermutation.get()
+                        : cpermutation.get();
+
+        auto permuted = x->permute(perm, mode);
+        auto dpermuted = dx->permute(perm, mode);
+
+        GKO_ASSERT_MTX_NEAR(permuted, dpermuted, 0);
+    }
+}
+
+
+TEST_F(Dense, IsGenericPermutableIntoDenseCrossExecutor)
+{
+    using gko::matrix::permute_mode;
+    set_up_apply_data();
+
+    for (auto mode :
+         {permute_mode::none, permute_mode::rows, permute_mode::columns,
+          permute_mode::symmetric, permute_mode::inverse_rows,
+          permute_mode::inverse_columns, permute_mode::inverse_symmetric}) {
+        SCOPED_TRACE(mode);
+        auto host_permuted = square->clone();
+
+        auto ref_permuted = square->permute(rpermutation, mode);
+        dsquare->permute(rpermutation, host_permuted, mode);
+
+        GKO_ASSERT_MTX_NEAR(ref_permuted, host_permuted, 0);
+    }
+}
+
+
+TEST_F(Dense, IsNonsymmPermutable)
+{
+    using gko::matrix::permute_mode;
+    set_up_apply_data();
+
+    for (auto invert : {false, true}) {
+        SCOPED_TRACE(invert);
+        auto permuted = x->permute(rpermutation, cpermutation, invert);
+        auto dpermuted = dx->permute(rpermutation, cpermutation, invert);
+
+        GKO_ASSERT_MTX_NEAR(permuted, dpermuted, 0);
+    }
+}
+
+
+TEST_F(Dense, IsNonsymmPermutableIntoDenseCrossExecutor)
+{
+    using gko::matrix::permute_mode;
+    set_up_apply_data();
+
+    for (auto invert : {false, true}) {
+        SCOPED_TRACE(invert);
+        auto host_permuted = dx->clone();
+
+        auto ref_permuted = x->permute(rpermutation, cpermutation, invert);
+        dx->permute(rpermutation, cpermutation, host_permuted, invert);
+
+        GKO_ASSERT_MTX_NEAR(ref_permuted, host_permuted, 0);
+    }
+}
+
+
+TEST_F(Dense, IsGenericScalePermutable)
+{
+    using gko::matrix::permute_mode;
+    set_up_apply_data();
+
+    for (auto mode :
+         {permute_mode::none, permute_mode::rows, permute_mode::columns,
+          permute_mode::symmetric, permute_mode::inverse_rows,
+          permute_mode::inverse_columns, permute_mode::inverse_symmetric}) {
+        SCOPED_TRACE(mode);
+        auto permuted = square->scale_permute(rspermutation, mode);
+        auto dpermuted = dsquare->scale_permute(rspermutation, mode);
+
+        GKO_ASSERT_MTX_NEAR(permuted, dpermuted, r<value_type>::value);
+    }
+}
+
+
+TEST_F(Dense, IsGenericScalePermutableRectangular)
+{
+    using gko::matrix::permute_mode;
+    set_up_apply_data();
+
+    for (auto mode :
+         {permute_mode::rows, permute_mode::columns, permute_mode::inverse_rows,
+          permute_mode::inverse_columns}) {
+        SCOPED_TRACE(mode);
+        auto perm = (mode & permute_mode::rows) == permute_mode::rows
+                        ? rspermutation.get()
+                        : cspermutation.get();
+
+        auto permuted = x->scale_permute(perm, mode);
+        auto dpermuted = dx->scale_permute(perm, mode);
+
+        GKO_ASSERT_MTX_NEAR(permuted, dpermuted, r<value_type>::value);
+    }
+}
+
+
+TEST_F(Dense, IsGenericScalePermutableIntoDenseCrossExecutor)
+{
+    using gko::matrix::permute_mode;
+    set_up_apply_data();
+
+    for (auto mode :
+         {permute_mode::none, permute_mode::rows, permute_mode::columns,
+          permute_mode::symmetric, permute_mode::inverse_rows,
+          permute_mode::inverse_columns, permute_mode::inverse_symmetric}) {
+        SCOPED_TRACE(mode);
+        auto host_permuted = square->clone();
+
+        auto ref_permuted = square->scale_permute(rspermutation, mode);
+        dsquare->scale_permute(rspermutation, host_permuted, mode);
+
+        GKO_ASSERT_MTX_NEAR(ref_permuted, host_permuted, r<value_type>::value);
+    }
+}
+
+
+TEST_F(Dense, IsNonsymmScalePermutable)
+{
+    using gko::matrix::permute_mode;
+    set_up_apply_data();
+
+    for (auto invert : {false, true}) {
+        SCOPED_TRACE(invert);
+        auto permuted = x->scale_permute(rspermutation, cspermutation, invert);
+        auto dpermuted =
+            dx->scale_permute(rspermutation, cspermutation, invert);
+
+        GKO_ASSERT_MTX_NEAR(permuted, dpermuted, r<value_type>::value);
+    }
+}
+
+
+TEST_F(Dense, IsNonsymmScalePermutableIntoDenseCrossExecutor)
+{
+    using gko::matrix::permute_mode;
+    set_up_apply_data();
+
+    for (auto invert : {false, true}) {
+        SCOPED_TRACE(invert);
+        auto host_permuted = dx->clone();
+
+        auto ref_permuted =
+            x->scale_permute(rspermutation, cspermutation, invert);
+        dx->scale_permute(rspermutation, cspermutation, host_permuted, invert);
+
+        GKO_ASSERT_MTX_NEAR(ref_permuted, host_permuted, r<value_type>::value);
+    }
+}
+
+
 TEST_F(Dense, IsPermutable)
 {
     set_up_apply_data();
diff --git a/test/matrix/fft_kernels.cpp b/test/matrix/fft_kernels.cpp
index 59d2d2de68e..fd9dda821c0 100644
--- a/test/matrix/fft_kernels.cpp
+++ b/test/matrix/fft_kernels.cpp
@@ -138,6 +138,9 @@ TYPED_TEST(Fft, Apply1DIsEqualToReference)
 
 TYPED_TEST(Fft, ApplyStrided1DIsEqualToReference)
 {
+#if defined(GKO_COMPILING_HIP) && GINKGO_HIP_PLATFORM_HCC
+    GTEST_SKIP() << "rocFFT has a bug related to strided 1D FFT";
+#endif
     using T = typename TestFixture::value_type;
 
     this->fft->apply(this->data_strided, this->out_strided);
@@ -160,6 +163,9 @@ TYPED_TEST(Fft, Apply1DInverseIsEqualToReference)
 
 TYPED_TEST(Fft, ApplyStrided1DInverseIsEqualToReference)
 {
+#if defined(GKO_COMPILING_HIP) && GINKGO_HIP_PLATFORM_HCC
+    GTEST_SKIP() << "rocFFT has a bug related to strided 1D FFT";
+#endif
     using T = typename TestFixture::value_type;
 
     this->ifft->apply(this->data_strided, this->out_strided);
diff --git a/test/matrix/matrix.cpp b/test/matrix/matrix.cpp
index 9192b2eeebe..9b78ae21d6c 100644
--- a/test/matrix/matrix.cpp
+++ b/test/matrix/matrix.cpp
@@ -155,7 +155,7 @@ struct CsrWithDefaultStrategy : CsrBase {
         CsrBase::assert_empty_state(mtx);
         auto first_strategy = mtx->create_default()->get_strategy();
         auto second_strategy = mtx->get_strategy();
-        ASSERT_EQ(typeid(*first_strategy), typeid(*second_strategy));
+        GKO_ASSERT_DYNAMIC_TYPE_EQ(first_strategy, second_strategy);
     }
 };
 
diff --git a/test/matrix/permutation_kernels.cpp b/test/matrix/permutation_kernels.cpp
new file mode 100644
index 00000000000..a1013d7b401
--- /dev/null
+++ b/test/matrix/permutation_kernels.cpp
@@ -0,0 +1,132 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <algorithm>
+#include <numeric>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/matrix/permutation.hpp>
+
+
+#include "core/test/utils.hpp"
+#include "test/utils/executor.hpp"
+
+
+class Permutation : public CommonTestFixture {
+protected:
+    using Perm = gko::matrix::Permutation<index_type>;
+    using Mtx = gko::matrix::Dense<value_type>;
+
+    Permutation() : rand_engine(42)
+    {
+        std::vector<int> tmp(1000, 0);
+        std::iota(tmp.begin(), tmp.end(), 0);
+        auto tmp2 = tmp;
+        std::shuffle(tmp.begin(), tmp.end(), rand_engine);
+        std::shuffle(tmp2.begin(), tmp2.end(), rand_engine);
+        permutation = Perm::create(
+            ref, gko::array<index_type>(ref, tmp.begin(), tmp.end()));
+        permutation2 = Perm::create(
+            ref, gko::array<index_type>(ref, tmp2.begin(), tmp2.end()));
+        dpermutation = permutation->clone(exec);
+
+        mtx = gko::test::generate_random_matrix<Mtx>(
+            tmp.size(), 4, std::uniform_int_distribution<>(4, 4),
+            std::normal_distribution<gko::remove_complex<value_type>>(-1.0,
+                                                                      1.0),
+            rand_engine, ref);
+        mtx2 = gko::test::generate_random_matrix<Mtx>(
+            tmp.size(), 4, std::uniform_int_distribution<>(4, 4),
+            std::normal_distribution<gko::remove_complex<value_type>>(-1.0,
+                                                                      1.0),
+            rand_engine, ref);
+        alpha = gko::initialize<Mtx>({2.0}, ref);
+        beta = gko::initialize<Mtx>({-3.0}, ref);
+        dmtx = mtx->clone();
+    }
+
+    std::default_random_engine rand_engine;
+    std::unique_ptr<Mtx> mtx;
+    std::unique_ptr<Mtx> mtx2;
+    std::unique_ptr<Mtx> dmtx;
+    std::unique_ptr<Mtx> alpha;
+    std::unique_ptr<Mtx> beta;
+    std::unique_ptr<Perm> permutation;
+    std::unique_ptr<Perm> permutation2;
+    std::unique_ptr<Perm> dpermutation;
+};
+
+
+TEST_F(Permutation, InvertIsEquivalentToRef)
+{
+    auto inv = permutation->compute_inverse();
+    auto dinv = dpermutation->compute_inverse();
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(inv, dinv);
+}
+
+
+TEST_F(Permutation, ApplyIsEquivalentToRef)
+{
+    auto out = mtx->clone();
+    auto dout = dmtx->clone();
+
+    permutation->apply(mtx, out);
+    dpermutation->apply(dmtx, dout);
+
+    GKO_ASSERT_MTX_NEAR(out, dout, 0.0);
+}
+
+
+TEST_F(Permutation, AdvancedApplyIsEquivalentToRef)
+{
+    auto out = mtx->clone();
+    auto dout = dmtx->clone();
+
+    permutation->apply(alpha, mtx, beta, out);
+    dpermutation->apply(alpha, dmtx, beta, dout);
+
+    GKO_ASSERT_MTX_NEAR(out, dout, r<value_type>::value);
+}
+
+
+TEST_F(Permutation, CombineIsEquivalentToRef)
+{
+    auto combined = permutation->compose(permutation2);
+    auto dcombined = dpermutation->compose(permutation2);
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(combined, dcombined);
+}
diff --git a/test/matrix/scaled_permutation_kernels.cpp b/test/matrix/scaled_permutation_kernels.cpp
new file mode 100644
index 00000000000..85ea72071ce
--- /dev/null
+++ b/test/matrix/scaled_permutation_kernels.cpp
@@ -0,0 +1,139 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <algorithm>
+#include <numeric>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/matrix/scaled_permutation.hpp>
+
+
+#include "core/test/utils.hpp"
+#include "test/utils/executor.hpp"
+
+
+class ScaledPermutation : public CommonTestFixture {
+protected:
+    using ScaledPerm = gko::matrix::ScaledPermutation<value_type, index_type>;
+    using Mtx = gko::matrix::Dense<value_type>;
+
+    ScaledPermutation() : rand_engine(42)
+    {
+        std::vector<int> tmp(1000, 0);
+        std::iota(tmp.begin(), tmp.end(), 0);
+        auto tmp2 = tmp;
+        std::shuffle(tmp.begin(), tmp.end(), rand_engine);
+        std::shuffle(tmp2.begin(), tmp2.end(), rand_engine);
+        std::vector<value_type> scale(tmp.size());
+        std::vector<value_type> scale2(tmp2.size());
+        std::uniform_real_distribution<value_type> dist(1, 2);
+        auto gen = [&] { return dist(rand_engine); };
+        std::generate(scale.begin(), scale.end(), gen);
+        std::generate(scale2.begin(), scale2.end(), gen);
+        permutation = ScaledPerm::create(
+            ref, gko::array<value_type>(ref, scale.begin(), scale.end()),
+            gko::array<index_type>(ref, tmp.begin(), tmp.end()));
+        permutation2 = ScaledPerm::create(
+            ref, gko::array<value_type>(ref, scale2.begin(), scale2.end()),
+            gko::array<index_type>(ref, tmp2.begin(), tmp2.end()));
+        dpermutation = permutation->clone(exec);
+
+        mtx = gko::test::generate_random_matrix<Mtx>(
+            tmp.size(), 4, std::uniform_int_distribution<>(4, 4),
+            std::normal_distribution<gko::remove_complex<value_type>>(-1.0,
+                                                                      1.0),
+            rand_engine, ref);
+        mtx2 = gko::test::generate_random_matrix<Mtx>(
+            tmp.size(), 4, std::uniform_int_distribution<>(4, 4),
+            std::normal_distribution<gko::remove_complex<value_type>>(-1.0,
+                                                                      1.0),
+            rand_engine, ref);
+        alpha = gko::initialize<Mtx>({2.0}, ref);
+        beta = gko::initialize<Mtx>({-3.0}, ref);
+        dmtx = mtx->clone();
+    }
+
+    std::default_random_engine rand_engine;
+    std::unique_ptr<Mtx> mtx;
+    std::unique_ptr<Mtx> mtx2;
+    std::unique_ptr<Mtx> dmtx;
+    std::unique_ptr<Mtx> alpha;
+    std::unique_ptr<Mtx> beta;
+    std::unique_ptr<ScaledPerm> permutation;
+    std::unique_ptr<ScaledPerm> permutation2;
+    std::unique_ptr<ScaledPerm> dpermutation;
+};
+
+
+TEST_F(ScaledPermutation, InvertIsEquivalentToRef)
+{
+    auto inv = permutation->compute_inverse();
+    auto dinv = dpermutation->compute_inverse();
+
+    GKO_ASSERT_MTX_NEAR(inv, dinv, r<value_type>::value);
+}
+
+
+TEST_F(ScaledPermutation, ApplyIsEquivalentToRef)
+{
+    auto out = mtx->clone();
+    auto dout = dmtx->clone();
+
+    permutation->apply(mtx, out);
+    dpermutation->apply(dmtx, dout);
+
+    GKO_ASSERT_MTX_NEAR(out, dout, r<value_type>::value);
+}
+
+
+TEST_F(ScaledPermutation, AdvancedApplyIsEquivalentToRef)
+{
+    auto out = mtx->clone();
+    auto dout = dmtx->clone();
+
+    permutation->apply(alpha, mtx, beta, out);
+    dpermutation->apply(alpha, dmtx, beta, dout);
+
+    GKO_ASSERT_MTX_NEAR(out, dout, r<value_type>::value);
+}
+
+
+TEST_F(ScaledPermutation, CombineIsEquivalentToRef)
+{
+    auto combined = permutation->compose(permutation2);
+    auto dcombined = dpermutation->compose(permutation2);
+
+    GKO_ASSERT_MTX_NEAR(combined, dcombined, r<value_type>::value);
+}
diff --git a/test/matrix/sparsity_csr_kernels.cpp b/test/matrix/sparsity_csr_kernels.cpp
index b137ce72ca8..d865570b6d0 100644
--- a/test/matrix/sparsity_csr_kernels.cpp
+++ b/test/matrix/sparsity_csr_kernels.cpp
@@ -50,6 +50,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/test/utils.hpp"
 #include "core/test/utils/assertions.hpp"
 #include "core/test/utils/matrix_generator.hpp"
+#include "core/test/utils/unsort_matrix.hpp"
 #include "test/utils/executor.hpp"
 
 
@@ -59,6 +60,7 @@ namespace {
 class SparsityCsr : public CommonTestFixture {
 protected:
     using Mtx = gko::matrix::SparsityCsr<value_type, index_type>;
+    using Mtx64 = gko::matrix::SparsityCsr<value_type, gko::int64>;
 
     SparsityCsr() : rng{9312}
     {
@@ -145,4 +147,99 @@ TEST_F(SparsityCsr, ConvertToDenseIsEquivalentToRef)
 }
 
 
+TEST_F(SparsityCsr, SortSortedMatrixIsEquivalentToRef)
+{
+    mtx->sort_by_column_index();
+    dmtx->sort_by_column_index();
+
+    auto cols_view =
+        gko::make_array_view(ref, mtx->get_num_nonzeros(), mtx->get_col_idxs());
+    auto dcols_view = gko::make_array_view(exec, dmtx->get_num_nonzeros(),
+                                           dmtx->get_col_idxs());
+    GKO_ASSERT_ARRAY_EQ(cols_view, dcols_view);
+}
+
+
+TEST_F(SparsityCsr, SortSortedMatrix64IsEquivalentToRef)
+{
+    auto mtx64 = Mtx64::create(ref);
+    auto dmtx64 = Mtx64::create(exec);
+    gko::matrix_data<value_type, index_type> data;
+    gko::matrix_data<value_type, gko::int64> data64;
+    mtx->sort_by_column_index();
+    mtx->write(data);
+    data64.size = data.size;
+    for (auto entry : data.nonzeros) {
+        data64.nonzeros.emplace_back(entry.row, entry.column, entry.value);
+    }
+    mtx64->read(data64);
+    dmtx64->read(data64);
+
+    mtx64->sort_by_column_index();
+    dmtx64->sort_by_column_index();
+
+    auto cols_view = gko::make_array_view(ref, mtx64->get_num_nonzeros(),
+                                          mtx64->get_col_idxs());
+    auto dcols_view = gko::make_array_view(exec, dmtx64->get_num_nonzeros(),
+                                           dmtx64->get_col_idxs());
+    GKO_ASSERT_ARRAY_EQ(cols_view, dcols_view);
+}
+
+
+TEST_F(SparsityCsr, SortUnsortedMatrixIsEquivalentToRef)
+{
+    gko::test::unsort_matrix(mtx, rng);
+    dmtx->copy_from(mtx);
+
+    mtx->sort_by_column_index();
+    dmtx->sort_by_column_index();
+
+    auto cols_view =
+        gko::make_array_view(ref, mtx->get_num_nonzeros(), mtx->get_col_idxs());
+    auto dcols_view = gko::make_array_view(exec, dmtx->get_num_nonzeros(),
+                                           dmtx->get_col_idxs());
+    GKO_ASSERT_ARRAY_EQ(cols_view, dcols_view);
+}
+
+
+TEST_F(SparsityCsr, SortUnsortedMatrix64IsEquivalentToRef)
+{
+    gko::test::unsort_matrix(mtx, rng);
+    auto mtx64 = Mtx64::create(ref);
+    auto dmtx64 = Mtx64::create(exec);
+    gko::matrix_data<value_type, index_type> data;
+    gko::matrix_data<value_type, gko::int64> data64;
+    mtx->write(data);
+    data64.size = data.size;
+    for (auto entry : data.nonzeros) {
+        data64.nonzeros.emplace_back(entry.row, entry.column, entry.value);
+    }
+    mtx64->read(data64);
+    dmtx64->read(data64);
+
+    mtx64->sort_by_column_index();
+    dmtx64->sort_by_column_index();
+
+    auto cols_view = gko::make_array_view(ref, mtx64->get_num_nonzeros(),
+                                          mtx64->get_col_idxs());
+    auto dcols_view = gko::make_array_view(exec, dmtx64->get_num_nonzeros(),
+                                           dmtx64->get_col_idxs());
+    GKO_ASSERT_ARRAY_EQ(cols_view, dcols_view);
+}
+
+
+TEST_F(SparsityCsr, RecognizesUnsortedMatrix)
+{
+    gko::test::unsort_matrix(dmtx, rng);
+
+    ASSERT_FALSE(dmtx->is_sorted_by_column_index());
+}
+
+
+TEST_F(SparsityCsr, RecognizesSortedMatrix)
+{
+    ASSERT_TRUE(dmtx->is_sorted_by_column_index());
+}
+
+
 }  // namespace
diff --git a/test/mpi/CMakeLists.txt b/test/mpi/CMakeLists.txt
index f715ea482ec..fc0aec8138a 100644
--- a/test/mpi/CMakeLists.txt
+++ b/test/mpi/CMakeLists.txt
@@ -1,2 +1,6 @@
-add_subdirectory(distributed)
+ginkgo_create_common_and_reference_test(matrix MPI_SIZE 3)
+ginkgo_create_common_and_reference_test(partition_helpers MPI_SIZE 3)
+ginkgo_create_common_and_reference_test(vector MPI_SIZE 3)
+
+add_subdirectory(preconditioner)
 add_subdirectory(solver)
diff --git a/test/mpi/distributed/CMakeLists.txt b/test/mpi/distributed/CMakeLists.txt
deleted file mode 100644
index a92e0ef4f70..00000000000
--- a/test/mpi/distributed/CMakeLists.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-ginkgo_create_common_and_reference_test(matrix MPI_SIZE 3)
-ginkgo_create_common_and_reference_test(vector MPI_SIZE 3)
-
-add_subdirectory(preconditioner)
diff --git a/test/mpi/distributed/preconditioner/CMakeLists.txt b/test/mpi/distributed/preconditioner/CMakeLists.txt
deleted file mode 100644
index 681bbec3bc9..00000000000
--- a/test/mpi/distributed/preconditioner/CMakeLists.txt
+++ /dev/null
@@ -1 +0,0 @@
-ginkgo_create_common_and_reference_test(schwarz MPI_SIZE 3)
diff --git a/test/mpi/distributed/matrix.cpp b/test/mpi/matrix.cpp
similarity index 100%
rename from test/mpi/distributed/matrix.cpp
rename to test/mpi/matrix.cpp
diff --git a/test/mpi/partition_helpers.cpp b/test/mpi/partition_helpers.cpp
new file mode 100644
index 00000000000..de0b897fd13
--- /dev/null
+++ b/test/mpi/partition_helpers.cpp
@@ -0,0 +1,135 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/distributed/partition.hpp>
+#include <ginkgo/core/distributed/partition_helpers.hpp>
+
+
+#include "core/test/utils.hpp"
+#include "test/utils/mpi/executor.hpp"
+
+
+using comm_index_type = gko::experimental::distributed::comm_index_type;
+
+
+template <typename IndexType>
+class PartitionHelpers : public CommonMpiTestFixture {
+protected:
+    using index_type = IndexType;
+};
+
+TYPED_TEST_SUITE(PartitionHelpers, gko::test::IndexTypes,
+                 TypenameNameGenerator);
+
+
+TYPED_TEST(PartitionHelpers, CanBuildFromLocalRanges)
+{
+    using itype = typename TestFixture::index_type;
+    gko::span local_range[] = {{0u, 4u}, {4u, 9u}, {9u, 11u}};
+    gko::array<itype> expects_ranges{this->exec, {0, 4, 9, 11}};
+    gko::array<comm_index_type> expects_pid{this->exec, {0, 1, 2}};
+
+    auto part =
+        gko::experimental::distributed::build_partition_from_local_range<
+            gko::int32, itype>(this->exec, this->comm,
+                               local_range[this->comm.rank()]);
+
+    GKO_ASSERT_ARRAY_EQ(
+        expects_ranges,
+        gko::make_const_array_view(this->exec, expects_ranges.get_num_elems(),
+                                   part->get_range_bounds()));
+    GKO_ASSERT_ARRAY_EQ(
+        expects_pid,
+        gko::make_const_array_view(this->exec, expects_pid.get_num_elems(),
+                                   part->get_part_ids()));
+}
+
+
+TYPED_TEST(PartitionHelpers, CanBuildFromLocalRangesUnsorted)
+{
+    using itype = typename TestFixture::index_type;
+    gko::span local_range[] = {{4u, 9u}, {9u, 11u}, {0u, 4u}};
+    gko::array<itype> expects_ranges{this->exec, {0, 4, 9, 11}};
+    gko::array<comm_index_type> expects_pid{this->exec, {2, 0, 1}};
+
+    auto part =
+        gko::experimental::distributed::build_partition_from_local_range<
+            gko::int32, itype>(this->exec, this->comm,
+                               local_range[this->comm.rank()]);
+
+    GKO_ASSERT_ARRAY_EQ(
+        expects_ranges,
+        gko::make_const_array_view(this->exec, expects_ranges.get_num_elems(),
+                                   part->get_range_bounds()));
+    GKO_ASSERT_ARRAY_EQ(
+        expects_pid,
+        gko::make_const_array_view(this->exec, expects_pid.get_num_elems(),
+                                   part->get_part_ids()));
+}
+
+
+TYPED_TEST(PartitionHelpers, CanBuildFromLocalRangesThrowsOnGap)
+{
+    using itype = typename TestFixture::index_type;
+    gko::span local_range[] = {{4u, 6u}, {9u, 11u}, {0u, 4u}};
+    // Hack because of multiple template arguments in macro
+    auto build_from_local_ranges = [](auto... args) {
+        return gko::experimental::distributed::build_partition_from_local_range<
+            gko::int32, itype>(args...);
+    };
+
+    ASSERT_THROW(build_from_local_ranges(this->exec, this->comm,
+                                         local_range[this->comm.rank()]),
+                 gko::InvalidStateError);
+}
+
+
+TYPED_TEST(PartitionHelpers, CanBuildFromLocalSize)
+{
+    using itype = typename TestFixture::index_type;
+    gko::size_type local_range[] = {4, 5, 3};
+    gko::array<itype> expects_ranges{this->exec, {0, 4, 9, 12}};
+    gko::array<comm_index_type> expects_pid{this->exec, {0, 1, 2}};
+
+    auto part = gko::experimental::distributed::build_partition_from_local_size<
+        gko::int32, itype>(this->exec, this->comm,
+                           local_range[this->comm.rank()]);
+
+    GKO_ASSERT_ARRAY_EQ(
+        expects_ranges,
+        gko::make_const_array_view(this->exec, expects_ranges.get_num_elems(),
+                                   part->get_range_bounds()));
+    GKO_ASSERT_ARRAY_EQ(
+        expects_pid,
+        gko::make_const_array_view(this->exec, expects_pid.get_num_elems(),
+                                   part->get_part_ids()));
+}
diff --git a/test/mpi/preconditioner/CMakeLists.txt b/test/mpi/preconditioner/CMakeLists.txt
new file mode 100644
index 00000000000..4f734d21df8
--- /dev/null
+++ b/test/mpi/preconditioner/CMakeLists.txt
@@ -0,0 +1 @@
+ginkgo_create_common_and_reference_test(schwarz MPI_SIZE 3 DISABLE_EXECUTORS dpcpp)
diff --git a/test/mpi/distributed/preconditioner/schwarz.cpp b/test/mpi/preconditioner/schwarz.cpp
similarity index 74%
rename from test/mpi/distributed/preconditioner/schwarz.cpp
rename to test/mpi/preconditioner/schwarz.cpp
index 95bfe3f66b4..3c6dbf33a52 100644
--- a/test/mpi/distributed/preconditioner/schwarz.cpp
+++ b/test/mpi/preconditioner/schwarz.cpp
@@ -101,14 +101,14 @@ class SchwarzPreconditioner : public CommonMpiTestFixture {
 
 
     SchwarzPreconditioner()
-        : size{8, 8}, mat_input{size, {{0, 0, 2}, {0, 1, -1}, {1, 0, -1},
-                                       {1, 1, 2}, {1, 2, -1}, {2, 1, -1},
-                                       {2, 2, 2}, {2, 3, -1}, {3, 2, -1},
-                                       {3, 3, 2}, {3, 4, -1}, {4, 3, -1},
-                                       {4, 4, 2}, {4, 5, -1}, {5, 4, -1},
-                                       {5, 5, 2}, {5, 6, -1}, {6, 5, -1},
-                                       {6, 6, 2}, {6, 7, -1}, {7, 6, -1},
-                                       {7, 7, 2}}}
+        : CommonMpiTestFixture(),
+          size{8, 8},
+          mat_input{size,
+                    {{0, 0, 2},  {0, 1, -1}, {1, 0, -1}, {1, 1, 2},  {1, 2, -1},
+                     {2, 1, -1}, {2, 2, 2},  {2, 3, -1}, {3, 2, -1}, {3, 3, 2},
+                     {3, 4, -1}, {4, 3, -1}, {4, 4, 2},  {4, 5, -1}, {5, 4, -1},
+                     {5, 5, 2},  {5, 6, -1}, {6, 5, -1}, {6, 6, 2},  {6, 7, -1},
+                     {7, 6, -1}, {7, 7, 2}}}
     {
         row_part = Partition::build_from_contiguous(
             exec, gko::array<global_index_type>(
@@ -162,13 +162,14 @@ class SchwarzPreconditioner : public CommonMpiTestFixture {
         std::shared_ptr<dist_vec_type> dist_vec,
         std::shared_ptr<local_vec_type> local_vec)
     {
+        auto host_row_part = row_part->clone(ref);
         auto l_dist_vec = dist_vec->get_local_vector();
         auto vec_view = local_vec_type::create_const(
             exec, l_dist_vec->get_size(),
             gko::array<value_type>::const_view(
                 exec, l_dist_vec->get_size()[0],
                 local_vec->get_const_values() +
-                    row_part->get_range_bounds()[comm.rank()]),
+                    host_row_part->get_range_bounds()[comm.rank()]),
             l_dist_vec->get_size()[1]);
         GKO_ASSERT_MTX_NEAR(l_dist_vec, vec_view.get(), r<value_type>::value);
     }
@@ -177,6 +178,36 @@ class SchwarzPreconditioner : public CommonMpiTestFixture {
 TYPED_TEST_SUITE(SchwarzPreconditioner, gko::test::ValueLocalGlobalIndexTypes,
                  TupleTypenameNameGenerator);
 
+TYPED_TEST(SchwarzPreconditioner, GenerateFailsIfInvalidState)
+{
+    using value_type = typename TestFixture::value_type;
+    using local_index_type = typename TestFixture::local_index_type;
+    using local_prec_type =
+        gko::preconditioner::Jacobi<value_type, local_index_type>;
+    using prec = typename TestFixture::dist_prec_type;
+
+    auto local_solver = gko::share(local_prec_type::build()
+                                       .with_max_block_size(1u)
+                                       .on(this->exec)
+                                       ->generate(this->non_dist_mat));
+    auto schwarz = prec::build()
+                       .with_local_solver(this->local_solver_factory)
+                       .with_generated_local_solver(local_solver)
+                       .on(this->exec);
+
+    ASSERT_THROW(schwarz->generate(this->dist_mat), gko::InvalidStateError);
+}
+
+
+TYPED_TEST(SchwarzPreconditioner, GenerateFailsIfNoSolverProvided)
+{
+    using prec = typename TestFixture::dist_prec_type;
+    auto schwarz_no_solver = prec::build().on(this->exec);
+
+    ASSERT_THROW(schwarz_no_solver->generate(this->dist_mat),
+                 gko::InvalidStateError);
+}
+
 
 TYPED_TEST(SchwarzPreconditioner, CanApplyPreconditionedSolver)
 {
@@ -196,7 +227,7 @@ TYPED_TEST(SchwarzPreconditioner, CanApplyPreconditionedSolver)
         cg::build()
             .with_preconditioner(
                 prec::build()
-                    .with_local_solver_factory(this->local_solver_factory)
+                    .with_local_solver(this->local_solver_factory)
                     .on(this->exec))
             .with_criteria(iter_stop, tol_stop)
             .on(this->exec);
@@ -217,17 +248,49 @@ TYPED_TEST(SchwarzPreconditioner, CanApplyPreconditionedSolver)
 }
 
 
-TYPED_TEST(SchwarzPreconditioner, CanApplyPreconditioner)
+TYPED_TEST(SchwarzPreconditioner, CanApplyPreconditionedSolverWithPregenSolver)
 {
     using value_type = typename TestFixture::value_type;
+    using local_index_type = typename TestFixture::local_index_type;
+    using local_prec_type =
+        gko::preconditioner::Jacobi<value_type, local_index_type>;
     using csr = typename TestFixture::local_matrix_type;
     using cg = typename TestFixture::solver_type;
     using prec = typename TestFixture::dist_prec_type;
 
-    auto precond_factory =
-        prec::build()
-            .with_local_solver_factory(this->local_solver_factory)
-            .on(this->exec);
+    auto local_solver =
+        gko::share(local_prec_type::build()
+                       .with_max_block_size(1u)
+                       .on(this->exec)
+                       ->generate(this->dist_mat->get_local_matrix()));
+    auto precond = prec::build()
+                       .with_local_solver(this->local_solver_factory)
+                       .on(this->exec)
+                       ->generate(this->dist_mat);
+    auto precond_pregen = prec::build()
+                              .with_generated_local_solver(local_solver)
+                              .on(this->exec)
+                              ->generate(this->dist_mat);
+    auto dist_x = gko::share(this->dist_x->clone());
+    auto dist_x_pregen = gko::share(this->dist_x->clone());
+
+    precond->apply(this->dist_b.get(), dist_x.get());
+    precond_pregen->apply(this->dist_b.get(), dist_x_pregen.get());
+
+    GKO_ASSERT_MTX_NEAR(dist_x->get_local_vector(),
+                        dist_x_pregen->get_local_vector(),
+                        r<value_type>::value);
+}
+
+
+TYPED_TEST(SchwarzPreconditioner, CanApplyPreconditioner)
+{
+    using value_type = typename TestFixture::value_type;
+    using prec = typename TestFixture::dist_prec_type;
+
+    auto precond_factory = prec::build()
+                               .with_local_solver(this->local_solver_factory)
+                               .on(this->exec);
     auto local_precond =
         this->local_solver_factory->generate(this->non_dist_mat);
     auto precond = precond_factory->generate(this->dist_mat);
@@ -248,10 +311,9 @@ TYPED_TEST(SchwarzPreconditioner, CanAdvancedApplyPreconditioner)
     using cg = typename TestFixture::solver_type;
     using prec = typename TestFixture::dist_prec_type;
 
-    auto precond_factory =
-        prec::build()
-            .with_local_solver_factory(this->local_solver_factory)
-            .on(this->exec);
+    auto precond_factory = prec::build()
+                               .with_local_solver(this->local_solver_factory)
+                               .on(this->exec);
     auto local_precond =
         this->local_solver_factory->generate(this->non_dist_mat);
     auto precond = precond_factory->generate(this->dist_mat);
diff --git a/test/mpi/solver/CMakeLists.txt b/test/mpi/solver/CMakeLists.txt
index 43a2d870d3f..bffd7b5ab10 100644
--- a/test/mpi/solver/CMakeLists.txt
+++ b/test/mpi/solver/CMakeLists.txt
@@ -1 +1 @@
-ginkgo_create_common_and_reference_test(solver MPI_SIZE 3)
+ginkgo_create_common_and_reference_test(solver MPI_SIZE 3 DISABLE_EXECUTORS dpcpp)
diff --git a/test/mpi/solver/solver.cpp b/test/mpi/solver/solver.cpp
index 59462a9be59..f53b2784124 100644
--- a/test/mpi/solver/solver.cpp
+++ b/test/mpi/solver/solver.cpp
@@ -107,9 +107,7 @@ struct SimpleSolverTest {
         std::shared_ptr<const gko::Executor> exec)
     {
         return solver_type::build().with_criteria(
-            gko::stop::Iteration::build()
-                .with_max_iters(iteration_count())
-                .on(exec),
+            gko::stop::Iteration::build().with_max_iters(iteration_count()),
             gko::stop::ResidualNorm<value_type>::build()
                 .with_baseline(gko::stop::mode::absolute)
                 .with_reduction_factor(reduction_factor())
@@ -164,17 +162,11 @@ struct Ir : SimpleSolverTest<gko::solver::Ir<solver_value_type>> {
         std::shared_ptr<const gko::Executor> exec)
     {
         return SimpleSolverTest<gko::solver::Ir<solver_value_type>>::build(exec)
-            .with_solver(
-                gko::solver::Cg<value_type>::build()
-                    .with_criteria(
-                        gko::stop::Iteration::build()
-                            .with_max_iters(iteration_count())
-                            .on(exec),
-                        gko::stop::ResidualNorm<value_type>::build()
-                            .with_baseline(gko::stop::mode::absolute)
-                            .with_reduction_factor(2 * reduction_factor())
-                            .on(exec))
-                    .on(exec))
+            .with_solver(gko::solver::Cg<value_type>::build().with_criteria(
+                gko::stop::Iteration::build().with_max_iters(iteration_count()),
+                gko::stop::ResidualNorm<value_type>::build()
+                    .with_baseline(gko::stop::mode::absolute)
+                    .with_reduction_factor(2 * reduction_factor())))
             .with_relaxation_factor(0.9);
     }
 };
diff --git a/test/mpi/distributed/vector.cpp b/test/mpi/vector.cpp
similarity index 98%
rename from test/mpi/distributed/vector.cpp
rename to test/mpi/vector.cpp
index a7ad735458c..ac75a461465 100644
--- a/test/mpi/distributed/vector.cpp
+++ b/test/mpi/vector.cpp
@@ -676,6 +676,30 @@ TYPED_TEST(VectorReductions, ComputeSquaredNorm2WithTmpIsSameAsDense)
 }
 
 
+TYPED_TEST(VectorReductions, ComputesMeanIsSameAsDense)
+{
+    using value_type = typename TestFixture::value_type;
+    this->init_result();
+
+    this->x->compute_mean(this->res);
+    this->dense_x->compute_mean(this->dense_res);
+
+    GKO_ASSERT_MTX_NEAR(this->res, this->dense_res, r<value_type>::value);
+}
+
+
+TYPED_TEST(VectorReductions, ComputesMeanWithTmpIsSameAsDense)
+{
+    using value_type = typename TestFixture::value_type;
+    this->init_result();
+
+    this->x->compute_mean(this->res, this->tmp);
+    this->dense_x->compute_mean(this->dense_res, this->dense_tmp);
+
+    GKO_ASSERT_MTX_NEAR(this->res, this->dense_res, r<value_type>::value);
+}
+
+
 TYPED_TEST(VectorReductions, ComputeDotCopiesToHostOnlyIfNecessary)
 {
     this->init_result();
diff --git a/test/reorder/CMakeLists.txt b/test/reorder/CMakeLists.txt
index d87d5f8313c..c9f3980e8bf 100644
--- a/test/reorder/CMakeLists.txt
+++ b/test/reorder/CMakeLists.txt
@@ -1,4 +1,5 @@
 ginkgo_create_common_test(amd DISABLE_EXECUTORS dpcpp)
+ginkgo_create_common_test(mc64 DISABLE_EXECUTORS dpcpp)
 if (GINKGO_HAVE_METIS)
     ginkgo_create_common_test(nested_dissection)
 endif()
diff --git a/test/reorder/mc64.cpp b/test/reorder/mc64.cpp
new file mode 100644
index 00000000000..51e3a143c88
--- /dev/null
+++ b/test/reorder/mc64.cpp
@@ -0,0 +1,96 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/composition.hpp>
+#include <ginkgo/core/reorder/mc64.hpp>
+
+
+#include "core/test/utils/assertions.hpp"
+#include "test/utils/executor.hpp"
+
+
+namespace {
+
+
+class Mc64 : public CommonTestFixture {
+protected:
+    using v_type = double;
+    using i_type = int;
+    using CsrMtx = gko::matrix::Csr<v_type, i_type>;
+    using reorder_type = gko::experimental::reorder::Mc64<v_type, i_type>;
+    using result_type = gko::Composition<v_type>;
+    using perm_type = gko::matrix::ScaledPermutation<v_type, i_type>;
+
+    Mc64()
+        : mtx(gko::initialize<CsrMtx>({{1.0, 2.0, 0.0, -1.3, 2.1},
+                                       {2.0, 5.0, 1.5, 0.0, 0.0},
+                                       {0.0, 1.5, 1.5, 1.1, 0.0},
+                                       {-1.3, 0.0, 1.1, 2.0, 0.0},
+                                       {2.1, 0.0, 0.0, 0.0, 1.0}},
+                                      ref)),
+          dmtx(mtx->clone(exec)),
+          mc64_factory(reorder_type::build().on(ref)),
+          dmc64_factory(reorder_type::build().on(exec))
+    {}
+
+    std::pair<std::shared_ptr<const perm_type>,
+              std::shared_ptr<const perm_type>>
+    unpack(const result_type* result)
+    {
+        GKO_ASSERT_EQ(result->get_operators().size(), 2);
+        return std::make_pair(gko::as<perm_type>(result->get_operators()[0]),
+                              gko::as<perm_type>(result->get_operators()[1]));
+    }
+
+    std::unique_ptr<reorder_type> mc64_factory;
+    std::unique_ptr<reorder_type> dmc64_factory;
+    std::shared_ptr<CsrMtx> mtx;
+    std::shared_ptr<CsrMtx> dmtx;
+};
+
+
+TEST_F(Mc64, IsEquivalentToReference)
+{
+    auto perm = mc64_factory->generate(mtx);
+    auto dperm = dmc64_factory->generate(dmtx);
+
+    auto ops = unpack(perm.get());
+    auto dops = unpack(dperm.get());
+    GKO_ASSERT_MTX_NEAR(ops.first, dops.first, 0.0);
+    GKO_ASSERT_MTX_NEAR(ops.second, dops.second, 0.0);
+}
+
+
+}  // namespace
diff --git a/test/solver/CMakeLists.txt b/test/solver/CMakeLists.txt
index 4cec6b05d22..00c78eb93a0 100644
--- a/test/solver/CMakeLists.txt
+++ b/test/solver/CMakeLists.txt
@@ -1,3 +1,4 @@
+ginkgo_create_common_test(batch_bicgstab_kernels)
 ginkgo_create_common_test(bicg_kernels)
 ginkgo_create_common_test(bicgstab_kernels)
 ginkgo_create_common_test(cb_gmres_kernels)
@@ -13,3 +14,6 @@ ginkgo_create_common_test(lower_trs_kernels DISABLE_EXECUTORS dpcpp)
 ginkgo_create_common_test(multigrid_kernels DISABLE_EXECUTORS dpcpp)
 ginkgo_create_common_test(solver DISABLE_EXECUTORS dpcpp)
 ginkgo_create_common_test(upper_trs_kernels DISABLE_EXECUTORS dpcpp)
+if(GINKGO_BUILD_SYCL) 
+    gko_add_sycl_to_target(TARGET test_solver_idr_kernels_dpcpp SOURCES idr_kernels.cpp)
+endif()
diff --git a/test/solver/batch_bicgstab_kernels.cpp b/test/solver/batch_bicgstab_kernels.cpp
new file mode 100644
index 00000000000..4bec19a165f
--- /dev/null
+++ b/test/solver/batch_bicgstab_kernels.cpp
@@ -0,0 +1,262 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/solver/batch_bicgstab_kernels.hpp"
+
+
+#include <memory>
+#include <random>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/log/batch_logger.hpp>
+#include <ginkgo/core/matrix/batch_dense.hpp>
+#include <ginkgo/core/matrix/batch_ell.hpp>
+#include <ginkgo/core/solver/batch_bicgstab.hpp>
+
+
+#include "core/base/batch_utilities.hpp"
+#include "core/matrix/batch_dense_kernels.hpp"
+#include "core/test/utils.hpp"
+#include "core/test/utils/batch_helpers.hpp"
+#include "test/utils/executor.hpp"
+
+
+class BatchBicgstab : public CommonTestFixture {
+protected:
+    using real_type = gko::remove_complex<value_type>;
+    using solver_type = gko::batch::solver::Bicgstab<value_type>;
+    using Mtx = gko::batch::matrix::Dense<value_type>;
+    using EllMtx = gko::batch::matrix::Ell<value_type>;
+    using MVec = gko::batch::MultiVector<value_type>;
+    using RealMVec = gko::batch::MultiVector<real_type>;
+    using Settings = gko::kernels::batch_bicgstab::settings<real_type>;
+    using LogData = gko::batch::log::detail::log_data<real_type>;
+    using Logger = gko::batch::log::BatchConvergence<real_type>;
+
+    BatchBicgstab() {}
+
+    template <typename MatrixType>
+    gko::test::LinearSystem<MatrixType> setup_linsys_and_solver(
+        std::shared_ptr<const MatrixType> mat, const int num_rhs,
+        const real_type tol, const int max_iters)
+    {
+        auto executor = exec;
+        solve_lambda = [executor](const Settings settings,
+                                  const gko::batch::BatchLinOp* prec,
+                                  const Mtx* mtx, const MVec* b, MVec* x,
+                                  LogData& log_data) {
+            gko::kernels::EXEC_NAMESPACE::batch_bicgstab::apply<
+                typename Mtx::value_type>(executor, settings, mtx, prec, b, x,
+                                          log_data);
+        };
+        solver_settings = Settings{max_iters, tol,
+                                   gko::batch::stop::tolerance_type::relative};
+
+        solver_factory =
+            solver_type::build()
+                .with_max_iterations(max_iters)
+                .with_tolerance(tol)
+                .with_tolerance_type(gko::batch::stop::tolerance_type::relative)
+                .on(exec);
+        return gko::test::generate_batch_linear_system(mat, num_rhs);
+    }
+
+    std::function<void(const Settings, const gko::batch::BatchLinOp*,
+                       const Mtx*, const MVec*, MVec*, LogData&)>
+        solve_lambda;
+    Settings solver_settings{};
+    std::shared_ptr<solver_type::Factory> solver_factory;
+};
+
+
+TEST_F(BatchBicgstab, SolvesStencilSystem)
+{
+    const int num_batch_items = 2;
+    const int num_rows = 33;
+    const int num_rhs = 1;
+    const real_type tol = 1e-5;
+    const int max_iters = 100;
+    auto mat = gko::share(gko::test::generate_3pt_stencil_batch_matrix<Mtx>(
+        exec, num_batch_items, num_rows));
+    auto linear_system = setup_linsys_and_solver(mat, num_rhs, tol, max_iters);
+
+    auto res = gko::test::solve_linear_system(exec, solve_lambda,
+                                              solver_settings, linear_system);
+
+    for (size_t i = 0; i < num_batch_items; i++) {
+        ASSERT_LE(res.host_res_norm->get_const_values()[i] /
+                      linear_system.host_rhs_norm->get_const_values()[i],
+                  solver_settings.residual_tol);
+    }
+    GKO_ASSERT_BATCH_MTX_NEAR(res.x, linear_system.exact_sol, tol);
+}
+
+
+TEST_F(BatchBicgstab, StencilSystemLoggerLogsResidual)
+{
+    const int num_batch_items = 2;
+    const int num_rows = 33;
+    const int num_rhs = 1;
+    const real_type tol = 1e-5;
+    const int max_iters = 100;
+    auto mat = gko::share(gko::test::generate_3pt_stencil_batch_matrix<Mtx>(
+        exec, num_batch_items, num_rows));
+    auto linear_system = setup_linsys_and_solver(mat, num_rhs, tol, max_iters);
+
+    auto res = gko::test::solve_linear_system(exec, solve_lambda,
+                                              solver_settings, linear_system);
+
+    auto res_log_array = res.log_data->res_norms.get_const_data();
+    for (size_t i = 0; i < num_batch_items; i++) {
+        ASSERT_LE(res_log_array[i] / linear_system.host_rhs_norm->at(i, 0, 0),
+                  solver_settings.residual_tol);
+        ASSERT_NEAR(res_log_array[i], res.host_res_norm->get_const_values()[i],
+                    10 * tol);
+    }
+}
+
+
+TEST_F(BatchBicgstab, StencilSystemLoggerLogsIterations)
+{
+    const int num_batch_items = 2;
+    const int num_rows = 33;
+    const int num_rhs = 1;
+    const int ref_iters = 5;
+    auto mat = gko::share(gko::test::generate_3pt_stencil_batch_matrix<Mtx>(
+        exec, num_batch_items, num_rows));
+    auto linear_system = setup_linsys_and_solver(mat, num_rhs, 0, ref_iters);
+
+    auto res = gko::test::solve_linear_system(exec, solve_lambda,
+                                              solver_settings, linear_system);
+
+    auto iter_array = res.log_data->iter_counts.get_const_data();
+    for (size_t i = 0; i < num_batch_items; i++) {
+        ASSERT_EQ(iter_array[i], ref_iters);
+    }
+}
+
+
+TEST_F(BatchBicgstab, CanSolve3ptStencilSystem)
+{
+    const int num_batch_items = 8;
+    const int num_rows = 100;
+    const int num_rhs = 1;
+    const real_type tol = 1e-5;
+    const int max_iters = 500;
+    auto mat = gko::share(gko::test::generate_3pt_stencil_batch_matrix<Mtx>(
+        exec, num_batch_items, num_rows));
+    auto linear_system = setup_linsys_and_solver(mat, num_rhs, tol, max_iters);
+    auto solver = gko::share(solver_factory->generate(linear_system.matrix));
+
+    auto res = gko::test::solve_linear_system(exec, linear_system, solver);
+
+    GKO_ASSERT_BATCH_MTX_NEAR(res.x, linear_system.exact_sol, tol * 10);
+    for (size_t i = 0; i < num_batch_items; i++) {
+        auto comp_res_norm = res.host_res_norm->get_const_values()[i] /
+                             linear_system.host_rhs_norm->get_const_values()[i];
+        ASSERT_LE(comp_res_norm, tol);
+    }
+}
+
+
+TEST_F(BatchBicgstab, CanSolveLargeBatchSizeHpdSystem)
+{
+    const int num_batch_items = 100;
+    const int num_rows = 102;
+    const int num_rhs = 1;
+    const real_type tol = 1e-5;
+    const int max_iters = num_rows * 2;
+    std::shared_ptr<Logger> logger = Logger::create();
+    auto mat = gko::share(gko::test::generate_diag_dominant_batch_matrix<Mtx>(
+        exec, num_batch_items, num_rows, true));
+    auto linear_system = setup_linsys_and_solver(mat, num_rhs, tol, max_iters);
+    auto solver = gko::share(solver_factory->generate(linear_system.matrix));
+    solver->add_logger(logger);
+
+    auto res = gko::test::solve_linear_system(exec, linear_system, solver);
+
+    solver->remove_logger(logger);
+    auto iter_counts = gko::make_temporary_clone(exec->get_master(),
+                                                 &logger->get_num_iterations());
+    auto res_norm = gko::make_temporary_clone(exec->get_master(),
+                                              &logger->get_residual_norm());
+    GKO_ASSERT_BATCH_MTX_NEAR(res.x, linear_system.exact_sol, tol * 500);
+    for (size_t i = 0; i < num_batch_items; i++) {
+        auto comp_res_norm = res.host_res_norm->get_const_values()[i] /
+                             linear_system.host_rhs_norm->get_const_values()[i];
+        ASSERT_LE(iter_counts->get_const_data()[i], max_iters);
+        EXPECT_LE(res_norm->get_const_data()[i] /
+                      linear_system.host_rhs_norm->get_const_values()[i],
+                  tol);
+        EXPECT_GT(res_norm->get_const_data()[i], real_type{0.0});
+        ASSERT_LE(comp_res_norm, tol * 10);
+    }
+}
+
+
+TEST_F(BatchBicgstab, CanSolveLargeMatrixSizeHpdSystem)
+{
+    const int num_batch_items = 12;
+    const int num_rows = 1025;
+    const int num_rhs = 1;
+    const real_type tol = 1e-5;
+    const int max_iters = num_rows * 2;
+    std::shared_ptr<Logger> logger = Logger::create();
+    auto mat = gko::share(gko::test::generate_diag_dominant_batch_matrix<Mtx>(
+        exec, num_batch_items, num_rows, true));
+    auto linear_system = setup_linsys_and_solver(mat, num_rhs, tol, max_iters);
+    auto solver = gko::share(solver_factory->generate(linear_system.matrix));
+    solver->add_logger(logger);
+
+    auto res = gko::test::solve_linear_system(exec, linear_system, solver);
+
+    solver->remove_logger(logger);
+    auto iter_counts = gko::make_temporary_clone(exec->get_master(),
+                                                 &logger->get_num_iterations());
+    auto res_norm = gko::make_temporary_clone(exec->get_master(),
+                                              &logger->get_residual_norm());
+    GKO_ASSERT_BATCH_MTX_NEAR(res.x, linear_system.exact_sol, tol * 500);
+    for (size_t i = 0; i < num_batch_items; i++) {
+        auto comp_res_norm = res.host_res_norm->get_const_values()[i] /
+                             linear_system.host_rhs_norm->get_const_values()[i];
+        ASSERT_LE(iter_counts->get_const_data()[i], max_iters);
+        EXPECT_LE(res_norm->get_const_data()[i] /
+                      linear_system.host_rhs_norm->get_const_values()[i],
+                  tol);
+        EXPECT_GT(res_norm->get_const_data()[i], real_type{0.0});
+        ASSERT_LE(comp_res_norm, tol * 10);
+    }
+}
diff --git a/test/solver/bicg_kernels.cpp b/test/solver/bicg_kernels.cpp
index a62ab3f6d72..d35e6de227d 100644
--- a/test/solver/bicg_kernels.cpp
+++ b/test/solver/bicg_kernels.cpp
@@ -239,19 +239,15 @@ TEST_F(Bicg, ApplyWithSpdMatrixIsEquivalentToRef)
     auto d_b = gko::clone(exec, b);
     auto bicg_factory =
         gko::solver::Bicg<value_type>::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(50u).on(ref),
-                gko::stop::ResidualNorm<value_type>::build()
-                    .with_reduction_factor(::r<value_type>::value)
-                    .on(ref))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(50u),
+                           gko::stop::ResidualNorm<value_type>::build()
+                               .with_reduction_factor(::r<value_type>::value))
             .on(ref);
     auto d_bicg_factory =
         gko::solver::Bicg<value_type>::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(50u).on(exec),
-                gko::stop::ResidualNorm<value_type>::build()
-                    .with_reduction_factor(::r<value_type>::value)
-                    .on(exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(50u),
+                           gko::stop::ResidualNorm<value_type>::build()
+                               .with_reduction_factor(::r<value_type>::value))
             .on(exec);
     auto solver = bicg_factory->generate(std::move(mtx));
     auto d_solver = d_bicg_factory->generate(std::move(d_mtx));
@@ -271,19 +267,15 @@ TEST_F(Bicg, ApplyWithSuiteSparseMatrixIsEquivalentToRef)
     auto d_b = gko::clone(exec, b);
     auto bicg_factory =
         gko::solver::Bicg<value_type>::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(50u).on(ref),
-                gko::stop::ResidualNorm<value_type>::build()
-                    .with_reduction_factor(::r<value_type>::value)
-                    .on(ref))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(50u),
+                           gko::stop::ResidualNorm<value_type>::build()
+                               .with_reduction_factor(::r<value_type>::value))
             .on(ref);
     auto d_bicg_factory =
         gko::solver::Bicg<value_type>::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(50u).on(exec),
-                gko::stop::ResidualNorm<value_type>::build()
-                    .with_reduction_factor(::r<value_type>::value)
-                    .on(exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(50u),
+                           gko::stop::ResidualNorm<value_type>::build()
+                               .with_reduction_factor(::r<value_type>::value))
             .on(exec);
     auto solver = bicg_factory->generate(std::move(mtx_ani));
     auto d_solver = d_bicg_factory->generate(std::move(d_mtx_ani));
diff --git a/test/solver/bicgstab_kernels.cpp b/test/solver/bicgstab_kernels.cpp
index 15eda2a74cb..422d51c86ad 100644
--- a/test/solver/bicgstab_kernels.cpp
+++ b/test/solver/bicgstab_kernels.cpp
@@ -71,19 +71,17 @@ class Bicgstab : public CommonTestFixture {
         exec_bicgstab_factory =
             Solver::build()
                 .with_criteria(
-                    gko::stop::Iteration::build().with_max_iters(246u).on(exec),
+                    gko::stop::Iteration::build().with_max_iters(246u),
                     gko::stop::ResidualNorm<value_type>::build()
-                        .with_reduction_factor(::r<value_type>::value)
-                        .on(exec))
+                        .with_reduction_factor(::r<value_type>::value))
                 .on(exec);
 
         ref_bicgstab_factory =
             Solver::build()
                 .with_criteria(
-                    gko::stop::Iteration::build().with_max_iters(246u).on(ref),
+                    gko::stop::Iteration::build().with_max_iters(246u),
                     gko::stop::ResidualNorm<value_type>::build()
-                        .with_reduction_factor(::r<value_type>::value)
-                        .on(ref))
+                        .with_reduction_factor(::r<value_type>::value))
                 .on(ref);
     }
 
diff --git a/test/solver/cg_kernels.cpp b/test/solver/cg_kernels.cpp
index a51ac48c59b..dcb4b0147f6 100644
--- a/test/solver/cg_kernels.cpp
+++ b/test/solver/cg_kernels.cpp
@@ -203,19 +203,15 @@ TEST_F(Cg, ApplyIsEquivalentToRef)
     auto d_b = gko::clone(exec, b);
     auto cg_factory =
         gko::solver::Cg<value_type>::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(50u).on(ref),
-                gko::stop::ResidualNorm<value_type>::build()
-                    .with_reduction_factor(::r<value_type>::value)
-                    .on(ref))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(50u),
+                           gko::stop::ResidualNorm<value_type>::build()
+                               .with_reduction_factor(::r<value_type>::value))
             .on(ref);
     auto d_cg_factory =
         gko::solver::Cg<value_type>::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(50u).on(exec),
-                gko::stop::ResidualNorm<value_type>::build()
-                    .with_reduction_factor(::r<value_type>::value)
-                    .on(exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(50u),
+                           gko::stop::ResidualNorm<value_type>::build()
+                               .with_reduction_factor(::r<value_type>::value))
             .on(exec);
     auto solver = cg_factory->generate(std::move(mtx));
     auto d_solver = d_cg_factory->generate(std::move(d_mtx));
diff --git a/test/solver/cgs_kernels.cpp b/test/solver/cgs_kernels.cpp
index b1b124ed420..35914d4afa6 100644
--- a/test/solver/cgs_kernels.cpp
+++ b/test/solver/cgs_kernels.cpp
@@ -70,18 +70,16 @@ class Cgs : public CommonTestFixture {
         exec_cgs_factory =
             Solver::build()
                 .with_criteria(
-                    gko::stop::Iteration::build().with_max_iters(246u).on(exec),
+                    gko::stop::Iteration::build().with_max_iters(246u),
                     gko::stop::ResidualNorm<value_type>::build()
-                        .with_reduction_factor(::r<value_type>::value)
-                        .on(exec))
+                        .with_reduction_factor(::r<value_type>::value))
                 .on(exec);
         ref_cgs_factory =
             Solver::build()
                 .with_criteria(
-                    gko::stop::Iteration::build().with_max_iters(246u).on(ref),
+                    gko::stop::Iteration::build().with_max_iters(246u),
                     gko::stop::ResidualNorm<value_type>::build()
-                        .with_reduction_factor(::r<value_type>::value)
-                        .on(ref))
+                        .with_reduction_factor(::r<value_type>::value))
                 .on(ref);
     }
 
diff --git a/test/solver/direct.cpp b/test/solver/direct.cpp
index 0a30f7ba67f..8ab66bee1d6 100644
--- a/test/solver/direct.cpp
+++ b/test/solver/direct.cpp
@@ -94,9 +94,10 @@ class Direct : public CommonTestFixture {
         dmtx = gko::clone(exec, mtx);
         const auto num_rows = mtx->get_size()[0];
         factory = solver_type::build()
-                      .with_factorization(factorization_type::build()
-                                              .with_symmetric_sparsity(true)
-                                              .on(ref))
+                      .with_factorization(
+                          factorization_type::build().with_symbolic_algorithm(
+                              gko::experimental::factorization::symbolic_type::
+                                  symmetric))
                       .with_num_rhs(static_cast<gko::size_type>(nrhs))
                       .on(ref);
         alpha = gen_mtx(1, 1);
@@ -104,9 +105,10 @@ class Direct : public CommonTestFixture {
         input = gen_mtx(num_rows, nrhs);
         output = gen_mtx(num_rows, nrhs);
         dfactory = solver_type::build()
-                       .with_factorization(factorization_type::build()
-                                               .with_symmetric_sparsity(true)
-                                               .on(exec))
+                       .with_factorization(
+                           factorization_type::build().with_symbolic_algorithm(
+                               gko::experimental::factorization::symbolic_type::
+                                   symmetric))
                        .with_num_rhs(static_cast<gko::size_type>(nrhs))
                        .on(exec);
         dalpha = gko::clone(exec, alpha);
diff --git a/test/solver/fcg_kernels.cpp b/test/solver/fcg_kernels.cpp
index 0d1ced86f85..d8a3a1ef9b2 100644
--- a/test/solver/fcg_kernels.cpp
+++ b/test/solver/fcg_kernels.cpp
@@ -212,19 +212,15 @@ TEST_F(Fcg, ApplyIsEquivalentToRef)
     auto d_b = gko::clone(exec, b);
     auto fcg_factory =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(50u).on(ref),
-                gko::stop::ResidualNorm<value_type>::build()
-                    .with_reduction_factor(::r<value_type>::value)
-                    .on(ref))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(50u),
+                           gko::stop::ResidualNorm<value_type>::build()
+                               .with_reduction_factor(::r<value_type>::value))
             .on(ref);
     auto d_fcg_factory =
         Solver::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(50u).on(exec),
-                gko::stop::ResidualNorm<value_type>::build()
-                    .with_reduction_factor(::r<value_type>::value)
-                    .on(exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(50u),
+                           gko::stop::ResidualNorm<value_type>::build()
+                               .with_reduction_factor(::r<value_type>::value))
             .on(exec);
     auto solver = fcg_factory->generate(std::move(mtx));
     auto d_solver = d_fcg_factory->generate(std::move(d_mtx));
diff --git a/test/solver/gcr_kernels.cpp b/test/solver/gcr_kernels.cpp
index 8f02c431f98..8db5570a6f0 100644
--- a/test/solver/gcr_kernels.cpp
+++ b/test/solver/gcr_kernels.cpp
@@ -74,19 +74,17 @@ class Gcr : public CommonTestFixture {
         exec_gcr_factory =
             Solver::build()
                 .with_criteria(
-                    gko::stop::Iteration::build().with_max_iters(246u).on(exec),
+                    gko::stop::Iteration::build().with_max_iters(246u),
                     gko::stop::ResidualNorm<value_type>::build()
-                        .with_reduction_factor(value_type{1e-15})
-                        .on(exec))
+                        .with_reduction_factor(value_type{1e-15}))
                 .on(exec);
 
         ref_gcr_factory =
             Solver::build()
                 .with_criteria(
-                    gko::stop::Iteration::build().with_max_iters(246u).on(ref),
+                    gko::stop::Iteration::build().with_max_iters(246u),
                     gko::stop::ResidualNorm<value_type>::build()
-                        .with_reduction_factor(value_type{1e-15})
-                        .on(ref))
+                        .with_reduction_factor(value_type{1e-15}))
                 .on(ref);
     }
 
diff --git a/test/solver/gmres_kernels.cpp b/test/solver/gmres_kernels.cpp
index 5c2541da1a7..7752ff4dda6 100644
--- a/test/solver/gmres_kernels.cpp
+++ b/test/solver/gmres_kernels.cpp
@@ -70,19 +70,17 @@ class Gmres : public CommonTestFixture {
         exec_gmres_factory =
             Solver::build()
                 .with_criteria(
-                    gko::stop::Iteration::build().with_max_iters(246u).on(exec),
+                    gko::stop::Iteration::build().with_max_iters(246u),
                     gko::stop::ResidualNorm<value_type>::build()
-                        .with_reduction_factor(value_type{1e-15})
-                        .on(exec))
+                        .with_reduction_factor(value_type{1e-15}))
                 .on(exec);
 
         ref_gmres_factory =
             Solver::build()
                 .with_criteria(
-                    gko::stop::Iteration::build().with_max_iters(246u).on(ref),
+                    gko::stop::Iteration::build().with_max_iters(246u),
                     gko::stop::ResidualNorm<value_type>::build()
-                        .with_reduction_factor(value_type{1e-15})
-                        .on(ref))
+                        .with_reduction_factor(value_type{1e-15}))
                 .on(ref);
     }
 
diff --git a/test/solver/idr_kernels.cpp b/test/solver/idr_kernels.cpp
index f7191483615..0019c05b9d4 100644
--- a/test/solver/idr_kernels.cpp
+++ b/test/solver/idr_kernels.cpp
@@ -40,6 +40,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <gtest/gtest.h>
 
 
+#ifdef GKO_COMPILING_DPCPP
+#include <CL/sycl.hpp>
+#endif
+
+
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/executor.hpp>
@@ -71,15 +76,13 @@ class Idr : public CommonTestFixture {
         exec_idr_factory =
             Solver::build()
                 .with_deterministic(true)
-                .with_criteria(
-                    gko::stop::Iteration::build().with_max_iters(1u).on(exec))
+                .with_criteria(gko::stop::Iteration::build().with_max_iters(1u))
                 .on(exec);
 
         ref_idr_factory =
             Solver::build()
                 .with_deterministic(true)
-                .with_criteria(
-                    gko::stop::Iteration::build().with_max_iters(1u).on(ref))
+                .with_criteria(gko::stop::Iteration::build().with_max_iters(1u))
                 .on(ref);
     }
 
@@ -290,15 +293,13 @@ TEST_F(Idr, IdrIterationWithComplexSubspaceOneRHSIsEquivalentToRef)
         Solver::build()
             .with_deterministic(true)
             .with_complex_subspace(true)
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(1u).on(exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(1u))
             .on(exec);
     ref_idr_factory =
         Solver::build()
             .with_deterministic(true)
             .with_complex_subspace(true)
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(1u).on(ref))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(1u))
             .on(ref);
     auto ref_solver = ref_idr_factory->generate(mtx);
     auto exec_solver = exec_idr_factory->generate(d_mtx);
@@ -332,15 +333,13 @@ TEST_F(Idr, IdrIterationWithComplexSubspaceMultipleRHSIsEquivalentToRef)
         Solver::build()
             .with_deterministic(true)
             .with_complex_subspace(true)
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(1u).on(exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(1u))
             .on(exec);
     ref_idr_factory =
         Solver::build()
             .with_deterministic(true)
             .with_complex_subspace(true)
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(1u).on(ref))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(1u))
             .on(ref);
     auto exec_solver = exec_idr_factory->generate(d_mtx);
     auto ref_solver = ref_idr_factory->generate(mtx);
diff --git a/test/solver/ir_kernels.cpp b/test/solver/ir_kernels.cpp
index c21f6da3f66..9374b7867ce 100644
--- a/test/solver/ir_kernels.cpp
+++ b/test/solver/ir_kernels.cpp
@@ -105,13 +105,11 @@ TEST_F(Ir, ApplyIsEquivalentToRef)
     // both executors
     auto ir_factory =
         gko::solver::Ir<value_type>::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(2u).on(ref))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(2u))
             .on(ref);
     auto d_ir_factory =
         gko::solver::Ir<value_type>::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(2u).on(exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(2u))
             .on(exec);
     auto solver = ir_factory->generate(std::move(mtx));
     auto d_solver = d_ir_factory->generate(std::move(d_mtx));
@@ -134,25 +132,15 @@ TEST_F(Ir, ApplyWithIterativeInnerSolverIsEquivalentToRef)
 
     auto ir_factory =
         gko::solver::Ir<value_type>::build()
-            .with_solver(
-                gko::solver::Gmres<value_type>::build()
-                    .with_criteria(
-                        gko::stop::Iteration::build().with_max_iters(1u).on(
-                            ref))
-                    .on(ref))
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(2u).on(ref))
+            .with_solver(gko::solver::Gmres<value_type>::build().with_criteria(
+                gko::stop::Iteration::build().with_max_iters(1u)))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(2u))
             .on(ref);
     auto d_ir_factory =
         gko::solver::Ir<value_type>::build()
-            .with_solver(
-                gko::solver::Gmres<value_type>::build()
-                    .with_criteria(
-                        gko::stop::Iteration::build().with_max_iters(1u).on(
-                            exec))
-                    .on(exec))
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(2u).on(exec))
+            .with_solver(gko::solver::Gmres<value_type>::build().with_criteria(
+                gko::stop::Iteration::build().with_max_iters(1u)))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(2u))
             .on(exec);
     auto solver = ir_factory->generate(std::move(mtx));
     auto d_solver = d_ir_factory->generate(std::move(d_mtx));
@@ -180,14 +168,12 @@ TEST_F(Ir, RichardsonApplyIsEquivalentToRef)
     // both executors
     auto ir_factory =
         gko::solver::Ir<value_type>::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(2u).on(ref))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(2u))
             .with_relaxation_factor(value_type{0.9})
             .on(ref);
     auto d_ir_factory =
         gko::solver::Ir<value_type>::build()
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(2u).on(exec))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(2u))
             .with_relaxation_factor(value_type{0.9})
             .on(exec);
     auto solver = ir_factory->generate(std::move(mtx));
@@ -210,26 +196,16 @@ TEST_F(Ir, RichardsonApplyWithIterativeInnerSolverIsEquivalentToRef)
     auto d_b = clone(exec, b);
     auto ir_factory =
         gko::solver::Ir<value_type>::build()
-            .with_solver(
-                gko::solver::Gmres<value_type>::build()
-                    .with_criteria(
-                        gko::stop::Iteration::build().with_max_iters(1u).on(
-                            ref))
-                    .on(ref))
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(2u).on(ref))
+            .with_solver(gko::solver::Gmres<value_type>::build().with_criteria(
+                gko::stop::Iteration::build().with_max_iters(1u)))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(2u))
             .with_relaxation_factor(value_type{0.9})
             .on(ref);
     auto d_ir_factory =
         gko::solver::Ir<value_type>::build()
-            .with_solver(
-                gko::solver::Gmres<value_type>::build()
-                    .with_criteria(
-                        gko::stop::Iteration::build().with_max_iters(1u).on(
-                            exec))
-                    .on(exec))
-            .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(2u).on(exec))
+            .with_solver(gko::solver::Gmres<value_type>::build().with_criteria(
+                gko::stop::Iteration::build().with_max_iters(1u)))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(2u))
             .with_relaxation_factor(value_type{0.9})
             .on(exec);
     auto solver = ir_factory->generate(std::move(mtx));
@@ -258,14 +234,12 @@ TEST_F(Ir, ApplyWithGivenInitialGuessModeIsEquivalentToRef)
         auto d_x = clone(exec, x);
         auto ir_factory =
             gko::solver::Ir<value_type>::build()
-                .with_criteria(
-                    gko::stop::Iteration::build().with_max_iters(2u).on(ref))
+                .with_criteria(gko::stop::Iteration::build().with_max_iters(2u))
                 .with_default_initial_guess(guess)
                 .on(ref);
         auto d_ir_factory =
             gko::solver::Ir<value_type>::build()
-                .with_criteria(
-                    gko::stop::Iteration::build().with_max_iters(2u).on(exec))
+                .with_criteria(gko::stop::Iteration::build().with_max_iters(2u))
                 .with_default_initial_guess(guess)
                 .on(exec);
         auto solver = ir_factory->generate(mtx);
diff --git a/test/solver/solver.cpp b/test/solver/solver.cpp
index b6f228c13f5..6bb59507f17 100644
--- a/test/solver/solver.cpp
+++ b/test/solver/solver.cpp
@@ -101,9 +101,7 @@ struct SimpleSolverTest {
         gko::size_type iteration_count, bool check_residual = true)
     {
         return solver_type::build().with_criteria(
-            gko::stop::Iteration::build()
-                .with_max_iters(iteration_count)
-                .on(exec),
+            gko::stop::Iteration::build().with_max_iters(iteration_count),
             check_residual ? gko::stop::ResidualNorm<value_type>::build()
                                  .with_baseline(gko::stop::mode::absolute)
                                  .with_reduction_factor(1e-30)
@@ -116,8 +114,7 @@ struct SimpleSolverTest {
         gko::size_type iteration_count, bool check_residual = true)
     {
         return build(exec, iteration_count, check_residual)
-            .with_preconditioner(
-                precond_type::build().with_max_block_size(1u).on(exec));
+            .with_preconditioner(precond_type::build().with_max_block_size(1u));
     }
 
     static const gko::LinOp* get_preconditioner(
@@ -185,8 +182,7 @@ struct Idr : SimpleSolverTest<gko::solver::Idr<solver_value_type>> {
         gko::size_type iteration_count, bool check_residual = true)
     {
         return build(exec, iteration_count, check_residual)
-            .with_preconditioner(
-                precond_type::build().with_max_block_size(1u).on(exec));
+            .with_preconditioner(precond_type::build().with_max_block_size(1u));
     }
 };
 
@@ -200,8 +196,7 @@ struct Ir : SimpleSolverTest<gko::solver::Ir<solver_value_type>> {
     {
         return SimpleSolverTest<gko::solver::Ir<solver_value_type>>::build(
                    exec, iteration_count, check_residual)
-            .with_solver(
-                precond_type::build().with_max_block_size(1u).on(exec));
+            .with_solver(precond_type::build().with_max_block_size(1u));
     }
 
     static const gko::LinOp* get_preconditioner(
@@ -232,8 +227,7 @@ struct CbGmres : SimpleSolverTest<gko::solver::CbGmres<solver_value_type>> {
         gko::size_type iteration_count, bool check_residual = true)
     {
         return build(exec, iteration_count, check_residual)
-            .with_preconditioner(
-                precond_type::build().with_max_block_size(1u).on(exec));
+            .with_preconditioner(precond_type::build().with_max_block_size(1u));
     }
 };
 
@@ -254,8 +248,7 @@ struct Gmres : SimpleSolverTest<gko::solver::Gmres<solver_value_type>> {
         gko::size_type iteration_count, bool check_residual = true)
     {
         return build(exec, iteration_count, check_residual)
-            .with_preconditioner(
-                precond_type::build().with_max_block_size(1u).on(exec));
+            .with_preconditioner(precond_type::build().with_max_block_size(1u));
     }
 };
 
@@ -277,8 +270,7 @@ struct FGmres : SimpleSolverTest<gko::solver::Gmres<solver_value_type>> {
         gko::size_type iteration_count, bool check_residual = true)
     {
         return build(exec, iteration_count, check_residual)
-            .with_preconditioner(
-                precond_type::build().with_max_block_size(1u).on(exec))
+            .with_preconditioner(precond_type::build().with_max_block_size(1u))
             .with_flexible(true);
     }
 };
@@ -300,8 +292,7 @@ struct Gcr : SimpleSolverTest<gko::solver::Gcr<solver_value_type>> {
         gko::size_type iteration_count, bool check_residual = true)
     {
         return build(exec, iteration_count, check_residual)
-            .with_preconditioner(
-                precond_type::build().with_max_block_size(1u).on(exec));
+            .with_preconditioner(precond_type::build().with_max_block_size(1u));
     }
 };
 
diff --git a/test/stop/residual_norm_kernels.cpp b/test/stop/residual_norm_kernels.cpp
index 50b8ae19df1..5377f42eb71 100644
--- a/test/stop/residual_norm_kernels.cpp
+++ b/test/stop/residual_norm_kernels.cpp
@@ -65,6 +65,7 @@ class ResidualNorm : public CommonTestFixture {
 protected:
     using Mtx = gko::matrix::Dense<T>;
     using NormVector = gko::matrix::Dense<gko::remove_complex<T>>;
+    using ValueType = T;
 
     ResidualNorm()
     {
@@ -110,6 +111,40 @@ TYPED_TEST(ResidualNorm, CanIgorneResidualNorm)
                  gko::NotSupported);
 }
 
+TYPED_TEST(ResidualNorm, CheckIfResZeroConverges)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using NormVector = typename TestFixture::NormVector;
+    using T = typename TestFixture::ValueType;
+    using mode = gko::stop::mode;
+    std::shared_ptr<gko::LinOp> mtx = gko::initialize<Mtx>({1.0}, this->exec);
+    std::shared_ptr<gko::LinOp> rhs = gko::initialize<Mtx>({0.0}, this->exec);
+    std::shared_ptr<gko::LinOp> x = gko::initialize<Mtx>({0.0}, this->exec);
+    std::shared_ptr<gko::LinOp> res_norm =
+        gko::initialize<NormVector>({0.0}, this->exec);
+
+    for (auto baseline :
+         {mode::rhs_norm, mode::initial_resnorm, mode::absolute}) {
+        gko::remove_complex<T> factor =
+            (baseline == mode::absolute) ? 0.0 : r<T>::value;
+        auto criterion = gko::stop::ResidualNorm<T>::build()
+                             .with_reduction_factor(factor)
+                             .with_baseline(baseline)
+                             .on(this->exec)
+                             ->generate(mtx, rhs, x.get(), nullptr);
+        constexpr gko::uint8 RelativeStoppingId{1};
+        bool one_changed{};
+        gko::array<gko::stopping_status> stop_status(this->ref, 1);
+        stop_status.get_data()[0].reset();
+        stop_status.set_executor(this->exec);
+
+        EXPECT_TRUE(criterion->update().residual_norm(res_norm).check(
+            RelativeStoppingId, true, &stop_status, &one_changed));
+        stop_status.set_executor(this->ref);
+        EXPECT_TRUE(stop_status.get_data()[0].has_converged());
+        EXPECT_TRUE(one_changed);
+    }
+}
 
 TYPED_TEST(ResidualNorm, WaitsTillResidualGoal)
 {
@@ -522,6 +557,7 @@ class ImplicitResidualNorm : public CommonTestFixture {
 protected:
     using Mtx = gko::matrix::Dense<T>;
     using NormVector = gko::matrix::Dense<gko::remove_complex<T>>;
+    using ValueType = T;
 
     ImplicitResidualNorm()
     {
@@ -538,6 +574,42 @@ TYPED_TEST_SUITE(ImplicitResidualNorm, gko::test::ValueTypes,
                  TypenameNameGenerator);
 
 
+TYPED_TEST(ImplicitResidualNorm, CheckIfResZeroConverges)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using T = typename TestFixture::ValueType;
+    using mode = typename gko::stop::mode;
+    std::shared_ptr<gko::LinOp> mtx = gko::initialize<Mtx>({1.0}, this->exec);
+    std::shared_ptr<gko::LinOp> rhs = gko::initialize<Mtx>({0.0}, this->exec);
+    std::shared_ptr<gko::LinOp> x = gko::initialize<Mtx>({0.0}, this->exec);
+    std::shared_ptr<gko::LinOp> implicit_sq_res_norm =
+        gko::initialize<Mtx>({0.0}, this->exec);
+
+    for (auto baseline :
+         {mode::rhs_norm, mode::initial_resnorm, mode::absolute}) {
+        gko::remove_complex<T> factor =
+            (baseline == mode::absolute) ? 0.0 : r<T>::value;
+        auto criterion = gko::stop::ImplicitResidualNorm<T>::build()
+                             .with_reduction_factor(factor)
+                             .with_baseline(baseline)
+                             .on(this->exec)
+                             ->generate(mtx, rhs, x.get(), nullptr);
+        constexpr gko::uint8 RelativeStoppingId{1};
+        bool one_changed{};
+        gko::array<gko::stopping_status> stop_status(this->ref, 1);
+        stop_status.get_data()[0].reset();
+        stop_status.set_executor(this->exec);
+
+        EXPECT_TRUE(
+            criterion->update()
+                .implicit_sq_residual_norm(implicit_sq_res_norm)
+                .check(RelativeStoppingId, true, &stop_status, &one_changed));
+        stop_status.set_executor(this->ref);
+        EXPECT_TRUE(stop_status.get_data()[0].has_converged());
+        EXPECT_TRUE(one_changed);
+    }
+}
+
 TYPED_TEST(ImplicitResidualNorm, WaitsTillResidualGoal)
 {
     using T = TypeParam;
diff --git a/test/test_exportbuild/CMakeLists.txt b/test/test_exportbuild/CMakeLists.txt
index 52a8d3851cd..71633b91c35 100644
--- a/test/test_exportbuild/CMakeLists.txt
+++ b/test/test_exportbuild/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.9)
+cmake_minimum_required(VERSION 3.16)
 project(GinkgoExportBuildTest LANGUAGES CXX)
 
 find_package(Ginkgo REQUIRED)
diff --git a/test/test_install/CMakeLists.txt b/test/test_install/CMakeLists.txt
index a36a936e867..513af67e923 100644
--- a/test/test_install/CMakeLists.txt
+++ b/test/test_install/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.9)
+cmake_minimum_required(VERSION 3.16)
 
 project(TestInstall LANGUAGES CXX)
 
@@ -38,12 +38,6 @@ if(GINKGO_BUILD_CUDA)
     enable_language(CUDA)
     configure_file(test_install.cpp test_install.cu COPYONLY)
     add_executable(test_install_cuda ${CMAKE_CURRENT_BINARY_DIR}/test_install.cu)
-    target_compile_options(test_install_cuda
-        PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:${GINKGO_CUDA_ARCH_FLAGS}>")
-    # we handle CUDA architecture flags for now, disable CMake handling
-    if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.18)
-        set_target_properties(test_install_cuda PROPERTIES CUDA_ARCHITECTURES OFF)
-    endif()
     target_compile_definitions(test_install_cuda PRIVATE HAS_CUDA=1)
     target_compile_definitions(test_install_cuda PRIVATE HAS_REFERENCE=${HAS_REFERENCE})
     target_link_libraries(test_install_cuda PRIVATE Ginkgo::ginkgo)
@@ -60,16 +54,13 @@ if(GINKGO_BUILD_HIP)
     else()
         set (GINKGO_PIC_OPTION "$<$<CONFIG:Debug>:-fPIC>")
     endif()
-    if (CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA")
-        set(TESTINSTALL_CUDA_ARCH_FLAGS "${GINKGO_CUDA_ARCH_FLAGS}")
-    endif()
     if (CMAKE_CUDA_HOST_COMPILER)
         set(TESTINSTALL_CUDA_HOST_COMPILER "-ccbin=${CMAKE_CUDA_HOST_COMPILER}")
     endif()
     hip_add_executable(test_install_hip test_install.cpp
         HIPCC_OPTIONS "-std=c++14"
         CLANG_OPTIONS "${GINKGO_PIC_OPTION}"
-        NVCC_OPTIONS "${GINKGO_CUDA_PIC_OPTION}" "${TESTINSTALL_CUDA_ARCH_FLAGS}" "${TESTINSTALL_CUDA_HOST_COMPILER}")
+        NVCC_OPTIONS "${GINKGO_CUDA_PIC_OPTION}" "${TESTINSTALL_CUDA_HOST_COMPILER}")
 
     target_link_libraries(test_install_hip PRIVATE Ginkgo::ginkgo)
     target_compile_definitions(test_install_hip PRIVATE HAS_HIP=1)
diff --git a/test/test_install/test_install.cpp b/test/test_install/test_install.cpp
index 2016f00dade..49e82865857 100644
--- a/test/test_install/test_install.cpp
+++ b/test/test_install/test_install.cpp
@@ -104,11 +104,9 @@ void check_solver(std::shared_ptr<gko::Executor> exec,
     auto solver_gen =
         Solver::build()
             .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(num_iters).on(
-                    exec),
-                gko::stop::ResidualNorm<>::build()
-                    .with_reduction_factor(reduction_factor)
-                    .on(exec))
+                gko::stop::Iteration::build().with_max_iters(num_iters),
+                gko::stop::ResidualNorm<>::build().with_reduction_factor(
+                    reduction_factor))
             .on(exec);
 #if HAS_REFERENCE
     A->read(A_raw);
@@ -126,11 +124,9 @@ void check_solver(std::shared_ptr<gko::Executor> exec,
     auto solver_gen_ref =
         Solver::build()
             .with_criteria(
-                gko::stop::Iteration::build().with_max_iters(num_iters).on(
-                    exec_ref),
-                gko::stop::ResidualNorm<>::build()
-                    .with_reduction_factor(reduction_factor)
-                    .on(exec_ref))
+                gko::stop::Iteration::build().with_max_iters(num_iters),
+                gko::stop::ResidualNorm<>::build().with_reduction_factor(
+                    reduction_factor))
             .on(exec_ref);
     auto x_ref = gko::clone(exec_ref, x);
     solver_gen->generate(A_ref)->apply(b, x_ref);
@@ -210,6 +206,19 @@ int main()
         array_type test;
     }
 
+    // core/base/batch_dim.hpp
+    {
+        using type1 = int;
+        auto test = gko::batch_dim<2, type1>{};
+    }
+
+    // core/base/batch_multi_vector.hpp
+    {
+        using type1 = float;
+        using batch_multi_vector_type = gko::batch::MultiVector<type1>;
+        auto test = batch_multi_vector_type::create(exec);
+    }
+
     // core/base/combination.hpp
     {
         using type1 = int;
@@ -351,6 +360,20 @@ int main()
     }
 #endif  // GKO_HAVE_PAPI_SDE
 
+    // core/matrix/batch_dense.hpp
+    {
+        using type1 = float;
+        using batch_dense_type = gko::batch::matrix::Dense<type1>;
+        auto test = batch_dense_type::create(exec);
+    }
+
+    // core/matrix/batch_ell.hpp
+    {
+        using type1 = float;
+        using batch_ell_type = gko::batch::matrix::Ell<type1>;
+        auto test = batch_ell_type::create(exec);
+    }
+
     // core/matrix/coo.hpp
     {
         using Mtx = gko::matrix::Coo<>;
@@ -433,6 +456,12 @@ int main()
         auto test = Bj::build().with_max_block_size(1u).on(exec);
     }
 
+    // core/solver/batch_bicgstab.hpp
+    {
+        using Solver = gko::batch::solver::Bicgstab<>;
+        auto test = Solver::build().with_max_iterations(5).on(exec);
+    }
+
     // core/solver/bicgstab.hpp
     {
         using Solver = gko::solver::Bicgstab<>;
@@ -480,8 +509,7 @@ int main()
         using Solver = gko::solver::Ir<>;
         auto test =
             Solver::build()
-                .with_criteria(
-                    gko::stop::Iteration::build().with_max_iters(1u).on(exec))
+                .with_criteria(gko::stop::Iteration::build().with_max_iters(1u))
                 .on(exec);
     }
 
diff --git a/test/test_pkgconfig/CMakeLists.txt b/test/test_pkgconfig/CMakeLists.txt
index 883ad134f05..e904f997f26 100644
--- a/test/test_pkgconfig/CMakeLists.txt
+++ b/test/test_pkgconfig/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.9)
+cmake_minimum_required(VERSION 3.16)
 project(GinkgoExportBuildWithPkgConfigTest LANGUAGES CXX)
 
 find_package(PkgConfig REQUIRED)
diff --git a/test/test_subdir/CMakeLists.txt b/test/test_subdir/CMakeLists.txt
index 2017b69366f..dcf846f4adc 100644
--- a/test/test_subdir/CMakeLists.txt
+++ b/test/test_subdir/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.14)
+cmake_minimum_required(VERSION 3.16)
 project(GinkgoSubdirTest LANGUAGES CXX)
 file(CREATE_LINK "${CMAKE_CURRENT_SOURCE_DIR}/../.." "${CMAKE_CURRENT_BINARY_DIR}/ginkgo" SYMBOLIC)
 
diff --git a/test/tools/CMakeLists.txt b/test/tools/CMakeLists.txt
new file mode 100644
index 00000000000..21a7a5fc695
--- /dev/null
+++ b/test/tools/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_executable(resource_file_generator resource_file_generator.cpp)
+target_link_libraries(resource_file_generator Ginkgo::ginkgo)
diff --git a/test/tools/resource_file_generator.cpp b/test/tools/resource_file_generator.cpp
new file mode 100644
index 00000000000..ca7b09288e8
--- /dev/null
+++ b/test/tools/resource_file_generator.cpp
@@ -0,0 +1,107 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <iomanip>
+#include <thread>
+
+
+#include <ginkgo/core/base/executor.hpp>
+
+
+std::vector<std::string> split(const std::string& s, char delimiter = ',')
+{
+    std::istringstream iss(s);
+    std::vector<std::string> tokens;
+    std::string token;
+    while (std::getline(iss, token, delimiter)) {
+        tokens.push_back(token);
+    }
+    return tokens;
+}
+
+
+std::string create_json(const std::string& resources)
+{
+    std::string json;
+    json.append(R"({
+  "version": {
+    "major": 1,
+    "minor": 0
+  },
+  "local": [
+    {
+)");
+    for (const auto& line : split(resources, '\n')) {
+        json.append(R"(      )");
+        json.append(line);
+        json.append("\n");
+    }
+    json.append(R"(    }
+  ]
+})");
+    return json;
+}
+
+
+int main()
+{
+    auto num_cpu_threads = gko::OmpExecutor::get_num_omp_threads();
+    auto num_cuda_gpus = gko::CudaExecutor::get_num_devices();
+    auto num_hip_gpus = gko::HipExecutor::get_num_devices();
+    auto num_sycl_gpus = gko::DpcppExecutor::get_num_devices("gpu");
+
+    std::string cpus = R"("cpu": [{"id": "0", "slots": )" +
+                       std::to_string(num_cpu_threads) + "}]";
+
+    std::string gpus = "";
+    auto add_devices = [&](int num_devices, const std::string& name) {
+        if (num_devices) {
+            gpus.append(",\n");
+            gpus += '"' + name + "\": [\n";
+        }
+        for (int i = 0; i < num_devices; i++) {
+            if (i > 0) {
+                gpus.append(",\n");
+            }
+            gpus += R"(  {"id": ")" + std::to_string(i) + R"(", "slots": 1})";
+        }
+        if (num_devices) {
+            gpus.append("\n]");
+        }
+    };
+    add_devices(num_cuda_gpus, "cudagpu");
+    add_devices(num_hip_gpus, "hipgpu");
+    // SYCL GPUs, fall back to CPU
+    add_devices(std::max(1, num_sycl_gpus), "sycl");
+
+    std::cout << create_json(cpus + gpus) << std::endl;
+}
diff --git a/test/utils/executor.hpp b/test/utils/executor.hpp
index 25482cf18c8..ea2aef157fd 100644
--- a/test/utils/executor.hpp
+++ b/test/utils/executor.hpp
@@ -44,34 +44,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <gtest/gtest.h>
 
 
-#ifdef GKO_COMPILING_CUDA
-
-#include "cuda/base/device.hpp"
-
-class CudaEnvironment : public ::testing::Environment {
-public:
-    void TearDown() override { gko::kernels::cuda::reset_device(0); }
-};
+#include <ginkgo/core/base/stream.hpp>
 
-testing::Environment* cuda_env =
-    testing::AddGlobalTestEnvironment(new CudaEnvironment);
-
-#endif
 
-
-#ifdef GKO_COMPILING_HIP
-
-#include "hip/base/device.hpp"
-
-class HipEnvironment : public ::testing::Environment {
-public:
-    void TearDown() override { gko::kernels::hip::reset_device(0); }
-};
-
-testing::Environment* hip_env =
-    testing::AddGlobalTestEnvironment(new HipEnvironment);
-
-#endif
+#include "core/test/gtest/resources.hpp"
 
 
 #if GINKGO_COMMON_SINGLE_MODE
@@ -106,8 +82,9 @@ inline void init_executor(std::shared_ptr<gko::ReferenceExecutor> ref,
         if (gko::CudaExecutor::get_num_devices() == 0) {
             throw std::runtime_error{"No suitable CUDA devices"};
         }
-        exec = gko::CudaExecutor::create(0, ref, false,
-                                         gko::default_cuda_alloc_mode, stream);
+        exec = gko::CudaExecutor::create(
+            ResourceEnvironment::cuda_device_id, ref,
+            std::make_shared<gko::CudaAllocator>(), stream);
     }
 }
 
@@ -119,8 +96,9 @@ inline void init_executor(std::shared_ptr<gko::ReferenceExecutor> ref,
     if (gko::HipExecutor::get_num_devices() == 0) {
         throw std::runtime_error{"No suitable HIP devices"};
     }
-    exec = gko::HipExecutor::create(0, ref, false, gko::default_hip_alloc_mode,
-                                    stream);
+    exec =
+        gko::HipExecutor::create(ResourceEnvironment::hip_device_id, ref,
+                                 std::make_shared<gko::HipAllocator>(), stream);
 }
 
 
@@ -128,7 +106,8 @@ inline void init_executor(std::shared_ptr<gko::ReferenceExecutor> ref,
                           std::shared_ptr<gko::DpcppExecutor>& exec)
 {
     if (gko::DpcppExecutor::get_num_devices("gpu") > 0) {
-        exec = gko::DpcppExecutor::create(0, ref, "gpu");
+        exec = gko::DpcppExecutor::create(ResourceEnvironment::sycl_device_id,
+                                          ref, "gpu");
     } else if (gko::DpcppExecutor::get_num_devices("cpu") > 0) {
         exec = gko::DpcppExecutor::create(0, ref, "cpu");
     } else {
@@ -146,14 +125,24 @@ class CommonTestFixture : public ::testing::Test {
 #endif
     using index_type = int;
 
-    CommonTestFixture() : ref{gko::ReferenceExecutor::create()}
+    CommonTestFixture()
+        :
+#if defined(GKO_TEST_NONDEFAULT_STREAM) && defined(GKO_COMPILING_CUDA)
+          stream(ResourceEnvironment::cuda_device_id),
+#endif
+#if defined(GKO_TEST_NONDEFAULT_STREAM) && defined(GKO_COMPILING_HIP)
+          stream(ResourceEnvironment::hip_device_id),
+#endif
+          ref{gko::ReferenceExecutor::create()}
     {
-#if defined(GKO_TEST_NONDEFAULT_STREAM) && \
-    (defined(GKO_COMPILING_CUDA) || defined(GKO_COMPILING_HIP))
+#if defined(GKO_COMPILING_CUDA) || defined(GKO_COMPILING_HIP)
         init_executor(ref, exec, stream.get());
 #else
         init_executor(ref, exec);
 #endif
+        // set device-id test-wide since some test call device
+        // kernels directly
+        guard = exec->get_scoped_device_id_guard();
     }
 
     void TearDown() final
@@ -163,16 +152,15 @@ class CommonTestFixture : public ::testing::Test {
         }
     }
 
-#ifdef GKO_TEST_NONDEFAULT_STREAM
 #ifdef GKO_COMPILING_CUDA
     gko::cuda_stream stream;
 #endif
 #ifdef GKO_COMPILING_HIP
     gko::hip_stream stream;
-#endif
 #endif
     std::shared_ptr<gko::ReferenceExecutor> ref;
     std::shared_ptr<gko::EXEC_TYPE> exec;
+    gko::scoped_device_id_guard guard;
 };
 
 
diff --git a/test/utils/mpi/executor.hpp b/test/utils/mpi/executor.hpp
index 59c3f1e3f3b..504fc5d761c 100644
--- a/test/utils/mpi/executor.hpp
+++ b/test/utils/mpi/executor.hpp
@@ -46,69 +46,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/mpi.hpp>
 
 
-inline void init_executor(std::shared_ptr<gko::ReferenceExecutor>,
-                          std::shared_ptr<gko::ReferenceExecutor>& exec)
-{
-    exec = gko::ReferenceExecutor::create();
-}
-
-
-inline void init_executor(std::shared_ptr<gko::ReferenceExecutor>,
-                          std::shared_ptr<gko::OmpExecutor>& exec)
-{
-    exec = gko::OmpExecutor::create();
-}
-
-
-inline void init_executor(std::shared_ptr<gko::ReferenceExecutor> ref,
-                          std::shared_ptr<gko::CudaExecutor>& exec,
-                          CUstream_st* stream = nullptr)
-{
-    {
-        if (gko::CudaExecutor::get_num_devices() == 0) {
-            throw std::runtime_error{"No suitable CUDA devices"};
-        }
-        exec = gko::CudaExecutor::create(
-            gko::experimental::mpi::map_rank_to_device_id(
-                MPI_COMM_WORLD, gko::CudaExecutor::get_num_devices()),
-            ref, false, gko::default_cuda_alloc_mode, stream);
-    }
-}
-
-
-inline void init_executor(std::shared_ptr<gko::ReferenceExecutor> ref,
-                          std::shared_ptr<gko::HipExecutor>& exec,
-                          GKO_HIP_STREAM_STRUCT* stream = nullptr)
-{
-    if (gko::HipExecutor::get_num_devices() == 0) {
-        throw std::runtime_error{"No suitable HIP devices"};
-    }
-    exec = gko::HipExecutor::create(
-        gko::experimental::mpi::map_rank_to_device_id(
-            MPI_COMM_WORLD, gko::HipExecutor::get_num_devices()),
-        ref, false, gko::default_hip_alloc_mode, stream);
-}
-
-
-inline void init_executor(std::shared_ptr<gko::ReferenceExecutor> ref,
-                          std::shared_ptr<gko::DpcppExecutor>& exec)
-{
-    auto num_gpu_devices = gko::DpcppExecutor::get_num_devices("gpu");
-    auto num_cpu_devices = gko::DpcppExecutor::get_num_devices("cpu");
-    if (num_gpu_devices > 0) {
-        exec = gko::DpcppExecutor::create(
-            gko::experimental::mpi::map_rank_to_device_id(MPI_COMM_WORLD,
-                                                          num_gpu_devices),
-            ref, "gpu");
-    } else if (num_cpu_devices > 0) {
-        exec = gko::DpcppExecutor::create(
-            gko::experimental::mpi::map_rank_to_device_id(MPI_COMM_WORLD,
-                                                          num_cpu_devices),
-            ref, "cpu");
-    } else {
-        throw std::runtime_error{"No suitable DPC++ devices"};
-    }
-}
+#include "test/utils/executor.hpp"
 
 
 class CommonMpiTestFixture : public ::testing::Test {
@@ -122,20 +60,20 @@ class CommonMpiTestFixture : public ::testing::Test {
 
     CommonMpiTestFixture()
         : comm(MPI_COMM_WORLD),
-#if defined(GKO_TEST_NONDEFAULT_STREAM) && \
-    (defined(GKO_COMPILING_CUDA) || defined(GKO_COMPILING_HIP))
-
-          stream(gko::experimental::mpi::map_rank_to_device_id(
-              comm.get(), gko::EXEC_TYPE::get_num_devices())),
+#ifdef GKO_COMPILING_CUDA
+          stream(ResourceEnvironment::cuda_device_id),
+#endif
+#ifdef GKO_COMPILING_HIP
+          stream(ResourceEnvironment::hip_device_id),
 #endif
           ref{gko::ReferenceExecutor::create()}
     {
-#if defined(GKO_TEST_NONDEFAULT_STREAM) && \
-    (defined(GKO_COMPILING_CUDA) || defined(GKO_COMPILING_HIP))
+#if defined(GKO_COMPILING_CUDA) || defined(GKO_COMPILING_HIP)
         init_executor(ref, exec, stream.get());
 #else
         init_executor(ref, exec);
 #endif
+        guard = exec->get_scoped_device_id_guard();
     }
 
     void TearDown() final
@@ -147,17 +85,15 @@ class CommonMpiTestFixture : public ::testing::Test {
 
     gko::experimental::mpi::communicator comm;
 
-#ifdef GKO_TEST_NONDEFAULT_STREAM
 #ifdef GKO_COMPILING_CUDA
     gko::cuda_stream stream;
 #endif
 #ifdef GKO_COMPILING_HIP
     gko::hip_stream stream;
 #endif
-#endif
-
     std::shared_ptr<gko::ReferenceExecutor> ref;
     std::shared_ptr<gko::EXEC_TYPE> exec;
+    gko::scoped_device_id_guard guard;
 };
 
 
diff --git a/third_party/CMakeLists.txt b/third_party/CMakeLists.txt
index c714a51c187..828f95bc8ca 100644
--- a/third_party/CMakeLists.txt
+++ b/third_party/CMakeLists.txt
@@ -3,10 +3,6 @@ if(GINKGO_BUILD_TESTS AND (NOT GTest_FOUND))
     add_subdirectory(gtest)
 endif()
 
-if(GINKGO_BUILD_HWLOC AND (NOT HWLOC_FOUND))
-    add_subdirectory(hwloc)
-endif()
-
 if(GINKGO_DEVEL_TOOLS)
     set(GCF_IGNORE_LIST "third_party" CACHE STRING "Ignore directories for GCF")
     add_subdirectory(git-cmake-format)
@@ -18,8 +14,8 @@ if(GINKGO_BUILD_BENCHMARKS)
     if (NOT gflags_FOUND)
         add_subdirectory(gflags)
     endif()
-    if (NOT RapidJSON_FOUND)
-        add_subdirectory(rapidjson)
+    if (NOT nlohmann_json_FOUND)
+        add_subdirectory(nlohmann_json)
     endif()
 endif()
 
diff --git a/third_party/gtest/CMakeLists.txt b/third_party/gtest/CMakeLists.txt
index 45b564dbfbf..378a7cdc705 100644
--- a/third_party/gtest/CMakeLists.txt
+++ b/third_party/gtest/CMakeLists.txt
@@ -22,7 +22,7 @@ set_target_properties(gtest gtest_main PROPERTIES
     LIBRARY_OUTPUT_DIRECTORY "${GINKGO_LIBRARY_PATH}")
 # If CXX compiler is dpcpp, use -ffp-model=precise
 # Otherwise, it will throw src/gtest.cc:1583:8: error: comparison with NaN always evaluates to false in fast floating point modes
-if(CMAKE_CXX_COMPILER MATCHES "dpcpp")
+if(CMAKE_CXX_COMPILER MATCHES "dpcpp|icpx")
     target_compile_options(gtest PRIVATE "-ffp-model=precise")
     target_compile_options(gtest_main PRIVATE "-ffp-model=precise")
 endif()
diff --git a/third_party/hwloc/CMakeLists.txt b/third_party/hwloc/CMakeLists.txt
deleted file mode 100644
index 9cbbb46482e..00000000000
--- a/third_party/hwloc/CMakeLists.txt
+++ /dev/null
@@ -1,23 +0,0 @@
-message(STATUS "Configuring and building HWLOC")
-set(TPL_HWLOC_PATH "${PROJECT_BINARY_DIR}/third_party/hwloc")
-ginkgo_load_and_configure_package(hwloc_external "https://download.open-mpi.org/release/hwloc/v2.4/hwloc-2.4.1.tar.gz"
-    "SHA1=b94950e8958e1125ca75ecac0bc0259ee3d108c4"
-    "${TPL_HWLOC_PATH}/src/configure" "--disable-nvml" "--disable-cuda" "--disable-rsmi"
-    )
-
-add_library(hwloc SHARED IMPORTED GLOBAL)
-add_dependencies(hwloc hwloc_external)
-file(MAKE_DIRECTORY ${TPL_HWLOC_PATH}/lib/)
-file(GLOB HWLOC_LIBS "${TPL_HWLOC_PATH}/build/hwloc/.libs/libhwloc.so*")
-configure_file("${TPL_HWLOC_PATH}/build/include/hwloc/autogen/config.h" "${TPL_HWLOC_PATH}/src/include/hwloc/autogen/config.h" COPYONLY)
-foreach(lib ${HWLOC_LIBS})
-    get_filename_component(lib_name ${lib} NAME)
-    configure_file("${lib}" "${TPL_HWLOC_PATH}/lib/${lib_name}" COPYONLY)
-endforeach()
-# NOTE: if changing this (e.g. to `.a`), please update the special case in
-# `cmake/information_helpers.cmake`
-set(HWLOC_LIBRARIES "${TPL_HWLOC_PATH}/lib/libhwloc.so" CACHE FILEPATH "The path to HWLOC library libhwloc.so" FORCE)
-set(HWLOC_INCLUDE_DIRS "${TPL_HWLOC_PATH}/src/include" CACHE PATH "The directory containing the hwloc header, hwloc.h" FORCE)
-set_target_properties(hwloc PROPERTIES IMPORTED_LOCATION ${HWLOC_LIBRARIES})
-set_target_properties(hwloc PROPERTIES INTERFACE_LINK_LIBRARIES ${HWLOC_LIBRARIES})
-set_target_properties(hwloc PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${HWLOC_INCLUDE_DIRS}")
diff --git a/third_party/identify_stream_usage/identify_stream_usage.cpp b/third_party/identify_stream_usage/identify_stream_usage.cpp
index a88de4ee427..5cdd4d30b09 100644
--- a/third_party/identify_stream_usage/identify_stream_usage.cpp
+++ b/third_party/identify_stream_usage/identify_stream_usage.cpp
@@ -124,14 +124,10 @@ DEFINE_OVERLOAD(cudaLaunchCooperativeKernel,
                     size_t sharedMem, cudaStream_t stream),
                 ARG(func, gridDim, blockDim, args, sharedMem, stream));
 
-#if CUDA_VERSION >= 10000
-
 DEFINE_OVERLOAD(cudaLaunchHostFunc,
                 ARG(cudaStream_t stream, cudaHostFn_t fn, void* userData),
                 ARG(stream, fn, userData));
 
-#endif
-
 // Memory transfer APIS:
 // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY
 DEFINE_OVERLOAD(cudaMemPrefetchAsync,
diff --git a/third_party/nlohmann_json/CMakeLists.txt b/third_party/nlohmann_json/CMakeLists.txt
new file mode 100644
index 00000000000..6f413e458b9
--- /dev/null
+++ b/third_party/nlohmann_json/CMakeLists.txt
@@ -0,0 +1,10 @@
+message(STATUS "Fetching external nlohmann_json")
+include(FetchContent)
+FetchContent_Declare(
+    nlohmann_json
+    GIT_REPOSITORY https://github.com/nlohmann/json.git
+    GIT_TAG        v3.9.1
+)
+set(JSON_BuildTests OFF CACHE INTERNAL "")
+set(JSON_Install OFF CACHE INTERNAL "")
+FetchContent_MakeAvailable(nlohmann_json)
diff --git a/third_party/papi_sde/papi_sde_interface.h b/third_party/papi_sde/papi_sde_interface.h
deleted file mode 100644
index 6a28d0089a3..00000000000
--- a/third_party/papi_sde/papi_sde_interface.h
+++ /dev/null
@@ -1,113 +0,0 @@
-#ifndef PAPI_SDE_INTERFACE_H
-#define PAPI_SDE_INTERFACE_H
-
-#include <stdint.h>
-#include <stdlib.h>
-
-#define PAPI_SDE_RO 0x00
-#define PAPI_SDE_RW 0x01
-#define PAPI_SDE_DELTA 0x00
-#define PAPI_SDE_INSTANT 0x10
-
-#define PAPI_SDE_long_long 0x0
-#define PAPI_SDE_int 0x1
-#define PAPI_SDE_double 0x2
-#define PAPI_SDE_float 0x3
-
-#define PAPI_SDE_SUM 0x0
-#define PAPI_SDE_MAX 0x1
-#define PAPI_SDE_MIN 0x2
-
-
-#define GET_FLOAT_SDE(x) *((float *)&x)
-#define GET_DOUBLE_SDE(x) *((double *)&x)
-/*
- * GET_SDE_RECORDER_ADDRESS() USAGE EXAMPLE:
- * If SDE recorder logs values of type 'double':
- *     double *ptr = GET_SDE_RECORDER_ADDRESS(papi_event_value[6], double);
- *     for (j=0; j<CNT; j++)
- *        printf("    %d: %.4e\n",j, ptr[j]);
- */
-#define GET_SDE_RECORDER_ADDRESS(x, rcrd_type) ((rcrd_type *)x)
-
-
-typedef long long int (*papi_sde_fptr_t)(void *);
-typedef int (*papi_sde_cmpr_fptr_t)(void *);
-typedef void *papi_handle_t;
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-typedef struct papi_sde_fptr_struct_s {
-    papi_handle_t (*init)(const char *lib_name);
-    int (*register_counter)(void *handle, const char *event_name, int mode,
-                            int type, void *counter);
-    int (*register_fp_counter)(void *handle, const char *event_name, int mode,
-                               int type, papi_sde_fptr_t fp_counter,
-                               void *param);
-    int (*unregister_counter)(void *handle, const char *event_name);
-    int (*describe_counter)(void *handle, const char *event_name,
-                            const char *event_description);
-    int (*add_counter_to_group)(void *handle, const char *event_name,
-                                const char *group_name, uint32_t group_flags);
-    int (*create_counter)(papi_handle_t handle, const char *event_name,
-                          int cntr_type, void **cntr_handle);
-    int (*inc_counter)(papi_handle_t cntr_handle, long long int increment);
-    int (*create_recorder)(papi_handle_t handle, const char *event_name,
-                           size_t typesize,
-                           int (*cmpr_func_ptr)(const void *p1, const void *p2),
-                           void **record_handle);
-    int (*record)(void *record_handle, size_t typesize, void *value);
-    int (*reset_recorder)(void *record_handle);
-    int (*reset_counter)(void *cntr_handle);
-} papi_sde_fptr_struct_t;
-
-papi_handle_t papi_sde_init(const char *name_of_library);
-int papi_sde_register_counter(papi_handle_t handle, const char *event_name,
-                              int cntr_mode, int cntr_type, void *counter);
-int papi_sde_register_fp_counter(papi_handle_t handle, const char *event_name,
-                                 int cntr_mode, int cntr_type,
-                                 papi_sde_fptr_t func_ptr, void *param);
-int papi_sde_unregister_counter(void *handle, const char *event_name);
-int papi_sde_describe_counter(papi_handle_t handle, const char *event_name,
-                              const char *event_description);
-int papi_sde_add_counter_to_group(papi_handle_t handle, const char *event_name,
-                                  const char *group_name, uint32_t group_flags);
-int papi_sde_create_counter(papi_handle_t handle, const char *event_name,
-                            int cntr_type, void **cntr_handle);
-int papi_sde_inc_counter(void *cntr_handle, long long int increment);
-int papi_sde_create_recorder(
-    papi_handle_t handle, const char *event_name, size_t typesize,
-    int (*cmpr_func_ptr)(const void *p1, const void *p2), void **record_handle);
-int papi_sde_record(void *record_handle, size_t typesize, void *value);
-int papi_sde_reset_recorder(void *record_handle);
-int papi_sde_reset_counter(void *cntr_handle);
-void *papi_sde_get_counter_handle(papi_handle_t handle, const char *event_name);
-
-int papi_sde_compare_long_long(const void *p1, const void *p2);
-int papi_sde_compare_int(const void *p1, const void *p2);
-int papi_sde_compare_double(const void *p1, const void *p2);
-int papi_sde_compare_float(const void *p1, const void *p2);
-
-papi_handle_t papi_sde_hook_list_events(papi_sde_fptr_struct_t *fptr_struct);
-#ifdef __cplusplus
-}
-#endif
-
-#define POPULATE_SDE_FPTR_STRUCT(_A_)                             \
-    do {                                                          \
-        _A_.init = papi_sde_init;                                 \
-        _A_.register_counter = papi_sde_register_counter;         \
-        _A_.register_fp_counter = papi_sde_register_fp_counter;   \
-        _A_.unregister_counter = papi_sde_unregister_counter;     \
-        _A_.describe_counter = papi_sde_describe_counter;         \
-        _A_.add_counter_to_group = papi_sde_add_counter_to_group; \
-        _A_.create_counter = papi_sde_create_counter;             \
-        _A_.inc_counter = papi_sde_inc_counter;                   \
-        _A_.create_recorder = papi_sde_create_recorder;           \
-        _A_.record = papi_sde_record;                             \
-        _A_.reset_recorder = papi_sde_reset_recorder;             \
-        _A_.reset_counter = papi_sde_reset_counter;               \
-    } while (0)
-
-#endif
diff --git a/third_party/rapidjson/CMakeLists.txt b/third_party/rapidjson/CMakeLists.txt
deleted file mode 100644
index a96b90cb882..00000000000
--- a/third_party/rapidjson/CMakeLists.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-message(STATUS "Fetching external RapidJSON")
-include(FetchContent)
-FetchContent_Declare(
-    rapidjson
-    GIT_REPOSITORY https://github.com/Tencent/rapidjson.git
-    GIT_TAG        27c3a8dc0e2c9218fe94986d249a12b5ed838f1d
-)
-FetchContent_GetProperties(rapidjson)
-if(NOT rapidjson_POPULATED)
-    FetchContent_Populate(rapidjson)
-endif()
-set(RapidJSON_INCLUDE_DIR "${rapidjson_SOURCE_DIR}/include")
-add_library(rapidjson INTERFACE)
-set_target_properties(rapidjson PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${RapidJSON_INCLUDE_DIR}")