From 1448e12d7093450825f1688c2da48cf971a0bb85 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Thu, 30 Mar 2023 12:20:20 +0200
Subject: [PATCH 01/14] add allocator support to all executors

---
 benchmark/utils/general.hpp                   |   4 +-
 core/CMakeLists.txt                           |   1 +
 core/base/memory.cpp                          |  59 ++++
 core/device_hooks/cuda_hooks.cpp              |  44 +++
 core/device_hooks/dpcpp_hooks.cpp             |  18 ++
 core/device_hooks/hip_hooks.cpp               |  19 ++
 core/test/base/executor.cpp                   |  79 +-----
 cuda/CMakeLists.txt                           |   4 +
 cuda/base/device.cpp                          |  65 +++++
 cuda/base/executor.cpp                        | 173 ++----------
 cuda/base/memory.cpp                          | 168 +++++++++++
 cuda/base/nvtx.cpp                            |  96 +++++++
 .../stream.cpp}                               |  53 ++--
 cuda/test/base/CMakeLists.txt                 |   2 +-
 cuda/test/base/cuda_executor.cu               |  15 +-
 cuda/test/base/memory.cpp                     | 126 +++++++++
 cuda/test/utils.hpp                           |   4 +-
 devices/cuda/executor.cpp                     |  27 --
 devices/hip/executor.cpp                      |  34 ---
 devices/omp/executor.cpp                      |   7 +-
 dpcpp/base/executor.dp.cpp                    |  33 +++
 .../base/memory.dp.cpp                        |  47 ++--
 dpcpp/test/base/CMakeLists.txt                |   1 +
 dpcpp/test/base/memory.dp.cpp                 |  98 +++++++
 .../adaptiveprecision-blockjacobi.cpp         |   7 +-
 examples/cb-gmres/cb-gmres.cpp                |   7 +-
 examples/custom-logger/custom-logger.cpp      |   7 +-
 .../custom-matrix-format.cpp                  |   7 +-
 .../custom-stopping-criterion.cpp             |   7 +-
 .../ilu-preconditioned-solver.cpp             |   7 +-
 .../inverse-iteration/inverse-iteration.cpp   |   7 +-
 .../ir-ilu-preconditioned-solver.cpp          |   7 +-
 .../iterative-refinement.cpp                  |   7 +-
 .../minimal-cuda-solver.cpp                   |   2 +-
 .../mixed-multigrid-preconditioned-solver.cpp |   7 +-
 .../mixed-multigrid-solver.cpp                |   7 +-
 .../mixed-precision-ir/mixed-precision-ir.cpp |   7 +-
 examples/mixed-spmv/mixed-spmv.cpp            |   7 +-
 ...igrid-preconditioned-solver-customized.cpp |   7 +-
 .../multigrid-preconditioned-solver.cpp       |   7 +-
 .../nine-pt-stencil-solver.cpp                |   7 +-
 examples/papi-logging/papi-logging.cpp        |   7 +-
 .../performance-debugging.cpp                 |   7 +-
 examples/poisson-solver/poisson-solver.cpp    |   7 +-
 .../preconditioned-solver.cpp                 |   7 +-
 .../simple-solver-logging.cpp                 |   7 +-
 examples/simple-solver/simple-solver.cpp      |   7 +-
 .../three-pt-stencil-solver.cpp               |   7 +-
 hip/CMakeLists.txt                            |   4 +
 hip/base/device.hip.cpp                       |  67 +++++
 hip/base/executor.hip.cpp                     | 137 +--------
 hip/base/memory.hip.cpp                       |  97 +++++++
 hip/base/roctx.hip.cpp                        |  70 +++++
 hip/base/stream.hip.cpp                       |  78 +++++
 hip/test/base/CMakeLists.txt                  |   1 -
 hip/test/base/hip_executor.hip.cpp            |  14 +-
 hip/test/utils.hip.hpp                        |   4 +-
 include/ginkgo/core/base/executor.hpp         | 266 +++++-------------
 include/ginkgo/core/base/fwd_defs.hpp         |  90 ++++++
 include/ginkgo/core/base/memory.hpp           | 211 ++++++++++++++
 include/ginkgo/core/base/stream.hpp           | 124 ++++++++
 include/ginkgo/ginkgo.hpp                     |   3 +
 test/utils/executor.hpp                       |  19 +-
 test/utils/mpi/executor.hpp                   |   5 +-
 64 files changed, 1755 insertions(+), 775 deletions(-)
 create mode 100644 core/base/memory.cpp
 create mode 100644 cuda/base/device.cpp
 create mode 100644 cuda/base/memory.cpp
 create mode 100644 cuda/base/nvtx.cpp
 rename cuda/{test/base/cuda_executor_reset.cpp => base/stream.cpp} (62%)
 create mode 100644 cuda/test/base/memory.cpp
 rename hip/test/base/hip_executor_reset.cpp => dpcpp/base/memory.dp.cpp (63%)
 create mode 100644 dpcpp/test/base/memory.dp.cpp
 create mode 100644 hip/base/device.hip.cpp
 create mode 100644 hip/base/memory.hip.cpp
 create mode 100644 hip/base/roctx.hip.cpp
 create mode 100644 hip/base/stream.hip.cpp
 create mode 100644 include/ginkgo/core/base/fwd_defs.hpp
 create mode 100644 include/ginkgo/core/base/memory.hpp
 create mode 100644 include/ginkgo/core/base/stream.hpp

diff --git a/benchmark/utils/general.hpp b/benchmark/utils/general.hpp
index 92c3e5c9b13..35077f66d4b 100644
--- a/benchmark/utils/general.hpp
+++ b/benchmark/utils/general.hpp
@@ -337,12 +337,12 @@ const std::map<std::string, std::function<std::shared_ptr<gko::Executor>(bool)>>
         {"cuda",
          [](bool) {
              return gko::CudaExecutor::create(FLAGS_device_id,
-                                              gko::OmpExecutor::create(), true);
+                                              gko::OmpExecutor::create());
          }},
         {"hip",
          [](bool) {
              return gko::HipExecutor::create(FLAGS_device_id,
-                                             gko::OmpExecutor::create(), true);
+                                             gko::OmpExecutor::create());
          }},
         {"dpcpp", [](bool use_gpu_timer) {
              auto property = dpcpp_queue_property::in_order;
diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt
index 2f9643115c9..49cf89b66d6 100644
--- a/core/CMakeLists.txt
+++ b/core/CMakeLists.txt
@@ -10,6 +10,7 @@ target_sources(ginkgo
     base/device_matrix_data.cpp
     base/executor.cpp
     base/index_set.cpp
+    base/memory.cpp
     base/mpi.cpp
     base/mtx_io.cpp
     base/perturbation.cpp
diff --git a/core/base/memory.cpp b/core/base/memory.cpp
new file mode 100644
index 00000000000..88d97bcc765
--- /dev/null
+++ b/core/base/memory.cpp
@@ -0,0 +1,59 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/base/memory.hpp>
+
+
+#include <new>
+
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+
+
+namespace gko {
+
+
+void* CpuAllocator::allocate(size_type num_bytes) const
+{
+    auto ptr = ::operator new (num_bytes, std::nothrow_t{});
+    GKO_ENSURE_ALLOCATED(ptr, "cpu", num_bytes);
+    return ptr;
+}
+
+
+void CpuAllocator::deallocate(void* ptr) const
+{
+    ::operator delete (ptr, std::nothrow_t{});
+}
+
+
+}  // namespace gko
\ No newline at end of file
diff --git a/core/device_hooks/cuda_hooks.cpp b/core/device_hooks/cuda_hooks.cpp
index dd4c3f19f7c..cdecf735a9d 100644
--- a/core/device_hooks/cuda_hooks.cpp
+++ b/core/device_hooks/cuda_hooks.cpp
@@ -35,6 +35,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/memory.hpp>
+#include <ginkgo/core/base/stream.hpp>
 #include <ginkgo/core/base/timer.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/base/version.hpp>
@@ -52,6 +54,45 @@ version version_info::get_cuda_version() noexcept
 }
 
 
+void* CudaAllocator::allocate(size_type num_bytes) const GKO_NOT_COMPILED(cuda);
+
+
+void CudaAllocator::deallocate(void* dev_ptr) const GKO_NOT_COMPILED(cuda);
+
+
+CudaAsyncAllocator::CudaAsyncAllocator(CUstream_st* stream)
+    GKO_NOT_COMPILED(cuda);
+
+
+void* CudaAsyncAllocator::allocate(size_type num_bytes) const
+    GKO_NOT_COMPILED(cuda);
+
+
+void CudaAsyncAllocator::deallocate(void* dev_ptr) const GKO_NOT_COMPILED(cuda);
+
+
+CudaUnifiedAllocator::CudaUnifiedAllocator(int device_id, unsigned int flags)
+    GKO_NOT_COMPILED(cuda);
+
+
+void* CudaUnifiedAllocator::allocate(size_type num_bytes) const
+    GKO_NOT_COMPILED(cuda);
+
+
+void CudaUnifiedAllocator::deallocate(void* dev_ptr) const
+    GKO_NOT_COMPILED(cuda);
+
+
+CudaHostAllocator::CudaHostAllocator(int device_id) GKO_NOT_COMPILED(cuda);
+
+
+void* CudaHostAllocator::allocate(size_type num_bytes) const
+    GKO_NOT_COMPILED(cuda);
+
+
+void CudaHostAllocator::deallocate(void* dev_ptr) const GKO_NOT_COMPILED(cuda);
+
+
 std::shared_ptr<CudaExecutor> CudaExecutor::create(
     int device_id, std::shared_ptr<Executor> master, bool device_reset,
     allocation_mode alloc_mode, CUstream_st* stream)
@@ -154,6 +195,9 @@ scoped_device_id_guard::scoped_device_id_guard(const CudaExecutor* exec,
     GKO_NOT_COMPILED(cuda);
 
 
+cuda_stream::cuda_stream() GKO_NOT_COMPILED(cuda);
+
+
 cuda_stream::cuda_stream(int device_id) GKO_NOT_COMPILED(cuda);
 
 
diff --git a/core/device_hooks/dpcpp_hooks.cpp b/core/device_hooks/dpcpp_hooks.cpp
index a08f1f608fb..0ee3e6f289f 100644
--- a/core/device_hooks/dpcpp_hooks.cpp
+++ b/core/device_hooks/dpcpp_hooks.cpp
@@ -36,6 +36,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/memory.hpp>
 #include <ginkgo/core/base/timer.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/base/version.hpp>
@@ -52,6 +53,23 @@ version version_info::get_dpcpp_version() noexcept
 }
 
 
+void* DpcppAllocator::allocate_impl(sycl::queue* queue, size_type size) const
+    GKO_NOT_COMPILED(dpcpp);
+
+
+void DpcppAllocator::deallocate_impl(sycl::queue* queue, void* ptr) const
+    GKO_NOT_COMPILED(dpcpp);
+
+
+void* DpcppUnifiedAllocator::allocate_impl(sycl::queue* queue,
+                                           size_type size) const
+    GKO_NOT_COMPILED(dpcpp);
+
+
+void DpcppUnifiedAllocator::deallocate_impl(sycl::queue* queue, void* ptr) const
+    GKO_NOT_COMPILED(dpcpp);
+
+
 std::shared_ptr<DpcppExecutor> DpcppExecutor::create(
     int device_id, std::shared_ptr<Executor> master, std::string device_type,
     dpcpp_queue_property property)
diff --git a/core/device_hooks/hip_hooks.cpp b/core/device_hooks/hip_hooks.cpp
index 50637f7b3f0..739dac39f08 100644
--- a/core/device_hooks/hip_hooks.cpp
+++ b/core/device_hooks/hip_hooks.cpp
@@ -53,6 +53,22 @@ version version_info::get_hip_version() noexcept
 }
 
 
+void* HipAllocator::allocate(size_type num_bytes) GKO_NOT_COMPILED(hip);
+
+
+void HipAllocator::deallocate(void* dev_ptr) GKO_NOT_COMPILED(hip);
+
+
+HipAsyncAllocator::HipAsyncAllocator(GKO_HIP_STREAM_STRUCT* stream)
+    GKO_NOT_COMPILED(hip);
+
+
+void* HipAsyncAllocator::allocate(size_type num_bytes) GKO_NOT_COMPILED(hip);
+
+
+void HipAsyncAllocator::deallocate(void* dev_ptr) GKO_NOT_COMPILED(hip);
+
+
 std::shared_ptr<HipExecutor> HipExecutor::create(
     int device_id, std::shared_ptr<Executor> master, bool device_reset,
     allocation_mode alloc_mode, GKO_HIP_STREAM_STRUCT* stream)
@@ -155,6 +171,9 @@ scoped_device_id_guard::scoped_device_id_guard(const HipExecutor* exec,
     GKO_NOT_COMPILED(hip);
 
 
+hip_stream::hip_stream() GKO_NOT_COMPILED(hip);
+
+
 hip_stream::hip_stream(int device_id) GKO_NOT_COMPILED(hip);
 
 
diff --git a/core/test/base/executor.cpp b/core/test/base/executor.cpp
index 0d64dfcf3cf..71064cf01d2 100644
--- a/core/test/base/executor.cpp
+++ b/core/test/base/executor.cpp
@@ -35,6 +35,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include <thread>
 #include <type_traits>
+#include "ginkgo/core/base/memory.hpp"
 
 
 #if defined(__unix__) || defined(__APPLE__)
@@ -263,35 +264,6 @@ TEST(CudaExecutor, KnowsItsDeviceId)
 }
 
 
-TEST(CudaExecutor, CanGetDeviceResetBoolean)
-{
-    auto omp = gko::OmpExecutor::create();
-    auto cuda = gko::CudaExecutor::create(0, omp);
-
-    ASSERT_EQ(false, cuda->get_device_reset());
-}
-
-
-TEST(CudaExecutor, CanSetDefaultDeviceResetBoolean)
-{
-    auto omp = gko::OmpExecutor::create();
-    auto cuda = gko::CudaExecutor::create(0, omp, true);
-
-    ASSERT_EQ(true, cuda->get_device_reset());
-}
-
-
-TEST(CudaExecutor, CanSetDeviceResetBoolean)
-{
-    auto omp = gko::OmpExecutor::create();
-    auto cuda = gko::CudaExecutor::create(0, omp);
-
-    cuda->set_device_reset(true);
-
-    ASSERT_EQ(true, cuda->get_device_reset());
-}
-
-
 TEST(HipExecutor, KnowsItsMaster)
 {
     auto omp = gko::OmpExecutor::create();
@@ -310,35 +282,6 @@ TEST(HipExecutor, KnowsItsDeviceId)
 }
 
 
-TEST(HipExecutor, CanGetDeviceResetBoolean)
-{
-    auto omp = gko::OmpExecutor::create();
-    auto hip = gko::HipExecutor::create(0, omp);
-
-    ASSERT_EQ(false, hip->get_device_reset());
-}
-
-
-TEST(HipExecutor, CanSetDefaultDeviceResetBoolean)
-{
-    auto omp = gko::OmpExecutor::create();
-    auto hip = gko::HipExecutor::create(0, omp, true);
-
-    ASSERT_EQ(true, hip->get_device_reset());
-}
-
-
-TEST(HipExecutor, CanSetDeviceResetBoolean)
-{
-    auto omp = gko::OmpExecutor::create();
-    auto hip = gko::HipExecutor::create(0, omp);
-
-    hip->set_device_reset(true);
-
-    ASSERT_EQ(true, hip->get_device_reset());
-}
-
-
 TEST(DpcppExecutor, KnowsItsMaster)
 {
     auto omp = gko::OmpExecutor::create();
@@ -442,20 +385,11 @@ TEST(Executor, CanVerifyMemory)
 }
 
 
-template <typename T>
-struct mock_free : T {
-    /**
-     * @internal Due to a bug with gcc 5.3, the constructor needs to be called
-     * with `()` operator instead of `{}`.
-     */
-    template <typename... Params>
-    mock_free(Params&&... params) : T(std::forward<Params>(params)...)
-    {}
-
-    void raw_free(void* ptr) const noexcept override
+struct MockAllocator : gko::CpuAllocator {
+    void deallocate(void* ptr) const noexcept override
     {
         called_free = true;
-        T::raw_free(ptr);
+        CpuAllocator::deallocate(ptr);
     }
 
     mutable bool called_free{false};
@@ -464,12 +398,13 @@ struct mock_free : T {
 
 TEST(ExecutorDeleter, DeletesObject)
 {
-    auto ref = std::make_shared<mock_free<gko::ReferenceExecutor>>();
+    auto alloc = std::make_shared<MockAllocator>();
+    auto ref = gko::ReferenceExecutor::create(alloc);
     auto x = ref->alloc<int>(5);
 
     gko::executor_deleter<int>{ref}(x);
 
-    ASSERT_TRUE(ref->called_free);
+    ASSERT_TRUE(alloc->called_free);
 }
 
 
diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt
index bbe7a953dbd..aecf4e1c2f2 100644
--- a/cuda/CMakeLists.txt
+++ b/cuda/CMakeLists.txt
@@ -1,11 +1,15 @@
 add_library(ginkgo_cuda $<TARGET_OBJECTS:ginkgo_cuda_device> "")
 target_sources(ginkgo_cuda
     PRIVATE
+    base/device.cpp
     base/device_matrix_data_kernels.cu
     base/exception.cpp
     base/executor.cpp
     base/index_set_kernels.cpp
+    base/memory.cpp
+    base/nvtx.cpp
     base/scoped_device_id.cpp
+    base/stream.cpp
     base/timer.cpp
     base/version.cpp
     components/prefix_sum_kernels.cu
diff --git a/cuda/base/device.cpp b/cuda/base/device.cpp
new file mode 100644
index 00000000000..31ab5bcde63
--- /dev/null
+++ b/cuda/base/device.cpp
@@ -0,0 +1,65 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "cuda/base/device.hpp"
+
+
+#include <cuda_runtime.h>
+
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+
+
+#include "cuda/base/scoped_device_id.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+
+
+void reset_device(int device_id)
+{
+    gko::detail::cuda_scoped_device_id_guard guard{device_id};
+    cudaDeviceReset();
+}
+
+
+void destroy_event(CUevent_st* event)
+{
+    GKO_ASSERT_NO_CUDA_ERRORS(cudaEventDestroy(event));
+}
+
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko
diff --git a/cuda/base/executor.cpp b/cuda/base/executor.cpp
index e474d9c9f49..f6e838dd2dd 100644
--- a/cuda/base/executor.cpp
+++ b/cuda/base/executor.cpp
@@ -39,18 +39,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <cuda_runtime.h>
-#ifdef GKO_LEGACY_NVTX
-#include <nvToolsExt.h>
-#else
-#include <nvtx3/nvToolsExt.h>
-#endif
 
 
 #include <ginkgo/config.hpp>
 #include <ginkgo/core/base/device.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
-#include <ginkgo/core/log/profiler_hook.hpp>
+#include <ginkgo/core/base/memory.hpp>
 
 
 #include "cuda/base/config.hpp"
@@ -65,25 +60,38 @@ namespace gko {
 #include "common/cuda_hip/base/executor.hpp.inc"
 
 
+std::unique_ptr<CudaAllocatorBase> allocator_from_mode(int device_id,
+                                                       allocation_mode mode)
+{
+    switch (mode) {
+    case allocation_mode::device:
+        return std::make_unique<CudaAllocator>();
+    case allocation_mode::unified_global:
+        return std::make_unique<CudaUnifiedAllocator>(device_id);
+    case allocation_mode::unified_host:
+        return std::make_unique<CudaUnifiedAllocator>(device_id);
+    default:
+        GKO_NOT_SUPPORTED(mode);
+    }
+}
+
+
 std::shared_ptr<CudaExecutor> CudaExecutor::create(
     int device_id, std::shared_ptr<Executor> master, bool device_reset,
     allocation_mode alloc_mode, cudaStream_t stream)
 {
     return std::shared_ptr<CudaExecutor>(
-        new CudaExecutor(device_id, std::move(master), device_reset, alloc_mode,
-                         stream),
-        [device_id](CudaExecutor* exec) {
-            auto device_reset = exec->get_device_reset();
-            std::lock_guard<std::mutex> guard(
-                nvidia_device::get_mutex(device_id));
-            delete exec;
-            auto& num_execs = nvidia_device::get_num_execs(device_id);
-            num_execs--;
-            if (!num_execs && device_reset) {
-                detail::cuda_scoped_device_id_guard g(device_id);
-                cudaDeviceReset();
-            }
-        });
+        new CudaExecutor(device_id, std::move(master),
+                         allocator_from_mode(device_id, alloc_mode), stream));
+}
+
+
+std::shared_ptr<CudaExecutor> CudaExecutor::create(
+    int device_id, std::shared_ptr<Executor> master,
+    std::shared_ptr<CudaAllocatorBase> alloc, cudaStream_t stream)
+{
+    return std::shared_ptr<CudaExecutor>(new CudaExecutor(
+        device_id, std::move(master), std::move(alloc), stream));
 }
 
 
@@ -123,41 +131,14 @@ void OmpExecutor::raw_copy_to(const CudaExecutor* dest, size_type num_bytes,
 void CudaExecutor::raw_free(void* ptr) const noexcept
 {
     detail::cuda_scoped_device_id_guard g(this->get_device_id());
-    auto error_code = cudaFree(ptr);
-    if (error_code != cudaSuccess) {
-#if GKO_VERBOSE_LEVEL >= 1
-        // Unfortunately, if memory free fails, there's not much we can do
-        std::cerr << "Unrecoverable CUDA error on device "
-                  << this->get_device_id() << " in " << __func__ << ": "
-                  << cudaGetErrorName(error_code) << ": "
-                  << cudaGetErrorString(error_code) << std::endl
-                  << "Exiting program" << std::endl;
-#endif  // GKO_VERBOSE_LEVEL >= 1
-        std::exit(error_code);
-    }
+    alloc_->deallocate(ptr);
 }
 
 
 void* CudaExecutor::raw_alloc(size_type num_bytes) const
 {
-    void* dev_ptr = nullptr;
     detail::cuda_scoped_device_id_guard g(this->get_device_id());
-    int error_code = 0;
-    if (this->alloc_mode_ == allocation_mode::unified_host) {
-        error_code = cudaMallocManaged(&dev_ptr, num_bytes, cudaMemAttachHost);
-    } else if (this->alloc_mode_ == allocation_mode::unified_global) {
-        error_code =
-            cudaMallocManaged(&dev_ptr, num_bytes, cudaMemAttachGlobal);
-    } else if (this->alloc_mode_ == allocation_mode::device) {
-        error_code = cudaMalloc(&dev_ptr, num_bytes);
-    } else {
-        GKO_NOT_SUPPORTED(this->alloc_mode_);
-    }
-    if (error_code != cudaErrorMemoryAllocation) {
-        GKO_ASSERT_NO_CUDA_ERRORS(error_code);
-    }
-    GKO_ENSURE_ALLOCATED(dev_ptr, "cuda", num_bytes);
-    return dev_ptr;
+    return alloc_->allocate(num_bytes);
 }
 
 
@@ -298,98 +279,4 @@ void CudaExecutor::init_handles()
 }
 
 
-cuda_stream::cuda_stream(int device_id) : stream_{}, device_id_(device_id)
-{
-    detail::cuda_scoped_device_id_guard g(device_id_);
-    GKO_ASSERT_NO_CUDA_ERRORS(cudaStreamCreate(&stream_));
-}
-
-
-cuda_stream::~cuda_stream()
-{
-    if (stream_) {
-        detail::cuda_scoped_device_id_guard g(device_id_);
-        cudaStreamDestroy(stream_);
-    }
-}
-
-
-cuda_stream::cuda_stream(cuda_stream&& other)
-    : stream_{std::exchange(other.stream_, nullptr)},
-      device_id_(std::exchange(other.device_id_, -1))
-{}
-
-
-CUstream_st* cuda_stream::get() const { return stream_; }
-
-
-namespace log {
-
-
-// "GKO" in ASCII to avoid collision with other application's categories
-constexpr static uint32 category_magic_offset = 0x676B6FU;
-
-
-void init_nvtx()
-{
-#define NAMED_CATEGORY(_name)                                             \
-    nvtxNameCategory(static_cast<uint32>(profile_event_category::_name) + \
-                         category_magic_offset,                           \
-                     "gko::" #_name)
-    NAMED_CATEGORY(memory);
-    NAMED_CATEGORY(operation);
-    NAMED_CATEGORY(object);
-    NAMED_CATEGORY(linop);
-    NAMED_CATEGORY(factory);
-    NAMED_CATEGORY(solver);
-    NAMED_CATEGORY(criterion);
-    NAMED_CATEGORY(user);
-    NAMED_CATEGORY(internal);
-#undef NAMED_CATEGORY
-}
-
-
-std::function<void(const char*, profile_event_category)> begin_nvtx_fn(
-    uint32_t color_argb)
-{
-    return [color_argb](const char* name, profile_event_category category) {
-        nvtxEventAttributes_t attr{};
-        attr.version = NVTX_VERSION;
-        attr.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
-        attr.category = static_cast<uint32>(category) + category_magic_offset;
-        attr.colorType = NVTX_COLOR_ARGB;
-        attr.color = color_argb;
-        attr.payloadType = NVTX_PAYLOAD_UNKNOWN;
-        attr.messageType = NVTX_MESSAGE_TYPE_ASCII;
-        attr.message.ascii = name;
-        nvtxRangePushEx(&attr);
-    };
-}
-
-
-void end_nvtx(const char* name, profile_event_category) { nvtxRangePop(); }
-
-
-}  // namespace log
-
-
-namespace kernels {
-namespace cuda {
-
-
-void reset_device(int device_id)
-{
-    gko::detail::cuda_scoped_device_id_guard guard{device_id};
-    cudaDeviceReset();
-}
-
-
-void destroy_event(CUevent_st* event)
-{
-    GKO_ASSERT_NO_CUDA_ERRORS(cudaEventDestroy(event));
-}
-
-
-}  // namespace cuda
-}  // namespace kernels
 }  // namespace gko
diff --git a/cuda/base/memory.cpp b/cuda/base/memory.cpp
new file mode 100644
index 00000000000..11dee81ad42
--- /dev/null
+++ b/cuda/base/memory.cpp
@@ -0,0 +1,168 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/base/memory.hpp>
+
+
+#include <cuda_runtime.h>
+
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+
+
+#include "cuda/base/scoped_device_id.hpp"
+
+
+namespace gko {
+
+
+#define GKO_ASSERT_NO_CUDA_ALLOCATION_ERRORS(_operation, _size)       \
+    {                                                                 \
+        auto error_code = _operation;                                 \
+        if (error_code == cudaErrorMemoryAllocation) {                \
+            throw AllocationError(__FILE__, __LINE__, "cuda", _size); \
+        } else {                                                      \
+            GKO_ASSERT_NO_CUDA_ERRORS(error_code);                    \
+        }                                                             \
+    }
+
+
+#if GKO_VERBOSE_LEVEL >= 1
+#define GKO_EXIT_ON_CUDA_ERROR(_operation)                                  \
+    {                                                                       \
+        const auto error_code = _operation;                                 \
+        if (error_code != cudaSuccess) {                                    \
+            int device_id{-1};                                              \
+            cudaGetDevice(&device_id);                                      \
+            std::cerr << "Unrecoverable CUDA error on device " << device_id \
+                      << " in " << __func__ << ":" << __LINE__ << ": "      \
+                      << cudaGetErrorName(error_code) << ": "               \
+                      << cudaGetErrorString(error_code) << std::endl        \
+                      << "Exiting program" << std::endl;                    \
+            std::exit(error_code);                                          \
+        }                                                                   \
+    }
+#else
+#define GKO_EXIT_ON_CUDA_ERROR(_operation)  \
+    {                                       \
+        const auto error_code = _operation; \
+        if (error_code != cudaSuccess) {    \
+            std::exit(error_code);          \
+        }                                   \
+    }
+#endif
+
+
+void* CudaAllocator::allocate(size_type num_bytes) const
+{
+    void* ptr{};
+    GKO_ASSERT_NO_CUDA_ALLOCATION_ERRORS(cudaMalloc(&ptr, num_bytes),
+                                         num_bytes);
+    return ptr;
+}
+
+
+void CudaAllocator::deallocate(void* ptr) const
+{
+    GKO_EXIT_ON_CUDA_ERROR(cudaFree(ptr));
+}
+
+
+CudaAsyncAllocator::CudaAsyncAllocator(cudaStream_t stream) : stream_{stream} {}
+
+
+void* CudaAsyncAllocator::allocate(size_type num_bytes) const
+{
+    void* ptr{};
+    GKO_ASSERT_NO_CUDA_ALLOCATION_ERRORS(
+        cudaMallocAsync(&ptr, num_bytes, stream_), num_bytes);
+    return ptr;
+}
+
+void CudaAsyncAllocator::deallocate(void* ptr) const
+{
+    GKO_EXIT_ON_CUDA_ERROR(cudaFreeAsync(ptr, stream_));
+}
+
+
+CudaUnifiedAllocator::CudaUnifiedAllocator(int device_id)
+    : CudaUnifiedAllocator{device_id, cudaMemAttachGlobal}
+{}
+
+
+CudaUnifiedAllocator::CudaUnifiedAllocator(int device_id, unsigned int flags)
+    : device_id_{device_id}, flags_{flags}
+{}
+
+
+void* CudaUnifiedAllocator::allocate(size_type num_bytes) const
+{
+    // we need to set the device ID in case this gets used in a host executor
+    detail::cuda_scoped_device_id_guard g(device_id_);
+    void* ptr{};
+    GKO_ASSERT_NO_CUDA_ALLOCATION_ERRORS(
+        cudaMallocManaged(&ptr, num_bytes, flags_), num_bytes);
+    return ptr;
+}
+
+
+void CudaUnifiedAllocator::deallocate(void* ptr) const
+{
+    // we need to set the device ID in case this gets used in a host executor
+    detail::cuda_scoped_device_id_guard g(device_id_);
+    GKO_EXIT_ON_CUDA_ERROR(cudaFree(ptr));
+}
+
+
+CudaHostAllocator::CudaHostAllocator(int device_id) : device_id_{device_id} {}
+
+
+void* CudaHostAllocator::allocate(size_type num_bytes) const
+{
+    // we need to set the device ID in case this gets used in a host executor
+    detail::cuda_scoped_device_id_guard g(device_id_);
+    void* ptr{};
+    GKO_ASSERT_NO_CUDA_ALLOCATION_ERRORS(cudaMallocHost(&ptr, num_bytes),
+                                         num_bytes);
+    return ptr;
+}
+
+
+void CudaHostAllocator::deallocate(void* ptr) const
+{
+    // we need to set the device ID in case this gets used in a host executor
+    detail::cuda_scoped_device_id_guard g(device_id_);
+    GKO_EXIT_ON_CUDA_ERROR(cudaFreeHost(ptr));
+}
+
+
+}  // namespace gko
\ No newline at end of file
diff --git a/cuda/base/nvtx.cpp b/cuda/base/nvtx.cpp
new file mode 100644
index 00000000000..e313c110ea2
--- /dev/null
+++ b/cuda/base/nvtx.cpp
@@ -0,0 +1,96 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/config.hpp>
+
+
+#include <cuda_runtime.h>
+#ifdef GKO_LEGACY_NVTX
+#include <nvToolsExt.h>
+#else
+#include <nvtx3/nvToolsExt.h>
+#endif
+
+
+#include <ginkgo/core/log/profiler_hook.hpp>
+
+
+namespace gko {
+namespace log {
+
+
+// "GKO" in ASCII to avoid collision with other application's categories
+constexpr static uint32 category_magic_offset = 0x676B6FU;
+
+
+void init_nvtx()
+{
+#define NAMED_CATEGORY(_name)                                             \
+    nvtxNameCategory(static_cast<uint32>(profile_event_category::_name) + \
+                         category_magic_offset,                           \
+                     "gko::" #_name)
+    NAMED_CATEGORY(memory);
+    NAMED_CATEGORY(operation);
+    NAMED_CATEGORY(object);
+    NAMED_CATEGORY(linop);
+    NAMED_CATEGORY(factory);
+    NAMED_CATEGORY(solver);
+    NAMED_CATEGORY(criterion);
+    NAMED_CATEGORY(user);
+    NAMED_CATEGORY(internal);
+#undef NAMED_CATEGORY
+}
+
+
+std::function<void(const char*, profile_event_category)> begin_nvtx_fn(
+    uint32_t color_argb)
+{
+    return [color_argb](const char* name, profile_event_category category) {
+        nvtxEventAttributes_t attr{};
+        attr.version = NVTX_VERSION;
+        attr.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+        attr.category = static_cast<uint32>(category) + category_magic_offset;
+        attr.colorType = NVTX_COLOR_ARGB;
+        attr.color = color_argb;
+        attr.payloadType = NVTX_PAYLOAD_UNKNOWN;
+        attr.messageType = NVTX_MESSAGE_TYPE_ASCII;
+        attr.message.ascii = name;
+        nvtxRangePushEx(&attr);
+    };
+}
+
+
+void end_nvtx(const char* name, profile_event_category) { nvtxRangePop(); }
+
+
+}  // namespace log
+}  // namespace gko
diff --git a/cuda/test/base/cuda_executor_reset.cpp b/cuda/base/stream.cpp
similarity index 62%
rename from cuda/test/base/cuda_executor_reset.cpp
rename to cuda/base/stream.cpp
index c8159b9c4d7..8c6aa92c28b 100644
--- a/cuda/test/base/cuda_executor_reset.cpp
+++ b/cuda/base/stream.cpp
@@ -30,58 +30,45 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-#include <ginkgo/core/base/executor.hpp>
+#include <cuda_runtime.h>
 
 
-#include <thread>
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/stream.hpp>
 
 
-#include <gtest/gtest.h>
+#include "cuda/base/scoped_device_id.hpp"
 
 
-namespace {
+namespace gko {
 
 
-#define GTEST_ASSERT_NO_EXIT(statement) \
-    ASSERT_EXIT({ {statement} exit(0); }, ::testing::ExitedWithCode(0), "")
+cuda_stream::cuda_stream() : stream_{}, device_id_{-1} {}
 
 
-TEST(DeviceReset, HipCuda)
+cuda_stream::cuda_stream(int device_id) : stream_{}, device_id_(device_id)
 {
-    GTEST_ASSERT_NO_EXIT({
-        auto ref = gko::ReferenceExecutor::create();
-        auto hip = gko::HipExecutor::create(0, ref, true);
-        auto cuda = gko::CudaExecutor::create(0, ref, true);
-    });
+    detail::cuda_scoped_device_id_guard g(device_id_);
+    GKO_ASSERT_NO_CUDA_ERRORS(cudaStreamCreate(&stream_));
 }
 
 
-TEST(DeviceReset, CudaHip)
+cuda_stream::~cuda_stream()
 {
-    GTEST_ASSERT_NO_EXIT({
-        auto ref = gko::ReferenceExecutor::create();
-        auto cuda = gko::CudaExecutor::create(0, ref, true);
-        auto hip = gko::HipExecutor::create(0, ref, true);
-    });
+    if (stream_) {
+        detail::cuda_scoped_device_id_guard g(device_id_);
+        cudaStreamDestroy(stream_);
+    }
 }
 
 
-void func()
-{
-    auto ref = gko::ReferenceExecutor::create();
-    auto exec = gko::CudaExecutor::create(0, ref, true);
-}
+cuda_stream::cuda_stream(cuda_stream&& other)
+    : stream_{std::exchange(other.stream_, nullptr)},
+      device_id_(std::exchange(other.device_id_, -1))
+{}
 
 
-TEST(DeviceReset, CudaCuda)
-{
-    GTEST_ASSERT_NO_EXIT({
-        std::thread t1(func);
-        std::thread t2(func);
-        t1.join();
-        t2.join();
-    });
-}
+CUstream_st* cuda_stream::get() const { return stream_; }
 
 
-}  // namespace
+}  // namespace gko
diff --git a/cuda/test/base/CMakeLists.txt b/cuda/test/base/CMakeLists.txt
index 9be3caf9faa..a213e65277a 100644
--- a/cuda/test/base/CMakeLists.txt
+++ b/cuda/test/base/CMakeLists.txt
@@ -1,7 +1,6 @@
 ginkgo_create_cuda_test(array)
 ginkgo_create_cuda_test(cuda_executor)
 ginkgo_create_test(index_set)
-ginkgo_create_test(cuda_executor_reset ADDITIONAL_LIBRARIES Threads::Threads)
 if(GINKGO_HAVE_HWLOC)
     find_package(NUMA REQUIRED)
     ginkgo_create_cuda_test(cuda_executor_topology ADDITIONAL_LIBRARIES NUMA::NUMA)
@@ -10,4 +9,5 @@ ginkgo_create_cuda_test(exception_helpers)
 ginkgo_create_cuda_test(kernel_launch)
 ginkgo_create_cuda_test(lin_op)
 ginkgo_create_cuda_test(math)
+ginkgo_create_test(memory)
 ginkgo_create_cuda_test(scoped_device_id)
diff --git a/cuda/test/base/cuda_executor.cu b/cuda/test/base/cuda_executor.cu
index 5f489ac22f0..afb23c06186 100644
--- a/cuda/test/base/cuda_executor.cu
+++ b/cuda/test/base/cuda_executor.cu
@@ -42,6 +42,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/stream.hpp>
+
 
 #include "common/cuda_hip/base/executor.hpp.inc"
 #include "cuda/base/scoped_device_id.hpp"
@@ -103,18 +105,19 @@ protected:
         ASSERT_GT(gko::CudaExecutor::get_num_devices(), 0);
 #ifdef GKO_TEST_NONDEFAULT_STREAM
         cuda = gko::CudaExecutor::create(
-            0, omp, false, gko::default_cuda_alloc_mode, stream.get());
+            0, omp, std::make_shared<gko::CudaAllocator>(), stream.get());
         cuda2 = gko::CudaExecutor::create(
-            gko::CudaExecutor::get_num_devices() - 1, omp, false,
-            gko::default_cuda_alloc_mode, other_stream.get());
+            gko::CudaExecutor::get_num_devices() - 1, omp,
+            std::make_shared<gko::CudaAllocator>(), other_stream.get());
         cuda3 = gko::CudaExecutor::create(
-            0, omp, false, gko::allocation_mode::unified_global, stream.get());
+            0, omp, std::make_shared<gko::CudaUnifiedAllocator>(0),
+            stream.get());
 #else
         cuda = gko::CudaExecutor::create(0, omp);
         cuda2 = gko::CudaExecutor::create(
             gko::CudaExecutor::get_num_devices() - 1, omp);
-        cuda3 = gko::CudaExecutor::create(0, omp, false,
-                                          gko::allocation_mode::unified_global);
+        cuda3 = gko::CudaExecutor::create(
+            0, omp, std::make_shared<gko::CudaUnifiedAllocator>(0));
 #endif
     }
 
diff --git a/cuda/test/base/memory.cpp b/cuda/test/base/memory.cpp
new file mode 100644
index 00000000000..a329817f4af
--- /dev/null
+++ b/cuda/test/base/memory.cpp
@@ -0,0 +1,126 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/base/memory.hpp>
+
+
+#include <memory>
+#include <type_traits>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/exception.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/executor.hpp>
+
+
+#include "cuda/test/utils.hpp"
+
+
+namespace {
+
+
+class Memory : public CudaTestFixture {
+protected:
+    Memory()
+        : host_exec_with_pinned{gko::OmpExecutor::create(
+              std::make_shared<gko::CudaHostAllocator>(0))},
+          host_exec_with_unified{gko::OmpExecutor::create(
+              std::make_shared<gko::CudaUnifiedAllocator>(0))},
+          exec_with_normal{gko::CudaExecutor::create(
+              0, ref, std::make_shared<gko::CudaAllocator>(),
+              exec->get_stream())},
+          exec_with_async{gko::CudaExecutor::create(
+              0, host_exec_with_pinned,
+              std::make_shared<gko::CudaAsyncAllocator>(exec->get_stream()),
+              exec->get_stream())},
+          exec_with_unified{gko::CudaExecutor::create(
+              0, host_exec_with_unified,
+              std::make_shared<gko::CudaUnifiedAllocator>(0),
+              exec->get_stream())}
+    {}
+
+    std::shared_ptr<gko::OmpExecutor> host_exec_with_pinned;
+    std::shared_ptr<gko::OmpExecutor> host_exec_with_unified;
+    std::shared_ptr<gko::CudaExecutor> exec_with_normal;
+    std::shared_ptr<gko::CudaExecutor> exec_with_async;
+    std::shared_ptr<gko::CudaExecutor> exec_with_unified;
+};
+
+
+TEST_F(Memory, DeviceAllocationWorks)
+{
+    gko::array<int> data{exec_with_normal, {1, 2}};
+
+    GKO_ASSERT_ARRAY_EQ(data, I<int>({1, 2}));
+}
+
+
+TEST_F(Memory, AsyncDeviceAllocationWorks)
+{
+    gko::array<int> data{exec_with_async, {1, 2}};
+
+    GKO_ASSERT_ARRAY_EQ(data, I<int>({1, 2}));
+}
+
+
+TEST_F(Memory, UnifiedDeviceAllocationWorks)
+{
+    gko::array<int> data{exec_with_unified, {1, 2}};
+    exec->synchronize();
+
+    ASSERT_EQ(data.get_const_data()[0], 1);
+    ASSERT_EQ(data.get_const_data()[1], 2);
+}
+
+
+TEST_F(Memory, HostUnifiedAllocationWorks)
+{
+    gko::array<int> data{host_exec_with_unified, {1, 2}};
+
+    ASSERT_EQ(data.get_const_data()[0], 1);
+    ASSERT_EQ(data.get_const_data()[1], 2);
+}
+
+
+TEST_F(Memory, HostPinnedAllocationWorks)
+{
+    gko::array<int> data{host_exec_with_pinned, {1, 2}};
+
+    ASSERT_EQ(data.get_const_data()[0], 1);
+    ASSERT_EQ(data.get_const_data()[1], 2);
+}
+
+
+}  // namespace
diff --git a/cuda/test/utils.hpp b/cuda/test/utils.hpp
index 814405ba0d9..e1156b91903 100644
--- a/cuda/test/utils.hpp
+++ b/cuda/test/utils.hpp
@@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/stream.hpp>
 
 
 #include "cuda/base/device.hpp"
@@ -60,8 +61,9 @@ class CudaTestFixture : public ::testing::Test {
     CudaTestFixture()
         : ref(gko::ReferenceExecutor::create()),
 #ifdef GKO_TEST_NONDEFAULT_STREAM
+          stream(0),
           exec(gko::CudaExecutor::create(
-              0, ref, false, gko::default_cuda_alloc_mode, stream.get()))
+              0, ref, std::make_shared<gko::CudaAllocator>(), stream.get()))
 #else
           exec(gko::CudaExecutor::create(0, ref))
 #endif
diff --git a/devices/cuda/executor.cpp b/devices/cuda/executor.cpp
index d066d272f81..3789274c3f3 100644
--- a/devices/cuda/executor.cpp
+++ b/devices/cuda/executor.cpp
@@ -64,31 +64,4 @@ bool CudaExecutor::verify_memory_to(const HipExecutor* dest_exec) const
 }
 
 
-void CudaExecutor::increase_num_execs(unsigned device_id)
-{
-#ifdef GKO_COMPILING_CUDA_DEVICE
-    // increase the Cuda Device count only when ginkgo build cuda
-    std::lock_guard<std::mutex> guard(nvidia_device::get_mutex(device_id));
-    nvidia_device::get_num_execs(device_id)++;
-#endif  // GKO_COMPILING_CUDA_DEVICE
-}
-
-
-void CudaExecutor::decrease_num_execs(unsigned device_id)
-{
-#ifdef GKO_COMPILING_CUDA_DEVICE
-    // increase the Cuda Device count only when ginkgo build cuda
-    std::lock_guard<std::mutex> guard(nvidia_device::get_mutex(device_id));
-    nvidia_device::get_num_execs(device_id)--;
-#endif  // GKO_COMPILING_CUDA_DEVICE
-}
-
-
-unsigned CudaExecutor::get_num_execs(unsigned device_id)
-{
-    std::lock_guard<std::mutex> guard(nvidia_device::get_mutex(device_id));
-    return nvidia_device::get_num_execs(device_id);
-}
-
-
 }  // namespace gko
diff --git a/devices/hip/executor.cpp b/devices/hip/executor.cpp
index 60efb4c53a3..b044074c19e 100644
--- a/devices/hip/executor.cpp
+++ b/devices/hip/executor.cpp
@@ -61,38 +61,4 @@ bool HipExecutor::verify_memory_to(const CudaExecutor* dest_exec) const
 }
 
 
-#if (GINKGO_HIP_PLATFORM_NVCC == 1)
-using hip_device_class = nvidia_device;
-#else
-using hip_device_class = amd_device;
-#endif
-
-
-void HipExecutor::increase_num_execs(int device_id)
-{
-#ifdef GKO_COMPILING_HIP_DEVICE
-    // increase the HIP Device count only when ginkgo build hip
-    std::lock_guard<std::mutex> guard(hip_device_class::get_mutex(device_id));
-    hip_device_class::get_num_execs(device_id)++;
-#endif  // GKO_COMPILING_HIP_DEVICE
-}
-
-
-void HipExecutor::decrease_num_execs(int device_id)
-{
-#ifdef GKO_COMPILING_HIP_DEVICE
-    // increase the HIP Device count only when ginkgo build hip
-    std::lock_guard<std::mutex> guard(hip_device_class::get_mutex(device_id));
-    hip_device_class::get_num_execs(device_id)--;
-#endif  // GKO_COMPILING_HIP_DEVICE
-}
-
-
-int HipExecutor::get_num_execs(int device_id)
-{
-    std::lock_guard<std::mutex> guard(hip_device_class::get_mutex(device_id));
-    return hip_device_class::get_num_execs(device_id);
-}
-
-
 }  // namespace gko
diff --git a/devices/omp/executor.cpp b/devices/omp/executor.cpp
index 352216f7633..f8e700bc2d5 100644
--- a/devices/omp/executor.cpp
+++ b/devices/omp/executor.cpp
@@ -55,7 +55,10 @@ void OmpExecutor::populate_exec_info(const machine_topology* mach_topo)
 }
 
 
-void OmpExecutor::raw_free(void* ptr) const noexcept { std::free(ptr); }
+void OmpExecutor::raw_free(void* ptr) const noexcept
+{
+    return alloc_->deallocate(ptr);
+}
 
 
 std::shared_ptr<Executor> OmpExecutor::get_master() noexcept
@@ -72,7 +75,7 @@ std::shared_ptr<const Executor> OmpExecutor::get_master() const noexcept
 
 void* OmpExecutor::raw_alloc(size_type num_bytes) const
 {
-    return GKO_ENSURE_ALLOCATED(std::malloc(num_bytes), "OMP", num_bytes);
+    return alloc_->allocate(num_bytes);
 }
 
 
diff --git a/dpcpp/base/executor.dp.cpp b/dpcpp/base/executor.dp.cpp
index c2015c8664c..d668331a43b 100644
--- a/dpcpp/base/executor.dp.cpp
+++ b/dpcpp/base/executor.dp.cpp
@@ -51,6 +51,39 @@ namespace gko {
 namespace detail {
 
 
+DpcppAllocator::DpcppAllocator(sycl::queue* queue) : queue_{queue} {}
+
+
+void* DpcppAllocator::allocate(size_type size)
+{
+    return sycl::malloc_device(size, *queue_);
+}
+
+
+void DpcppAllocator::deallocate(void* ptr)
+{
+    queue_->wait_and_throw();
+    sycl::free(ptr, queue_->get_context());
+}
+
+
+DpcppUnifiedAllocator::DpcppUnifiedAllocator(sycl::queue* queue) : queue_{queue}
+{}
+
+
+void* DpcppUnifiedAllocator::allocate(size_type size)
+{
+    return sycl::malloc_shared(size, *queue_);
+}
+
+
+void DpcppUnifiedAllocator::deallocate(void* ptr)
+{
+    queue_->wait_and_throw();
+    sycl::free(ptr, queue_->get_context());
+}
+
+
 const std::vector<sycl::device> get_devices(std::string device_type)
 {
     std::map<std::string, sycl::info::device_type> device_type_map{
diff --git a/hip/test/base/hip_executor_reset.cpp b/dpcpp/base/memory.dp.cpp
similarity index 63%
rename from hip/test/base/hip_executor_reset.cpp
rename to dpcpp/base/memory.dp.cpp
index 39e3252e053..b1ccd007dea 100644
--- a/hip/test/base/hip_executor_reset.cpp
+++ b/dpcpp/base/memory.dp.cpp
@@ -30,58 +30,43 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/memory.hpp>
 
 
-#include <thread>
+#include <CL/sycl.hpp>
 
 
-#include <gtest/gtest.h>
+namespace gko {
 
 
-namespace {
+DpcppAllocatorBase::DpcppAllocatorBase(sycl::queue* queue) : queue_{queue} {}
 
 
-#define GTEST_ASSERT_NO_EXIT(statement) \
-    ASSERT_EXIT({ {statement} exit(0); }, ::testing::ExitedWithCode(0), "")
-
-
-TEST(DeviceReset, HipCuda)
+void* DpcppAllocator::allocate_impl(sycl::queue* queue,
+                                    size_type num_bytes) const
 {
-    GTEST_ASSERT_NO_EXIT({
-        auto ref = gko::ReferenceExecutor::create();
-        auto hip = gko::HipExecutor::create(0, ref, true);
-        auto cuda = gko::CudaExecutor::create(0, ref, true);
-    });
+    return sycl::malloc_device(size, *queue);
 }
 
 
-TEST(DeviceReset, CudaHip)
+void DpcppAllocator::deallocate_impl(sycl::queue* queue, void* ptr) const
 {
-    GTEST_ASSERT_NO_EXIT({
-        auto ref = gko::ReferenceExecutor::create();
-        auto cuda = gko::CudaExecutor::create(0, ref, true);
-        auto hip = gko::HipExecutor::create(0, ref, true);
-    });
+    queue->wait_and_throw();
+    sycl::free(ptr, queue->get_context());
 }
 
 
-void func()
+void* DpcppUnifiedAllocator::allocate(size_type num_bytes)
 {
-    auto ref = gko::ReferenceExecutor::create();
-    auto exec = gko::HipExecutor::create(0, ref, true);
+    return sycl::malloc_shared(size, *queue_);
 }
 
 
-TEST(DeviceReset, HipHip)
+void DpcppUnifiedAllocator::deallocate(void* ptr)
 {
-    GTEST_ASSERT_NO_EXIT({
-        std::thread t1(func);
-        std::thread t2(func);
-        t1.join();
-        t2.join();
-    });
+    queue_->wait_and_throw();
+    sycl::free(ptr, queue_->get_context());
 }
 
 
-}  // namespace
+}  // namespace gko
diff --git a/dpcpp/test/base/CMakeLists.txt b/dpcpp/test/base/CMakeLists.txt
index bb9c8a75050..5c0ca601f04 100644
--- a/dpcpp/test/base/CMakeLists.txt
+++ b/dpcpp/test/base/CMakeLists.txt
@@ -3,3 +3,4 @@ ginkgo_create_dpcpp_test(dim3)
 ginkgo_create_dpcpp_test(kernel_launch)
 # set correct flags for kernel_launch.hpp
 target_compile_definitions(dpcpp_test_base_kernel_launch PRIVATE GKO_COMPILING_DPCPP)
+ginkgo_create_dpcpp_test(memory)
\ No newline at end of file
diff --git a/dpcpp/test/base/memory.dp.cpp b/dpcpp/test/base/memory.dp.cpp
new file mode 100644
index 00000000000..e587660cde3
--- /dev/null
+++ b/dpcpp/test/base/memory.dp.cpp
@@ -0,0 +1,98 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/base/memory.hpp>
+
+
+#include <memory>
+#include <type_traits>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/exception.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/executor.hpp>
+
+
+#include "dpcpp/test/utils.hpp"
+
+
+namespace {
+
+
+class Memory : public ::testing::Test {
+protected:
+    Memory()
+        : exec{gko::DpcppExecutor::create(0, gko::OmpExecutor::create())},
+          host_exec_with_unified{gko::OmpExecutor::create(
+              std::make_shared<gko::DpcppUnifiedAllocator>(exec->get_queue()))},
+          exec_with_unified{gko::DpcppExecutor::create(
+              exec->get_queue(), host_exec_with_unified,
+              std::make_shared<gko::DpcppUnifiedAllocator>(exec->get_queue()))}
+    {}
+
+    std::shared_ptr<gko::DpcppExecutor> exec;
+    std::shared_ptr<gko::OmpExecutor> host_exec_with_unified;
+    std::shared_ptr<gko::DpcppExecutor> exec_with_unified;
+};
+
+
+TEST_F(Memory, DeviceAllocationWorks)
+{
+    gko::array<int> data{exec, {1, 2}};
+
+    GKO_ASSERT_ARRAY_EQ(data, I<int>({1, 2}));
+}
+
+
+TEST_F(Memory, UnifiedDeviceAllocationWorks)
+{
+    gko::array<int> data{exec_with_unified, {1, 2}};
+    exec->synchronize();
+
+    ASSERT_EQ(data.get_const_data()[0], 1);
+    ASSERT_EQ(data.get_const_data()[1], 2);
+}
+
+
+TEST_F(Memory, HostUnifiedAllocationWorks)
+{
+    gko::array<int> data{host_exec_with_unified, {1, 2}};
+
+    ASSERT_EQ(data.get_const_data()[0], 1);
+    ASSERT_EQ(data.get_const_data()[1], 2);
+}
+
+
+}  // namespace
diff --git a/examples/adaptiveprecision-blockjacobi/adaptiveprecision-blockjacobi.cpp b/examples/adaptiveprecision-blockjacobi/adaptiveprecision-blockjacobi.cpp
index b300292e9a3..79b197aacc8 100644
--- a/examples/adaptiveprecision-blockjacobi/adaptiveprecision-blockjacobi.cpp
+++ b/examples/adaptiveprecision-blockjacobi/adaptiveprecision-blockjacobi.cpp
@@ -68,13 +68,12 @@ int main(int argc, char* argv[])
             {"omp", [] { return gko::OmpExecutor::create(); }},
             {"cuda",
              [] {
-                 return gko::CudaExecutor::create(0, gko::OmpExecutor::create(),
-                                                  true);
+                 return gko::CudaExecutor::create(0,
+                                                  gko::OmpExecutor::create());
              }},
             {"hip",
              [] {
-                 return gko::HipExecutor::create(0, gko::OmpExecutor::create(),
-                                                 true);
+                 return gko::HipExecutor::create(0, gko::OmpExecutor::create());
              }},
             {"dpcpp",
              [] {
diff --git a/examples/cb-gmres/cb-gmres.cpp b/examples/cb-gmres/cb-gmres.cpp
index c0235f75e55..b096e48c71a 100644
--- a/examples/cb-gmres/cb-gmres.cpp
+++ b/examples/cb-gmres/cb-gmres.cpp
@@ -108,13 +108,12 @@ int main(int argc, char* argv[])
             {"omp", [] { return gko::OmpExecutor::create(); }},
             {"cuda",
              [] {
-                 return gko::CudaExecutor::create(0, gko::OmpExecutor::create(),
-                                                  true);
+                 return gko::CudaExecutor::create(0,
+                                                  gko::OmpExecutor::create());
              }},
             {"hip",
              [] {
-                 return gko::HipExecutor::create(0, gko::OmpExecutor::create(),
-                                                 true);
+                 return gko::HipExecutor::create(0, gko::OmpExecutor::create());
              }},
             {"dpcpp",
              [] {
diff --git a/examples/custom-logger/custom-logger.cpp b/examples/custom-logger/custom-logger.cpp
index c2270cadb0d..7e6cf531edd 100644
--- a/examples/custom-logger/custom-logger.cpp
+++ b/examples/custom-logger/custom-logger.cpp
@@ -249,13 +249,12 @@ int main(int argc, char* argv[])
             {"omp", [] { return gko::OmpExecutor::create(); }},
             {"cuda",
              [] {
-                 return gko::CudaExecutor::create(0, gko::OmpExecutor::create(),
-                                                  true);
+                 return gko::CudaExecutor::create(0,
+                                                  gko::OmpExecutor::create());
              }},
             {"hip",
              [] {
-                 return gko::HipExecutor::create(0, gko::OmpExecutor::create(),
-                                                 true);
+                 return gko::HipExecutor::create(0, gko::OmpExecutor::create());
              }},
             {"dpcpp",
              [] {
diff --git a/examples/custom-matrix-format/custom-matrix-format.cpp b/examples/custom-matrix-format/custom-matrix-format.cpp
index af08dbdf226..4610413fe9c 100644
--- a/examples/custom-matrix-format/custom-matrix-format.cpp
+++ b/examples/custom-matrix-format/custom-matrix-format.cpp
@@ -255,13 +255,12 @@ int main(int argc, char* argv[])
             {"omp", [] { return gko::OmpExecutor::create(); }},
             {"cuda",
              [] {
-                 return gko::CudaExecutor::create(0, gko::OmpExecutor::create(),
-                                                  true);
+                 return gko::CudaExecutor::create(0,
+                                                  gko::OmpExecutor::create());
              }},
             {"hip",
              [] {
-                 return gko::HipExecutor::create(0, gko::OmpExecutor::create(),
-                                                 true);
+                 return gko::HipExecutor::create(0, gko::OmpExecutor::create());
              }},
             {"dpcpp",
              [] {
diff --git a/examples/custom-stopping-criterion/custom-stopping-criterion.cpp b/examples/custom-stopping-criterion/custom-stopping-criterion.cpp
index 9389f86cc45..e07f1bf92fb 100644
--- a/examples/custom-stopping-criterion/custom-stopping-criterion.cpp
+++ b/examples/custom-stopping-criterion/custom-stopping-criterion.cpp
@@ -158,13 +158,12 @@ int main(int argc, char* argv[])
             {"omp", [] { return gko::OmpExecutor::create(); }},
             {"cuda",
              [] {
-                 return gko::CudaExecutor::create(0, gko::OmpExecutor::create(),
-                                                  true);
+                 return gko::CudaExecutor::create(0,
+                                                  gko::OmpExecutor::create());
              }},
             {"hip",
              [] {
-                 return gko::HipExecutor::create(0, gko::OmpExecutor::create(),
-                                                 true);
+                 return gko::HipExecutor::create(0, gko::OmpExecutor::create());
              }},
             {"dpcpp",
              [] {
diff --git a/examples/ilu-preconditioned-solver/ilu-preconditioned-solver.cpp b/examples/ilu-preconditioned-solver/ilu-preconditioned-solver.cpp
index aa32e0e879a..33946b7de44 100644
--- a/examples/ilu-preconditioned-solver/ilu-preconditioned-solver.cpp
+++ b/examples/ilu-preconditioned-solver/ilu-preconditioned-solver.cpp
@@ -68,13 +68,12 @@ int main(int argc, char* argv[])
             {"omp", [] { return gko::OmpExecutor::create(); }},
             {"cuda",
              [] {
-                 return gko::CudaExecutor::create(0, gko::OmpExecutor::create(),
-                                                  true);
+                 return gko::CudaExecutor::create(0,
+                                                  gko::OmpExecutor::create());
              }},
             {"hip",
              [] {
-                 return gko::HipExecutor::create(0, gko::OmpExecutor::create(),
-                                                 true);
+                 return gko::HipExecutor::create(0, gko::OmpExecutor::create());
              }},
             {"dpcpp",
              [] {
diff --git a/examples/inverse-iteration/inverse-iteration.cpp b/examples/inverse-iteration/inverse-iteration.cpp
index 5d8270f1ca1..460370b7e00 100644
--- a/examples/inverse-iteration/inverse-iteration.cpp
+++ b/examples/inverse-iteration/inverse-iteration.cpp
@@ -72,13 +72,12 @@ int main(int argc, char* argv[])
             {"omp", [] { return gko::OmpExecutor::create(); }},
             {"cuda",
              [] {
-                 return gko::CudaExecutor::create(0, gko::OmpExecutor::create(),
-                                                  true);
+                 return gko::CudaExecutor::create(0,
+                                                  gko::OmpExecutor::create());
              }},
             {"hip",
              [] {
-                 return gko::HipExecutor::create(0, gko::OmpExecutor::create(),
-                                                 true);
+                 return gko::HipExecutor::create(0, gko::OmpExecutor::create());
              }},
             {"dpcpp",
              [] {
diff --git a/examples/ir-ilu-preconditioned-solver/ir-ilu-preconditioned-solver.cpp b/examples/ir-ilu-preconditioned-solver/ir-ilu-preconditioned-solver.cpp
index e676e15cc6d..407a083e548 100644
--- a/examples/ir-ilu-preconditioned-solver/ir-ilu-preconditioned-solver.cpp
+++ b/examples/ir-ilu-preconditioned-solver/ir-ilu-preconditioned-solver.cpp
@@ -71,13 +71,12 @@ int main(int argc, char* argv[])
             {"omp", [] { return gko::OmpExecutor::create(); }},
             {"cuda",
              [] {
-                 return gko::CudaExecutor::create(0, gko::OmpExecutor::create(),
-                                                  true);
+                 return gko::CudaExecutor::create(0,
+                                                  gko::OmpExecutor::create());
              }},
             {"hip",
              [] {
-                 return gko::HipExecutor::create(0, gko::OmpExecutor::create(),
-                                                 true);
+                 return gko::HipExecutor::create(0, gko::OmpExecutor::create());
              }},
             {"dpcpp",
              [] {
diff --git a/examples/iterative-refinement/iterative-refinement.cpp b/examples/iterative-refinement/iterative-refinement.cpp
index cbd2156be60..14384eaab52 100644
--- a/examples/iterative-refinement/iterative-refinement.cpp
+++ b/examples/iterative-refinement/iterative-refinement.cpp
@@ -68,13 +68,12 @@ int main(int argc, char* argv[])
             {"omp", [] { return gko::OmpExecutor::create(); }},
             {"cuda",
              [] {
-                 return gko::CudaExecutor::create(0, gko::OmpExecutor::create(),
-                                                  true);
+                 return gko::CudaExecutor::create(0,
+                                                  gko::OmpExecutor::create());
              }},
             {"hip",
              [] {
-                 return gko::HipExecutor::create(0, gko::OmpExecutor::create(),
-                                                 true);
+                 return gko::HipExecutor::create(0, gko::OmpExecutor::create());
              }},
             {"dpcpp",
              [] {
diff --git a/examples/minimal-cuda-solver/minimal-cuda-solver.cpp b/examples/minimal-cuda-solver/minimal-cuda-solver.cpp
index 7182bc9ad8c..5a7a8c086af 100644
--- a/examples/minimal-cuda-solver/minimal-cuda-solver.cpp
+++ b/examples/minimal-cuda-solver/minimal-cuda-solver.cpp
@@ -36,7 +36,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 int main()
 {
     // Instantiate a CUDA executor
-    auto gpu = gko::CudaExecutor::create(0, gko::OmpExecutor::create(), true);
+    auto gpu = gko::CudaExecutor::create(0, gko::OmpExecutor::create());
     // Read data
     auto A = gko::read<gko::matrix::Csr<>>(std::cin, gpu);
     auto b = gko::read<gko::matrix::Dense<>>(std::cin, gpu);
diff --git a/examples/mixed-multigrid-preconditioned-solver/mixed-multigrid-preconditioned-solver.cpp b/examples/mixed-multigrid-preconditioned-solver/mixed-multigrid-preconditioned-solver.cpp
index 6f1600d2805..9edd7ff29a1 100644
--- a/examples/mixed-multigrid-preconditioned-solver/mixed-multigrid-preconditioned-solver.cpp
+++ b/examples/mixed-multigrid-preconditioned-solver/mixed-multigrid-preconditioned-solver.cpp
@@ -71,13 +71,12 @@ int main(int argc, char* argv[])
             {"omp", [] { return gko::OmpExecutor::create(); }},
             {"cuda",
              [] {
-                 return gko::CudaExecutor::create(0, gko::OmpExecutor::create(),
-                                                  true);
+                 return gko::CudaExecutor::create(0,
+                                                  gko::OmpExecutor::create());
              }},
             {"hip",
              [] {
-                 return gko::HipExecutor::create(0, gko::OmpExecutor::create(),
-                                                 true);
+                 return gko::HipExecutor::create(0, gko::OmpExecutor::create());
              }},
             {"dpcpp",
              [] {
diff --git a/examples/mixed-multigrid-solver/mixed-multigrid-solver.cpp b/examples/mixed-multigrid-solver/mixed-multigrid-solver.cpp
index d3f45cda916..cbecbbbdc02 100644
--- a/examples/mixed-multigrid-solver/mixed-multigrid-solver.cpp
+++ b/examples/mixed-multigrid-solver/mixed-multigrid-solver.cpp
@@ -69,13 +69,12 @@ int main(int argc, char* argv[])
             {"omp", [] { return gko::OmpExecutor::create(); }},
             {"cuda",
              [] {
-                 return gko::CudaExecutor::create(0, gko::OmpExecutor::create(),
-                                                  true);
+                 return gko::CudaExecutor::create(0,
+                                                  gko::OmpExecutor::create());
              }},
             {"hip",
              [] {
-                 return gko::HipExecutor::create(0, gko::OmpExecutor::create(),
-                                                 true);
+                 return gko::HipExecutor::create(0, gko::OmpExecutor::create());
              }},
             {"dpcpp",
              [] {
diff --git a/examples/mixed-precision-ir/mixed-precision-ir.cpp b/examples/mixed-precision-ir/mixed-precision-ir.cpp
index 3510a2163e1..0882d755cdc 100644
--- a/examples/mixed-precision-ir/mixed-precision-ir.cpp
+++ b/examples/mixed-precision-ir/mixed-precision-ir.cpp
@@ -76,13 +76,12 @@ int main(int argc, char* argv[])
             {"omp", [] { return gko::OmpExecutor::create(); }},
             {"cuda",
              [] {
-                 return gko::CudaExecutor::create(0, gko::OmpExecutor::create(),
-                                                  true);
+                 return gko::CudaExecutor::create(0,
+                                                  gko::OmpExecutor::create());
              }},
             {"hip",
              [] {
-                 return gko::HipExecutor::create(0, gko::OmpExecutor::create(),
-                                                 true);
+                 return gko::HipExecutor::create(0, gko::OmpExecutor::create());
              }},
             {"dpcpp",
              [] {
diff --git a/examples/mixed-spmv/mixed-spmv.cpp b/examples/mixed-spmv/mixed-spmv.cpp
index 78461de39ef..6b327c1c708 100644
--- a/examples/mixed-spmv/mixed-spmv.cpp
+++ b/examples/mixed-spmv/mixed-spmv.cpp
@@ -170,13 +170,12 @@ int main(int argc, char* argv[])
             {"omp", [] { return gko::OmpExecutor::create(); }},
             {"cuda",
              [] {
-                 return gko::CudaExecutor::create(0, gko::OmpExecutor::create(),
-                                                  true);
+                 return gko::CudaExecutor::create(0,
+                                                  gko::OmpExecutor::create());
              }},
             {"hip",
              [] {
-                 return gko::HipExecutor::create(0, gko::OmpExecutor::create(),
-                                                 true);
+                 return gko::HipExecutor::create(0, gko::OmpExecutor::create());
              }},
             {"dpcpp",
              [] {
diff --git a/examples/multigrid-preconditioned-solver-customized/multigrid-preconditioned-solver-customized.cpp b/examples/multigrid-preconditioned-solver-customized/multigrid-preconditioned-solver-customized.cpp
index 6f75ca29630..a455ca2e8ed 100644
--- a/examples/multigrid-preconditioned-solver-customized/multigrid-preconditioned-solver-customized.cpp
+++ b/examples/multigrid-preconditioned-solver-customized/multigrid-preconditioned-solver-customized.cpp
@@ -64,13 +64,12 @@ int main(int argc, char* argv[])
             {"omp", [] { return gko::OmpExecutor::create(); }},
             {"cuda",
              [] {
-                 return gko::CudaExecutor::create(0, gko::OmpExecutor::create(),
-                                                  true);
+                 return gko::CudaExecutor::create(0,
+                                                  gko::OmpExecutor::create());
              }},
             {"hip",
              [] {
-                 return gko::HipExecutor::create(0, gko::OmpExecutor::create(),
-                                                 true);
+                 return gko::HipExecutor::create(0, gko::OmpExecutor::create());
              }},
             {"dpcpp",
              [] {
diff --git a/examples/multigrid-preconditioned-solver/multigrid-preconditioned-solver.cpp b/examples/multigrid-preconditioned-solver/multigrid-preconditioned-solver.cpp
index 7f47d039072..75c03259c67 100644
--- a/examples/multigrid-preconditioned-solver/multigrid-preconditioned-solver.cpp
+++ b/examples/multigrid-preconditioned-solver/multigrid-preconditioned-solver.cpp
@@ -62,13 +62,12 @@ int main(int argc, char* argv[])
             {"omp", [] { return gko::OmpExecutor::create(); }},
             {"cuda",
              [] {
-                 return gko::CudaExecutor::create(0, gko::OmpExecutor::create(),
-                                                  true);
+                 return gko::CudaExecutor::create(0,
+                                                  gko::OmpExecutor::create());
              }},
             {"hip",
              [] {
-                 return gko::HipExecutor::create(0, gko::OmpExecutor::create(),
-                                                 true);
+                 return gko::HipExecutor::create(0, gko::OmpExecutor::create());
              }},
             {"dpcpp",
              [] {
diff --git a/examples/nine-pt-stencil-solver/nine-pt-stencil-solver.cpp b/examples/nine-pt-stencil-solver/nine-pt-stencil-solver.cpp
index 51fdf97d4a4..05ee0503a5f 100644
--- a/examples/nine-pt-stencil-solver/nine-pt-stencil-solver.cpp
+++ b/examples/nine-pt-stencil-solver/nine-pt-stencil-solver.cpp
@@ -230,13 +230,12 @@ void solve_system(const std::string& executor_string,
             {"omp", [] { return gko::OmpExecutor::create(); }},
             {"cuda",
              [] {
-                 return gko::CudaExecutor::create(0, gko::OmpExecutor::create(),
-                                                  true);
+                 return gko::CudaExecutor::create(0,
+                                                  gko::OmpExecutor::create());
              }},
             {"hip",
              [] {
-                 return gko::HipExecutor::create(0, gko::OmpExecutor::create(),
-                                                 true);
+                 return gko::HipExecutor::create(0, gko::OmpExecutor::create());
              }},
             {"dpcpp",
              [] {
diff --git a/examples/papi-logging/papi-logging.cpp b/examples/papi-logging/papi-logging.cpp
index 0d81ef65909..1ae2ae9ec08 100644
--- a/examples/papi-logging/papi-logging.cpp
+++ b/examples/papi-logging/papi-logging.cpp
@@ -151,13 +151,12 @@ int main(int argc, char* argv[])
             {"omp", [] { return gko::OmpExecutor::create(); }},
             {"cuda",
              [] {
-                 return gko::CudaExecutor::create(0, gko::OmpExecutor::create(),
-                                                  true);
+                 return gko::CudaExecutor::create(0,
+                                                  gko::OmpExecutor::create());
              }},
             {"hip",
              [] {
-                 return gko::HipExecutor::create(0, gko::OmpExecutor::create(),
-                                                 true);
+                 return gko::HipExecutor::create(0, gko::OmpExecutor::create());
              }},
             {"dpcpp",
              [] {
diff --git a/examples/performance-debugging/performance-debugging.cpp b/examples/performance-debugging/performance-debugging.cpp
index f357a8d4619..5f036728924 100644
--- a/examples/performance-debugging/performance-debugging.cpp
+++ b/examples/performance-debugging/performance-debugging.cpp
@@ -371,13 +371,12 @@ int main(int argc, char* argv[])
             {"omp", [] { return gko::OmpExecutor::create(); }},
             {"cuda",
              [] {
-                 return gko::CudaExecutor::create(0, gko::OmpExecutor::create(),
-                                                  true);
+                 return gko::CudaExecutor::create(0,
+                                                  gko::OmpExecutor::create());
              }},
             {"hip",
              [] {
-                 return gko::HipExecutor::create(0, gko::OmpExecutor::create(),
-                                                 true);
+                 return gko::HipExecutor::create(0, gko::OmpExecutor::create());
              }},
             {"dpcpp",
              [] {
diff --git a/examples/poisson-solver/poisson-solver.cpp b/examples/poisson-solver/poisson-solver.cpp
index 7602600a514..e16f0b26968 100644
--- a/examples/poisson-solver/poisson-solver.cpp
+++ b/examples/poisson-solver/poisson-solver.cpp
@@ -144,13 +144,12 @@ int main(int argc, char* argv[])
             {"omp", [] { return gko::OmpExecutor::create(); }},
             {"cuda",
              [] {
-                 return gko::CudaExecutor::create(0, gko::OmpExecutor::create(),
-                                                  true);
+                 return gko::CudaExecutor::create(0,
+                                                  gko::OmpExecutor::create());
              }},
             {"hip",
              [] {
-                 return gko::HipExecutor::create(0, gko::OmpExecutor::create(),
-                                                 true);
+                 return gko::HipExecutor::create(0, gko::OmpExecutor::create());
              }},
             {"dpcpp",
              [] {
diff --git a/examples/preconditioned-solver/preconditioned-solver.cpp b/examples/preconditioned-solver/preconditioned-solver.cpp
index 37963f205cc..b64b588c4ef 100644
--- a/examples/preconditioned-solver/preconditioned-solver.cpp
+++ b/examples/preconditioned-solver/preconditioned-solver.cpp
@@ -69,13 +69,12 @@ int main(int argc, char* argv[])
             {"omp", [] { return gko::OmpExecutor::create(); }},
             {"cuda",
              [] {
-                 return gko::CudaExecutor::create(0, gko::OmpExecutor::create(),
-                                                  true);
+                 return gko::CudaExecutor::create(0,
+                                                  gko::OmpExecutor::create());
              }},
             {"hip",
              [] {
-                 return gko::HipExecutor::create(0, gko::OmpExecutor::create(),
-                                                 true);
+                 return gko::HipExecutor::create(0, gko::OmpExecutor::create());
              }},
             {"dpcpp",
              [] {
diff --git a/examples/simple-solver-logging/simple-solver-logging.cpp b/examples/simple-solver-logging/simple-solver-logging.cpp
index 6aa85462605..02318dd7784 100644
--- a/examples/simple-solver-logging/simple-solver-logging.cpp
+++ b/examples/simple-solver-logging/simple-solver-logging.cpp
@@ -85,13 +85,12 @@ int main(int argc, char* argv[])
             {"omp", [] { return gko::OmpExecutor::create(); }},
             {"cuda",
              [] {
-                 return gko::CudaExecutor::create(0, gko::OmpExecutor::create(),
-                                                  true);
+                 return gko::CudaExecutor::create(0,
+                                                  gko::OmpExecutor::create());
              }},
             {"hip",
              [] {
-                 return gko::HipExecutor::create(0, gko::OmpExecutor::create(),
-                                                 true);
+                 return gko::HipExecutor::create(0, gko::OmpExecutor::create());
              }},
             {"dpcpp",
              [] {
diff --git a/examples/simple-solver/simple-solver.cpp b/examples/simple-solver/simple-solver.cpp
index 8f665f98496..81dc9ee6d74 100644
--- a/examples/simple-solver/simple-solver.cpp
+++ b/examples/simple-solver/simple-solver.cpp
@@ -89,13 +89,12 @@ int main(int argc, char* argv[])
             {"omp", [] { return gko::OmpExecutor::create(); }},
             {"cuda",
              [] {
-                 return gko::CudaExecutor::create(0, gko::OmpExecutor::create(),
-                                                  true);
+                 return gko::CudaExecutor::create(0,
+                                                  gko::OmpExecutor::create());
              }},
             {"hip",
              [] {
-                 return gko::HipExecutor::create(0, gko::OmpExecutor::create(),
-                                                 true);
+                 return gko::HipExecutor::create(0, gko::OmpExecutor::create());
              }},
             {"dpcpp",
              [] {
diff --git a/examples/three-pt-stencil-solver/three-pt-stencil-solver.cpp b/examples/three-pt-stencil-solver/three-pt-stencil-solver.cpp
index 6bf3cc21a8a..63adfaa5571 100644
--- a/examples/three-pt-stencil-solver/three-pt-stencil-solver.cpp
+++ b/examples/three-pt-stencil-solver/three-pt-stencil-solver.cpp
@@ -165,13 +165,12 @@ void solve_system(const std::string& executor_string,
             {"omp", [] { return gko::OmpExecutor::create(); }},
             {"cuda",
              [] {
-                 return gko::CudaExecutor::create(0, gko::OmpExecutor::create(),
-                                                  true);
+                 return gko::CudaExecutor::create(0,
+                                                  gko::OmpExecutor::create());
              }},
             {"hip",
              [] {
-                 return gko::HipExecutor::create(0, gko::OmpExecutor::create(),
-                                                 true);
+                 return gko::HipExecutor::create(0, gko::OmpExecutor::create());
              }},
             {"dpcpp",
              [] {
diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt
index 6c6fc235f45..61b06ad4058 100644
--- a/hip/CMakeLists.txt
+++ b/hip/CMakeLists.txt
@@ -1,9 +1,13 @@
 set(GINKGO_HIP_SOURCES
+    base/device.hip.cpp
     base/device_matrix_data_kernels.hip.cpp
     base/exception.hip.cpp
     base/executor.hip.cpp
     base/index_set_kernels.hip.cpp
+    base/memory.hip.cpp
+    base/roctx.hip.cpp
     base/scoped_device_id.hip.cpp
+    base/stream.hip.cpp
     base/timer.hip.cpp
     base/version.hip.cpp
     components/prefix_sum_kernels.hip.cpp
diff --git a/hip/base/device.hip.cpp b/hip/base/device.hip.cpp
new file mode 100644
index 00000000000..b5ec1bec6d6
--- /dev/null
+++ b/hip/base/device.hip.cpp
@@ -0,0 +1,67 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/base/stream.hpp>
+
+
+#include <hip/hip_runtime.h>
+
+
+#include <ginkgo/config.hpp>
+#include <ginkgo/core/base/device.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+
+
+#include "hip/base/scoped_device_id.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+
+
+void reset_device(int device_id)
+{
+    gko::detail::hip_scoped_device_id_guard guard{device_id};
+    hipDeviceReset();
+}
+
+
+void destroy_event(GKO_HIP_EVENT_STRUCT* event)
+{
+    GKO_ASSERT_NO_HIP_ERRORS(hipEventDestroy(event));
+}
+
+
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
diff --git a/hip/base/executor.hip.cpp b/hip/base/executor.hip.cpp
index cd8a485c19d..6b4b0fd5ddc 100644
--- a/hip/base/executor.hip.cpp
+++ b/hip/base/executor.hip.cpp
@@ -37,15 +37,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <hip/hip_runtime.h>
-#if GINKGO_HIP_PLATFORM_HCC && GKO_HAVE_ROCTX
-#include <roctx.h>
-#endif
 
 
 #include <ginkgo/config.hpp>
 #include <ginkgo/core/base/device.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
-#include <ginkgo/core/log/profiler_hook.hpp>
 
 
 #include "hip/base/config.hip.hpp"
@@ -60,32 +56,22 @@ namespace gko {
 #include "common/cuda_hip/base/executor.hpp.inc"
 
 
-#if (GINKGO_HIP_PLATFORM_NVCC == 1)
-using hip_device_class = nvidia_device;
-#else
-using hip_device_class = amd_device;
-#endif
-
-
 std::shared_ptr<HipExecutor> HipExecutor::create(
     int device_id, std::shared_ptr<Executor> master, bool device_reset,
     allocation_mode alloc_mode, hipStream_t stream)
 {
     return std::shared_ptr<HipExecutor>(
-        new HipExecutor(device_id, std::move(master), device_reset, alloc_mode,
-                        stream),
-        [device_id](HipExecutor* exec) {
-            auto device_reset = exec->get_device_reset();
-            std::lock_guard<std::mutex> guard(
-                hip_device_class::get_mutex(device_id));
-            delete exec;
-            auto& num_execs = hip_device_class::get_num_execs(device_id);
-            num_execs--;
-            if (!num_execs && device_reset) {
-                detail::hip_scoped_device_id_guard g(device_id);
-                hipDeviceReset();
-            }
-        });
+        new HipExecutor(device_id, std::move(master),
+                        std::make_shared<HipAllocator>(), stream));
+}
+
+
+std::shared_ptr<HipExecutor> HipExecutor::create(
+    int device_id, std::shared_ptr<Executor> master,
+    std::shared_ptr<HipAllocatorBase> alloc, hipStream_t stream)
+{
+    return std::shared_ptr<HipExecutor>(new HipExecutor(
+        device_id, std::move(master), std::move(alloc), stream));
 }
 
 
@@ -125,42 +111,14 @@ void OmpExecutor::raw_copy_to(const HipExecutor* dest, size_type num_bytes,
 void HipExecutor::raw_free(void* ptr) const noexcept
 {
     detail::hip_scoped_device_id_guard g(this->get_device_id());
-    auto error_code = hipFree(ptr);
-    if (error_code != hipSuccess) {
-#if GKO_VERBOSE_LEVEL >= 1
-        // Unfortunately, if memory free fails, there's not much we can do
-        std::cerr << "Unrecoverable HIP error on device "
-                  << this->get_device_id() << " in " << __func__ << ": "
-                  << hipGetErrorName(error_code) << ": "
-                  << hipGetErrorString(error_code) << std::endl
-                  << "Exiting program" << std::endl;
-#endif  // GKO_VERBOSE_LEVEL >= 1
-        std::exit(error_code);
-    }
+    alloc_->deallocate(ptr);
 }
 
 
 void* HipExecutor::raw_alloc(size_type num_bytes) const
 {
-    void* dev_ptr = nullptr;
     detail::hip_scoped_device_id_guard g(this->get_device_id());
-    int error_code = 0;
-    if (this->alloc_mode_ == allocation_mode::device) {
-        error_code = hipMalloc(&dev_ptr, num_bytes);
-#if !(GKO_HIP_PLATFORM_HCC == 1)
-    } else if (this->alloc_mode_ == allocation_mode::unified_global) {
-        error_code = hipMallocManaged(&dev_ptr, num_bytes, hipMemAttachGlobal);
-    } else if (this->alloc_mode_ == allocation_mode::unified_host) {
-        error_code = hipMallocManaged(&dev_ptr, num_bytes, hipMemAttachHost);
-#endif
-    } else {
-        GKO_NOT_SUPPORTED(this->alloc_mode_);
-    }
-    if (error_code != hipErrorMemoryAllocation) {
-        GKO_ASSERT_NO_HIP_ERRORS(error_code);
-    }
-    GKO_ENSURE_ALLOCATED(dev_ptr, "hip", num_bytes);
-    return dev_ptr;
+    return alloc_->allocate(num_bytes);
 }
 
 
@@ -309,73 +267,4 @@ void HipExecutor::init_handles()
 }
 
 
-hip_stream::hip_stream(int device_id) : stream_{}, device_id_(device_id)
-{
-    detail::hip_scoped_device_id_guard g(device_id_);
-    GKO_ASSERT_NO_HIP_ERRORS(hipStreamCreate(&stream_));
-}
-
-
-hip_stream::~hip_stream()
-{
-    if (stream_) {
-        detail::hip_scoped_device_id_guard g(device_id_);
-        hipStreamDestroy(stream_);
-    }
-}
-
-
-hip_stream::hip_stream(hip_stream&& other)
-    : stream_{std::exchange(other.stream_, nullptr)},
-      device_id_{std::exchange(other.device_id_, -1)}
-{}
-
-
-GKO_HIP_STREAM_STRUCT* hip_stream::get() const { return stream_; }
-
-
-namespace log {
-
-
-#if GINKGO_HIP_PLATFORM_HCC && GKO_HAVE_ROCTX
-
-void begin_roctx(const char* name, profile_event_category)
-{
-    roctxRangePush(name);
-}
-
-void end_roctx(const char*, profile_event_category) { roctxRangePop(); }
-
-#else
-
-void begin_roctx(const char* name, profile_event_category)
-    GKO_NOT_COMPILED(roctx);
-
-void end_roctx(const char*, profile_event_category) GKO_NOT_COMPILED(roctx);
-
-#endif
-
-
-}  // namespace log
-
-
-namespace kernels {
-namespace hip {
-
-
-void reset_device(int device_id)
-{
-    gko::detail::hip_scoped_device_id_guard guard{device_id};
-    hipDeviceReset();
-}
-
-
-void destroy_event(GKO_HIP_EVENT_STRUCT* event)
-{
-    GKO_ASSERT_NO_HIP_ERRORS(hipEventDestroy(event));
-}
-
-
-}  // namespace hip
-}  // namespace kernels
 }  // namespace gko
diff --git a/hip/base/memory.hip.cpp b/hip/base/memory.hip.cpp
new file mode 100644
index 00000000000..f2a8977525f
--- /dev/null
+++ b/hip/base/memory.hip.cpp
@@ -0,0 +1,97 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/base/memory.hpp>
+
+
+#include <hip/hip_runtime.h>
+
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+
+
+namespace gko {
+
+
+#define GKO_ASSERT_NO_HIP_ALLOCATION_ERRORS(_operation, _size)       \
+    {                                                                \
+        auto error_code = _operation;                                \
+        if (error_code == hipErrorMemoryAllocation) {                \
+            throw AllocationError(__FILE__, __LINE__, "hip", _size); \
+        } else {                                                     \
+            GKO_ASSERT_NO_HIP_ERRORS(error_code);                    \
+        }                                                            \
+    }
+
+
+#if GKO_VERBOSE_LEVEL >= 1
+#define GKO_EXIT_ON_HIP_ERROR(_operation)                                  \
+    {                                                                      \
+        const auto error_code = _operation;                                \
+        if (error_code != hipSuccess) {                                    \
+            int device_id{-1};                                             \
+            hipGetDevice(&device_id);                                      \
+            std::cerr << "Unrecoverable HIP error on device " << device_id \
+                      << " in " << __func__ << ": "                        \
+                      << hipGetErrorName(error_code) << ": "               \
+                      << hipGetErrorString(error_code) << std::endl        \
+                      << "Exiting program" << std::endl;                   \
+            std::exit(error_code);                                         \
+        }                                                                  \
+    }
+#else
+#define GKO_EXIT_ON_HIP_ERROR(_operation)   \
+    {                                       \
+        const auto error_code = _operation; \
+        if (error_code != hipSuccess) {     \
+            std::exit(error_code);          \
+        }                                   \
+    }
+#endif
+
+
+void* HipAllocator::allocate(size_type num_bytes) const
+{
+    void* dev_ptr{};
+    GKO_ASSERT_NO_HIP_ALLOCATION_ERRORS(hipMalloc(&dev_ptr, num_bytes),
+                                        num_bytes);
+    return dev_ptr;
+}
+
+
+void HipAllocator::deallocate(void* dev_ptr) const
+{
+    GKO_EXIT_ON_HIP_ERROR(hipFree(dev_ptr));
+}
+
+
+}  // namespace gko
diff --git a/hip/base/roctx.hip.cpp b/hip/base/roctx.hip.cpp
new file mode 100644
index 00000000000..9f309b93362
--- /dev/null
+++ b/hip/base/roctx.hip.cpp
@@ -0,0 +1,70 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/config.hpp>
+
+
+#include <hip/hip_runtime.h>
+#if GINKGO_HIP_PLATFORM_HCC && GKO_HAVE_ROCTX
+#include <roctx.h>
+#endif
+
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/log/profiler_hook.hpp>
+
+
+namespace gko {
+namespace log {
+
+
+#if GINKGO_HIP_PLATFORM_HCC && GKO_HAVE_ROCTX
+
+void begin_roctx(const char* name, profile_event_category)
+{
+    roctxRangePush(name);
+}
+
+void end_roctx(const char*, profile_event_category) { roctxRangePop(); }
+
+#else
+
+void begin_roctx(const char* name, profile_event_category)
+    GKO_NOT_COMPILED(roctx);
+
+void end_roctx(const char*, profile_event_category) GKO_NOT_COMPILED(roctx);
+
+#endif
+
+
+}  // namespace log
+}  // namespace gko
diff --git a/hip/base/stream.hip.cpp b/hip/base/stream.hip.cpp
new file mode 100644
index 00000000000..e5817eb9ebd
--- /dev/null
+++ b/hip/base/stream.hip.cpp
@@ -0,0 +1,78 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/base/stream.hpp>
+
+
+#include <hip/hip_runtime.h>
+
+
+#include <ginkgo/config.hpp>
+#include <ginkgo/core/base/device.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+
+
+#include "hip/base/scoped_device_id.hip.hpp"
+
+
+namespace gko {
+
+
+hip_stream::hip_stream() : stream_{}, device_id_{-1} {}
+
+
+hip_stream::hip_stream(int device_id) : stream_{}, device_id_(device_id)
+{
+    detail::hip_scoped_device_id_guard g(device_id_);
+    GKO_ASSERT_NO_HIP_ERRORS(hipStreamCreate(&stream_));
+}
+
+
+hip_stream::~hip_stream()
+{
+    if (stream_) {
+        detail::hip_scoped_device_id_guard g(device_id_);
+        hipStreamDestroy(stream_);
+    }
+}
+
+
+hip_stream::hip_stream(hip_stream&& other)
+    : stream_{std::exchange(other.stream_, nullptr)},
+      device_id_{std::exchange(other.device_id_, -1)}
+{}
+
+
+GKO_HIP_STREAM_STRUCT* hip_stream::get() const { return stream_; }
+
+
+}  // namespace gko
diff --git a/hip/test/base/CMakeLists.txt b/hip/test/base/CMakeLists.txt
index 7ed0d2ceb52..f597a3d6e3d 100644
--- a/hip/test/base/CMakeLists.txt
+++ b/hip/test/base/CMakeLists.txt
@@ -1,6 +1,5 @@
 ginkgo_create_hip_test(hip_executor)
 ginkgo_create_test(index_set)
-ginkgo_create_test(hip_executor_reset ADDITIONAL_LIBRARIES Threads::Threads)
 if(GINKGO_HAVE_HWLOC)
     find_package(NUMA REQUIRED)
     ginkgo_create_hip_test(hip_executor_topology ADDITIONAL_LIBRARIES NUMA::NUMA)
diff --git a/hip/test/base/hip_executor.hip.cpp b/hip/test/base/hip_executor.hip.cpp
index d27dd58d132..e531fa739e6 100644
--- a/hip/test/base/hip_executor.hip.cpp
+++ b/hip/test/base/hip_executor.hip.cpp
@@ -109,18 +109,18 @@ class HipExecutor : public ::testing::Test {
         ASSERT_GT(gko::HipExecutor::get_num_devices(), 0);
 #ifdef GKO_TEST_NONDEFAULT_STREAM
         hip = gko::HipExecutor::create(
-            0, omp, false, gko::default_hip_alloc_mode, stream.get());
-        hip2 = gko::HipExecutor::create(gko::HipExecutor::get_num_devices() - 1,
-                                        omp, false, gko::default_hip_alloc_mode,
-                                        other_stream.get());
+            0, omp, std::make_shared<gko::HipAllocator>(), stream.get());
+        hip2 = gko::HipExecutor::create(
+            gko::HipExecutor::get_num_devices() - 1, omp,
+            std::make_shared<gko::HipAllocator>(), other_stream.get());
         hip3 = gko::HipExecutor::create(
-            0, omp, false, gko::allocation_mode::unified_global, stream.get());
+            0, omp, std::make_shared<gko::HipAllocator>(), stream.get());
 #else
         hip = gko::HipExecutor::create(0, omp);
         hip2 = gko::HipExecutor::create(gko::HipExecutor::get_num_devices() - 1,
                                         omp);
-        hip3 = gko::HipExecutor::create(0, omp, false,
-                                        gko::allocation_mode::unified_global);
+        hip3 = gko::HipExecutor::create(0, omp,
+                                        std::make_shared<gko::HipAllocator>());
 #endif
     }
 
diff --git a/hip/test/utils.hip.hpp b/hip/test/utils.hip.hpp
index 9337da14139..bf7073cf9a1 100644
--- a/hip/test/utils.hip.hpp
+++ b/hip/test/utils.hip.hpp
@@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/stream.hpp>
 
 
 #include "hip/base/device.hpp"
@@ -60,8 +61,9 @@ class HipTestFixture : public ::testing::Test {
     HipTestFixture()
         : ref(gko::ReferenceExecutor::create()),
 #ifdef GKO_TEST_NONDEFAULT_STREAM
+          stream(0),
           exec(gko::HipExecutor::create(
-              0, ref, false, gko::default_hip_alloc_mode, stream.get()))
+              0, ref, std::make_shared<gko::HipAllocator>(), stream.get()))
 #else
           exec(gko::HipExecutor::create(0, ref))
 #endif
diff --git a/include/ginkgo/core/base/executor.hpp b/include/ginkgo/core/base/executor.hpp
index 965cd562bff..4545b216f86 100644
--- a/include/ginkgo/core/base/executor.hpp
+++ b/include/ginkgo/core/base/executor.hpp
@@ -47,7 +47,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <ginkgo/core/base/device.hpp>
+#include <ginkgo/core/base/fwd_defs.hpp>
 #include <ginkgo/core/base/machine_topology.hpp>
+#include <ginkgo/core/base/memory.hpp>
 #include <ginkgo/core/base/scoped_device_id_guard.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/log/logger.hpp>
@@ -121,33 +123,6 @@ constexpr allocation_mode default_hip_alloc_mode =
 }  // namespace gko
 
 
-// after intel/llvm September'22 release, which uses major version 6, they
-// introduce another inline namespace _V1.
-#if GINKGO_DPCPP_MAJOR_VERSION >= 6
-namespace sycl {
-inline namespace _V1 {
-
-
-class queue;
-class event;
-
-
-}  // namespace _V1
-}  // namespace sycl
-#else  // GINKGO_DPCPP_MAJOR_VERSION < 6
-inline namespace cl {
-namespace sycl {
-
-
-class queue;
-class event;
-
-
-}  // namespace sycl
-}  // namespace cl
-#endif
-
-
 /**
  * The enum class is for the dpcpp queue property. It's legal to use a binary
  * or(|) operation to combine several properties.
@@ -172,29 +147,6 @@ GKO_ATTRIBUTES GKO_INLINE dpcpp_queue_property operator|(dpcpp_queue_property a,
 }
 
 
-struct cublasContext;
-
-struct cusparseContext;
-
-struct CUstream_st;
-
-struct CUevent_st;
-
-struct hipblasContext;
-
-struct hipsparseContext;
-
-#if GINKGO_HIP_PLATFORM_HCC
-struct ihipStream_t;
-struct ihipEvent_t;
-#define GKO_HIP_STREAM_STRUCT ihipStream_t
-#define GKO_HIP_EVENT_STRUCT ihipEvent_t
-#else
-#define GKO_HIP_STREAM_STRUCT CUstream_st
-#define GKO_HIP_EVENT_STRUCT CUevent_st
-#endif
-
-
 namespace gko {
 
 
@@ -1355,26 +1307,14 @@ class EnableDeviceReset {
      *
      * @param device_reset  whether to allow a device reset or not
      */
-    void set_device_reset(bool device_reset) { device_reset_ = device_reset; }
+    void set_device_reset(bool device_reset) {}
 
     /**
      * Returns the current status of the device reset boolean for this executor.
      *
      * @return the current status of the device reset boolean for this executor.
      */
-    bool get_device_reset() { return device_reset_; }
-
-protected:
-    /**
-     * Instantiate an EnableDeviceReset class
-     *
-     * @param device_reset  the starting device_reset status. Defaults to false.
-     */
-    EnableDeviceReset(bool device_reset = false) : device_reset_{device_reset}
-    {}
-
-private:
-    bool device_reset_{};
+    bool get_device_reset() { return false; }
 };
 
 
@@ -1411,9 +1351,11 @@ class OmpExecutor : public detail::ExecutorBase<OmpExecutor>,
     /**
      * Creates a new OmpExecutor.
      */
-    static std::shared_ptr<OmpExecutor> create()
+    static std::shared_ptr<OmpExecutor> create(
+        std::shared_ptr<CpuAllocatorBase> alloc =
+            std::make_shared<CpuAllocator>())
     {
-        return std::shared_ptr<OmpExecutor>(new OmpExecutor());
+        return std::shared_ptr<OmpExecutor>(new OmpExecutor(std::move(alloc)));
     }
 
     std::shared_ptr<Executor> get_master() noexcept override;
@@ -1435,7 +1377,8 @@ class OmpExecutor : public detail::ExecutorBase<OmpExecutor>,
     scoped_device_id_guard get_scoped_device_id_guard() const override;
 
 protected:
-    OmpExecutor()
+    OmpExecutor(std::shared_ptr<CpuAllocatorBase> alloc)
+        : alloc_{std::move(alloc)}
     {
         this->OmpExecutor::populate_exec_info(machine_topology::get_instance());
     }
@@ -1457,6 +1400,8 @@ class OmpExecutor : public detail::ExecutorBase<OmpExecutor>,
     GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(CudaExecutor, false);
 
     bool verify_memory_to(const DpcppExecutor* dest_exec) const override;
+
+    std::shared_ptr<CpuAllocatorBase> alloc_;
 };
 
 
@@ -1476,9 +1421,12 @@ using DefaultExecutor = OmpExecutor;
  */
 class ReferenceExecutor : public OmpExecutor {
 public:
-    static std::shared_ptr<ReferenceExecutor> create()
+    static std::shared_ptr<ReferenceExecutor> create(
+        std::shared_ptr<CpuAllocatorBase> alloc =
+            std::make_shared<CpuAllocator>())
     {
-        return std::shared_ptr<ReferenceExecutor>(new ReferenceExecutor());
+        return std::shared_ptr<ReferenceExecutor>(
+            new ReferenceExecutor(std::move(alloc)));
     }
 
     scoped_device_id_guard get_scoped_device_id_guard() const override
@@ -1495,7 +1443,8 @@ class ReferenceExecutor : public OmpExecutor {
     }
 
 protected:
-    ReferenceExecutor()
+    ReferenceExecutor(std::shared_ptr<CpuAllocatorBase> alloc)
+        : OmpExecutor{std::move(alloc)}
     {
         this->ReferenceExecutor::populate_exec_info(
             machine_topology::get_instance());
@@ -1550,15 +1499,32 @@ class CudaExecutor : public detail::ExecutorBase<CudaExecutor>,
      * @param device_id  the CUDA device id of this device
      * @param master  an executor on the host that is used to invoke the device
      * kernels
-     * @param device_reset  whether to reset the device after the object exits
-     *                      the scope.
+     * @param device_reset  this option no longer has any effect.
      * @param alloc_mode  the allocation mode that the executor should operate
      *                    on. See @allocation_mode for more details
+     * @param stream  the stream to execute operations on.
+     */
+    [[deprecated(
+        "device_reset is deprecated entirely, call cudaDeviceReset directly. "
+        "alloc_mode was replaced by the Allocator type "
+        "hierarchy.")]] static std::shared_ptr<CudaExecutor>
+    create(int device_id, std::shared_ptr<Executor> master, bool device_reset,
+           allocation_mode alloc_mode = default_cuda_alloc_mode,
+           CUstream_st* stream = nullptr);
+
+    /**
+     * Creates a new CudaExecutor with a custom allocator and device stream.
+     *
+     * @param device_id  the CUDA device id of this device
+     * @param master  an executor on the host that is used to invoke the device
+     *                kernels.
+     * @param alloc  the allocator to use for device memory allocations.
+     * @param stream  the stream to execute operations on.
      */
     static std::shared_ptr<CudaExecutor> create(
         int device_id, std::shared_ptr<Executor> master,
-        bool device_reset = false,
-        allocation_mode alloc_mode = default_cuda_alloc_mode,
+        std::shared_ptr<CudaAllocatorBase> alloc =
+            std::make_shared<CudaAllocator>(),
         CUstream_st* stream = nullptr);
 
     std::shared_ptr<Executor> get_master() noexcept override;
@@ -1679,26 +1645,15 @@ class CudaExecutor : public detail::ExecutorBase<CudaExecutor>,
     void init_handles();
 
     CudaExecutor(int device_id, std::shared_ptr<Executor> master,
-                 bool device_reset = false,
-                 allocation_mode alloc_mode = default_cuda_alloc_mode,
-                 CUstream_st* stream = nullptr)
-        : EnableDeviceReset{device_reset},
-          master_(master),
-          alloc_mode_{alloc_mode},
-          stream_{stream}
+                 std::shared_ptr<CudaAllocatorBase> alloc, CUstream_st* stream)
+        : alloc_{std::move(alloc)}, master_(master), stream_{stream}
     {
         this->get_exec_info().device_id = device_id;
         this->get_exec_info().num_computing_units = 0;
         this->get_exec_info().num_pu_per_cu = 0;
         this->CudaExecutor::populate_exec_info(
             machine_topology::get_instance());
-
-        // it only gets attribute from device, so it should not be affected by
-        // DeviceReset.
         this->set_gpu_property();
-        // increase the number of executor before any operations may be affected
-        // by DeviceReset.
-        increase_num_execs(this->get_exec_info().device_id);
         this->init_handles();
     }
 
@@ -1718,12 +1673,6 @@ class CudaExecutor : public detail::ExecutorBase<CudaExecutor>,
 
     bool verify_memory_to(const CudaExecutor* dest_exec) const override;
 
-    static void increase_num_execs(unsigned device_id);
-
-    static void decrease_num_execs(unsigned device_id);
-
-    static unsigned get_num_execs(unsigned device_id);
-
     void populate_exec_info(const machine_topology* mach_topo) override;
 
 private:
@@ -1733,45 +1682,8 @@ class CudaExecutor : public detail::ExecutorBase<CudaExecutor>,
     using handle_manager = std::unique_ptr<T, std::function<void(T*)>>;
     handle_manager<cublasContext> cublas_handle_;
     handle_manager<cusparseContext> cusparse_handle_;
+    std::shared_ptr<CudaAllocatorBase> alloc_;
     CUstream_st* stream_;
-
-    allocation_mode alloc_mode_;
-};
-
-
-/**
- * An RAII wrapper for a custom CUDA stream.
- * The stream will be created on construction and destroyed when the lifetime of
- * the wrapper ends.
- */
-class cuda_stream {
-public:
-    /** Creates a new custom CUDA stream. */
-    cuda_stream(int device_id = 0);
-
-    /** Destroys the custom CUDA stream, if it wasn't moved-from already. */
-    ~cuda_stream();
-
-    cuda_stream(const cuda_stream&) = delete;
-
-    /** Move-constructs from an existing stream, which will be emptied. */
-    cuda_stream(cuda_stream&&);
-
-    cuda_stream& operator=(const cuda_stream&) = delete;
-
-    /** Move-assigns from an existing stream, which will be emptied. */
-    cuda_stream& operator=(cuda_stream&&) = delete;
-
-    /**
-     * Returns the native CUDA stream handle.
-     * In a moved-from cuda_stream, this will return nullptr.
-     */
-    CUstream_st* get() const;
-
-private:
-    CUstream_st* stream_;
-
-    int device_id_;
 };
 
 
@@ -1805,10 +1717,15 @@ class HipExecutor : public detail::ExecutorBase<HipExecutor>,
      * @param alloc_mode  the allocation mode that the executor should operate
      *                    on. See @allocation_mode for more details
      */
+    [[deprecated("")]] static std::shared_ptr<HipExecutor> create(
+        int device_id, std::shared_ptr<Executor> master, bool device_reset,
+        allocation_mode alloc_mode = default_hip_alloc_mode,
+        GKO_HIP_STREAM_STRUCT* stream = nullptr);
+
     static std::shared_ptr<HipExecutor> create(
         int device_id, std::shared_ptr<Executor> master,
-        bool device_reset = false,
-        allocation_mode alloc_mode = default_hip_alloc_mode,
+        std::shared_ptr<HipAllocatorBase> alloc =
+            std::make_shared<HipAllocator>(),
         GKO_HIP_STREAM_STRUCT* stream = nullptr);
 
     std::shared_ptr<Executor> get_master() noexcept override;
@@ -1923,25 +1840,15 @@ class HipExecutor : public detail::ExecutorBase<HipExecutor>,
     void init_handles();
 
     HipExecutor(int device_id, std::shared_ptr<Executor> master,
-                bool device_reset = false,
-                allocation_mode alloc_mode = default_hip_alloc_mode,
-                GKO_HIP_STREAM_STRUCT* stream = nullptr)
-        : EnableDeviceReset{device_reset},
-          master_(master),
-          alloc_mode_(alloc_mode),
-          stream_{stream}
+                std::shared_ptr<HipAllocatorBase> alloc,
+                GKO_HIP_STREAM_STRUCT* stream)
+        : master_{std::move(master)}, alloc_{std::move(alloc)}, stream_{stream}
     {
         this->get_exec_info().device_id = device_id;
         this->get_exec_info().num_computing_units = 0;
         this->get_exec_info().num_pu_per_cu = 0;
         this->HipExecutor::populate_exec_info(machine_topology::get_instance());
-
-        // it only gets attribute from device, so it should not be affected by
-        // DeviceReset.
         this->set_gpu_property();
-        // increase the number of executor before any operations may be affected
-        // by DeviceReset.
-        increase_num_execs(this->get_exec_info().device_id);
         this->init_handles();
     }
 
@@ -1961,12 +1868,6 @@ class HipExecutor : public detail::ExecutorBase<HipExecutor>,
 
     bool verify_memory_to(const HipExecutor* dest_exec) const override;
 
-    static void increase_num_execs(int device_id);
-
-    static void decrease_num_execs(int device_id);
-
-    static int get_num_execs(int device_id);
-
     void populate_exec_info(const machine_topology* mach_topo) override;
 
 private:
@@ -1976,48 +1877,11 @@ class HipExecutor : public detail::ExecutorBase<HipExecutor>,
     using handle_manager = std::unique_ptr<T, std::function<void(T*)>>;
     handle_manager<hipblasContext> hipblas_handle_;
     handle_manager<hipsparseContext> hipsparse_handle_;
-
-    allocation_mode alloc_mode_;
+    std::shared_ptr<HipAllocatorBase> alloc_;
     GKO_HIP_STREAM_STRUCT* stream_;
 };
 
 
-/**
- * An RAII wrapper for a custom HIP stream.
- * The stream will be created on construction and destroyed when the lifetime of
- * the wrapper ends.
- */
-class hip_stream {
-public:
-    /** Creates a new custom HIP stream. */
-    hip_stream(int device_id = 0);
-
-    /** Destroys the custom HIP stream, if it wasn't moved-from already. */
-    ~hip_stream();
-
-    hip_stream(const hip_stream&) = delete;
-
-    /** Move-constructs from an existing stream, which will be emptied. */
-    hip_stream(hip_stream&&);
-
-    hip_stream& operator=(const hip_stream&) = delete;
-
-    /** Move-assigns from an existing stream, which will be emptied. */
-    hip_stream& operator=(hip_stream&&) = delete;
-
-    /**
-     * Returns the native HIP stream handle.
-     * In a moved-from hip_stream, this will return nullptr.
-     */
-    GKO_HIP_STREAM_STRUCT* get() const;
-
-private:
-    GKO_HIP_STREAM_STRUCT* stream_;
-
-    int device_id_;
-};
-
-
 namespace kernels {
 namespace hip {
 using DefaultExecutor = HipExecutor;
@@ -2050,6 +1914,28 @@ class DpcppExecutor : public detail::ExecutorBase<DpcppExecutor>,
         std::string device_type = "all",
         dpcpp_queue_property property = dpcpp_queue_property::in_order);
 
+    /**
+     * Creates a new DpcppExecutor from an existing SYCL queue.
+     *
+     * @param queue  the DPCPP device id of this device
+     * @param master  an executor on the host that is used to invoke the device
+     *                kernels
+     */
+    static std::shared_ptr<DpcppExecutor> create(
+        sycl::queue* queue, std::shared_ptr<Executor> master);
+
+    /**
+     * Creates a new DpcppExecutor from an existing SYCL queue.
+     *
+     * @param queue  the DPCPP device id of this device
+     * @param master  an executor on the host that is used to invoke the device
+     *                kernels
+     * @param alloc  the allocator used for memory allocation
+     */
+    static std::shared_ptr<DpcppExecutor> create(
+        sycl::queue* queue, std::shared_ptr<Executor> master,
+        std::shared_ptr<DpcppAllocatorBase> alloc);
+
     std::shared_ptr<Executor> get_master() noexcept override;
 
     std::shared_ptr<const Executor> get_master() const noexcept override;
diff --git a/include/ginkgo/core/base/fwd_defs.hpp b/include/ginkgo/core/base/fwd_defs.hpp
new file mode 100644
index 00000000000..5f0cbd9d960
--- /dev/null
+++ b/include/ginkgo/core/base/fwd_defs.hpp
@@ -0,0 +1,90 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_PUBLIC_CORE_BASE_FWD_DEFS_HPP_
+#define GKO_PUBLIC_CORE_BASE_FWD_DEFS_HPP_
+
+
+#include <ginkgo/config.hpp>
+
+
+struct cublasContext;
+
+struct cusparseContext;
+
+struct CUstream_st;
+
+struct CUevent_st;
+
+struct hipblasContext;
+
+struct hipsparseContext;
+
+#if GINKGO_HIP_PLATFORM_HCC
+struct ihipStream_t;
+struct ihipEvent_t;
+#define GKO_HIP_STREAM_STRUCT ihipStream_t
+#define GKO_HIP_EVENT_STRUCT ihipEvent_t
+#else
+#define GKO_HIP_STREAM_STRUCT CUstream_st
+#define GKO_HIP_EVENT_STRUCT CUevent_st
+#endif
+
+
+// after intel/llvm September'22 release, which uses major version 6, they
+// introduce another inline namespace _V1.
+#if GINKGO_DPCPP_MAJOR_VERSION >= 6
+namespace sycl {
+inline namespace _V1 {
+
+
+class queue;
+class event;
+
+
+}  // namespace _V1
+}  // namespace sycl
+#else  // GINKGO_DPCPP_MAJOR_VERSION < 6
+inline namespace cl {
+namespace sycl {
+
+
+class queue;
+class event;
+
+
+}  // namespace sycl
+}  // namespace cl
+#endif
+
+
+#endif  // GKO_PUBLIC_CORE_BASE_FWD_DEFS_HPP_
diff --git a/include/ginkgo/core/base/memory.hpp b/include/ginkgo/core/base/memory.hpp
new file mode 100644
index 00000000000..ec25920dcea
--- /dev/null
+++ b/include/ginkgo/core/base/memory.hpp
@@ -0,0 +1,211 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_PUBLIC_CORE_BASE_MEMORY_HPP_
+#define GKO_PUBLIC_CORE_BASE_MEMORY_HPP_
+
+
+#include <ginkgo/core/base/fwd_defs.hpp>
+#include <ginkgo/core/base/types.hpp>
+
+
+namespace gko {
+
+
+/**
+ * Provides generic allocation and deallocation functionality to be used by an
+ * Executor.
+ */
+class Allocator {
+public:
+    virtual ~Allocator() = default;
+
+    virtual void* allocate(size_type num_bytes) const = 0;
+
+    virtual void deallocate(void* ptr) const = 0;
+};
+
+
+/**
+ * Implement this interface to provide an allocator for OmpExecutor or
+ * ReferenceExecutor.
+ */
+class CpuAllocatorBase : public Allocator {};
+
+
+/**
+ * Implement this interface to provide an allocator for CudaExecutor.
+ */
+class CudaAllocatorBase : public Allocator {};
+
+
+/**
+ * Implement this interface to provide an allocator for HipExecutor.
+ */
+class HipAllocatorBase : public Allocator {};
+
+
+/**
+ * Implement this interface to provide an allocator for DpcppExecutor.
+ */
+class DpcppAllocatorBase : public Allocator {
+public:
+    DpcppAllocatorBase(sycl::queue* queue);
+
+protected:
+    virtual void* allocate_impl(sycl::queue* queue,
+                                size_type num_bytes) const = 0;
+
+    virtual void deallocate_impl(sycl::queue* queue, void* ptr) const = 0;
+
+private:
+    sycl::queue* queue_;
+};
+
+
+/**
+ * Allocator using new/delete.
+ */
+class CpuAllocator : public CpuAllocatorBase {
+public:
+    void* allocate(size_type num_bytes) const override;
+
+    void deallocate(void* ptr) const override;
+};
+
+
+/**
+ * Allocator using cudaMalloc.
+ */
+class CudaAllocator : public CudaAllocatorBase {
+public:
+    void* allocate(size_type num_bytes) const override;
+
+    void deallocate(void* ptr) const override;
+};
+
+
+/*
+ * Allocator using cudaMallocAsync.
+ */
+class CudaAsyncAllocator : public CudaAllocatorBase {
+public:
+    void* allocate(size_type num_bytes) const override;
+
+    void deallocate(void* ptr) const override;
+
+    CudaAsyncAllocator(CUstream_st* stream);
+
+private:
+    CUstream_st* stream_;
+};
+
+
+/*
+ * Allocator using cudaMallocManaged
+ */
+class CudaUnifiedAllocator : public CudaAllocatorBase, public CpuAllocatorBase {
+public:
+    void* allocate(size_type num_bytes) const override;
+
+    void deallocate(void* ptr) const override;
+
+    CudaUnifiedAllocator(int device_id);
+
+    CudaUnifiedAllocator(int device_id, unsigned int flags);
+
+private:
+    int device_id_;
+    unsigned int flags_;
+};
+
+
+/*
+ * Allocator using cudaMallocHost.
+ */
+class CudaHostAllocator : public CudaAllocatorBase, public CpuAllocatorBase {
+public:
+    void* allocate(size_type num_bytes) const override;
+
+    void deallocate(void* ptr) const override;
+
+    CudaHostAllocator(int device_id);
+
+private:
+    int device_id_;
+};
+
+
+/*
+ * Allocator using hipMalloc.
+ */
+class HipAllocator : public HipAllocatorBase {
+public:
+    void* allocate(size_type num_bytes) const override;
+
+    void deallocate(void* ptr) const override;
+};
+
+
+/*
+ * Allocator using sycl::malloc_device.
+ */
+class DpcppAllocator : public DpcppAllocatorBase {
+public:
+    using DpcppAllocatorBase::DpcppAllocatorBase;
+
+protected:
+    void* allocate_impl(sycl::queue* queue, size_type num_bytes) const override;
+
+    void deallocate_impl(sycl::queue* queue, void* ptr) const override;
+};
+
+
+/*
+ * Allocator using sycl::malloc_shared.
+ */
+class DpcppUnifiedAllocator : public DpcppAllocatorBase,
+                              public CpuAllocatorBase {
+public:
+    using DpcppAllocatorBase::DpcppAllocatorBase;
+
+protected:
+    void* allocate_impl(sycl::queue* queue, size_type num_bytes) const override;
+
+    void deallocate_impl(sycl::queue* queue, void* ptr) const override;
+};
+
+
+}  // namespace gko
+
+
+#endif  // GKO_PUBLIC_CORE_BASE_MEMORY_HPP_
diff --git a/include/ginkgo/core/base/stream.hpp b/include/ginkgo/core/base/stream.hpp
new file mode 100644
index 00000000000..4bb4aeecf9e
--- /dev/null
+++ b/include/ginkgo/core/base/stream.hpp
@@ -0,0 +1,124 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_PUBLIC_CORE_BASE_STREAM_HPP_
+#define GKO_PUBLIC_CORE_BASE_STREAM_HPP_
+
+
+#include <ginkgo/core/base/executor.hpp>
+
+
+namespace gko {
+
+
+/**
+ * An RAII wrapper for a custom CUDA stream.
+ * The stream will be created on construction and destroyed when the lifetime of
+ * the wrapper ends.
+ */
+class cuda_stream {
+public:
+    /** Creates an empty stream wrapper, representing the default stream. */
+    cuda_stream();
+
+    /** Creates a new custom CUDA stream. */
+    cuda_stream(int device_id);
+
+    /** Destroys the custom CUDA stream, if it isn't empty. */
+    ~cuda_stream();
+
+    cuda_stream(const cuda_stream&) = delete;
+
+    /** Move-constructs from an existing stream, which will be emptied. */
+    cuda_stream(cuda_stream&&);
+
+    cuda_stream& operator=(const cuda_stream&) = delete;
+
+    /** Move-assigns from an existing stream, which will be emptied. */
+    cuda_stream& operator=(cuda_stream&&) = delete;
+
+    /**
+     * Returns the native CUDA stream handle.
+     * In an empty cuda_stream, this will return nullptr.
+     */
+    CUstream_st* get() const;
+
+private:
+    CUstream_st* stream_;
+
+    int device_id_;
+};
+
+
+/**
+ * An RAII wrapper for a custom HIP stream.
+ * The stream will be created on construction and destroyed when the lifetime of
+ * the wrapper ends.
+ */
+class hip_stream {
+public:
+    /** Creates an empty stream wrapper, representing the default stream. */
+    hip_stream();
+
+    /** Creates a new custom HIP stream. */
+    hip_stream(int device_id);
+
+    /** Destroys the custom HIP stream, if it isn't empty. */
+    ~hip_stream();
+
+    hip_stream(const hip_stream&) = delete;
+
+    /** Move-constructs from an existing stream, which will be emptied. */
+    hip_stream(hip_stream&&);
+
+    hip_stream& operator=(const hip_stream&) = delete;
+
+    /** Move-assigns from an existing stream, which will be emptied. */
+    hip_stream& operator=(hip_stream&&) = delete;
+
+    /**
+     * Returns the native HIP stream handle.
+     * In an empty hip_stream, this will return nullptr.
+     */
+    GKO_HIP_STREAM_STRUCT* get() const;
+
+private:
+    GKO_HIP_STREAM_STRUCT* stream_;
+
+    int device_id_;
+};
+
+
+}  // namespace gko
+
+
+#endif  // GKO_PUBLIC_CORE_BASE_EXECUTOR_HPP_
diff --git a/include/ginkgo/ginkgo.hpp b/include/ginkgo/ginkgo.hpp
index 93663b02290..d73bf669700 100644
--- a/include/ginkgo/ginkgo.hpp
+++ b/include/ginkgo/ginkgo.hpp
@@ -48,6 +48,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/fwd_defs.hpp>
 #include <ginkgo/core/base/index_set.hpp>
 #include <ginkgo/core/base/intrinsics.hpp>
 #include <ginkgo/core/base/lin_op.hpp>
@@ -55,6 +56,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/matrix_assembly_data.hpp>
 #include <ginkgo/core/base/matrix_data.hpp>
+#include <ginkgo/core/base/memory.hpp>
 #include <ginkgo/core/base/mpi.hpp>
 #include <ginkgo/core/base/mtx_io.hpp>
 #include <ginkgo/core/base/name_demangling.hpp>
@@ -65,6 +67,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/range_accessors.hpp>
 #include <ginkgo/core/base/scoped_device_id_guard.hpp>
 #include <ginkgo/core/base/std_extensions.hpp>
+#include <ginkgo/core/base/stream.hpp>
 #include <ginkgo/core/base/temporary_clone.hpp>
 #include <ginkgo/core/base/temporary_conversion.hpp>
 #include <ginkgo/core/base/timer.hpp>
diff --git a/test/utils/executor.hpp b/test/utils/executor.hpp
index 25482cf18c8..33e6258fbbd 100644
--- a/test/utils/executor.hpp
+++ b/test/utils/executor.hpp
@@ -44,6 +44,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <gtest/gtest.h>
 
 
+#include <ginkgo/core/base/stream.hpp>
+
+
 #ifdef GKO_COMPILING_CUDA
 
 #include "cuda/base/device.hpp"
@@ -106,8 +109,8 @@ inline void init_executor(std::shared_ptr<gko::ReferenceExecutor> ref,
         if (gko::CudaExecutor::get_num_devices() == 0) {
             throw std::runtime_error{"No suitable CUDA devices"};
         }
-        exec = gko::CudaExecutor::create(0, ref, false,
-                                         gko::default_cuda_alloc_mode, stream);
+        exec = gko::CudaExecutor::create(
+            0, ref, std::make_shared<gko::CudaAsyncAllocator>(stream), stream);
     }
 }
 
@@ -119,8 +122,8 @@ inline void init_executor(std::shared_ptr<gko::ReferenceExecutor> ref,
     if (gko::HipExecutor::get_num_devices() == 0) {
         throw std::runtime_error{"No suitable HIP devices"};
     }
-    exec = gko::HipExecutor::create(0, ref, false, gko::default_hip_alloc_mode,
-                                    stream);
+    exec = gko::HipExecutor::create(
+        0, ref, std::make_shared<gko::HipAllocator>(), stream);
 }
 
 
@@ -146,7 +149,13 @@ class CommonTestFixture : public ::testing::Test {
 #endif
     using index_type = int;
 
-    CommonTestFixture() : ref{gko::ReferenceExecutor::create()}
+    CommonTestFixture()
+        :
+#if defined(GKO_TEST_NONDEFAULT_STREAM) && \
+    (defined(GKO_COMPILING_CUDA) || defined(GKO_COMPILING_HIP))
+          stream{0},
+#endif
+          ref{gko::ReferenceExecutor::create()}
     {
 #if defined(GKO_TEST_NONDEFAULT_STREAM) && \
     (defined(GKO_COMPILING_CUDA) || defined(GKO_COMPILING_HIP))
diff --git a/test/utils/mpi/executor.hpp b/test/utils/mpi/executor.hpp
index 59c3f1e3f3b..d8c94e01804 100644
--- a/test/utils/mpi/executor.hpp
+++ b/test/utils/mpi/executor.hpp
@@ -44,6 +44,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <ginkgo/core/base/mpi.hpp>
+#include <ginkgo/core/base/stream.hpp>
 
 
 inline void init_executor(std::shared_ptr<gko::ReferenceExecutor>,
@@ -71,7 +72,7 @@ inline void init_executor(std::shared_ptr<gko::ReferenceExecutor> ref,
         exec = gko::CudaExecutor::create(
             gko::experimental::mpi::map_rank_to_device_id(
                 MPI_COMM_WORLD, gko::CudaExecutor::get_num_devices()),
-            ref, false, gko::default_cuda_alloc_mode, stream);
+            ref, std::make_shared<gko::CudaAllocator>(), stream);
     }
 }
 
@@ -86,7 +87,7 @@ inline void init_executor(std::shared_ptr<gko::ReferenceExecutor> ref,
     exec = gko::HipExecutor::create(
         gko::experimental::mpi::map_rank_to_device_id(
             MPI_COMM_WORLD, gko::HipExecutor::get_num_devices()),
-        ref, false, gko::default_hip_alloc_mode, stream);
+        ref, std::make_shared<gko::HipAllocator>(), stream);
 }
 
 

From e046562667c920addcda2b9c91e560f0ee037c8a Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Thu, 30 Mar 2023 12:23:58 +0200
Subject: [PATCH 02/14] reset to default CUDA allocator

---
 test/utils/executor.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/utils/executor.hpp b/test/utils/executor.hpp
index 33e6258fbbd..c588ac74260 100644
--- a/test/utils/executor.hpp
+++ b/test/utils/executor.hpp
@@ -110,7 +110,7 @@ inline void init_executor(std::shared_ptr<gko::ReferenceExecutor> ref,
             throw std::runtime_error{"No suitable CUDA devices"};
         }
         exec = gko::CudaExecutor::create(
-            0, ref, std::make_shared<gko::CudaAsyncAllocator>(stream), stream);
+            0, ref, std::make_shared<gko::CudaAllocator>(stream), stream);
     }
 }
 

From 84ec96035657b62fe9fb3278873afe5b571a8f42 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Thu, 30 Mar 2023 16:18:42 +0200
Subject: [PATCH 03/14] fix some compilation issues

---
 core/device_hooks/cuda_hooks.cpp    | 12 ++++++++++-
 core/device_hooks/dpcpp_hooks.cpp   | 15 +++++++++++--
 core/device_hooks/hip_hooks.cpp     | 25 +++++++++-------------
 cuda/base/memory.cpp                | 28 ++++++++++++++++++++++++
 dpcpp/CMakeLists.txt                |  1 +
 dpcpp/base/executor.dp.cpp          | 33 -----------------------------
 dpcpp/base/memory.dp.cpp            | 25 ++++++++++++++++------
 include/ginkgo/core/base/memory.hpp |  4 ++++
 8 files changed, 86 insertions(+), 57 deletions(-)

diff --git a/core/device_hooks/cuda_hooks.cpp b/core/device_hooks/cuda_hooks.cpp
index cdecf735a9d..f8489908cc9 100644
--- a/core/device_hooks/cuda_hooks.cpp
+++ b/core/device_hooks/cuda_hooks.cpp
@@ -96,9 +96,19 @@ void CudaHostAllocator::deallocate(void* dev_ptr) const GKO_NOT_COMPILED(cuda);
 std::shared_ptr<CudaExecutor> CudaExecutor::create(
     int device_id, std::shared_ptr<Executor> master, bool device_reset,
     allocation_mode alloc_mode, CUstream_st* stream)
+{
+    return std::shared_ptr<CudaExecutor>(
+        new CudaExecutor(device_id, std::move(master),
+                         std::make_shared<CudaAllocator>(), stream));
+}
+
+
+std::shared_ptr<CudaExecutor> CudaExecutor::create(
+    int device_id, std::shared_ptr<Executor> master,
+    std::shared_ptr<CudaAllocatorBase> alloc, CUstream_st* stream)
 {
     return std::shared_ptr<CudaExecutor>(new CudaExecutor(
-        device_id, std::move(master), device_reset, alloc_mode, stream));
+        device_id, std::move(master), std::move(alloc), stream));
 }
 
 
diff --git a/core/device_hooks/dpcpp_hooks.cpp b/core/device_hooks/dpcpp_hooks.cpp
index 0ee3e6f289f..1981c712872 100644
--- a/core/device_hooks/dpcpp_hooks.cpp
+++ b/core/device_hooks/dpcpp_hooks.cpp
@@ -53,7 +53,18 @@ version version_info::get_dpcpp_version() noexcept
 }
 
 
-void* DpcppAllocator::allocate_impl(sycl::queue* queue, size_type size) const
+DpcppAllocatorBase::DpcppAllocatorBase(sycl::queue*) GKO_NOT_COMPILED(dpcpp);
+
+
+void* DpcppAllocatorBase::allocate(size_type num_bytes) const
+    GKO_NOT_COMPILED(dpcpp);
+
+
+void DpcppAllocatorBase::deallocate(void* ptr) const GKO_NOT_COMPILED(dpcpp);
+
+
+void* DpcppAllocator::allocate_impl(sycl::queue* queue,
+                                    size_type num_bytes) const
     GKO_NOT_COMPILED(dpcpp);
 
 
@@ -62,7 +73,7 @@ void DpcppAllocator::deallocate_impl(sycl::queue* queue, void* ptr) const
 
 
 void* DpcppUnifiedAllocator::allocate_impl(sycl::queue* queue,
-                                           size_type size) const
+                                           size_type num_bytes) const
     GKO_NOT_COMPILED(dpcpp);
 
 
diff --git a/core/device_hooks/hip_hooks.cpp b/core/device_hooks/hip_hooks.cpp
index 739dac39f08..54486cc4e74 100644
--- a/core/device_hooks/hip_hooks.cpp
+++ b/core/device_hooks/hip_hooks.cpp
@@ -36,10 +36,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/stream.hpp>
 #include <ginkgo/core/base/timer.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/base/version.hpp>
 #include <ginkgo/core/log/profiler_hook.hpp>
+#include "ginkgo/core/base/memory.hpp"
 
 
 namespace gko {
@@ -53,29 +55,22 @@ version version_info::get_hip_version() noexcept
 }
 
 
-void* HipAllocator::allocate(size_type num_bytes) GKO_NOT_COMPILED(hip);
+void* HipAllocator::allocate(size_type num_bytes) const GKO_NOT_COMPILED(hip);
 
 
-void HipAllocator::deallocate(void* dev_ptr) GKO_NOT_COMPILED(hip);
+void HipAllocator::deallocate(void* dev_ptr) const GKO_NOT_COMPILED(hip);
 
 
-HipAsyncAllocator::HipAsyncAllocator(GKO_HIP_STREAM_STRUCT* stream)
+std::shared_ptr<HipExecutor> HipExecutor::create(
+    int device_id, std::shared_ptr<Executor> master, bool device_reset,
+    allocation_mode alloc_mode, GKO_HIP_STREAM_STRUCT* stream)
     GKO_NOT_COMPILED(hip);
 
 
-void* HipAsyncAllocator::allocate(size_type num_bytes) GKO_NOT_COMPILED(hip);
-
-
-void HipAsyncAllocator::deallocate(void* dev_ptr) GKO_NOT_COMPILED(hip);
-
-
 std::shared_ptr<HipExecutor> HipExecutor::create(
-    int device_id, std::shared_ptr<Executor> master, bool device_reset,
-    allocation_mode alloc_mode, GKO_HIP_STREAM_STRUCT* stream)
-{
-    return std::shared_ptr<HipExecutor>(new HipExecutor(
-        device_id, std::move(master), device_reset, alloc_mode, stream));
-}
+    int device_id, std::shared_ptr<Executor> master,
+    std::shared_ptr<HipAllocatorBase> alloc, GKO_HIP_STREAM_STRUCT* stream)
+    GKO_NOT_COMPILED(hip);
 
 
 void HipExecutor::populate_exec_info(const machine_topology* mach_topo)
diff --git a/cuda/base/memory.cpp b/cuda/base/memory.cpp
index 11dee81ad42..c1b0a5d517f 100644
--- a/cuda/base/memory.cpp
+++ b/cuda/base/memory.cpp
@@ -97,6 +97,9 @@ void CudaAllocator::deallocate(void* ptr) const
 }
 
 
+#if CUDA_VERSION >= 11020
+
+
 CudaAsyncAllocator::CudaAsyncAllocator(cudaStream_t stream) : stream_{stream} {}
 
 
@@ -108,12 +111,37 @@ void* CudaAsyncAllocator::allocate(size_type num_bytes) const
     return ptr;
 }
 
+
 void CudaAsyncAllocator::deallocate(void* ptr) const
 {
     GKO_EXIT_ON_CUDA_ERROR(cudaFreeAsync(ptr, stream_));
 }
 
 
+#else  // Fall back to regular allocation
+
+
+CudaAsyncAllocator::CudaAsyncAllocator(cudaStream_t stream) : stream_{} {}
+
+
+void* CudaAsyncAllocator::allocate(size_type num_bytes) const
+{
+    void* ptr{};
+    GKO_ASSERT_NO_CUDA_ALLOCATION_ERRORS(cudaMalloc(&ptr, num_bytes),
+                                         num_bytes);
+    return ptr;
+}
+
+
+void CudaAsyncAllocator::deallocate(void* ptr) const
+{
+    GKO_EXIT_ON_CUDA_ERROR(cudaFree(ptr));
+}
+
+
+#endif
+
+
 CudaUnifiedAllocator::CudaUnifiedAllocator(int device_id)
     : CudaUnifiedAllocator{device_id, cudaMemAttachGlobal}
 {}
diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt
index 31b5e0543ba..55763ca5525 100644
--- a/dpcpp/CMakeLists.txt
+++ b/dpcpp/CMakeLists.txt
@@ -10,6 +10,7 @@ target_sources(ginkgo_dpcpp
     base/executor.dp.cpp
     base/helper.dp.cpp
     base/index_set_kernels.dp.cpp
+    base/memory.dp.cpp
     base/scoped_device_id.dp.cpp
     base/timer.dp.cpp
     base/version.dp.cpp
diff --git a/dpcpp/base/executor.dp.cpp b/dpcpp/base/executor.dp.cpp
index d668331a43b..c2015c8664c 100644
--- a/dpcpp/base/executor.dp.cpp
+++ b/dpcpp/base/executor.dp.cpp
@@ -51,39 +51,6 @@ namespace gko {
 namespace detail {
 
 
-DpcppAllocator::DpcppAllocator(sycl::queue* queue) : queue_{queue} {}
-
-
-void* DpcppAllocator::allocate(size_type size)
-{
-    return sycl::malloc_device(size, *queue_);
-}
-
-
-void DpcppAllocator::deallocate(void* ptr)
-{
-    queue_->wait_and_throw();
-    sycl::free(ptr, queue_->get_context());
-}
-
-
-DpcppUnifiedAllocator::DpcppUnifiedAllocator(sycl::queue* queue) : queue_{queue}
-{}
-
-
-void* DpcppUnifiedAllocator::allocate(size_type size)
-{
-    return sycl::malloc_shared(size, *queue_);
-}
-
-
-void DpcppUnifiedAllocator::deallocate(void* ptr)
-{
-    queue_->wait_and_throw();
-    sycl::free(ptr, queue_->get_context());
-}
-
-
 const std::vector<sycl::device> get_devices(std::string device_type)
 {
     std::map<std::string, sycl::info::device_type> device_type_map{
diff --git a/dpcpp/base/memory.dp.cpp b/dpcpp/base/memory.dp.cpp
index b1ccd007dea..2582fa331a0 100644
--- a/dpcpp/base/memory.dp.cpp
+++ b/dpcpp/base/memory.dp.cpp
@@ -42,10 +42,22 @@ namespace gko {
 DpcppAllocatorBase::DpcppAllocatorBase(sycl::queue* queue) : queue_{queue} {}
 
 
+void* DpcppAllocatorBase::allocate(size_type num_bytes) const
+{
+    return this->allocate_impl(queue_, num_bytes);
+}
+
+
+void DpcppAllocatorBase::deallocate(void* ptr) const
+{
+    this->deallocate_impl(queue_, ptr);
+}
+
+
 void* DpcppAllocator::allocate_impl(sycl::queue* queue,
                                     size_type num_bytes) const
 {
-    return sycl::malloc_device(size, *queue);
+    return sycl::malloc_device(num_bytes, *queue);
 }
 
 
@@ -56,16 +68,17 @@ void DpcppAllocator::deallocate_impl(sycl::queue* queue, void* ptr) const
 }
 
 
-void* DpcppUnifiedAllocator::allocate(size_type num_bytes)
+void* DpcppUnifiedAllocator::allocate_impl(sycl::queue* queue,
+                                           size_type num_bytes)
 {
-    return sycl::malloc_shared(size, *queue_);
+    return sycl::malloc_shared(num_bytes, *queue);
 }
 
 
-void DpcppUnifiedAllocator::deallocate(void* ptr)
+void DpcppUnifiedAllocator::deallocate_impl(sycl::queue* queue, void* ptr)
 {
-    queue_->wait_and_throw();
-    sycl::free(ptr, queue_->get_context());
+    queue->wait_and_throw();
+    sycl::free(ptr, queue->get_context());
 }
 
 
diff --git a/include/ginkgo/core/base/memory.hpp b/include/ginkgo/core/base/memory.hpp
index ec25920dcea..872a25a9a33 100644
--- a/include/ginkgo/core/base/memory.hpp
+++ b/include/ginkgo/core/base/memory.hpp
@@ -81,6 +81,10 @@ class DpcppAllocatorBase : public Allocator {
 public:
     DpcppAllocatorBase(sycl::queue* queue);
 
+    void* allocate(size_type num_bytes) const final;
+
+    void deallocate(void* ptr) const final;
+
 protected:
     virtual void* allocate_impl(sycl::queue* queue,
                                 size_type num_bytes) const = 0;

From ec67b1990838217d82ad987103a66d609a18d207 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Wed, 31 May 2023 14:08:44 +0200
Subject: [PATCH 04/14] formatting

---
 core/base/memory.cpp                | 2 +-
 core/device_hooks/hip_hooks.cpp     | 2 +-
 core/test/base/executor.cpp         | 2 +-
 cuda/base/device.cpp                | 4 +---
 cuda/base/memory.cpp                | 2 +-
 cuda/base/nvtx.cpp                  | 4 +++-
 cuda/base/stream.cpp                | 4 +++-
 cuda/test/base/cuda_executor.cu     | 1 -
 hip/base/device.hip.cpp             | 4 ++--
 hip/base/roctx.hip.cpp              | 4 +++-
 include/ginkgo/core/base/stream.hpp | 2 +-
 11 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/core/base/memory.cpp b/core/base/memory.cpp
index 88d97bcc765..4e9f0b7e24a 100644
--- a/core/base/memory.cpp
+++ b/core/base/memory.cpp
@@ -56,4 +56,4 @@ void CpuAllocator::deallocate(void* ptr) const
 }
 
 
-}  // namespace gko
\ No newline at end of file
+}  // namespace gko
diff --git a/core/device_hooks/hip_hooks.cpp b/core/device_hooks/hip_hooks.cpp
index 54486cc4e74..4dbe6409c01 100644
--- a/core/device_hooks/hip_hooks.cpp
+++ b/core/device_hooks/hip_hooks.cpp
@@ -36,12 +36,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/memory.hpp>
 #include <ginkgo/core/base/stream.hpp>
 #include <ginkgo/core/base/timer.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/base/version.hpp>
 #include <ginkgo/core/log/profiler_hook.hpp>
-#include "ginkgo/core/base/memory.hpp"
 
 
 namespace gko {
diff --git a/core/test/base/executor.cpp b/core/test/base/executor.cpp
index 71064cf01d2..94e7bc02d79 100644
--- a/core/test/base/executor.cpp
+++ b/core/test/base/executor.cpp
@@ -35,7 +35,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include <thread>
 #include <type_traits>
-#include "ginkgo/core/base/memory.hpp"
 
 
 #if defined(__unix__) || defined(__APPLE__)
@@ -47,6 +46,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <ginkgo/core/base/exception.hpp>
+#include <ginkgo/core/base/memory.hpp>
 
 
 namespace {
diff --git a/cuda/base/device.cpp b/cuda/base/device.cpp
index 31ab5bcde63..2db0876ca95 100644
--- a/cuda/base/device.cpp
+++ b/cuda/base/device.cpp
@@ -30,15 +30,13 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-#include "cuda/base/device.hpp"
-
-
 #include <cuda_runtime.h>
 
 
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
+#include "cuda/base/device.hpp"
 #include "cuda/base/scoped_device_id.hpp"
 
 
diff --git a/cuda/base/memory.cpp b/cuda/base/memory.cpp
index c1b0a5d517f..afc1f9f62fa 100644
--- a/cuda/base/memory.cpp
+++ b/cuda/base/memory.cpp
@@ -193,4 +193,4 @@ void CudaHostAllocator::deallocate(void* ptr) const
 }
 
 
-}  // namespace gko
\ No newline at end of file
+}  // namespace gko
diff --git a/cuda/base/nvtx.cpp b/cuda/base/nvtx.cpp
index e313c110ea2..3cbc59299b0 100644
--- a/cuda/base/nvtx.cpp
+++ b/cuda/base/nvtx.cpp
@@ -30,10 +30,12 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
+#include <cuda_runtime.h>
+
+
 #include <ginkgo/config.hpp>
 
 
-#include <cuda_runtime.h>
 #ifdef GKO_LEGACY_NVTX
 #include <nvToolsExt.h>
 #else
diff --git a/cuda/base/stream.cpp b/cuda/base/stream.cpp
index 8c6aa92c28b..0bbc9b1cc83 100644
--- a/cuda/base/stream.cpp
+++ b/cuda/base/stream.cpp
@@ -30,11 +30,13 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
+#include <ginkgo/core/base/stream.hpp>
+
+
 #include <cuda_runtime.h>
 
 
 #include <ginkgo/core/base/exception_helpers.hpp>
-#include <ginkgo/core/base/stream.hpp>
 
 
 #include "cuda/base/scoped_device_id.hpp"
diff --git a/cuda/test/base/cuda_executor.cu b/cuda/test/base/cuda_executor.cu
index afb23c06186..c81799e0dae 100644
--- a/cuda/test/base/cuda_executor.cu
+++ b/cuda/test/base/cuda_executor.cu
@@ -44,7 +44,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/stream.hpp>
 
-
 #include "common/cuda_hip/base/executor.hpp.inc"
 #include "cuda/base/scoped_device_id.hpp"
 #include "cuda/test/utils.hpp"
diff --git a/hip/base/device.hip.cpp b/hip/base/device.hip.cpp
index b5ec1bec6d6..9a01d6aacee 100644
--- a/hip/base/device.hip.cpp
+++ b/hip/base/device.hip.cpp
@@ -30,15 +30,15 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-#include <ginkgo/core/base/stream.hpp>
+#include <ginkgo/core/base/device.hpp>
 
 
 #include <hip/hip_runtime.h>
 
 
 #include <ginkgo/config.hpp>
-#include <ginkgo/core/base/device.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/stream.hpp>
 
 
 #include "hip/base/scoped_device_id.hip.hpp"
diff --git a/hip/base/roctx.hip.cpp b/hip/base/roctx.hip.cpp
index 9f309b93362..a01bc11dc47 100644
--- a/hip/base/roctx.hip.cpp
+++ b/hip/base/roctx.hip.cpp
@@ -30,10 +30,12 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
+#include <hip/hip_runtime.h>
+
+
 #include <ginkgo/config.hpp>
 
 
-#include <hip/hip_runtime.h>
 #if GINKGO_HIP_PLATFORM_HCC && GKO_HAVE_ROCTX
 #include <roctx.h>
 #endif
diff --git a/include/ginkgo/core/base/stream.hpp b/include/ginkgo/core/base/stream.hpp
index 4bb4aeecf9e..8ee8333e41a 100644
--- a/include/ginkgo/core/base/stream.hpp
+++ b/include/ginkgo/core/base/stream.hpp
@@ -121,4 +121,4 @@ class hip_stream {
 }  // namespace gko
 
 
-#endif  // GKO_PUBLIC_CORE_BASE_EXECUTOR_HPP_
+#endif  // GKO_PUBLIC_CORE_BASE_STREAM_HPP_

From b985d6349c189a2f5bffba100db36e09c755ab2a Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Thu, 1 Jun 2023 12:00:18 +0200
Subject: [PATCH 05/14] fix compilation

---
 core/device_hooks/hip_hooks.cpp | 12 ++++++++++--
 test/utils/executor.hpp         |  2 +-
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/core/device_hooks/hip_hooks.cpp b/core/device_hooks/hip_hooks.cpp
index 4dbe6409c01..ba7563f1ef0 100644
--- a/core/device_hooks/hip_hooks.cpp
+++ b/core/device_hooks/hip_hooks.cpp
@@ -64,13 +64,21 @@ void HipAllocator::deallocate(void* dev_ptr) const GKO_NOT_COMPILED(hip);
 std::shared_ptr<HipExecutor> HipExecutor::create(
     int device_id, std::shared_ptr<Executor> master, bool device_reset,
     allocation_mode alloc_mode, GKO_HIP_STREAM_STRUCT* stream)
-    GKO_NOT_COMPILED(hip);
+{
+    return std::shared_ptr<HipExecutor>(
+        new HipExecutor(device_id, std::move(master),
+                        std::make_shared<HipAllocator>(), stream));
+}
 
 
 std::shared_ptr<HipExecutor> HipExecutor::create(
     int device_id, std::shared_ptr<Executor> master,
     std::shared_ptr<HipAllocatorBase> alloc, GKO_HIP_STREAM_STRUCT* stream)
-    GKO_NOT_COMPILED(hip);
+{
+    return std::shared_ptr<HipExecutor>(
+        new HipExecutor(device_id, std::move(master),
+                        std::make_shared<HipAllocator>(), stream));
+}
 
 
 void HipExecutor::populate_exec_info(const machine_topology* mach_topo)
diff --git a/test/utils/executor.hpp b/test/utils/executor.hpp
index c588ac74260..200f4652644 100644
--- a/test/utils/executor.hpp
+++ b/test/utils/executor.hpp
@@ -110,7 +110,7 @@ inline void init_executor(std::shared_ptr<gko::ReferenceExecutor> ref,
             throw std::runtime_error{"No suitable CUDA devices"};
         }
         exec = gko::CudaExecutor::create(
-            0, ref, std::make_shared<gko::CudaAllocator>(stream), stream);
+            0, ref, std::make_shared<gko::CudaAllocator>(), stream);
     }
 }
 

From 51e8399fb5e1b708c61356af5bbd76bf54135774 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Thu, 6 Jul 2023 15:45:46 +0200
Subject: [PATCH 06/14] update interface and remove DPC++ allocator

---
 core/base/memory.cpp                  |   4 +-
 core/device_hooks/cuda_hooks.cpp      |  32 ++++--
 core/device_hooks/dpcpp_hooks.cpp     |  28 -----
 core/device_hooks/hip_hooks.cpp       |  51 ++++++++-
 core/test/base/executor.cpp           |   2 +-
 cuda/base/executor.cpp                |   9 +-
 cuda/base/memory.cpp                  |  43 ++++++--
 dpcpp/CMakeLists.txt                  |   1 -
 dpcpp/base/memory.dp.cpp              |  85 ---------------
 dpcpp/test/base/CMakeLists.txt        |   1 -
 dpcpp/test/base/memory.dp.cpp         |  98 -----------------
 hip/base/executor.hip.cpp             |   9 +-
 hip/base/memory.hip.cpp               | 123 ++++++++++++++++++++-
 include/ginkgo/core/base/executor.hpp |  33 ++----
 include/ginkgo/core/base/memory.hpp   | 151 ++++++++++++++++++--------
 15 files changed, 348 insertions(+), 322 deletions(-)
 delete mode 100644 dpcpp/base/memory.dp.cpp
 delete mode 100644 dpcpp/test/base/memory.dp.cpp

diff --git a/core/base/memory.cpp b/core/base/memory.cpp
index 4e9f0b7e24a..b6c6f8f265c 100644
--- a/core/base/memory.cpp
+++ b/core/base/memory.cpp
@@ -42,7 +42,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 namespace gko {
 
 
-void* CpuAllocator::allocate(size_type num_bytes) const
+void* CpuAllocator::allocate(size_type num_bytes)
 {
     auto ptr = ::operator new (num_bytes, std::nothrow_t{});
     GKO_ENSURE_ALLOCATED(ptr, "cpu", num_bytes);
@@ -50,7 +50,7 @@ void* CpuAllocator::allocate(size_type num_bytes) const
 }
 
 
-void CpuAllocator::deallocate(void* ptr) const
+void CpuAllocator::deallocate(void* ptr)
 {
     ::operator delete (ptr, std::nothrow_t{});
 }
diff --git a/core/device_hooks/cuda_hooks.cpp b/core/device_hooks/cuda_hooks.cpp
index f8489908cc9..03ab12deb46 100644
--- a/core/device_hooks/cuda_hooks.cpp
+++ b/core/device_hooks/cuda_hooks.cpp
@@ -54,43 +54,55 @@ version version_info::get_cuda_version() noexcept
 }
 
 
-void* CudaAllocator::allocate(size_type num_bytes) const GKO_NOT_COMPILED(cuda);
+void* CudaAllocator::allocate(size_type num_bytes) GKO_NOT_COMPILED(cuda);
 
 
-void CudaAllocator::deallocate(void* dev_ptr) const GKO_NOT_COMPILED(cuda);
+void CudaAllocator::deallocate(void* dev_ptr) GKO_NOT_COMPILED(cuda);
 
 
 CudaAsyncAllocator::CudaAsyncAllocator(CUstream_st* stream)
     GKO_NOT_COMPILED(cuda);
 
 
-void* CudaAsyncAllocator::allocate(size_type num_bytes) const
-    GKO_NOT_COMPILED(cuda);
+void* CudaAsyncAllocator::allocate(size_type num_bytes) GKO_NOT_COMPILED(cuda);
+
 
+void CudaAsyncAllocator::deallocate(void* dev_ptr) GKO_NOT_COMPILED(cuda);
 
-void CudaAsyncAllocator::deallocate(void* dev_ptr) const GKO_NOT_COMPILED(cuda);
+
+bool CudaAsyncAllocator::check_environment(int device_id,
+                                           CUstream_st* stream) const
+    GKO_NOT_COMPILED(cuda);
 
 
 CudaUnifiedAllocator::CudaUnifiedAllocator(int device_id, unsigned int flags)
     GKO_NOT_COMPILED(cuda);
 
 
-void* CudaUnifiedAllocator::allocate(size_type num_bytes) const
+void* CudaUnifiedAllocator::allocate(size_type num_bytes)
     GKO_NOT_COMPILED(cuda);
 
 
-void CudaUnifiedAllocator::deallocate(void* dev_ptr) const
+void CudaUnifiedAllocator::deallocate(void* dev_ptr) GKO_NOT_COMPILED(cuda);
+
+
+bool CudaUnifiedAllocator::check_environment(int device_id,
+                                             CUstream_st* stream) const
     GKO_NOT_COMPILED(cuda);
 
 
 CudaHostAllocator::CudaHostAllocator(int device_id) GKO_NOT_COMPILED(cuda);
 
 
-void* CudaHostAllocator::allocate(size_type num_bytes) const
-    GKO_NOT_COMPILED(cuda);
+void* CudaHostAllocator::allocate(size_type num_bytes) GKO_NOT_COMPILED(cuda);
+
 
+void CudaHostAllocator::deallocate(void* dev_ptr) GKO_NOT_COMPILED(cuda);
 
-void CudaHostAllocator::deallocate(void* dev_ptr) const GKO_NOT_COMPILED(cuda);
+
+bool CudaHostAllocator::check_environment(int device_id,
+                                          CUstream_st* stream) const
+    GKO_NOT_COMPILED(cuda);
 
 
 std::shared_ptr<CudaExecutor> CudaExecutor::create(
diff --git a/core/device_hooks/dpcpp_hooks.cpp b/core/device_hooks/dpcpp_hooks.cpp
index 1981c712872..532e9c55bbe 100644
--- a/core/device_hooks/dpcpp_hooks.cpp
+++ b/core/device_hooks/dpcpp_hooks.cpp
@@ -53,34 +53,6 @@ version version_info::get_dpcpp_version() noexcept
 }
 
 
-DpcppAllocatorBase::DpcppAllocatorBase(sycl::queue*) GKO_NOT_COMPILED(dpcpp);
-
-
-void* DpcppAllocatorBase::allocate(size_type num_bytes) const
-    GKO_NOT_COMPILED(dpcpp);
-
-
-void DpcppAllocatorBase::deallocate(void* ptr) const GKO_NOT_COMPILED(dpcpp);
-
-
-void* DpcppAllocator::allocate_impl(sycl::queue* queue,
-                                    size_type num_bytes) const
-    GKO_NOT_COMPILED(dpcpp);
-
-
-void DpcppAllocator::deallocate_impl(sycl::queue* queue, void* ptr) const
-    GKO_NOT_COMPILED(dpcpp);
-
-
-void* DpcppUnifiedAllocator::allocate_impl(sycl::queue* queue,
-                                           size_type num_bytes) const
-    GKO_NOT_COMPILED(dpcpp);
-
-
-void DpcppUnifiedAllocator::deallocate_impl(sycl::queue* queue, void* ptr) const
-    GKO_NOT_COMPILED(dpcpp);
-
-
 std::shared_ptr<DpcppExecutor> DpcppExecutor::create(
     int device_id, std::shared_ptr<Executor> master, std::string device_type,
     dpcpp_queue_property property)
diff --git a/core/device_hooks/hip_hooks.cpp b/core/device_hooks/hip_hooks.cpp
index ba7563f1ef0..dec1de15933 100644
--- a/core/device_hooks/hip_hooks.cpp
+++ b/core/device_hooks/hip_hooks.cpp
@@ -55,10 +55,54 @@ version version_info::get_hip_version() noexcept
 }
 
 
-void* HipAllocator::allocate(size_type num_bytes) const GKO_NOT_COMPILED(hip);
+void* HipAllocator::allocate(size_type num_bytes) GKO_NOT_COMPILED(hip);
 
 
-void HipAllocator::deallocate(void* dev_ptr) const GKO_NOT_COMPILED(hip);
+void HipAllocator::deallocate(void* dev_ptr) GKO_NOT_COMPILED(hip);
+
+
+HipAsyncAllocator::HipAsyncAllocator(GKO_HIP_STREAM_STRUCT* stream)
+    GKO_NOT_COMPILED(hip);
+
+
+void* HipAsyncAllocator::allocate(size_type num_bytes) GKO_NOT_COMPILED(hip);
+
+
+void HipAsyncAllocator::deallocate(void* dev_ptr) GKO_NOT_COMPILED(hip);
+
+
+bool HipAsyncAllocator::check_environment(int device_id,
+                                          GKO_HIP_STREAM_STRUCT* stream) const
+    GKO_NOT_COMPILED(hip);
+
+
+HipUnifiedAllocator::HipUnifiedAllocator(int device_id, unsigned int flags)
+    GKO_NOT_COMPILED(hip);
+
+
+void* HipUnifiedAllocator::allocate(size_type num_bytes) GKO_NOT_COMPILED(hip);
+
+
+void HipUnifiedAllocator::deallocate(void* dev_ptr) GKO_NOT_COMPILED(hip);
+
+
+bool HipUnifiedAllocator::check_environment(int device_id,
+                                            GKO_HIP_STREAM_STRUCT* stream) const
+    GKO_NOT_COMPILED(hip);
+
+
+HipHostAllocator::HipHostAllocator(int device_id) GKO_NOT_COMPILED(hip);
+
+
+void* HipHostAllocator::allocate(size_type num_bytes) GKO_NOT_COMPILED(hip);
+
+
+void HipHostAllocator::deallocate(void* dev_ptr) GKO_NOT_COMPILED(hip);
+
+
+bool HipHostAllocator::check_environment(int device_id,
+                                         GKO_HIP_STREAM_STRUCT* stream) const
+    GKO_NOT_COMPILED(hip);
 
 
 std::shared_ptr<HipExecutor> HipExecutor::create(
@@ -76,8 +120,7 @@ std::shared_ptr<HipExecutor> HipExecutor::create(
     std::shared_ptr<HipAllocatorBase> alloc, GKO_HIP_STREAM_STRUCT* stream)
 {
     return std::shared_ptr<HipExecutor>(
-        new HipExecutor(device_id, std::move(master),
-                        std::make_shared<HipAllocator>(), stream));
+        new HipExecutor(device_id, std::move(master), alloc, stream));
 }
 
 
diff --git a/core/test/base/executor.cpp b/core/test/base/executor.cpp
index 94e7bc02d79..13cba09e2b6 100644
--- a/core/test/base/executor.cpp
+++ b/core/test/base/executor.cpp
@@ -386,7 +386,7 @@ TEST(Executor, CanVerifyMemory)
 
 
 struct MockAllocator : gko::CpuAllocator {
-    void deallocate(void* ptr) const noexcept override
+    void deallocate(void* ptr) noexcept override
     {
         called_free = true;
         CpuAllocator::deallocate(ptr);
diff --git a/cuda/base/executor.cpp b/cuda/base/executor.cpp
index f6e838dd2dd..faf90037a0f 100644
--- a/cuda/base/executor.cpp
+++ b/cuda/base/executor.cpp
@@ -80,9 +80,8 @@ std::shared_ptr<CudaExecutor> CudaExecutor::create(
     int device_id, std::shared_ptr<Executor> master, bool device_reset,
     allocation_mode alloc_mode, cudaStream_t stream)
 {
-    return std::shared_ptr<CudaExecutor>(
-        new CudaExecutor(device_id, std::move(master),
-                         allocator_from_mode(device_id, alloc_mode), stream));
+    return create(device_id, master, allocator_from_mode(device_id, alloc_mode),
+                  stream);
 }
 
 
@@ -90,6 +89,10 @@ std::shared_ptr<CudaExecutor> CudaExecutor::create(
     int device_id, std::shared_ptr<Executor> master,
     std::shared_ptr<CudaAllocatorBase> alloc, cudaStream_t stream)
 {
+    if (!alloc->check_environment(device_id, stream)) {
+        throw Error{__FILE__, __LINE__,
+                    "Allocator uses incorrect stream or device ID."};
+    }
     return std::shared_ptr<CudaExecutor>(new CudaExecutor(
         device_id, std::move(master), std::move(alloc), stream));
 }
diff --git a/cuda/base/memory.cpp b/cuda/base/memory.cpp
index afc1f9f62fa..08c64c0ba05 100644
--- a/cuda/base/memory.cpp
+++ b/cuda/base/memory.cpp
@@ -82,7 +82,7 @@ namespace gko {
 #endif
 
 
-void* CudaAllocator::allocate(size_type num_bytes) const
+void* CudaAllocator::allocate(size_type num_bytes)
 {
     void* ptr{};
     GKO_ASSERT_NO_CUDA_ALLOCATION_ERRORS(cudaMalloc(&ptr, num_bytes),
@@ -91,7 +91,7 @@ void* CudaAllocator::allocate(size_type num_bytes) const
 }
 
 
-void CudaAllocator::deallocate(void* ptr) const
+void CudaAllocator::deallocate(void* ptr)
 {
     GKO_EXIT_ON_CUDA_ERROR(cudaFree(ptr));
 }
@@ -103,7 +103,7 @@ void CudaAllocator::deallocate(void* ptr) const
 CudaAsyncAllocator::CudaAsyncAllocator(cudaStream_t stream) : stream_{stream} {}
 
 
-void* CudaAsyncAllocator::allocate(size_type num_bytes) const
+void* CudaAsyncAllocator::allocate(size_type num_bytes)
 {
     void* ptr{};
     GKO_ASSERT_NO_CUDA_ALLOCATION_ERRORS(
@@ -112,7 +112,7 @@ void* CudaAsyncAllocator::allocate(size_type num_bytes) const
 }
 
 
-void CudaAsyncAllocator::deallocate(void* ptr) const
+void CudaAsyncAllocator::deallocate(void* ptr)
 {
     GKO_EXIT_ON_CUDA_ERROR(cudaFreeAsync(ptr, stream_));
 }
@@ -121,10 +121,10 @@ void CudaAsyncAllocator::deallocate(void* ptr) const
 #else  // Fall back to regular allocation
 
 
-CudaAsyncAllocator::CudaAsyncAllocator(cudaStream_t stream) : stream_{} {}
+CudaAsyncAllocator::CudaAsyncAllocator(cudaStream_t stream) : stream_{stream} {}
 
 
-void* CudaAsyncAllocator::allocate(size_type num_bytes) const
+void* CudaAsyncAllocator::allocate(size_type num_bytes)
 {
     void* ptr{};
     GKO_ASSERT_NO_CUDA_ALLOCATION_ERRORS(cudaMalloc(&ptr, num_bytes),
@@ -133,7 +133,7 @@ void* CudaAsyncAllocator::allocate(size_type num_bytes) const
 }
 
 
-void CudaAsyncAllocator::deallocate(void* ptr) const
+void CudaAsyncAllocator::deallocate(void* ptr)
 {
     GKO_EXIT_ON_CUDA_ERROR(cudaFree(ptr));
 }
@@ -142,6 +142,13 @@ void CudaAsyncAllocator::deallocate(void* ptr) const
 #endif
 
 
+bool CudaAsyncAllocator::check_environment(int device_id,
+                                           CUstream_st* stream) const
+{
+    return stream == stream_;
+}
+
+
 CudaUnifiedAllocator::CudaUnifiedAllocator(int device_id)
     : CudaUnifiedAllocator{device_id, cudaMemAttachGlobal}
 {}
@@ -152,7 +159,7 @@ CudaUnifiedAllocator::CudaUnifiedAllocator(int device_id, unsigned int flags)
 {}
 
 
-void* CudaUnifiedAllocator::allocate(size_type num_bytes) const
+void* CudaUnifiedAllocator::allocate(size_type num_bytes)
 {
     // we need to set the device ID in case this gets used in a host executor
     detail::cuda_scoped_device_id_guard g(device_id_);
@@ -163,7 +170,7 @@ void* CudaUnifiedAllocator::allocate(size_type num_bytes) const
 }
 
 
-void CudaUnifiedAllocator::deallocate(void* ptr) const
+void CudaUnifiedAllocator::deallocate(void* ptr)
 {
     // we need to set the device ID in case this gets used in a host executor
     detail::cuda_scoped_device_id_guard g(device_id_);
@@ -171,10 +178,17 @@ void CudaUnifiedAllocator::deallocate(void* ptr) const
 }
 
 
+bool CudaUnifiedAllocator::check_environment(int device_id,
+                                             CUstream_st* stream) const
+{
+    return device_id == device_id_;
+}
+
+
 CudaHostAllocator::CudaHostAllocator(int device_id) : device_id_{device_id} {}
 
 
-void* CudaHostAllocator::allocate(size_type num_bytes) const
+void* CudaHostAllocator::allocate(size_type num_bytes)
 {
     // we need to set the device ID in case this gets used in a host executor
     detail::cuda_scoped_device_id_guard g(device_id_);
@@ -185,7 +199,7 @@ void* CudaHostAllocator::allocate(size_type num_bytes) const
 }
 
 
-void CudaHostAllocator::deallocate(void* ptr) const
+void CudaHostAllocator::deallocate(void* ptr)
 {
     // we need to set the device ID in case this gets used in a host executor
     detail::cuda_scoped_device_id_guard g(device_id_);
@@ -193,4 +207,11 @@ void CudaHostAllocator::deallocate(void* ptr) const
 }
 
 
+bool CudaHostAllocator::check_environment(int device_id,
+                                          CUstream_st* stream) const
+{
+    return device_id == device_id_;
+}
+
+
 }  // namespace gko
diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt
index 55763ca5525..31b5e0543ba 100644
--- a/dpcpp/CMakeLists.txt
+++ b/dpcpp/CMakeLists.txt
@@ -10,7 +10,6 @@ target_sources(ginkgo_dpcpp
     base/executor.dp.cpp
     base/helper.dp.cpp
     base/index_set_kernels.dp.cpp
-    base/memory.dp.cpp
     base/scoped_device_id.dp.cpp
     base/timer.dp.cpp
     base/version.dp.cpp
diff --git a/dpcpp/base/memory.dp.cpp b/dpcpp/base/memory.dp.cpp
deleted file mode 100644
index 2582fa331a0..00000000000
--- a/dpcpp/base/memory.dp.cpp
+++ /dev/null
@@ -1,85 +0,0 @@
-/*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2023, the Ginkgo authors
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions
-are met:
-
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in the
-documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its
-contributors may be used to endorse or promote products derived from
-this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-******************************<GINKGO LICENSE>*******************************/
-
-#include <ginkgo/core/base/memory.hpp>
-
-
-#include <CL/sycl.hpp>
-
-
-namespace gko {
-
-
-DpcppAllocatorBase::DpcppAllocatorBase(sycl::queue* queue) : queue_{queue} {}
-
-
-void* DpcppAllocatorBase::allocate(size_type num_bytes) const
-{
-    return this->allocate_impl(queue_, num_bytes);
-}
-
-
-void DpcppAllocatorBase::deallocate(void* ptr) const
-{
-    this->deallocate_impl(queue_, ptr);
-}
-
-
-void* DpcppAllocator::allocate_impl(sycl::queue* queue,
-                                    size_type num_bytes) const
-{
-    return sycl::malloc_device(num_bytes, *queue);
-}
-
-
-void DpcppAllocator::deallocate_impl(sycl::queue* queue, void* ptr) const
-{
-    queue->wait_and_throw();
-    sycl::free(ptr, queue->get_context());
-}
-
-
-void* DpcppUnifiedAllocator::allocate_impl(sycl::queue* queue,
-                                           size_type num_bytes)
-{
-    return sycl::malloc_shared(num_bytes, *queue);
-}
-
-
-void DpcppUnifiedAllocator::deallocate_impl(sycl::queue* queue, void* ptr)
-{
-    queue->wait_and_throw();
-    sycl::free(ptr, queue->get_context());
-}
-
-
-}  // namespace gko
diff --git a/dpcpp/test/base/CMakeLists.txt b/dpcpp/test/base/CMakeLists.txt
index 5c0ca601f04..bb9c8a75050 100644
--- a/dpcpp/test/base/CMakeLists.txt
+++ b/dpcpp/test/base/CMakeLists.txt
@@ -3,4 +3,3 @@ ginkgo_create_dpcpp_test(dim3)
 ginkgo_create_dpcpp_test(kernel_launch)
 # set correct flags for kernel_launch.hpp
 target_compile_definitions(dpcpp_test_base_kernel_launch PRIVATE GKO_COMPILING_DPCPP)
-ginkgo_create_dpcpp_test(memory)
\ No newline at end of file
diff --git a/dpcpp/test/base/memory.dp.cpp b/dpcpp/test/base/memory.dp.cpp
deleted file mode 100644
index e587660cde3..00000000000
--- a/dpcpp/test/base/memory.dp.cpp
+++ /dev/null
@@ -1,98 +0,0 @@
-/*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2023, the Ginkgo authors
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions
-are met:
-
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in the
-documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its
-contributors may be used to endorse or promote products derived from
-this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-******************************<GINKGO LICENSE>*******************************/
-
-#include <ginkgo/core/base/memory.hpp>
-
-
-#include <memory>
-#include <type_traits>
-
-
-#include <gtest/gtest.h>
-
-
-#include <ginkgo/core/base/exception.hpp>
-#include <ginkgo/core/base/exception_helpers.hpp>
-#include <ginkgo/core/base/executor.hpp>
-
-
-#include "dpcpp/test/utils.hpp"
-
-
-namespace {
-
-
-class Memory : public ::testing::Test {
-protected:
-    Memory()
-        : exec{gko::DpcppExecutor::create(0, gko::OmpExecutor::create())},
-          host_exec_with_unified{gko::OmpExecutor::create(
-              std::make_shared<gko::DpcppUnifiedAllocator>(exec->get_queue()))},
-          exec_with_unified{gko::DpcppExecutor::create(
-              exec->get_queue(), host_exec_with_unified,
-              std::make_shared<gko::DpcppUnifiedAllocator>(exec->get_queue()))}
-    {}
-
-    std::shared_ptr<gko::DpcppExecutor> exec;
-    std::shared_ptr<gko::OmpExecutor> host_exec_with_unified;
-    std::shared_ptr<gko::DpcppExecutor> exec_with_unified;
-};
-
-
-TEST_F(Memory, DeviceAllocationWorks)
-{
-    gko::array<int> data{exec, {1, 2}};
-
-    GKO_ASSERT_ARRAY_EQ(data, I<int>({1, 2}));
-}
-
-
-TEST_F(Memory, UnifiedDeviceAllocationWorks)
-{
-    gko::array<int> data{exec_with_unified, {1, 2}};
-    exec->synchronize();
-
-    ASSERT_EQ(data.get_const_data()[0], 1);
-    ASSERT_EQ(data.get_const_data()[1], 2);
-}
-
-
-TEST_F(Memory, HostUnifiedAllocationWorks)
-{
-    gko::array<int> data{host_exec_with_unified, {1, 2}};
-
-    ASSERT_EQ(data.get_const_data()[0], 1);
-    ASSERT_EQ(data.get_const_data()[1], 2);
-}
-
-
-}  // namespace
diff --git a/hip/base/executor.hip.cpp b/hip/base/executor.hip.cpp
index 6b4b0fd5ddc..2df5c9a4847 100644
--- a/hip/base/executor.hip.cpp
+++ b/hip/base/executor.hip.cpp
@@ -60,9 +60,8 @@ std::shared_ptr<HipExecutor> HipExecutor::create(
     int device_id, std::shared_ptr<Executor> master, bool device_reset,
     allocation_mode alloc_mode, hipStream_t stream)
 {
-    return std::shared_ptr<HipExecutor>(
-        new HipExecutor(device_id, std::move(master),
-                        std::make_shared<HipAllocator>(), stream));
+    return create(device_id, std::move(master),
+                  std::make_shared<HipAllocator>(), stream);
 }
 
 
@@ -70,6 +69,10 @@ std::shared_ptr<HipExecutor> HipExecutor::create(
     int device_id, std::shared_ptr<Executor> master,
     std::shared_ptr<HipAllocatorBase> alloc, hipStream_t stream)
 {
+    if (!alloc->check_environment(device_id, stream)) {
+        throw Error{__FILE__, __LINE__,
+                    "Allocator uses incorrect stream or device ID."};
+    }
     return std::shared_ptr<HipExecutor>(new HipExecutor(
         device_id, std::move(master), std::move(alloc), stream));
 }
diff --git a/hip/base/memory.hip.cpp b/hip/base/memory.hip.cpp
index f2a8977525f..7acb208173a 100644
--- a/hip/base/memory.hip.cpp
+++ b/hip/base/memory.hip.cpp
@@ -39,6 +39,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
+#include "hip/base/scoped_device_id.hip.hpp"
+
+
 namespace gko {
 
 
@@ -79,7 +82,7 @@ namespace gko {
 #endif
 
 
-void* HipAllocator::allocate(size_type num_bytes) const
+void* HipAllocator::allocate(size_type num_bytes)
 {
     void* dev_ptr{};
     GKO_ASSERT_NO_HIP_ALLOCATION_ERRORS(hipMalloc(&dev_ptr, num_bytes),
@@ -88,10 +91,126 @@ void* HipAllocator::allocate(size_type num_bytes) const
 }
 
 
-void HipAllocator::deallocate(void* dev_ptr) const
+void HipAllocator::deallocate(void* dev_ptr)
 {
     GKO_EXIT_ON_HIP_ERROR(hipFree(dev_ptr));
 }
 
 
+#if HIP_VERSION_MAJOR >= 5
+
+
+HipAsyncAllocator::HipAsyncAllocator(hipStream_t stream) : stream_{stream} {}
+
+
+void* HipAsyncAllocator::allocate(size_type num_bytes)
+{
+    void* ptr{};
+    GKO_ASSERT_NO_HIP_ALLOCATION_ERRORS(
+        hipMallocAsync(&ptr, num_bytes, stream_), num_bytes);
+    return ptr;
+}
+
+
+void HipAsyncAllocator::deallocate(void* ptr)
+{
+    GKO_EXIT_ON_HIP_ERROR(hipFreeAsync(ptr, stream_));
+}
+
+
+#else  // Fall back to regular allocation
+
+
+HipAsyncAllocator::HipAsyncAllocator(hipStream_t stream) : stream_{stream} {}
+
+
+void* HipAsyncAllocator::allocate(size_type num_bytes)
+{
+    void* ptr{};
+    GKO_ASSERT_NO_HIP_ALLOCATION_ERRORS(hipMalloc(&ptr, num_bytes), num_bytes);
+    return ptr;
+}
+
+
+void HipAsyncAllocator::deallocate(void* ptr)
+{
+    GKO_EXIT_ON_HIP_ERROR(hipFree(ptr));
+}
+
+
+#endif
+
+
+bool HipAsyncAllocator::check_environment(int device_id,
+                                          hipStream_t stream) const
+{
+    return stream == stream_;
+}
+
+
+HipUnifiedAllocator::HipUnifiedAllocator(int device_id)
+    : HipUnifiedAllocator{device_id, hipMemAttachGlobal}
+{}
+
+
+HipUnifiedAllocator::HipUnifiedAllocator(int device_id, unsigned int flags)
+    : device_id_{device_id}, flags_{flags}
+{}
+
+
+void* HipUnifiedAllocator::allocate(size_type num_bytes)
+{
+    // we need to set the device ID in case this gets used in a host executor
+    detail::hip_scoped_device_id_guard g(device_id_);
+    void* ptr{};
+    GKO_ASSERT_NO_HIP_ALLOCATION_ERRORS(
+        hipMallocManaged(&ptr, num_bytes, flags_), num_bytes);
+    return ptr;
+}
+
+
+void HipUnifiedAllocator::deallocate(void* ptr)
+{
+    // we need to set the device ID in case this gets used in a host executor
+    detail::hip_scoped_device_id_guard g(device_id_);
+    GKO_EXIT_ON_HIP_ERROR(hipFree(ptr));
+}
+
+
+bool HipUnifiedAllocator::check_environment(int device_id,
+                                            hipStream_t stream) const
+{
+    return device_id == device_id_;
+}
+
+
+HipHostAllocator::HipHostAllocator(int device_id) : device_id_{device_id} {}
+
+
+void* HipHostAllocator::allocate(size_type num_bytes)
+{
+    // we need to set the device ID in case this gets used in a host executor
+    detail::hip_scoped_device_id_guard g(device_id_);
+    void* ptr{};
+    GKO_ASSERT_NO_HIP_ALLOCATION_ERRORS(hipHostMalloc(&ptr, num_bytes),
+                                        num_bytes);
+    return ptr;
+}
+
+
+void HipHostAllocator::deallocate(void* ptr)
+{
+    // we need to set the device ID in case this gets used in a host executor
+    detail::hip_scoped_device_id_guard g(device_id_);
+    GKO_EXIT_ON_HIP_ERROR(hipFreeHost(ptr));
+}
+
+
+bool HipHostAllocator::check_environment(int device_id,
+                                         hipStream_t stream) const
+{
+    return device_id == device_id_;
+}
+
+
 }  // namespace gko
diff --git a/include/ginkgo/core/base/executor.hpp b/include/ginkgo/core/base/executor.hpp
index 4545b216f86..f033873e392 100644
--- a/include/ginkgo/core/base/executor.hpp
+++ b/include/ginkgo/core/base/executor.hpp
@@ -1717,10 +1717,13 @@ class HipExecutor : public detail::ExecutorBase<HipExecutor>,
      * @param alloc_mode  the allocation mode that the executor should operate
      *                    on. See @allocation_mode for more details
      */
-    [[deprecated("")]] static std::shared_ptr<HipExecutor> create(
-        int device_id, std::shared_ptr<Executor> master, bool device_reset,
-        allocation_mode alloc_mode = default_hip_alloc_mode,
-        GKO_HIP_STREAM_STRUCT* stream = nullptr);
+    [[deprecated(
+        "device_reset is deprecated entirely, call hipDeviceReset directly. "
+        "alloc_mode was replaced by the Allocator type "
+        "hierarchy.")]] static std::shared_ptr<HipExecutor>
+    create(int device_id, std::shared_ptr<Executor> master, bool device_reset,
+           allocation_mode alloc_mode = default_hip_alloc_mode,
+           GKO_HIP_STREAM_STRUCT* stream = nullptr);
 
     static std::shared_ptr<HipExecutor> create(
         int device_id, std::shared_ptr<Executor> master,
@@ -1914,28 +1917,6 @@ class DpcppExecutor : public detail::ExecutorBase<DpcppExecutor>,
         std::string device_type = "all",
         dpcpp_queue_property property = dpcpp_queue_property::in_order);
 
-    /**
-     * Creates a new DpcppExecutor from an existing SYCL queue.
-     *
-     * @param queue  the DPCPP device id of this device
-     * @param master  an executor on the host that is used to invoke the device
-     *                kernels
-     */
-    static std::shared_ptr<DpcppExecutor> create(
-        sycl::queue* queue, std::shared_ptr<Executor> master);
-
-    /**
-     * Creates a new DpcppExecutor from an existing SYCL queue.
-     *
-     * @param queue  the DPCPP device id of this device
-     * @param master  an executor on the host that is used to invoke the device
-     *                kernels
-     * @param alloc  the allocator used for memory allocation
-     */
-    static std::shared_ptr<DpcppExecutor> create(
-        sycl::queue* queue, std::shared_ptr<Executor> master,
-        std::shared_ptr<DpcppAllocatorBase> alloc);
-
     std::shared_ptr<Executor> get_master() noexcept override;
 
     std::shared_ptr<const Executor> get_master() const noexcept override;
diff --git a/include/ginkgo/core/base/memory.hpp b/include/ginkgo/core/base/memory.hpp
index 872a25a9a33..1086c9aacb4 100644
--- a/include/ginkgo/core/base/memory.hpp
+++ b/include/ginkgo/core/base/memory.hpp
@@ -49,9 +49,9 @@ class Allocator {
 public:
     virtual ~Allocator() = default;
 
-    virtual void* allocate(size_type num_bytes) const = 0;
+    virtual void* allocate(size_type num_bytes) = 0;
 
-    virtual void deallocate(void* ptr) const = 0;
+    virtual void deallocate(void* ptr) = 0;
 };
 
 
@@ -65,34 +65,49 @@ class CpuAllocatorBase : public Allocator {};
 /**
  * Implement this interface to provide an allocator for CudaExecutor.
  */
-class CudaAllocatorBase : public Allocator {};
+class CudaAllocatorBase : public Allocator {
+    friend class CudaExecutor;
 
-
-/**
- * Implement this interface to provide an allocator for HipExecutor.
- */
-class HipAllocatorBase : public Allocator {};
+protected:
+    /**
+     * Checks if the allocator can be used safely with the provided device ID
+     * and stream. The check is necessary to ensure safe usage of stream-ordered
+     * allocators and unified shared memory allocators.
+     *
+     * @param device_id the device ID the allocator will be used in.
+     * @param stream the stream the allocator will be used with.
+     * @return true if and only if the allocator can be used by CudaExecutor in
+     *         the given environment.
+     */
+    virtual bool check_environment(int device_id, CUstream_st* stream) const
+    {
+        return true;
+    }
+};
 
 
 /**
- * Implement this interface to provide an allocator for DpcppExecutor.
+ * Implement this interface to provide an allocator for HipExecutor.
  */
-class DpcppAllocatorBase : public Allocator {
-public:
-    DpcppAllocatorBase(sycl::queue* queue);
-
-    void* allocate(size_type num_bytes) const final;
-
-    void deallocate(void* ptr) const final;
+class HipAllocatorBase : public Allocator {
+    friend class HipExecutor;
 
 protected:
-    virtual void* allocate_impl(sycl::queue* queue,
-                                size_type num_bytes) const = 0;
-
-    virtual void deallocate_impl(sycl::queue* queue, void* ptr) const = 0;
-
-private:
-    sycl::queue* queue_;
+    /**
+     * Checks if the allocator can be used safely with the provided device ID
+     * and stream. The check is necessary to ensure safe usage of stream-ordered
+     * allocators and unified shared memory allocators.
+     *
+     * @param device_id the device ID the allocator will be used in.
+     * @param stream the stream the allocator will be used with.
+     * @return true if and only if the allocator can be used by CudaExecutor in
+     *         the given environment.
+     */
+    virtual bool check_environment(int device_id,
+                                   GKO_HIP_STREAM_STRUCT* stream) const
+    {
+        return true;
+    }
 };
 
 
@@ -101,9 +116,9 @@ class DpcppAllocatorBase : public Allocator {
  */
 class CpuAllocator : public CpuAllocatorBase {
 public:
-    void* allocate(size_type num_bytes) const override;
+    void* allocate(size_type num_bytes) override;
 
-    void deallocate(void* ptr) const override;
+    void deallocate(void* ptr) override;
 };
 
 
@@ -112,9 +127,9 @@ class CpuAllocator : public CpuAllocatorBase {
  */
 class CudaAllocator : public CudaAllocatorBase {
 public:
-    void* allocate(size_type num_bytes) const override;
+    void* allocate(size_type num_bytes) override;
 
-    void deallocate(void* ptr) const override;
+    void deallocate(void* ptr) override;
 };
 
 
@@ -123,12 +138,14 @@ class CudaAllocator : public CudaAllocatorBase {
  */
 class CudaAsyncAllocator : public CudaAllocatorBase {
 public:
-    void* allocate(size_type num_bytes) const override;
+    void* allocate(size_type num_bytes) override;
 
-    void deallocate(void* ptr) const override;
+    void deallocate(void* ptr) override;
 
     CudaAsyncAllocator(CUstream_st* stream);
 
+    bool check_environment(int device_id, CUstream_st* stream) const override;
+
 private:
     CUstream_st* stream_;
 };
@@ -139,14 +156,17 @@ class CudaAsyncAllocator : public CudaAllocatorBase {
  */
 class CudaUnifiedAllocator : public CudaAllocatorBase, public CpuAllocatorBase {
 public:
-    void* allocate(size_type num_bytes) const override;
+    void* allocate(size_type num_bytes) override;
 
-    void deallocate(void* ptr) const override;
+    void deallocate(void* ptr) override;
 
     CudaUnifiedAllocator(int device_id);
 
     CudaUnifiedAllocator(int device_id, unsigned int flags);
 
+protected:
+    bool check_environment(int device_id, CUstream_st* stream) const override;
+
 private:
     int device_id_;
     unsigned int flags_;
@@ -154,16 +174,19 @@ class CudaUnifiedAllocator : public CudaAllocatorBase, public CpuAllocatorBase {
 
 
 /*
- * Allocator using cudaMallocHost.
+ * Allocator using cudaHostMalloc.
  */
 class CudaHostAllocator : public CudaAllocatorBase, public CpuAllocatorBase {
 public:
-    void* allocate(size_type num_bytes) const override;
+    void* allocate(size_type num_bytes) override;
 
-    void deallocate(void* ptr) const override;
+    void deallocate(void* ptr) override;
 
     CudaHostAllocator(int device_id);
 
+protected:
+    bool check_environment(int device_id, CUstream_st* stream) const override;
+
 private:
     int device_id_;
 };
@@ -174,38 +197,72 @@ class CudaHostAllocator : public CudaAllocatorBase, public CpuAllocatorBase {
  */
 class HipAllocator : public HipAllocatorBase {
 public:
-    void* allocate(size_type num_bytes) const override;
+    void* allocate(size_type num_bytes) override;
+
+    void deallocate(void* ptr) override;
+};
+
+
+/*
+ * Allocator using hipMallocAsync.
+ */
+class HipAsyncAllocator : public HipAllocatorBase {
+public:
+    void* allocate(size_type num_bytes) override;
 
-    void deallocate(void* ptr) const override;
+    void deallocate(void* ptr) override;
+
+    HipAsyncAllocator(GKO_HIP_STREAM_STRUCT* stream);
+
+protected:
+    bool check_environment(int device_id,
+                           GKO_HIP_STREAM_STRUCT* stream) const override;
+
+private:
+    GKO_HIP_STREAM_STRUCT* stream_;
 };
 
 
 /*
- * Allocator using sycl::malloc_device.
+ * Allocator using hipMallocManaged
  */
-class DpcppAllocator : public DpcppAllocatorBase {
+class HipUnifiedAllocator : public HipAllocatorBase, public CpuAllocatorBase {
 public:
-    using DpcppAllocatorBase::DpcppAllocatorBase;
+    void* allocate(size_type num_bytes) override;
+
+    void deallocate(void* ptr) override;
+
+    HipUnifiedAllocator(int device_id);
+
+    HipUnifiedAllocator(int device_id, unsigned int flags);
 
 protected:
-    void* allocate_impl(sycl::queue* queue, size_type num_bytes) const override;
+    bool check_environment(int device_id,
+                           GKO_HIP_STREAM_STRUCT* stream) const override;
 
-    void deallocate_impl(sycl::queue* queue, void* ptr) const override;
+private:
+    int device_id_;
+    unsigned int flags_;
 };
 
 
 /*
- * Allocator using sycl::malloc_shared.
+ * Allocator using hipHostAlloc.
  */
-class DpcppUnifiedAllocator : public DpcppAllocatorBase,
-                              public CpuAllocatorBase {
+class HipHostAllocator : public HipAllocatorBase, public CpuAllocatorBase {
 public:
-    using DpcppAllocatorBase::DpcppAllocatorBase;
+    void* allocate(size_type num_bytes) override;
+
+    void deallocate(void* ptr) override;
+
+    HipHostAllocator(int device_id);
 
 protected:
-    void* allocate_impl(sycl::queue* queue, size_type num_bytes) const override;
+    bool check_environment(int device_id,
+                           GKO_HIP_STREAM_STRUCT* stream) const override;
 
-    void deallocate_impl(sycl::queue* queue, void* ptr) const override;
+private:
+    int device_id_;
 };
 
 

From 39b3b3f288b6bdc3de081b33d08644d2ac6adecc Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Fri, 7 Jul 2023 11:49:57 +0200
Subject: [PATCH 07/14] add HIP allocator tests

---
 benchmark/utils/general.hpp  |   9 ++-
 hip/base/memory.hip.cpp      |   2 +-
 hip/test/base/CMakeLists.txt |   1 +
 hip/test/base/memory.hip.cpp | 126 +++++++++++++++++++++++++++++++++++
 4 files changed, 132 insertions(+), 6 deletions(-)
 create mode 100644 hip/test/base/memory.hip.cpp

diff --git a/benchmark/utils/general.hpp b/benchmark/utils/general.hpp
index 35077f66d4b..19c71b74a1a 100644
--- a/benchmark/utils/general.hpp
+++ b/benchmark/utils/general.hpp
@@ -368,16 +368,15 @@ const std::map<std::string,
          [](MPI_Comm comm) {
              FLAGS_device_id = gko::experimental::mpi::map_rank_to_device_id(
                  comm, gko::CudaExecutor::get_num_devices());
-             return gko::CudaExecutor::create(
-                 FLAGS_device_id, gko::ReferenceExecutor::create(), false,
-                 gko::allocation_mode::device);
+             return gko::CudaExecutor::create(FLAGS_device_id,
+                                              gko::ReferenceExecutor::create());
          }},
         {"hip",
          [](MPI_Comm comm) {
              FLAGS_device_id = gko::experimental::mpi::map_rank_to_device_id(
                  comm, gko::HipExecutor::get_num_devices());
-             return gko::HipExecutor::create(
-                 FLAGS_device_id, gko::ReferenceExecutor::create(), true);
+             return gko::HipExecutor::create(FLAGS_device_id,
+                                             gko::ReferenceExecutor::create());
          }},
         {"dpcpp", [](MPI_Comm comm) {
              if (gko::DpcppExecutor::get_num_devices("gpu")) {
diff --git a/hip/base/memory.hip.cpp b/hip/base/memory.hip.cpp
index 7acb208173a..86ac31c3154 100644
--- a/hip/base/memory.hip.cpp
+++ b/hip/base/memory.hip.cpp
@@ -202,7 +202,7 @@ void HipHostAllocator::deallocate(void* ptr)
 {
     // we need to set the device ID in case this gets used in a host executor
     detail::hip_scoped_device_id_guard g(device_id_);
-    GKO_EXIT_ON_HIP_ERROR(hipFreeHost(ptr));
+    GKO_EXIT_ON_HIP_ERROR(hipHostFree(ptr));
 }
 
 
diff --git a/hip/test/base/CMakeLists.txt b/hip/test/base/CMakeLists.txt
index f597a3d6e3d..486fca294c2 100644
--- a/hip/test/base/CMakeLists.txt
+++ b/hip/test/base/CMakeLists.txt
@@ -8,6 +8,7 @@ ginkgo_create_hip_test(kernel_launch)
 # correct flags for kernel_launch.hpp are set in GINKGO_HIPCC_OPTIONS
 ginkgo_create_hip_test(lin_op)
 ginkgo_create_hip_test(math)
+ginkgo_create_hip_test(memory)
 # Only hcc needs the libraries. nvcc only requires the headers.
 if (GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_AMD_REGEX}")
     ginkgo_create_hip_test(exception_helpers ADDITIONAL_LIBRARIES roc::hipblas roc::hipsparse hip::hiprand roc::rocrand)
diff --git a/hip/test/base/memory.hip.cpp b/hip/test/base/memory.hip.cpp
new file mode 100644
index 00000000000..2dc0a3aa337
--- /dev/null
+++ b/hip/test/base/memory.hip.cpp
@@ -0,0 +1,126 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/base/memory.hpp>
+
+
+#include <memory>
+#include <type_traits>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/exception.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/executor.hpp>
+
+
+#include "hip/test/utils.hip.hpp"
+
+
+namespace {
+
+
+class Memory : public HipTestFixture {
+protected:
+    Memory()
+        : host_exec_with_pinned{gko::OmpExecutor::create(
+              std::make_shared<gko::HipHostAllocator>(0))},
+          host_exec_with_unified{gko::OmpExecutor::create(
+              std::make_shared<gko::HipUnifiedAllocator>(0))},
+          exec_with_normal{gko::HipExecutor::create(
+              0, ref, std::make_shared<gko::HipAllocator>(),
+              exec->get_stream())},
+          exec_with_async{gko::HipExecutor::create(
+              0, host_exec_with_pinned,
+              std::make_shared<gko::HipAsyncAllocator>(exec->get_stream()),
+              exec->get_stream())},
+          exec_with_unified{gko::HipExecutor::create(
+              0, host_exec_with_unified,
+              std::make_shared<gko::HipUnifiedAllocator>(0),
+              exec->get_stream())}
+    {}
+
+    std::shared_ptr<gko::OmpExecutor> host_exec_with_pinned;
+    std::shared_ptr<gko::OmpExecutor> host_exec_with_unified;
+    std::shared_ptr<gko::HipExecutor> exec_with_normal;
+    std::shared_ptr<gko::HipExecutor> exec_with_async;
+    std::shared_ptr<gko::HipExecutor> exec_with_unified;
+};
+
+
+TEST_F(Memory, DeviceAllocationWorks)
+{
+    gko::array<int> data{exec_with_normal, {1, 2}};
+
+    GKO_ASSERT_ARRAY_EQ(data, I<int>({1, 2}));
+}
+
+
+TEST_F(Memory, AsyncDeviceAllocationWorks)
+{
+    gko::array<int> data{exec_with_async, {1, 2}};
+
+    GKO_ASSERT_ARRAY_EQ(data, I<int>({1, 2}));
+}
+
+
+TEST_F(Memory, UnifiedDeviceAllocationWorks)
+{
+    gko::array<int> data{exec_with_unified, {1, 2}};
+    exec->synchronize();
+
+    ASSERT_EQ(data.get_const_data()[0], 1);
+    ASSERT_EQ(data.get_const_data()[1], 2);
+}
+
+
+TEST_F(Memory, HostUnifiedAllocationWorks)
+{
+    gko::array<int> data{host_exec_with_unified, {1, 2}};
+
+    ASSERT_EQ(data.get_const_data()[0], 1);
+    ASSERT_EQ(data.get_const_data()[1], 2);
+}
+
+
+TEST_F(Memory, HostPinnedAllocationWorks)
+{
+    gko::array<int> data{host_exec_with_pinned, {1, 2}};
+
+    ASSERT_EQ(data.get_const_data()[0], 1);
+    ASSERT_EQ(data.get_const_data()[1], 2);
+}
+
+
+}  // namespace

From 37524940da45380511987f3791cf3bf6eeb617b9 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Fri, 7 Jul 2023 13:22:33 +0200
Subject: [PATCH 08/14] review updates

* honor allocation_mode for HIP
* use correct allocation flags for cudaMallocManaged allocation_mode
* use valid device_id in moved-from stream wrapper
* add more deprecation warnings for device_reset functionality
* documentation

Co-authored-by: Yuhsiang M. Tsai <yhmtsai@gmail.com>
Co-authored-by: Marcel Koch <marcel.koch@kit.edu>
---
 cuda/base/executor.cpp                |  6 ++++--
 cuda/base/stream.cpp                  |  4 ++--
 hip/base/executor.hip.cpp             | 20 ++++++++++++++++++-
 hip/base/roctx.hip.cpp                |  2 ++
 hip/base/stream.hip.cpp               |  4 ++--
 include/ginkgo/core/base/executor.hpp | 28 +++++++++++++++++++++++++--
 include/ginkgo/core/base/stream.hpp   | 12 ++++++++++--
 7 files changed, 65 insertions(+), 11 deletions(-)

diff --git a/cuda/base/executor.cpp b/cuda/base/executor.cpp
index faf90037a0f..fd16815456a 100644
--- a/cuda/base/executor.cpp
+++ b/cuda/base/executor.cpp
@@ -67,9 +67,11 @@ std::unique_ptr<CudaAllocatorBase> allocator_from_mode(int device_id,
     case allocation_mode::device:
         return std::make_unique<CudaAllocator>();
     case allocation_mode::unified_global:
-        return std::make_unique<CudaUnifiedAllocator>(device_id);
+        return std::make_unique<CudaUnifiedAllocator>(device_id,
+                                                      cudaMemAttachGlobal);
     case allocation_mode::unified_host:
-        return std::make_unique<CudaUnifiedAllocator>(device_id);
+        return std::make_unique<CudaUnifiedAllocator>(device_id,
+                                                      cudaMemAttachHost);
     default:
         GKO_NOT_SUPPORTED(mode);
     }
diff --git a/cuda/base/stream.cpp b/cuda/base/stream.cpp
index 0bbc9b1cc83..76027bd51e2 100644
--- a/cuda/base/stream.cpp
+++ b/cuda/base/stream.cpp
@@ -45,7 +45,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 namespace gko {
 
 
-cuda_stream::cuda_stream() : stream_{}, device_id_{-1} {}
+cuda_stream::cuda_stream() : stream_{nullptr}, device_id_{} {}
 
 
 cuda_stream::cuda_stream(int device_id) : stream_{}, device_id_(device_id)
@@ -66,7 +66,7 @@ cuda_stream::~cuda_stream()
 
 cuda_stream::cuda_stream(cuda_stream&& other)
     : stream_{std::exchange(other.stream_, nullptr)},
-      device_id_(std::exchange(other.device_id_, -1))
+      device_id_(std::exchange(other.device_id_, 0))
 {}
 
 
diff --git a/hip/base/executor.hip.cpp b/hip/base/executor.hip.cpp
index 2df5c9a4847..a89e765becb 100644
--- a/hip/base/executor.hip.cpp
+++ b/hip/base/executor.hip.cpp
@@ -56,12 +56,30 @@ namespace gko {
 #include "common/cuda_hip/base/executor.hpp.inc"
 
 
+std::unique_ptr<HipAllocatorBase> allocator_from_mode(int device_id,
+                                                      allocation_mode mode)
+{
+    switch (mode) {
+    case allocation_mode::device:
+        return std::make_unique<HipAllocator>();
+    case allocation_mode::unified_global:
+        return std::make_unique<HipUnifiedAllocator>(device_id,
+                                                     hipMemAttachGlobal);
+    case allocation_mode::unified_host:
+        return std::make_unique<HipUnifiedAllocator>(device_id,
+                                                     hipMemAttachHost);
+    default:
+        GKO_NOT_SUPPORTED(mode);
+    }
+}
+
+
 std::shared_ptr<HipExecutor> HipExecutor::create(
     int device_id, std::shared_ptr<Executor> master, bool device_reset,
     allocation_mode alloc_mode, hipStream_t stream)
 {
     return create(device_id, std::move(master),
-                  std::make_shared<HipAllocator>(), stream);
+                  allocator_from_mode(device_id, alloc_mode), stream);
 }
 
 
diff --git a/hip/base/roctx.hip.cpp b/hip/base/roctx.hip.cpp
index a01bc11dc47..23b07e60254 100644
--- a/hip/base/roctx.hip.cpp
+++ b/hip/base/roctx.hip.cpp
@@ -56,6 +56,7 @@ void begin_roctx(const char* name, profile_event_category)
     roctxRangePush(name);
 }
 
+
 void end_roctx(const char*, profile_event_category) { roctxRangePop(); }
 
 #else
@@ -63,6 +64,7 @@ void end_roctx(const char*, profile_event_category) { roctxRangePop(); }
 void begin_roctx(const char* name, profile_event_category)
     GKO_NOT_COMPILED(roctx);
 
+
 void end_roctx(const char*, profile_event_category) GKO_NOT_COMPILED(roctx);
 
 #endif
diff --git a/hip/base/stream.hip.cpp b/hip/base/stream.hip.cpp
index e5817eb9ebd..dc2d99b8b17 100644
--- a/hip/base/stream.hip.cpp
+++ b/hip/base/stream.hip.cpp
@@ -47,7 +47,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 namespace gko {
 
 
-hip_stream::hip_stream() : stream_{}, device_id_{-1} {}
+hip_stream::hip_stream() : stream_{}, device_id_{} {}
 
 
 hip_stream::hip_stream(int device_id) : stream_{}, device_id_(device_id)
@@ -68,7 +68,7 @@ hip_stream::~hip_stream()
 
 hip_stream::hip_stream(hip_stream&& other)
     : stream_{std::exchange(other.stream_, nullptr)},
-      device_id_{std::exchange(other.device_id_, -1)}
+      device_id_{std::exchange(other.device_id_, 0)}
 {}
 
 
diff --git a/include/ginkgo/core/base/executor.hpp b/include/ginkgo/core/base/executor.hpp
index f033873e392..4f476b9286d 100644
--- a/include/ginkgo/core/base/executor.hpp
+++ b/include/ginkgo/core/base/executor.hpp
@@ -1307,14 +1307,38 @@ class EnableDeviceReset {
      *
      * @param device_reset  whether to allow a device reset or not
      */
-    void set_device_reset(bool device_reset) {}
+    [[deprecated(
+        "device_reset is no longer supported, call "
+        "cudaDeviceReset/hipDeviceReset manually")]] void
+    set_device_reset(bool device_reset)
+    {}
 
     /**
      * Returns the current status of the device reset boolean for this executor.
      *
      * @return the current status of the device reset boolean for this executor.
      */
-    bool get_device_reset() { return false; }
+    [[deprecated(
+        "device_reset is no longer supported, call "
+        "cudaDeviceReset/hipDeviceReset manually")]] bool
+    get_device_reset()
+    {
+        return false;
+    }
+
+protected:
+    /**
+     * Instantiate an EnableDeviceReset class
+     *
+     * @param device_reset  the starting device_reset status. Defaults to false.
+     */
+    EnableDeviceReset() {}
+
+    [[deprecated(
+        "device_reset is no longer supported, call "
+        "cudaDeviceReset/hipDeviceReset manually")]] EnableDeviceReset(bool
+                                                                           device_reset)
+    {}
 };
 
 
diff --git a/include/ginkgo/core/base/stream.hpp b/include/ginkgo/core/base/stream.hpp
index 8ee8333e41a..f7d45f59c5a 100644
--- a/include/ginkgo/core/base/stream.hpp
+++ b/include/ginkgo/core/base/stream.hpp
@@ -50,7 +50,11 @@ class cuda_stream {
     /** Creates an empty stream wrapper, representing the default stream. */
     cuda_stream();
 
-    /** Creates a new custom CUDA stream. */
+    /**
+     * Creates a new custom CUDA stream on the given device.
+     *
+     * @param device_id  the device ID to create the stream on.
+     */
     cuda_stream(int device_id);
 
     /** Destroys the custom CUDA stream, if it isn't empty. */
@@ -89,7 +93,11 @@ class hip_stream {
     /** Creates an empty stream wrapper, representing the default stream. */
     hip_stream();
 
-    /** Creates a new custom HIP stream. */
+    /**
+     * Creates a new custom HIP stream on the given device.
+     *
+     * @param device_id  the device ID to create the stream on.
+     */
     hip_stream(int device_id);
 
     /** Destroys the custom HIP stream, if it isn't empty. */

From a5d82f1494b13a35a1cb1a2d6c4e42d8777f274e Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Mon, 10 Jul 2023 14:23:11 +0000
Subject: [PATCH 09/14] avoid creating OmpExecutor in tests

---
 core/test/base/executor.cpp                 | 54 ++++++++++-----------
 cuda/test/base/cuda_executor.cu             | 28 +++++------
 cuda/test/base/cuda_executor_topology.cu    | 18 +++----
 hip/test/base/hip_executor.hip.cpp          | 28 +++++------
 hip/test/base/hip_executor_topology.hip.cpp | 19 ++++----
 5 files changed, 74 insertions(+), 73 deletions(-)

diff --git a/core/test/base/executor.cpp b/core/test/base/executor.cpp
index 13cba09e2b6..a331d8f3485 100644
--- a/core/test/base/executor.cpp
+++ b/core/test/base/executor.cpp
@@ -248,17 +248,17 @@ TEST(ReferenceExecutor, IsItsOwnMaster)
 
 TEST(CudaExecutor, KnowsItsMaster)
 {
-    auto omp = gko::OmpExecutor::create();
-    exec_ptr cuda = gko::CudaExecutor::create(0, omp);
+    auto ref = gko::ReferenceExecutor::create();
+    exec_ptr cuda = gko::CudaExecutor::create(0, ref);
 
-    ASSERT_EQ(omp, cuda->get_master());
+    ASSERT_EQ(ref, cuda->get_master());
 }
 
 
 TEST(CudaExecutor, KnowsItsDeviceId)
 {
-    auto omp = gko::OmpExecutor::create();
-    auto cuda = gko::CudaExecutor::create(0, omp);
+    auto ref = gko::ReferenceExecutor::create();
+    auto cuda = gko::CudaExecutor::create(0, ref);
 
     ASSERT_EQ(0, cuda->get_device_id());
 }
@@ -266,17 +266,17 @@ TEST(CudaExecutor, KnowsItsDeviceId)
 
 TEST(HipExecutor, KnowsItsMaster)
 {
-    auto omp = gko::OmpExecutor::create();
-    exec_ptr hip = gko::HipExecutor::create(0, omp);
+    auto ref = gko::ReferenceExecutor::create();
+    exec_ptr hip = gko::HipExecutor::create(0, ref);
 
-    ASSERT_EQ(omp, hip->get_master());
+    ASSERT_EQ(ref, hip->get_master());
 }
 
 
 TEST(HipExecutor, KnowsItsDeviceId)
 {
-    auto omp = gko::OmpExecutor::create();
-    auto hip = gko::HipExecutor::create(0, omp);
+    auto ref = gko::ReferenceExecutor::create();
+    auto hip = gko::HipExecutor::create(0, ref);
 
     ASSERT_EQ(0, hip->get_device_id());
 }
@@ -284,17 +284,17 @@ TEST(HipExecutor, KnowsItsDeviceId)
 
 TEST(DpcppExecutor, KnowsItsMaster)
 {
-    auto omp = gko::OmpExecutor::create();
-    exec_ptr dpcpp = gko::DpcppExecutor::create(0, omp);
+    auto ref = gko::ReferenceExecutor::create();
+    exec_ptr dpcpp = gko::DpcppExecutor::create(0, ref);
 
-    ASSERT_EQ(omp, dpcpp->get_master());
+    ASSERT_EQ(ref, dpcpp->get_master());
 }
 
 
 TEST(DpcppExecutor, KnowsItsDeviceId)
 {
-    auto omp = gko::OmpExecutor::create();
-    auto dpcpp = gko::DpcppExecutor::create(0, omp);
+    auto ref = gko::ReferenceExecutor::create();
+    auto dpcpp = gko::DpcppExecutor::create(0, ref);
 
     ASSERT_EQ(0, dpcpp->get_device_id());
 }
@@ -304,13 +304,13 @@ TEST(Executor, CanVerifyMemory)
 {
     auto ref = gko::ReferenceExecutor::create();
     auto omp = gko::OmpExecutor::create();
-    auto hip = gko::HipExecutor::create(0, omp);
-    auto cuda = gko::CudaExecutor::create(0, omp);
+    auto hip = gko::HipExecutor::create(0, ref);
+    auto cuda = gko::CudaExecutor::create(0, ref);
     auto omp2 = gko::OmpExecutor::create();
-    auto hip2 = gko::HipExecutor::create(0, omp);
-    auto cuda2 = gko::CudaExecutor::create(0, omp);
-    auto hip_1 = gko::HipExecutor::create(1, omp);
-    auto cuda_1 = gko::CudaExecutor::create(1, omp);
+    auto hip2 = gko::HipExecutor::create(0, ref);
+    auto cuda2 = gko::CudaExecutor::create(0, ref);
+    auto hip_1 = gko::HipExecutor::create(1, ref);
+    auto cuda_1 = gko::CudaExecutor::create(1, ref);
     std::shared_ptr<gko::DpcppExecutor> host_dpcpp;
     std::shared_ptr<gko::DpcppExecutor> cpu_dpcpp;
     std::shared_ptr<gko::DpcppExecutor> gpu_dpcpp;
@@ -318,16 +318,16 @@ TEST(Executor, CanVerifyMemory)
     std::shared_ptr<gko::DpcppExecutor> cpu_dpcpp_dup;
     std::shared_ptr<gko::DpcppExecutor> gpu_dpcpp_dup;
     if (gko::DpcppExecutor::get_num_devices("host")) {
-        host_dpcpp = gko::DpcppExecutor::create(0, omp, "host");
-        host_dpcpp_dup = gko::DpcppExecutor::create(0, omp, "host");
+        host_dpcpp = gko::DpcppExecutor::create(0, ref, "host");
+        host_dpcpp_dup = gko::DpcppExecutor::create(0, ref, "host");
     }
     if (gko::DpcppExecutor::get_num_devices("cpu")) {
-        cpu_dpcpp = gko::DpcppExecutor::create(0, omp, "cpu");
-        cpu_dpcpp_dup = gko::DpcppExecutor::create(0, omp, "cpu");
+        cpu_dpcpp = gko::DpcppExecutor::create(0, ref, "cpu");
+        cpu_dpcpp_dup = gko::DpcppExecutor::create(0, ref, "cpu");
     }
     if (gko::DpcppExecutor::get_num_devices("gpu")) {
-        gpu_dpcpp = gko::DpcppExecutor::create(0, omp, "gpu");
-        gpu_dpcpp_dup = gko::DpcppExecutor::create(0, omp, "gpu");
+        gpu_dpcpp = gko::DpcppExecutor::create(0, ref, "gpu");
+        gpu_dpcpp_dup = gko::DpcppExecutor::create(0, ref, "gpu");
     }
 
     ASSERT_EQ(false, ref->memory_accessible(omp));
diff --git a/cuda/test/base/cuda_executor.cu b/cuda/test/base/cuda_executor.cu
index c81799e0dae..83cfd1827ad 100644
--- a/cuda/test/base/cuda_executor.cu
+++ b/cuda/test/base/cuda_executor.cu
@@ -93,7 +93,7 @@ protected:
           stream(0),
           other_stream(gko::CudaExecutor::get_num_devices() - 1),
 #endif
-          omp(gko::OmpExecutor::create()),
+          ref(gko::ReferenceExecutor::create()),
           cuda(nullptr),
           cuda2(nullptr),
           cuda3(nullptr)
@@ -104,19 +104,19 @@ protected:
         ASSERT_GT(gko::CudaExecutor::get_num_devices(), 0);
 #ifdef GKO_TEST_NONDEFAULT_STREAM
         cuda = gko::CudaExecutor::create(
-            0, omp, std::make_shared<gko::CudaAllocator>(), stream.get());
+            0, ref, std::make_shared<gko::CudaAllocator>(), stream.get());
         cuda2 = gko::CudaExecutor::create(
-            gko::CudaExecutor::get_num_devices() - 1, omp,
+            gko::CudaExecutor::get_num_devices() - 1, ref,
             std::make_shared<gko::CudaAllocator>(), other_stream.get());
         cuda3 = gko::CudaExecutor::create(
-            0, omp, std::make_shared<gko::CudaUnifiedAllocator>(0),
+            0, ref, std::make_shared<gko::CudaUnifiedAllocator>(0),
             stream.get());
 #else
-        cuda = gko::CudaExecutor::create(0, omp);
+        cuda = gko::CudaExecutor::create(0, ref);
         cuda2 = gko::CudaExecutor::create(
-            gko::CudaExecutor::get_num_devices() - 1, omp);
+            gko::CudaExecutor::get_num_devices() - 1, ref);
         cuda3 = gko::CudaExecutor::create(
-            0, omp, std::make_shared<gko::CudaUnifiedAllocator>(0));
+            0, ref, std::make_shared<gko::CudaUnifiedAllocator>(0));
 #endif
     }
 
@@ -132,7 +132,7 @@ protected:
     gko::cuda_stream stream;
     gko::cuda_stream other_stream;
 #endif
-    std::shared_ptr<gko::Executor> omp;
+    std::shared_ptr<gko::ReferenceExecutor> ref;
     std::shared_ptr<gko::CudaExecutor> cuda;
     std::shared_ptr<gko::CudaExecutor> cuda2;
     std::shared_ptr<gko::CudaExecutor> cuda3;
@@ -141,8 +141,8 @@ protected:
 
 TEST_F(CudaExecutor, CanInstantiateTwoExecutorsOnOneDevice)
 {
-    auto cuda = gko::CudaExecutor::create(0, omp);
-    auto cuda2 = gko::CudaExecutor::create(0, omp);
+    auto cuda = gko::CudaExecutor::create(0, ref);
+    auto cuda2 = gko::CudaExecutor::create(0, ref);
 
     // We want automatic deinitialization to not create any error
 }
@@ -197,7 +197,7 @@ TEST_F(CudaExecutor, CopiesDataToCuda)
     int orig[] = {3, 8};
     auto* copy = cuda->alloc<int>(2);
 
-    cuda->copy_from(omp, 2, orig, copy);
+    cuda->copy_from(ref, 2, orig, copy);
 
     check_data<<<1, 1, 0, cuda->get_stream()>>>(copy);
     ASSERT_NO_THROW(cuda->synchronize());
@@ -218,7 +218,7 @@ TEST_F(CudaExecutor, CanAllocateOnUnifiedMemory)
     int orig[] = {3, 8};
     auto* copy = cuda3->alloc<int>(2);
 
-    cuda3->copy_from(omp, 2, orig, copy);
+    cuda3->copy_from(ref, 2, orig, copy);
 
     check_data<<<1, 1, 0, cuda3->get_stream()>>>(copy);
     ASSERT_NO_THROW(cuda3->synchronize());
@@ -240,7 +240,7 @@ TEST_F(CudaExecutor, CopiesDataFromCuda)
     auto orig = cuda->alloc<int>(2);
     init_data<<<1, 1, 0, cuda->get_stream()>>>(orig);
 
-    omp->copy_from(cuda, 2, orig, copy);
+    ref->copy_from(cuda, 2, orig, copy);
 
     EXPECT_EQ(3, copy[0]);
     ASSERT_EQ(8, copy[1]);
@@ -293,7 +293,7 @@ TEST_F(CudaExecutor, CopiesDataFromCudaToCuda)
     cuda2->run(ExampleOperation(value));
     ASSERT_EQ(value, cuda2->get_device_id());
     // Put the results on OpenMP and run CPU side assertions
-    omp->copy_from(cuda2, 2, copy_cuda2, copy);
+    ref->copy_from(cuda2, 2, copy_cuda2, copy);
     EXPECT_EQ(3, copy[0]);
     ASSERT_EQ(8, copy[1]);
     cuda2->free(copy_cuda2);
diff --git a/cuda/test/base/cuda_executor_topology.cu b/cuda/test/base/cuda_executor_topology.cu
index a0ee6826ded..3b91cc7941a 100644
--- a/cuda/test/base/cuda_executor_topology.cu
+++ b/cuda/test/base/cuda_executor_topology.cu
@@ -60,15 +60,15 @@ namespace {
 class CudaExecutor : public ::testing::Test {
 protected:
     CudaExecutor()
-        : omp(gko::OmpExecutor::create()), cuda(nullptr), cuda2(nullptr)
+        : ref(gko::ReferenceExecutor::create()), cuda(nullptr), cuda2(nullptr)
     {}
 
     void SetUp()
     {
         ASSERT_GT(gko::CudaExecutor::get_num_devices(), 0);
-        cuda = gko::CudaExecutor::create(0, omp);
+        cuda = gko::CudaExecutor::create(0, ref);
         cuda2 = gko::CudaExecutor::create(
-            gko::CudaExecutor::get_num_devices() - 1, omp);
+            gko::CudaExecutor::get_num_devices() - 1, ref);
     }
 
     void TearDown()
@@ -79,7 +79,7 @@ protected:
         }
     }
 
-    std::shared_ptr<gko::Executor> omp;
+    std::shared_ptr<gko::ReferenceExecutor> ref;
     std::shared_ptr<const gko::CudaExecutor> cuda;
     std::shared_ptr<const gko::CudaExecutor> cuda2;
 };
@@ -102,7 +102,7 @@ inline int get_core_os_id(int log_id)
 
 TEST_F(CudaExecutor, CanBindToSinglePu)
 {
-    cuda = gko::CudaExecutor::create(0, gko::OmpExecutor::create());
+    cuda = gko::CudaExecutor::create(0, gko::ReferenceExecutor::create());
 
     const int bind_pu = 1;
     gko::machine_topology::get_instance()->bind_to_pu(bind_pu);
@@ -114,7 +114,7 @@ TEST_F(CudaExecutor, CanBindToSinglePu)
 
 TEST_F(CudaExecutor, CanBindToPus)
 {
-    cuda = gko::CudaExecutor::create(0, gko::OmpExecutor::create());
+    cuda = gko::CudaExecutor::create(0, gko::ReferenceExecutor::create());
 
     std::vector<int> bind_pus = {1, 3};
     gko::machine_topology::get_instance()->bind_to_pus(bind_pus);
@@ -126,7 +126,7 @@ TEST_F(CudaExecutor, CanBindToPus)
 
 TEST_F(CudaExecutor, CanBindToCores)
 {
-    cuda = gko::CudaExecutor::create(0, gko::OmpExecutor::create());
+    cuda = gko::CudaExecutor::create(0, gko::ReferenceExecutor::create());
 
     std::vector<int> bind_cores = {1, 3};
     gko::machine_topology::get_instance()->bind_to_cores(bind_cores);
@@ -138,7 +138,7 @@ TEST_F(CudaExecutor, CanBindToCores)
 
 TEST_F(CudaExecutor, ClosestCpusIsPopulated)
 {
-    cuda = gko::CudaExecutor::create(0, gko::OmpExecutor::create());
+    cuda = gko::CudaExecutor::create(0, gko::ReferenceExecutor::create());
     auto close_cpus = cuda->get_closest_pus();
     if (close_cpus.size() == 0) {
         GTEST_SKIP();
@@ -150,7 +150,7 @@ TEST_F(CudaExecutor, ClosestCpusIsPopulated)
 
 TEST_F(CudaExecutor, KnowsItsNuma)
 {
-    cuda = gko::CudaExecutor::create(0, gko::OmpExecutor::create());
+    cuda = gko::CudaExecutor::create(0, gko::ReferenceExecutor::create());
     auto numa0 = cuda->get_closest_numa();
     auto close_cpus = cuda->get_closest_pus();
     if (close_cpus.size() == 0) {
diff --git a/hip/test/base/hip_executor.hip.cpp b/hip/test/base/hip_executor.hip.cpp
index e531fa739e6..e63543ef77c 100644
--- a/hip/test/base/hip_executor.hip.cpp
+++ b/hip/test/base/hip_executor.hip.cpp
@@ -98,7 +98,7 @@ class HipExecutor : public ::testing::Test {
           stream(0),
           other_stream(gko::HipExecutor::get_num_devices() - 1),
 #endif
-          omp(gko::OmpExecutor::create()),
+          ref(gko::ReferenceExecutor::create()),
           hip(nullptr),
           hip2(nullptr),
           hip3(nullptr)
@@ -109,17 +109,17 @@ class HipExecutor : public ::testing::Test {
         ASSERT_GT(gko::HipExecutor::get_num_devices(), 0);
 #ifdef GKO_TEST_NONDEFAULT_STREAM
         hip = gko::HipExecutor::create(
-            0, omp, std::make_shared<gko::HipAllocator>(), stream.get());
+            0, ref, std::make_shared<gko::HipAllocator>(), stream.get());
         hip2 = gko::HipExecutor::create(
-            gko::HipExecutor::get_num_devices() - 1, omp,
+            gko::HipExecutor::get_num_devices() - 1, ref,
             std::make_shared<gko::HipAllocator>(), other_stream.get());
         hip3 = gko::HipExecutor::create(
-            0, omp, std::make_shared<gko::HipAllocator>(), stream.get());
+            0, ref, std::make_shared<gko::HipAllocator>(), stream.get());
 #else
-        hip = gko::HipExecutor::create(0, omp);
+        hip = gko::HipExecutor::create(0, ref);
         hip2 = gko::HipExecutor::create(gko::HipExecutor::get_num_devices() - 1,
-                                        omp);
-        hip3 = gko::HipExecutor::create(0, omp,
+                                        ref);
+        hip3 = gko::HipExecutor::create(0, ref,
                                         std::make_shared<gko::HipAllocator>());
 #endif
     }
@@ -136,7 +136,7 @@ class HipExecutor : public ::testing::Test {
     gko::hip_stream stream;
     gko::hip_stream other_stream;
 #endif
-    std::shared_ptr<gko::Executor> omp;
+    std::shared_ptr<gko::ReferenceExecutor> ref;
     std::shared_ptr<gko::HipExecutor> hip;
     std::shared_ptr<gko::HipExecutor> hip2;
     std::shared_ptr<gko::HipExecutor> hip3;
@@ -145,8 +145,8 @@ class HipExecutor : public ::testing::Test {
 
 TEST_F(HipExecutor, CanInstantiateTwoExecutorsOnOneDevice)
 {
-    auto hip = gko::HipExecutor::create(0, omp);
-    auto hip2 = gko::HipExecutor::create(0, omp);
+    auto hip = gko::HipExecutor::create(0, ref);
+    auto hip2 = gko::HipExecutor::create(0, ref);
 
     // We want automatic deinitialization to not create any error
 }
@@ -204,7 +204,7 @@ TEST_F(HipExecutor, CopiesDataToHip)
     int orig[] = {3, 8};
     auto* copy = hip->alloc<int>(2);
 
-    hip->copy_from(omp, 2, orig, copy);
+    hip->copy_from(ref, 2, orig, copy);
 
     check_data<<<1, 1, 0, hip->get_stream()>>>(copy);
     ASSERT_NO_THROW(hip->synchronize());
@@ -232,7 +232,7 @@ TEST_F(HipExecutor, CanAllocateOnUnifiedMemory)
     int orig[] = {3, 8};
     auto* copy = hip3->alloc<int>(2);
 
-    hip3->copy_from(omp, 2, orig, copy);
+    hip3->copy_from(ref, 2, orig, copy);
 
     check_data<<<1, 1, 0, hip3->get_stream()>>>(copy);
     ASSERT_NO_THROW(hip3->synchronize());
@@ -257,7 +257,7 @@ TEST_F(HipExecutor, CopiesDataFromHip)
     auto orig = hip->alloc<int>(2);
     init_data<<<1, 1, 0, hip->get_stream()>>>(orig);
 
-    omp->copy_from(hip, 2, orig, copy);
+    ref->copy_from(hip, 2, orig, copy);
 
     EXPECT_EQ(3, copy[0]);
     ASSERT_EQ(8, copy[1]);
@@ -310,7 +310,7 @@ TEST_F(HipExecutor, CopiesDataFromHipToHip)
     hip2->run(ExampleOperation(value));
     ASSERT_EQ(value, hip2->get_device_id());
     // Put the results on OpenMP and run CPU side assertions
-    omp->copy_from(hip2, 2, copy_hip2, copy);
+    ref->copy_from(hip2, 2, copy_hip2, copy);
     EXPECT_EQ(3, copy[0]);
     ASSERT_EQ(8, copy[1]);
     hip2->free(copy_hip2);
diff --git a/hip/test/base/hip_executor_topology.hip.cpp b/hip/test/base/hip_executor_topology.hip.cpp
index 394b2776319..3d6e3f2bddc 100644
--- a/hip/test/base/hip_executor_topology.hip.cpp
+++ b/hip/test/base/hip_executor_topology.hip.cpp
@@ -65,15 +65,16 @@ namespace {
 
 class HipExecutor : public ::testing::Test {
 protected:
-    HipExecutor() : omp(gko::OmpExecutor::create()), hip(nullptr), hip2(nullptr)
+    HipExecutor()
+        : ref(gko::ReferenceExecutor::create()), hip(nullptr), hip2(nullptr)
     {}
 
     void SetUp()
     {
         ASSERT_GT(gko::HipExecutor::get_num_devices(), 0);
-        hip = gko::HipExecutor::create(0, omp);
+        hip = gko::HipExecutor::create(0, ref);
         hip2 = gko::HipExecutor::create(gko::HipExecutor::get_num_devices() - 1,
-                                        omp);
+                                        ref);
     }
 
     void TearDown()
@@ -84,7 +85,7 @@ class HipExecutor : public ::testing::Test {
         }
     }
 
-    std::shared_ptr<gko::Executor> omp;
+    std::shared_ptr<gko::ReferenceExecutor> ref;
     std::shared_ptr<const gko::HipExecutor> hip;
     std::shared_ptr<const gko::HipExecutor> hip2;
 };
@@ -107,7 +108,7 @@ inline int get_core_os_id(int log_id)
 
 TEST_F(HipExecutor, CanBindToSinglePu)
 {
-    hip = gko::HipExecutor::create(0, gko::OmpExecutor::create());
+    hip = gko::HipExecutor::create(0, gko::ReferenceExecutor::create());
 
     const int bind_pu = 1;
     gko::machine_topology::get_instance()->bind_to_pu(bind_pu);
@@ -119,7 +120,7 @@ TEST_F(HipExecutor, CanBindToSinglePu)
 
 TEST_F(HipExecutor, CanBindToPus)
 {
-    hip = gko::HipExecutor::create(0, gko::OmpExecutor::create());
+    hip = gko::HipExecutor::create(0, gko::ReferenceExecutor::create());
 
     std::vector<int> bind_pus = {1, 3};
     gko::machine_topology::get_instance()->bind_to_pus(bind_pus);
@@ -131,7 +132,7 @@ TEST_F(HipExecutor, CanBindToPus)
 
 TEST_F(HipExecutor, CanBindToCores)
 {
-    hip = gko::HipExecutor::create(0, gko::OmpExecutor::create());
+    hip = gko::HipExecutor::create(0, gko::ReferenceExecutor::create());
 
     std::vector<int> bind_cores = {1, 3};
     gko::machine_topology::get_instance()->bind_to_cores(bind_cores);
@@ -143,7 +144,7 @@ TEST_F(HipExecutor, CanBindToCores)
 
 TEST_F(HipExecutor, ClosestCpusIsPopulated)
 {
-    hip = gko::HipExecutor::create(0, gko::OmpExecutor::create());
+    hip = gko::HipExecutor::create(0, gko::ReferenceExecutor::create());
     auto close_cpus = hip->get_closest_pus();
     if (close_cpus.size() == 0) {
         GTEST_SKIP();
@@ -155,7 +156,7 @@ TEST_F(HipExecutor, ClosestCpusIsPopulated)
 
 TEST_F(HipExecutor, KnowsItsNuma)
 {
-    hip = gko::HipExecutor::create(0, gko::OmpExecutor::create());
+    hip = gko::HipExecutor::create(0, gko::ReferenceExecutor::create());
     auto numa0 = hip->get_closest_numa();
     auto close_cpus = hip->get_closest_pus();
     if (close_cpus.size() == 0) {

From d925a80faf15a7a84ff9a7771faf4b435c52942c Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Mon, 10 Jul 2023 14:24:15 +0000
Subject: [PATCH 10/14] warn if using unsupported allocator

---
 cuda/base/memory.cpp | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/cuda/base/memory.cpp b/cuda/base/memory.cpp
index 08c64c0ba05..f605d9135ea 100644
--- a/cuda/base/memory.cpp
+++ b/cuda/base/memory.cpp
@@ -121,7 +121,13 @@ void CudaAsyncAllocator::deallocate(void* ptr)
 #else  // Fall back to regular allocation
 
 
-CudaAsyncAllocator::CudaAsyncAllocator(cudaStream_t stream) : stream_{stream} {}
+CudaAsyncAllocator::CudaAsyncAllocator(cudaStream_t stream) : stream_{stream}
+{
+#if GKO_VERBOSE_LEVEL >= 1
+    std::cerr << "This version of CUDA does not support cudaMallocAsync, "
+                 "please use CudaAllocator instead of CudaAsyncAllocator.\n";
+#endif
+}
 
 
 void* CudaAsyncAllocator::allocate(size_type num_bytes)

From f61f9122012348f56996ce4073972388c9921271 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <mail@ribizel.de>
Date: Mon, 10 Jul 2023 14:27:48 +0000
Subject: [PATCH 11/14] improve documentation

Co-authored-by: Yuhsiang M. Tsai <yhmtsai@gmail.com>
Co-authored-by: Pratik Nayak <pratik.nayak@kit.edu>
---
 include/ginkgo/core/base/memory.hpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/ginkgo/core/base/memory.hpp b/include/ginkgo/core/base/memory.hpp
index 1086c9aacb4..f421abf7da4 100644
--- a/include/ginkgo/core/base/memory.hpp
+++ b/include/ginkgo/core/base/memory.hpp
@@ -76,6 +76,7 @@ class CudaAllocatorBase : public Allocator {
      *
      * @param device_id the device ID the allocator will be used in.
      * @param stream the stream the allocator will be used with.
+     *
      * @return true if and only if the allocator can be used by CudaExecutor in
      *         the given environment.
      */
@@ -100,7 +101,8 @@ class HipAllocatorBase : public Allocator {
      *
      * @param device_id the device ID the allocator will be used in.
      * @param stream the stream the allocator will be used with.
-     * @return true if and only if the allocator can be used by CudaExecutor in
+     *
+     * @return true if and only if the allocator can be used by HipExecutor in
      *         the given environment.
      */
     virtual bool check_environment(int device_id,

From 580219a908fbf06b66a0caa6790043553bc93f83 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Wed, 19 Jul 2023 11:46:47 +0200
Subject: [PATCH 12/14] fix HIP requirements for stream-ordered allocation

---
 hip/base/memory.hip.cpp | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/hip/base/memory.hip.cpp b/hip/base/memory.hip.cpp
index 86ac31c3154..be795bb3397 100644
--- a/hip/base/memory.hip.cpp
+++ b/hip/base/memory.hip.cpp
@@ -97,7 +97,7 @@ void HipAllocator::deallocate(void* dev_ptr)
 }
 
 
-#if HIP_VERSION_MAJOR >= 5
+#if HIP_VERSION >= 50200000
 
 
 HipAsyncAllocator::HipAsyncAllocator(hipStream_t stream) : stream_{stream} {}
@@ -121,7 +121,13 @@ void HipAsyncAllocator::deallocate(void* ptr)
 #else  // Fall back to regular allocation
 
 
-HipAsyncAllocator::HipAsyncAllocator(hipStream_t stream) : stream_{stream} {}
+HipAsyncAllocator::HipAsyncAllocator(hipStream_t stream) : stream_{stream}
+{
+#if GKO_VERBOSE_LEVEL >= 1
+    std::cerr << "This version of HIP does not support hipMallocAsync, "
+                 "please use HipAllocator instead of HipAsyncAllocator.\n";
+#endif
+}
 
 
 void* HipAsyncAllocator::allocate(size_type num_bytes)

From 54ccdafb954f0240a7eb7101326eedf404a85055 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Wed, 19 Jul 2023 12:13:05 +0200
Subject: [PATCH 13/14] use unified allocator in some HIP tests

---
 hip/test/base/hip_executor.hip.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/hip/test/base/hip_executor.hip.cpp b/hip/test/base/hip_executor.hip.cpp
index e63543ef77c..42499384704 100644
--- a/hip/test/base/hip_executor.hip.cpp
+++ b/hip/test/base/hip_executor.hip.cpp
@@ -114,13 +114,14 @@ class HipExecutor : public ::testing::Test {
             gko::HipExecutor::get_num_devices() - 1, ref,
             std::make_shared<gko::HipAllocator>(), other_stream.get());
         hip3 = gko::HipExecutor::create(
-            0, ref, std::make_shared<gko::HipAllocator>(), stream.get());
+            0, ref, std::make_shared<gko::HipUnifiedAllocator>(0),
+            stream.get());
 #else
         hip = gko::HipExecutor::create(0, ref);
         hip2 = gko::HipExecutor::create(gko::HipExecutor::get_num_devices() - 1,
                                         ref);
-        hip3 = gko::HipExecutor::create(0, ref,
-                                        std::make_shared<gko::HipAllocator>());
+        hip3 = gko::HipExecutor::create(
+            0, ref, std::make_shared<gko::HipUnifiedAllocator>(0));
 #endif
     }
 

From 33e29c3390ac2e4c53a90ccada3b4ad6cde68d1f Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Wed, 19 Jul 2023 13:52:49 +0200
Subject: [PATCH 14/14] resolve ambiguous symbol

---
 cuda/base/executor.cpp    | 8 ++++----
 hip/base/executor.hip.cpp | 6 +++---
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/cuda/base/executor.cpp b/cuda/base/executor.cpp
index fd16815456a..f296fb9da86 100644
--- a/cuda/base/executor.cpp
+++ b/cuda/base/executor.cpp
@@ -60,8 +60,8 @@ namespace gko {
 #include "common/cuda_hip/base/executor.hpp.inc"
 
 
-std::unique_ptr<CudaAllocatorBase> allocator_from_mode(int device_id,
-                                                       allocation_mode mode)
+std::unique_ptr<CudaAllocatorBase> cuda_allocator_from_mode(
+    int device_id, allocation_mode mode)
 {
     switch (mode) {
     case allocation_mode::device:
@@ -82,8 +82,8 @@ std::shared_ptr<CudaExecutor> CudaExecutor::create(
     int device_id, std::shared_ptr<Executor> master, bool device_reset,
     allocation_mode alloc_mode, cudaStream_t stream)
 {
-    return create(device_id, master, allocator_from_mode(device_id, alloc_mode),
-                  stream);
+    return create(device_id, master,
+                  cuda_allocator_from_mode(device_id, alloc_mode), stream);
 }
 
 
diff --git a/hip/base/executor.hip.cpp b/hip/base/executor.hip.cpp
index a89e765becb..8d175c0e424 100644
--- a/hip/base/executor.hip.cpp
+++ b/hip/base/executor.hip.cpp
@@ -56,8 +56,8 @@ namespace gko {
 #include "common/cuda_hip/base/executor.hpp.inc"
 
 
-std::unique_ptr<HipAllocatorBase> allocator_from_mode(int device_id,
-                                                      allocation_mode mode)
+std::unique_ptr<HipAllocatorBase> hip_allocator_from_mode(int device_id,
+                                                          allocation_mode mode)
 {
     switch (mode) {
     case allocation_mode::device:
@@ -79,7 +79,7 @@ std::shared_ptr<HipExecutor> HipExecutor::create(
     allocation_mode alloc_mode, hipStream_t stream)
 {
     return create(device_id, std::move(master),
-                  allocator_from_mode(device_id, alloc_mode), stream);
+                  hip_allocator_from_mode(device_id, alloc_mode), stream);
 }