From 9fd6482ce850497e84cabc7d61f423d024624ba0 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Thu, 6 Jul 2023 15:45:46 +0200
Subject: [PATCH] update interface and remove DPC++ allocator

---
 core/base/memory.cpp                  |   4 +-
 core/device_hooks/cuda_hooks.cpp      |  32 ++++--
 core/device_hooks/dpcpp_hooks.cpp     |  28 -----
 core/device_hooks/hip_hooks.cpp       |  51 ++++++++-
 core/test/base/executor.cpp           |   2 +-
 cuda/base/executor.cpp                |   9 +-
 cuda/base/memory.cpp                  |  43 ++++++--
 dpcpp/CMakeLists.txt                  |   1 -
 dpcpp/base/memory.dp.cpp              |  85 ---------------
 dpcpp/test/base/CMakeLists.txt        |   1 -
 dpcpp/test/base/memory.dp.cpp         |  98 -----------------
 hip/base/executor.hip.cpp             |   9 +-
 hip/base/memory.hip.cpp               | 123 ++++++++++++++++++++-
 include/ginkgo/core/base/executor.hpp |  33 ++----
 include/ginkgo/core/base/memory.hpp   | 151 ++++++++++++++++++--------
 15 files changed, 348 insertions(+), 322 deletions(-)
 delete mode 100644 dpcpp/base/memory.dp.cpp
 delete mode 100644 dpcpp/test/base/memory.dp.cpp

diff --git a/core/base/memory.cpp b/core/base/memory.cpp
index 4e9f0b7e24a..b6c6f8f265c 100644
--- a/core/base/memory.cpp
+++ b/core/base/memory.cpp
@@ -42,7 +42,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 namespace gko {
 
 
-void* CpuAllocator::allocate(size_type num_bytes) const
+void* CpuAllocator::allocate(size_type num_bytes)
 {
     auto ptr = ::operator new (num_bytes, std::nothrow_t{});
     GKO_ENSURE_ALLOCATED(ptr, "cpu", num_bytes);
@@ -50,7 +50,7 @@ void* CpuAllocator::allocate(size_type num_bytes) const
 }
 
 
-void CpuAllocator::deallocate(void* ptr) const
+void CpuAllocator::deallocate(void* ptr)
 {
     ::operator delete (ptr, std::nothrow_t{});
 }
diff --git a/core/device_hooks/cuda_hooks.cpp b/core/device_hooks/cuda_hooks.cpp
index f8489908cc9..03ab12deb46 100644
--- a/core/device_hooks/cuda_hooks.cpp
+++ b/core/device_hooks/cuda_hooks.cpp
@@ -54,43 +54,55 @@ version version_info::get_cuda_version() noexcept
 }
 
 
-void* CudaAllocator::allocate(size_type num_bytes) const GKO_NOT_COMPILED(cuda);
+void* CudaAllocator::allocate(size_type num_bytes) GKO_NOT_COMPILED(cuda);
 
 
-void CudaAllocator::deallocate(void* dev_ptr) const GKO_NOT_COMPILED(cuda);
+void CudaAllocator::deallocate(void* dev_ptr) GKO_NOT_COMPILED(cuda);
 
 
 CudaAsyncAllocator::CudaAsyncAllocator(CUstream_st* stream)
     GKO_NOT_COMPILED(cuda);
 
 
-void* CudaAsyncAllocator::allocate(size_type num_bytes) const
-    GKO_NOT_COMPILED(cuda);
+void* CudaAsyncAllocator::allocate(size_type num_bytes) GKO_NOT_COMPILED(cuda);
+
 
+void CudaAsyncAllocator::deallocate(void* dev_ptr) GKO_NOT_COMPILED(cuda);
 
-void CudaAsyncAllocator::deallocate(void* dev_ptr) const GKO_NOT_COMPILED(cuda);
+
+bool CudaAsyncAllocator::check_environment(int device_id,
+                                           CUstream_st* stream) const
+    GKO_NOT_COMPILED(cuda);
 
 
 CudaUnifiedAllocator::CudaUnifiedAllocator(int device_id, unsigned int flags)
     GKO_NOT_COMPILED(cuda);
 
 
-void* CudaUnifiedAllocator::allocate(size_type num_bytes) const
+void* CudaUnifiedAllocator::allocate(size_type num_bytes)
     GKO_NOT_COMPILED(cuda);
 
 
-void CudaUnifiedAllocator::deallocate(void* dev_ptr) const
+void CudaUnifiedAllocator::deallocate(void* dev_ptr) GKO_NOT_COMPILED(cuda);
+
+
+bool CudaUnifiedAllocator::check_environment(int device_id,
+                                             CUstream_st* stream) const
     GKO_NOT_COMPILED(cuda);
 
 
 CudaHostAllocator::CudaHostAllocator(int device_id) GKO_NOT_COMPILED(cuda);
 
 
-void* CudaHostAllocator::allocate(size_type num_bytes) const
-    GKO_NOT_COMPILED(cuda);
+void* CudaHostAllocator::allocate(size_type num_bytes) GKO_NOT_COMPILED(cuda);
+
 
+void CudaHostAllocator::deallocate(void* dev_ptr) GKO_NOT_COMPILED(cuda);
 
-void CudaHostAllocator::deallocate(void* dev_ptr) const GKO_NOT_COMPILED(cuda);
+
+bool CudaHostAllocator::check_environment(int device_id,
+                                          CUstream_st* stream) const
+    GKO_NOT_COMPILED(cuda);
 
 
 std::shared_ptr<CudaExecutor> CudaExecutor::create(
diff --git a/core/device_hooks/dpcpp_hooks.cpp b/core/device_hooks/dpcpp_hooks.cpp
index 1981c712872..532e9c55bbe 100644
--- a/core/device_hooks/dpcpp_hooks.cpp
+++ b/core/device_hooks/dpcpp_hooks.cpp
@@ -53,34 +53,6 @@ version version_info::get_dpcpp_version() noexcept
 }
 
 
-DpcppAllocatorBase::DpcppAllocatorBase(sycl::queue*) GKO_NOT_COMPILED(dpcpp);
-
-
-void* DpcppAllocatorBase::allocate(size_type num_bytes) const
-    GKO_NOT_COMPILED(dpcpp);
-
-
-void DpcppAllocatorBase::deallocate(void* ptr) const GKO_NOT_COMPILED(dpcpp);
-
-
-void* DpcppAllocator::allocate_impl(sycl::queue* queue,
-                                    size_type num_bytes) const
-    GKO_NOT_COMPILED(dpcpp);
-
-
-void DpcppAllocator::deallocate_impl(sycl::queue* queue, void* ptr) const
-    GKO_NOT_COMPILED(dpcpp);
-
-
-void* DpcppUnifiedAllocator::allocate_impl(sycl::queue* queue,
-                                           size_type num_bytes) const
-    GKO_NOT_COMPILED(dpcpp);
-
-
-void DpcppUnifiedAllocator::deallocate_impl(sycl::queue* queue, void* ptr) const
-    GKO_NOT_COMPILED(dpcpp);
-
-
 std::shared_ptr<DpcppExecutor> DpcppExecutor::create(
     int device_id, std::shared_ptr<Executor> master, std::string device_type,
     dpcpp_queue_property property)
diff --git a/core/device_hooks/hip_hooks.cpp b/core/device_hooks/hip_hooks.cpp
index ba7563f1ef0..dec1de15933 100644
--- a/core/device_hooks/hip_hooks.cpp
+++ b/core/device_hooks/hip_hooks.cpp
@@ -55,10 +55,54 @@ version version_info::get_hip_version() noexcept
 }
 
 
-void* HipAllocator::allocate(size_type num_bytes) const GKO_NOT_COMPILED(hip);
+void* HipAllocator::allocate(size_type num_bytes) GKO_NOT_COMPILED(hip);
 
 
-void HipAllocator::deallocate(void* dev_ptr) const GKO_NOT_COMPILED(hip);
+void HipAllocator::deallocate(void* dev_ptr) GKO_NOT_COMPILED(hip);
+
+
+HipAsyncAllocator::HipAsyncAllocator(GKO_HIP_STREAM_STRUCT* stream)
+    GKO_NOT_COMPILED(hip);
+
+
+void* HipAsyncAllocator::allocate(size_type num_bytes) GKO_NOT_COMPILED(hip);
+
+
+void HipAsyncAllocator::deallocate(void* dev_ptr) GKO_NOT_COMPILED(hip);
+
+
+bool HipAsyncAllocator::check_environment(int device_id,
+                                          GKO_HIP_STREAM_STRUCT* stream) const
+    GKO_NOT_COMPILED(hip);
+
+
+HipUnifiedAllocator::HipUnifiedAllocator(int device_id, unsigned int flags)
+    GKO_NOT_COMPILED(hip);
+
+
+void* HipUnifiedAllocator::allocate(size_type num_bytes) GKO_NOT_COMPILED(hip);
+
+
+void HipUnifiedAllocator::deallocate(void* dev_ptr) GKO_NOT_COMPILED(hip);
+
+
+bool HipUnifiedAllocator::check_environment(int device_id,
+                                            GKO_HIP_STREAM_STRUCT* stream) const
+    GKO_NOT_COMPILED(hip);
+
+
+HipHostAllocator::HipHostAllocator(int device_id) GKO_NOT_COMPILED(hip);
+
+
+void* HipHostAllocator::allocate(size_type num_bytes) GKO_NOT_COMPILED(hip);
+
+
+void HipHostAllocator::deallocate(void* dev_ptr) GKO_NOT_COMPILED(hip);
+
+
+bool HipHostAllocator::check_environment(int device_id,
+                                         GKO_HIP_STREAM_STRUCT* stream) const
+    GKO_NOT_COMPILED(hip);
 
 
 std::shared_ptr<HipExecutor> HipExecutor::create(
@@ -76,8 +120,7 @@ std::shared_ptr<HipExecutor> HipExecutor::create(
     std::shared_ptr<HipAllocatorBase> alloc, GKO_HIP_STREAM_STRUCT* stream)
 {
     return std::shared_ptr<HipExecutor>(
-        new HipExecutor(device_id, std::move(master),
-                        std::make_shared<HipAllocator>(), stream));
+        new HipExecutor(device_id, std::move(master), alloc, stream));
 }
 
 
diff --git a/core/test/base/executor.cpp b/core/test/base/executor.cpp
index 94e7bc02d79..13cba09e2b6 100644
--- a/core/test/base/executor.cpp
+++ b/core/test/base/executor.cpp
@@ -386,7 +386,7 @@ TEST(Executor, CanVerifyMemory)
 
 
 struct MockAllocator : gko::CpuAllocator {
-    void deallocate(void* ptr) const noexcept override
+    void deallocate(void* ptr) noexcept override
     {
         called_free = true;
         CpuAllocator::deallocate(ptr);
diff --git a/cuda/base/executor.cpp b/cuda/base/executor.cpp
index f6e838dd2dd..faf90037a0f 100644
--- a/cuda/base/executor.cpp
+++ b/cuda/base/executor.cpp
@@ -80,9 +80,8 @@ std::shared_ptr<CudaExecutor> CudaExecutor::create(
     int device_id, std::shared_ptr<Executor> master, bool device_reset,
     allocation_mode alloc_mode, cudaStream_t stream)
 {
-    return std::shared_ptr<CudaExecutor>(
-        new CudaExecutor(device_id, std::move(master),
-                         allocator_from_mode(device_id, alloc_mode), stream));
+    return create(device_id, master, allocator_from_mode(device_id, alloc_mode),
+                  stream);
 }
 
 
@@ -90,6 +89,10 @@ std::shared_ptr<CudaExecutor> CudaExecutor::create(
     int device_id, std::shared_ptr<Executor> master,
     std::shared_ptr<CudaAllocatorBase> alloc, cudaStream_t stream)
 {
+    if (!alloc->check_environment(device_id, stream)) {
+        throw Error{__FILE__, __LINE__,
+                    "Allocator uses incorrect stream or device ID."};
+    }
     return std::shared_ptr<CudaExecutor>(new CudaExecutor(
         device_id, std::move(master), std::move(alloc), stream));
 }
diff --git a/cuda/base/memory.cpp b/cuda/base/memory.cpp
index afc1f9f62fa..08c64c0ba05 100644
--- a/cuda/base/memory.cpp
+++ b/cuda/base/memory.cpp
@@ -82,7 +82,7 @@ namespace gko {
 #endif
 
 
-void* CudaAllocator::allocate(size_type num_bytes) const
+void* CudaAllocator::allocate(size_type num_bytes)
 {
     void* ptr{};
     GKO_ASSERT_NO_CUDA_ALLOCATION_ERRORS(cudaMalloc(&ptr, num_bytes),
@@ -91,7 +91,7 @@ void* CudaAllocator::allocate(size_type num_bytes) const
 }
 
 
-void CudaAllocator::deallocate(void* ptr) const
+void CudaAllocator::deallocate(void* ptr)
 {
     GKO_EXIT_ON_CUDA_ERROR(cudaFree(ptr));
 }
@@ -103,7 +103,7 @@ void CudaAllocator::deallocate(void* ptr) const
 CudaAsyncAllocator::CudaAsyncAllocator(cudaStream_t stream) : stream_{stream} {}
 
 
-void* CudaAsyncAllocator::allocate(size_type num_bytes) const
+void* CudaAsyncAllocator::allocate(size_type num_bytes)
 {
     void* ptr{};
     GKO_ASSERT_NO_CUDA_ALLOCATION_ERRORS(
@@ -112,7 +112,7 @@ void* CudaAsyncAllocator::allocate(size_type num_bytes) const
 }
 
 
-void CudaAsyncAllocator::deallocate(void* ptr) const
+void CudaAsyncAllocator::deallocate(void* ptr)
 {
     GKO_EXIT_ON_CUDA_ERROR(cudaFreeAsync(ptr, stream_));
 }
@@ -121,10 +121,10 @@ void CudaAsyncAllocator::deallocate(void* ptr) const
 #else  // Fall back to regular allocation
 
 
-CudaAsyncAllocator::CudaAsyncAllocator(cudaStream_t stream) : stream_{} {}
+CudaAsyncAllocator::CudaAsyncAllocator(cudaStream_t stream) : stream_{stream} {}
 
 
-void* CudaAsyncAllocator::allocate(size_type num_bytes) const
+void* CudaAsyncAllocator::allocate(size_type num_bytes)
 {
     void* ptr{};
     GKO_ASSERT_NO_CUDA_ALLOCATION_ERRORS(cudaMalloc(&ptr, num_bytes),
@@ -133,7 +133,7 @@ void* CudaAsyncAllocator::allocate(size_type num_bytes) const
 }
 
 
-void CudaAsyncAllocator::deallocate(void* ptr) const
+void CudaAsyncAllocator::deallocate(void* ptr)
 {
     GKO_EXIT_ON_CUDA_ERROR(cudaFree(ptr));
 }
@@ -142,6 +142,13 @@ void CudaAsyncAllocator::deallocate(void* ptr) const
 #endif
 
 
+bool CudaAsyncAllocator::check_environment(int device_id,
+                                           CUstream_st* stream) const
+{
+    return stream == stream_;
+}
+
+
 CudaUnifiedAllocator::CudaUnifiedAllocator(int device_id)
     : CudaUnifiedAllocator{device_id, cudaMemAttachGlobal}
 {}
@@ -152,7 +159,7 @@ CudaUnifiedAllocator::CudaUnifiedAllocator(int device_id, unsigned int flags)
 {}
 
 
-void* CudaUnifiedAllocator::allocate(size_type num_bytes) const
+void* CudaUnifiedAllocator::allocate(size_type num_bytes)
 {
     // we need to set the device ID in case this gets used in a host executor
     detail::cuda_scoped_device_id_guard g(device_id_);
@@ -163,7 +170,7 @@ void* CudaUnifiedAllocator::allocate(size_type num_bytes) const
 }
 
 
-void CudaUnifiedAllocator::deallocate(void* ptr) const
+void CudaUnifiedAllocator::deallocate(void* ptr)
 {
     // we need to set the device ID in case this gets used in a host executor
     detail::cuda_scoped_device_id_guard g(device_id_);
@@ -171,10 +178,17 @@ void CudaUnifiedAllocator::deallocate(void* ptr) const
 }
 
 
+bool CudaUnifiedAllocator::check_environment(int device_id,
+                                             CUstream_st* stream) const
+{
+    return device_id == device_id_;
+}
+
+
 CudaHostAllocator::CudaHostAllocator(int device_id) : device_id_{device_id} {}
 
 
-void* CudaHostAllocator::allocate(size_type num_bytes) const
+void* CudaHostAllocator::allocate(size_type num_bytes)
 {
     // we need to set the device ID in case this gets used in a host executor
     detail::cuda_scoped_device_id_guard g(device_id_);
@@ -185,7 +199,7 @@ void* CudaHostAllocator::allocate(size_type num_bytes) const
 }
 
 
-void CudaHostAllocator::deallocate(void* ptr) const
+void CudaHostAllocator::deallocate(void* ptr)
 {
     // we need to set the device ID in case this gets used in a host executor
     detail::cuda_scoped_device_id_guard g(device_id_);
@@ -193,4 +207,11 @@ void CudaHostAllocator::deallocate(void* ptr) const
 }
 
 
+bool CudaHostAllocator::check_environment(int device_id,
+                                          CUstream_st* stream) const
+{
+    return device_id == device_id_;
+}
+
+
 }  // namespace gko
diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt
index 55763ca5525..31b5e0543ba 100644
--- a/dpcpp/CMakeLists.txt
+++ b/dpcpp/CMakeLists.txt
@@ -10,7 +10,6 @@ target_sources(ginkgo_dpcpp
     base/executor.dp.cpp
     base/helper.dp.cpp
     base/index_set_kernels.dp.cpp
-    base/memory.dp.cpp
     base/scoped_device_id.dp.cpp
     base/timer.dp.cpp
     base/version.dp.cpp
diff --git a/dpcpp/base/memory.dp.cpp b/dpcpp/base/memory.dp.cpp
deleted file mode 100644
index 2582fa331a0..00000000000
--- a/dpcpp/base/memory.dp.cpp
+++ /dev/null
@@ -1,85 +0,0 @@
-/*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2023, the Ginkgo authors
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions
-are met:
-
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in the
-documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its
-contributors may be used to endorse or promote products derived from
-this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-******************************<GINKGO LICENSE>*******************************/
-
-#include <ginkgo/core/base/memory.hpp>
-
-
-#include <CL/sycl.hpp>
-
-
-namespace gko {
-
-
-DpcppAllocatorBase::DpcppAllocatorBase(sycl::queue* queue) : queue_{queue} {}
-
-
-void* DpcppAllocatorBase::allocate(size_type num_bytes) const
-{
-    return this->allocate_impl(queue_, num_bytes);
-}
-
-
-void DpcppAllocatorBase::deallocate(void* ptr) const
-{
-    this->deallocate_impl(queue_, ptr);
-}
-
-
-void* DpcppAllocator::allocate_impl(sycl::queue* queue,
-                                    size_type num_bytes) const
-{
-    return sycl::malloc_device(num_bytes, *queue);
-}
-
-
-void DpcppAllocator::deallocate_impl(sycl::queue* queue, void* ptr) const
-{
-    queue->wait_and_throw();
-    sycl::free(ptr, queue->get_context());
-}
-
-
-void* DpcppUnifiedAllocator::allocate_impl(sycl::queue* queue,
-                                           size_type num_bytes)
-{
-    return sycl::malloc_shared(num_bytes, *queue);
-}
-
-
-void DpcppUnifiedAllocator::deallocate_impl(sycl::queue* queue, void* ptr)
-{
-    queue->wait_and_throw();
-    sycl::free(ptr, queue->get_context());
-}
-
-
-}  // namespace gko
diff --git a/dpcpp/test/base/CMakeLists.txt b/dpcpp/test/base/CMakeLists.txt
index 5c0ca601f04..bb9c8a75050 100644
--- a/dpcpp/test/base/CMakeLists.txt
+++ b/dpcpp/test/base/CMakeLists.txt
@@ -3,4 +3,3 @@ ginkgo_create_dpcpp_test(dim3)
 ginkgo_create_dpcpp_test(kernel_launch)
 # set correct flags for kernel_launch.hpp
 target_compile_definitions(dpcpp_test_base_kernel_launch PRIVATE GKO_COMPILING_DPCPP)
-ginkgo_create_dpcpp_test(memory)
\ No newline at end of file
diff --git a/dpcpp/test/base/memory.dp.cpp b/dpcpp/test/base/memory.dp.cpp
deleted file mode 100644
index e587660cde3..00000000000
--- a/dpcpp/test/base/memory.dp.cpp
+++ /dev/null
@@ -1,98 +0,0 @@
-/*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2023, the Ginkgo authors
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions
-are met:
-
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in the
-documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its
-contributors may be used to endorse or promote products derived from
-this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-******************************<GINKGO LICENSE>*******************************/
-
-#include <ginkgo/core/base/memory.hpp>
-
-
-#include <memory>
-#include <type_traits>
-
-
-#include <gtest/gtest.h>
-
-
-#include <ginkgo/core/base/exception.hpp>
-#include <ginkgo/core/base/exception_helpers.hpp>
-#include <ginkgo/core/base/executor.hpp>
-
-
-#include "dpcpp/test/utils.hpp"
-
-
-namespace {
-
-
-class Memory : public ::testing::Test {
-protected:
-    Memory()
-        : exec{gko::DpcppExecutor::create(0, gko::OmpExecutor::create())},
-          host_exec_with_unified{gko::OmpExecutor::create(
-              std::make_shared<gko::DpcppUnifiedAllocator>(exec->get_queue()))},
-          exec_with_unified{gko::DpcppExecutor::create(
-              exec->get_queue(), host_exec_with_unified,
-              std::make_shared<gko::DpcppUnifiedAllocator>(exec->get_queue()))}
-    {}
-
-    std::shared_ptr<gko::DpcppExecutor> exec;
-    std::shared_ptr<gko::OmpExecutor> host_exec_with_unified;
-    std::shared_ptr<gko::DpcppExecutor> exec_with_unified;
-};
-
-
-TEST_F(Memory, DeviceAllocationWorks)
-{
-    gko::array<int> data{exec, {1, 2}};
-
-    GKO_ASSERT_ARRAY_EQ(data, I<int>({1, 2}));
-}
-
-
-TEST_F(Memory, UnifiedDeviceAllocationWorks)
-{
-    gko::array<int> data{exec_with_unified, {1, 2}};
-    exec->synchronize();
-
-    ASSERT_EQ(data.get_const_data()[0], 1);
-    ASSERT_EQ(data.get_const_data()[1], 2);
-}
-
-
-TEST_F(Memory, HostUnifiedAllocationWorks)
-{
-    gko::array<int> data{host_exec_with_unified, {1, 2}};
-
-    ASSERT_EQ(data.get_const_data()[0], 1);
-    ASSERT_EQ(data.get_const_data()[1], 2);
-}
-
-
-}  // namespace
diff --git a/hip/base/executor.hip.cpp b/hip/base/executor.hip.cpp
index 6b4b0fd5ddc..2df5c9a4847 100644
--- a/hip/base/executor.hip.cpp
+++ b/hip/base/executor.hip.cpp
@@ -60,9 +60,8 @@ std::shared_ptr<HipExecutor> HipExecutor::create(
     int device_id, std::shared_ptr<Executor> master, bool device_reset,
     allocation_mode alloc_mode, hipStream_t stream)
 {
-    return std::shared_ptr<HipExecutor>(
-        new HipExecutor(device_id, std::move(master),
-                        std::make_shared<HipAllocator>(), stream));
+    return create(device_id, std::move(master),
+                  std::make_shared<HipAllocator>(), stream);
 }
 
 
@@ -70,6 +69,10 @@ std::shared_ptr<HipExecutor> HipExecutor::create(
     int device_id, std::shared_ptr<Executor> master,
     std::shared_ptr<HipAllocatorBase> alloc, hipStream_t stream)
 {
+    if (!alloc->check_environment(device_id, stream)) {
+        throw Error{__FILE__, __LINE__,
+                    "Allocator uses incorrect stream or device ID."};
+    }
     return std::shared_ptr<HipExecutor>(new HipExecutor(
         device_id, std::move(master), std::move(alloc), stream));
 }
diff --git a/hip/base/memory.hip.cpp b/hip/base/memory.hip.cpp
index f2a8977525f..7acb208173a 100644
--- a/hip/base/memory.hip.cpp
+++ b/hip/base/memory.hip.cpp
@@ -39,6 +39,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/exception_helpers.hpp>
 
 
+#include "hip/base/scoped_device_id.hip.hpp"
+
+
 namespace gko {
 
 
@@ -79,7 +82,7 @@ namespace gko {
 #endif
 
 
-void* HipAllocator::allocate(size_type num_bytes) const
+void* HipAllocator::allocate(size_type num_bytes)
 {
     void* dev_ptr{};
     GKO_ASSERT_NO_HIP_ALLOCATION_ERRORS(hipMalloc(&dev_ptr, num_bytes),
@@ -88,10 +91,126 @@ void* HipAllocator::allocate(size_type num_bytes) const
 }
 
 
-void HipAllocator::deallocate(void* dev_ptr) const
+void HipAllocator::deallocate(void* dev_ptr)
 {
     GKO_EXIT_ON_HIP_ERROR(hipFree(dev_ptr));
 }
 
 
+#if HIP_VERSION_MAJOR >= 5
+
+
+HipAsyncAllocator::HipAsyncAllocator(hipStream_t stream) : stream_{stream} {}
+
+
+void* HipAsyncAllocator::allocate(size_type num_bytes)
+{
+    void* ptr{};
+    GKO_ASSERT_NO_HIP_ALLOCATION_ERRORS(
+        hipMallocAsync(&ptr, num_bytes, stream_), num_bytes);
+    return ptr;
+}
+
+
+void HipAsyncAllocator::deallocate(void* ptr)
+{
+    GKO_EXIT_ON_HIP_ERROR(hipFreeAsync(ptr, stream_));
+}
+
+
+#else  // Fall back to regular allocation
+
+
+HipAsyncAllocator::HipAsyncAllocator(hipStream_t stream) : stream_{stream} {}
+
+
+void* HipAsyncAllocator::allocate(size_type num_bytes)
+{
+    void* ptr{};
+    GKO_ASSERT_NO_HIP_ALLOCATION_ERRORS(hipMalloc(&ptr, num_bytes), num_bytes);
+    return ptr;
+}
+
+
+void HipAsyncAllocator::deallocate(void* ptr)
+{
+    GKO_EXIT_ON_HIP_ERROR(hipFree(ptr));
+}
+
+
+#endif
+
+
+bool HipAsyncAllocator::check_environment(int device_id,
+                                          hipStream_t stream) const
+{
+    return stream == stream_;
+}
+
+
+HipUnifiedAllocator::HipUnifiedAllocator(int device_id)
+    : HipUnifiedAllocator{device_id, hipMemAttachGlobal}
+{}
+
+
+HipUnifiedAllocator::HipUnifiedAllocator(int device_id, unsigned int flags)
+    : device_id_{device_id}, flags_{flags}
+{}
+
+
+void* HipUnifiedAllocator::allocate(size_type num_bytes)
+{
+    // we need to set the device ID in case this gets used in a host executor
+    detail::hip_scoped_device_id_guard g(device_id_);
+    void* ptr{};
+    GKO_ASSERT_NO_HIP_ALLOCATION_ERRORS(
+        hipMallocManaged(&ptr, num_bytes, flags_), num_bytes);
+    return ptr;
+}
+
+
+void HipUnifiedAllocator::deallocate(void* ptr)
+{
+    // we need to set the device ID in case this gets used in a host executor
+    detail::hip_scoped_device_id_guard g(device_id_);
+    GKO_EXIT_ON_HIP_ERROR(hipFree(ptr));
+}
+
+
+bool HipUnifiedAllocator::check_environment(int device_id,
+                                            hipStream_t stream) const
+{
+    return device_id == device_id_;
+}
+
+
+HipHostAllocator::HipHostAllocator(int device_id) : device_id_{device_id} {}
+
+
+void* HipHostAllocator::allocate(size_type num_bytes)
+{
+    // we need to set the device ID in case this gets used in a host executor
+    detail::hip_scoped_device_id_guard g(device_id_);
+    void* ptr{};
+    GKO_ASSERT_NO_HIP_ALLOCATION_ERRORS(hipHostMalloc(&ptr, num_bytes),
+                                        num_bytes);
+    return ptr;
+}
+
+
+void HipHostAllocator::deallocate(void* ptr)
+{
+    // we need to set the device ID in case this gets used in a host executor
+    detail::hip_scoped_device_id_guard g(device_id_);
+    GKO_EXIT_ON_HIP_ERROR(hipFreeHost(ptr));
+}
+
+
+bool HipHostAllocator::check_environment(int device_id,
+                                         hipStream_t stream) const
+{
+    return device_id == device_id_;
+}
+
+
 }  // namespace gko
diff --git a/include/ginkgo/core/base/executor.hpp b/include/ginkgo/core/base/executor.hpp
index 4545b216f86..f033873e392 100644
--- a/include/ginkgo/core/base/executor.hpp
+++ b/include/ginkgo/core/base/executor.hpp
@@ -1717,10 +1717,13 @@ class HipExecutor : public detail::ExecutorBase<HipExecutor>,
      * @param alloc_mode  the allocation mode that the executor should operate
      *                    on. See @allocation_mode for more details
      */
-    [[deprecated("")]] static std::shared_ptr<HipExecutor> create(
-        int device_id, std::shared_ptr<Executor> master, bool device_reset,
-        allocation_mode alloc_mode = default_hip_alloc_mode,
-        GKO_HIP_STREAM_STRUCT* stream = nullptr);
+    [[deprecated(
+        "device_reset is deprecated entirely, call hipDeviceReset directly. "
+        "alloc_mode was replaced by the Allocator type "
+        "hierarchy.")]] static std::shared_ptr<HipExecutor>
+    create(int device_id, std::shared_ptr<Executor> master, bool device_reset,
+           allocation_mode alloc_mode = default_hip_alloc_mode,
+           GKO_HIP_STREAM_STRUCT* stream = nullptr);
 
     static std::shared_ptr<HipExecutor> create(
         int device_id, std::shared_ptr<Executor> master,
@@ -1914,28 +1917,6 @@ class DpcppExecutor : public detail::ExecutorBase<DpcppExecutor>,
         std::string device_type = "all",
         dpcpp_queue_property property = dpcpp_queue_property::in_order);
 
-    /**
-     * Creates a new DpcppExecutor from an existing SYCL queue.
-     *
-     * @param queue  the DPCPP device id of this device
-     * @param master  an executor on the host that is used to invoke the device
-     *                kernels
-     */
-    static std::shared_ptr<DpcppExecutor> create(
-        sycl::queue* queue, std::shared_ptr<Executor> master);
-
-    /**
-     * Creates a new DpcppExecutor from an existing SYCL queue.
-     *
-     * @param queue  the DPCPP device id of this device
-     * @param master  an executor on the host that is used to invoke the device
-     *                kernels
-     * @param alloc  the allocator used for memory allocation
-     */
-    static std::shared_ptr<DpcppExecutor> create(
-        sycl::queue* queue, std::shared_ptr<Executor> master,
-        std::shared_ptr<DpcppAllocatorBase> alloc);
-
     std::shared_ptr<Executor> get_master() noexcept override;
 
     std::shared_ptr<const Executor> get_master() const noexcept override;
diff --git a/include/ginkgo/core/base/memory.hpp b/include/ginkgo/core/base/memory.hpp
index 872a25a9a33..1086c9aacb4 100644
--- a/include/ginkgo/core/base/memory.hpp
+++ b/include/ginkgo/core/base/memory.hpp
@@ -49,9 +49,9 @@ class Allocator {
 public:
     virtual ~Allocator() = default;
 
-    virtual void* allocate(size_type num_bytes) const = 0;
+    virtual void* allocate(size_type num_bytes) = 0;
 
-    virtual void deallocate(void* ptr) const = 0;
+    virtual void deallocate(void* ptr) = 0;
 };
 
 
@@ -65,34 +65,49 @@ class CpuAllocatorBase : public Allocator {};
 /**
  * Implement this interface to provide an allocator for CudaExecutor.
  */
-class CudaAllocatorBase : public Allocator {};
+class CudaAllocatorBase : public Allocator {
+    friend class CudaExecutor;
 
-
-/**
- * Implement this interface to provide an allocator for HipExecutor.
- */
-class HipAllocatorBase : public Allocator {};
+protected:
+    /**
+     * Checks if the allocator can be used safely with the provided device ID
+     * and stream. The check is necessary to ensure safe usage of stream-ordered
+     * allocators and unified shared memory allocators.
+     *
+     * @param device_id the device ID the allocator will be used in.
+     * @param stream the stream the allocator will be used with.
+     * @return true if and only if the allocator can be used by CudaExecutor in
+     *         the given environment.
+     */
+    virtual bool check_environment(int device_id, CUstream_st* stream) const
+    {
+        return true;
+    }
+};
 
 
 /**
- * Implement this interface to provide an allocator for DpcppExecutor.
+ * Implement this interface to provide an allocator for HipExecutor.
  */
-class DpcppAllocatorBase : public Allocator {
-public:
-    DpcppAllocatorBase(sycl::queue* queue);
-
-    void* allocate(size_type num_bytes) const final;
-
-    void deallocate(void* ptr) const final;
+class HipAllocatorBase : public Allocator {
+    friend class HipExecutor;
 
 protected:
-    virtual void* allocate_impl(sycl::queue* queue,
-                                size_type num_bytes) const = 0;
-
-    virtual void deallocate_impl(sycl::queue* queue, void* ptr) const = 0;
-
-private:
-    sycl::queue* queue_;
+    /**
+     * Checks if the allocator can be used safely with the provided device ID
+     * and stream. The check is necessary to ensure safe usage of stream-ordered
+     * allocators and unified shared memory allocators.
+     *
+     * @param device_id the device ID the allocator will be used in.
+     * @param stream the stream the allocator will be used with.
+     * @return true if and only if the allocator can be used by CudaExecutor in
+     *         the given environment.
+     */
+    virtual bool check_environment(int device_id,
+                                   GKO_HIP_STREAM_STRUCT* stream) const
+    {
+        return true;
+    }
 };
 
 
@@ -101,9 +116,9 @@ class DpcppAllocatorBase : public Allocator {
  */
 class CpuAllocator : public CpuAllocatorBase {
 public:
-    void* allocate(size_type num_bytes) const override;
+    void* allocate(size_type num_bytes) override;
 
-    void deallocate(void* ptr) const override;
+    void deallocate(void* ptr) override;
 };
 
 
@@ -112,9 +127,9 @@ class CpuAllocator : public CpuAllocatorBase {
  */
 class CudaAllocator : public CudaAllocatorBase {
 public:
-    void* allocate(size_type num_bytes) const override;
+    void* allocate(size_type num_bytes) override;
 
-    void deallocate(void* ptr) const override;
+    void deallocate(void* ptr) override;
 };
 
 
@@ -123,12 +138,14 @@ class CudaAllocator : public CudaAllocatorBase {
  */
 class CudaAsyncAllocator : public CudaAllocatorBase {
 public:
-    void* allocate(size_type num_bytes) const override;
+    void* allocate(size_type num_bytes) override;
 
-    void deallocate(void* ptr) const override;
+    void deallocate(void* ptr) override;
 
     CudaAsyncAllocator(CUstream_st* stream);
 
+    bool check_environment(int device_id, CUstream_st* stream) const override;
+
 private:
     CUstream_st* stream_;
 };
@@ -139,14 +156,17 @@ class CudaAsyncAllocator : public CudaAllocatorBase {
  */
 class CudaUnifiedAllocator : public CudaAllocatorBase, public CpuAllocatorBase {
 public:
-    void* allocate(size_type num_bytes) const override;
+    void* allocate(size_type num_bytes) override;
 
-    void deallocate(void* ptr) const override;
+    void deallocate(void* ptr) override;
 
     CudaUnifiedAllocator(int device_id);
 
     CudaUnifiedAllocator(int device_id, unsigned int flags);
 
+protected:
+    bool check_environment(int device_id, CUstream_st* stream) const override;
+
 private:
     int device_id_;
     unsigned int flags_;
@@ -154,16 +174,19 @@ class CudaUnifiedAllocator : public CudaAllocatorBase, public CpuAllocatorBase {
 
 
 /*
- * Allocator using cudaMallocHost.
+ * Allocator using cudaHostMalloc.
  */
 class CudaHostAllocator : public CudaAllocatorBase, public CpuAllocatorBase {
 public:
-    void* allocate(size_type num_bytes) const override;
+    void* allocate(size_type num_bytes) override;
 
-    void deallocate(void* ptr) const override;
+    void deallocate(void* ptr) override;
 
     CudaHostAllocator(int device_id);
 
+protected:
+    bool check_environment(int device_id, CUstream_st* stream) const override;
+
 private:
     int device_id_;
 };
@@ -174,38 +197,72 @@ class CudaHostAllocator : public CudaAllocatorBase, public CpuAllocatorBase {
  */
 class HipAllocator : public HipAllocatorBase {
 public:
-    void* allocate(size_type num_bytes) const override;
+    void* allocate(size_type num_bytes) override;
+
+    void deallocate(void* ptr) override;
+};
+
+
+/*
+ * Allocator using hipMallocAsync.
+ */
+class HipAsyncAllocator : public HipAllocatorBase {
+public:
+    void* allocate(size_type num_bytes) override;
 
-    void deallocate(void* ptr) const override;
+    void deallocate(void* ptr) override;
+
+    HipAsyncAllocator(GKO_HIP_STREAM_STRUCT* stream);
+
+protected:
+    bool check_environment(int device_id,
+                           GKO_HIP_STREAM_STRUCT* stream) const override;
+
+private:
+    GKO_HIP_STREAM_STRUCT* stream_;
 };
 
 
 /*
- * Allocator using sycl::malloc_device.
+ * Allocator using hipMallocManaged
  */
-class DpcppAllocator : public DpcppAllocatorBase {
+class HipUnifiedAllocator : public HipAllocatorBase, public CpuAllocatorBase {
 public:
-    using DpcppAllocatorBase::DpcppAllocatorBase;
+    void* allocate(size_type num_bytes) override;
+
+    void deallocate(void* ptr) override;
+
+    HipUnifiedAllocator(int device_id);
+
+    HipUnifiedAllocator(int device_id, unsigned int flags);
 
 protected:
-    void* allocate_impl(sycl::queue* queue, size_type num_bytes) const override;
+    bool check_environment(int device_id,
+                           GKO_HIP_STREAM_STRUCT* stream) const override;
 
-    void deallocate_impl(sycl::queue* queue, void* ptr) const override;
+private:
+    int device_id_;
+    unsigned int flags_;
 };
 
 
 /*
- * Allocator using sycl::malloc_shared.
+ * Allocator using hipHostAlloc.
  */
-class DpcppUnifiedAllocator : public DpcppAllocatorBase,
-                              public CpuAllocatorBase {
+class HipHostAllocator : public HipAllocatorBase, public CpuAllocatorBase {
 public:
-    using DpcppAllocatorBase::DpcppAllocatorBase;
+    void* allocate(size_type num_bytes) override;
+
+    void deallocate(void* ptr) override;
+
+    HipHostAllocator(int device_id);
 
 protected:
-    void* allocate_impl(sycl::queue* queue, size_type num_bytes) const override;
+    bool check_environment(int device_id,
+                           GKO_HIP_STREAM_STRUCT* stream) const override;
 
-    void deallocate_impl(sycl::queue* queue, void* ptr) const override;
+private:
+    int device_id_;
 };