From 1448e12d7093450825f1688c2da48cf971a0bb85 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 30 Mar 2023 12:20:20 +0200 Subject: [PATCH 01/14] add allocator support to all executors --- benchmark/utils/general.hpp | 4 +- core/CMakeLists.txt | 1 + core/base/memory.cpp | 59 ++++ core/device_hooks/cuda_hooks.cpp | 44 +++ core/device_hooks/dpcpp_hooks.cpp | 18 ++ core/device_hooks/hip_hooks.cpp | 19 ++ core/test/base/executor.cpp | 79 +----- cuda/CMakeLists.txt | 4 + cuda/base/device.cpp | 65 +++++ cuda/base/executor.cpp | 173 ++---------- cuda/base/memory.cpp | 168 +++++++++++ cuda/base/nvtx.cpp | 96 +++++++ .../stream.cpp} | 53 ++-- cuda/test/base/CMakeLists.txt | 2 +- cuda/test/base/cuda_executor.cu | 15 +- cuda/test/base/memory.cpp | 126 +++++++++ cuda/test/utils.hpp | 4 +- devices/cuda/executor.cpp | 27 -- devices/hip/executor.cpp | 34 --- devices/omp/executor.cpp | 7 +- dpcpp/base/executor.dp.cpp | 33 +++ .../base/memory.dp.cpp | 47 ++-- dpcpp/test/base/CMakeLists.txt | 1 + dpcpp/test/base/memory.dp.cpp | 98 +++++++ .../adaptiveprecision-blockjacobi.cpp | 7 +- examples/cb-gmres/cb-gmres.cpp | 7 +- examples/custom-logger/custom-logger.cpp | 7 +- .../custom-matrix-format.cpp | 7 +- .../custom-stopping-criterion.cpp | 7 +- .../ilu-preconditioned-solver.cpp | 7 +- .../inverse-iteration/inverse-iteration.cpp | 7 +- .../ir-ilu-preconditioned-solver.cpp | 7 +- .../iterative-refinement.cpp | 7 +- .../minimal-cuda-solver.cpp | 2 +- .../mixed-multigrid-preconditioned-solver.cpp | 7 +- .../mixed-multigrid-solver.cpp | 7 +- .../mixed-precision-ir/mixed-precision-ir.cpp | 7 +- examples/mixed-spmv/mixed-spmv.cpp | 7 +- ...igrid-preconditioned-solver-customized.cpp | 7 +- .../multigrid-preconditioned-solver.cpp | 7 +- .../nine-pt-stencil-solver.cpp | 7 +- examples/papi-logging/papi-logging.cpp | 7 +- .../performance-debugging.cpp | 7 +- examples/poisson-solver/poisson-solver.cpp | 7 +- .../preconditioned-solver.cpp | 7 +- .../simple-solver-logging.cpp | 7 +- examples/simple-solver/simple-solver.cpp | 7 +- .../three-pt-stencil-solver.cpp | 7 +- hip/CMakeLists.txt | 4 + hip/base/device.hip.cpp | 67 +++++ hip/base/executor.hip.cpp | 137 +-------- hip/base/memory.hip.cpp | 97 +++++++ hip/base/roctx.hip.cpp | 70 +++++ hip/base/stream.hip.cpp | 78 +++++ hip/test/base/CMakeLists.txt | 1 - hip/test/base/hip_executor.hip.cpp | 14 +- hip/test/utils.hip.hpp | 4 +- include/ginkgo/core/base/executor.hpp | 266 +++++------------- include/ginkgo/core/base/fwd_defs.hpp | 90 ++++++ include/ginkgo/core/base/memory.hpp | 211 ++++++++++++++ include/ginkgo/core/base/stream.hpp | 124 ++++++++ include/ginkgo/ginkgo.hpp | 3 + test/utils/executor.hpp | 19 +- test/utils/mpi/executor.hpp | 5 +- 64 files changed, 1755 insertions(+), 775 deletions(-) create mode 100644 core/base/memory.cpp create mode 100644 cuda/base/device.cpp create mode 100644 cuda/base/memory.cpp create mode 100644 cuda/base/nvtx.cpp rename cuda/{test/base/cuda_executor_reset.cpp => base/stream.cpp} (62%) create mode 100644 cuda/test/base/memory.cpp rename hip/test/base/hip_executor_reset.cpp => dpcpp/base/memory.dp.cpp (63%) create mode 100644 dpcpp/test/base/memory.dp.cpp create mode 100644 hip/base/device.hip.cpp create mode 100644 hip/base/memory.hip.cpp create mode 100644 hip/base/roctx.hip.cpp create mode 100644 hip/base/stream.hip.cpp create mode 100644 include/ginkgo/core/base/fwd_defs.hpp create mode 100644 include/ginkgo/core/base/memory.hpp create mode 100644 include/ginkgo/core/base/stream.hpp diff --git a/benchmark/utils/general.hpp b/benchmark/utils/general.hpp index 92c3e5c9b13..35077f66d4b 100644 --- a/benchmark/utils/general.hpp +++ b/benchmark/utils/general.hpp @@ -337,12 +337,12 @@ const std::map(bool)>> {"cuda", [](bool) { return gko::CudaExecutor::create(FLAGS_device_id, - gko::OmpExecutor::create(), true); + gko::OmpExecutor::create()); }}, {"hip", [](bool) { return gko::HipExecutor::create(FLAGS_device_id, - gko::OmpExecutor::create(), true); + gko::OmpExecutor::create()); }}, {"dpcpp", [](bool use_gpu_timer) { auto property = dpcpp_queue_property::in_order; diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt index 2f9643115c9..49cf89b66d6 100644 --- a/core/CMakeLists.txt +++ b/core/CMakeLists.txt @@ -10,6 +10,7 @@ target_sources(ginkgo base/device_matrix_data.cpp base/executor.cpp base/index_set.cpp + base/memory.cpp base/mpi.cpp base/mtx_io.cpp base/perturbation.cpp diff --git a/core/base/memory.cpp b/core/base/memory.cpp new file mode 100644 index 00000000000..88d97bcc765 --- /dev/null +++ b/core/base/memory.cpp @@ -0,0 +1,59 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include + + +#include + + +namespace gko { + + +void* CpuAllocator::allocate(size_type num_bytes) const +{ + auto ptr = ::operator new (num_bytes, std::nothrow_t{}); + GKO_ENSURE_ALLOCATED(ptr, "cpu", num_bytes); + return ptr; +} + + +void CpuAllocator::deallocate(void* ptr) const +{ + ::operator delete (ptr, std::nothrow_t{}); +} + + +} // namespace gko \ No newline at end of file diff --git a/core/device_hooks/cuda_hooks.cpp b/core/device_hooks/cuda_hooks.cpp index dd4c3f19f7c..cdecf735a9d 100644 --- a/core/device_hooks/cuda_hooks.cpp +++ b/core/device_hooks/cuda_hooks.cpp @@ -35,6 +35,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include +#include #include #include #include @@ -52,6 +54,45 @@ version version_info::get_cuda_version() noexcept } +void* CudaAllocator::allocate(size_type num_bytes) const GKO_NOT_COMPILED(cuda); + + +void CudaAllocator::deallocate(void* dev_ptr) const GKO_NOT_COMPILED(cuda); + + +CudaAsyncAllocator::CudaAsyncAllocator(CUstream_st* stream) + GKO_NOT_COMPILED(cuda); + + +void* CudaAsyncAllocator::allocate(size_type num_bytes) const + GKO_NOT_COMPILED(cuda); + + +void CudaAsyncAllocator::deallocate(void* dev_ptr) const GKO_NOT_COMPILED(cuda); + + +CudaUnifiedAllocator::CudaUnifiedAllocator(int device_id, unsigned int flags) + GKO_NOT_COMPILED(cuda); + + +void* CudaUnifiedAllocator::allocate(size_type num_bytes) const + GKO_NOT_COMPILED(cuda); + + +void CudaUnifiedAllocator::deallocate(void* dev_ptr) const + GKO_NOT_COMPILED(cuda); + + +CudaHostAllocator::CudaHostAllocator(int device_id) GKO_NOT_COMPILED(cuda); + + +void* CudaHostAllocator::allocate(size_type num_bytes) const + GKO_NOT_COMPILED(cuda); + + +void CudaHostAllocator::deallocate(void* dev_ptr) const GKO_NOT_COMPILED(cuda); + + std::shared_ptr CudaExecutor::create( int device_id, std::shared_ptr master, bool device_reset, allocation_mode alloc_mode, CUstream_st* stream) @@ -154,6 +195,9 @@ scoped_device_id_guard::scoped_device_id_guard(const CudaExecutor* exec, GKO_NOT_COMPILED(cuda); +cuda_stream::cuda_stream() GKO_NOT_COMPILED(cuda); + + cuda_stream::cuda_stream(int device_id) GKO_NOT_COMPILED(cuda); diff --git a/core/device_hooks/dpcpp_hooks.cpp b/core/device_hooks/dpcpp_hooks.cpp index a08f1f608fb..0ee3e6f289f 100644 --- a/core/device_hooks/dpcpp_hooks.cpp +++ b/core/device_hooks/dpcpp_hooks.cpp @@ -36,6 +36,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include #include #include @@ -52,6 +53,23 @@ version version_info::get_dpcpp_version() noexcept } +void* DpcppAllocator::allocate_impl(sycl::queue* queue, size_type size) const + GKO_NOT_COMPILED(dpcpp); + + +void DpcppAllocator::deallocate_impl(sycl::queue* queue, void* ptr) const + GKO_NOT_COMPILED(dpcpp); + + +void* DpcppUnifiedAllocator::allocate_impl(sycl::queue* queue, + size_type size) const + GKO_NOT_COMPILED(dpcpp); + + +void DpcppUnifiedAllocator::deallocate_impl(sycl::queue* queue, void* ptr) const + GKO_NOT_COMPILED(dpcpp); + + std::shared_ptr DpcppExecutor::create( int device_id, std::shared_ptr master, std::string device_type, dpcpp_queue_property property) diff --git a/core/device_hooks/hip_hooks.cpp b/core/device_hooks/hip_hooks.cpp index 50637f7b3f0..739dac39f08 100644 --- a/core/device_hooks/hip_hooks.cpp +++ b/core/device_hooks/hip_hooks.cpp @@ -53,6 +53,22 @@ version version_info::get_hip_version() noexcept } +void* HipAllocator::allocate(size_type num_bytes) GKO_NOT_COMPILED(hip); + + +void HipAllocator::deallocate(void* dev_ptr) GKO_NOT_COMPILED(hip); + + +HipAsyncAllocator::HipAsyncAllocator(GKO_HIP_STREAM_STRUCT* stream) + GKO_NOT_COMPILED(hip); + + +void* HipAsyncAllocator::allocate(size_type num_bytes) GKO_NOT_COMPILED(hip); + + +void HipAsyncAllocator::deallocate(void* dev_ptr) GKO_NOT_COMPILED(hip); + + std::shared_ptr HipExecutor::create( int device_id, std::shared_ptr master, bool device_reset, allocation_mode alloc_mode, GKO_HIP_STREAM_STRUCT* stream) @@ -155,6 +171,9 @@ scoped_device_id_guard::scoped_device_id_guard(const HipExecutor* exec, GKO_NOT_COMPILED(hip); +hip_stream::hip_stream() GKO_NOT_COMPILED(hip); + + hip_stream::hip_stream(int device_id) GKO_NOT_COMPILED(hip); diff --git a/core/test/base/executor.cpp b/core/test/base/executor.cpp index 0d64dfcf3cf..71064cf01d2 100644 --- a/core/test/base/executor.cpp +++ b/core/test/base/executor.cpp @@ -35,6 +35,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include "ginkgo/core/base/memory.hpp" #if defined(__unix__) || defined(__APPLE__) @@ -263,35 +264,6 @@ TEST(CudaExecutor, KnowsItsDeviceId) } -TEST(CudaExecutor, CanGetDeviceResetBoolean) -{ - auto omp = gko::OmpExecutor::create(); - auto cuda = gko::CudaExecutor::create(0, omp); - - ASSERT_EQ(false, cuda->get_device_reset()); -} - - -TEST(CudaExecutor, CanSetDefaultDeviceResetBoolean) -{ - auto omp = gko::OmpExecutor::create(); - auto cuda = gko::CudaExecutor::create(0, omp, true); - - ASSERT_EQ(true, cuda->get_device_reset()); -} - - -TEST(CudaExecutor, CanSetDeviceResetBoolean) -{ - auto omp = gko::OmpExecutor::create(); - auto cuda = gko::CudaExecutor::create(0, omp); - - cuda->set_device_reset(true); - - ASSERT_EQ(true, cuda->get_device_reset()); -} - - TEST(HipExecutor, KnowsItsMaster) { auto omp = gko::OmpExecutor::create(); @@ -310,35 +282,6 @@ TEST(HipExecutor, KnowsItsDeviceId) } -TEST(HipExecutor, CanGetDeviceResetBoolean) -{ - auto omp = gko::OmpExecutor::create(); - auto hip = gko::HipExecutor::create(0, omp); - - ASSERT_EQ(false, hip->get_device_reset()); -} - - -TEST(HipExecutor, CanSetDefaultDeviceResetBoolean) -{ - auto omp = gko::OmpExecutor::create(); - auto hip = gko::HipExecutor::create(0, omp, true); - - ASSERT_EQ(true, hip->get_device_reset()); -} - - -TEST(HipExecutor, CanSetDeviceResetBoolean) -{ - auto omp = gko::OmpExecutor::create(); - auto hip = gko::HipExecutor::create(0, omp); - - hip->set_device_reset(true); - - ASSERT_EQ(true, hip->get_device_reset()); -} - - TEST(DpcppExecutor, KnowsItsMaster) { auto omp = gko::OmpExecutor::create(); @@ -442,20 +385,11 @@ TEST(Executor, CanVerifyMemory) } -template -struct mock_free : T { - /** - * @internal Due to a bug with gcc 5.3, the constructor needs to be called - * with `()` operator instead of `{}`. - */ - template - mock_free(Params&&... params) : T(std::forward(params)...) - {} - - void raw_free(void* ptr) const noexcept override +struct MockAllocator : gko::CpuAllocator { + void deallocate(void* ptr) const noexcept override { called_free = true; - T::raw_free(ptr); + CpuAllocator::deallocate(ptr); } mutable bool called_free{false}; @@ -464,12 +398,13 @@ struct mock_free : T { TEST(ExecutorDeleter, DeletesObject) { - auto ref = std::make_shared>(); + auto alloc = std::make_shared(); + auto ref = gko::ReferenceExecutor::create(alloc); auto x = ref->alloc(5); gko::executor_deleter{ref}(x); - ASSERT_TRUE(ref->called_free); + ASSERT_TRUE(alloc->called_free); } diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt index bbe7a953dbd..aecf4e1c2f2 100644 --- a/cuda/CMakeLists.txt +++ b/cuda/CMakeLists.txt @@ -1,11 +1,15 @@ add_library(ginkgo_cuda $ "") target_sources(ginkgo_cuda PRIVATE + base/device.cpp base/device_matrix_data_kernels.cu base/exception.cpp base/executor.cpp base/index_set_kernels.cpp + base/memory.cpp + base/nvtx.cpp base/scoped_device_id.cpp + base/stream.cpp base/timer.cpp base/version.cpp components/prefix_sum_kernels.cu diff --git a/cuda/base/device.cpp b/cuda/base/device.cpp new file mode 100644 index 00000000000..31ab5bcde63 --- /dev/null +++ b/cuda/base/device.cpp @@ -0,0 +1,65 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "cuda/base/device.hpp" + + +#include + + +#include + + +#include "cuda/base/scoped_device_id.hpp" + + +namespace gko { +namespace kernels { +namespace cuda { + + +void reset_device(int device_id) +{ + gko::detail::cuda_scoped_device_id_guard guard{device_id}; + cudaDeviceReset(); +} + + +void destroy_event(CUevent_st* event) +{ + GKO_ASSERT_NO_CUDA_ERRORS(cudaEventDestroy(event)); +} + + +} // namespace cuda +} // namespace kernels +} // namespace gko diff --git a/cuda/base/executor.cpp b/cuda/base/executor.cpp index e474d9c9f49..f6e838dd2dd 100644 --- a/cuda/base/executor.cpp +++ b/cuda/base/executor.cpp @@ -39,18 +39,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#ifdef GKO_LEGACY_NVTX -#include -#else -#include -#endif #include #include #include #include -#include +#include #include "cuda/base/config.hpp" @@ -65,25 +60,38 @@ namespace gko { #include "common/cuda_hip/base/executor.hpp.inc" +std::unique_ptr allocator_from_mode(int device_id, + allocation_mode mode) +{ + switch (mode) { + case allocation_mode::device: + return std::make_unique(); + case allocation_mode::unified_global: + return std::make_unique(device_id); + case allocation_mode::unified_host: + return std::make_unique(device_id); + default: + GKO_NOT_SUPPORTED(mode); + } +} + + std::shared_ptr CudaExecutor::create( int device_id, std::shared_ptr master, bool device_reset, allocation_mode alloc_mode, cudaStream_t stream) { return std::shared_ptr( - new CudaExecutor(device_id, std::move(master), device_reset, alloc_mode, - stream), - [device_id](CudaExecutor* exec) { - auto device_reset = exec->get_device_reset(); - std::lock_guard guard( - nvidia_device::get_mutex(device_id)); - delete exec; - auto& num_execs = nvidia_device::get_num_execs(device_id); - num_execs--; - if (!num_execs && device_reset) { - detail::cuda_scoped_device_id_guard g(device_id); - cudaDeviceReset(); - } - }); + new CudaExecutor(device_id, std::move(master), + allocator_from_mode(device_id, alloc_mode), stream)); +} + + +std::shared_ptr CudaExecutor::create( + int device_id, std::shared_ptr master, + std::shared_ptr alloc, cudaStream_t stream) +{ + return std::shared_ptr(new CudaExecutor( + device_id, std::move(master), std::move(alloc), stream)); } @@ -123,41 +131,14 @@ void OmpExecutor::raw_copy_to(const CudaExecutor* dest, size_type num_bytes, void CudaExecutor::raw_free(void* ptr) const noexcept { detail::cuda_scoped_device_id_guard g(this->get_device_id()); - auto error_code = cudaFree(ptr); - if (error_code != cudaSuccess) { -#if GKO_VERBOSE_LEVEL >= 1 - // Unfortunately, if memory free fails, there's not much we can do - std::cerr << "Unrecoverable CUDA error on device " - << this->get_device_id() << " in " << __func__ << ": " - << cudaGetErrorName(error_code) << ": " - << cudaGetErrorString(error_code) << std::endl - << "Exiting program" << std::endl; -#endif // GKO_VERBOSE_LEVEL >= 1 - std::exit(error_code); - } + alloc_->deallocate(ptr); } void* CudaExecutor::raw_alloc(size_type num_bytes) const { - void* dev_ptr = nullptr; detail::cuda_scoped_device_id_guard g(this->get_device_id()); - int error_code = 0; - if (this->alloc_mode_ == allocation_mode::unified_host) { - error_code = cudaMallocManaged(&dev_ptr, num_bytes, cudaMemAttachHost); - } else if (this->alloc_mode_ == allocation_mode::unified_global) { - error_code = - cudaMallocManaged(&dev_ptr, num_bytes, cudaMemAttachGlobal); - } else if (this->alloc_mode_ == allocation_mode::device) { - error_code = cudaMalloc(&dev_ptr, num_bytes); - } else { - GKO_NOT_SUPPORTED(this->alloc_mode_); - } - if (error_code != cudaErrorMemoryAllocation) { - GKO_ASSERT_NO_CUDA_ERRORS(error_code); - } - GKO_ENSURE_ALLOCATED(dev_ptr, "cuda", num_bytes); - return dev_ptr; + return alloc_->allocate(num_bytes); } @@ -298,98 +279,4 @@ void CudaExecutor::init_handles() } -cuda_stream::cuda_stream(int device_id) : stream_{}, device_id_(device_id) -{ - detail::cuda_scoped_device_id_guard g(device_id_); - GKO_ASSERT_NO_CUDA_ERRORS(cudaStreamCreate(&stream_)); -} - - -cuda_stream::~cuda_stream() -{ - if (stream_) { - detail::cuda_scoped_device_id_guard g(device_id_); - cudaStreamDestroy(stream_); - } -} - - -cuda_stream::cuda_stream(cuda_stream&& other) - : stream_{std::exchange(other.stream_, nullptr)}, - device_id_(std::exchange(other.device_id_, -1)) -{} - - -CUstream_st* cuda_stream::get() const { return stream_; } - - -namespace log { - - -// "GKO" in ASCII to avoid collision with other application's categories -constexpr static uint32 category_magic_offset = 0x676B6FU; - - -void init_nvtx() -{ -#define NAMED_CATEGORY(_name) \ - nvtxNameCategory(static_cast(profile_event_category::_name) + \ - category_magic_offset, \ - "gko::" #_name) - NAMED_CATEGORY(memory); - NAMED_CATEGORY(operation); - NAMED_CATEGORY(object); - NAMED_CATEGORY(linop); - NAMED_CATEGORY(factory); - NAMED_CATEGORY(solver); - NAMED_CATEGORY(criterion); - NAMED_CATEGORY(user); - NAMED_CATEGORY(internal); -#undef NAMED_CATEGORY -} - - -std::function begin_nvtx_fn( - uint32_t color_argb) -{ - return [color_argb](const char* name, profile_event_category category) { - nvtxEventAttributes_t attr{}; - attr.version = NVTX_VERSION; - attr.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; - attr.category = static_cast(category) + category_magic_offset; - attr.colorType = NVTX_COLOR_ARGB; - attr.color = color_argb; - attr.payloadType = NVTX_PAYLOAD_UNKNOWN; - attr.messageType = NVTX_MESSAGE_TYPE_ASCII; - attr.message.ascii = name; - nvtxRangePushEx(&attr); - }; -} - - -void end_nvtx(const char* name, profile_event_category) { nvtxRangePop(); } - - -} // namespace log - - -namespace kernels { -namespace cuda { - - -void reset_device(int device_id) -{ - gko::detail::cuda_scoped_device_id_guard guard{device_id}; - cudaDeviceReset(); -} - - -void destroy_event(CUevent_st* event) -{ - GKO_ASSERT_NO_CUDA_ERRORS(cudaEventDestroy(event)); -} - - -} // namespace cuda -} // namespace kernels } // namespace gko diff --git a/cuda/base/memory.cpp b/cuda/base/memory.cpp new file mode 100644 index 00000000000..11dee81ad42 --- /dev/null +++ b/cuda/base/memory.cpp @@ -0,0 +1,168 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include + + +#include + + +#include "cuda/base/scoped_device_id.hpp" + + +namespace gko { + + +#define GKO_ASSERT_NO_CUDA_ALLOCATION_ERRORS(_operation, _size) \ + { \ + auto error_code = _operation; \ + if (error_code == cudaErrorMemoryAllocation) { \ + throw AllocationError(__FILE__, __LINE__, "cuda", _size); \ + } else { \ + GKO_ASSERT_NO_CUDA_ERRORS(error_code); \ + } \ + } + + +#if GKO_VERBOSE_LEVEL >= 1 +#define GKO_EXIT_ON_CUDA_ERROR(_operation) \ + { \ + const auto error_code = _operation; \ + if (error_code != cudaSuccess) { \ + int device_id{-1}; \ + cudaGetDevice(&device_id); \ + std::cerr << "Unrecoverable CUDA error on device " << device_id \ + << " in " << __func__ << ":" << __LINE__ << ": " \ + << cudaGetErrorName(error_code) << ": " \ + << cudaGetErrorString(error_code) << std::endl \ + << "Exiting program" << std::endl; \ + std::exit(error_code); \ + } \ + } +#else +#define GKO_EXIT_ON_CUDA_ERROR(_operation) \ + { \ + const auto error_code = _operation; \ + if (error_code != cudaSuccess) { \ + std::exit(error_code); \ + } \ + } +#endif + + +void* CudaAllocator::allocate(size_type num_bytes) const +{ + void* ptr{}; + GKO_ASSERT_NO_CUDA_ALLOCATION_ERRORS(cudaMalloc(&ptr, num_bytes), + num_bytes); + return ptr; +} + + +void CudaAllocator::deallocate(void* ptr) const +{ + GKO_EXIT_ON_CUDA_ERROR(cudaFree(ptr)); +} + + +CudaAsyncAllocator::CudaAsyncAllocator(cudaStream_t stream) : stream_{stream} {} + + +void* CudaAsyncAllocator::allocate(size_type num_bytes) const +{ + void* ptr{}; + GKO_ASSERT_NO_CUDA_ALLOCATION_ERRORS( + cudaMallocAsync(&ptr, num_bytes, stream_), num_bytes); + return ptr; +} + +void CudaAsyncAllocator::deallocate(void* ptr) const +{ + GKO_EXIT_ON_CUDA_ERROR(cudaFreeAsync(ptr, stream_)); +} + + +CudaUnifiedAllocator::CudaUnifiedAllocator(int device_id) + : CudaUnifiedAllocator{device_id, cudaMemAttachGlobal} +{} + + +CudaUnifiedAllocator::CudaUnifiedAllocator(int device_id, unsigned int flags) + : device_id_{device_id}, flags_{flags} +{} + + +void* CudaUnifiedAllocator::allocate(size_type num_bytes) const +{ + // we need to set the device ID in case this gets used in a host executor + detail::cuda_scoped_device_id_guard g(device_id_); + void* ptr{}; + GKO_ASSERT_NO_CUDA_ALLOCATION_ERRORS( + cudaMallocManaged(&ptr, num_bytes, flags_), num_bytes); + return ptr; +} + + +void CudaUnifiedAllocator::deallocate(void* ptr) const +{ + // we need to set the device ID in case this gets used in a host executor + detail::cuda_scoped_device_id_guard g(device_id_); + GKO_EXIT_ON_CUDA_ERROR(cudaFree(ptr)); +} + + +CudaHostAllocator::CudaHostAllocator(int device_id) : device_id_{device_id} {} + + +void* CudaHostAllocator::allocate(size_type num_bytes) const +{ + // we need to set the device ID in case this gets used in a host executor + detail::cuda_scoped_device_id_guard g(device_id_); + void* ptr{}; + GKO_ASSERT_NO_CUDA_ALLOCATION_ERRORS(cudaMallocHost(&ptr, num_bytes), + num_bytes); + return ptr; +} + + +void CudaHostAllocator::deallocate(void* ptr) const +{ + // we need to set the device ID in case this gets used in a host executor + detail::cuda_scoped_device_id_guard g(device_id_); + GKO_EXIT_ON_CUDA_ERROR(cudaFreeHost(ptr)); +} + + +} // namespace gko \ No newline at end of file diff --git a/cuda/base/nvtx.cpp b/cuda/base/nvtx.cpp new file mode 100644 index 00000000000..e313c110ea2 --- /dev/null +++ b/cuda/base/nvtx.cpp @@ -0,0 +1,96 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include +#ifdef GKO_LEGACY_NVTX +#include +#else +#include +#endif + + +#include + + +namespace gko { +namespace log { + + +// "GKO" in ASCII to avoid collision with other application's categories +constexpr static uint32 category_magic_offset = 0x676B6FU; + + +void init_nvtx() +{ +#define NAMED_CATEGORY(_name) \ + nvtxNameCategory(static_cast(profile_event_category::_name) + \ + category_magic_offset, \ + "gko::" #_name) + NAMED_CATEGORY(memory); + NAMED_CATEGORY(operation); + NAMED_CATEGORY(object); + NAMED_CATEGORY(linop); + NAMED_CATEGORY(factory); + NAMED_CATEGORY(solver); + NAMED_CATEGORY(criterion); + NAMED_CATEGORY(user); + NAMED_CATEGORY(internal); +#undef NAMED_CATEGORY +} + + +std::function begin_nvtx_fn( + uint32_t color_argb) +{ + return [color_argb](const char* name, profile_event_category category) { + nvtxEventAttributes_t attr{}; + attr.version = NVTX_VERSION; + attr.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; + attr.category = static_cast(category) + category_magic_offset; + attr.colorType = NVTX_COLOR_ARGB; + attr.color = color_argb; + attr.payloadType = NVTX_PAYLOAD_UNKNOWN; + attr.messageType = NVTX_MESSAGE_TYPE_ASCII; + attr.message.ascii = name; + nvtxRangePushEx(&attr); + }; +} + + +void end_nvtx(const char* name, profile_event_category) { nvtxRangePop(); } + + +} // namespace log +} // namespace gko diff --git a/cuda/test/base/cuda_executor_reset.cpp b/cuda/base/stream.cpp similarity index 62% rename from cuda/test/base/cuda_executor_reset.cpp rename to cuda/base/stream.cpp index c8159b9c4d7..8c6aa92c28b 100644 --- a/cuda/test/base/cuda_executor_reset.cpp +++ b/cuda/base/stream.cpp @@ -30,58 +30,45 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#include +#include -#include +#include +#include -#include +#include "cuda/base/scoped_device_id.hpp" -namespace { +namespace gko { -#define GTEST_ASSERT_NO_EXIT(statement) \ - ASSERT_EXIT({ {statement} exit(0); }, ::testing::ExitedWithCode(0), "") +cuda_stream::cuda_stream() : stream_{}, device_id_{-1} {} -TEST(DeviceReset, HipCuda) +cuda_stream::cuda_stream(int device_id) : stream_{}, device_id_(device_id) { - GTEST_ASSERT_NO_EXIT({ - auto ref = gko::ReferenceExecutor::create(); - auto hip = gko::HipExecutor::create(0, ref, true); - auto cuda = gko::CudaExecutor::create(0, ref, true); - }); + detail::cuda_scoped_device_id_guard g(device_id_); + GKO_ASSERT_NO_CUDA_ERRORS(cudaStreamCreate(&stream_)); } -TEST(DeviceReset, CudaHip) +cuda_stream::~cuda_stream() { - GTEST_ASSERT_NO_EXIT({ - auto ref = gko::ReferenceExecutor::create(); - auto cuda = gko::CudaExecutor::create(0, ref, true); - auto hip = gko::HipExecutor::create(0, ref, true); - }); + if (stream_) { + detail::cuda_scoped_device_id_guard g(device_id_); + cudaStreamDestroy(stream_); + } } -void func() -{ - auto ref = gko::ReferenceExecutor::create(); - auto exec = gko::CudaExecutor::create(0, ref, true); -} +cuda_stream::cuda_stream(cuda_stream&& other) + : stream_{std::exchange(other.stream_, nullptr)}, + device_id_(std::exchange(other.device_id_, -1)) +{} -TEST(DeviceReset, CudaCuda) -{ - GTEST_ASSERT_NO_EXIT({ - std::thread t1(func); - std::thread t2(func); - t1.join(); - t2.join(); - }); -} +CUstream_st* cuda_stream::get() const { return stream_; } -} // namespace +} // namespace gko diff --git a/cuda/test/base/CMakeLists.txt b/cuda/test/base/CMakeLists.txt index 9be3caf9faa..a213e65277a 100644 --- a/cuda/test/base/CMakeLists.txt +++ b/cuda/test/base/CMakeLists.txt @@ -1,7 +1,6 @@ ginkgo_create_cuda_test(array) ginkgo_create_cuda_test(cuda_executor) ginkgo_create_test(index_set) -ginkgo_create_test(cuda_executor_reset ADDITIONAL_LIBRARIES Threads::Threads) if(GINKGO_HAVE_HWLOC) find_package(NUMA REQUIRED) ginkgo_create_cuda_test(cuda_executor_topology ADDITIONAL_LIBRARIES NUMA::NUMA) @@ -10,4 +9,5 @@ ginkgo_create_cuda_test(exception_helpers) ginkgo_create_cuda_test(kernel_launch) ginkgo_create_cuda_test(lin_op) ginkgo_create_cuda_test(math) +ginkgo_create_test(memory) ginkgo_create_cuda_test(scoped_device_id) diff --git a/cuda/test/base/cuda_executor.cu b/cuda/test/base/cuda_executor.cu index 5f489ac22f0..afb23c06186 100644 --- a/cuda/test/base/cuda_executor.cu +++ b/cuda/test/base/cuda_executor.cu @@ -42,6 +42,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include + #include "common/cuda_hip/base/executor.hpp.inc" #include "cuda/base/scoped_device_id.hpp" @@ -103,18 +105,19 @@ protected: ASSERT_GT(gko::CudaExecutor::get_num_devices(), 0); #ifdef GKO_TEST_NONDEFAULT_STREAM cuda = gko::CudaExecutor::create( - 0, omp, false, gko::default_cuda_alloc_mode, stream.get()); + 0, omp, std::make_shared(), stream.get()); cuda2 = gko::CudaExecutor::create( - gko::CudaExecutor::get_num_devices() - 1, omp, false, - gko::default_cuda_alloc_mode, other_stream.get()); + gko::CudaExecutor::get_num_devices() - 1, omp, + std::make_shared(), other_stream.get()); cuda3 = gko::CudaExecutor::create( - 0, omp, false, gko::allocation_mode::unified_global, stream.get()); + 0, omp, std::make_shared(0), + stream.get()); #else cuda = gko::CudaExecutor::create(0, omp); cuda2 = gko::CudaExecutor::create( gko::CudaExecutor::get_num_devices() - 1, omp); - cuda3 = gko::CudaExecutor::create(0, omp, false, - gko::allocation_mode::unified_global); + cuda3 = gko::CudaExecutor::create( + 0, omp, std::make_shared(0)); #endif } diff --git a/cuda/test/base/memory.cpp b/cuda/test/base/memory.cpp new file mode 100644 index 00000000000..a329817f4af --- /dev/null +++ b/cuda/test/base/memory.cpp @@ -0,0 +1,126 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include +#include + + +#include + + +#include +#include +#include + + +#include "cuda/test/utils.hpp" + + +namespace { + + +class Memory : public CudaTestFixture { +protected: + Memory() + : host_exec_with_pinned{gko::OmpExecutor::create( + std::make_shared(0))}, + host_exec_with_unified{gko::OmpExecutor::create( + std::make_shared(0))}, + exec_with_normal{gko::CudaExecutor::create( + 0, ref, std::make_shared(), + exec->get_stream())}, + exec_with_async{gko::CudaExecutor::create( + 0, host_exec_with_pinned, + std::make_shared(exec->get_stream()), + exec->get_stream())}, + exec_with_unified{gko::CudaExecutor::create( + 0, host_exec_with_unified, + std::make_shared(0), + exec->get_stream())} + {} + + std::shared_ptr host_exec_with_pinned; + std::shared_ptr host_exec_with_unified; + std::shared_ptr exec_with_normal; + std::shared_ptr exec_with_async; + std::shared_ptr exec_with_unified; +}; + + +TEST_F(Memory, DeviceAllocationWorks) +{ + gko::array data{exec_with_normal, {1, 2}}; + + GKO_ASSERT_ARRAY_EQ(data, I({1, 2})); +} + + +TEST_F(Memory, AsyncDeviceAllocationWorks) +{ + gko::array data{exec_with_async, {1, 2}}; + + GKO_ASSERT_ARRAY_EQ(data, I({1, 2})); +} + + +TEST_F(Memory, UnifiedDeviceAllocationWorks) +{ + gko::array data{exec_with_unified, {1, 2}}; + exec->synchronize(); + + ASSERT_EQ(data.get_const_data()[0], 1); + ASSERT_EQ(data.get_const_data()[1], 2); +} + + +TEST_F(Memory, HostUnifiedAllocationWorks) +{ + gko::array data{host_exec_with_unified, {1, 2}}; + + ASSERT_EQ(data.get_const_data()[0], 1); + ASSERT_EQ(data.get_const_data()[1], 2); +} + + +TEST_F(Memory, HostPinnedAllocationWorks) +{ + gko::array data{host_exec_with_pinned, {1, 2}}; + + ASSERT_EQ(data.get_const_data()[0], 1); + ASSERT_EQ(data.get_const_data()[1], 2); +} + + +} // namespace diff --git a/cuda/test/utils.hpp b/cuda/test/utils.hpp index 814405ba0d9..e1156b91903 100644 --- a/cuda/test/utils.hpp +++ b/cuda/test/utils.hpp @@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include "cuda/base/device.hpp" @@ -60,8 +61,9 @@ class CudaTestFixture : public ::testing::Test { CudaTestFixture() : ref(gko::ReferenceExecutor::create()), #ifdef GKO_TEST_NONDEFAULT_STREAM + stream(0), exec(gko::CudaExecutor::create( - 0, ref, false, gko::default_cuda_alloc_mode, stream.get())) + 0, ref, std::make_shared(), stream.get())) #else exec(gko::CudaExecutor::create(0, ref)) #endif diff --git a/devices/cuda/executor.cpp b/devices/cuda/executor.cpp index d066d272f81..3789274c3f3 100644 --- a/devices/cuda/executor.cpp +++ b/devices/cuda/executor.cpp @@ -64,31 +64,4 @@ bool CudaExecutor::verify_memory_to(const HipExecutor* dest_exec) const } -void CudaExecutor::increase_num_execs(unsigned device_id) -{ -#ifdef GKO_COMPILING_CUDA_DEVICE - // increase the Cuda Device count only when ginkgo build cuda - std::lock_guard guard(nvidia_device::get_mutex(device_id)); - nvidia_device::get_num_execs(device_id)++; -#endif // GKO_COMPILING_CUDA_DEVICE -} - - -void CudaExecutor::decrease_num_execs(unsigned device_id) -{ -#ifdef GKO_COMPILING_CUDA_DEVICE - // increase the Cuda Device count only when ginkgo build cuda - std::lock_guard guard(nvidia_device::get_mutex(device_id)); - nvidia_device::get_num_execs(device_id)--; -#endif // GKO_COMPILING_CUDA_DEVICE -} - - -unsigned CudaExecutor::get_num_execs(unsigned device_id) -{ - std::lock_guard guard(nvidia_device::get_mutex(device_id)); - return nvidia_device::get_num_execs(device_id); -} - - } // namespace gko diff --git a/devices/hip/executor.cpp b/devices/hip/executor.cpp index 60efb4c53a3..b044074c19e 100644 --- a/devices/hip/executor.cpp +++ b/devices/hip/executor.cpp @@ -61,38 +61,4 @@ bool HipExecutor::verify_memory_to(const CudaExecutor* dest_exec) const } -#if (GINKGO_HIP_PLATFORM_NVCC == 1) -using hip_device_class = nvidia_device; -#else -using hip_device_class = amd_device; -#endif - - -void HipExecutor::increase_num_execs(int device_id) -{ -#ifdef GKO_COMPILING_HIP_DEVICE - // increase the HIP Device count only when ginkgo build hip - std::lock_guard guard(hip_device_class::get_mutex(device_id)); - hip_device_class::get_num_execs(device_id)++; -#endif // GKO_COMPILING_HIP_DEVICE -} - - -void HipExecutor::decrease_num_execs(int device_id) -{ -#ifdef GKO_COMPILING_HIP_DEVICE - // increase the HIP Device count only when ginkgo build hip - std::lock_guard guard(hip_device_class::get_mutex(device_id)); - hip_device_class::get_num_execs(device_id)--; -#endif // GKO_COMPILING_HIP_DEVICE -} - - -int HipExecutor::get_num_execs(int device_id) -{ - std::lock_guard guard(hip_device_class::get_mutex(device_id)); - return hip_device_class::get_num_execs(device_id); -} - - } // namespace gko diff --git a/devices/omp/executor.cpp b/devices/omp/executor.cpp index 352216f7633..f8e700bc2d5 100644 --- a/devices/omp/executor.cpp +++ b/devices/omp/executor.cpp @@ -55,7 +55,10 @@ void OmpExecutor::populate_exec_info(const machine_topology* mach_topo) } -void OmpExecutor::raw_free(void* ptr) const noexcept { std::free(ptr); } +void OmpExecutor::raw_free(void* ptr) const noexcept +{ + return alloc_->deallocate(ptr); +} std::shared_ptr OmpExecutor::get_master() noexcept @@ -72,7 +75,7 @@ std::shared_ptr OmpExecutor::get_master() const noexcept void* OmpExecutor::raw_alloc(size_type num_bytes) const { - return GKO_ENSURE_ALLOCATED(std::malloc(num_bytes), "OMP", num_bytes); + return alloc_->allocate(num_bytes); } diff --git a/dpcpp/base/executor.dp.cpp b/dpcpp/base/executor.dp.cpp index c2015c8664c..d668331a43b 100644 --- a/dpcpp/base/executor.dp.cpp +++ b/dpcpp/base/executor.dp.cpp @@ -51,6 +51,39 @@ namespace gko { namespace detail { +DpcppAllocator::DpcppAllocator(sycl::queue* queue) : queue_{queue} {} + + +void* DpcppAllocator::allocate(size_type size) +{ + return sycl::malloc_device(size, *queue_); +} + + +void DpcppAllocator::deallocate(void* ptr) +{ + queue_->wait_and_throw(); + sycl::free(ptr, queue_->get_context()); +} + + +DpcppUnifiedAllocator::DpcppUnifiedAllocator(sycl::queue* queue) : queue_{queue} +{} + + +void* DpcppUnifiedAllocator::allocate(size_type size) +{ + return sycl::malloc_shared(size, *queue_); +} + + +void DpcppUnifiedAllocator::deallocate(void* ptr) +{ + queue_->wait_and_throw(); + sycl::free(ptr, queue_->get_context()); +} + + const std::vector get_devices(std::string device_type) { std::map device_type_map{ diff --git a/hip/test/base/hip_executor_reset.cpp b/dpcpp/base/memory.dp.cpp similarity index 63% rename from hip/test/base/hip_executor_reset.cpp rename to dpcpp/base/memory.dp.cpp index 39e3252e053..b1ccd007dea 100644 --- a/hip/test/base/hip_executor_reset.cpp +++ b/dpcpp/base/memory.dp.cpp @@ -30,58 +30,43 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#include +#include -#include +#include -#include +namespace gko { -namespace { +DpcppAllocatorBase::DpcppAllocatorBase(sycl::queue* queue) : queue_{queue} {} -#define GTEST_ASSERT_NO_EXIT(statement) \ - ASSERT_EXIT({ {statement} exit(0); }, ::testing::ExitedWithCode(0), "") - - -TEST(DeviceReset, HipCuda) +void* DpcppAllocator::allocate_impl(sycl::queue* queue, + size_type num_bytes) const { - GTEST_ASSERT_NO_EXIT({ - auto ref = gko::ReferenceExecutor::create(); - auto hip = gko::HipExecutor::create(0, ref, true); - auto cuda = gko::CudaExecutor::create(0, ref, true); - }); + return sycl::malloc_device(size, *queue); } -TEST(DeviceReset, CudaHip) +void DpcppAllocator::deallocate_impl(sycl::queue* queue, void* ptr) const { - GTEST_ASSERT_NO_EXIT({ - auto ref = gko::ReferenceExecutor::create(); - auto cuda = gko::CudaExecutor::create(0, ref, true); - auto hip = gko::HipExecutor::create(0, ref, true); - }); + queue->wait_and_throw(); + sycl::free(ptr, queue->get_context()); } -void func() +void* DpcppUnifiedAllocator::allocate(size_type num_bytes) { - auto ref = gko::ReferenceExecutor::create(); - auto exec = gko::HipExecutor::create(0, ref, true); + return sycl::malloc_shared(size, *queue_); } -TEST(DeviceReset, HipHip) +void DpcppUnifiedAllocator::deallocate(void* ptr) { - GTEST_ASSERT_NO_EXIT({ - std::thread t1(func); - std::thread t2(func); - t1.join(); - t2.join(); - }); + queue_->wait_and_throw(); + sycl::free(ptr, queue_->get_context()); } -} // namespace +} // namespace gko diff --git a/dpcpp/test/base/CMakeLists.txt b/dpcpp/test/base/CMakeLists.txt index bb9c8a75050..5c0ca601f04 100644 --- a/dpcpp/test/base/CMakeLists.txt +++ b/dpcpp/test/base/CMakeLists.txt @@ -3,3 +3,4 @@ ginkgo_create_dpcpp_test(dim3) ginkgo_create_dpcpp_test(kernel_launch) # set correct flags for kernel_launch.hpp target_compile_definitions(dpcpp_test_base_kernel_launch PRIVATE GKO_COMPILING_DPCPP) +ginkgo_create_dpcpp_test(memory) \ No newline at end of file diff --git a/dpcpp/test/base/memory.dp.cpp b/dpcpp/test/base/memory.dp.cpp new file mode 100644 index 00000000000..e587660cde3 --- /dev/null +++ b/dpcpp/test/base/memory.dp.cpp @@ -0,0 +1,98 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include +#include + + +#include + + +#include +#include +#include + + +#include "dpcpp/test/utils.hpp" + + +namespace { + + +class Memory : public ::testing::Test { +protected: + Memory() + : exec{gko::DpcppExecutor::create(0, gko::OmpExecutor::create())}, + host_exec_with_unified{gko::OmpExecutor::create( + std::make_shared(exec->get_queue()))}, + exec_with_unified{gko::DpcppExecutor::create( + exec->get_queue(), host_exec_with_unified, + std::make_shared(exec->get_queue()))} + {} + + std::shared_ptr exec; + std::shared_ptr host_exec_with_unified; + std::shared_ptr exec_with_unified; +}; + + +TEST_F(Memory, DeviceAllocationWorks) +{ + gko::array data{exec, {1, 2}}; + + GKO_ASSERT_ARRAY_EQ(data, I({1, 2})); +} + + +TEST_F(Memory, UnifiedDeviceAllocationWorks) +{ + gko::array data{exec_with_unified, {1, 2}}; + exec->synchronize(); + + ASSERT_EQ(data.get_const_data()[0], 1); + ASSERT_EQ(data.get_const_data()[1], 2); +} + + +TEST_F(Memory, HostUnifiedAllocationWorks) +{ + gko::array data{host_exec_with_unified, {1, 2}}; + + ASSERT_EQ(data.get_const_data()[0], 1); + ASSERT_EQ(data.get_const_data()[1], 2); +} + + +} // namespace diff --git a/examples/adaptiveprecision-blockjacobi/adaptiveprecision-blockjacobi.cpp b/examples/adaptiveprecision-blockjacobi/adaptiveprecision-blockjacobi.cpp index b300292e9a3..79b197aacc8 100644 --- a/examples/adaptiveprecision-blockjacobi/adaptiveprecision-blockjacobi.cpp +++ b/examples/adaptiveprecision-blockjacobi/adaptiveprecision-blockjacobi.cpp @@ -68,13 +68,12 @@ int main(int argc, char* argv[]) {"omp", [] { return gko::OmpExecutor::create(); }}, {"cuda", [] { - return gko::CudaExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::CudaExecutor::create(0, + gko::OmpExecutor::create()); }}, {"hip", [] { - return gko::HipExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::HipExecutor::create(0, gko::OmpExecutor::create()); }}, {"dpcpp", [] { diff --git a/examples/cb-gmres/cb-gmres.cpp b/examples/cb-gmres/cb-gmres.cpp index c0235f75e55..b096e48c71a 100644 --- a/examples/cb-gmres/cb-gmres.cpp +++ b/examples/cb-gmres/cb-gmres.cpp @@ -108,13 +108,12 @@ int main(int argc, char* argv[]) {"omp", [] { return gko::OmpExecutor::create(); }}, {"cuda", [] { - return gko::CudaExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::CudaExecutor::create(0, + gko::OmpExecutor::create()); }}, {"hip", [] { - return gko::HipExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::HipExecutor::create(0, gko::OmpExecutor::create()); }}, {"dpcpp", [] { diff --git a/examples/custom-logger/custom-logger.cpp b/examples/custom-logger/custom-logger.cpp index c2270cadb0d..7e6cf531edd 100644 --- a/examples/custom-logger/custom-logger.cpp +++ b/examples/custom-logger/custom-logger.cpp @@ -249,13 +249,12 @@ int main(int argc, char* argv[]) {"omp", [] { return gko::OmpExecutor::create(); }}, {"cuda", [] { - return gko::CudaExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::CudaExecutor::create(0, + gko::OmpExecutor::create()); }}, {"hip", [] { - return gko::HipExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::HipExecutor::create(0, gko::OmpExecutor::create()); }}, {"dpcpp", [] { diff --git a/examples/custom-matrix-format/custom-matrix-format.cpp b/examples/custom-matrix-format/custom-matrix-format.cpp index af08dbdf226..4610413fe9c 100644 --- a/examples/custom-matrix-format/custom-matrix-format.cpp +++ b/examples/custom-matrix-format/custom-matrix-format.cpp @@ -255,13 +255,12 @@ int main(int argc, char* argv[]) {"omp", [] { return gko::OmpExecutor::create(); }}, {"cuda", [] { - return gko::CudaExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::CudaExecutor::create(0, + gko::OmpExecutor::create()); }}, {"hip", [] { - return gko::HipExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::HipExecutor::create(0, gko::OmpExecutor::create()); }}, {"dpcpp", [] { diff --git a/examples/custom-stopping-criterion/custom-stopping-criterion.cpp b/examples/custom-stopping-criterion/custom-stopping-criterion.cpp index 9389f86cc45..e07f1bf92fb 100644 --- a/examples/custom-stopping-criterion/custom-stopping-criterion.cpp +++ b/examples/custom-stopping-criterion/custom-stopping-criterion.cpp @@ -158,13 +158,12 @@ int main(int argc, char* argv[]) {"omp", [] { return gko::OmpExecutor::create(); }}, {"cuda", [] { - return gko::CudaExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::CudaExecutor::create(0, + gko::OmpExecutor::create()); }}, {"hip", [] { - return gko::HipExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::HipExecutor::create(0, gko::OmpExecutor::create()); }}, {"dpcpp", [] { diff --git a/examples/ilu-preconditioned-solver/ilu-preconditioned-solver.cpp b/examples/ilu-preconditioned-solver/ilu-preconditioned-solver.cpp index aa32e0e879a..33946b7de44 100644 --- a/examples/ilu-preconditioned-solver/ilu-preconditioned-solver.cpp +++ b/examples/ilu-preconditioned-solver/ilu-preconditioned-solver.cpp @@ -68,13 +68,12 @@ int main(int argc, char* argv[]) {"omp", [] { return gko::OmpExecutor::create(); }}, {"cuda", [] { - return gko::CudaExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::CudaExecutor::create(0, + gko::OmpExecutor::create()); }}, {"hip", [] { - return gko::HipExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::HipExecutor::create(0, gko::OmpExecutor::create()); }}, {"dpcpp", [] { diff --git a/examples/inverse-iteration/inverse-iteration.cpp b/examples/inverse-iteration/inverse-iteration.cpp index 5d8270f1ca1..460370b7e00 100644 --- a/examples/inverse-iteration/inverse-iteration.cpp +++ b/examples/inverse-iteration/inverse-iteration.cpp @@ -72,13 +72,12 @@ int main(int argc, char* argv[]) {"omp", [] { return gko::OmpExecutor::create(); }}, {"cuda", [] { - return gko::CudaExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::CudaExecutor::create(0, + gko::OmpExecutor::create()); }}, {"hip", [] { - return gko::HipExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::HipExecutor::create(0, gko::OmpExecutor::create()); }}, {"dpcpp", [] { diff --git a/examples/ir-ilu-preconditioned-solver/ir-ilu-preconditioned-solver.cpp b/examples/ir-ilu-preconditioned-solver/ir-ilu-preconditioned-solver.cpp index e676e15cc6d..407a083e548 100644 --- a/examples/ir-ilu-preconditioned-solver/ir-ilu-preconditioned-solver.cpp +++ b/examples/ir-ilu-preconditioned-solver/ir-ilu-preconditioned-solver.cpp @@ -71,13 +71,12 @@ int main(int argc, char* argv[]) {"omp", [] { return gko::OmpExecutor::create(); }}, {"cuda", [] { - return gko::CudaExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::CudaExecutor::create(0, + gko::OmpExecutor::create()); }}, {"hip", [] { - return gko::HipExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::HipExecutor::create(0, gko::OmpExecutor::create()); }}, {"dpcpp", [] { diff --git a/examples/iterative-refinement/iterative-refinement.cpp b/examples/iterative-refinement/iterative-refinement.cpp index cbd2156be60..14384eaab52 100644 --- a/examples/iterative-refinement/iterative-refinement.cpp +++ b/examples/iterative-refinement/iterative-refinement.cpp @@ -68,13 +68,12 @@ int main(int argc, char* argv[]) {"omp", [] { return gko::OmpExecutor::create(); }}, {"cuda", [] { - return gko::CudaExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::CudaExecutor::create(0, + gko::OmpExecutor::create()); }}, {"hip", [] { - return gko::HipExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::HipExecutor::create(0, gko::OmpExecutor::create()); }}, {"dpcpp", [] { diff --git a/examples/minimal-cuda-solver/minimal-cuda-solver.cpp b/examples/minimal-cuda-solver/minimal-cuda-solver.cpp index 7182bc9ad8c..5a7a8c086af 100644 --- a/examples/minimal-cuda-solver/minimal-cuda-solver.cpp +++ b/examples/minimal-cuda-solver/minimal-cuda-solver.cpp @@ -36,7 +36,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. int main() { // Instantiate a CUDA executor - auto gpu = gko::CudaExecutor::create(0, gko::OmpExecutor::create(), true); + auto gpu = gko::CudaExecutor::create(0, gko::OmpExecutor::create()); // Read data auto A = gko::read>(std::cin, gpu); auto b = gko::read>(std::cin, gpu); diff --git a/examples/mixed-multigrid-preconditioned-solver/mixed-multigrid-preconditioned-solver.cpp b/examples/mixed-multigrid-preconditioned-solver/mixed-multigrid-preconditioned-solver.cpp index 6f1600d2805..9edd7ff29a1 100644 --- a/examples/mixed-multigrid-preconditioned-solver/mixed-multigrid-preconditioned-solver.cpp +++ b/examples/mixed-multigrid-preconditioned-solver/mixed-multigrid-preconditioned-solver.cpp @@ -71,13 +71,12 @@ int main(int argc, char* argv[]) {"omp", [] { return gko::OmpExecutor::create(); }}, {"cuda", [] { - return gko::CudaExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::CudaExecutor::create(0, + gko::OmpExecutor::create()); }}, {"hip", [] { - return gko::HipExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::HipExecutor::create(0, gko::OmpExecutor::create()); }}, {"dpcpp", [] { diff --git a/examples/mixed-multigrid-solver/mixed-multigrid-solver.cpp b/examples/mixed-multigrid-solver/mixed-multigrid-solver.cpp index d3f45cda916..cbecbbbdc02 100644 --- a/examples/mixed-multigrid-solver/mixed-multigrid-solver.cpp +++ b/examples/mixed-multigrid-solver/mixed-multigrid-solver.cpp @@ -69,13 +69,12 @@ int main(int argc, char* argv[]) {"omp", [] { return gko::OmpExecutor::create(); }}, {"cuda", [] { - return gko::CudaExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::CudaExecutor::create(0, + gko::OmpExecutor::create()); }}, {"hip", [] { - return gko::HipExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::HipExecutor::create(0, gko::OmpExecutor::create()); }}, {"dpcpp", [] { diff --git a/examples/mixed-precision-ir/mixed-precision-ir.cpp b/examples/mixed-precision-ir/mixed-precision-ir.cpp index 3510a2163e1..0882d755cdc 100644 --- a/examples/mixed-precision-ir/mixed-precision-ir.cpp +++ b/examples/mixed-precision-ir/mixed-precision-ir.cpp @@ -76,13 +76,12 @@ int main(int argc, char* argv[]) {"omp", [] { return gko::OmpExecutor::create(); }}, {"cuda", [] { - return gko::CudaExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::CudaExecutor::create(0, + gko::OmpExecutor::create()); }}, {"hip", [] { - return gko::HipExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::HipExecutor::create(0, gko::OmpExecutor::create()); }}, {"dpcpp", [] { diff --git a/examples/mixed-spmv/mixed-spmv.cpp b/examples/mixed-spmv/mixed-spmv.cpp index 78461de39ef..6b327c1c708 100644 --- a/examples/mixed-spmv/mixed-spmv.cpp +++ b/examples/mixed-spmv/mixed-spmv.cpp @@ -170,13 +170,12 @@ int main(int argc, char* argv[]) {"omp", [] { return gko::OmpExecutor::create(); }}, {"cuda", [] { - return gko::CudaExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::CudaExecutor::create(0, + gko::OmpExecutor::create()); }}, {"hip", [] { - return gko::HipExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::HipExecutor::create(0, gko::OmpExecutor::create()); }}, {"dpcpp", [] { diff --git a/examples/multigrid-preconditioned-solver-customized/multigrid-preconditioned-solver-customized.cpp b/examples/multigrid-preconditioned-solver-customized/multigrid-preconditioned-solver-customized.cpp index 6f75ca29630..a455ca2e8ed 100644 --- a/examples/multigrid-preconditioned-solver-customized/multigrid-preconditioned-solver-customized.cpp +++ b/examples/multigrid-preconditioned-solver-customized/multigrid-preconditioned-solver-customized.cpp @@ -64,13 +64,12 @@ int main(int argc, char* argv[]) {"omp", [] { return gko::OmpExecutor::create(); }}, {"cuda", [] { - return gko::CudaExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::CudaExecutor::create(0, + gko::OmpExecutor::create()); }}, {"hip", [] { - return gko::HipExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::HipExecutor::create(0, gko::OmpExecutor::create()); }}, {"dpcpp", [] { diff --git a/examples/multigrid-preconditioned-solver/multigrid-preconditioned-solver.cpp b/examples/multigrid-preconditioned-solver/multigrid-preconditioned-solver.cpp index 7f47d039072..75c03259c67 100644 --- a/examples/multigrid-preconditioned-solver/multigrid-preconditioned-solver.cpp +++ b/examples/multigrid-preconditioned-solver/multigrid-preconditioned-solver.cpp @@ -62,13 +62,12 @@ int main(int argc, char* argv[]) {"omp", [] { return gko::OmpExecutor::create(); }}, {"cuda", [] { - return gko::CudaExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::CudaExecutor::create(0, + gko::OmpExecutor::create()); }}, {"hip", [] { - return gko::HipExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::HipExecutor::create(0, gko::OmpExecutor::create()); }}, {"dpcpp", [] { diff --git a/examples/nine-pt-stencil-solver/nine-pt-stencil-solver.cpp b/examples/nine-pt-stencil-solver/nine-pt-stencil-solver.cpp index 51fdf97d4a4..05ee0503a5f 100644 --- a/examples/nine-pt-stencil-solver/nine-pt-stencil-solver.cpp +++ b/examples/nine-pt-stencil-solver/nine-pt-stencil-solver.cpp @@ -230,13 +230,12 @@ void solve_system(const std::string& executor_string, {"omp", [] { return gko::OmpExecutor::create(); }}, {"cuda", [] { - return gko::CudaExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::CudaExecutor::create(0, + gko::OmpExecutor::create()); }}, {"hip", [] { - return gko::HipExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::HipExecutor::create(0, gko::OmpExecutor::create()); }}, {"dpcpp", [] { diff --git a/examples/papi-logging/papi-logging.cpp b/examples/papi-logging/papi-logging.cpp index 0d81ef65909..1ae2ae9ec08 100644 --- a/examples/papi-logging/papi-logging.cpp +++ b/examples/papi-logging/papi-logging.cpp @@ -151,13 +151,12 @@ int main(int argc, char* argv[]) {"omp", [] { return gko::OmpExecutor::create(); }}, {"cuda", [] { - return gko::CudaExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::CudaExecutor::create(0, + gko::OmpExecutor::create()); }}, {"hip", [] { - return gko::HipExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::HipExecutor::create(0, gko::OmpExecutor::create()); }}, {"dpcpp", [] { diff --git a/examples/performance-debugging/performance-debugging.cpp b/examples/performance-debugging/performance-debugging.cpp index f357a8d4619..5f036728924 100644 --- a/examples/performance-debugging/performance-debugging.cpp +++ b/examples/performance-debugging/performance-debugging.cpp @@ -371,13 +371,12 @@ int main(int argc, char* argv[]) {"omp", [] { return gko::OmpExecutor::create(); }}, {"cuda", [] { - return gko::CudaExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::CudaExecutor::create(0, + gko::OmpExecutor::create()); }}, {"hip", [] { - return gko::HipExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::HipExecutor::create(0, gko::OmpExecutor::create()); }}, {"dpcpp", [] { diff --git a/examples/poisson-solver/poisson-solver.cpp b/examples/poisson-solver/poisson-solver.cpp index 7602600a514..e16f0b26968 100644 --- a/examples/poisson-solver/poisson-solver.cpp +++ b/examples/poisson-solver/poisson-solver.cpp @@ -144,13 +144,12 @@ int main(int argc, char* argv[]) {"omp", [] { return gko::OmpExecutor::create(); }}, {"cuda", [] { - return gko::CudaExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::CudaExecutor::create(0, + gko::OmpExecutor::create()); }}, {"hip", [] { - return gko::HipExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::HipExecutor::create(0, gko::OmpExecutor::create()); }}, {"dpcpp", [] { diff --git a/examples/preconditioned-solver/preconditioned-solver.cpp b/examples/preconditioned-solver/preconditioned-solver.cpp index 37963f205cc..b64b588c4ef 100644 --- a/examples/preconditioned-solver/preconditioned-solver.cpp +++ b/examples/preconditioned-solver/preconditioned-solver.cpp @@ -69,13 +69,12 @@ int main(int argc, char* argv[]) {"omp", [] { return gko::OmpExecutor::create(); }}, {"cuda", [] { - return gko::CudaExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::CudaExecutor::create(0, + gko::OmpExecutor::create()); }}, {"hip", [] { - return gko::HipExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::HipExecutor::create(0, gko::OmpExecutor::create()); }}, {"dpcpp", [] { diff --git a/examples/simple-solver-logging/simple-solver-logging.cpp b/examples/simple-solver-logging/simple-solver-logging.cpp index 6aa85462605..02318dd7784 100644 --- a/examples/simple-solver-logging/simple-solver-logging.cpp +++ b/examples/simple-solver-logging/simple-solver-logging.cpp @@ -85,13 +85,12 @@ int main(int argc, char* argv[]) {"omp", [] { return gko::OmpExecutor::create(); }}, {"cuda", [] { - return gko::CudaExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::CudaExecutor::create(0, + gko::OmpExecutor::create()); }}, {"hip", [] { - return gko::HipExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::HipExecutor::create(0, gko::OmpExecutor::create()); }}, {"dpcpp", [] { diff --git a/examples/simple-solver/simple-solver.cpp b/examples/simple-solver/simple-solver.cpp index 8f665f98496..81dc9ee6d74 100644 --- a/examples/simple-solver/simple-solver.cpp +++ b/examples/simple-solver/simple-solver.cpp @@ -89,13 +89,12 @@ int main(int argc, char* argv[]) {"omp", [] { return gko::OmpExecutor::create(); }}, {"cuda", [] { - return gko::CudaExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::CudaExecutor::create(0, + gko::OmpExecutor::create()); }}, {"hip", [] { - return gko::HipExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::HipExecutor::create(0, gko::OmpExecutor::create()); }}, {"dpcpp", [] { diff --git a/examples/three-pt-stencil-solver/three-pt-stencil-solver.cpp b/examples/three-pt-stencil-solver/three-pt-stencil-solver.cpp index 6bf3cc21a8a..63adfaa5571 100644 --- a/examples/three-pt-stencil-solver/three-pt-stencil-solver.cpp +++ b/examples/three-pt-stencil-solver/three-pt-stencil-solver.cpp @@ -165,13 +165,12 @@ void solve_system(const std::string& executor_string, {"omp", [] { return gko::OmpExecutor::create(); }}, {"cuda", [] { - return gko::CudaExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::CudaExecutor::create(0, + gko::OmpExecutor::create()); }}, {"hip", [] { - return gko::HipExecutor::create(0, gko::OmpExecutor::create(), - true); + return gko::HipExecutor::create(0, gko::OmpExecutor::create()); }}, {"dpcpp", [] { diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt index 6c6fc235f45..61b06ad4058 100644 --- a/hip/CMakeLists.txt +++ b/hip/CMakeLists.txt @@ -1,9 +1,13 @@ set(GINKGO_HIP_SOURCES + base/device.hip.cpp base/device_matrix_data_kernels.hip.cpp base/exception.hip.cpp base/executor.hip.cpp base/index_set_kernels.hip.cpp + base/memory.hip.cpp + base/roctx.hip.cpp base/scoped_device_id.hip.cpp + base/stream.hip.cpp base/timer.hip.cpp base/version.hip.cpp components/prefix_sum_kernels.hip.cpp diff --git a/hip/base/device.hip.cpp b/hip/base/device.hip.cpp new file mode 100644 index 00000000000..b5ec1bec6d6 --- /dev/null +++ b/hip/base/device.hip.cpp @@ -0,0 +1,67 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include + + +#include +#include +#include + + +#include "hip/base/scoped_device_id.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { + + +void reset_device(int device_id) +{ + gko::detail::hip_scoped_device_id_guard guard{device_id}; + hipDeviceReset(); +} + + +void destroy_event(GKO_HIP_EVENT_STRUCT* event) +{ + GKO_ASSERT_NO_HIP_ERRORS(hipEventDestroy(event)); +} + + +} // namespace hip +} // namespace kernels +} // namespace gko diff --git a/hip/base/executor.hip.cpp b/hip/base/executor.hip.cpp index cd8a485c19d..6b4b0fd5ddc 100644 --- a/hip/base/executor.hip.cpp +++ b/hip/base/executor.hip.cpp @@ -37,15 +37,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#if GINKGO_HIP_PLATFORM_HCC && GKO_HAVE_ROCTX -#include -#endif #include #include #include -#include #include "hip/base/config.hip.hpp" @@ -60,32 +56,22 @@ namespace gko { #include "common/cuda_hip/base/executor.hpp.inc" -#if (GINKGO_HIP_PLATFORM_NVCC == 1) -using hip_device_class = nvidia_device; -#else -using hip_device_class = amd_device; -#endif - - std::shared_ptr HipExecutor::create( int device_id, std::shared_ptr master, bool device_reset, allocation_mode alloc_mode, hipStream_t stream) { return std::shared_ptr( - new HipExecutor(device_id, std::move(master), device_reset, alloc_mode, - stream), - [device_id](HipExecutor* exec) { - auto device_reset = exec->get_device_reset(); - std::lock_guard guard( - hip_device_class::get_mutex(device_id)); - delete exec; - auto& num_execs = hip_device_class::get_num_execs(device_id); - num_execs--; - if (!num_execs && device_reset) { - detail::hip_scoped_device_id_guard g(device_id); - hipDeviceReset(); - } - }); + new HipExecutor(device_id, std::move(master), + std::make_shared(), stream)); +} + + +std::shared_ptr HipExecutor::create( + int device_id, std::shared_ptr master, + std::shared_ptr alloc, hipStream_t stream) +{ + return std::shared_ptr(new HipExecutor( + device_id, std::move(master), std::move(alloc), stream)); } @@ -125,42 +111,14 @@ void OmpExecutor::raw_copy_to(const HipExecutor* dest, size_type num_bytes, void HipExecutor::raw_free(void* ptr) const noexcept { detail::hip_scoped_device_id_guard g(this->get_device_id()); - auto error_code = hipFree(ptr); - if (error_code != hipSuccess) { -#if GKO_VERBOSE_LEVEL >= 1 - // Unfortunately, if memory free fails, there's not much we can do - std::cerr << "Unrecoverable HIP error on device " - << this->get_device_id() << " in " << __func__ << ": " - << hipGetErrorName(error_code) << ": " - << hipGetErrorString(error_code) << std::endl - << "Exiting program" << std::endl; -#endif // GKO_VERBOSE_LEVEL >= 1 - std::exit(error_code); - } + alloc_->deallocate(ptr); } void* HipExecutor::raw_alloc(size_type num_bytes) const { - void* dev_ptr = nullptr; detail::hip_scoped_device_id_guard g(this->get_device_id()); - int error_code = 0; - if (this->alloc_mode_ == allocation_mode::device) { - error_code = hipMalloc(&dev_ptr, num_bytes); -#if !(GKO_HIP_PLATFORM_HCC == 1) - } else if (this->alloc_mode_ == allocation_mode::unified_global) { - error_code = hipMallocManaged(&dev_ptr, num_bytes, hipMemAttachGlobal); - } else if (this->alloc_mode_ == allocation_mode::unified_host) { - error_code = hipMallocManaged(&dev_ptr, num_bytes, hipMemAttachHost); -#endif - } else { - GKO_NOT_SUPPORTED(this->alloc_mode_); - } - if (error_code != hipErrorMemoryAllocation) { - GKO_ASSERT_NO_HIP_ERRORS(error_code); - } - GKO_ENSURE_ALLOCATED(dev_ptr, "hip", num_bytes); - return dev_ptr; + return alloc_->allocate(num_bytes); } @@ -309,73 +267,4 @@ void HipExecutor::init_handles() } -hip_stream::hip_stream(int device_id) : stream_{}, device_id_(device_id) -{ - detail::hip_scoped_device_id_guard g(device_id_); - GKO_ASSERT_NO_HIP_ERRORS(hipStreamCreate(&stream_)); -} - - -hip_stream::~hip_stream() -{ - if (stream_) { - detail::hip_scoped_device_id_guard g(device_id_); - hipStreamDestroy(stream_); - } -} - - -hip_stream::hip_stream(hip_stream&& other) - : stream_{std::exchange(other.stream_, nullptr)}, - device_id_{std::exchange(other.device_id_, -1)} -{} - - -GKO_HIP_STREAM_STRUCT* hip_stream::get() const { return stream_; } - - -namespace log { - - -#if GINKGO_HIP_PLATFORM_HCC && GKO_HAVE_ROCTX - -void begin_roctx(const char* name, profile_event_category) -{ - roctxRangePush(name); -} - -void end_roctx(const char*, profile_event_category) { roctxRangePop(); } - -#else - -void begin_roctx(const char* name, profile_event_category) - GKO_NOT_COMPILED(roctx); - -void end_roctx(const char*, profile_event_category) GKO_NOT_COMPILED(roctx); - -#endif - - -} // namespace log - - -namespace kernels { -namespace hip { - - -void reset_device(int device_id) -{ - gko::detail::hip_scoped_device_id_guard guard{device_id}; - hipDeviceReset(); -} - - -void destroy_event(GKO_HIP_EVENT_STRUCT* event) -{ - GKO_ASSERT_NO_HIP_ERRORS(hipEventDestroy(event)); -} - - -} // namespace hip -} // namespace kernels } // namespace gko diff --git a/hip/base/memory.hip.cpp b/hip/base/memory.hip.cpp new file mode 100644 index 00000000000..f2a8977525f --- /dev/null +++ b/hip/base/memory.hip.cpp @@ -0,0 +1,97 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include + + +#include + + +namespace gko { + + +#define GKO_ASSERT_NO_HIP_ALLOCATION_ERRORS(_operation, _size) \ + { \ + auto error_code = _operation; \ + if (error_code == hipErrorMemoryAllocation) { \ + throw AllocationError(__FILE__, __LINE__, "hip", _size); \ + } else { \ + GKO_ASSERT_NO_HIP_ERRORS(error_code); \ + } \ + } + + +#if GKO_VERBOSE_LEVEL >= 1 +#define GKO_EXIT_ON_HIP_ERROR(_operation) \ + { \ + const auto error_code = _operation; \ + if (error_code != hipSuccess) { \ + int device_id{-1}; \ + hipGetDevice(&device_id); \ + std::cerr << "Unrecoverable HIP error on device " << device_id \ + << " in " << __func__ << ": " \ + << hipGetErrorName(error_code) << ": " \ + << hipGetErrorString(error_code) << std::endl \ + << "Exiting program" << std::endl; \ + std::exit(error_code); \ + } \ + } +#else +#define GKO_EXIT_ON_HIP_ERROR(_operation) \ + { \ + const auto error_code = _operation; \ + if (error_code != hipSuccess) { \ + std::exit(error_code); \ + } \ + } +#endif + + +void* HipAllocator::allocate(size_type num_bytes) const +{ + void* dev_ptr{}; + GKO_ASSERT_NO_HIP_ALLOCATION_ERRORS(hipMalloc(&dev_ptr, num_bytes), + num_bytes); + return dev_ptr; +} + + +void HipAllocator::deallocate(void* dev_ptr) const +{ + GKO_EXIT_ON_HIP_ERROR(hipFree(dev_ptr)); +} + + +} // namespace gko diff --git a/hip/base/roctx.hip.cpp b/hip/base/roctx.hip.cpp new file mode 100644 index 00000000000..9f309b93362 --- /dev/null +++ b/hip/base/roctx.hip.cpp @@ -0,0 +1,70 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include +#if GINKGO_HIP_PLATFORM_HCC && GKO_HAVE_ROCTX +#include +#endif + + +#include +#include + + +namespace gko { +namespace log { + + +#if GINKGO_HIP_PLATFORM_HCC && GKO_HAVE_ROCTX + +void begin_roctx(const char* name, profile_event_category) +{ + roctxRangePush(name); +} + +void end_roctx(const char*, profile_event_category) { roctxRangePop(); } + +#else + +void begin_roctx(const char* name, profile_event_category) + GKO_NOT_COMPILED(roctx); + +void end_roctx(const char*, profile_event_category) GKO_NOT_COMPILED(roctx); + +#endif + + +} // namespace log +} // namespace gko diff --git a/hip/base/stream.hip.cpp b/hip/base/stream.hip.cpp new file mode 100644 index 00000000000..e5817eb9ebd --- /dev/null +++ b/hip/base/stream.hip.cpp @@ -0,0 +1,78 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include + + +#include +#include +#include + + +#include "hip/base/scoped_device_id.hip.hpp" + + +namespace gko { + + +hip_stream::hip_stream() : stream_{}, device_id_{-1} {} + + +hip_stream::hip_stream(int device_id) : stream_{}, device_id_(device_id) +{ + detail::hip_scoped_device_id_guard g(device_id_); + GKO_ASSERT_NO_HIP_ERRORS(hipStreamCreate(&stream_)); +} + + +hip_stream::~hip_stream() +{ + if (stream_) { + detail::hip_scoped_device_id_guard g(device_id_); + hipStreamDestroy(stream_); + } +} + + +hip_stream::hip_stream(hip_stream&& other) + : stream_{std::exchange(other.stream_, nullptr)}, + device_id_{std::exchange(other.device_id_, -1)} +{} + + +GKO_HIP_STREAM_STRUCT* hip_stream::get() const { return stream_; } + + +} // namespace gko diff --git a/hip/test/base/CMakeLists.txt b/hip/test/base/CMakeLists.txt index 7ed0d2ceb52..f597a3d6e3d 100644 --- a/hip/test/base/CMakeLists.txt +++ b/hip/test/base/CMakeLists.txt @@ -1,6 +1,5 @@ ginkgo_create_hip_test(hip_executor) ginkgo_create_test(index_set) -ginkgo_create_test(hip_executor_reset ADDITIONAL_LIBRARIES Threads::Threads) if(GINKGO_HAVE_HWLOC) find_package(NUMA REQUIRED) ginkgo_create_hip_test(hip_executor_topology ADDITIONAL_LIBRARIES NUMA::NUMA) diff --git a/hip/test/base/hip_executor.hip.cpp b/hip/test/base/hip_executor.hip.cpp index d27dd58d132..e531fa739e6 100644 --- a/hip/test/base/hip_executor.hip.cpp +++ b/hip/test/base/hip_executor.hip.cpp @@ -109,18 +109,18 @@ class HipExecutor : public ::testing::Test { ASSERT_GT(gko::HipExecutor::get_num_devices(), 0); #ifdef GKO_TEST_NONDEFAULT_STREAM hip = gko::HipExecutor::create( - 0, omp, false, gko::default_hip_alloc_mode, stream.get()); - hip2 = gko::HipExecutor::create(gko::HipExecutor::get_num_devices() - 1, - omp, false, gko::default_hip_alloc_mode, - other_stream.get()); + 0, omp, std::make_shared(), stream.get()); + hip2 = gko::HipExecutor::create( + gko::HipExecutor::get_num_devices() - 1, omp, + std::make_shared(), other_stream.get()); hip3 = gko::HipExecutor::create( - 0, omp, false, gko::allocation_mode::unified_global, stream.get()); + 0, omp, std::make_shared(), stream.get()); #else hip = gko::HipExecutor::create(0, omp); hip2 = gko::HipExecutor::create(gko::HipExecutor::get_num_devices() - 1, omp); - hip3 = gko::HipExecutor::create(0, omp, false, - gko::allocation_mode::unified_global); + hip3 = gko::HipExecutor::create(0, omp, + std::make_shared()); #endif } diff --git a/hip/test/utils.hip.hpp b/hip/test/utils.hip.hpp index 9337da14139..bf7073cf9a1 100644 --- a/hip/test/utils.hip.hpp +++ b/hip/test/utils.hip.hpp @@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include "hip/base/device.hpp" @@ -60,8 +61,9 @@ class HipTestFixture : public ::testing::Test { HipTestFixture() : ref(gko::ReferenceExecutor::create()), #ifdef GKO_TEST_NONDEFAULT_STREAM + stream(0), exec(gko::HipExecutor::create( - 0, ref, false, gko::default_hip_alloc_mode, stream.get())) + 0, ref, std::make_shared(), stream.get())) #else exec(gko::HipExecutor::create(0, ref)) #endif diff --git a/include/ginkgo/core/base/executor.hpp b/include/ginkgo/core/base/executor.hpp index 965cd562bff..4545b216f86 100644 --- a/include/ginkgo/core/base/executor.hpp +++ b/include/ginkgo/core/base/executor.hpp @@ -47,7 +47,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include +#include #include #include #include @@ -121,33 +123,6 @@ constexpr allocation_mode default_hip_alloc_mode = } // namespace gko -// after intel/llvm September'22 release, which uses major version 6, they -// introduce another inline namespace _V1. -#if GINKGO_DPCPP_MAJOR_VERSION >= 6 -namespace sycl { -inline namespace _V1 { - - -class queue; -class event; - - -} // namespace _V1 -} // namespace sycl -#else // GINKGO_DPCPP_MAJOR_VERSION < 6 -inline namespace cl { -namespace sycl { - - -class queue; -class event; - - -} // namespace sycl -} // namespace cl -#endif - - /** * The enum class is for the dpcpp queue property. It's legal to use a binary * or(|) operation to combine several properties. @@ -172,29 +147,6 @@ GKO_ATTRIBUTES GKO_INLINE dpcpp_queue_property operator|(dpcpp_queue_property a, } -struct cublasContext; - -struct cusparseContext; - -struct CUstream_st; - -struct CUevent_st; - -struct hipblasContext; - -struct hipsparseContext; - -#if GINKGO_HIP_PLATFORM_HCC -struct ihipStream_t; -struct ihipEvent_t; -#define GKO_HIP_STREAM_STRUCT ihipStream_t -#define GKO_HIP_EVENT_STRUCT ihipEvent_t -#else -#define GKO_HIP_STREAM_STRUCT CUstream_st -#define GKO_HIP_EVENT_STRUCT CUevent_st -#endif - - namespace gko { @@ -1355,26 +1307,14 @@ class EnableDeviceReset { * * @param device_reset whether to allow a device reset or not */ - void set_device_reset(bool device_reset) { device_reset_ = device_reset; } + void set_device_reset(bool device_reset) {} /** * Returns the current status of the device reset boolean for this executor. * * @return the current status of the device reset boolean for this executor. */ - bool get_device_reset() { return device_reset_; } - -protected: - /** - * Instantiate an EnableDeviceReset class - * - * @param device_reset the starting device_reset status. Defaults to false. - */ - EnableDeviceReset(bool device_reset = false) : device_reset_{device_reset} - {} - -private: - bool device_reset_{}; + bool get_device_reset() { return false; } }; @@ -1411,9 +1351,11 @@ class OmpExecutor : public detail::ExecutorBase, /** * Creates a new OmpExecutor. */ - static std::shared_ptr create() + static std::shared_ptr create( + std::shared_ptr alloc = + std::make_shared()) { - return std::shared_ptr(new OmpExecutor()); + return std::shared_ptr(new OmpExecutor(std::move(alloc))); } std::shared_ptr get_master() noexcept override; @@ -1435,7 +1377,8 @@ class OmpExecutor : public detail::ExecutorBase, scoped_device_id_guard get_scoped_device_id_guard() const override; protected: - OmpExecutor() + OmpExecutor(std::shared_ptr alloc) + : alloc_{std::move(alloc)} { this->OmpExecutor::populate_exec_info(machine_topology::get_instance()); } @@ -1457,6 +1400,8 @@ class OmpExecutor : public detail::ExecutorBase, GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(CudaExecutor, false); bool verify_memory_to(const DpcppExecutor* dest_exec) const override; + + std::shared_ptr alloc_; }; @@ -1476,9 +1421,12 @@ using DefaultExecutor = OmpExecutor; */ class ReferenceExecutor : public OmpExecutor { public: - static std::shared_ptr create() + static std::shared_ptr create( + std::shared_ptr alloc = + std::make_shared()) { - return std::shared_ptr(new ReferenceExecutor()); + return std::shared_ptr( + new ReferenceExecutor(std::move(alloc))); } scoped_device_id_guard get_scoped_device_id_guard() const override @@ -1495,7 +1443,8 @@ class ReferenceExecutor : public OmpExecutor { } protected: - ReferenceExecutor() + ReferenceExecutor(std::shared_ptr alloc) + : OmpExecutor{std::move(alloc)} { this->ReferenceExecutor::populate_exec_info( machine_topology::get_instance()); @@ -1550,15 +1499,32 @@ class CudaExecutor : public detail::ExecutorBase, * @param device_id the CUDA device id of this device * @param master an executor on the host that is used to invoke the device * kernels - * @param device_reset whether to reset the device after the object exits - * the scope. + * @param device_reset this option no longer has any effect. * @param alloc_mode the allocation mode that the executor should operate * on. See @allocation_mode for more details + * @param stream the stream to execute operations on. + */ + [[deprecated( + "device_reset is deprecated entirely, call cudaDeviceReset directly. " + "alloc_mode was replaced by the Allocator type " + "hierarchy.")]] static std::shared_ptr + create(int device_id, std::shared_ptr master, bool device_reset, + allocation_mode alloc_mode = default_cuda_alloc_mode, + CUstream_st* stream = nullptr); + + /** + * Creates a new CudaExecutor with a custom allocator and device stream. + * + * @param device_id the CUDA device id of this device + * @param master an executor on the host that is used to invoke the device + * kernels. + * @param alloc the allocator to use for device memory allocations. + * @param stream the stream to execute operations on. */ static std::shared_ptr create( int device_id, std::shared_ptr master, - bool device_reset = false, - allocation_mode alloc_mode = default_cuda_alloc_mode, + std::shared_ptr alloc = + std::make_shared(), CUstream_st* stream = nullptr); std::shared_ptr get_master() noexcept override; @@ -1679,26 +1645,15 @@ class CudaExecutor : public detail::ExecutorBase, void init_handles(); CudaExecutor(int device_id, std::shared_ptr master, - bool device_reset = false, - allocation_mode alloc_mode = default_cuda_alloc_mode, - CUstream_st* stream = nullptr) - : EnableDeviceReset{device_reset}, - master_(master), - alloc_mode_{alloc_mode}, - stream_{stream} + std::shared_ptr alloc, CUstream_st* stream) + : alloc_{std::move(alloc)}, master_(master), stream_{stream} { this->get_exec_info().device_id = device_id; this->get_exec_info().num_computing_units = 0; this->get_exec_info().num_pu_per_cu = 0; this->CudaExecutor::populate_exec_info( machine_topology::get_instance()); - - // it only gets attribute from device, so it should not be affected by - // DeviceReset. this->set_gpu_property(); - // increase the number of executor before any operations may be affected - // by DeviceReset. - increase_num_execs(this->get_exec_info().device_id); this->init_handles(); } @@ -1718,12 +1673,6 @@ class CudaExecutor : public detail::ExecutorBase, bool verify_memory_to(const CudaExecutor* dest_exec) const override; - static void increase_num_execs(unsigned device_id); - - static void decrease_num_execs(unsigned device_id); - - static unsigned get_num_execs(unsigned device_id); - void populate_exec_info(const machine_topology* mach_topo) override; private: @@ -1733,45 +1682,8 @@ class CudaExecutor : public detail::ExecutorBase, using handle_manager = std::unique_ptr>; handle_manager cublas_handle_; handle_manager cusparse_handle_; + std::shared_ptr alloc_; CUstream_st* stream_; - - allocation_mode alloc_mode_; -}; - - -/** - * An RAII wrapper for a custom CUDA stream. - * The stream will be created on construction and destroyed when the lifetime of - * the wrapper ends. - */ -class cuda_stream { -public: - /** Creates a new custom CUDA stream. */ - cuda_stream(int device_id = 0); - - /** Destroys the custom CUDA stream, if it wasn't moved-from already. */ - ~cuda_stream(); - - cuda_stream(const cuda_stream&) = delete; - - /** Move-constructs from an existing stream, which will be emptied. */ - cuda_stream(cuda_stream&&); - - cuda_stream& operator=(const cuda_stream&) = delete; - - /** Move-assigns from an existing stream, which will be emptied. */ - cuda_stream& operator=(cuda_stream&&) = delete; - - /** - * Returns the native CUDA stream handle. - * In a moved-from cuda_stream, this will return nullptr. - */ - CUstream_st* get() const; - -private: - CUstream_st* stream_; - - int device_id_; }; @@ -1805,10 +1717,15 @@ class HipExecutor : public detail::ExecutorBase, * @param alloc_mode the allocation mode that the executor should operate * on. See @allocation_mode for more details */ + [[deprecated("")]] static std::shared_ptr create( + int device_id, std::shared_ptr master, bool device_reset, + allocation_mode alloc_mode = default_hip_alloc_mode, + GKO_HIP_STREAM_STRUCT* stream = nullptr); + static std::shared_ptr create( int device_id, std::shared_ptr master, - bool device_reset = false, - allocation_mode alloc_mode = default_hip_alloc_mode, + std::shared_ptr alloc = + std::make_shared(), GKO_HIP_STREAM_STRUCT* stream = nullptr); std::shared_ptr get_master() noexcept override; @@ -1923,25 +1840,15 @@ class HipExecutor : public detail::ExecutorBase, void init_handles(); HipExecutor(int device_id, std::shared_ptr master, - bool device_reset = false, - allocation_mode alloc_mode = default_hip_alloc_mode, - GKO_HIP_STREAM_STRUCT* stream = nullptr) - : EnableDeviceReset{device_reset}, - master_(master), - alloc_mode_(alloc_mode), - stream_{stream} + std::shared_ptr alloc, + GKO_HIP_STREAM_STRUCT* stream) + : master_{std::move(master)}, alloc_{std::move(alloc)}, stream_{stream} { this->get_exec_info().device_id = device_id; this->get_exec_info().num_computing_units = 0; this->get_exec_info().num_pu_per_cu = 0; this->HipExecutor::populate_exec_info(machine_topology::get_instance()); - - // it only gets attribute from device, so it should not be affected by - // DeviceReset. this->set_gpu_property(); - // increase the number of executor before any operations may be affected - // by DeviceReset. - increase_num_execs(this->get_exec_info().device_id); this->init_handles(); } @@ -1961,12 +1868,6 @@ class HipExecutor : public detail::ExecutorBase, bool verify_memory_to(const HipExecutor* dest_exec) const override; - static void increase_num_execs(int device_id); - - static void decrease_num_execs(int device_id); - - static int get_num_execs(int device_id); - void populate_exec_info(const machine_topology* mach_topo) override; private: @@ -1976,48 +1877,11 @@ class HipExecutor : public detail::ExecutorBase, using handle_manager = std::unique_ptr>; handle_manager hipblas_handle_; handle_manager hipsparse_handle_; - - allocation_mode alloc_mode_; + std::shared_ptr alloc_; GKO_HIP_STREAM_STRUCT* stream_; }; -/** - * An RAII wrapper for a custom HIP stream. - * The stream will be created on construction and destroyed when the lifetime of - * the wrapper ends. - */ -class hip_stream { -public: - /** Creates a new custom HIP stream. */ - hip_stream(int device_id = 0); - - /** Destroys the custom HIP stream, if it wasn't moved-from already. */ - ~hip_stream(); - - hip_stream(const hip_stream&) = delete; - - /** Move-constructs from an existing stream, which will be emptied. */ - hip_stream(hip_stream&&); - - hip_stream& operator=(const hip_stream&) = delete; - - /** Move-assigns from an existing stream, which will be emptied. */ - hip_stream& operator=(hip_stream&&) = delete; - - /** - * Returns the native HIP stream handle. - * In a moved-from hip_stream, this will return nullptr. - */ - GKO_HIP_STREAM_STRUCT* get() const; - -private: - GKO_HIP_STREAM_STRUCT* stream_; - - int device_id_; -}; - - namespace kernels { namespace hip { using DefaultExecutor = HipExecutor; @@ -2050,6 +1914,28 @@ class DpcppExecutor : public detail::ExecutorBase, std::string device_type = "all", dpcpp_queue_property property = dpcpp_queue_property::in_order); + /** + * Creates a new DpcppExecutor from an existing SYCL queue. + * + * @param queue the DPCPP device id of this device + * @param master an executor on the host that is used to invoke the device + * kernels + */ + static std::shared_ptr create( + sycl::queue* queue, std::shared_ptr master); + + /** + * Creates a new DpcppExecutor from an existing SYCL queue. + * + * @param queue the DPCPP device id of this device + * @param master an executor on the host that is used to invoke the device + * kernels + * @param alloc the allocator used for memory allocation + */ + static std::shared_ptr create( + sycl::queue* queue, std::shared_ptr master, + std::shared_ptr alloc); + std::shared_ptr get_master() noexcept override; std::shared_ptr get_master() const noexcept override; diff --git a/include/ginkgo/core/base/fwd_defs.hpp b/include/ginkgo/core/base/fwd_defs.hpp new file mode 100644 index 00000000000..5f0cbd9d960 --- /dev/null +++ b/include/ginkgo/core/base/fwd_defs.hpp @@ -0,0 +1,90 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_PUBLIC_CORE_BASE_FWD_DEFS_HPP_ +#define GKO_PUBLIC_CORE_BASE_FWD_DEFS_HPP_ + + +#include + + +struct cublasContext; + +struct cusparseContext; + +struct CUstream_st; + +struct CUevent_st; + +struct hipblasContext; + +struct hipsparseContext; + +#if GINKGO_HIP_PLATFORM_HCC +struct ihipStream_t; +struct ihipEvent_t; +#define GKO_HIP_STREAM_STRUCT ihipStream_t +#define GKO_HIP_EVENT_STRUCT ihipEvent_t +#else +#define GKO_HIP_STREAM_STRUCT CUstream_st +#define GKO_HIP_EVENT_STRUCT CUevent_st +#endif + + +// after intel/llvm September'22 release, which uses major version 6, they +// introduce another inline namespace _V1. +#if GINKGO_DPCPP_MAJOR_VERSION >= 6 +namespace sycl { +inline namespace _V1 { + + +class queue; +class event; + + +} // namespace _V1 +} // namespace sycl +#else // GINKGO_DPCPP_MAJOR_VERSION < 6 +inline namespace cl { +namespace sycl { + + +class queue; +class event; + + +} // namespace sycl +} // namespace cl +#endif + + +#endif // GKO_PUBLIC_CORE_BASE_FWD_DEFS_HPP_ diff --git a/include/ginkgo/core/base/memory.hpp b/include/ginkgo/core/base/memory.hpp new file mode 100644 index 00000000000..ec25920dcea --- /dev/null +++ b/include/ginkgo/core/base/memory.hpp @@ -0,0 +1,211 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_PUBLIC_CORE_BASE_MEMORY_HPP_ +#define GKO_PUBLIC_CORE_BASE_MEMORY_HPP_ + + +#include +#include + + +namespace gko { + + +/** + * Provides generic allocation and deallocation functionality to be used by an + * Executor. + */ +class Allocator { +public: + virtual ~Allocator() = default; + + virtual void* allocate(size_type num_bytes) const = 0; + + virtual void deallocate(void* ptr) const = 0; +}; + + +/** + * Implement this interface to provide an allocator for OmpExecutor or + * ReferenceExecutor. + */ +class CpuAllocatorBase : public Allocator {}; + + +/** + * Implement this interface to provide an allocator for CudaExecutor. + */ +class CudaAllocatorBase : public Allocator {}; + + +/** + * Implement this interface to provide an allocator for HipExecutor. + */ +class HipAllocatorBase : public Allocator {}; + + +/** + * Implement this interface to provide an allocator for DpcppExecutor. + */ +class DpcppAllocatorBase : public Allocator { +public: + DpcppAllocatorBase(sycl::queue* queue); + +protected: + virtual void* allocate_impl(sycl::queue* queue, + size_type num_bytes) const = 0; + + virtual void deallocate_impl(sycl::queue* queue, void* ptr) const = 0; + +private: + sycl::queue* queue_; +}; + + +/** + * Allocator using new/delete. + */ +class CpuAllocator : public CpuAllocatorBase { +public: + void* allocate(size_type num_bytes) const override; + + void deallocate(void* ptr) const override; +}; + + +/** + * Allocator using cudaMalloc. + */ +class CudaAllocator : public CudaAllocatorBase { +public: + void* allocate(size_type num_bytes) const override; + + void deallocate(void* ptr) const override; +}; + + +/* + * Allocator using cudaMallocAsync. + */ +class CudaAsyncAllocator : public CudaAllocatorBase { +public: + void* allocate(size_type num_bytes) const override; + + void deallocate(void* ptr) const override; + + CudaAsyncAllocator(CUstream_st* stream); + +private: + CUstream_st* stream_; +}; + + +/* + * Allocator using cudaMallocManaged + */ +class CudaUnifiedAllocator : public CudaAllocatorBase, public CpuAllocatorBase { +public: + void* allocate(size_type num_bytes) const override; + + void deallocate(void* ptr) const override; + + CudaUnifiedAllocator(int device_id); + + CudaUnifiedAllocator(int device_id, unsigned int flags); + +private: + int device_id_; + unsigned int flags_; +}; + + +/* + * Allocator using cudaMallocHost. + */ +class CudaHostAllocator : public CudaAllocatorBase, public CpuAllocatorBase { +public: + void* allocate(size_type num_bytes) const override; + + void deallocate(void* ptr) const override; + + CudaHostAllocator(int device_id); + +private: + int device_id_; +}; + + +/* + * Allocator using hipMalloc. + */ +class HipAllocator : public HipAllocatorBase { +public: + void* allocate(size_type num_bytes) const override; + + void deallocate(void* ptr) const override; +}; + + +/* + * Allocator using sycl::malloc_device. + */ +class DpcppAllocator : public DpcppAllocatorBase { +public: + using DpcppAllocatorBase::DpcppAllocatorBase; + +protected: + void* allocate_impl(sycl::queue* queue, size_type num_bytes) const override; + + void deallocate_impl(sycl::queue* queue, void* ptr) const override; +}; + + +/* + * Allocator using sycl::malloc_shared. + */ +class DpcppUnifiedAllocator : public DpcppAllocatorBase, + public CpuAllocatorBase { +public: + using DpcppAllocatorBase::DpcppAllocatorBase; + +protected: + void* allocate_impl(sycl::queue* queue, size_type num_bytes) const override; + + void deallocate_impl(sycl::queue* queue, void* ptr) const override; +}; + + +} // namespace gko + + +#endif // GKO_PUBLIC_CORE_BASE_MEMORY_HPP_ diff --git a/include/ginkgo/core/base/stream.hpp b/include/ginkgo/core/base/stream.hpp new file mode 100644 index 00000000000..4bb4aeecf9e --- /dev/null +++ b/include/ginkgo/core/base/stream.hpp @@ -0,0 +1,124 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_PUBLIC_CORE_BASE_STREAM_HPP_ +#define GKO_PUBLIC_CORE_BASE_STREAM_HPP_ + + +#include + + +namespace gko { + + +/** + * An RAII wrapper for a custom CUDA stream. + * The stream will be created on construction and destroyed when the lifetime of + * the wrapper ends. + */ +class cuda_stream { +public: + /** Creates an empty stream wrapper, representing the default stream. */ + cuda_stream(); + + /** Creates a new custom CUDA stream. */ + cuda_stream(int device_id); + + /** Destroys the custom CUDA stream, if it isn't empty. */ + ~cuda_stream(); + + cuda_stream(const cuda_stream&) = delete; + + /** Move-constructs from an existing stream, which will be emptied. */ + cuda_stream(cuda_stream&&); + + cuda_stream& operator=(const cuda_stream&) = delete; + + /** Move-assigns from an existing stream, which will be emptied. */ + cuda_stream& operator=(cuda_stream&&) = delete; + + /** + * Returns the native CUDA stream handle. + * In an empty cuda_stream, this will return nullptr. + */ + CUstream_st* get() const; + +private: + CUstream_st* stream_; + + int device_id_; +}; + + +/** + * An RAII wrapper for a custom HIP stream. + * The stream will be created on construction and destroyed when the lifetime of + * the wrapper ends. + */ +class hip_stream { +public: + /** Creates an empty stream wrapper, representing the default stream. */ + hip_stream(); + + /** Creates a new custom HIP stream. */ + hip_stream(int device_id); + + /** Destroys the custom HIP stream, if it isn't empty. */ + ~hip_stream(); + + hip_stream(const hip_stream&) = delete; + + /** Move-constructs from an existing stream, which will be emptied. */ + hip_stream(hip_stream&&); + + hip_stream& operator=(const hip_stream&) = delete; + + /** Move-assigns from an existing stream, which will be emptied. */ + hip_stream& operator=(hip_stream&&) = delete; + + /** + * Returns the native HIP stream handle. + * In an empty hip_stream, this will return nullptr. + */ + GKO_HIP_STREAM_STRUCT* get() const; + +private: + GKO_HIP_STREAM_STRUCT* stream_; + + int device_id_; +}; + + +} // namespace gko + + +#endif // GKO_PUBLIC_CORE_BASE_EXECUTOR_HPP_ diff --git a/include/ginkgo/ginkgo.hpp b/include/ginkgo/ginkgo.hpp index 93663b02290..d73bf669700 100644 --- a/include/ginkgo/ginkgo.hpp +++ b/include/ginkgo/ginkgo.hpp @@ -48,6 +48,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include #include @@ -55,6 +56,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include #include @@ -65,6 +67,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include #include diff --git a/test/utils/executor.hpp b/test/utils/executor.hpp index 25482cf18c8..33e6258fbbd 100644 --- a/test/utils/executor.hpp +++ b/test/utils/executor.hpp @@ -44,6 +44,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include + + #ifdef GKO_COMPILING_CUDA #include "cuda/base/device.hpp" @@ -106,8 +109,8 @@ inline void init_executor(std::shared_ptr ref, if (gko::CudaExecutor::get_num_devices() == 0) { throw std::runtime_error{"No suitable CUDA devices"}; } - exec = gko::CudaExecutor::create(0, ref, false, - gko::default_cuda_alloc_mode, stream); + exec = gko::CudaExecutor::create( + 0, ref, std::make_shared(stream), stream); } } @@ -119,8 +122,8 @@ inline void init_executor(std::shared_ptr ref, if (gko::HipExecutor::get_num_devices() == 0) { throw std::runtime_error{"No suitable HIP devices"}; } - exec = gko::HipExecutor::create(0, ref, false, gko::default_hip_alloc_mode, - stream); + exec = gko::HipExecutor::create( + 0, ref, std::make_shared(), stream); } @@ -146,7 +149,13 @@ class CommonTestFixture : public ::testing::Test { #endif using index_type = int; - CommonTestFixture() : ref{gko::ReferenceExecutor::create()} + CommonTestFixture() + : +#if defined(GKO_TEST_NONDEFAULT_STREAM) && \ + (defined(GKO_COMPILING_CUDA) || defined(GKO_COMPILING_HIP)) + stream{0}, +#endif + ref{gko::ReferenceExecutor::create()} { #if defined(GKO_TEST_NONDEFAULT_STREAM) && \ (defined(GKO_COMPILING_CUDA) || defined(GKO_COMPILING_HIP)) diff --git a/test/utils/mpi/executor.hpp b/test/utils/mpi/executor.hpp index 59c3f1e3f3b..d8c94e01804 100644 --- a/test/utils/mpi/executor.hpp +++ b/test/utils/mpi/executor.hpp @@ -44,6 +44,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include inline void init_executor(std::shared_ptr, @@ -71,7 +72,7 @@ inline void init_executor(std::shared_ptr ref, exec = gko::CudaExecutor::create( gko::experimental::mpi::map_rank_to_device_id( MPI_COMM_WORLD, gko::CudaExecutor::get_num_devices()), - ref, false, gko::default_cuda_alloc_mode, stream); + ref, std::make_shared(), stream); } } @@ -86,7 +87,7 @@ inline void init_executor(std::shared_ptr ref, exec = gko::HipExecutor::create( gko::experimental::mpi::map_rank_to_device_id( MPI_COMM_WORLD, gko::HipExecutor::get_num_devices()), - ref, false, gko::default_hip_alloc_mode, stream); + ref, std::make_shared(), stream); } From e046562667c920addcda2b9c91e560f0ee037c8a Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 30 Mar 2023 12:23:58 +0200 Subject: [PATCH 02/14] reset to default CUDA allocator --- test/utils/executor.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/utils/executor.hpp b/test/utils/executor.hpp index 33e6258fbbd..c588ac74260 100644 --- a/test/utils/executor.hpp +++ b/test/utils/executor.hpp @@ -110,7 +110,7 @@ inline void init_executor(std::shared_ptr ref, throw std::runtime_error{"No suitable CUDA devices"}; } exec = gko::CudaExecutor::create( - 0, ref, std::make_shared(stream), stream); + 0, ref, std::make_shared(stream), stream); } } From 84ec96035657b62fe9fb3278873afe5b571a8f42 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 30 Mar 2023 16:18:42 +0200 Subject: [PATCH 03/14] fix some compilation issues --- core/device_hooks/cuda_hooks.cpp | 12 ++++++++++- core/device_hooks/dpcpp_hooks.cpp | 15 +++++++++++-- core/device_hooks/hip_hooks.cpp | 25 +++++++++------------- cuda/base/memory.cpp | 28 ++++++++++++++++++++++++ dpcpp/CMakeLists.txt | 1 + dpcpp/base/executor.dp.cpp | 33 ----------------------------- dpcpp/base/memory.dp.cpp | 25 ++++++++++++++++------ include/ginkgo/core/base/memory.hpp | 4 ++++ 8 files changed, 86 insertions(+), 57 deletions(-) diff --git a/core/device_hooks/cuda_hooks.cpp b/core/device_hooks/cuda_hooks.cpp index cdecf735a9d..f8489908cc9 100644 --- a/core/device_hooks/cuda_hooks.cpp +++ b/core/device_hooks/cuda_hooks.cpp @@ -96,9 +96,19 @@ void CudaHostAllocator::deallocate(void* dev_ptr) const GKO_NOT_COMPILED(cuda); std::shared_ptr CudaExecutor::create( int device_id, std::shared_ptr master, bool device_reset, allocation_mode alloc_mode, CUstream_st* stream) +{ + return std::shared_ptr( + new CudaExecutor(device_id, std::move(master), + std::make_shared(), stream)); +} + + +std::shared_ptr CudaExecutor::create( + int device_id, std::shared_ptr master, + std::shared_ptr alloc, CUstream_st* stream) { return std::shared_ptr(new CudaExecutor( - device_id, std::move(master), device_reset, alloc_mode, stream)); + device_id, std::move(master), std::move(alloc), stream)); } diff --git a/core/device_hooks/dpcpp_hooks.cpp b/core/device_hooks/dpcpp_hooks.cpp index 0ee3e6f289f..1981c712872 100644 --- a/core/device_hooks/dpcpp_hooks.cpp +++ b/core/device_hooks/dpcpp_hooks.cpp @@ -53,7 +53,18 @@ version version_info::get_dpcpp_version() noexcept } -void* DpcppAllocator::allocate_impl(sycl::queue* queue, size_type size) const +DpcppAllocatorBase::DpcppAllocatorBase(sycl::queue*) GKO_NOT_COMPILED(dpcpp); + + +void* DpcppAllocatorBase::allocate(size_type num_bytes) const + GKO_NOT_COMPILED(dpcpp); + + +void DpcppAllocatorBase::deallocate(void* ptr) const GKO_NOT_COMPILED(dpcpp); + + +void* DpcppAllocator::allocate_impl(sycl::queue* queue, + size_type num_bytes) const GKO_NOT_COMPILED(dpcpp); @@ -62,7 +73,7 @@ void DpcppAllocator::deallocate_impl(sycl::queue* queue, void* ptr) const void* DpcppUnifiedAllocator::allocate_impl(sycl::queue* queue, - size_type size) const + size_type num_bytes) const GKO_NOT_COMPILED(dpcpp); diff --git a/core/device_hooks/hip_hooks.cpp b/core/device_hooks/hip_hooks.cpp index 739dac39f08..54486cc4e74 100644 --- a/core/device_hooks/hip_hooks.cpp +++ b/core/device_hooks/hip_hooks.cpp @@ -36,10 +36,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include #include #include #include +#include "ginkgo/core/base/memory.hpp" namespace gko { @@ -53,29 +55,22 @@ version version_info::get_hip_version() noexcept } -void* HipAllocator::allocate(size_type num_bytes) GKO_NOT_COMPILED(hip); +void* HipAllocator::allocate(size_type num_bytes) const GKO_NOT_COMPILED(hip); -void HipAllocator::deallocate(void* dev_ptr) GKO_NOT_COMPILED(hip); +void HipAllocator::deallocate(void* dev_ptr) const GKO_NOT_COMPILED(hip); -HipAsyncAllocator::HipAsyncAllocator(GKO_HIP_STREAM_STRUCT* stream) +std::shared_ptr HipExecutor::create( + int device_id, std::shared_ptr master, bool device_reset, + allocation_mode alloc_mode, GKO_HIP_STREAM_STRUCT* stream) GKO_NOT_COMPILED(hip); -void* HipAsyncAllocator::allocate(size_type num_bytes) GKO_NOT_COMPILED(hip); - - -void HipAsyncAllocator::deallocate(void* dev_ptr) GKO_NOT_COMPILED(hip); - - std::shared_ptr HipExecutor::create( - int device_id, std::shared_ptr master, bool device_reset, - allocation_mode alloc_mode, GKO_HIP_STREAM_STRUCT* stream) -{ - return std::shared_ptr(new HipExecutor( - device_id, std::move(master), device_reset, alloc_mode, stream)); -} + int device_id, std::shared_ptr master, + std::shared_ptr alloc, GKO_HIP_STREAM_STRUCT* stream) + GKO_NOT_COMPILED(hip); void HipExecutor::populate_exec_info(const machine_topology* mach_topo) diff --git a/cuda/base/memory.cpp b/cuda/base/memory.cpp index 11dee81ad42..c1b0a5d517f 100644 --- a/cuda/base/memory.cpp +++ b/cuda/base/memory.cpp @@ -97,6 +97,9 @@ void CudaAllocator::deallocate(void* ptr) const } +#if CUDA_VERSION >= 11020 + + CudaAsyncAllocator::CudaAsyncAllocator(cudaStream_t stream) : stream_{stream} {} @@ -108,12 +111,37 @@ void* CudaAsyncAllocator::allocate(size_type num_bytes) const return ptr; } + void CudaAsyncAllocator::deallocate(void* ptr) const { GKO_EXIT_ON_CUDA_ERROR(cudaFreeAsync(ptr, stream_)); } +#else // Fall back to regular allocation + + +CudaAsyncAllocator::CudaAsyncAllocator(cudaStream_t stream) : stream_{} {} + + +void* CudaAsyncAllocator::allocate(size_type num_bytes) const +{ + void* ptr{}; + GKO_ASSERT_NO_CUDA_ALLOCATION_ERRORS(cudaMalloc(&ptr, num_bytes), + num_bytes); + return ptr; +} + + +void CudaAsyncAllocator::deallocate(void* ptr) const +{ + GKO_EXIT_ON_CUDA_ERROR(cudaFree(ptr)); +} + + +#endif + + CudaUnifiedAllocator::CudaUnifiedAllocator(int device_id) : CudaUnifiedAllocator{device_id, cudaMemAttachGlobal} {} diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt index 31b5e0543ba..55763ca5525 100644 --- a/dpcpp/CMakeLists.txt +++ b/dpcpp/CMakeLists.txt @@ -10,6 +10,7 @@ target_sources(ginkgo_dpcpp base/executor.dp.cpp base/helper.dp.cpp base/index_set_kernels.dp.cpp + base/memory.dp.cpp base/scoped_device_id.dp.cpp base/timer.dp.cpp base/version.dp.cpp diff --git a/dpcpp/base/executor.dp.cpp b/dpcpp/base/executor.dp.cpp index d668331a43b..c2015c8664c 100644 --- a/dpcpp/base/executor.dp.cpp +++ b/dpcpp/base/executor.dp.cpp @@ -51,39 +51,6 @@ namespace gko { namespace detail { -DpcppAllocator::DpcppAllocator(sycl::queue* queue) : queue_{queue} {} - - -void* DpcppAllocator::allocate(size_type size) -{ - return sycl::malloc_device(size, *queue_); -} - - -void DpcppAllocator::deallocate(void* ptr) -{ - queue_->wait_and_throw(); - sycl::free(ptr, queue_->get_context()); -} - - -DpcppUnifiedAllocator::DpcppUnifiedAllocator(sycl::queue* queue) : queue_{queue} -{} - - -void* DpcppUnifiedAllocator::allocate(size_type size) -{ - return sycl::malloc_shared(size, *queue_); -} - - -void DpcppUnifiedAllocator::deallocate(void* ptr) -{ - queue_->wait_and_throw(); - sycl::free(ptr, queue_->get_context()); -} - - const std::vector get_devices(std::string device_type) { std::map device_type_map{ diff --git a/dpcpp/base/memory.dp.cpp b/dpcpp/base/memory.dp.cpp index b1ccd007dea..2582fa331a0 100644 --- a/dpcpp/base/memory.dp.cpp +++ b/dpcpp/base/memory.dp.cpp @@ -42,10 +42,22 @@ namespace gko { DpcppAllocatorBase::DpcppAllocatorBase(sycl::queue* queue) : queue_{queue} {} +void* DpcppAllocatorBase::allocate(size_type num_bytes) const +{ + return this->allocate_impl(queue_, num_bytes); +} + + +void DpcppAllocatorBase::deallocate(void* ptr) const +{ + this->deallocate_impl(queue_, ptr); +} + + void* DpcppAllocator::allocate_impl(sycl::queue* queue, size_type num_bytes) const { - return sycl::malloc_device(size, *queue); + return sycl::malloc_device(num_bytes, *queue); } @@ -56,16 +68,17 @@ void DpcppAllocator::deallocate_impl(sycl::queue* queue, void* ptr) const } -void* DpcppUnifiedAllocator::allocate(size_type num_bytes) +void* DpcppUnifiedAllocator::allocate_impl(sycl::queue* queue, + size_type num_bytes) { - return sycl::malloc_shared(size, *queue_); + return sycl::malloc_shared(num_bytes, *queue); } -void DpcppUnifiedAllocator::deallocate(void* ptr) +void DpcppUnifiedAllocator::deallocate_impl(sycl::queue* queue, void* ptr) { - queue_->wait_and_throw(); - sycl::free(ptr, queue_->get_context()); + queue->wait_and_throw(); + sycl::free(ptr, queue->get_context()); } diff --git a/include/ginkgo/core/base/memory.hpp b/include/ginkgo/core/base/memory.hpp index ec25920dcea..872a25a9a33 100644 --- a/include/ginkgo/core/base/memory.hpp +++ b/include/ginkgo/core/base/memory.hpp @@ -81,6 +81,10 @@ class DpcppAllocatorBase : public Allocator { public: DpcppAllocatorBase(sycl::queue* queue); + void* allocate(size_type num_bytes) const final; + + void deallocate(void* ptr) const final; + protected: virtual void* allocate_impl(sycl::queue* queue, size_type num_bytes) const = 0; From ec67b1990838217d82ad987103a66d609a18d207 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 31 May 2023 14:08:44 +0200 Subject: [PATCH 04/14] formatting --- core/base/memory.cpp | 2 +- core/device_hooks/hip_hooks.cpp | 2 +- core/test/base/executor.cpp | 2 +- cuda/base/device.cpp | 4 +--- cuda/base/memory.cpp | 2 +- cuda/base/nvtx.cpp | 4 +++- cuda/base/stream.cpp | 4 +++- cuda/test/base/cuda_executor.cu | 1 - hip/base/device.hip.cpp | 4 ++-- hip/base/roctx.hip.cpp | 4 +++- include/ginkgo/core/base/stream.hpp | 2 +- 11 files changed, 17 insertions(+), 14 deletions(-) diff --git a/core/base/memory.cpp b/core/base/memory.cpp index 88d97bcc765..4e9f0b7e24a 100644 --- a/core/base/memory.cpp +++ b/core/base/memory.cpp @@ -56,4 +56,4 @@ void CpuAllocator::deallocate(void* ptr) const } -} // namespace gko \ No newline at end of file +} // namespace gko diff --git a/core/device_hooks/hip_hooks.cpp b/core/device_hooks/hip_hooks.cpp index 54486cc4e74..4dbe6409c01 100644 --- a/core/device_hooks/hip_hooks.cpp +++ b/core/device_hooks/hip_hooks.cpp @@ -36,12 +36,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include #include #include #include #include -#include "ginkgo/core/base/memory.hpp" namespace gko { diff --git a/core/test/base/executor.cpp b/core/test/base/executor.cpp index 71064cf01d2..94e7bc02d79 100644 --- a/core/test/base/executor.cpp +++ b/core/test/base/executor.cpp @@ -35,7 +35,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include -#include "ginkgo/core/base/memory.hpp" #if defined(__unix__) || defined(__APPLE__) @@ -47,6 +46,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include namespace { diff --git a/cuda/base/device.cpp b/cuda/base/device.cpp index 31ab5bcde63..2db0876ca95 100644 --- a/cuda/base/device.cpp +++ b/cuda/base/device.cpp @@ -30,15 +30,13 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#include "cuda/base/device.hpp" - - #include #include +#include "cuda/base/device.hpp" #include "cuda/base/scoped_device_id.hpp" diff --git a/cuda/base/memory.cpp b/cuda/base/memory.cpp index c1b0a5d517f..afc1f9f62fa 100644 --- a/cuda/base/memory.cpp +++ b/cuda/base/memory.cpp @@ -193,4 +193,4 @@ void CudaHostAllocator::deallocate(void* ptr) const } -} // namespace gko \ No newline at end of file +} // namespace gko diff --git a/cuda/base/nvtx.cpp b/cuda/base/nvtx.cpp index e313c110ea2..3cbc59299b0 100644 --- a/cuda/base/nvtx.cpp +++ b/cuda/base/nvtx.cpp @@ -30,10 +30,12 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ +#include + + #include -#include #ifdef GKO_LEGACY_NVTX #include #else diff --git a/cuda/base/stream.cpp b/cuda/base/stream.cpp index 8c6aa92c28b..0bbc9b1cc83 100644 --- a/cuda/base/stream.cpp +++ b/cuda/base/stream.cpp @@ -30,11 +30,13 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ +#include + + #include #include -#include #include "cuda/base/scoped_device_id.hpp" diff --git a/cuda/test/base/cuda_executor.cu b/cuda/test/base/cuda_executor.cu index afb23c06186..c81799e0dae 100644 --- a/cuda/test/base/cuda_executor.cu +++ b/cuda/test/base/cuda_executor.cu @@ -44,7 +44,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include - #include "common/cuda_hip/base/executor.hpp.inc" #include "cuda/base/scoped_device_id.hpp" #include "cuda/test/utils.hpp" diff --git a/hip/base/device.hip.cpp b/hip/base/device.hip.cpp index b5ec1bec6d6..9a01d6aacee 100644 --- a/hip/base/device.hip.cpp +++ b/hip/base/device.hip.cpp @@ -30,15 +30,15 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#include +#include #include #include -#include #include +#include #include "hip/base/scoped_device_id.hip.hpp" diff --git a/hip/base/roctx.hip.cpp b/hip/base/roctx.hip.cpp index 9f309b93362..a01bc11dc47 100644 --- a/hip/base/roctx.hip.cpp +++ b/hip/base/roctx.hip.cpp @@ -30,10 +30,12 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ +#include + + #include -#include #if GINKGO_HIP_PLATFORM_HCC && GKO_HAVE_ROCTX #include #endif diff --git a/include/ginkgo/core/base/stream.hpp b/include/ginkgo/core/base/stream.hpp index 4bb4aeecf9e..8ee8333e41a 100644 --- a/include/ginkgo/core/base/stream.hpp +++ b/include/ginkgo/core/base/stream.hpp @@ -121,4 +121,4 @@ class hip_stream { } // namespace gko -#endif // GKO_PUBLIC_CORE_BASE_EXECUTOR_HPP_ +#endif // GKO_PUBLIC_CORE_BASE_STREAM_HPP_ From b985d6349c189a2f5bffba100db36e09c755ab2a Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 1 Jun 2023 12:00:18 +0200 Subject: [PATCH 05/14] fix compilation --- core/device_hooks/hip_hooks.cpp | 12 ++++++++++-- test/utils/executor.hpp | 2 +- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/core/device_hooks/hip_hooks.cpp b/core/device_hooks/hip_hooks.cpp index 4dbe6409c01..ba7563f1ef0 100644 --- a/core/device_hooks/hip_hooks.cpp +++ b/core/device_hooks/hip_hooks.cpp @@ -64,13 +64,21 @@ void HipAllocator::deallocate(void* dev_ptr) const GKO_NOT_COMPILED(hip); std::shared_ptr HipExecutor::create( int device_id, std::shared_ptr master, bool device_reset, allocation_mode alloc_mode, GKO_HIP_STREAM_STRUCT* stream) - GKO_NOT_COMPILED(hip); +{ + return std::shared_ptr( + new HipExecutor(device_id, std::move(master), + std::make_shared(), stream)); +} std::shared_ptr HipExecutor::create( int device_id, std::shared_ptr master, std::shared_ptr alloc, GKO_HIP_STREAM_STRUCT* stream) - GKO_NOT_COMPILED(hip); +{ + return std::shared_ptr( + new HipExecutor(device_id, std::move(master), + std::make_shared(), stream)); +} void HipExecutor::populate_exec_info(const machine_topology* mach_topo) diff --git a/test/utils/executor.hpp b/test/utils/executor.hpp index c588ac74260..200f4652644 100644 --- a/test/utils/executor.hpp +++ b/test/utils/executor.hpp @@ -110,7 +110,7 @@ inline void init_executor(std::shared_ptr ref, throw std::runtime_error{"No suitable CUDA devices"}; } exec = gko::CudaExecutor::create( - 0, ref, std::make_shared(stream), stream); + 0, ref, std::make_shared(), stream); } } From 51e8399fb5e1b708c61356af5bbd76bf54135774 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 6 Jul 2023 15:45:46 +0200 Subject: [PATCH 06/14] update interface and remove DPC++ allocator --- core/base/memory.cpp | 4 +- core/device_hooks/cuda_hooks.cpp | 32 ++++-- core/device_hooks/dpcpp_hooks.cpp | 28 ----- core/device_hooks/hip_hooks.cpp | 51 ++++++++- core/test/base/executor.cpp | 2 +- cuda/base/executor.cpp | 9 +- cuda/base/memory.cpp | 43 ++++++-- dpcpp/CMakeLists.txt | 1 - dpcpp/base/memory.dp.cpp | 85 --------------- dpcpp/test/base/CMakeLists.txt | 1 - dpcpp/test/base/memory.dp.cpp | 98 ----------------- hip/base/executor.hip.cpp | 9 +- hip/base/memory.hip.cpp | 123 ++++++++++++++++++++- include/ginkgo/core/base/executor.hpp | 33 ++---- include/ginkgo/core/base/memory.hpp | 151 ++++++++++++++++++-------- 15 files changed, 348 insertions(+), 322 deletions(-) delete mode 100644 dpcpp/base/memory.dp.cpp delete mode 100644 dpcpp/test/base/memory.dp.cpp diff --git a/core/base/memory.cpp b/core/base/memory.cpp index 4e9f0b7e24a..b6c6f8f265c 100644 --- a/core/base/memory.cpp +++ b/core/base/memory.cpp @@ -42,7 +42,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace gko { -void* CpuAllocator::allocate(size_type num_bytes) const +void* CpuAllocator::allocate(size_type num_bytes) { auto ptr = ::operator new (num_bytes, std::nothrow_t{}); GKO_ENSURE_ALLOCATED(ptr, "cpu", num_bytes); @@ -50,7 +50,7 @@ void* CpuAllocator::allocate(size_type num_bytes) const } -void CpuAllocator::deallocate(void* ptr) const +void CpuAllocator::deallocate(void* ptr) { ::operator delete (ptr, std::nothrow_t{}); } diff --git a/core/device_hooks/cuda_hooks.cpp b/core/device_hooks/cuda_hooks.cpp index f8489908cc9..03ab12deb46 100644 --- a/core/device_hooks/cuda_hooks.cpp +++ b/core/device_hooks/cuda_hooks.cpp @@ -54,43 +54,55 @@ version version_info::get_cuda_version() noexcept } -void* CudaAllocator::allocate(size_type num_bytes) const GKO_NOT_COMPILED(cuda); +void* CudaAllocator::allocate(size_type num_bytes) GKO_NOT_COMPILED(cuda); -void CudaAllocator::deallocate(void* dev_ptr) const GKO_NOT_COMPILED(cuda); +void CudaAllocator::deallocate(void* dev_ptr) GKO_NOT_COMPILED(cuda); CudaAsyncAllocator::CudaAsyncAllocator(CUstream_st* stream) GKO_NOT_COMPILED(cuda); -void* CudaAsyncAllocator::allocate(size_type num_bytes) const - GKO_NOT_COMPILED(cuda); +void* CudaAsyncAllocator::allocate(size_type num_bytes) GKO_NOT_COMPILED(cuda); + +void CudaAsyncAllocator::deallocate(void* dev_ptr) GKO_NOT_COMPILED(cuda); -void CudaAsyncAllocator::deallocate(void* dev_ptr) const GKO_NOT_COMPILED(cuda); + +bool CudaAsyncAllocator::check_environment(int device_id, + CUstream_st* stream) const + GKO_NOT_COMPILED(cuda); CudaUnifiedAllocator::CudaUnifiedAllocator(int device_id, unsigned int flags) GKO_NOT_COMPILED(cuda); -void* CudaUnifiedAllocator::allocate(size_type num_bytes) const +void* CudaUnifiedAllocator::allocate(size_type num_bytes) GKO_NOT_COMPILED(cuda); -void CudaUnifiedAllocator::deallocate(void* dev_ptr) const +void CudaUnifiedAllocator::deallocate(void* dev_ptr) GKO_NOT_COMPILED(cuda); + + +bool CudaUnifiedAllocator::check_environment(int device_id, + CUstream_st* stream) const GKO_NOT_COMPILED(cuda); CudaHostAllocator::CudaHostAllocator(int device_id) GKO_NOT_COMPILED(cuda); -void* CudaHostAllocator::allocate(size_type num_bytes) const - GKO_NOT_COMPILED(cuda); +void* CudaHostAllocator::allocate(size_type num_bytes) GKO_NOT_COMPILED(cuda); + +void CudaHostAllocator::deallocate(void* dev_ptr) GKO_NOT_COMPILED(cuda); -void CudaHostAllocator::deallocate(void* dev_ptr) const GKO_NOT_COMPILED(cuda); + +bool CudaHostAllocator::check_environment(int device_id, + CUstream_st* stream) const + GKO_NOT_COMPILED(cuda); std::shared_ptr CudaExecutor::create( diff --git a/core/device_hooks/dpcpp_hooks.cpp b/core/device_hooks/dpcpp_hooks.cpp index 1981c712872..532e9c55bbe 100644 --- a/core/device_hooks/dpcpp_hooks.cpp +++ b/core/device_hooks/dpcpp_hooks.cpp @@ -53,34 +53,6 @@ version version_info::get_dpcpp_version() noexcept } -DpcppAllocatorBase::DpcppAllocatorBase(sycl::queue*) GKO_NOT_COMPILED(dpcpp); - - -void* DpcppAllocatorBase::allocate(size_type num_bytes) const - GKO_NOT_COMPILED(dpcpp); - - -void DpcppAllocatorBase::deallocate(void* ptr) const GKO_NOT_COMPILED(dpcpp); - - -void* DpcppAllocator::allocate_impl(sycl::queue* queue, - size_type num_bytes) const - GKO_NOT_COMPILED(dpcpp); - - -void DpcppAllocator::deallocate_impl(sycl::queue* queue, void* ptr) const - GKO_NOT_COMPILED(dpcpp); - - -void* DpcppUnifiedAllocator::allocate_impl(sycl::queue* queue, - size_type num_bytes) const - GKO_NOT_COMPILED(dpcpp); - - -void DpcppUnifiedAllocator::deallocate_impl(sycl::queue* queue, void* ptr) const - GKO_NOT_COMPILED(dpcpp); - - std::shared_ptr DpcppExecutor::create( int device_id, std::shared_ptr master, std::string device_type, dpcpp_queue_property property) diff --git a/core/device_hooks/hip_hooks.cpp b/core/device_hooks/hip_hooks.cpp index ba7563f1ef0..dec1de15933 100644 --- a/core/device_hooks/hip_hooks.cpp +++ b/core/device_hooks/hip_hooks.cpp @@ -55,10 +55,54 @@ version version_info::get_hip_version() noexcept } -void* HipAllocator::allocate(size_type num_bytes) const GKO_NOT_COMPILED(hip); +void* HipAllocator::allocate(size_type num_bytes) GKO_NOT_COMPILED(hip); -void HipAllocator::deallocate(void* dev_ptr) const GKO_NOT_COMPILED(hip); +void HipAllocator::deallocate(void* dev_ptr) GKO_NOT_COMPILED(hip); + + +HipAsyncAllocator::HipAsyncAllocator(GKO_HIP_STREAM_STRUCT* stream) + GKO_NOT_COMPILED(hip); + + +void* HipAsyncAllocator::allocate(size_type num_bytes) GKO_NOT_COMPILED(hip); + + +void HipAsyncAllocator::deallocate(void* dev_ptr) GKO_NOT_COMPILED(hip); + + +bool HipAsyncAllocator::check_environment(int device_id, + GKO_HIP_STREAM_STRUCT* stream) const + GKO_NOT_COMPILED(hip); + + +HipUnifiedAllocator::HipUnifiedAllocator(int device_id, unsigned int flags) + GKO_NOT_COMPILED(hip); + + +void* HipUnifiedAllocator::allocate(size_type num_bytes) GKO_NOT_COMPILED(hip); + + +void HipUnifiedAllocator::deallocate(void* dev_ptr) GKO_NOT_COMPILED(hip); + + +bool HipUnifiedAllocator::check_environment(int device_id, + GKO_HIP_STREAM_STRUCT* stream) const + GKO_NOT_COMPILED(hip); + + +HipHostAllocator::HipHostAllocator(int device_id) GKO_NOT_COMPILED(hip); + + +void* HipHostAllocator::allocate(size_type num_bytes) GKO_NOT_COMPILED(hip); + + +void HipHostAllocator::deallocate(void* dev_ptr) GKO_NOT_COMPILED(hip); + + +bool HipHostAllocator::check_environment(int device_id, + GKO_HIP_STREAM_STRUCT* stream) const + GKO_NOT_COMPILED(hip); std::shared_ptr HipExecutor::create( @@ -76,8 +120,7 @@ std::shared_ptr HipExecutor::create( std::shared_ptr alloc, GKO_HIP_STREAM_STRUCT* stream) { return std::shared_ptr( - new HipExecutor(device_id, std::move(master), - std::make_shared(), stream)); + new HipExecutor(device_id, std::move(master), alloc, stream)); } diff --git a/core/test/base/executor.cpp b/core/test/base/executor.cpp index 94e7bc02d79..13cba09e2b6 100644 --- a/core/test/base/executor.cpp +++ b/core/test/base/executor.cpp @@ -386,7 +386,7 @@ TEST(Executor, CanVerifyMemory) struct MockAllocator : gko::CpuAllocator { - void deallocate(void* ptr) const noexcept override + void deallocate(void* ptr) noexcept override { called_free = true; CpuAllocator::deallocate(ptr); diff --git a/cuda/base/executor.cpp b/cuda/base/executor.cpp index f6e838dd2dd..faf90037a0f 100644 --- a/cuda/base/executor.cpp +++ b/cuda/base/executor.cpp @@ -80,9 +80,8 @@ std::shared_ptr CudaExecutor::create( int device_id, std::shared_ptr master, bool device_reset, allocation_mode alloc_mode, cudaStream_t stream) { - return std::shared_ptr( - new CudaExecutor(device_id, std::move(master), - allocator_from_mode(device_id, alloc_mode), stream)); + return create(device_id, master, allocator_from_mode(device_id, alloc_mode), + stream); } @@ -90,6 +89,10 @@ std::shared_ptr CudaExecutor::create( int device_id, std::shared_ptr master, std::shared_ptr alloc, cudaStream_t stream) { + if (!alloc->check_environment(device_id, stream)) { + throw Error{__FILE__, __LINE__, + "Allocator uses incorrect stream or device ID."}; + } return std::shared_ptr(new CudaExecutor( device_id, std::move(master), std::move(alloc), stream)); } diff --git a/cuda/base/memory.cpp b/cuda/base/memory.cpp index afc1f9f62fa..08c64c0ba05 100644 --- a/cuda/base/memory.cpp +++ b/cuda/base/memory.cpp @@ -82,7 +82,7 @@ namespace gko { #endif -void* CudaAllocator::allocate(size_type num_bytes) const +void* CudaAllocator::allocate(size_type num_bytes) { void* ptr{}; GKO_ASSERT_NO_CUDA_ALLOCATION_ERRORS(cudaMalloc(&ptr, num_bytes), @@ -91,7 +91,7 @@ void* CudaAllocator::allocate(size_type num_bytes) const } -void CudaAllocator::deallocate(void* ptr) const +void CudaAllocator::deallocate(void* ptr) { GKO_EXIT_ON_CUDA_ERROR(cudaFree(ptr)); } @@ -103,7 +103,7 @@ void CudaAllocator::deallocate(void* ptr) const CudaAsyncAllocator::CudaAsyncAllocator(cudaStream_t stream) : stream_{stream} {} -void* CudaAsyncAllocator::allocate(size_type num_bytes) const +void* CudaAsyncAllocator::allocate(size_type num_bytes) { void* ptr{}; GKO_ASSERT_NO_CUDA_ALLOCATION_ERRORS( @@ -112,7 +112,7 @@ void* CudaAsyncAllocator::allocate(size_type num_bytes) const } -void CudaAsyncAllocator::deallocate(void* ptr) const +void CudaAsyncAllocator::deallocate(void* ptr) { GKO_EXIT_ON_CUDA_ERROR(cudaFreeAsync(ptr, stream_)); } @@ -121,10 +121,10 @@ void CudaAsyncAllocator::deallocate(void* ptr) const #else // Fall back to regular allocation -CudaAsyncAllocator::CudaAsyncAllocator(cudaStream_t stream) : stream_{} {} +CudaAsyncAllocator::CudaAsyncAllocator(cudaStream_t stream) : stream_{stream} {} -void* CudaAsyncAllocator::allocate(size_type num_bytes) const +void* CudaAsyncAllocator::allocate(size_type num_bytes) { void* ptr{}; GKO_ASSERT_NO_CUDA_ALLOCATION_ERRORS(cudaMalloc(&ptr, num_bytes), @@ -133,7 +133,7 @@ void* CudaAsyncAllocator::allocate(size_type num_bytes) const } -void CudaAsyncAllocator::deallocate(void* ptr) const +void CudaAsyncAllocator::deallocate(void* ptr) { GKO_EXIT_ON_CUDA_ERROR(cudaFree(ptr)); } @@ -142,6 +142,13 @@ void CudaAsyncAllocator::deallocate(void* ptr) const #endif +bool CudaAsyncAllocator::check_environment(int device_id, + CUstream_st* stream) const +{ + return stream == stream_; +} + + CudaUnifiedAllocator::CudaUnifiedAllocator(int device_id) : CudaUnifiedAllocator{device_id, cudaMemAttachGlobal} {} @@ -152,7 +159,7 @@ CudaUnifiedAllocator::CudaUnifiedAllocator(int device_id, unsigned int flags) {} -void* CudaUnifiedAllocator::allocate(size_type num_bytes) const +void* CudaUnifiedAllocator::allocate(size_type num_bytes) { // we need to set the device ID in case this gets used in a host executor detail::cuda_scoped_device_id_guard g(device_id_); @@ -163,7 +170,7 @@ void* CudaUnifiedAllocator::allocate(size_type num_bytes) const } -void CudaUnifiedAllocator::deallocate(void* ptr) const +void CudaUnifiedAllocator::deallocate(void* ptr) { // we need to set the device ID in case this gets used in a host executor detail::cuda_scoped_device_id_guard g(device_id_); @@ -171,10 +178,17 @@ void CudaUnifiedAllocator::deallocate(void* ptr) const } +bool CudaUnifiedAllocator::check_environment(int device_id, + CUstream_st* stream) const +{ + return device_id == device_id_; +} + + CudaHostAllocator::CudaHostAllocator(int device_id) : device_id_{device_id} {} -void* CudaHostAllocator::allocate(size_type num_bytes) const +void* CudaHostAllocator::allocate(size_type num_bytes) { // we need to set the device ID in case this gets used in a host executor detail::cuda_scoped_device_id_guard g(device_id_); @@ -185,7 +199,7 @@ void* CudaHostAllocator::allocate(size_type num_bytes) const } -void CudaHostAllocator::deallocate(void* ptr) const +void CudaHostAllocator::deallocate(void* ptr) { // we need to set the device ID in case this gets used in a host executor detail::cuda_scoped_device_id_guard g(device_id_); @@ -193,4 +207,11 @@ void CudaHostAllocator::deallocate(void* ptr) const } +bool CudaHostAllocator::check_environment(int device_id, + CUstream_st* stream) const +{ + return device_id == device_id_; +} + + } // namespace gko diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt index 55763ca5525..31b5e0543ba 100644 --- a/dpcpp/CMakeLists.txt +++ b/dpcpp/CMakeLists.txt @@ -10,7 +10,6 @@ target_sources(ginkgo_dpcpp base/executor.dp.cpp base/helper.dp.cpp base/index_set_kernels.dp.cpp - base/memory.dp.cpp base/scoped_device_id.dp.cpp base/timer.dp.cpp base/version.dp.cpp diff --git a/dpcpp/base/memory.dp.cpp b/dpcpp/base/memory.dp.cpp deleted file mode 100644 index 2582fa331a0..00000000000 --- a/dpcpp/base/memory.dp.cpp +++ /dev/null @@ -1,85 +0,0 @@ -/************************************************************* -Copyright (c) 2017-2023, the Ginkgo authors -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions -are met: - -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in the -documentation and/or other materials provided with the distribution. - -3. Neither the name of the copyright holder nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS -IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED -TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A -PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*************************************************************/ - -#include - - -#include - - -namespace gko { - - -DpcppAllocatorBase::DpcppAllocatorBase(sycl::queue* queue) : queue_{queue} {} - - -void* DpcppAllocatorBase::allocate(size_type num_bytes) const -{ - return this->allocate_impl(queue_, num_bytes); -} - - -void DpcppAllocatorBase::deallocate(void* ptr) const -{ - this->deallocate_impl(queue_, ptr); -} - - -void* DpcppAllocator::allocate_impl(sycl::queue* queue, - size_type num_bytes) const -{ - return sycl::malloc_device(num_bytes, *queue); -} - - -void DpcppAllocator::deallocate_impl(sycl::queue* queue, void* ptr) const -{ - queue->wait_and_throw(); - sycl::free(ptr, queue->get_context()); -} - - -void* DpcppUnifiedAllocator::allocate_impl(sycl::queue* queue, - size_type num_bytes) -{ - return sycl::malloc_shared(num_bytes, *queue); -} - - -void DpcppUnifiedAllocator::deallocate_impl(sycl::queue* queue, void* ptr) -{ - queue->wait_and_throw(); - sycl::free(ptr, queue->get_context()); -} - - -} // namespace gko diff --git a/dpcpp/test/base/CMakeLists.txt b/dpcpp/test/base/CMakeLists.txt index 5c0ca601f04..bb9c8a75050 100644 --- a/dpcpp/test/base/CMakeLists.txt +++ b/dpcpp/test/base/CMakeLists.txt @@ -3,4 +3,3 @@ ginkgo_create_dpcpp_test(dim3) ginkgo_create_dpcpp_test(kernel_launch) # set correct flags for kernel_launch.hpp target_compile_definitions(dpcpp_test_base_kernel_launch PRIVATE GKO_COMPILING_DPCPP) -ginkgo_create_dpcpp_test(memory) \ No newline at end of file diff --git a/dpcpp/test/base/memory.dp.cpp b/dpcpp/test/base/memory.dp.cpp deleted file mode 100644 index e587660cde3..00000000000 --- a/dpcpp/test/base/memory.dp.cpp +++ /dev/null @@ -1,98 +0,0 @@ -/************************************************************* -Copyright (c) 2017-2023, the Ginkgo authors -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions -are met: - -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in the -documentation and/or other materials provided with the distribution. - -3. Neither the name of the copyright holder nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS -IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED -TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A -PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*************************************************************/ - -#include - - -#include -#include - - -#include - - -#include -#include -#include - - -#include "dpcpp/test/utils.hpp" - - -namespace { - - -class Memory : public ::testing::Test { -protected: - Memory() - : exec{gko::DpcppExecutor::create(0, gko::OmpExecutor::create())}, - host_exec_with_unified{gko::OmpExecutor::create( - std::make_shared(exec->get_queue()))}, - exec_with_unified{gko::DpcppExecutor::create( - exec->get_queue(), host_exec_with_unified, - std::make_shared(exec->get_queue()))} - {} - - std::shared_ptr exec; - std::shared_ptr host_exec_with_unified; - std::shared_ptr exec_with_unified; -}; - - -TEST_F(Memory, DeviceAllocationWorks) -{ - gko::array data{exec, {1, 2}}; - - GKO_ASSERT_ARRAY_EQ(data, I({1, 2})); -} - - -TEST_F(Memory, UnifiedDeviceAllocationWorks) -{ - gko::array data{exec_with_unified, {1, 2}}; - exec->synchronize(); - - ASSERT_EQ(data.get_const_data()[0], 1); - ASSERT_EQ(data.get_const_data()[1], 2); -} - - -TEST_F(Memory, HostUnifiedAllocationWorks) -{ - gko::array data{host_exec_with_unified, {1, 2}}; - - ASSERT_EQ(data.get_const_data()[0], 1); - ASSERT_EQ(data.get_const_data()[1], 2); -} - - -} // namespace diff --git a/hip/base/executor.hip.cpp b/hip/base/executor.hip.cpp index 6b4b0fd5ddc..2df5c9a4847 100644 --- a/hip/base/executor.hip.cpp +++ b/hip/base/executor.hip.cpp @@ -60,9 +60,8 @@ std::shared_ptr HipExecutor::create( int device_id, std::shared_ptr master, bool device_reset, allocation_mode alloc_mode, hipStream_t stream) { - return std::shared_ptr( - new HipExecutor(device_id, std::move(master), - std::make_shared(), stream)); + return create(device_id, std::move(master), + std::make_shared(), stream); } @@ -70,6 +69,10 @@ std::shared_ptr HipExecutor::create( int device_id, std::shared_ptr master, std::shared_ptr alloc, hipStream_t stream) { + if (!alloc->check_environment(device_id, stream)) { + throw Error{__FILE__, __LINE__, + "Allocator uses incorrect stream or device ID."}; + } return std::shared_ptr(new HipExecutor( device_id, std::move(master), std::move(alloc), stream)); } diff --git a/hip/base/memory.hip.cpp b/hip/base/memory.hip.cpp index f2a8977525f..7acb208173a 100644 --- a/hip/base/memory.hip.cpp +++ b/hip/base/memory.hip.cpp @@ -39,6 +39,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "hip/base/scoped_device_id.hip.hpp" + + namespace gko { @@ -79,7 +82,7 @@ namespace gko { #endif -void* HipAllocator::allocate(size_type num_bytes) const +void* HipAllocator::allocate(size_type num_bytes) { void* dev_ptr{}; GKO_ASSERT_NO_HIP_ALLOCATION_ERRORS(hipMalloc(&dev_ptr, num_bytes), @@ -88,10 +91,126 @@ void* HipAllocator::allocate(size_type num_bytes) const } -void HipAllocator::deallocate(void* dev_ptr) const +void HipAllocator::deallocate(void* dev_ptr) { GKO_EXIT_ON_HIP_ERROR(hipFree(dev_ptr)); } +#if HIP_VERSION_MAJOR >= 5 + + +HipAsyncAllocator::HipAsyncAllocator(hipStream_t stream) : stream_{stream} {} + + +void* HipAsyncAllocator::allocate(size_type num_bytes) +{ + void* ptr{}; + GKO_ASSERT_NO_HIP_ALLOCATION_ERRORS( + hipMallocAsync(&ptr, num_bytes, stream_), num_bytes); + return ptr; +} + + +void HipAsyncAllocator::deallocate(void* ptr) +{ + GKO_EXIT_ON_HIP_ERROR(hipFreeAsync(ptr, stream_)); +} + + +#else // Fall back to regular allocation + + +HipAsyncAllocator::HipAsyncAllocator(hipStream_t stream) : stream_{stream} {} + + +void* HipAsyncAllocator::allocate(size_type num_bytes) +{ + void* ptr{}; + GKO_ASSERT_NO_HIP_ALLOCATION_ERRORS(hipMalloc(&ptr, num_bytes), num_bytes); + return ptr; +} + + +void HipAsyncAllocator::deallocate(void* ptr) +{ + GKO_EXIT_ON_HIP_ERROR(hipFree(ptr)); +} + + +#endif + + +bool HipAsyncAllocator::check_environment(int device_id, + hipStream_t stream) const +{ + return stream == stream_; +} + + +HipUnifiedAllocator::HipUnifiedAllocator(int device_id) + : HipUnifiedAllocator{device_id, hipMemAttachGlobal} +{} + + +HipUnifiedAllocator::HipUnifiedAllocator(int device_id, unsigned int flags) + : device_id_{device_id}, flags_{flags} +{} + + +void* HipUnifiedAllocator::allocate(size_type num_bytes) +{ + // we need to set the device ID in case this gets used in a host executor + detail::hip_scoped_device_id_guard g(device_id_); + void* ptr{}; + GKO_ASSERT_NO_HIP_ALLOCATION_ERRORS( + hipMallocManaged(&ptr, num_bytes, flags_), num_bytes); + return ptr; +} + + +void HipUnifiedAllocator::deallocate(void* ptr) +{ + // we need to set the device ID in case this gets used in a host executor + detail::hip_scoped_device_id_guard g(device_id_); + GKO_EXIT_ON_HIP_ERROR(hipFree(ptr)); +} + + +bool HipUnifiedAllocator::check_environment(int device_id, + hipStream_t stream) const +{ + return device_id == device_id_; +} + + +HipHostAllocator::HipHostAllocator(int device_id) : device_id_{device_id} {} + + +void* HipHostAllocator::allocate(size_type num_bytes) +{ + // we need to set the device ID in case this gets used in a host executor + detail::hip_scoped_device_id_guard g(device_id_); + void* ptr{}; + GKO_ASSERT_NO_HIP_ALLOCATION_ERRORS(hipHostMalloc(&ptr, num_bytes), + num_bytes); + return ptr; +} + + +void HipHostAllocator::deallocate(void* ptr) +{ + // we need to set the device ID in case this gets used in a host executor + detail::hip_scoped_device_id_guard g(device_id_); + GKO_EXIT_ON_HIP_ERROR(hipFreeHost(ptr)); +} + + +bool HipHostAllocator::check_environment(int device_id, + hipStream_t stream) const +{ + return device_id == device_id_; +} + + } // namespace gko diff --git a/include/ginkgo/core/base/executor.hpp b/include/ginkgo/core/base/executor.hpp index 4545b216f86..f033873e392 100644 --- a/include/ginkgo/core/base/executor.hpp +++ b/include/ginkgo/core/base/executor.hpp @@ -1717,10 +1717,13 @@ class HipExecutor : public detail::ExecutorBase, * @param alloc_mode the allocation mode that the executor should operate * on. See @allocation_mode for more details */ - [[deprecated("")]] static std::shared_ptr create( - int device_id, std::shared_ptr master, bool device_reset, - allocation_mode alloc_mode = default_hip_alloc_mode, - GKO_HIP_STREAM_STRUCT* stream = nullptr); + [[deprecated( + "device_reset is deprecated entirely, call hipDeviceReset directly. " + "alloc_mode was replaced by the Allocator type " + "hierarchy.")]] static std::shared_ptr + create(int device_id, std::shared_ptr master, bool device_reset, + allocation_mode alloc_mode = default_hip_alloc_mode, + GKO_HIP_STREAM_STRUCT* stream = nullptr); static std::shared_ptr create( int device_id, std::shared_ptr master, @@ -1914,28 +1917,6 @@ class DpcppExecutor : public detail::ExecutorBase, std::string device_type = "all", dpcpp_queue_property property = dpcpp_queue_property::in_order); - /** - * Creates a new DpcppExecutor from an existing SYCL queue. - * - * @param queue the DPCPP device id of this device - * @param master an executor on the host that is used to invoke the device - * kernels - */ - static std::shared_ptr create( - sycl::queue* queue, std::shared_ptr master); - - /** - * Creates a new DpcppExecutor from an existing SYCL queue. - * - * @param queue the DPCPP device id of this device - * @param master an executor on the host that is used to invoke the device - * kernels - * @param alloc the allocator used for memory allocation - */ - static std::shared_ptr create( - sycl::queue* queue, std::shared_ptr master, - std::shared_ptr alloc); - std::shared_ptr get_master() noexcept override; std::shared_ptr get_master() const noexcept override; diff --git a/include/ginkgo/core/base/memory.hpp b/include/ginkgo/core/base/memory.hpp index 872a25a9a33..1086c9aacb4 100644 --- a/include/ginkgo/core/base/memory.hpp +++ b/include/ginkgo/core/base/memory.hpp @@ -49,9 +49,9 @@ class Allocator { public: virtual ~Allocator() = default; - virtual void* allocate(size_type num_bytes) const = 0; + virtual void* allocate(size_type num_bytes) = 0; - virtual void deallocate(void* ptr) const = 0; + virtual void deallocate(void* ptr) = 0; }; @@ -65,34 +65,49 @@ class CpuAllocatorBase : public Allocator {}; /** * Implement this interface to provide an allocator for CudaExecutor. */ -class CudaAllocatorBase : public Allocator {}; +class CudaAllocatorBase : public Allocator { + friend class CudaExecutor; - -/** - * Implement this interface to provide an allocator for HipExecutor. - */ -class HipAllocatorBase : public Allocator {}; +protected: + /** + * Checks if the allocator can be used safely with the provided device ID + * and stream. The check is necessary to ensure safe usage of stream-ordered + * allocators and unified shared memory allocators. + * + * @param device_id the device ID the allocator will be used in. + * @param stream the stream the allocator will be used with. + * @return true if and only if the allocator can be used by CudaExecutor in + * the given environment. + */ + virtual bool check_environment(int device_id, CUstream_st* stream) const + { + return true; + } +}; /** - * Implement this interface to provide an allocator for DpcppExecutor. + * Implement this interface to provide an allocator for HipExecutor. */ -class DpcppAllocatorBase : public Allocator { -public: - DpcppAllocatorBase(sycl::queue* queue); - - void* allocate(size_type num_bytes) const final; - - void deallocate(void* ptr) const final; +class HipAllocatorBase : public Allocator { + friend class HipExecutor; protected: - virtual void* allocate_impl(sycl::queue* queue, - size_type num_bytes) const = 0; - - virtual void deallocate_impl(sycl::queue* queue, void* ptr) const = 0; - -private: - sycl::queue* queue_; + /** + * Checks if the allocator can be used safely with the provided device ID + * and stream. The check is necessary to ensure safe usage of stream-ordered + * allocators and unified shared memory allocators. + * + * @param device_id the device ID the allocator will be used in. + * @param stream the stream the allocator will be used with. + * @return true if and only if the allocator can be used by CudaExecutor in + * the given environment. + */ + virtual bool check_environment(int device_id, + GKO_HIP_STREAM_STRUCT* stream) const + { + return true; + } }; @@ -101,9 +116,9 @@ class DpcppAllocatorBase : public Allocator { */ class CpuAllocator : public CpuAllocatorBase { public: - void* allocate(size_type num_bytes) const override; + void* allocate(size_type num_bytes) override; - void deallocate(void* ptr) const override; + void deallocate(void* ptr) override; }; @@ -112,9 +127,9 @@ class CpuAllocator : public CpuAllocatorBase { */ class CudaAllocator : public CudaAllocatorBase { public: - void* allocate(size_type num_bytes) const override; + void* allocate(size_type num_bytes) override; - void deallocate(void* ptr) const override; + void deallocate(void* ptr) override; }; @@ -123,12 +138,14 @@ class CudaAllocator : public CudaAllocatorBase { */ class CudaAsyncAllocator : public CudaAllocatorBase { public: - void* allocate(size_type num_bytes) const override; + void* allocate(size_type num_bytes) override; - void deallocate(void* ptr) const override; + void deallocate(void* ptr) override; CudaAsyncAllocator(CUstream_st* stream); + bool check_environment(int device_id, CUstream_st* stream) const override; + private: CUstream_st* stream_; }; @@ -139,14 +156,17 @@ class CudaAsyncAllocator : public CudaAllocatorBase { */ class CudaUnifiedAllocator : public CudaAllocatorBase, public CpuAllocatorBase { public: - void* allocate(size_type num_bytes) const override; + void* allocate(size_type num_bytes) override; - void deallocate(void* ptr) const override; + void deallocate(void* ptr) override; CudaUnifiedAllocator(int device_id); CudaUnifiedAllocator(int device_id, unsigned int flags); +protected: + bool check_environment(int device_id, CUstream_st* stream) const override; + private: int device_id_; unsigned int flags_; @@ -154,16 +174,19 @@ class CudaUnifiedAllocator : public CudaAllocatorBase, public CpuAllocatorBase { /* - * Allocator using cudaMallocHost. + * Allocator using cudaHostMalloc. */ class CudaHostAllocator : public CudaAllocatorBase, public CpuAllocatorBase { public: - void* allocate(size_type num_bytes) const override; + void* allocate(size_type num_bytes) override; - void deallocate(void* ptr) const override; + void deallocate(void* ptr) override; CudaHostAllocator(int device_id); +protected: + bool check_environment(int device_id, CUstream_st* stream) const override; + private: int device_id_; }; @@ -174,38 +197,72 @@ class CudaHostAllocator : public CudaAllocatorBase, public CpuAllocatorBase { */ class HipAllocator : public HipAllocatorBase { public: - void* allocate(size_type num_bytes) const override; + void* allocate(size_type num_bytes) override; + + void deallocate(void* ptr) override; +}; + + +/* + * Allocator using hipMallocAsync. + */ +class HipAsyncAllocator : public HipAllocatorBase { +public: + void* allocate(size_type num_bytes) override; - void deallocate(void* ptr) const override; + void deallocate(void* ptr) override; + + HipAsyncAllocator(GKO_HIP_STREAM_STRUCT* stream); + +protected: + bool check_environment(int device_id, + GKO_HIP_STREAM_STRUCT* stream) const override; + +private: + GKO_HIP_STREAM_STRUCT* stream_; }; /* - * Allocator using sycl::malloc_device. + * Allocator using hipMallocManaged */ -class DpcppAllocator : public DpcppAllocatorBase { +class HipUnifiedAllocator : public HipAllocatorBase, public CpuAllocatorBase { public: - using DpcppAllocatorBase::DpcppAllocatorBase; + void* allocate(size_type num_bytes) override; + + void deallocate(void* ptr) override; + + HipUnifiedAllocator(int device_id); + + HipUnifiedAllocator(int device_id, unsigned int flags); protected: - void* allocate_impl(sycl::queue* queue, size_type num_bytes) const override; + bool check_environment(int device_id, + GKO_HIP_STREAM_STRUCT* stream) const override; - void deallocate_impl(sycl::queue* queue, void* ptr) const override; +private: + int device_id_; + unsigned int flags_; }; /* - * Allocator using sycl::malloc_shared. + * Allocator using hipHostAlloc. */ -class DpcppUnifiedAllocator : public DpcppAllocatorBase, - public CpuAllocatorBase { +class HipHostAllocator : public HipAllocatorBase, public CpuAllocatorBase { public: - using DpcppAllocatorBase::DpcppAllocatorBase; + void* allocate(size_type num_bytes) override; + + void deallocate(void* ptr) override; + + HipHostAllocator(int device_id); protected: - void* allocate_impl(sycl::queue* queue, size_type num_bytes) const override; + bool check_environment(int device_id, + GKO_HIP_STREAM_STRUCT* stream) const override; - void deallocate_impl(sycl::queue* queue, void* ptr) const override; +private: + int device_id_; }; From 39b3b3f288b6bdc3de081b33d08644d2ac6adecc Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Fri, 7 Jul 2023 11:49:57 +0200 Subject: [PATCH 07/14] add HIP allocator tests --- benchmark/utils/general.hpp | 9 ++- hip/base/memory.hip.cpp | 2 +- hip/test/base/CMakeLists.txt | 1 + hip/test/base/memory.hip.cpp | 126 +++++++++++++++++++++++++++++++++++ 4 files changed, 132 insertions(+), 6 deletions(-) create mode 100644 hip/test/base/memory.hip.cpp diff --git a/benchmark/utils/general.hpp b/benchmark/utils/general.hpp index 35077f66d4b..19c71b74a1a 100644 --- a/benchmark/utils/general.hpp +++ b/benchmark/utils/general.hpp @@ -368,16 +368,15 @@ const std::map****************************** +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include +#include + + +#include + + +#include +#include +#include + + +#include "hip/test/utils.hip.hpp" + + +namespace { + + +class Memory : public HipTestFixture { +protected: + Memory() + : host_exec_with_pinned{gko::OmpExecutor::create( + std::make_shared(0))}, + host_exec_with_unified{gko::OmpExecutor::create( + std::make_shared(0))}, + exec_with_normal{gko::HipExecutor::create( + 0, ref, std::make_shared(), + exec->get_stream())}, + exec_with_async{gko::HipExecutor::create( + 0, host_exec_with_pinned, + std::make_shared(exec->get_stream()), + exec->get_stream())}, + exec_with_unified{gko::HipExecutor::create( + 0, host_exec_with_unified, + std::make_shared(0), + exec->get_stream())} + {} + + std::shared_ptr host_exec_with_pinned; + std::shared_ptr host_exec_with_unified; + std::shared_ptr exec_with_normal; + std::shared_ptr exec_with_async; + std::shared_ptr exec_with_unified; +}; + + +TEST_F(Memory, DeviceAllocationWorks) +{ + gko::array data{exec_with_normal, {1, 2}}; + + GKO_ASSERT_ARRAY_EQ(data, I({1, 2})); +} + + +TEST_F(Memory, AsyncDeviceAllocationWorks) +{ + gko::array data{exec_with_async, {1, 2}}; + + GKO_ASSERT_ARRAY_EQ(data, I({1, 2})); +} + + +TEST_F(Memory, UnifiedDeviceAllocationWorks) +{ + gko::array data{exec_with_unified, {1, 2}}; + exec->synchronize(); + + ASSERT_EQ(data.get_const_data()[0], 1); + ASSERT_EQ(data.get_const_data()[1], 2); +} + + +TEST_F(Memory, HostUnifiedAllocationWorks) +{ + gko::array data{host_exec_with_unified, {1, 2}}; + + ASSERT_EQ(data.get_const_data()[0], 1); + ASSERT_EQ(data.get_const_data()[1], 2); +} + + +TEST_F(Memory, HostPinnedAllocationWorks) +{ + gko::array data{host_exec_with_pinned, {1, 2}}; + + ASSERT_EQ(data.get_const_data()[0], 1); + ASSERT_EQ(data.get_const_data()[1], 2); +} + + +} // namespace From 37524940da45380511987f3791cf3bf6eeb617b9 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Fri, 7 Jul 2023 13:22:33 +0200 Subject: [PATCH 08/14] review updates * honor allocation_mode for HIP * use correct allocation flags for cudaMallocManaged allocation_mode * use valid device_id in moved-from stream wrapper * add more deprecation warnings for device_reset functionality * documentation Co-authored-by: Yuhsiang M. Tsai Co-authored-by: Marcel Koch --- cuda/base/executor.cpp | 6 ++++-- cuda/base/stream.cpp | 4 ++-- hip/base/executor.hip.cpp | 20 ++++++++++++++++++- hip/base/roctx.hip.cpp | 2 ++ hip/base/stream.hip.cpp | 4 ++-- include/ginkgo/core/base/executor.hpp | 28 +++++++++++++++++++++++++-- include/ginkgo/core/base/stream.hpp | 12 ++++++++++-- 7 files changed, 65 insertions(+), 11 deletions(-) diff --git a/cuda/base/executor.cpp b/cuda/base/executor.cpp index faf90037a0f..fd16815456a 100644 --- a/cuda/base/executor.cpp +++ b/cuda/base/executor.cpp @@ -67,9 +67,11 @@ std::unique_ptr allocator_from_mode(int device_id, case allocation_mode::device: return std::make_unique(); case allocation_mode::unified_global: - return std::make_unique(device_id); + return std::make_unique(device_id, + cudaMemAttachGlobal); case allocation_mode::unified_host: - return std::make_unique(device_id); + return std::make_unique(device_id, + cudaMemAttachHost); default: GKO_NOT_SUPPORTED(mode); } diff --git a/cuda/base/stream.cpp b/cuda/base/stream.cpp index 0bbc9b1cc83..76027bd51e2 100644 --- a/cuda/base/stream.cpp +++ b/cuda/base/stream.cpp @@ -45,7 +45,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace gko { -cuda_stream::cuda_stream() : stream_{}, device_id_{-1} {} +cuda_stream::cuda_stream() : stream_{nullptr}, device_id_{} {} cuda_stream::cuda_stream(int device_id) : stream_{}, device_id_(device_id) @@ -66,7 +66,7 @@ cuda_stream::~cuda_stream() cuda_stream::cuda_stream(cuda_stream&& other) : stream_{std::exchange(other.stream_, nullptr)}, - device_id_(std::exchange(other.device_id_, -1)) + device_id_(std::exchange(other.device_id_, 0)) {} diff --git a/hip/base/executor.hip.cpp b/hip/base/executor.hip.cpp index 2df5c9a4847..a89e765becb 100644 --- a/hip/base/executor.hip.cpp +++ b/hip/base/executor.hip.cpp @@ -56,12 +56,30 @@ namespace gko { #include "common/cuda_hip/base/executor.hpp.inc" +std::unique_ptr allocator_from_mode(int device_id, + allocation_mode mode) +{ + switch (mode) { + case allocation_mode::device: + return std::make_unique(); + case allocation_mode::unified_global: + return std::make_unique(device_id, + hipMemAttachGlobal); + case allocation_mode::unified_host: + return std::make_unique(device_id, + hipMemAttachHost); + default: + GKO_NOT_SUPPORTED(mode); + } +} + + std::shared_ptr HipExecutor::create( int device_id, std::shared_ptr master, bool device_reset, allocation_mode alloc_mode, hipStream_t stream) { return create(device_id, std::move(master), - std::make_shared(), stream); + allocator_from_mode(device_id, alloc_mode), stream); } diff --git a/hip/base/roctx.hip.cpp b/hip/base/roctx.hip.cpp index a01bc11dc47..23b07e60254 100644 --- a/hip/base/roctx.hip.cpp +++ b/hip/base/roctx.hip.cpp @@ -56,6 +56,7 @@ void begin_roctx(const char* name, profile_event_category) roctxRangePush(name); } + void end_roctx(const char*, profile_event_category) { roctxRangePop(); } #else @@ -63,6 +64,7 @@ void end_roctx(const char*, profile_event_category) { roctxRangePop(); } void begin_roctx(const char* name, profile_event_category) GKO_NOT_COMPILED(roctx); + void end_roctx(const char*, profile_event_category) GKO_NOT_COMPILED(roctx); #endif diff --git a/hip/base/stream.hip.cpp b/hip/base/stream.hip.cpp index e5817eb9ebd..dc2d99b8b17 100644 --- a/hip/base/stream.hip.cpp +++ b/hip/base/stream.hip.cpp @@ -47,7 +47,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace gko { -hip_stream::hip_stream() : stream_{}, device_id_{-1} {} +hip_stream::hip_stream() : stream_{}, device_id_{} {} hip_stream::hip_stream(int device_id) : stream_{}, device_id_(device_id) @@ -68,7 +68,7 @@ hip_stream::~hip_stream() hip_stream::hip_stream(hip_stream&& other) : stream_{std::exchange(other.stream_, nullptr)}, - device_id_{std::exchange(other.device_id_, -1)} + device_id_{std::exchange(other.device_id_, 0)} {} diff --git a/include/ginkgo/core/base/executor.hpp b/include/ginkgo/core/base/executor.hpp index f033873e392..4f476b9286d 100644 --- a/include/ginkgo/core/base/executor.hpp +++ b/include/ginkgo/core/base/executor.hpp @@ -1307,14 +1307,38 @@ class EnableDeviceReset { * * @param device_reset whether to allow a device reset or not */ - void set_device_reset(bool device_reset) {} + [[deprecated( + "device_reset is no longer supported, call " + "cudaDeviceReset/hipDeviceReset manually")]] void + set_device_reset(bool device_reset) + {} /** * Returns the current status of the device reset boolean for this executor. * * @return the current status of the device reset boolean for this executor. */ - bool get_device_reset() { return false; } + [[deprecated( + "device_reset is no longer supported, call " + "cudaDeviceReset/hipDeviceReset manually")]] bool + get_device_reset() + { + return false; + } + +protected: + /** + * Instantiate an EnableDeviceReset class + * + * @param device_reset the starting device_reset status. Defaults to false. + */ + EnableDeviceReset() {} + + [[deprecated( + "device_reset is no longer supported, call " + "cudaDeviceReset/hipDeviceReset manually")]] EnableDeviceReset(bool + device_reset) + {} }; diff --git a/include/ginkgo/core/base/stream.hpp b/include/ginkgo/core/base/stream.hpp index 8ee8333e41a..f7d45f59c5a 100644 --- a/include/ginkgo/core/base/stream.hpp +++ b/include/ginkgo/core/base/stream.hpp @@ -50,7 +50,11 @@ class cuda_stream { /** Creates an empty stream wrapper, representing the default stream. */ cuda_stream(); - /** Creates a new custom CUDA stream. */ + /** + * Creates a new custom CUDA stream on the given device. + * + * @param device_id the device ID to create the stream on. + */ cuda_stream(int device_id); /** Destroys the custom CUDA stream, if it isn't empty. */ @@ -89,7 +93,11 @@ class hip_stream { /** Creates an empty stream wrapper, representing the default stream. */ hip_stream(); - /** Creates a new custom HIP stream. */ + /** + * Creates a new custom HIP stream on the given device. + * + * @param device_id the device ID to create the stream on. + */ hip_stream(int device_id); /** Destroys the custom HIP stream, if it isn't empty. */ From a5d82f1494b13a35a1cb1a2d6c4e42d8777f274e Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 10 Jul 2023 14:23:11 +0000 Subject: [PATCH 09/14] avoid creating OmpExecutor in tests --- core/test/base/executor.cpp | 54 ++++++++++----------- cuda/test/base/cuda_executor.cu | 28 +++++------ cuda/test/base/cuda_executor_topology.cu | 18 +++---- hip/test/base/hip_executor.hip.cpp | 28 +++++------ hip/test/base/hip_executor_topology.hip.cpp | 19 ++++---- 5 files changed, 74 insertions(+), 73 deletions(-) diff --git a/core/test/base/executor.cpp b/core/test/base/executor.cpp index 13cba09e2b6..a331d8f3485 100644 --- a/core/test/base/executor.cpp +++ b/core/test/base/executor.cpp @@ -248,17 +248,17 @@ TEST(ReferenceExecutor, IsItsOwnMaster) TEST(CudaExecutor, KnowsItsMaster) { - auto omp = gko::OmpExecutor::create(); - exec_ptr cuda = gko::CudaExecutor::create(0, omp); + auto ref = gko::ReferenceExecutor::create(); + exec_ptr cuda = gko::CudaExecutor::create(0, ref); - ASSERT_EQ(omp, cuda->get_master()); + ASSERT_EQ(ref, cuda->get_master()); } TEST(CudaExecutor, KnowsItsDeviceId) { - auto omp = gko::OmpExecutor::create(); - auto cuda = gko::CudaExecutor::create(0, omp); + auto ref = gko::ReferenceExecutor::create(); + auto cuda = gko::CudaExecutor::create(0, ref); ASSERT_EQ(0, cuda->get_device_id()); } @@ -266,17 +266,17 @@ TEST(CudaExecutor, KnowsItsDeviceId) TEST(HipExecutor, KnowsItsMaster) { - auto omp = gko::OmpExecutor::create(); - exec_ptr hip = gko::HipExecutor::create(0, omp); + auto ref = gko::ReferenceExecutor::create(); + exec_ptr hip = gko::HipExecutor::create(0, ref); - ASSERT_EQ(omp, hip->get_master()); + ASSERT_EQ(ref, hip->get_master()); } TEST(HipExecutor, KnowsItsDeviceId) { - auto omp = gko::OmpExecutor::create(); - auto hip = gko::HipExecutor::create(0, omp); + auto ref = gko::ReferenceExecutor::create(); + auto hip = gko::HipExecutor::create(0, ref); ASSERT_EQ(0, hip->get_device_id()); } @@ -284,17 +284,17 @@ TEST(HipExecutor, KnowsItsDeviceId) TEST(DpcppExecutor, KnowsItsMaster) { - auto omp = gko::OmpExecutor::create(); - exec_ptr dpcpp = gko::DpcppExecutor::create(0, omp); + auto ref = gko::ReferenceExecutor::create(); + exec_ptr dpcpp = gko::DpcppExecutor::create(0, ref); - ASSERT_EQ(omp, dpcpp->get_master()); + ASSERT_EQ(ref, dpcpp->get_master()); } TEST(DpcppExecutor, KnowsItsDeviceId) { - auto omp = gko::OmpExecutor::create(); - auto dpcpp = gko::DpcppExecutor::create(0, omp); + auto ref = gko::ReferenceExecutor::create(); + auto dpcpp = gko::DpcppExecutor::create(0, ref); ASSERT_EQ(0, dpcpp->get_device_id()); } @@ -304,13 +304,13 @@ TEST(Executor, CanVerifyMemory) { auto ref = gko::ReferenceExecutor::create(); auto omp = gko::OmpExecutor::create(); - auto hip = gko::HipExecutor::create(0, omp); - auto cuda = gko::CudaExecutor::create(0, omp); + auto hip = gko::HipExecutor::create(0, ref); + auto cuda = gko::CudaExecutor::create(0, ref); auto omp2 = gko::OmpExecutor::create(); - auto hip2 = gko::HipExecutor::create(0, omp); - auto cuda2 = gko::CudaExecutor::create(0, omp); - auto hip_1 = gko::HipExecutor::create(1, omp); - auto cuda_1 = gko::CudaExecutor::create(1, omp); + auto hip2 = gko::HipExecutor::create(0, ref); + auto cuda2 = gko::CudaExecutor::create(0, ref); + auto hip_1 = gko::HipExecutor::create(1, ref); + auto cuda_1 = gko::CudaExecutor::create(1, ref); std::shared_ptr host_dpcpp; std::shared_ptr cpu_dpcpp; std::shared_ptr gpu_dpcpp; @@ -318,16 +318,16 @@ TEST(Executor, CanVerifyMemory) std::shared_ptr cpu_dpcpp_dup; std::shared_ptr gpu_dpcpp_dup; if (gko::DpcppExecutor::get_num_devices("host")) { - host_dpcpp = gko::DpcppExecutor::create(0, omp, "host"); - host_dpcpp_dup = gko::DpcppExecutor::create(0, omp, "host"); + host_dpcpp = gko::DpcppExecutor::create(0, ref, "host"); + host_dpcpp_dup = gko::DpcppExecutor::create(0, ref, "host"); } if (gko::DpcppExecutor::get_num_devices("cpu")) { - cpu_dpcpp = gko::DpcppExecutor::create(0, omp, "cpu"); - cpu_dpcpp_dup = gko::DpcppExecutor::create(0, omp, "cpu"); + cpu_dpcpp = gko::DpcppExecutor::create(0, ref, "cpu"); + cpu_dpcpp_dup = gko::DpcppExecutor::create(0, ref, "cpu"); } if (gko::DpcppExecutor::get_num_devices("gpu")) { - gpu_dpcpp = gko::DpcppExecutor::create(0, omp, "gpu"); - gpu_dpcpp_dup = gko::DpcppExecutor::create(0, omp, "gpu"); + gpu_dpcpp = gko::DpcppExecutor::create(0, ref, "gpu"); + gpu_dpcpp_dup = gko::DpcppExecutor::create(0, ref, "gpu"); } ASSERT_EQ(false, ref->memory_accessible(omp)); diff --git a/cuda/test/base/cuda_executor.cu b/cuda/test/base/cuda_executor.cu index c81799e0dae..83cfd1827ad 100644 --- a/cuda/test/base/cuda_executor.cu +++ b/cuda/test/base/cuda_executor.cu @@ -93,7 +93,7 @@ protected: stream(0), other_stream(gko::CudaExecutor::get_num_devices() - 1), #endif - omp(gko::OmpExecutor::create()), + ref(gko::ReferenceExecutor::create()), cuda(nullptr), cuda2(nullptr), cuda3(nullptr) @@ -104,19 +104,19 @@ protected: ASSERT_GT(gko::CudaExecutor::get_num_devices(), 0); #ifdef GKO_TEST_NONDEFAULT_STREAM cuda = gko::CudaExecutor::create( - 0, omp, std::make_shared(), stream.get()); + 0, ref, std::make_shared(), stream.get()); cuda2 = gko::CudaExecutor::create( - gko::CudaExecutor::get_num_devices() - 1, omp, + gko::CudaExecutor::get_num_devices() - 1, ref, std::make_shared(), other_stream.get()); cuda3 = gko::CudaExecutor::create( - 0, omp, std::make_shared(0), + 0, ref, std::make_shared(0), stream.get()); #else - cuda = gko::CudaExecutor::create(0, omp); + cuda = gko::CudaExecutor::create(0, ref); cuda2 = gko::CudaExecutor::create( - gko::CudaExecutor::get_num_devices() - 1, omp); + gko::CudaExecutor::get_num_devices() - 1, ref); cuda3 = gko::CudaExecutor::create( - 0, omp, std::make_shared(0)); + 0, ref, std::make_shared(0)); #endif } @@ -132,7 +132,7 @@ protected: gko::cuda_stream stream; gko::cuda_stream other_stream; #endif - std::shared_ptr omp; + std::shared_ptr ref; std::shared_ptr cuda; std::shared_ptr cuda2; std::shared_ptr cuda3; @@ -141,8 +141,8 @@ protected: TEST_F(CudaExecutor, CanInstantiateTwoExecutorsOnOneDevice) { - auto cuda = gko::CudaExecutor::create(0, omp); - auto cuda2 = gko::CudaExecutor::create(0, omp); + auto cuda = gko::CudaExecutor::create(0, ref); + auto cuda2 = gko::CudaExecutor::create(0, ref); // We want automatic deinitialization to not create any error } @@ -197,7 +197,7 @@ TEST_F(CudaExecutor, CopiesDataToCuda) int orig[] = {3, 8}; auto* copy = cuda->alloc(2); - cuda->copy_from(omp, 2, orig, copy); + cuda->copy_from(ref, 2, orig, copy); check_data<<<1, 1, 0, cuda->get_stream()>>>(copy); ASSERT_NO_THROW(cuda->synchronize()); @@ -218,7 +218,7 @@ TEST_F(CudaExecutor, CanAllocateOnUnifiedMemory) int orig[] = {3, 8}; auto* copy = cuda3->alloc(2); - cuda3->copy_from(omp, 2, orig, copy); + cuda3->copy_from(ref, 2, orig, copy); check_data<<<1, 1, 0, cuda3->get_stream()>>>(copy); ASSERT_NO_THROW(cuda3->synchronize()); @@ -240,7 +240,7 @@ TEST_F(CudaExecutor, CopiesDataFromCuda) auto orig = cuda->alloc(2); init_data<<<1, 1, 0, cuda->get_stream()>>>(orig); - omp->copy_from(cuda, 2, orig, copy); + ref->copy_from(cuda, 2, orig, copy); EXPECT_EQ(3, copy[0]); ASSERT_EQ(8, copy[1]); @@ -293,7 +293,7 @@ TEST_F(CudaExecutor, CopiesDataFromCudaToCuda) cuda2->run(ExampleOperation(value)); ASSERT_EQ(value, cuda2->get_device_id()); // Put the results on OpenMP and run CPU side assertions - omp->copy_from(cuda2, 2, copy_cuda2, copy); + ref->copy_from(cuda2, 2, copy_cuda2, copy); EXPECT_EQ(3, copy[0]); ASSERT_EQ(8, copy[1]); cuda2->free(copy_cuda2); diff --git a/cuda/test/base/cuda_executor_topology.cu b/cuda/test/base/cuda_executor_topology.cu index a0ee6826ded..3b91cc7941a 100644 --- a/cuda/test/base/cuda_executor_topology.cu +++ b/cuda/test/base/cuda_executor_topology.cu @@ -60,15 +60,15 @@ namespace { class CudaExecutor : public ::testing::Test { protected: CudaExecutor() - : omp(gko::OmpExecutor::create()), cuda(nullptr), cuda2(nullptr) + : ref(gko::ReferenceExecutor::create()), cuda(nullptr), cuda2(nullptr) {} void SetUp() { ASSERT_GT(gko::CudaExecutor::get_num_devices(), 0); - cuda = gko::CudaExecutor::create(0, omp); + cuda = gko::CudaExecutor::create(0, ref); cuda2 = gko::CudaExecutor::create( - gko::CudaExecutor::get_num_devices() - 1, omp); + gko::CudaExecutor::get_num_devices() - 1, ref); } void TearDown() @@ -79,7 +79,7 @@ protected: } } - std::shared_ptr omp; + std::shared_ptr ref; std::shared_ptr cuda; std::shared_ptr cuda2; }; @@ -102,7 +102,7 @@ inline int get_core_os_id(int log_id) TEST_F(CudaExecutor, CanBindToSinglePu) { - cuda = gko::CudaExecutor::create(0, gko::OmpExecutor::create()); + cuda = gko::CudaExecutor::create(0, gko::ReferenceExecutor::create()); const int bind_pu = 1; gko::machine_topology::get_instance()->bind_to_pu(bind_pu); @@ -114,7 +114,7 @@ TEST_F(CudaExecutor, CanBindToSinglePu) TEST_F(CudaExecutor, CanBindToPus) { - cuda = gko::CudaExecutor::create(0, gko::OmpExecutor::create()); + cuda = gko::CudaExecutor::create(0, gko::ReferenceExecutor::create()); std::vector bind_pus = {1, 3}; gko::machine_topology::get_instance()->bind_to_pus(bind_pus); @@ -126,7 +126,7 @@ TEST_F(CudaExecutor, CanBindToPus) TEST_F(CudaExecutor, CanBindToCores) { - cuda = gko::CudaExecutor::create(0, gko::OmpExecutor::create()); + cuda = gko::CudaExecutor::create(0, gko::ReferenceExecutor::create()); std::vector bind_cores = {1, 3}; gko::machine_topology::get_instance()->bind_to_cores(bind_cores); @@ -138,7 +138,7 @@ TEST_F(CudaExecutor, CanBindToCores) TEST_F(CudaExecutor, ClosestCpusIsPopulated) { - cuda = gko::CudaExecutor::create(0, gko::OmpExecutor::create()); + cuda = gko::CudaExecutor::create(0, gko::ReferenceExecutor::create()); auto close_cpus = cuda->get_closest_pus(); if (close_cpus.size() == 0) { GTEST_SKIP(); @@ -150,7 +150,7 @@ TEST_F(CudaExecutor, ClosestCpusIsPopulated) TEST_F(CudaExecutor, KnowsItsNuma) { - cuda = gko::CudaExecutor::create(0, gko::OmpExecutor::create()); + cuda = gko::CudaExecutor::create(0, gko::ReferenceExecutor::create()); auto numa0 = cuda->get_closest_numa(); auto close_cpus = cuda->get_closest_pus(); if (close_cpus.size() == 0) { diff --git a/hip/test/base/hip_executor.hip.cpp b/hip/test/base/hip_executor.hip.cpp index e531fa739e6..e63543ef77c 100644 --- a/hip/test/base/hip_executor.hip.cpp +++ b/hip/test/base/hip_executor.hip.cpp @@ -98,7 +98,7 @@ class HipExecutor : public ::testing::Test { stream(0), other_stream(gko::HipExecutor::get_num_devices() - 1), #endif - omp(gko::OmpExecutor::create()), + ref(gko::ReferenceExecutor::create()), hip(nullptr), hip2(nullptr), hip3(nullptr) @@ -109,17 +109,17 @@ class HipExecutor : public ::testing::Test { ASSERT_GT(gko::HipExecutor::get_num_devices(), 0); #ifdef GKO_TEST_NONDEFAULT_STREAM hip = gko::HipExecutor::create( - 0, omp, std::make_shared(), stream.get()); + 0, ref, std::make_shared(), stream.get()); hip2 = gko::HipExecutor::create( - gko::HipExecutor::get_num_devices() - 1, omp, + gko::HipExecutor::get_num_devices() - 1, ref, std::make_shared(), other_stream.get()); hip3 = gko::HipExecutor::create( - 0, omp, std::make_shared(), stream.get()); + 0, ref, std::make_shared(), stream.get()); #else - hip = gko::HipExecutor::create(0, omp); + hip = gko::HipExecutor::create(0, ref); hip2 = gko::HipExecutor::create(gko::HipExecutor::get_num_devices() - 1, - omp); - hip3 = gko::HipExecutor::create(0, omp, + ref); + hip3 = gko::HipExecutor::create(0, ref, std::make_shared()); #endif } @@ -136,7 +136,7 @@ class HipExecutor : public ::testing::Test { gko::hip_stream stream; gko::hip_stream other_stream; #endif - std::shared_ptr omp; + std::shared_ptr ref; std::shared_ptr hip; std::shared_ptr hip2; std::shared_ptr hip3; @@ -145,8 +145,8 @@ class HipExecutor : public ::testing::Test { TEST_F(HipExecutor, CanInstantiateTwoExecutorsOnOneDevice) { - auto hip = gko::HipExecutor::create(0, omp); - auto hip2 = gko::HipExecutor::create(0, omp); + auto hip = gko::HipExecutor::create(0, ref); + auto hip2 = gko::HipExecutor::create(0, ref); // We want automatic deinitialization to not create any error } @@ -204,7 +204,7 @@ TEST_F(HipExecutor, CopiesDataToHip) int orig[] = {3, 8}; auto* copy = hip->alloc(2); - hip->copy_from(omp, 2, orig, copy); + hip->copy_from(ref, 2, orig, copy); check_data<<<1, 1, 0, hip->get_stream()>>>(copy); ASSERT_NO_THROW(hip->synchronize()); @@ -232,7 +232,7 @@ TEST_F(HipExecutor, CanAllocateOnUnifiedMemory) int orig[] = {3, 8}; auto* copy = hip3->alloc(2); - hip3->copy_from(omp, 2, orig, copy); + hip3->copy_from(ref, 2, orig, copy); check_data<<<1, 1, 0, hip3->get_stream()>>>(copy); ASSERT_NO_THROW(hip3->synchronize()); @@ -257,7 +257,7 @@ TEST_F(HipExecutor, CopiesDataFromHip) auto orig = hip->alloc(2); init_data<<<1, 1, 0, hip->get_stream()>>>(orig); - omp->copy_from(hip, 2, orig, copy); + ref->copy_from(hip, 2, orig, copy); EXPECT_EQ(3, copy[0]); ASSERT_EQ(8, copy[1]); @@ -310,7 +310,7 @@ TEST_F(HipExecutor, CopiesDataFromHipToHip) hip2->run(ExampleOperation(value)); ASSERT_EQ(value, hip2->get_device_id()); // Put the results on OpenMP and run CPU side assertions - omp->copy_from(hip2, 2, copy_hip2, copy); + ref->copy_from(hip2, 2, copy_hip2, copy); EXPECT_EQ(3, copy[0]); ASSERT_EQ(8, copy[1]); hip2->free(copy_hip2); diff --git a/hip/test/base/hip_executor_topology.hip.cpp b/hip/test/base/hip_executor_topology.hip.cpp index 394b2776319..3d6e3f2bddc 100644 --- a/hip/test/base/hip_executor_topology.hip.cpp +++ b/hip/test/base/hip_executor_topology.hip.cpp @@ -65,15 +65,16 @@ namespace { class HipExecutor : public ::testing::Test { protected: - HipExecutor() : omp(gko::OmpExecutor::create()), hip(nullptr), hip2(nullptr) + HipExecutor() + : ref(gko::ReferenceExecutor::create()), hip(nullptr), hip2(nullptr) {} void SetUp() { ASSERT_GT(gko::HipExecutor::get_num_devices(), 0); - hip = gko::HipExecutor::create(0, omp); + hip = gko::HipExecutor::create(0, ref); hip2 = gko::HipExecutor::create(gko::HipExecutor::get_num_devices() - 1, - omp); + ref); } void TearDown() @@ -84,7 +85,7 @@ class HipExecutor : public ::testing::Test { } } - std::shared_ptr omp; + std::shared_ptr ref; std::shared_ptr hip; std::shared_ptr hip2; }; @@ -107,7 +108,7 @@ inline int get_core_os_id(int log_id) TEST_F(HipExecutor, CanBindToSinglePu) { - hip = gko::HipExecutor::create(0, gko::OmpExecutor::create()); + hip = gko::HipExecutor::create(0, gko::ReferenceExecutor::create()); const int bind_pu = 1; gko::machine_topology::get_instance()->bind_to_pu(bind_pu); @@ -119,7 +120,7 @@ TEST_F(HipExecutor, CanBindToSinglePu) TEST_F(HipExecutor, CanBindToPus) { - hip = gko::HipExecutor::create(0, gko::OmpExecutor::create()); + hip = gko::HipExecutor::create(0, gko::ReferenceExecutor::create()); std::vector bind_pus = {1, 3}; gko::machine_topology::get_instance()->bind_to_pus(bind_pus); @@ -131,7 +132,7 @@ TEST_F(HipExecutor, CanBindToPus) TEST_F(HipExecutor, CanBindToCores) { - hip = gko::HipExecutor::create(0, gko::OmpExecutor::create()); + hip = gko::HipExecutor::create(0, gko::ReferenceExecutor::create()); std::vector bind_cores = {1, 3}; gko::machine_topology::get_instance()->bind_to_cores(bind_cores); @@ -143,7 +144,7 @@ TEST_F(HipExecutor, CanBindToCores) TEST_F(HipExecutor, ClosestCpusIsPopulated) { - hip = gko::HipExecutor::create(0, gko::OmpExecutor::create()); + hip = gko::HipExecutor::create(0, gko::ReferenceExecutor::create()); auto close_cpus = hip->get_closest_pus(); if (close_cpus.size() == 0) { GTEST_SKIP(); @@ -155,7 +156,7 @@ TEST_F(HipExecutor, ClosestCpusIsPopulated) TEST_F(HipExecutor, KnowsItsNuma) { - hip = gko::HipExecutor::create(0, gko::OmpExecutor::create()); + hip = gko::HipExecutor::create(0, gko::ReferenceExecutor::create()); auto numa0 = hip->get_closest_numa(); auto close_cpus = hip->get_closest_pus(); if (close_cpus.size() == 0) { From d925a80faf15a7a84ff9a7771faf4b435c52942c Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 10 Jul 2023 14:24:15 +0000 Subject: [PATCH 10/14] warn if using unsupported allocator --- cuda/base/memory.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/cuda/base/memory.cpp b/cuda/base/memory.cpp index 08c64c0ba05..f605d9135ea 100644 --- a/cuda/base/memory.cpp +++ b/cuda/base/memory.cpp @@ -121,7 +121,13 @@ void CudaAsyncAllocator::deallocate(void* ptr) #else // Fall back to regular allocation -CudaAsyncAllocator::CudaAsyncAllocator(cudaStream_t stream) : stream_{stream} {} +CudaAsyncAllocator::CudaAsyncAllocator(cudaStream_t stream) : stream_{stream} +{ +#if GKO_VERBOSE_LEVEL >= 1 + std::cerr << "This version of CUDA does not support cudaMallocAsync, " + "please use CudaAllocator instead of CudaAsyncAllocator.\n"; +#endif +} void* CudaAsyncAllocator::allocate(size_type num_bytes) From f61f9122012348f56996ce4073972388c9921271 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 10 Jul 2023 14:27:48 +0000 Subject: [PATCH 11/14] improve documentation Co-authored-by: Yuhsiang M. Tsai Co-authored-by: Pratik Nayak --- include/ginkgo/core/base/memory.hpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/ginkgo/core/base/memory.hpp b/include/ginkgo/core/base/memory.hpp index 1086c9aacb4..f421abf7da4 100644 --- a/include/ginkgo/core/base/memory.hpp +++ b/include/ginkgo/core/base/memory.hpp @@ -76,6 +76,7 @@ class CudaAllocatorBase : public Allocator { * * @param device_id the device ID the allocator will be used in. * @param stream the stream the allocator will be used with. + * * @return true if and only if the allocator can be used by CudaExecutor in * the given environment. */ @@ -100,7 +101,8 @@ class HipAllocatorBase : public Allocator { * * @param device_id the device ID the allocator will be used in. * @param stream the stream the allocator will be used with. - * @return true if and only if the allocator can be used by CudaExecutor in + * + * @return true if and only if the allocator can be used by HipExecutor in * the given environment. */ virtual bool check_environment(int device_id, From 580219a908fbf06b66a0caa6790043553bc93f83 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 19 Jul 2023 11:46:47 +0200 Subject: [PATCH 12/14] fix HIP requirements for stream-ordered allocation --- hip/base/memory.hip.cpp | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/hip/base/memory.hip.cpp b/hip/base/memory.hip.cpp index 86ac31c3154..be795bb3397 100644 --- a/hip/base/memory.hip.cpp +++ b/hip/base/memory.hip.cpp @@ -97,7 +97,7 @@ void HipAllocator::deallocate(void* dev_ptr) } -#if HIP_VERSION_MAJOR >= 5 +#if HIP_VERSION >= 50200000 HipAsyncAllocator::HipAsyncAllocator(hipStream_t stream) : stream_{stream} {} @@ -121,7 +121,13 @@ void HipAsyncAllocator::deallocate(void* ptr) #else // Fall back to regular allocation -HipAsyncAllocator::HipAsyncAllocator(hipStream_t stream) : stream_{stream} {} +HipAsyncAllocator::HipAsyncAllocator(hipStream_t stream) : stream_{stream} +{ +#if GKO_VERBOSE_LEVEL >= 1 + std::cerr << "This version of HIP does not support hipMallocAsync, " + "please use HipAllocator instead of HipAsyncAllocator.\n"; +#endif +} void* HipAsyncAllocator::allocate(size_type num_bytes) From 54ccdafb954f0240a7eb7101326eedf404a85055 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 19 Jul 2023 12:13:05 +0200 Subject: [PATCH 13/14] use unified allocator in some HIP tests --- hip/test/base/hip_executor.hip.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/hip/test/base/hip_executor.hip.cpp b/hip/test/base/hip_executor.hip.cpp index e63543ef77c..42499384704 100644 --- a/hip/test/base/hip_executor.hip.cpp +++ b/hip/test/base/hip_executor.hip.cpp @@ -114,13 +114,14 @@ class HipExecutor : public ::testing::Test { gko::HipExecutor::get_num_devices() - 1, ref, std::make_shared(), other_stream.get()); hip3 = gko::HipExecutor::create( - 0, ref, std::make_shared(), stream.get()); + 0, ref, std::make_shared(0), + stream.get()); #else hip = gko::HipExecutor::create(0, ref); hip2 = gko::HipExecutor::create(gko::HipExecutor::get_num_devices() - 1, ref); - hip3 = gko::HipExecutor::create(0, ref, - std::make_shared()); + hip3 = gko::HipExecutor::create( + 0, ref, std::make_shared(0)); #endif } From 33e29c3390ac2e4c53a90ccada3b4ad6cde68d1f Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 19 Jul 2023 13:52:49 +0200 Subject: [PATCH 14/14] resolve ambiguous symbol --- cuda/base/executor.cpp | 8 ++++---- hip/base/executor.hip.cpp | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/cuda/base/executor.cpp b/cuda/base/executor.cpp index fd16815456a..f296fb9da86 100644 --- a/cuda/base/executor.cpp +++ b/cuda/base/executor.cpp @@ -60,8 +60,8 @@ namespace gko { #include "common/cuda_hip/base/executor.hpp.inc" -std::unique_ptr allocator_from_mode(int device_id, - allocation_mode mode) +std::unique_ptr cuda_allocator_from_mode( + int device_id, allocation_mode mode) { switch (mode) { case allocation_mode::device: @@ -82,8 +82,8 @@ std::shared_ptr CudaExecutor::create( int device_id, std::shared_ptr master, bool device_reset, allocation_mode alloc_mode, cudaStream_t stream) { - return create(device_id, master, allocator_from_mode(device_id, alloc_mode), - stream); + return create(device_id, master, + cuda_allocator_from_mode(device_id, alloc_mode), stream); } diff --git a/hip/base/executor.hip.cpp b/hip/base/executor.hip.cpp index a89e765becb..8d175c0e424 100644 --- a/hip/base/executor.hip.cpp +++ b/hip/base/executor.hip.cpp @@ -56,8 +56,8 @@ namespace gko { #include "common/cuda_hip/base/executor.hpp.inc" -std::unique_ptr allocator_from_mode(int device_id, - allocation_mode mode) +std::unique_ptr hip_allocator_from_mode(int device_id, + allocation_mode mode) { switch (mode) { case allocation_mode::device: @@ -79,7 +79,7 @@ std::shared_ptr HipExecutor::create( allocation_mode alloc_mode, hipStream_t stream) { return create(device_id, std::move(master), - allocator_from_mode(device_id, alloc_mode), stream); + hip_allocator_from_mode(device_id, alloc_mode), stream); }