diff --git a/cudax/include/cuda/experimental/__container/async_buffer.cuh b/cudax/include/cuda/experimental/__container/async_buffer.cuh index 40a1dca5ac7..5c4731d9c10 100644 --- a/cudax/include/cuda/experimental/__container/async_buffer.cuh +++ b/cudax/include/cuda/experimental/__container/async_buffer.cuh @@ -149,7 +149,7 @@ private: static_assert(::cuda::std::contiguous_iterator<_Iter>, "Non contiguous iterators are not supported"); // TODO use batched memcpy for non-contiguous iterators, it allows to specify stream ordered access ::cuda::__driver::__memcpyAsync( - __dest, ::cuda::std::to_address(__first), sizeof(_Tp) * __count, __buf_.stream().get()); + __dest, ::cuda::std::to_address(__first), sizeof(_Tp) * __count, __buf_.stream().value().get()); } public: @@ -158,8 +158,8 @@ public: //! @brief Copy-constructs from a async_buffer //! @param __other The other async_buffer. - _CCCL_HIDE_FROM_ABI async_buffer(const async_buffer& __other) - : __buf_(__other.memory_resource(), __other.stream(), __other.size()) + _CCCL_HIDE_FROM_ABI explicit async_buffer(const async_buffer& __other) + : __buf_(__other.memory_resource(), __other.stream().value(), __other.size()) { this->__copy_cross( __other.__unwrapped_begin(), __other.__unwrapped_end(), __unwrapped_begin(), __other.size()); @@ -176,8 +176,8 @@ public: //! @param __other The other async_buffer. _CCCL_TEMPLATE(class... _OtherProperties) _CCCL_REQUIRES(__properties_match<_OtherProperties...>) - _CCCL_HIDE_FROM_ABI explicit async_buffer(const async_buffer<_Tp, _OtherProperties...>& __other) - : __buf_(__other.memory_resource(), __other.stream(), __other.size()) + _CCCL_HIDE_FROM_ABI async_buffer(::cuda::stream_ref __stream, const async_buffer<_Tp, _OtherProperties...>& __other) + : __buf_(__other.memory_resource(), __stream, __other.size()) { this->__copy_cross( __other.__unwrapped_begin(), __other.__unwrapped_end(), __unwrapped_begin(), __other.size()); @@ -477,15 +477,16 @@ public: //! @brief Returns the stored stream //! @note Stream used to allocate the buffer is initially stored in the buffer, but can be changed with `set_stream` - [[nodiscard]] _CCCL_HIDE_FROM_ABI constexpr stream_ref stream() const noexcept + [[nodiscard]] _CCCL_HIDE_FROM_ABI constexpr ::cuda::std::optional<::cuda::stream_ref> stream() const noexcept { return __buf_.stream(); } //! @brief Replaces the stored stream //! @param __new_stream the new stream - //! @note Always synchronizes with the old stream - _CCCL_HIDE_FROM_ABI constexpr void set_stream(stream_ref __new_stream) + //! @warning This does not synchronize between \p __new_stream and the current stream. It is the user's responsibility + //! to ensure proper stream order going forward + _CCCL_HIDE_FROM_ABI constexpr void set_stream(const ::cuda::std::optional<::cuda::stream_ref>& __new_stream) { __buf_.set_stream_unsynchronized(__new_stream); } @@ -570,7 +571,7 @@ using __buffer_type_for_props = template void __copy_cross_buffers(stream_ref __stream, _BufferTo& __to, const _BufferFrom& __from) { - __stream.wait(__from.stream()); + __stream.wait(__from.stream().value()); ::cuda::__driver::__memcpyAsync( __to.__unwrapped_begin(), __from.__unwrapped_begin(), diff --git a/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh b/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh index 6fcd0d55c97..fe7a926153e 100644 --- a/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh +++ b/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh @@ -31,6 +31,7 @@ #include #include #include +#include #include #include @@ -80,9 +81,9 @@ private: using __async_resource = ::cuda::mr::any_resource<_Properties...>; __async_resource __mr_; - ::cuda::stream_ref __stream_ = {::cudaStream_t{}}; - size_t __count_ = 0; - void* __buf_ = nullptr; + ::cuda::std::optional<::cuda::stream_ref> __stream_ = {::cudaStream_t{}}; + size_t __count_ = 0; + void* __buf_ = nullptr; template friend class uninitialized_async_buffer; @@ -195,9 +196,9 @@ public: _CCCL_HIDE_FROM_ABI uninitialized_async_buffer(__async_resource __mr, const ::cuda::stream_ref __stream, const size_t __count) : __mr_(::cuda::std::move(__mr)) - , __stream_(__stream) + , __stream_({__stream}) , __count_(__count) - , __buf_(__count_ == 0 ? nullptr : __mr_.allocate(__stream_, __get_allocation_size(__count_))) + , __buf_(__count_ == 0 ? nullptr : __mr_.allocate(__stream, __get_allocation_size(__count_))) {} _CCCL_HIDE_FROM_ABI uninitialized_async_buffer(const uninitialized_async_buffer&) = delete; @@ -208,7 +209,7 @@ public: //! Takes ownership of the allocation in \p __other and resets it _CCCL_HIDE_FROM_ABI uninitialized_async_buffer(uninitialized_async_buffer&& __other) noexcept : __mr_(::cuda::std::move(__other.__mr_)) - , __stream_(::cuda::std::exchange(__other.__stream_, ::cuda::stream_ref{::cudaStream_t{}})) + , __stream_(::cuda::std::exchange(__other.__stream_, ::cuda::std::nullopt)) , __count_(::cuda::std::exchange(__other.__count_, 0)) , __buf_(::cuda::std::exchange(__other.__buf_, nullptr)) {} @@ -220,11 +221,32 @@ public: _CCCL_REQUIRES(__properties_match<_OtherProperties...>) _CCCL_HIDE_FROM_ABI uninitialized_async_buffer(uninitialized_async_buffer<_Tp, _OtherProperties...>&& __other) noexcept : __mr_(::cuda::std::move(__other.__mr_)) - , __stream_(::cuda::std::exchange(__other.__stream_, ::cuda::stream_ref{::cudaStream_t{}})) + , __stream_(::cuda::std::exchange(__other.__stream_, ::cuda::std::nullopt)) , __count_(::cuda::std::exchange(__other.__count_, 0)) , __buf_(::cuda::std::exchange(__other.__buf_, nullptr)) {} + _CCCL_HIDE_FROM_ABI void __destroy_impl(const ::cuda::std::optional<::cuda::stream_ref>& __stream) + { + if (__buf_) + { + if (__stream) + { + __mr_.deallocate(__stream.value(), __buf_, __get_allocation_size(__count_)); + } + else + { + __mr_.deallocate_sync(__buf_, __get_allocation_size(__count_)); + } + __buf_ = nullptr; + __count_ = 0; + } + + // TODO should we make sure we move the mr only once by moving it to the if above? + // It won't work for 0 count buffers, so we would probably need a separate bool to track it + __mr_ = ::cuda::std::move(__mr_); + } + //! @brief Move-assigns a \c uninitialized_async_buffer from \p __other //! @param __other Another \c uninitialized_async_buffer //! Deallocates the current allocation and then takes ownership of the allocation in \p __other and resets it @@ -235,12 +257,9 @@ public: return *this; } - if (__buf_) - { - __mr_.deallocate(__stream_, __buf_, __get_allocation_size(__count_)); - } + __destroy_impl(__stream_); __mr_ = ::cuda::std::move(__other.__mr_); - __stream_ = ::cuda::std::exchange(__other.__stream_, ::cuda::stream_ref{::cudaStream_t{}}); + __stream_ = ::cuda::std::exchange(__other.__stream_, ::cuda::std::nullopt); __count_ = ::cuda::std::exchange(__other.__count_, 0); __buf_ = ::cuda::std::exchange(__other.__buf_, nullptr); return *this; @@ -253,15 +272,7 @@ public: //! user's responsibility to ensure that all objects within the buffer have been properly destroyed. _CCCL_HIDE_FROM_ABI void destroy(::cuda::stream_ref __stream) { - if (__buf_) - { - __mr_.deallocate(__stream, __buf_, __get_allocation_size(__count_)); - __buf_ = nullptr; - __count_ = 0; - } - // TODO should we make sure we move the mr only once by moving it to the if above? - // It won't work for 0 count buffers, so we would probably need a separate bool to track it - auto __tmp_mr = ::cuda::std::move(__mr_); + __destroy_impl(__stream); } //! @brief Destroys an \c uninitialized_async_buffer, deallocates the buffer in stream order on the stream that @@ -270,7 +281,7 @@ public: //! user's responsibility to ensure that all objects within the buffer have been properly destroyed. _CCCL_HIDE_FROM_ABI void destroy() { - destroy(__stream_); + __destroy_impl(__stream_); } //! @brief Destroys an \c uninitialized_async_buffer and deallocates the buffer in stream order on the stream @@ -342,7 +353,7 @@ public: //! @brief Returns the stored stream //! @note Stream used to allocate the buffer is initially stored in the buffer, but can be changed with `set_stream` - [[nodiscard]] _CCCL_HIDE_FROM_ABI constexpr ::cuda::stream_ref stream() const noexcept + [[nodiscard]] _CCCL_HIDE_FROM_ABI constexpr ::cuda::std::optional<::cuda::stream_ref> stream() const noexcept { return __stream_; } @@ -350,11 +361,11 @@ public: //! @brief Replaces the stored stream //! @param __new_stream the new stream //! @note Always synchronizes with the old stream - _CCCL_HIDE_FROM_ABI constexpr void set_stream(::cuda::stream_ref __new_stream) + _CCCL_HIDE_FROM_ABI constexpr void set_stream(const ::cuda::std::optional<::cuda::stream_ref>& __new_stream) { - if (__new_stream != __stream_) + if (__new_stream != __stream_ && __stream_) { - __stream_.sync(); + __stream_.value().sync(); } __stream_ = __new_stream; } @@ -363,7 +374,8 @@ public: //! @param __new_stream the new stream //! @warning This does not synchronize between \p __new_stream and the current stream. It is the user's responsibility //! to ensure proper stream order going forward - _CCCL_HIDE_FROM_ABI constexpr void set_stream_unsynchronized(::cuda::stream_ref __new_stream) noexcept + _CCCL_HIDE_FROM_ABI constexpr void + set_stream_unsynchronized(const ::cuda::std::optional<::cuda::stream_ref>& __new_stream) noexcept { __stream_ = __new_stream; } @@ -380,7 +392,7 @@ public: _CCCL_HIDE_FROM_ABI uninitialized_async_buffer __replace_allocation(const size_t __count) { // Create a new buffer with a reference to the stored memory resource and swap allocation information - uninitialized_async_buffer __ret{__fake_resource_ref{::cuda::std::addressof(__mr_)}, __stream_, __count}; + uninitialized_async_buffer __ret{__fake_resource_ref{::cuda::std::addressof(__mr_)}, __stream_.value(), __count}; ::cuda::std::swap(__count_, __ret.__count_); ::cuda::std::swap(__buf_, __ret.__buf_); return __ret; diff --git a/cudax/test/containers/async_buffer/access.cu b/cudax/test/containers/async_buffer/access.cu index 640b75cf8e9..8a05e382abe 100644 --- a/cudax/test/containers/async_buffer/access.cu +++ b/cudax/test/containers/async_buffer/access.cu @@ -54,7 +54,7 @@ C2H_CCCLRT_TEST("cudax::async_buffer access and stream", "[container][async_buff { Buffer buf{stream, resource, {T(1), T(42), T(1337), T(0)}}; - buf.stream().sync(); + stream.sync(); auto& res = buf.get_unsynchronized(2); CUDAX_CHECK(compare_value(res, T(1337))); CUDAX_CHECK(static_cast(cuda::std::addressof(res) - buf.data()) == 2); @@ -73,14 +73,14 @@ C2H_CCCLRT_TEST("cudax::async_buffer access and stream", "[container][async_buff { // Works without allocation Buffer buf{stream, resource}; - buf.stream().sync(); + stream.sync(); CUDAX_CHECK(buf.data() == nullptr); CUDAX_CHECK(cuda::std::as_const(buf).data() == nullptr); } { // Works with allocation Buffer buf{stream, resource, {T(1), T(42), T(1337), T(0)}}; - buf.stream().sync(); + stream.sync(); CUDAX_CHECK(buf.data() != nullptr); CUDAX_CHECK(cuda::std::as_const(buf).data() != nullptr); CUDAX_CHECK(cuda::std::as_const(buf).data() == buf.data()); @@ -89,17 +89,32 @@ C2H_CCCLRT_TEST("cudax::async_buffer access and stream", "[container][async_buff SECTION("cudax::async_buffer::stream") { - Buffer buf{stream, resource, {T(1), T(42), T(1337), T(0)}}; - CUDAX_CHECK(buf.stream() == stream); - { - cudax::stream other_stream{cuda::device_ref{0}}; - buf.set_stream(other_stream); - CUDAX_CHECK(buf.stream() == other_stream); - buf.set_stream(stream); + Buffer buf{stream, resource, {T(1), T(42), T(1337), T(0)}}; + CUDAX_CHECK(buf.stream().value() == stream); + + { + cudax::stream other_stream{cuda::device_ref{0}}; + buf.set_stream(other_stream); + CUDAX_CHECK(buf.stream().value() == other_stream); + buf.set_stream(stream); + } + + CUDAX_CHECK(buf.stream().value() == stream); + buf.destroy(stream); + + buf = cudax::make_async_buffer(stream, resource, {T(1), T(42), T(1337), T(0)}); + CUDAX_CHECK(buf.stream().value() == stream); } - CUDAX_CHECK(buf.stream() == stream); - buf.destroy(stream); + { + Buffer buf = cudax::make_async_buffer(stream, resource, {T(1), T(42), T(1337), T(0)}); + CUDAX_CHECK(buf.stream().value() == stream); + + stream.sync(); + buf.set_stream(cuda::std::nullopt); + CUDAX_CHECK(buf.stream() == cuda::std::nullopt); + // Buffer should handle destruction when stream is nullopt + } } } diff --git a/cudax/test/containers/async_buffer/conversion.cu b/cudax/test/containers/async_buffer/conversion.cu index e5b1852bf80..ad9fd9944d2 100644 --- a/cudax/test/containers/async_buffer/conversion.cu +++ b/cudax/test/containers/async_buffer/conversion.cu @@ -47,14 +47,14 @@ C2H_CCCLRT_TEST("cudax::async_buffer conversion", "[container][async_buffer]", t { { // can be copy constructed from empty input const MatchingBuffer input{stream, resource, 0, cudax::no_init}; - Buffer buf(input); + Buffer buf(stream, input); CUDAX_CHECK(buf.empty()); CUDAX_CHECK(input.empty()); } { // can be copy constructed from non-empty input const MatchingBuffer input{stream, resource, {T(1), T(42), T(1337), T(0), T(12), T(-1)}}; - Buffer buf(input); + Buffer buf(stream, input); CUDAX_CHECK(!buf.empty()); CUDAX_CHECK(equal_range(buf)); CUDAX_CHECK(equal_range(input)); @@ -62,7 +62,7 @@ C2H_CCCLRT_TEST("cudax::async_buffer conversion", "[container][async_buffer]", t { // can be move constructed with empty input MatchingBuffer input{stream, resource, 0, cudax::no_init}; - Buffer buf(cuda::std::move(input)); + Buffer buf(stream, cuda::std::move(input)); CUDAX_CHECK(buf.empty()); CUDAX_CHECK(input.empty()); } @@ -72,7 +72,7 @@ C2H_CCCLRT_TEST("cudax::async_buffer conversion", "[container][async_buffer]", t // ensure that we steal the data const auto* allocation = input.data(); - Buffer buf(cuda::std::move(input)); + Buffer buf(stream, cuda::std::move(input)); CUDAX_CHECK(buf.size() == 6); CUDAX_CHECK(buf.data() == allocation); CUDAX_CHECK(input.size() == 0); diff --git a/cudax/test/containers/async_buffer/copy.cu b/cudax/test/containers/async_buffer/copy.cu index 029c855bca7..e26dc2e3f9d 100644 --- a/cudax/test/containers/async_buffer/copy.cu +++ b/cudax/test/containers/async_buffer/copy.cu @@ -54,28 +54,28 @@ C2H_CCCLRT_TEST("cudax::async_buffer make_async_buffer", "[container][async_buff { { // empty input const Buffer input{stream, resource}; - const Buffer buf = cudax::make_async_buffer(input.stream(), input.memory_resource(), input); + const Buffer buf = cudax::make_async_buffer(input.stream().value(), input.memory_resource(), input); CUDAX_CHECK(buf.empty()); CUDAX_CHECK(buf.data() == nullptr); } { // non-empty input const Buffer input{stream, resource, {T(1), T(42), T(1337), T(0), T(12), T(-1)}}; - const Buffer buf = cudax::make_async_buffer(input.stream(), input.memory_resource(), input); + const Buffer buf = cudax::make_async_buffer(input.stream().value(), input.memory_resource(), input); CUDAX_CHECK(!buf.empty()); CUDAX_CHECK(equal_range(buf)); } { // empty input const Buffer input{stream, resource}; - const Buffer buf = cudax::make_async_buffer(input.stream(), input.memory_resource(), input); + const Buffer buf = cudax::make_async_buffer(input.stream().value(), input.memory_resource(), input); CUDAX_CHECK(buf.empty()); CUDAX_CHECK(buf.data() == nullptr); } { // non-empty input const Buffer input{stream, resource, {T(1), T(42), T(1337), T(0), T(12), T(-1)}}; - const Buffer buf = cudax::make_async_buffer(input.stream(), input.memory_resource(), input); + const Buffer buf = cudax::make_async_buffer(input.stream().value(), input.memory_resource(), input); CUDAX_CHECK(!buf.empty()); CUDAX_CHECK(equal_range(buf)); } @@ -164,8 +164,7 @@ C2H_CCCLRT_TEST("make_async_buffer variants", "[container][async_buffer]") {int(1), int(42), int(1337), int(0), int(12), int(-1)}}; // straight from a resource - auto buf = - cuda::experimental::make_async_buffer(input.stream(), cuda::device_default_memory_pool(cuda::device_ref{0}), input); + auto buf = cudax::make_async_buffer(stream, cuda::device_default_memory_pool(cuda::device_ref{0}), input); CUDAX_CHECK(equal_range(buf)); static_assert( ::cuda::mr::synchronous_resource_with); @@ -174,7 +173,7 @@ C2H_CCCLRT_TEST("make_async_buffer variants", "[container][async_buffer]") static_assert(!::cuda::mr::synchronous_resource_with); auto buf2 = cuda::experimental::make_async_buffer( - input.stream(), {cuda::device_default_memory_pool(cuda::device_ref{0})}, input); + stream, {cuda::device_default_memory_pool(cuda::device_ref{0})}, input); CUDAX_CHECK(equal_range(buf2)); static_assert( ::cuda::mr::synchronous_resource_with); @@ -185,7 +184,7 @@ C2H_CCCLRT_TEST("make_async_buffer variants", "[container][async_buffer]") // from any resource auto any_res = cuda::mr::any_resource( cuda::device_default_memory_pool(cuda::device_ref{0})); - auto buf3 = cudax::make_async_buffer(input.stream(), any_res, input); + auto buf3 = cudax::make_async_buffer(stream, any_res, input); CUDAX_CHECK(equal_range(buf3)); static_assert( ::cuda::mr::synchronous_resource_with); @@ -193,7 +192,7 @@ C2H_CCCLRT_TEST("make_async_buffer variants", "[container][async_buffer]") static_assert( !::cuda::mr::synchronous_resource_with); - auto buf4 = cudax::make_async_buffer(input.stream(), {any_res}, input); + auto buf4 = cudax::make_async_buffer(stream, {any_res}, input); CUDAX_CHECK(equal_range(buf4)); static_assert( ::cuda::mr::synchronous_resource_with); @@ -203,7 +202,7 @@ C2H_CCCLRT_TEST("make_async_buffer variants", "[container][async_buffer]") // from a resource reference auto res_ref = cuda::mr::resource_ref{any_res}; - auto buf5 = cudax::make_async_buffer(input.stream(), res_ref, input); + auto buf5 = cudax::make_async_buffer(stream, res_ref, input); CUDAX_CHECK(equal_range(buf5)); static_assert( ::cuda::mr::synchronous_resource_with); @@ -211,7 +210,7 @@ C2H_CCCLRT_TEST("make_async_buffer variants", "[container][async_buffer]") static_assert( !::cuda::mr::synchronous_resource_with); - auto buf6 = cudax::make_async_buffer(input.stream(), {res_ref}, input); + auto buf6 = cudax::make_async_buffer(stream, {res_ref}, input); CUDAX_CHECK(equal_range(buf6)); static_assert( ::cuda::mr::synchronous_resource_with); @@ -221,7 +220,7 @@ C2H_CCCLRT_TEST("make_async_buffer variants", "[container][async_buffer]") auto shared_res = cudax::make_shared_resource(cuda::device_default_memory_pool(cuda::device_ref{0})); - auto buf7 = cudax::make_async_buffer(input.stream(), shared_res, input); + auto buf7 = cudax::make_async_buffer(stream, shared_res, input); CUDAX_CHECK(equal_range(buf7)); static_assert( ::cuda::mr::synchronous_resource_with); @@ -229,7 +228,7 @@ C2H_CCCLRT_TEST("make_async_buffer variants", "[container][async_buffer]") static_assert( !::cuda::mr::synchronous_resource_with); - auto buf8 = cudax::make_async_buffer(input.stream(), {shared_res}, input); + auto buf8 = cudax::make_async_buffer(stream, {shared_res}, input); CUDAX_CHECK(equal_range(buf8)); static_assert( ::cuda::mr::synchronous_resource_with); diff --git a/cudax/test/containers/async_buffer/helper.h b/cudax/test/containers/async_buffer/helper.h index 71b99c41db5..8215002b36d 100644 --- a/cudax/test/containers/async_buffer/helper.h +++ b/cudax/test/containers/async_buffer/helper.h @@ -34,15 +34,17 @@ bool equal_range(const Buffer& buf) { if constexpr (!Buffer::properties_list::has_property(cuda::mr::device_accessible{})) { - buf.stream().sync(); + buf.stream().value().sync(); return cuda::std::equal(buf.begin(), buf.end(), cuda::std::begin(host_data), cuda::std::end(host_data)); } else { cuda::experimental::__ensure_current_device guard{cuda::device_ref{0}}; return buf.size() == cuda::std::size(device_data) - && thrust::equal( - thrust::cuda::par.on(buf.stream().get()), buf.begin(), buf.end(), cuda::get_device_address(device_data[0])); + && thrust::equal(thrust::cuda::par.on(buf.stream().value().get()), + buf.begin(), + buf.end(), + cuda::get_device_address(device_data[0])); } } @@ -111,7 +113,7 @@ bool equal_size_value(const Buffer& buf, const size_t size, const int value) { if constexpr (!Buffer::properties_list::has_property(cuda::mr::device_accessible{})) { - buf.stream().sync(); + buf.stream().value().sync(); return buf.size() == size && cuda::std::equal(buf.begin(), buf.end(), @@ -122,7 +124,7 @@ bool equal_size_value(const Buffer& buf, const size_t size, const int value) { cuda::experimental::__ensure_current_device guard{cuda::device_ref{0}}; return buf.size() == size - && thrust::equal(thrust::cuda::par.on(buf.stream().get()), + && thrust::equal(thrust::cuda::par.on(buf.stream().value().get()), buf.begin(), buf.end(), cuda::std::begin(device_data), @@ -136,14 +138,15 @@ bool equal_range(const Range1& range1, const Range2& range2) { if constexpr (!Range1::properties_list::has_property(cuda::mr::device_accessible{})) { - range1.stream().sync(); + range1.stream().value().sync(); return cuda::std::equal(range1.begin(), range1.end(), range2.begin(), range2.end()); } else { cuda::experimental::__ensure_current_device guard{cuda::device_ref{0}}; return range1.size() == range2.size() - && thrust::equal(thrust::cuda::par.on(range1.stream().get()), range1.begin(), range1.end(), range2.begin()); + && thrust::equal( + thrust::cuda::par.on(range1.stream().value().get()), range1.begin(), range1.end(), range2.begin()); } }