diff --git a/cudax/examples/simple_p2p.cu b/cudax/examples/simple_p2p.cu index f1a5950dded..c92ad37ac62 100644 --- a/cudax/examples/simple_p2p.cu +++ b/cudax/examples/simple_p2p.cu @@ -30,10 +30,10 @@ * Unified Virtual Address Space (UVA) features. */ +#include #include #include -#include #include #include #include @@ -91,11 +91,11 @@ void benchmark_cross_device_ping_pong_copy( // Ping-pong copy between GPUs if (i % 2 == 0) { - cudax::copy_bytes(dev1_stream, dev0_buffer, dev1_buffer); + cuda::copy_bytes(dev1_stream, dev0_buffer, dev1_buffer); } else { - cudax::copy_bytes(dev1_stream, dev1_buffer, dev0_buffer); + cuda::copy_bytes(dev1_stream, dev1_buffer, dev0_buffer); } } @@ -126,7 +126,7 @@ void test_cross_device_access_from_kernel( return static_cast((i++) % 4096); }); - cudax::copy_bytes(dev0_stream, host_buffer, dev0_buffer); + cuda::copy_bytes(dev0_stream, host_buffer, dev0_buffer); dev1_stream.wait(dev0_stream); // Kernel launch configuration @@ -151,7 +151,7 @@ void test_cross_device_access_from_kernel( // Copy data back to host and verify printf("Copy data back to host from GPU%d and verify results...\n", dev0.get()); - cudax::copy_bytes(dev0_stream, dev0_buffer, host_buffer); + cuda::copy_bytes(dev0_stream, dev0_buffer, host_buffer); dev0_stream.sync(); int error_count = 0; diff --git a/cudax/include/cuda/experimental/__algorithm/common.cuh b/cudax/include/cuda/experimental/__algorithm/common.cuh deleted file mode 100644 index e4f9461cf97..00000000000 --- a/cudax/include/cuda/experimental/__algorithm/common.cuh +++ /dev/null @@ -1,69 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of CUDA Experimental in CUDA C++ Core Libraries, -// under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. -// -//===----------------------------------------------------------------------===// - -#ifndef __CUDAX_ALGORITHM_COMMON -#define __CUDAX_ALGORITHM_COMMON - -#include - -#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) -# pragma GCC system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) -# pragma clang system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) -# pragma system_header -#endif // no system header - -#include -#include -#include -#include -#include -#include - -#include - -#include - -namespace cuda::experimental -{ -template -using __as_span_t = ::cuda::std::span<::cuda::std::remove_reference_t<::cuda::std::ranges::range_reference_t<_Tp>>>; - -//! @brief A concept that checks if the type can be converted to a `cuda::std::span`. -//! The type must be a contiguous range. -template -_CCCL_CONCEPT __spannable = _CCCL_REQUIRES_EXPR((_Tp))( // - requires(::cuda::std::ranges::contiguous_range<_Tp>), // - requires(::cuda::std::convertible_to<_Tp, __as_span_t<_Tp>>)); - -template -using __as_mdspan_t = - ::cuda::std::mdspan::value_type, - typename ::cuda::std::decay_t<_Tp>::extents_type, - typename ::cuda::std::decay_t<_Tp>::layout_type, - typename ::cuda::std::decay_t<_Tp>::accessor_type>; - -//! @brief A concept that checks if the type can be converted to a `cuda::std::mdspan`. -//! The type must have a conversion to `__as_mdspan_t<_Tp>`. -template -_CCCL_CONCEPT __mdspannable = - _CCCL_REQUIRES_EXPR((_Tp))(requires(::cuda::std::convertible_to<_Tp, __as_mdspan_t<_Tp>>)); - -template -[[nodiscard]] _CCCL_HOST_API constexpr auto __as_mdspan(_Tp&& __value) noexcept -> __as_mdspan_t<_Tp> -{ - return ::cuda::std::forward<_Tp>(__value); -} -} // namespace cuda::experimental - -#include - -#endif //__CUDAX_ALGORITHM_COMMON diff --git a/cudax/include/cuda/experimental/__algorithm/copy.cuh b/cudax/include/cuda/experimental/__algorithm/copy.cuh deleted file mode 100644 index 1a235f26629..00000000000 --- a/cudax/include/cuda/experimental/__algorithm/copy.cuh +++ /dev/null @@ -1,94 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of CUDA Experimental in CUDA C++ Core Libraries, -// under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. -// -//===----------------------------------------------------------------------===// - -#ifndef __CUDAX_ALGORITHM_COPY -#define __CUDAX_ALGORITHM_COPY - -#include - -#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) -# pragma GCC system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) -# pragma clang system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) -# pragma system_header -#endif // no system header - -#include -#include -#include -#include - -#include - -#include - -namespace cuda::experimental -{ -//! @brief Launches a bytewise memory copy from source to destination into the provided -//! stream. -//! -//! Both source and destination needs to be or device_transform to a type that is a `contiguous_range` and converts to -//! `cuda::std::span`. The element types of both the source and destination range is required to be trivially copyable. -//! -//! This call might be synchronous if either source or destination is pagable host memory. -//! It will be synchronous if both destination and copy is located in host memory. -//! -//! @param __stream Stream that the copy should be inserted into -//! @param __src Source to copy from -//! @param __dst Destination to copy into -//! @param __config Configuration for the copy -_CCCL_TEMPLATE(typename _SrcTy, typename _DstTy) -_CCCL_REQUIRES(::cuda::__spannable> - _CCCL_AND ::cuda::__spannable>) -_CCCL_HOST_API void -copy_bytes(::cuda::stream_ref __stream, _SrcTy&& __src, _DstTy&& __dst, copy_configuration __config = {}) -{ - ::cuda::__detail::__copy_bytes_impl( - __stream, - ::cuda::std::span(device_transform(__stream, ::cuda::std::forward<_SrcTy>(__src))), - ::cuda::std::span(device_transform(__stream, ::cuda::std::forward<_DstTy>(__dst))), - __config); -} - -//! @brief Launches a bytewise memory copy from source to destination into the provided -//! stream. -//! -//! Both source and destination needs to be or device_transform to an instance of `cuda::std::mdspan`. -//! They can also implicitly convert to `cuda::std::mdspan`, but the -//! type needs to contain `mdspan` template arguments as member aliases named -//! `value_type`, `extents_type`, `layout_type` and `accessor_type`. The resulting mdspan -//! is required to be exhaustive. The element types of both the source and destination -//! type are required to be trivially copyable. -//! -//! This call might be synchronous if either source or destination is pagable host memory. -//! It will be synchronous if both destination and copy is located in host memory. -//! -//! @param __stream Stream that the copy should be inserted into -//! @param __src Source to copy from -//! @param __dst Destination to copy into -//! @param __config Configuration for the copy -_CCCL_TEMPLATE(typename _SrcTy, typename _DstTy) -_CCCL_REQUIRES(::cuda::__mdspannable> - _CCCL_AND ::cuda::__mdspannable>) -_CCCL_HOST_API void -copy_bytes(::cuda::stream_ref __stream, _SrcTy&& __src, _DstTy&& __dst, copy_configuration __config = {}) -{ - ::cuda::__detail::__copy_bytes_impl( - __stream, - ::cuda::__as_mdspan(device_transform(__stream, ::cuda::std::forward<_SrcTy>(__src))), - ::cuda::__as_mdspan(device_transform(__stream, ::cuda::std::forward<_DstTy>(__dst))), - __config); -} -} // namespace cuda::experimental - -#include - -#endif // __CUDAX_ALGORITHM_COPY diff --git a/cudax/include/cuda/experimental/__algorithm/fill.cuh b/cudax/include/cuda/experimental/__algorithm/fill.cuh deleted file mode 100644 index 5490d21ccdc..00000000000 --- a/cudax/include/cuda/experimental/__algorithm/fill.cuh +++ /dev/null @@ -1,78 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of CUDA Experimental in CUDA C++ Core Libraries, -// under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. -// -//===----------------------------------------------------------------------===// - -#ifndef __CUDAX_ALGORITHM_FILL -#define __CUDAX_ALGORITHM_FILL - -#include - -#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) -# pragma GCC system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) -# pragma clang system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) -# pragma system_header -#endif // no system header - -#include -#include - -#include - -#include - -namespace cuda::experimental -{ -//! @brief Launches an operation to bytewise fill the memory into the provided stream. -//! -//! The destination needs to either be a `contiguous_range` or transform into one. It can -//! also implicitly convert to `cuda::std::span`, but it needs to contain a `value_type` -//! member alias. The element type of the destination is required to be trivially -//! copyable. -//! -//! The destination cannot reside in pagable host memory. -//! -//! @param __stream Stream that the copy should be inserted into -//! @param __dst Destination memory to fill -//! @param __value Value to fill into every byte in the destination -_CCCL_TEMPLATE(typename _DstTy) -_CCCL_REQUIRES(::cuda::__spannable>) -_CCCL_HOST_API void fill_bytes(::cuda::stream_ref __stream, _DstTy&& __dst, ::cuda::std::uint8_t __value) -{ - ::cuda::__detail::__fill_bytes_impl( - __stream, ::cuda::std::span(device_transform(__stream, ::cuda::std::forward<_DstTy>(__dst))), __value); -} - -//! @brief Launches an operation to bytewise fill the memory into the provided stream. -//! -//! Destination needs to either be an instance of `cuda::std::mdspan` or transform into -//! one. It can also implicitly convert to `cuda::std::mdspan`, but the type needs to -//! contain `mdspan` template arguments as member aliases named `value_type`, -//! `extents_type`, `layout_type` and `accessor_type`. The resulting mdspan is required to -//! be exhaustive. The element type of the destination is required to be trivially -//! copyable. -//! -//! The destination cannot reside in pagable host memory. -//! -//! @param __stream Stream that the copy should be inserted into -//! @param __dst Destination memory to fill -//! @param __value Value to fill into every byte in the destination -_CCCL_TEMPLATE(typename _DstTy) -_CCCL_REQUIRES(::cuda::__mdspannable>) -_CCCL_HOST_API void fill_bytes(::cuda::stream_ref __stream, _DstTy&& __dst, ::cuda::std::uint8_t __value) -{ - ::cuda::__detail::__fill_bytes_impl( - __stream, ::cuda::__as_mdspan(device_transform(__stream, ::cuda::std::forward<_DstTy>(__dst))), __value); -} -} // namespace cuda::experimental - -#include - -#endif // __CUDAX_ALGORITHM_FILL diff --git a/cudax/include/cuda/experimental/__launch/launch.cuh b/cudax/include/cuda/experimental/__launch/launch.cuh index c9573103942..18a51b07053 100644 --- a/cudax/include/cuda/experimental/__launch/launch.cuh +++ b/cudax/include/cuda/experimental/__launch/launch.cuh @@ -22,6 +22,7 @@ #endif // no system header #include +#include #include #include #include @@ -38,7 +39,6 @@ #include #include #include -#include #include #include diff --git a/cudax/include/cuda/experimental/algorithm.cuh b/cudax/include/cuda/experimental/algorithm.cuh deleted file mode 100644 index 4c91213237c..00000000000 --- a/cudax/include/cuda/experimental/algorithm.cuh +++ /dev/null @@ -1,17 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of CUDA Experimental in CUDA C++ Core Libraries, -// under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. -// -//===----------------------------------------------------------------------===// - -#ifndef __CUDAX_ALGORITHM__ -#define __CUDAX_ALGORITHM__ - -#include -#include - -#endif // __CUDAX_ALGORITHM__ diff --git a/cudax/include/cuda/experimental/launch.cuh b/cudax/include/cuda/experimental/launch.cuh index 158d99f6384..fe74b58727f 100644 --- a/cudax/include/cuda/experimental/launch.cuh +++ b/cudax/include/cuda/experimental/launch.cuh @@ -11,10 +11,11 @@ #ifndef __CUDAX_LAUNCH___ #define __CUDAX_LAUNCH___ +#include + #include #include #include #include -#include #endif // __CUDAX_LAUNCH___ diff --git a/cudax/test/algorithm/common.cuh b/cudax/test/algorithm/common.cuh index 17d2b8c6424..b114dadeb7a 100644 --- a/cudax/test/algorithm/common.cuh +++ b/cudax/test/algorithm/common.cuh @@ -11,10 +11,10 @@ #ifndef __ALGORITHM_COMMON__ #define __ALGORITHM_COMMON__ +#include #include #include -#include #include #include diff --git a/cudax/test/algorithm/copy.cu b/cudax/test/algorithm/copy.cu index 501b2ab1794..6fa26da8158 100644 --- a/cudax/test/algorithm/copy.cu +++ b/cudax/test/algorithm/copy.cu @@ -21,25 +21,25 @@ C2H_TEST("1d Copy", "[data_manipulation]") { cuda::__uninitialized_async_buffer buffer(device_resource, _stream, buffer_size); - cudax::fill_bytes(_stream, buffer, fill_byte); + cuda::fill_bytes(_stream, buffer, fill_byte); - cudax::copy_bytes(_stream, buffer, host_vector); + cuda::copy_bytes(_stream, buffer, host_vector); check_result_and_erase(_stream, host_vector); - cudax::copy_bytes(_stream, std::move(buffer), host_vector); + cuda::copy_bytes(_stream, std::move(buffer), host_vector); check_result_and_erase(_stream, host_vector); } { cuda::__uninitialized_async_buffer not_yet_const_buffer( device_resource, _stream, buffer_size); - cudax::fill_bytes(_stream, not_yet_const_buffer, fill_byte); + cuda::fill_bytes(_stream, not_yet_const_buffer, fill_byte); const auto& const_buffer = not_yet_const_buffer; - cudax::copy_bytes(_stream, const_buffer, host_vector); + cuda::copy_bytes(_stream, const_buffer, host_vector); check_result_and_erase(_stream, host_vector); - cudax::copy_bytes(_stream, const_buffer, cuda::std::span(host_vector)); + cuda::copy_bytes(_stream, const_buffer, cuda::std::span(host_vector)); check_result_and_erase(_stream, host_vector); } } @@ -53,26 +53,26 @@ C2H_TEST("1d Copy", "[data_manipulation]") cudax::uninitialized_buffer host_buffer(host_resource, buffer_size); cudax::uninitialized_buffer device_buffer(managed_resource, buffer_size); - cudax::fill_bytes(_stream, host_buffer, fill_byte); + cuda::fill_bytes(_stream, host_buffer, fill_byte); - cudax::copy_bytes(_stream, host_buffer, device_buffer); + cuda::copy_bytes(_stream, host_buffer, device_buffer); check_result_and_erase(_stream, device_buffer); - cudax::copy_bytes(_stream, cuda::std::span(host_buffer), device_buffer); + cuda::copy_bytes(_stream, cuda::std::span(host_buffer), device_buffer); check_result_and_erase(_stream, device_buffer); } { cudax::uninitialized_buffer not_yet_const_host_buffer(host_resource, buffer_size); cudax::uninitialized_buffer device_buffer(managed_resource, buffer_size); - cudax::fill_bytes(_stream, not_yet_const_host_buffer, fill_byte); + cuda::fill_bytes(_stream, not_yet_const_host_buffer, fill_byte); const auto& const_host_buffer = not_yet_const_host_buffer; - cudax::copy_bytes(_stream, const_host_buffer, device_buffer); + cuda::copy_bytes(_stream, const_host_buffer, device_buffer); check_result_and_erase(_stream, device_buffer); - cudax::copy_bytes(_stream, cuda::std::span(const_host_buffer), device_buffer); + cuda::copy_bytes(_stream, cuda::std::span(const_host_buffer), device_buffer); check_result_and_erase(_stream, device_buffer); } } @@ -84,7 +84,7 @@ C2H_TEST("1d Copy", "[data_manipulation]") memset(input.data, fill_byte, input.size * sizeof(int)); - cudax::copy_bytes(_stream, input, output); + cuda::copy_bytes(_stream, input, output); check_result_and_erase(_stream, cuda::std::span(output.data, output.size)); } @@ -92,11 +92,11 @@ C2H_TEST("1d Copy", "[data_manipulation]") { cuda::legacy_pinned_memory_resource host_resource; cudax::uninitialized_buffer host_buffer(host_resource, 1); - cudax::fill_bytes(_stream, host_buffer, fill_byte); + cuda::fill_bytes(_stream, host_buffer, fill_byte); ::std::vector vec(buffer_size, 0xbeef); - cudax::copy_bytes(_stream, host_buffer, vec); + cuda::copy_bytes(_stream, host_buffer, vec); _stream.sync(); CUDAX_REQUIRE(vec[0] == get_expected_value(fill_byte)); @@ -122,7 +122,7 @@ void test_mdspan_copy_bytes( src(0, i) = i; } - cudax::copy_bytes(stream, std::move(src), dst); + cuda::copy_bytes(stream, std::move(src), dst); stream.sync(); for (int i = 0; i < static_cast(dst.extent(1)); i++) @@ -163,7 +163,7 @@ C2H_TEST("Mdspan copy", "[data_manipulation]") cudax::weird_buffer> buffer{ host_resource, mdspan.mapping().required_span_size()}; - cudax::copy_bytes(stream, mdspan, buffer); + cuda::copy_bytes(stream, mdspan, buffer); stream.sync(); CUDAX_REQUIRE(!memcmp(mdspan_buffer.data(), buffer.data, mdspan_buffer.size())); } @@ -177,7 +177,7 @@ C2H_TEST("Non exhaustive mdspan copy_bytes", "[data_manipulation]") try { - cudax::copy_bytes(stream, fake_strided_mdspan, fake_strided_mdspan); + cuda::copy_bytes(stream, fake_strided_mdspan, fake_strided_mdspan); } catch (const ::std::invalid_argument& e) { diff --git a/cudax/test/algorithm/fill.cu b/cudax/test/algorithm/fill.cu index 10dae14209c..c470f988770 100644 --- a/cudax/test/algorithm/fill.cu +++ b/cudax/test/algorithm/fill.cu @@ -18,7 +18,7 @@ C2H_TEST("Fill", "[data_manipulation]") cuda::legacy_pinned_memory_resource host_resource; cudax::uninitialized_buffer buffer(host_resource, buffer_size); - cudax::fill_bytes(_stream, buffer, fill_byte); + cuda::fill_bytes(_stream, buffer, fill_byte); check_result_and_erase(_stream, cuda::std::span(buffer)); } @@ -27,7 +27,7 @@ C2H_TEST("Fill", "[data_manipulation]") { cuda::device_memory_pool_ref device_resource = cuda::device_default_memory_pool(cuda::device_ref{0}); cudax::uninitialized_buffer buffer(device_resource, buffer_size); - cudax::fill_bytes(_stream, buffer, fill_byte); + cuda::fill_bytes(_stream, buffer, fill_byte); std::vector host_vector(42); CUDART(cudaMemcpyAsync( @@ -40,7 +40,7 @@ C2H_TEST("Fill", "[data_manipulation]") cuda::legacy_pinned_memory_resource host_resource; cudax::weird_buffer buffer(host_resource, buffer_size); - cudax::fill_bytes(_stream, buffer, fill_byte); + cuda::fill_bytes(_stream, buffer, fill_byte); check_result_and_erase(_stream, cuda::std::span(buffer.data, buffer.size)); } } @@ -53,7 +53,7 @@ C2H_TEST("Mdspan Fill", "[data_manipulation]") auto buffer = make_buffer_for_mdspan(dynamic_extents, 0); cuda::std::mdspan dynamic_mdspan(buffer.data(), dynamic_extents); - cudax::fill_bytes(stream, dynamic_mdspan, fill_byte); + cuda::fill_bytes(stream, dynamic_mdspan, fill_byte); check_result_and_erase(stream, cuda::std::span(buffer.data(), buffer.size())); } { @@ -61,7 +61,7 @@ C2H_TEST("Mdspan Fill", "[data_manipulation]") auto buffer = make_buffer_for_mdspan(mixed_extents, 0); cuda::std::mdspan mixed_mdspan(buffer.data(), mixed_extents); - cudax::fill_bytes(stream, cuda::std::move(mixed_mdspan), fill_byte); + cuda::fill_bytes(stream, cuda::std::move(mixed_mdspan), fill_byte); check_result_and_erase(stream, cuda::std::span(buffer.data(), buffer.size())); } { @@ -70,7 +70,7 @@ C2H_TEST("Mdspan Fill", "[data_manipulation]") auto size = cuda::std::layout_left::mapping().required_span_size(); cudax::weird_buffer> buffer(host_resource, size); - cudax::fill_bytes(stream, buffer, fill_byte); + cuda::fill_bytes(stream, buffer, fill_byte); check_result_and_erase(stream, cuda::std::span(buffer.data, buffer.size)); } } @@ -83,7 +83,7 @@ C2H_TEST("Non exhaustive mdspan fill_bytes", "[data_manipulation]") try { - cudax::fill_bytes(stream, fake_strided_mdspan, fill_byte); + cuda::fill_bytes(stream, fake_strided_mdspan, fill_byte); } catch (const ::std::invalid_argument& e) { diff --git a/cudax/test/containers/async_buffer/transform.cu b/cudax/test/containers/async_buffer/transform.cu index c045beb446b..e7975d9f85b 100644 --- a/cudax/test/containers/async_buffer/transform.cu +++ b/cudax/test/containers/async_buffer/transform.cu @@ -13,12 +13,12 @@ #include #include +#include #include #include #include #include -#include #include #include @@ -90,7 +90,7 @@ C2H_CCCLRT_TEST("cudax::buffer launch transform", "[container][buffer]") cudax::launch(stream, cudax::make_config(cudax::grid_dims<1>, cudax::block_dims<32>), add_kernel{}, a, b); std::vector host_result(a.size()); - cudax::copy_bytes(stream, a, host_result); + cuda::copy_bytes(stream, a, host_result); stream.sync(); diff --git a/cudax/test/launch/launch_smoke.cu b/cudax/test/launch/launch_smoke.cu index a0e78cce140..1920b3d27ad 100644 --- a/cudax/test/launch/launch_smoke.cu +++ b/cudax/test/launch/launch_smoke.cu @@ -130,7 +130,7 @@ struct launch_transform_to_int_convertible // Immovable to ensure that device_transform doesn't copy the returned // object - int_convertible(int_convertible&&) = delete; + int_convertible(int_convertible&&) noexcept = delete; ~int_convertible() noexcept { diff --git a/libcudacxx/include/cuda/__algorithm/copy.h b/libcudacxx/include/cuda/__algorithm/copy.h index 8cb81c9acaf..4ee8133c343 100644 --- a/libcudacxx/include/cuda/__algorithm/copy.h +++ b/libcudacxx/include/cuda/__algorithm/copy.h @@ -24,6 +24,7 @@ #if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC) # include +# include # include # include # include @@ -149,13 +150,14 @@ _CCCL_HOST_API void __copy_bytes_impl( //! @param __dst Destination to copy into //! @param __config Configuration for the copy _CCCL_TEMPLATE(typename _SrcTy, typename _DstTy) -_CCCL_REQUIRES(__spannable<_SrcTy> _CCCL_AND __spannable<_DstTy>) +_CCCL_REQUIRES( + __spannable> _CCCL_AND __spannable>) _CCCL_HOST_API void copy_bytes(stream_ref __stream, _SrcTy&& __src, _DstTy&& __dst, copy_configuration __config = {}) { ::cuda::__detail::__copy_bytes_impl( __stream, - ::cuda::std::span(::cuda::std::forward<_SrcTy>(__src)), - ::cuda::std::span(::cuda::std::forward<_DstTy>(__dst)), + ::cuda::std::span(device_transform(__stream, ::cuda::std::forward<_SrcTy>(__src))), + ::cuda::std::span(device_transform(__stream, ::cuda::std::forward<_DstTy>(__dst))), __config); } @@ -177,13 +179,14 @@ _CCCL_HOST_API void copy_bytes(stream_ref __stream, _SrcTy&& __src, _DstTy&& __d //! @param __dst Destination to copy into //! @param __config Configuration for the copy _CCCL_TEMPLATE(typename _SrcTy, typename _DstTy) -_CCCL_REQUIRES(__mdspannable<_SrcTy> _CCCL_AND __mdspannable<_DstTy>) +_CCCL_REQUIRES( + __mdspannable> _CCCL_AND __mdspannable>) _CCCL_HOST_API void copy_bytes(stream_ref __stream, _SrcTy&& __src, _DstTy&& __dst, copy_configuration __config = {}) { ::cuda::__detail::__copy_bytes_impl( __stream, - ::cuda::__as_mdspan(::cuda::std::forward<_SrcTy>(__src)), - ::cuda::__as_mdspan(::cuda::std::forward<_DstTy>(__dst)), + ::cuda::__as_mdspan(device_transform(__stream, ::cuda::std::forward<_SrcTy>(__src))), + ::cuda::__as_mdspan(device_transform(__stream, ::cuda::std::forward<_DstTy>(__dst))), __config); } diff --git a/libcudacxx/include/cuda/__algorithm/fill.h b/libcudacxx/include/cuda/__algorithm/fill.h index e5985a3556d..9db6d53c91b 100644 --- a/libcudacxx/include/cuda/__algorithm/fill.h +++ b/libcudacxx/include/cuda/__algorithm/fill.h @@ -24,6 +24,7 @@ #if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC) # include +# include # include # include @@ -62,7 +63,7 @@ _CCCL_HOST_API void __fill_bytes_impl(stream_ref __stream, //! @brief Launches an operation to bytewise fill the memory into the provided stream. //! -//! The destination needs to be a `contiguous_range` and convert to `cuda::std::span`. +//! The destination needs to be or device_transform to a `contiguous_range` and convert to `cuda::std::span`. //! The element type of the destination is required to be trivially copyable. //! //! The destination cannot reside in pagable host memory. @@ -71,15 +72,16 @@ _CCCL_HOST_API void __fill_bytes_impl(stream_ref __stream, //! @param __dst Destination memory to fill //! @param __value Value to fill into every byte in the destination _CCCL_TEMPLATE(typename _DstTy) -_CCCL_REQUIRES(__spannable<_DstTy>) +_CCCL_REQUIRES(__spannable>) _CCCL_HOST_API void fill_bytes(stream_ref __stream, _DstTy&& __dst, ::cuda::std::uint8_t __value) { - ::cuda::__detail::__fill_bytes_impl(__stream, ::cuda::std::span(::cuda::std::forward<_DstTy>(__dst)), __value); + ::cuda::__detail::__fill_bytes_impl( + __stream, ::cuda::std::span(device_transform(__stream, ::cuda::std::forward<_DstTy>(__dst))), __value); } //! @brief Launches an operation to bytewise fill the memory into the provided stream. //! -//! Destination needs to be an instance of `cuda::std::mdspan`. +//! Destination needs to be or device_transform to an instance of `cuda::std::mdspan`. //! It can also convert to `cuda::std::mdspan`, but the type needs to //! contain `mdspan` template arguments as member aliases named `value_type`, //! `extents_type`, `layout_type` and `accessor_type`. The resulting mdspan is required to @@ -92,10 +94,11 @@ _CCCL_HOST_API void fill_bytes(stream_ref __stream, _DstTy&& __dst, ::cuda::std: //! @param __dst Destination memory to fill //! @param __value Value to fill into every byte in the destination _CCCL_TEMPLATE(typename _DstTy) -_CCCL_REQUIRES(__mdspannable<_DstTy>) +_CCCL_REQUIRES(__mdspannable>) _CCCL_HOST_API void fill_bytes(stream_ref __stream, _DstTy&& __dst, ::cuda::std::uint8_t __value) { - ::cuda::__detail::__fill_bytes_impl(__stream, __as_mdspan(::cuda::std::forward<_DstTy>(__dst)), __value); + ::cuda::__detail::__fill_bytes_impl( + __stream, __as_mdspan(device_transform(__stream, ::cuda::std::forward<_DstTy>(__dst))), __value); } _CCCL_END_NAMESPACE_CUDA diff --git a/cudax/include/cuda/experimental/__stream/device_transform.cuh b/libcudacxx/include/cuda/__stream/device_transform.h similarity index 79% rename from cudax/include/cuda/experimental/__stream/device_transform.cuh rename to libcudacxx/include/cuda/__stream/device_transform.h index c441edda88a..f7136646149 100644 --- a/cudax/include/cuda/experimental/__stream/device_transform.cuh +++ b/libcudacxx/include/cuda/__stream/device_transform.h @@ -1,6 +1,6 @@ //===----------------------------------------------------------------------===// // -// Part of CUDA Experimental in CUDA C++ Core Libraries, +// Part of libcu++, the C++ Standard Library for your entire system, // under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception @@ -8,8 +8,8 @@ // //===----------------------------------------------------------------------===// -#ifndef _CUDAX__STREAM_DEVICE_TRANSFORM_CUH -#define _CUDAX__STREAM_DEVICE_TRANSFORM_CUH +#ifndef _CUDA__STREAM_DEVICE_TRANSFORM_H +#define _CUDA__STREAM_DEVICE_TRANSFORM_H #include @@ -22,9 +22,11 @@ #endif // no system header #include +#include #include #include #include +#include #include #include #include @@ -32,19 +34,15 @@ #include #include -#include -#include -#include - #include -namespace cuda::experimental -{ +_CCCL_BEGIN_NAMESPACE_CUDA namespace __detail { // This function turns rvalues into prvalues and leaves lvalues as is. template -_CCCL_API constexpr auto __ixnay_xvalue(_Tp&& __value) noexcept(__nothrow_movable<_Tp>) -> _Tp +_CCCL_API constexpr auto __ixnay_xvalue(_Tp&& __value) noexcept(::cuda::std::is_nothrow_move_constructible_v<_Tp>) + -> _Tp { return ::cuda::std::forward<_Tp>(__value); } @@ -85,22 +83,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __device_transform_t using __transformed_argument_t = __remove_rvalue_reference_t().transformed_argument())>; -#if _CCCL_COMPILER(MSVC) - // MSVC has a bug where it rejects the use of an immovable type in a defaulted function - // argument. - // https://developercommunity.visualstudio.com/t/rejects-valid-cannot-use-an-immovable/10932844 - template - struct __optional : execution::__variant<_Arg> - { - __optional() = default; - __optional(__optional&&); // declared but not defined - }; -#else - template - using __optional = execution::__variant<_Arg>; -#endif - - // The use of `__variant` here is to move the destruction of the object returned from + // The use of `optional` here is to move the destruction of the object returned from // transform_device_argument into the caller's stack frame. Objects created for default arguments // are located in the caller's stack frame. This is so that a use of `device_transform` // such as: @@ -120,24 +103,44 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __device_transform_t // auto operator()(::cuda::stream_ref stream, Arg&& arg, auto&& action = transform_device_argument(arg)) // // but sadly that is not valid C++. + // TODO move to use __variant type once cuda/experimental/execution/__variant is moved to libcudacxx + // NOTE: The above seems to only apply if the type is not trivially destructible. To use the optional here I had to + // add a destructor. + template + struct __optional_with_a_destructor : ::cuda::std::optional<_Tp> + { + using ::cuda::std::optional<_Tp>::optional; + ~__optional_with_a_destructor() {} + + template + _CCCL_API inline _CCCL_CONSTEXPR_CXX20 _Tp& __emplace_from_fn(_Fn&& __fn) + { + _CCCL_ASSERT(!this->has_value(), "__construct called for engaged __optional_storage"); + new (::cuda::std::addressof(this->__get())) _Tp(::cuda::std::invoke(::cuda::std::forward<_Fn>(__fn))); + this->__set_engaged(true); + return this->__get(); + } + }; + _CCCL_TEMPLATE(typename _Stream, typename _Arg) _CCCL_REQUIRES(::cuda::std::convertible_to<_Stream, ::cuda::stream_ref> _CCCL_AND( !::cuda::std::is_reference_v<__transform_result_t<_Arg>>)) - [[nodiscard]] _CCCL_HOST_API auto - operator()(_Stream&& __stream, _Arg&& __arg, __optional<__transform_result_t<_Arg>> __storage = {}) const - -> decltype(auto) + [[nodiscard]] _CCCL_HOST_API auto operator()( + _Stream&& __stream, + _Arg&& __arg, + __optional_with_a_destructor<__transform_result_t<_Arg>> __storage = cuda::std::nullopt) const -> decltype(auto) { // Calls to transform_device_argument are intentionally unqualified so as to use ADL. if constexpr (__is_instantiable_with<__transformed_argument_t, __transform_result_t<_Arg>>) { - return _CCCL_MOVE(__storage.__emplace_from([&] { + return _CCCL_MOVE(__storage.__emplace_from_fn([&]() { return transform_device_argument(__stream, ::cuda::std::forward<_Arg>(__arg)); })) .transformed_argument(); } else { - return _CCCL_MOVE(__storage.__emplace_from([&] { + return _CCCL_MOVE(__storage.__emplace_from_fn([&]() { return transform_device_argument(__stream, ::cuda::std::forward<_Arg>(__arg)); })); } @@ -182,8 +185,9 @@ _CCCL_GLOBAL_CONSTANT auto device_transform = __tfx::__device_transform_t{}; template using transformed_device_argument_t _CCCL_NODEBUG_ALIAS = __remove_rvalue_reference_t<::cuda::std::__call_result_t<__tfx::__device_transform_t, ::cuda::stream_ref, _Arg>>; -} // namespace cuda::experimental + +_CCCL_END_NAMESPACE_CUDA #include -#endif // !_CUDAX__STREAM_DEVICE_TRANSFORM_CUH +#endif // _CUDA__STREAM_DEVICE_TRANSFORM_H diff --git a/libcudacxx/include/cuda/__stream/internal_streams.h b/libcudacxx/include/cuda/__stream/internal_streams.h index 94ea5132206..18c984e3b25 100644 --- a/libcudacxx/include/cuda/__stream/internal_streams.h +++ b/libcudacxx/include/cuda/__stream/internal_streams.h @@ -1,6 +1,6 @@ //===----------------------------------------------------------------------===// // -// Part of CUDA Experimental in CUDA C++ Core Libraries, +// Part of libcu++, the C++ Standard Library for your entire system, // under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception diff --git a/libcudacxx/include/cuda/__type_traits/is_instantiable_with.h b/libcudacxx/include/cuda/__type_traits/is_instantiable_with.h new file mode 100644 index 00000000000..957aa264eba --- /dev/null +++ b/libcudacxx/include/cuda/__type_traits/is_instantiable_with.h @@ -0,0 +1,47 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef __CUDA__TYPE_TRAITS_IS_INSTANTIABLE_WITH_H +#define __CUDA__TYPE_TRAITS_IS_INSTANTIABLE_WITH_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include + +#include + +_CCCL_BEGIN_NAMESPACE_CUDA + +#if _CCCL_HAS_CONCEPTS() + +template