diff --git a/cudax/examples/simple_p2p.cu b/cudax/examples/simple_p2p.cu
index f1a5950dded..c92ad37ac62 100644
--- a/cudax/examples/simple_p2p.cu
+++ b/cudax/examples/simple_p2p.cu
@@ -30,10 +30,10 @@
  * Unified Virtual Address Space (UVA) features.
  */
 
+#include <cuda/algorithm>
 #include <cuda/devices>
 #include <cuda/memory_resource>
 
-#include <cuda/experimental/algorithm.cuh>
 #include <cuda/experimental/container.cuh>
 #include <cuda/experimental/launch.cuh>
 #include <cuda/experimental/memory_resource.cuh>
@@ -91,11 +91,11 @@ void benchmark_cross_device_ping_pong_copy(
     // Ping-pong copy between GPUs
     if (i % 2 == 0)
     {
-      cudax::copy_bytes(dev1_stream, dev0_buffer, dev1_buffer);
+      cuda::copy_bytes(dev1_stream, dev0_buffer, dev1_buffer);
     }
     else
     {
-      cudax::copy_bytes(dev1_stream, dev1_buffer, dev0_buffer);
+      cuda::copy_bytes(dev1_stream, dev1_buffer, dev0_buffer);
     }
   }
 
@@ -126,7 +126,7 @@ void test_cross_device_access_from_kernel(
     return static_cast<float>((i++) % 4096);
   });
 
-  cudax::copy_bytes(dev0_stream, host_buffer, dev0_buffer);
+  cuda::copy_bytes(dev0_stream, host_buffer, dev0_buffer);
   dev1_stream.wait(dev0_stream);
 
   // Kernel launch configuration
@@ -151,7 +151,7 @@ void test_cross_device_access_from_kernel(
 
   // Copy data back to host and verify
   printf("Copy data back to host from GPU%d and verify results...\n", dev0.get());
-  cudax::copy_bytes(dev0_stream, dev0_buffer, host_buffer);
+  cuda::copy_bytes(dev0_stream, dev0_buffer, host_buffer);
   dev0_stream.sync();
 
   int error_count = 0;
diff --git a/cudax/include/cuda/experimental/__algorithm/common.cuh b/cudax/include/cuda/experimental/__algorithm/common.cuh
deleted file mode 100644
index e4f9461cf97..00000000000
--- a/cudax/include/cuda/experimental/__algorithm/common.cuh
+++ /dev/null
@@ -1,69 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of CUDA Experimental in CUDA C++ Core Libraries,
-// under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef __CUDAX_ALGORITHM_COMMON
-#define __CUDAX_ALGORITHM_COMMON
-
-#include <cuda/__cccl_config>
-
-#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
-#  pragma GCC system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
-#  pragma clang system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
-#  pragma system_header
-#endif // no system header
-
-#include <cuda/std/__concepts/concept_macros.h>
-#include <cuda/std/__concepts/convertible_to.h>
-#include <cuda/std/__ranges/concepts.h>
-#include <cuda/std/__type_traits/remove_reference.h>
-#include <cuda/std/mdspan>
-#include <cuda/std/span>
-
-#include <cuda/experimental/__stream/device_transform.cuh>
-
-#include <cuda/std/__cccl/prologue.h>
-
-namespace cuda::experimental
-{
-template <typename _Tp>
-using __as_span_t = ::cuda::std::span<::cuda::std::remove_reference_t<::cuda::std::ranges::range_reference_t<_Tp>>>;
-
-//! @brief A concept that checks if the type can be converted to a `cuda::std::span`.
-//! The type must be a contiguous range.
-template <typename _Tp>
-_CCCL_CONCEPT __spannable = _CCCL_REQUIRES_EXPR((_Tp))( //
-  requires(::cuda::std::ranges::contiguous_range<_Tp>), //
-  requires(::cuda::std::convertible_to<_Tp, __as_span_t<_Tp>>));
-
-template <typename _Tp>
-using __as_mdspan_t =
-  ::cuda::std::mdspan<typename ::cuda::std::decay_t<_Tp>::value_type,
-                      typename ::cuda::std::decay_t<_Tp>::extents_type,
-                      typename ::cuda::std::decay_t<_Tp>::layout_type,
-                      typename ::cuda::std::decay_t<_Tp>::accessor_type>;
-
-//! @brief A concept that checks if the type can be converted to a `cuda::std::mdspan`.
-//! The type must have a conversion to `__as_mdspan_t<_Tp>`.
-template <typename _Tp>
-_CCCL_CONCEPT __mdspannable =
-  _CCCL_REQUIRES_EXPR((_Tp))(requires(::cuda::std::convertible_to<_Tp, __as_mdspan_t<_Tp>>));
-
-template <typename _Tp>
-[[nodiscard]] _CCCL_HOST_API constexpr auto __as_mdspan(_Tp&& __value) noexcept -> __as_mdspan_t<_Tp>
-{
-  return ::cuda::std::forward<_Tp>(__value);
-}
-} // namespace cuda::experimental
-
-#include <cuda/std/__cccl/epilogue.h>
-
-#endif //__CUDAX_ALGORITHM_COMMON
diff --git a/cudax/include/cuda/experimental/__algorithm/copy.cuh b/cudax/include/cuda/experimental/__algorithm/copy.cuh
deleted file mode 100644
index 1a235f26629..00000000000
--- a/cudax/include/cuda/experimental/__algorithm/copy.cuh
+++ /dev/null
@@ -1,94 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of CUDA Experimental in CUDA C++ Core Libraries,
-// under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef __CUDAX_ALGORITHM_COPY
-#define __CUDAX_ALGORITHM_COPY
-
-#include <cuda/__cccl_config>
-
-#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
-#  pragma GCC system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
-#  pragma clang system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
-#  pragma system_header
-#endif // no system header
-
-#include <cuda/__algorithm/copy.h>
-#include <cuda/std/__concepts/concept_macros.h>
-#include <cuda/std/mdspan>
-#include <cuda/std/span>
-
-#include <cuda/experimental/__stream/device_transform.cuh>
-
-#include <cuda/std/__cccl/prologue.h>
-
-namespace cuda::experimental
-{
-//! @brief Launches a bytewise memory copy from source to destination into the provided
-//! stream.
-//!
-//! Both source and destination needs to be or device_transform to a type that is a `contiguous_range` and converts to
-//! `cuda::std::span`. The element types of both the source and destination range is required to be trivially copyable.
-//!
-//! This call might be synchronous if either source or destination is pagable host memory.
-//! It will be synchronous if both destination and copy is located in host memory.
-//!
-//! @param __stream Stream that the copy should be inserted into
-//! @param __src Source to copy from
-//! @param __dst Destination to copy into
-//! @param __config Configuration for the copy
-_CCCL_TEMPLATE(typename _SrcTy, typename _DstTy)
-_CCCL_REQUIRES(::cuda::__spannable<transformed_device_argument_t<_SrcTy>>
-                 _CCCL_AND ::cuda::__spannable<transformed_device_argument_t<_DstTy>>)
-_CCCL_HOST_API void
-copy_bytes(::cuda::stream_ref __stream, _SrcTy&& __src, _DstTy&& __dst, copy_configuration __config = {})
-{
-  ::cuda::__detail::__copy_bytes_impl(
-    __stream,
-    ::cuda::std::span(device_transform(__stream, ::cuda::std::forward<_SrcTy>(__src))),
-    ::cuda::std::span(device_transform(__stream, ::cuda::std::forward<_DstTy>(__dst))),
-    __config);
-}
-
-//! @brief Launches a bytewise memory copy from source to destination into the provided
-//! stream.
-//!
-//! Both source and destination needs to be or device_transform to an instance of `cuda::std::mdspan`.
-//! They can also implicitly convert to `cuda::std::mdspan`, but the
-//! type needs to contain `mdspan` template arguments as member aliases named
-//! `value_type`, `extents_type`, `layout_type` and `accessor_type`. The resulting mdspan
-//! is required to be exhaustive. The element types of both the source and destination
-//! type are required to be trivially copyable.
-//!
-//! This call might be synchronous if either source or destination is pagable host memory.
-//! It will be synchronous if both destination and copy is located in host memory.
-//!
-//! @param __stream Stream that the copy should be inserted into
-//! @param __src Source to copy from
-//! @param __dst Destination to copy into
-//! @param __config Configuration for the copy
-_CCCL_TEMPLATE(typename _SrcTy, typename _DstTy)
-_CCCL_REQUIRES(::cuda::__mdspannable<transformed_device_argument_t<_SrcTy>>
-                 _CCCL_AND ::cuda::__mdspannable<transformed_device_argument_t<_DstTy>>)
-_CCCL_HOST_API void
-copy_bytes(::cuda::stream_ref __stream, _SrcTy&& __src, _DstTy&& __dst, copy_configuration __config = {})
-{
-  ::cuda::__detail::__copy_bytes_impl(
-    __stream,
-    ::cuda::__as_mdspan(device_transform(__stream, ::cuda::std::forward<_SrcTy>(__src))),
-    ::cuda::__as_mdspan(device_transform(__stream, ::cuda::std::forward<_DstTy>(__dst))),
-    __config);
-}
-} // namespace cuda::experimental
-
-#include <cuda/std/__cccl/epilogue.h>
-
-#endif // __CUDAX_ALGORITHM_COPY
diff --git a/cudax/include/cuda/experimental/__algorithm/fill.cuh b/cudax/include/cuda/experimental/__algorithm/fill.cuh
deleted file mode 100644
index 5490d21ccdc..00000000000
--- a/cudax/include/cuda/experimental/__algorithm/fill.cuh
+++ /dev/null
@@ -1,78 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of CUDA Experimental in CUDA C++ Core Libraries,
-// under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef __CUDAX_ALGORITHM_FILL
-#define __CUDAX_ALGORITHM_FILL
-
-#include <cuda/__cccl_config>
-
-#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
-#  pragma GCC system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
-#  pragma clang system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
-#  pragma system_header
-#endif // no system header
-
-#include <cuda/__algorithm/fill.h>
-#include <cuda/std/__concepts/concept_macros.h>
-
-#include <cuda/experimental/__stream/device_transform.cuh>
-
-#include <cuda/std/__cccl/prologue.h>
-
-namespace cuda::experimental
-{
-//! @brief Launches an operation to bytewise fill the memory into the provided stream.
-//!
-//! The destination needs to either be a `contiguous_range` or transform into one. It can
-//! also implicitly convert to `cuda::std::span`, but it needs to contain a `value_type`
-//! member alias. The element type of the destination is required to be trivially
-//! copyable.
-//!
-//! The destination cannot reside in pagable host memory.
-//!
-//! @param __stream Stream that the copy should be inserted into
-//! @param __dst Destination memory to fill
-//! @param __value Value to fill into every byte in the destination
-_CCCL_TEMPLATE(typename _DstTy)
-_CCCL_REQUIRES(::cuda::__spannable<transformed_device_argument_t<_DstTy>>)
-_CCCL_HOST_API void fill_bytes(::cuda::stream_ref __stream, _DstTy&& __dst, ::cuda::std::uint8_t __value)
-{
-  ::cuda::__detail::__fill_bytes_impl(
-    __stream, ::cuda::std::span(device_transform(__stream, ::cuda::std::forward<_DstTy>(__dst))), __value);
-}
-
-//! @brief Launches an operation to bytewise fill the memory into the provided stream.
-//!
-//! Destination needs to either be an instance of `cuda::std::mdspan` or transform into
-//! one. It can also implicitly convert to `cuda::std::mdspan`, but the type needs to
-//! contain `mdspan` template arguments as member aliases named `value_type`,
-//! `extents_type`, `layout_type` and `accessor_type`. The resulting mdspan is required to
-//! be exhaustive. The element type of the destination is required to be trivially
-//! copyable.
-//!
-//! The destination cannot reside in pagable host memory.
-//!
-//! @param __stream Stream that the copy should be inserted into
-//! @param __dst Destination memory to fill
-//! @param __value Value to fill into every byte in the destination
-_CCCL_TEMPLATE(typename _DstTy)
-_CCCL_REQUIRES(::cuda::__mdspannable<transformed_device_argument_t<_DstTy>>)
-_CCCL_HOST_API void fill_bytes(::cuda::stream_ref __stream, _DstTy&& __dst, ::cuda::std::uint8_t __value)
-{
-  ::cuda::__detail::__fill_bytes_impl(
-    __stream, ::cuda::__as_mdspan(device_transform(__stream, ::cuda::std::forward<_DstTy>(__dst))), __value);
-}
-} // namespace cuda::experimental
-
-#include <cuda/std/__cccl/epilogue.h>
-
-#endif // __CUDAX_ALGORITHM_FILL
diff --git a/cudax/include/cuda/experimental/__launch/launch.cuh b/cudax/include/cuda/experimental/__launch/launch.cuh
index c9573103942..18a51b07053 100644
--- a/cudax/include/cuda/experimental/__launch/launch.cuh
+++ b/cudax/include/cuda/experimental/__launch/launch.cuh
@@ -22,6 +22,7 @@
 #endif // no system header
 
 #include <cuda/__driver/driver_api.h>
+#include <cuda/__stream/device_transform.h>
 #include <cuda/__stream/stream_ref.h>
 #include <cuda/std/__exception/cuda_error.h>
 #include <cuda/std/__type_traits/is_function.h>
@@ -38,7 +39,6 @@
 #include <cuda/experimental/__graph/path_builder.cuh>
 #include <cuda/experimental/__kernel/kernel_ref.cuh>
 #include <cuda/experimental/__launch/configuration.cuh>
-#include <cuda/experimental/__stream/device_transform.cuh>
 #include <cuda/experimental/__utility/ensure_current_device.cuh>
 
 #include <cuda/std/__cccl/prologue.h>
diff --git a/cudax/include/cuda/experimental/algorithm.cuh b/cudax/include/cuda/experimental/algorithm.cuh
deleted file mode 100644
index 4c91213237c..00000000000
--- a/cudax/include/cuda/experimental/algorithm.cuh
+++ /dev/null
@@ -1,17 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of CUDA Experimental in CUDA C++ Core Libraries,
-// under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef __CUDAX_ALGORITHM__
-#define __CUDAX_ALGORITHM__
-
-#include <cuda/experimental/__algorithm/copy.cuh>
-#include <cuda/experimental/__algorithm/fill.cuh>
-
-#endif // __CUDAX_ALGORITHM__
diff --git a/cudax/include/cuda/experimental/launch.cuh b/cudax/include/cuda/experimental/launch.cuh
index 158d99f6384..fe74b58727f 100644
--- a/cudax/include/cuda/experimental/launch.cuh
+++ b/cudax/include/cuda/experimental/launch.cuh
@@ -11,10 +11,11 @@
 #ifndef __CUDAX_LAUNCH___
 #define __CUDAX_LAUNCH___
 
+#include <cuda/__stream/device_transform.h>
+
 #include <cuda/experimental/__launch/configuration.cuh>
 #include <cuda/experimental/__launch/host_launch.cuh>
 #include <cuda/experimental/__launch/launch.cuh>
 #include <cuda/experimental/__launch/param_kind.cuh>
-#include <cuda/experimental/__stream/device_transform.cuh>
 
 #endif // __CUDAX_LAUNCH___
diff --git a/cudax/test/algorithm/common.cuh b/cudax/test/algorithm/common.cuh
index 17d2b8c6424..b114dadeb7a 100644
--- a/cudax/test/algorithm/common.cuh
+++ b/cudax/test/algorithm/common.cuh
@@ -11,10 +11,10 @@
 #ifndef __ALGORITHM_COMMON__
 #define __ALGORITHM_COMMON__
 
+#include <cuda/algorithm>
 #include <cuda/memory_resource>
 #include <cuda/std/mdspan>
 
-#include <cuda/experimental/algorithm.cuh>
 #include <cuda/experimental/container.cuh>
 #include <cuda/experimental/memory_resource.cuh>
 
diff --git a/cudax/test/algorithm/copy.cu b/cudax/test/algorithm/copy.cu
index 501b2ab1794..6fa26da8158 100644
--- a/cudax/test/algorithm/copy.cu
+++ b/cudax/test/algorithm/copy.cu
@@ -21,25 +21,25 @@ C2H_TEST("1d Copy", "[data_manipulation]")
 
     {
       cuda::__uninitialized_async_buffer<int, cuda::mr::device_accessible> buffer(device_resource, _stream, buffer_size);
-      cudax::fill_bytes(_stream, buffer, fill_byte);
+      cuda::fill_bytes(_stream, buffer, fill_byte);
 
-      cudax::copy_bytes(_stream, buffer, host_vector);
+      cuda::copy_bytes(_stream, buffer, host_vector);
       check_result_and_erase(_stream, host_vector);
 
-      cudax::copy_bytes(_stream, std::move(buffer), host_vector);
+      cuda::copy_bytes(_stream, std::move(buffer), host_vector);
       check_result_and_erase(_stream, host_vector);
     }
     {
       cuda::__uninitialized_async_buffer<int, cuda::mr::device_accessible> not_yet_const_buffer(
         device_resource, _stream, buffer_size);
-      cudax::fill_bytes(_stream, not_yet_const_buffer, fill_byte);
+      cuda::fill_bytes(_stream, not_yet_const_buffer, fill_byte);
 
       const auto& const_buffer = not_yet_const_buffer;
 
-      cudax::copy_bytes(_stream, const_buffer, host_vector);
+      cuda::copy_bytes(_stream, const_buffer, host_vector);
       check_result_and_erase(_stream, host_vector);
 
-      cudax::copy_bytes(_stream, const_buffer, cuda::std::span(host_vector));
+      cuda::copy_bytes(_stream, const_buffer, cuda::std::span(host_vector));
       check_result_and_erase(_stream, host_vector);
     }
   }
@@ -53,26 +53,26 @@ C2H_TEST("1d Copy", "[data_manipulation]")
       cudax::uninitialized_buffer<int, cuda::mr::host_accessible> host_buffer(host_resource, buffer_size);
       cudax::uninitialized_buffer<int, cuda::mr::device_accessible> device_buffer(managed_resource, buffer_size);
 
-      cudax::fill_bytes(_stream, host_buffer, fill_byte);
+      cuda::fill_bytes(_stream, host_buffer, fill_byte);
 
-      cudax::copy_bytes(_stream, host_buffer, device_buffer);
+      cuda::copy_bytes(_stream, host_buffer, device_buffer);
       check_result_and_erase(_stream, device_buffer);
 
-      cudax::copy_bytes(_stream, cuda::std::span(host_buffer), device_buffer);
+      cuda::copy_bytes(_stream, cuda::std::span(host_buffer), device_buffer);
       check_result_and_erase(_stream, device_buffer);
     }
 
     {
       cudax::uninitialized_buffer<int, cuda::mr::host_accessible> not_yet_const_host_buffer(host_resource, buffer_size);
       cudax::uninitialized_buffer<int, cuda::mr::device_accessible> device_buffer(managed_resource, buffer_size);
-      cudax::fill_bytes(_stream, not_yet_const_host_buffer, fill_byte);
+      cuda::fill_bytes(_stream, not_yet_const_host_buffer, fill_byte);
 
       const auto& const_host_buffer = not_yet_const_host_buffer;
 
-      cudax::copy_bytes(_stream, const_host_buffer, device_buffer);
+      cuda::copy_bytes(_stream, const_host_buffer, device_buffer);
       check_result_and_erase(_stream, device_buffer);
 
-      cudax::copy_bytes(_stream, cuda::std::span(const_host_buffer), device_buffer);
+      cuda::copy_bytes(_stream, cuda::std::span(const_host_buffer), device_buffer);
       check_result_and_erase(_stream, device_buffer);
     }
   }
@@ -84,7 +84,7 @@ C2H_TEST("1d Copy", "[data_manipulation]")
 
     memset(input.data, fill_byte, input.size * sizeof(int));
 
-    cudax::copy_bytes(_stream, input, output);
+    cuda::copy_bytes(_stream, input, output);
     check_result_and_erase(_stream, cuda::std::span(output.data, output.size));
   }
 
@@ -92,11 +92,11 @@ C2H_TEST("1d Copy", "[data_manipulation]")
   {
     cuda::legacy_pinned_memory_resource host_resource;
     cudax::uninitialized_buffer<int, cuda::mr::host_accessible> host_buffer(host_resource, 1);
-    cudax::fill_bytes(_stream, host_buffer, fill_byte);
+    cuda::fill_bytes(_stream, host_buffer, fill_byte);
 
     ::std::vector<int> vec(buffer_size, 0xbeef);
 
-    cudax::copy_bytes(_stream, host_buffer, vec);
+    cuda::copy_bytes(_stream, host_buffer, vec);
     _stream.sync();
 
     CUDAX_REQUIRE(vec[0] == get_expected_value(fill_byte));
@@ -122,7 +122,7 @@ void test_mdspan_copy_bytes(
     src(0, i) = i;
   }
 
-  cudax::copy_bytes(stream, std::move(src), dst);
+  cuda::copy_bytes(stream, std::move(src), dst);
   stream.sync();
 
   for (int i = 0; i < static_cast<int>(dst.extent(1)); i++)
@@ -163,7 +163,7 @@ C2H_TEST("Mdspan copy", "[data_manipulation]")
     cudax::weird_buffer<cuda::std::mdspan<int, decltype(static_extents)>> buffer{
       host_resource, mdspan.mapping().required_span_size()};
 
-    cudax::copy_bytes(stream, mdspan, buffer);
+    cuda::copy_bytes(stream, mdspan, buffer);
     stream.sync();
     CUDAX_REQUIRE(!memcmp(mdspan_buffer.data(), buffer.data, mdspan_buffer.size()));
   }
@@ -177,7 +177,7 @@ C2H_TEST("Non exhaustive mdspan copy_bytes", "[data_manipulation]")
 
     try
     {
-      cudax::copy_bytes(stream, fake_strided_mdspan, fake_strided_mdspan);
+      cuda::copy_bytes(stream, fake_strided_mdspan, fake_strided_mdspan);
     }
     catch (const ::std::invalid_argument& e)
     {
diff --git a/cudax/test/algorithm/fill.cu b/cudax/test/algorithm/fill.cu
index 10dae14209c..c470f988770 100644
--- a/cudax/test/algorithm/fill.cu
+++ b/cudax/test/algorithm/fill.cu
@@ -18,7 +18,7 @@ C2H_TEST("Fill", "[data_manipulation]")
     cuda::legacy_pinned_memory_resource host_resource;
     cudax::uninitialized_buffer<int, cuda::mr::device_accessible> buffer(host_resource, buffer_size);
 
-    cudax::fill_bytes(_stream, buffer, fill_byte);
+    cuda::fill_bytes(_stream, buffer, fill_byte);
 
     check_result_and_erase(_stream, cuda::std::span(buffer));
   }
@@ -27,7 +27,7 @@ C2H_TEST("Fill", "[data_manipulation]")
   {
     cuda::device_memory_pool_ref device_resource = cuda::device_default_memory_pool(cuda::device_ref{0});
     cudax::uninitialized_buffer<int, cuda::mr::device_accessible> buffer(device_resource, buffer_size);
-    cudax::fill_bytes(_stream, buffer, fill_byte);
+    cuda::fill_bytes(_stream, buffer, fill_byte);
 
     std::vector<int> host_vector(42);
     CUDART(cudaMemcpyAsync(
@@ -40,7 +40,7 @@ C2H_TEST("Fill", "[data_manipulation]")
     cuda::legacy_pinned_memory_resource host_resource;
     cudax::weird_buffer buffer(host_resource, buffer_size);
 
-    cudax::fill_bytes(_stream, buffer, fill_byte);
+    cuda::fill_bytes(_stream, buffer, fill_byte);
     check_result_and_erase(_stream, cuda::std::span(buffer.data, buffer.size));
   }
 }
@@ -53,7 +53,7 @@ C2H_TEST("Mdspan Fill", "[data_manipulation]")
     auto buffer = make_buffer_for_mdspan(dynamic_extents, 0);
     cuda::std::mdspan<int, decltype(dynamic_extents)> dynamic_mdspan(buffer.data(), dynamic_extents);
 
-    cudax::fill_bytes(stream, dynamic_mdspan, fill_byte);
+    cuda::fill_bytes(stream, dynamic_mdspan, fill_byte);
     check_result_and_erase(stream, cuda::std::span(buffer.data(), buffer.size()));
   }
   {
@@ -61,7 +61,7 @@ C2H_TEST("Mdspan Fill", "[data_manipulation]")
     auto buffer = make_buffer_for_mdspan(mixed_extents, 0);
     cuda::std::mdspan<int, decltype(mixed_extents)> mixed_mdspan(buffer.data(), mixed_extents);
 
-    cudax::fill_bytes(stream, cuda::std::move(mixed_mdspan), fill_byte);
+    cuda::fill_bytes(stream, cuda::std::move(mixed_mdspan), fill_byte);
     check_result_and_erase(stream, cuda::std::span(buffer.data(), buffer.size()));
   }
   {
@@ -70,7 +70,7 @@ C2H_TEST("Mdspan Fill", "[data_manipulation]")
     auto size            = cuda::std::layout_left::mapping<static_extents>().required_span_size();
     cudax::weird_buffer<cuda::std::mdspan<int, static_extents>> buffer(host_resource, size);
 
-    cudax::fill_bytes(stream, buffer, fill_byte);
+    cuda::fill_bytes(stream, buffer, fill_byte);
     check_result_and_erase(stream, cuda::std::span(buffer.data, buffer.size));
   }
 }
@@ -83,7 +83,7 @@ C2H_TEST("Non exhaustive mdspan fill_bytes", "[data_manipulation]")
 
     try
     {
-      cudax::fill_bytes(stream, fake_strided_mdspan, fill_byte);
+      cuda::fill_bytes(stream, fake_strided_mdspan, fill_byte);
     }
     catch (const ::std::invalid_argument& e)
     {
diff --git a/cudax/test/containers/async_buffer/transform.cu b/cudax/test/containers/async_buffer/transform.cu
index c045beb446b..e7975d9f85b 100644
--- a/cudax/test/containers/async_buffer/transform.cu
+++ b/cudax/test/containers/async_buffer/transform.cu
@@ -13,12 +13,12 @@
 #include <thrust/host_vector.h>
 #include <thrust/sequence.h>
 
+#include <cuda/algorithm>
 #include <cuda/memory_resource>
 #include <cuda/std/tuple>
 #include <cuda/std/type_traits>
 #include <cuda/std/utility>
 
-#include <cuda/experimental/algorithm.cuh>
 #include <cuda/experimental/container.cuh>
 #include <cuda/experimental/launch.cuh>
 
@@ -90,7 +90,7 @@ C2H_CCCLRT_TEST("cudax::buffer launch transform", "[container][buffer]")
   cudax::launch(stream, cudax::make_config(cudax::grid_dims<1>, cudax::block_dims<32>), add_kernel{}, a, b);
 
   std::vector<int> host_result(a.size());
-  cudax::copy_bytes(stream, a, host_result);
+  cuda::copy_bytes(stream, a, host_result);
 
   stream.sync();
 
diff --git a/cudax/test/launch/launch_smoke.cu b/cudax/test/launch/launch_smoke.cu
index a0e78cce140..1920b3d27ad 100644
--- a/cudax/test/launch/launch_smoke.cu
+++ b/cudax/test/launch/launch_smoke.cu
@@ -130,7 +130,7 @@ struct launch_transform_to_int_convertible
 
     // Immovable to ensure that device_transform doesn't copy the returned
     // object
-    int_convertible(int_convertible&&) = delete;
+    int_convertible(int_convertible&&) noexcept = delete;
 
     ~int_convertible() noexcept
     {
diff --git a/libcudacxx/include/cuda/__algorithm/copy.h b/libcudacxx/include/cuda/__algorithm/copy.h
index 8cb81c9acaf..4ee8133c343 100644
--- a/libcudacxx/include/cuda/__algorithm/copy.h
+++ b/libcudacxx/include/cuda/__algorithm/copy.h
@@ -24,6 +24,7 @@
 #if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
 
 #  include <cuda/__algorithm/common.h>
+#  include <cuda/__stream/device_transform.h>
 #  include <cuda/__stream/stream_ref.h>
 #  include <cuda/std/__concepts/concept_macros.h>
 #  include <cuda/std/mdspan>
@@ -149,13 +150,14 @@ _CCCL_HOST_API void __copy_bytes_impl(
 //! @param __dst Destination to copy into
 //! @param __config Configuration for the copy
 _CCCL_TEMPLATE(typename _SrcTy, typename _DstTy)
-_CCCL_REQUIRES(__spannable<_SrcTy> _CCCL_AND __spannable<_DstTy>)
+_CCCL_REQUIRES(
+  __spannable<transformed_device_argument_t<_SrcTy>> _CCCL_AND __spannable<transformed_device_argument_t<_DstTy>>)
 _CCCL_HOST_API void copy_bytes(stream_ref __stream, _SrcTy&& __src, _DstTy&& __dst, copy_configuration __config = {})
 {
   ::cuda::__detail::__copy_bytes_impl(
     __stream,
-    ::cuda::std::span(::cuda::std::forward<_SrcTy>(__src)),
-    ::cuda::std::span(::cuda::std::forward<_DstTy>(__dst)),
+    ::cuda::std::span(device_transform(__stream, ::cuda::std::forward<_SrcTy>(__src))),
+    ::cuda::std::span(device_transform(__stream, ::cuda::std::forward<_DstTy>(__dst))),
     __config);
 }
 
@@ -177,13 +179,14 @@ _CCCL_HOST_API void copy_bytes(stream_ref __stream, _SrcTy&& __src, _DstTy&& __d
 //! @param __dst Destination to copy into
 //! @param __config Configuration for the copy
 _CCCL_TEMPLATE(typename _SrcTy, typename _DstTy)
-_CCCL_REQUIRES(__mdspannable<_SrcTy> _CCCL_AND __mdspannable<_DstTy>)
+_CCCL_REQUIRES(
+  __mdspannable<transformed_device_argument_t<_SrcTy>> _CCCL_AND __mdspannable<transformed_device_argument_t<_DstTy>>)
 _CCCL_HOST_API void copy_bytes(stream_ref __stream, _SrcTy&& __src, _DstTy&& __dst, copy_configuration __config = {})
 {
   ::cuda::__detail::__copy_bytes_impl(
     __stream,
-    ::cuda::__as_mdspan(::cuda::std::forward<_SrcTy>(__src)),
-    ::cuda::__as_mdspan(::cuda::std::forward<_DstTy>(__dst)),
+    ::cuda::__as_mdspan(device_transform(__stream, ::cuda::std::forward<_SrcTy>(__src))),
+    ::cuda::__as_mdspan(device_transform(__stream, ::cuda::std::forward<_DstTy>(__dst))),
     __config);
 }
 
diff --git a/libcudacxx/include/cuda/__algorithm/fill.h b/libcudacxx/include/cuda/__algorithm/fill.h
index e5985a3556d..9db6d53c91b 100644
--- a/libcudacxx/include/cuda/__algorithm/fill.h
+++ b/libcudacxx/include/cuda/__algorithm/fill.h
@@ -24,6 +24,7 @@
 #if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
 
 #  include <cuda/__algorithm/common.h>
+#  include <cuda/__stream/device_transform.h>
 #  include <cuda/__stream/stream_ref.h>
 #  include <cuda/std/__concepts/concept_macros.h>
 
@@ -62,7 +63,7 @@ _CCCL_HOST_API void __fill_bytes_impl(stream_ref __stream,
 
 //! @brief Launches an operation to bytewise fill the memory into the provided stream.
 //!
-//! The destination needs to be a `contiguous_range` and convert to `cuda::std::span`.
+//! The destination needs to be or device_transform to a `contiguous_range` and convert to `cuda::std::span`.
 //! The element type of the destination is required to be trivially copyable.
 //!
 //! The destination cannot reside in pagable host memory.
@@ -71,15 +72,16 @@ _CCCL_HOST_API void __fill_bytes_impl(stream_ref __stream,
 //! @param __dst Destination memory to fill
 //! @param __value Value to fill into every byte in the destination
 _CCCL_TEMPLATE(typename _DstTy)
-_CCCL_REQUIRES(__spannable<_DstTy>)
+_CCCL_REQUIRES(__spannable<transformed_device_argument_t<_DstTy>>)
 _CCCL_HOST_API void fill_bytes(stream_ref __stream, _DstTy&& __dst, ::cuda::std::uint8_t __value)
 {
-  ::cuda::__detail::__fill_bytes_impl(__stream, ::cuda::std::span(::cuda::std::forward<_DstTy>(__dst)), __value);
+  ::cuda::__detail::__fill_bytes_impl(
+    __stream, ::cuda::std::span(device_transform(__stream, ::cuda::std::forward<_DstTy>(__dst))), __value);
 }
 
 //! @brief Launches an operation to bytewise fill the memory into the provided stream.
 //!
-//! Destination needs to be an instance of `cuda::std::mdspan`.
+//! Destination needs to be or device_transform to an instance of `cuda::std::mdspan`.
 //! It can also convert to `cuda::std::mdspan`, but the type needs to
 //! contain `mdspan` template arguments as member aliases named `value_type`,
 //! `extents_type`, `layout_type` and `accessor_type`. The resulting mdspan is required to
@@ -92,10 +94,11 @@ _CCCL_HOST_API void fill_bytes(stream_ref __stream, _DstTy&& __dst, ::cuda::std:
 //! @param __dst Destination memory to fill
 //! @param __value Value to fill into every byte in the destination
 _CCCL_TEMPLATE(typename _DstTy)
-_CCCL_REQUIRES(__mdspannable<_DstTy>)
+_CCCL_REQUIRES(__mdspannable<transformed_device_argument_t<_DstTy>>)
 _CCCL_HOST_API void fill_bytes(stream_ref __stream, _DstTy&& __dst, ::cuda::std::uint8_t __value)
 {
-  ::cuda::__detail::__fill_bytes_impl(__stream, __as_mdspan(::cuda::std::forward<_DstTy>(__dst)), __value);
+  ::cuda::__detail::__fill_bytes_impl(
+    __stream, __as_mdspan(device_transform(__stream, ::cuda::std::forward<_DstTy>(__dst))), __value);
 }
 
 _CCCL_END_NAMESPACE_CUDA
diff --git a/cudax/include/cuda/experimental/__stream/device_transform.cuh b/libcudacxx/include/cuda/__stream/device_transform.h
similarity index 79%
rename from cudax/include/cuda/experimental/__stream/device_transform.cuh
rename to libcudacxx/include/cuda/__stream/device_transform.h
index c441edda88a..f7136646149 100644
--- a/cudax/include/cuda/experimental/__stream/device_transform.cuh
+++ b/libcudacxx/include/cuda/__stream/device_transform.h
@@ -1,6 +1,6 @@
 //===----------------------------------------------------------------------===//
 //
-// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// Part of libcu++, the C++ Standard Library for your entire system,
 // under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
@@ -8,8 +8,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _CUDAX__STREAM_DEVICE_TRANSFORM_CUH
-#define _CUDAX__STREAM_DEVICE_TRANSFORM_CUH
+#ifndef _CUDA__STREAM_DEVICE_TRANSFORM_H
+#define _CUDA__STREAM_DEVICE_TRANSFORM_H
 
 #include <cuda/__cccl_config>
 
@@ -22,9 +22,11 @@
 #endif // no system header
 
 #include <cuda/__stream/stream_ref.h>
+#include <cuda/__type_traits/is_instantiable_with.h>
 #include <cuda/std/__memory/addressof.h>
 #include <cuda/std/__memory/construct_at.h>
 #include <cuda/std/__new/launder.h>
+#include <cuda/std/__optional/optional.h>
 #include <cuda/std/__type_traits/decay.h>
 #include <cuda/std/__type_traits/is_callable.h>
 #include <cuda/std/__type_traits/is_reference.h>
@@ -32,19 +34,15 @@
 #include <cuda/std/__utility/forward.h>
 #include <cuda/std/__utility/move.h>
 
-#include <cuda/experimental/__detail/type_traits.cuh>
-#include <cuda/experimental/__detail/utility.cuh>
-#include <cuda/experimental/__execution/variant.cuh>
-
 #include <cuda/std/__cccl/prologue.h>
 
-namespace cuda::experimental
-{
+_CCCL_BEGIN_NAMESPACE_CUDA
 namespace __detail
 {
 // This function turns rvalues into prvalues and leaves lvalues as is.
 template <typename _Tp>
-_CCCL_API constexpr auto __ixnay_xvalue(_Tp&& __value) noexcept(__nothrow_movable<_Tp>) -> _Tp
+_CCCL_API constexpr auto __ixnay_xvalue(_Tp&& __value) noexcept(::cuda::std::is_nothrow_move_constructible_v<_Tp>)
+  -> _Tp
 {
   return ::cuda::std::forward<_Tp>(__value);
 }
@@ -85,22 +83,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __device_transform_t
   using __transformed_argument_t =
     __remove_rvalue_reference_t<decltype(::cuda::std::declval<_Arg>().transformed_argument())>;
 
-#if _CCCL_COMPILER(MSVC)
-  // MSVC has a bug where it rejects the use of an immovable type in a defaulted function
-  // argument.
-  // https://developercommunity.visualstudio.com/t/rejects-valid-cannot-use-an-immovable/10932844
-  template <typename _Arg>
-  struct __optional : execution::__variant<_Arg>
-  {
-    __optional() = default;
-    __optional(__optional&&); // declared but not defined
-  };
-#else
-  template <typename _Arg>
-  using __optional = execution::__variant<_Arg>;
-#endif
-
-  // The use of `__variant` here is to move the destruction of the object returned from
+  // The use of `optional` here is to move the destruction of the object returned from
   // transform_device_argument into the caller's stack frame. Objects created for default arguments
   // are located in the caller's stack frame. This is so that a use of `device_transform`
   // such as:
@@ -120,24 +103,44 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __device_transform_t
   //   auto operator()(::cuda::stream_ref stream, Arg&& arg, auto&& action = transform_device_argument(arg))
   //
   // but sadly that is not valid C++.
+  // TODO move to use __variant type once cuda/experimental/execution/__variant is moved to libcudacxx
+  // NOTE: The above seems to only apply if the type is not trivially destructible. To use the optional here I had to
+  // add a destructor.
+  template <typename _Tp>
+  struct __optional_with_a_destructor : ::cuda::std::optional<_Tp>
+  {
+    using ::cuda::std::optional<_Tp>::optional;
+    ~__optional_with_a_destructor() {}
+
+    template <class _Fn>
+    _CCCL_API inline _CCCL_CONSTEXPR_CXX20 _Tp& __emplace_from_fn(_Fn&& __fn)
+    {
+      _CCCL_ASSERT(!this->has_value(), "__construct called for engaged __optional_storage");
+      new (::cuda::std::addressof(this->__get())) _Tp(::cuda::std::invoke(::cuda::std::forward<_Fn>(__fn)));
+      this->__set_engaged(true);
+      return this->__get();
+    }
+  };
+
   _CCCL_TEMPLATE(typename _Stream, typename _Arg)
   _CCCL_REQUIRES(::cuda::std::convertible_to<_Stream, ::cuda::stream_ref> _CCCL_AND(
     !::cuda::std::is_reference_v<__transform_result_t<_Arg>>))
-  [[nodiscard]] _CCCL_HOST_API auto
-  operator()(_Stream&& __stream, _Arg&& __arg, __optional<__transform_result_t<_Arg>> __storage = {}) const
-    -> decltype(auto)
+  [[nodiscard]] _CCCL_HOST_API auto operator()(
+    _Stream&& __stream,
+    _Arg&& __arg,
+    __optional_with_a_destructor<__transform_result_t<_Arg>> __storage = cuda::std::nullopt) const -> decltype(auto)
   {
     // Calls to transform_device_argument are intentionally unqualified so as to use ADL.
     if constexpr (__is_instantiable_with<__transformed_argument_t, __transform_result_t<_Arg>>)
     {
-      return _CCCL_MOVE(__storage.__emplace_from([&] {
+      return _CCCL_MOVE(__storage.__emplace_from_fn([&]() {
                return transform_device_argument(__stream, ::cuda::std::forward<_Arg>(__arg));
              }))
         .transformed_argument();
     }
     else
     {
-      return _CCCL_MOVE(__storage.__emplace_from([&] {
+      return _CCCL_MOVE(__storage.__emplace_from_fn([&]() {
         return transform_device_argument(__stream, ::cuda::std::forward<_Arg>(__arg));
       }));
     }
@@ -182,8 +185,9 @@ _CCCL_GLOBAL_CONSTANT auto device_transform = __tfx::__device_transform_t{};
 template <typename _Arg>
 using transformed_device_argument_t _CCCL_NODEBUG_ALIAS =
   __remove_rvalue_reference_t<::cuda::std::__call_result_t<__tfx::__device_transform_t, ::cuda::stream_ref, _Arg>>;
-} // namespace cuda::experimental
+
+_CCCL_END_NAMESPACE_CUDA
 
 #include <cuda/std/__cccl/epilogue.h>
 
-#endif // !_CUDAX__STREAM_DEVICE_TRANSFORM_CUH
+#endif // _CUDA__STREAM_DEVICE_TRANSFORM_H
diff --git a/libcudacxx/include/cuda/__stream/internal_streams.h b/libcudacxx/include/cuda/__stream/internal_streams.h
index 94ea5132206..18c984e3b25 100644
--- a/libcudacxx/include/cuda/__stream/internal_streams.h
+++ b/libcudacxx/include/cuda/__stream/internal_streams.h
@@ -1,6 +1,6 @@
 //===----------------------------------------------------------------------===//
 //
-// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// Part of libcu++, the C++ Standard Library for your entire system,
 // under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/libcudacxx/include/cuda/__type_traits/is_instantiable_with.h b/libcudacxx/include/cuda/__type_traits/is_instantiable_with.h
new file mode 100644
index 00000000000..957aa264eba
--- /dev/null
+++ b/libcudacxx/include/cuda/__type_traits/is_instantiable_with.h
@@ -0,0 +1,47 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDA__TYPE_TRAITS_IS_INSTANTIABLE_WITH_H
+#define __CUDA__TYPE_TRAITS_IS_INSTANTIABLE_WITH_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__concepts/concept_macros.h>
+#include <cuda/std/__type_traits/is_valid_expansion.h>
+
+#include <cuda/std/__cccl/prologue.h>
+
+_CCCL_BEGIN_NAMESPACE_CUDA
+
+#if _CCCL_HAS_CONCEPTS()
+
+template <template <class...> class _Fn, class... _Ts>
+_CCCL_CONCEPT __is_instantiable_with = requires { typename _Fn<_Ts...>; };
+
+#else // ^^^ _CCCL_HAS_CONCEPTS() ^^^ / vvv !_CCCL_HAS_CONCEPTS() vvv
+
+template <template <class...> class _Fn, class... _Ts>
+_CCCL_CONCEPT __is_instantiable_with = ::cuda::std::_IsValidExpansion<_Fn, _Ts...>::value;
+
+#endif // ^^^ !_CCCL_HAS_CONCEPTS() ^^^
+
+_CCCL_END_NAMESPACE_CUDA
+
+#include <cuda/std/__cccl/epilogue.h>
+
+#endif // __CUDA__TYPE_TRAITS_IS_INSTANTIABLE_WITH_H
diff --git a/libcudacxx/include/cuda/std/__optional/optional.h b/libcudacxx/include/cuda/std/__optional/optional.h
index 268326fdf92..4c0aabeeebc 100644
--- a/libcudacxx/include/cuda/std/__optional/optional.h
+++ b/libcudacxx/include/cuda/std/__optional/optional.h
@@ -371,6 +371,7 @@ class optional : private __optional_move_assign_base<_Tp>
   }
 
   using __base::__get;
+  using __base::__set_engaged;
   using __base::has_value;
 
   _CCCL_API constexpr value_type const& value() const&
diff --git a/libcudacxx/include/cuda/std/__optional/optional_base.h b/libcudacxx/include/cuda/std/__optional/optional_base.h
index edbe6f1be61..cb83f983283 100644
--- a/libcudacxx/include/cuda/std/__optional/optional_base.h
+++ b/libcudacxx/include/cuda/std/__optional/optional_base.h
@@ -215,6 +215,11 @@ struct __optional_storage_base : __optional_destruct_base<_Tp>
     return ::cuda::std::move(this->__storage_.__val_);
   }
 
+  _CCCL_API constexpr void __set_engaged(bool __engaged) noexcept
+  {
+    this->__engaged_ = __engaged;
+  }
+
   _CCCL_EXEC_CHECK_DISABLE
   template <class... _Args>
   _CCCL_API inline _CCCL_CONSTEXPR_CXX20 void __construct(_Args&&... __args)