Use BlockLoadToShared in DeviceMerge

pauleonix · pauleonix · commit 5de62bd46c44 · 2025-09-24T07:06:05.000+02:00
diff --git a/cub/cub/agent/agent_merge.cuh b/cub/cub/agent/agent_merge.cuh
@@ -14,15 +14,25 @@
 #endif // no system header
 
 #include <cub/agent/agent_merge_sort.cuh>
-#include <cub/block/block_load.cuh>
+#include <cub/block/block_load_to_shared.cuh>
 #include <cub/block/block_merge_sort.cuh>
 #include <cub/block/block_store.cuh>
 #include <cub/iterator/cache_modified_input_iterator.cuh>
 #include <cub/util_namespace.cuh>
 #include <cub/util_type.cuh>
 
+#include <thrust/type_traits/is_contiguous_iterator.h>
+#include <thrust/type_traits/is_trivially_relocatable.h>
+#include <thrust/type_traits/unwrap_contiguous_iterator.h>
+
+#include <cuda/__memory/ptr_rebind.h>
 #include <cuda/std/__algorithm/max.h>
 #include <cuda/std/__algorithm/min.h>
+#include <cuda/std/__type_traits/conditional.h>
+#include <cuda/std/__type_traits/is_same.h>
+#include <cuda/std/__type_traits/is_trivially_copyable.h>
+#include <cuda/std/cstddef>
+#include <cuda/std/span>
 
 CUB_NAMESPACE_BEGIN
 namespace detail
@@ -33,7 +43,8 @@ template <int ThreadsPerBlock,
           int ItemsPerThread,
           BlockLoadAlgorithm LoadAlgorithm,
           CacheLoadModifier LoadCacheModifier,
-          BlockStoreAlgorithm StoreAlgorithm>
+          BlockStoreAlgorithm StoreAlgorithm,
+          bool UseBlockLoadToShared = false>
 struct agent_policy_t
 {
   // do not change data member names, policy_wrapper_t depends on it
@@ -43,6 +54,7 @@ struct agent_policy_t
   static constexpr BlockLoadAlgorithm LOAD_ALGORITHM   = LoadAlgorithm;
   static constexpr CacheLoadModifier LOAD_MODIFIER     = LoadCacheModifier;
   static constexpr BlockStoreAlgorithm STORE_ALGORITHM = StoreAlgorithm;
+  static constexpr bool use_block_load_to_shared       = UseBlockLoadToShared;
 };
 
 // TODO(bgruber): can we unify this one with AgentMerge in agent_merge_sort.cuh?
@@ -54,48 +66,102 @@ template <typename Policy,
           typename KeysOutputIt,
           typename ItemsOutputIt,
           typename Offset,
-          typename CompareOp>
+          typename CompareOp,
+          bool AllowBlockLoadToShared>
 struct agent_t
 {
-  using policy = Policy;
+  using policy                           = Policy;
+  static constexpr int items_per_thread  = Policy::ITEMS_PER_THREAD;
+  static constexpr int threads_per_block = Policy::BLOCK_THREADS;
+  static constexpr Offset items_per_tile = Policy::ITEMS_PER_TILE;
 
   // key and value type are taken from the first input sequence (consistent with old Thrust behavior)
   using key_type  = it_value_t<KeysIt1>;
   using item_type = it_value_t<ItemsIt1>;
 
-  using keys_load_it1  = try_make_cache_modified_iterator_t<Policy::LOAD_MODIFIER, KeysIt1>;
-  using keys_load_it2  = try_make_cache_modified_iterator_t<Policy::LOAD_MODIFIER, KeysIt2>;
-  using items_load_it1 = try_make_cache_modified_iterator_t<Policy::LOAD_MODIFIER, ItemsIt1>;
-  using items_load_it2 = try_make_cache_modified_iterator_t<Policy::LOAD_MODIFIER, ItemsIt2>;
+  using block_load_to_shared = cub::detail::BlockLoadToShared<threads_per_block>;
+  using block_store_keys     = typename BlockStoreType<Policy, KeysOutputIt, key_type>::type;
+  using block_store_items    = typename BlockStoreType<Policy, ItemsOutputIt, item_type>::type;
+
+  template <typename ValueT, typename Iter1, typename Iter2>
+  static constexpr bool use_block_load_to_shared =
+    Policy::use_block_load_to_shared && (sizeof(ValueT) == alignof(ValueT)) && AllowBlockLoadToShared
+    && THRUST_NS_QUALIFIER::is_trivially_relocatable_v<ValueT> //
+    && THRUST_NS_QUALIFIER::is_contiguous_iterator_v<Iter1> //
+    && THRUST_NS_QUALIFIER::is_contiguous_iterator_v<Iter2>
+    && ::cuda::std::is_same_v<ValueT, cub::detail::it_value_t<Iter1>>
+    && ::cuda::std::is_same_v<ValueT, cub::detail::it_value_t<Iter2>>;
+
+  static constexpr bool keys_use_block_load_to_shared  = use_block_load_to_shared<key_type, KeysIt1, KeysIt2>;
+  static constexpr bool items_use_block_load_to_shared = use_block_load_to_shared<item_type, ItemsIt1, ItemsIt2>;
+  static constexpr bool need_block_load_to_shared = keys_use_block_load_to_shared || items_use_block_load_to_shared;
+  static constexpr int load2sh_minimum_align      = block_load_to_shared::template SharedBufferAlignBytes<char>();
+
+  struct empty_t
+  {
+    struct TempStorage
+    {};
+    _CCCL_DEVICE _CCCL_FORCEINLINE empty_t(TempStorage) {}
+  };
+
+  using optional_load2sh_t = ::cuda::std::conditional_t<need_block_load_to_shared, block_load_to_shared, empty_t>;
+
+  using keys_load_it1 =
+    ::cuda::std::conditional_t<keys_use_block_load_to_shared,
+                               KeysIt1,
+                               try_make_cache_modified_iterator_t<Policy::LOAD_MODIFIER, KeysIt1>>;
+  using keys_load_it2 =
+    ::cuda::std::conditional_t<keys_use_block_load_to_shared,
+                               KeysIt2,
+                               try_make_cache_modified_iterator_t<Policy::LOAD_MODIFIER, KeysIt2>>;
+  using items_load_it1 =
+    ::cuda::std::conditional_t<items_use_block_load_to_shared,
+                               ItemsIt1,
+                               try_make_cache_modified_iterator_t<Policy::LOAD_MODIFIER, ItemsIt1>>;
+  using items_load_it2 =
+    ::cuda::std::conditional_t<items_use_block_load_to_shared,
+                               ItemsIt2,
+                               try_make_cache_modified_iterator_t<Policy::LOAD_MODIFIER, ItemsIt2>>;
 
-  using block_load_keys1  = typename BlockLoadType<Policy, keys_load_it1>::type;
-  using block_load_keys2  = typename BlockLoadType<Policy, keys_load_it2>::type;
-  using block_load_items1 = typename BlockLoadType<Policy, items_load_it1>::type;
-  using block_load_items2 = typename BlockLoadType<Policy, items_load_it2>::type;
+  template <typename ValueT, bool UseBlockLoadToShared>
+  struct alignas(UseBlockLoadToShared ? block_load_to_shared::template SharedBufferAlignBytes<ValueT>()
+                                      : alignof(ValueT)) buffer_t
+  {
+    // Need extra bytes of padding for TMA because this static buffer has to hold the two dynamically sized buffers.
+    char c_array[UseBlockLoadToShared ? (block_load_to_shared::template SharedBufferSizeBytes<ValueT>(items_per_tile + 1)
+                                         + (alignof(ValueT) < load2sh_minimum_align ? 2 * load2sh_minimum_align : 0))
+                                      : sizeof(ValueT) * (items_per_tile + 1)];
+  };
 
-  using block_store_keys  = typename BlockStoreType<Policy, KeysOutputIt, key_type>::type;
-  using block_store_items = typename BlockStoreType<Policy, ItemsOutputIt, item_type>::type;
+  struct temp_storages_bl2sh
+  {
+    union
+    {
+      typename block_store_keys::TempStorage store_keys;
+      typename block_store_items::TempStorage store_items;
+      buffer_t<key_type, keys_use_block_load_to_shared> keys_shared;
+      buffer_t<item_type, items_use_block_load_to_shared> items_shared;
+    };
+    typename block_load_to_shared::TempStorage load2sh;
+  };
 
-  union temp_storages
+  union temp_storages_fallback
   {
-    typename block_load_keys1::TempStorage load_keys1;
-    typename block_load_keys2::TempStorage load_keys2;
-    typename block_load_items1::TempStorage load_items1;
-    typename block_load_items2::TempStorage load_items2;
     typename block_store_keys::TempStorage store_keys;
     typename block_store_items::TempStorage store_items;
 
-    key_type keys_shared[Policy::ITEMS_PER_TILE + 1];
-    item_type items_shared[Policy::ITEMS_PER_TILE + 1];
+    buffer_t<key_type, keys_use_block_load_to_shared> keys_shared;
+    buffer_t<item_type, items_use_block_load_to_shared> items_shared;
+
+    typename empty_t::TempStorage load2sh;
   };
 
+  using temp_storages =
+    ::cuda::std::conditional_t<need_block_load_to_shared, temp_storages_bl2sh, temp_storages_fallback>;
+
   struct TempStorage : Uninitialized<temp_storages>
   {};
 
-  static constexpr int items_per_thread  = Policy::ITEMS_PER_THREAD;
-  static constexpr int threads_per_block = Policy::BLOCK_THREADS;
-  static constexpr Offset items_per_tile = Policy::ITEMS_PER_TILE;
-
   // Per thread data
   temp_storages& storage;
   keys_load_it1 keys1_in;
@@ -128,18 +194,49 @@ struct agent_t
     const int num_keys1 = static_cast<int>(keys1_end - keys1_beg);
     const int num_keys2 = static_cast<int>(keys2_end - keys2_beg);
 
-    key_type keys_loc[items_per_thread];
-    merge_sort::gmem_to_reg<threads_per_block, IsFullTile>(
-      keys_loc, keys1_in + keys1_beg, keys2_in + keys2_beg, num_keys1, num_keys2);
-    merge_sort::reg_to_shared<threads_per_block>(&storage.keys_shared[0], keys_loc);
-    __syncthreads();
+    optional_load2sh_t load2sh{storage.load2sh};
+
+    key_type* keys1_shared;
+    key_type* keys2_shared;
+    int keys2_offset;
+    if constexpr (keys_use_block_load_to_shared)
+    {
+      ::cuda::std::span keys1_src{THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(keys1_in + keys1_beg),
+                                  static_cast<::cuda::std::size_t>(num_keys1)};
+      ::cuda::std::span keys2_src{THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(keys2_in + keys2_beg),
+                                  static_cast<::cuda::std::size_t>(num_keys2)};
+      ::cuda::std::span keys_buffers{storage.keys_shared.c_array};
+      auto keys1_buffer = keys_buffers.first(block_load_to_shared::template SharedBufferSizeBytes<key_type>(num_keys1));
+      auto keys2_buffer = keys_buffers.last(block_load_to_shared::template SharedBufferSizeBytes<key_type>(num_keys2));
+      _CCCL_ASSERT(keys1_buffer.end() <= keys2_buffer.begin(),
+                   "Keys buffer needs to be appropriately sized (internal)");
+      auto keys1_sh = load2sh.CopyAsync(keys1_buffer, keys1_src);
+      auto keys2_sh = load2sh.CopyAsync(keys2_buffer, keys2_src);
+      load2sh.Commit();
+      keys1_shared = data(keys1_sh);
+      keys2_shared = data(keys2_sh);
+      // Needed for using keys1_shared as one big buffer including both ranges in SerialMerge
+      keys2_offset = static_cast<int>(keys2_shared - keys1_shared);
+      load2sh.Wait();
+    }
+    else
+    {
+      key_type keys_loc[items_per_thread];
+      merge_sort::gmem_to_reg<threads_per_block, IsFullTile>(
+        keys_loc, keys1_in + keys1_beg, keys2_in + keys2_beg, num_keys1, num_keys2);
+      keys1_shared = &::cuda::ptr_rebind<key_type>(storage.keys_shared.c_array)[0];
+      // Needed for using keys1_shared as one big buffer including both ranges in SerialMerge
+      keys2_offset = num_keys1;
+      keys2_shared = keys1_shared + keys2_offset;
+      merge_sort::reg_to_shared<threads_per_block>(keys1_shared, keys_loc);
+      __syncthreads();
+    }
 
     // use binary search in shared memory to find merge path for each of thread.
     // we can use int type here, because the number of items in shared memory is limited
     const int diag0_loc = (::cuda::std::min) (num_keys1 + num_keys2, static_cast<int>(items_per_thread * threadIdx.x));
 
-    const int keys1_beg_loc =
-      MergePath(&storage.keys_shared[0], &storage.keys_shared[num_keys1], num_keys1, num_keys2, diag0_loc, compare_op);
+    const int keys1_beg_loc = MergePath(keys1_shared, keys2_shared, num_keys1, num_keys2, diag0_loc, compare_op);
     const int keys1_end_loc = num_keys1;
     const int keys2_beg_loc = diag0_loc - keys1_beg_loc;
     const int keys2_end_loc = num_keys2;
@@ -148,11 +245,12 @@ struct agent_t
     const int num_keys2_loc = keys2_end_loc - keys2_beg_loc;
 
     // perform serial merge
+    key_type keys_loc[items_per_thread];
     int indices[items_per_thread];
     cub::SerialMerge(
-      &storage.keys_shared[0],
+      keys1_shared,
       keys1_beg_loc,
-      keys2_beg_loc + num_keys1,
+      keys2_offset + keys2_beg_loc,
       num_keys1_loc,
       num_keys2_loc,
       keys_loc,
@@ -174,19 +272,67 @@ struct agent_t
     static constexpr bool have_items = !::cuda::std::is_same_v<item_type, NullType>;
     if constexpr (have_items)
     {
-      item_type items_loc[items_per_thread];
-      merge_sort::gmem_to_reg<threads_per_block, IsFullTile>(
-        items_loc, items1_in + keys1_beg, items2_in + keys2_beg, num_keys1, num_keys2);
-      __syncthreads(); // block_store_keys above uses shared memory, so make sure all threads are done before we write
-                       // to it
-      merge_sort::reg_to_shared<threads_per_block>(&storage.items_shared[0], items_loc);
-      __syncthreads();
+      [[maybe_unsused]] const auto translate_indices = [&](int items2_offset) -> void {
+        const int diff = items2_offset - keys2_offset;
+        _CCCL_PRAGMA_UNROLL_FULL()
+        for (int i = 0; i < items_per_thread; ++i)
+        {
+          if (indices[i] >= keys2_offset)
+          {
+            indices[i] += diff;
+          }
+        }
+      };
+
+      item_type* items1_shared;
+      int items2_offset;
+      if constexpr (keys_use_block_load_to_shared)
+      {
+        ::cuda::std::span items1_src{THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(items1_in + keys1_beg),
+                                     static_cast<::cuda::std::size_t>(num_keys1)};
+        ::cuda::std::span items2_src{THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(items2_in + keys2_beg),
+                                     static_cast<::cuda::std::size_t>(num_keys2)};
+        ::cuda::std::span items_buffers{storage.items_shared.c_array};
+        auto items1_buffer =
+          items_buffers.first(block_load_to_shared::template SharedBufferSizeBytes<item_type>(num_keys1));
+        auto items2_buffer =
+          items_buffers.last(block_load_to_shared::template SharedBufferSizeBytes<item_type>(num_keys2));
+        _CCCL_ASSERT(items1_buffer.end() <= items2_buffer.begin(),
+                     "Items buffer needs to be appropriately sized (internal)");
+        // block_store_keys above uses shared memory, so make sure all threads are done before we write
+        __syncthreads();
+        auto items1_sh = load2sh.CopyAsync(items1_buffer, items1_src);
+        auto items2_sh = load2sh.CopyAsync(items2_buffer, items2_src);
+        load2sh.Commit();
+        items1_shared            = data(items1_sh);
+        item_type* items2_shared = data(items2_sh);
+        items2_offset            = static_cast<int>(items2_shared - items1_shared);
+        translate_indices(items2_offset);
+        load2sh.Wait();
+      }
+      else
+      {
+        item_type items_loc[items_per_thread];
+        merge_sort::gmem_to_reg<threads_per_block, IsFullTile>(
+          items_loc, items1_in + keys1_beg, items2_in + keys2_beg, num_keys1, num_keys2);
+        __syncthreads(); // block_store_keys above uses shared memory, so make sure all threads are done before we write
+                         // to it
+        items1_shared = &::cuda::ptr_rebind<item_type>(storage.items_shared.c_array)[0];
+        items2_offset = num_keys1;
+        if constexpr (keys_use_block_load_to_shared)
+        {
+          translate_indices(items2_offset);
+        }
+        merge_sort::reg_to_shared<threads_per_block>(items1_shared, items_loc);
+        __syncthreads();
+      }
 
       // gather items from shared mem
+      item_type items_loc[items_per_thread];
       _CCCL_PRAGMA_UNROLL_FULL()
       for (int i = 0; i < items_per_thread; ++i)
       {
-        items_loc[i] = storage.items_shared[indices[i]];
+        items_loc[i] = items1_shared[indices[i]];
       }
       __syncthreads();
 
diff --git a/cub/cub/device/dispatch/dispatch_merge.cuh b/cub/cub/device/dispatch/dispatch_merge.cuh
@@ -34,16 +34,30 @@ inline constexpr int fallback_ITEMS_PER_THREAD = 1;
 template <typename DefaultPolicy, class... Args>
 class choose_merge_agent
 {
-  using default_agent_t = agent_t<DefaultPolicy, Args...>;
+  using default_load2sh_agent_t   = agent_t<DefaultPolicy, Args..., /* AllowBlockLoadToShared = */ true>;
+  using default_noload2sh_agent_t = agent_t<DefaultPolicy, Args..., /* AllowBlockLoadToShared = */ false>;
+
+  // Disallow BlockLoadToShared with its additional padding because we want to keep TempStorage minimal in the fallback,
+  // so avoid BlockLoadToShared with its padding needs. The restricted tile size of the fallback might also be a bad
+  // combination with the expensive mbarrier setup even when no padding is needed.
   using fallback_agent_t =
-    agent_t<policy_wrapper_t<DefaultPolicy, fallback_BLOCK_THREADS, fallback_ITEMS_PER_THREAD>, Args...>;
+    agent_t<policy_wrapper_t<DefaultPolicy, fallback_BLOCK_THREADS, fallback_ITEMS_PER_THREAD>, Args..., false>;
 
-  // Use fallback if merge agent exceeds maximum shared memory, but the fallback agent still fits
-  static constexpr bool use_fallback = sizeof(typename default_agent_t::TempStorage) > max_smem_per_block
-                                    && sizeof(typename fallback_agent_t::TempStorage) <= max_smem_per_block;
+  static constexpr bool use_default_load2sh =
+    sizeof(typename default_load2sh_agent_t::TempStorage) <= max_smem_per_block;
+  static constexpr bool use_default_noload2sh =
+    sizeof(typename default_noload2sh_agent_t::TempStorage) <= max_smem_per_block;
+  // Use fallback if merge agent exceeds maximum shared memory, but the fallback agent still fits, else use
+  // vsmem-compatible version, so noload2sh
+  static constexpr bool use_fallback = sizeof(typename fallback_agent_t::TempStorage) <= max_smem_per_block;
 
 public:
-  using type = ::cuda::std::conditional_t<use_fallback, fallback_agent_t, default_agent_t>;
+  using type = ::cuda::std::conditional_t<
+    use_default_load2sh,
+    default_load2sh_agent_t,
+    ::cuda::std::conditional_t<use_default_noload2sh,
+                               default_noload2sh_agent_t,
+                               ::cuda::std::conditional_t<use_fallback, fallback_agent_t, default_noload2sh_agent_t>>>;
 };
 
 // Computes the merge path intersections at equally wide intervals. The approach is outlined in the paper:
@@ -143,11 +157,11 @@ __launch_bounds__(
   auto& temp_storage = vsmem_helper_t::get_temp_storage(shared_temp_storage, global_temp_storage);
   MergeAgent{
     temp_storage.Alias(),
-    try_make_cache_modified_iterator<MergePolicy::LOAD_MODIFIER>(keys1),
-    try_make_cache_modified_iterator<MergePolicy::LOAD_MODIFIER>(items1),
+    keys1,
+    items1,
     num_keys1,
-    try_make_cache_modified_iterator<MergePolicy::LOAD_MODIFIER>(keys2),
-    try_make_cache_modified_iterator<MergePolicy::LOAD_MODIFIER>(items2),
+    keys2,
+    items2,
     num_keys2,
     keys_result,
     items_result,
diff --git a/cub/cub/device/dispatch/tuning/tuning_merge.cuh b/cub/cub/device/dispatch/tuning/tuning_merge.cuh
@@ -83,7 +83,18 @@ struct policy_hub
                      BLOCK_STORE_WARP_TRANSPOSE>;
   };
 
-  using max_policy = policy600;
+  struct policy800 : ChainedPolicy<800, policy800, policy600>
+  {
+    using merge_policy =
+      agent_policy_t<512,
+                     Nominal4BItemsToItems<tune_type>(15),
+                     BLOCK_LOAD_WARP_TRANSPOSE,
+                     LOAD_DEFAULT,
+                     BLOCK_STORE_WARP_TRANSPOSE,
+                     /* UseBlockLoadToShared = */ true>;
+  };
+
+  using max_policy = policy800;
 };
 } // namespace merge
 } // namespace detail