Use BlockLoadToShared in DeviceMerge

pauleonix · pauleonix · commit 9363b0e38d28 · 2025-09-19T08:12:05.000+02:00
diff --git a/cub/cub/agent/agent_merge.cuh b/cub/cub/agent/agent_merge.cuh
@@ -14,15 +14,24 @@
 #endif // no system header
 
 #include <cub/agent/agent_merge_sort.cuh>
-#include <cub/block/block_load.cuh>
+#include <cub/block/block_load_to_shared.cuh>
 #include <cub/block/block_merge_sort.cuh>
 #include <cub/block/block_store.cuh>
 #include <cub/iterator/cache_modified_input_iterator.cuh>
 #include <cub/util_namespace.cuh>
 #include <cub/util_type.cuh>
 
+#include <thrust/type_traits/is_contiguous_iterator.h>
+#include <thrust/type_traits/unwrap_contiguous_iterator.h>
+
+#include <cuda/__memory/ptr_rebind.h>
 #include <cuda/std/__algorithm/max.h>
 #include <cuda/std/__algorithm/min.h>
+#include <cuda/std/__type_traits/conditional.h>
+#include <cuda/std/__type_traits/is_same.h>
+#include <cuda/std/__type_traits/is_trivially_copyable.h>
+#include <cuda/std/cstddef>
+#include <cuda/std/span>
 
 CUB_NAMESPACE_BEGIN
 namespace detail
@@ -33,7 +42,8 @@ template <int ThreadsPerBlock,
           int ItemsPerThread,
           BlockLoadAlgorithm LoadAlgorithm,
           CacheLoadModifier LoadCacheModifier,
-          BlockStoreAlgorithm StoreAlgorithm>
+          BlockStoreAlgorithm StoreAlgorithm,
+          bool UseBlockLoadToShared = false>
 struct agent_policy_t
 {
   // do not change data member names, policy_wrapper_t depends on it
@@ -43,6 +53,7 @@ struct agent_policy_t
   static constexpr BlockLoadAlgorithm LOAD_ALGORITHM   = LoadAlgorithm;
   static constexpr CacheLoadModifier LOAD_MODIFIER     = LoadCacheModifier;
   static constexpr BlockStoreAlgorithm STORE_ALGORITHM = StoreAlgorithm;
+  static constexpr bool use_block_load_to_shared       = UseBlockLoadToShared;
 };
 
 // TODO(bgruber): can we unify this one with AgentMerge in agent_merge_sort.cuh?
@@ -57,45 +68,98 @@ template <typename Policy,
           typename CompareOp>
 struct agent_t
 {
-  using policy = Policy;
+  using policy                           = Policy;
+  static constexpr int items_per_thread  = Policy::ITEMS_PER_THREAD;
+  static constexpr int threads_per_block = Policy::BLOCK_THREADS;
+  static constexpr Offset items_per_tile = Policy::ITEMS_PER_TILE;
 
   // key and value type are taken from the first input sequence (consistent with old Thrust behavior)
   using key_type  = it_value_t<KeysIt1>;
   using item_type = it_value_t<ItemsIt1>;
 
-  using keys_load_it1  = try_make_cache_modified_iterator_t<Policy::LOAD_MODIFIER, KeysIt1>;
-  using keys_load_it2  = try_make_cache_modified_iterator_t<Policy::LOAD_MODIFIER, KeysIt2>;
-  using items_load_it1 = try_make_cache_modified_iterator_t<Policy::LOAD_MODIFIER, ItemsIt1>;
-  using items_load_it2 = try_make_cache_modified_iterator_t<Policy::LOAD_MODIFIER, ItemsIt2>;
+  using block_load_to_shared = cub::detail::BlockLoadToShared<threads_per_block>;
+  using block_store_keys     = typename BlockStoreType<Policy, KeysOutputIt, key_type>::type;
+  using block_store_items    = typename BlockStoreType<Policy, ItemsOutputIt, item_type>::type;
+
+  template <typename ValueT, typename Iter1, typename Iter2>
+  static constexpr bool use_block_load_to_shared =
+    Policy::use_block_load_to_shared && (sizeof(ValueT) == alignof(ValueT))
+    && ::cuda::std::is_trivially_copyable_v<ValueT> //
+    && THRUST_NS_QUALIFIER::is_contiguous_iterator_v<Iter1> //
+    && THRUST_NS_QUALIFIER::is_contiguous_iterator_v<Iter2>
+    && ::cuda::std::is_same_v<ValueT, cub::detail::it_value_t<Iter1>>
+    && ::cuda::std::is_same_v<ValueT, cub::detail::it_value_t<Iter2>>;
+
+  static constexpr bool keys_use_block_load_to_shared  = use_block_load_to_shared<key_type, KeysIt1, KeysIt2>;
+  static constexpr bool items_use_block_load_to_shared = use_block_load_to_shared<item_type, ItemsIt1, ItemsIt2>;
+  static constexpr bool need_block_load_to_shared = keys_use_block_load_to_shared || items_use_block_load_to_shared;
+  static constexpr int load2sh_minimum_align      = block_load_to_shared::template SharedBufferAlignBytes<char>();
+
+  struct empty_t
+  {
+    struct TempStorage
+    {};
+    _CCCL_DEVICE _CCCL_FORCEINLINE empty_t(TempStorage) {}
+  };
+
+  using optional_load2sh_t = ::cuda::std::conditional_t<need_block_load_to_shared, block_load_to_shared, empty_t>;
+
+  using keys_load_it1 =
+    ::cuda::std::conditional_t<keys_use_block_load_to_shared,
+                               KeysIt1,
+                               try_make_cache_modified_iterator_t<Policy::LOAD_MODIFIER, KeysIt1>>;
+  using keys_load_it2 =
+    ::cuda::std::conditional_t<keys_use_block_load_to_shared,
+                               KeysIt2,
+                               try_make_cache_modified_iterator_t<Policy::LOAD_MODIFIER, KeysIt2>>;
+  using items_load_it1 =
+    ::cuda::std::conditional_t<items_use_block_load_to_shared,
+                               ItemsIt1,
+                               try_make_cache_modified_iterator_t<Policy::LOAD_MODIFIER, ItemsIt1>>;
+  using items_load_it2 =
+    ::cuda::std::conditional_t<items_use_block_load_to_shared,
+                               ItemsIt2,
+                               try_make_cache_modified_iterator_t<Policy::LOAD_MODIFIER, ItemsIt2>>;
 
-  using block_load_keys1  = typename BlockLoadType<Policy, keys_load_it1>::type;
-  using block_load_keys2  = typename BlockLoadType<Policy, keys_load_it2>::type;
-  using block_load_items1 = typename BlockLoadType<Policy, items_load_it1>::type;
-  using block_load_items2 = typename BlockLoadType<Policy, items_load_it2>::type;
+  template <typename ValueT, bool UseBlockLoadToShared>
+  struct alignas(UseBlockLoadToShared ? block_load_to_shared::template SharedBufferAlignBytes<ValueT>()
+                                      : alignof(ValueT)) buffer_t
+  {
+    // Need extra bytes of padding for TMA because this static buffer has to hold the two dynamically sized buffers.
+    char c_array[UseBlockLoadToShared ? (block_load_to_shared::template SharedBufferSizeBytes<ValueT>(items_per_tile + 1)
+                                         + 2 * load2sh_minimum_align)
+                                      : sizeof(ValueT) * (items_per_tile + 1)];
+  };
 
-  using block_store_keys  = typename BlockStoreType<Policy, KeysOutputIt, key_type>::type;
-  using block_store_items = typename BlockStoreType<Policy, ItemsOutputIt, item_type>::type;
+  struct temp_storages_bl2sh
+  {
+    union
+    {
+      typename block_store_keys::TempStorage store_keys;
+      typename block_store_items::TempStorage store_items;
+      buffer_t<key_type, keys_use_block_load_to_shared> keys_shared;
+      buffer_t<item_type, items_use_block_load_to_shared> items_shared;
+    };
+    typename block_load_to_shared::TempStorage load2sh;
+  };
 
-  union temp_storages
+  union temp_storages_fallback
   {
-    typename block_load_keys1::TempStorage load_keys1;
-    typename block_load_keys2::TempStorage load_keys2;
-    typename block_load_items1::TempStorage load_items1;
-    typename block_load_items2::TempStorage load_items2;
     typename block_store_keys::TempStorage store_keys;
     typename block_store_items::TempStorage store_items;
 
-    key_type keys_shared[Policy::ITEMS_PER_TILE + 1];
-    item_type items_shared[Policy::ITEMS_PER_TILE + 1];
+    buffer_t<key_type, keys_use_block_load_to_shared> keys_shared;
+    buffer_t<item_type, items_use_block_load_to_shared> items_shared;
+
+    typename empty_t::TempStorage load2sh;
   };
 
+  using temp_storages =
+    ::cuda::std::conditional_t<need_block_load_to_shared, temp_storages_bl2sh, temp_storages_fallback>;
+
   struct TempStorage : Uninitialized<temp_storages>
   {};
 
-  static constexpr int items_per_thread  = Policy::ITEMS_PER_THREAD;
-  static constexpr int threads_per_block = Policy::BLOCK_THREADS;
-  static constexpr Offset items_per_tile = Policy::ITEMS_PER_TILE;
-
   // Per thread data
   temp_storages& storage;
   keys_load_it1 keys1_in;
@@ -128,18 +192,49 @@ struct agent_t
     const int num_keys1 = static_cast<int>(keys1_end - keys1_beg);
     const int num_keys2 = static_cast<int>(keys2_end - keys2_beg);
 
-    key_type keys_loc[items_per_thread];
-    merge_sort::gmem_to_reg<threads_per_block, IsFullTile>(
-      keys_loc, keys1_in + keys1_beg, keys2_in + keys2_beg, num_keys1, num_keys2);
-    merge_sort::reg_to_shared<threads_per_block>(&storage.keys_shared[0], keys_loc);
-    __syncthreads();
+    optional_load2sh_t load2sh{storage.load2sh};
+
+    key_type* keys1_shared;
+    key_type* keys2_shared;
+    int keys2_offset;
+    if constexpr (keys_use_block_load_to_shared)
+    {
+      ::cuda::std::span keys1_src{THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(keys1_in + keys1_beg),
+                                  static_cast<::cuda::std::size_t>(num_keys1)};
+      ::cuda::std::span keys2_src{THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(keys2_in + keys2_beg),
+                                  static_cast<::cuda::std::size_t>(num_keys2)};
+      ::cuda::std::span keys_buffers{storage.keys_shared.c_array};
+      auto keys1_buffer = keys_buffers.first(block_load_to_shared::template SharedBufferSizeBytes<key_type>(num_keys1));
+      auto keys2_buffer = keys_buffers.last(block_load_to_shared::template SharedBufferSizeBytes<key_type>(num_keys2));
+      _CCCL_ASSERT(keys1_buffer.end() <= keys2_buffer.begin(),
+                   "Keys buffer needs to be appropriately sized (internal)");
+      auto keys1_sh = load2sh.template CopyAsync<key_type>(keys1_buffer, keys1_src);
+      auto keys2_sh = load2sh.template CopyAsync<key_type>(keys2_buffer, keys2_src);
+      load2sh.Commit();
+      keys1_shared = data(keys1_sh);
+      keys2_shared = data(keys2_sh);
+      // Needed for using keys1_shared as one big buffer including both ranges in SerialMerge
+      keys2_offset = static_cast<int>(keys2_shared - keys1_shared);
+      load2sh.Wait();
+    }
+    else
+    {
+      key_type keys_loc[items_per_thread];
+      merge_sort::gmem_to_reg<threads_per_block, IsFullTile>(
+        keys_loc, keys1_in + keys1_beg, keys2_in + keys2_beg, num_keys1, num_keys2);
+      keys1_shared = &::cuda::ptr_rebind<key_type>(storage.keys_shared.c_array)[0];
+      // Needed for using keys1_shared as one big buffer including both ranges in SerialMerge
+      keys2_offset = num_keys1;
+      keys2_shared = keys1_shared + keys2_offset;
+      merge_sort::reg_to_shared<threads_per_block>(keys1_shared, keys_loc);
+      __syncthreads();
+    }
 
     // use binary search in shared memory to find merge path for each of thread.
     // we can use int type here, because the number of items in shared memory is limited
     const int diag0_loc = (::cuda::std::min) (num_keys1 + num_keys2, static_cast<int>(items_per_thread * threadIdx.x));
 
-    const int keys1_beg_loc =
-      MergePath(&storage.keys_shared[0], &storage.keys_shared[num_keys1], num_keys1, num_keys2, diag0_loc, compare_op);
+    const int keys1_beg_loc = MergePath(keys1_shared, keys2_shared, num_keys1, num_keys2, diag0_loc, compare_op);
     const int keys1_end_loc = num_keys1;
     const int keys2_beg_loc = diag0_loc - keys1_beg_loc;
     const int keys2_end_loc = num_keys2;
@@ -148,11 +243,12 @@ struct agent_t
     const int num_keys2_loc = keys2_end_loc - keys2_beg_loc;
 
     // perform serial merge
+    key_type keys_loc[items_per_thread];
     int indices[items_per_thread];
     cub::SerialMerge(
-      &storage.keys_shared[0],
+      keys1_shared,
       keys1_beg_loc,
-      keys2_beg_loc + num_keys1,
+      keys2_offset + keys2_beg_loc,
       num_keys1_loc,
       num_keys2_loc,
       keys_loc,
@@ -174,19 +270,69 @@ struct agent_t
     static constexpr bool have_items = !::cuda::std::is_same_v<item_type, NullType>;
     if constexpr (have_items)
     {
-      item_type items_loc[items_per_thread];
-      merge_sort::gmem_to_reg<threads_per_block, IsFullTile>(
-        items_loc, items1_in + keys1_beg, items2_in + keys2_beg, num_keys1, num_keys2);
-      __syncthreads(); // block_store_keys above uses shared memory, so make sure all threads are done before we write
-                       // to it
-      merge_sort::reg_to_shared<threads_per_block>(&storage.items_shared[0], items_loc);
-      __syncthreads();
+      const auto translate_indices = [&](int items2_offset) -> void {
+        const int diff = items2_offset - keys2_offset;
+        _CCCL_PRAGMA_UNROLL_FULL()
+        for (int i = 0; i < items_per_thread; ++i)
+        {
+          if (indices[i] >= keys2_offset)
+          {
+            indices[i] += diff;
+          }
+        }
+      };
+
+      item_type* items1_shared;
+      item_type* items2_shared;
+      int items2_offset;
+      if constexpr (keys_use_block_load_to_shared)
+      {
+        ::cuda::std::span items1_src{THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(items1_in + keys1_beg),
+                                     static_cast<::cuda::std::size_t>(num_keys1)};
+        ::cuda::std::span items2_src{THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(items2_in + keys2_beg),
+                                     static_cast<::cuda::std::size_t>(num_keys2)};
+        ::cuda::std::span items_buffers{storage.items_shared.c_array};
+        auto items1_buffer =
+          items_buffers.first(block_load_to_shared::template SharedBufferSizeBytes<item_type>(num_keys1));
+        auto items2_buffer =
+          items_buffers.last(block_load_to_shared::template SharedBufferSizeBytes<item_type>(num_keys2));
+        _CCCL_ASSERT(items1_buffer.end() <= items2_buffer.begin(),
+                     "Items buffer needs to be appropriately sized (internal)");
+        // block_store_keys above uses shared memory, so make sure all threads are done before we write
+        __syncthreads();
+        auto items1_sh = load2sh.template CopyAsync<item_type>(items1_buffer, items1_src);
+        auto items2_sh = load2sh.template CopyAsync<item_type>(items2_buffer, items2_src);
+        load2sh.Commit();
+        items1_shared = data(items1_sh);
+        items2_shared = data(items2_sh);
+        items2_offset = static_cast<int>(items2_shared - items1_shared);
+        translate_indices(items2_offset);
+        load2sh.Wait();
+      }
+      else
+      {
+        item_type items_loc[items_per_thread];
+        merge_sort::gmem_to_reg<threads_per_block, IsFullTile>(
+          items_loc, items1_in + keys1_beg, items2_in + keys2_beg, num_keys1, num_keys2);
+        __syncthreads(); // block_store_keys above uses shared memory, so make sure all threads are done before we write
+                         // to it
+        items1_shared = &::cuda::ptr_rebind<item_type>(storage.items_shared.c_array)[0];
+        items2_offset = num_keys1;
+        items2_shared = items1_shared + items2_offset;
+        if constexpr (keys_use_block_load_to_shared)
+        {
+          translate_indices(items2_offset);
+        }
+        merge_sort::reg_to_shared<threads_per_block>(items1_shared, items_loc);
+        __syncthreads();
+      }
 
       // gather items from shared mem
+      item_type items_loc[items_per_thread];
       _CCCL_PRAGMA_UNROLL_FULL()
       for (int i = 0; i < items_per_thread; ++i)
       {
-        items_loc[i] = storage.items_shared[indices[i]];
+        items_loc[i] = items1_shared[indices[i]];
       }
       __syncthreads();
 
diff --git a/cub/cub/device/dispatch/dispatch_merge.cuh b/cub/cub/device/dispatch/dispatch_merge.cuh
@@ -143,11 +143,11 @@ __launch_bounds__(
   auto& temp_storage = vsmem_helper_t::get_temp_storage(shared_temp_storage, global_temp_storage);
   MergeAgent{
     temp_storage.Alias(),
-    try_make_cache_modified_iterator<MergePolicy::LOAD_MODIFIER>(keys1),
-    try_make_cache_modified_iterator<MergePolicy::LOAD_MODIFIER>(items1),
+    keys1,
+    items1,
     num_keys1,
-    try_make_cache_modified_iterator<MergePolicy::LOAD_MODIFIER>(keys2),
-    try_make_cache_modified_iterator<MergePolicy::LOAD_MODIFIER>(items2),
+    keys2,
+    items2,
     num_keys2,
     keys_result,
     items_result,
diff --git a/cub/cub/device/dispatch/tuning/tuning_merge.cuh b/cub/cub/device/dispatch/tuning/tuning_merge.cuh
@@ -83,7 +83,18 @@ struct policy_hub
                      BLOCK_STORE_WARP_TRANSPOSE>;
   };
 
-  using max_policy = policy600;
+  struct policy800 : ChainedPolicy<800, policy800, policy600>
+  {
+    using merge_policy =
+      agent_policy_t<512,
+                     Nominal4BItemsToItems<tune_type>(15),
+                     BLOCK_LOAD_WARP_TRANSPOSE,
+                     LOAD_DEFAULT,
+                     BLOCK_STORE_WARP_TRANSPOSE,
+                     /* UseBlockLoadToShared = */ true>;
+  };
+
+  using max_policy = policy800;
 };
 } // namespace merge
 } // namespace detail