From 1c6087b41275caf92ced77f7aed3f0d629648804 Mon Sep 17 00:00:00 2001
From: gonidelis <ggonidelis@nvidia.com>
Date: Wed, 5 Nov 2025 09:55:49 -0800
Subject: [PATCH] Move more block scan examples to separate test executable

---
 cub/cub/block/block_scan.cuh             | 226 +++++------------------
 cub/examples/block/example_block_scan.cu | 135 +++++++++++++-
 2 files changed, 175 insertions(+), 186 deletions(-)

diff --git a/cub/cub/block/block_scan.cuh b/cub/cub/block/block_scan.cuh
index 6b774a96a2e..20b1a0901b6 100644
--- a/cub/cub/block/block_scan.cuh
+++ b/cub/cub/block/block_scan.cuh
@@ -362,56 +362,17 @@ public:
   //! prefix functor to maintain a running total between block-wide scans.  Each tile consists
   //! of 128 integer items that are partitioned across 128 threads.
   //!
-  //! .. code-block:: c++
-  //!
-  //!    #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
-  //!
-  //!    // A stateful callback functor that maintains a running prefix to be applied
-  //!    // during consecutive scan operations.
-  //!    struct BlockPrefixCallbackOp
-  //!    {
-  //!        // Running prefix
-  //!        int running_total;
-  //!
-  //!        // Constructor
-  //!        __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
-  //!
-  //!        // Callback operator to be entered by the first warp of threads in the block.
-  //!        // Thread-0 is responsible for returning a value for seeding the block-wide scan.
-  //!        __device__ int operator()(int block_aggregate)
-  //!        {
-  //!            int old_prefix = running_total;
-  //!            running_total += block_aggregate;
-  //!            return old_prefix;
-  //!        }
-  //!    };
-  //!
-  //!    __global__ void ExampleKernel(int *d_data, int num_items, ...)
-  //!    {
-  //!        // Specialize BlockScan for a 1D block of 128 threads
-  //!        using BlockScan = cub::BlockScan<int, 128>;
-  //!
-  //!        // Allocate shared memory for BlockScan
-  //!        __shared__ typename BlockScan::TempStorage temp_storage;
-  //!
-  //!        // Initialize running total
-  //!        BlockPrefixCallbackOp prefix_op(0);
-  //!
-  //!        // Have the block iterate over segments of items
-  //!        for (int block_offset = 0; block_offset < num_items; block_offset += 128)
-  //!        {
-  //!            // Load a segment of consecutive items that are blocked across threads
-  //!            int thread_data = d_data[block_offset + threadIdx.x];
-  //!
-  //!            // Collectively compute the block-wide exclusive prefix sum
-  //!            BlockScan(temp_storage).ExclusiveSum(
-  //!                thread_data, thread_data, prefix_op);
-  //!            __syncthreads();
+  //! .. literalinclude:: ../../examples/block/example_block_scan.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin block-prefix-callback-op
+  //!     :end-before: example-end block-prefix-callback-op
   //!
-  //!            // Store scanned items to output segment
-  //!            d_data[block_offset + threadIdx.x] = thread_data;
-  //!        }
-  //!    }
+  //! .. literalinclude:: ../../examples/block/example_block_scan.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin exclusive-sum-single-prefix-callback
+  //!     :end-before: example-end exclusive-sum-single-prefix-callback
   //!
   //! Suppose the input ``d_data`` is ``1, 1, 1, 1, 1, 1, 1, 1, ...``.
   //! The corresponding output for the first segment will be ``0, 1, ..., 127``.
@@ -460,25 +421,11 @@ public:
   //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
   //! where each thread owns 4 consecutive items.
   //!
-  //! .. code-block:: c++
-  //!
-  //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-  //!
-  //!    __global__ void ExampleKernel(...)
-  //!    {
-  //!        // Specialize BlockScan for a 1D block of 128 threads of type int
-  //!        using BlockScan = cub::BlockScan<int, 128>;
-  //!
-  //!        // Allocate shared memory for BlockScan
-  //!        __shared__ typename BlockScan::TempStorage temp_storage;
-  //!
-  //!        // Obtain a segment of consecutive items that are blocked across threads
-  //!        int thread_data[4];
-  //!        ...
-  //!
-  //!        // Collectively compute the block-wide exclusive prefix sum
-  //!        BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
-  //!    }
+  //! .. literalinclude:: ../../examples/block/example_block_scan.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin exclusive-sum-array
+  //!     :end-before: example-end exclusive-sum-array
   //!
   //! Suppose the set of input ``thread_data`` across the block of threads is
   //! ``{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }``.
@@ -521,26 +468,11 @@ public:
   //! a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads where each thread owns
   //! 4 consecutive items.
   //!
-  //! .. code-block:: c++
-  //!
-  //!    #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
-  //!
-  //!    __global__ void ExampleKernel(...)
-  //!    {
-  //!        // Specialize BlockScan for a 1D block of 128 threads of type int
-  //!        using BlockScan = cub::BlockScan<int, 128>;
-  //!
-  //!        // Allocate shared memory for BlockScan
-  //!        __shared__ typename BlockScan::TempStorage temp_storage;
-  //!
-  //!        // Obtain a segment of consecutive items that are blocked across threads
-  //!        int thread_data[4];
-  //!        ...
-  //!
-  //!        // Collectively compute the block-wide exclusive prefix sum
-  //!        int block_aggregate;
-  //!        BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate);
-  //!    }
+  //! .. literalinclude:: ../../examples/block/example_block_scan.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin exclusive-sum-array-aggregate
+  //!     :end-before: example-end exclusive-sum-array-aggregate
   //!
   //! Suppose the set of input ``thread_data`` across the block of threads is
   //! ``{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }``.
@@ -657,25 +589,11 @@ public:
   //! The code snippet below illustrates an exclusive prefix max scan of 128 integer items that
   //! are partitioned across 128 threads.
   //!
-  //! .. code-block:: c++
-  //!
-  //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-  //!
-  //!    __global__ void ExampleKernel(...)
-  //!    {
-  //!        // Specialize BlockScan for a 1D block of 128 threads of type int
-  //!        using BlockScan = cub::BlockScan<int, 128>;
-  //!
-  //!        // Allocate shared memory for BlockScan
-  //!        __shared__ typename BlockScan::TempStorage temp_storage;
-  //!
-  //!        // Obtain input item for each thread
-  //!        int thread_data;
-  //!        ...
-  //!
-  //!        // Collectively compute the block-wide exclusive prefix max scan
-  //!        BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cuda::maximum<>{});
-  //!    }
+  //! .. literalinclude:: ../../examples/block/example_block_scan.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin exclusive-scan-single
+  //!     :end-before: example-end exclusive-scan-single
   //!
   //! Suppose the set of input ``thread_data`` across the block of threads is ``0, -1, 2, -3, ..., 126, -127``.
   //! The corresponding output ``thread_data`` in those threads will be ``INT_MIN, 0, 0, 2, ..., 124, 126``.
@@ -1314,25 +1232,11 @@ public:
   //! The code snippet below illustrates an inclusive prefix sum of 128 integer items that
   //! are partitioned across 128 threads.
   //!
-  //! .. code-block:: c++
-  //!
-  //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-  //!
-  //!    __global__ void ExampleKernel(...)
-  //!    {
-  //!        // Specialize BlockScan for a 1D block of 128 threads of type int
-  //!        using BlockScan = cub::BlockScan<int, 128>;
-  //!
-  //!        // Allocate shared memory for BlockScan
-  //!        __shared__ typename BlockScan::TempStorage temp_storage;
-  //!
-  //!        // Obtain input item for each thread
-  //!        int thread_data;
-  //!        ...
-  //!
-  //!        // Collectively compute the block-wide inclusive prefix sum
-  //!        int block_aggregate;
-  //!        BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate);
+  //! .. literalinclude:: ../../examples/block/example_block_scan.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin inclusive-sum-single-aggregate
+  //!     :end-before: example-end inclusive-sum-single-aggregate
   //!
   //! Suppose the set of input ``thread_data`` across the block of threads is ``1, 1, ..., 1``.
   //! The corresponding output ``thread_data`` in those threads will be ``1, 2, ..., 128``.
@@ -1526,25 +1430,11 @@ public:
   //! are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
   //! where each thread owns 4 consecutive items.
   //!
-  //! .. code-block:: c++
-  //!
-  //!    #include <cub/cub.cuh> // or equivalently <cub/block/block_scan.cuh>
-  //!
-  //!    __global__ void ExampleKernel(...)
-  //!    {
-  //!        // Specialize BlockScan for a 1D block of 128 threads of type int
-  //!        using BlockScan = cub::BlockScan<int, 128>;
-  //!
-  //!        // Allocate shared memory for BlockScan
-  //!        __shared__ typename BlockScan::TempStorage temp_storage;
-  //!
-  //!        // Obtain a segment of consecutive items that are blocked across threads
-  //!        int thread_data[4];
-  //!        ...
-  //!
-  //!        // Collectively compute the block-wide inclusive prefix sum
-  //!        int block_aggregate;
-  //!        BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate);
+  //! .. literalinclude:: ../../examples/block/example_block_scan.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin inclusive-sum-array-aggregate
+  //!     :end-before: example-end inclusive-sum-array-aggregate
   //!
   //! Suppose the set of input ``thread_data`` across the block of threads is
   //! ``{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }``. The
@@ -1686,24 +1576,11 @@ public:
   //! The code snippet below illustrates an inclusive prefix max scan of 128 integer items that
   //! are partitioned across 128 threads.
   //!
-  //! .. code-block:: c++
-  //!
-  //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-  //!
-  //!    __global__ void ExampleKernel(...)
-  //!    {
-  //!        // Specialize BlockScan for a 1D block of 128 threads of type int
-  //!        using BlockScan = cub::BlockScan<int, 128>;
-  //!
-  //!        // Allocate shared memory for BlockScan
-  //!        __shared__ typename BlockScan::TempStorage temp_storage;
-  //!
-  //!        // Obtain input item for each thread
-  //!        int thread_data;
-  //!        ...
-  //!
-  //!        // Collectively compute the block-wide inclusive prefix max scan
-  //!        BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cuda::maximum<>{});
+  //! .. literalinclude:: ../../examples/block/example_block_scan.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin inclusive-scan-single
+  //!     :end-before: example-end inclusive-scan-single
   //!
   //! Suppose the set of input ``thread_data`` across the block of threads is
   //! ``0, -1, 2, -3, ..., 126, -127``. The corresponding output ``thread_data``
@@ -1879,24 +1756,11 @@ public:
   //! are partitioned in a [<em>blocked arrangement</em>](../index.html#sec5sec3) across 128 threads
   //! where each thread owns 4 consecutive items.
   //!
-  //! .. code-block:: c++
-  //!
-  //!    #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-  //!
-  //!    __global__ void ExampleKernel(...)
-  //!    {
-  //!        // Specialize BlockScan for a 1D block of 128 threads of type int
-  //!        using BlockScan = cub::BlockScan<int, 128>;
-  //!
-  //!        // Allocate shared memory for BlockScan
-  //!        __shared__ typename BlockScan::TempStorage temp_storage;
-  //!
-  //!        // Obtain a segment of consecutive items that are blocked across threads
-  //!        int thread_data[4];
-  //!        ...
-  //!
-  //!        // Collectively compute the block-wide inclusive prefix max scan
-  //!        BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cuda::maximum<>{});
+  //! .. literalinclude:: ../../examples/block/example_block_scan.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin inclusive-scan-array
+  //!     :end-before: example-end inclusive-scan-array
   //!
   //! Suppose the set of input ``thread_data`` across the block of threads is
   //! ``{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }``.
diff --git a/cub/examples/block/example_block_scan.cu b/cub/examples/block/example_block_scan.cu
index 09e23d1c0d9..27b0cfe0b64 100644
--- a/cub/examples/block/example_block_scan.cu
+++ b/cub/examples/block/example_block_scan.cu
@@ -1,5 +1,5 @@
 // SPDX-FileCopyrightText: Copyright (c) 2011, Duane Merrill. All rights reserved.
-// SPDX-FileCopyrightText: Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
+// SPDX-FileCopyrightText: Copyright (c) 2011-2025, NVIDIA CORPORATION. All rights reserved.
 // SPDX-License-Identifier: BSD-3
 
 /******************************************************************************
@@ -401,6 +401,26 @@ __global__ void ExclusiveSumPrefixCallbackKernel(int* d_data, int num_items)
 }
 // example-end exclusive-sum-prefix-callback
 
+// example-begin exclusive-scan-single
+__global__ void ExclusiveScanSingleKernel(int* d_data)
+{
+  // Specialize BlockScan for a 1D block of 128 threads of type int
+  using BlockScan = cub::BlockScan<int, 128>;
+
+  // Allocate shared memory for BlockScan
+  __shared__ typename BlockScan::TempStorage temp_storage;
+
+  // Obtain input item for each thread
+  int thread_data = d_data[threadIdx.x];
+
+  // Collectively compute the block-wide exclusive prefix max scan
+  BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cuda::maximum<>{});
+
+  // Store result
+  d_data[threadIdx.x] = thread_data;
+}
+// example-end exclusive-scan-single
+
 // example-begin inclusive-scan-prefix-callback
 __global__ void InclusiveSumPrefixCallbackKernel(int* d_data, int num_items)
 {
@@ -503,7 +523,7 @@ __global__ void ExclusiveScanArrayKernel(int* d_data)
   }
 
   // Collectively compute the block-wide exclusive prefix max scan
-  BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, ::cuda::maximum<>{});
+  BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cuda::maximum<>{});
 
   // Store results
   for (int i = 0; i < 4; i++)
@@ -527,7 +547,7 @@ __global__ void ExclusiveScanAggregateKernel(int* d_data)
 
   // Collectively compute the block-wide exclusive prefix max scan
   int block_aggregate;
-  BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, ::cuda::maximum<>{}, block_aggregate);
+  BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cuda::maximum<>{}, block_aggregate);
 
   // Store result
   d_data[threadIdx.x] = thread_data;
@@ -596,6 +616,62 @@ __global__ void ExclusiveScanPrefixCallbackKernel(int* d_data, int num_items)
 }
 // example-end exclusive-scan-prefix-callback
 
+// example-begin exclusive-sum-single-prefix-callback
+__global__ void ExclusiveSumSinglePrefixCallbackKernel(int* d_data, int num_items)
+{
+  // Specialize BlockScan for a 1D block of 128 threads
+  using BlockScan = cub::BlockScan<int, 128>;
+
+  // Allocate shared memory for BlockScan
+  __shared__ typename BlockScan::TempStorage temp_storage;
+
+  // Initialize running total
+  BlockPrefixCallbackOp prefix_op(0);
+
+  // Have the block iterate over segments of items
+  for (int block_offset = 0; block_offset < num_items; block_offset += 128)
+  {
+    // Load a segment of consecutive items that are blocked across threads
+    int thread_data = d_data[block_offset + threadIdx.x];
+
+    // Collectively compute the block-wide exclusive prefix sum
+    BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, prefix_op);
+    __syncthreads();
+
+    // Store scanned items to output segment
+    d_data[block_offset + threadIdx.x] = thread_data;
+  }
+}
+// example-end exclusive-sum-single-prefix-callback
+
+// example-begin exclusive-sum-array-aggregate
+__global__ void ExclusiveSumArrayAggregateKernel(int* d_data)
+{
+  // Specialize BlockScan for a 1D block of 128 threads of type int
+  using BlockScan = cub::BlockScan<int, 128>;
+
+  // Allocate shared memory for BlockScan
+  __shared__ typename BlockScan::TempStorage temp_storage;
+
+  // Obtain a segment of consecutive items that are blocked across threads
+  int thread_data[4];
+  for (int i = 0; i < 4; i++)
+  {
+    thread_data[i] = d_data[threadIdx.x * 4 + i];
+  }
+
+  // Collectively compute the block-wide exclusive prefix sum
+  int block_aggregate;
+  BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate);
+
+  // Store results
+  for (int i = 0; i < 4; i++)
+  {
+    d_data[threadIdx.x * 4 + i] = thread_data[i];
+  }
+}
+// example-end exclusive-sum-array-aggregate
+
 // example-begin inclusive-scan-array
 __global__ void InclusiveScanArrayKernel(int* d_data)
 {
@@ -636,7 +712,7 @@ __global__ void InclusiveScanSingleKernel(int* d_data)
   int thread_data = d_data[threadIdx.x];
 
   // Collectively compute the block-wide inclusive prefix max scan
-  BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, ::cuda::maximum<>{});
+  BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cuda::maximum<>{});
 
   // Store result
   d_data[threadIdx.x] = thread_data;
@@ -671,7 +747,7 @@ __global__ void InclusiveScanPrefixCallbackKernel(int* d_data, int num_items)
     __syncthreads();
 
     // Collectively compute the block-wide inclusive prefix max
-    BlockScanT(temp_storage.scan).InclusiveScan(thread_data, thread_data, ::cuda::maximum<>{}, prefix_op);
+    BlockScanT(temp_storage.scan).InclusiveScan(thread_data, thread_data, cuda::maximum<>{}, prefix_op);
     __syncthreads();
 
     // Store scanned items to output segment
@@ -681,6 +757,55 @@ __global__ void InclusiveScanPrefixCallbackKernel(int* d_data, int num_items)
 }
 // example-end inclusive-scan-prefix-callback-max
 
+// example-begin inclusive-sum-array-aggregate
+__global__ void InclusiveSumArrayAggregateKernel(int* d_data)
+{
+  // Specialize BlockScan for a 1D block of 128 threads of type int
+  using BlockScan = cub::BlockScan<int, 128>;
+
+  // Allocate shared memory for BlockScan
+  __shared__ typename BlockScan::TempStorage temp_storage;
+
+  // Obtain a segment of consecutive items that are blocked across threads
+  int thread_data[4];
+  for (int i = 0; i < 4; i++)
+  {
+    thread_data[i] = d_data[threadIdx.x * 4 + i];
+  }
+
+  // Collectively compute the block-wide inclusive prefix sum
+  int block_aggregate;
+  BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate);
+
+  // Store results
+  for (int i = 0; i < 4; i++)
+  {
+    d_data[threadIdx.x * 4 + i] = thread_data[i];
+  }
+}
+// example-end inclusive-sum-array-aggregate
+
+// example-begin inclusive-sum-single-aggregate
+__global__ void InclusiveSumSingleAggregateKernel(int* d_data)
+{
+  // Specialize BlockScan for a 1D block of 128 threads of type int
+  using BlockScan = cub::BlockScan<int, 128>;
+
+  // Allocate shared memory for BlockScan
+  __shared__ typename BlockScan::TempStorage temp_storage;
+
+  // Obtain input item for each thread
+  int thread_data = d_data[threadIdx.x];
+
+  // Collectively compute the block-wide inclusive prefix sum
+  int block_aggregate;
+  BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate);
+
+  // Store result
+  d_data[threadIdx.x] = thread_data;
+}
+// example-end inclusive-sum-single-aggregate
+
 /**
  * Test documentation example kernels
  */