From 1c6087b41275caf92ced77f7aed3f0d629648804 Mon Sep 17 00:00:00 2001 From: gonidelis Date: Wed, 5 Nov 2025 09:55:49 -0800 Subject: [PATCH] Move more block scan examples to separate test executable --- cub/cub/block/block_scan.cuh | 226 +++++------------------ cub/examples/block/example_block_scan.cu | 135 +++++++++++++- 2 files changed, 175 insertions(+), 186 deletions(-) diff --git a/cub/cub/block/block_scan.cuh b/cub/cub/block/block_scan.cuh index 6b774a96a2e..20b1a0901b6 100644 --- a/cub/cub/block/block_scan.cuh +++ b/cub/cub/block/block_scan.cuh @@ -362,56 +362,17 @@ public: //! prefix functor to maintain a running total between block-wide scans. Each tile consists //! of 128 integer items that are partitioned across 128 threads. //! - //! .. code-block:: c++ - //! - //! #include // or equivalently - //! - //! // A stateful callback functor that maintains a running prefix to be applied - //! // during consecutive scan operations. - //! struct BlockPrefixCallbackOp - //! { - //! // Running prefix - //! int running_total; - //! - //! // Constructor - //! __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} - //! - //! // Callback operator to be entered by the first warp of threads in the block. - //! // Thread-0 is responsible for returning a value for seeding the block-wide scan. - //! __device__ int operator()(int block_aggregate) - //! { - //! int old_prefix = running_total; - //! running_total += block_aggregate; - //! return old_prefix; - //! } - //! }; - //! - //! __global__ void ExampleKernel(int *d_data, int num_items, ...) - //! { - //! // Specialize BlockScan for a 1D block of 128 threads - //! using BlockScan = cub::BlockScan; - //! - //! // Allocate shared memory for BlockScan - //! __shared__ typename BlockScan::TempStorage temp_storage; - //! - //! // Initialize running total - //! BlockPrefixCallbackOp prefix_op(0); - //! - //! // Have the block iterate over segments of items - //! for (int block_offset = 0; block_offset < num_items; block_offset += 128) - //! { - //! // Load a segment of consecutive items that are blocked across threads - //! int thread_data = d_data[block_offset + threadIdx.x]; - //! - //! // Collectively compute the block-wide exclusive prefix sum - //! BlockScan(temp_storage).ExclusiveSum( - //! thread_data, thread_data, prefix_op); - //! __syncthreads(); + //! .. literalinclude:: ../../examples/block/example_block_scan.cu + //! :language: c++ + //! :dedent: + //! :start-after: example-begin block-prefix-callback-op + //! :end-before: example-end block-prefix-callback-op //! - //! // Store scanned items to output segment - //! d_data[block_offset + threadIdx.x] = thread_data; - //! } - //! } + //! .. literalinclude:: ../../examples/block/example_block_scan.cu + //! :language: c++ + //! :dedent: + //! :start-after: example-begin exclusive-sum-single-prefix-callback + //! :end-before: example-end exclusive-sum-single-prefix-callback //! //! Suppose the input ``d_data`` is ``1, 1, 1, 1, 1, 1, 1, 1, ...``. //! The corresponding output for the first segment will be ``0, 1, ..., 127``. @@ -460,25 +421,11 @@ public: //! are partitioned in a :ref:`blocked arrangement ` across 128 threads //! where each thread owns 4 consecutive items. //! - //! .. code-block:: c++ - //! - //! #include // or equivalently - //! - //! __global__ void ExampleKernel(...) - //! { - //! // Specialize BlockScan for a 1D block of 128 threads of type int - //! using BlockScan = cub::BlockScan; - //! - //! // Allocate shared memory for BlockScan - //! __shared__ typename BlockScan::TempStorage temp_storage; - //! - //! // Obtain a segment of consecutive items that are blocked across threads - //! int thread_data[4]; - //! ... - //! - //! // Collectively compute the block-wide exclusive prefix sum - //! BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data); - //! } + //! .. literalinclude:: ../../examples/block/example_block_scan.cu + //! :language: c++ + //! :dedent: + //! :start-after: example-begin exclusive-sum-array + //! :end-before: example-end exclusive-sum-array //! //! Suppose the set of input ``thread_data`` across the block of threads is //! ``{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }``. @@ -521,26 +468,11 @@ public: //! a :ref:`blocked arrangement ` across 128 threads where each thread owns //! 4 consecutive items. //! - //! .. code-block:: c++ - //! - //! #include // or equivalently - //! - //! __global__ void ExampleKernel(...) - //! { - //! // Specialize BlockScan for a 1D block of 128 threads of type int - //! using BlockScan = cub::BlockScan; - //! - //! // Allocate shared memory for BlockScan - //! __shared__ typename BlockScan::TempStorage temp_storage; - //! - //! // Obtain a segment of consecutive items that are blocked across threads - //! int thread_data[4]; - //! ... - //! - //! // Collectively compute the block-wide exclusive prefix sum - //! int block_aggregate; - //! BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate); - //! } + //! .. literalinclude:: ../../examples/block/example_block_scan.cu + //! :language: c++ + //! :dedent: + //! :start-after: example-begin exclusive-sum-array-aggregate + //! :end-before: example-end exclusive-sum-array-aggregate //! //! Suppose the set of input ``thread_data`` across the block of threads is //! ``{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }``. @@ -657,25 +589,11 @@ public: //! The code snippet below illustrates an exclusive prefix max scan of 128 integer items that //! are partitioned across 128 threads. //! - //! .. code-block:: c++ - //! - //! #include // or equivalently - //! - //! __global__ void ExampleKernel(...) - //! { - //! // Specialize BlockScan for a 1D block of 128 threads of type int - //! using BlockScan = cub::BlockScan; - //! - //! // Allocate shared memory for BlockScan - //! __shared__ typename BlockScan::TempStorage temp_storage; - //! - //! // Obtain input item for each thread - //! int thread_data; - //! ... - //! - //! // Collectively compute the block-wide exclusive prefix max scan - //! BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cuda::maximum<>{}); - //! } + //! .. literalinclude:: ../../examples/block/example_block_scan.cu + //! :language: c++ + //! :dedent: + //! :start-after: example-begin exclusive-scan-single + //! :end-before: example-end exclusive-scan-single //! //! Suppose the set of input ``thread_data`` across the block of threads is ``0, -1, 2, -3, ..., 126, -127``. //! The corresponding output ``thread_data`` in those threads will be ``INT_MIN, 0, 0, 2, ..., 124, 126``. @@ -1314,25 +1232,11 @@ public: //! The code snippet below illustrates an inclusive prefix sum of 128 integer items that //! are partitioned across 128 threads. //! - //! .. code-block:: c++ - //! - //! #include // or equivalently - //! - //! __global__ void ExampleKernel(...) - //! { - //! // Specialize BlockScan for a 1D block of 128 threads of type int - //! using BlockScan = cub::BlockScan; - //! - //! // Allocate shared memory for BlockScan - //! __shared__ typename BlockScan::TempStorage temp_storage; - //! - //! // Obtain input item for each thread - //! int thread_data; - //! ... - //! - //! // Collectively compute the block-wide inclusive prefix sum - //! int block_aggregate; - //! BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate); + //! .. literalinclude:: ../../examples/block/example_block_scan.cu + //! :language: c++ + //! :dedent: + //! :start-after: example-begin inclusive-sum-single-aggregate + //! :end-before: example-end inclusive-sum-single-aggregate //! //! Suppose the set of input ``thread_data`` across the block of threads is ``1, 1, ..., 1``. //! The corresponding output ``thread_data`` in those threads will be ``1, 2, ..., 128``. @@ -1526,25 +1430,11 @@ public: //! are partitioned in a :ref:`blocked arrangement ` across 128 threads //! where each thread owns 4 consecutive items. //! - //! .. code-block:: c++ - //! - //! #include // or equivalently - //! - //! __global__ void ExampleKernel(...) - //! { - //! // Specialize BlockScan for a 1D block of 128 threads of type int - //! using BlockScan = cub::BlockScan; - //! - //! // Allocate shared memory for BlockScan - //! __shared__ typename BlockScan::TempStorage temp_storage; - //! - //! // Obtain a segment of consecutive items that are blocked across threads - //! int thread_data[4]; - //! ... - //! - //! // Collectively compute the block-wide inclusive prefix sum - //! int block_aggregate; - //! BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate); + //! .. literalinclude:: ../../examples/block/example_block_scan.cu + //! :language: c++ + //! :dedent: + //! :start-after: example-begin inclusive-sum-array-aggregate + //! :end-before: example-end inclusive-sum-array-aggregate //! //! Suppose the set of input ``thread_data`` across the block of threads is //! ``{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }``. The @@ -1686,24 +1576,11 @@ public: //! The code snippet below illustrates an inclusive prefix max scan of 128 integer items that //! are partitioned across 128 threads. //! - //! .. code-block:: c++ - //! - //! #include // or equivalently - //! - //! __global__ void ExampleKernel(...) - //! { - //! // Specialize BlockScan for a 1D block of 128 threads of type int - //! using BlockScan = cub::BlockScan; - //! - //! // Allocate shared memory for BlockScan - //! __shared__ typename BlockScan::TempStorage temp_storage; - //! - //! // Obtain input item for each thread - //! int thread_data; - //! ... - //! - //! // Collectively compute the block-wide inclusive prefix max scan - //! BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cuda::maximum<>{}); + //! .. literalinclude:: ../../examples/block/example_block_scan.cu + //! :language: c++ + //! :dedent: + //! :start-after: example-begin inclusive-scan-single + //! :end-before: example-end inclusive-scan-single //! //! Suppose the set of input ``thread_data`` across the block of threads is //! ``0, -1, 2, -3, ..., 126, -127``. The corresponding output ``thread_data`` @@ -1879,24 +1756,11 @@ public: //! are partitioned in a [blocked arrangement](../index.html#sec5sec3) across 128 threads //! where each thread owns 4 consecutive items. //! - //! .. code-block:: c++ - //! - //! #include // or equivalently - //! - //! __global__ void ExampleKernel(...) - //! { - //! // Specialize BlockScan for a 1D block of 128 threads of type int - //! using BlockScan = cub::BlockScan; - //! - //! // Allocate shared memory for BlockScan - //! __shared__ typename BlockScan::TempStorage temp_storage; - //! - //! // Obtain a segment of consecutive items that are blocked across threads - //! int thread_data[4]; - //! ... - //! - //! // Collectively compute the block-wide inclusive prefix max scan - //! BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cuda::maximum<>{}); + //! .. literalinclude:: ../../examples/block/example_block_scan.cu + //! :language: c++ + //! :dedent: + //! :start-after: example-begin inclusive-scan-array + //! :end-before: example-end inclusive-scan-array //! //! Suppose the set of input ``thread_data`` across the block of threads is //! ``{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }``. diff --git a/cub/examples/block/example_block_scan.cu b/cub/examples/block/example_block_scan.cu index 09e23d1c0d9..27b0cfe0b64 100644 --- a/cub/examples/block/example_block_scan.cu +++ b/cub/examples/block/example_block_scan.cu @@ -1,5 +1,5 @@ // SPDX-FileCopyrightText: Copyright (c) 2011, Duane Merrill. All rights reserved. -// SPDX-FileCopyrightText: Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. +// SPDX-FileCopyrightText: Copyright (c) 2011-2025, NVIDIA CORPORATION. All rights reserved. // SPDX-License-Identifier: BSD-3 /****************************************************************************** @@ -401,6 +401,26 @@ __global__ void ExclusiveSumPrefixCallbackKernel(int* d_data, int num_items) } // example-end exclusive-sum-prefix-callback +// example-begin exclusive-scan-single +__global__ void ExclusiveScanSingleKernel(int* d_data) +{ + // Specialize BlockScan for a 1D block of 128 threads of type int + using BlockScan = cub::BlockScan; + + // Allocate shared memory for BlockScan + __shared__ typename BlockScan::TempStorage temp_storage; + + // Obtain input item for each thread + int thread_data = d_data[threadIdx.x]; + + // Collectively compute the block-wide exclusive prefix max scan + BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cuda::maximum<>{}); + + // Store result + d_data[threadIdx.x] = thread_data; +} +// example-end exclusive-scan-single + // example-begin inclusive-scan-prefix-callback __global__ void InclusiveSumPrefixCallbackKernel(int* d_data, int num_items) { @@ -503,7 +523,7 @@ __global__ void ExclusiveScanArrayKernel(int* d_data) } // Collectively compute the block-wide exclusive prefix max scan - BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, ::cuda::maximum<>{}); + BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cuda::maximum<>{}); // Store results for (int i = 0; i < 4; i++) @@ -527,7 +547,7 @@ __global__ void ExclusiveScanAggregateKernel(int* d_data) // Collectively compute the block-wide exclusive prefix max scan int block_aggregate; - BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, ::cuda::maximum<>{}, block_aggregate); + BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cuda::maximum<>{}, block_aggregate); // Store result d_data[threadIdx.x] = thread_data; @@ -596,6 +616,62 @@ __global__ void ExclusiveScanPrefixCallbackKernel(int* d_data, int num_items) } // example-end exclusive-scan-prefix-callback +// example-begin exclusive-sum-single-prefix-callback +__global__ void ExclusiveSumSinglePrefixCallbackKernel(int* d_data, int num_items) +{ + // Specialize BlockScan for a 1D block of 128 threads + using BlockScan = cub::BlockScan; + + // Allocate shared memory for BlockScan + __shared__ typename BlockScan::TempStorage temp_storage; + + // Initialize running total + BlockPrefixCallbackOp prefix_op(0); + + // Have the block iterate over segments of items + for (int block_offset = 0; block_offset < num_items; block_offset += 128) + { + // Load a segment of consecutive items that are blocked across threads + int thread_data = d_data[block_offset + threadIdx.x]; + + // Collectively compute the block-wide exclusive prefix sum + BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, prefix_op); + __syncthreads(); + + // Store scanned items to output segment + d_data[block_offset + threadIdx.x] = thread_data; + } +} +// example-end exclusive-sum-single-prefix-callback + +// example-begin exclusive-sum-array-aggregate +__global__ void ExclusiveSumArrayAggregateKernel(int* d_data) +{ + // Specialize BlockScan for a 1D block of 128 threads of type int + using BlockScan = cub::BlockScan; + + // Allocate shared memory for BlockScan + __shared__ typename BlockScan::TempStorage temp_storage; + + // Obtain a segment of consecutive items that are blocked across threads + int thread_data[4]; + for (int i = 0; i < 4; i++) + { + thread_data[i] = d_data[threadIdx.x * 4 + i]; + } + + // Collectively compute the block-wide exclusive prefix sum + int block_aggregate; + BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate); + + // Store results + for (int i = 0; i < 4; i++) + { + d_data[threadIdx.x * 4 + i] = thread_data[i]; + } +} +// example-end exclusive-sum-array-aggregate + // example-begin inclusive-scan-array __global__ void InclusiveScanArrayKernel(int* d_data) { @@ -636,7 +712,7 @@ __global__ void InclusiveScanSingleKernel(int* d_data) int thread_data = d_data[threadIdx.x]; // Collectively compute the block-wide inclusive prefix max scan - BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, ::cuda::maximum<>{}); + BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cuda::maximum<>{}); // Store result d_data[threadIdx.x] = thread_data; @@ -671,7 +747,7 @@ __global__ void InclusiveScanPrefixCallbackKernel(int* d_data, int num_items) __syncthreads(); // Collectively compute the block-wide inclusive prefix max - BlockScanT(temp_storage.scan).InclusiveScan(thread_data, thread_data, ::cuda::maximum<>{}, prefix_op); + BlockScanT(temp_storage.scan).InclusiveScan(thread_data, thread_data, cuda::maximum<>{}, prefix_op); __syncthreads(); // Store scanned items to output segment @@ -681,6 +757,55 @@ __global__ void InclusiveScanPrefixCallbackKernel(int* d_data, int num_items) } // example-end inclusive-scan-prefix-callback-max +// example-begin inclusive-sum-array-aggregate +__global__ void InclusiveSumArrayAggregateKernel(int* d_data) +{ + // Specialize BlockScan for a 1D block of 128 threads of type int + using BlockScan = cub::BlockScan; + + // Allocate shared memory for BlockScan + __shared__ typename BlockScan::TempStorage temp_storage; + + // Obtain a segment of consecutive items that are blocked across threads + int thread_data[4]; + for (int i = 0; i < 4; i++) + { + thread_data[i] = d_data[threadIdx.x * 4 + i]; + } + + // Collectively compute the block-wide inclusive prefix sum + int block_aggregate; + BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate); + + // Store results + for (int i = 0; i < 4; i++) + { + d_data[threadIdx.x * 4 + i] = thread_data[i]; + } +} +// example-end inclusive-sum-array-aggregate + +// example-begin inclusive-sum-single-aggregate +__global__ void InclusiveSumSingleAggregateKernel(int* d_data) +{ + // Specialize BlockScan for a 1D block of 128 threads of type int + using BlockScan = cub::BlockScan; + + // Allocate shared memory for BlockScan + __shared__ typename BlockScan::TempStorage temp_storage; + + // Obtain input item for each thread + int thread_data = d_data[threadIdx.x]; + + // Collectively compute the block-wide inclusive prefix sum + int block_aggregate; + BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate); + + // Store result + d_data[threadIdx.x] = thread_data; +} +// example-end inclusive-sum-single-aggregate + /** * Test documentation example kernels */