diff --git a/thrust/examples/run_length_decoding_single_pass.cmake b/thrust/examples/run_length_decoding_single_pass.cmake new file mode 100644 index 00000000000..4c23704dafc --- /dev/null +++ b/thrust/examples/run_length_decoding_single_pass.cmake @@ -0,0 +1,41 @@ +# This example uses a single-pass algorithm that performs side effects during scan. +# The algorithm relies on the scan functor being called exactly once per element with the final +# scan value, which is guaranteed by CUDA but not by CPU-parallel backends like OMP/TBB where +# the functor may be called multiple times with intermediate values during parallel reduction. +# Therefore, disable this example for non-CUDA backends. +if (NOT "CUDA" STREQUAL "${config_device}") + set_target_properties(${example_target} PROPERTIES EXCLUDE_FROM_ALL TRUE) + set_tests_properties(${example_target} PROPERTIES DISABLED TRUE) + return() +endif() + +# GCC 7 has issues with capturing lambdas in non-CUDA backends triggering unused parameter warnings +# in libcudacxx's __destroy_at. Disable this example for GCC 7. +if ( + "GNU" STREQUAL "${CMAKE_CXX_COMPILER_ID}" + AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 8 +) + set_target_properties(${example_target} PROPERTIES EXCLUDE_FROM_ALL TRUE) + set_tests_properties(${example_target} PROPERTIES DISABLED TRUE) +endif() + +target_compile_options( + ${example_target} + PRIVATE $<$: --extended-lambda> +) +# This check is actually not correct, because we must check the host compiler, not the CXX compiler. +# We rely on these usually being the same ;) +if ( + "Clang" STREQUAL "${CMAKE_CXX_COMPILER_ID}" + AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 13 +) + # When clang >= 13 is used as host compiler, we get the following warning: + # nvcc_internal_extended_lambda_implementation:312:22: error: definition of implicit copy constructor for '__nv_hdl_wrapper_t, int (const int &)>' is deprecated because it has a user-declared copy assignment operator [-Werror,-Wdeprecated-copy] + # 312 | __nv_hdl_wrapper_t & operator=(const __nv_hdl_wrapper_t &in) = delete; + # | ^ + # Let's suppress it until NVBug 4980157 is resolved. + target_compile_options( + ${example_target} + PRIVATE $<$: -Wno-deprecated-copy> + ) +endif() diff --git a/thrust/examples/run_length_decoding_single_pass.cu b/thrust/examples/run_length_decoding_single_pass.cu new file mode 100644 index 00000000000..fe53496ca46 --- /dev/null +++ b/thrust/examples/run_length_decoding_single_pass.cu @@ -0,0 +1,156 @@ +// This example demonstrates a single-pass run-length decoding algorithm using thrust::inclusive_scan. +// The algorithm processes run-length encoded data and expands it in a single pass by computing +// offsets during the scan and filling the output array as the scan proceeds. The size of the output +// has to be known ahead of time. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include + +template +struct run +{ + ValueType value{}; + CountType count{0}; + CountType offset{0}; // Offset where we should begin outputting the run. + CountType run_id{1}; // 1-index of the run in the input sequence; subtract by 1 to get the 0-index. + + __host__ __device__ friend run operator+(run l, run r) + { + return run{r.value, r.count, l.offset + l.count + r.offset, l.run_id + r.run_id}; + } +}; + +template +__host__ __device__ run(ValueType, CountType) -> run; + +template +struct expand +{ + OutputIterator out; + CountType out_size; + CountType runs_size; + ExpandedSizeIterator expanded_size; + + template + __host__ __device__ CountType operator()(run r) const + { + cuda::std::size_t end = cuda::std::min(r.offset + r.count, out_size); + cuda::std::fill(out + r.offset, out + end, r.value); + if (r.run_id == runs_size) // If we're the last run, write the expanded size. + { + *expanded_size = end; + } + return end; + } +}; + +template +__host__ __device__ expand(OutputIterator, CountType, CountType, ExpandedSizeIterator) + -> expand; + +template +OutputIterator run_length_decode( + ValueIterator values, CountIterator counts, CountType runs_size, OutputIterator out, CountType out_size) +{ + using ValueType = typename ValueIterator::value_type; + + // Zip the values and counts together and convert the resulting tuples to a named struct. + auto runs = cuda::zip_transform_iterator( + [] __host__ __device__(ValueType v, CountType c) { + return run{v, c}; + }, + values, + counts); + + // Allocate storage for the expanded size. If we were using CUB directly, we could read this + // from the final ScanTileState. + thrust::device_vector expanded_size(1); + + // Iterator that writes the expanded sequence to the output and discards the actual scan results. + auto expand_out = thrust::make_transform_output_iterator( + thrust::make_discard_iterator(), expand{out, out_size, runs_size, expanded_size.begin()}); + + // Scan to compute output offsets and then write the expanded sequence. + thrust::inclusive_scan(thrust::device, runs, runs + runs_size, expand_out); + + // Calculate the end of the output sequence. + return out + expanded_size[0]; +} + +int main() +{ + using ValueType = cuda::std::int32_t; + using CountType = cuda::std::size_t; + + CountType size = 8192; + CountType repeat = 4; + + thrust::device_vector values(size); + thrust::sequence(thrust::device, values.begin(), values.end(), 0); + + thrust::device_vector counts(size); + thrust::fill(thrust::device, counts.begin(), counts.end(), repeat); + + // Print first 32 elements of compressed input + thrust::host_vector values_h(32); + thrust::host_vector counts_h(32); + thrust::copy(values.begin(), values.begin() + 32, values_h.begin()); + thrust::copy(counts.begin(), counts.begin() + 32, counts_h.begin()); + + std::cout << "Compressed input (first 32 runs):" << std::endl; + for (CountType i = 0; i < 32; ++i) + { + std::cout << "(" << values_h[i] << "," << counts_h[i] << ") "; + } + std::cout << std::endl << std::endl; + + thrust::device_vector output(size * repeat); + + auto out_end = run_length_decode(values.begin(), counts.begin(), values.size(), output.begin(), output.size()); + + if (CountType(out_end - output.begin()) != size * repeat) + { + std::cerr << "Error: Output size mismatch. Expected " << size * repeat << " but got " + << static_cast(out_end - output.begin()) << std::endl; + return 1; + } + + thrust::host_vector observed(size * repeat); + thrust::copy(output.begin(), out_end, observed.begin()); + + // Print first 32 elements of decompressed output + std::cout << "Decompressed output (first 32 elements):" << std::endl; + for (CountType i = 0; i < 32; ++i) + { + std::cout << observed[i] << " "; + } + std::cout << std::endl; + + auto gold = cuda::std::views::iota(CountType{0}, size * repeat) | cuda::std::views::transform([=](CountType idx) { + return ValueType(idx / repeat); + }); + + if (!cuda::std::equal(observed.begin(), observed.end(), gold.begin())) + { + std::cerr << "Error: Output does not match expected values" << std::endl; + return 1; + } +} diff --git a/thrust/internal/test/thrust.example.run_length_decoding_single_pass.filecheck b/thrust/internal/test/thrust.example.run_length_decoding_single_pass.filecheck new file mode 100644 index 00000000000..95ef218992b --- /dev/null +++ b/thrust/internal/test/thrust.example.run_length_decoding_single_pass.filecheck @@ -0,0 +1,5 @@ + CHECK: Compressed input (first 32 runs): +CHECK-NEXT: (0,4) (1,4) (2,4) (3,4) (4,4) (5,4) (6,4) (7,4) (8,4) (9,4) (10,4) (11,4) (12,4) (13,4) (14,4) (15,4) (16,4) (17,4) (18,4) (19,4) (20,4) (21,4) (22,4) (23,4) (24,4) (25,4) (26,4) (27,4) (28,4) (29,4) (30,4) (31,4) +CHECK-NEXT: {{^$}} +CHECK-NEXT: Decompressed output (first 32 elements): +CHECK-NEXT: 0 0 0 0 1 1 1 1 2 2 2 2 3 3 3 3 4 4 4 4 5 5 5 5 6 6 6 6 7 7 7 7