NVIDIA · brycelelbach · Sep 26, 2025 · Sep 26, 2025 · Nov 4, 2025 · Nov 4, 2025
@@ -134,6 +134,20 @@ function(thrust_add_example target_name_var example_name example_src thrust_targ
   if (("OMP" IN_LIST config_systems) OR ("TBB" IN_LIST config_systems))
     set_tests_properties(${example_target} PROPERTIES RUN_SERIAL ON)
   endif()
+
+  # Check for per-example script. Script will be included in the current scope
+  # to allow custom property modifications.
+  get_filename_component(example_cmake_script "${example_src}" NAME_WLE)
+  set(example_cmake_script "${CMAKE_CURRENT_LIST_DIR}/${example_cmake_script}.cmake")
+  # Use a glob so we can detect if this changes:
+  file(GLOB example_cmake_script
+    RELATIVE "${CMAKE_CURRENT_LIST_DIR}"
+    CONFIGURE_DEPENDS
+    "${example_cmake_script}"
+  )
+  if (example_cmake_script) # Will be non-empty only if the script exists
+    include("${example_cmake_script}")
+  endif()
 endfunction()
 
 file(GLOB example_srcs

@@ -0,0 +1,36 @@
+# This example uses C++20 ranges, so it requires C++20.
+if (NOT "20" STREQUAL "${config_dialect}")
+    set_target_properties(${example_target} PROPERTIES EXCLUDE_FROM_ALL TRUE)
+    set_tests_properties(${example_target} PROPERTIES DISABLED TRUE)
+    return()
+endif()
+
+# CUDA 13 nvcc has incompatibilities with GCC 14's libstdc++ ranges implementation.
+# Disable this example for CUDA 13 + GCC 14 combination.
+if ("CUDA" STREQUAL "${config_device}" AND "${CMAKE_CUDA_COMPILER_VERSION}" VERSION_LESS "14.0")
+    if ("GNU" STREQUAL "${CMAKE_CXX_COMPILER_ID}" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 14)
+        set_target_properties(${example_target} PROPERTIES EXCLUDE_FROM_ALL TRUE)
+        set_tests_properties(${example_target} PROPERTIES DISABLED TRUE)
+        return()
+    endif()
+endif()
+
+# GCC 7 has issues with capturing lambdas in non-CUDA backends triggering unused parameter warnings
+# in libcudacxx's __destroy_at. Disable this example for GCC 7.
+if ("GNU" STREQUAL "${CMAKE_CXX_COMPILER_ID}" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 8)
+    set_target_properties(${example_target} PROPERTIES EXCLUDE_FROM_ALL TRUE)
+    set_tests_properties(${example_target} PROPERTIES DISABLED TRUE)
+    return()
+endif()
+
+target_compile_options(${example_target} PRIVATE $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>: --extended-lambda>)
+# This check is actually not correct, because we must check the host compiler, not the CXX compiler.
+# We rely on these usually being the same ;)
+if ("Clang" STREQUAL "${CMAKE_CXX_COMPILER_ID}" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 13)
+    # When clang >= 13 is used as host compiler, we get the following warning:
+    #   nvcc_internal_extended_lambda_implementation:312:22: error: definition of implicit copy constructor for '__nv_hdl_wrapper_t<false, true, false, __nv_dl_tag<void (*)(), &TestAddressStabilityLambda, 2>, int (const int &)>' is deprecated because it has a user-declared copy assignment operator [-Werror,-Wdeprecated-copy]
+    #   312 | __nv_hdl_wrapper_t & operator=(const __nv_hdl_wrapper_t &in) = delete;
+    #   |                      ^
+    # Let's suppress it until NVBug 4980157 is resolved.
+    target_compile_options(${example_target} PRIVATE $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>: -Wno-deprecated-copy>)
+endif ()
@@ -0,0 +1,169 @@
+// This example demonstrates a single-pass run-length decoding algorithm using thrust::inclusive_scan.
+// The algorithm processes run-length encoded data and expands it in a single pass by computing
+// offsets during the scan and filling the output array as the scan proceeds. The size of the output
+// has to be known ahead of time.
+
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/fill.h>
+#include <thrust/host_vector.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/iterator/transform_output_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/scan.h>
+#include <thrust/sequence.h>
+
+#include <cuda/functional>
+#include <cuda/std/cstdint>
+
+#include <algorithm>
+#include <iostream>
+#include <ranges>
+
+template <typename ValueType, typename CountType>
+struct run
+{
+  ValueType value;
+  CountType count;
+  CountType offset; // Offset where we should begin outputting the run.
+  CountType run_id; // 1-index of the run in the input sequence; subtract by 1 to get the 0-index.
+
+  __host__ __device__ run(ValueType value = ValueType{}, CountType count = 0, CountType offset = 0, CountType run_id = 1)
+      : value(value)
+      , count(count)
+      , offset(offset)
+      , run_id(run_id)
+  {}
+  run(run const& other)            = default;
+  run& operator=(run const& other) = default;
+
+  __host__ __device__ friend run operator+(run l, run r)
+  {
+    return run{r.value, r.count, l.offset + l.count + r.offset, l.run_id + r.run_id};
+  }
+};
+
+template <typename OutputIterator, typename CountType, typename ExpandedSizeIterator>
+struct expand
+{
+  OutputIterator out;
+  CountType out_size;
+  CountType runs_size;
+  ExpandedSizeIterator expanded_size;
+
+  __host__ __device__
+  expand(OutputIterator out, CountType out_size, CountType runs_size, ExpandedSizeIterator expanded_size)
+      : out(out)
+      , out_size(out_size)
+      , runs_size(runs_size)
+      , expanded_size(expanded_size)
+  {}
+
+  template <typename ValueType>
+  __host__ __device__ CountType operator()(run<ValueType, CountType> r) const
+  {
+    cuda::std::size_t end = cuda::minimum()(r.offset + r.count, out_size);
-    cuda::std::size_t end = cuda::minimum()(r.offset + r.count, out_size);
+    cuda::std::size_t end = cuda::std::min(r.offset + r.count, out_size);
-    cuda::std::size_t end = cuda::minimum()(r.offset + r.count, out_size);
+    cuda::std::size_t end = cuda::std::min(r.offset + r.count, out_size);
+    thrust::fill(thrust::seq, out + r.offset, out + end, r.value);
+    if (r.run_id == runs_size) // If we're the last run, write the expanded size.
+    {
+      *expanded_size = end;
+    }
+    return end;
+  }
+};
+
+template <typename ValueIterator, typename CountIterator, typename OutputIterator, typename CountType>
+OutputIterator run_length_decode(
+  ValueIterator values, CountIterator counts, CountType runs_size, OutputIterator out, CountType out_size)
+{
+  using ValueType = typename ValueIterator::value_type;
+
+  // Zip the values and counts together and convert the resulting tuples to a named struct.
+  auto runs = thrust::make_transform_iterator(
+    thrust::make_zip_iterator(values, counts), [] __host__ __device__(thrust::tuple<ValueType, CountType> tup) {
+      return run{thrust::get<0>(tup), thrust::get<1>(tup)};
+    });
-  auto runs = thrust::make_transform_iterator(
-    thrust::make_zip_iterator(values, counts), [] __host__ __device__(thrust::tuple<ValueType, CountType> tup) {
-      return run{thrust::get<0>(tup), thrust::get<1>(tup)};
-    });
+  auto runs = thrust::make_transform_iterator(
+    thrust::make_zip_iterator(values, counts), thrust::make_zip_function([] __host__ __device__(ValueType v CountType c) {
+      return run{v, c};
+    }));
-  auto runs = thrust::make_transform_iterator(
-    thrust::make_zip_iterator(values, counts), [] __host__ __device__(thrust::tuple<ValueType, CountType> tup) {
-      return run{thrust::get<0>(tup), thrust::get<1>(tup)};
-    });
+  auto runs = thrust::make_transform_iterator(
+    thrust::make_zip_iterator(values, counts), thrust::make_zip_function([] __host__ __device__(ValueType v CountType c) {
+      return run{v, c};
+    }));
+
+  // Allocate storage for the expanded size. If we were using CUB directly, we could read this
+  // from the final ScanTileState.
+  thrust::device_vector<CountType> expanded_size(1);
+
+  // Iterator that writes the expanded sequence to the output and discards the actual scan results.
+  auto expand_out = thrust::make_transform_output_iterator(
+    thrust::make_discard_iterator(), expand(out, out_size, runs_size, expanded_size.begin()));
-  auto expand_out = thrust::make_transform_output_iterator(
-    thrust::make_discard_iterator(), expand(out, out_size, runs_size, expanded_size.begin()));
+  auto expand_out = thrust::make_transform_output_iterator(
+    thrust::make_discard_iterator(), expand{out, out_size, runs_size, expanded_size.begin()});
-  auto expand_out = thrust::make_transform_output_iterator(
-    thrust::make_discard_iterator(), expand(out, out_size, runs_size, expanded_size.begin()));
+  auto expand_out = thrust::make_transform_output_iterator(
+    thrust::make_discard_iterator(), expand{out, out_size, runs_size, expanded_size.begin()});
+
+  // Scan to compute output offsets and then write the expanded sequence.
+  thrust::inclusive_scan(thrust::device, runs, runs + runs_size, expand_out);
+
+  // Calculate the end of the output sequence.
+  return out + expanded_size[0];
+}
+
+int main()
+{
+  using ValueType = cuda::std::int32_t;
+  using CountType = cuda::std::size_t;
+
+  CountType size   = 8192;
+  CountType repeat = 4;
+
+  thrust::device_vector<ValueType> values(size);
+  thrust::sequence(thrust::device, values.begin(), values.end(), 0);
+
+  thrust::device_vector<CountType> counts(size);
+  thrust::fill(thrust::device, counts.begin(), counts.end(), repeat);
-  thrust::device_vector<CountType> counts(size);
-  thrust::fill(thrust::device, counts.begin(), counts.end(), repeat);
+  thrust::device_vector<CountType> counts(size, repeat);
-  thrust::device_vector<CountType> counts(size);
-  thrust::fill(thrust::device, counts.begin(), counts.end(), repeat);
+  thrust::device_vector<CountType> counts(size, repeat);
+
+  // Print first 32 elements of compressed input
+  thrust::host_vector<ValueType> values_h(32);
+  thrust::host_vector<CountType> counts_h(32);
+  thrust::copy(values.begin(), values.begin() + 32, values_h.begin());
+  thrust::copy(counts.begin(), counts.begin() + 32, counts_h.begin());
+
+  std::cout << "Compressed input (first 32 runs):" << std::endl;
+  for (CountType i = 0; i < 32; ++i)
+  {
+    std::cout << "(" << values_h[i] << "," << counts_h[i] << ")";
+    if (i < 31)
+    {
+      std::cout << " ";
+    }
+  }
+  std::cout << std::endl << std::endl;
+
+  thrust::device_vector<ValueType> output(size * repeat);
+
+  auto out_end = run_length_decode(values.begin(), counts.begin(), values.size(), output.begin(), output.size());
+
+  if (static_cast<CountType>(out_end - output.begin()) != size * repeat)
+  {
+    throw int{};
+  }
+
+  thrust::host_vector<ValueType> observed(size * repeat);
+  thrust::copy(output.begin(), out_end, observed.begin());
+
+  // Print first 32 elements of decompressed output
+  std::cout << "Decompressed output (first 32 elements):" << std::endl;
+  for (CountType i = 0; i < 32; ++i)
+  {
+    std::cout << observed[i];
+    if (i < 31)
+    {
+      std::cout << " ";
+    }
+  }
+  std::cout << std::endl;
+
+  auto gold = std::views::iota(CountType{0}) | std::views::transform([=](auto x) {
+                return std::views::iota(CountType{0}, repeat) | std::views::transform([=](auto) {
+                         return x;
+                       });
+              })
+            | std::views::join;
+
+  if (!std::equal(observed.begin(), observed.end(), gold.begin()))
+  {
+    throw int{};
+  }
+}
@@ -0,0 +1,5 @@
+     CHECK: Compressed input (first 32 runs):
+CHECK-NEXT: (0,4) (1,4) (2,4) (3,4) (4,4) (5,4) (6,4) (7,4) (8,4) (9,4) (10,4) (11,4) (12,4) (13,4) (14,4) (15,4) (16,4) (17,4) (18,4) (19,4) (20,4) (21,4) (22,4) (23,4) (24,4) (25,4) (26,4) (27,4) (28,4) (29,4) (30,4) (31,4)
+CHECK-NEXT: {{^$}}
+CHECK-NEXT: Decompressed output (first 32 elements):
+CHECK-NEXT: 0 0 0 0 1 1 1 1 2 2 2 2 3 3 3 3 4 4 4 4 5 5 5 5 6 6 6 6 7 7 7 7