Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions thrust/examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,20 @@ function(thrust_add_example target_name_var example_name example_src thrust_targ
if (("OMP" IN_LIST config_systems) OR ("TBB" IN_LIST config_systems))
set_tests_properties(${example_target} PROPERTIES RUN_SERIAL ON)
endif()

# Check for per-example script. Script will be included in the current scope
# to allow custom property modifications.
get_filename_component(example_cmake_script "${example_src}" NAME_WLE)
set(example_cmake_script "${CMAKE_CURRENT_LIST_DIR}/${example_cmake_script}.cmake")
# Use a glob so we can detect if this changes:
file(GLOB example_cmake_script
RELATIVE "${CMAKE_CURRENT_LIST_DIR}"
CONFIGURE_DEPENDS
"${example_cmake_script}"
)
if (example_cmake_script) # Will be non-empty only if the script exists
include("${example_cmake_script}")
endif()
endfunction()

file(GLOB example_srcs
Expand Down
36 changes: 36 additions & 0 deletions thrust/examples/run_length_decoding_single_pass.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# This example uses C++20 ranges, so it requires C++20.
if (NOT "20" STREQUAL "${config_dialect}")
set_target_properties(${example_target} PROPERTIES EXCLUDE_FROM_ALL TRUE)
set_tests_properties(${example_target} PROPERTIES DISABLED TRUE)
return()
endif()

# CUDA 13 nvcc has incompatibilities with GCC 14's libstdc++ ranges implementation.
# Disable this example for CUDA 13 + GCC 14 combination.
if ("CUDA" STREQUAL "${config_device}" AND "${CMAKE_CUDA_COMPILER_VERSION}" VERSION_LESS "14.0")
if ("GNU" STREQUAL "${CMAKE_CXX_COMPILER_ID}" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 14)
set_target_properties(${example_target} PROPERTIES EXCLUDE_FROM_ALL TRUE)
set_tests_properties(${example_target} PROPERTIES DISABLED TRUE)
return()
endif()
endif()

# GCC 7 has issues with capturing lambdas in non-CUDA backends triggering unused parameter warnings
# in libcudacxx's __destroy_at. Disable this example for GCC 7.
if ("GNU" STREQUAL "${CMAKE_CXX_COMPILER_ID}" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 8)
set_target_properties(${example_target} PROPERTIES EXCLUDE_FROM_ALL TRUE)
set_tests_properties(${example_target} PROPERTIES DISABLED TRUE)
return()
endif()

target_compile_options(${example_target} PRIVATE $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>: --extended-lambda>)
# This check is actually not correct, because we must check the host compiler, not the CXX compiler.
# We rely on these usually being the same ;)
if ("Clang" STREQUAL "${CMAKE_CXX_COMPILER_ID}" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 13)
# When clang >= 13 is used as host compiler, we get the following warning:
# nvcc_internal_extended_lambda_implementation:312:22: error: definition of implicit copy constructor for '__nv_hdl_wrapper_t<false, true, false, __nv_dl_tag<void (*)(), &TestAddressStabilityLambda, 2>, int (const int &)>' is deprecated because it has a user-declared copy assignment operator [-Werror,-Wdeprecated-copy]
# 312 | __nv_hdl_wrapper_t & operator=(const __nv_hdl_wrapper_t &in) = delete;
# | ^
# Let's suppress it until NVBug 4980157 is resolved.
target_compile_options(${example_target} PRIVATE $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>: -Wno-deprecated-copy>)
endif ()
169 changes: 169 additions & 0 deletions thrust/examples/run_length_decoding_single_pass.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
// This example demonstrates a single-pass run-length decoding algorithm using thrust::inclusive_scan.
// The algorithm processes run-length encoded data and expands it in a single pass by computing
// offsets during the scan and filling the output array as the scan proceeds. The size of the output
// has to be known ahead of time.

#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <thrust/fill.h>
#include <thrust/host_vector.h>
#include <thrust/iterator/discard_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/transform_output_iterator.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/scan.h>
#include <thrust/sequence.h>

#include <cuda/functional>
#include <cuda/std/cstdint>

#include <algorithm>
#include <iostream>
#include <ranges>

template <typename ValueType, typename CountType>
struct run
{
ValueType value;
CountType count;
CountType offset; // Offset where we should begin outputting the run.
CountType run_id; // 1-index of the run in the input sequence; subtract by 1 to get the 0-index.

__host__ __device__ run(ValueType value = ValueType{}, CountType count = 0, CountType offset = 0, CountType run_id = 1)
: value(value)
, count(count)
, offset(offset)
, run_id(run_id)
{}
run(run const& other) = default;
run& operator=(run const& other) = default;
Comment on lines +38 to +39
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nitpick: What is the purpose here? do you want to delete move operations?

Comment on lines +32 to +39
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggestion: delete special ops and just use aggregate init


__host__ __device__ friend run operator+(run l, run r)
{
return run{r.value, r.count, l.offset + l.count + r.offset, l.run_id + r.run_id};
}
};

template <typename OutputIterator, typename CountType, typename ExpandedSizeIterator>
struct expand
{
OutputIterator out;
CountType out_size;
CountType runs_size;
ExpandedSizeIterator expanded_size;

__host__ __device__
expand(OutputIterator out, CountType out_size, CountType runs_size, ExpandedSizeIterator expanded_size)
: out(out)
, out_size(out_size)
, runs_size(runs_size)
, expanded_size(expanded_size)
{}
Comment on lines +55 to +61
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here


template <typename ValueType>
__host__ __device__ CountType operator()(run<ValueType, CountType> r) const
{
cuda::std::size_t end = cuda::minimum()(r.offset + r.count, out_size);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This could just be

Suggested change
cuda::std::size_t end = cuda::minimum()(r.offset + r.count, out_size);
cuda::std::size_t end = cuda::std::min(r.offset + r.count, out_size);

thrust::fill(thrust::seq, out + r.offset, out + end, r.value);
if (r.run_id == runs_size) // If we're the last run, write the expanded size.
{
*expanded_size = end;
}
return end;
}
};

template <typename ValueIterator, typename CountIterator, typename OutputIterator, typename CountType>
OutputIterator run_length_decode(
ValueIterator values, CountIterator counts, CountType runs_size, OutputIterator out, CountType out_size)
{
using ValueType = typename ValueIterator::value_type;

// Zip the values and counts together and convert the resulting tuples to a named struct.
auto runs = thrust::make_transform_iterator(
thrust::make_zip_iterator(values, counts), [] __host__ __device__(thrust::tuple<ValueType, CountType> tup) {
return run{thrust::get<0>(tup), thrust::get<1>(tup)};
});
Comment on lines +83 to +86
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We do have cuda::zip_transform_iterator, just saying

Comment on lines +83 to +86
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggestion: use zip_function to unpack the tuple

Suggested change
auto runs = thrust::make_transform_iterator(
thrust::make_zip_iterator(values, counts), [] __host__ __device__(thrust::tuple<ValueType, CountType> tup) {
return run{thrust::get<0>(tup), thrust::get<1>(tup)};
});
auto runs = thrust::make_transform_iterator(
thrust::make_zip_iterator(values, counts), thrust::make_zip_function([] __host__ __device__(ValueType v CountType c) {
return run{v, c};
}));


// Allocate storage for the expanded size. If we were using CUB directly, we could read this
// from the final ScanTileState.
thrust::device_vector<CountType> expanded_size(1);

// Iterator that writes the expanded sequence to the output and discards the actual scan results.
auto expand_out = thrust::make_transform_output_iterator(
thrust::make_discard_iterator(), expand(out, out_size, runs_size, expanded_size.begin()));
Comment on lines +93 to +94
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggestion: I would find a braced-init list easier to read here:

Suggested change
auto expand_out = thrust::make_transform_output_iterator(
thrust::make_discard_iterator(), expand(out, out_size, runs_size, expanded_size.begin()));
auto expand_out = thrust::make_transform_output_iterator(
thrust::make_discard_iterator(), expand{out, out_size, runs_size, expanded_size.begin()});

Plus, you can drop the ctor of expand now.


// Scan to compute output offsets and then write the expanded sequence.
thrust::inclusive_scan(thrust::device, runs, runs + runs_size, expand_out);

// Calculate the end of the output sequence.
return out + expanded_size[0];
}

int main()
{
using ValueType = cuda::std::int32_t;
using CountType = cuda::std::size_t;

CountType size = 8192;
CountType repeat = 4;

thrust::device_vector<ValueType> values(size);
thrust::sequence(thrust::device, values.begin(), values.end(), 0);

thrust::device_vector<CountType> counts(size);
thrust::fill(thrust::device, counts.begin(), counts.end(), repeat);
Comment on lines +114 to +115
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should rather be

Suggested change
thrust::device_vector<CountType> counts(size);
thrust::fill(thrust::device, counts.begin(), counts.end(), repeat);
thrust::device_vector<CountType> counts(size, repeat);


// Print first 32 elements of compressed input
thrust::host_vector<ValueType> values_h(32);
thrust::host_vector<CountType> counts_h(32);
thrust::copy(values.begin(), values.begin() + 32, values_h.begin());
thrust::copy(counts.begin(), counts.begin() + 32, counts_h.begin());

std::cout << "Compressed input (first 32 runs):" << std::endl;
for (CountType i = 0; i < 32; ++i)
{
std::cout << "(" << values_h[i] << "," << counts_h[i] << ")";
if (i < 31)
{
std::cout << " ";
}
}
std::cout << std::endl << std::endl;

thrust::device_vector<ValueType> output(size * repeat);

auto out_end = run_length_decode(values.begin(), counts.begin(), values.size(), output.begin(), output.size());

if (static_cast<CountType>(out_end - output.begin()) != size * repeat)
{
throw int{};
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Important: let's not show bad practices like throwing exceptions not derived from std::exception. Since this is in main, can we just print an error message and return something nonzero?

Applies one more time below.

}

thrust::host_vector<ValueType> observed(size * repeat);
thrust::copy(output.begin(), out_end, observed.begin());

// Print first 32 elements of decompressed output
std::cout << "Decompressed output (first 32 elements):" << std::endl;
for (CountType i = 0; i < 32; ++i)
{
std::cout << observed[i];
if (i < 31)
{
std::cout << " ";
}
}
std::cout << std::endl;

auto gold = std::views::iota(CountType{0}) | std::views::transform([=](auto x) {
Copy link
Contributor

@miscco miscco Nov 4, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We do have cuda::std::iota_view and cuda::std::transform_view at home

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

They are even available in C++17

return std::views::iota(CountType{0}, repeat) | std::views::transform([=](auto) {
return x;
});
})
| std::views::join;

if (!std::equal(observed.begin(), observed.end(), gold.begin()))
{
throw int{};
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
CHECK: Compressed input (first 32 runs):
CHECK-NEXT: (0,4) (1,4) (2,4) (3,4) (4,4) (5,4) (6,4) (7,4) (8,4) (9,4) (10,4) (11,4) (12,4) (13,4) (14,4) (15,4) (16,4) (17,4) (18,4) (19,4) (20,4) (21,4) (22,4) (23,4) (24,4) (25,4) (26,4) (27,4) (28,4) (29,4) (30,4) (31,4)
CHECK-NEXT: {{^$}}
CHECK-NEXT: Decompressed output (first 32 elements):
CHECK-NEXT: 0 0 0 0 1 1 1 1 2 2 2 2 3 3 3 3 4 4 4 4 5 5 5 5 6 6 6 6 7 7 7 7