-
Notifications
You must be signed in to change notification settings - Fork 287
[Thrust]: New single-pass run length decoding example #6470
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
8c60a06
5b31a3c
832d700
dc68e5a
1930824
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,36 @@ | ||
| # This example uses C++20 ranges, so it requires C++20. | ||
| if (NOT "20" STREQUAL "${config_dialect}") | ||
| set_target_properties(${example_target} PROPERTIES EXCLUDE_FROM_ALL TRUE) | ||
| set_tests_properties(${example_target} PROPERTIES DISABLED TRUE) | ||
| return() | ||
| endif() | ||
|
|
||
| # CUDA 13 nvcc has incompatibilities with GCC 14's libstdc++ ranges implementation. | ||
| # Disable this example for CUDA 13 + GCC 14 combination. | ||
| if ("CUDA" STREQUAL "${config_device}" AND "${CMAKE_CUDA_COMPILER_VERSION}" VERSION_LESS "14.0") | ||
| if ("GNU" STREQUAL "${CMAKE_CXX_COMPILER_ID}" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 14) | ||
| set_target_properties(${example_target} PROPERTIES EXCLUDE_FROM_ALL TRUE) | ||
| set_tests_properties(${example_target} PROPERTIES DISABLED TRUE) | ||
| return() | ||
| endif() | ||
| endif() | ||
|
|
||
| # GCC 7 has issues with capturing lambdas in non-CUDA backends triggering unused parameter warnings | ||
| # in libcudacxx's __destroy_at. Disable this example for GCC 7. | ||
| if ("GNU" STREQUAL "${CMAKE_CXX_COMPILER_ID}" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 8) | ||
| set_target_properties(${example_target} PROPERTIES EXCLUDE_FROM_ALL TRUE) | ||
| set_tests_properties(${example_target} PROPERTIES DISABLED TRUE) | ||
| return() | ||
| endif() | ||
|
|
||
| target_compile_options(${example_target} PRIVATE $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>: --extended-lambda>) | ||
| # This check is actually not correct, because we must check the host compiler, not the CXX compiler. | ||
| # We rely on these usually being the same ;) | ||
| if ("Clang" STREQUAL "${CMAKE_CXX_COMPILER_ID}" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 13) | ||
| # When clang >= 13 is used as host compiler, we get the following warning: | ||
| # nvcc_internal_extended_lambda_implementation:312:22: error: definition of implicit copy constructor for '__nv_hdl_wrapper_t<false, true, false, __nv_dl_tag<void (*)(), &TestAddressStabilityLambda, 2>, int (const int &)>' is deprecated because it has a user-declared copy assignment operator [-Werror,-Wdeprecated-copy] | ||
| # 312 | __nv_hdl_wrapper_t & operator=(const __nv_hdl_wrapper_t &in) = delete; | ||
| # | ^ | ||
| # Let's suppress it until NVBug 4980157 is resolved. | ||
| target_compile_options(${example_target} PRIVATE $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>: -Wno-deprecated-copy>) | ||
| endif () |
| Original file line number | Diff line number | Diff line change | ||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| @@ -0,0 +1,169 @@ | ||||||||||||||||||
| // This example demonstrates a single-pass run-length decoding algorithm using thrust::inclusive_scan. | ||||||||||||||||||
| // The algorithm processes run-length encoded data and expands it in a single pass by computing | ||||||||||||||||||
| // offsets during the scan and filling the output array as the scan proceeds. The size of the output | ||||||||||||||||||
| // has to be known ahead of time. | ||||||||||||||||||
|
|
||||||||||||||||||
| #include <thrust/device_vector.h> | ||||||||||||||||||
| #include <thrust/execution_policy.h> | ||||||||||||||||||
| #include <thrust/fill.h> | ||||||||||||||||||
| #include <thrust/host_vector.h> | ||||||||||||||||||
| #include <thrust/iterator/discard_iterator.h> | ||||||||||||||||||
| #include <thrust/iterator/transform_iterator.h> | ||||||||||||||||||
| #include <thrust/iterator/transform_output_iterator.h> | ||||||||||||||||||
| #include <thrust/iterator/zip_iterator.h> | ||||||||||||||||||
| #include <thrust/scan.h> | ||||||||||||||||||
| #include <thrust/sequence.h> | ||||||||||||||||||
|
|
||||||||||||||||||
| #include <cuda/functional> | ||||||||||||||||||
| #include <cuda/std/cstdint> | ||||||||||||||||||
|
|
||||||||||||||||||
| #include <algorithm> | ||||||||||||||||||
| #include <iostream> | ||||||||||||||||||
| #include <ranges> | ||||||||||||||||||
|
|
||||||||||||||||||
| template <typename ValueType, typename CountType> | ||||||||||||||||||
| struct run | ||||||||||||||||||
| { | ||||||||||||||||||
| ValueType value; | ||||||||||||||||||
| CountType count; | ||||||||||||||||||
| CountType offset; // Offset where we should begin outputting the run. | ||||||||||||||||||
| CountType run_id; // 1-index of the run in the input sequence; subtract by 1 to get the 0-index. | ||||||||||||||||||
|
|
||||||||||||||||||
| __host__ __device__ run(ValueType value = ValueType{}, CountType count = 0, CountType offset = 0, CountType run_id = 1) | ||||||||||||||||||
| : value(value) | ||||||||||||||||||
| , count(count) | ||||||||||||||||||
| , offset(offset) | ||||||||||||||||||
| , run_id(run_id) | ||||||||||||||||||
| {} | ||||||||||||||||||
| run(run const& other) = default; | ||||||||||||||||||
| run& operator=(run const& other) = default; | ||||||||||||||||||
|
Comment on lines
+32
to
+39
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Suggestion: delete special ops and just use aggregate init |
||||||||||||||||||
|
|
||||||||||||||||||
| __host__ __device__ friend run operator+(run l, run r) | ||||||||||||||||||
| { | ||||||||||||||||||
| return run{r.value, r.count, l.offset + l.count + r.offset, l.run_id + r.run_id}; | ||||||||||||||||||
| } | ||||||||||||||||||
| }; | ||||||||||||||||||
|
|
||||||||||||||||||
| template <typename OutputIterator, typename CountType, typename ExpandedSizeIterator> | ||||||||||||||||||
| struct expand | ||||||||||||||||||
| { | ||||||||||||||||||
| OutputIterator out; | ||||||||||||||||||
| CountType out_size; | ||||||||||||||||||
| CountType runs_size; | ||||||||||||||||||
| ExpandedSizeIterator expanded_size; | ||||||||||||||||||
|
|
||||||||||||||||||
| __host__ __device__ | ||||||||||||||||||
| expand(OutputIterator out, CountType out_size, CountType runs_size, ExpandedSizeIterator expanded_size) | ||||||||||||||||||
| : out(out) | ||||||||||||||||||
| , out_size(out_size) | ||||||||||||||||||
| , runs_size(runs_size) | ||||||||||||||||||
| , expanded_size(expanded_size) | ||||||||||||||||||
| {} | ||||||||||||||||||
|
Comment on lines
+55
to
+61
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same here |
||||||||||||||||||
|
|
||||||||||||||||||
| template <typename ValueType> | ||||||||||||||||||
| __host__ __device__ CountType operator()(run<ValueType, CountType> r) const | ||||||||||||||||||
| { | ||||||||||||||||||
| cuda::std::size_t end = cuda::minimum()(r.offset + r.count, out_size); | ||||||||||||||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This could just be
Suggested change
|
||||||||||||||||||
| thrust::fill(thrust::seq, out + r.offset, out + end, r.value); | ||||||||||||||||||
| if (r.run_id == runs_size) // If we're the last run, write the expanded size. | ||||||||||||||||||
| { | ||||||||||||||||||
| *expanded_size = end; | ||||||||||||||||||
| } | ||||||||||||||||||
| return end; | ||||||||||||||||||
| } | ||||||||||||||||||
| }; | ||||||||||||||||||
|
|
||||||||||||||||||
| template <typename ValueIterator, typename CountIterator, typename OutputIterator, typename CountType> | ||||||||||||||||||
| OutputIterator run_length_decode( | ||||||||||||||||||
| ValueIterator values, CountIterator counts, CountType runs_size, OutputIterator out, CountType out_size) | ||||||||||||||||||
| { | ||||||||||||||||||
| using ValueType = typename ValueIterator::value_type; | ||||||||||||||||||
|
|
||||||||||||||||||
| // Zip the values and counts together and convert the resulting tuples to a named struct. | ||||||||||||||||||
| auto runs = thrust::make_transform_iterator( | ||||||||||||||||||
| thrust::make_zip_iterator(values, counts), [] __host__ __device__(thrust::tuple<ValueType, CountType> tup) { | ||||||||||||||||||
| return run{thrust::get<0>(tup), thrust::get<1>(tup)}; | ||||||||||||||||||
| }); | ||||||||||||||||||
|
Comment on lines
+83
to
+86
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We do have
Comment on lines
+83
to
+86
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Suggestion: use
Suggested change
|
||||||||||||||||||
|
|
||||||||||||||||||
| // Allocate storage for the expanded size. If we were using CUB directly, we could read this | ||||||||||||||||||
| // from the final ScanTileState. | ||||||||||||||||||
| thrust::device_vector<CountType> expanded_size(1); | ||||||||||||||||||
|
|
||||||||||||||||||
| // Iterator that writes the expanded sequence to the output and discards the actual scan results. | ||||||||||||||||||
| auto expand_out = thrust::make_transform_output_iterator( | ||||||||||||||||||
| thrust::make_discard_iterator(), expand(out, out_size, runs_size, expanded_size.begin())); | ||||||||||||||||||
|
Comment on lines
+93
to
+94
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Suggestion: I would find a braced-init list easier to read here:
Suggested change
Plus, you can drop the ctor of |
||||||||||||||||||
|
|
||||||||||||||||||
| // Scan to compute output offsets and then write the expanded sequence. | ||||||||||||||||||
| thrust::inclusive_scan(thrust::device, runs, runs + runs_size, expand_out); | ||||||||||||||||||
|
|
||||||||||||||||||
| // Calculate the end of the output sequence. | ||||||||||||||||||
| return out + expanded_size[0]; | ||||||||||||||||||
| } | ||||||||||||||||||
|
|
||||||||||||||||||
| int main() | ||||||||||||||||||
| { | ||||||||||||||||||
| using ValueType = cuda::std::int32_t; | ||||||||||||||||||
| using CountType = cuda::std::size_t; | ||||||||||||||||||
|
|
||||||||||||||||||
| CountType size = 8192; | ||||||||||||||||||
| CountType repeat = 4; | ||||||||||||||||||
|
|
||||||||||||||||||
| thrust::device_vector<ValueType> values(size); | ||||||||||||||||||
| thrust::sequence(thrust::device, values.begin(), values.end(), 0); | ||||||||||||||||||
|
|
||||||||||||||||||
| thrust::device_vector<CountType> counts(size); | ||||||||||||||||||
| thrust::fill(thrust::device, counts.begin(), counts.end(), repeat); | ||||||||||||||||||
|
Comment on lines
+114
to
+115
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should rather be
Suggested change
|
||||||||||||||||||
|
|
||||||||||||||||||
| // Print first 32 elements of compressed input | ||||||||||||||||||
| thrust::host_vector<ValueType> values_h(32); | ||||||||||||||||||
| thrust::host_vector<CountType> counts_h(32); | ||||||||||||||||||
| thrust::copy(values.begin(), values.begin() + 32, values_h.begin()); | ||||||||||||||||||
| thrust::copy(counts.begin(), counts.begin() + 32, counts_h.begin()); | ||||||||||||||||||
|
|
||||||||||||||||||
| std::cout << "Compressed input (first 32 runs):" << std::endl; | ||||||||||||||||||
| for (CountType i = 0; i < 32; ++i) | ||||||||||||||||||
| { | ||||||||||||||||||
| std::cout << "(" << values_h[i] << "," << counts_h[i] << ")"; | ||||||||||||||||||
| if (i < 31) | ||||||||||||||||||
| { | ||||||||||||||||||
| std::cout << " "; | ||||||||||||||||||
| } | ||||||||||||||||||
| } | ||||||||||||||||||
| std::cout << std::endl << std::endl; | ||||||||||||||||||
|
|
||||||||||||||||||
| thrust::device_vector<ValueType> output(size * repeat); | ||||||||||||||||||
|
|
||||||||||||||||||
| auto out_end = run_length_decode(values.begin(), counts.begin(), values.size(), output.begin(), output.size()); | ||||||||||||||||||
|
|
||||||||||||||||||
| if (static_cast<CountType>(out_end - output.begin()) != size * repeat) | ||||||||||||||||||
| { | ||||||||||||||||||
| throw int{}; | ||||||||||||||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Important: let's not show bad practices like throwing exceptions not derived from Applies one more time below. |
||||||||||||||||||
| } | ||||||||||||||||||
|
|
||||||||||||||||||
| thrust::host_vector<ValueType> observed(size * repeat); | ||||||||||||||||||
| thrust::copy(output.begin(), out_end, observed.begin()); | ||||||||||||||||||
|
|
||||||||||||||||||
| // Print first 32 elements of decompressed output | ||||||||||||||||||
| std::cout << "Decompressed output (first 32 elements):" << std::endl; | ||||||||||||||||||
| for (CountType i = 0; i < 32; ++i) | ||||||||||||||||||
| { | ||||||||||||||||||
| std::cout << observed[i]; | ||||||||||||||||||
| if (i < 31) | ||||||||||||||||||
| { | ||||||||||||||||||
| std::cout << " "; | ||||||||||||||||||
| } | ||||||||||||||||||
| } | ||||||||||||||||||
| std::cout << std::endl; | ||||||||||||||||||
|
|
||||||||||||||||||
| auto gold = std::views::iota(CountType{0}) | std::views::transform([=](auto x) { | ||||||||||||||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We do have There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. They are even available in C++17 |
||||||||||||||||||
| return std::views::iota(CountType{0}, repeat) | std::views::transform([=](auto) { | ||||||||||||||||||
| return x; | ||||||||||||||||||
| }); | ||||||||||||||||||
| }) | ||||||||||||||||||
| | std::views::join; | ||||||||||||||||||
|
|
||||||||||||||||||
| if (!std::equal(observed.begin(), observed.end(), gold.begin())) | ||||||||||||||||||
| { | ||||||||||||||||||
| throw int{}; | ||||||||||||||||||
| } | ||||||||||||||||||
| } | ||||||||||||||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,5 @@ | ||
| CHECK: Compressed input (first 32 runs): | ||
| CHECK-NEXT: (0,4) (1,4) (2,4) (3,4) (4,4) (5,4) (6,4) (7,4) (8,4) (9,4) (10,4) (11,4) (12,4) (13,4) (14,4) (15,4) (16,4) (17,4) (18,4) (19,4) (20,4) (21,4) (22,4) (23,4) (24,4) (25,4) (26,4) (27,4) (28,4) (29,4) (30,4) (31,4) | ||
| CHECK-NEXT: {{^$}} | ||
| CHECK-NEXT: Decompressed output (first 32 elements): | ||
| CHECK-NEXT: 0 0 0 0 1 1 1 1 2 2 2 2 3 3 3 3 4 4 4 4 5 5 5 5 6 6 6 6 7 7 7 7 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nitpick: What is the purpose here? do you want to delete move operations?