diff --git a/cub/cub/agent/single_pass_scan_operators.cuh b/cub/cub/agent/single_pass_scan_operators.cuh index 351cea2dd28..7dceb902758 100644 --- a/cub/cub/agent/single_pass_scan_operators.cuh +++ b/cub/cub/agent/single_pass_scan_operators.cuh @@ -41,8 +41,8 @@ #include #include #include -#include #include +#include #include diff --git a/cub/cub/cub.cuh b/cub/cub/cub.cuh index 18dc2c0034d..c5cac22cf14 100644 --- a/cub/cub/cub.cuh +++ b/cub/cub/cub.cuh @@ -106,4 +106,5 @@ #include "util_device.cuh" #include "util_macro.cuh" #include "util_ptx.cuh" +#include "util_temporary_storage.cuh" #include "util_type.cuh" diff --git a/cub/cub/detail/temporary_storage.cuh b/cub/cub/detail/temporary_storage.cuh index 005a3763ef5..51cb3cc855c 100644 --- a/cub/cub/detail/temporary_storage.cuh +++ b/cub/cub/detail/temporary_storage.cuh @@ -17,7 +17,7 @@ #pragma once #include -#include +#include CUB_NAMESPACE_BEGIN diff --git a/cub/cub/device/dispatch/dispatch_histogram.cuh b/cub/cub/device/dispatch/dispatch_histogram.cuh index b393d3c8245..4621be04003 100644 --- a/cub/cub/device/dispatch/dispatch_histogram.cuh +++ b/cub/cub/device/dispatch/dispatch_histogram.cuh @@ -38,6 +38,8 @@ #include #include #include +#include +#include #include #include #include diff --git a/cub/cub/device/dispatch/dispatch_reduce.cuh b/cub/cub/device/dispatch/dispatch_reduce.cuh index 994adc0954c..e3402b6d8c9 100644 --- a/cub/cub/device/dispatch/dispatch_reduce.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce.cuh @@ -46,6 +46,7 @@ #include #include #include +#include #include diff --git a/cub/cub/device/dispatch/tuning/tuning_histogram.cuh b/cub/cub/device/dispatch/tuning/tuning_histogram.cuh index f53ae3a27ba..2b595e91e79 100644 --- a/cub/cub/device/dispatch/tuning/tuning_histogram.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_histogram.cuh @@ -30,6 +30,7 @@ #include #include #include +#include #include CUB_NAMESPACE_BEGIN diff --git a/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh b/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh index 49bc19974e9..e06ddb5019e 100644 --- a/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh @@ -33,6 +33,7 @@ #include #include #include +#include #include CUB_NAMESPACE_BEGIN diff --git a/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh b/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh index 80282592915..d7ad21c808f 100644 --- a/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh @@ -34,6 +34,7 @@ #include #include #include +#include #include CUB_NAMESPACE_BEGIN diff --git a/cub/cub/device/dispatch/tuning/tuning_scan.cuh b/cub/cub/device/dispatch/tuning/tuning_scan.cuh index 0ea5c41ad62..eff4f29fcd2 100644 --- a/cub/cub/device/dispatch/tuning/tuning_scan.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_scan.cuh @@ -33,6 +33,7 @@ #include #include #include +#include #include CUB_NAMESPACE_BEGIN diff --git a/cub/cub/device/dispatch/tuning/tuning_scan_by_key.cuh b/cub/cub/device/dispatch/tuning/tuning_scan_by_key.cuh index 63897248644..92e4931fe81 100644 --- a/cub/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_scan_by_key.cuh @@ -34,6 +34,7 @@ #include #include #include +#include #include #include diff --git a/cub/cub/device/dispatch/tuning/tuning_select_if.cuh b/cub/cub/device/dispatch/tuning/tuning_select_if.cuh index 27247ca70e5..41e5b34f5ae 100644 --- a/cub/cub/device/dispatch/tuning/tuning_select_if.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_select_if.cuh @@ -32,6 +32,7 @@ #include #include #include +#include #include CUB_NAMESPACE_BEGIN diff --git a/cub/cub/device/dispatch/tuning/tuning_three_way_partition.cuh b/cub/cub/device/dispatch/tuning/tuning_three_way_partition.cuh index 8c8fabe79e0..51371dd461c 100644 --- a/cub/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_three_way_partition.cuh @@ -32,6 +32,7 @@ #include #include #include +#include #include #include diff --git a/cub/cub/device/dispatch/tuning/tuning_unique_by_key.cuh b/cub/cub/device/dispatch/tuning/tuning_unique_by_key.cuh index b0a40205d98..3f63d897e90 100644 --- a/cub/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_unique_by_key.cuh @@ -32,6 +32,7 @@ #include #include #include +#include #include #include diff --git a/cub/cub/iterator/arg_index_input_iterator.cuh b/cub/cub/iterator/arg_index_input_iterator.cuh index 822869a5a54..7ea860981e6 100644 --- a/cub/cub/iterator/arg_index_input_iterator.cuh +++ b/cub/cub/iterator/arg_index_input_iterator.cuh @@ -39,7 +39,6 @@ #include "../config.cuh" #include "../thread/thread_load.cuh" #include "../thread/thread_store.cuh" -#include "../util_device.cuh" #include diff --git a/cub/cub/iterator/cache_modified_input_iterator.cuh b/cub/cub/iterator/cache_modified_input_iterator.cuh index 6dcf2c8c02f..9a5936d5ac6 100644 --- a/cub/cub/iterator/cache_modified_input_iterator.cuh +++ b/cub/cub/iterator/cache_modified_input_iterator.cuh @@ -39,7 +39,6 @@ #include "../config.cuh" #include "../thread/thread_load.cuh" #include "../thread/thread_store.cuh" -#include "../util_device.cuh" #if (THRUST_VERSION >= 100700) // This iterator is compatible with Thrust API 1.7 and newer diff --git a/cub/cub/iterator/cache_modified_output_iterator.cuh b/cub/cub/iterator/cache_modified_output_iterator.cuh index be08dbce0bf..91d4fc91a7e 100644 --- a/cub/cub/iterator/cache_modified_output_iterator.cuh +++ b/cub/cub/iterator/cache_modified_output_iterator.cuh @@ -39,7 +39,6 @@ #include "../thread/thread_load.cuh" #include "../thread/thread_store.cuh" #include "../config.cuh" -#include "../util_device.cuh" #if (THRUST_VERSION >= 100700) // This iterator is compatible with Thrust API 1.7 and newer diff --git a/cub/cub/iterator/counting_input_iterator.cuh b/cub/cub/iterator/counting_input_iterator.cuh index 0272543710e..700455f420c 100644 --- a/cub/cub/iterator/counting_input_iterator.cuh +++ b/cub/cub/iterator/counting_input_iterator.cuh @@ -39,7 +39,6 @@ #include "../thread/thread_load.cuh" #include "../thread/thread_store.cuh" #include "../config.cuh" -#include "../util_device.cuh" #if (THRUST_VERSION >= 100700) // This iterator is compatible with Thrust API 1.7 and newer diff --git a/cub/cub/iterator/tex_obj_input_iterator.cuh b/cub/cub/iterator/tex_obj_input_iterator.cuh index 6ac249ce318..cd3d015aab8 100644 --- a/cub/cub/iterator/tex_obj_input_iterator.cuh +++ b/cub/cub/iterator/tex_obj_input_iterator.cuh @@ -37,7 +37,6 @@ #include #include #include -#include #include diff --git a/cub/cub/iterator/transform_input_iterator.cuh b/cub/cub/iterator/transform_input_iterator.cuh index 2ef7269c676..0b3350e88a4 100644 --- a/cub/cub/iterator/transform_input_iterator.cuh +++ b/cub/cub/iterator/transform_input_iterator.cuh @@ -39,7 +39,6 @@ #include "../thread/thread_load.cuh" #include "../thread/thread_store.cuh" #include "../config.cuh" -#include "../util_device.cuh" #if (THRUST_VERSION >= 100700) // This iterator is compatible with Thrust API 1.7 and newer diff --git a/cub/cub/util_device.cuh b/cub/cub/util_device.cuh index cb798372fc3..cc371826a91 100644 --- a/cub/cub/util_device.cuh +++ b/cub/cub/util_device.cuh @@ -28,7 +28,11 @@ /** * \file - * Properties of a given CUDA device and the corresponding PTX bundle + * Properties of a given CUDA device and the corresponding PTX bundle. + * + * \note + * This file contains __host__ only functions and utilities, and should not be + * included in code paths that could be online-compiled (ex: using NVRTC). */ #pragma once @@ -42,6 +46,8 @@ #include #include #include +// for backward compatibility +#include #include @@ -60,55 +66,6 @@ CUB_NAMESPACE_BEGIN #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document -/** - * \brief Alias temporaries to externally-allocated device storage (or simply return the amount of storage needed). - */ -template -__host__ __device__ __forceinline__ -cudaError_t AliasTemporaries( - void *d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t& temp_storage_bytes, ///< [in,out] Size in bytes of \t d_temp_storage allocation - void* (&allocations)[ALLOCATIONS], ///< [in,out] Pointers to device allocations needed - size_t (&allocation_sizes)[ALLOCATIONS]) ///< [in] Sizes in bytes of device allocations needed -{ - const int ALIGN_BYTES = 256; - const int ALIGN_MASK = ~(ALIGN_BYTES - 1); - - // Compute exclusive prefix sum over allocation requests - size_t allocation_offsets[ALLOCATIONS]; - size_t bytes_needed = 0; - for (int i = 0; i < ALLOCATIONS; ++i) - { - size_t allocation_bytes = (allocation_sizes[i] + ALIGN_BYTES - 1) & ALIGN_MASK; - allocation_offsets[i] = bytes_needed; - bytes_needed += allocation_bytes; - } - bytes_needed += ALIGN_BYTES - 1; - - // Check if the caller is simply requesting the size of the storage allocation - if (!d_temp_storage) - { - temp_storage_bytes = bytes_needed; - return cudaSuccess; - } - - // Check if enough storage provided - if (temp_storage_bytes < bytes_needed) - { - return CubDebug(cudaErrorInvalidValue); - } - - // Alias - d_temp_storage = (void *) ((size_t(d_temp_storage) + ALIGN_BYTES - 1) & ALIGN_MASK); - for (int i = 0; i < ALLOCATIONS; ++i) - { - allocations[i] = static_cast(d_temp_storage) + allocation_offsets[i]; - } - - return cudaSuccess; -} - - /** * \brief Empty kernel for querying PTX manifest metadata (e.g., version) for the current device */ diff --git a/cub/cub/util_temporary_storage.cuh b/cub/cub/util_temporary_storage.cuh new file mode 100644 index 00000000000..780efcd366e --- /dev/null +++ b/cub/cub/util_temporary_storage.cuh @@ -0,0 +1,101 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Utilities for device-accessible temporary storages. + */ + +#pragma once + +#include +#include + + +CUB_NAMESPACE_BEGIN + +/** + * \addtogroup UtilMgmt + * @{ + */ + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +/** + * \brief Alias temporaries to externally-allocated device storage (or simply return the amount of storage needed). + */ +template +__host__ __device__ __forceinline__ +cudaError_t AliasTemporaries( + void *d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Size in bytes of \t d_temp_storage allocation + void* (&allocations)[ALLOCATIONS], ///< [in,out] Pointers to device allocations needed + size_t (&allocation_sizes)[ALLOCATIONS]) ///< [in] Sizes in bytes of device allocations needed +{ + const int ALIGN_BYTES = 256; + const int ALIGN_MASK = ~(ALIGN_BYTES - 1); + + // Compute exclusive prefix sum over allocation requests + size_t allocation_offsets[ALLOCATIONS]; + size_t bytes_needed = 0; + for (int i = 0; i < ALLOCATIONS; ++i) + { + size_t allocation_bytes = (allocation_sizes[i] + ALIGN_BYTES - 1) & ALIGN_MASK; + allocation_offsets[i] = bytes_needed; + bytes_needed += allocation_bytes; + } + bytes_needed += ALIGN_BYTES - 1; + + // Check if the caller is simply requesting the size of the storage allocation + if (!d_temp_storage) + { + temp_storage_bytes = bytes_needed; + return cudaSuccess; + } + + // Check if enough storage provided + if (temp_storage_bytes < bytes_needed) + { + return CubDebug(cudaErrorInvalidValue); + } + + // Alias + d_temp_storage = (void *) ((size_t(d_temp_storage) + ALIGN_BYTES - 1) & ALIGN_MASK); + for (int i = 0; i < ALLOCATIONS; ++i) + { + allocations[i] = static_cast(d_temp_storage) + allocation_offsets[i]; + } + + return cudaSuccess; +} + +#endif // DOXYGEN_SHOULD_SKIP_THIS + +/** @} */ // end group UtilMgmt + +CUB_NAMESPACE_END diff --git a/thrust/thrust/system/cuda/detail/core/util.h b/thrust/thrust/system/cuda/detail/core/util.h index e2f5f8299e4..3741aec2848 100644 --- a/thrust/thrust/system/cuda/detail/core/util.h +++ b/thrust/thrust/system/cuda/detail/core/util.h @@ -36,6 +36,7 @@ #include #include #include +#include #include