diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 65347bd6689..5e79204a558 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -674,6 +674,7 @@ add_library( src/utilities/linked_column.cpp src/utilities/logger.cpp src/utilities/pinned_memory.cpp + src/utilities/prefetch.cpp src/utilities/stacktrace.cpp src/utilities/stream_pool.cpp src/utilities/traits.cpp diff --git a/cpp/include/cudf/column/column_view.hpp b/cpp/include/cudf/column/column_view.hpp index 134e835911f..03352fdce13 100644 --- a/cpp/include/cudf/column/column_view.hpp +++ b/cpp/include/cudf/column/column_view.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,7 +16,9 @@ #pragma once #include +#include #include +#include #include #include #include @@ -72,7 +74,7 @@ class column_view_base { CUDF_ENABLE_IF(std::is_same_v or is_rep_layout_compatible())> T const* head() const noexcept { - return static_cast(_data); + return static_cast(get_data()); } /** @@ -225,6 +227,17 @@ class column_view_base { [[nodiscard]] size_type offset() const noexcept { return _offset; } protected: + /** + * @brief Returns pointer to the base device memory allocation. + * + * The primary purpose of this function is to allow derived classes to + * override the fundamental properties of memory accesses without needing to + * change all of the different accessors for the underlying pointer. + * + * @return Typed pointer to underlying data + */ + virtual void const* get_data() const noexcept { return _data; } + data_type _type{type_id::EMPTY}; ///< Element type size_type _size{}; ///< Number of elements void const* _data{}; ///< Pointer to device memory containing elements @@ -236,7 +249,7 @@ class column_view_base { ///< Enables zero-copy slicing column_view_base() = default; - ~column_view_base() = default; + virtual ~column_view_base() = default; column_view_base(column_view_base const&) = default; ///< Copy constructor column_view_base(column_view_base&&) = default; ///< Move constructor /** @@ -283,11 +296,6 @@ class column_view_base { size_type null_count, size_type offset = 0); }; - -class mutable_column_view_base : public column_view_base { - public: - protected: -}; } // namespace detail /** @@ -323,7 +331,7 @@ class column_view : public detail::column_view_base { #ifdef __CUDACC__ #pragma nv_exec_check_disable #endif - ~column_view() = default; + ~column_view() override = default; #ifdef __CUDACC__ #pragma nv_exec_check_disable #endif @@ -447,6 +455,18 @@ class column_view : public detail::column_view_base { return device_span(data(), size()); } + protected: + /** + * @brief Returns pointer to the base device memory allocation. + * + * The primary purpose of this function is to allow derived classes to + * override the fundamental properties of memory accesses without needing to + * change all of the different accessors for the underlying pointer. + * + * @return Typed pointer to underlying data + */ + void const* get_data() const noexcept override; + private: friend column_view bit_cast(column_view const& input, data_type type); @@ -478,7 +498,7 @@ class mutable_column_view : public detail::column_view_base { public: mutable_column_view() = default; - ~mutable_column_view(){ + ~mutable_column_view() override{ // Needed so that the first instance of the implicit destructor for any TU isn't 'constructed' // from a host+device function marking the implicit version also as host+device }; @@ -572,7 +592,7 @@ class mutable_column_view : public detail::column_view_base { } /** - * @brief Return first element (accounting for offset) when underlying data is + * @brief Return first element (accounting for offset) after underlying data is * casted to the specified type. * * This function does not participate in overload resolution if `is_rep_layout_compatible` is @@ -665,6 +685,18 @@ class mutable_column_view : public detail::column_view_base { */ operator column_view() const; + protected: + /** + * @brief Returns pointer to the base device memory allocation. + * + * The primary purpose of this function is to allow derived classes to + * override the fundamental properties of memory accesses without needing to + * change all of the different accessors for the underlying pointer. + * + * @return Typed pointer to underlying data + */ + void const* get_data() const noexcept override; + private: friend mutable_column_view bit_cast(mutable_column_view const& input, data_type type); diff --git a/cpp/include/cudf/detail/join.hpp b/cpp/include/cudf/detail/join.hpp index aabfff746ea..b4ec5f2cc69 100644 --- a/cpp/include/cudf/detail/join.hpp +++ b/cpp/include/cudf/detail/join.hpp @@ -40,9 +40,6 @@ class preprocessed_table; namespace cudf { namespace detail { -// Forward declaration -class cuco_allocator; - constexpr int DEFAULT_JOIN_CG_SIZE = 2; enum class join_kind { INNER_JOIN, LEFT_JOIN, FULL_JOIN, LEFT_SEMI_JOIN, LEFT_ANTI_JOIN }; diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh index fcd74bebfe8..4369de317b3 100644 --- a/cpp/include/cudf/strings/detail/gather.cuh +++ b/cpp/include/cudf/strings/detail/gather.cuh @@ -18,11 +18,13 @@ #include #include #include +#include #include #include #include #include #include +#include #include #include @@ -230,7 +232,8 @@ rmm::device_uvector gather_chars(StringIterator strings_begin, if (output_count == 0) return rmm::device_uvector(0, stream, mr); auto chars_data = rmm::device_uvector(chars_bytes, stream, mr); - auto d_chars = chars_data.data(); + cudf::experimental::prefetch::detail::prefetch("gather", chars_data, stream); + auto d_chars = chars_data.data(); constexpr int warps_per_threadblock = 4; // String parallel strategy will be used if average string length is above this threshold. @@ -312,6 +315,8 @@ std::unique_ptr gather(strings_column_view const& strings, // build chars column auto const offsets_view = cudf::detail::offsetalator_factory::make_input_iterator(out_offsets_column->view()); + cudf::experimental::prefetch::detail::prefetch( + "gather", strings.chars_begin(stream), strings.chars_size(stream), stream); auto out_chars_data = gather_chars( d_strings->begin(), begin, end, offsets_view, total_bytes, stream, mr); diff --git a/cpp/include/cudf/strings/detail/strings_children.cuh b/cpp/include/cudf/strings/detail/strings_children.cuh index f5f3982a5d6..55b59dd4ff2 100644 --- a/cpp/include/cudf/strings/detail/strings_children.cuh +++ b/cpp/include/cudf/strings/detail/strings_children.cuh @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -186,6 +187,7 @@ auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn, // Now build the chars column rmm::device_uvector chars(bytes, stream, mr); + cudf::experimental::prefetch::detail::prefetch("gather", chars, stream); size_and_exec_fn.d_chars = chars.data(); // Execute the function fn again to fill in the chars data. diff --git a/cpp/include/cudf/utilities/prefetch.hpp b/cpp/include/cudf/utilities/prefetch.hpp new file mode 100644 index 00000000000..5ca6fd6f4b0 --- /dev/null +++ b/cpp/include/cudf/utilities/prefetch.hpp @@ -0,0 +1,155 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include + +#include +#include +#include + +namespace cudf::experimental::prefetch { + +namespace detail { + +/** + * @brief A singleton class that manages the prefetching configuration. + */ +class PrefetchConfig { + public: + PrefetchConfig& operator=(const PrefetchConfig&) = delete; + PrefetchConfig(const PrefetchConfig&) = delete; + + /** + * @brief Get the singleton instance of the prefetching configuration. + * + * @return The singleton instance of the prefetching configuration. + */ + static PrefetchConfig& instance(); + + /** + * @brief Get the value of a configuration key. + * + * @param key The configuration key. + * @return The value of the configuration key. + */ + bool get(std::string_view key); + /** + * @brief Set the value of a configuration key. + * + * @param key The configuration key. + * @param value The value to set. + */ + void set(std::string_view key, bool value); + /** + * @brief Enable or disable debug mode. + * + * In debug mode, the pointers being prefetched are printed to stderr. + */ + bool debug{false}; + + private: + PrefetchConfig() = default; //< Private constructor to enforce singleton pattern + std::map config_values; //< Map of configuration keys to values +}; + +/** + * @brief Enable prefetching for a particular structure or algorithm. + * + * @param key The key to enable prefetching for. + * @param ptr The pointer to prefetch. + * @param size The size of the memory region to prefetch. + * @param stream The stream to prefetch on. + * @param device_id The device to prefetch on. + */ +void prefetch(std::string_view key, + void const* ptr, + std::size_t size, + rmm::cuda_stream_view stream, + rmm::cuda_device_id device_id = rmm::get_current_cuda_device()); + +/** + * @brief Enable prefetching for a particular structure or algorithm. + * + * @note This function will not throw exceptions, so it is safe to call in + * noexcept contexts. If an error occurs, the error code is returned. This + * function primarily exists for [mutable_]column_view::get_data and should be + * removed once an method for stream-ordered data pointer access is added to + * those data structures. + * + * @param key The key to enable prefetching for. + * @param ptr The pointer to prefetch. + * @param size The size of the memory region to prefetch. + * @param stream The stream to prefetch on. + * @param device_id The device to prefetch on. + */ +cudaError_t prefetch_noexcept( + std::string_view key, + void const* ptr, + std::size_t size, + rmm::cuda_stream_view stream, + rmm::cuda_device_id device_id = rmm::get_current_cuda_device()) noexcept; + +/** + * @brief Prefetch the data in a device_uvector. + * + * @note At present this function does not support stream-ordered execution. Prefetching always + * occurs on the default stream. + * + * @param key The key to enable prefetching for. + * @param v The device_uvector to prefetch. + * @param stream The stream to prefetch on. + * @param device_id The device to prefetch on. + */ +template +void prefetch(std::string_view key, + rmm::device_uvector const& v, + rmm::cuda_stream_view stream, + rmm::cuda_device_id device_id = rmm::get_current_cuda_device()) +{ + if (v.is_empty()) { return; } + prefetch(key, v.data(), v.size(), stream, device_id); +} + +} // namespace detail + +/** + * @brief Enable prefetching for a particular structure or algorithm. + * + * @param key The key to enable prefetching for. + */ +void enable_prefetching(std::string_view key); + +/** + * @brief Disable prefetching for a particular structure or algorithm. + * + * @param key The key to disable prefetching for. + */ +void disable_prefetching(std::string_view key); + +/** + * @brief Enable or disable debug mode. + * + * In debug mode, the pointers being prefetched are printed to stderr. + * + * @param enable Whether to enable or disable debug mode. + */ +void prefetch_debugging(bool enable); + +} // namespace cudf::experimental::prefetch diff --git a/cpp/src/column/column_view.cpp b/cpp/src/column/column_view.cpp index 4d16298c605..a9605efb362 100644 --- a/cpp/src/column/column_view.cpp +++ b/cpp/src/column/column_view.cpp @@ -15,8 +15,10 @@ */ #include +#include #include #include +#include #include #include #include @@ -27,10 +29,37 @@ #include #include #include +#include #include namespace cudf { namespace detail { +namespace { + +template +void prefetch_col_data(ColumnView& col, void const* data_ptr, std::string_view key) noexcept +{ + if (cudf::experimental::prefetch::detail::PrefetchConfig::instance().get(key)) { + if (cudf::is_fixed_width(col.type())) { + cudf::experimental::prefetch::detail::prefetch_noexcept( + key, data_ptr, col.size() * size_of(col.type()), cudf::get_default_stream()); + } else if (col.type().id() == type_id::STRING) { + strings_column_view scv{col}; + + cudf::experimental::prefetch::detail::prefetch_noexcept( + key, + data_ptr, + scv.chars_size(cudf::get_default_stream()) * sizeof(char), + cudf::get_default_stream()); + } else { + std::cout << key << ": Unsupported type: " << static_cast(col.type().id()) + << std::endl; + } + } +} + +} // namespace + column_view_base::column_view_base(data_type type, size_type size, void const* data, @@ -126,6 +155,7 @@ bool is_shallow_equivalent(column_view const& lhs, column_view const& rhs) { return shallow_equivalent_impl(lhs, rhs); } + } // namespace detail // Immutable view constructor @@ -175,6 +205,18 @@ mutable_column_view::operator column_view() const return column_view{_type, _size, _data, _null_mask, _null_count, _offset, std::move(child_views)}; } +void const* column_view::get_data() const noexcept +{ + detail::prefetch_col_data(*this, _data, "column_view::get_data"); + return _data; +} + +void const* mutable_column_view::get_data() const noexcept +{ + detail::prefetch_col_data(*this, _data, "mutable_column_view::get_data"); + return _data; +} + size_type count_descendants(column_view parent) { auto descendants = [](auto const& child) { return count_descendants(child); }; diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu index b0184ff6a86..eb9b687630b 100644 --- a/cpp/src/join/hash_join.cu +++ b/cpp/src/join/hash_join.cu @@ -185,6 +185,8 @@ probe_join_hash_table( auto left_indices = std::make_unique>(join_size, stream, mr); auto right_indices = std::make_unique>(join_size, stream, mr); + cudf::experimental::prefetch::detail::prefetch("hash_join", *left_indices, stream); + cudf::experimental::prefetch::detail::prefetch("hash_join", *right_indices, stream); auto const probe_nulls = cudf::nullate::DYNAMIC{has_nulls}; diff --git a/cpp/src/utilities/prefetch.cpp b/cpp/src/utilities/prefetch.cpp new file mode 100644 index 00000000000..21f2e40c82a --- /dev/null +++ b/cpp/src/utilities/prefetch.cpp @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include + +#include + +namespace cudf::experimental::prefetch { + +namespace detail { + +PrefetchConfig& PrefetchConfig::instance() +{ + static PrefetchConfig instance; + return instance; +} + +bool PrefetchConfig::get(std::string_view key) +{ + // Default to not prefetching + if (config_values.find(key.data()) == config_values.end()) { + return (config_values[key.data()] = false); + } + return config_values[key.data()]; +} +void PrefetchConfig::set(std::string_view key, bool value) { config_values[key.data()] = value; } + +cudaError_t prefetch_noexcept(std::string_view key, + void const* ptr, + std::size_t size, + rmm::cuda_stream_view stream, + rmm::cuda_device_id device_id) noexcept +{ + if (PrefetchConfig::instance().get(key)) { + if (PrefetchConfig::instance().debug) { + std::cerr << "Prefetching " << size << " bytes for key " << key << " at location " << ptr + << std::endl; + } + auto result = cudaMemPrefetchAsync(ptr, size, device_id.value(), stream.value()); + // Need to flush the CUDA error so that the context is not corrupted. + if (result == cudaErrorInvalidValue) { cudaGetLastError(); } + return result; + } + return cudaSuccess; +} + +void prefetch(std::string_view key, + void const* ptr, + std::size_t size, + rmm::cuda_stream_view stream, + rmm::cuda_device_id device_id) +{ + auto result = prefetch_noexcept(key, ptr, size, stream, device_id); + // Ignore cudaErrorInvalidValue because that will be raised if prefetching is + // attempted on unmanaged memory. + if ((result != cudaErrorInvalidValue) && (result != cudaSuccess)) { + std::cerr << "Prefetch failed" << std::endl; + CUDF_CUDA_TRY(result); + } +} + +} // namespace detail + +void enable_prefetching(std::string_view key) { detail::PrefetchConfig::instance().set(key, true); } + +void disable_prefetching(std::string_view key) +{ + detail::PrefetchConfig::instance().set(key, false); +} + +void prefetch_debugging(bool enable) { detail::PrefetchConfig::instance().debug = enable; } +} // namespace cudf::experimental::prefetch diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt index 0800fa18e94..df4591baa71 100644 --- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt @@ -20,6 +20,7 @@ set(cython_sources concatenate.pyx copying.pyx datetime.pyx + experimental.pyx expressions.pyx filling.pyx gpumemoryview.pyx diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd index 26e89b818d3..71f523fc3cd 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd @@ -8,6 +8,7 @@ from . cimport ( concatenate, copying, datetime, + experimental, expressions, filling, groupby, @@ -48,6 +49,8 @@ __all__ = [ "concatenate", "copying", "datetime", + "experimental", + "expressions", "filling", "gpumemoryview", "groupby", diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/cudf/cudf/_lib/pylibcudf/__init__.py index e89a5ed9f96..9705eba84b1 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.py +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.py @@ -7,6 +7,7 @@ concatenate, copying, datetime, + experimental, expressions, filling, groupby, @@ -48,6 +49,8 @@ "concatenate", "copying", "datetime", + "experimental", + "expressions", "filling", "gpumemoryview", "groupby", diff --git a/python/cudf/cudf/_lib/pylibcudf/experimental.pxd b/python/cudf/cudf/_lib/pylibcudf/experimental.pxd new file mode 100644 index 00000000000..107c91c8365 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/experimental.pxd @@ -0,0 +1,10 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp cimport bool + + +cpdef enable_prefetching(str key) + +cpdef disable_prefetching(str key) + +cpdef prefetch_debugging(bool enable) diff --git a/python/cudf/cudf/_lib/pylibcudf/experimental.pyx b/python/cudf/cudf/_lib/pylibcudf/experimental.pyx new file mode 100644 index 00000000000..1e2a682d879 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/experimental.pyx @@ -0,0 +1,43 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp cimport bool +from libcpp.string cimport string + +from cudf._lib.pylibcudf.libcudf cimport experimental as cpp_experimental + + +cpdef enable_prefetching(str key): + """Turn on prefetch instructions for the given key. + + Parameters + ---------- + key : str + The key to enable prefetching for. + """ + cdef string c_key = key.encode("utf-8") + cpp_experimental.enable_prefetching(c_key) + + +cpdef disable_prefetching(str key): + """Turn off prefetch instructions for the given key. + + Parameters + ---------- + key : str + The key to disable prefetching for. + """ + cdef string c_key = key.encode("utf-8") + cpp_experimental.disable_prefetching(c_key) + + +cpdef prefetch_debugging(bool enable): + """Enable or disable prefetch debugging. + + When enabled, any prefetch instructions will be logged to the console. + + Parameters + ---------- + enable : bool + Whether to enable or disable prefetch debugging. + """ + cpp_experimental.prefetch_debugging(enable) diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/experimental.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/experimental.pxd new file mode 100644 index 00000000000..f280a382a04 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/experimental.pxd @@ -0,0 +1,16 @@ +# Copyright (c) 2022-2024, NVIDIA CORPORATION. + +from libcpp cimport bool +from libcpp.string cimport string + + +cdef extern from "cudf/utilities/prefetch.hpp" \ + namespace "cudf::experimental::prefetch" nogil: + # Not technically the right signature, but it's good enough to let Cython + # generate valid C++ code. It just means we'll be copying a host string + # extra, but that's OK. If we care we could generate string_view bindings, + # but there's no real rush so if we go that route we might as well + # contribute them upstream to Cython itself. + void enable_prefetching(string key) + void disable_prefetching(string key) + void prefetch_debugging(bool enable)