From e56e0a48e71b798cfd36078b4c2f1a54b852153a Mon Sep 17 00:00:00 2001 From: Anja Kefala Date: Mon, 21 Oct 2024 15:46:48 -0700 Subject: [PATCH] GH-34535: [C++] Move ChunkResolver to the public API (#44357) ### Rationale for this change Adopting https://github.com/apache/arrow/pull/40226. The creation and return of a shared_ptr does result in some performance overhead, that makes a difference for a performance-sensitive application. If someone could use ChunkResolver to learn the indices, they could then instead access the data directly. ### What changes are included in this PR? - [X] Updates to documentation (thanks to @ SChakravorti21 ) - [X] Moving `ChunkResolver` to public API, and updating all references to it in the code ### Are these changes tested? There seemed to be comprehensive tests already: https://github.com/apache/arrow/blob/main/cpp/src/arrow/chunked_array_test.cc#L324 If an edgecase is missing, I'd be happy to add it. ### Are there any user-facing changes? `ChunkResolver` and `TypedChunkLocation` are now in the public API. * GitHub Issue: #34535 Lead-authored-by: Anja Kefala Co-authored-by: anjakefala Co-authored-by: Bryce Mecum Co-authored-by: SChakravorti21 Co-authored-by: SChakravorti21 Co-authored-by: Jacob Wujciak-Jens Signed-off-by: Felipe Oliveira Carvalho --- cpp/src/arrow/chunk_resolver.cc | 4 +- cpp/src/arrow/chunk_resolver.h | 40 +++++++++++-------- cpp/src/arrow/chunk_resolver_benchmark.cc | 3 -- cpp/src/arrow/chunked_array.h | 2 +- cpp/src/arrow/chunked_array_test.cc | 4 -- .../arrow/compute/kernels/chunked_internal.h | 2 +- cpp/src/arrow/compute/kernels/vector_sort.cc | 3 +- .../compute/kernels/vector_sort_internal.h | 4 +- cpp/src/arrow/stl_iterator.h | 2 +- docs/source/cpp/api/array.rst | 14 +++++++ r/src/altrep.cpp | 14 +++++-- 11 files changed, 55 insertions(+), 37 deletions(-) diff --git a/cpp/src/arrow/chunk_resolver.cc b/cpp/src/arrow/chunk_resolver.cc index bda6b17810299..ca74ffa06c820 100644 --- a/cpp/src/arrow/chunk_resolver.cc +++ b/cpp/src/arrow/chunk_resolver.cc @@ -26,7 +26,7 @@ #include "arrow/array.h" #include "arrow/record_batch.h" -namespace arrow::internal { +namespace arrow { namespace { template @@ -167,4 +167,4 @@ void ChunkResolver::ResolveManyImpl(int64_t n_indices, const uint64_t* logical_i logical_index_vec, out_chunk_location_vec, chunk_hint); } -} // namespace arrow::internal +} // namespace arrow diff --git a/cpp/src/arrow/chunk_resolver.h b/cpp/src/arrow/chunk_resolver.h index 4a5e27c05361f..ab0e753d0040e 100644 --- a/cpp/src/arrow/chunk_resolver.h +++ b/cpp/src/arrow/chunk_resolver.h @@ -27,12 +27,12 @@ #include "arrow/type_fwd.h" #include "arrow/util/macros.h" -namespace arrow::internal { +namespace arrow { -struct ChunkResolver; +class ChunkResolver; template -struct TypedChunkLocation { +struct ARROW_EXPORT TypedChunkLocation { /// \brief Index of the chunk in the array of chunks /// /// The value is always in the range `[0, chunks.size()]`. `chunks.size()` is used @@ -41,7 +41,7 @@ struct TypedChunkLocation { /// \brief Index of the value in the chunk /// - /// The value is UNDEFINED if chunk_index >= chunks.size() + /// The value is UNDEFINED if `chunk_index >= chunks.size()` IndexType index_in_chunk = 0; TypedChunkLocation() = default; @@ -61,7 +61,7 @@ using ChunkLocation = TypedChunkLocation; /// \brief An utility that incrementally resolves logical indices into /// physical indices in a chunked array. -struct ARROW_EXPORT ChunkResolver { +class ARROW_EXPORT ChunkResolver { private: /// \brief Array containing `chunks.size() + 1` offsets. /// @@ -76,14 +76,11 @@ struct ARROW_EXPORT ChunkResolver { public: explicit ChunkResolver(const ArrayVector& chunks) noexcept; + explicit ChunkResolver(const std::vector& chunks) noexcept; + explicit ChunkResolver(const RecordBatchVector& batches) noexcept; - /// \brief Construct a ChunkResolver from a vector of chunks.size() + 1 offsets. - /// - /// The first offset must be 0 and the last offset must be the logical length of the - /// chunked array. Each offset before the last represents the starting logical index of - /// the corresponding chunk. explicit ChunkResolver(std::vector offsets) noexcept : offsets_(std::move(offsets)), cached_chunk_(0) { #ifndef NDEBUG @@ -115,11 +112,11 @@ struct ARROW_EXPORT ChunkResolver { /// The returned ChunkLocation contains the chunk index and the within-chunk index /// equivalent to the logical index. /// - /// \pre index >= 0 - /// \post location.chunk_index in [0, chunks.size()] + /// \pre `index >= 0` + /// \post `location.chunk_index` in `[0, chunks.size()]` /// \param index The logical index to resolve /// \return ChunkLocation with a valid chunk_index if index is within - /// bounds, or with chunk_index == chunks.size() if logical index is + /// bounds, or with `chunk_index == chunks.size()` if logical index is /// `>= chunked_array.length()`. inline ChunkLocation Resolve(int64_t index) const { const auto cached_chunk = cached_chunk_.load(std::memory_order_relaxed); @@ -133,13 +130,13 @@ struct ARROW_EXPORT ChunkResolver { /// The returned ChunkLocation contains the chunk index and the within-chunk index /// equivalent to the logical index. /// - /// \pre index >= 0 - /// \post location.chunk_index in [0, chunks.size()] + /// \pre `index >= 0` + /// \post `location.chunk_index` in `[0, chunks.size()]` /// \param index The logical index to resolve /// \param hint ChunkLocation{} or the last ChunkLocation returned by /// this ChunkResolver. /// \return ChunkLocation with a valid chunk_index if index is within - /// bounds, or with chunk_index == chunks.size() if logical index is + /// bounds, or with `chunk_index == chunks.size()` if logical index is /// `>= chunked_array.length()`. inline ChunkLocation ResolveWithHint(int64_t index, ChunkLocation hint) const { assert(hint.chunk_index < static_cast(offsets_.size())); @@ -281,4 +278,13 @@ struct ARROW_EXPORT ChunkResolver { } }; -} // namespace arrow::internal +// Explicitly instantiate template base struct, for DLL linking on Windows +template struct TypedChunkLocation; +template struct TypedChunkLocation; +template struct TypedChunkLocation; +template struct TypedChunkLocation; +template struct TypedChunkLocation; +template struct TypedChunkLocation; +template struct TypedChunkLocation; +template struct TypedChunkLocation; +} // namespace arrow diff --git a/cpp/src/arrow/chunk_resolver_benchmark.cc b/cpp/src/arrow/chunk_resolver_benchmark.cc index 0756de3fbe930..a6f539a444bbc 100644 --- a/cpp/src/arrow/chunk_resolver_benchmark.cc +++ b/cpp/src/arrow/chunk_resolver_benchmark.cc @@ -28,9 +28,6 @@ namespace arrow { -using internal::ChunkResolver; -using internal::TypedChunkLocation; - namespace { int64_t constexpr kChunkedArrayLength = std::numeric_limits::max(); diff --git a/cpp/src/arrow/chunked_array.h b/cpp/src/arrow/chunked_array.h index c65b6cb6e227f..02bcd0f9026bc 100644 --- a/cpp/src/arrow/chunked_array.h +++ b/cpp/src/arrow/chunked_array.h @@ -199,7 +199,7 @@ class ARROW_EXPORT ChunkedArray { private: template friend class ::arrow::stl::ChunkedArrayIterator; - internal::ChunkResolver chunk_resolver_; + ChunkResolver chunk_resolver_; ARROW_DISALLOW_COPY_AND_ASSIGN(ChunkedArray); }; diff --git a/cpp/src/arrow/chunked_array_test.cc b/cpp/src/arrow/chunked_array_test.cc index f98dde689c237..b3944fd1b1927 100644 --- a/cpp/src/arrow/chunked_array_test.cc +++ b/cpp/src/arrow/chunked_array_test.cc @@ -36,10 +36,6 @@ namespace arrow { -using internal::ChunkLocation; -using internal::ChunkResolver; -using internal::TypedChunkLocation; - class TestChunkedArray : public ::testing::Test { protected: virtual void Construct() { diff --git a/cpp/src/arrow/compute/kernels/chunked_internal.h b/cpp/src/arrow/compute/kernels/chunked_internal.h index 2b72e0ab3109e..f7cb615f3ed81 100644 --- a/cpp/src/arrow/compute/kernels/chunked_internal.h +++ b/cpp/src/arrow/compute/kernels/chunked_internal.h @@ -52,7 +52,7 @@ struct ResolvedChunk { class ChunkedArrayResolver { private: - ::arrow::internal::ChunkResolver resolver_; + ChunkResolver resolver_; std::vector chunks_; public: diff --git a/cpp/src/arrow/compute/kernels/vector_sort.cc b/cpp/src/arrow/compute/kernels/vector_sort.cc index 8766ca3baac96..395ed86a06b4a 100644 --- a/cpp/src/arrow/compute/kernels/vector_sort.cc +++ b/cpp/src/arrow/compute/kernels/vector_sort.cc @@ -24,7 +24,6 @@ namespace arrow { using internal::checked_cast; -using internal::ChunkLocation; namespace compute { namespace internal { @@ -852,7 +851,7 @@ class TableSorter { const RecordBatchVector batches_; const SortOptions& options_; const NullPlacement null_placement_; - const ::arrow::internal::ChunkResolver left_resolver_, right_resolver_; + const ::arrow::ChunkResolver left_resolver_, right_resolver_; const std::vector sort_keys_; uint64_t* indices_begin_; uint64_t* indices_end_; diff --git a/cpp/src/arrow/compute/kernels/vector_sort_internal.h b/cpp/src/arrow/compute/kernels/vector_sort_internal.h index 564afb8c087d2..bee7f838a05da 100644 --- a/cpp/src/arrow/compute/kernels/vector_sort_internal.h +++ b/cpp/src/arrow/compute/kernels/vector_sort_internal.h @@ -749,9 +749,9 @@ struct ResolvedTableSortKey { order(order), null_count(null_count) {} - using LocationType = ::arrow::internal::ChunkLocation; + using LocationType = ::arrow::ChunkLocation; - ResolvedChunk GetChunk(::arrow::internal::ChunkLocation loc) const { + ResolvedChunk GetChunk(::arrow::ChunkLocation loc) const { return {chunks[loc.chunk_index], loc.index_in_chunk}; } diff --git a/cpp/src/arrow/stl_iterator.h b/cpp/src/arrow/stl_iterator.h index 5f2acfb071b29..577066cba0fcd 100644 --- a/cpp/src/arrow/stl_iterator.h +++ b/cpp/src/arrow/stl_iterator.h @@ -237,7 +237,7 @@ class ChunkedArrayIterator { } private: - arrow::internal::ChunkLocation GetChunkLocation(int64_t index) const { + arrow::ChunkLocation GetChunkLocation(int64_t index) const { assert(chunked_array_); return chunked_array_->chunk_resolver_.Resolve(index); } diff --git a/docs/source/cpp/api/array.rst b/docs/source/cpp/api/array.rst index a7e5d0cf07e0a..133a432884d3b 100644 --- a/docs/source/cpp/api/array.rst +++ b/docs/source/cpp/api/array.rst @@ -19,6 +19,9 @@ Arrays ====== +Base classes +============ + .. doxygenclass:: arrow::ArrayData :project: arrow_cpp :members: @@ -85,6 +88,17 @@ Chunked Arrays :project: arrow_cpp :members: +.. doxygenstruct:: arrow::ChunkLocation + :project: arrow_cpp + :members: + +.. doxygenstruct:: arrow::TypedChunkLocation + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::ChunkResolver + :project: arrow_cpp + :members: Utilities ========= diff --git a/r/src/altrep.cpp b/r/src/altrep.cpp index bdaac0a9ce5d2..90a459e19cb6d 100644 --- a/r/src/altrep.cpp +++ b/r/src/altrep.cpp @@ -80,6 +80,14 @@ void DeletePointer(std::shared_ptr* ptr) { template using Pointer = cpp11::external_pointer, DeletePointer>; +#if ARROW_VERSION_MAJOR >= 18 +using ChunkResolver = arrow::ChunkResolver; +using ChunkLocation = arrow::ChunkLocation; +#else +using ChunkResolver = arrow::internal::ChunkResolver; +using ChunkLocation = arrow::internal::ChunkLocation; +#endif + class ArrowAltrepData { public: explicit ArrowAltrepData(const std::shared_ptr& chunked_array) @@ -87,13 +95,11 @@ class ArrowAltrepData { const std::shared_ptr& chunked_array() { return chunked_array_; } - arrow::internal::ChunkLocation locate(int64_t index) { - return resolver_.Resolve(index); - } + ChunkLocation locate(int64_t index) { return resolver_.Resolve(index); } private: std::shared_ptr chunked_array_; - arrow::internal::ChunkResolver resolver_; + ChunkResolver resolver_; }; // the ChunkedArray that is being wrapped by the altrep object