Skip to content

Commit

Permalink
GH-34535: [C++] Move ChunkResolver to the public API (#44357)
Browse files Browse the repository at this point in the history
### Rationale for this change

Adopting #40226. 

 The creation and return of a shared_ptr does result in some performance overhead, that makes a difference for a performance-sensitive application.

If someone could use ChunkResolver to learn the indices, they could then instead access the data directly. 

### What changes are included in this PR?

- [X] Updates to documentation (thanks to @ SChakravorti21 )
- [X] Moving `ChunkResolver` to public API, and updating all references to it in the code

### Are these changes tested?

There seemed to be comprehensive tests already: https://github.com/apache/arrow/blob/main/cpp/src/arrow/chunked_array_test.cc#L324 If an edgecase is missing, I'd be happy to add it.

### Are there any user-facing changes?

`ChunkResolver` and `TypedChunkLocation` are now in the public API.
* GitHub Issue: #34535

Lead-authored-by: Anja Kefala <[email protected]>
Co-authored-by: anjakefala <[email protected]>
Co-authored-by: Bryce Mecum <[email protected]>
Co-authored-by: SChakravorti21 <[email protected]>
Co-authored-by: SChakravorti21<[email protected]>
Co-authored-by: Jacob Wujciak-Jens <[email protected]>
Signed-off-by: Felipe Oliveira Carvalho <[email protected]>
  • Loading branch information
4 people authored Oct 21, 2024
1 parent 375b21f commit 8208774
Show file tree
Hide file tree
Showing 11 changed files with 55 additions and 37 deletions.
4 changes: 2 additions & 2 deletions cpp/src/arrow/chunk_resolver.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
#include "arrow/array.h"
#include "arrow/record_batch.h"

namespace arrow::internal {
namespace arrow {

namespace {
template <typename T>
Expand Down Expand Up @@ -167,4 +167,4 @@ void ChunkResolver::ResolveManyImpl(int64_t n_indices, const uint64_t* logical_i
logical_index_vec, out_chunk_location_vec, chunk_hint);
}

} // namespace arrow::internal
} // namespace arrow
40 changes: 23 additions & 17 deletions cpp/src/arrow/chunk_resolver.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,12 @@
#include "arrow/type_fwd.h"
#include "arrow/util/macros.h"

namespace arrow::internal {
namespace arrow {

struct ChunkResolver;
class ChunkResolver;

template <typename IndexType>
struct TypedChunkLocation {
struct ARROW_EXPORT TypedChunkLocation {
/// \brief Index of the chunk in the array of chunks
///
/// The value is always in the range `[0, chunks.size()]`. `chunks.size()` is used
Expand All @@ -41,7 +41,7 @@ struct TypedChunkLocation {

/// \brief Index of the value in the chunk
///
/// The value is UNDEFINED if chunk_index >= chunks.size()
/// The value is UNDEFINED if `chunk_index >= chunks.size()`
IndexType index_in_chunk = 0;

TypedChunkLocation() = default;
Expand All @@ -61,7 +61,7 @@ using ChunkLocation = TypedChunkLocation<int64_t>;

/// \brief An utility that incrementally resolves logical indices into
/// physical indices in a chunked array.
struct ARROW_EXPORT ChunkResolver {
class ARROW_EXPORT ChunkResolver {
private:
/// \brief Array containing `chunks.size() + 1` offsets.
///
Expand All @@ -76,14 +76,11 @@ struct ARROW_EXPORT ChunkResolver {

public:
explicit ChunkResolver(const ArrayVector& chunks) noexcept;

explicit ChunkResolver(const std::vector<const Array*>& chunks) noexcept;

explicit ChunkResolver(const RecordBatchVector& batches) noexcept;

/// \brief Construct a ChunkResolver from a vector of chunks.size() + 1 offsets.
///
/// The first offset must be 0 and the last offset must be the logical length of the
/// chunked array. Each offset before the last represents the starting logical index of
/// the corresponding chunk.
explicit ChunkResolver(std::vector<int64_t> offsets) noexcept
: offsets_(std::move(offsets)), cached_chunk_(0) {
#ifndef NDEBUG
Expand Down Expand Up @@ -115,11 +112,11 @@ struct ARROW_EXPORT ChunkResolver {
/// The returned ChunkLocation contains the chunk index and the within-chunk index
/// equivalent to the logical index.
///
/// \pre index >= 0
/// \post location.chunk_index in [0, chunks.size()]
/// \pre `index >= 0`
/// \post `location.chunk_index` in `[0, chunks.size()]`
/// \param index The logical index to resolve
/// \return ChunkLocation with a valid chunk_index if index is within
/// bounds, or with chunk_index == chunks.size() if logical index is
/// bounds, or with `chunk_index == chunks.size()` if logical index is
/// `>= chunked_array.length()`.
inline ChunkLocation Resolve(int64_t index) const {
const auto cached_chunk = cached_chunk_.load(std::memory_order_relaxed);
Expand All @@ -133,13 +130,13 @@ struct ARROW_EXPORT ChunkResolver {
/// The returned ChunkLocation contains the chunk index and the within-chunk index
/// equivalent to the logical index.
///
/// \pre index >= 0
/// \post location.chunk_index in [0, chunks.size()]
/// \pre `index >= 0`
/// \post `location.chunk_index` in `[0, chunks.size()]`
/// \param index The logical index to resolve
/// \param hint ChunkLocation{} or the last ChunkLocation returned by
/// this ChunkResolver.
/// \return ChunkLocation with a valid chunk_index if index is within
/// bounds, or with chunk_index == chunks.size() if logical index is
/// bounds, or with `chunk_index == chunks.size()` if logical index is
/// `>= chunked_array.length()`.
inline ChunkLocation ResolveWithHint(int64_t index, ChunkLocation hint) const {
assert(hint.chunk_index < static_cast<uint32_t>(offsets_.size()));
Expand Down Expand Up @@ -281,4 +278,13 @@ struct ARROW_EXPORT ChunkResolver {
}
};

} // namespace arrow::internal
// Explicitly instantiate template base struct, for DLL linking on Windows
template struct TypedChunkLocation<int32_t>;
template struct TypedChunkLocation<int16_t>;
template struct TypedChunkLocation<int8_t>;
template struct TypedChunkLocation<uint8_t>;
template struct TypedChunkLocation<uint16_t>;
template struct TypedChunkLocation<uint32_t>;
template struct TypedChunkLocation<int64_t>;
template struct TypedChunkLocation<uint64_t>;
} // namespace arrow
3 changes: 0 additions & 3 deletions cpp/src/arrow/chunk_resolver_benchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,6 @@

namespace arrow {

using internal::ChunkResolver;
using internal::TypedChunkLocation;

namespace {

int64_t constexpr kChunkedArrayLength = std::numeric_limits<uint16_t>::max();
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/arrow/chunked_array.h
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ class ARROW_EXPORT ChunkedArray {
private:
template <typename T, typename V>
friend class ::arrow::stl::ChunkedArrayIterator;
internal::ChunkResolver chunk_resolver_;
ChunkResolver chunk_resolver_;
ARROW_DISALLOW_COPY_AND_ASSIGN(ChunkedArray);
};

Expand Down
4 changes: 0 additions & 4 deletions cpp/src/arrow/chunked_array_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,6 @@

namespace arrow {

using internal::ChunkLocation;
using internal::ChunkResolver;
using internal::TypedChunkLocation;

class TestChunkedArray : public ::testing::Test {
protected:
virtual void Construct() {
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/arrow/compute/kernels/chunked_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ struct ResolvedChunk {

class ChunkedArrayResolver {
private:
::arrow::internal::ChunkResolver resolver_;
ChunkResolver resolver_;
std::vector<const Array*> chunks_;

public:
Expand Down
3 changes: 1 addition & 2 deletions cpp/src/arrow/compute/kernels/vector_sort.cc
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
namespace arrow {

using internal::checked_cast;
using internal::ChunkLocation;

namespace compute {
namespace internal {
Expand Down Expand Up @@ -852,7 +851,7 @@ class TableSorter {
const RecordBatchVector batches_;
const SortOptions& options_;
const NullPlacement null_placement_;
const ::arrow::internal::ChunkResolver left_resolver_, right_resolver_;
const ::arrow::ChunkResolver left_resolver_, right_resolver_;
const std::vector<ResolvedSortKey> sort_keys_;
uint64_t* indices_begin_;
uint64_t* indices_end_;
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/arrow/compute/kernels/vector_sort_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -749,9 +749,9 @@ struct ResolvedTableSortKey {
order(order),
null_count(null_count) {}

using LocationType = ::arrow::internal::ChunkLocation;
using LocationType = ::arrow::ChunkLocation;

ResolvedChunk GetChunk(::arrow::internal::ChunkLocation loc) const {
ResolvedChunk GetChunk(::arrow::ChunkLocation loc) const {
return {chunks[loc.chunk_index], loc.index_in_chunk};
}

Expand Down
2 changes: 1 addition & 1 deletion cpp/src/arrow/stl_iterator.h
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,7 @@ class ChunkedArrayIterator {
}

private:
arrow::internal::ChunkLocation GetChunkLocation(int64_t index) const {
arrow::ChunkLocation GetChunkLocation(int64_t index) const {
assert(chunked_array_);
return chunked_array_->chunk_resolver_.Resolve(index);
}
Expand Down
14 changes: 14 additions & 0 deletions docs/source/cpp/api/array.rst
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@
Arrays
======

Base classes
============

.. doxygenclass:: arrow::ArrayData
:project: arrow_cpp
:members:
Expand Down Expand Up @@ -85,6 +88,17 @@ Chunked Arrays
:project: arrow_cpp
:members:

.. doxygenstruct:: arrow::ChunkLocation
:project: arrow_cpp
:members:

.. doxygenstruct:: arrow::TypedChunkLocation
:project: arrow_cpp
:members:

.. doxygenclass:: arrow::ChunkResolver
:project: arrow_cpp
:members:

Utilities
=========
Expand Down
14 changes: 10 additions & 4 deletions r/src/altrep.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,20 +80,26 @@ void DeletePointer(std::shared_ptr<T>* ptr) {
template <typename T>
using Pointer = cpp11::external_pointer<std::shared_ptr<T>, DeletePointer<T>>;

#if ARROW_VERSION_MAJOR >= 18
using ChunkResolver = arrow::ChunkResolver;
using ChunkLocation = arrow::ChunkLocation;
#else
using ChunkResolver = arrow::internal::ChunkResolver;
using ChunkLocation = arrow::internal::ChunkLocation;
#endif

class ArrowAltrepData {
public:
explicit ArrowAltrepData(const std::shared_ptr<ChunkedArray>& chunked_array)
: chunked_array_(chunked_array), resolver_(chunked_array->chunks()) {}

const std::shared_ptr<ChunkedArray>& chunked_array() { return chunked_array_; }

arrow::internal::ChunkLocation locate(int64_t index) {
return resolver_.Resolve(index);
}
ChunkLocation locate(int64_t index) { return resolver_.Resolve(index); }

private:
std::shared_ptr<ChunkedArray> chunked_array_;
arrow::internal::ChunkResolver resolver_;
ChunkResolver resolver_;
};

// the ChunkedArray that is being wrapped by the altrep object
Expand Down

0 comments on commit 8208774

Please sign in to comment.