diff --git a/cpp/src/arrow/compute/kernels/chunked_internal.cc b/cpp/src/arrow/compute/kernels/chunked_internal.cc index 187590a4b705d..8f72acc878162 100644 --- a/cpp/src/arrow/compute/kernels/chunked_internal.cc +++ b/cpp/src/arrow/compute/kernels/chunked_internal.cc @@ -49,28 +49,28 @@ std::vector ChunkedIndexMapper::GetChunkLengths( return chunk_lengths; } -Result> +Result> ChunkedIndexMapper::LogicalToPhysical() { - // Check that indices would fall in bounds for ResolvedChunkIndex + // Check that indices would fall in bounds for CompressedChunkLocation if (ARROW_PREDICT_FALSE(chunk_lengths_.size() > - ResolvedChunkIndex::kMaxChunkIndex + 1)) { + CompressedChunkLocation::kMaxChunkIndex + 1)) { return Status::NotImplemented("Chunked array has more than ", - ResolvedChunkIndex::kMaxChunkIndex + 1, " chunks"); + CompressedChunkLocation::kMaxChunkIndex + 1, " chunks"); } for (int64_t chunk_length : chunk_lengths_) { if (ARROW_PREDICT_FALSE(static_cast(chunk_length) > - ResolvedChunkIndex::kMaxIndexInChunk + 1)) { + CompressedChunkLocation::kMaxIndexInChunk + 1)) { return Status::NotImplemented("Individual chunk in chunked array has more than ", - ResolvedChunkIndex::kMaxIndexInChunk + 1, + CompressedChunkLocation::kMaxIndexInChunk + 1, " elements"); } } const int64_t num_indices = static_cast(indices_end_ - indices_begin_); - ResolvedChunkIndex* physical_begin = - reinterpret_cast(indices_begin_); + CompressedChunkLocation* physical_begin = + reinterpret_cast(indices_begin_); DCHECK_EQ(physical_begin + num_indices, - reinterpret_cast(indices_end_)); + reinterpret_cast(indices_end_)); int64_t chunk_offset = 0; for (int64_t chunk_index = 0; chunk_index < static_cast(chunk_lengths_.size()); @@ -80,7 +80,7 @@ ChunkedIndexMapper::LogicalToPhysical() { DCHECK_GE(indices_begin_[chunk_offset + i], static_cast(chunk_offset)); DCHECK_LT(indices_begin_[chunk_offset + i], static_cast(chunk_offset + chunk_length)); - physical_begin[chunk_offset + i] = ResolvedChunkIndex{ + physical_begin[chunk_offset + i] = CompressedChunkLocation{ static_cast(chunk_index), indices_begin_[chunk_offset + i] - static_cast(chunk_offset)}; } @@ -101,8 +101,8 @@ Status ChunkedIndexMapper::PhysicalToLogical() { } const int64_t num_indices = static_cast(indices_end_ - indices_begin_); - ResolvedChunkIndex* physical_begin = - reinterpret_cast(indices_begin_); + CompressedChunkLocation* physical_begin = + reinterpret_cast(indices_begin_); for (int64_t i = 0; i < num_indices; ++i) { const auto loc = physical_begin[i]; DCHECK_LT(loc.chunk_index(), chunk_offsets.size()); diff --git a/cpp/src/arrow/compute/kernels/chunked_internal.h b/cpp/src/arrow/compute/kernels/chunked_internal.h index b317ee4477b54..5bc8233016f30 100644 --- a/cpp/src/arrow/compute/kernels/chunked_internal.h +++ b/cpp/src/arrow/compute/kernels/chunked_internal.h @@ -60,19 +60,20 @@ struct ResolvedChunk { // The goal of compression is to make it fit in 64 bits, allowing in place // replacement of logical uint64_t indices with physical indices. // (see ChunkedIndexMapper) -struct ResolvedChunkIndex { +struct CompressedChunkLocation { static constexpr int kChunkIndexBits = 24; static constexpr int KIndexInChunkBits = 64 - kChunkIndexBits; static constexpr uint64_t kMaxChunkIndex = (1ULL << kChunkIndexBits) - 1; static constexpr uint64_t kMaxIndexInChunk = (1ULL << KIndexInChunkBits) - 1; - ResolvedChunkIndex() = default; + CompressedChunkLocation() = default; constexpr uint64_t chunk_index() const { return data_ & kMaxChunkIndex; } constexpr uint64_t index_in_chunk() const { return data_ >> kChunkIndexBits; } - explicit constexpr ResolvedChunkIndex(uint64_t chunk_index, uint64_t index_in_chunk) + explicit constexpr CompressedChunkLocation(uint64_t chunk_index, + uint64_t index_in_chunk) : data_((index_in_chunk << kChunkIndexBits) | chunk_index) {} template @@ -85,7 +86,7 @@ struct ResolvedChunkIndex { uint64_t data_; }; -static_assert(sizeof(uint64_t) == sizeof(ResolvedChunkIndex)); +static_assert(sizeof(uint64_t) == sizeof(CompressedChunkLocation)); class ChunkedArrayResolver { private: @@ -144,7 +145,8 @@ class ChunkedIndexMapper { // PhysicalToLogical() is called. // // This assumes that the logical indices are originally chunk-partitioned. - Result> LogicalToPhysical(); + Result> + LogicalToPhysical(); // Turn the physical indices back into logical, making the uint64_t indices // usable again. diff --git a/cpp/src/arrow/compute/kernels/vector_sort.cc b/cpp/src/arrow/compute/kernels/vector_sort.cc index b4f972a172257..9185bfb72477f 100644 --- a/cpp/src/arrow/compute/kernels/vector_sort.cc +++ b/cpp/src/arrow/compute/kernels/vector_sort.cc @@ -118,18 +118,18 @@ class ChunkedArraySorter : public TypeVisitor { sorted[i], indices_begin_, chunked_indices_begin); } - auto merge_nulls = [&](ResolvedChunkIndex* nulls_begin, - ResolvedChunkIndex* nulls_middle, - ResolvedChunkIndex* nulls_end, - ResolvedChunkIndex* temp_indices, int64_t null_count) { + auto merge_nulls = [&](CompressedChunkLocation* nulls_begin, + CompressedChunkLocation* nulls_middle, + CompressedChunkLocation* nulls_end, + CompressedChunkLocation* temp_indices, int64_t null_count) { if (has_null_like_values::value) { PartitionNullsOnly(nulls_begin, nulls_end, arrays, null_count, null_placement_); } }; auto merge_non_nulls = - [&](ResolvedChunkIndex* range_begin, ResolvedChunkIndex* range_middle, - ResolvedChunkIndex* range_end, ResolvedChunkIndex* temp_indices) { + [&](CompressedChunkLocation* range_begin, CompressedChunkLocation* range_middle, + CompressedChunkLocation* range_end, CompressedChunkLocation* temp_indices) { MergeNonNulls(range_begin, range_middle, range_end, arrays, temp_indices); }; @@ -175,20 +175,21 @@ class ChunkedArraySorter : public TypeVisitor { } template - void MergeNonNulls(ResolvedChunkIndex* range_begin, ResolvedChunkIndex* range_middle, - ResolvedChunkIndex* range_end, span arrays, - ResolvedChunkIndex* temp_indices) { + void MergeNonNulls(CompressedChunkLocation* range_begin, + CompressedChunkLocation* range_middle, + CompressedChunkLocation* range_end, span arrays, + CompressedChunkLocation* temp_indices) { using ArrowType = typename ArrayType::TypeClass; if (order_ == SortOrder::Ascending) { std::merge(range_begin, range_middle, range_middle, range_end, temp_indices, - [&](ResolvedChunkIndex left, ResolvedChunkIndex right) { + [&](CompressedChunkLocation left, CompressedChunkLocation right) { return ChunkValue(arrays, left) < ChunkValue(arrays, right); }); } else { std::merge(range_begin, range_middle, range_middle, range_end, temp_indices, - [&](ResolvedChunkIndex left, ResolvedChunkIndex right) { + [&](CompressedChunkLocation left, CompressedChunkLocation right) { // We don't use 'left > right' here to reduce required // operator. If we use 'right < left' here, '<' is only // required. @@ -201,7 +202,7 @@ class ChunkedArraySorter : public TypeVisitor { } template - auto ChunkValue(span arrays, ResolvedChunkIndex loc) const { + auto ChunkValue(span arrays, CompressedChunkLocation loc) const { return ResolvedChunk(arrays[loc.chunk_index()], static_cast(loc.index_in_chunk())) .template Value(); @@ -744,16 +745,16 @@ class TableSorter { template Status MergeInternal(std::vector* sorted, int64_t null_count) { - auto merge_nulls = [&](ResolvedChunkIndex* nulls_begin, - ResolvedChunkIndex* nulls_middle, - ResolvedChunkIndex* nulls_end, - ResolvedChunkIndex* temp_indices, int64_t null_count) { + auto merge_nulls = [&](CompressedChunkLocation* nulls_begin, + CompressedChunkLocation* nulls_middle, + CompressedChunkLocation* nulls_end, + CompressedChunkLocation* temp_indices, int64_t null_count) { MergeNulls(nulls_begin, nulls_middle, nulls_end, temp_indices, null_count); }; auto merge_non_nulls = - [&](ResolvedChunkIndex* range_begin, ResolvedChunkIndex* range_middle, - ResolvedChunkIndex* range_end, ResolvedChunkIndex* temp_indices) { + [&](CompressedChunkLocation* range_begin, CompressedChunkLocation* range_middle, + CompressedChunkLocation* range_end, CompressedChunkLocation* temp_indices) { MergeNonNulls(range_begin, range_middle, range_end, temp_indices); }; @@ -779,16 +780,17 @@ class TableSorter { } template - void MergeNulls(ResolvedChunkIndex* nulls_begin, ResolvedChunkIndex* nulls_middle, - ResolvedChunkIndex* nulls_end, ResolvedChunkIndex* temp_indices, - int64_t null_count) { + void MergeNulls(CompressedChunkLocation* nulls_begin, + CompressedChunkLocation* nulls_middle, + CompressedChunkLocation* nulls_end, + CompressedChunkLocation* temp_indices, int64_t null_count) { if constexpr (has_null_like_values::value) { // Merge rows with a null or a null-like in the first sort key auto& comparator = comparator_; const auto& first_sort_key = sort_keys_[0]; std::merge(nulls_begin, nulls_middle, nulls_middle, nulls_end, temp_indices, - [&](ResolvedChunkIndex left, ResolvedChunkIndex right) { + [&](CompressedChunkLocation left, CompressedChunkLocation right) { // First column is either null or nan const auto left_loc = ChunkLocation{left}; const auto right_loc = ChunkLocation{right}; @@ -811,14 +813,15 @@ class TableSorter { } } - void MergeNullsOnly(ResolvedChunkIndex* nulls_begin, ResolvedChunkIndex* nulls_middle, - ResolvedChunkIndex* nulls_end, ResolvedChunkIndex* temp_indices, - int64_t null_count) { + void MergeNullsOnly(CompressedChunkLocation* nulls_begin, + CompressedChunkLocation* nulls_middle, + CompressedChunkLocation* nulls_end, + CompressedChunkLocation* temp_indices, int64_t null_count) { // Untyped implementation auto& comparator = comparator_; std::merge(nulls_begin, nulls_middle, nulls_middle, nulls_end, temp_indices, - [&](ResolvedChunkIndex left, ResolvedChunkIndex right) { + [&](CompressedChunkLocation left, CompressedChunkLocation right) { // First column is always null return comparator.Compare(ChunkLocation{left}, ChunkLocation{right}, 1); }); @@ -831,13 +834,13 @@ class TableSorter { // template enable_if_t::value> MergeNonNulls( - ResolvedChunkIndex* range_begin, ResolvedChunkIndex* range_middle, - ResolvedChunkIndex* range_end, ResolvedChunkIndex* temp_indices) { + CompressedChunkLocation* range_begin, CompressedChunkLocation* range_middle, + CompressedChunkLocation* range_end, CompressedChunkLocation* temp_indices) { auto& comparator = comparator_; const auto& first_sort_key = sort_keys_[0]; std::merge(range_begin, range_middle, range_middle, range_end, temp_indices, - [&](ResolvedChunkIndex left, ResolvedChunkIndex right) { + [&](CompressedChunkLocation left, CompressedChunkLocation right) { // Both values are never null nor NaN. const auto left_loc = ChunkLocation{left}; const auto right_loc = ChunkLocation{right}; @@ -867,10 +870,10 @@ class TableSorter { } template - enable_if_null MergeNonNulls(ResolvedChunkIndex* range_begin, - ResolvedChunkIndex* range_middle, - ResolvedChunkIndex* range_end, - ResolvedChunkIndex* temp_indices) { + enable_if_null MergeNonNulls(CompressedChunkLocation* range_begin, + CompressedChunkLocation* range_middle, + CompressedChunkLocation* range_end, + CompressedChunkLocation* temp_indices) { const int64_t null_count = range_end - range_begin; MergeNullsOnly(range_begin, range_middle, range_end, temp_indices, null_count); } diff --git a/cpp/src/arrow/compute/kernels/vector_sort_internal.h b/cpp/src/arrow/compute/kernels/vector_sort_internal.h index 119b65537a70e..e7ff870782c8a 100644 --- a/cpp/src/arrow/compute/kernels/vector_sort_internal.h +++ b/cpp/src/arrow/compute/kernels/vector_sort_internal.h @@ -209,7 +209,7 @@ struct GenericNullPartitionResult { }; using NullPartitionResult = GenericNullPartitionResult; -using ChunkedNullPartitionResult = GenericNullPartitionResult; +using ChunkedNullPartitionResult = GenericNullPartitionResult; // Move nulls (not null-like values) to end of array. // @@ -289,7 +289,7 @@ NullPartitionResult PartitionNulls(uint64_t* indices_begin, uint64_t* indices_en // // Null partitioning on chunked arrays, in two flavors: // 1) with uint64_t indices and ChunkedArrayResolver -// 2) with ResolvedChunkIndex and span of chunks +// 2) with CompressedChunkLocation and span of chunks // template @@ -316,8 +316,8 @@ NullPartitionResult PartitionNullsOnly(uint64_t* indices_begin, uint64_t* indice } template -ChunkedNullPartitionResult PartitionNullsOnly(ResolvedChunkIndex* locations_begin, - ResolvedChunkIndex* locations_end, +ChunkedNullPartitionResult PartitionNullsOnly(CompressedChunkLocation* locations_begin, + CompressedChunkLocation* locations_end, util::span chunks, int64_t null_count, NullPlacement null_placement) { @@ -328,7 +328,7 @@ ChunkedNullPartitionResult PartitionNullsOnly(ResolvedChunkIndex* locations_begi Partitioner partitioner; if (null_placement == NullPlacement::AtStart) { auto nulls_end = - partitioner(locations_begin, locations_end, [&](ResolvedChunkIndex loc) { + partitioner(locations_begin, locations_end, [&](CompressedChunkLocation loc) { return chunks[loc.chunk_index()]->IsNull( static_cast(loc.index_in_chunk())); }); @@ -336,7 +336,7 @@ ChunkedNullPartitionResult PartitionNullsOnly(ResolvedChunkIndex* locations_begi nulls_end); } else { auto nulls_begin = - partitioner(locations_begin, locations_end, [&](ResolvedChunkIndex loc) { + partitioner(locations_begin, locations_end, [&](CompressedChunkLocation loc) { return !chunks[loc.chunk_index()]->IsNull( static_cast(loc.index_in_chunk())); }); @@ -501,7 +501,8 @@ struct GenericMergeImpl { }; using MergeImpl = GenericMergeImpl; -using ChunkedMergeImpl = GenericMergeImpl; +using ChunkedMergeImpl = + GenericMergeImpl; // TODO make this usable if indices are non trivial on input // (see ConcreteRecordBatchColumnSorter)