Skip to content

Commit

Permalink
ChunkResolver: Handle vectors of signed logical indices
Browse files Browse the repository at this point in the history
  • Loading branch information
felipecrv committed May 14, 2024
1 parent dbfc796 commit ce7840b
Show file tree
Hide file tree
Showing 2 changed files with 50 additions and 11 deletions.
29 changes: 23 additions & 6 deletions cpp/src/arrow/chunk_resolver.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include <cassert>
#include <cstdint>
#include <limits>
#include <type_traits>
#include <vector>

#include "arrow/type_fwd.h"
Expand Down Expand Up @@ -141,13 +142,15 @@ struct ARROW_EXPORT ChunkResolver {

/// \brief Resolve `n` logical indices to chunk indices.
///
/// \pre logical_index_vec[i] < n (for valid chunk index results)
/// \pre out_chunk_index_vec has space for `n` elements
/// \pre 0 <= logical_index_vec[i] < n (for well-defined and valid chunk index results)
/// \pre out_chunk_index_vec has space for `n_indices`
/// \post chunk_hint in [0, chunks.size()]
/// \post out_chunk_index_vec[i] in [0, chunks.size()] for i in [0, n)
/// \post if logical_index_vec[i] >= chunked_array.length(), then
/// out_chunk_index_vec[i] == chunks.size()
/// and out_index_in_chunk_vec[i] is undefined (can be out-of-bounds)
/// and out_index_in_chunk_vec[i] is UNDEFINED (can be out-of-bounds)
/// \post if logical_index_vec[i] < 0, then both out_chunk_index_vec[i] and
/// out_index_in_chunk_vec[i] are UNDEFINED
///
/// \param n_indices The number of logical indices to resolve
/// \param logical_index_vec The logical indices to resolve
Expand All @@ -160,7 +163,6 @@ struct ARROW_EXPORT ChunkResolver {
[[nodiscard]] bool ResolveMany(int64_t n_indices, const IndexType* logical_index_vec,
IndexType* out_chunk_index_vec, IndexType chunk_hint = 0,
IndexType* out_index_in_chunk_vec = NULLPTR) const {
static_assert(std::is_unsigned_v<IndexType>);
if constexpr (sizeof(IndexType) < sizeof(uint64_t)) {
// The max value returned by Bisect is `offsets.size() - 1` (= chunks.size()).
constexpr uint64_t kMaxIndexTypeValue = std::numeric_limits<IndexType>::max();
Expand All @@ -174,8 +176,23 @@ struct ARROW_EXPORT ChunkResolver {
// Since an index-in-chunk cannot possibly exceed the logical index being
// queried, we don't have to worry about these values not fitting on IndexType.
}
ResolveManyImpl(n_indices, logical_index_vec, out_chunk_index_vec, chunk_hint,
out_index_in_chunk_vec);
if constexpr (std::is_signed_v<IndexType>) {
// We interpret signed integers as unsigned and avoid having to generate double
// the amount of binary code to handle each integer width.
//
// Negative logical indices can become large values when cast to unsigned, but
// they are gracefully handled by ResolveManyImpl. Although both the chunk index
// and the index in chunk values will be undefined in these cases.
using U = std::make_unsigned_t<IndexType>;
ResolveManyImpl(n_indices, reinterpret_cast<const U*>(logical_index_vec),
reinterpret_cast<U*>(out_chunk_index_vec),
static_cast<U>(chunk_hint),
reinterpret_cast<U*>(out_index_in_chunk_vec));
} else {
static_assert(std::is_unsigned_v<IndexType>);
ResolveManyImpl(n_indices, logical_index_vec, out_chunk_index_vec, chunk_hint,
out_index_in_chunk_vec);
}
return true;
}

Expand Down
32 changes: 27 additions & 5 deletions cpp/src/arrow/chunked_array_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -316,7 +316,8 @@ TEST_F(TestChunkedArray, GetScalar) {

// ChunkResolver tests

using IndexTypes = ::testing::Types<uint8_t, uint16_t, uint32_t, uint64_t>;
using IndexTypes = ::testing::Types<uint8_t, uint16_t, uint32_t, uint64_t, int8_t,
int16_t, int32_t, int64_t>;

TEST(TestChunkResolver, Resolve) {
ChunkResolver empty(std::vector<int64_t>({0})); // []
Expand Down Expand Up @@ -444,22 +445,43 @@ class TestChunkResolverMany : public ::testing::Test {
for (size_t i = 0; i < logical_index_vec.size(); i++) {
ASSERT_EQ(locations[i].chunk_index, resolver.num_chunks());
}

if constexpr (std::is_signed_v<IndexType>) {
std::vector<IndexType> logical_index_vec = {-1, -2, -3, -4, -5, -127 - 1};
ASSERT_OK_AND_ASSIGN(auto locations, ResolveMany(resolver, logical_index_vec));
EXPECT_EQ(logical_index_vec.size(), locations.size());
for (size_t i = 0; i < logical_index_vec.size(); i++) {
using U = std::make_unsigned_t<IndexType>;
auto unsigned_logical_index = static_cast<U>(logical_index_vec[i]);
// When negative values are passed, the results should be treated as
// undefined because they are the result of resolving whathever the
// logical index becomes when interpreted as an unsigned integer.
auto expected = resolver.Resolve(static_cast<int64_t>(unsigned_logical_index));
if (expected.chunk_index == resolver.num_chunks()) {
ASSERT_EQ(locations[i].chunk_index, expected.chunk_index);
} else {
ASSERT_EQ(locations[i], expected);
}
}
}
}

void TestOverflow() {
std::vector<IndexType> logical_index_vec = {0, 1, 2, 127, 255};
const int64_t kMaxIndex = std::is_signed_v<IndexType> ? 127 : 255;
std::vector<IndexType> logical_index_vec = {0, 1, 2,
static_cast<IndexType>(kMaxIndex)};

// Offsets are rare because to even make it possible, we need more chunks
// Overflows are rare because to make them possible, we need more chunks
// than logical elements in the ChunkedArray. That requires at least one
// empty chunk.
std::vector<int64_t> offsets;
for (int64_t i = 0; i < 256; i++) {
for (int64_t i = 0; i <= kMaxIndex; i++) {
offsets.push_back(i);
}
ChunkResolver resolver{offsets};
ASSERT_OK(ResolveMany(resolver, logical_index_vec));

offsets.push_back(255); // adding an empty chunk
offsets.push_back(kMaxIndex); // adding an empty chunk
ChunkResolver resolver_with_empty{offsets};
if (sizeof(IndexType) == 1) {
ASSERT_NOT_OK(ResolveMany(resolver_with_empty, logical_index_vec));
Expand Down

0 comments on commit ce7840b

Please sign in to comment.