diff --git a/cpp/src/arrow/chunk_resolver.h b/cpp/src/arrow/chunk_resolver.h index ecb0df20b1947..400ede9cfa416 100644 --- a/cpp/src/arrow/chunk_resolver.h +++ b/cpp/src/arrow/chunk_resolver.h @@ -21,6 +21,7 @@ #include #include #include +#include #include #include "arrow/type_fwd.h" @@ -141,13 +142,15 @@ struct ARROW_EXPORT ChunkResolver { /// \brief Resolve `n` logical indices to chunk indices. /// - /// \pre logical_index_vec[i] < n (for valid chunk index results) - /// \pre out_chunk_index_vec has space for `n` elements + /// \pre 0 <= logical_index_vec[i] < n (for well-defined and valid chunk index results) + /// \pre out_chunk_index_vec has space for `n_indices` /// \post chunk_hint in [0, chunks.size()] /// \post out_chunk_index_vec[i] in [0, chunks.size()] for i in [0, n) /// \post if logical_index_vec[i] >= chunked_array.length(), then /// out_chunk_index_vec[i] == chunks.size() - /// and out_index_in_chunk_vec[i] is undefined (can be out-of-bounds) + /// and out_index_in_chunk_vec[i] is UNDEFINED (can be out-of-bounds) + /// \post if logical_index_vec[i] < 0, then both out_chunk_index_vec[i] and + /// out_index_in_chunk_vec[i] are UNDEFINED /// /// \param n_indices The number of logical indices to resolve /// \param logical_index_vec The logical indices to resolve @@ -160,7 +163,6 @@ struct ARROW_EXPORT ChunkResolver { [[nodiscard]] bool ResolveMany(int64_t n_indices, const IndexType* logical_index_vec, IndexType* out_chunk_index_vec, IndexType chunk_hint = 0, IndexType* out_index_in_chunk_vec = NULLPTR) const { - static_assert(std::is_unsigned_v); if constexpr (sizeof(IndexType) < sizeof(uint64_t)) { // The max value returned by Bisect is `offsets.size() - 1` (= chunks.size()). constexpr uint64_t kMaxIndexTypeValue = std::numeric_limits::max(); @@ -174,8 +176,23 @@ struct ARROW_EXPORT ChunkResolver { // Since an index-in-chunk cannot possibly exceed the logical index being // queried, we don't have to worry about these values not fitting on IndexType. } - ResolveManyImpl(n_indices, logical_index_vec, out_chunk_index_vec, chunk_hint, - out_index_in_chunk_vec); + if constexpr (std::is_signed_v) { + // We interpret signed integers as unsigned and avoid having to generate double + // the amount of binary code to handle each integer width. + // + // Negative logical indices can become large values when cast to unsigned, but + // they are gracefully handled by ResolveManyImpl. Although both the chunk index + // and the index in chunk values will be undefined in these cases. + using U = std::make_unsigned_t; + ResolveManyImpl(n_indices, reinterpret_cast(logical_index_vec), + reinterpret_cast(out_chunk_index_vec), + static_cast(chunk_hint), + reinterpret_cast(out_index_in_chunk_vec)); + } else { + static_assert(std::is_unsigned_v); + ResolveManyImpl(n_indices, logical_index_vec, out_chunk_index_vec, chunk_hint, + out_index_in_chunk_vec); + } return true; } diff --git a/cpp/src/arrow/chunked_array_test.cc b/cpp/src/arrow/chunked_array_test.cc index f146b43b5a84c..af1feae538b16 100644 --- a/cpp/src/arrow/chunked_array_test.cc +++ b/cpp/src/arrow/chunked_array_test.cc @@ -316,7 +316,8 @@ TEST_F(TestChunkedArray, GetScalar) { // ChunkResolver tests -using IndexTypes = ::testing::Types; +using IndexTypes = ::testing::Types; TEST(TestChunkResolver, Resolve) { ChunkResolver empty(std::vector({0})); // [] @@ -444,22 +445,43 @@ class TestChunkResolverMany : public ::testing::Test { for (size_t i = 0; i < logical_index_vec.size(); i++) { ASSERT_EQ(locations[i].chunk_index, resolver.num_chunks()); } + + if constexpr (std::is_signed_v) { + std::vector logical_index_vec = {-1, -2, -3, -4, -5, -127 - 1}; + ASSERT_OK_AND_ASSIGN(auto locations, ResolveMany(resolver, logical_index_vec)); + EXPECT_EQ(logical_index_vec.size(), locations.size()); + for (size_t i = 0; i < logical_index_vec.size(); i++) { + using U = std::make_unsigned_t; + auto unsigned_logical_index = static_cast(logical_index_vec[i]); + // When negative values are passed, the results should be treated as + // undefined because they are the result of resolving whathever the + // logical index becomes when interpreted as an unsigned integer. + auto expected = resolver.Resolve(static_cast(unsigned_logical_index)); + if (expected.chunk_index == resolver.num_chunks()) { + ASSERT_EQ(locations[i].chunk_index, expected.chunk_index); + } else { + ASSERT_EQ(locations[i], expected); + } + } + } } void TestOverflow() { - std::vector logical_index_vec = {0, 1, 2, 127, 255}; + const int64_t kMaxIndex = std::is_signed_v ? 127 : 255; + std::vector logical_index_vec = {0, 1, 2, + static_cast(kMaxIndex)}; - // Offsets are rare because to even make it possible, we need more chunks + // Overflows are rare because to make them possible, we need more chunks // than logical elements in the ChunkedArray. That requires at least one // empty chunk. std::vector offsets; - for (int64_t i = 0; i < 256; i++) { + for (int64_t i = 0; i <= kMaxIndex; i++) { offsets.push_back(i); } ChunkResolver resolver{offsets}; ASSERT_OK(ResolveMany(resolver, logical_index_vec)); - offsets.push_back(255); // adding an empty chunk + offsets.push_back(kMaxIndex); // adding an empty chunk ChunkResolver resolver_with_empty{offsets}; if (sizeof(IndexType) == 1) { ASSERT_NOT_OK(ResolveMany(resolver_with_empty, logical_index_vec));