diff --git a/cpp/src/arrow/util/rle_encoding_internal.h b/cpp/src/arrow/util/rle_encoding_internal.h index 2420270f3ab..1f249db470e 100644 --- a/cpp/src/arrow/util/rle_encoding_internal.h +++ b/cpp/src/arrow/util/rle_encoding_internal.h @@ -674,12 +674,22 @@ auto RleBitPackedParser::PeekImpl(Handler&& handler) const ARROW_DCHECK_LT(static_cast(count) * 8, internal::max_size_for_v); // Count Already divided by 8 for byte size calculations - const auto bytes_read = header_bytes + static_cast(count) * value_bit_width_; + auto bytes_read = header_bytes + static_cast(count) * value_bit_width_; + auto values_count = static_cast(count * 8); if (ARROW_PREDICT_FALSE(bytes_read > data_size_)) { - // Bit-packed run would overflow data buffer - return {0, ControlFlow::Break}; + // Bit-packed run would overflow data buffer, but we might still be able + // to return a truncated bit-packed such as generated by some non-compliant + // encoders. + // Example in GH-47981: column contains 25 5-bit values, has a single + // bit-packed run with count=4 (theoretically 32 values), but only 17 + // bytes of RLE-bit-packed data (including the one-byte header). + bytes_read = data_size_; + values_count = + static_cast((bytes_read - header_bytes) * 8 / value_bit_width_); + if (values_count < 1) { + return {0, ControlFlow::Break}; + } } - const auto values_count = static_cast(count * 8); auto control = handler.OnBitPackedRun( BitPackedRun(data_ + header_bytes, values_count, value_bit_width_)); @@ -1210,7 +1220,8 @@ auto RleBitPackedDecoder::GetBatchWithDict(const V* dictionary, rle_size_t batch_size) -> rle_size_t { using ControlFlow = RleBitPackedParser::ControlFlow; - if (ARROW_PREDICT_FALSE(batch_size <= 0)) { + if (ARROW_PREDICT_FALSE(batch_size <= 0 || dictionary_length == 0)) { + // Either empty batch or invalid dictionary return 0; } @@ -1279,6 +1290,17 @@ auto RleBitPackedDecoder::GetBatchWithDictSpaced( if (null_count == 0) { return GetBatchWithDict(dictionary, dictionary_length, out, batch_size); } + if (null_count == batch_size) { + // All nulls, avoid instantiating DictionaryConverter as dictionary_length + // could be 0. + std::fill(out, out + batch_size, V{}); + return batch_size; + } + if (ARROW_PREDICT_FALSE(batch_size <= 0 || dictionary_length == 0)) { + // Either empty batch or invalid dictionary + return 0; + } + internal::DictionaryConverter converter{dictionary, dictionary_length}; return GetSpaced(converter, out, batch_size, valid_bits, valid_bits_offset, null_count);