Skip to content
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 27 additions & 5 deletions cpp/src/arrow/util/rle_encoding_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -674,12 +674,22 @@ auto RleBitPackedParser::PeekImpl(Handler&& handler) const
ARROW_DCHECK_LT(static_cast<uint64_t>(count) * 8,
internal::max_size_for_v<rle_size_t>);
// Count Already divided by 8 for byte size calculations
const auto bytes_read = header_bytes + static_cast<int64_t>(count) * value_bit_width_;
auto bytes_read = header_bytes + static_cast<int64_t>(count) * value_bit_width_;
auto values_count = static_cast<rle_size_t>(count * 8);
if (ARROW_PREDICT_FALSE(bytes_read > data_size_)) {
// Bit-packed run would overflow data buffer
return {0, ControlFlow::Break};
// Bit-packed run would overflow data buffer, but we might still be able
// to return a truncated bit-packed such as generated by some non-compliant
// encoders.
// Example in GH-47981: column contains 25 5-bit values, has a single
// bit-packed run with count=4 (theoretically 32 values), but only 17
// bytes of RLE-bit-packed data (including the one-byte header).
bytes_read = data_size_;
values_count =
static_cast<rle_size_t>((bytes_read - header_bytes) * 8 / value_bit_width_);
if (values_count < 1) {
return {0, ControlFlow::Break};
}
}
const auto values_count = static_cast<rle_size_t>(count * 8);

auto control = handler.OnBitPackedRun(
BitPackedRun(data_ + header_bytes, values_count, value_bit_width_));
Expand Down Expand Up @@ -1210,7 +1220,8 @@ auto RleBitPackedDecoder<T>::GetBatchWithDict(const V* dictionary,
rle_size_t batch_size) -> rle_size_t {
using ControlFlow = RleBitPackedParser::ControlFlow;

if (ARROW_PREDICT_FALSE(batch_size <= 0)) {
if (ARROW_PREDICT_FALSE(batch_size <= 0 || dictionary_length == 0)) {
// Either empty batch or invalid dictionary
return 0;
}

Expand Down Expand Up @@ -1279,6 +1290,17 @@ auto RleBitPackedDecoder<T>::GetBatchWithDictSpaced(
if (null_count == 0) {
return GetBatchWithDict<V>(dictionary, dictionary_length, out, batch_size);
}
if (null_count == batch_size) {
// All nulls, avoid instantiating DictionaryConverter as dictionary_length
// could be 0.
std::fill(out, out + batch_size, V{});
return batch_size;
}
if (ARROW_PREDICT_FALSE(batch_size <= 0 || dictionary_length == 0)) {
// Either empty batch or invalid dictionary
return 0;
}

internal::DictionaryConverter<V, value_type> converter{dictionary, dictionary_length};

return GetSpaced(converter, out, batch_size, valid_bits, valid_bits_offset, null_count);
Expand Down
Loading