From 1102eca04f36ee45e84d97648d61785718d5ce33 Mon Sep 17 00:00:00 2001 From: mwish Date: Tue, 7 May 2024 16:23:56 +0800 Subject: [PATCH] basic fixing for Byte Stream split --- cpp/src/parquet/encoding.cc | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 3da5c64ace5dd..ab3df43131145 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -3694,12 +3694,23 @@ class ByteStreamSplitDecoderBase : public DecoderImpl, ByteStreamSplitDecoderBase(const ColumnDescriptor* descr, int byte_width) : DecoderImpl(descr, Encoding::BYTE_STREAM_SPLIT), byte_width_(byte_width) {} - void SetData(int num_values, const uint8_t* data, int len) override { - if (static_cast(num_values) * byte_width_ != len) { + void SetData(int num_values, const uint8_t* data, int len) final { + // Check that the data size is consistent with the number of values + // The spec requires that the data size is a multiple of the number of values, + // see: https://github.com/apache/parquet-format/pull/192 . + // GH-41562: passed in `num_values` may include nulls, so we need to check and + // adjust the number of values. + if (static_cast(num_values) * byte_width_ < len) { throw ParquetException("Data size (" + std::to_string(len) + ") does not match number of values in BYTE_STREAM_SPLIT (" + std::to_string(num_values) + ")"); } + if (len % byte_width_ != 0) { + throw ParquetException("ByteStreamSplit data size " + std::to_string(len) + + " not aligned with type " + TypeToString(DType::type_num) + + " and byte_width: " + std::to_string(byte_width_)); + } + num_values = len / byte_width_; DecoderImpl::SetData(num_values, data, len); stride_ = num_values_; }