From a50f3fab569bc99004ad50ba06cdbf7fd25aa57b Mon Sep 17 00:00:00 2001 From: mwish Date: Sun, 19 Nov 2023 15:37:47 +0800 Subject: [PATCH] GH-38432: [C++][Parquet] Encoding: Dict Arrow Decoder tiny regression fix --- cpp/src/parquet/encoding.cc | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 1bb487c20d3e2..6e311067f3e6a 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -904,7 +904,12 @@ class DecoderImpl : virtual public Decoder { protected: explicit DecoderImpl(const ColumnDescriptor* descr, Encoding::type encoding) - : descr_(descr), encoding_(encoding), num_values_(0), data_(NULLPTR), len_(0) {} + : descr_(descr), + encoding_(encoding), + num_values_(0), + data_(NULLPTR), + len_(0), + type_length_(0) {} // For accessing type-specific metadata, like FIXED_LEN_BYTE_ARRAY const ColumnDescriptor* descr_; @@ -1196,6 +1201,10 @@ struct ArrowBinaryHelper { chunk_space_remaining_(::arrow::kBinaryMemoryLimit - acc_->builder->value_data_length()) {} + // Prepare will Reserve the number of entries remaining in the current chunk. + // If estimated_data_length is provided, it will also Reserve the estimated data length, + // and the caller should remember to call `UnsafeAppend` instead of `Append` to avoid + // double counting the data length. Status Prepare(std::optional estimated_data_length = {}) { RETURN_NOT_OK(acc_->builder->Reserve(entries_remaining_)); if (estimated_data_length.has_value()) { @@ -1205,6 +1214,9 @@ struct ArrowBinaryHelper { return Status::OK(); } + // If estimated_remaining_data_length is provided, it will also Reserve the estimated + // data length, and the caller should remember to call `UnsafeAppend` instead of + // `Append` to avoid double counting the data length. Status PrepareNextInput(int64_t next_value_length, std::optional estimated_remaining_data_length = {}) { if (ARROW_PREDICT_FALSE(!CanFit(next_value_length))) { @@ -1983,7 +1995,7 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl, int values_decoded = 0; ArrowBinaryHelper helper(out, num_values); - RETURN_NOT_OK(helper.Prepare(len_)); + RETURN_NOT_OK(helper.Prepare()); auto dict_values = reinterpret_cast(dictionary_->data());