From a7633d053c4c074e8f0cf9fdd6075e04832986b6 Mon Sep 17 00:00:00 2001 From: mwish Date: Mon, 20 Nov 2023 11:36:49 +0800 Subject: [PATCH 01/10] GH-38432: [C++][Parquet] Encoding: Dict Arrow Decoder tiny regression fix --- cpp/src/parquet/encoding.cc | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 1bb487c20d3e2..905e73ae43bb5 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -1196,6 +1196,10 @@ struct ArrowBinaryHelper { chunk_space_remaining_(::arrow::kBinaryMemoryLimit - acc_->builder->value_data_length()) {} + // Prepare will Reserve the number of entries remaining in the current chunk. + // If estimated_data_length is provided, it will also Reserve the estimated data length, + // and the caller should remember to call `UnsafeAppend` instead of `Append` to avoid + // double counting the data length. Status Prepare(std::optional estimated_data_length = {}) { RETURN_NOT_OK(acc_->builder->Reserve(entries_remaining_)); if (estimated_data_length.has_value()) { @@ -1205,6 +1209,9 @@ struct ArrowBinaryHelper { return Status::OK(); } + // If estimated_remaining_data_length is provided, it will also Reserve the estimated + // data length, and the caller should remember to call `UnsafeAppend` instead of + // `Append` to avoid double counting the data length. Status PrepareNextInput(int64_t next_value_length, std::optional estimated_remaining_data_length = {}) { if (ARROW_PREDICT_FALSE(!CanFit(next_value_length))) { @@ -1983,7 +1990,7 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl, int values_decoded = 0; ArrowBinaryHelper helper(out, num_values); - RETURN_NOT_OK(helper.Prepare(len_)); + RETURN_NOT_OK(helper.Prepare()); auto dict_values = reinterpret_cast(dictionary_->data()); From f74b4c14cb84fae0fec5829f7a1ef6054b942e74 Mon Sep 17 00:00:00 2001 From: mwish <1506118561@qq.com> Date: Mon, 20 Nov 2023 13:53:12 +0800 Subject: [PATCH 02/10] Apply suggestions from code review Co-authored-by: Gang Wu --- cpp/src/parquet/encoding.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 905e73ae43bb5..553d610ddaafe 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -1196,7 +1196,7 @@ struct ArrowBinaryHelper { chunk_space_remaining_(::arrow::kBinaryMemoryLimit - acc_->builder->value_data_length()) {} - // Prepare will Reserve the number of entries remaining in the current chunk. + // Prepare will reserve the number of entries remaining in the current chunk. // If estimated_data_length is provided, it will also Reserve the estimated data length, // and the caller should remember to call `UnsafeAppend` instead of `Append` to avoid // double counting the data length. @@ -1209,7 +1209,7 @@ struct ArrowBinaryHelper { return Status::OK(); } - // If estimated_remaining_data_length is provided, it will also Reserve the estimated + // If estimated_remaining_data_length is provided, it will also reserve the estimated // data length, and the caller should remember to call `UnsafeAppend` instead of // `Append` to avoid double counting the data length. Status PrepareNextInput(int64_t next_value_length, From 7686572e303dfa34a9de7aa588f65824fe92a32e Mon Sep 17 00:00:00 2001 From: mwish Date: Tue, 21 Nov 2023 22:30:57 +0800 Subject: [PATCH 03/10] tiny update --- cpp/src/parquet/encoding.cc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 553d610ddaafe..51f0162e299e4 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -1197,9 +1197,9 @@ struct ArrowBinaryHelper { acc_->builder->value_data_length()) {} // Prepare will reserve the number of entries remaining in the current chunk. - // If estimated_data_length is provided, it will also Reserve the estimated data length, - // and the caller should remember to call `UnsafeAppend` instead of `Append` to avoid - // double counting the data length. + // If estimated_data_length is provided, it will also reserve the estimated data length, + // and the caller should better call `UnsafeAppend` instead of `Append` to avoid + // double-checking the data length. Status Prepare(std::optional estimated_data_length = {}) { RETURN_NOT_OK(acc_->builder->Reserve(entries_remaining_)); if (estimated_data_length.has_value()) { @@ -1210,8 +1210,8 @@ struct ArrowBinaryHelper { } // If estimated_remaining_data_length is provided, it will also reserve the estimated - // data length, and the caller should remember to call `UnsafeAppend` instead of - // `Append` to avoid double counting the data length. + // data length, and the caller should better call `UnsafeAppend` instead of + // `Append` to avoid double-checking the data length. Status PrepareNextInput(int64_t next_value_length, std::optional estimated_remaining_data_length = {}) { if (ARROW_PREDICT_FALSE(!CanFit(next_value_length))) { From 5f1287633f462f23bfbf02f12c8d2675b8ef50fe Mon Sep 17 00:00:00 2001 From: mwish Date: Tue, 21 Nov 2023 23:31:50 +0800 Subject: [PATCH 04/10] add comment for ByteArray dict Prepare --- cpp/src/parquet/encoding.cc | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 51f0162e299e4..a310a25a4a692 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -1922,6 +1922,9 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl, int32_t indices[kBufferSize]; ArrowBinaryHelper helper(out, num_values); + // The `len_` in the ByteArrayDictDecoder is the total length of the + // RLE/Bit-pack encoded data size, so, we cannot use `len_` to reserve + // space for binary data. RETURN_NOT_OK(helper.Prepare()); auto dict_values = reinterpret_cast(dictionary_->data()); @@ -1990,6 +1993,9 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl, int values_decoded = 0; ArrowBinaryHelper helper(out, num_values); + // The `len_` in the ByteArrayDictDecoder is the total length of the + // RLE/Bit-pack encoded data size, so, we cannot use `len_` to reserve + // space for binary data. RETURN_NOT_OK(helper.Prepare()); auto dict_values = reinterpret_cast(dictionary_->data()); From 51e6fb79a05301622de43f03409379cfc299c29f Mon Sep 17 00:00:00 2001 From: mwish Date: Thu, 23 Nov 2023 22:06:14 +0800 Subject: [PATCH 05/10] update the allocation length --- cpp/src/parquet/encoding.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index a310a25a4a692..89785d4e675ae 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -1921,7 +1921,7 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl, constexpr int32_t kBufferSize = 1024; int32_t indices[kBufferSize]; - ArrowBinaryHelper helper(out, num_values); + ArrowBinaryHelper helper(out, std::min(num_values, this->num_values_)); // The `len_` in the ByteArrayDictDecoder is the total length of the // RLE/Bit-pack encoded data size, so, we cannot use `len_` to reserve // space for binary data. @@ -1992,7 +1992,7 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl, int32_t indices[kBufferSize]; int values_decoded = 0; - ArrowBinaryHelper helper(out, num_values); + ArrowBinaryHelper helper(out, std::min(num_values, this->num_values_)); // The `len_` in the ByteArrayDictDecoder is the total length of the // RLE/Bit-pack encoded data size, so, we cannot use `len_` to reserve // space for binary data. From 0333b417407a9c6fa3d70d0e08265b49eca1b1cf Mon Sep 17 00:00:00 2001 From: mwish Date: Fri, 24 Nov 2023 00:02:00 +0800 Subject: [PATCH 06/10] [Update] Update maximum logic for Prepare --- cpp/src/parquet/encoding.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 89785d4e675ae..4fccb162e4fb6 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -1204,7 +1204,7 @@ struct ArrowBinaryHelper { RETURN_NOT_OK(acc_->builder->Reserve(entries_remaining_)); if (estimated_data_length.has_value()) { RETURN_NOT_OK(acc_->builder->ReserveData( - std::min(*estimated_data_length, ::arrow::kBinaryMemoryLimit))); + std::min(*estimated_data_length, this->chunk_space_remaining_))); } return Status::OK(); } @@ -1921,7 +1921,7 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl, constexpr int32_t kBufferSize = 1024; int32_t indices[kBufferSize]; - ArrowBinaryHelper helper(out, std::min(num_values, this->num_values_)); + ArrowBinaryHelper helper(out, num_values); // The `len_` in the ByteArrayDictDecoder is the total length of the // RLE/Bit-pack encoded data size, so, we cannot use `len_` to reserve // space for binary data. @@ -1992,7 +1992,7 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl, int32_t indices[kBufferSize]; int values_decoded = 0; - ArrowBinaryHelper helper(out, std::min(num_values, this->num_values_)); + ArrowBinaryHelper helper(out, num_values); // The `len_` in the ByteArrayDictDecoder is the total length of the // RLE/Bit-pack encoded data size, so, we cannot use `len_` to reserve // space for binary data. From 4f26c9843229861e94f2403af4628f0c5d724a16 Mon Sep 17 00:00:00 2001 From: mwish Date: Fri, 24 Nov 2023 10:45:45 +0800 Subject: [PATCH 07/10] [Update] Trying to disable "Prepare" --- cpp/src/parquet/encoding.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 4fccb162e4fb6..9e09bf70670d2 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -1925,7 +1925,7 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl, // The `len_` in the ByteArrayDictDecoder is the total length of the // RLE/Bit-pack encoded data size, so, we cannot use `len_` to reserve // space for binary data. - RETURN_NOT_OK(helper.Prepare()); + // RETURN_NOT_OK(helper.Prepare()); auto dict_values = reinterpret_cast(dictionary_->data()); int values_decoded = 0; @@ -1996,7 +1996,7 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl, // The `len_` in the ByteArrayDictDecoder is the total length of the // RLE/Bit-pack encoded data size, so, we cannot use `len_` to reserve // space for binary data. - RETURN_NOT_OK(helper.Prepare()); + // RETURN_NOT_OK(helper.Prepare()); auto dict_values = reinterpret_cast(dictionary_->data()); From e759a902c520f3b6f8816ea4157d0731a6adc930 Mon Sep 17 00:00:00 2001 From: mwish Date: Fri, 24 Nov 2023 16:44:20 +0800 Subject: [PATCH 08/10] Update the impl --- cpp/src/parquet/encoding.cc | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 9e09bf70670d2..c529ea47aceff 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -1209,18 +1209,26 @@ struct ArrowBinaryHelper { return Status::OK(); } + Status PrepareNextInput(int64_t next_value_length) { + if (ARROW_PREDICT_FALSE(!CanFit(next_value_length))) { + // This element would exceed the capacity of a chunk + RETURN_NOT_OK(PushChunk()); + } + return Status::OK(); + } + // If estimated_remaining_data_length is provided, it will also reserve the estimated // data length, and the caller should better call `UnsafeAppend` instead of // `Append` to avoid double-checking the data length. Status PrepareNextInput(int64_t next_value_length, - std::optional estimated_remaining_data_length = {}) { + int64_t estimated_remaining_data_length) { if (ARROW_PREDICT_FALSE(!CanFit(next_value_length))) { // This element would exceed the capacity of a chunk RETURN_NOT_OK(PushChunk()); RETURN_NOT_OK(acc_->builder->Reserve(entries_remaining_)); - if (estimated_remaining_data_length.has_value()) { + if (estimated_remaining_data_length) { RETURN_NOT_OK(acc_->builder->ReserveData( - std::min(*estimated_remaining_data_length, chunk_space_remaining_))); + std::min(estimated_remaining_data_length, chunk_space_remaining_))); } } return Status::OK(); From 60cdb808c9ab6d25b2e00474a6307a8bf35b7b24 Mon Sep 17 00:00:00 2001 From: mwish Date: Fri, 24 Nov 2023 23:01:17 +0800 Subject: [PATCH 09/10] resume logic for PrepareNextInput --- cpp/src/parquet/encoding.cc | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index c529ea47aceff..7bdcf9a2aaad1 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -1213,6 +1213,7 @@ struct ArrowBinaryHelper { if (ARROW_PREDICT_FALSE(!CanFit(next_value_length))) { // This element would exceed the capacity of a chunk RETURN_NOT_OK(PushChunk()); + RETURN_NOT_OK(acc_->builder->Reserve(entries_remaining_)); } return Status::OK(); } @@ -1286,8 +1287,12 @@ struct ArrowBinaryHelper { return acc_->Reserve(entries_remaining_); } + Status PrepareNextInput(int64_t next_value_length) { + return Status::OK(); + } + Status PrepareNextInput(int64_t next_value_length, - std::optional estimated_remaining_data_length = {}) { + int64_t estimated_remaining_data_length) { return Status::OK(); } @@ -1933,7 +1938,7 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl, // The `len_` in the ByteArrayDictDecoder is the total length of the // RLE/Bit-pack encoded data size, so, we cannot use `len_` to reserve // space for binary data. - // RETURN_NOT_OK(helper.Prepare()); + RETURN_NOT_OK(helper.Prepare()); auto dict_values = reinterpret_cast(dictionary_->data()); int values_decoded = 0; @@ -2004,7 +2009,7 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl, // The `len_` in the ByteArrayDictDecoder is the total length of the // RLE/Bit-pack encoded data size, so, we cannot use `len_` to reserve // space for binary data. - // RETURN_NOT_OK(helper.Prepare()); + RETURN_NOT_OK(helper.Prepare()); auto dict_values = reinterpret_cast(dictionary_->data()); From 3dc536c745340366f580c137fe95b666d4a3d5fb Mon Sep 17 00:00:00 2001 From: mwish Date: Fri, 24 Nov 2023 23:11:03 +0800 Subject: [PATCH 10/10] fix format --- cpp/src/parquet/encoding.cc | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 7bdcf9a2aaad1..04c6935d2de6e 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -1287,9 +1287,7 @@ struct ArrowBinaryHelper { return acc_->Reserve(entries_remaining_); } - Status PrepareNextInput(int64_t next_value_length) { - return Status::OK(); - } + Status PrepareNextInput(int64_t next_value_length) { return Status::OK(); } Status PrepareNextInput(int64_t next_value_length, int64_t estimated_remaining_data_length) {