From cf5d7c7ec3cf45d82aad2b2ff67b799d078f0619 Mon Sep 17 00:00:00 2001 From: Rossi Sun Date: Tue, 16 Jul 2024 22:48:21 +0800 Subject: [PATCH] GH-43202: [C++][Compute] Detect and explicit error for offset overflow in row table (#43226) ### Rationale for this change There are two places for the offset in a row table to possibly overflow: 1) Encoding columns into a row table; 2) Appending to a row table from another row table. They are particularly bad because they can cause silent wrong result for the computation. ### What changes are included in this PR? Detect such overflows in the aforementioned places and report an explicit error when an overflow is detected. ### Are these changes tested? UT included. ### Are there any user-facing changes? User code that used to be fake passing could see an explicit error after this change. **This PR contains a "Critical Fix".** * GitHub Issue: #43202 Authored-by: Ruoxi Sun Signed-off-by: Antoine Pitrou --- cpp/src/arrow/compute/row/encode_internal.cc | 26 ++-- cpp/src/arrow/compute/row/encode_internal.h | 6 +- cpp/src/arrow/compute/row/row_internal.cc | 12 +- cpp/src/arrow/compute/row/row_test.cc | 129 +++++++++++++++++++ 4 files changed, 160 insertions(+), 13 deletions(-) diff --git a/cpp/src/arrow/compute/row/encode_internal.cc b/cpp/src/arrow/compute/row/encode_internal.cc index 88ab5b81b1e0a..658e0dffcac68 100644 --- a/cpp/src/arrow/compute/row/encode_internal.cc +++ b/cpp/src/arrow/compute/row/encode_internal.cc @@ -17,6 +17,7 @@ #include "arrow/compute/row/encode_internal.h" #include "arrow/util/checked_cast.h" +#include "arrow/util/int_util_overflow.h" namespace arrow { namespace compute { @@ -160,8 +161,8 @@ Status RowTableEncoder::EncodeSelected(RowTableImpl* rows, uint32_t num_selected /*num_extra_bytes_to_append=*/static_cast(0))); // Then populate the offsets of the var-length columns, which will be used as the target // size of the var-length buffers resizing below. - EncoderOffsets::GetRowOffsetsSelected(rows, batch_varbinary_cols_, num_selected, - selection); + RETURN_NOT_OK(EncoderOffsets::GetRowOffsetsSelected(rows, batch_varbinary_cols_, + num_selected, selection)); // Last AppendEmpty with zero rows and zero extra bytes to resize the var-length buffers // based on the populated offsets. RETURN_NOT_OK( @@ -667,12 +668,12 @@ void EncoderOffsets::Decode(uint32_t start_row, uint32_t num_rows, } } -void EncoderOffsets::GetRowOffsetsSelected(RowTableImpl* rows, - const std::vector& cols, - uint32_t num_selected, - const uint16_t* selection) { +Status EncoderOffsets::GetRowOffsetsSelected(RowTableImpl* rows, + const std::vector& cols, + uint32_t num_selected, + const uint16_t* selection) { if (rows->metadata().is_fixed_length) { - return; + return Status::OK(); } uint32_t* row_offsets = rows->mutable_offsets(); @@ -713,9 +714,18 @@ void EncoderOffsets::GetRowOffsetsSelected(RowTableImpl* rows, uint32_t length = row_offsets[i]; length += RowTableMetadata::padding_for_alignment(length, row_alignment); row_offsets[i] = sum; - sum += length; + uint32_t sum_maybe_overflow = 0; + if (ARROW_PREDICT_FALSE( + arrow::internal::AddWithOverflow(sum, length, &sum_maybe_overflow))) { + return Status::Invalid( + "Offset overflow detected in EncoderOffsets::GetRowOffsetsSelected for row ", i, + " of length ", length, " bytes, current length in total is ", sum, " bytes"); + } + sum = sum_maybe_overflow; } row_offsets[num_selected] = sum; + + return Status::OK(); } template diff --git a/cpp/src/arrow/compute/row/encode_internal.h b/cpp/src/arrow/compute/row/encode_internal.h index 2afc150530b9e..0618ddd8e4b96 100644 --- a/cpp/src/arrow/compute/row/encode_internal.h +++ b/cpp/src/arrow/compute/row/encode_internal.h @@ -227,9 +227,9 @@ class EncoderBinaryPair { class EncoderOffsets { public: - static void GetRowOffsetsSelected(RowTableImpl* rows, - const std::vector& cols, - uint32_t num_selected, const uint16_t* selection); + static Status GetRowOffsetsSelected(RowTableImpl* rows, + const std::vector& cols, + uint32_t num_selected, const uint16_t* selection); static void EncodeSelected(RowTableImpl* rows, const std::vector& cols, uint32_t num_selected, const uint16_t* selection); diff --git a/cpp/src/arrow/compute/row/row_internal.cc b/cpp/src/arrow/compute/row/row_internal.cc index 0d418fdecf488..2365ef5632cce 100644 --- a/cpp/src/arrow/compute/row/row_internal.cc +++ b/cpp/src/arrow/compute/row/row_internal.cc @@ -18,6 +18,7 @@ #include "arrow/compute/row/row_internal.h" #include "arrow/compute/util.h" +#include "arrow/util/int_util_overflow.h" namespace arrow { namespace compute { @@ -325,14 +326,21 @@ Status RowTableImpl::AppendSelectionFrom(const RowTableImpl& from, // Varying-length rows auto from_offsets = reinterpret_cast(from.offsets_->data()); auto to_offsets = reinterpret_cast(offsets_->mutable_data()); - // TODO(GH-43202): The following two variables are possibly overflowing. uint32_t total_length = to_offsets[num_rows_]; uint32_t total_length_to_append = 0; for (uint32_t i = 0; i < num_rows_to_append; ++i) { uint16_t row_id = source_row_ids ? source_row_ids[i] : i; uint32_t length = from_offsets[row_id + 1] - from_offsets[row_id]; total_length_to_append += length; - to_offsets[num_rows_ + i + 1] = total_length + total_length_to_append; + uint32_t to_offset_maybe_overflow = 0; + if (ARROW_PREDICT_FALSE(arrow::internal::AddWithOverflow( + total_length, total_length_to_append, &to_offset_maybe_overflow))) { + return Status::Invalid( + "Offset overflow detected in RowTableImpl::AppendSelectionFrom for row ", + num_rows_ + i, " of length ", length, " bytes, current length in total is ", + to_offsets[num_rows_ + i], " bytes"); + } + to_offsets[num_rows_ + i + 1] = to_offset_maybe_overflow; } RETURN_NOT_OK(ResizeOptionalVaryingLengthBuffer(total_length_to_append)); diff --git a/cpp/src/arrow/compute/row/row_test.cc b/cpp/src/arrow/compute/row/row_test.cc index 2c1a60dfb231c..679ad519a9ef2 100644 --- a/cpp/src/arrow/compute/row/row_test.cc +++ b/cpp/src/arrow/compute/row/row_test.cc @@ -125,5 +125,134 @@ TEST(RowTableMemoryConsumption, Encode) { } } +// GH-43202: Ensure that when offset overflow happens in encoding the row table, an +// explicit error is raised instead of a silent wrong result. +TEST(RowTableOffsetOverflow, LARGE_MEMORY_TEST(Encode)) { + if constexpr (sizeof(void*) == 4) { + GTEST_SKIP() << "Test only works on 64-bit platforms"; + } + + // Use 8 512MB var-length rows (occupies 4GB+) to overflow the offset in the row table. + constexpr int64_t num_rows = 8; + constexpr int64_t length_per_binary = 512 * 1024 * 1024; + constexpr int64_t row_alignment = sizeof(uint32_t); + constexpr int64_t var_length_alignment = sizeof(uint32_t); + + MemoryPool* pool = default_memory_pool(); + + // The column to encode. + std::vector columns; + std::vector values; + ASSERT_OK_AND_ASSIGN( + auto value, ::arrow::gen::Constant( + std::make_shared(std::string(length_per_binary, 'X'))) + ->Generate(1)); + values.push_back(std::move(value)); + ExecBatch batch = ExecBatch(std::move(values), 1); + ASSERT_OK(ColumnArraysFromExecBatch(batch, &columns)); + + // The row table. + std::vector column_metadatas; + ASSERT_OK(ColumnMetadatasFromExecBatch(batch, &column_metadatas)); + RowTableMetadata table_metadata; + table_metadata.FromColumnMetadataVector(column_metadatas, row_alignment, + var_length_alignment); + RowTableImpl row_table; + ASSERT_OK(row_table.Init(pool, table_metadata)); + RowTableEncoder row_encoder; + row_encoder.Init(column_metadatas, row_alignment, var_length_alignment); + + // The rows to encode. + std::vector row_ids(num_rows, 0); + + // Encoding 7 rows should be fine. + { + row_encoder.PrepareEncodeSelected(0, num_rows - 1, columns); + ASSERT_OK(row_encoder.EncodeSelected(&row_table, static_cast(num_rows - 1), + row_ids.data())); + } + + // Encoding 8 rows should overflow. + { + int64_t length_per_row = table_metadata.fixed_length + length_per_binary; + std::stringstream expected_error_message; + expected_error_message << "Invalid: Offset overflow detected in " + "EncoderOffsets::GetRowOffsetsSelected for row " + << num_rows - 1 << " of length " << length_per_row + << " bytes, current length in total is " + << length_per_row * (num_rows - 1) << " bytes"; + row_encoder.PrepareEncodeSelected(0, num_rows, columns); + ASSERT_RAISES_WITH_MESSAGE( + Invalid, expected_error_message.str(), + row_encoder.EncodeSelected(&row_table, static_cast(num_rows), + row_ids.data())); + } +} + +// GH-43202: Ensure that when offset overflow happens in appending to the row table, an +// explicit error is raised instead of a silent wrong result. +TEST(RowTableOffsetOverflow, LARGE_MEMORY_TEST(AppendFrom)) { + if constexpr (sizeof(void*) == 4) { + GTEST_SKIP() << "Test only works on 64-bit platforms"; + } + + // Use 8 512MB var-length rows (occupies 4GB+) to overflow the offset in the row table. + constexpr int64_t num_rows = 8; + constexpr int64_t length_per_binary = 512 * 1024 * 1024; + constexpr int64_t num_rows_seed = 1; + constexpr int64_t row_alignment = sizeof(uint32_t); + constexpr int64_t var_length_alignment = sizeof(uint32_t); + + MemoryPool* pool = default_memory_pool(); + + // The column to encode. + std::vector columns; + std::vector values; + ASSERT_OK_AND_ASSIGN( + auto value, ::arrow::gen::Constant( + std::make_shared(std::string(length_per_binary, 'X'))) + ->Generate(num_rows_seed)); + values.push_back(std::move(value)); + ExecBatch batch = ExecBatch(std::move(values), num_rows_seed); + ASSERT_OK(ColumnArraysFromExecBatch(batch, &columns)); + + // The seed row table. + std::vector column_metadatas; + ASSERT_OK(ColumnMetadatasFromExecBatch(batch, &column_metadatas)); + RowTableMetadata table_metadata; + table_metadata.FromColumnMetadataVector(column_metadatas, row_alignment, + var_length_alignment); + RowTableImpl row_table_seed; + ASSERT_OK(row_table_seed.Init(pool, table_metadata)); + RowTableEncoder row_encoder; + row_encoder.Init(column_metadatas, row_alignment, var_length_alignment); + row_encoder.PrepareEncodeSelected(0, num_rows_seed, columns); + std::vector row_ids(num_rows_seed, 0); + ASSERT_OK(row_encoder.EncodeSelected( + &row_table_seed, static_cast(num_rows_seed), row_ids.data())); + + // The target row table. + RowTableImpl row_table; + ASSERT_OK(row_table.Init(pool, table_metadata)); + + // Appending the seed 7 times should be fine. + for (int i = 0; i < num_rows - 1; ++i) { + ASSERT_OK(row_table.AppendSelectionFrom(row_table_seed, num_rows_seed, + /*source_row_ids=*/NULLPTR)); + } + + // Appending the seed the 8-th time should overflow. + int64_t length_per_row = table_metadata.fixed_length + length_per_binary; + std::stringstream expected_error_message; + expected_error_message + << "Invalid: Offset overflow detected in RowTableImpl::AppendSelectionFrom for row " + << num_rows - 1 << " of length " << length_per_row + << " bytes, current length in total is " << length_per_row * (num_rows - 1) + << " bytes"; + ASSERT_RAISES_WITH_MESSAGE(Invalid, expected_error_message.str(), + row_table.AppendSelectionFrom(row_table_seed, num_rows_seed, + /*source_row_ids=*/NULLPTR)); +} + } // namespace compute } // namespace arrow