Skip to content

Commit

Permalink
GH-43202: [C++][Compute] Detect and explicit error for offset overflo…
Browse files Browse the repository at this point in the history
…w in row table (#43226)

### Rationale for this change

There are two places for the offset in a row table to possibly overflow: 1) Encoding columns into a row table; 2) Appending to a row table from another row table.

They are particularly bad because they can cause silent wrong result for the computation.

### What changes are included in this PR?

Detect such overflows in the aforementioned places and report an explicit error when an overflow is detected.

### Are these changes tested?

UT included.

### Are there any user-facing changes?

User code that used to be fake passing could see an explicit error after this change.

**This PR contains a "Critical Fix".**
* GitHub Issue: #43202

Authored-by: Ruoxi Sun <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
  • Loading branch information
zanmato1984 authored Jul 16, 2024
1 parent c3aad6a commit cf5d7c7
Show file tree
Hide file tree
Showing 4 changed files with 160 additions and 13 deletions.
26 changes: 18 additions & 8 deletions cpp/src/arrow/compute/row/encode_internal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

#include "arrow/compute/row/encode_internal.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/int_util_overflow.h"

namespace arrow {
namespace compute {
Expand Down Expand Up @@ -160,8 +161,8 @@ Status RowTableEncoder::EncodeSelected(RowTableImpl* rows, uint32_t num_selected
/*num_extra_bytes_to_append=*/static_cast<uint32_t>(0)));
// Then populate the offsets of the var-length columns, which will be used as the target
// size of the var-length buffers resizing below.
EncoderOffsets::GetRowOffsetsSelected(rows, batch_varbinary_cols_, num_selected,
selection);
RETURN_NOT_OK(EncoderOffsets::GetRowOffsetsSelected(rows, batch_varbinary_cols_,
num_selected, selection));
// Last AppendEmpty with zero rows and zero extra bytes to resize the var-length buffers
// based on the populated offsets.
RETURN_NOT_OK(
Expand Down Expand Up @@ -667,12 +668,12 @@ void EncoderOffsets::Decode(uint32_t start_row, uint32_t num_rows,
}
}

void EncoderOffsets::GetRowOffsetsSelected(RowTableImpl* rows,
const std::vector<KeyColumnArray>& cols,
uint32_t num_selected,
const uint16_t* selection) {
Status EncoderOffsets::GetRowOffsetsSelected(RowTableImpl* rows,
const std::vector<KeyColumnArray>& cols,
uint32_t num_selected,
const uint16_t* selection) {
if (rows->metadata().is_fixed_length) {
return;
return Status::OK();
}

uint32_t* row_offsets = rows->mutable_offsets();
Expand Down Expand Up @@ -713,9 +714,18 @@ void EncoderOffsets::GetRowOffsetsSelected(RowTableImpl* rows,
uint32_t length = row_offsets[i];
length += RowTableMetadata::padding_for_alignment(length, row_alignment);
row_offsets[i] = sum;
sum += length;
uint32_t sum_maybe_overflow = 0;
if (ARROW_PREDICT_FALSE(
arrow::internal::AddWithOverflow(sum, length, &sum_maybe_overflow))) {
return Status::Invalid(
"Offset overflow detected in EncoderOffsets::GetRowOffsetsSelected for row ", i,
" of length ", length, " bytes, current length in total is ", sum, " bytes");
}
sum = sum_maybe_overflow;
}
row_offsets[num_selected] = sum;

return Status::OK();
}

template <bool has_nulls, bool is_first_varbinary>
Expand Down
6 changes: 3 additions & 3 deletions cpp/src/arrow/compute/row/encode_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -227,9 +227,9 @@ class EncoderBinaryPair {

class EncoderOffsets {
public:
static void GetRowOffsetsSelected(RowTableImpl* rows,
const std::vector<KeyColumnArray>& cols,
uint32_t num_selected, const uint16_t* selection);
static Status GetRowOffsetsSelected(RowTableImpl* rows,
const std::vector<KeyColumnArray>& cols,
uint32_t num_selected, const uint16_t* selection);
static void EncodeSelected(RowTableImpl* rows, const std::vector<KeyColumnArray>& cols,
uint32_t num_selected, const uint16_t* selection);

Expand Down
12 changes: 10 additions & 2 deletions cpp/src/arrow/compute/row/row_internal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include "arrow/compute/row/row_internal.h"

#include "arrow/compute/util.h"
#include "arrow/util/int_util_overflow.h"

namespace arrow {
namespace compute {
Expand Down Expand Up @@ -325,14 +326,21 @@ Status RowTableImpl::AppendSelectionFrom(const RowTableImpl& from,
// Varying-length rows
auto from_offsets = reinterpret_cast<const uint32_t*>(from.offsets_->data());
auto to_offsets = reinterpret_cast<uint32_t*>(offsets_->mutable_data());
// TODO(GH-43202): The following two variables are possibly overflowing.
uint32_t total_length = to_offsets[num_rows_];
uint32_t total_length_to_append = 0;
for (uint32_t i = 0; i < num_rows_to_append; ++i) {
uint16_t row_id = source_row_ids ? source_row_ids[i] : i;
uint32_t length = from_offsets[row_id + 1] - from_offsets[row_id];
total_length_to_append += length;
to_offsets[num_rows_ + i + 1] = total_length + total_length_to_append;
uint32_t to_offset_maybe_overflow = 0;
if (ARROW_PREDICT_FALSE(arrow::internal::AddWithOverflow(
total_length, total_length_to_append, &to_offset_maybe_overflow))) {
return Status::Invalid(
"Offset overflow detected in RowTableImpl::AppendSelectionFrom for row ",
num_rows_ + i, " of length ", length, " bytes, current length in total is ",
to_offsets[num_rows_ + i], " bytes");
}
to_offsets[num_rows_ + i + 1] = to_offset_maybe_overflow;
}

RETURN_NOT_OK(ResizeOptionalVaryingLengthBuffer(total_length_to_append));
Expand Down
129 changes: 129 additions & 0 deletions cpp/src/arrow/compute/row/row_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -125,5 +125,134 @@ TEST(RowTableMemoryConsumption, Encode) {
}
}

// GH-43202: Ensure that when offset overflow happens in encoding the row table, an
// explicit error is raised instead of a silent wrong result.
TEST(RowTableOffsetOverflow, LARGE_MEMORY_TEST(Encode)) {
if constexpr (sizeof(void*) == 4) {
GTEST_SKIP() << "Test only works on 64-bit platforms";
}

// Use 8 512MB var-length rows (occupies 4GB+) to overflow the offset in the row table.
constexpr int64_t num_rows = 8;
constexpr int64_t length_per_binary = 512 * 1024 * 1024;
constexpr int64_t row_alignment = sizeof(uint32_t);
constexpr int64_t var_length_alignment = sizeof(uint32_t);

MemoryPool* pool = default_memory_pool();

// The column to encode.
std::vector<KeyColumnArray> columns;
std::vector<Datum> values;
ASSERT_OK_AND_ASSIGN(
auto value, ::arrow::gen::Constant(
std::make_shared<BinaryScalar>(std::string(length_per_binary, 'X')))
->Generate(1));
values.push_back(std::move(value));
ExecBatch batch = ExecBatch(std::move(values), 1);
ASSERT_OK(ColumnArraysFromExecBatch(batch, &columns));

// The row table.
std::vector<KeyColumnMetadata> column_metadatas;
ASSERT_OK(ColumnMetadatasFromExecBatch(batch, &column_metadatas));
RowTableMetadata table_metadata;
table_metadata.FromColumnMetadataVector(column_metadatas, row_alignment,
var_length_alignment);
RowTableImpl row_table;
ASSERT_OK(row_table.Init(pool, table_metadata));
RowTableEncoder row_encoder;
row_encoder.Init(column_metadatas, row_alignment, var_length_alignment);

// The rows to encode.
std::vector<uint16_t> row_ids(num_rows, 0);

// Encoding 7 rows should be fine.
{
row_encoder.PrepareEncodeSelected(0, num_rows - 1, columns);
ASSERT_OK(row_encoder.EncodeSelected(&row_table, static_cast<uint32_t>(num_rows - 1),
row_ids.data()));
}

// Encoding 8 rows should overflow.
{
int64_t length_per_row = table_metadata.fixed_length + length_per_binary;
std::stringstream expected_error_message;
expected_error_message << "Invalid: Offset overflow detected in "
"EncoderOffsets::GetRowOffsetsSelected for row "
<< num_rows - 1 << " of length " << length_per_row
<< " bytes, current length in total is "
<< length_per_row * (num_rows - 1) << " bytes";
row_encoder.PrepareEncodeSelected(0, num_rows, columns);
ASSERT_RAISES_WITH_MESSAGE(
Invalid, expected_error_message.str(),
row_encoder.EncodeSelected(&row_table, static_cast<uint32_t>(num_rows),
row_ids.data()));
}
}

// GH-43202: Ensure that when offset overflow happens in appending to the row table, an
// explicit error is raised instead of a silent wrong result.
TEST(RowTableOffsetOverflow, LARGE_MEMORY_TEST(AppendFrom)) {
if constexpr (sizeof(void*) == 4) {
GTEST_SKIP() << "Test only works on 64-bit platforms";
}

// Use 8 512MB var-length rows (occupies 4GB+) to overflow the offset in the row table.
constexpr int64_t num_rows = 8;
constexpr int64_t length_per_binary = 512 * 1024 * 1024;
constexpr int64_t num_rows_seed = 1;
constexpr int64_t row_alignment = sizeof(uint32_t);
constexpr int64_t var_length_alignment = sizeof(uint32_t);

MemoryPool* pool = default_memory_pool();

// The column to encode.
std::vector<KeyColumnArray> columns;
std::vector<Datum> values;
ASSERT_OK_AND_ASSIGN(
auto value, ::arrow::gen::Constant(
std::make_shared<BinaryScalar>(std::string(length_per_binary, 'X')))
->Generate(num_rows_seed));
values.push_back(std::move(value));
ExecBatch batch = ExecBatch(std::move(values), num_rows_seed);
ASSERT_OK(ColumnArraysFromExecBatch(batch, &columns));

// The seed row table.
std::vector<KeyColumnMetadata> column_metadatas;
ASSERT_OK(ColumnMetadatasFromExecBatch(batch, &column_metadatas));
RowTableMetadata table_metadata;
table_metadata.FromColumnMetadataVector(column_metadatas, row_alignment,
var_length_alignment);
RowTableImpl row_table_seed;
ASSERT_OK(row_table_seed.Init(pool, table_metadata));
RowTableEncoder row_encoder;
row_encoder.Init(column_metadatas, row_alignment, var_length_alignment);
row_encoder.PrepareEncodeSelected(0, num_rows_seed, columns);
std::vector<uint16_t> row_ids(num_rows_seed, 0);
ASSERT_OK(row_encoder.EncodeSelected(
&row_table_seed, static_cast<uint32_t>(num_rows_seed), row_ids.data()));

// The target row table.
RowTableImpl row_table;
ASSERT_OK(row_table.Init(pool, table_metadata));

// Appending the seed 7 times should be fine.
for (int i = 0; i < num_rows - 1; ++i) {
ASSERT_OK(row_table.AppendSelectionFrom(row_table_seed, num_rows_seed,
/*source_row_ids=*/NULLPTR));
}

// Appending the seed the 8-th time should overflow.
int64_t length_per_row = table_metadata.fixed_length + length_per_binary;
std::stringstream expected_error_message;
expected_error_message
<< "Invalid: Offset overflow detected in RowTableImpl::AppendSelectionFrom for row "
<< num_rows - 1 << " of length " << length_per_row
<< " bytes, current length in total is " << length_per_row * (num_rows - 1)
<< " bytes";
ASSERT_RAISES_WITH_MESSAGE(Invalid, expected_error_message.str(),
row_table.AppendSelectionFrom(row_table_seed, num_rows_seed,
/*source_row_ids=*/NULLPTR));
}

} // namespace compute
} // namespace arrow

0 comments on commit cf5d7c7

Please sign in to comment.