From 5a26064ebdb68addf1dd9b3fdea383b42559fb9a Mon Sep 17 00:00:00 2001 From: Xiaoxuan Meng Date: Wed, 18 Dec 2024 15:52:36 -0800 Subject: [PATCH] fix: Fix columns stats when insert serialized rows (#11910) Summary: We don't update column stats when insert serialized rows which could cause problem when extract data from the row container as deserialized vector which depends on the column stats in the row container. This PR adds column stats update for serialized row insertion Reviewed By: Yuhta Differential Revision: D67419701 --- velox/exec/RowContainer.cpp | 24 ++++++++++++++++++++++-- velox/exec/RowContainer.h | 3 +++ velox/exec/tests/RowContainerTest.cpp | 25 +++++++++++++++++++++++++ 3 files changed, 50 insertions(+), 2 deletions(-) diff --git a/velox/exec/RowContainer.cpp b/velox/exec/RowContainer.cpp index 74d49266b0fd..9e9ace14ee0f 100644 --- a/velox/exec/RowContainer.cpp +++ b/velox/exec/RowContainer.cpp @@ -520,6 +520,25 @@ void RowContainer::updateColumnStats( } } +void RowContainer::updateColumnStats(char* row, int32_t columnIndex) { + const bool nullColumn = isNullAt(row, rowColumns_[columnIndex]); + updateColumnHasNulls(columnIndex, nullColumn); + + if (rowColumnsStats_.empty()) { + // Column stats have been invalidated. + return; + } + + auto& columnStats = rowColumnsStats_[columnIndex]; + if (nullColumn) { + columnStats.addNullCell(); + } else if (types_[columnIndex]->isFixedWidth()) { + columnStats.addCellSize(fixedSizeAt(columnIndex)); + } else { + columnStats.addCellSize(variableSizeAt(row, columnIndex)); + } +} + void RowContainer::store( const DecodedVector& decoded, vector_size_t rowIndex, @@ -596,7 +615,7 @@ std::unique_ptr RowContainer::prepareRead( int32_t RowContainer::variableSizeAt(const char* row, column_index_t column) const { - const auto rowColumn = rowColumns_[column]; + const auto& rowColumn = rowColumns_[column]; if (isNullAt(row, rowColumn)) { return 0; @@ -766,7 +785,7 @@ void RowContainer::storeSerializedRow( vector_size_t index, char* row) { VELOX_CHECK(!vector.isNullAt(index)); - auto serialized = vector.valueAt(index); + const auto serialized = vector.valueAt(index); size_t offset = 0; ::memcpy(row + rowColumns_[0].nullByte(), serialized.data(), flagBytes_); @@ -783,6 +802,7 @@ void RowContainer::storeSerializedRow( const auto size = storeVariableSizeAt(serialized.data() + offset, row, i); offset += size; } + updateColumnStats(row, i); } } diff --git a/velox/exec/RowContainer.h b/velox/exec/RowContainer.h index 72cd08e276dd..2d3f65346717 100644 --- a/velox/exec/RowContainer.h +++ b/velox/exec/RowContainer.h @@ -1466,6 +1466,9 @@ class RowContainer { char* row, int32_t columnIndex); + // Updates column stats for serialized row. + inline void updateColumnStats(char* row, int32_t columnIndex); + // Light weight aggregated column stats does not support row erasures. This // method is called whenever a row is erased. void invalidateColumnStats(); diff --git a/velox/exec/tests/RowContainerTest.cpp b/velox/exec/tests/RowContainerTest.cpp index a2040721f664..4189adc5de72 100644 --- a/velox/exec/tests/RowContainerTest.cpp +++ b/velox/exec/tests/RowContainerTest.cpp @@ -1953,6 +1953,20 @@ TEST_F(RowContainerTest, extractSerializedRow) { auto rowType = fuzzer.randRowType(); auto data = fuzzer.fuzzInputRow(rowType); + std::vector expectedColumnHasNulls(data->size(), false); + for (int col = 0; col < rowType->size(); ++col) { + const auto child = data->childAt(col); + if (!child->mayHaveNulls()) { + expectedColumnHasNulls[col] = false; + } else { + for (auto row = 0; row < child->size(); ++row) { + if (child->isNullAt(row)) { + expectedColumnHasNulls[col] = true; + break; + } + } + } + } SCOPED_TRACE(data->toString()); @@ -1965,9 +1979,20 @@ TEST_F(RowContainerTest, extractSerializedRow) { VARBINARY(), data->size(), pool()); rowContainer.extractSerializedRows( folly::Range(rows.data(), rows.size()), serialized); + for (int col = 0; col < rowType->size(); ++col) { + ASSERT_EQ(rowContainer.columnHasNulls(col), expectedColumnHasNulls[col]); + } rowContainer.clear(); + // TODO: fix these once we reset the column stats when clear the row + // container. + for (int col = 0; col < rowType->size(); ++col) { + ASSERT_EQ(rowContainer.columnHasNulls(col), expectedColumnHasNulls[col]); + } rows.clear(); + for (int col = 0; col < rowType->size(); ++col) { + ASSERT_EQ(rowContainer.columnHasNulls(col), expectedColumnHasNulls[col]); + } // Load serialized rows back. for (auto i = 0; i < data->size(); ++i) {