diff --git a/velox/exec/RowContainer.cpp b/velox/exec/RowContainer.cpp index 74d49266b0fd..9e9ace14ee0f 100644 --- a/velox/exec/RowContainer.cpp +++ b/velox/exec/RowContainer.cpp @@ -520,6 +520,25 @@ void RowContainer::updateColumnStats( } } +void RowContainer::updateColumnStats(char* row, int32_t columnIndex) { + const bool nullColumn = isNullAt(row, rowColumns_[columnIndex]); + updateColumnHasNulls(columnIndex, nullColumn); + + if (rowColumnsStats_.empty()) { + // Column stats have been invalidated. + return; + } + + auto& columnStats = rowColumnsStats_[columnIndex]; + if (nullColumn) { + columnStats.addNullCell(); + } else if (types_[columnIndex]->isFixedWidth()) { + columnStats.addCellSize(fixedSizeAt(columnIndex)); + } else { + columnStats.addCellSize(variableSizeAt(row, columnIndex)); + } +} + void RowContainer::store( const DecodedVector& decoded, vector_size_t rowIndex, @@ -596,7 +615,7 @@ std::unique_ptr RowContainer::prepareRead( int32_t RowContainer::variableSizeAt(const char* row, column_index_t column) const { - const auto rowColumn = rowColumns_[column]; + const auto& rowColumn = rowColumns_[column]; if (isNullAt(row, rowColumn)) { return 0; @@ -766,7 +785,7 @@ void RowContainer::storeSerializedRow( vector_size_t index, char* row) { VELOX_CHECK(!vector.isNullAt(index)); - auto serialized = vector.valueAt(index); + const auto serialized = vector.valueAt(index); size_t offset = 0; ::memcpy(row + rowColumns_[0].nullByte(), serialized.data(), flagBytes_); @@ -783,6 +802,7 @@ void RowContainer::storeSerializedRow( const auto size = storeVariableSizeAt(serialized.data() + offset, row, i); offset += size; } + updateColumnStats(row, i); } } diff --git a/velox/exec/RowContainer.h b/velox/exec/RowContainer.h index 72cd08e276dd..2d3f65346717 100644 --- a/velox/exec/RowContainer.h +++ b/velox/exec/RowContainer.h @@ -1466,6 +1466,9 @@ class RowContainer { char* row, int32_t columnIndex); + // Updates column stats for serialized row. + inline void updateColumnStats(char* row, int32_t columnIndex); + // Light weight aggregated column stats does not support row erasures. This // method is called whenever a row is erased. void invalidateColumnStats(); diff --git a/velox/exec/tests/RowContainerTest.cpp b/velox/exec/tests/RowContainerTest.cpp index a2040721f664..4189adc5de72 100644 --- a/velox/exec/tests/RowContainerTest.cpp +++ b/velox/exec/tests/RowContainerTest.cpp @@ -1953,6 +1953,20 @@ TEST_F(RowContainerTest, extractSerializedRow) { auto rowType = fuzzer.randRowType(); auto data = fuzzer.fuzzInputRow(rowType); + std::vector expectedColumnHasNulls(data->size(), false); + for (int col = 0; col < rowType->size(); ++col) { + const auto child = data->childAt(col); + if (!child->mayHaveNulls()) { + expectedColumnHasNulls[col] = false; + } else { + for (auto row = 0; row < child->size(); ++row) { + if (child->isNullAt(row)) { + expectedColumnHasNulls[col] = true; + break; + } + } + } + } SCOPED_TRACE(data->toString()); @@ -1965,9 +1979,20 @@ TEST_F(RowContainerTest, extractSerializedRow) { VARBINARY(), data->size(), pool()); rowContainer.extractSerializedRows( folly::Range(rows.data(), rows.size()), serialized); + for (int col = 0; col < rowType->size(); ++col) { + ASSERT_EQ(rowContainer.columnHasNulls(col), expectedColumnHasNulls[col]); + } rowContainer.clear(); + // TODO: fix these once we reset the column stats when clear the row + // container. + for (int col = 0; col < rowType->size(); ++col) { + ASSERT_EQ(rowContainer.columnHasNulls(col), expectedColumnHasNulls[col]); + } rows.clear(); + for (int col = 0; col < rowType->size(); ++col) { + ASSERT_EQ(rowContainer.columnHasNulls(col), expectedColumnHasNulls[col]); + } // Load serialized rows back. for (auto i = 0; i < data->size(); ++i) {