Skip to content

Commit

Permalink
fix: Fix columns stats when insert serialized rows (#11910)
Browse files Browse the repository at this point in the history
Summary:

We don't update column stats when insert serialized rows which could cause problem when extract data
from the row container as deserialized vector which depends on the column stats in the row container.

This PR adds column stats update for serialized row insertion

Reviewed By: Yuhta

Differential Revision: D67419701
  • Loading branch information
xiaoxmeng authored and facebook-github-bot committed Dec 18, 2024
1 parent 4ffaf9c commit 5a26064
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 2 deletions.
24 changes: 22 additions & 2 deletions velox/exec/RowContainer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -520,6 +520,25 @@ void RowContainer::updateColumnStats(
}
}

void RowContainer::updateColumnStats(char* row, int32_t columnIndex) {
const bool nullColumn = isNullAt(row, rowColumns_[columnIndex]);
updateColumnHasNulls(columnIndex, nullColumn);

if (rowColumnsStats_.empty()) {
// Column stats have been invalidated.
return;
}

auto& columnStats = rowColumnsStats_[columnIndex];
if (nullColumn) {
columnStats.addNullCell();
} else if (types_[columnIndex]->isFixedWidth()) {
columnStats.addCellSize(fixedSizeAt(columnIndex));
} else {
columnStats.addCellSize(variableSizeAt(row, columnIndex));
}
}

void RowContainer::store(
const DecodedVector& decoded,
vector_size_t rowIndex,
Expand Down Expand Up @@ -596,7 +615,7 @@ std::unique_ptr<ByteInputStream> RowContainer::prepareRead(

int32_t RowContainer::variableSizeAt(const char* row, column_index_t column)
const {
const auto rowColumn = rowColumns_[column];
const auto& rowColumn = rowColumns_[column];

if (isNullAt(row, rowColumn)) {
return 0;
Expand Down Expand Up @@ -766,7 +785,7 @@ void RowContainer::storeSerializedRow(
vector_size_t index,
char* row) {
VELOX_CHECK(!vector.isNullAt(index));
auto serialized = vector.valueAt(index);
const auto serialized = vector.valueAt(index);
size_t offset = 0;

::memcpy(row + rowColumns_[0].nullByte(), serialized.data(), flagBytes_);
Expand All @@ -783,6 +802,7 @@ void RowContainer::storeSerializedRow(
const auto size = storeVariableSizeAt(serialized.data() + offset, row, i);
offset += size;
}
updateColumnStats(row, i);
}
}

Expand Down
3 changes: 3 additions & 0 deletions velox/exec/RowContainer.h
Original file line number Diff line number Diff line change
Expand Up @@ -1466,6 +1466,9 @@ class RowContainer {
char* row,
int32_t columnIndex);

// Updates column stats for serialized row.
inline void updateColumnStats(char* row, int32_t columnIndex);

// Light weight aggregated column stats does not support row erasures. This
// method is called whenever a row is erased.
void invalidateColumnStats();
Expand Down
25 changes: 25 additions & 0 deletions velox/exec/tests/RowContainerTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1953,6 +1953,20 @@ TEST_F(RowContainerTest, extractSerializedRow) {

auto rowType = fuzzer.randRowType();
auto data = fuzzer.fuzzInputRow(rowType);
std::vector<bool> expectedColumnHasNulls(data->size(), false);
for (int col = 0; col < rowType->size(); ++col) {
const auto child = data->childAt(col);
if (!child->mayHaveNulls()) {
expectedColumnHasNulls[col] = false;
} else {
for (auto row = 0; row < child->size(); ++row) {
if (child->isNullAt(row)) {
expectedColumnHasNulls[col] = true;
break;
}
}
}
}

SCOPED_TRACE(data->toString());

Expand All @@ -1965,9 +1979,20 @@ TEST_F(RowContainerTest, extractSerializedRow) {
VARBINARY(), data->size(), pool());
rowContainer.extractSerializedRows(
folly::Range(rows.data(), rows.size()), serialized);
for (int col = 0; col < rowType->size(); ++col) {
ASSERT_EQ(rowContainer.columnHasNulls(col), expectedColumnHasNulls[col]);
}

rowContainer.clear();
// TODO: fix these once we reset the column stats when clear the row
// container.
for (int col = 0; col < rowType->size(); ++col) {
ASSERT_EQ(rowContainer.columnHasNulls(col), expectedColumnHasNulls[col]);
}
rows.clear();
for (int col = 0; col < rowType->size(); ++col) {
ASSERT_EQ(rowContainer.columnHasNulls(col), expectedColumnHasNulls[col]);
}

// Load serialized rows back.
for (auto i = 0; i < data->size(); ++i) {
Expand Down

0 comments on commit 5a26064

Please sign in to comment.