From 67f316256ea84e3ac03959117222d8284102ce25 Mon Sep 17 00:00:00 2001 From: Xiaoxuan Meng Date: Tue, 17 Dec 2024 12:24:40 -0800 Subject: [PATCH] fix: Use the right row container to extract rows in grouping set Summary: Currently we always extract rows from grouping set through the default row container in the grouping set. It can cause potential performance and even data correctness problem in spill path as the row extract depends on the column stats recorded in the row container to decide whether apply fast no null path or slow path with nulls. If we use default row container to extract rows from the row container used by spill, the column stats is not accurate. If spill merge row container has null, then we get random non-null value for non column. This is correctness problem. If spill merge row container has no null, then we get slow path with null handling This PR fixes by always use the right container to extract rows Differential Revision: D67352373 --- velox/exec/GroupingSet.cpp | 12 +++++++----- velox/exec/GroupingSet.h | 5 ++++- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/velox/exec/GroupingSet.cpp b/velox/exec/GroupingSet.cpp index b0ba52cd9828..9f15ea440c91 100644 --- a/velox/exec/GroupingSet.cpp +++ b/velox/exec/GroupingSet.cpp @@ -760,22 +760,23 @@ bool GroupingSet::getOutput( } return false; } - extractGroups(folly::Range(groups, numGroups), result); + extractGroups( + table_->rows(), folly::Range(groups, numGroups), result); return true; } void GroupingSet::extractGroups( + RowContainer* rowContainer, folly::Range groups, const RowVectorPtr& result) { result->resize(groups.size()); if (groups.empty()) { return; } - RowContainer& rows = *table_->rows(); - const auto totalKeys = rows.keyTypes().size(); + const auto totalKeys = rowContainer->keyTypes().size(); for (int32_t i = 0; i < totalKeys; ++i) { auto& keyVector = result->childAt(i); - rows.extractColumn( + rowContainer->extractColumn( groups.data(), groups.size(), groupingKeyOutputProjections_[i], @@ -1252,7 +1253,8 @@ void GroupingSet::extractSpillResult(const RowVectorPtr& result) { mergeRows_->listRows( &iter, rows.size(), RowContainer::kUnlimited, rows.data()); } - extractGroups(folly::Range(rows.data(), rows.size()), result); + extractGroups( + mergeRows_.get(), folly::Range(rows.data(), rows.size()), result); mergeRows_->clear(); } diff --git a/velox/exec/GroupingSet.h b/velox/exec/GroupingSet.h index fda8f4eea02f..9edb3cfac34d 100644 --- a/velox/exec/GroupingSet.h +++ b/velox/exec/GroupingSet.h @@ -203,7 +203,10 @@ class GroupingSet { // Copies the grouping keys and aggregates for 'groups' into 'result' If // partial output, extracts the intermediate type for aggregates, final result // otherwise. - void extractGroups(folly::Range groups, const RowVectorPtr& result); + void extractGroups( + RowContainer* container, + folly::Range groups, + const RowVectorPtr& result); // Produces output in if spilling has occurred. First produces data // from non-spilled partitions, then merges spill runs and unspilled data