From 2d2cdbd37396257055055e9ae7da861a7e010c26 Mon Sep 17 00:00:00 2001 From: Xiaoxuan Meng Date: Tue, 17 Dec 2024 15:48:36 -0800 Subject: [PATCH] fix: Use the right row container to extract rows in grouping set (#11899) Summary: Pull Request resolved: https://github.com/facebookincubator/velox/pull/11899 Currently we always extract rows from grouping set through the default row container in the grouping set. It can cause potential performance and even data correctness problem in spill path as the row extract depends on the column stats recorded in the row container to decide whether apply fast no null path or slow path with nulls. If we use default row container to extract rows from the row container used by spill, the column stats is not accurate. If spill merge row container has null, then we get random non-null value for non column. This is correctness problem. If spill merge row container has no null, then we get slow path with null handling This PR fixes by always use the right container to extract rows Reviewed By: Yuhta Differential Revision: D67352373 fbshipit-source-id: 9d48857c4b3ca270bfc6777e0c3e4fbc03ee1e03 --- velox/exec/GroupingSet.cpp | 12 +++++++----- velox/exec/GroupingSet.h | 5 ++++- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/velox/exec/GroupingSet.cpp b/velox/exec/GroupingSet.cpp index b0ba52cd9828..9f15ea440c91 100644 --- a/velox/exec/GroupingSet.cpp +++ b/velox/exec/GroupingSet.cpp @@ -760,22 +760,23 @@ bool GroupingSet::getOutput( } return false; } - extractGroups(folly::Range(groups, numGroups), result); + extractGroups( + table_->rows(), folly::Range(groups, numGroups), result); return true; } void GroupingSet::extractGroups( + RowContainer* rowContainer, folly::Range groups, const RowVectorPtr& result) { result->resize(groups.size()); if (groups.empty()) { return; } - RowContainer& rows = *table_->rows(); - const auto totalKeys = rows.keyTypes().size(); + const auto totalKeys = rowContainer->keyTypes().size(); for (int32_t i = 0; i < totalKeys; ++i) { auto& keyVector = result->childAt(i); - rows.extractColumn( + rowContainer->extractColumn( groups.data(), groups.size(), groupingKeyOutputProjections_[i], @@ -1252,7 +1253,8 @@ void GroupingSet::extractSpillResult(const RowVectorPtr& result) { mergeRows_->listRows( &iter, rows.size(), RowContainer::kUnlimited, rows.data()); } - extractGroups(folly::Range(rows.data(), rows.size()), result); + extractGroups( + mergeRows_.get(), folly::Range(rows.data(), rows.size()), result); mergeRows_->clear(); } diff --git a/velox/exec/GroupingSet.h b/velox/exec/GroupingSet.h index fda8f4eea02f..9edb3cfac34d 100644 --- a/velox/exec/GroupingSet.h +++ b/velox/exec/GroupingSet.h @@ -203,7 +203,10 @@ class GroupingSet { // Copies the grouping keys and aggregates for 'groups' into 'result' If // partial output, extracts the intermediate type for aggregates, final result // otherwise. - void extractGroups(folly::Range groups, const RowVectorPtr& result); + void extractGroups( + RowContainer* container, + folly::Range groups, + const RowVectorPtr& result); // Produces output in if spilling has occurred. First produces data // from non-spilled partitions, then merges spill runs and unspilled data