-
Notifications
You must be signed in to change notification settings - Fork 1.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix Window operator with NaN as the frame bound #11293
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -349,6 +349,7 @@ vector_size_t WindowPartition::linearSearchFrameValue( | |
return end == numRows() ? numRows() + 1 : -1; | ||
} | ||
|
||
template <typename T> | ||
void WindowPartition::updateKRangeFrameBounds( | ||
bool firstMatch, | ||
bool isPreceding, | ||
|
@@ -357,21 +358,34 @@ void WindowPartition::updateKRangeFrameBounds( | |
vector_size_t numRows, | ||
column_index_t frameColumn, | ||
const vector_size_t* rawPeerBounds, | ||
vector_size_t* rawFrameBounds) const { | ||
vector_size_t* rawFrameBounds, | ||
SelectivityVector& validFrames) const { | ||
column_index_t orderByColumn = sortKeyInfo_[0].first; | ||
column_index_t mappedFrameColumn = inputMapping_[frameColumn]; | ||
|
||
vector_size_t start = 0; | ||
vector_size_t end; | ||
// frameColumn is a column index into the original input rows, while | ||
// orderByColumn is a column index into rows in data_ after the columns are | ||
// reordered as per inputMapping_. | ||
// orderByColumn and mappedFrameColumn are column indices into rows in data_ | ||
// after the columns are reordered as per inputMapping_. | ||
VELOX_DEBUG_ONLY RowColumn frameRowColumn = columns_[frameColumn]; | ||
RowColumn orderByRowColumn = data_->columnAt(orderByColumn); | ||
RowColumn mappedFrameRowColumn = data_->columnAt(mappedFrameColumn); | ||
for (auto i = 0; i < numRows; i++) { | ||
auto currentRow = startRow + i; | ||
auto* partitionRow = partition_[currentRow]; | ||
|
||
// Mark the frame invalid if the frame bound is NaN, except if NaN in the | ||
// frame column is derived from NaN in the order-by column. | ||
// https://github.com/facebookincubator/velox/pull/11293#issuecomment-2475391888 | ||
if constexpr (std::is_floating_point_v<T>) { | ||
if (data_->isNanAt<T>(partitionRow, mappedFrameRowColumn) && | ||
!data_->isNanAt<T>(partitionRow, orderByRowColumn)) { | ||
validFrames.setValid(currentRow, false); | ||
continue; | ||
} | ||
} | ||
|
||
// The user is expected to set the frame column equal to NULL when the | ||
// ORDER BY value is NULL and not in any other case. Validate this | ||
// assumption. | ||
|
@@ -423,20 +437,48 @@ void WindowPartition::computeKRangeFrameBounds( | |
vector_size_t startRow, | ||
vector_size_t numRows, | ||
const vector_size_t* rawPeerBuffer, | ||
vector_size_t* rawFrameBounds) const { | ||
vector_size_t* rawFrameBounds, | ||
SelectivityVector& validFrames) const { | ||
CompareFlags flags; | ||
flags.ascending = sortKeyInfo_[0].second.isAscending(); | ||
flags.nullsFirst = sortKeyInfo_[0].second.isNullsFirst(); | ||
|
||
// Start bounds require first match. End bounds require last match. | ||
updateKRangeFrameBounds( | ||
isStartBound, | ||
isPreceding, | ||
flags, | ||
startRow, | ||
numRows, | ||
frameColumn, | ||
rawPeerBuffer, | ||
rawFrameBounds); | ||
const auto frameType = data_->columnTypes()[inputMapping_[frameColumn]]; | ||
if (frameType->isReal()) { | ||
updateKRangeFrameBounds<float>( | ||
isStartBound, | ||
isPreceding, | ||
flags, | ||
startRow, | ||
numRows, | ||
frameColumn, | ||
rawPeerBuffer, | ||
rawFrameBounds, | ||
validFrames); | ||
} else if (frameType->isDouble()) { | ||
updateKRangeFrameBounds<double>( | ||
isStartBound, | ||
isPreceding, | ||
flags, | ||
startRow, | ||
numRows, | ||
frameColumn, | ||
rawPeerBuffer, | ||
rawFrameBounds, | ||
validFrames); | ||
} else { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we use VELOX_DYNAMIC_SCALAR_TYPE_DISPATCH_ALL here instead of passing void? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi @xiaoxmeng, I intentionally attempted to avoid using VELOX_DYNAMIC_SCALAR_TYPE_DISPATCH_ALL. This will cause the code instantiation for every type, but we don't really differentiate the code for non-floating-point types. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we use VELOX_DYNAMIC_SCALAR_TYPE_DISPATCH_ALL here instead of passing void? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we use VELOX_DYNAMIC_SCALAR_TYPE_DISPATCH_ALL here instead of passing void for non-float or double type? |
||
updateKRangeFrameBounds<void>( | ||
isStartBound, | ||
isPreceding, | ||
flags, | ||
startRow, | ||
numRows, | ||
frameColumn, | ||
rawPeerBuffer, | ||
rawFrameBounds, | ||
validFrames); | ||
} | ||
} | ||
|
||
} // namespace facebook::velox::exec |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -692,5 +692,65 @@ DEBUG_ONLY_TEST_F(WindowTest, reserveMemorySort) { | |
} | ||
} | ||
|
||
TEST_F(WindowTest, NaNFrameBound) { | ||
const auto kNan = std::numeric_limits<double>::quiet_NaN(); | ||
auto data = makeRowVector( | ||
{"c0", "s0", "off0", "off1"}, | ||
{ | ||
makeFlatVector<int64_t>({1, 2, 3, 4}), | ||
makeFlatVector<double>({1.0, 2.0, 3.0, kNan}), | ||
makeFlatVector<double>({0.1, 2.0, 1.9, kNan}), | ||
makeFlatVector<double>({kNan, 2.0, kNan, kNan}), | ||
}); | ||
|
||
const auto makeFrames = [](const std::string& call) { | ||
std::vector<std::string> frames; | ||
|
||
std::vector<std::string> orders{"asc", "desc"}; | ||
std::vector<std::string> bounds{"preceding", "following"}; | ||
for (const std::string& order : orders) { | ||
for (const std::string& startBound : bounds) { | ||
for (const std::string& endBound : bounds) { | ||
// Frames starting from following and ending at preceding are not | ||
// allowed. | ||
if (startBound == "following" && endBound == "preceding") { | ||
continue; | ||
} | ||
frames.push_back(fmt::format( | ||
"{} over (order by s0 {} range between off0 {} and off1 {})", | ||
call, | ||
order, | ||
startBound, | ||
endBound)); | ||
frames.push_back(fmt::format( | ||
"{} over (order by s0 {} range between off1 {} and off0 {})", | ||
call, | ||
order, | ||
startBound, | ||
endBound)); | ||
} | ||
} | ||
} | ||
return frames; | ||
}; | ||
|
||
auto expected = makeRowVector( | ||
{makeNullableFlatVector<int64_t>({std::nullopt, 2, std::nullopt, 4})}); | ||
for (const auto& frame : makeFrames("sum(c0)")) { | ||
auto plan = | ||
PlanBuilder().values({data}).window({frame}).project({"w0"}).planNode(); | ||
AssertQueryBuilder(plan).assertResults(expected); | ||
} | ||
|
||
// rank() should not be affected by the frames, so added this test to ensure | ||
// rank() produces correct results even if the frame bounds contain NaN. | ||
expected = makeRowVector({makeFlatVector<int64_t>({1, 2, 3, 4})}); | ||
for (const auto& frame : makeFrames("rank()")) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @kagamiori : rank() is not affected by frame values. Only value functions (first_value, last_value, nth_value) and aggregate functions are affected by frame boundaries. MIght be better to use one of those functions in the test. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi @aditi-pandit, I intentionally added the test case of rank() to ensure it is not affected by the handling of NaN frame bound as you point out. The regular case is tested by the sum() function above. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @kagamiori : I see. Maybe then add a comment to indicate this is for a function not affected by the Nan frames. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @aditi-pandit, sure, will do. Thanks for the suggestion! |
||
auto plan = | ||
PlanBuilder().values({data}).window({frame}).project({"w0"}).planNode(); | ||
AssertQueryBuilder(plan).assertResults(expected); | ||
} | ||
} | ||
|
||
} // namespace | ||
} // namespace facebook::velox::exec |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Add a comment to describe this edge case as chat offline? thanks!