From ce035c87de4b207b1afd1b3e0f8959b3d0e2cfe4 Mon Sep 17 00:00:00 2001 From: Pramod Date: Mon, 7 Oct 2024 17:19:42 -0700 Subject: [PATCH] Add kRange preceding/following frames in window fuzzer (#10006) Summary: 1. Adds support for kRange preceding/following frames in window fuzzer 2. Adds [reference query runner context](https://ibm.box.com/s/9tuk22hfp13imjwq9xelnz1dogsdgdh1) for PrestoSql frame clause for kRange preceding/following frames. Resolves https://github.com/facebookincubator/velox/issues/9572 . Pull Request resolved: https://github.com/facebookincubator/velox/pull/10006 Reviewed By: xiaoxmeng Differential Revision: D61882004 Pulled By: kagamiori fbshipit-source-id: 343201fc36e5c3c01779d73ff0888cd0597ba13c --- velox/exec/fuzzer/AggregationFuzzerBase.cpp | 31 +- velox/exec/fuzzer/AggregationFuzzerBase.h | 6 +- velox/exec/fuzzer/PrestoQueryRunner.cpp | 53 +-- velox/exec/fuzzer/PrestoQueryRunner.h | 10 + velox/exec/fuzzer/WindowFuzzer.cpp | 445 ++++++++++++++---- velox/exec/fuzzer/WindowFuzzer.h | 100 +++- velox/exec/tests/PrestoQueryRunnerTest.cpp | 23 +- .../fuzzer/ApproxPercentileResultVerifier.h | 52 +- velox/vector/fuzzer/VectorFuzzer.h | 6 + 9 files changed, 527 insertions(+), 199 deletions(-) diff --git a/velox/exec/fuzzer/AggregationFuzzerBase.cpp b/velox/exec/fuzzer/AggregationFuzzerBase.cpp index db6796d5bdea..95e6cb9d125b 100644 --- a/velox/exec/fuzzer/AggregationFuzzerBase.cpp +++ b/velox/exec/fuzzer/AggregationFuzzerBase.cpp @@ -241,14 +241,35 @@ std::vector AggregationFuzzerBase::generateKeys( std::vector AggregationFuzzerBase::generateSortingKeys( const std::string& prefix, std::vector& names, - std::vector& types) { + std::vector& types, + bool rangeFrame) { std::vector keys; - auto numKeys = boost::random::uniform_int_distribution(1, 5)(rng_); + vector_size_t numKeys; + vector_size_t maxDepth; + std::vector sortingKeyTypes = defaultScalarTypes(); + + // If frame has k-RANGE bound, only one sorting key should be present, and it + // should be a scalar type which supports '+', '-' arithmetic operations. + if (rangeFrame) { + numKeys = 1; + sortingKeyTypes = { + TINYINT(), + SMALLINT(), + INTEGER(), + BIGINT(), + HUGEINT(), + REAL(), + DOUBLE()}; + maxDepth = 0; + } else { + numKeys = randInt(1, 5); + // Pick random, possibly complex, type. + maxDepth = 2; + } + for (auto i = 0; i < numKeys; ++i) { keys.push_back(fmt::format("{}{}", prefix, i)); - - // Pick random, possibly complex, type. - types.push_back(vectorFuzzer_.randOrderableType(2)); + types.push_back(vectorFuzzer_.randOrderableType(sortingKeyTypes, maxDepth)); names.push_back(keys.back()); } diff --git a/velox/exec/fuzzer/AggregationFuzzerBase.h b/velox/exec/fuzzer/AggregationFuzzerBase.h index 3a314605db52..6d6bb6f5fbe9 100644 --- a/velox/exec/fuzzer/AggregationFuzzerBase.h +++ b/velox/exec/fuzzer/AggregationFuzzerBase.h @@ -183,11 +183,13 @@ class AggregationFuzzerBase { std::vector& types); // Similar to generateKeys, but restricts types to orderable types (i.e. no - // maps). + // maps). For k-RANGE frame bounds, rangeFrame must be set to true so only + // one sorting key is generated. std::vector generateSortingKeys( const std::string& prefix, std::vector& names, - std::vector& types); + std::vector& types, + bool rangeFrame = false); std::pair pickSignature(); diff --git a/velox/exec/fuzzer/PrestoQueryRunner.cpp b/velox/exec/fuzzer/PrestoQueryRunner.cpp index 4aa20ca8c4a9..956e7e235943 100644 --- a/velox/exec/fuzzer/PrestoQueryRunner.cpp +++ b/velox/exec/fuzzer/PrestoQueryRunner.cpp @@ -154,6 +154,7 @@ PrestoQueryRunner::PrestoQueryRunner( timeout_(timeout) { eventBaseThread_.start("PrestoQueryRunner"); pool_ = aggregatePool()->addLeafChild("leaf"); + queryRunnerContext_ = std::make_shared(); } std::optional PrestoQueryRunner::toSql( @@ -352,56 +353,6 @@ std::optional PrestoQueryRunner::toSql( return sql.str(); } -namespace { - -void appendWindowFrame( - const core::WindowNode::Frame& frame, - std::stringstream& sql) { - // TODO: Add support for k Range Frames by retrieving the original range bound - // from WindowNode. - switch (frame.type) { - case core::WindowNode::WindowType::kRange: - sql << " RANGE"; - break; - case core::WindowNode::WindowType::kRows: - sql << " ROWS"; - break; - default: - VELOX_UNREACHABLE(); - } - sql << " BETWEEN"; - - auto appendBound = [&sql]( - const core::WindowNode::BoundType& bound, - const core::TypedExprPtr& value) { - switch (bound) { - case core::WindowNode::BoundType::kUnboundedPreceding: - sql << " UNBOUNDED PRECEDING"; - break; - case core::WindowNode::BoundType::kUnboundedFollowing: - sql << " UNBOUNDED FOLLOWING"; - break; - case core::WindowNode::BoundType::kCurrentRow: - sql << " CURRENT ROW"; - break; - case core::WindowNode::BoundType::kPreceding: - sql << " " << value->toString() << " PRECEDING"; - break; - case core::WindowNode::BoundType::kFollowing: - sql << " " << value->toString() << " FOLLOWING"; - break; - default: - VELOX_UNREACHABLE(); - } - }; - - appendBound(frame.startType, frame.startValue); - sql << " AND"; - appendBound(frame.endType, frame.endValue); -} - -} // namespace - std::optional PrestoQueryRunner::toSql( const std::shared_ptr& windowNode) { if (!isSupportedDwrfType(windowNode->sources()[0]->outputType())) { @@ -446,7 +397,7 @@ std::optional PrestoQueryRunner::toSql( } } - appendWindowFrame(functions[i].frame, sql); + sql << " " << queryRunnerContext_->windowFrames_.at(windowNode->id()).at(i); sql << ")"; } diff --git a/velox/exec/fuzzer/PrestoQueryRunner.h b/velox/exec/fuzzer/PrestoQueryRunner.h index b5a20b4d1cbe..cc4ec047eef9 100644 --- a/velox/exec/fuzzer/PrestoQueryRunner.h +++ b/velox/exec/fuzzer/PrestoQueryRunner.h @@ -31,6 +31,11 @@ T extractSingleValue(const std::vector& data) { return simpleVector->valueAt(0); } +class QueryRunnerContext { + public: + std::unordered_map> windowFrames_; +}; + class PrestoQueryRunner : public velox::exec::test::ReferenceQueryRunner { public: /// @param coordinatorUri Presto REST API endpoint, e.g. http://127.0.0.1:8080 @@ -99,6 +104,10 @@ class PrestoQueryRunner : public velox::exec::test::ReferenceQueryRunner { const std::vector& buildInput, const RowTypePtr& resultType) override; + std::shared_ptr queryRunnerContext() { + return queryRunnerContext_; + } + private: memory::MemoryPool* pool() { return pool_.get(); @@ -141,6 +150,7 @@ class PrestoQueryRunner : public velox::exec::test::ReferenceQueryRunner { const std::chrono::milliseconds timeout_; folly::EventBaseThread eventBaseThread_{false}; std::shared_ptr pool_; + std::shared_ptr queryRunnerContext_; }; } // namespace facebook::velox::exec::test diff --git a/velox/exec/fuzzer/WindowFuzzer.cpp b/velox/exec/fuzzer/WindowFuzzer.cpp index 66e722f6e674..d73318452fa7 100644 --- a/velox/exec/fuzzer/WindowFuzzer.cpp +++ b/velox/exec/fuzzer/WindowFuzzer.cpp @@ -17,9 +17,9 @@ #include "velox/exec/fuzzer/WindowFuzzer.h" #include -#include "velox/common/base/Portability.h" #include "velox/exec/tests/utils/PlanBuilder.h" #include "velox/exec/tests/utils/TempDirectoryPath.h" +#include "velox/expression/ScopedVarSetter.h" DEFINE_bool( enable_window_reference_verification, @@ -44,6 +44,12 @@ bool supportIgnoreNulls(const std::string& name) { return supportedFunctions.count(name) > 0; } +bool isKBoundFrame(const core::WindowNode::BoundType& boundType) { + return ( + boundType == core::WindowNode::BoundType::kPreceding || + boundType == core::WindowNode::BoundType::kFollowing); +} + } // namespace void WindowFuzzer::addWindowFunctionSignatures( @@ -60,27 +66,23 @@ void WindowFuzzer::addWindowFunctionSignatures( } } -std::string WindowFuzzer::generateFrameClause( +std::string WindowFuzzer::generateKRowsFrameBound( std::vector& argNames, std::vector& argTypes, - bool& isRowsFrame) { - auto frameType = [](int value) -> const std::string { - switch (value) { - case 0: - return "RANGE"; - case 1: - return "ROWS"; - default: - VELOX_UNREACHABLE("Unknown value for frame type generation"); - } - }; - isRowsFrame = boost::random::uniform_int_distribution(0, 1)(rng_); - auto frameTypeString = frameType(isRowsFrame); + const std::string& colName) { + // Generating column bounded frames 50% of the time and constant bounded the + // rest of the times. The column bounded frames are of type INTEGER and only + // have positive values as Window functions reject negative values for them. + if (vectorFuzzer_.coinToss(0.5)) { + argTypes.push_back(INTEGER()); + argNames.push_back(colName); + return colName; + } constexpr int64_t kMax = std::numeric_limits::max(); constexpr int64_t kMin = std::numeric_limits::min(); - // For frames with kPreceding, kFollowing bounds, pick a valid k, in the range - // of 1 to 10, 70% of times. Test for random k values remaining times. + // For frames with kPreceding, kFollowing bounds, pick a valid k, in the + // range of 1 to 10, 70% of times. Test for random k values remaining times. int64_t minKValue, maxKValue; if (vectorFuzzer_.coinToss(0.7)) { minKValue = 1; @@ -89,78 +91,135 @@ std::string WindowFuzzer::generateFrameClause( minKValue = kMin; maxKValue = kMax; } + const auto kValue = boost::random::uniform_int_distribution( + minKValue, maxKValue)(rng_); + return std::to_string(kValue); +} + +WindowFuzzer::FrameMetadata WindowFuzzer::generateFrameClause( + std::vector& argNames, + std::vector& argTypes, + const std::vector& kBoundColumnNames) { + FrameMetadata frameMetadata; + // Randomly select if ROWS or RANGE frame + frameMetadata.windowType = vectorFuzzer_.coinToss(0.5) + ? core::WindowNode::WindowType::kRows + : core::WindowNode::WindowType::kRange; + const std::vector commonBoundTypes = { + core::WindowNode::BoundType::kPreceding, + core::WindowNode::BoundType::kCurrentRow, + core::WindowNode::BoundType::kFollowing}; + std::vector startBoundOptions = { + core::WindowNode::BoundType::kUnboundedPreceding}; + startBoundOptions.insert( + startBoundOptions.end(), + commonBoundTypes.begin(), + commonBoundTypes.end()); + auto endBoundOptions = commonBoundTypes; + endBoundOptions.emplace_back( + core::WindowNode::BoundType::kUnboundedFollowing); + + // End bound option should not be greater than start bound option as this + // would result in an invalid frame. + auto startBoundIndex = boost::random::uniform_int_distribution( + 0, startBoundOptions.size() - 1)(rng_); + frameMetadata.startBoundType = startBoundOptions[startBoundIndex]; + auto endBoundMinIdx = std::max(0, static_cast(startBoundIndex) - 1); + auto endBoundIndex = boost::random::uniform_int_distribution( + endBoundMinIdx, endBoundOptions.size() - 1)(rng_); + frameMetadata.endBoundType = endBoundOptions[endBoundIndex]; + + // Generate the frame bounds for kRows frames. The frame bounds for kRange + // frames can be generated only after we get the sorting keys, as the frame + // bound value depends on the ORDER-BY column value. + if (frameMetadata.windowType == core::WindowNode::WindowType::kRows) { + if (isKBoundFrame(frameMetadata.startBoundType)) { + frameMetadata.startBoundString = + generateKRowsFrameBound(argNames, argTypes, kBoundColumnNames[0]); + } + if (isKBoundFrame(frameMetadata.endBoundType)) { + frameMetadata.endBoundString = + generateKRowsFrameBound(argNames, argTypes, kBoundColumnNames[1]); + } + } - // Generating column bounded frames 50% of the time and constant - // bounded the rest of the times. The column bounded frames are of - // type INTEGER and only have positive values as Window functions - // reject negative values for them. - auto frameBound = [this, minKValue, maxKValue, &argNames, &argTypes]( - core::WindowNode::BoundType boundType, - const std::string& colName) -> const std::string { - auto kValue = boost::random::uniform_int_distribution( - minKValue, maxKValue)(rng_); + return frameMetadata; +} + +std::string WindowFuzzer::frameClauseString( + const FrameMetadata& frameMetadata, + const std::vector& kRangeOffsetColumns) { + auto frameType = [&](const core::WindowNode::BoundType boundType, + bool isStartBound) -> std::string { + const auto boundTypeString = core::WindowNode::boundTypeName(boundType); switch (boundType) { case core::WindowNode::BoundType::kUnboundedPreceding: - return "UNBOUNDED PRECEDING"; - case core::WindowNode::BoundType::kPreceding: - if (vectorFuzzer_.coinToss(0.5)) { - argTypes.push_back(INTEGER()); - argNames.push_back(colName); - return fmt::format("{} PRECEDING", colName); - } - return fmt::format("{} PRECEDING", kValue); case core::WindowNode::BoundType::kCurrentRow: - return "CURRENT ROW"; - case core::WindowNode::BoundType::kFollowing: - if (vectorFuzzer_.coinToss(0.5)) { - argTypes.push_back(INTEGER()); - argNames.push_back(colName); - return fmt::format("{} FOLLOWING", colName); - } - return fmt::format("{} FOLLOWING", kValue); case core::WindowNode::BoundType::kUnboundedFollowing: - return "UNBOUNDED FOLLOWING"; + return boundTypeString; + case core::WindowNode::BoundType::kPreceding: + case core::WindowNode::BoundType::kFollowing: { + std::string frameBound; + if (kRangeOffsetColumns.empty()) { + frameBound = isStartBound ? frameMetadata.startBoundString + : frameMetadata.endBoundString; + } else { + frameBound = + isStartBound ? kRangeOffsetColumns[0] : kRangeOffsetColumns[1]; + } + return fmt::format("{} {}", frameBound, boundTypeString); + } default: VELOX_UNREACHABLE("Unknown option for frame clause generation"); + return ""; } }; - // Generating k PRECEDING and k FOLLOWING frames only for ROWS type. - // k RANGE frames require more work as we have to generate columns with the - // frame bound values. - std::vector startBoundOptions, endBoundOptions; - if (isRowsFrame) { - startBoundOptions = { - core::WindowNode::BoundType::kUnboundedPreceding, - core::WindowNode::BoundType::kPreceding, - core::WindowNode::BoundType::kCurrentRow, - core::WindowNode::BoundType::kFollowing}; - endBoundOptions = { - core::WindowNode::BoundType::kPreceding, - core::WindowNode::BoundType::kCurrentRow, - core::WindowNode::BoundType::kFollowing, - core::WindowNode::BoundType::kUnboundedFollowing}; - } else { - startBoundOptions = { - core::WindowNode::BoundType::kUnboundedPreceding, - core::WindowNode::BoundType::kCurrentRow}; - endBoundOptions = { - core::WindowNode::BoundType::kCurrentRow, - core::WindowNode::BoundType::kUnboundedFollowing}; + return fmt::format( + " {} BETWEEN {} AND {}", + core::WindowNode::windowTypeName(frameMetadata.windowType), + frameType(frameMetadata.startBoundType, true), + frameType(frameMetadata.endBoundType, false)); +} + +template +const T WindowFuzzer::genOffsetAtIdx( + const T& orderByValue, + const T& frameBound, + const core::WindowNode::BoundType& frameBoundType, + const core::SortOrder& sortOrder) { + constexpr bool isIntegerType = + (std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v || + std::is_same_v); + auto isPreceding = [&](core::WindowNode::BoundType boundType) { + return boundType == core::WindowNode::BoundType::kPreceding; + }; + + if ((isPreceding(frameBoundType) && sortOrder.isAscending()) || + (!isPreceding(frameBoundType) && !sortOrder.isAscending())) { + if constexpr (std::is_same_v || std::is_same_v) { + return orderByValue - frameBound; + } else if constexpr (isIntegerType) { + return checkedMinus(orderByValue, frameBound); + } } - // End bound option should not be greater than start bound option as this - // would result in an invalid frame. - auto startBoundIndex = boost::random::uniform_int_distribution( - 0, startBoundOptions.size() - 1)(rng_); - auto endBoundMinIdx = std::max(0, static_cast(startBoundIndex) - 1); - auto endBoundIndex = boost::random::uniform_int_distribution( - endBoundMinIdx, endBoundOptions.size() - 1)(rng_); - auto frameStartBound = frameBound(startBoundOptions[startBoundIndex], "k0"); - auto frameEndBound = frameBound(endBoundOptions[endBoundIndex], "k1"); + if ((!isPreceding(frameBoundType) && sortOrder.isAscending()) || + (isPreceding(frameBoundType) && !sortOrder.isAscending())) { + if constexpr (std::is_same_v || std::is_same_v) { + return orderByValue + frameBound; + } else if constexpr (isIntegerType) { + return checkedPlus(orderByValue, frameBound); + } + } - return frameTypeString + " BETWEEN " + frameStartBound + " AND " + - frameEndBound; + VELOX_UNREACHABLE( + "Offset cannot be generated: orderBy key type: {}, sortOrder ascending {}, frameBoundType {}", + CppToType::name, + sortOrder.toString(), + core::WindowNode::boundTypeName(frameBoundType)); + return T{}; } std::string WindowFuzzer::generateOrderByClause( @@ -187,15 +246,16 @@ std::string WindowFuzzer::getFrame( if (!sortingKeysAndOrders.empty()) { frame << generateOrderByClause(sortingKeysAndOrders); } - frame << " " << frameClause; + frame << frameClause; return frame.str(); } std::vector WindowFuzzer::generateSortingKeysAndOrders( const std::string& prefix, std::vector& names, - std::vector& types) { - auto keys = generateSortingKeys(prefix, names, types); + std::vector& types, + bool isKRangeFrame) { + auto keys = generateSortingKeys(prefix, names, types, isKRangeFrame); std::vector results; for (auto i = 0; i < keys.size(); ++i) { auto asc = vectorFuzzer_.coinToss(0.5); @@ -205,6 +265,135 @@ std::vector WindowFuzzer::generateSortingKeysAndOrders( return results; } +template +VectorPtr WindowFuzzer::buildKRangeColumn( + const VectorPtr& frameBound, + const VectorPtr& orderByCol, + const core::WindowNode::BoundType& frameBoundType, + const core::SortOrder& sortOrder, + bool isColumnBound, + bool isOffsetColumn) { + auto type = CppToType::create(); + const auto size = vectorFuzzer_.getOptions().vectorSize; + const SelectivityVector allRows(size); + DecodedVector decodedVector(*orderByCol, allRows); + BufferPtr values = AlignedBuffer::allocate(size, pool_.get()); + BufferPtr nulls = allocateNulls(size, pool_.get()); + auto rawValues = values->asMutableRange(); + auto* rawNulls = nulls->asMutable(); + + for (auto j = 0; j < size; j++) { + if (decodedVector.isNullAt(j)) { + bits::setNull(rawNulls, j, true); + } else { + const auto orderByValueAtIdx = decodedVector.valueAt(j); + if constexpr (std::is_same_v || std::is_same_v) { + if (!std::isfinite(orderByValueAtIdx) || + std::isnan(orderByValueAtIdx)) { + rawValues[j] = orderByValueAtIdx; + continue; + } + } + + const T frameBoundValue = isColumnBound + ? frameBound->as>()->valueAt(j) + : frameBound->as>()->valueAt(0); + if (isOffsetColumn) { + rawValues[j] = genOffsetAtIdx( + orderByValueAtIdx, frameBoundValue, frameBoundType, sortOrder); + } else { + rawValues[j] = frameBoundValue; + } + } + } + + return std::make_shared>( + pool_.get(), type, nulls, size, values, std::vector{}); +} + +template +std::string WindowFuzzer::addKRangeOffsetColumnToInputImpl( + std::vector& input, + const core::WindowNode::BoundType& frameBoundType, + const SortingKeyAndOrder& orderByKey, + const std::string& columnName, + const std::string& offsetColumnName) { + // Use columns as frame bound 50% of time. + bool isColumnBound = vectorFuzzer_.coinToss(0.5); + const auto type = CppToType::create(); + VectorPtr constantFrameBound = + isColumnBound ? nullptr : vectorFuzzer_.fuzzConstant(type, 1); + VectorPtr columnFrameBound; + // Generate frame bound (constant/column) without nulls. + ScopedVarSetter nullRatioHolder( + &vectorFuzzer_.getMutableOptions().nullRatio, 0.0); + ScopedVarSetter dataSpecHolder( + &vectorFuzzer_.getMutableOptions().dataSpec, {false, false}); + velox::test::VectorMaker vectorMaker{pool_.get()}; + + for (auto i = 0; i < FLAGS_num_batches; i++) { + const auto orderByCol = input[i]->childAt(orderByKey.key_); + auto names = input[i]->type()->asRow().names(); + auto children = input[i]->children(); + if (isColumnBound) { + VectorPtr fuzzFrameBound = vectorFuzzer_.fuzzFlat(type); + names.push_back(columnName); + columnFrameBound = buildKRangeColumn( + fuzzFrameBound, + orderByCol, + frameBoundType, + orderByKey.sortOrder_, + isColumnBound, + false); + children.push_back(columnFrameBound); + } + + names.push_back(offsetColumnName); + VectorPtr frameBound = + isColumnBound ? columnFrameBound : constantFrameBound; + const VectorPtr offsetColumn = buildKRangeColumn( + frameBound, + orderByCol, + frameBoundType, + orderByKey.sortOrder_, + isColumnBound, + true); + children.push_back(offsetColumn); + input[i] = vectorMaker.rowVector(names, children); + } + + return isColumnBound + ? columnName + : constantFrameBound->as>()->toString(0); +} + +template +void WindowFuzzer::addKRangeOffsetColumnToInput( + std::vector& input, + FrameMetadata& frameMetadata, + SortingKeyAndOrder& orderByKey, + const std::vector& columnNames, + const std::vector& offsetColumnNames) { + using TCpp = typename TypeTraits::NativeType; + + if (isKBoundFrame(frameMetadata.startBoundType)) { + frameMetadata.startBoundString = addKRangeOffsetColumnToInputImpl( + input, + frameMetadata.startBoundType, + orderByKey, + columnNames[0], + offsetColumnNames[0]); + } + if (isKBoundFrame(frameMetadata.endBoundType)) { + frameMetadata.endBoundString = addKRangeOffsetColumnToInputImpl( + input, + frameMetadata.endBoundType, + orderByKey, + columnNames[1], + offsetColumnNames[1]); + } +} + void WindowFuzzer::go() { VELOX_CHECK( FLAGS_steps > 0 || FLAGS_duration_sec > 0, @@ -248,27 +437,82 @@ void WindowFuzzer::go() { const auto call = makeFunctionCall(signature.name, argNames, false, false, ignoreNulls); + // Columns used as k-PRECEDING/FOLLOWING frame bounds have fixed names: k0 + // and k1, for when a column is used as frame start and frame end bound + // respectively. + const std::vector kBoundColumns = {"k0", "k1"}; + auto frameMetadata = generateFrameClause(argNames, argTypes, kBoundColumns); + const auto windowType = frameMetadata.windowType; + const auto startBoundType = frameMetadata.startBoundType; + const auto endBoundType = frameMetadata.endBoundType; + bool isKRangeFrame = windowType == core::WindowNode::WindowType::kRange && + (isKBoundFrame(startBoundType) || isKBoundFrame(endBoundType)); + + auto useRowNumberKey = + requireSortedInput || windowType == core::WindowNode::WindowType::kRows; + + const auto partitionKeys = generateSortingKeys("p", argNames, argTypes); + std::vector sortingKeysAndOrders; - // 50% chance without order-by clause. - if (vectorFuzzer_.coinToss(0.5)) { + TypeKind orderByTypeKind; + if (useRowNumberKey) { + // If the function is order-dependent or uses "rows" frame, sort all + // input rows by row_number additionally. + sortingKeysAndOrders.emplace_back("row_number", core::kAscNullsLast); + orderByTypeKind = TypeKind::INTEGER; + ++stats_.numSortedInputs; + } else if (isKRangeFrame) { + // kRange frames need only one order by key. This would be row_number for + // functions that are order dependent. + sortingKeysAndOrders = + generateSortingKeysAndOrders("s", argNames, argTypes, true); + orderByTypeKind = argTypes.back()->kind(); + } else if (vectorFuzzer_.coinToss(0.5)) { + // 50% chance without order-by clause. sortingKeysAndOrders = generateSortingKeysAndOrders("s", argNames, argTypes); } - const auto partitionKeys = generateSortingKeys("p", argNames, argTypes); - bool isRowsFrame = false; - const auto frameClause = - generateFrameClause(argNames, argTypes, isRowsFrame); - const auto input = generateInputDataWithRowNumber( + + auto input = generateInputDataWithRowNumber( argNames, argTypes, partitionKeys, signature); - // If the function is order-dependent or uses "rows" frame, sort all input - // rows by row_number additionally. - if (requireSortedInput || isRowsFrame) { - sortingKeysAndOrders.emplace_back("row_number", core::kAscNullsLast); - ++stats_.numSortedInputs; + // Offset column names used for k-RANGE frame bounds have fixed names: off0 + // and off1, representing the precomputed offset columns used as frame start + // and frame end bound respectively. + const std::vector kRangeOffsetColumns = {"off0", "off1"}; + if (isKRangeFrame) { + // Catch possible type overflow errors when generating offset columns. + try { + VELOX_USER_CHECK( + sortingKeysAndOrders.size() == 1, + "Window with k PRECEDING/FOLLOWING frame bounds should have a single ORDER-BY key"); + auto orderByKey = sortingKeysAndOrders[0]; + VELOX_DYNAMIC_SCALAR_TYPE_DISPATCH( + addKRangeOffsetColumnToInput, + orderByTypeKind, + input, + frameMetadata, + orderByKey, + kBoundColumns, + kRangeOffsetColumns); + } catch (VeloxUserError& e) { + VLOG(2) << fmt::format( + "This iteration is not valid due to exception from addKRangeOffsetColumnsToInput: {}", + e.message()); + continue; + } catch (VeloxRuntimeError& e) { + throw e; + } } logVectors(input); + // For kRange frames with constant k, velox expects the frame bounds to be + // columns containing precomputed offset values. Presto frame clause uses + // constant k values. + const auto prestoFrameClause = frameClauseString(frameMetadata); + const auto frameClause = isKRangeFrame + ? frameClauseString(frameMetadata, kRangeOffsetColumns) + : prestoFrameClause; bool failed = verifyWindow( partitionKeys, sortingKeysAndOrders, @@ -277,7 +521,8 @@ void WindowFuzzer::go() { input, customVerification, customVerifier, - FLAGS_enable_window_reference_verification); + FLAGS_enable_window_reference_verification, + prestoFrameClause); if (failed) { signatureWithStats.second.numFailed++; } @@ -404,17 +649,20 @@ bool WindowFuzzer::verifyWindow( const std::vector& input, bool customVerification, const std::shared_ptr& customVerifier, - bool enableWindowVerification) { + bool enableWindowVerification, + const std::string& prestoFrameClause) { SCOPE_EXIT { if (customVerifier) { customVerifier->reset(); } }; + core::PlanNodeId windowNodeId; auto frame = getFrame(partitionKeys, sortingKeysAndOrders, frameClause); auto plan = PlanBuilder() .values(input) .window({fmt::format("{} over ({})", functionCall, frame)}) + .capturePlanNodeId(windowNodeId) .planNode(); if (persistAndRunOnce_) { @@ -430,8 +678,19 @@ bool WindowFuzzer::verifyWindow( if (!customVerification) { if (resultOrError.result && enableWindowVerification) { + auto prestoQueryRunner = + dynamic_cast(referenceQueryRunner_.get()); + bool isPrestoQueryRunner = (prestoQueryRunner != nullptr); + if (isPrestoQueryRunner) { + prestoQueryRunner->queryRunnerContext() + ->windowFrames_[windowNodeId] + .push_back(prestoFrameClause); + } auto referenceResult = computeReferenceResults(plan, input, referenceQueryRunner_.get()); + if (isPrestoQueryRunner) { + prestoQueryRunner->queryRunnerContext()->windowFrames_.clear(); + } stats_.updateReferenceQueryStats(referenceResult.second); if (auto expectedResult = referenceResult.first) { ++stats_.numVerified; diff --git a/velox/exec/fuzzer/WindowFuzzer.h b/velox/exec/fuzzer/WindowFuzzer.h index 8d4b2f439b90..447eec87f646 100644 --- a/velox/exec/fuzzer/WindowFuzzer.h +++ b/velox/exec/fuzzer/WindowFuzzer.h @@ -81,14 +81,100 @@ class WindowFuzzer : public AggregationFuzzerBase { void go(const std::string& planPath); private: + struct FrameMetadata { + core::WindowNode::WindowType windowType = + core::WindowNode::WindowType::kRange; + core::WindowNode::BoundType startBoundType = + core::WindowNode::BoundType::kUnboundedPreceding; + core::WindowNode::BoundType endBoundType = + core::WindowNode::BoundType::kCurrentRow; + std::string startBoundString = ""; + std::string endBoundString = ""; + }; + void addWindowFunctionSignatures(const WindowFunctionMap& signatureMap); - // Return a randomly generated frame clause string together with a boolean - // flag indicating whether it is a ROWS frame. - std::string generateFrameClause( + // Generates the frame bound for k-ROWS frames, decides randomly if the frame + // bound is a constant or a column. Returns the constant integer value when + // K is a constant, otherwise returns the column name used as frame bound. + std::string generateKRowsFrameBound( + std::vector& argNames, + std::vector& argTypes, + const std::string& columnName); + + // Randomly generates the frame metadata, which includes the window type, the + // frame start and end bound type, and the constant/column offset for k-ROWS + // frames. For k-RANGE frames, the constant/column offset generation is + // deferred till the ORDER-BY key is generated. The FrameMetadata returned by + // this function is used in sorting key(s) generation, to ensure a single + // ORDER-BY key exists for k-RANGE frames. + FrameMetadata generateFrameClause( std::vector& argNames, std::vector& argTypes, - bool& isRowsFrame); + const std::vector& kBoundColumnNames); + + // Adds offset column(s) to input data for k-RANGE frames using the helper + // function addKRangeOffsetColumnToInputImpl. + template + void addKRangeOffsetColumnToInput( + std::vector& input, + FrameMetadata& frameMetadata, + SortingKeyAndOrder& orderByKey, + const std::vector& columnNames, + const std::vector& offsetColumnNames); + + // Adds offset column to input data for k-range frames. Randomly decides if + // the frame bound will be a constant or a column. + // 1. If the frame bound is a constant, the constant K value is randomly + // generated to be of the same type as the ORDER-BY column. The offset column + // value for a given row is then generated by helper function genOffsetAtIdx. + // The constant K value is returned as a string. + // 2. If the frame bound is a column, VectorFuzzer is used to generate a + // column of the same type as the ORDER-BY key. Returns offset column name. + template + std::string addKRangeOffsetColumnToInputImpl( + std::vector& input, + const core::WindowNode::BoundType& frameBoundType, + const SortingKeyAndOrder& orderByKey, + const std::string& columnName, + const std::string& offsetColumnName); + + // Utility function to build column for kRange frames. When isOffsetColumn is + // true, the offset column is generated. When isColumnBound is true, the + // column used as frame bound is generated. The offset column (and frame bound + // column, if used) will have nulls for rows where order by column is null. + template + VectorPtr buildKRangeColumn( + const VectorPtr& frameBound, + const VectorPtr& orderByCol, + const core::WindowNode::BoundType& frameBoundType, + const core::SortOrder& sortOrder, + bool isColumnBound, + bool isOffsetColumn); + + // For frames with k RANGE PRECEDING/FOLLOWING, Velox requires the application + // to add columns with the range frame boundary value computed according to + // the frame type. frame_boundary_value = NULL iff current_order_by = NULL. + // If the frame is k PRECEDING : + // frame_boundary_value = current_order_by - k (for ascending ORDER BY) + // frame_boundary_value = current_order_by + k (for descending ORDER BY) + // If the frame is k FOLLOWING : + // frame_boundary_value = current_order_by + k (for ascending ORDER BY) + // frame_boundary_value = current_order_by - k (for descending ORDER BY) + template + const T genOffsetAtIdx( + const T& orderByValue, + const T& frameBound, + const core::WindowNode::BoundType& frameBoundType, + const core::SortOrder& sortOrder); + + // Builds the frame clause string from frameMetadata. In case of k-RANGE frame + // bounds, offset column names from kRangeOffsetColumnNames are used to build + // the frame clause. If kRangeOffsetColumnNames is not specified, the frame + // bound is obtained from frameMetadata. + std::string frameClauseString( + const FrameMetadata& frameMetadata, + const std::vector& kRangeOffsetColumns = {}); std::string generateOrderByClause( const std::vector& sortingKeysAndOrders); @@ -101,7 +187,8 @@ class WindowFuzzer : public AggregationFuzzerBase { std::vector generateSortingKeysAndOrders( const std::string& prefix, std::vector& names, - std::vector& types); + std::vector& types, + const bool isKRangeFrame = false); // Return 'true' if query plans failed. bool verifyWindow( @@ -112,7 +199,8 @@ class WindowFuzzer : public AggregationFuzzerBase { const std::vector& input, bool customVerification, const std::shared_ptr& customVerifier, - bool enableWindowVerification); + bool enableWindowVerification, + const std::string& prestoFrameClause); void testAlternativePlans( const std::vector& partitionKeys, diff --git a/velox/exec/tests/PrestoQueryRunnerTest.cpp b/velox/exec/tests/PrestoQueryRunnerTest.cpp index cb17f6d773db..502da147605d 100644 --- a/velox/exec/tests/PrestoQueryRunnerTest.cpp +++ b/velox/exec/tests/PrestoQueryRunnerTest.cpp @@ -187,25 +187,42 @@ TEST_F(PrestoQueryRunnerTest, toSql) { // Test window queries. { + auto queryRunnerContext = queryRunner->queryRunnerContext(); + core::PlanNodeId id; + const auto frameClause = + "RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW"; auto plan = PlanBuilder() .tableScan("tmp", dataType) .window({"first_value(c0) over (partition by c1 order by c2)"}) + .capturePlanNodeId(id) .planNode(); + queryRunnerContext->windowFrames_[id] = {frameClause}; EXPECT_EQ( queryRunner->toSql(plan), - "SELECT c0, c1, c2, first_value(c0) OVER (PARTITION BY c1 ORDER BY c2 ASC NULLS LAST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) FROM tmp"); - + fmt::format( + "SELECT c0, c1, c2, first_value(c0) OVER (PARTITION BY c1 ORDER BY c2 ASC NULLS LAST {}) FROM tmp", + frameClause)); + + const auto firstValueFrame = + "ROWS BETWEEN 1 FOLLOWING AND UNBOUNDED FOLLOWING"; + const auto lastValueFrame = + "RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW"; plan = PlanBuilder() .tableScan("tmp", dataType) .window( {"first_value(c0) over (partition by c1 order by c2 desc nulls first rows between 1 following and unbounded following)", "last_value(c0) over (partition by c1 order by c2 desc nulls first)"}) + .capturePlanNodeId(id) .planNode(); + queryRunnerContext->windowFrames_[id] = {firstValueFrame, lastValueFrame}; EXPECT_EQ( queryRunner->toSql(plan), - "SELECT c0, c1, c2, first_value(c0) OVER (PARTITION BY c1 ORDER BY c2 DESC NULLS FIRST ROWS BETWEEN 1 FOLLOWING AND UNBOUNDED FOLLOWING), last_value(c0) OVER (PARTITION BY c1 ORDER BY c2 DESC NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) FROM tmp"); + fmt::format( + "SELECT c0, c1, c2, first_value(c0) OVER (PARTITION BY c1 ORDER BY c2 DESC NULLS FIRST {}), last_value(c0) OVER (PARTITION BY c1 ORDER BY c2 DESC NULLS FIRST {}) FROM tmp", + firstValueFrame, + lastValueFrame)); } // Test aggregation queries. diff --git a/velox/functions/prestosql/fuzzer/ApproxPercentileResultVerifier.h b/velox/functions/prestosql/fuzzer/ApproxPercentileResultVerifier.h index bc6d13e4269c..63a3becf3095 100644 --- a/velox/functions/prestosql/fuzzer/ApproxPercentileResultVerifier.h +++ b/velox/functions/prestosql/fuzzer/ApproxPercentileResultVerifier.h @@ -70,7 +70,7 @@ class ApproxPercentileResultVerifier : public ResultVerifier { const std::vector& partitionByKeys, const std::vector& sortingKeysAndOrders, const core::WindowNode::Function& function, - const std::string& /*frame*/, + const std::string& frame, const std::string& windowName) override { VELOX_CHECK(!input.empty()); verifyWindow_ = true; @@ -88,7 +88,7 @@ class ApproxPercentileResultVerifier : public ResultVerifier { valueField, weightField, sortingKeysAndOrders, - function.frame, + frame, function.functionCall->type(), isArrayPercentile); } @@ -303,37 +303,6 @@ class ApproxPercentileResultVerifier : public ResultVerifier { return AssertQueryBuilder(plan).copyResults(input[0]->pool()); } - std::string getFrameClause(const core::WindowNode::Frame& frame) { - std::stringstream ss; - ss << core::WindowNode::windowTypeName(frame.type) << " between "; - if (frame.startValue) { - ss << frame.startValue->toString() << " "; - } - ss << core::WindowNode::boundTypeName(frame.startType) << " and "; - if (frame.endValue) { - ss << frame.endValue->toString() << " "; - } - ss << core::WindowNode::boundTypeName(frame.endType); - return ss.str(); - } - - std::string getOrderByClause( - const std::vector& sortingKeysAndOrders) { - if (sortingKeysAndOrders.empty()) { - return ""; - } - std::stringstream ss; - ss << "order by "; - for (auto i = 0; i < sortingKeysAndOrders.size(); ++i) { - if (i > 0) { - ss << ", "; - } - ss << sortingKeysAndOrders[i].key_ << " " - << sortingKeysAndOrders[i].sortOrder_.toString(); - } - return ss.str(); - } - std::string getPartitionByClause( const std::vector& partitionByKeys) { if (partitionByKeys.empty()) { @@ -456,7 +425,7 @@ class ApproxPercentileResultVerifier : public ResultVerifier { const std::string& valueField, const std::optional& weightField, const std::vector& sortingKeysAndOrders, - const core::WindowNode::Frame& frame, + const std::string& frame, const TypePtr& resultType, bool isArray) { VELOX_CHECK(!input.empty()); @@ -473,6 +442,15 @@ class ApproxPercentileResultVerifier : public ResultVerifier { projections.push_back(fmt::format("{} as x", valueField)); projections.push_back( fmt::format("{} as w", weighted ? weightField.value() : "1::bigint")); + // Names of columns, which could be offset columns in case of kRange frames, + // or frame bounds, should be projected. + static const std::unordered_set kKBoundColumnNames = { + "k0", "k1", "off0", "off1"}; + for (const auto& child : rowType->names()) { + if (kKBoundColumnNames.count(child) != 0) { + projections.push_back(child); + } + } PlanBuilder planBuilder; planBuilder.values(input).project(projections).filter("w > 0"); @@ -480,11 +458,7 @@ class ApproxPercentileResultVerifier : public ResultVerifier { auto partitionByKeysWithRowNumber = getPartitionByClause(append(groupingKeys_, {"row_number"})); planBuilder - .window({fmt::format( - "multimap_agg(x, w) over ({} {} {}) as bucket", - getPartitionByClause(groupingKeys_), - getOrderByClause(sortingKeysAndOrders), - getFrameClause(frame))}) + .window({fmt::format("multimap_agg(x, w) over ({}) as bucket", frame)}) .project(append( groupingKeys_, {"row_number", diff --git a/velox/vector/fuzzer/VectorFuzzer.h b/velox/vector/fuzzer/VectorFuzzer.h index 414ef4065b35..28604d1f823a 100644 --- a/velox/vector/fuzzer/VectorFuzzer.h +++ b/velox/vector/fuzzer/VectorFuzzer.h @@ -38,6 +38,8 @@ struct DataSpec { bool includeInfinity; }; +const std::vector& defaultScalarTypes(); + /// VectorFuzzer is a helper class that generates randomized vectors and their /// data for testing, with a high degree of entropy. /// @@ -170,6 +172,10 @@ class VectorFuzzer { return opts_; } + VectorFuzzer::Options& getMutableOptions() { + return opts_; + } + /// Returns a "fuzzed" vector, containing randomized data, nulls, and indices /// vector (dictionary). Returns a vector containing `opts_.vectorSize` or /// `size` elements.