From b4c6ed42f2d1ca69f4a27253c64bb96d54928b32 Mon Sep 17 00:00:00 2001 From: Wei He Date: Tue, 17 Dec 2024 14:49:13 -0800 Subject: [PATCH] fix(fuzzer): Reduce invalid or unsupported test cases in window fuzzer (#11902) Summary: WindowFuzzer currently generate many invalid or unsupported test cases. When a test case is invalid, the execution throws in Velox and is not verified against the reference DB. Test cases unsupported by the ReferenceQueryRunner are also not verified against the reference DB. As the result, the test coverage becomes limited. This PR fixes WindowFuzzer to reduce the percentage of invalid and unsupported test cases. Specifically, this PR includes the following fixes and adjustments: 1. When generating partition-by and order-by keys, only use scalar types supported by the ReferenceQueryRunner. 2. Update the type of the row_number column to be INTEGER to match the type of the offset columns of K-Rows frames. 3. Avoid generating NULLs in the offset columns of K-Rows frames. 4. Fail the fuzzer test if less than 50% iterations are verified, either against reference DB or through custom verifiers. Differential Revision: D67360981 Pulled By: kagamiori --- velox/exec/fuzzer/AggregationFuzzerBase.cpp | 21 ++++++++++------ velox/exec/fuzzer/AggregationFuzzerBase.h | 12 ++++++--- velox/exec/fuzzer/WindowFuzzer.cpp | 28 ++++++++++++++++++--- 3 files changed, 45 insertions(+), 16 deletions(-) diff --git a/velox/exec/fuzzer/AggregationFuzzerBase.cpp b/velox/exec/fuzzer/AggregationFuzzerBase.cpp index b6fe99a14491..ef8a61f4a65a 100644 --- a/velox/exec/fuzzer/AggregationFuzzerBase.cpp +++ b/velox/exec/fuzzer/AggregationFuzzerBase.cpp @@ -243,10 +243,11 @@ std::vector AggregationFuzzerBase::generateSortingKeys( std::vector& names, std::vector& types, bool rangeFrame, + const std::vector& scalarTypes, std::optional numKeys) { std::vector keys; vector_size_t maxDepth; - std::vector sortingKeyTypes = defaultScalarTypes(); + std::vector sortingKeyTypes = scalarTypes; // If frame has k-RANGE bound, only one sorting key should be present, and it // should be a scalar type which supports '+', '-' arithmetic operations. @@ -328,9 +329,10 @@ std::vector AggregationFuzzerBase::generateInputDataWithRowNumber( std::vector names, std::vector types, const std::vector& partitionKeys, + const std::vector& windowFrameBounds, const CallableSignature& signature) { names.push_back("row_number"); - types.push_back(BIGINT()); + types.push_back(INTEGER()); auto generator = findInputGenerator(signature); @@ -339,11 +341,10 @@ std::vector AggregationFuzzerBase::generateInputDataWithRowNumber( velox::test::VectorMaker vectorMaker{pool_.get()}; int64_t rowNumber = 0; - std::unordered_set partitionKeySet; - partitionKeySet.reserve(partitionKeys.size()); - for (auto partitionKey : partitionKeys) { - partitionKeySet.insert(partitionKey); - } + std::unordered_set partitionKeySet{ + partitionKeys.begin(), partitionKeys.end()}; + std::unordered_set windowFrameBoundsSet{ + windowFrameBounds.begin(), windowFrameBounds.end()}; for (auto j = 0; j < FLAGS_num_batches; ++j) { std::vector children; @@ -365,11 +366,15 @@ std::vector AggregationFuzzerBase::generateInputDataWithRowNumber( auto baseVector = vectorFuzzer_.fuzz(types[i], numPartitions); children.push_back( BaseVector::wrapInDictionary(nulls, indices, size, baseVector)); + } else if ( + windowFrameBoundsSet.find(names[i]) != windowFrameBoundsSet.end()) { + // Frame bound columns cannot have NULLs. + children.push_back(vectorFuzzer_.fuzzNotNull(types[i], size)); } else { children.push_back(vectorFuzzer_.fuzz(types[i], size)); } } - children.push_back(vectorMaker.flatVector( + children.push_back(vectorMaker.flatVector( size, [&](auto /*row*/) { return rowNumber++; })); input.push_back(vectorMaker.rowVector(names, children)); } diff --git a/velox/exec/fuzzer/AggregationFuzzerBase.h b/velox/exec/fuzzer/AggregationFuzzerBase.h index a2237f8a9d74..8e2b252fa650 100644 --- a/velox/exec/fuzzer/AggregationFuzzerBase.h +++ b/velox/exec/fuzzer/AggregationFuzzerBase.h @@ -188,6 +188,7 @@ class AggregationFuzzerBase { std::vector& names, std::vector& types, bool rangeFrame = false, + const std::vector& scalarTypes = defaultScalarTypes(), std::optional numKeys = std::nullopt); std::pair pickSignature(); @@ -197,14 +198,17 @@ class AggregationFuzzerBase { std::vector types, const std::optional& signature); - // Generate a RowVector of the given types of children with an additional - // child named "row_number" of BIGINT row numbers that differentiates every - // row. Row numbers start from 0. This additional input vector is needed for - // result verification of window aggregations. + /// Generate a RowVector of the given types of children with an additional + /// child named "row_number" of INTEGER row numbers that differentiates every + /// row. Row numbers start from 0. This additional input vector is needed for + /// result verification of window aggregations. + /// @param windowFrameBounds Names of frame bound columns of a window + /// operation. These columns are fuzzed without NULLs. std::vector generateInputDataWithRowNumber( std::vector names, std::vector types, const std::vector& partitionKeys, + const std::vector& windowFrameBounds, const CallableSignature& signature); velox::fuzzer::ResultOrError execute( diff --git a/velox/exec/fuzzer/WindowFuzzer.cpp b/velox/exec/fuzzer/WindowFuzzer.cpp index b4d8a578f3a8..eaaffea3df67 100644 --- a/velox/exec/fuzzer/WindowFuzzer.cpp +++ b/velox/exec/fuzzer/WindowFuzzer.cpp @@ -256,7 +256,14 @@ std::vector WindowFuzzer::generateSortingKeysAndOrders( std::vector& types, bool isKRangeFrame, std::optional numKeys) { - auto keys = generateSortingKeys(prefix, names, types, isKRangeFrame, numKeys); + VELOX_CHECK_NOT_NULL(referenceQueryRunner_); + auto keys = generateSortingKeys( + prefix, + names, + types, + isKRangeFrame, + referenceQueryRunner_->supportedScalarTypes(), + numKeys); std::vector results; for (auto i = 0; i < keys.size(); ++i) { auto asc = vectorFuzzer_.coinToss(0.5); @@ -455,8 +462,14 @@ void WindowFuzzer::go() { const uint32_t numKeys = boost::random::uniform_int_distribution(1, 15)(rng_); - const auto partitionKeys = - generateSortingKeys("p", argNames, argTypes, false, numKeys); + VELOX_CHECK_NOT_NULL(referenceQueryRunner_); + const auto partitionKeys = generateSortingKeys( + "p", + argNames, + argTypes, + false, + referenceQueryRunner_->supportedScalarTypes(), + numKeys); std::vector sortingKeysAndOrders; TypeKind orderByTypeKind; @@ -479,7 +492,7 @@ void WindowFuzzer::go() { } auto input = generateInputDataWithRowNumber( - argNames, argTypes, partitionKeys, signature); + argNames, argTypes, partitionKeys, kBoundColumns, signature); // Offset column names used for k-RANGE frame bounds have fixed names: off0 // and off1, representing the precomputed offset columns used as frame start // and frame end bound respectively. @@ -547,6 +560,13 @@ void WindowFuzzer::go() { } stats_.print(iteration); + // Check that at least half of the iterations were verified, either against + // the reference DB or through custom result verifiers. + // stats_.numVerificationSkipped tracks the number of iterations verified + // through custom result verifiers. + VELOX_CHECK_GE( + (stats_.numVerified + stats_.numVerificationSkipped) / (double)iteration, + 0.5); printSignatureStats(); }