From 204457b51c46ac57f55962cf264933c55aca793a Mon Sep 17 00:00:00 2001 From: Wei He Date: Tue, 17 Dec 2024 18:54:45 -0800 Subject: [PATCH] fix(fuzzer): Reduce invalid or unsupported test cases in window fuzzer (#11902) Summary: WindowFuzzer currently generate many invalid or unsupported test cases. When a test case is invalid, the execution throws in Velox and is not verified against the reference DB. Test cases unsupported by the ReferenceQueryRunner are also not verified against the reference DB. As the result, the test coverage becomes limited. This PR fixes WindowFuzzer to reduce the percentage of invalid and unsupported test cases. Specifically, this PR includes the following fixes and adjustments: 1. When generating partition-by and order-by keys, only use scalar types supported by the ReferenceQueryRunner. 2. Update the type of the row_number column to be INTEGER to match the type of the offset columns of K-Rows frames. 3. Avoid generating NULLs in the offset columns of K-Rows frames. 4. Fail the fuzzer test if less than 50% iterations are verified, either against reference DB or through custom verifiers. Before: ``` I20241217 15:10:26.897787 551721 WindowFuzzer.cpp:535] ==============================> Done with iteration 50 I20241217 15:10:26.942860 551721 AggregationFuzzerBase.cpp:615] Total functions tested: 24 I20241217 15:10:26.942891 551721 AggregationFuzzerBase.cpp:616] Total iterations requiring sorted inputs: 39 (76.47%) I20241217 15:10:26.942906 551721 AggregationFuzzerBase.cpp:618] Total iterations verified against reference DB: 1 (1.96%) I20241217 15:10:26.942916 551721 AggregationFuzzerBase.cpp:620] Total functions not verified (verification skipped / not supported by reference DB / reference DB failed): 22 (43.14%) / 15 (29.41%) / 0 (0.00%) I20241217 15:10:26.942926 551721 AggregationFuzzerBase.cpp:625] Total failed functions: 21 (41.18%) I20241217 15:10:26.942934 551721 WindowFuzzer.cpp:785] Total functions verified in reference DB: 1 ``` After: ``` I20241217 11:47:07.706315 227175 WindowFuzzer.cpp:537] ==============================> Done with iteration 20 I20241217 11:47:07.732537 227175 AggregationFuzzerBase.cpp:616] Total functions tested: 14 I20241217 11:47:07.732566 227175 AggregationFuzzerBase.cpp:617] Total iterations requiring sorted inputs: 16 (76.19%) I20241217 11:47:07.732582 227175 AggregationFuzzerBase.cpp:619] Total iterations verified against reference DB: 9 (42.86%) I20241217 11:47:07.732592 227175 AggregationFuzzerBase.cpp:621] Total functions not verified (verification skipped / not supported by reference DB / reference DB failed): 8 (38.10%) / 0 (0.00%) / 2 (9.52%) I20241217 11:47:07.732602 227175 AggregationFuzzerBase.cpp:626] Total failed functions: 3 (14.29%) I20241217 11:47:07.732610 227175 WindowFuzzer.cpp:788] Total functions verified in reference DB: 6 ``` Reviewed By: Yuhta Differential Revision: D67360981 Pulled By: kagamiori --- velox/exec/fuzzer/AggregationFuzzer.cpp | 2 +- velox/exec/fuzzer/AggregationFuzzerBase.cpp | 21 ++++++++++------ velox/exec/fuzzer/AggregationFuzzerBase.h | 12 ++++++--- velox/exec/fuzzer/WindowFuzzer.cpp | 28 ++++++++++++++++++--- 4 files changed, 46 insertions(+), 17 deletions(-) diff --git a/velox/exec/fuzzer/AggregationFuzzer.cpp b/velox/exec/fuzzer/AggregationFuzzer.cpp index d778f39df475..b21c8a7be1e9 100644 --- a/velox/exec/fuzzer/AggregationFuzzer.cpp +++ b/velox/exec/fuzzer/AggregationFuzzer.cpp @@ -385,7 +385,7 @@ void AggregationFuzzer::go() { auto partitionKeys = generateKeys("p", argNames, argTypes); auto sortingKeys = generateSortingKeys("s", argNames, argTypes); auto input = generateInputDataWithRowNumber( - argNames, argTypes, partitionKeys, signature); + argNames, argTypes, partitionKeys, {}, signature); logVectors(input); diff --git a/velox/exec/fuzzer/AggregationFuzzerBase.cpp b/velox/exec/fuzzer/AggregationFuzzerBase.cpp index b6fe99a14491..ef8a61f4a65a 100644 --- a/velox/exec/fuzzer/AggregationFuzzerBase.cpp +++ b/velox/exec/fuzzer/AggregationFuzzerBase.cpp @@ -243,10 +243,11 @@ std::vector AggregationFuzzerBase::generateSortingKeys( std::vector& names, std::vector& types, bool rangeFrame, + const std::vector& scalarTypes, std::optional numKeys) { std::vector keys; vector_size_t maxDepth; - std::vector sortingKeyTypes = defaultScalarTypes(); + std::vector sortingKeyTypes = scalarTypes; // If frame has k-RANGE bound, only one sorting key should be present, and it // should be a scalar type which supports '+', '-' arithmetic operations. @@ -328,9 +329,10 @@ std::vector AggregationFuzzerBase::generateInputDataWithRowNumber( std::vector names, std::vector types, const std::vector& partitionKeys, + const std::vector& windowFrameBounds, const CallableSignature& signature) { names.push_back("row_number"); - types.push_back(BIGINT()); + types.push_back(INTEGER()); auto generator = findInputGenerator(signature); @@ -339,11 +341,10 @@ std::vector AggregationFuzzerBase::generateInputDataWithRowNumber( velox::test::VectorMaker vectorMaker{pool_.get()}; int64_t rowNumber = 0; - std::unordered_set partitionKeySet; - partitionKeySet.reserve(partitionKeys.size()); - for (auto partitionKey : partitionKeys) { - partitionKeySet.insert(partitionKey); - } + std::unordered_set partitionKeySet{ + partitionKeys.begin(), partitionKeys.end()}; + std::unordered_set windowFrameBoundsSet{ + windowFrameBounds.begin(), windowFrameBounds.end()}; for (auto j = 0; j < FLAGS_num_batches; ++j) { std::vector children; @@ -365,11 +366,15 @@ std::vector AggregationFuzzerBase::generateInputDataWithRowNumber( auto baseVector = vectorFuzzer_.fuzz(types[i], numPartitions); children.push_back( BaseVector::wrapInDictionary(nulls, indices, size, baseVector)); + } else if ( + windowFrameBoundsSet.find(names[i]) != windowFrameBoundsSet.end()) { + // Frame bound columns cannot have NULLs. + children.push_back(vectorFuzzer_.fuzzNotNull(types[i], size)); } else { children.push_back(vectorFuzzer_.fuzz(types[i], size)); } } - children.push_back(vectorMaker.flatVector( + children.push_back(vectorMaker.flatVector( size, [&](auto /*row*/) { return rowNumber++; })); input.push_back(vectorMaker.rowVector(names, children)); } diff --git a/velox/exec/fuzzer/AggregationFuzzerBase.h b/velox/exec/fuzzer/AggregationFuzzerBase.h index a2237f8a9d74..8e2b252fa650 100644 --- a/velox/exec/fuzzer/AggregationFuzzerBase.h +++ b/velox/exec/fuzzer/AggregationFuzzerBase.h @@ -188,6 +188,7 @@ class AggregationFuzzerBase { std::vector& names, std::vector& types, bool rangeFrame = false, + const std::vector& scalarTypes = defaultScalarTypes(), std::optional numKeys = std::nullopt); std::pair pickSignature(); @@ -197,14 +198,17 @@ class AggregationFuzzerBase { std::vector types, const std::optional& signature); - // Generate a RowVector of the given types of children with an additional - // child named "row_number" of BIGINT row numbers that differentiates every - // row. Row numbers start from 0. This additional input vector is needed for - // result verification of window aggregations. + /// Generate a RowVector of the given types of children with an additional + /// child named "row_number" of INTEGER row numbers that differentiates every + /// row. Row numbers start from 0. This additional input vector is needed for + /// result verification of window aggregations. + /// @param windowFrameBounds Names of frame bound columns of a window + /// operation. These columns are fuzzed without NULLs. std::vector generateInputDataWithRowNumber( std::vector names, std::vector types, const std::vector& partitionKeys, + const std::vector& windowFrameBounds, const CallableSignature& signature); velox::fuzzer::ResultOrError execute( diff --git a/velox/exec/fuzzer/WindowFuzzer.cpp b/velox/exec/fuzzer/WindowFuzzer.cpp index b4d8a578f3a8..eaaffea3df67 100644 --- a/velox/exec/fuzzer/WindowFuzzer.cpp +++ b/velox/exec/fuzzer/WindowFuzzer.cpp @@ -256,7 +256,14 @@ std::vector WindowFuzzer::generateSortingKeysAndOrders( std::vector& types, bool isKRangeFrame, std::optional numKeys) { - auto keys = generateSortingKeys(prefix, names, types, isKRangeFrame, numKeys); + VELOX_CHECK_NOT_NULL(referenceQueryRunner_); + auto keys = generateSortingKeys( + prefix, + names, + types, + isKRangeFrame, + referenceQueryRunner_->supportedScalarTypes(), + numKeys); std::vector results; for (auto i = 0; i < keys.size(); ++i) { auto asc = vectorFuzzer_.coinToss(0.5); @@ -455,8 +462,14 @@ void WindowFuzzer::go() { const uint32_t numKeys = boost::random::uniform_int_distribution(1, 15)(rng_); - const auto partitionKeys = - generateSortingKeys("p", argNames, argTypes, false, numKeys); + VELOX_CHECK_NOT_NULL(referenceQueryRunner_); + const auto partitionKeys = generateSortingKeys( + "p", + argNames, + argTypes, + false, + referenceQueryRunner_->supportedScalarTypes(), + numKeys); std::vector sortingKeysAndOrders; TypeKind orderByTypeKind; @@ -479,7 +492,7 @@ void WindowFuzzer::go() { } auto input = generateInputDataWithRowNumber( - argNames, argTypes, partitionKeys, signature); + argNames, argTypes, partitionKeys, kBoundColumns, signature); // Offset column names used for k-RANGE frame bounds have fixed names: off0 // and off1, representing the precomputed offset columns used as frame start // and frame end bound respectively. @@ -547,6 +560,13 @@ void WindowFuzzer::go() { } stats_.print(iteration); + // Check that at least half of the iterations were verified, either against + // the reference DB or through custom result verifiers. + // stats_.numVerificationSkipped tracks the number of iterations verified + // through custom result verifiers. + VELOX_CHECK_GE( + (stats_.numVerified + stats_.numVerificationSkipped) / (double)iteration, + 0.5); printSignatureStats(); }