From 730d5cc7aed78f216bfc72e1a7ef5b4331e0a037 Mon Sep 17 00:00:00 2001 From: Bikramjeet Vig Date: Mon, 28 Oct 2024 16:46:18 -0700 Subject: [PATCH] Add ability to verify expression fuzzer runs on a subset of rows (#11267) Summary: Currently, the expression fuzzer has a phase where it re-runs rows that did not throw an error to ensure evaluation is consistent for them. To achieve this, it currently wraps the inputs with a dictionary that only points to the subset of those rows. This results a change in the encoding of inputs which can cause differences in eval paths taken between phases. To address this and ensure the same paths are taken for each evaluation phase, this change introduces the ability for the expression verifier to only verify a subset of the input rows. The aforementioned fuzzer run phase can only specify the non error rows and maintain the original input row. Follow up: After this change, it would be relevant to also store the input selectivity vector. A subsequent change will be added that would add this ability and make corresponding changes to the ExpressionRunner Differential Revision: D64366745 --- .../fuzzer/ExpressionFuzzerVerifier.cpp | 40 ++++------ velox/expression/tests/ExpressionRunner.cpp | 2 + velox/expression/tests/ExpressionVerifier.cpp | 73 +++++++++++++++---- velox/expression/tests/ExpressionVerifier.h | 21 ++++-- .../tests/ExpressionVerifierUnitTest.cpp | 3 +- 5 files changed, 89 insertions(+), 50 deletions(-) diff --git a/velox/expression/fuzzer/ExpressionFuzzerVerifier.cpp b/velox/expression/fuzzer/ExpressionFuzzerVerifier.cpp index cad98cb0ee0e4..f37133ca6eda8 100644 --- a/velox/expression/fuzzer/ExpressionFuzzerVerifier.cpp +++ b/velox/expression/fuzzer/ExpressionFuzzerVerifier.cpp @@ -36,9 +36,8 @@ namespace { using exec::SignatureBinder; -/// Returns row numbers for non-null rows among all children in'data' or null -/// if all rows are null. -BufferPtr extractNonNullIndices(const RowVectorPtr& data) { +/// Returns all non-null rows among all children in 'data'. +SelectivityVector extractNonNullRows(const RowVectorPtr& data) { DecodedVector decoded; SelectivityVector nonNullRows(data->size()); @@ -48,19 +47,8 @@ BufferPtr extractNonNullIndices(const RowVectorPtr& data) { if (rawNulls) { nonNullRows.deselectNulls(rawNulls, 0, data->size()); } - if (!nonNullRows.hasSelections()) { - return nullptr; - } } - - BufferPtr indices = allocateIndices(nonNullRows.end(), data->pool()); - auto rawIndices = indices->asMutable(); - vector_size_t cnt = 0; - nonNullRows.applyToSelected( - [&](vector_size_t row) { rawIndices[cnt++] = row; }); - VELOX_CHECK_GT(cnt, 0); - indices->setSize(cnt * sizeof(vector_size_t)); - return indices; + return nonNullRows; } /// Wraps child vectors of the specified 'rowVector' in dictionary using @@ -261,6 +249,7 @@ void ExpressionFuzzerVerifier::retryWithTry( tryResult = verifier_.verify( tryPlans, rowVector, + std::nullopt, resultVector ? BaseVector::copy(*resultVector) : nullptr, false, // canThrow columnsToWrapInLazy); @@ -271,6 +260,7 @@ void ExpressionFuzzerVerifier::retryWithTry( *vectorFuzzer_, plans, rowVector, + std::nullopt, columnsToWrapInLazy); } throw; @@ -283,21 +273,18 @@ void ExpressionFuzzerVerifier::retryWithTry( // Re-evaluate the original expression on rows that didn't produce an // error (i.e. returned non-NULL results when evaluated with TRY). - BufferPtr noErrorIndices = extractNonNullIndices(tryResult.result); - - if (noErrorIndices != nullptr) { - auto noErrorRowVector = wrapChildren(noErrorIndices, rowVector); + SelectivityVector noErrorRows = extractNonNullRows(tryResult.result); - LOG(INFO) << "Retrying original expression on " << noErrorRowVector->size() + if (noErrorRows.hasSelections()) { + LOG(INFO) << "Retrying original expression on " << noErrorRows.end() << " rows without errors"; try { verifier_.verify( plans, - noErrorRowVector, - resultVector ? BaseVector::copy(*resultVector) - ->slice(0, noErrorRowVector->size()) - : nullptr, + rowVector, + noErrorRows, + resultVector ? BaseVector::copy(*resultVector) : nullptr, false, // canThrow columnsToWrapInLazy); } catch (const std::exception&) { @@ -306,7 +293,8 @@ void ExpressionFuzzerVerifier::retryWithTry( {&execCtx_, {false, ""}, referenceQueryRunner_}, *vectorFuzzer_, plans, - noErrorRowVector, + rowVector, + noErrorRows, columnsToWrapInLazy); } throw; @@ -379,6 +367,7 @@ void ExpressionFuzzerVerifier::go() { result = verifier_.verify( plans, rowVector, + std::nullopt, resultVectors ? BaseVector::copy(*resultVectors) : nullptr, true, // canThrow columnsToWrapInLazy); @@ -389,6 +378,7 @@ void ExpressionFuzzerVerifier::go() { *vectorFuzzer_, plans, rowVector, + std::nullopt, columnsToWrapInLazy); } throw; diff --git a/velox/expression/tests/ExpressionRunner.cpp b/velox/expression/tests/ExpressionRunner.cpp index d406de11f9ea7..85356e7cd1d61 100644 --- a/velox/expression/tests/ExpressionRunner.cpp +++ b/velox/expression/tests/ExpressionRunner.cpp @@ -197,6 +197,7 @@ void ExpressionRunner::run( verifier.verify( typedExprs, inputVector, + std::nullopt, std::move(resultVector), true, columnsToWrapInLazy); @@ -209,6 +210,7 @@ void ExpressionRunner::run( fuzzer, typedExprs, inputVector, + std::nullopt, columnsToWrapInLazy); } throw; diff --git a/velox/expression/tests/ExpressionVerifier.cpp b/velox/expression/tests/ExpressionVerifier.cpp index 02991c1c8114d..e6a99f0017f43 100644 --- a/velox/expression/tests/ExpressionVerifier.cpp +++ b/velox/expression/tests/ExpressionVerifier.cpp @@ -68,11 +68,33 @@ bool defaultNullRowsSkipped(const exec::ExprSet& exprSet) { } return false; } + +// Returns a RowVector with only the rows specified in 'rows'. Does not maintain +// encodings. +RowVectorPtr reduceToSelectedRows( + const RowVectorPtr& rowVector, + const SelectivityVector& rows) { + if (rows.isAllSelected()) { + return rowVector; + } + BufferPtr indices = allocateIndices(rows.end(), rowVector->pool()); + auto rawIndices = indices->asMutable(); + vector_size_t cnt = 0; + rows.applyToSelected([&](vector_size_t row) { rawIndices[cnt++] = row; }); + VELOX_CHECK_GT(cnt, 0); + indices->setSize(cnt * sizeof(vector_size_t)); + RowVectorPtr reducedVector = std::dynamic_pointer_cast( + BaseVector::create(rowVector->type(), cnt, rowVector->pool())); + SelectivityVector rowsToCopy(cnt); + reducedVector->copy(rowVector.get(), rowsToCopy, rawIndices); + return reducedVector; +} } // namespace fuzzer::ResultOrError ExpressionVerifier::verify( const std::vector& plans, const RowVectorPtr& rowVector, + const std::optional& rowsToVerify, VectorPtr&& resultVector, bool canThrow, std::vector columnsToWrapInLazy) { @@ -132,7 +154,12 @@ fuzzer::ResultOrError ExpressionVerifier::verify( std::exception_ptr exceptionSimplifiedPtr; VLOG(1) << "Starting common eval execution."; - SelectivityVector rows{rowVector ? rowVector->size() : 1}; + SelectivityVector rows; + if (rowsToVerify.has_value()) { + rows = *rowsToVerify; + } else { + rows = SelectivityVector{rowVector ? rowVector->size() : 1}; + } // Execute with common expression eval path. Some columns of the input row // vector will be wrapped in lazy as specified in 'columnsToWrapInLazy'. @@ -171,7 +198,8 @@ fuzzer::ResultOrError ExpressionVerifier::verify( copiedInput, BaseVector::copy(*inputRowVector), "Copy of original input", - "Input after common"); + "Input after common", + rows); } } catch (const VeloxException& e) { if (e.errorCode() == error_code::kUnsupportedInputUncatchable) { @@ -201,9 +229,11 @@ fuzzer::ResultOrError ExpressionVerifier::verify( if (referenceQueryRunner_ != nullptr) { VLOG(1) << "Execute with reference DB."; - auto projectionPlan = makeProjectionPlan(rowVector, plans); + auto inputRowVector = rowVector; + inputRowVector = reduceToSelectedRows(rowVector, rows); + auto projectionPlan = makeProjectionPlan(inputRowVector, plans); auto referenceResultOrError = computeReferenceResults( - projectionPlan, {rowVector}, referenceQueryRunner_.get()); + projectionPlan, {inputRowVector}, referenceQueryRunner_.get()); auto referenceEvalResult = referenceResultOrError.first; @@ -246,6 +276,7 @@ fuzzer::ResultOrError ExpressionVerifier::verify( nullptr, commonEvalResult[0]->size(), commonEvalResult); + commonEvalResultRow = reduceToSelectedRows(commonEvalResultRow, rows); VELOX_CHECK( exec::test::assertEqualResults( referenceEvalResult.value(), @@ -280,7 +311,8 @@ fuzzer::ResultOrError ExpressionVerifier::verify( copy, BaseVector::copy(*rowVector), "Copy of original input", - "Input after simplified"); + "Input after simplified", + rows); } catch (const VeloxException& e) { if (e.errorCode() == error_code::kUnsupportedInputUncatchable) { unsupportedInputUncatchableError = true; @@ -479,12 +511,13 @@ class MinimalSubExpressionFinder { void findMinimalExpression( core::TypedExprPtr plan, const RowVectorPtr& rowVector, + const std::optional& rowsToVerify, const std::vector& columnsToWrapInLazy) { - if (verifyWithResults(plan, rowVector, columnsToWrapInLazy)) { + if (verifyWithResults(plan, rowVector, rowsToVerify, columnsToWrapInLazy)) { errorExit("Retry should have failed"); } - bool minimalFound = - findMinimalRecursive(plan, rowVector, columnsToWrapInLazy); + bool minimalFound = findMinimalRecursive( + plan, rowVector, rowsToVerify, columnsToWrapInLazy); if (minimalFound) { errorExit("Found minimal failing expression."); } else { @@ -504,13 +537,15 @@ class MinimalSubExpressionFinder { bool findMinimalRecursive( core::TypedExprPtr plan, const RowVectorPtr& rowVector, + const std::optional& rowsToVerify, const std::vector& columnsToWrapInLazy) { bool anyFailed = false; for (auto& input : plan->inputs()) { - if (!verifyWithResults(input, rowVector, columnsToWrapInLazy)) { + if (!verifyWithResults( + input, rowVector, rowsToVerify, columnsToWrapInLazy)) { anyFailed = true; - bool minimalFound = - findMinimalRecursive(input, rowVector, columnsToWrapInLazy); + bool minimalFound = findMinimalRecursive( + input, rowVector, rowsToVerify, columnsToWrapInLazy); if (minimalFound) { return true; } @@ -519,10 +554,10 @@ class MinimalSubExpressionFinder { if (!anyFailed) { LOG(INFO) << "Failed with all children succeeding: " << plan->toString(); // Re-running the minimum failed. Put breakpoint here to debug. - verifyWithResults(plan, rowVector, columnsToWrapInLazy); + verifyWithResults(plan, rowVector, rowsToVerify, columnsToWrapInLazy); if (!columnsToWrapInLazy.empty()) { LOG(INFO) << "Trying without lazy:"; - if (verifyWithResults(plan, rowVector, {})) { + if (verifyWithResults(plan, rowVector, rowsToVerify, {})) { LOG(INFO) << "Minimal failure succeeded without lazy vectors"; } } @@ -539,14 +574,16 @@ class MinimalSubExpressionFinder { bool verifyWithResults( core::TypedExprPtr plan, const RowVectorPtr& rowVector, + const std::optional& rowsToVerify, const std::vector& columnsToWrapInLazy) { VectorPtr result; LOG(INFO) << "Running with empty results vector :" << plan->toString(); - bool emptyResult = verifyPlan(plan, rowVector, columnsToWrapInLazy, result); + bool emptyResult = + verifyPlan(plan, rowVector, rowsToVerify, columnsToWrapInLazy, result); LOG(INFO) << "Running with non empty vector :" << plan->toString(); result = vectorFuzzer_.fuzzFlat(plan->type()); bool filledResult = - verifyPlan(plan, rowVector, columnsToWrapInLazy, result); + verifyPlan(plan, rowVector, rowsToVerify, columnsToWrapInLazy, result); if (emptyResult != filledResult) { LOG(ERROR) << fmt::format( "Different results for empty vs populated ! Empty result = {} filledResult = {}", @@ -561,6 +598,7 @@ class MinimalSubExpressionFinder { bool verifyPlan( core::TypedExprPtr plan, const RowVectorPtr& rowVector, + const std::optional& rowsToVerify, const std::vector& columnsToWrapInLazy, VectorPtr results) { // Turn off unnecessary logging. @@ -571,6 +609,7 @@ class MinimalSubExpressionFinder { verifier_.verify( {plan}, rowVector, + rowsToVerify, results ? BaseVector::copy(*results) : nullptr, true, // canThrow columnsToWrapInLazy); @@ -591,6 +630,7 @@ void computeMinimumSubExpression( VectorFuzzer& fuzzer, const std::vector& plans, const RowVectorPtr& rowVector, + const std::optional& rowsToVerify, const std::vector& columnsToWrapInLazy) { auto finder = MinimalSubExpressionFinder(std::move(minimalVerifier), fuzzer); if (plans.size() > 1) { @@ -602,7 +642,8 @@ void computeMinimumSubExpression( for (auto plan : plans) { LOG(INFO) << "============================================"; LOG(INFO) << "Finding minimal subexpression for plan:" << plan->toString(); - finder.findMinimalExpression(plan, rowVector, columnsToWrapInLazy); + finder.findMinimalExpression( + plan, rowVector, rowsToVerify, columnsToWrapInLazy); LOG(INFO) << "============================================"; } } diff --git a/velox/expression/tests/ExpressionVerifier.h b/velox/expression/tests/ExpressionVerifier.h index 5b21826d29605..4aaa2f4baa290 100644 --- a/velox/expression/tests/ExpressionVerifier.h +++ b/velox/expression/tests/ExpressionVerifier.h @@ -57,14 +57,17 @@ class ExpressionVerifier { options_(options), referenceQueryRunner_{referenceQueryRunner} {} - // Executes expressions both using common path (all evaluation - // optimizations) and simplified path. Additionally, a list of column - // indices can be passed via 'columnsToWrapInLazy' which specify the - // columns/children in the input row vector that should be wrapped in a lazy - // layer before running it through the common evaluation path. The list can - // contain negative column indices that represent lazy vectors that should be - // preloaded before being fed to the evaluator. This list is sorted on the - // absolute value of the entries. + // Executes expressions using common path (all evaluation + // optimizations) and compares the result with either the simplified path or a + // reference query runner. An optional selectivity vector 'rowsToVerify' can + // be passed which specifies which rows to evaluate and verify. If its not + // provided (by passing std::nullopt) then all rows will be verified. + // Additionally, a list of column indices can be passed via + // 'columnsToWrapInLazy' which specify the columns/children in the input row + // vector that should be wrapped in a lazy layer before running it through the + // common evaluation path. The list can contain negative column indices that + // represent lazy vectors that should be preloaded before being fed to the + // evaluator. This list is sorted on the absolute value of the entries. // Returns: // - result of evaluating the expressions if both paths succeeded and // returned the exact same vectors. @@ -74,6 +77,7 @@ class ExpressionVerifier { fuzzer::ResultOrError verify( const std::vector& plans, const RowVectorPtr& rowVector, + const std::optional& rowsToVerify, VectorPtr&& resultVector, bool canThrow, std::vector columnsToWarpInLazy = {}); @@ -112,5 +116,6 @@ void computeMinimumSubExpression( VectorFuzzer& fuzzer, const std::vector& plans, const RowVectorPtr& rowVector, + const std::optional& rowsToVerify, const std::vector& columnsToWrapInLazy); } // namespace facebook::velox::test diff --git a/velox/expression/tests/ExpressionVerifierUnitTest.cpp b/velox/expression/tests/ExpressionVerifierUnitTest.cpp index 4857ecfa8dd2d..b5f3914360964 100644 --- a/velox/expression/tests/ExpressionVerifierUnitTest.cpp +++ b/velox/expression/tests/ExpressionVerifierUnitTest.cpp @@ -102,7 +102,8 @@ TEST_F(ExpressionVerifierUnitTest, persistReproInfo) { auto plan = parseExpression("always_throws(c0)", asRowType(data->type())); removeDirecrtoryIfExist(localFs, reproPath); - VELOX_ASSERT_THROW(verifier.verify({plan}, data, nullptr, false), ""); + VELOX_ASSERT_THROW( + verifier.verify({plan}, data, std::nullopt, nullptr, false), ""); EXPECT_TRUE(localFs->exists(reproPath)); EXPECT_FALSE(localFs->list(reproPath).empty()); removeDirecrtoryIfExist(localFs, reproPath);