From 975ca3ac942472d6312b54280d8543a7b870d6b0 Mon Sep 17 00:00:00 2001 From: Masha Basmanova Date: Tue, 5 Dec 2023 03:29:20 -0800 Subject: [PATCH] Add support for lambda functions to ExpressionFuzzer (#7834) Summary: The tricky part of supporting lambda functions is finding a way to generate lambdas given both input types and the result type, i.e. generate LambdaTypedExpr to match given FunctionType. This implementation is very basic. It looks for a matching function signature or signature template and if none found uses a lambda expression that returns constant value. For example, lambda for filter(array, function(T) -> boolean) can be x -> is_null(x) or x -> cast(x as boolean) or x -> true. This very simple implementation is quite effective. It helped discover a number of bugs: https://github.com/facebookincubator/velox/issues/7821 #7824 https://github.com/facebookincubator/velox/issues/7828 #7829 https://github.com/facebookincubator/velox/issues/7831 #7851 https://github.com/facebookincubator/velox/issues/7861 This new logic cannot generate valid lambda for 'comparator' argument of array_sort. Hence, added support for skipping individual function signatures and used it to exclude this particular signature for array_sort. Ran the Fuzzer multiple times for 10 minutes each and collected custom stats on the lambda functions tested and lambda expressions generated. 20 functions were tested using 154 different lambda expressions. More details in https://gist.github.com/mbasmanova/1530e3cdfe2f6f4fef35a03f07ee4a39 ``` velox_expression_fuzzer_test --logtostderr --velox_fuzzer_enable_complex_types --enable_variadic_signatures --lazy_vector_generation_ratio 0.2 --velox_fuzzer_enable_column_reuse --velox_fuzzer_enable_expression_reuse --max_expression_trees_per_step 2 --retry_with_try --enable_dereference --duration_sec 1200 ``` Pull Request resolved: https://github.com/facebookincubator/velox/pull/7834 Test Plan: Imported from GitHub, without a `Test Plan:` line. ``` $ buck2 run stylus/xstream/query/functions/test:fuzzer_test $ buck2 run stylus/xstream/query/functions/test:f3_fuzzer_test ``` Reviewed By: xiaoxmeng Differential Revision: D51756700 Pulled By: mbasmanova fbshipit-source-id: 4a60fd29e1e670f905e3dc8b5a2180e4440796e2 --- velox/expression/FunctionSignature.h | 5 + velox/expression/tests/ExpressionFuzzer.cpp | 169 +++++++++++++++++- velox/expression/tests/ExpressionFuzzer.h | 25 +++ .../expression/tests/ExpressionFuzzerTest.cpp | 5 + velox/expression/tests/FuzzerRunner.h | 41 ++++- velox/functions/lib/Re2Functions.cpp | 2 +- .../functions/lib/tests/Re2FunctionsTest.cpp | 18 +- 7 files changed, 252 insertions(+), 13 deletions(-) diff --git a/velox/expression/FunctionSignature.h b/velox/expression/FunctionSignature.h index 3266d74db2fe..8b3f063e4eb8 100644 --- a/velox/expression/FunctionSignature.h +++ b/velox/expression/FunctionSignature.h @@ -180,6 +180,11 @@ class FunctionSignature { return constantArguments_; } + bool hasConstantArgument() const { + return std::any_of( + constantArguments_.begin(), constantArguments_.end(), folly::identity); + } + bool variableArity() const { return variableArity_; } diff --git a/velox/expression/tests/ExpressionFuzzer.cpp b/velox/expression/tests/ExpressionFuzzer.cpp index 9134897a8f4a..f7372353b158 100644 --- a/velox/expression/tests/ExpressionFuzzer.cpp +++ b/velox/expression/tests/ExpressionFuzzer.cpp @@ -34,6 +34,42 @@ namespace facebook::velox::test { namespace { using exec::SignatureBinder; +using exec::SignatureBinderBase; + +class FullSignatureBinder : public SignatureBinderBase { + public: + FullSignatureBinder( + const exec::FunctionSignature& signature, + const std::vector& argTypes, + const TypePtr& returnType) + : SignatureBinderBase(signature) { + if (signature_.argumentTypes().size() != argTypes.size()) { + return; + } + + for (auto i = 0; i < argTypes.size(); ++i) { + if (!SignatureBinderBase::tryBind( + signature_.argumentTypes()[i], argTypes[i])) { + return; + } + } + + if (!SignatureBinderBase::tryBind(signature_.returnType(), returnType)) { + return; + } + + bound_ = true; + } + + // Returns true if argument types and return type specified in the constructor + // match specified signature. + bool tryBind() { + return bound_; + } + + private: + bool bound_{false}; +}; /// Returns if `functionName` with the given `argTypes` is deterministic. /// Returns true if the function was not found or determinism cannot be @@ -164,7 +200,7 @@ bool isSupportedSignature( // Not supporting lambda functions, or functions using decimal and // timestamp with time zone types. return !( - useTypeName(signature, "opaque") || useTypeName(signature, "function") || + useTypeName(signature, "opaque") || useTypeName(signature, "long_decimal") || useTypeName(signature, "short_decimal") || useTypeName(signature, "decimal") || @@ -578,9 +614,69 @@ std::vector ExpressionFuzzer::generateArgs( return generateArgs(input.args, input.constantArgs, numVarArgs); } +core::TypedExprPtr ExpressionFuzzer::generateArgFunction(const TypePtr& arg) { + const auto& functionType = arg->asFunction(); + + std::vector args; + std::vector names; + std::vector inputs; + args.reserve(arg->size() - 1); + names.reserve(arg->size() - 1); + inputs.reserve(arg->size() - 1); + + for (auto i = 0; i < arg->size() - 1; ++i) { + args.push_back(arg->childAt(i)); + names.push_back(fmt::format("__a{}", i)); + inputs.push_back(std::make_shared( + args.back(), names.back())); + } + + const auto& returnType = functionType.children().back(); + + const auto baseType = typeToBaseName(returnType); + const auto& baseList = typeToExpressionList_[baseType]; + const auto& templateList = typeToExpressionList_[kTypeParameterName]; + + std::vector eligible; + for (const auto& functionName : baseList) { + if (auto* signature = + findConcreteSignature(args, returnType, functionName)) { + eligible.push_back(functionName); + } else if ( + auto* signatureTemplate = + findSignatureTemplate(args, returnType, baseType, functionName)) { + eligible.push_back(functionName); + } + } + + for (const auto& functionName : templateList) { + if (auto* signatureTemplate = + findSignatureTemplate(args, returnType, baseType, functionName)) { + eligible.push_back(functionName); + } + } + + if (eligible.empty()) { + return std::make_shared( + ROW(std::move(names), std::move(args)), + generateArgConstant(returnType)); + } + + const auto idx = rand32(0, eligible.size() - 1); + const auto name = eligible[idx]; + + return std::make_shared( + ROW(std::move(names), std::move(args)), + std::make_shared(returnType, inputs, name)); +} + core::TypedExprPtr ExpressionFuzzer::generateArg( const TypePtr& arg, bool isConstant) { + if (arg->isFunction()) { + return generateArgFunction(arg); + } + if (isConstant) { return generateArgConstant(arg); } else { @@ -768,6 +864,46 @@ const CallableSignature* ExpressionFuzzer::chooseRandomConcreteSignature( return eligible[idx]; } +const CallableSignature* ExpressionFuzzer::findConcreteSignature( + const std::vector& argTypes, + const TypePtr& returnType, + const std::string& functionName) { + if (expressionToSignature_.find(functionName) == + expressionToSignature_.end()) { + return nullptr; + } + auto baseType = typeToBaseName(returnType); + auto it = expressionToSignature_[functionName].find(baseType); + if (it == expressionToSignature_[functionName].end()) { + return nullptr; + } + + for (auto signature : it->second) { + if (!signature->returnType->equivalent(*returnType)) { + continue; + } + + if (signature->args.size() != argTypes.size()) { + continue; + } + + bool argTypesMatch = true; + for (auto i = 0; i < argTypes.size(); ++i) { + if (signature->constantArgs[i] || + !signature->args[i]->equivalent(*argTypes[i])) { + argTypesMatch = false; + break; + } + } + + if (argTypesMatch) { + return signature; + } + } + + return nullptr; +} + core::TypedExprPtr ExpressionFuzzer::generateExpressionFromConcreteSignatures( const TypePtr& returnType, const std::string& functionName) { @@ -812,6 +948,37 @@ const SignatureTemplate* ExpressionFuzzer::chooseRandomSignatureTemplate( return eligible[idx]; } +const SignatureTemplate* ExpressionFuzzer::findSignatureTemplate( + const std::vector& argTypes, + const TypePtr& returnType, + const std::string& typeName, + const std::string& functionName) { + std::vector eligible; + if (expressionToTemplatedSignature_.find(functionName) == + expressionToTemplatedSignature_.end()) { + return nullptr; + } + auto it = expressionToTemplatedSignature_[functionName].find(typeName); + if (it == expressionToTemplatedSignature_[functionName].end()) { + return nullptr; + } + + for (auto signatureTemplate : it->second) { + // Skip signatures with constant arguments. + if (signatureTemplate->signature->hasConstantArgument()) { + continue; + } + + FullSignatureBinder binder{ + *signatureTemplate->signature, argTypes, returnType}; + if (binder.tryBind()) { + return signatureTemplate; + } + } + + return nullptr; +} + core::TypedExprPtr ExpressionFuzzer::generateExpressionFromSignatureTemplate( const TypePtr& returnType, const std::string& functionName) { diff --git a/velox/expression/tests/ExpressionFuzzer.h b/velox/expression/tests/ExpressionFuzzer.h index 4f2b7dce56ff..9a4d7e7b30aa 100644 --- a/velox/expression/tests/ExpressionFuzzer.h +++ b/velox/expression/tests/ExpressionFuzzer.h @@ -191,6 +191,16 @@ class ExpressionFuzzer { core::TypedExprPtr generateArg(const TypePtr& arg); + // Given lambda argument type, generate matching LambdaTypedExpr. + // + // The 'arg' specifies inputs types and result type for the lambda. This + // method finds all matching signatures and signature templates, picks one + // randomly and generates LambdaTypedExpr. If no matching signatures or + // signature templates found, this method returns LambdaTypedExpr that + // represents a constant lambda, i.e lambda that returns the same value for + // all input. The constant value is generated using 'generateArgConstant'. + core::TypedExprPtr generateArgFunction(const TypePtr& arg); + std::vector generateArgs(const CallableSignature& input); std::vector generateArgs( @@ -239,6 +249,13 @@ class ExpressionFuzzer { const TypePtr& returnType, const std::string& functionName); + /// Returns a signature with matching input types and return type. Returns + /// nullptr if matching signature doesn't exist. + const CallableSignature* findConcreteSignature( + const std::vector& argTypes, + const TypePtr& returnType, + const std::string& functionName); + /// Generate an expression by randomly selecting a concrete function /// signature that returns 'returnType' among all signatures that the /// function named 'functionName' supports. @@ -254,6 +271,14 @@ class ExpressionFuzzer { const std::string& typeName, const std::string& functionName); + /// Returns a signature template with matching input types and return type. + /// Returns nullptr if matching signature template doesn't exist. + const SignatureTemplate* findSignatureTemplate( + const std::vector& argTypes, + const TypePtr& returnType, + const std::string& typeName, + const std::string& functionName); + /// Generate an expression by randomly selecting a function signature /// template that returns 'returnType' among all signature templates that /// the function named 'functionName' supports. diff --git a/velox/expression/tests/ExpressionFuzzerTest.cpp b/velox/expression/tests/ExpressionFuzzerTest.cpp index 61df1dba943a..eee027676fc3 100644 --- a/velox/expression/tests/ExpressionFuzzerTest.cpp +++ b/velox/expression/tests/ExpressionFuzzerTest.cpp @@ -52,6 +52,9 @@ int main(int argc, char** argv) { // TODO: List of the functions that at some point crash or fail and need to // be fixed before we can enable. + // This list can include a mix of function names and function signatures. + // Use function name to exclude all signatures of a given function from + // testing. Use function signature to exclude only a specific signature. std::unordered_set skipFunctions = { // Fuzzer and the underlying engine are confused about cardinality(HLL) // (since HLL is a user defined type), and end up trying to use @@ -60,6 +63,8 @@ int main(int argc, char** argv) { "cardinality", "element_at", "width_bucket", + // Fuzzer cannot generate valid 'comparator' lambda. + "array_sort(array(T),constant function(T,T,bigint)) -> array(T)", }; size_t initialSeed = FLAGS_seed == 0 ? std::time(nullptr) : FLAGS_seed; return FuzzerRunner::run( diff --git a/velox/expression/tests/FuzzerRunner.h b/velox/expression/tests/FuzzerRunner.h index 65cedaf5498e..985d1035c24a 100644 --- a/velox/expression/tests/FuzzerRunner.h +++ b/velox/expression/tests/FuzzerRunner.h @@ -74,6 +74,17 @@ class FuzzerRunner { return nameSet; } + static std::pair splitSignature( + const std::string& signature) { + const auto parenPos = signature.find("("); + + if (parenPos != std::string::npos) { + return {signature.substr(0, parenPos), signature.substr(parenPos)}; + } + + return {signature, ""}; + } + // Parse the comma separated list of function names, and use it to filter the // input signatures. static facebook::velox::FunctionSignatureMap filterSignatures( @@ -85,10 +96,32 @@ class FuzzerRunner { return input; } facebook::velox::FunctionSignatureMap output(input); - for (auto s : skipFunctions) { - auto str = s; - folly::toLowerAscii(str); - output.erase(str); + for (auto skip : skipFunctions) { + // 'skip' can be function name or signature. + const auto [skipName, skipSignature] = splitSignature(skip); + + if (skipSignature.empty()) { + output.erase(skipName); + } else { + auto it = output.find(skipName); + if (it != output.end()) { + // Compiler refuses to reference 'skipSignature' from the lambda as + // is. + const auto& signatureToRemove = skipSignature; + + auto removeIt = std::find_if( + it->second.begin(), + it->second.end(), + [&](const auto& signature) { + return signature->toString() == signatureToRemove; + }); + VELOX_CHECK( + removeIt != it->second.end(), + "Skip signature not found: {}", + skip); + it->second.erase(removeIt); + } + } } return output; } diff --git a/velox/functions/lib/Re2Functions.cpp b/velox/functions/lib/Re2Functions.cpp index 59bfd8451105..7523ed6e7270 100644 --- a/velox/functions/lib/Re2Functions.cpp +++ b/velox/functions/lib/Re2Functions.cpp @@ -581,7 +581,7 @@ class LikeGeneric final : public VectorFunction { auto [it, inserted] = compiledRegularExpressions_.emplace( key, std::make_unique(toStringPiece(regex), opt)); - VELOX_CHECK_LE( + VELOX_USER_CHECK_LE( compiledRegularExpressions_.size(), kMaxCompiledRegexes, "Max number of regex reached"); diff --git a/velox/functions/lib/tests/Re2FunctionsTest.cpp b/velox/functions/lib/tests/Re2FunctionsTest.cpp index 57376d3f44e6..5285a7e3aaa7 100644 --- a/velox/functions/lib/tests/Re2FunctionsTest.cpp +++ b/velox/functions/lib/tests/Re2FunctionsTest.cpp @@ -1104,20 +1104,24 @@ TEST_F(Re2FunctionsTest, likeRegexLimit) { } VELOX_ASSERT_THROW( - evaluate("like(c0 , c1)", makeRowVector({input, pattern})), + evaluate("like(c0, c1)", makeRowVector({input, pattern})), "Max number of regex reached"); - // Make sure try does not suppress that faluire. - VELOX_ASSERT_THROW( - evaluate("try(like(c0 , c1))", makeRowVector({input, pattern})), - "Max number of regex reached"); + // First 20 rows should return false, the rest raise and error and become + // null. + result = evaluate("try(like(c0, c1))", makeRowVector({input, pattern})); + auto expected = makeFlatVector( + 26, + [](auto /*row*/) { return false; }, + [](auto row) { return row >= 20; }); + assertEqualVectors(expected, result); // All are complex but the same, should pass. for (int i = 0; i < 26; i++) { flatPattern->set(i, "b%[0-9]+.*{}.*{}.*[0-9]+"); } - assertEqualVectors( - BaseVector::createConstant(BOOLEAN(), false, 26, pool()), result); + result = evaluate("like(c0, c1)", makeRowVector({input, pattern})); + assertEqualVectors(makeConstant(false, 26), result); } TEST_F(Re2FunctionsTest, invalidEscapeChar) {