diff --git a/velox/exec/fuzzer/AggregationFuzzerOptions.h b/velox/exec/fuzzer/AggregationFuzzerOptions.h index 1d372d5743ab..71807617817f 100644 --- a/velox/exec/fuzzer/AggregationFuzzerOptions.h +++ b/velox/exec/fuzzer/AggregationFuzzerOptions.h @@ -50,8 +50,6 @@ struct AggregationFuzzerOptions { std::unordered_set orderDependentFunctions; - std::unordered_map functionDataSpec; - /// Timestamp precision to use when generating inputs of type TIMESTAMP. VectorFuzzer::Options::TimestampPrecision timestampPrecision{ VectorFuzzer::Options::TimestampPrecision::kMilliSeconds}; diff --git a/velox/exec/fuzzer/AggregationFuzzerRunner.h b/velox/exec/fuzzer/AggregationFuzzerRunner.h index cc62a52911a5..0d299bbae462 100644 --- a/velox/exec/fuzzer/AggregationFuzzerRunner.h +++ b/velox/exec/fuzzer/AggregationFuzzerRunner.h @@ -107,12 +107,15 @@ class AggregationFuzzerRunner { registerVectorSerde(); facebook::velox::filesystems::registerLocalFileSystem(); + auto& aggregationFunctionDataSpecs = + referenceQueryRunner->aggregationFunctionDataSpecs(); + facebook::velox::exec::test::aggregateFuzzer( filteredSignatures, seed, options.customVerificationFunctions, options.customInputGenerators, - options.functionDataSpec, + aggregationFunctionDataSpecs, options.timestampPrecision, options.queryConfigs, options.hiveConfigs, diff --git a/velox/exec/fuzzer/DuckQueryRunner.cpp b/velox/exec/fuzzer/DuckQueryRunner.cpp index 1491333c4948..3f4e3e6a6320 100644 --- a/velox/exec/fuzzer/DuckQueryRunner.cpp +++ b/velox/exec/fuzzer/DuckQueryRunner.cpp @@ -79,6 +79,29 @@ const std::vector& DuckQueryRunner::supportedScalarTypes() const { return kScalarTypes; } +const std::unordered_map& +DuckQueryRunner::aggregationFunctionDataSpecs() const { + // There are some functions for which DuckDB and Velox have inconsistent + // behavior with Nan and Infinity, so we exclude those. + static const std::unordered_map + kAggregationFunctionDataSpecs{ + {"covar_pop", DataSpec{true, false}}, + {"covar_samp", DataSpec{true, false}}, + {"histogram", DataSpec{false, false}}, + {"regr_avgx", DataSpec{true, false}}, + {"regr_avgy", DataSpec{true, false}}, + {"regr_intercept", DataSpec{false, false}}, + {"regr_r2", DataSpec{false, false}}, + {"regr_replacement", DataSpec{false, false}}, + {"regr_slope", DataSpec{false, false}}, + {"regr_sxx", DataSpec{false, false}}, + {"regr_sxy", DataSpec{false, false}}, + {"regr_syy", DataSpec{false, false}}, + {"var_pop", DataSpec{false, false}}}; + + return kAggregationFunctionDataSpecs; +} + std::multiset> DuckQueryRunner::execute( const std::string& sql, const std::vector& input, diff --git a/velox/exec/fuzzer/DuckQueryRunner.h b/velox/exec/fuzzer/DuckQueryRunner.h index 491cc14aa368..4fa826af0488 100644 --- a/velox/exec/fuzzer/DuckQueryRunner.h +++ b/velox/exec/fuzzer/DuckQueryRunner.h @@ -34,6 +34,9 @@ class DuckQueryRunner : public ReferenceQueryRunner { /// TODO Investigate mismatches reported when comparing Varbinary. const std::vector& supportedScalarTypes() const override; + const std::unordered_map& + aggregationFunctionDataSpecs() const override; + /// Specify names of aggregate function to exclude from the list of supported /// functions. Used to exclude functions that are non-determonistic, have bugs /// or whose semantics differ from Velox. diff --git a/velox/exec/fuzzer/PrestoQueryRunner.cpp b/velox/exec/fuzzer/PrestoQueryRunner.cpp index 956e7e235943..1172992c90a1 100644 --- a/velox/exec/fuzzer/PrestoQueryRunner.cpp +++ b/velox/exec/fuzzer/PrestoQueryRunner.cpp @@ -273,6 +273,29 @@ const std::vector& PrestoQueryRunner::supportedScalarTypes() const { return kScalarTypes; } +const std::unordered_map& +PrestoQueryRunner::aggregationFunctionDataSpecs() const { + // For some functions, velox supports NaN, Infinity better than presto query + // runner, which makes the comparison impossible. + // Add data constraint in vector fuzzer to enforce to not generate such data + // for those functions before they are fixed in presto query runner + static const std::unordered_map + kAggregationFunctionDataSpecs{ + {"regr_avgx", DataSpec{false, false}}, + {"regr_avgy", DataSpec{false, false}}, + {"regr_r2", DataSpec{false, false}}, + {"regr_sxx", DataSpec{false, false}}, + {"regr_syy", DataSpec{false, false}}, + {"regr_sxy", DataSpec{false, false}}, + {"regr_slope", DataSpec{false, false}}, + {"regr_replacement", DataSpec{false, false}}, + {"covar_pop", DataSpec{true, false}}, + {"covar_samp", DataSpec{true, false}}, + }; + + return kAggregationFunctionDataSpecs; +} + std::optional PrestoQueryRunner::toSql( const std::shared_ptr& aggregationNode) { // Assume plan is Aggregation over Values. diff --git a/velox/exec/fuzzer/PrestoQueryRunner.h b/velox/exec/fuzzer/PrestoQueryRunner.h index cc4ec047eef9..17b289366b2c 100644 --- a/velox/exec/fuzzer/PrestoQueryRunner.h +++ b/velox/exec/fuzzer/PrestoQueryRunner.h @@ -53,6 +53,9 @@ class PrestoQueryRunner : public velox::exec::test::ReferenceQueryRunner { const std::vector& supportedScalarTypes() const override; + const std::unordered_map& + aggregationFunctionDataSpecs() const override; + /// Converts Velox query plan to Presto SQL. Supports Values -> Aggregation or /// Window with an optional Project on top. /// diff --git a/velox/exec/fuzzer/ReferenceQueryRunner.h b/velox/exec/fuzzer/ReferenceQueryRunner.h index c666eeb6a789..75226a7b9ecc 100644 --- a/velox/exec/fuzzer/ReferenceQueryRunner.h +++ b/velox/exec/fuzzer/ReferenceQueryRunner.h @@ -45,6 +45,9 @@ class ReferenceQueryRunner { return defaultScalarTypes(); } + virtual const std::unordered_map& + aggregationFunctionDataSpecs() const = 0; + /// Converts Velox plan into SQL accepted by the reference database. /// @return std::nullopt if the plan uses features not supported by the /// reference database. diff --git a/velox/exec/fuzzer/WindowFuzzerRunner.h b/velox/exec/fuzzer/WindowFuzzerRunner.h index 4f66d2c4918c..d24536ff8175 100644 --- a/velox/exec/fuzzer/WindowFuzzerRunner.h +++ b/velox/exec/fuzzer/WindowFuzzerRunner.h @@ -79,6 +79,9 @@ class WindowFuzzerRunner { registerVectorSerde(); facebook::velox::filesystems::registerLocalFileSystem(); + auto& aggregationFunctionDataSpecs = + referenceQueryRunner->aggregationFunctionDataSpecs(); + facebook::velox::exec::test::windowFuzzer( filteredAggregationSignatures, filteredWindowSignatures, @@ -86,7 +89,7 @@ class WindowFuzzerRunner { options.customVerificationFunctions, options.customInputGenerators, options.orderDependentFunctions, - options.functionDataSpec, + aggregationFunctionDataSpecs, options.timestampPrecision, options.queryConfigs, options.hiveConfigs, diff --git a/velox/functions/prestosql/fuzzer/AggregationFuzzerTest.cpp b/velox/functions/prestosql/fuzzer/AggregationFuzzerTest.cpp index 64391a42f83b..12650de90df4 100644 --- a/velox/functions/prestosql/fuzzer/AggregationFuzzerTest.cpp +++ b/velox/functions/prestosql/fuzzer/AggregationFuzzerTest.cpp @@ -184,24 +184,6 @@ int main(int argc, char** argv) { {"sum_data_size_for_stats", nullptr}, }; - using facebook::velox::DataSpec; - // For some functions, velox supports NaN, Infinity better than presto query - // runner, which makes the comparison impossible. - // Add data constraint in vector fuzzer to enforce to not generate such data - // for those functions before they are fixed in presto query runner - static const std::unordered_map functionDataSpec = { - {"regr_avgx", DataSpec{false, false}}, - {"regr_avgy", DataSpec{false, false}}, - {"regr_r2", DataSpec{false, false}}, - {"regr_sxx", DataSpec{false, false}}, - {"regr_syy", DataSpec{false, false}}, - {"regr_sxy", DataSpec{false, false}}, - {"regr_slope", DataSpec{false, false}}, - {"regr_replacement", DataSpec{false, false}}, - {"covar_pop", DataSpec{true, false}}, - {"covar_samp", DataSpec{true, false}}, - }; - using Runner = facebook::velox::exec::test::AggregationFuzzerRunner; using Options = facebook::velox::exec::test::AggregationFuzzerOptions; @@ -213,9 +195,9 @@ int main(int argc, char** argv) { facebook::velox::exec::test::getCustomInputGenerators(); options.timestampPrecision = facebook::velox::VectorFuzzer::Options::TimestampPrecision::kMilliSeconds; - options.functionDataSpec = functionDataSpec; std::shared_ptr rootPool{ facebook::velox::memory::memoryManager()->addRootPool()}; + return Runner::run( initialSeed, setupReferenceQueryRunner( diff --git a/velox/functions/prestosql/fuzzer/WindowFuzzerTest.cpp b/velox/functions/prestosql/fuzzer/WindowFuzzerTest.cpp index 2cdd0ebceb6e..f7d863d7b809 100644 --- a/velox/functions/prestosql/fuzzer/WindowFuzzerTest.cpp +++ b/velox/functions/prestosql/fuzzer/WindowFuzzerTest.cpp @@ -141,24 +141,6 @@ int main(int argc, char** argv) { {"sum_data_size_for_stats", nullptr}, }; - using facebook::velox::DataSpec; - // For some functions, velox supports NaN, Infinity better than presto query - // runner, which makes the comparison impossible. - // Add data spec in vector fuzzer to enforce to not generate such data - // for those functions before they are fixed in presto query runner - static const std::unordered_map functionDataSpec = { - {"regr_avgx", DataSpec{false, false}}, - {"regr_avgy", DataSpec{false, false}}, - {"regr_r2", DataSpec{false, false}}, - {"regr_sxx", DataSpec{false, false}}, - {"regr_syy", DataSpec{false, false}}, - {"regr_sxy", DataSpec{false, false}}, - {"regr_slope", DataSpec{false, false}}, - {"regr_replacement", DataSpec{false, false}}, - {"covar_pop", DataSpec{true, false}}, - {"covar_samp", DataSpec{true, false}}, - }; - static const std::unordered_set orderDependentFunctions = { // Window functions. "first_value", @@ -199,7 +181,6 @@ int main(int argc, char** argv) { options.orderDependentFunctions = orderDependentFunctions; options.timestampPrecision = facebook::velox::VectorFuzzer::Options::TimestampPrecision::kMilliSeconds; - options.functionDataSpec = functionDataSpec; std::shared_ptr rootPool{ facebook::velox::memory::memoryManager()->addRootPool()}; return Runner::run( diff --git a/velox/functions/sparksql/fuzzer/SparkAggregationFuzzerTest.cpp b/velox/functions/sparksql/fuzzer/SparkAggregationFuzzerTest.cpp index 3ed339e94d23..e5c1c3e17981 100644 --- a/velox/functions/sparksql/fuzzer/SparkAggregationFuzzerTest.cpp +++ b/velox/functions/sparksql/fuzzer/SparkAggregationFuzzerTest.cpp @@ -111,24 +111,6 @@ int main(int argc, char** argv) { // formula. The results from the two methods are completely different. "kurtosis"}); - using facebook::velox::DataSpec; - // For some functions, velox supports NaN, Infinity better than presto query - // runner, which makes the comparison impossible. - // Add data spec in vector fuzzer to enforce to not generate such data - // for those functions before they are fixed in presto query runner - static const std::unordered_map functionDataSpec = { - {"regr_avgx", DataSpec{false, false}}, - {"regr_avgy", DataSpec{false, false}}, - {"regr_r2", DataSpec{false, false}}, - {"regr_sxx", DataSpec{false, false}}, - {"regr_syy", DataSpec{false, false}}, - {"regr_sxy", DataSpec{false, false}}, - {"regr_slope", DataSpec{false, false}}, - {"regr_replacement", DataSpec{false, false}}, - {"covar_pop", DataSpec{true, false}}, - {"covar_samp", DataSpec{true, false}}, - }; - using Runner = facebook::velox::exec::test::AggregationFuzzerRunner; using Options = facebook::velox::exec::test::AggregationFuzzerOptions; @@ -137,6 +119,5 @@ int main(int argc, char** argv) { options.skipFunctions = skipFunctions; options.customVerificationFunctions = customVerificationFunctions; options.orderableGroupKeys = true; - options.functionDataSpec = functionDataSpec; return Runner::run(initialSeed, std::move(duckQueryRunner), options); } diff --git a/velox/functions/sparksql/fuzzer/SparkQueryRunner.cpp b/velox/functions/sparksql/fuzzer/SparkQueryRunner.cpp index 89d05aeb604d..07cf0dd70392 100644 --- a/velox/functions/sparksql/fuzzer/SparkQueryRunner.cpp +++ b/velox/functions/sparksql/fuzzer/SparkQueryRunner.cpp @@ -90,6 +90,14 @@ const std::vector& SparkQueryRunner::supportedScalarTypes() const { return kScalarTypes; } +const std::unordered_map& +SparkQueryRunner::aggregationFunctionDataSpecs() const { + static const std::unordered_map + kAggregationFunctionDataSpecs{}; + + return kAggregationFunctionDataSpecs; +} + std::optional SparkQueryRunner::toSql( const velox::core::PlanNodePtr& plan) { if (const auto aggregationNode = diff --git a/velox/functions/sparksql/fuzzer/SparkQueryRunner.h b/velox/functions/sparksql/fuzzer/SparkQueryRunner.h index 038f3a82b04f..078deed96534 100644 --- a/velox/functions/sparksql/fuzzer/SparkQueryRunner.h +++ b/velox/functions/sparksql/fuzzer/SparkQueryRunner.h @@ -73,6 +73,9 @@ class SparkQueryRunner : public velox::exec::test::ReferenceQueryRunner { const std::vector& supportedScalarTypes() const override; + const std::unordered_map& + aggregationFunctionDataSpecs() const override; + bool supportsVeloxVectorResults() const override { return true; }