Skip to content

Commit

Permalink
Move Aggregation Function Data Specs into the Query Runners (#11192)
Browse files Browse the repository at this point in the history
Summary:

Aggregation Function Data Specs declare when we don't want to generate NaN/Infinity for the arguments to a particular aggregate.

Today these are hard coded in the AggregationFuzzer itself, but whether or not we
support NaN/Infinity in fuzzing an aggregate function is more tightly tied to what we
are comparing against (in what cases it's compatible with Velox) which Fuzzer we're
running (as evidenced by the fact the Presto and Spark Aggregation Fuzzers are using
the same lists currently).  We are seeing cases where fuzzing with the Presto Query
Runner is succeeding while fuzzing with the DuckDB Query Runner is failing.

To fix this I've moved the Aggregation Function Data Specs map into the Query
Runner, so that we can specify per Query Runner in what aggregation functions we
don't want to produce Nan/Infinity for.

Reviewed By: kewang1024

Differential Revision: D64011761
  • Loading branch information
Kevin Wilfong authored and facebook-github-bot committed Oct 8, 2024
1 parent 3fabdf5 commit d1bbe54
Show file tree
Hide file tree
Showing 13 changed files with 75 additions and 61 deletions.
2 changes: 0 additions & 2 deletions velox/exec/fuzzer/AggregationFuzzerOptions.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,6 @@ struct AggregationFuzzerOptions {

std::unordered_set<std::string> orderDependentFunctions;

std::unordered_map<std::string, DataSpec> functionDataSpec;

/// Timestamp precision to use when generating inputs of type TIMESTAMP.
VectorFuzzer::Options::TimestampPrecision timestampPrecision{
VectorFuzzer::Options::TimestampPrecision::kMilliSeconds};
Expand Down
5 changes: 4 additions & 1 deletion velox/exec/fuzzer/AggregationFuzzerRunner.h
Original file line number Diff line number Diff line change
Expand Up @@ -107,12 +107,15 @@ class AggregationFuzzerRunner {
registerVectorSerde();
facebook::velox::filesystems::registerLocalFileSystem();

auto& aggregationFunctionDataSpecs =
referenceQueryRunner->aggregationFunctionDataSpecs();

facebook::velox::exec::test::aggregateFuzzer(
filteredSignatures,
seed,
options.customVerificationFunctions,
options.customInputGenerators,
options.functionDataSpec,
aggregationFunctionDataSpecs,
options.timestampPrecision,
options.queryConfigs,
options.hiveConfigs,
Expand Down
23 changes: 23 additions & 0 deletions velox/exec/fuzzer/DuckQueryRunner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,29 @@ const std::vector<TypePtr>& DuckQueryRunner::supportedScalarTypes() const {
return kScalarTypes;
}

const std::unordered_map<std::string, DataSpec>&
DuckQueryRunner::aggregationFunctionDataSpecs() const {
// There are some functions for which DuckDB and Velox have inconsistent
// behavior with Nan and Infinity, so we exclude those.
static const std::unordered_map<std::string, DataSpec>
kAggregationFunctionDataSpecs{
{"covar_pop", DataSpec{true, false}},
{"covar_samp", DataSpec{true, false}},
{"histogram", DataSpec{false, false}},
{"regr_avgx", DataSpec{true, false}},
{"regr_avgy", DataSpec{true, false}},
{"regr_intercept", DataSpec{false, false}},
{"regr_r2", DataSpec{false, false}},
{"regr_replacement", DataSpec{false, false}},
{"regr_slope", DataSpec{false, false}},
{"regr_sxx", DataSpec{false, false}},
{"regr_sxy", DataSpec{false, false}},
{"regr_syy", DataSpec{false, false}},
{"var_pop", DataSpec{false, false}}};

return kAggregationFunctionDataSpecs;
}

std::multiset<std::vector<velox::variant>> DuckQueryRunner::execute(
const std::string& sql,
const std::vector<RowVectorPtr>& input,
Expand Down
3 changes: 3 additions & 0 deletions velox/exec/fuzzer/DuckQueryRunner.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@ class DuckQueryRunner : public ReferenceQueryRunner {
/// TODO Investigate mismatches reported when comparing Varbinary.
const std::vector<TypePtr>& supportedScalarTypes() const override;

const std::unordered_map<std::string, DataSpec>&
aggregationFunctionDataSpecs() const override;

/// Specify names of aggregate function to exclude from the list of supported
/// functions. Used to exclude functions that are non-determonistic, have bugs
/// or whose semantics differ from Velox.
Expand Down
23 changes: 23 additions & 0 deletions velox/exec/fuzzer/PrestoQueryRunner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,29 @@ const std::vector<TypePtr>& PrestoQueryRunner::supportedScalarTypes() const {
return kScalarTypes;
}

const std::unordered_map<std::string, DataSpec>&
PrestoQueryRunner::aggregationFunctionDataSpecs() const {
// For some functions, velox supports NaN, Infinity better than presto query
// runner, which makes the comparison impossible.
// Add data constraint in vector fuzzer to enforce to not generate such data
// for those functions before they are fixed in presto query runner
static const std::unordered_map<std::string, DataSpec>
kAggregationFunctionDataSpecs{
{"regr_avgx", DataSpec{false, false}},
{"regr_avgy", DataSpec{false, false}},
{"regr_r2", DataSpec{false, false}},
{"regr_sxx", DataSpec{false, false}},
{"regr_syy", DataSpec{false, false}},
{"regr_sxy", DataSpec{false, false}},
{"regr_slope", DataSpec{false, false}},
{"regr_replacement", DataSpec{false, false}},
{"covar_pop", DataSpec{true, false}},
{"covar_samp", DataSpec{true, false}},
};

return kAggregationFunctionDataSpecs;
}

std::optional<std::string> PrestoQueryRunner::toSql(
const std::shared_ptr<const core::AggregationNode>& aggregationNode) {
// Assume plan is Aggregation over Values.
Expand Down
3 changes: 3 additions & 0 deletions velox/exec/fuzzer/PrestoQueryRunner.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@ class PrestoQueryRunner : public velox::exec::test::ReferenceQueryRunner {

const std::vector<TypePtr>& supportedScalarTypes() const override;

const std::unordered_map<std::string, DataSpec>&
aggregationFunctionDataSpecs() const override;

/// Converts Velox query plan to Presto SQL. Supports Values -> Aggregation or
/// Window with an optional Project on top.
///
Expand Down
3 changes: 3 additions & 0 deletions velox/exec/fuzzer/ReferenceQueryRunner.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,9 @@ class ReferenceQueryRunner {
return defaultScalarTypes();
}

virtual const std::unordered_map<std::string, DataSpec>&
aggregationFunctionDataSpecs() const = 0;

/// Converts Velox plan into SQL accepted by the reference database.
/// @return std::nullopt if the plan uses features not supported by the
/// reference database.
Expand Down
5 changes: 4 additions & 1 deletion velox/exec/fuzzer/WindowFuzzerRunner.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,14 +79,17 @@ class WindowFuzzerRunner {
registerVectorSerde();
facebook::velox::filesystems::registerLocalFileSystem();

auto& aggregationFunctionDataSpecs =
referenceQueryRunner->aggregationFunctionDataSpecs();

facebook::velox::exec::test::windowFuzzer(
filteredAggregationSignatures,
filteredWindowSignatures,
seed,
options.customVerificationFunctions,
options.customInputGenerators,
options.orderDependentFunctions,
options.functionDataSpec,
aggregationFunctionDataSpecs,
options.timestampPrecision,
options.queryConfigs,
options.hiveConfigs,
Expand Down
20 changes: 1 addition & 19 deletions velox/functions/prestosql/fuzzer/AggregationFuzzerTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -184,24 +184,6 @@ int main(int argc, char** argv) {
{"sum_data_size_for_stats", nullptr},
};

using facebook::velox::DataSpec;
// For some functions, velox supports NaN, Infinity better than presto query
// runner, which makes the comparison impossible.
// Add data constraint in vector fuzzer to enforce to not generate such data
// for those functions before they are fixed in presto query runner
static const std::unordered_map<std::string, DataSpec> functionDataSpec = {
{"regr_avgx", DataSpec{false, false}},
{"regr_avgy", DataSpec{false, false}},
{"regr_r2", DataSpec{false, false}},
{"regr_sxx", DataSpec{false, false}},
{"regr_syy", DataSpec{false, false}},
{"regr_sxy", DataSpec{false, false}},
{"regr_slope", DataSpec{false, false}},
{"regr_replacement", DataSpec{false, false}},
{"covar_pop", DataSpec{true, false}},
{"covar_samp", DataSpec{true, false}},
};

using Runner = facebook::velox::exec::test::AggregationFuzzerRunner;
using Options = facebook::velox::exec::test::AggregationFuzzerOptions;

Expand All @@ -213,9 +195,9 @@ int main(int argc, char** argv) {
facebook::velox::exec::test::getCustomInputGenerators();
options.timestampPrecision =
facebook::velox::VectorFuzzer::Options::TimestampPrecision::kMilliSeconds;
options.functionDataSpec = functionDataSpec;
std::shared_ptr<facebook::velox::memory::MemoryPool> rootPool{
facebook::velox::memory::memoryManager()->addRootPool()};

return Runner::run(
initialSeed,
setupReferenceQueryRunner(
Expand Down
19 changes: 0 additions & 19 deletions velox/functions/prestosql/fuzzer/WindowFuzzerTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -141,24 +141,6 @@ int main(int argc, char** argv) {
{"sum_data_size_for_stats", nullptr},
};

using facebook::velox::DataSpec;
// For some functions, velox supports NaN, Infinity better than presto query
// runner, which makes the comparison impossible.
// Add data spec in vector fuzzer to enforce to not generate such data
// for those functions before they are fixed in presto query runner
static const std::unordered_map<std::string, DataSpec> functionDataSpec = {
{"regr_avgx", DataSpec{false, false}},
{"regr_avgy", DataSpec{false, false}},
{"regr_r2", DataSpec{false, false}},
{"regr_sxx", DataSpec{false, false}},
{"regr_syy", DataSpec{false, false}},
{"regr_sxy", DataSpec{false, false}},
{"regr_slope", DataSpec{false, false}},
{"regr_replacement", DataSpec{false, false}},
{"covar_pop", DataSpec{true, false}},
{"covar_samp", DataSpec{true, false}},
};

static const std::unordered_set<std::string> orderDependentFunctions = {
// Window functions.
"first_value",
Expand Down Expand Up @@ -199,7 +181,6 @@ int main(int argc, char** argv) {
options.orderDependentFunctions = orderDependentFunctions;
options.timestampPrecision =
facebook::velox::VectorFuzzer::Options::TimestampPrecision::kMilliSeconds;
options.functionDataSpec = functionDataSpec;
std::shared_ptr<facebook::velox::memory::MemoryPool> rootPool{
facebook::velox::memory::memoryManager()->addRootPool()};
return Runner::run(
Expand Down
19 changes: 0 additions & 19 deletions velox/functions/sparksql/fuzzer/SparkAggregationFuzzerTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -111,24 +111,6 @@ int main(int argc, char** argv) {
// formula. The results from the two methods are completely different.
"kurtosis"});

using facebook::velox::DataSpec;
// For some functions, velox supports NaN, Infinity better than presto query
// runner, which makes the comparison impossible.
// Add data spec in vector fuzzer to enforce to not generate such data
// for those functions before they are fixed in presto query runner
static const std::unordered_map<std::string, DataSpec> functionDataSpec = {
{"regr_avgx", DataSpec{false, false}},
{"regr_avgy", DataSpec{false, false}},
{"regr_r2", DataSpec{false, false}},
{"regr_sxx", DataSpec{false, false}},
{"regr_syy", DataSpec{false, false}},
{"regr_sxy", DataSpec{false, false}},
{"regr_slope", DataSpec{false, false}},
{"regr_replacement", DataSpec{false, false}},
{"covar_pop", DataSpec{true, false}},
{"covar_samp", DataSpec{true, false}},
};

using Runner = facebook::velox::exec::test::AggregationFuzzerRunner;
using Options = facebook::velox::exec::test::AggregationFuzzerOptions;

Expand All @@ -137,6 +119,5 @@ int main(int argc, char** argv) {
options.skipFunctions = skipFunctions;
options.customVerificationFunctions = customVerificationFunctions;
options.orderableGroupKeys = true;
options.functionDataSpec = functionDataSpec;
return Runner::run(initialSeed, std::move(duckQueryRunner), options);
}
8 changes: 8 additions & 0 deletions velox/functions/sparksql/fuzzer/SparkQueryRunner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,14 @@ const std::vector<TypePtr>& SparkQueryRunner::supportedScalarTypes() const {
return kScalarTypes;
}

const std::unordered_map<std::string, DataSpec>&
SparkQueryRunner::aggregationFunctionDataSpecs() const {
static const std::unordered_map<std::string, DataSpec>
kAggregationFunctionDataSpecs{};

return kAggregationFunctionDataSpecs;
}

std::optional<std::string> SparkQueryRunner::toSql(
const velox::core::PlanNodePtr& plan) {
if (const auto aggregationNode =
Expand Down
3 changes: 3 additions & 0 deletions velox/functions/sparksql/fuzzer/SparkQueryRunner.h
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,9 @@ class SparkQueryRunner : public velox::exec::test::ReferenceQueryRunner {

const std::vector<TypePtr>& supportedScalarTypes() const override;

const std::unordered_map<std::string, DataSpec>&
aggregationFunctionDataSpecs() const override;

bool supportsVeloxVectorResults() const override {
return true;
}
Expand Down

0 comments on commit d1bbe54

Please sign in to comment.