Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move Aggregation Function Data Specs into the Query Runners #11192

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions velox/exec/fuzzer/AggregationFuzzerOptions.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,6 @@ struct AggregationFuzzerOptions {

std::unordered_set<std::string> orderDependentFunctions;

std::unordered_map<std::string, DataSpec> functionDataSpec;

/// Timestamp precision to use when generating inputs of type TIMESTAMP.
VectorFuzzer::Options::TimestampPrecision timestampPrecision{
VectorFuzzer::Options::TimestampPrecision::kMilliSeconds};
Expand Down
5 changes: 4 additions & 1 deletion velox/exec/fuzzer/AggregationFuzzerRunner.h
Original file line number Diff line number Diff line change
Expand Up @@ -107,12 +107,15 @@ class AggregationFuzzerRunner {
registerVectorSerde();
facebook::velox::filesystems::registerLocalFileSystem();

auto& aggregationFunctionDataSpecs =
referenceQueryRunner->aggregationFunctionDataSpecs();

facebook::velox::exec::test::aggregateFuzzer(
filteredSignatures,
seed,
options.customVerificationFunctions,
options.customInputGenerators,
options.functionDataSpec,
aggregationFunctionDataSpecs,
options.timestampPrecision,
options.queryConfigs,
options.hiveConfigs,
Expand Down
23 changes: 23 additions & 0 deletions velox/exec/fuzzer/DuckQueryRunner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,29 @@ const std::vector<TypePtr>& DuckQueryRunner::supportedScalarTypes() const {
return kScalarTypes;
}

const std::unordered_map<std::string, DataSpec>&
DuckQueryRunner::aggregationFunctionDataSpecs() const {
// There are some functions for which DuckDB and Velox have inconsistent
// behavior with Nan and Infinity, so we exclude those.
static const std::unordered_map<std::string, DataSpec>
kAggregationFunctionDataSpecs{
{"covar_pop", DataSpec{true, false}},
{"covar_samp", DataSpec{true, false}},
{"histogram", DataSpec{false, false}},
{"regr_avgx", DataSpec{true, false}},
{"regr_avgy", DataSpec{true, false}},
{"regr_intercept", DataSpec{false, false}},
{"regr_r2", DataSpec{false, false}},
{"regr_replacement", DataSpec{false, false}},
{"regr_slope", DataSpec{false, false}},
{"regr_sxx", DataSpec{false, false}},
{"regr_sxy", DataSpec{false, false}},
{"regr_syy", DataSpec{false, false}},
{"var_pop", DataSpec{false, false}}};

return kAggregationFunctionDataSpecs;
}

std::multiset<std::vector<velox::variant>> DuckQueryRunner::execute(
const std::string& sql,
const std::vector<RowVectorPtr>& input,
Expand Down
3 changes: 3 additions & 0 deletions velox/exec/fuzzer/DuckQueryRunner.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@ class DuckQueryRunner : public ReferenceQueryRunner {
/// TODO Investigate mismatches reported when comparing Varbinary.
const std::vector<TypePtr>& supportedScalarTypes() const override;

const std::unordered_map<std::string, DataSpec>&
aggregationFunctionDataSpecs() const override;

/// Specify names of aggregate function to exclude from the list of supported
/// functions. Used to exclude functions that are non-determonistic, have bugs
/// or whose semantics differ from Velox.
Expand Down
23 changes: 23 additions & 0 deletions velox/exec/fuzzer/PrestoQueryRunner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,29 @@ const std::vector<TypePtr>& PrestoQueryRunner::supportedScalarTypes() const {
return kScalarTypes;
}

const std::unordered_map<std::string, DataSpec>&
PrestoQueryRunner::aggregationFunctionDataSpecs() const {
// For some functions, velox supports NaN, Infinity better than presto query
// runner, which makes the comparison impossible.
// Add data constraint in vector fuzzer to enforce to not generate such data
// for those functions before they are fixed in presto query runner
static const std::unordered_map<std::string, DataSpec>
kAggregationFunctionDataSpecs{
{"regr_avgx", DataSpec{false, false}},
{"regr_avgy", DataSpec{false, false}},
{"regr_r2", DataSpec{false, false}},
{"regr_sxx", DataSpec{false, false}},
{"regr_syy", DataSpec{false, false}},
{"regr_sxy", DataSpec{false, false}},
{"regr_slope", DataSpec{false, false}},
{"regr_replacement", DataSpec{false, false}},
{"covar_pop", DataSpec{true, false}},
{"covar_samp", DataSpec{true, false}},
};

return kAggregationFunctionDataSpecs;
}

std::optional<std::string> PrestoQueryRunner::toSql(
const std::shared_ptr<const core::AggregationNode>& aggregationNode) {
// Assume plan is Aggregation over Values.
Expand Down
3 changes: 3 additions & 0 deletions velox/exec/fuzzer/PrestoQueryRunner.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@ class PrestoQueryRunner : public velox::exec::test::ReferenceQueryRunner {

const std::vector<TypePtr>& supportedScalarTypes() const override;

const std::unordered_map<std::string, DataSpec>&
aggregationFunctionDataSpecs() const override;

/// Converts Velox query plan to Presto SQL. Supports Values -> Aggregation or
/// Window with an optional Project on top.
///
Expand Down
3 changes: 3 additions & 0 deletions velox/exec/fuzzer/ReferenceQueryRunner.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,9 @@ class ReferenceQueryRunner {
return defaultScalarTypes();
}

virtual const std::unordered_map<std::string, DataSpec>&
aggregationFunctionDataSpecs() const = 0;

/// Converts Velox plan into SQL accepted by the reference database.
/// @return std::nullopt if the plan uses features not supported by the
/// reference database.
Expand Down
5 changes: 4 additions & 1 deletion velox/exec/fuzzer/WindowFuzzerRunner.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,14 +79,17 @@ class WindowFuzzerRunner {
registerVectorSerde();
facebook::velox::filesystems::registerLocalFileSystem();

auto& aggregationFunctionDataSpecs =
referenceQueryRunner->aggregationFunctionDataSpecs();

facebook::velox::exec::test::windowFuzzer(
filteredAggregationSignatures,
filteredWindowSignatures,
seed,
options.customVerificationFunctions,
options.customInputGenerators,
options.orderDependentFunctions,
options.functionDataSpec,
aggregationFunctionDataSpecs,
options.timestampPrecision,
options.queryConfigs,
options.hiveConfigs,
Expand Down
20 changes: 1 addition & 19 deletions velox/functions/prestosql/fuzzer/AggregationFuzzerTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -184,24 +184,6 @@ int main(int argc, char** argv) {
{"sum_data_size_for_stats", nullptr},
};

using facebook::velox::DataSpec;
// For some functions, velox supports NaN, Infinity better than presto query
// runner, which makes the comparison impossible.
// Add data constraint in vector fuzzer to enforce to not generate such data
// for those functions before they are fixed in presto query runner
static const std::unordered_map<std::string, DataSpec> functionDataSpec = {
{"regr_avgx", DataSpec{false, false}},
{"regr_avgy", DataSpec{false, false}},
{"regr_r2", DataSpec{false, false}},
{"regr_sxx", DataSpec{false, false}},
{"regr_syy", DataSpec{false, false}},
{"regr_sxy", DataSpec{false, false}},
{"regr_slope", DataSpec{false, false}},
{"regr_replacement", DataSpec{false, false}},
{"covar_pop", DataSpec{true, false}},
{"covar_samp", DataSpec{true, false}},
};

using Runner = facebook::velox::exec::test::AggregationFuzzerRunner;
using Options = facebook::velox::exec::test::AggregationFuzzerOptions;

Expand All @@ -213,9 +195,9 @@ int main(int argc, char** argv) {
facebook::velox::exec::test::getCustomInputGenerators();
options.timestampPrecision =
facebook::velox::VectorFuzzer::Options::TimestampPrecision::kMilliSeconds;
options.functionDataSpec = functionDataSpec;
std::shared_ptr<facebook::velox::memory::MemoryPool> rootPool{
facebook::velox::memory::memoryManager()->addRootPool()};

return Runner::run(
initialSeed,
setupReferenceQueryRunner(
Expand Down
19 changes: 0 additions & 19 deletions velox/functions/prestosql/fuzzer/WindowFuzzerTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -141,24 +141,6 @@ int main(int argc, char** argv) {
{"sum_data_size_for_stats", nullptr},
};

using facebook::velox::DataSpec;
// For some functions, velox supports NaN, Infinity better than presto query
// runner, which makes the comparison impossible.
// Add data spec in vector fuzzer to enforce to not generate such data
// for those functions before they are fixed in presto query runner
static const std::unordered_map<std::string, DataSpec> functionDataSpec = {
{"regr_avgx", DataSpec{false, false}},
{"regr_avgy", DataSpec{false, false}},
{"regr_r2", DataSpec{false, false}},
{"regr_sxx", DataSpec{false, false}},
{"regr_syy", DataSpec{false, false}},
{"regr_sxy", DataSpec{false, false}},
{"regr_slope", DataSpec{false, false}},
{"regr_replacement", DataSpec{false, false}},
{"covar_pop", DataSpec{true, false}},
{"covar_samp", DataSpec{true, false}},
};

static const std::unordered_set<std::string> orderDependentFunctions = {
// Window functions.
"first_value",
Expand Down Expand Up @@ -199,7 +181,6 @@ int main(int argc, char** argv) {
options.orderDependentFunctions = orderDependentFunctions;
options.timestampPrecision =
facebook::velox::VectorFuzzer::Options::TimestampPrecision::kMilliSeconds;
options.functionDataSpec = functionDataSpec;
std::shared_ptr<facebook::velox::memory::MemoryPool> rootPool{
facebook::velox::memory::memoryManager()->addRootPool()};
return Runner::run(
Expand Down
19 changes: 0 additions & 19 deletions velox/functions/sparksql/fuzzer/SparkAggregationFuzzerTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -111,24 +111,6 @@ int main(int argc, char** argv) {
// formula. The results from the two methods are completely different.
"kurtosis"});

using facebook::velox::DataSpec;
// For some functions, velox supports NaN, Infinity better than presto query
// runner, which makes the comparison impossible.
// Add data spec in vector fuzzer to enforce to not generate such data
// for those functions before they are fixed in presto query runner
static const std::unordered_map<std::string, DataSpec> functionDataSpec = {
{"regr_avgx", DataSpec{false, false}},
{"regr_avgy", DataSpec{false, false}},
{"regr_r2", DataSpec{false, false}},
{"regr_sxx", DataSpec{false, false}},
{"regr_syy", DataSpec{false, false}},
{"regr_sxy", DataSpec{false, false}},
{"regr_slope", DataSpec{false, false}},
{"regr_replacement", DataSpec{false, false}},
{"covar_pop", DataSpec{true, false}},
{"covar_samp", DataSpec{true, false}},
};

using Runner = facebook::velox::exec::test::AggregationFuzzerRunner;
using Options = facebook::velox::exec::test::AggregationFuzzerOptions;

Expand All @@ -137,6 +119,5 @@ int main(int argc, char** argv) {
options.skipFunctions = skipFunctions;
options.customVerificationFunctions = customVerificationFunctions;
options.orderableGroupKeys = true;
options.functionDataSpec = functionDataSpec;
return Runner::run(initialSeed, std::move(duckQueryRunner), options);
}
8 changes: 8 additions & 0 deletions velox/functions/sparksql/fuzzer/SparkQueryRunner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,14 @@ const std::vector<TypePtr>& SparkQueryRunner::supportedScalarTypes() const {
return kScalarTypes;
}

const std::unordered_map<std::string, DataSpec>&
SparkQueryRunner::aggregationFunctionDataSpecs() const {
static const std::unordered_map<std::string, DataSpec>
kAggregationFunctionDataSpecs{};

return kAggregationFunctionDataSpecs;
}

std::optional<std::string> SparkQueryRunner::toSql(
const velox::core::PlanNodePtr& plan) {
if (const auto aggregationNode =
Expand Down
3 changes: 3 additions & 0 deletions velox/functions/sparksql/fuzzer/SparkQueryRunner.h
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,9 @@ class SparkQueryRunner : public velox::exec::test::ReferenceQueryRunner {

const std::vector<TypePtr>& supportedScalarTypes() const override;

const std::unordered_map<std::string, DataSpec>&
aggregationFunctionDataSpecs() const override;

bool supportsVeloxVectorResults() const override {
return true;
}
Expand Down
Loading