From beb39f1a4be6f569578720c04d429d174e40dd34 Mon Sep 17 00:00:00 2001 From: Bikramjeet Vig Date: Tue, 12 Nov 2024 14:10:47 -0800 Subject: [PATCH] [Draft] Add utility to read json files and test against Presto --- velox/exec/tests/utils/QueryAssertions.cpp | 22 +- .../expression/tests/ExpressionRunnerTest.cpp | 305 +++++++----------- 2 files changed, 131 insertions(+), 196 deletions(-) diff --git a/velox/exec/tests/utils/QueryAssertions.cpp b/velox/exec/tests/utils/QueryAssertions.cpp index de2541de6ce4..656a3b4c0507 100644 --- a/velox/exec/tests/utils/QueryAssertions.cpp +++ b/velox/exec/tests/utils/QueryAssertions.cpp @@ -685,8 +685,7 @@ std::string makeErrorMessage( message << extraRows.size() << " extra rows, " << missingRows.size() << " missing rows" << std::endl; - auto extraRowsToPrint = - std::min((size_t)FLAGS_max_error_rows, extraRows.size()); + auto extraRowsToPrint = extraRows.size(); message << extraRowsToPrint << " of extra rows:" << std::endl; for (int32_t i = 0; i < extraRowsToPrint; i++) { @@ -696,8 +695,7 @@ std::string makeErrorMessage( } message << std::endl; - auto missingRowsToPrint = - std::min((size_t)FLAGS_max_error_rows, missingRows.size()); + auto missingRowsToPrint = missingRows.size(); message << missingRowsToPrint << " of missing rows:" << std::endl; for (int32_t i = 0; i < missingRowsToPrint; i++) { message << "\t"; @@ -852,20 +850,28 @@ std::string generateUserFriendlyDiff( const MaterializedRowMultiset& actualRows, const TypePtr& type) { std::vector extraRows; - std::set_difference( +/* std::set_difference( actualRows.begin(), actualRows.end(), expectedRows.begin(), expectedRows.end(), - std::inserter(extraRows, extraRows.end())); + std::inserter(extraRows, extraRows.end()));*/ + + for(auto& item : actualRows) { + extraRows.push_back(item); + } std::vector missingRows; - std::set_difference( +/* std::set_difference( expectedRows.begin(), expectedRows.end(), actualRows.begin(), actualRows.end(), - std::inserter(missingRows, missingRows.end())); + std::inserter(missingRows, missingRows.end()));*/ + + for(auto& item : expectedRows) { + missingRows.push_back(item); + } return makeErrorMessage( missingRows, extraRows, expectedRows.size(), actualRows.size(), type); diff --git a/velox/expression/tests/ExpressionRunnerTest.cpp b/velox/expression/tests/ExpressionRunnerTest.cpp index 34b2f3be21be..b5371724fe7a 100644 --- a/velox/expression/tests/ExpressionRunnerTest.cpp +++ b/velox/expression/tests/ExpressionRunnerTest.cpp @@ -29,83 +29,19 @@ #include "velox/functions/prestosql/registration/RegistrationFunctions.h" #include "velox/functions/sparksql/Register.h" #include "velox/vector/VectorSaver.h" +#include "velox/vector/tests/utils/VectorTestBase.h" +#include "velox/parse/ExpressionsParser.h" +#include "velox/parse/TypeResolver.h" +#include "velox/expression/Expr.h" using namespace facebook::velox; using facebook::velox::exec::test::PrestoQueryRunner; using facebook::velox::test::ReferenceQueryRunner; -DEFINE_string( - input_path, - "", - "Path for vector to be restored from disk. This will enable single run " - "of the fuzzer with the on-disk persisted repro information. This has to " - "be set with sql_path and optionally result_path."); - -DEFINE_string( - sql_path, - "", - "Path for expression SQL to be restored from disk. This will enable " - "single run of the fuzzer with the on-disk persisted repro information. " - "This has to be set with input_path and optionally result_path."); - -DEFINE_string( - complex_constant_path, - "", - "Path for complex constants that aren't expressible in SQL."); - -DEFINE_string( - sql, - "", - "Comma separated SQL expressions to evaluate. This flag and --sql_path " - "flag are mutually exclusive. If both are specified, --sql is used and " - "--sql_path is ignored."); - -DEFINE_string( - registry, - "presto", - "Funciton registry to use for expression evaluation. Currently supported values are " - "presto and spark. Default is presto."); - -DEFINE_string( - result_path, - "", - "Path for result vector to restore from disk. This is optional for " - "on-disk reproduction. Don't set if the initial repro result vector is " - "nullptr"); - -DEFINE_string( - mode, - "common", - "Mode for expression runner: \n" - "verify: evaluate the expression and compare results between common and " - "simplified paths.\n" - "common: evaluate the expression using common path and print out results.\n" - "simplified: evaluate the expression using simplified path and print out " - "results.\n" - "query: evaluate SQL query specified in --sql or --sql_path and print out " - "results. If --input_path is specified, the query may reference it as " - "table 't'."); - -DEFINE_string( - lazy_column_list_path, - "", - "Path for the file stored on-disk which contains a vector of column " - "indices that specify which columns of the input row vector should " - "be wrapped in lazy."); - -DEFINE_bool( - use_seperate_memory_pool_for_input_vector, - true, - "If true, expression evaluator and input vectors use different memory pools." - " This helps trigger code-paths that can depend on vectors having different" - " pools. For eg, when copying a flat string vector copies of the strings" - " stored in the string buffers need to be created. If however, the pools" - " were the same between the vectors then the buffers can simply be shared" - " between them instead."); DEFINE_string( reference_db_url, - "", + "http://127.0.0.1:8080", "ReferenceDB URI along with port. If set, we use the reference DB as the " "source of truth. Otherwise, use Velox simplified eval path. Example: " "--reference_db_url=http://127.0.0.1:8080"); @@ -116,105 +52,117 @@ DEFINE_uint32( "Timeout in milliseconds for HTTP requests made to reference DB, " "such as Presto. Example: --req_timeout_ms=2000"); -static bool validateMode(const char* flagName, const std::string& value) { - static const std::unordered_set kModes = { - "common", "simplified", "verify", "query"}; - if (kModes.count(value) != 1) { - std::cerr << "Invalid value for --" << flagName << ": " << value << ". "; - std::cerr << "Valid values are: " << folly::join(", ", kModes) << "." - << std::endl; - return false; - } +DEFINE_string(json_queries_path, "", ""); - return true; -} +class JsonExtractTestSuite : public facebook::velox::test::VectorTestBase { -static bool validateRegistry(const char* flagName, const std::string& value) { - static const std::unordered_set kRegistries = { - "presto", "spark"}; - if (kRegistries.count(value) != 1) { - std::cerr << "Invalid value for --" << flagName << ": " << value << ". "; - std::cerr << "Valid values are: " << folly::join(", ", kRegistries) << "." - << std::endl; - return false; - } - if (value == "spark") { - functions::sparksql::registerFunctions(""); - } else if (value == "presto") { - functions::prestosql::registerAllScalarFunctions(); - } + std::vector parseSql( + const std::string& sql, + const TypePtr& inputType, + memory::MemoryPool* pool, + const VectorPtr& complexConstants) { + auto exprs = parse::parseMultipleExpressions(sql, {}); - return true; -} - -DEFINE_validator(mode, &validateMode); -DEFINE_validator(registry, &validateRegistry); - -DEFINE_int32( - num_rows, - 10, - "Maximum number of rows to process. Zero means 'all rows'. Applies to " - "'common' and 'simplified' modes only. Ignored for 'verify' mode."); - -DEFINE_string( - store_result_path, - "", - "Directory path for storing the results of evaluating SQL expression or " - "query in common, simplified or query modes."); - -DEFINE_string( - fuzzer_repro_path, - "", - "Directory path where all input files generated by ExpressionVerifier are " - "expected to reside. For more details on which files and their names are " - "expected, please checkout the ExpressionVerifier class. Any file paths " - "already specified via a startup flag will take precedence."); - -DEFINE_bool( - find_minimal_subexpression, - false, - "Automatically seeks minimum failed subexpression on result mismatch"); + std::vector typedExprs; + typedExprs.reserve(exprs.size()); + for (const auto& expr : exprs) { + typedExprs.push_back( + core::Expressions::inferTypes(expr, inputType, pool, complexConstants)); + } + return typedExprs; + } -static std::string checkAndReturnFilePath( - const std::string_view& fileName, - const std::string& flagName) { - auto path = fmt::format("{}/{}", FLAGS_fuzzer_repro_path, fileName); - if (fs::exists(path)) { - LOG(INFO) << "Using " << flagName << " = " << path; - return path; - } else { - LOG(INFO) << "File for " << flagName << " not found."; + RowVectorPtr createRowVector( + const std::vector& vectors, + vector_size_t size, + memory::MemoryPool* pool) { + auto n = vectors.size(); + + std::vector names; + names.reserve(n); + std::vector types; + types.reserve(n); + for (auto i = 0; i < n; ++i) { + names.push_back(fmt::format("_col{}", i)); + types.push_back(vectors[i]->type()); + } + + return std::make_shared( + pool, ROW(std::move(names), std::move(types)), nullptr, size, vectors); } - return ""; -} -static void checkDirForExpectedFiles() { - LOG(INFO) << "Searching input directory for expected files at " - << FLAGS_fuzzer_repro_path; + RowVectorPtr evaluateAndPrintResults( + exec::ExprSet& exprSet, + const RowVectorPtr& data, + const SelectivityVector& rows, + core::ExecCtx& execCtx) { + exec::EvalCtx evalCtx(&execCtx, &exprSet, data.get()); + + std::vector results(1); + exprSet.eval(rows, evalCtx, results); + + // Print the results. + auto rowResult = createRowVector(results, rows.size(), execCtx.pool()); + std::cout << "Result: " << rowResult->type()->toString() << std::endl; + exec::test::printResults(rowResult, std::cout); + return rowResult; + } - FLAGS_input_path = FLAGS_input_path.empty() - ? checkAndReturnFilePath( - test::ExpressionVerifier::kInputVectorFileName, "input_path") - : FLAGS_input_path; - FLAGS_result_path = FLAGS_result_path.empty() - ? checkAndReturnFilePath( - test::ExpressionVerifier::kResultVectorFileName, "result_path") - : FLAGS_result_path; - FLAGS_sql_path = FLAGS_sql_path.empty() - ? checkAndReturnFilePath( - test::ExpressionVerifier::kExpressionSqlFileName, "sql_path") - : FLAGS_sql_path; - FLAGS_lazy_column_list_path = FLAGS_lazy_column_list_path.empty() - ? checkAndReturnFilePath( - test::ExpressionVerifier::kIndicesOfLazyColumnsFileName, - "lazy_column_list_path") - : FLAGS_lazy_column_list_path; - FLAGS_complex_constant_path = FLAGS_complex_constant_path.empty() - ? checkAndReturnFilePath( - test::ExpressionVerifier::kComplexConstantsFileName, - "complex_constant_path") - : FLAGS_complex_constant_path; -} + public: + void execute( + std::string rootFolderPath, + std::shared_ptr& referenceQueryRunner) { + // Specify the file name to read from each subfolder + std::string jsonfileName = "document.json"; + std::string selectorFileName = "selector"; + // Iterate through all subfolders in the root folder + std::vector testNames; + std::vector jsons; + std::vector jsonPaths; + for (const auto& entry : + std::filesystem::directory_iterator(rootFolderPath)) { + if (entry.is_directory()) { + testNames.push_back(entry.path().stem().string()); + + std::string filePath = entry.path().string() + "/" + jsonfileName; + jsons.push_back(restoreStringFromFile(filePath.c_str())); + + filePath = entry.path().string() + "/" + selectorFileName; + auto str = restoreStringFromFile(filePath.c_str()); + if(!str.empty() && str.back() == '\n') str.pop_back(); + jsonPaths.push_back(str); + } + } + + auto inputVector = makeRowVector( + {"test_name", "json_col", "path_col"}, + {makeFlatVector(testNames), + makeFlatVector(jsons), + makeFlatVector(jsonPaths)}); + SelectivityVector rows(inputVector->size()); + std::string sql = "test_name,path_col, try(json_extract(json_parse(json_col), path_col))"; + VectorPtr complexConstants{nullptr}; + parse::registerTypeResolver(); + auto typedExprs = + parseSql(sql, inputVector->type(), pool(), complexConstants); + VectorPtr resultVector = BaseVector::create( + ROW({"c0", "c1"}, {VARCHAR(), VARCHAR()}), inputVector->size(), pool()); + auto queryCtx = core::QueryCtx::create(); + core::ExecCtx execCtx{pool(), queryCtx.get()}; + auto verifier = + test::ExpressionVerifier(&execCtx, {false, ""}, referenceQueryRunner); + verifier.verify( + typedExprs, + inputVector, + std::nullopt, + std::move(resultVector), + true, + {}); + + //exec::ExprSet exprSet(typedExprs, &execCtx); + //auto results = evaluateAndPrintResults(exprSet, inputVector, rows, execCtx); + } +}; int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); @@ -223,20 +171,7 @@ int main(int argc, char** argv) { // experience, and initialize glog and gflags. folly::Init init(&argc, &argv); - if (!FLAGS_fuzzer_repro_path.empty()) { - checkDirForExpectedFiles(); - } - - if (FLAGS_sql.empty() && FLAGS_sql_path.empty()) { - std::cerr << "One of --sql or --sql_path flags must be set." << std::endl; - exit(1); - } - - auto sql = FLAGS_sql; - if (sql.empty()) { - sql = restoreStringFromFile(FLAGS_sql_path.c_str()); - VELOX_CHECK(!sql.empty()); - } + VELOX_CHECK(!FLAGS_json_queries_path.empty()); memory::initializeMemoryManager({}); @@ -246,10 +181,12 @@ int main(int argc, char** argv) { exec::test::registerHiveConnector({}); dwrf::registerDwrfWriterFactory(); + functions::prestosql::registerAllScalarFunctions(); + std::shared_ptr rootPool{ facebook::velox::memory::memoryManager()->addRootPool()}; std::shared_ptr referenceQueryRunner{nullptr}; - if (FLAGS_registry == "presto" && !FLAGS_reference_db_url.empty()) { + if (!FLAGS_reference_db_url.empty()) { referenceQueryRunner = std::make_shared( rootPool.get(), FLAGS_reference_db_url, @@ -258,16 +195,8 @@ int main(int argc, char** argv) { LOG(INFO) << "Using Presto as the reference DB."; } - test::ExpressionRunner::run( - FLAGS_input_path, - sql, - FLAGS_complex_constant_path, - FLAGS_result_path, - FLAGS_mode, - FLAGS_num_rows, - FLAGS_store_result_path, - FLAGS_lazy_column_list_path, - referenceQueryRunner, - FLAGS_find_minimal_subexpression, - FLAGS_use_seperate_memory_pool_for_input_vector); + // Specify the root folder path + std::string rootFolderPath = FLAGS_json_queries_path; + JsonExtractTestSuite testSuite; + testSuite.execute(rootFolderPath, referenceQueryRunner); }