From aa5ea0d5925d7784a4c4019faef0678762d4daf3 Mon Sep 17 00:00:00 2001 From: Yong Wang Date: Mon, 30 Oct 2023 23:35:14 -0700 Subject: [PATCH 1/2] Kusto-phase3: Optimizing the 'summarize' operator --- src/Parsers/Kusto/ParserKQLQuery.cpp | 12 +++++---- src/Parsers/Kusto/ParserKQLSummarize.cpp | 26 +++++++++++++++++-- .../KQL/gtest_KQL_AggregateFunctions.cpp | 12 +++++++++ .../0_stateless/02366_kql_summarize.reference | 8 ++++++ .../0_stateless/02366_kql_summarize.sql | 5 ++++ 5 files changed, 56 insertions(+), 7 deletions(-) diff --git a/src/Parsers/Kusto/ParserKQLQuery.cpp b/src/Parsers/Kusto/ParserKQLQuery.cpp index cf462da4a879..964deaa1a133 100644 --- a/src/Parsers/Kusto/ParserKQLQuery.cpp +++ b/src/Parsers/Kusto/ParserKQLQuery.cpp @@ -60,7 +60,7 @@ const std::unordered_map {"order by", {"order by", false, false, false, 4}}, {"table", {"table", false, false, false, 3}}, {"print", {"print", false, true, false, 3}}, - {"summarize", {"summarize", true, true, false, 3}}, + {"summarize", {"summarize", false, true, false, 3}}, {"make-series", {"make-series", true, true, false, 5}}, {"mv-expand", {"mv-expand", true, true, false, 5}}, {"count", {"count", true, true, false, 3}}, @@ -629,9 +629,10 @@ bool ParserKQLQuery::executeImpl(Pos & pos, ASTPtr & node, Expected & expected) limit_clause = op_str; else if (op == "order by" || op == "sort by") order_clause = order_clause.empty() ? op_str : order_clause + "," + op_str; + return op == "project" || op == "where" || op == "filter" || op == "limit" || op == "take" ||op == "order by" || op == "sort by"; }; - set_main_query_clause(last_op, last_pos); + bool last_op_processed = set_main_query_clause(last_op, last_pos); operation_pos.pop_back(); @@ -672,9 +673,6 @@ bool ParserKQLQuery::executeImpl(Pos & pos, ASTPtr & node, Expected & expected) return false; } - if (!kql_operator_p->parse(npos, node, expected)) - return false; - auto set_query_clause = [&](const String & op_str, const String & op_clause) { auto parser = getOperator(op_str); @@ -695,6 +693,10 @@ bool ParserKQLQuery::executeImpl(Pos & pos, ASTPtr & node, Expected & expected) || (!where_clause.empty() && !set_query_clause("where", where_clause)) || (!limit_clause.empty() && !set_query_clause("limit", limit_clause))) return false; + + if (!last_op_processed) + if (!kql_operator_p->parse(npos, node, expected)) + return false; } if (auto * select_query = node->as(); !select_query->select()) diff --git a/src/Parsers/Kusto/ParserKQLSummarize.cpp b/src/Parsers/Kusto/ParserKQLSummarize.cpp index c2a23af55453..ffcfd97fe57d 100644 --- a/src/Parsers/Kusto/ParserKQLSummarize.cpp +++ b/src/Parsers/Kusto/ParserKQLSummarize.cpp @@ -21,6 +21,8 @@ #include #include #include +#include +#include namespace DB { @@ -201,7 +203,27 @@ bool ParserKQLSummarize::parseImpl(Pos & pos, ASTPtr & node, Expected & expected expr_columns = expr_columns + "," + expr_aggregation; } - String converted_columns = getExprFromToken(expr_columns, pos.max_depth); + std::unordered_map alias_map; + auto select_expr = node->as()->select(); + if (select_expr) + { + std::ranges::for_each( + node->as()->select()->children, + [&alias_map](const ASTPtr & expression) + { + if (const auto alias = expression->tryGetAlias(); !alias.empty()) + { + alias_map[alias] = expression->getColumnNameWithoutAlias(); + } + }); + } + String converted_columns_raw = getExprFromToken(expr_columns, pos.max_depth); + std::istringstream column_stream(converted_columns_raw); + String converted_columns; + for (String value; std::getline(column_stream, value, ',');) { + converted_columns += alias_map.contains(value) ? alias_map[value] + " as " + value : value; + converted_columns +=","; + } Tokens token_converted_columns(converted_columns.c_str(), converted_columns.c_str() + converted_columns.size()); IParser::Pos pos_converted_columns(token_converted_columns, pos.max_depth); @@ -210,7 +232,7 @@ bool ParserKQLSummarize::parseImpl(Pos & pos, ASTPtr & node, Expected & expected return false; node->as()->setExpression(ASTSelectQuery::Expression::SELECT, std::move(select_expression_list)); - + node->as()->setExpression(ASTSelectQuery::Expression::ORDER_BY, nullptr); if (groupby) { String converted_groupby = getExprFromToken(expr_groupby, pos.max_depth); diff --git a/src/Parsers/tests/KQL/gtest_KQL_AggregateFunctions.cpp b/src/Parsers/tests/KQL/gtest_KQL_AggregateFunctions.cpp index 173b8ea789aa..08708722ce26 100644 --- a/src/Parsers/tests/KQL/gtest_KQL_AggregateFunctions.cpp +++ b/src/Parsers/tests/KQL/gtest_KQL_AggregateFunctions.cpp @@ -121,6 +121,18 @@ INSTANTIATE_TEST_SUITE_P(ParserKQLQuery_Aggregate, ParserTest, { "Customers | summarize x = hll(Education), y = hll(Occupation) | project xy = hll_merge(x, y) | project dcount_hll(xy);", "SELECT uniqCombined64Merge(18)(xy) AS Column1\nFROM\n(\n SELECT uniqCombined64MergeState(18)(arrayJoin([x, y])) AS xy\n FROM\n (\n SELECT\n uniqCombined64State(18)(Education) AS x,\n uniqCombined64State(18)(Occupation) AS y\n FROM Customers\n )\n)" + }, + { + "events_dist | project original_time, ip | where unixtime_milliseconds_todatetime(original_time) > ago(1h) | summarize ip_count = count(*) by ip;", + "SELECT\n ip,\n count(*) AS ip_count\nFROM events_dist\nWHERE kql_todatetime(fromUnixTimestamp64Milli(original_time, 'UTC')) > (now64(9, 'UTC') + (-1 * toIntervalNanosecond(3600000000000)))\nGROUP BY ip" + }, + { + "events_dist | project abc = ip ,original_time | where unixtime_milliseconds_todatetime(original_time) > ago(1h) | summarize ip_count = count(*) by abc;", + "SELECT\n ip AS abc,\n count(*) AS ip_count\nFROM events_dist\nWHERE kql_todatetime(fromUnixTimestamp64Milli(original_time, 'UTC')) > (now64(9, 'UTC') + (-1 * toIntervalNanosecond(3600000000000)))\nGROUP BY abc" + }, + { + "events_dist | where unixtime_milliseconds_todatetime(original_time) > ago(1h) | summarize ip_count = count(*) by ip | summarize avg(ip_count);", + "SELECT avg(ip_count) AS avg_ip_count\nFROM\n(\n SELECT\n ip,\n count(*) AS ip_count\n FROM events_dist\n WHERE kql_todatetime(fromUnixTimestamp64Milli(original_time, 'UTC')) > (now64(9, 'UTC') + (-1 * toIntervalNanosecond(3600000000000)))\n GROUP BY ip\n)" } }))); diff --git a/tests/queries/0_stateless/02366_kql_summarize.reference b/tests/queries/0_stateless/02366_kql_summarize.reference index 25c8673ba394..9f014df1dc10 100644 --- a/tests/queries/0_stateless/02366_kql_summarize.reference +++ b/tests/queries/0_stateless/02366_kql_summarize.reference @@ -139,3 +139,11 @@ Management abcd defg Stephanie Cox 33 3 7 7 +-- summarize optimization -- +Skilled Manual 2 +Professional 3 +Management abcd defg 1 +Skilled Manual 2 +Professional 3 +Management abcd defg 1 +2 diff --git a/tests/queries/0_stateless/02366_kql_summarize.sql b/tests/queries/0_stateless/02366_kql_summarize.sql index e35035d574b6..4b6fcc1a8841 100644 --- a/tests/queries/0_stateless/02366_kql_summarize.sql +++ b/tests/queries/0_stateless/02366_kql_summarize.sql @@ -151,3 +151,8 @@ Customers | summarize x = hll(Education) | project dcount_hll(x); Customers | summarize y = hll(Occupation) | project dcount_hll(y); Customers | summarize x = hll(Education), y = hll(Occupation) | project xy = hll_merge(x, y) | project dcount_hll(xy); Customers | summarize x = hll(Education), y = hll(Occupation) | summarize xy = hll_merge(x, y) | project dcount_hll(xy); + +print '-- summarize optimization --'; +Customers|project Occupation, Age |where Age > 30 | summarize occ_count = count(*) by Occupation; +Customers|project prefession = Occupation, Age | where Age > 30 | summarize occ_count = count(*) by prefession; +Customers|project prefession = Occupation, Age |where Age > 30 | summarize occ_count = count(*) by prefession |summarize avg(occ_count); \ No newline at end of file From ee4bd0f6454ebcd0b3759cf0f854c3126e176d4a Mon Sep 17 00:00:00 2001 From: Yong Wang Date: Wed, 1 Nov 2023 09:35:12 -0700 Subject: [PATCH 2/2] Kusto-phase3: update release note for summarize optimization --- src/Parsers/Kusto/KQL_ReleaseNote.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/Parsers/Kusto/KQL_ReleaseNote.md b/src/Parsers/Kusto/KQL_ReleaseNote.md index 94cafc0b92df..09637a43dc27 100644 --- a/src/Parsers/Kusto/KQL_ReleaseNote.md +++ b/src/Parsers/Kusto/KQL_ReleaseNote.md @@ -5,6 +5,25 @@ ``` print lookup_contains('test_rocksDB',1); ``` +- Optimizing the 'summarize' operator. + + KQL: + ``` + events_dist + | project original_time, ip + | where unixtime_milliseconds_todatetime(original_time) > ago(1h) + | summarize ip_count = count(*) by ip + ``` + Optimal SQL: + ``` + SELECT + ip, + count(*) AS ip_count + FROM events_dist + WHERE kql_todatetime(fromUnixTimestamp64Milli(original_time, 'UTC')) > (now64(9, 'UTC') + (-1 * toIntervalNanosecond(3600000000000))) + GROUP BY ip + ``` + # July, 2023 ## Features