From 9c3be169d0d29a27289101fd28d413ba2ee1c612 Mon Sep 17 00:00:00 2001 From: Yong Wang Date: Tue, 25 Oct 2022 09:24:37 -0700 Subject: [PATCH 1/2] Kusto-phase3: Add top and top-hitters operator --- src/Parsers/Kusto/ParserKQLQuery.cpp | 100 +++++++++++------- src/Parsers/Kusto/ParserKQLQuery.h | 16 ++- src/Parsers/Kusto/ParserKQLTop.cpp | 57 ++++++++++ src/Parsers/Kusto/ParserKQLTop.h | 17 +++ src/Parsers/Kusto/ParserKQLTopHitter.cpp | 76 +++++++++++++ src/Parsers/Kusto/ParserKQLTopHitter.h | 17 +++ src/Parsers/tests/KQL/gtest_KQL_TopHitter.cpp | 57 ++++++++++ .../02366_kql_top_hitters.reference | 38 +++++++ .../0_stateless/02366_kql_top_hitters.sql | 39 +++++++ 9 files changed, 376 insertions(+), 41 deletions(-) create mode 100644 src/Parsers/Kusto/ParserKQLTop.cpp create mode 100644 src/Parsers/Kusto/ParserKQLTop.h create mode 100644 src/Parsers/Kusto/ParserKQLTopHitter.cpp create mode 100644 src/Parsers/Kusto/ParserKQLTopHitter.h create mode 100644 src/Parsers/tests/KQL/gtest_KQL_TopHitter.cpp create mode 100644 tests/queries/0_stateless/02366_kql_top_hitters.reference create mode 100644 tests/queries/0_stateless/02366_kql_top_hitters.sql diff --git a/src/Parsers/Kusto/ParserKQLQuery.cpp b/src/Parsers/Kusto/ParserKQLQuery.cpp index 0f73a64a8af..1d4ec56557b 100644 --- a/src/Parsers/Kusto/ParserKQLQuery.cpp +++ b/src/Parsers/Kusto/ParserKQLQuery.cpp @@ -24,6 +24,8 @@ #include #include #include +#include +#include namespace DB { @@ -34,6 +36,27 @@ namespace ErrorCodes extern const int SYNTAX_ERROR; } +std::unordered_map kql_parser = +{ + {"filter", {"filter", false, false, false, 3}}, + {"where", {"filter", false, false, false, 3}}, + {"limit", {"limit", false, true, false, 3}}, + {"take", {"limit", false, true, false, 3}}, + {"project", {"project", false, false, false, 3}}, + {"distinct", {"distinct", false, true, false, 3}}, + {"extend", {"extend", true, true, false, 3}}, + {"sort by", {"order by", false, false, false, 4}}, + {"order by", {"order by", false, false, false, 4}}, + {"table", {"table", false, false, false, 3}}, + {"print", {"print", false, true, false, 3}}, + {"summarize", {"summarize", true, true, false, 3}}, + {"make-series", {"make-series", true, true, false, 5}}, + {"mv-expand", {"mv-expand", true, true, false, 5}}, + {"count", {"count", false, true, false, 3}}, + {"top", {"top", false, true, true, 3}}, + {"top-hitters", {"top-hitters", true, true, true, 5}}, +}; + bool ParserKQLBase::parseByString(const String expr, ASTPtr & node, const uint32_t max_depth) { Expected expected; @@ -117,7 +140,7 @@ String ParserKQLBase::getExprFromPipe(Pos & pos) ++end; } --end; - return String(begin->begin, end->end); + return (begin <= end) ? String(begin->begin, end->end) : ""; } String ParserKQLBase::getExprFromToken(Pos & pos) @@ -175,7 +198,7 @@ String ParserKQLBase::getExprFromToken(Pos & pos) return res; } -std::unique_ptr ParserKQLQuery::getOperator(String & op_name) +std::unique_ptr ParserKQLQuery::getOperator(String & op_name) { if (op_name == "filter" || op_name == "where") return std::make_unique(); @@ -201,45 +224,16 @@ std::unique_ptr ParserKQLQuery::getOperator(String & op_name) return std::make_unique(); else if (op_name == "count") return std::make_unique(); + else if (op_name == "top") + return std::make_unique(); + else if (op_name == "top-hitters") + return std::make_unique(); else return nullptr; } -bool ParserKQLQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) +bool ParserKQLQuery::getOperations(Pos & pos, Expected & expected, OperationsPos & operation_pos) { - struct KQLOperatorDataFlowState - { - String operator_name; - bool need_input; - bool gen_output; - int8_t backspace_steps; // how many steps to last token of previous pipe - }; - - auto select_query = std::make_shared(); - node = select_query; - ASTPtr tables; - - std::unordered_map kql_parser = - { - {"filter", {"filter", false, false, 3}}, - {"where", {"filter", false, false, 3}}, - {"limit", {"limit", false, true, 3}}, - {"take", {"limit", false, true, 3}}, - {"project", {"project", false, false, 3}}, - {"distinct", {"distinct", false, true, 3}}, - {"extend", {"extend", true, true, 3}}, - {"sort by", {"order by", false, false, 4}}, - {"order by", {"order by", false, false, 4}}, - {"table", {"table", false, false, 3}}, - {"print", {"print", false, true, 3}}, - {"summarize", {"summarize", true, true, 3}}, - {"make-series", {"make-series", true, true, 5}}, - {"mv-expand", {"mv-expand", true, true, 5}}, - {"count", {"count", false, true, 3}}, - }; - - std::vector> operation_pos; - String table_name(pos->begin, pos->end); if (table_name == "print") @@ -300,17 +294,45 @@ bool ParserKQLQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) else ++pos; } + return true; +} + +bool ParserKQLQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) +{ + auto select_query = std::make_shared(); + node = select_query; + ASTPtr tables; + + OperationsPos operation_pos; + + if (!getOperations(pos, expected, operation_pos)) + return false; auto kql_operator_str = operation_pos.back().first; - auto npos = operation_pos.back().second; - // if (!npos.isValid()) - // return false; auto kql_operator_p = getOperator(kql_operator_str); + if (!kql_operator_p) + return false; + + String updated_query; + kql_operator_p->updatePipeLine(operation_pos, updated_query); + + Tokens token_query(updated_query.c_str(), updated_query.c_str() + updated_query.size()); + IParser::Pos pos_query(token_query, pos.max_depth); + if (!updated_query.empty()) + { + operation_pos.clear(); + if(!ParserKQLQuery::getOperations(pos_query, expected, operation_pos)) + return false; + } + kql_operator_str = operation_pos.back().first; + kql_operator_p = getOperator(kql_operator_str); if (!kql_operator_p) return false; + auto npos = operation_pos.back().second; + if (operation_pos.size() == 1) { if (kql_operator_str == "print") diff --git a/src/Parsers/Kusto/ParserKQLQuery.h b/src/Parsers/Kusto/ParserKQLQuery.h index 381462a5382..9ffe1996547 100644 --- a/src/Parsers/Kusto/ParserKQLQuery.h +++ b/src/Parsers/Kusto/ParserKQLQuery.h @@ -5,6 +5,8 @@ namespace DB { +using OperationsPos = std::vector>; + class ParserKQLBase : public IParserBase { public: @@ -14,13 +16,23 @@ class ParserKQLBase : public IParserBase static bool setSubQuerySource(ASTPtr & select_query, ASTPtr & source, bool dest_is_subquery, bool src_is_subquery); static bool parseSQLQueryByString(ParserPtr && parser, String & query, ASTPtr & select_node, int32_t max_depth); bool parseByString(const String expr, ASTPtr & node, const uint32_t max_depth); + virtual bool updatePipeLine (OperationsPos & /*operations*/, String & /*query*/) {return false;} }; class ParserKQLQuery : public IParserBase { - +public: + struct KQLOperatorDataFlowState + { + String operator_name; + bool need_input; + bool gen_output; + bool need_reinterpret; + int8_t backspace_steps; // how many steps to last token of previous pipe + }; + static bool getOperations(Pos & pos, Expected & expected, OperationsPos & operation_pos); protected: - static std::unique_ptr getOperator(String &op_name); + static std::unique_ptr getOperator(String &op_name); const char * getName() const override { return "KQL query"; } bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override; }; diff --git a/src/Parsers/Kusto/ParserKQLTop.cpp b/src/Parsers/Kusto/ParserKQLTop.cpp new file mode 100644 index 00000000000..327adc45cac --- /dev/null +++ b/src/Parsers/Kusto/ParserKQLTop.cpp @@ -0,0 +1,57 @@ +#include +#include +#include +#include + +namespace DB +{ +namespace ErrorCodes +{ + extern const int SYNTAX_ERROR; +} + +bool ParserKQLTop::parseImpl(Pos & /*pos*/, ASTPtr & /*node*/, Expected & /*expected*/) +{ + return true; +} + +bool ParserKQLTop::updatePipeLine (OperationsPos & operations, String & query) +{ + Pos pos = operations.back().second; + + if (pos->isEnd() || pos->type == TokenType::PipeMark || pos->type == TokenType::Semicolon) + throw Exception("Syntax error near top operator", ErrorCodes::SYNTAX_ERROR); + + Pos start_pos = operations.front().second; + Pos end_pos = pos; + --end_pos; + --end_pos; + + String prev_query(start_pos->begin, end_pos->end); + + String limit_expr, sort_expr; + start_pos = pos; + end_pos = pos; + while (!pos->isEnd() && pos->type != TokenType::PipeMark && pos->type != TokenType::Semicolon) + { + if (String(pos->begin, pos->end) == "by") + { + auto limt_end_pos = pos; + --limt_end_pos; + limit_expr = String(start_pos->begin, limt_end_pos->end); + start_pos = pos; + ++start_pos; + } + end_pos = pos; + ++pos; + } + sort_expr = (start_pos <= end_pos) ? String(start_pos->begin, end_pos->end) : ""; + if (limit_expr.empty() || sort_expr.empty()) + throw Exception("top operator need a by clause", ErrorCodes::SYNTAX_ERROR); + + query = std::format("{} sort by {} | take {}", prev_query, sort_expr, limit_expr); + + return true; +} + +} diff --git a/src/Parsers/Kusto/ParserKQLTop.h b/src/Parsers/Kusto/ParserKQLTop.h new file mode 100644 index 00000000000..8672204f402 --- /dev/null +++ b/src/Parsers/Kusto/ParserKQLTop.h @@ -0,0 +1,17 @@ +#pragma once + +#include +#include + +namespace DB +{ + +class ParserKQLTop : public ParserKQLBase +{ +protected: + const char * getName() const override { return "KQL top"; } + bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override; + bool updatePipeLine (OperationsPos & operations, String & query) override; +}; + +} diff --git a/src/Parsers/Kusto/ParserKQLTopHitter.cpp b/src/Parsers/Kusto/ParserKQLTopHitter.cpp new file mode 100644 index 00000000000..9d9205edf12 --- /dev/null +++ b/src/Parsers/Kusto/ParserKQLTopHitter.cpp @@ -0,0 +1,76 @@ +#include +#include +#include +#include + +namespace DB +{ +namespace ErrorCodes +{ + extern const int SYNTAX_ERROR; +} + +bool ParserKQLTopHitters::parseImpl(Pos & /*pos*/, ASTPtr & /*node*/, Expected & /*expected*/) +{ + return true; +} + +bool ParserKQLTopHitters::updatePipeLine (OperationsPos & operations, String & query) +{ + Pos pos = operations.back().second; + + if (pos->isEnd() || pos->type == TokenType::PipeMark || pos->type == TokenType::Semicolon) + throw Exception("Syntax error near top-hitters operator", ErrorCodes::SYNTAX_ERROR); + + Pos start_pos = operations.front().second; + Pos end_pos = pos; + --end_pos; + --end_pos; + --end_pos; + --end_pos; + + String prev_query(start_pos->begin, end_pos->end); + + String number_of_values, value_expression, summing_expression; + start_pos = pos; + end_pos = pos; + while (!pos->isEnd() && pos->type != TokenType::PipeMark && pos->type != TokenType::Semicolon) + { + if (String(pos->begin, pos->end) == "of") + { + auto number_end_pos = pos; + --number_end_pos; + number_of_values = String(start_pos->begin, number_end_pos->end); + start_pos = pos; + ++start_pos; + } + + if (String(pos->begin, pos->end) == "by") + { + auto expr_end_pos = pos; + --expr_end_pos; + value_expression = String(start_pos->begin, expr_end_pos->end); + start_pos = pos; + ++start_pos; + } + end_pos = pos; + ++pos; + } + + if (value_expression.empty()) + value_expression = (start_pos <= end_pos) ? String(start_pos->begin, end_pos->end) : ""; + else + summing_expression = (start_pos <= end_pos) ? String(start_pos->begin, end_pos->end) : ""; + + if (number_of_values.empty() || value_expression.empty()) + throw Exception("top-hitter operator need a ValueExpression", ErrorCodes::SYNTAX_ERROR); + + if (summing_expression.empty()) + query = std::format("{0} summarize approximate_count_{1} = count() by {1} | sort by approximate_count_{1} desc | take {2} ", prev_query, value_expression, number_of_values); + else + query = std::format("{0} summarize approximate_sum_{1} = sum({1}) by {2} | sort by approximate_sum_{1} desc | take {3}", prev_query, summing_expression, value_expression, number_of_values); + + return true; +} + +} diff --git a/src/Parsers/Kusto/ParserKQLTopHitter.h b/src/Parsers/Kusto/ParserKQLTopHitter.h new file mode 100644 index 00000000000..2fa6a9b6203 --- /dev/null +++ b/src/Parsers/Kusto/ParserKQLTopHitter.h @@ -0,0 +1,17 @@ +#pragma once + +#include +#include + +namespace DB +{ + +class ParserKQLTopHitters : public ParserKQLBase +{ +protected: + const char * getName() const override { return "KQL top-hitters"; } + bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override; + bool updatePipeLine (OperationsPos & operations, String & query) override; +}; + +} diff --git a/src/Parsers/tests/KQL/gtest_KQL_TopHitter.cpp b/src/Parsers/tests/KQL/gtest_KQL_TopHitter.cpp new file mode 100644 index 00000000000..4162461e1dc --- /dev/null +++ b/src/Parsers/tests/KQL/gtest_KQL_TopHitter.cpp @@ -0,0 +1,57 @@ +#include + +#include + +INSTANTIATE_TEST_SUITE_P(ParserKQLQuery_TopHitters, ParserTest, + ::testing::Combine( + ::testing::Values(std::make_shared()), + ::testing::ValuesIn(std::initializer_list{ + { + "Customers | top 5 by Age", + "SELECT *\nFROM Customers\nORDER BY Age DESC\nLIMIT 5" + }, + { + "Customers | top 5 by Age desc", + "SELECT *\nFROM Customers\nORDER BY Age DESC\nLIMIT 5" + }, + { + "Customers | top 5 by Age asc", + "SELECT *\nFROM Customers\nORDER BY Age ASC\nLIMIT 5" + }, + { + "Customers | top 5 by FirstName desc nulls first", + "SELECT *\nFROM Customers\nORDER BY FirstName DESC NULLS FIRST\nLIMIT 5" + }, + { + "Customers | top 5 by FirstName desc nulls last", + "SELECT *\nFROM Customers\nORDER BY FirstName DESC NULLS LAST\nLIMIT 5" + }, + { + "Customers | top 5 by Age | top 2 by FirstName", + "SELECT *\nFROM\n(\n SELECT *\n FROM Customers\n ORDER BY Age DESC\n LIMIT 5\n)\nORDER BY FirstName DESC\nLIMIT 2" + }, + { + "Customers| top-hitters a = 3 of Age by extra", + "SELECT *\nFROM\n(\n SELECT\n Age,\n sum(extra) AS approximate_sum_extra\n FROM Customers\n GROUP BY Age\n)\nORDER BY approximate_sum_extra DESC\nLIMIT 3 AS a" + }, + { + "Customers| top-hitters 3 of Age", + "SELECT *\nFROM\n(\n SELECT\n Age,\n count() AS approximate_count_Age\n FROM Customers\n GROUP BY Age\n)\nORDER BY approximate_count_Age DESC\nLIMIT 3" + }, + { + "Customers| top-hitters 3 of Age by extra | top-hitters 2 of Age", + "SELECT *\nFROM\n(\n SELECT\n Age,\n count() AS approximate_count_Age\n FROM\n (\n SELECT *\n FROM\n (\n SELECT\n Age,\n sum(extra) AS approximate_sum_extra\n FROM Customers\n GROUP BY Age\n )\n ORDER BY approximate_sum_extra DESC\n LIMIT 3\n )\n GROUP BY Age\n)\nORDER BY approximate_count_Age DESC\nLIMIT 2" + }, + { + "Customers| top-hitters 3 of Age by extra | where Age > 30", + "SELECT *\nFROM\n(\n SELECT *\n FROM\n (\n SELECT\n Age,\n sum(extra) AS approximate_sum_extra\n FROM Customers\n GROUP BY Age\n )\n ORDER BY approximate_sum_extra DESC\n LIMIT 3\n)\nWHERE Age > 30" + }, + { + "Customers| top-hitters 3 of Age by extra | where approximate_sum_extra < 200", + "SELECT *\nFROM\n(\n SELECT *\n FROM\n (\n SELECT\n Age,\n sum(extra) AS approximate_sum_extra\n FROM Customers\n GROUP BY Age\n )\n ORDER BY approximate_sum_extra DESC\n LIMIT 3\n)\nWHERE approximate_sum_extra < 200" + }, + { + "Customers| top-hitters 3 of Age | where approximate_count_Age > 2", + "SELECT *\nFROM\n(\n SELECT *\n FROM\n (\n SELECT\n Age,\n count() AS approximate_count_Age\n FROM Customers\n GROUP BY Age\n )\n ORDER BY approximate_count_Age DESC\n LIMIT 3\n)\nWHERE approximate_count_Age > 2" + } +}))); diff --git a/tests/queries/0_stateless/02366_kql_top_hitters.reference b/tests/queries/0_stateless/02366_kql_top_hitters.reference new file mode 100644 index 00000000000..6ce63c6f8e2 --- /dev/null +++ b/tests/queries/0_stateless/02366_kql_top_hitters.reference @@ -0,0 +1,38 @@ +--top 1-- +Angel Stewart Professional Partial College 46 100 +Dalton Wood Professional Partial College 42 70 +\N why Professional Partial College 38 120 +--top 2-- +Angel Stewart Professional Partial College 46 100 +Dalton Wood Professional Partial College 42 70 +\N why Professional Partial College 38 120 +--top 3-- +Peter Nara Skilled Manual Graduate Degree 26 30 +Latoya Shen Professional Graduate Degree 25 40 +Joshua Lee Professional Partial College 26 50 +--top 4-- +\N why Professional Partial College 38 120 +Theodore Diaz Skilled Manual Bachelors 28 10 +Stephanie Cox Management Bachelors 33 20 +--top 5-- +Theodore Diaz Skilled Manual Bachelors 28 10 +Stephanie Cox Management Bachelors 33 20 +Peter Nara Skilled Manual Graduate Degree 26 30 +--top 6-- +Dalton Wood Professional Partial College 42 70 +Angel Stewart Professional Partial College 46 100 +--top hitters 1-- +28 210 +38 120 +--top hitters 2-- +28 3 +26 2 +--top hitters 3-- +38 1 +28 1 +--top hitters 4-- +38 120 +--top hitters 5-- +38 120 +--top hitters 6-- +28 3 diff --git a/tests/queries/0_stateless/02366_kql_top_hitters.sql b/tests/queries/0_stateless/02366_kql_top_hitters.sql new file mode 100644 index 00000000000..2f27a583ea7 --- /dev/null +++ b/tests/queries/0_stateless/02366_kql_top_hitters.sql @@ -0,0 +1,39 @@ +DROP TABLE IF EXISTS Customers; +CREATE TABLE Customers +( + FirstName Nullable(String), + LastName String, + Occupation String, + Education String, + Age Nullable(UInt8), + extra Int16 +) ENGINE = Memory; + +INSERT INTO Customers VALUES ('Theodore','Diaz','Skilled Manual','Bachelors',28,10),('Stephanie','Cox','Management','Bachelors',33,20),('Peter','Nara','Skilled Manual','Graduate Degree',26,30),('Latoya','Shen','Professional','Graduate Degree',25,40),('Joshua','Lee','Professional','Partial College',26,50),('Edward','Hernandez','Skilled Manual','High School',36,60),('Dalton','Wood','Professional','Partial College',42,70),('Christine','Nara','Skilled Manual','Partial College',33,80),('Cameron','Rodriguez','Professional','Partial College',28,90),('Angel','Stewart','Professional','Partial College',46,100),('Apple','B','Skilled Manual','Bachelors',28,110),(NULL,'why','Professional','Partial College',38,120); + +set dialect = 'kusto'; +print '--top 1--'; +Customers | top 3 by Age; +print '--top 2--'; +Customers | top 3 by Age desc; +print '--top 3--'; +Customers | top 3 by Age asc | order by FirstName; +print '--top 4--'; +Customers | top 3 by FirstName desc nulls first; +print '--top 5--'; +Customers | top 3 by FirstName desc nulls last; +print '--top 6--'; +Customers | top 3 by Age | top 2 by FirstName; +print '--top hitters 1--'; +Customers | top-hitters a = 2 of Age by extra; +print '--top hitters 2--'; +Customers | top-hitters 2 of Age; +print '--top hitters 3--'; +Customers | top-hitters 2 of Age by extra | top-hitters 2 of Age | order by Age; +print '--top hitters 4--'; +Customers | top-hitters 2 of Age by extra | where Age > 30; +print '--top hitters 5--'; +Customers | top-hitters 2 of Age by extra | where approximate_sum_extra < 200; +print '--top hitters 6--'; +Customers | top-hitters 2 of Age | where approximate_count_Age > 2; + From c2d9078df8d1a50d2f97ac4a095abf2857049130 Mon Sep 17 00:00:00 2001 From: Yong Wang Date: Tue, 25 Oct 2022 09:33:55 -0700 Subject: [PATCH 2/2] Kusto-phase2: Update release for top-hitters operator --- src/Parsers/Kusto/KQL_ReleaseNote.md | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/src/Parsers/Kusto/KQL_ReleaseNote.md b/src/Parsers/Kusto/KQL_ReleaseNote.md index 6e7459d2aab..3e966bfa8c8 100644 --- a/src/Parsers/Kusto/KQL_ReleaseNote.md +++ b/src/Parsers/Kusto/KQL_ReleaseNote.md @@ -2,12 +2,27 @@ # October 25, 2022 ## operator - [count](https://learn.microsoft.com/en-us/azure/data-explorer/kusto/query/countoperator) - `Customers | count;` `Customers | where Age< 30 | count;` `Customers | where Age< 30 | limit 2 | count;` `Customers | where Age< 30 | limit 2 | count | project Count;` +- [top](https://learn.microsoft.com/en-us/azure/data-explorer/kusto/query/topoperator) +`Customers | top 3 by Age;` +`Customers | top 3 by Age desc;` +`Customers | top 3 by Age asc | order by FirstName;` +`Customers | top 3 by FirstName desc nulls first;` +`Customers | top 3 by FirstName desc nulls last;` +`Customers | top 3 by Age | top 2 by FirstName;` + +- [top-hitters](https://learn.microsoft.com/en-us/azure/data-explorer/kusto/query/tophittersoperator) +`Customers | top-hitters a = 2 of Age by extra;` +`Customers | top-hitters 2 of Age;` +`Customers | top-hitters 2 of Age by extra | top-hitters 2 of Age | order by Age;` +`Customers | top-hitters 2 of Age by extra | where Age > 30;` +`Customers | top-hitters 2 of Age by extra | where approximate_sum_extra < 200;` +`Customers | top-hitters 2 of Age | where approximate_count_Age > 2;` + # October 9, 2022