From eedb3a9456f3b5b1f0a9119d45b682a7180a22d3 Mon Sep 17 00:00:00 2001 From: ltrk2 <107155950+ltrk2@users.noreply.github.com> Date: Thu, 8 Sep 2022 13:19:56 -0700 Subject: [PATCH] Implement KQL set functions --- src/Parsers/Kusto/KQL_ReleaseNote.md | 52 ++++++++++++++ .../KustoFunctions/IParserKQLFunction.cpp | 8 ++- .../Kusto/KustoFunctions/IParserKQLFunction.h | 5 +- .../KustoFunctions/KQLDynamicFunctions.cpp | 67 ++++++++++++++----- .../KustoFunctions/KQLFunctionFactory.cpp | 5 ++ .../Kusto/KustoFunctions/KQLFunctionFactory.h | 1 + .../KQLMathematicalFunctions.cpp | 9 +++ .../KustoFunctions/KQLMathematicalFunctions.h | 11 +++ src/Parsers/tests/KQL/gtest_KQL_Dynamic.cpp | 40 +++++++++-- 9 files changed, 174 insertions(+), 24 deletions(-) create mode 100644 src/Parsers/Kusto/KustoFunctions/KQLMathematicalFunctions.cpp create mode 100644 src/Parsers/Kusto/KustoFunctions/KQLMathematicalFunctions.h diff --git a/src/Parsers/Kusto/KQL_ReleaseNote.md b/src/Parsers/Kusto/KQL_ReleaseNote.md index 03abc7aa7fdd..b4e18faa27af 100644 --- a/src/Parsers/Kusto/KQL_ReleaseNote.md +++ b/src/Parsers/Kusto/KQL_ReleaseNote.md @@ -56,6 +56,58 @@ Please note that our current implementation supports only scalars and arrays made up of elements of the same type. Support for mixed types and property bags is deferred for now, based on our understanding of the required effort and discussion with representatives of the QRadar team. +## Mathematical functions + - [isnan](https://docs.microsoft.com/en-us/azure/data-explorer/kusto/query/isnanfunction) + `print isnan(double(nan)) == true` + `print isnan(4.2) == false` + `print isnan(4) == false` + `print isnan(real(+inf)) == false` + +## Set functions +Please note that functions returning arrays with set semantics may return them in any particular order, which may be subject to change in the future. + + - [jaccard_index](https://docs.microsoft.com/en-us/azure/data-explorer/kusto/query/jaccard-index-function) + `print jaccard_index(dynamic([1, 1, 2, 2, 3, 3]), dynamic([1, 2, 3, 4, 4, 4])) == 0.75` + `print jaccard_index(dynamic([1, 2, 3]), dynamic([])) == 0` + `print jaccard_index(dynamic([]), dynamic([1, 2, 3, 4])) == 0` + `print isnan(jaccard_index(dynamic([]), dynamic([])))` + `print jaccard_index(dynamic([1, 2, 3]), dynamic([4, 5, 6, 7])) == 0` + `print jaccard_index(dynamic(['a', 's', 'd']), dynamic(['f', 'd', 's', 'a'])) == 0.75` + `print jaccard_index(dynamic(['Chewbacca', 'Darth Vader', 'Han Solo']), dynamic(['Darth Sidious', 'Darth Vader'])) == 0.25` + + - [set_difference](https://docs.microsoft.com/en-us/azure/data-explorer/kusto/query/setdifferencefunction) + `print set_difference(dynamic([1, 1, 2, 2, 3, 3]), dynamic([1, 2, 3])) == dynamic([])` + `print array_sort_asc(set_difference(dynamic([1, 4, 2, 3, 5, 4, 6]), dynamic([1, 2, 3])))[1] == dynamic([4, 5, 6])` + `print set_difference(dynamic([4]), dynamic([1, 2, 3])) == dynamic([4])` + `print array_sort_asc(set_difference(dynamic([1, 2, 3, 4, 5]), dynamic([5]), dynamic([2, 4])))[1] == dynamic([1, 3])` + `print array_sort_asc(set_difference(dynamic([1, 2, 3]), dynamic([])))[1] == dynamic([1, 2, 3])` + `print array_sort_asc(set_difference(dynamic(['a', 's', 'd']), dynamic(['a', 'f'])))[1] == dynamic(['d', 's'])` + `print array_sort_asc(set_difference(dynamic(['Chewbacca', 'Darth Vader', 'Han Solo']), dynamic(['Darth Sidious', 'Darth Vader'])))[1] == dynamic(['Chewbacca', 'Han Solo'])` + + - [set_has_element](https://docs.microsoft.com/en-us/azure/data-explorer/kusto/query/sethaselementfunction) + `print set_has_element(dynamic(["this", "is", "an", "example"]), "example") == true` + `print set_has_element(dynamic(["this", "is", "an", "example"]), "examplee") == false` + `print set_has_element(dynamic([1, 2, 3]), 2) == true` + `print set_has_element(dynamic([1, 2, 3, 4.2]), 4) == false` + + - [set_intersect](https://docs.microsoft.com/en-us/azure/data-explorer/kusto/query/setintersectfunction) + `print array_sort_asc(set_intersect(dynamic([1, 1, 2, 2, 3, 3]), dynamic([1, 2, 3])))[1] == dynamic([1, 2, 3])` + `print array_sort_asc(set_intersect(dynamic([1, 4, 2, 3, 5, 4, 6]), dynamic([1, 2, 3])))[1] == dynamic([1, 2, 3])` + `print set_intersect(dynamic([4]), dynamic([1, 2, 3])) == dynamic([])` + `print set_intersect(dynamic([1, 2, 3, 4, 5]), dynamic([1, 3, 5]), dynamic([2, 5])) == dynamic([5])` + `print set_intersect(dynamic([1, 2, 3]), dynamic([])) == dynamic([])` + `print set_intersect(dynamic(['a', 's', 'd']), dynamic(['a', 'f'])) == dynamic(['a'])` + `print set_intersect(dynamic(['Chewbacca', 'Darth Vader', 'Han Solo']), dynamic(['Darth Sidious', 'Darth Vader'])) == dynamic(['Darth Vader'])` + + - [set_union](https://docs.microsoft.com/en-us/azure/data-explorer/kusto/query/setunionfunction) + `print array_sort_asc(set_union(dynamic([1, 1, 2, 2, 3, 3]), dynamic([1, 2, 3])))[1] == dynamic([1, 2, 3])` + `print array_sort_asc(set_union(dynamic([1, 4, 2, 3, 5, 4, 6]), dynamic([1, 2, 3])))[1] == dynamic([1, 2, 3, 4, 5, 6])` + `print array_sort_asc(set_union(dynamic([4]), dynamic([1, 2, 3])))[1] == dynamic([1, 2, 3, 4])` + `print array_sort_asc(set_union(dynamic([1, 3, 4]), dynamic([5]), dynamic([2, 4])))[1] == dynamic([1, 2, 3, 4, 5])` + `print array_sort_asc(set_union(dynamic([1, 2, 3]), dynamic([])))[1] == dynamic([1, 2, 3])` + `print array_sort_asc(set_union(dynamic(['a', 's', 'd']), dynamic(['a', 'f'])))[1] == dynamic(['a', 'd', 'f', 's'])` + `print array_sort_asc(set_union(dynamic(['Chewbacca', 'Darth Vader', 'Han Solo']), dynamic(['Darth Sidious', 'Darth Vader'])))[1] == dynamic(['Chewbacca', 'Darth Sidious', 'Darth Vader', 'Han Solo'])` + # August 29, 2022 ## **mv-expand operator** diff --git a/src/Parsers/Kusto/KustoFunctions/IParserKQLFunction.cpp b/src/Parsers/Kusto/KustoFunctions/IParserKQLFunction.cpp index 7fb24b574932..d44add5cce6b 100644 --- a/src/Parsers/Kusto/KustoFunctions/IParserKQLFunction.cpp +++ b/src/Parsers/Kusto/KustoFunctions/IParserKQLFunction.cpp @@ -164,7 +164,13 @@ String IParserKQLFunction::getKQLFunctionName(IParser::Pos & pos) } String IParserKQLFunction::kqlCallToExpression( - const String & function_name, std::initializer_list params, const uint32_t max_depth) + const std::string_view function_name, const std::initializer_list params, const uint32_t max_depth) +{ + return kqlCallToExpression(function_name, std::span(params), max_depth); +} + +String IParserKQLFunction::kqlCallToExpression( + const std::string_view function_name, const std::span params, const uint32_t max_depth) { const auto params_str = std::accumulate( std::cbegin(params), diff --git a/src/Parsers/Kusto/KustoFunctions/IParserKQLFunction.h b/src/Parsers/Kusto/KustoFunctions/IParserKQLFunction.h index ffa551ea36f0..245b196c8e37 100644 --- a/src/Parsers/Kusto/KustoFunctions/IParserKQLFunction.h +++ b/src/Parsers/Kusto/KustoFunctions/IParserKQLFunction.h @@ -3,6 +3,8 @@ #include #include +#include + namespace DB { class IParserKQLFunction @@ -48,7 +50,8 @@ class IParserKQLFunction static String getArgument(const String & function_name, DB::IParser::Pos & pos); static String getConvertedArgument(const String & fn_name, IParser::Pos & pos); static std::optional getOptionalArgument(const String & function_name, DB::IParser::Pos & pos); - static String kqlCallToExpression(const String & function_name, std::initializer_list params, uint32_t max_depth); + static String kqlCallToExpression(std::string_view function_name, std::initializer_list params, uint32_t max_depth); + static String kqlCallToExpression(std::string_view function_name, std::span params, uint32_t max_depth); static void validateEndOfFunction(const String & fn_name, IParser::Pos & pos); static String getKQLFunctionName(IParser::Pos & pos); static String ArraySortHelper(String & out, IParser::Pos & pos, bool ascending); diff --git a/src/Parsers/Kusto/KustoFunctions/KQLDynamicFunctions.cpp b/src/Parsers/Kusto/KustoFunctions/KQLDynamicFunctions.cpp index 8e0fc847ddfd..e4ac4dd0ee6d 100644 --- a/src/Parsers/Kusto/KustoFunctions/KQLDynamicFunctions.cpp +++ b/src/Parsers/Kusto/KustoFunctions/KQLDynamicFunctions.cpp @@ -11,6 +11,14 @@ #include +namespace +{ +String wrapInDynamic(const String & parameter) +{ + return "dynamic(" + parameter + ")"; +} +} + namespace DB { @@ -91,7 +99,7 @@ bool ArrayRotateRight::convertImpl(String & out, IParser::Pos & pos) const auto array = getArgument(function_name, pos); const auto count = getArgument(function_name, pos); - out = kqlCallToExpression("array_rotate_left", {"dynamic(" + array + ")", "-1 * " + count}, pos.max_depth); + out = kqlCallToExpression("array_rotate_left", {wrapInDynamic(array), "-1 * " + count}, pos.max_depth); return true; } @@ -106,7 +114,7 @@ bool ArrayShiftLeft::convertImpl(String & out, IParser::Pos & pos) const auto count = getArgument(function_name, pos); const auto fill = getOptionalArgument(function_name, pos); out = std::format( - "arrayResize(multiIf({1} > 0, arraySlice({0}, {1} + 1), {1} < 0, arrayConcat(arrayWithConstant(abs({1}), fill_value_{3}), {0}), {0}), " + "arrayResize(if({1} > 0, arraySlice({0}, {1} + 1), arrayConcat(arrayWithConstant(abs({1}), fill_value_{3}), {0})), " "length({0}), ifNull({2}, if(toTypeName({0}) = 'Array(String)', defaultValueOfArgumentType({0}[1]), null)) as fill_value_{3})", array, count, @@ -126,7 +134,7 @@ bool ArrayShiftRight::convertImpl(String & out, IParser::Pos & pos) const auto count = getArgument(function_name, pos); const auto fill = getOptionalArgument(function_name, pos); - const auto arg1 = "dynamic(" + array + ")"; + const auto arg1 = wrapInDynamic(array); const auto arg2 = "-1 * " + count; out = kqlCallToExpression( "array_shift_left", @@ -222,9 +230,18 @@ bool BagRemoveKeys::convertImpl(String & out, IParser::Pos & pos) bool JaccardIndex::convertImpl(String & out, IParser::Pos & pos) { - String res = String(pos->begin, pos->end); - out = res; - return false; + const auto function_name = getKQLFunctionName(pos); + if (function_name.empty()) + return false; + + const auto lhs = wrapInDynamic(getArgument(function_name, pos)); + const auto rhs = wrapInDynamic(getArgument(function_name, pos)); + out = std::format( + "divide(length({0}), length({1}))", + kqlCallToExpression("set_intersect", {lhs, rhs}, pos.max_depth), + kqlCallToExpression("set_union", {lhs, rhs}, pos.max_depth)); + + return true; } bool Pack::convertImpl(String & out, IParser::Pos & pos) @@ -261,30 +278,44 @@ bool Repeat::convertImpl(String & out, IParser::Pos & pos) bool SetDifference::convertImpl(String & out, IParser::Pos & pos) { - String res = String(pos->begin, pos->end); - out = res; - return false; + const auto function_name = getKQLFunctionName(pos); + if (function_name.empty()) + return false; + + const auto lhs = getArgument(function_name, pos); + const auto rhs = std::invoke( + [&function_name, &pos] + { + std::vector arrays{wrapInDynamic(getArgument(function_name, pos))}; + while (auto next_array = getOptionalArgument(function_name, pos)) + arrays.push_back(wrapInDynamic(*next_array)); + + return kqlCallToExpression("set_union", std::vector(arrays.cbegin(), arrays.cend()), pos.max_depth); + }); + + out = std::format("arrayFilter(x -> not has({1}, x), arrayDistinct({0}))", lhs, rhs); + + return true; } bool SetHasElement::convertImpl(String & out, IParser::Pos & pos) { - String res = String(pos->begin, pos->end); - out = res; - return false; + return directMapping(out, pos, "has"); } bool SetIntersect::convertImpl(String & out, IParser::Pos & pos) { - String res = String(pos->begin, pos->end); - out = res; - return false; + return directMapping(out, pos, "arrayIntersect"); } bool SetUnion::convertImpl(String & out, IParser::Pos & pos) { - String res = String(pos->begin, pos->end); - out = res; - return false; + if (!directMapping(out, pos, "arrayConcat")) + return false; + + out = std::format("arrayDistinct({0})", out); + + return true; } bool TreePath::convertImpl(String & out, IParser::Pos & pos) diff --git a/src/Parsers/Kusto/KustoFunctions/KQLFunctionFactory.cpp b/src/Parsers/Kusto/KustoFunctions/KQLFunctionFactory.cpp index 6eef8ae19f52..869b0808f78f 100644 --- a/src/Parsers/Kusto/KustoFunctions/KQLFunctionFactory.cpp +++ b/src/Parsers/Kusto/KustoFunctions/KQLFunctionFactory.cpp @@ -16,6 +16,7 @@ #include #include #include +#include namespace DB { @@ -65,6 +66,7 @@ namespace DB {"has_any_index", KQLFunctionValue::has_any_index}, {"indexof", KQLFunctionValue::indexof}, {"isempty", KQLFunctionValue::isempty}, + {"isnan", KQLFunctionValue::isnan}, {"isnotempty", KQLFunctionValue::isnotempty}, {"notempty", KQLFunctionValue::isnotempty}, {"isnotnull", KQLFunctionValue::isnotnull}, @@ -370,6 +372,9 @@ std::unique_ptr KQLFunctionFactory::get(String &kql_function case KQLFunctionValue::isempty: return std::make_unique(); + case KQLFunctionValue::isnan: + return std::make_unique(); + case KQLFunctionValue::isnotempty: return std::make_unique(); diff --git a/src/Parsers/Kusto/KustoFunctions/KQLFunctionFactory.h b/src/Parsers/Kusto/KustoFunctions/KQLFunctionFactory.h index 114f29fb3158..41042df09890 100644 --- a/src/Parsers/Kusto/KustoFunctions/KQLFunctionFactory.h +++ b/src/Parsers/Kusto/KustoFunctions/KQLFunctionFactory.h @@ -53,6 +53,7 @@ namespace DB has_any_index, indexof, isempty, + isnan, isnotempty, isnotnull, isnull, diff --git a/src/Parsers/Kusto/KustoFunctions/KQLMathematicalFunctions.cpp b/src/Parsers/Kusto/KustoFunctions/KQLMathematicalFunctions.cpp new file mode 100644 index 000000000000..77aacd169e5e --- /dev/null +++ b/src/Parsers/Kusto/KustoFunctions/KQLMathematicalFunctions.cpp @@ -0,0 +1,9 @@ +#include "KQLMathematicalFunctions.h" + +namespace DB +{ +bool IsNan::convertImpl(String & out, IParser::Pos & pos) +{ + return directMapping(out, pos, "isNaN"); +} +} diff --git a/src/Parsers/Kusto/KustoFunctions/KQLMathematicalFunctions.h b/src/Parsers/Kusto/KustoFunctions/KQLMathematicalFunctions.h new file mode 100644 index 000000000000..76cae66cae4d --- /dev/null +++ b/src/Parsers/Kusto/KustoFunctions/KQLMathematicalFunctions.h @@ -0,0 +1,11 @@ +#include "IParserKQLFunction.h" + +namespace DB +{ +class IsNan : public IParserKQLFunction +{ +protected: + const char * getName() const override { return "isnan()"; } + bool convertImpl(String & out, IParser::Pos & pos) override; +}; +} diff --git a/src/Parsers/tests/KQL/gtest_KQL_Dynamic.cpp b/src/Parsers/tests/KQL/gtest_KQL_Dynamic.cpp index 035f7b954fce..4ba35361db0c 100644 --- a/src/Parsers/tests/KQL/gtest_KQL_Dynamic.cpp +++ b/src/Parsers/tests/KQL/gtest_KQL_Dynamic.cpp @@ -63,6 +63,10 @@ INSTANTIATE_TEST_SUITE_P(ParserKQLQuery_DynamicExactMatch, ParserTest, "print output = array_sum(dynamic([2.5, 5.5, 3]))", "SELECT arraySum([2.5, 5.5, 3]) AS output" }, + { + "print jaccard_index(A, B)", + "SELECT length(arrayIntersect(A, B)) / length(arrayDistinct(arrayConcat(A, B)))" + }, { "print pack_array(A, B, C, D)", "SELECT [A, B, C, D]" @@ -71,6 +75,34 @@ INSTANTIATE_TEST_SUITE_P(ParserKQLQuery_DynamicExactMatch, ParserTest, "print repeat(A, B)", "SELECT arrayWithConstant(B, A)" }, + { + "print set_difference(A, B)", + "SELECT arrayFilter(x -> (NOT has(arrayDistinct(arrayConcat(B)), x)), arrayDistinct(A))" + }, + { + "print set_difference(A, B, C)", + "SELECT arrayFilter(x -> (NOT has(arrayDistinct(arrayConcat(B, C)), x)), arrayDistinct(A))" + }, + { + "print set_has_element(A, B)", + "SELECT has(A, B)" + }, + { + "print set_intersect(A, B)", + "SELECT arrayIntersect(A, B)" + }, + { + "print set_intersect(A, B, C)", + "SELECT arrayIntersect(A, B, C)" + }, + { + "print set_union(A, B)", + "SELECT arrayDistinct(arrayConcat(A, B))" + }, + { + "print set_union(A, B, C)", + "SELECT arrayDistinct(arrayConcat(A, B, C))" + }, { "print zip(A, B)", "SELECT arrayMap(t -> [untuple(t)], arrayZip(A, B))" @@ -87,19 +119,19 @@ INSTANTIATE_TEST_SUITE_P(ParserKQLQuery_DynamicRegex, ParserRegexTest, ::testing::ValuesIn(std::initializer_list{ { "print array_shift_left(A, B)", - "SELECT arrayResize\\(multiIf\\(B > 0, arraySlice\\(A, B \\+ 1\\), B < 0, arrayConcat\\(arrayWithConstant\\(abs\\(B\\), fill_value_\\d+\\), A\\), A\\), length\\(A\\), ifNull\\(NULL, if\\(toTypeName\\(A\\) = 'Array\\(String\\)', defaultValueOfArgumentType\\(A\\[1\\]\\), NULL\\)\\) AS fill_value_\\d+\\)" + "SELECT arrayResize\\(if\\(B > 0, arraySlice\\(A, B \\+ 1\\), arrayConcat\\(arrayWithConstant\\(abs\\(B\\), fill_value_\\d+\\), A\\)\\), length\\(A\\), ifNull\\(NULL, if\\(toTypeName\\(A\\) = 'Array\\(String\\)', defaultValueOfArgumentType\\(A\\[1\\]\\), NULL\\)\\) AS fill_value_\\d+\\)" }, { "print array_shift_left(A, B, C)", - "SELECT arrayResize\\(multiIf\\(B > 0, arraySlice\\(A, B \\+ 1\\), B < 0, arrayConcat\\(arrayWithConstant\\(abs\\(B\\), fill_value_\\d+\\), A\\), A\\), length\\(A\\), ifNull\\(C, if\\(toTypeName\\(A\\) = 'Array\\(String\\)', defaultValueOfArgumentType\\(A\\[1\\]\\), NULL\\)\\) AS fill_value_\\d+\\)" + "SELECT arrayResize\\(if\\(B > 0, arraySlice\\(A, B \\+ 1\\), arrayConcat\\(arrayWithConstant\\(abs\\(B\\), fill_value_\\d+\\), A\\)\\), length\\(A\\), ifNull\\(C, if\\(toTypeName\\(A\\) = 'Array\\(String\\)', defaultValueOfArgumentType\\(A\\[1\\]\\), NULL\\)\\) AS fill_value_\\d+\\)" }, { "print array_shift_right(A, B)", - "SELECT arrayResize\\(multiIf\\(\\(-1 \\* B\\) > 0, arraySlice\\(A, \\(-1 \\* B\\) \\+ 1\\), \\(-1 \\* B\\) < 0, arrayConcat\\(arrayWithConstant\\(abs\\(-1 \\* B\\), fill_value_\\d+\\), A\\), A\\), length\\(A\\), ifNull\\(NULL, if\\(toTypeName\\(A\\) = 'Array\\(String\\)', defaultValueOfArgumentType\\(A\\[1\\]\\), NULL\\)\\) AS fill_value_\\d+\\)" + "SELECT arrayResize\\(if\\(\\(-1 \\* B\\) > 0, arraySlice\\(A, \\(-1 \\* B\\) \\+ 1\\), arrayConcat\\(arrayWithConstant\\(abs\\(-1 \\* B\\), fill_value_\\d+\\), A\\)\\), length\\(A\\), ifNull\\(NULL, if\\(toTypeName\\(A\\) = 'Array\\(String\\)', defaultValueOfArgumentType\\(A\\[1\\]\\), NULL\\)\\) AS fill_value_\\d+\\)" }, { "print array_shift_right(A, B, C)", - "SELECT arrayResize\\(multiIf\\(\\(-1 \\* B\\) > 0, arraySlice\\(A, \\(-1 \\* B\\) \\+ 1\\), \\(-1 \\* B\\) < 0, arrayConcat\\(arrayWithConstant\\(abs\\(-1 \\* B\\), fill_value_\\d+\\), A\\), A\\), length\\(A\\), ifNull\\(C, if\\(toTypeName\\(A\\) = 'Array\\(String\\)', defaultValueOfArgumentType\\(A\\[1\\]\\), NULL\\)\\) AS fill_value_\\d+\\)" + "SELECT arrayResize\\(if\\(\\(-1 \\* B\\) > 0, arraySlice\\(A, \\(-1 \\* B\\) \\+ 1\\), arrayConcat\\(arrayWithConstant\\(abs\\(-1 \\* B\\), fill_value_\\d+\\), A\\)\\), length\\(A\\), ifNull\\(C, if\\(toTypeName\\(A\\) = 'Array\\(String\\)', defaultValueOfArgumentType\\(A\\[1\\]\\), NULL\\)\\) AS fill_value_\\d+\\)" }, { "print array_slice(A, B, C)",