Skip to content

Commit

Permalink
Merge branch 'Kusto-phase3' into KQL_X3_URL_Parse
Browse files Browse the repository at this point in the history
  • Loading branch information
mcmajam authored Apr 3, 2023
2 parents a92a0fb + 764876c commit c3ef4c1
Show file tree
Hide file tree
Showing 14 changed files with 556 additions and 76 deletions.
135 changes: 135 additions & 0 deletions src/Functions/Kusto/kqlIndexOfRegex.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
#include <Columns/ColumnNullable.h>
#include <Columns/ColumnString.h>
#include <Columns/ColumnsNumber.h>
#include <DataTypes/DataTypeString.h>
#include <DataTypes/DataTypesNumber.h>
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionHelpers.h>
#include <Functions/IFunction.h>

#include <re2/re2.h>

namespace DB::ErrorCodes
{
extern const int CANNOT_COMPILE_REGEXP;
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
}

namespace DB
{
class FunctionKqlIndexOfRegex : public IFunction
{
public:
static constexpr auto name = "kql_indexof_regex";
static FunctionPtr create(ContextPtr context) { return std::make_shared<FunctionKqlIndexOfRegex>(std::move(context)); }

explicit FunctionKqlIndexOfRegex(ContextPtr context_) : context(std::move(context_)) { }
~FunctionKqlIndexOfRegex() override = default;

ColumnPtr
executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override;
ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1, 4}; }
String getName() const override { return name; }
size_t getNumberOfArguments() const override { return 0; }
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override;
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; }
bool isVariadic() const override { return true; }

private:
ColumnPtr extractArgumentColumnAsString(const ColumnWithTypeAndName & argument, const size_t input_rows_count) const
{
if (isString(argument.type))
return argument.column;

const ColumnsWithTypeAndName kql_to_string_args{argument};
return executeFunctionCall(context, "kql_tostring", kql_to_string_args, input_rows_count).first;
}

ColumnPtr extractIntegerArgumentColumn(const ColumnsWithTypeAndName & arguments, const int index, const int default_value) const
{
if (index >= std::ssize(arguments))
return DataTypeInt32().createColumnConst(1, toField(default_value));

const auto & argument = arguments[index];
if (!isInteger(argument.type))
throw Exception(
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
"Illegal type {} of argument #{} of function {}, expected integral type",
argument.type->getName(),
index,
getName());

return argument.column;
}

ContextPtr context;
};

ColumnPtr
FunctionKqlIndexOfRegex::executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, const size_t input_rows_count) const
{
const auto in_column_haystack = extractArgumentColumnAsString(arguments[0], input_rows_count);
const auto in_column_pattern = extractArgumentColumnAsString(arguments[1], input_rows_count);
const auto in_column_start = extractIntegerArgumentColumn(arguments, 2, 0);
const auto in_column_length = extractIntegerArgumentColumn(arguments, 3, -1);
const auto in_column_occurrence = extractIntegerArgumentColumn(arguments, 4, 1);

const auto pattern = in_column_pattern->getDataAt(0).toView();
const RE2 precompiled_pattern(pattern, RE2::Quiet);
if (!precompiled_pattern.ok())
throw Exception(ErrorCodes::CANNOT_COMPILE_REGEXP, "{}: {}", getName(), precompiled_pattern.error());

auto out_column = ColumnInt64::create(input_rows_count);
auto out_null_map = ColumnUInt8::create(input_rows_count);

auto & out_column_data = out_column->getData();
auto & out_null_map_data = out_null_map->getData();
for (size_t i = 0; i < input_rows_count; ++i)
{
const auto start = in_column_start->getInt(i);
const auto length = in_column_length->getInt(i);
const auto occurrence = in_column_occurrence->getInt(i);

const auto is_invalid = start < 0 || length < -1 || occurrence <= 0;
out_null_map_data[i] = is_invalid;

if (is_invalid)
continue;

const auto haystack = in_column_haystack->getDataAt(i).toView();
const auto bounded_start = std::min(start, std::max(std::ssize(haystack) - 1, Int64(0)));
const auto shortened_haystack = haystack.substr(bounded_start, length == -1 ? std::string_view::npos : length);

size_t offset = 0;
re2::StringPiece partial_match;
int pass = 0;
while (pass < occurrence
&& precompiled_pattern.Match(shortened_haystack, offset, shortened_haystack.length(), RE2::UNANCHORED, &partial_match, 1))
{
offset = std::distance(shortened_haystack.data(), partial_match.data()) + partial_match.length();
++pass;
}

out_column_data[i] = pass == occurrence ? std::distance(haystack.data(), partial_match.data()) : -1;
}

return ColumnNullable::create(std::move(out_column), std::move(out_null_map));
}

DataTypePtr FunctionKqlIndexOfRegex::getReturnTypeImpl(const DataTypes & arguments) const
{
if (const auto argument_count = std::ssize(arguments); argument_count < 2 || 5 < argument_count)
throw Exception(
ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
"Number of arguments for function {} doesn't match: passed {}, should be between 2 and 5.",
getName(),
argument_count);

return makeNullable(std::make_shared<DataTypeInt64>());
}

REGISTER_FUNCTION(KqlIndexOfRegex)
{
factory.registerFunction<FunctionKqlIndexOfRegex>();
}
}
126 changes: 126 additions & 0 deletions src/Functions/Kusto/kqlMakeString.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
#include <Columns/ColumnString.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeString.h>
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionHelpers.h>
#include <Functions/IFunction.h>
#include <Common/UTF8Helpers.h>

#include <codecvt>
#include <format>
#include <locale>

namespace DB
{
namespace ErrorCodes
{
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
}

class FunctionKqlMakeString : public IFunction
{
public:
static constexpr auto name = "kql_make_string";
static FunctionPtr create(ContextPtr context) { return std::make_shared<FunctionKqlMakeString>(std::move(context)); }

explicit FunctionKqlMakeString(ContextPtr context_) : context(std::move(context_)) { }
~FunctionKqlMakeString() override = default;

ColumnPtr
executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override;
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override;
String getName() const override { return name; }
bool isVariadic() const override { return true; }
size_t getNumberOfArguments() const override { return 0; }
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; }


private:
void convertAndAppendCodePoint(int code_point, String & row_str) const;
ContextPtr context;
};

void FunctionKqlMakeString::convertAndAppendCodePoint(const int code_point, String & row_str) const
{
if (code_point < 0 || code_point > 1114111)
throw DB::Exception(
DB::ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
"Argument in function {} is out of range, should be between 0 and 1114111",
getName());

std::array<char, 4> buff;
const auto num_chars = UTF8::convertCodePointToUTF8(code_point, buff.data(), buff.size());
row_str.append(buff.data(), num_chars);
}

DataTypePtr FunctionKqlMakeString::getReturnTypeImpl(const DataTypes & arguments) const
{
if (const auto argument_count = std::ssize(arguments); argument_count < 1 || argument_count > 64)
throw DB::Exception(
DB::ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
"Number of arguments for function {} doesn't match: passed {}, should be between 1 and 64",
getName(),
argument_count);

const auto arg_it = std::ranges::find_if(arguments, [](const auto & argument) {
if (const auto * array_type = typeid_cast<const DataTypeArray *>(argument.get()))
{
WhichDataType which(array_type->getNestedType()->getPtr());

return !which.isUInt() && !which.isInt() && !which.isNothing();
}
return !WhichDataType(argument).isUInt() && !WhichDataType(argument).isInt();
});

if (arg_it != arguments.cend())
throw DB::Exception(
DB::ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
"Arguments type argument # {} for function {} doesn't match: arguments should be integers int,long or a dynamic value holding "
"an array of "
"integral numbers",
std::distance(arguments.cbegin(), arg_it),
getName());

return std::make_shared<DataTypeString>();
}

ColumnPtr
FunctionKqlMakeString::executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, const size_t input_rows_count) const
{
auto out_col = ColumnString::create();


for (size_t j = 0; j < input_rows_count; ++j)
{
String row_str;
for (size_t i = 0; i < arguments.size(); ++i)
{
if (WhichDataType(arguments[i].type).isArray())
{
Field arr_field;
arguments[i].column->get(j, arr_field);
const auto len = arr_field.get<Array>().size();
for (size_t k = 0; k < len; ++k)
{
const auto & val = arr_field.get<Array>().at(k);
const auto code_point = static_cast<int>(val.get<Int64>());
convertAndAppendCodePoint(code_point, row_str);
}
}
else
{
const auto code_point = static_cast<int>(arguments[i].column->getInt(j));
convertAndAppendCodePoint(code_point, row_str);
}
}
out_col->insertData(row_str.c_str(), row_str.size());
}
return out_col;
}

REGISTER_FUNCTION(KqlMakeString)
{
factory.registerFunction<FunctionKqlMakeString>();
}
}
10 changes: 10 additions & 0 deletions src/Parsers/Kusto/KQL_ReleaseNote.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,16 @@
ADX will incorrectly consume part of encapsulated IPv6 Host as Port from last colon to '/'.
print parse_url("http://[2001:db8:3333:4444:5555:6666:7777:8888]/filepath/index.htm")
```
- [arg_max()](https://learn.microsoft.com/en-us/azure/data-explorer/kusto/query/arg-max-aggfunction) and [arg_min()](https://learn.microsoft.com/en-us/azure/data-explorer/kusto/query/arg-min-aggfunction)
support multiple arguments now.
`Customers | arg_max(Age, FirstName, LastName)`
Note: The wildcard parameter (`*`) does not currently work, and will be implemented in a future build. Additionally, the parameter to maximize or minimize is always the last parameter in the output.
## Functions
- [indexof_regex](https://learn.microsoft.com/en-us/azure/data-explorer/kusto/query/indexofregexfunction)
`print idx1 = indexof_regex("abcabc", "a.c");`
- [make_string()](https://github.com/microsoft/Kusto-Query-Language/blob/master/doc/makestringfunction.md)
`print str = make_string(75, 117, 115, 116, 111)`

# March XX, 2023
## Functions
- [hash()](https://learn.microsoft.com/en-us/azure/data-explorer/kusto/query/hashfunction)
Expand Down
38 changes: 36 additions & 2 deletions src/Parsers/Kusto/KustoFunctions/KQLAggregationFunctions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,46 @@ namespace DB

bool ArgMax::convertImpl(String & out, IParser::Pos & pos)
{
return directMapping(out, pos, "argMax");
String fn_name = getKQLFunctionName(pos);

if (fn_name.empty())
return false;
++pos;
String expr_to_maximize = getConvertedArgument(fn_name, pos);
while (pos->type == TokenType::Comma)
{
++pos;
const auto expr_to_return = getConvertedArgument(fn_name, pos);
if (expr_to_return == expr_to_maximize)
{
continue;
}
out += std::format("argMax({}, {}) as {},", expr_to_return, expr_to_maximize, expr_to_return);
}
out += std::format("argMax({}, {})", expr_to_maximize, expr_to_maximize);
return true;
}

bool ArgMin::convertImpl(String & out, IParser::Pos & pos)
{
return directMapping(out, pos, "argMin");
String fn_name = getKQLFunctionName(pos);

if (fn_name.empty())
return false;
++pos;
String expr_to_maximize = getConvertedArgument(fn_name, pos);
while (pos->type == TokenType::Comma)
{
++pos;
const auto expr_to_return = getConvertedArgument(fn_name, pos);
if (expr_to_return == expr_to_maximize)
{
continue;
}
out += std::format("argMin({}, {}) as {},", expr_to_return, expr_to_maximize, expr_to_return);
}
out += std::format("argMin({}, {})", expr_to_maximize, expr_to_maximize);
return true;
}

bool Avg::convertImpl(String & out, IParser::Pos & pos)
Expand Down
10 changes: 10 additions & 0 deletions src/Parsers/Kusto/KustoFunctions/KQLFunctionFactory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,11 +62,13 @@ enum class KQLFunction : uint16_t
extract_json,
has_any_index,
indexof,
indexof_regex,
isempty,
isnan,
isnotempty,
isnotnull,
isnull,
make_string,
new_guid,
parse_command_line,
parse_csv,
Expand Down Expand Up @@ -318,13 +320,15 @@ const std::unordered_map<String, KQLFunction> KQL_FUNCTIONS{
{"extractjson", KQLFunction::extract_json},
{"has_any_index", KQLFunction::has_any_index},
{"indexof", KQLFunction::indexof},
{"indexof_regex", KQLFunction::indexof_regex},
{"isempty", KQLFunction::isempty},
{"isnan", KQLFunction::isnan},
{"isnotempty", KQLFunction::isnotempty},
{"notempty", KQLFunction::isnotempty},
{"isnotnull", KQLFunction::isnotnull},
{"notnull", KQLFunction::isnotnull},
{"isnull", KQLFunction::isnull},
{"make_string", KQLFunction::make_string},
{"new_guid", KQLFunction::new_guid},
{"parse_command_line", KQLFunction::parse_command_line},
{"parse_csv", KQLFunction::parse_csv},
Expand Down Expand Up @@ -674,6 +678,9 @@ std::unique_ptr<IParserKQLFunction> KQLFunctionFactory::get(const String & kql_f
case KQLFunction::indexof:
return std::make_unique<IndexOf>();

case KQLFunction::indexof_regex:
return std::make_unique<IndexOfRegex>();

case KQLFunction::isempty:
return std::make_unique<IsEmpty>();

Expand All @@ -689,6 +696,9 @@ std::unique_ptr<IParserKQLFunction> KQLFunctionFactory::get(const String & kql_f
case KQLFunction::isnull:
return std::make_unique<IsNull>();

case KQLFunction::make_string:
return std::make_unique<MakeString>();

case KQLFunction::new_guid:
return std::make_unique<NewGuid>();

Expand Down
Loading

0 comments on commit c3ef4c1

Please sign in to comment.