forked from ClickHouse/ClickHouse
-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'Kusto-phase3' into KQL_X3_URL_Parse
- Loading branch information
Showing
14 changed files
with
556 additions
and
76 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,135 @@ | ||
#include <Columns/ColumnNullable.h> | ||
#include <Columns/ColumnString.h> | ||
#include <Columns/ColumnsNumber.h> | ||
#include <DataTypes/DataTypeString.h> | ||
#include <DataTypes/DataTypesNumber.h> | ||
#include <Functions/FunctionFactory.h> | ||
#include <Functions/FunctionHelpers.h> | ||
#include <Functions/IFunction.h> | ||
|
||
#include <re2/re2.h> | ||
|
||
namespace DB::ErrorCodes | ||
{ | ||
extern const int CANNOT_COMPILE_REGEXP; | ||
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; | ||
} | ||
|
||
namespace DB | ||
{ | ||
class FunctionKqlIndexOfRegex : public IFunction | ||
{ | ||
public: | ||
static constexpr auto name = "kql_indexof_regex"; | ||
static FunctionPtr create(ContextPtr context) { return std::make_shared<FunctionKqlIndexOfRegex>(std::move(context)); } | ||
|
||
explicit FunctionKqlIndexOfRegex(ContextPtr context_) : context(std::move(context_)) { } | ||
~FunctionKqlIndexOfRegex() override = default; | ||
|
||
ColumnPtr | ||
executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override; | ||
ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1, 4}; } | ||
String getName() const override { return name; } | ||
size_t getNumberOfArguments() const override { return 0; } | ||
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override; | ||
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; } | ||
bool isVariadic() const override { return true; } | ||
|
||
private: | ||
ColumnPtr extractArgumentColumnAsString(const ColumnWithTypeAndName & argument, const size_t input_rows_count) const | ||
{ | ||
if (isString(argument.type)) | ||
return argument.column; | ||
|
||
const ColumnsWithTypeAndName kql_to_string_args{argument}; | ||
return executeFunctionCall(context, "kql_tostring", kql_to_string_args, input_rows_count).first; | ||
} | ||
|
||
ColumnPtr extractIntegerArgumentColumn(const ColumnsWithTypeAndName & arguments, const int index, const int default_value) const | ||
{ | ||
if (index >= std::ssize(arguments)) | ||
return DataTypeInt32().createColumnConst(1, toField(default_value)); | ||
|
||
const auto & argument = arguments[index]; | ||
if (!isInteger(argument.type)) | ||
throw Exception( | ||
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, | ||
"Illegal type {} of argument #{} of function {}, expected integral type", | ||
argument.type->getName(), | ||
index, | ||
getName()); | ||
|
||
return argument.column; | ||
} | ||
|
||
ContextPtr context; | ||
}; | ||
|
||
ColumnPtr | ||
FunctionKqlIndexOfRegex::executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, const size_t input_rows_count) const | ||
{ | ||
const auto in_column_haystack = extractArgumentColumnAsString(arguments[0], input_rows_count); | ||
const auto in_column_pattern = extractArgumentColumnAsString(arguments[1], input_rows_count); | ||
const auto in_column_start = extractIntegerArgumentColumn(arguments, 2, 0); | ||
const auto in_column_length = extractIntegerArgumentColumn(arguments, 3, -1); | ||
const auto in_column_occurrence = extractIntegerArgumentColumn(arguments, 4, 1); | ||
|
||
const auto pattern = in_column_pattern->getDataAt(0).toView(); | ||
const RE2 precompiled_pattern(pattern, RE2::Quiet); | ||
if (!precompiled_pattern.ok()) | ||
throw Exception(ErrorCodes::CANNOT_COMPILE_REGEXP, "{}: {}", getName(), precompiled_pattern.error()); | ||
|
||
auto out_column = ColumnInt64::create(input_rows_count); | ||
auto out_null_map = ColumnUInt8::create(input_rows_count); | ||
|
||
auto & out_column_data = out_column->getData(); | ||
auto & out_null_map_data = out_null_map->getData(); | ||
for (size_t i = 0; i < input_rows_count; ++i) | ||
{ | ||
const auto start = in_column_start->getInt(i); | ||
const auto length = in_column_length->getInt(i); | ||
const auto occurrence = in_column_occurrence->getInt(i); | ||
|
||
const auto is_invalid = start < 0 || length < -1 || occurrence <= 0; | ||
out_null_map_data[i] = is_invalid; | ||
|
||
if (is_invalid) | ||
continue; | ||
|
||
const auto haystack = in_column_haystack->getDataAt(i).toView(); | ||
const auto bounded_start = std::min(start, std::max(std::ssize(haystack) - 1, Int64(0))); | ||
const auto shortened_haystack = haystack.substr(bounded_start, length == -1 ? std::string_view::npos : length); | ||
|
||
size_t offset = 0; | ||
re2::StringPiece partial_match; | ||
int pass = 0; | ||
while (pass < occurrence | ||
&& precompiled_pattern.Match(shortened_haystack, offset, shortened_haystack.length(), RE2::UNANCHORED, &partial_match, 1)) | ||
{ | ||
offset = std::distance(shortened_haystack.data(), partial_match.data()) + partial_match.length(); | ||
++pass; | ||
} | ||
|
||
out_column_data[i] = pass == occurrence ? std::distance(haystack.data(), partial_match.data()) : -1; | ||
} | ||
|
||
return ColumnNullable::create(std::move(out_column), std::move(out_null_map)); | ||
} | ||
|
||
DataTypePtr FunctionKqlIndexOfRegex::getReturnTypeImpl(const DataTypes & arguments) const | ||
{ | ||
if (const auto argument_count = std::ssize(arguments); argument_count < 2 || 5 < argument_count) | ||
throw Exception( | ||
ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, | ||
"Number of arguments for function {} doesn't match: passed {}, should be between 2 and 5.", | ||
getName(), | ||
argument_count); | ||
|
||
return makeNullable(std::make_shared<DataTypeInt64>()); | ||
} | ||
|
||
REGISTER_FUNCTION(KqlIndexOfRegex) | ||
{ | ||
factory.registerFunction<FunctionKqlIndexOfRegex>(); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,126 @@ | ||
#include <Columns/ColumnString.h> | ||
#include <DataTypes/DataTypeArray.h> | ||
#include <DataTypes/DataTypeString.h> | ||
#include <Functions/FunctionFactory.h> | ||
#include <Functions/FunctionHelpers.h> | ||
#include <Functions/IFunction.h> | ||
#include <Common/UTF8Helpers.h> | ||
|
||
#include <codecvt> | ||
#include <format> | ||
#include <locale> | ||
|
||
namespace DB | ||
{ | ||
namespace ErrorCodes | ||
{ | ||
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; | ||
extern const int ILLEGAL_TYPE_OF_ARGUMENT; | ||
} | ||
|
||
class FunctionKqlMakeString : public IFunction | ||
{ | ||
public: | ||
static constexpr auto name = "kql_make_string"; | ||
static FunctionPtr create(ContextPtr context) { return std::make_shared<FunctionKqlMakeString>(std::move(context)); } | ||
|
||
explicit FunctionKqlMakeString(ContextPtr context_) : context(std::move(context_)) { } | ||
~FunctionKqlMakeString() override = default; | ||
|
||
ColumnPtr | ||
executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override; | ||
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override; | ||
String getName() const override { return name; } | ||
bool isVariadic() const override { return true; } | ||
size_t getNumberOfArguments() const override { return 0; } | ||
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; } | ||
|
||
|
||
private: | ||
void convertAndAppendCodePoint(int code_point, String & row_str) const; | ||
ContextPtr context; | ||
}; | ||
|
||
void FunctionKqlMakeString::convertAndAppendCodePoint(const int code_point, String & row_str) const | ||
{ | ||
if (code_point < 0 || code_point > 1114111) | ||
throw DB::Exception( | ||
DB::ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, | ||
"Argument in function {} is out of range, should be between 0 and 1114111", | ||
getName()); | ||
|
||
std::array<char, 4> buff; | ||
const auto num_chars = UTF8::convertCodePointToUTF8(code_point, buff.data(), buff.size()); | ||
row_str.append(buff.data(), num_chars); | ||
} | ||
|
||
DataTypePtr FunctionKqlMakeString::getReturnTypeImpl(const DataTypes & arguments) const | ||
{ | ||
if (const auto argument_count = std::ssize(arguments); argument_count < 1 || argument_count > 64) | ||
throw DB::Exception( | ||
DB::ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, | ||
"Number of arguments for function {} doesn't match: passed {}, should be between 1 and 64", | ||
getName(), | ||
argument_count); | ||
|
||
const auto arg_it = std::ranges::find_if(arguments, [](const auto & argument) { | ||
if (const auto * array_type = typeid_cast<const DataTypeArray *>(argument.get())) | ||
{ | ||
WhichDataType which(array_type->getNestedType()->getPtr()); | ||
|
||
return !which.isUInt() && !which.isInt() && !which.isNothing(); | ||
} | ||
return !WhichDataType(argument).isUInt() && !WhichDataType(argument).isInt(); | ||
}); | ||
|
||
if (arg_it != arguments.cend()) | ||
throw DB::Exception( | ||
DB::ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, | ||
"Arguments type argument # {} for function {} doesn't match: arguments should be integers int,long or a dynamic value holding " | ||
"an array of " | ||
"integral numbers", | ||
std::distance(arguments.cbegin(), arg_it), | ||
getName()); | ||
|
||
return std::make_shared<DataTypeString>(); | ||
} | ||
|
||
ColumnPtr | ||
FunctionKqlMakeString::executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, const size_t input_rows_count) const | ||
{ | ||
auto out_col = ColumnString::create(); | ||
|
||
|
||
for (size_t j = 0; j < input_rows_count; ++j) | ||
{ | ||
String row_str; | ||
for (size_t i = 0; i < arguments.size(); ++i) | ||
{ | ||
if (WhichDataType(arguments[i].type).isArray()) | ||
{ | ||
Field arr_field; | ||
arguments[i].column->get(j, arr_field); | ||
const auto len = arr_field.get<Array>().size(); | ||
for (size_t k = 0; k < len; ++k) | ||
{ | ||
const auto & val = arr_field.get<Array>().at(k); | ||
const auto code_point = static_cast<int>(val.get<Int64>()); | ||
convertAndAppendCodePoint(code_point, row_str); | ||
} | ||
} | ||
else | ||
{ | ||
const auto code_point = static_cast<int>(arguments[i].column->getInt(j)); | ||
convertAndAppendCodePoint(code_point, row_str); | ||
} | ||
} | ||
out_col->insertData(row_str.c_str(), row_str.size()); | ||
} | ||
return out_col; | ||
} | ||
|
||
REGISTER_FUNCTION(KqlMakeString) | ||
{ | ||
factory.registerFunction<FunctionKqlMakeString>(); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.