Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add 'expression.max_compiled_regexes' Query Config #11850

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions velox/core/QueryConfig.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,11 @@ class QueryConfig {
static constexpr const char* kExprMaxArraySizeInReduce =
"expression.max_array_size_in_reduce";

/// Controls maximum number of compiled regular expression patterns per
/// function instance per thread of execution.
static constexpr const char* kExprMaxCompiledRegexes =
"expression.max_compiled_regexes";

/// Used for backpressure to block local exchange producers when the local
/// exchange buffer reaches or exceeds this size.
static constexpr const char* kMaxLocalExchangeBufferSize =
Expand Down Expand Up @@ -617,6 +622,10 @@ class QueryConfig {
return get<uint64_t>(kExprMaxArraySizeInReduce, 100'000);
}

uint64_t exprMaxCompiledRegexes() const {
return get<uint64_t>(kExprMaxCompiledRegexes, 100);
}

bool adjustTimestampToTimezone() const {
return get<bool>(kAdjustTimestampToTimezone, false);
}
Expand Down
4 changes: 4 additions & 0 deletions velox/docs/configs.rst
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,10 @@ Expression Evaluation Configuration
- integer
- 100000
- ``Reduce`` function will throw an error if encountered an array of size greater than this.
* - expression.max_compiled_regexes
- integer
- 100
- Controls maximum number of compiled regular expression patterns per batch.
* - debug_disable_expression_with_peeling
- bool
- false
Expand Down
6 changes: 3 additions & 3 deletions velox/docs/functions/spark/regexp.rst
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,9 @@ See https://github.com/google/re2/wiki/Syntax for more information.
Note: The wildcard '%' represents 0, 1 or multiple characters and the
wildcard '_' represents exactly one character.

Note: Each function instance allow for a maximum of 20 regular expressions to
be compiled per thread of execution. Not all patterns require
compilation of regular expressions. Patterns 'hello', 'hello%', '_hello__%',
Note: Each function instance allow for a maximum of ``expression.max_compiled_regexes``
(default 100) regular expressions to be compiled per thread of execution. Not all patterns
require compilation of regular expressions. Patterns 'hello', 'hello%', '_hello__%',
'%hello', '%__hello_', '%hello%', where 'hello', 'velox'
contains only regular characters and '_' wildcards are evaluated without
using regular expressions. Only those patterns that require the compilation of
Expand Down
67 changes: 43 additions & 24 deletions velox/functions/lib/Re2Functions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ Expected<RE2*> ReCache::tryFindOrCompile(const StringView& pattern) {
return reIt->second.get();
}

if (cache_.size() >= kMaxCompiledRegexes) {
if (cache_.size() >= maxCompiledRegexes_) {
return folly::makeUnexpected(
Status::UserError("Max number of regex reached"));
}
Expand Down Expand Up @@ -239,6 +239,8 @@ class Re2MatchConstantPattern final : public exec::VectorFunction {
template <bool (*Fn)(StringView, const RE2&)>
class Re2Match final : public exec::VectorFunction {
public:
explicit Re2Match(int64_t maxCompiledRegexes) : cache_(maxCompiledRegexes) {}

void apply(
const SelectivityVector& rows,
std::vector<VectorPtr>& args,
Expand Down Expand Up @@ -359,8 +361,8 @@ class Re2SearchAndExtractConstantPattern final : public exec::VectorFunction {
template <typename T>
class Re2SearchAndExtract final : public exec::VectorFunction {
public:
explicit Re2SearchAndExtract(bool emptyNoMatch)
: emptyNoMatch_(emptyNoMatch) {}
explicit Re2SearchAndExtract(bool emptyNoMatch, int64_t maxCompiledRegexes)
: emptyNoMatch_(emptyNoMatch), cache_(maxCompiledRegexes) {}
void apply(
const SelectivityVector& rows,
std::vector<VectorPtr>& args,
Expand Down Expand Up @@ -886,11 +888,15 @@ class LikeWithRe2 final : public exec::VectorFunction {
};

// This function is constructed when pattern or escape are not constants.
// It allows up to kMaxCompiledRegexes different regular expressions to be
// compiled throughout the query lifetime per expression and thread of
// execution, note that optimized regular expressions that are not compiled are
// not counted.
// It allows up to 'expression.max_compiled_regexes' different regular
// expressions to be compiled throughout the query lifetime per expression and
// thread of execution, note that optimized regular expressions that are not
// compiled are not counted.
class LikeGeneric final : public exec::VectorFunction {
public:
explicit LikeGeneric(int64_t maxCompiledRegexes)
: maxCompiledRegexes_(maxCompiledRegexes) {}

void apply(
const SelectivityVector& rows,
std::vector<VectorPtr>& args,
Expand Down Expand Up @@ -1008,7 +1014,7 @@ class LikeGeneric final : public exec::VectorFunction {

VELOX_USER_CHECK_LT(
compiledRegularExpressions_.size(),
kMaxCompiledRegexes,
maxCompiledRegexes_,
"Max number of regex reached");

bool validEscapeUsage;
Expand All @@ -1033,6 +1039,7 @@ class LikeGeneric final : public exec::VectorFunction {
std::pair<std::string, std::optional<char>>,
std::unique_ptr<RE2>>
compiledRegularExpressions_;
int64_t maxCompiledRegexes_;
};

void re2ExtractAll(
Expand Down Expand Up @@ -1145,6 +1152,9 @@ class Re2ExtractAllConstantPattern final : public exec::VectorFunction {
template <typename T>
class Re2ExtractAll final : public exec::VectorFunction {
public:
explicit Re2ExtractAll(int64_t maxCompiledRegexes)
: cache_(maxCompiledRegexes) {}

void apply(
const SelectivityVector& rows,
std::vector<VectorPtr>& args,
Expand Down Expand Up @@ -1204,7 +1214,8 @@ class Re2ExtractAll final : public exec::VectorFunction {
template <bool (*Fn)(StringView, const RE2&)>
std::shared_ptr<exec::VectorFunction> makeRe2MatchImpl(
const std::string& name,
const std::vector<exec::VectorFunctionArg>& inputArgs) {
const std::vector<exec::VectorFunctionArg>& inputArgs,
const core::QueryConfig& config) {
if (inputArgs.size() != 2 || !inputArgs[0].type->isVarchar() ||
!inputArgs[1].type->isVarchar()) {
VELOX_UNSUPPORTED(
Expand All @@ -1220,11 +1231,14 @@ std::shared_ptr<exec::VectorFunction> makeRe2MatchImpl(
constantPattern->as<ConstantVector<StringView>>()->valueAt(0));
}

return std::make_shared<Re2Match<Fn>>();
return std::make_shared<Re2Match<Fn>>(config.exprMaxCompiledRegexes());
}

class RegexpReplaceWithLambdaFunction : public exec::VectorFunction {
public:
explicit RegexpReplaceWithLambdaFunction(int64_t maxCompiledRegexes)
: cache_(maxCompiledRegexes) {}

void apply(
const SelectivityVector& rows,
std::vector<VectorPtr>& args,
Expand Down Expand Up @@ -1592,8 +1606,8 @@ class RegexpReplaceWithLambdaFunction : public exec::VectorFunction {
std::shared_ptr<exec::VectorFunction> makeRe2Match(
const std::string& name,
const std::vector<exec::VectorFunctionArg>& inputArgs,
const core::QueryConfig& /*config*/) {
return makeRe2MatchImpl<re2FullMatch>(name, inputArgs);
const core::QueryConfig& config) {
return makeRe2MatchImpl<re2FullMatch>(name, inputArgs, config);
}

std::vector<std::shared_ptr<exec::FunctionSignature>> re2MatchSignatures() {
Expand All @@ -1608,8 +1622,8 @@ std::vector<std::shared_ptr<exec::FunctionSignature>> re2MatchSignatures() {
std::shared_ptr<exec::VectorFunction> makeRe2Search(
const std::string& name,
const std::vector<exec::VectorFunctionArg>& inputArgs,
const core::QueryConfig& /*config*/) {
return makeRe2MatchImpl<re2PartialMatch>(name, inputArgs);
const core::QueryConfig& config) {
return makeRe2MatchImpl<re2PartialMatch>(name, inputArgs, config);
}

std::vector<std::shared_ptr<exec::FunctionSignature>> re2SearchSignatures() {
Expand All @@ -1624,7 +1638,7 @@ std::vector<std::shared_ptr<exec::FunctionSignature>> re2SearchSignatures() {
std::shared_ptr<exec::VectorFunction> makeRe2Extract(
const std::string& name,
const std::vector<exec::VectorFunctionArg>& inputArgs,
const core::QueryConfig& /*config*/,
const core::QueryConfig& config,
const bool emptyNoMatch) {
auto numArgs = inputArgs.size();
VELOX_USER_CHECK(
Expand Down Expand Up @@ -1673,11 +1687,14 @@ std::shared_ptr<exec::VectorFunction> makeRe2Extract(
}
}

const auto maxCompiledRegexes = config.exprMaxCompiledRegexes();
switch (groupIdTypeKind) {
case TypeKind::INTEGER:
return std::make_shared<Re2SearchAndExtract<int32_t>>(emptyNoMatch);
return std::make_shared<Re2SearchAndExtract<int32_t>>(
emptyNoMatch, maxCompiledRegexes);
case TypeKind::BIGINT:
return std::make_shared<Re2SearchAndExtract<int64_t>>(emptyNoMatch);
return std::make_shared<Re2SearchAndExtract<int64_t>>(
emptyNoMatch, maxCompiledRegexes);
default:
VELOX_UNREACHABLE();
}
Expand Down Expand Up @@ -2158,14 +2175,14 @@ PatternMetadata determinePatternKind(
std::shared_ptr<exec::VectorFunction> makeLike(
const std::string& name,
const std::vector<exec::VectorFunctionArg>& inputArgs,
const core::QueryConfig& /*config*/) {
const core::QueryConfig& config) {
auto numArgs = inputArgs.size();

std::optional<char> escapeChar;
if (numArgs == 3) {
BaseVector* escape = inputArgs[2].constantValue.get();
if (!escape) {
return std::make_shared<LikeGeneric>();
return std::make_shared<LikeGeneric>(config.exprMaxCompiledRegexes());
}

auto constantEscape = escape->as<ConstantVector<StringView>>();
Expand All @@ -2191,7 +2208,7 @@ std::shared_ptr<exec::VectorFunction> makeLike(

BaseVector* constantPattern = inputArgs[1].constantValue.get();
if (!constantPattern) {
return std::make_shared<LikeGeneric>();
return std::make_shared<LikeGeneric>(config.exprMaxCompiledRegexes());
}

if (constantPattern->isNullAt(0)) {
Expand Down Expand Up @@ -2273,7 +2290,7 @@ std::vector<std::shared_ptr<exec::FunctionSignature>> likeSignatures() {
std::shared_ptr<exec::VectorFunction> makeRe2ExtractAll(
const std::string& name,
const std::vector<exec::VectorFunctionArg>& inputArgs,
const core::QueryConfig& /*config*/) {
const core::QueryConfig& config) {
auto numArgs = inputArgs.size();
VELOX_USER_CHECK(
numArgs == 2 || numArgs == 3,
Expand Down Expand Up @@ -2318,11 +2335,12 @@ std::shared_ptr<exec::VectorFunction> makeRe2ExtractAll(
}
}

const auto maxCompiledRegexes = config.exprMaxCompiledRegexes();
switch (groupIdTypeKind) {
case TypeKind::INTEGER:
return std::make_shared<Re2ExtractAll<int32_t>>();
return std::make_shared<Re2ExtractAll<int32_t>>(maxCompiledRegexes);
case TypeKind::BIGINT:
return std::make_shared<Re2ExtractAll<int64_t>>();
return std::make_shared<Re2ExtractAll<int64_t>>(maxCompiledRegexes);
default:
VELOX_UNREACHABLE();
}
Expand Down Expand Up @@ -2357,7 +2375,8 @@ std::shared_ptr<exec::VectorFunction> makeRegexpReplaceWithLambda(
const std::string& name,
const std::vector<exec::VectorFunctionArg>& inputArgs,
const core::QueryConfig& config) {
return std::make_shared<RegexpReplaceWithLambdaFunction>();
return std::make_shared<RegexpReplaceWithLambdaFunction>(
config.exprMaxCompiledRegexes());
}

std::vector<std::shared_ptr<exec::FunctionSignature>>
Expand Down
25 changes: 22 additions & 3 deletions velox/functions/lib/Re2Functions.h
Original file line number Diff line number Diff line change
Expand Up @@ -157,8 +157,6 @@ class PatternMetadata {
std::vector<std::string> substrings_;
};

inline const int kMaxCompiledRegexes = 20;

/// The functions in this file use RE2 as the regex engine. RE2 is fast, but
/// supports only a subset of PCRE syntax and in particular does not support
/// backtracking and associated features (e.g. backreferences).
Expand Down Expand Up @@ -255,18 +253,26 @@ std::vector<std::shared_ptr<exec::FunctionSignature>> re2ExtractAllSignatures();
namespace detail {

// A cache of compiled regular expressions (RE2 instances). Allows up to
// 'kMaxCompiledRegexes' different expressions.
// 'expression.max_compiled_regexes' different expressions.
//
// Compiling regular expressions is expensive. It can take up to 200 times
// more CPU time to compile a regex vs. evaluate it.
class ReCache {
public:
explicit ReCache(uint64_t maxCompiledRegexes)
: maxCompiledRegexes_(maxCompiledRegexes) {}

void setMaxCompiledRegexes(uint64_t maxCompiledRegexes) {
maxCompiledRegexes_ = maxCompiledRegexes;
}

RE2* findOrCompile(const StringView& pattern);

Expected<RE2*> tryFindOrCompile(const StringView& pattern);

private:
folly::F14FastMap<std::string, std::unique_ptr<RE2>> cache_;
uint64_t maxCompiledRegexes_;
};

} // namespace detail
Expand All @@ -287,6 +293,8 @@ template <
std::string (*prepareRegexpPattern)(const StringView&),
std::string (*prepareRegexpReplacement)(const RE2&, const StringView&)>
struct Re2RegexpReplace {
Re2RegexpReplace() : cache_(0) {}

VELOX_DEFINE_FUNCTION_TYPES(T);

FOLLY_ALWAYS_INLINE void initialize(
Expand All @@ -304,6 +312,7 @@ struct Re2RegexpReplace {
processedPattern,
re_->error());
}
cache_.setMaxCompiledRegexes(config.exprMaxCompiledRegexes());

if (replacement != nullptr) {
// Constant 'replacement' with non-constant 'pattern' needs to be
Expand Down Expand Up @@ -377,8 +386,18 @@ struct Re2RegexpReplace {

template <typename TExec>
struct Re2RegexpSplit {
Re2RegexpSplit() : cache_(0) {}

VELOX_DEFINE_FUNCTION_TYPES(TExec);

FOLLY_ALWAYS_INLINE void initialize(
const std::vector<TypePtr>& /*inputTypes*/,
const core::QueryConfig& config,
const arg_type<Varchar>* /*string*/,
const arg_type<Varchar>* /*pattern*/) {
cache_.setMaxCompiledRegexes(config.exprMaxCompiledRegexes());
}

static constexpr int32_t reuse_strings_from_arg = 0;

void call(
Expand Down
Loading
Loading