Skip to content

Commit

Permalink
chore(storage/bloom): support simplifiable regexp matchers (#14622)
Browse files Browse the repository at this point in the history
This adds support for basic regexps which can be simplified into a sequence of
OR matchers, such as:

* `key=~"value" becomes key="value"
* `key=~"value1|value2" becomes key="value1" or key="value2".
* `key=~".+" checks for the presence of key. This is currently the only way to 
   check if a key exists.

Only the cases above are "officially" supported. However, we technically
support basic concatenations and character classes due to how regexp/syntax
parses and simplifies expressions such as `value1|value2` into `value[12]`.

To prevent unbounded cardinality, we limit regexp expansion to 25 matchers;
otherwise a regexp like `value[0-9][0-9][0-9][0-9]` would expand into 10,000
matchers (too many!).

Closes grafana/loki-private#1106.

Co-authored-by: J Stickler <[email protected]>
  • Loading branch information
rfratto and JStickler authored Nov 4, 2024
1 parent 7b53f20 commit 8eca826
Show file tree
Hide file tree
Showing 6 changed files with 425 additions and 38 deletions.
5 changes: 5 additions & 0 deletions docs/sources/query/query_accceleration.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,11 @@ If [bloom filters][] are enabled, you can write LogQL queries using [structured
Queries will be accelerated for any [label filter expression][] that satisfies _all_ of the following criteria:

* The label filter expression using **string equality**, such as `| key="value"`.
* `or` and `and` operators can be used to match multiple values, such as `| detected_level="error" or detected_level="warn"`.
* _Basic_ regular expressions are automatically simplified into a supported expression:
* `| key=~"value"` is converted to `| key="value"`.
* `| key=~"value1|value2"` is converted to `| key="value1" or key="value2"`.
* `| key=~".+"` checks for existence of `key`. `.*` is not supported.
* The label filter expression is querying for structured metadata and not a stream label.
* The label filter expression is placed before any [parser expression][], [labels format expression][], [drop labels expression][], or [keep labels expression][].

Expand Down
6 changes: 3 additions & 3 deletions pkg/bloomgateway/processor_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ func TestProcessor(t *testing.T) {
}

matchers := []v1.LabelMatcher{
v1.PlainLabelMatcher{
v1.KeyValueMatcher{
Key: "trace_id",
Value: "nomatch",
},
Expand Down Expand Up @@ -191,7 +191,7 @@ func TestProcessor(t *testing.T) {
day: config.NewDayTime(truncateDay(now)),
}
matchers := []v1.LabelMatcher{
v1.PlainLabelMatcher{
v1.KeyValueMatcher{
Key: "trace_id",
Value: "nomatch",
},
Expand Down Expand Up @@ -238,7 +238,7 @@ func TestProcessor(t *testing.T) {
day: config.NewDayTime(truncateDay(now)),
}
matchers := []v1.LabelMatcher{
v1.PlainLabelMatcher{
v1.KeyValueMatcher{
Key: "trace_id",
Value: "nomatch",
},
Expand Down
202 changes: 191 additions & 11 deletions pkg/storage/bloom/v1/ast_extractor.go
Original file line number Diff line number Diff line change
@@ -1,12 +1,24 @@
package v1

import (
regexsyn "github.com/grafana/regexp/syntax"

"github.com/prometheus/prometheus/model/labels"

"github.com/grafana/loki/v3/pkg/logql/log"
"github.com/grafana/loki/v3/pkg/logql/syntax"
"github.com/grafana/loki/v3/pkg/util"
)

// Simplifiable regexp expressions can quickly expand into very high
// cardinality; we limit the number of matchers to prevent this. However,
// since bloom tests are relatively cheap to test, we can afford to be a little
// generous while still preventing excessive cardinality.
//
// For example, the regex `[0-9]` expands to 10 matchers (0, 1, .. 9), while
// `[0-9][0-9][0-9]` expands to 1000 matchers (000, 001, .., 999).
const maxRegexMatchers = 200

// LabelMatcher represents bloom tests for key-value pairs, mapped from
// LabelFilterExprs from the AST.
type LabelMatcher interface{ isLabelMatcher() }
Expand All @@ -15,9 +27,13 @@ type LabelMatcher interface{ isLabelMatcher() }
// mapped. Bloom tests for UnsupportedLabelMatchers must always pass.
type UnsupportedLabelMatcher struct{}

// PlainLabelMatcher represents a direct key-value matcher. Bloom tests
// must only pass if the key-value pair exists in the bloom.
type PlainLabelMatcher struct{ Key, Value string }
// KeyValueMatcher represents a direct key-value matcher. Bloom tests must only
// pass if the key-value pair exists in the bloom.
type KeyValueMatcher struct{ Key, Value string }

// KeyMatcher represents a key matcher. Bloom tests must only pass if the key
// exists in the bloom.
type KeyMatcher struct{ Key string }

// OrLabelMatcher represents a logical OR test. Bloom tests must only pass if
// one of the Left or Right label matcher bloom tests pass.
Expand Down Expand Up @@ -54,21 +70,27 @@ func buildLabelMatcher(filter log.LabelFilterer) LabelMatcher {
switch filter := filter.(type) {

case *log.LineFilterLabelFilter:
if filter.Type != labels.MatchEqual {
return UnsupportedLabelMatcher{}
if filter.Type == labels.MatchEqual {
return KeyValueMatcher{
Key: filter.Name,
Value: filter.Value,
}
} else if filter.Type == labels.MatchRegexp {
reg, err := regexsyn.Parse(filter.Value, regexsyn.Perl)
if err != nil {
return UnsupportedLabelMatcher{}
}
return buildSimplifiedRegexMatcher(filter.Name, reg.Simplify())
}

return PlainLabelMatcher{
Key: filter.Name,
Value: filter.Value,
}
return UnsupportedLabelMatcher{}

case *log.StringLabelFilter:
if filter.Type != labels.MatchEqual {
return UnsupportedLabelMatcher{}
}

return PlainLabelMatcher{
return KeyValueMatcher{
Key: filter.Name,
Value: filter.Value,
}
Expand All @@ -89,11 +111,169 @@ func buildLabelMatcher(filter log.LabelFilterer) LabelMatcher {
}
}

// buildSimplifiedRegexMatcher builds a simplified label matcher from a regex.
// reg may be mutated.
func buildSimplifiedRegexMatcher(key string, reg *regexsyn.Regexp) LabelMatcher {
switch reg.Op {
case regexsyn.OpAlternate:
util.ClearCapture(reg)

left := buildSimplifiedRegexMatcher(key, reg.Sub[0])
if len(reg.Sub) == 1 {
// This shouldn't be possible (even `warn|` has two subexpressions, where
// the latter matches an empty string), but we have a length check here
// anyway just to avoid a potential panic.
return left
}
for _, sub := range reg.Sub[1:] {
right := buildSimplifiedRegexMatcher(key, sub)
left = OrLabelMatcher{Left: left, Right: right}
}
return left

case regexsyn.OpConcat:
// OpConcat checks for the concatenation of two or more subexpressions. For
// example, value1|value2 simplifies to value[12], with the two
// subexpressions value and [12].
//
// We expand subexpressions back out into full matchers where possible, so
// value[12] becomes value1 OR value2, and value[1-9] becomes value1 OR
// value2 .. OR value9.
util.ClearCapture(reg)

matchers, ok := expandSubexpr(reg)
if !ok || len(matchers) == 0 {
return UnsupportedLabelMatcher{}
}

var left LabelMatcher = KeyValueMatcher{Key: key, Value: matchers[0]}
for _, matcher := range matchers[1:] {
right := KeyValueMatcher{Key: key, Value: matcher}
left = OrLabelMatcher{Left: left, Right: right}
}
return left

case regexsyn.OpCapture:
util.ClearCapture(reg)
return buildSimplifiedRegexMatcher(key, reg)

case regexsyn.OpLiteral:
return KeyValueMatcher{
Key: key,
Value: string(reg.Rune),
}

case regexsyn.OpPlus:
if reg.Sub[0].Op == regexsyn.OpAnyChar || reg.Sub[0].Op == regexsyn.OpAnyCharNotNL { // .+
return KeyMatcher{Key: key}
}

return UnsupportedLabelMatcher{}

default:
return UnsupportedLabelMatcher{}
}
}

func expandSubexpr(reg *regexsyn.Regexp) (prefixes []string, ok bool) {
switch reg.Op {
case regexsyn.OpAlternate:
util.ClearCapture(reg)

for _, sub := range reg.Sub {
subPrefixes, ok := expandSubexpr(sub)
if !ok {
return nil, false
} else if len(prefixes)+len(subPrefixes) > maxRegexMatchers {
return nil, false
}
prefixes = append(prefixes, subPrefixes...)
}
return prefixes, true

case regexsyn.OpCharClass:
// OpCharClass stores ranges of characters, so [12] is the range of bytes
// []rune('1', '2'), while [15] is represented as []rune('1', '1', '5',
// '5').
//
// To expand OpCharClass, we iterate over each pair of runes.
if len(reg.Rune)%2 != 0 {
// Invalid regexp; sequences should be even.
return nil, false
}

for i := 0; i < len(reg.Rune); i += 2 {
start, end := reg.Rune[i+0], reg.Rune[i+1]
for r := start; r <= end; r++ {
prefixes = append(prefixes, string(r))
if len(prefixes) > maxRegexMatchers {
return nil, false
}
}
}

return prefixes, true

case regexsyn.OpConcat:
if len(reg.Sub) == 0 {
return nil, false
}

// We get the prefixes for each subexpression and then iteratively combine
// them together.
//
// For the regexp [12][34]value (which concatenates [12], [34], and value):
//
// 1. We get the prefixes for [12], which are 1 and 2.
// 2. We get the prefixes for [34], which are 3 and 4.
// 3. We add the prefixes together to get 13, 14, 23, and 24.
// 4. We get the prerfixes for value, which is value.
// 5. Finally, we add the prefixes together to get 13value, 14value, 23value, and 24value.
curPrefixes, ok := expandSubexpr(reg.Sub[0])
if !ok {
return nil, false
}

for _, sub := range reg.Sub[1:] {
subPrefixes, ok := expandSubexpr(sub)
if !ok {
return nil, false
} else if len(curPrefixes)*len(subPrefixes) > maxRegexMatchers {
return nil, false
}

newPrefixes := make([]string, 0, len(curPrefixes)*len(subPrefixes))

for _, curPrefix := range curPrefixes {
for _, subPrefix := range subPrefixes {
newPrefixes = append(newPrefixes, curPrefix+subPrefix)
}
}

curPrefixes = newPrefixes
}

return curPrefixes, true

case regexsyn.OpCapture:
util.ClearCapture(reg)
return expandSubexpr(reg)

case regexsyn.OpLiteral:
prefixes = append(prefixes, string(reg.Rune))
return prefixes, true

default:
return nil, false
}
}

//
// Implement marker types:
//

func (UnsupportedLabelMatcher) isLabelMatcher() {}
func (PlainLabelMatcher) isLabelMatcher() {}
func (KeyValueMatcher) isLabelMatcher() {}
func (KeyMatcher) isLabelMatcher() {}
func (OrLabelMatcher) isLabelMatcher() {}
func (AndLabelMatcher) isLabelMatcher() {}
Loading

0 comments on commit 8eca826

Please sign in to comment.