Skip to content

Commit

Permalink
Added bulkmatch - much, much faster matching of logs to rules
Browse files Browse the repository at this point in the history
  • Loading branch information
fingon committed Dec 5, 2024
1 parent 184dd84 commit e1d164c
Show file tree
Hide file tree
Showing 4 changed files with 132 additions and 12 deletions.
10 changes: 0 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,16 +59,6 @@ local Lixie instance which probably does not exist.

## Big features

- Optimize rule evaluation (it could be also used when generating rules for
Vector); identify the stuff with most cardinality, match by that first
(and order the rules based on that too; it doesn't hurt to make it user
visible)

- idea: # of field matchers / # of regexp field matchers = score (and
then do iteratively by fields)

- As an option, could also parallelize it?

- Rethink how rules are stored; just big json file can get bit unwieldy?

## Robustness
Expand Down
128 changes: 128 additions & 0 deletions data/bulkmatch.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
/*
* Author: Markus Stenberg <[email protected]>
*
* Copyright (c) 2024 Markus Stenberg
*
*/

/*
Bulk rule matcher.
It first analyzer the rules, and finds the field with the most exact
matches. Then, using that field, it sequence of match objects which
either do single match (slow), or map-based lookup + list based match
(fast) for the given key.
*/

package data

import "log/slog"

type logMatcher interface {
ToRule(_ string, log *Log) *LogRule
}

type singleLogMatcher struct {
rule *LogRule
}

func (self *singleLogMatcher) ToRule(_ string, log *Log) *LogRule {
if LogMatchesRule(log, self.rule) {
return self.rule
}
return nil
}

type mapLogMatcher struct {
value2Rules map[string][]*LogRule
}

func (self *mapLogMatcher) ToRule(value string, log *Log) *LogRule {
rules, ok := self.value2Rules[value]
if !ok {
return nil
}
for _, rule := range rules {
if LogMatchesRule(log, rule) {
return rule
}
}
return nil
}

type BulkRuleMatcher struct {
field string
matchers []logMatcher
}

func (self *BulkRuleMatcher) ToRule(log *Log) *LogRule {
value, ok := log.Stream[self.field]
if !ok {
value, ok = log.Fields[self.field].(string)
if !ok {
value = ""
}
}
for _, matcher := range self.matchers {
rule := matcher.ToRule(value, log)
if rule != nil {
return rule
}
}
return nil
}

func NewBulkRuleMatcher(rules []*LogRule) *BulkRuleMatcher {
fieldToEMcount := map[string]int{}
for _, rule := range rules {
for _, matcher := range rule.Matchers {
if matcher.Op != "=" {
continue
}
cnt := fieldToEMcount[matcher.Field]
cnt++
fieldToEMcount[matcher.Field] = cnt
}
}
// TODO: While this is semi ok, what if all values are same?
// it isn't as useful for exact matching..
bestField := ""
bestCount := 0
for field, count := range fieldToEMcount {
if count > bestCount {
bestField = field
bestCount = count
}
}
slog.Debug("NewBulkRuleMatcher chose", "field", bestField, "count", bestCount)
brm := BulkRuleMatcher{field: bestField}
var currentMatcher *mapLogMatcher
var slowRules, fastRules int
for _, rule := range rules {
found := false
if bestField != "" {
for _, matcher := range rule.Matchers {
if matcher.Op == "=" && matcher.Field == bestField {
if currentMatcher == nil {
currentMatcher = &mapLogMatcher{make(map[string][]*LogRule)}
brm.matchers = append(brm.matchers, currentMatcher)
}
rules := currentMatcher.value2Rules[matcher.Value]
rules = append(rules, rule)
currentMatcher.value2Rules[matcher.Value] = rules
found = true
fastRules++
break
}
}
}
if !found {
currentMatcher = nil
// Use fallback code here; unfortunate, but it is what it is
brm.matchers = append(brm.matchers, &singleLogMatcher{rule})
slowRules++
}
}
slog.Debug("Produced matcher", "fast", fastRules, "slow", slowRules)
return &brm
}
4 changes: 3 additions & 1 deletion data/db.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ type LogRules struct {
// Reversed rules - these are always available if Rules are
Reversed []*LogRule `json:"-"`

brm *BulkRuleMatcher

// Internal tracking of rule matches to log lines
rid2Count map[int]int
}
Expand Down Expand Up @@ -67,7 +69,7 @@ func NewLogRules(rules []*LogRule, version int) LogRules {
for k, v := range rules {
reversed[count-k-1] = v
}
return LogRules{Rules: rules, Reversed: reversed, Version: version}
return LogRules{Rules: rules, Reversed: reversed, brm: NewBulkRuleMatcher(reversed), Version: version}
}

func (self *Database) add(r LogRule) error {
Expand Down
2 changes: 1 addition & 1 deletion data/log.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ func (self *Log) MatchesFTS(search string) bool {

func (self *Log) ToRule(rules *LogRules) *LogRule {
if self.rulesVersion != rules.Version {
self.rule = LogToRule(self, rules.Reversed)
self.rule = rules.brm.ToRule(self)
self.rulesVersion = rules.Version
}
return self.rule
Expand Down

0 comments on commit e1d164c

Please sign in to comment.