diff --git a/README.md b/README.md index fbe8818..928aafe 100644 --- a/README.md +++ b/README.md @@ -59,16 +59,6 @@ local Lixie instance which probably does not exist. ## Big features -- Optimize rule evaluation (it could be also used when generating rules for - Vector); identify the stuff with most cardinality, match by that first - (and order the rules based on that too; it doesn't hurt to make it user - visible) - - - idea: # of field matchers / # of regexp field matchers = score (and - then do iteratively by fields) - - - As an option, could also parallelize it? - - Rethink how rules are stored; just big json file can get bit unwieldy? ## Robustness diff --git a/data/bulkmatch.go b/data/bulkmatch.go new file mode 100644 index 0000000..c98f389 --- /dev/null +++ b/data/bulkmatch.go @@ -0,0 +1,128 @@ +/* + * Author: Markus Stenberg + * + * Copyright (c) 2024 Markus Stenberg + * + */ + +/* + Bulk rule matcher. + + It first analyzer the rules, and finds the field with the most exact +matches. Then, using that field, it sequence of match objects which +either do single match (slow), or map-based lookup + list based match +(fast) for the given key. +*/ + +package data + +import "log/slog" + +type logMatcher interface { + ToRule(_ string, log *Log) *LogRule +} + +type singleLogMatcher struct { + rule *LogRule +} + +func (self *singleLogMatcher) ToRule(_ string, log *Log) *LogRule { + if LogMatchesRule(log, self.rule) { + return self.rule + } + return nil +} + +type mapLogMatcher struct { + value2Rules map[string][]*LogRule +} + +func (self *mapLogMatcher) ToRule(value string, log *Log) *LogRule { + rules, ok := self.value2Rules[value] + if !ok { + return nil + } + for _, rule := range rules { + if LogMatchesRule(log, rule) { + return rule + } + } + return nil +} + +type BulkRuleMatcher struct { + field string + matchers []logMatcher +} + +func (self *BulkRuleMatcher) ToRule(log *Log) *LogRule { + value, ok := log.Stream[self.field] + if !ok { + value, ok = log.Fields[self.field].(string) + if !ok { + value = "" + } + } + for _, matcher := range self.matchers { + rule := matcher.ToRule(value, log) + if rule != nil { + return rule + } + } + return nil +} + +func NewBulkRuleMatcher(rules []*LogRule) *BulkRuleMatcher { + fieldToEMcount := map[string]int{} + for _, rule := range rules { + for _, matcher := range rule.Matchers { + if matcher.Op != "=" { + continue + } + cnt := fieldToEMcount[matcher.Field] + cnt++ + fieldToEMcount[matcher.Field] = cnt + } + } + // TODO: While this is semi ok, what if all values are same? + // it isn't as useful for exact matching.. + bestField := "" + bestCount := 0 + for field, count := range fieldToEMcount { + if count > bestCount { + bestField = field + bestCount = count + } + } + slog.Debug("NewBulkRuleMatcher chose", "field", bestField, "count", bestCount) + brm := BulkRuleMatcher{field: bestField} + var currentMatcher *mapLogMatcher + var slowRules, fastRules int + for _, rule := range rules { + found := false + if bestField != "" { + for _, matcher := range rule.Matchers { + if matcher.Op == "=" && matcher.Field == bestField { + if currentMatcher == nil { + currentMatcher = &mapLogMatcher{make(map[string][]*LogRule)} + brm.matchers = append(brm.matchers, currentMatcher) + } + rules := currentMatcher.value2Rules[matcher.Value] + rules = append(rules, rule) + currentMatcher.value2Rules[matcher.Value] = rules + found = true + fastRules++ + break + } + } + } + if !found { + currentMatcher = nil + // Use fallback code here; unfortunate, but it is what it is + brm.matchers = append(brm.matchers, &singleLogMatcher{rule}) + slowRules++ + } + } + slog.Debug("Produced matcher", "fast", fastRules, "slow", slowRules) + return &brm +} diff --git a/data/db.go b/data/db.go index 0f8e965..f73d34f 100644 --- a/data/db.go +++ b/data/db.go @@ -34,6 +34,8 @@ type LogRules struct { // Reversed rules - these are always available if Rules are Reversed []*LogRule `json:"-"` + brm *BulkRuleMatcher + // Internal tracking of rule matches to log lines rid2Count map[int]int } @@ -67,7 +69,7 @@ func NewLogRules(rules []*LogRule, version int) LogRules { for k, v := range rules { reversed[count-k-1] = v } - return LogRules{Rules: rules, Reversed: reversed, Version: version} + return LogRules{Rules: rules, Reversed: reversed, brm: NewBulkRuleMatcher(reversed), Version: version} } func (self *Database) add(r LogRule) error { diff --git a/data/log.go b/data/log.go index 003b037..6d5ecff 100644 --- a/data/log.go +++ b/data/log.go @@ -52,7 +52,7 @@ func (self *Log) MatchesFTS(search string) bool { func (self *Log) ToRule(rules *LogRules) *LogRule { if self.rulesVersion != rules.Version { - self.rule = LogToRule(self, rules.Reversed) + self.rule = rules.brm.ToRule(self) self.rulesVersion = rules.Version } return self.rule