From 17059e8f83fa0ad4a8a9d67c1da05ebb85fb7419 Mon Sep 17 00:00:00 2001 From: Calum Murray Date: Wed, 1 May 2024 14:53:50 -0400 Subject: [PATCH] feat: improved performance of LIKE matching Signed-off-by: Calum Murray --- sql/v2/expression/like_expression.go | 94 ++++++++++++---------------- 1 file changed, 41 insertions(+), 53 deletions(-) diff --git a/sql/v2/expression/like_expression.go b/sql/v2/expression/like_expression.go index 5f557fa5a..5ddaed52d 100644 --- a/sql/v2/expression/like_expression.go +++ b/sql/v2/expression/like_expression.go @@ -6,9 +6,6 @@ package expression import ( - "regexp" - "strings" - cesql "github.com/cloudevents/sdk-go/sql/v2" "github.com/cloudevents/sdk-go/sql/v2/utils" cloudevents "github.com/cloudevents/sdk-go/v2" @@ -16,7 +13,7 @@ import ( type likeExpression struct { baseUnaryExpression - pattern *regexp.Regexp + pattern string } func (l likeExpression) Evaluate(event cloudevents.Event) (interface{}, error) { @@ -30,70 +27,61 @@ func (l likeExpression) Evaluate(event cloudevents.Event) (interface{}, error) { return nil, err } - return l.pattern.MatchString(val.(string)), nil + return matchString(val.(string), l.pattern), nil + } func NewLikeExpression(child cesql.Expression, pattern string) (cesql.Expression, error) { - // Converting to regex is not the most performant impl, but it works - p, err := convertLikePatternToRegex(pattern) - if err != nil { - return nil, err - } - return likeExpression{ baseUnaryExpression: baseUnaryExpression{ child: child, }, - pattern: p, + pattern: pattern, }, nil } -func convertLikePatternToRegex(pattern string) (*regexp.Regexp, error) { - var chunks []string - chunks = append(chunks, "^") - - var chunk strings.Builder +func matchString(text, pattern string) bool { + textLen := len(text) + patternLen := len(pattern) + textIdx := 0 + patternIdx := 0 + lastWildcardIdx := -1 + lastMatchIdx := 0 - for i := 0; i < len(pattern); i++ { - if pattern[i] == '\\' && i < len(pattern)-1 { - if pattern[i+1] == '%' { - // \% case - chunk.WriteRune('%') - chunks = append(chunks, "\\Q"+chunk.String()+"\\E") - chunk.Reset() - i++ - continue - } else if pattern[i+1] == '_' { - // \_ case - chunk.WriteRune('_') - chunks = append(chunks, "\\Q"+chunk.String()+"\\E") - chunk.Reset() - i++ - continue - } else { - // if there is an actual literal \ character, we need to include that in the string - chunk.WriteRune('\\') - } - } else if pattern[i] == '_' { - // replace with . - chunks = append(chunks, "\\Q"+chunk.String()+"\\E") - chunk.Reset() - chunks = append(chunks, ".") - } else if pattern[i] == '%' { - // replace with .* - chunks = append(chunks, "\\Q"+chunk.String()+"\\E") - chunk.Reset() - chunks = append(chunks, ".*") + for textIdx < textLen { + // handle escaped characters -> pattern needs to increment two places here + if patternIdx < patternLen-1 && pattern[patternIdx] == '\\' && + ((pattern[patternIdx+1] == '_' || pattern[patternIdx+1] == '%') && + pattern[patternIdx+1] == text[textIdx]) { + patternIdx += 2 + textIdx += 1 + // handle non escaped characters + } else if patternIdx < patternLen && (pattern[patternIdx] == '_' || pattern[patternIdx] == text[textIdx]) { + textIdx += 1 + patternIdx += 1 + // handle wildcard characters + } else if patternIdx < patternLen && pattern[patternIdx] == '%' { + lastWildcardIdx = patternIdx + lastMatchIdx = textIdx + patternIdx += 1 + // greedy match didn't work, try again from the last known match + } else if lastWildcardIdx != -1 { + patternIdx = lastWildcardIdx + 1 + lastMatchIdx += 1 + textIdx = lastMatchIdx } else { - chunk.WriteByte(pattern[i]) + return false } } - if chunk.Len() != 0 { - chunks = append(chunks, "\\Q"+chunk.String()+"\\E") - } + // consume remaining pattern characters as long as they are wildcards + for patternIdx < patternLen { + if pattern[patternIdx] != '%' { + return false + } - chunks = append(chunks, "$") + patternIdx += 1 + } - return regexp.Compile(strings.Join(chunks, "")) + return true }