-
Notifications
You must be signed in to change notification settings - Fork 94
Add trie for patterns #3076
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Add trie for patterns #3076
Changes from all commits
2a06295
8cf2221
1b274ba
5ceffe6
8349ac8
d824c32
df5fbd1
a949317
ff856a0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,200 @@ | ||
package dyn | ||
|
||
import "fmt" | ||
|
||
// PatternTrie is a trie data structure for storing and querying patterns. | ||
// It supports both exact matches and wildcard matches. You can insert [Pattern]s | ||
// into the trie and then query it to see if a given [Path] matches any of the | ||
// patterns. | ||
type PatternTrie struct { | ||
root *trieNode | ||
} | ||
|
||
// trieNode represents a node in the pattern trie. | ||
// Each node in the array represents one or more of: | ||
// 1. An [AnyKey] component. This is the "*" wildcard which matches any map key. | ||
// 2. An [AnyIndex] component. This is the "[*]" wildcard which matches any array index. | ||
// 3. Multiple [Key] components. These are multiple static path keys for this this node would match. | ||
// 4. Multiple [Index] components. These are multiple static path indices for this this node would match. | ||
// | ||
// Note: It's valid for both anyKey and pathKey to be set at the same time. | ||
// For example, adding both "foo.*.bar" and "foo.bar" to a trie is valid. | ||
// Similarly, it's valid for both anyIndex and pathIndex to be set at the same time. | ||
// For example, adding both "foo[*].bar" and "foo[0]" to a trie is valid. | ||
// | ||
// Note: Setting both key (one of pathKey or anyKey) and index (one of pathIndex or anyIndex) | ||
// is not supported by the [PatternTrie.SearchPath] method. We don't perform validation for this | ||
// case because it's not expected to arise in practice where a field is either a map or an array, | ||
// but not both. | ||
type trieNode struct { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. don't we need to attach value to nodes? e.g. list of available enum values? |
||
// If set this indicates the trie node is an anyKey node. | ||
shreyas-goenka marked this conversation as resolved.
Show resolved
Hide resolved
|
||
// Maps to the [AnyKey] component. | ||
anyKey *trieNode | ||
|
||
// Indicates the trie node is an anyIndex node. | ||
// Maps to the [AnyIndex] component. | ||
anyIndex *trieNode | ||
|
||
// Set of strings which this trie node matches. | ||
// Maps to the [Key] component. | ||
pathKey map[string]*trieNode | ||
|
||
// Set of indices which this trie node matches. | ||
// Maps to the [Index] component. | ||
pathIndex map[int]*trieNode | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I cannot think of any case where we would need to match exact index array. Can be dropped to simplify? |
||
|
||
// Indicates if this node is the end of a pattern. Encountering a node | ||
// with isEnd set to true in a trie means the pattern from the root to this | ||
// node is a complete pattern. | ||
isEnd bool | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Are isEnd nodes expected to have any of the other fields set? |
||
} | ||
|
||
// NewPatternTrie creates a new empty pattern trie. | ||
func NewPatternTrie() *PatternTrie { | ||
return &PatternTrie{ | ||
root: &trieNode{}, | ||
} | ||
} | ||
|
||
// Insert adds a pattern to the trie. | ||
func (t *PatternTrie) Insert(pattern Pattern) error { | ||
// Empty pattern represents the root. | ||
if len(pattern) == 0 { | ||
t.root.isEnd = true | ||
return nil | ||
} | ||
|
||
current := t.root | ||
for i, component := range pattern { | ||
// Create next node based on component type | ||
var next *trieNode | ||
switch c := component.(type) { | ||
case anyKeyComponent: | ||
if current.anyKey == nil { | ||
current.anyKey = &trieNode{} | ||
} | ||
next = current.anyKey | ||
|
||
case anyIndexComponent: | ||
if current.anyIndex == nil { | ||
current.anyIndex = &trieNode{} | ||
} | ||
next = current.anyIndex | ||
|
||
case pathComponent: | ||
if key := c.Key(); key != "" { | ||
if current.pathKey == nil { | ||
current.pathKey = make(map[string]*trieNode) | ||
} | ||
if _, exists := current.pathKey[key]; !exists { | ||
current.pathKey[key] = &trieNode{} | ||
} | ||
next = current.pathKey[key] | ||
} else { | ||
idx := c.Index() | ||
if current.pathIndex == nil { | ||
current.pathIndex = make(map[int]*trieNode) | ||
} | ||
if _, exists := current.pathIndex[idx]; !exists { | ||
current.pathIndex[idx] = &trieNode{} | ||
} | ||
next = current.pathIndex[idx] | ||
} | ||
} | ||
|
||
if next == nil { | ||
return fmt.Errorf("invalid component type: %T", component) | ||
} | ||
|
||
// Mark as end of pattern if this is the last component. | ||
if !next.isEnd && i == len(pattern)-1 { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
this check is redundant? |
||
next.isEnd = true | ||
} | ||
|
||
// Move to next node | ||
current = next | ||
} | ||
|
||
return nil | ||
} | ||
|
||
// SearchPath checks if the given path matches any pattern in the trie. | ||
// A path matches if it exactly matches a pattern or if it matches a pattern with wildcards. | ||
func (t *PatternTrie) SearchPath(path Path) (Pattern, bool) { | ||
// We statically allocate the prefix array that is used to track the current | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: I'd call this pre-allocate, static allocation means something else https://en.wikipedia.org/wiki/Static_variable |
||
// prefix accumulated while walking the prefix tree. Having the static allocation | ||
// ensures that we do not allocate memory on every recursive call. | ||
prefix := make(Pattern, len(path)) | ||
shreyas-goenka marked this conversation as resolved.
Show resolved
Hide resolved
|
||
pattern, ok := t.searchPathRecursive(t.root, path, prefix, 0) | ||
return pattern, ok | ||
} | ||
|
||
// searchPathRecursive is a helper function that recursively checks if a path matches any pattern. | ||
// Arguments: | ||
// - node: the current node in the trie. | ||
// - path: the path to check. | ||
// - prefix: the prefix accumulated while walking the prefix tree. | ||
// - index: the current index in the path / prefix | ||
// | ||
// Note we always expect the path and prefix to be the same length because wildcards like * and [*] | ||
// only match a single path component. | ||
func (t *PatternTrie) searchPathRecursive(node *trieNode, path Path, prefix Pattern, index int) (Pattern, bool) { | ||
if node == nil { | ||
return nil, false | ||
} | ||
|
||
// Zero case, when the query path is the root node. We return nil here to match | ||
// the semantics of [MustPatternFromString] which returns nil for the empty string. | ||
if len(path) == 0 { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. path is always the same in all recursive calls, right? (This code somewhat implies that path is shrinking with depth). We should not need this condition - we already check below that index == len(path). |
||
return nil, node.isEnd | ||
} | ||
|
||
// If we've reached the end of the path, check if this node is a valid end of a pattern. | ||
isLast := index == len(path) | ||
if isLast { | ||
return prefix, node.isEnd | ||
} | ||
|
||
currentComponent := path[index] | ||
shreyas-goenka marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
// First check if the key wildcard is set for the current index. | ||
if currentComponent.isKey() && node.anyKey != nil { | ||
prefix[index] = AnyKey() | ||
pattern, ok := t.searchPathRecursive(node.anyKey, path, prefix, index+1) | ||
if ok { | ||
return pattern, true | ||
} | ||
} | ||
|
||
// If no key wildcard is set, check if the key is an exact match. | ||
if currentComponent.isKey() { | ||
child, exists := node.pathKey[currentComponent.Key()] | ||
if !exists { | ||
return prefix, false | ||
} | ||
prefix[index] = currentComponent | ||
return t.searchPathRecursive(child, path, prefix, index+1) | ||
} | ||
|
||
// First check if the index wildcard is set for the current index. | ||
if currentComponent.isIndex() && node.anyIndex != nil { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. q - currentComponent.IsIndex() must be true at this point, right? |
||
prefix[index] = AnyIndex() | ||
pattern, ok := t.searchPathRecursive(node.anyIndex, path, prefix, index+1) | ||
if ok { | ||
return pattern, true | ||
} | ||
} | ||
|
||
// If no index wildcard is set, check if the index is an exact match. | ||
if currentComponent.isIndex() { | ||
child, exists := node.pathIndex[currentComponent.Index()] | ||
if !exists { | ||
return prefix, false | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. are we supposed to return partial pattern? Is there a use case for that? |
||
} | ||
prefix[index] = currentComponent | ||
return t.searchPathRecursive(child, path, prefix, index+1) | ||
} | ||
|
||
// If we've reached this point, the path does not match any patterns in the trie. | ||
return prefix, false | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,191 @@ | ||
package dyn_test | ||
|
||
import ( | ||
"testing" | ||
|
||
"github.com/databricks/cli/libs/dyn" | ||
assert "github.com/databricks/cli/libs/dyn/dynassert" | ||
) | ||
|
||
func TestPatternTrie_SearchPath(t *testing.T) { | ||
tests := []struct { | ||
name string | ||
pattern string | ||
expected []string | ||
notExpected []string | ||
}{ | ||
{ | ||
name: "empty pattern", | ||
pattern: "", | ||
expected: []string{""}, | ||
notExpected: []string{"foo"}, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: expected / notExpected make it sound like we return all matching patterns, whereas we just check if they match. mustMatch / mustNotMatch would better explain that. |
||
}, | ||
{ | ||
name: "simple key pattern", | ||
pattern: "foo", | ||
expected: []string{"foo"}, | ||
notExpected: []string{"foo.bar", "foo[0]", "bar"}, | ||
}, | ||
{ | ||
name: "simple index pattern", | ||
pattern: "[0]", | ||
expected: []string{"[0]"}, | ||
notExpected: []string{"foo[0]", "foo.bar", "bar"}, | ||
}, | ||
{ | ||
name: "nested key pattern", | ||
pattern: "foo.bar", | ||
expected: []string{"foo.bar"}, | ||
notExpected: []string{"foo", "foo[0]", "bar.foo", "foo.baz"}, | ||
}, | ||
{ | ||
name: "root wildcard", | ||
pattern: "*", | ||
expected: []string{"foo", "bar"}, | ||
notExpected: []string{"", "bar.foo", "foo.baz"}, | ||
}, | ||
{ | ||
name: "wildcard * after foo", | ||
pattern: "foo.*", | ||
expected: []string{"foo.bar", "foo.baz"}, | ||
notExpected: []string{"foo", "bar", "foo.bar.baz"}, | ||
}, | ||
{ | ||
name: "wildcard [*] after foo", | ||
pattern: "foo[*]", | ||
expected: []string{"foo[0]", "foo[1]", "foo[2025]"}, | ||
notExpected: []string{"foo", "bar", "foo[0].bar"}, | ||
}, | ||
{ | ||
name: "key after * wildcard", | ||
pattern: "foo.*.bar", | ||
expected: []string{"foo.abc.bar", "foo.def.bar"}, | ||
notExpected: []string{"foo", "bar", "foo.bar.baz"}, | ||
}, | ||
{ | ||
name: "key after [*] wildcard", | ||
pattern: "foo[*].bar", | ||
expected: []string{"foo[0].bar", "foo[1].bar", "foo[2025].bar"}, | ||
notExpected: []string{"foo", "bar", "foo[0].baz"}, | ||
}, | ||
{ | ||
name: "multiple * wildcards", | ||
pattern: "*.*.*", | ||
expected: []string{"foo.bar.baz", "foo.bar.qux"}, | ||
notExpected: []string{"foo", "bar", "foo.bar", "foo.bar.baz.qux"}, | ||
}, | ||
{ | ||
name: "multiple [*] wildcards", | ||
pattern: "foo[*][*]", | ||
expected: []string{"foo[0][0]", "foo[1][1]", "foo[2025][2025]"}, | ||
notExpected: []string{"foo", "bar", "foo[0][0][0]"}, | ||
}, | ||
{ | ||
name: "[*] after * wildcard", | ||
pattern: "*[*]", | ||
expected: []string{"foo[0]", "foo[1]", "foo[2025]"}, | ||
notExpected: []string{"foo", "bar", "foo[0].bar", "[0].foo"}, | ||
}, | ||
} | ||
for _, tt := range tests { | ||
t.Run(tt.name, func(t *testing.T) { | ||
trie := dyn.NewPatternTrie() | ||
pattern := dyn.MustPatternFromString(tt.pattern) | ||
|
||
// None of the expected paths should match yet. | ||
for _, path := range tt.expected { | ||
_, ok := trie.SearchPath(dyn.MustPathFromString(path)) | ||
assert.False(t, ok) | ||
} | ||
for _, path := range tt.notExpected { | ||
_, ok := trie.SearchPath(dyn.MustPathFromString(path)) | ||
assert.False(t, ok) | ||
} | ||
|
||
err := trie.Insert(pattern) | ||
assert.NoError(t, err) | ||
|
||
// Now all the expected paths should match. | ||
for _, path := range tt.expected { | ||
pattern, ok := trie.SearchPath(dyn.MustPathFromString(path)) | ||
assert.True(t, ok) | ||
assert.Equal(t, dyn.MustPatternFromString(tt.pattern), pattern) | ||
} | ||
for _, path := range tt.notExpected { | ||
_, ok := trie.SearchPath(dyn.MustPathFromString(path)) | ||
assert.False(t, ok) | ||
} | ||
}) | ||
} | ||
} | ||
|
||
func TestPatternTrie_MultiplePatterns(t *testing.T) { | ||
trie := dyn.NewPatternTrie() | ||
|
||
patterns := []string{ | ||
"foo.bar", | ||
"foo.*.baz", | ||
"abc[0]", | ||
"def[*]", | ||
} | ||
|
||
expected := map[string]string{ | ||
"foo.bar": "foo.bar", | ||
"foo.abc.baz": "foo.*.baz", | ||
"foo.def.baz": "foo.*.baz", | ||
"abc[0]": "abc[0]", | ||
"def[0]": "def[*]", | ||
"def[1]": "def[*]", | ||
} | ||
|
||
notExpected := []string{ | ||
"foo", | ||
"abc[1]", | ||
"def[2].x", | ||
"foo.y", | ||
"foo.bar.baz.qux", | ||
} | ||
|
||
for _, pattern := range patterns { | ||
err := trie.Insert(dyn.MustPatternFromString(pattern)) | ||
assert.NoError(t, err) | ||
} | ||
|
||
for path, expectedPattern := range expected { | ||
pattern, ok := trie.SearchPath(dyn.MustPathFromString(path)) | ||
assert.True(t, ok) | ||
assert.Equal(t, dyn.MustPatternFromString(expectedPattern), pattern) | ||
} | ||
|
||
for _, path := range notExpected { | ||
_, ok := trie.SearchPath(dyn.MustPathFromString(path)) | ||
assert.False(t, ok) | ||
} | ||
} | ||
|
||
func TestPatternTrie_OverlappingPatterns(t *testing.T) { | ||
trie := dyn.NewPatternTrie() | ||
|
||
// Insert overlapping patterns | ||
patterns := []string{ | ||
"foo.bar", | ||
"foo.*", | ||
"*.bar", | ||
"*.*", | ||
} | ||
|
||
for _, pattern := range patterns { | ||
err := trie.Insert(dyn.MustPatternFromString(pattern)) | ||
assert.NoError(t, err) | ||
} | ||
|
||
for _, path := range []string{ | ||
"foo.bar", | ||
"foo.baz", | ||
"baz.bar", | ||
"baz.qux", | ||
} { | ||
_, ok := trie.SearchPath(dyn.MustPathFromString(path)) | ||
assert.True(t, ok) | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
q - why have this wrapper struct, why not work on *trieNode directly?