Skip to content

Add trie for patterns #3076

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
200 changes: 200 additions & 0 deletions libs/dyn/pattern_trie.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,200 @@
package dyn

import "fmt"

// PatternTrie is a trie data structure for storing and querying patterns.
// It supports both exact matches and wildcard matches. You can insert [Pattern]s
// into the trie and then query it to see if a given [Path] matches any of the
// patterns.
type PatternTrie struct {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

q - why have this wrapper struct, why not work on *trieNode directly?

root *trieNode
}

// trieNode represents a node in the pattern trie.
// Each node in the array represents one or more of:
// 1. An [AnyKey] component. This is the "*" wildcard which matches any map key.
// 2. An [AnyIndex] component. This is the "[*]" wildcard which matches any array index.
// 3. Multiple [Key] components. These are multiple static path keys for this this node would match.
// 4. Multiple [Index] components. These are multiple static path indices for this this node would match.
//
// Note: It's valid for both anyKey and pathKey to be set at the same time.
// For example, adding both "foo.*.bar" and "foo.bar" to a trie is valid.
// Similarly, it's valid for both anyIndex and pathIndex to be set at the same time.
// For example, adding both "foo[*].bar" and "foo[0]" to a trie is valid.
//
// Note: Setting both key (one of pathKey or anyKey) and index (one of pathIndex or anyIndex)
// is not supported by the [PatternTrie.SearchPath] method. We don't perform validation for this
// case because it's not expected to arise in practice where a field is either a map or an array,
// but not both.
type trieNode struct {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

don't we need to attach value to nodes? e.g. list of available enum values?

// If set this indicates the trie node is an anyKey node.
// Maps to the [AnyKey] component.
anyKey *trieNode

// Indicates the trie node is an anyIndex node.
// Maps to the [AnyIndex] component.
anyIndex *trieNode

// Set of strings which this trie node matches.
// Maps to the [Key] component.
pathKey map[string]*trieNode

// Set of indices which this trie node matches.
// Maps to the [Index] component.
pathIndex map[int]*trieNode
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I cannot think of any case where we would need to match exact index array. Can be dropped to simplify?


// Indicates if this node is the end of a pattern. Encountering a node
// with isEnd set to true in a trie means the pattern from the root to this
// node is a complete pattern.
isEnd bool
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are isEnd nodes expected to have any of the other fields set?

}

// NewPatternTrie creates a new empty pattern trie.
func NewPatternTrie() *PatternTrie {
return &PatternTrie{
root: &trieNode{},
}
}

// Insert adds a pattern to the trie.
func (t *PatternTrie) Insert(pattern Pattern) error {
// Empty pattern represents the root.
if len(pattern) == 0 {
t.root.isEnd = true
return nil
}

current := t.root
for i, component := range pattern {
// Create next node based on component type
var next *trieNode
switch c := component.(type) {
case anyKeyComponent:
if current.anyKey == nil {
current.anyKey = &trieNode{}
}
next = current.anyKey

case anyIndexComponent:
if current.anyIndex == nil {
current.anyIndex = &trieNode{}
}
next = current.anyIndex

case pathComponent:
if key := c.Key(); key != "" {
if current.pathKey == nil {
current.pathKey = make(map[string]*trieNode)
}
if _, exists := current.pathKey[key]; !exists {
current.pathKey[key] = &trieNode{}
}
next = current.pathKey[key]
} else {
idx := c.Index()
if current.pathIndex == nil {
current.pathIndex = make(map[int]*trieNode)
}
if _, exists := current.pathIndex[idx]; !exists {
current.pathIndex[idx] = &trieNode{}
}
next = current.pathIndex[idx]
}
}

if next == nil {
return fmt.Errorf("invalid component type: %T", component)
}

// Mark as end of pattern if this is the last component.
if !next.isEnd && i == len(pattern)-1 {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

!next.isEnd

this check is redundant?

next.isEnd = true
}

// Move to next node
current = next
}

return nil
}

// SearchPath checks if the given path matches any pattern in the trie.
// A path matches if it exactly matches a pattern or if it matches a pattern with wildcards.
func (t *PatternTrie) SearchPath(path Path) (Pattern, bool) {
// We statically allocate the prefix array that is used to track the current
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: I'd call this pre-allocate, static allocation means something else https://en.wikipedia.org/wiki/Static_variable

// prefix accumulated while walking the prefix tree. Having the static allocation
// ensures that we do not allocate memory on every recursive call.
prefix := make(Pattern, len(path))
pattern, ok := t.searchPathRecursive(t.root, path, prefix, 0)
return pattern, ok
}

// searchPathRecursive is a helper function that recursively checks if a path matches any pattern.
// Arguments:
// - node: the current node in the trie.
// - path: the path to check.
// - prefix: the prefix accumulated while walking the prefix tree.
// - index: the current index in the path / prefix
//
// Note we always expect the path and prefix to be the same length because wildcards like * and [*]
// only match a single path component.
func (t *PatternTrie) searchPathRecursive(node *trieNode, path Path, prefix Pattern, index int) (Pattern, bool) {
if node == nil {
return nil, false
}

// Zero case, when the query path is the root node. We return nil here to match
// the semantics of [MustPatternFromString] which returns nil for the empty string.
if len(path) == 0 {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

path is always the same in all recursive calls, right? (This code somewhat implies that path is shrinking with depth).

We should not need this condition - we already check below that index == len(path).

return nil, node.isEnd
}

// If we've reached the end of the path, check if this node is a valid end of a pattern.
isLast := index == len(path)
if isLast {
return prefix, node.isEnd
}

currentComponent := path[index]

// First check if the key wildcard is set for the current index.
if currentComponent.isKey() && node.anyKey != nil {
prefix[index] = AnyKey()
pattern, ok := t.searchPathRecursive(node.anyKey, path, prefix, index+1)
if ok {
return pattern, true
}
}

// If no key wildcard is set, check if the key is an exact match.
if currentComponent.isKey() {
child, exists := node.pathKey[currentComponent.Key()]
if !exists {
return prefix, false
}
prefix[index] = currentComponent
return t.searchPathRecursive(child, path, prefix, index+1)
}

// First check if the index wildcard is set for the current index.
if currentComponent.isIndex() && node.anyIndex != nil {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

q - currentComponent.IsIndex() must be true at this point, right?

prefix[index] = AnyIndex()
pattern, ok := t.searchPathRecursive(node.anyIndex, path, prefix, index+1)
if ok {
return pattern, true
}
}

// If no index wildcard is set, check if the index is an exact match.
if currentComponent.isIndex() {
child, exists := node.pathIndex[currentComponent.Index()]
if !exists {
return prefix, false
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

are we supposed to return partial pattern? Is there a use case for that?

}
prefix[index] = currentComponent
return t.searchPathRecursive(child, path, prefix, index+1)
}

// If we've reached this point, the path does not match any patterns in the trie.
return prefix, false
}
191 changes: 191 additions & 0 deletions libs/dyn/pattern_trie_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
package dyn_test

import (
"testing"

"github.com/databricks/cli/libs/dyn"
assert "github.com/databricks/cli/libs/dyn/dynassert"
)

func TestPatternTrie_SearchPath(t *testing.T) {
tests := []struct {
name string
pattern string
expected []string
notExpected []string
}{
{
name: "empty pattern",
pattern: "",
expected: []string{""},
notExpected: []string{"foo"},
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: expected / notExpected make it sound like we return all matching patterns, whereas we just check if they match. mustMatch / mustNotMatch would better explain that.

},
{
name: "simple key pattern",
pattern: "foo",
expected: []string{"foo"},
notExpected: []string{"foo.bar", "foo[0]", "bar"},
},
{
name: "simple index pattern",
pattern: "[0]",
expected: []string{"[0]"},
notExpected: []string{"foo[0]", "foo.bar", "bar"},
},
{
name: "nested key pattern",
pattern: "foo.bar",
expected: []string{"foo.bar"},
notExpected: []string{"foo", "foo[0]", "bar.foo", "foo.baz"},
},
{
name: "root wildcard",
pattern: "*",
expected: []string{"foo", "bar"},
notExpected: []string{"", "bar.foo", "foo.baz"},
},
{
name: "wildcard * after foo",
pattern: "foo.*",
expected: []string{"foo.bar", "foo.baz"},
notExpected: []string{"foo", "bar", "foo.bar.baz"},
},
{
name: "wildcard [*] after foo",
pattern: "foo[*]",
expected: []string{"foo[0]", "foo[1]", "foo[2025]"},
notExpected: []string{"foo", "bar", "foo[0].bar"},
},
{
name: "key after * wildcard",
pattern: "foo.*.bar",
expected: []string{"foo.abc.bar", "foo.def.bar"},
notExpected: []string{"foo", "bar", "foo.bar.baz"},
},
{
name: "key after [*] wildcard",
pattern: "foo[*].bar",
expected: []string{"foo[0].bar", "foo[1].bar", "foo[2025].bar"},
notExpected: []string{"foo", "bar", "foo[0].baz"},
},
{
name: "multiple * wildcards",
pattern: "*.*.*",
expected: []string{"foo.bar.baz", "foo.bar.qux"},
notExpected: []string{"foo", "bar", "foo.bar", "foo.bar.baz.qux"},
},
{
name: "multiple [*] wildcards",
pattern: "foo[*][*]",
expected: []string{"foo[0][0]", "foo[1][1]", "foo[2025][2025]"},
notExpected: []string{"foo", "bar", "foo[0][0][0]"},
},
{
name: "[*] after * wildcard",
pattern: "*[*]",
expected: []string{"foo[0]", "foo[1]", "foo[2025]"},
notExpected: []string{"foo", "bar", "foo[0].bar", "[0].foo"},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
trie := dyn.NewPatternTrie()
pattern := dyn.MustPatternFromString(tt.pattern)

// None of the expected paths should match yet.
for _, path := range tt.expected {
_, ok := trie.SearchPath(dyn.MustPathFromString(path))
assert.False(t, ok)
}
for _, path := range tt.notExpected {
_, ok := trie.SearchPath(dyn.MustPathFromString(path))
assert.False(t, ok)
}

err := trie.Insert(pattern)
assert.NoError(t, err)

// Now all the expected paths should match.
for _, path := range tt.expected {
pattern, ok := trie.SearchPath(dyn.MustPathFromString(path))
assert.True(t, ok)
assert.Equal(t, dyn.MustPatternFromString(tt.pattern), pattern)
}
for _, path := range tt.notExpected {
_, ok := trie.SearchPath(dyn.MustPathFromString(path))
assert.False(t, ok)
}
})
}
}

func TestPatternTrie_MultiplePatterns(t *testing.T) {
trie := dyn.NewPatternTrie()

patterns := []string{
"foo.bar",
"foo.*.baz",
"abc[0]",
"def[*]",
}

expected := map[string]string{
"foo.bar": "foo.bar",
"foo.abc.baz": "foo.*.baz",
"foo.def.baz": "foo.*.baz",
"abc[0]": "abc[0]",
"def[0]": "def[*]",
"def[1]": "def[*]",
}

notExpected := []string{
"foo",
"abc[1]",
"def[2].x",
"foo.y",
"foo.bar.baz.qux",
}

for _, pattern := range patterns {
err := trie.Insert(dyn.MustPatternFromString(pattern))
assert.NoError(t, err)
}

for path, expectedPattern := range expected {
pattern, ok := trie.SearchPath(dyn.MustPathFromString(path))
assert.True(t, ok)
assert.Equal(t, dyn.MustPatternFromString(expectedPattern), pattern)
}

for _, path := range notExpected {
_, ok := trie.SearchPath(dyn.MustPathFromString(path))
assert.False(t, ok)
}
}

func TestPatternTrie_OverlappingPatterns(t *testing.T) {
trie := dyn.NewPatternTrie()

// Insert overlapping patterns
patterns := []string{
"foo.bar",
"foo.*",
"*.bar",
"*.*",
}

for _, pattern := range patterns {
err := trie.Insert(dyn.MustPatternFromString(pattern))
assert.NoError(t, err)
}

for _, path := range []string{
"foo.bar",
"foo.baz",
"baz.bar",
"baz.qux",
} {
_, ok := trie.SearchPath(dyn.MustPathFromString(path))
assert.True(t, ok)
}
}
Loading