Skip to content

Commit fe04b93

Browse files
committed
Updates scanner to support Lua extension
Fixed up the Scanner logic to mirror changes made to support Lua extension in Lex. Added a compat layer so that the existing Lua type can be used with `Scanner` vs trying to refactor the implementation to remove the channel. Doing so I think would result in further gains. Benchmarks: ``` ❯ go test -benchmem -run=^$ -bench "^(BenchmarkLex|BenchmarkLexWithLua|BenchmarkScanner|BenchmarkScannerWithLua)$" github.com/nginxinc/nginx-go-crossplane -count=1 goos: darwin goarch: arm64 pkg: github.com/nginxinc/nginx-go-crossplane BenchmarkLex/simple-10 57963 17756 ns/op 103049 B/op 39 allocs/op BenchmarkLex/with-comments-10 60025 20067 ns/op 103112 B/op 45 allocs/op BenchmarkLex/messy-10 26170 47822 ns/op 104400 B/op 168 allocs/op BenchmarkLex/quote-behavior-10 74510 17693 ns/op 102961 B/op 26 allocs/op BenchmarkLex/quoted-right-brace-10 43134 27752 ns/op 103560 B/op 54 allocs/op BenchmarkLex/comments-between-args-10 78271 14866 ns/op 102937 B/op 27 allocs/op BenchmarkLexWithLua/lua-basic-10 46273 26012 ns/op 105499 B/op 53 allocs/op BenchmarkLexWithLua/lua-block-simple-10 22514 54149 ns/op 108556 B/op 143 allocs/op BenchmarkLexWithLua/lua-block-larger-10 25983 46605 ns/op 108403 B/op 59 allocs/op BenchmarkLexWithLua/lua-block-tricky-10 33756 35067 ns/op 106684 B/op 66 allocs/op BenchmarkScanner/simple-10 163138 7084 ns/op 4648 B/op 36 allocs/op BenchmarkScanner/with-comments-10 144558 8100 ns/op 4712 B/op 42 allocs/op BenchmarkScanner/messy-10 47570 25026 ns/op 6000 B/op 165 allocs/op BenchmarkScanner/quote-behavior-10 222280 5083 ns/op 4560 B/op 23 allocs/op BenchmarkScanner/quoted-right-brace-10 82656 14281 ns/op 5160 B/op 51 allocs/op BenchmarkScanner/comments-between-args-10 225475 4872 ns/op 4536 B/op 24 allocs/op BenchmarkScannerWithLua/lua-basic-10 93081 12833 ns/op 7866 B/op 66 allocs/op BenchmarkScannerWithLua/lua-block-simple-10 31426 37989 ns/op 10924 B/op 156 allocs/op BenchmarkScannerWithLua/lua-block-larger-10 37148 30723 ns/op 10770 B/op 72 allocs/op BenchmarkScannerWithLua/lua-block-tricky-10 54890 22383 ns/op 9050 B/op 79 allocs/op PASS ok github.com/nginxinc/nginx-go-crossplane 29.969s ```
1 parent cc657b1 commit fe04b93

File tree

4 files changed

+258
-64
lines changed

4 files changed

+258
-64
lines changed

lex.go

Lines changed: 57 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ type LexOptions struct {
6565
// RegisterLexer is an option that cna be used to add a lexer to tokenize external NGINX tokens.
6666
type RegisterLexer interface {
6767
applyLexOptions(options *LexOptions)
68+
applyScannerOptions(options *scannerOptions)
6869
}
6970

7071
type registerLexer struct {
@@ -82,6 +83,16 @@ func (rl registerLexer) applyLexOptions(o *LexOptions) {
8283
}
8384
}
8485

86+
func (rl registerLexer) applyScannerOptions(o *scannerOptions) {
87+
if o.extensions == nil {
88+
o.extensions = make(map[string]ScannerExt)
89+
}
90+
91+
for _, s := range rl.stringTokens {
92+
o.extensions[s] = &LexerScanner{lexer: rl.l}
93+
}
94+
}
95+
8596
// LexWithLexer registers a Lexer that implements tokenization of an NGINX configuration after one of the given
8697
// stringTokens is encountered by Lex.
8798
func LexWithLexer(l Lexer, stringTokens ...string) RegisterLexer { //nolint:ireturn
@@ -106,12 +117,38 @@ func Lex(reader io.Reader) chan NgxToken {
106117
// SubScanner provides an interface for scanning alternative grammars within NGINX configuration data.
107118
type SubScanner struct {
108119
scanner *bufio.Scanner
120+
parent *Scanner
109121
tokenLine int
110122
}
111123

112124
// Scan advances the scanner to the next token which will be available though the Text method. It returns false
113125
// when the scan stops by reaching the end of input.
114126
func (e *SubScanner) Scan() bool {
127+
if e.scanner != nil {
128+
return e.lexScan()
129+
}
130+
131+
if e.parent.err != nil {
132+
return false
133+
}
134+
135+
if !e.parent.scanner.Scan() {
136+
if err := e.parent.scanner.Err(); err != nil {
137+
e.parent.setErr(err)
138+
}
139+
return false
140+
}
141+
142+
// e.parent.prev = e.parent.scanner.Text()
143+
// if isEOL(e.parent.prev) {
144+
if t := e.parent.scanner.Text(); isEOL(t) {
145+
e.parent.lineno++
146+
}
147+
148+
return true
149+
}
150+
151+
func (e *SubScanner) lexScan() bool {
115152
if !e.scanner.Scan() {
116153
return false
117154
}
@@ -122,13 +159,30 @@ func (e *SubScanner) Scan() bool {
122159
}
123160

124161
// Err returns the fist non-EOF error encountered by the Scanner.
125-
func (e *SubScanner) Err() error { return e.scanner.Err() }
162+
func (e *SubScanner) Err() error {
163+
if e.scanner != nil {
164+
return e.scanner.Err()
165+
}
166+
return e.parent.Err()
167+
}
126168

127169
// Text returns the most recent token generated by a call to Scan.
128-
func (e *SubScanner) Text() string { return e.scanner.Text() }
170+
func (e *SubScanner) Text() string {
171+
if e.scanner != nil {
172+
return e.scanner.Text()
173+
}
174+
// return e.parent.prev
175+
return e.parent.scanner.Text()
176+
}
129177

130178
// Line returns the line number of the most recent token generated by a call to Scan.
131-
func (e *SubScanner) Line() int { return e.tokenLine }
179+
func (e *SubScanner) Line() int {
180+
if e.scanner != nil {
181+
return e.tokenLine
182+
}
183+
184+
return e.parent.lineno
185+
}
132186

133187
//nolint:gocyclo,funlen,gocognit,maintidx
134188
func tokenize(reader io.Reader, tokenCh chan NgxToken, options LexOptions) {

lex_test.go

Lines changed: 38 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -460,34 +460,53 @@ func TestLex(t *testing.T) {
460460
}
461461
}
462462

463-
var lexToken NgxToken //nolint: gochecknoglobals // trying to avoid return value being optimzed away
464-
465-
func BenchmarkLex(b *testing.B) {
463+
func benchmarkLex(b *testing.B, path string, options LexOptions) {
466464
var t NgxToken
467465

466+
file, err := os.Open(path)
467+
if err != nil {
468+
b.Fatal(err)
469+
}
470+
defer file.Close()
471+
b.ResetTimer()
472+
473+
for i := 0; i < b.N; i++ {
474+
if _, err := file.Seek(0, 0); err != nil {
475+
b.Fatal(err)
476+
}
477+
478+
for tok := range LexWithOptions(file, options) {
479+
t = tok
480+
}
481+
}
482+
483+
_ = t
484+
}
485+
486+
func BenchmarkLex(b *testing.B) {
468487
for _, bm := range lexFixtures {
488+
if strings.HasPrefix(bm.name, "lua") {
489+
continue
490+
}
491+
469492
b.Run(bm.name, func(b *testing.B) {
470493
path := getTestConfigPath(bm.name, "nginx.conf")
471-
file, err := os.Open(path)
472-
if err != nil {
473-
b.Fatal(err)
474-
}
475-
defer file.Close()
476-
b.ResetTimer()
494+
benchmarkLex(b, path, LexOptions{})
495+
})
496+
}
497+
}
477498

478-
for i := 0; i < b.N; i++ {
479-
if _, err := file.Seek(0, 0); err != nil {
480-
b.Fatal(err)
481-
}
499+
func BenchmarkLexWithLua(b *testing.B) {
500+
for _, bm := range lexFixtures {
501+
if !strings.HasPrefix(bm.name, "lua") {
502+
continue
503+
}
482504

483-
for tok := range Lex(file) {
484-
t = tok
485-
}
486-
}
505+
b.Run(bm.name, func(b *testing.B) {
506+
path := getTestConfigPath(bm.name, "nginx.conf")
507+
benchmarkLex(b, path, LexOptions{Lexers: []RegisterLexer{lua.RegisterLexer()}})
487508
})
488509
}
489-
490-
lexToken = t
491510
}
492511

493512
//nolint:gochecknoglobals

scanner.go

Lines changed: 111 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,14 @@ import (
88
"strings"
99
)
1010

11+
type scannerOptions struct {
12+
extensions map[string]ScannerExt
13+
}
14+
15+
type ScannerOption interface {
16+
applyScannerOptions(options *scannerOptions)
17+
}
18+
1119
// Token is a lexical token of the NGINX configuration syntax.
1220
type Token struct {
1321
// Text is the string corresponding to the token. It could be a directive or symbol. The value is the actual token
@@ -20,6 +28,8 @@ type Token struct {
2028
IsQuoted bool
2129
}
2230

31+
func (t Token) String() string { return fmt.Sprintf("{%d, %s, %t}", t.Line, t.Text, t.IsQuoted) }
32+
2333
type scannerError struct {
2434
msg string
2535
line int
@@ -52,23 +62,33 @@ func LineNumber(err error) (int, bool) {
5262
//
5363
// Use NewScanner to construct a Scanner.
5464
type Scanner struct {
55-
scanner *bufio.Scanner
56-
lineno int
57-
tokenStartLine int
58-
tokenDepth int
59-
repeateSpecialChar bool // only '}' can be repeated
60-
prev string
61-
err error
65+
scanner *bufio.Scanner
66+
lineno int
67+
tokenStartLine int
68+
tokenDepth int
69+
repeateSpecialChar bool // only '}' can be repeated
70+
nextTokenIsDirective bool
71+
prev string
72+
err error
73+
options *scannerOptions
74+
ext Tokenizer
6275
}
6376

6477
// NewScanner returns a new Scanner to read from r.
65-
func NewScanner(r io.Reader) *Scanner {
78+
func NewScanner(r io.Reader, options ...ScannerOption) *Scanner {
79+
opts := &scannerOptions{}
80+
for _, opt := range options {
81+
opt.applyScannerOptions(opts)
82+
}
83+
6684
s := &Scanner{
67-
scanner: bufio.NewScanner(r),
68-
lineno: 1,
69-
tokenStartLine: 1,
70-
tokenDepth: 0,
71-
repeateSpecialChar: false,
85+
scanner: bufio.NewScanner(r),
86+
lineno: 1,
87+
tokenStartLine: 1,
88+
tokenDepth: 0,
89+
repeateSpecialChar: false,
90+
nextTokenIsDirective: true,
91+
options: opts,
7292
}
7393

7494
s.scanner.Split(bufio.ScanRunes)
@@ -92,7 +112,21 @@ func (s *Scanner) setErr(err error) {
92112

93113
// Scan reads the next token from source and returns it.. It returns io.EOF at the end of the source. Scanner errors are
94114
// returned when encountered.
95-
func (s *Scanner) Scan() (Token, error) { //nolint: funlen, gocognit, gocyclo
115+
func (s *Scanner) Scan() (Token, error) { //nolint: funlen, gocognit, gocyclo, maintidx // sorry
116+
if s.ext != nil {
117+
t, err := s.ext.Next()
118+
if err != nil {
119+
if !errors.Is(err, ErrTokenizerDone) {
120+
s.setErr(err)
121+
return Token{}, s.err
122+
}
123+
124+
s.ext = nil
125+
} else {
126+
return t, nil
127+
}
128+
}
129+
96130
var tok strings.Builder
97131

98132
lexState := skipSpace
@@ -129,6 +163,7 @@ func (s *Scanner) Scan() (Token, error) { //nolint: funlen, gocognit, gocyclo
129163
r = nextRune
130164
if isEOL(r) {
131165
s.lineno++
166+
s.nextTokenIsDirective = true
132167
}
133168
default:
134169
readNext = true
@@ -149,6 +184,16 @@ func (s *Scanner) Scan() (Token, error) { //nolint: funlen, gocognit, gocyclo
149184
r = "\\" + r
150185
}
151186

187+
if tok.Len() > 0 {
188+
t := tok.String()
189+
if s.nextTokenIsDirective {
190+
if ext, ok := s.options.extensions[t]; ok {
191+
s.ext = ext.Tokenizer(&SubScanner{parent: s, tokenLine: s.tokenStartLine}, t)
192+
return Token{Text: t, Line: s.tokenStartLine}, nil
193+
}
194+
}
195+
}
196+
152197
switch lexState {
153198
case skipSpace:
154199
if !isSpace(r) {
@@ -166,11 +211,13 @@ func (s *Scanner) Scan() (Token, error) { //nolint: funlen, gocognit, gocyclo
166211
tok.WriteString(r)
167212
lexState = inComment
168213
s.tokenStartLine = s.lineno
214+
s.nextTokenIsDirective = false
169215
continue
170216
}
171217
}
172218

173219
if isSpace(r) {
220+
s.nextTokenIsDirective = false
174221
return Token{Text: tok.String(), Line: s.tokenStartLine}, nil
175222
}
176223

@@ -179,6 +226,7 @@ func (s *Scanner) Scan() (Token, error) { //nolint: funlen, gocognit, gocyclo
179226
tok.WriteString(r)
180227
lexState = inVar
181228
s.repeateSpecialChar = false
229+
s.nextTokenIsDirective = false
182230
continue
183231
}
184232

@@ -223,6 +271,7 @@ func (s *Scanner) Scan() (Token, error) { //nolint: funlen, gocognit, gocyclo
223271
}
224272

225273
tok.WriteString(r)
274+
s.nextTokenIsDirective = true
226275
return Token{Text: tok.String(), Line: s.tokenStartLine}, nil
227276
}
228277

@@ -250,3 +299,51 @@ func (s *Scanner) Scan() (Token, error) { //nolint: funlen, gocognit, gocyclo
250299
}
251300
}
252301
}
302+
303+
// ScannerExt is the interface that describes an extension for the [Scanner]. Scanner extensions enable scanning of
304+
// configurations that contain syntaxes that do not follow the usual grammar.
305+
type ScannerExt interface {
306+
Tokenizer(s *SubScanner, matchedToken string) Tokenizer
307+
}
308+
309+
// ErrTokenizerDone is returned by [Tokenizer] when tokenization is complete.
310+
var ErrTokenizerDone = errors.New("done")
311+
312+
// Tokenizer is the interface that wraps the Next method.
313+
//
314+
// Next returns the next token scanned from the NGINX configuration or an error if the configuration cannot be
315+
// tokenized. Return the special error, [ErrTokenizerDone] when finished tokenizing.
316+
type Tokenizer interface {
317+
Next() (Token, error)
318+
}
319+
320+
// LexerScanner is a compatibility layer between Lexers and Scanner.
321+
type LexerScanner struct {
322+
lexer Lexer
323+
scanner *SubScanner
324+
matchedToken string
325+
ch <-chan NgxToken
326+
}
327+
328+
func (s *LexerScanner) Tokenizer(scanner *SubScanner, matchedtoken string) Tokenizer {
329+
s.scanner = scanner
330+
s.matchedToken = matchedtoken
331+
return s
332+
}
333+
334+
func (s *LexerScanner) Next() (Token, error) {
335+
if s.ch == nil {
336+
s.ch = s.lexer.Lex(s.scanner, s.matchedToken)
337+
}
338+
339+
ngxTok, ok := <-s.ch
340+
if !ok {
341+
return Token{}, ErrTokenizerDone
342+
}
343+
344+
if ngxTok.Error != nil {
345+
return Token{}, newScannerErrf(ngxTok.Line, ngxTok.Error.Error())
346+
}
347+
348+
return Token{Text: ngxTok.Value, Line: ngxTok.Line, IsQuoted: ngxTok.IsQuoted}, nil
349+
}

0 commit comments

Comments
 (0)