Updates scanner to support Lua extension

ornj · ornj · commit fe04b9347427 · 2024-07-05T15:10:47.000-07:00
Fixed up the Scanner logic to mirror changes made to support Lua
extension in Lex. Added a compat layer so that the existing Lua type can
be used with `Scanner` vs trying to refactor the implementation to
remove the channel. Doing so I think would result in further gains.

Benchmarks:

```
❯ go test -benchmem -run=^$ -bench "^(BenchmarkLex|BenchmarkLexWithLua|BenchmarkScanner|BenchmarkScannerWithLua)$" github.com/nginxinc/nginx-go-crossplane -count=1
goos: darwin
goarch: arm64
pkg: github.com/nginxinc/nginx-go-crossplane
BenchmarkLex/simple-10             57963             17756 ns/op          103049 B/op         39 allocs/op
BenchmarkLex/with-comments-10      60025             20067 ns/op          103112 B/op         45 allocs/op
BenchmarkLex/messy-10              26170             47822 ns/op          104400 B/op        168 allocs/op
BenchmarkLex/quote-behavior-10             74510             17693 ns/op          102961 B/op         26 allocs/op
BenchmarkLex/quoted-right-brace-10         43134             27752 ns/op          103560 B/op         54 allocs/op
BenchmarkLex/comments-between-args-10      78271             14866 ns/op          102937 B/op         27 allocs/op
BenchmarkLexWithLua/lua-basic-10           46273             26012 ns/op          105499 B/op         53 allocs/op
BenchmarkLexWithLua/lua-block-simple-10                    22514             54149 ns/op          108556 B/op        143 allocs/op
BenchmarkLexWithLua/lua-block-larger-10                    25983             46605 ns/op          108403 B/op         59 allocs/op
BenchmarkLexWithLua/lua-block-tricky-10                    33756             35067 ns/op          106684 B/op         66 allocs/op
BenchmarkScanner/simple-10                                163138              7084 ns/op            4648 B/op         36 allocs/op
BenchmarkScanner/with-comments-10                         144558              8100 ns/op            4712 B/op         42 allocs/op
BenchmarkScanner/messy-10                                  47570             25026 ns/op            6000 B/op        165 allocs/op
BenchmarkScanner/quote-behavior-10                        222280              5083 ns/op            4560 B/op         23 allocs/op
BenchmarkScanner/quoted-right-brace-10                     82656             14281 ns/op            5160 B/op         51 allocs/op
BenchmarkScanner/comments-between-args-10                 225475              4872 ns/op            4536 B/op         24 allocs/op
BenchmarkScannerWithLua/lua-basic-10                       93081             12833 ns/op            7866 B/op         66 allocs/op
BenchmarkScannerWithLua/lua-block-simple-10                31426             37989 ns/op           10924 B/op        156 allocs/op
BenchmarkScannerWithLua/lua-block-larger-10                37148             30723 ns/op           10770 B/op         72 allocs/op
BenchmarkScannerWithLua/lua-block-tricky-10                54890             22383 ns/op            9050 B/op         79 allocs/op
PASS
ok      github.com/nginxinc/nginx-go-crossplane 29.969s
```
diff --git a/lex.go b/lex.go
@@ -65,6 +65,7 @@ type LexOptions struct {
 // RegisterLexer is an option that cna be used to add a lexer to tokenize external NGINX tokens.
 type RegisterLexer interface {
 	applyLexOptions(options *LexOptions)
+	applyScannerOptions(options *scannerOptions)
 }
 
 type registerLexer struct {
@@ -82,6 +83,16 @@ func (rl registerLexer) applyLexOptions(o *LexOptions) {
 	}
 }
 
+func (rl registerLexer) applyScannerOptions(o *scannerOptions) {
+	if o.extensions == nil {
+		o.extensions = make(map[string]ScannerExt)
+	}
+
+	for _, s := range rl.stringTokens {
+		o.extensions[s] = &LexerScanner{lexer: rl.l}
+	}
+}
+
 // LexWithLexer registers a Lexer that implements tokenization of an NGINX configuration after one of the given
 // stringTokens is encountered by Lex.
 func LexWithLexer(l Lexer, stringTokens ...string) RegisterLexer { //nolint:ireturn
@@ -106,12 +117,38 @@ func Lex(reader io.Reader) chan NgxToken {
 // SubScanner provides an interface for scanning alternative grammars within NGINX configuration data.
 type SubScanner struct {
 	scanner   *bufio.Scanner
+	parent    *Scanner
 	tokenLine int
 }
 
 // Scan advances the scanner to the next token which will be available though the Text method. It returns false
 // when the scan stops by reaching the end of input.
 func (e *SubScanner) Scan() bool {
+	if e.scanner != nil {
+		return e.lexScan()
+	}
+
+	if e.parent.err != nil {
+		return false
+	}
+
+	if !e.parent.scanner.Scan() {
+		if err := e.parent.scanner.Err(); err != nil {
+			e.parent.setErr(err)
+		}
+		return false
+	}
+
+	// e.parent.prev = e.parent.scanner.Text()
+	// if isEOL(e.parent.prev) {
+	if t := e.parent.scanner.Text(); isEOL(t) {
+		e.parent.lineno++
+	}
+
+	return true
+}
+
+func (e *SubScanner) lexScan() bool {
 	if !e.scanner.Scan() {
 		return false
 	}
@@ -122,13 +159,30 @@ func (e *SubScanner) Scan() bool {
 }
 
 // Err returns the fist non-EOF error encountered by the Scanner.
-func (e *SubScanner) Err() error { return e.scanner.Err() }
+func (e *SubScanner) Err() error {
+	if e.scanner != nil {
+		return e.scanner.Err()
+	}
+	return e.parent.Err()
+}
 
 // Text returns the most recent token generated by a call to Scan.
-func (e *SubScanner) Text() string { return e.scanner.Text() }
+func (e *SubScanner) Text() string {
+	if e.scanner != nil {
+		return e.scanner.Text()
+	}
+	// return e.parent.prev
+	return e.parent.scanner.Text()
+}
 
 // Line returns the line number of the most recent token generated by a call to Scan.
-func (e *SubScanner) Line() int { return e.tokenLine }
+func (e *SubScanner) Line() int {
+	if e.scanner != nil {
+		return e.tokenLine
+	}
+
+	return e.parent.lineno
+}
 
 //nolint:gocyclo,funlen,gocognit,maintidx
 func tokenize(reader io.Reader, tokenCh chan NgxToken, options LexOptions) {
diff --git a/lex_test.go b/lex_test.go
@@ -460,34 +460,53 @@ func TestLex(t *testing.T) {
 	}
 }
 
-var lexToken NgxToken //nolint: gochecknoglobals // trying to avoid return value being optimzed away
-
-func BenchmarkLex(b *testing.B) {
+func benchmarkLex(b *testing.B, path string, options LexOptions) {
 	var t NgxToken
 
+	file, err := os.Open(path)
+	if err != nil {
+		b.Fatal(err)
+	}
+	defer file.Close()
+	b.ResetTimer()
+
+	for i := 0; i < b.N; i++ {
+		if _, err := file.Seek(0, 0); err != nil {
+			b.Fatal(err)
+		}
+
+		for tok := range LexWithOptions(file, options) {
+			t = tok
+		}
+	}
+
+	_ = t
+}
+
+func BenchmarkLex(b *testing.B) {
 	for _, bm := range lexFixtures {
+		if strings.HasPrefix(bm.name, "lua") {
+			continue
+		}
+
 		b.Run(bm.name, func(b *testing.B) {
 			path := getTestConfigPath(bm.name, "nginx.conf")
-			file, err := os.Open(path)
-			if err != nil {
-				b.Fatal(err)
-			}
-			defer file.Close()
-			b.ResetTimer()
+			benchmarkLex(b, path, LexOptions{})
+		})
+	}
+}
 
-			for i := 0; i < b.N; i++ {
-				if _, err := file.Seek(0, 0); err != nil {
-					b.Fatal(err)
-				}
+func BenchmarkLexWithLua(b *testing.B) {
+	for _, bm := range lexFixtures {
+		if !strings.HasPrefix(bm.name, "lua") {
+			continue
+		}
 
-				for tok := range Lex(file) {
-					t = tok
-				}
-			}
+		b.Run(bm.name, func(b *testing.B) {
+			path := getTestConfigPath(bm.name, "nginx.conf")
+			benchmarkLex(b, path, LexOptions{Lexers: []RegisterLexer{lua.RegisterLexer()}})
 		})
 	}
-
-	lexToken = t
 }
 
 //nolint:gochecknoglobals
diff --git a/scanner.go b/scanner.go
@@ -8,6 +8,14 @@ import (
 	"strings"
 )
 
+type scannerOptions struct {
+	extensions map[string]ScannerExt
+}
+
+type ScannerOption interface {
+	applyScannerOptions(options *scannerOptions)
+}
+
 // Token is a lexical token of the NGINX configuration syntax.
 type Token struct {
 	// Text is the string corresponding to the token. It could be a directive or symbol. The value is the actual token
@@ -20,6 +28,8 @@ type Token struct {
 	IsQuoted bool
 }
 
+func (t Token) String() string { return fmt.Sprintf("{%d, %s, %t}", t.Line, t.Text, t.IsQuoted) }
+
 type scannerError struct {
 	msg  string
 	line int
@@ -52,23 +62,33 @@ func LineNumber(err error) (int, bool) {
 //
 // Use NewScanner to construct a Scanner.
 type Scanner struct {
-	scanner            *bufio.Scanner
-	lineno             int
-	tokenStartLine     int
-	tokenDepth         int
-	repeateSpecialChar bool //  only '}' can be repeated
-	prev               string
-	err                error
+	scanner              *bufio.Scanner
+	lineno               int
+	tokenStartLine       int
+	tokenDepth           int
+	repeateSpecialChar   bool //  only '}' can be repeated
+	nextTokenIsDirective bool
+	prev                 string
+	err                  error
+	options              *scannerOptions
+	ext                  Tokenizer
 }
 
 // NewScanner returns a new Scanner to read from r.
-func NewScanner(r io.Reader) *Scanner {
+func NewScanner(r io.Reader, options ...ScannerOption) *Scanner {
+	opts := &scannerOptions{}
+	for _, opt := range options {
+		opt.applyScannerOptions(opts)
+	}
+
 	s := &Scanner{
-		scanner:            bufio.NewScanner(r),
-		lineno:             1,
-		tokenStartLine:     1,
-		tokenDepth:         0,
-		repeateSpecialChar: false,
+		scanner:              bufio.NewScanner(r),
+		lineno:               1,
+		tokenStartLine:       1,
+		tokenDepth:           0,
+		repeateSpecialChar:   false,
+		nextTokenIsDirective: true,
+		options:              opts,
 	}
 
 	s.scanner.Split(bufio.ScanRunes)
@@ -92,7 +112,21 @@ func (s *Scanner) setErr(err error) {
 
 // Scan reads the next token from source and returns it.. It returns io.EOF at the end of the source. Scanner errors are
 // returned when encountered.
-func (s *Scanner) Scan() (Token, error) { //nolint: funlen, gocognit, gocyclo
+func (s *Scanner) Scan() (Token, error) { //nolint: funlen, gocognit, gocyclo, maintidx // sorry
+	if s.ext != nil {
+		t, err := s.ext.Next()
+		if err != nil {
+			if !errors.Is(err, ErrTokenizerDone) {
+				s.setErr(err)
+				return Token{}, s.err
+			}
+
+			s.ext = nil
+		} else {
+			return t, nil
+		}
+	}
+
 	var tok strings.Builder
 
 	lexState := skipSpace
@@ -129,6 +163,7 @@ func (s *Scanner) Scan() (Token, error) { //nolint: funlen, gocognit, gocyclo
 			r = nextRune
 			if isEOL(r) {
 				s.lineno++
+				s.nextTokenIsDirective = true
 			}
 		default:
 			readNext = true
@@ -149,6 +184,16 @@ func (s *Scanner) Scan() (Token, error) { //nolint: funlen, gocognit, gocyclo
 			r = "\\" + r
 		}
 
+		if tok.Len() > 0 {
+			t := tok.String()
+			if s.nextTokenIsDirective {
+				if ext, ok := s.options.extensions[t]; ok {
+					s.ext = ext.Tokenizer(&SubScanner{parent: s, tokenLine: s.tokenStartLine}, t)
+					return Token{Text: t, Line: s.tokenStartLine}, nil
+				}
+			}
+		}
+
 		switch lexState {
 		case skipSpace:
 			if !isSpace(r) {
@@ -166,11 +211,13 @@ func (s *Scanner) Scan() (Token, error) { //nolint: funlen, gocognit, gocyclo
 					tok.WriteString(r)
 					lexState = inComment
 					s.tokenStartLine = s.lineno
+					s.nextTokenIsDirective = false
 					continue
 				}
 			}
 
 			if isSpace(r) {
+				s.nextTokenIsDirective = false
 				return Token{Text: tok.String(), Line: s.tokenStartLine}, nil
 			}
 
@@ -179,6 +226,7 @@ func (s *Scanner) Scan() (Token, error) { //nolint: funlen, gocognit, gocyclo
 				tok.WriteString(r)
 				lexState = inVar
 				s.repeateSpecialChar = false
+				s.nextTokenIsDirective = false
 				continue
 			}
 
@@ -223,6 +271,7 @@ func (s *Scanner) Scan() (Token, error) { //nolint: funlen, gocognit, gocyclo
 				}
 
 				tok.WriteString(r)
+				s.nextTokenIsDirective = true
 				return Token{Text: tok.String(), Line: s.tokenStartLine}, nil
 			}
 
@@ -250,3 +299,51 @@ func (s *Scanner) Scan() (Token, error) { //nolint: funlen, gocognit, gocyclo
 		}
 	}
 }
+
+// ScannerExt is the interface that describes an extension for the [Scanner]. Scanner extensions enable scanning of
+// configurations that contain syntaxes that do not follow the usual grammar.
+type ScannerExt interface {
+	Tokenizer(s *SubScanner, matchedToken string) Tokenizer
+}
+
+// ErrTokenizerDone is returned by [Tokenizer] when tokenization is complete.
+var ErrTokenizerDone = errors.New("done")
+
+// Tokenizer is the interface that wraps the Next method.
+//
+// Next returns the next token scanned from the NGINX configuration or an error if the configuration cannot be
+// tokenized. Return the special error, [ErrTokenizerDone] when finished tokenizing.
+type Tokenizer interface {
+	Next() (Token, error)
+}
+
+// LexerScanner is a compatibility layer between Lexers and Scanner.
+type LexerScanner struct {
+	lexer        Lexer
+	scanner      *SubScanner
+	matchedToken string
+	ch           <-chan NgxToken
+}
+
+func (s *LexerScanner) Tokenizer(scanner *SubScanner, matchedtoken string) Tokenizer {
+	s.scanner = scanner
+	s.matchedToken = matchedtoken
+	return s
+}
+
+func (s *LexerScanner) Next() (Token, error) {
+	if s.ch == nil {
+		s.ch = s.lexer.Lex(s.scanner, s.matchedToken)
+	}
+
+	ngxTok, ok := <-s.ch
+	if !ok {
+		return Token{}, ErrTokenizerDone
+	}
+
+	if ngxTok.Error != nil {
+		return Token{}, newScannerErrf(ngxTok.Line, ngxTok.Error.Error())
+	}
+
+	return Token{Text: ngxTok.Value, Line: ngxTok.Line, IsQuoted: ngxTok.IsQuoted}, nil
+}
diff --git a/scanner_test.go b/scanner_test.go