Skip to content

Commit

Permalink
Updates scanner to support Lua extension
Browse files Browse the repository at this point in the history
Fixed up the Scanner logic to mirror changes made to support Lua
extension in Lex. Added a compat layer so that the existing Lua type can
be used with `Scanner` vs trying to refactor the implementation to
remove the channel. Doing so I think would result in further gains.

Benchmarks:

```
❯ go test -benchmem -run=^$ -bench "^(BenchmarkLex|BenchmarkLexWithLua|BenchmarkScanner|BenchmarkScannerWithLua)$" github.com/nginxinc/nginx-go-crossplane -count=1
goos: darwin
goarch: arm64
pkg: github.com/nginxinc/nginx-go-crossplane
BenchmarkLex/simple-10             57963             17756 ns/op          103049 B/op         39 allocs/op
BenchmarkLex/with-comments-10      60025             20067 ns/op          103112 B/op         45 allocs/op
BenchmarkLex/messy-10              26170             47822 ns/op          104400 B/op        168 allocs/op
BenchmarkLex/quote-behavior-10             74510             17693 ns/op          102961 B/op         26 allocs/op
BenchmarkLex/quoted-right-brace-10         43134             27752 ns/op          103560 B/op         54 allocs/op
BenchmarkLex/comments-between-args-10      78271             14866 ns/op          102937 B/op         27 allocs/op
BenchmarkLexWithLua/lua-basic-10           46273             26012 ns/op          105499 B/op         53 allocs/op
BenchmarkLexWithLua/lua-block-simple-10                    22514             54149 ns/op          108556 B/op        143 allocs/op
BenchmarkLexWithLua/lua-block-larger-10                    25983             46605 ns/op          108403 B/op         59 allocs/op
BenchmarkLexWithLua/lua-block-tricky-10                    33756             35067 ns/op          106684 B/op         66 allocs/op
BenchmarkScanner/simple-10                                163138              7084 ns/op            4648 B/op         36 allocs/op
BenchmarkScanner/with-comments-10                         144558              8100 ns/op            4712 B/op         42 allocs/op
BenchmarkScanner/messy-10                                  47570             25026 ns/op            6000 B/op        165 allocs/op
BenchmarkScanner/quote-behavior-10                        222280              5083 ns/op            4560 B/op         23 allocs/op
BenchmarkScanner/quoted-right-brace-10                     82656             14281 ns/op            5160 B/op         51 allocs/op
BenchmarkScanner/comments-between-args-10                 225475              4872 ns/op            4536 B/op         24 allocs/op
BenchmarkScannerWithLua/lua-basic-10                       93081             12833 ns/op            7866 B/op         66 allocs/op
BenchmarkScannerWithLua/lua-block-simple-10                31426             37989 ns/op           10924 B/op        156 allocs/op
BenchmarkScannerWithLua/lua-block-larger-10                37148             30723 ns/op           10770 B/op         72 allocs/op
BenchmarkScannerWithLua/lua-block-tricky-10                54890             22383 ns/op            9050 B/op         79 allocs/op
PASS
ok      github.com/nginxinc/nginx-go-crossplane 29.969s
```
  • Loading branch information
ornj committed Jul 5, 2024
1 parent cc657b1 commit fe04b93
Show file tree
Hide file tree
Showing 4 changed files with 258 additions and 64 deletions.
60 changes: 57 additions & 3 deletions lex.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ type LexOptions struct {
// RegisterLexer is an option that cna be used to add a lexer to tokenize external NGINX tokens.
type RegisterLexer interface {
applyLexOptions(options *LexOptions)
applyScannerOptions(options *scannerOptions)
}

type registerLexer struct {
Expand All @@ -82,6 +83,16 @@ func (rl registerLexer) applyLexOptions(o *LexOptions) {
}
}

func (rl registerLexer) applyScannerOptions(o *scannerOptions) {
if o.extensions == nil {
o.extensions = make(map[string]ScannerExt)
}

for _, s := range rl.stringTokens {
o.extensions[s] = &LexerScanner{lexer: rl.l}
}
}

// LexWithLexer registers a Lexer that implements tokenization of an NGINX configuration after one of the given
// stringTokens is encountered by Lex.
func LexWithLexer(l Lexer, stringTokens ...string) RegisterLexer { //nolint:ireturn
Expand All @@ -106,12 +117,38 @@ func Lex(reader io.Reader) chan NgxToken {
// SubScanner provides an interface for scanning alternative grammars within NGINX configuration data.
type SubScanner struct {
scanner *bufio.Scanner
parent *Scanner
tokenLine int
}

// Scan advances the scanner to the next token which will be available though the Text method. It returns false
// when the scan stops by reaching the end of input.
func (e *SubScanner) Scan() bool {
if e.scanner != nil {
return e.lexScan()
}

if e.parent.err != nil {
return false
}

if !e.parent.scanner.Scan() {
if err := e.parent.scanner.Err(); err != nil {
e.parent.setErr(err)
}
return false
}

// e.parent.prev = e.parent.scanner.Text()
// if isEOL(e.parent.prev) {
if t := e.parent.scanner.Text(); isEOL(t) {
e.parent.lineno++
}

return true
}

func (e *SubScanner) lexScan() bool {
if !e.scanner.Scan() {
return false
}
Expand All @@ -122,13 +159,30 @@ func (e *SubScanner) Scan() bool {
}

// Err returns the fist non-EOF error encountered by the Scanner.
func (e *SubScanner) Err() error { return e.scanner.Err() }
func (e *SubScanner) Err() error {
if e.scanner != nil {
return e.scanner.Err()
}
return e.parent.Err()
}

// Text returns the most recent token generated by a call to Scan.
func (e *SubScanner) Text() string { return e.scanner.Text() }
func (e *SubScanner) Text() string {
if e.scanner != nil {
return e.scanner.Text()
}
// return e.parent.prev
return e.parent.scanner.Text()
}

// Line returns the line number of the most recent token generated by a call to Scan.
func (e *SubScanner) Line() int { return e.tokenLine }
func (e *SubScanner) Line() int {
if e.scanner != nil {
return e.tokenLine
}

return e.parent.lineno
}

//nolint:gocyclo,funlen,gocognit,maintidx
func tokenize(reader io.Reader, tokenCh chan NgxToken, options LexOptions) {
Expand Down
57 changes: 38 additions & 19 deletions lex_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -460,34 +460,53 @@ func TestLex(t *testing.T) {
}
}

var lexToken NgxToken //nolint: gochecknoglobals // trying to avoid return value being optimzed away

func BenchmarkLex(b *testing.B) {
func benchmarkLex(b *testing.B, path string, options LexOptions) {
var t NgxToken

file, err := os.Open(path)
if err != nil {
b.Fatal(err)
}
defer file.Close()
b.ResetTimer()

for i := 0; i < b.N; i++ {
if _, err := file.Seek(0, 0); err != nil {
b.Fatal(err)
}

for tok := range LexWithOptions(file, options) {
t = tok
}
}

_ = t
}

func BenchmarkLex(b *testing.B) {
for _, bm := range lexFixtures {
if strings.HasPrefix(bm.name, "lua") {
continue
}

b.Run(bm.name, func(b *testing.B) {
path := getTestConfigPath(bm.name, "nginx.conf")
file, err := os.Open(path)
if err != nil {
b.Fatal(err)
}
defer file.Close()
b.ResetTimer()
benchmarkLex(b, path, LexOptions{})
})
}
}

for i := 0; i < b.N; i++ {
if _, err := file.Seek(0, 0); err != nil {
b.Fatal(err)
}
func BenchmarkLexWithLua(b *testing.B) {
for _, bm := range lexFixtures {
if !strings.HasPrefix(bm.name, "lua") {
continue
}

for tok := range Lex(file) {
t = tok
}
}
b.Run(bm.name, func(b *testing.B) {
path := getTestConfigPath(bm.name, "nginx.conf")
benchmarkLex(b, path, LexOptions{Lexers: []RegisterLexer{lua.RegisterLexer()}})
})
}

lexToken = t
}

//nolint:gochecknoglobals
Expand Down
125 changes: 111 additions & 14 deletions scanner.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,14 @@ import (
"strings"
)

type scannerOptions struct {
extensions map[string]ScannerExt
}

type ScannerOption interface {
applyScannerOptions(options *scannerOptions)
}

// Token is a lexical token of the NGINX configuration syntax.
type Token struct {
// Text is the string corresponding to the token. It could be a directive or symbol. The value is the actual token
Expand All @@ -20,6 +28,8 @@ type Token struct {
IsQuoted bool
}

func (t Token) String() string { return fmt.Sprintf("{%d, %s, %t}", t.Line, t.Text, t.IsQuoted) }

type scannerError struct {
msg string
line int
Expand Down Expand Up @@ -52,23 +62,33 @@ func LineNumber(err error) (int, bool) {
//
// Use NewScanner to construct a Scanner.
type Scanner struct {
scanner *bufio.Scanner
lineno int
tokenStartLine int
tokenDepth int
repeateSpecialChar bool // only '}' can be repeated
prev string
err error
scanner *bufio.Scanner
lineno int
tokenStartLine int
tokenDepth int
repeateSpecialChar bool // only '}' can be repeated
nextTokenIsDirective bool
prev string
err error
options *scannerOptions
ext Tokenizer
}

// NewScanner returns a new Scanner to read from r.
func NewScanner(r io.Reader) *Scanner {
func NewScanner(r io.Reader, options ...ScannerOption) *Scanner {
opts := &scannerOptions{}
for _, opt := range options {
opt.applyScannerOptions(opts)
}

s := &Scanner{
scanner: bufio.NewScanner(r),
lineno: 1,
tokenStartLine: 1,
tokenDepth: 0,
repeateSpecialChar: false,
scanner: bufio.NewScanner(r),
lineno: 1,
tokenStartLine: 1,
tokenDepth: 0,
repeateSpecialChar: false,
nextTokenIsDirective: true,
options: opts,
}

s.scanner.Split(bufio.ScanRunes)
Expand All @@ -92,7 +112,21 @@ func (s *Scanner) setErr(err error) {

// Scan reads the next token from source and returns it.. It returns io.EOF at the end of the source. Scanner errors are
// returned when encountered.
func (s *Scanner) Scan() (Token, error) { //nolint: funlen, gocognit, gocyclo
func (s *Scanner) Scan() (Token, error) { //nolint: funlen, gocognit, gocyclo, maintidx // sorry
if s.ext != nil {
t, err := s.ext.Next()
if err != nil {
if !errors.Is(err, ErrTokenizerDone) {
s.setErr(err)
return Token{}, s.err
}

s.ext = nil
} else {
return t, nil
}
}

var tok strings.Builder

lexState := skipSpace
Expand Down Expand Up @@ -129,6 +163,7 @@ func (s *Scanner) Scan() (Token, error) { //nolint: funlen, gocognit, gocyclo
r = nextRune
if isEOL(r) {
s.lineno++
s.nextTokenIsDirective = true
}
default:
readNext = true
Expand All @@ -149,6 +184,16 @@ func (s *Scanner) Scan() (Token, error) { //nolint: funlen, gocognit, gocyclo
r = "\\" + r
}

if tok.Len() > 0 {
t := tok.String()
if s.nextTokenIsDirective {
if ext, ok := s.options.extensions[t]; ok {
s.ext = ext.Tokenizer(&SubScanner{parent: s, tokenLine: s.tokenStartLine}, t)
return Token{Text: t, Line: s.tokenStartLine}, nil
}
}
}

switch lexState {
case skipSpace:
if !isSpace(r) {
Expand All @@ -166,11 +211,13 @@ func (s *Scanner) Scan() (Token, error) { //nolint: funlen, gocognit, gocyclo
tok.WriteString(r)
lexState = inComment
s.tokenStartLine = s.lineno
s.nextTokenIsDirective = false
continue
}
}

if isSpace(r) {
s.nextTokenIsDirective = false
return Token{Text: tok.String(), Line: s.tokenStartLine}, nil
}

Expand All @@ -179,6 +226,7 @@ func (s *Scanner) Scan() (Token, error) { //nolint: funlen, gocognit, gocyclo
tok.WriteString(r)
lexState = inVar
s.repeateSpecialChar = false
s.nextTokenIsDirective = false
continue
}

Expand Down Expand Up @@ -223,6 +271,7 @@ func (s *Scanner) Scan() (Token, error) { //nolint: funlen, gocognit, gocyclo
}

tok.WriteString(r)
s.nextTokenIsDirective = true
return Token{Text: tok.String(), Line: s.tokenStartLine}, nil
}

Expand Down Expand Up @@ -250,3 +299,51 @@ func (s *Scanner) Scan() (Token, error) { //nolint: funlen, gocognit, gocyclo
}
}
}

// ScannerExt is the interface that describes an extension for the [Scanner]. Scanner extensions enable scanning of
// configurations that contain syntaxes that do not follow the usual grammar.
type ScannerExt interface {
Tokenizer(s *SubScanner, matchedToken string) Tokenizer
}

// ErrTokenizerDone is returned by [Tokenizer] when tokenization is complete.
var ErrTokenizerDone = errors.New("done")

// Tokenizer is the interface that wraps the Next method.
//
// Next returns the next token scanned from the NGINX configuration or an error if the configuration cannot be
// tokenized. Return the special error, [ErrTokenizerDone] when finished tokenizing.
type Tokenizer interface {
Next() (Token, error)
}

// LexerScanner is a compatibility layer between Lexers and Scanner.
type LexerScanner struct {
lexer Lexer
scanner *SubScanner
matchedToken string
ch <-chan NgxToken
}

func (s *LexerScanner) Tokenizer(scanner *SubScanner, matchedtoken string) Tokenizer {

Check failure on line 328 in scanner.go

View workflow job for this annotation

GitHub Actions / Linting

Tokenizer returns interface (github.com/nginxinc/nginx-go-crossplane.Tokenizer) (ireturn)
s.scanner = scanner
s.matchedToken = matchedtoken
return s
}

func (s *LexerScanner) Next() (Token, error) {
if s.ch == nil {
s.ch = s.lexer.Lex(s.scanner, s.matchedToken)
}

ngxTok, ok := <-s.ch
if !ok {
return Token{}, ErrTokenizerDone
}

if ngxTok.Error != nil {
return Token{}, newScannerErrf(ngxTok.Line, ngxTok.Error.Error())
}

return Token{Text: ngxTok.Value, Line: ngxTok.Line, IsQuoted: ngxTok.IsQuoted}, nil
}
Loading

0 comments on commit fe04b93

Please sign in to comment.