Skip to content

Commit

Permalink
Updates scanner to support Lua extension
Browse files Browse the repository at this point in the history
Fixed up the Scanner logic to mirror changes made to support Lua
extension in Lex. Added a compat layer so that the existing Lua type can
be used with `Scanner` vs trying to refactor the implementation to
remove the channel. Doing so I think would result in further gains.

Benchmarks:

```
goarch: arm64
pkg: github.com/nginxinc/nginx-go-crossplane
BenchmarkLex/simple-10             61224             18869 ns/op          103049 B/op         39 allocs/op
BenchmarkLex/with-comments-10      56320             19776 ns/op          103113 B/op         45 allocs/op
BenchmarkLex/messy-10              25918             47312 ns/op          104400 B/op        168 allocs/op
BenchmarkLex/quote-behavior-10             72890             15389 ns/op          102960 B/op         26 allocs/op
BenchmarkLex/quoted-right-brace-10         44002             27143 ns/op          103561 B/op         54 allocs/op
BenchmarkLex/comments-between-args-10      79369             15303 ns/op          102937 B/op         27 allocs/op
BenchmarkLexWithLua/lua-basic-10           51590             23743 ns/op          103385 B/op         49 allocs/op
BenchmarkLexWithLua/lua-block-simple-10                    24564             48282 ns/op          104488 B/op        157 allocs/op
BenchmarkLexWithLua/lua-block-larger-10                    23427             48567 ns/op          104376 B/op        144 allocs/op
BenchmarkLexWithLua/lua-block-tricky-10                    33526             36308 ns/op          103896 B/op        117 allocs/op
BenchmarkScanner/simple-10                                170299              7450 ns/op            4648 B/op         36 allocs/op
BenchmarkScanner/with-comments-10                         120178              9462 ns/op            4712 B/op         42 allocs/op
BenchmarkScanner/messy-10                                  43105             27796 ns/op            6000 B/op        165 allocs/op
BenchmarkScanner/quote-behavior-10                        207045              5587 ns/op            4560 B/op         23 allocs/op
BenchmarkScanner/quoted-right-brace-10                     79261             15685 ns/op            5160 B/op         51 allocs/op
BenchmarkScanner/comments-between-args-10                 216628              5411 ns/op            4536 B/op         24 allocs/op
BenchmarkScannerWithLua/lua-basic-10                       80594             15127 ns/op            7867 B/op         66 allocs/op
BenchmarkScannerWithLua/lua-block-simple-10                28033             42450 ns/op           10922 B/op        156 allocs/op
BenchmarkScannerWithLua/lua-block-larger-10                33932             33655 ns/op           10771 B/op         72 allocs/op
BenchmarkScannerWithLua/lua-block-tricky-10                51888             23334 ns/op            9050 B/op         79 allocs/op
PASS
ok      github.com/nginxinc/nginx-go-crossplane 30.055s
```
  • Loading branch information
ornj committed Jul 5, 2024
1 parent 9956b1b commit 00fcf80
Show file tree
Hide file tree
Showing 5 changed files with 272 additions and 63 deletions.
60 changes: 57 additions & 3 deletions lex.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ type LexOptions struct {
// RegisterLexer is an option that cna be used to add a lexer to tokenize external NGINX tokens.
type RegisterLexer interface {
applyLexOptions(options *LexOptions)
applyScannerOptions(options *scannerOptions)
}

type registerLexer struct {
Expand All @@ -82,6 +83,16 @@ func (rl registerLexer) applyLexOptions(o *LexOptions) {
}
}

func (rl registerLexer) applyScannerOptions(o *scannerOptions) {
if o.extensions == nil {
o.extensions = make(map[string]ScannerExt)
}

for _, s := range rl.stringTokens {
o.extensions[s] = &LexerScanner{lexer: rl.l}
}
}

// LexWithLexer registers a Lexer that implements tokenization of an NGINX configuration after one of the given
// stringTokens is encountered by Lex.
func LexWithLexer(l Lexer, stringTokens ...string) RegisterLexer { //nolint:ireturn
Expand All @@ -106,12 +117,38 @@ func Lex(reader io.Reader) chan NgxToken {
// SubScanner provides an interface for scanning alternative grammars within NGINX configuration data.
type SubScanner struct {
scanner *bufio.Scanner
parent *Scanner
tokenLine int
}

// Scan advances the scanner to the next token which will be available though the Text method. It returns false
// when the scan stops by reaching the end of input.
func (e *SubScanner) Scan() bool {
if e.scanner != nil {
return e.lexScan()
}

if e.parent.err != nil {
return false
}

if !e.parent.scanner.Scan() {
if err := e.parent.scanner.Err(); err != nil {
e.parent.setErr(err)
}
return false
}

// e.parent.prev = e.parent.scanner.Text()
// if isEOL(e.parent.prev) {
if t := e.parent.scanner.Text(); isEOL(t) {
e.parent.lineno++
}

return true
}

func (e *SubScanner) lexScan() bool {
if !e.scanner.Scan() {
return false
}
Expand All @@ -122,13 +159,30 @@ func (e *SubScanner) Scan() bool {
}

// Err returns the fist non-EOF error encountered by the Scanner.
func (e *SubScanner) Err() error { return e.scanner.Err() }
func (e *SubScanner) Err() error {
if e.scanner != nil {
return e.scanner.Err()
}
return e.parent.Err()
}

// Text returns the most recent token generated by a call to Scan.
func (e *SubScanner) Text() string { return e.scanner.Text() }
func (e *SubScanner) Text() string {
if e.scanner != nil {
return e.scanner.Text()
}
// return e.parent.prev
return e.parent.scanner.Text()
}

// Line returns the line number of the most recent token generated by a call to Scan.
func (e *SubScanner) Line() int { return e.tokenLine }
func (e *SubScanner) Line() int {
if e.scanner != nil {
return e.tokenLine
}

return e.parent.lineno
}

//nolint:gocyclo,funlen,gocognit,maintidx
func tokenize(reader io.Reader, tokenCh chan NgxToken, options LexOptions) {
Expand Down
57 changes: 38 additions & 19 deletions lex_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -460,34 +460,53 @@ func TestLex(t *testing.T) {
}
}

var lexToken NgxToken //nolint: gochecknoglobals // trying to avoid return value being optimzed away

func BenchmarkLex(b *testing.B) {
func benchmarkLex(b *testing.B, path string, options LexOptions) {

Check warning on line 463 in lex_test.go

View workflow job for this annotation

GitHub Actions / Linting

unused-parameter: parameter 'options' seems to be unused, consider removing or renaming it as _ (revive)
var t NgxToken

file, err := os.Open(path)
if err != nil {
b.Fatal(err)
}
defer file.Close()
b.ResetTimer()

for i := 0; i < b.N; i++ {
if _, err := file.Seek(0, 0); err != nil {
b.Fatal(err)
}

for tok := range Lex(file) {
t = tok
}
}

_ = t
}

func BenchmarkLex(b *testing.B) {
for _, bm := range lexFixtures {
if strings.HasPrefix(bm.name, "lua") {
continue
}

b.Run(bm.name, func(b *testing.B) {
path := getTestConfigPath(bm.name, "nginx.conf")
file, err := os.Open(path)
if err != nil {
b.Fatal(err)
}
defer file.Close()
b.ResetTimer()
benchmarkLex(b, path, LexOptions{})
})
}
}

for i := 0; i < b.N; i++ {
if _, err := file.Seek(0, 0); err != nil {
b.Fatal(err)
}
func BenchmarkLexWithLua(b *testing.B) {
for _, bm := range lexFixtures {
if !strings.HasPrefix(bm.name, "lua") {
continue
}

for tok := range Lex(file) {
t = tok
}
}
b.Run(bm.name, func(b *testing.B) {
path := getTestConfigPath(bm.name, "nginx.conf")
benchmarkLex(b, path, LexOptions{})
})
}

lexToken = t
}

//nolint:gochecknoglobals
Expand Down
11 changes: 11 additions & 0 deletions lua.go
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,17 @@ func (l *Lua) Lex(s *SubScanner, matchedToken string) <-chan NgxToken {
return tokenCh
}

type LuaScanner struct {
scanner *SubScanner

Check failure on line 179 in lua.go

View workflow job for this annotation

GitHub Actions / Linting

field `scanner` is unused (unused)
matchedToken string

Check failure on line 180 in lua.go

View workflow job for this annotation

GitHub Actions / Linting

field `matchedToken` is unused (unused)
lua *Lua

Check failure on line 181 in lua.go

View workflow job for this annotation

GitHub Actions / Linting

field `lua` is unused (unused)
ch <-chan NgxToken

Check failure on line 182 in lua.go

View workflow job for this annotation

GitHub Actions / Linting

field `ch` is unused (unused)
}

func (l *Lua) Init(s *SubScanner, matchedToken string) Tokenizer {

Check warning on line 185 in lua.go

View workflow job for this annotation

GitHub Actions / Linting

unused-parameter: parameter 's' seems to be unused, consider removing or renaming it as _ (revive)
return &LexerScanner{lexer: l, matchedToken: matchedToken}
}

// RegisterBuilder registers a builder for generating Lua NGINX configuration.
func (l *Lua) RegisterBuilder() RegisterBuilder { //nolint:ireturn
return BuildWithBuilder(l, l.directiveNames()...)
Expand Down
127 changes: 114 additions & 13 deletions scanner.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,18 @@ import (
"strings"
)

type scannerOptions struct {
extensions map[string]ScannerExt
}

type ScannerOption interface {
applyScannerOptions(options *scannerOptions)
}

type scannerOptionFunc func(*scannerOptions)

Check failure on line 19 in scanner.go

View workflow job for this annotation

GitHub Actions / Linting

type `scannerOptionFunc` is unused (unused)

func (opt scannerOptionFunc) applyScanner(opts *scannerOptions) { opt(opts) }

Check failure on line 21 in scanner.go

View workflow job for this annotation

GitHub Actions / Linting

func `scannerOptionFunc.applyScanner` is unused (unused)

// Token is a lexical token of the NGINX configuration syntax.
type Token struct {
// Text is the string corresponding to the token. It could be a directive or symbol. The value is the actual token
Expand All @@ -20,6 +32,8 @@ type Token struct {
IsQuoted bool
}

func (t Token) String() string { return fmt.Sprintf("{%d, %s, %t}", t.Line, t.Text, t.IsQuoted) }

type scannerError struct {
msg string
line int
Expand Down Expand Up @@ -52,23 +66,33 @@ func LineNumber(err error) (int, bool) {
//
// Use NewScanner to construct a Scanner.
type Scanner struct {
scanner *bufio.Scanner
lineno int
tokenStartLine int
tokenDepth int
repeateSpecialChar bool // only '}' can be repeated
prev string
err error
scanner *bufio.Scanner
lineno int
tokenStartLine int
tokenDepth int
repeateSpecialChar bool // only '}' can be repeated
nextTokenIsDirective bool
prev string
err error
options *scannerOptions
ext Tokenizer
}

// NewScanner returns a new Scanner to read from r.
func NewScanner(r io.Reader) *Scanner {
func NewScanner(r io.Reader, options ...ScannerOption) *Scanner {
opts := &scannerOptions{}
for _, opt := range options {
opt.applyScannerOptions(opts)
}

s := &Scanner{
scanner: bufio.NewScanner(r),
lineno: 1,
tokenStartLine: 1,
tokenDepth: 0,
repeateSpecialChar: false,
scanner: bufio.NewScanner(r),
lineno: 1,
tokenStartLine: 1,
tokenDepth: 0,
repeateSpecialChar: false,
nextTokenIsDirective: true,
options: opts,
}

s.scanner.Split(bufio.ScanRunes)
Expand All @@ -93,6 +117,20 @@ func (s *Scanner) setErr(err error) {
// Scan reads the next token from source and returns it.. It returns io.EOF at the end of the source. Scanner errors are
// returned when encountered.
func (s *Scanner) Scan() (Token, error) { //nolint: funlen, gocognit, gocyclo

Check failure on line 119 in scanner.go

View workflow job for this annotation

GitHub Actions / Linting

Function name: Scan, Cyclomatic Complexity: 47, Halstead Volume: 4265.99, Maintainability Index: 18 (maintidx)
if s.ext != nil {
t, err := s.ext.Next()
if err != nil {
if !errors.Is(err, TokenizerDone) {
s.setErr(err)
return Token{}, s.err
}

s.ext = nil
} else {
return t, nil
}
}

var tok strings.Builder

lexState := skipSpace
Expand Down Expand Up @@ -129,6 +167,7 @@ func (s *Scanner) Scan() (Token, error) { //nolint: funlen, gocognit, gocyclo
r = nextRune
if isEOL(r) {
s.lineno++
s.nextTokenIsDirective = true
}
default:
readNext = true
Expand All @@ -149,6 +188,16 @@ func (s *Scanner) Scan() (Token, error) { //nolint: funlen, gocognit, gocyclo
r = "\\" + r
}

if tok.Len() > 0 {
t := tok.String()
if s.nextTokenIsDirective {
if ext, ok := s.options.extensions[t]; ok {
s.ext = ext.Tokenizer(&SubScanner{parent: s, tokenLine: s.tokenStartLine}, t)
return Token{Text: t, Line: s.tokenStartLine}, nil
}
}
}

switch lexState {
case skipSpace:
if !isSpace(r) {
Expand All @@ -166,11 +215,13 @@ func (s *Scanner) Scan() (Token, error) { //nolint: funlen, gocognit, gocyclo
tok.WriteString(r)
lexState = inComment
s.tokenStartLine = s.lineno
s.nextTokenIsDirective = false
continue
}
}

if isSpace(r) {
s.nextTokenIsDirective = false
return Token{Text: tok.String(), Line: s.tokenStartLine}, nil
}

Expand All @@ -179,6 +230,7 @@ func (s *Scanner) Scan() (Token, error) { //nolint: funlen, gocognit, gocyclo
tok.WriteString(r)
lexState = inVar
s.repeateSpecialChar = false
s.nextTokenIsDirective = false
continue
}

Expand Down Expand Up @@ -223,6 +275,7 @@ func (s *Scanner) Scan() (Token, error) { //nolint: funlen, gocognit, gocyclo
}

tok.WriteString(r)
s.nextTokenIsDirective = true
return Token{Text: tok.String(), Line: s.tokenStartLine}, nil
}

Expand Down Expand Up @@ -250,3 +303,51 @@ func (s *Scanner) Scan() (Token, error) { //nolint: funlen, gocognit, gocyclo
}
}
}

// ScannerExt is the interface that describes an extension for the [Scanner]. Scanner extensions enable scanning of
// configurations that contain syntaxes that do not follow the usual grammar.
type ScannerExt interface {
Tokenizer(s *SubScanner, matchedToken string) Tokenizer
}

// TokenizerDone is returned by [Tokenizer] when tokenization is complete.
var TokenizerDone = errors.New("done")

Check warning on line 314 in scanner.go

View workflow job for this annotation

GitHub Actions / Linting

error-naming: error var TokenizerDone should have name of the form ErrFoo (revive)

// Tokenizer is the interface that wraps the Next method.
//
// Next returns the next token scanned from the NGINX configuration or an error if the configuration cannot be
// tokenized. Return the special error, [TokenizerDone] when finished tokenizing.
type Tokenizer interface {
Next() (Token, error)
}

// LexerScanner is a compatibility layer between Lexers and Scanner.
type LexerScanner struct {
lexer Lexer
scanner *SubScanner
matchedToken string
ch <-chan NgxToken
}

func (s *LexerScanner) Tokenizer(scanner *SubScanner, matchedtoken string) Tokenizer {

Check failure on line 332 in scanner.go

View workflow job for this annotation

GitHub Actions / Linting

Tokenizer returns interface (github.com/nginxinc/nginx-go-crossplane.Tokenizer) (ireturn)
s.scanner = scanner
s.matchedToken = matchedtoken
return s
}

func (s *LexerScanner) Next() (Token, error) {
if s.ch == nil {
s.ch = s.lexer.Lex(s.scanner, s.matchedToken)
}

ngxTok, ok := <-s.ch
if !ok {
return Token{}, TokenizerDone
}

if ngxTok.Error != nil {
return Token{}, newScannerErrf(ngxTok.Line, ngxTok.Error.Error())
}

return Token{Text: ngxTok.Value, Line: ngxTok.Line, IsQuoted: ngxTok.IsQuoted}, nil
}
Loading

0 comments on commit 00fcf80

Please sign in to comment.