diff --git a/lex.go b/lex.go index 4b69cbea..a39b8a08 100644 --- a/lex.go +++ b/lex.go @@ -65,6 +65,7 @@ type LexOptions struct { // RegisterLexer is an option that cna be used to add a lexer to tokenize external NGINX tokens. type RegisterLexer interface { applyLexOptions(options *LexOptions) + applyScannerOptions(options *scannerOptions) } type registerLexer struct { @@ -82,6 +83,16 @@ func (rl registerLexer) applyLexOptions(o *LexOptions) { } } +func (rl registerLexer) applyScannerOptions(o *scannerOptions) { + if o.extensions == nil { + o.extensions = make(map[string]ScannerExt) + } + + for _, s := range rl.stringTokens { + o.extensions[s] = &LexerScanner{lexer: rl.l} + } +} + // LexWithLexer registers a Lexer that implements tokenization of an NGINX configuration after one of the given // stringTokens is encountered by Lex. func LexWithLexer(l Lexer, stringTokens ...string) RegisterLexer { //nolint:ireturn @@ -106,12 +117,38 @@ func Lex(reader io.Reader) chan NgxToken { // SubScanner provides an interface for scanning alternative grammars within NGINX configuration data. type SubScanner struct { scanner *bufio.Scanner + parent *Scanner tokenLine int } // Scan advances the scanner to the next token which will be available though the Text method. It returns false // when the scan stops by reaching the end of input. func (e *SubScanner) Scan() bool { + if e.scanner != nil { + return e.lexScan() + } + + if e.parent.err != nil { + return false + } + + if !e.parent.scanner.Scan() { + if err := e.parent.scanner.Err(); err != nil { + e.parent.setErr(err) + } + return false + } + + // e.parent.prev = e.parent.scanner.Text() + // if isEOL(e.parent.prev) { + if t := e.parent.scanner.Text(); isEOL(t) { + e.parent.lineno++ + } + + return true +} + +func (e *SubScanner) lexScan() bool { if !e.scanner.Scan() { return false } @@ -122,13 +159,30 @@ func (e *SubScanner) Scan() bool { } // Err returns the fist non-EOF error encountered by the Scanner. -func (e *SubScanner) Err() error { return e.scanner.Err() } +func (e *SubScanner) Err() error { + if e.scanner != nil { + return e.scanner.Err() + } + return e.parent.Err() +} // Text returns the most recent token generated by a call to Scan. -func (e *SubScanner) Text() string { return e.scanner.Text() } +func (e *SubScanner) Text() string { + if e.scanner != nil { + return e.scanner.Text() + } + // return e.parent.prev + return e.parent.scanner.Text() +} // Line returns the line number of the most recent token generated by a call to Scan. -func (e *SubScanner) Line() int { return e.tokenLine } +func (e *SubScanner) Line() int { + if e.scanner != nil { + return e.tokenLine + } + + return e.parent.lineno +} //nolint:gocyclo,funlen,gocognit,maintidx func tokenize(reader io.Reader, tokenCh chan NgxToken, options LexOptions) { diff --git a/lex_test.go b/lex_test.go index e022ad8c..985d9841 100644 --- a/lex_test.go +++ b/lex_test.go @@ -460,34 +460,53 @@ func TestLex(t *testing.T) { } } -var lexToken NgxToken //nolint: gochecknoglobals // trying to avoid return value being optimzed away - -func BenchmarkLex(b *testing.B) { +func benchmarkLex(b *testing.B, path string, options LexOptions) { var t NgxToken + file, err := os.Open(path) + if err != nil { + b.Fatal(err) + } + defer file.Close() + b.ResetTimer() + + for i := 0; i < b.N; i++ { + if _, err := file.Seek(0, 0); err != nil { + b.Fatal(err) + } + + for tok := range Lex(file) { + t = tok + } + } + + _ = t +} + +func BenchmarkLex(b *testing.B) { for _, bm := range lexFixtures { + if strings.HasPrefix(bm.name, "lua") { + continue + } + b.Run(bm.name, func(b *testing.B) { path := getTestConfigPath(bm.name, "nginx.conf") - file, err := os.Open(path) - if err != nil { - b.Fatal(err) - } - defer file.Close() - b.ResetTimer() + benchmarkLex(b, path, LexOptions{}) + }) + } +} - for i := 0; i < b.N; i++ { - if _, err := file.Seek(0, 0); err != nil { - b.Fatal(err) - } +func BenchmarkLexWithLua(b *testing.B) { + for _, bm := range lexFixtures { + if !strings.HasPrefix(bm.name, "lua") { + continue + } - for tok := range Lex(file) { - t = tok - } - } + b.Run(bm.name, func(b *testing.B) { + path := getTestConfigPath(bm.name, "nginx.conf") + benchmarkLex(b, path, LexOptions{}) }) } - - lexToken = t } //nolint:gochecknoglobals diff --git a/lua.go b/lua.go index dd4a45ba..3c5cb8ae 100644 --- a/lua.go +++ b/lua.go @@ -175,6 +175,17 @@ func (l *Lua) Lex(s *SubScanner, matchedToken string) <-chan NgxToken { return tokenCh } +type LuaScanner struct { + scanner *SubScanner + matchedToken string + lua *Lua + ch <-chan NgxToken +} + +func (l *Lua) Init(s *SubScanner, matchedToken string) Tokenizer { + return &LexerScanner{lexer: l, matchedToken: matchedToken} +} + // RegisterBuilder registers a builder for generating Lua NGINX configuration. func (l *Lua) RegisterBuilder() RegisterBuilder { //nolint:ireturn return BuildWithBuilder(l, l.directiveNames()...) diff --git a/scanner.go b/scanner.go index 683d1cb3..7afff69d 100644 --- a/scanner.go +++ b/scanner.go @@ -8,6 +8,18 @@ import ( "strings" ) +type scannerOptions struct { + extensions map[string]ScannerExt +} + +type ScannerOption interface { + applyScannerOptions(options *scannerOptions) +} + +type scannerOptionFunc func(*scannerOptions) + +func (opt scannerOptionFunc) applyScanner(opts *scannerOptions) { opt(opts) } + // Token is a lexical token of the NGINX configuration syntax. type Token struct { // Text is the string corresponding to the token. It could be a directive or symbol. The value is the actual token @@ -20,6 +32,8 @@ type Token struct { IsQuoted bool } +func (t Token) String() string { return fmt.Sprintf("{%d, %s, %t}", t.Line, t.Text, t.IsQuoted) } + type scannerError struct { msg string line int @@ -52,23 +66,33 @@ func LineNumber(err error) (int, bool) { // // Use NewScanner to construct a Scanner. type Scanner struct { - scanner *bufio.Scanner - lineno int - tokenStartLine int - tokenDepth int - repeateSpecialChar bool // only '}' can be repeated - prev string - err error + scanner *bufio.Scanner + lineno int + tokenStartLine int + tokenDepth int + repeateSpecialChar bool // only '}' can be repeated + nextTokenIsDirective bool + prev string + err error + options *scannerOptions + ext Tokenizer } // NewScanner returns a new Scanner to read from r. -func NewScanner(r io.Reader) *Scanner { +func NewScanner(r io.Reader, options ...ScannerOption) *Scanner { + opts := &scannerOptions{} + for _, opt := range options { + opt.applyScannerOptions(opts) + } + s := &Scanner{ - scanner: bufio.NewScanner(r), - lineno: 1, - tokenStartLine: 1, - tokenDepth: 0, - repeateSpecialChar: false, + scanner: bufio.NewScanner(r), + lineno: 1, + tokenStartLine: 1, + tokenDepth: 0, + repeateSpecialChar: false, + nextTokenIsDirective: true, + options: opts, } s.scanner.Split(bufio.ScanRunes) @@ -93,6 +117,20 @@ func (s *Scanner) setErr(err error) { // Scan reads the next token from source and returns it.. It returns io.EOF at the end of the source. Scanner errors are // returned when encountered. func (s *Scanner) Scan() (Token, error) { //nolint: funlen, gocognit, gocyclo + if s.ext != nil { + t, err := s.ext.Next() + if err != nil { + if !errors.Is(err, TokenizerDone) { + s.setErr(err) + return Token{}, s.err + } + + s.ext = nil + } else { + return t, nil + } + } + var tok strings.Builder lexState := skipSpace @@ -129,6 +167,7 @@ func (s *Scanner) Scan() (Token, error) { //nolint: funlen, gocognit, gocyclo r = nextRune if isEOL(r) { s.lineno++ + s.nextTokenIsDirective = true } default: readNext = true @@ -149,6 +188,16 @@ func (s *Scanner) Scan() (Token, error) { //nolint: funlen, gocognit, gocyclo r = "\\" + r } + if tok.Len() > 0 { + t := tok.String() + if s.nextTokenIsDirective { + if ext, ok := s.options.extensions[t]; ok { + s.ext = ext.Tokenizer(&SubScanner{parent: s, tokenLine: s.tokenStartLine}, t) + return Token{Text: t, Line: s.tokenStartLine}, nil + } + } + } + switch lexState { case skipSpace: if !isSpace(r) { @@ -166,11 +215,13 @@ func (s *Scanner) Scan() (Token, error) { //nolint: funlen, gocognit, gocyclo tok.WriteString(r) lexState = inComment s.tokenStartLine = s.lineno + s.nextTokenIsDirective = false continue } } if isSpace(r) { + s.nextTokenIsDirective = false return Token{Text: tok.String(), Line: s.tokenStartLine}, nil } @@ -179,6 +230,7 @@ func (s *Scanner) Scan() (Token, error) { //nolint: funlen, gocognit, gocyclo tok.WriteString(r) lexState = inVar s.repeateSpecialChar = false + s.nextTokenIsDirective = false continue } @@ -223,6 +275,7 @@ func (s *Scanner) Scan() (Token, error) { //nolint: funlen, gocognit, gocyclo } tok.WriteString(r) + s.nextTokenIsDirective = true return Token{Text: tok.String(), Line: s.tokenStartLine}, nil } @@ -250,3 +303,51 @@ func (s *Scanner) Scan() (Token, error) { //nolint: funlen, gocognit, gocyclo } } } + +// ScannerExt is the interface that describes an extension for the [Scanner]. Scanner extensions enable scanning of +// configurations that contain syntaxes that do not follow the usual grammar. +type ScannerExt interface { + Tokenizer(s *SubScanner, matchedToken string) Tokenizer +} + +// TokenizerDone is returned by [Tokenizer] when tokenization is complete. +var TokenizerDone = errors.New("done") + +// Tokenizer is the interface that wraps the Next method. +// +// Next returns the next token scanned from the NGINX configuration or an error if the configuration cannot be +// tokenized. Return the special error, [TokenizerDone] when finished tokenizing. +type Tokenizer interface { + Next() (Token, error) +} + +// LexerScanner is a compatibility layer between Lexers and Scanner. +type LexerScanner struct { + lexer Lexer + scanner *SubScanner + matchedToken string + ch <-chan NgxToken +} + +func (s *LexerScanner) Tokenizer(scanner *SubScanner, matchedtoken string) Tokenizer { + s.scanner = scanner + s.matchedToken = matchedtoken + return s +} + +func (s *LexerScanner) Next() (Token, error) { + if s.ch == nil { + s.ch = s.lexer.Lex(s.scanner, s.matchedToken) + } + + ngxTok, ok := <-s.ch + if !ok { + return Token{}, TokenizerDone + } + + if ngxTok.Error != nil { + return Token{}, newScannerErrf(ngxTok.Line, ngxTok.Error.Error()) + } + + return Token{Text: ngxTok.Value, Line: ngxTok.Line, IsQuoted: ngxTok.IsQuoted}, nil +} diff --git a/scanner_test.go b/scanner_test.go index 0774a50a..7f2f5b1e 100644 --- a/scanner_test.go +++ b/scanner_test.go @@ -15,6 +15,7 @@ func TestScanner(t *testing.T) { for _, f := range lexFixtures { f := f + t.Run(f.name, func(t *testing.T) { t.Parallel() @@ -25,7 +26,7 @@ func TestScanner(t *testing.T) { } defer file.Close() - s := NewScanner(file) + s := NewScanner(file, lua.RegisterLexer()) i := 0 for { @@ -42,8 +43,8 @@ func TestScanner(t *testing.T) { } want := f.tokens[i] - require.Equal(t, want.value, got.Text) - require.Equal(t, want.line, got.Line) + require.Equal(t, want.value, got.Text, "got=%s", got) + require.Equal(t, want.line, got.Line, "got=%s", got) i++ } }) @@ -58,7 +59,7 @@ func TestScanner_unhappy(t *testing.T) { t.Run(name, func(t *testing.T) { t.Parallel() - s := NewScanner(strings.NewReader(c)) + s := NewScanner(strings.NewReader(c), lua.RegisterLexer()) for { _, err := s.Scan() if err == io.EOF { @@ -83,38 +84,61 @@ func TestScanner_unhappy(t *testing.T) { } } -var t Token //nolint: gochecknoglobals // trying to avoid return value being optimzed away +func benchmarkScanner(b *testing.B, path string, options ...ScannerOption) { + var t Token -func BenchmarkScan(b *testing.B) { - for _, bm := range lexFixtures { - b.Run(bm.name, func(b *testing.B) { - path := getTestConfigPath(bm.name, "nginx.conf") - file, err := os.Open(path) + file, err := os.Open(path) + if err != nil { + b.Fatal(err) + } + defer file.Close() + + b.ResetTimer() + + for i := 0; i < b.N; i++ { + if _, err := file.Seek(0, 0); err != nil { + b.Fatal(err) + } + + s := NewScanner(file, options...) + + for { + tok, err := s.Scan() + if err == io.EOF { + break + } if err != nil { b.Fatal(err) } - defer file.Close() + t = tok + } + } - b.ResetTimer() + _ = t +} - for i := 0; i < b.N; i++ { - if _, err := file.Seek(0, 0); err != nil { - b.Fatal(err) - } +func BenchmarkScanner(b *testing.B) { + for _, bm := range lexFixtures { + if strings.HasPrefix(bm.name, "lua") { + continue + } - s := NewScanner(file) + b.Run(bm.name, func(b *testing.B) { + path := getTestConfigPath(bm.name, "nginx.conf") + benchmarkScanner(b, path) + }) + } +} - for { - tok, err := s.Scan() - if err == io.EOF { - break - } - if err != nil { - b.Fatal(err) - } - t = tok - } - } +func BenchmarkScannerWithLua(b *testing.B) { + for _, bm := range lexFixtures { + if !strings.HasPrefix(bm.name, "lua") { + continue + } + + b.Run(bm.name, func(b *testing.B) { + path := getTestConfigPath(bm.name, "nginx.conf") + benchmarkScanner(b, path, lua.RegisterLexer()) }) } }