From 5575a252eb4708750c479e788449be7a12dbb227 Mon Sep 17 00:00:00 2001 From: Alec Thomas Date: Sun, 4 Sep 2022 20:41:48 +1000 Subject: [PATCH] Allow lexers to be code-generated from JSON. - Add a CLI tool that can ingest the JSON and dump out the generated code. - Lexers can now be JSON marshalled. - Add a goreleaser step for the binary. As discussed in #213 --- .github/workflows/release.yml | 17 + .goreleaser.yml | 37 ++ COPYING | 2 +- bin/.goreleaser-1.11.2.pkg | 1 + bin/.jq-1.6.pkg | 1 + bin/goreleaser | 1 + bin/hermit.hcl | 3 + bin/jq | 1 + .../participle/gen_lexer_cmd.go | 108 +++- cmd/participle/go.mod | 10 + cmd/participle/go.sum | 17 + cmd/participle/main.go | 22 + go.mod | 7 +- go.sum | 4 + lexer/internal/basiclexer.go | 561 ++++++++++++++++++ lexer/internal/basiclexer.json | 32 + lexer/internal/codegen_gen_test.go | 336 ----------- lexer/internal/codegen_test.go | 106 ---- lexer/stateful.go | 113 +++- lexer/stateful_codegen_test.go | 437 -------------- lexer/stateful_test.go | 39 +- scripts/participle | 4 + scripts/regen-lexer | 3 + 23 files changed, 914 insertions(+), 948 deletions(-) create mode 100644 .github/workflows/release.yml create mode 100644 .goreleaser.yml create mode 120000 bin/.goreleaser-1.11.2.pkg create mode 120000 bin/.jq-1.6.pkg create mode 120000 bin/goreleaser create mode 120000 bin/jq rename lexer/codegen.go => cmd/participle/gen_lexer_cmd.go (80%) create mode 100644 cmd/participle/go.mod create mode 100644 cmd/participle/go.sum create mode 100644 cmd/participle/main.go create mode 100644 lexer/internal/basiclexer.go create mode 100644 lexer/internal/basiclexer.json delete mode 100644 lexer/internal/codegen_gen_test.go delete mode 100644 lexer/internal/codegen_test.go delete mode 100644 lexer/stateful_codegen_test.go create mode 100755 scripts/participle create mode 100755 scripts/regen-lexer diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 00000000..3d06f8fa --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,17 @@ +name: Release +on: + push: + tags: + - 'v*' +jobs: + release: + name: Release + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + with: + fetch-depth: 0 + - run: ./bin/hermit env --raw >> $GITHUB_ENV + - run: goreleaser release + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.goreleaser.yml b/.goreleaser.yml new file mode 100644 index 00000000..4018712d --- /dev/null +++ b/.goreleaser.yml @@ -0,0 +1,37 @@ +project_name: participle +release: + github: + owner: alecthomas + name: participle +brews: + - + install: bin.install "participle" +env: + - CGO_ENABLED=0 +builds: +- goos: + - linux + - darwin + - windows + goarch: + - arm64 + - amd64 + - "386" + goarm: + - "6" + dir: ./cmd/participle + main: . + ldflags: -s -w -X main.version={{.Version}} + binary: participle +archives: + - + format: tar.gz + name_template: '{{ .Binary }}-{{ .Version }}-{{ .Os }}-{{ .Arch }}{{ if .Arm }}v{{ + .Arm }}{{ end }}' + files: + - COPYING + - README* +snapshot: + name_template: SNAPSHOT-{{ .Commit }} +checksum: + name_template: '{{ .ProjectName }}-{{ .Version }}-checksums.txt' diff --git a/COPYING b/COPYING index 92dc39f7..44fed8b5 100644 --- a/COPYING +++ b/COPYING @@ -1,4 +1,4 @@ -Copyright (C) 2017 Alec Thomas +Copyright (C) 2017-2022 Alec Thomas Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in diff --git a/bin/.goreleaser-1.11.2.pkg b/bin/.goreleaser-1.11.2.pkg new file mode 120000 index 00000000..383f4511 --- /dev/null +++ b/bin/.goreleaser-1.11.2.pkg @@ -0,0 +1 @@ +hermit \ No newline at end of file diff --git a/bin/.jq-1.6.pkg b/bin/.jq-1.6.pkg new file mode 120000 index 00000000..383f4511 --- /dev/null +++ b/bin/.jq-1.6.pkg @@ -0,0 +1 @@ +hermit \ No newline at end of file diff --git a/bin/goreleaser b/bin/goreleaser new file mode 120000 index 00000000..10561a7b --- /dev/null +++ b/bin/goreleaser @@ -0,0 +1 @@ +.goreleaser-1.11.2.pkg \ No newline at end of file diff --git a/bin/hermit.hcl b/bin/hermit.hcl index e69de29b..60844159 100644 --- a/bin/hermit.hcl +++ b/bin/hermit.hcl @@ -0,0 +1,3 @@ +env = { + "PATH": "${HERMIT_ENV}/scripts:${PATH}", +} diff --git a/bin/jq b/bin/jq new file mode 120000 index 00000000..d7e067b8 --- /dev/null +++ b/bin/jq @@ -0,0 +1 @@ +.jq-1.6.pkg \ No newline at end of file diff --git a/lexer/codegen.go b/cmd/participle/gen_lexer_cmd.go similarity index 80% rename from lexer/codegen.go rename to cmd/participle/gen_lexer_cmd.go index 1acadc48..c8951cba 100644 --- a/lexer/codegen.go +++ b/cmd/participle/gen_lexer_cmd.go @@ -1,33 +1,81 @@ -package lexer +package main import ( + "encoding/json" "fmt" "io" + "os" "regexp" "regexp/syntax" "sort" "text/template" "unicode/utf8" + + "github.com/alecthomas/participle/v2/lexer" ) +type genLexerCmd struct { + Name string `help:"Name of the lexer."` + Output string `short:"o" help:"Output file."` + Package string `arg:"" required:"" help:"Go package for generated code."` + Lexer string `arg:"" required:"" default:"-" type:"existingfile" help:"JSON representation of a Participle lexer."` +} + +func (c *genLexerCmd) Help() string { + return ` +Generates Go code implementing the given JSON representation of a lexer. The +generated code should in general by around 10x faster and produce zero garbage +per token. +` +} + +func (c *genLexerCmd) Run() error { + var r *os.File + if c.Lexer == "-" { + r = os.Stdin + } else { + var err error + r, err = os.Open(c.Lexer) + if err != nil { + return err + } + defer r.Close() + } + + rules := lexer.Rules{} + err := json.NewDecoder(r).Decode(&rules) + if err != nil { + return err + } + def, err := lexer.New(rules) + if err != nil { + return err + } + err = generateLexer(os.Stdout, c.Package, def, c.Name) + if err != nil { + return err + } + return nil +} + var codegenBackrefRe = regexp.MustCompile(`(\\+)(\d)`) var codegenTemplate *template.Template = template.Must(template.New("lexgen").Funcs(template.FuncMap{ - "IsPush": func(r Rule) string { - if p, ok := r.Action.(ActionPush); ok { + "IsPush": func(r lexer.Rule) string { + if p, ok := r.Action.(lexer.ActionPush); ok { return p.State } return "" }, - "IsPop": func(r Rule) bool { - _, ok := r.Action.(ActionPop) + "IsPop": func(r lexer.Rule) bool { + _, ok := r.Action.(lexer.ActionPop) return ok }, - "IsReturn": func(r Rule) bool { - return r == ReturnRule + "IsReturn": func(r lexer.Rule) bool { + return r == lexer.ReturnRule }, "OrderRules": orderRules, - "HaveBackrefs": func(def *StatefulDefinition, state string) bool { + "HaveBackrefs": func(def *lexer.StatefulDefinition, state string) bool { for _, rule := range def.Rules()[state] { if codegenBackrefRe.MatchString(rule.Pattern) { return true @@ -51,11 +99,11 @@ import ( var _ syntax.Op -var Lexer lexer.Definition = definitionImpl{} +var {{.Name}}Lexer lexer.Definition = lexer{{.Name}}DefinitionImpl{} -type definitionImpl struct {} +type lexer{{.Name}}DefinitionImpl struct {} -func (definitionImpl) Symbols() map[string]lexer.TokenType { +func (lexer{{.Name}}DefinitionImpl) Symbols() map[string]lexer.TokenType { return map[string]lexer.TokenType{ {{- range $sym, $rn := .Def.Symbols}} "{{$sym}}": {{$rn}}, @@ -63,23 +111,23 @@ func (definitionImpl) Symbols() map[string]lexer.TokenType { } } -func (definitionImpl) LexString(filename string, s string) (lexer.Lexer, error) { - return &lexerImpl{ +func (lexer{{.Name}}DefinitionImpl) LexString(filename string, s string) (lexer.Lexer, error) { + return &lexer{{.Name}}Impl{ s: s, pos: lexer.Position{ Filename: filename, Line: 1, Column: 1, }, - states: []lexerState{lexerState{name: "Root"}}, + states: []lexer{{.Name}}State{lexer{{.Name}}State{name: "Root"}}, }, nil } -func (d definitionImpl) LexBytes(filename string, b []byte) (lexer.Lexer, error) { +func (d lexer{{.Name}}DefinitionImpl) LexBytes(filename string, b []byte) (lexer.Lexer, error) { return d.LexString(filename, string(b)) } -func (d definitionImpl) Lex(filename string, r io.Reader) (lexer.Lexer, error) { +func (d lexer{{.Name}}DefinitionImpl) Lex(filename string, r io.Reader) (lexer.Lexer, error) { s := &strings.Builder{} _, err := io.Copy(s, r) if err != nil { @@ -88,19 +136,19 @@ func (d definitionImpl) Lex(filename string, r io.Reader) (lexer.Lexer, error) { return d.LexString(filename, s.String()) } -type lexerState struct { +type lexer{{.Name}}State struct { name string groups []string } -type lexerImpl struct { +type lexer{{.Name}}Impl struct { s string p int pos lexer.Position - states []lexerState + states []lexer{{.Name}}State } -func (l *lexerImpl) Next() (lexer.Token, error) { +func (l *lexer{{.Name}}Impl) Next() (lexer.Token, error) { if l.p == len(l.s) { return lexer.EOFToken(l.pos), nil } @@ -122,7 +170,7 @@ func (l *lexerImpl) Next() (lexer.Token, error) { if true { {{- end}} {{- if .|IsPush}} - l.states = append(l.states, lexerState{name: "{{.|IsPush}}"{{if HaveBackrefs $.Def $state.Name}}, groups: l.sgroups(groups){{end}}}) + l.states = append(l.states, lexer{{.Name}}State{name: "{{.|IsPush}}"{{if HaveBackrefs $.Def $state.Name}}, groups: l.sgroups(groups){{end}}}) {{- else if (or (.|IsPop) (.|IsReturn))}} l.states = l.states[:len(l.states)-1] {{- if .|IsReturn}} @@ -154,7 +202,7 @@ func (l *lexerImpl) Next() (lexer.Token, error) { }, nil } -func (l *lexerImpl) sgroups(match []int) []string { +func (l *lexer{{.Name}}Impl) sgroups(match []int) []string { sgroups := make([]string, len(match)/2) for i := 0; i < len(match)-1; i += 2 { sgroups[i/2] = l.s[l.p+match[i]:l.p+match[i+1]] @@ -164,18 +212,14 @@ func (l *lexerImpl) sgroups(match []int) []string { `)) -// ExperimentalGenerateLexer generates Go code implementing the given stateful lexer. -// -// The generated code should in general by around 10x faster and produce zero garbage per token. -// -// NOTE: This is an experimental interface and subject to change. -func ExperimentalGenerateLexer(w io.Writer, pkg string, def *StatefulDefinition) error { +func generateLexer(w io.Writer, pkg string, def *lexer.StatefulDefinition, name string) error { type ctx struct { Package string - Def *StatefulDefinition + Name string + Def *lexer.StatefulDefinition } rules := def.Rules() - err := codegenTemplate.Execute(w, ctx{pkg, def}) + err := codegenTemplate.Execute(w, ctx{pkg, name, def}) if err != nil { return err } @@ -201,10 +245,10 @@ func ExperimentalGenerateLexer(w io.Writer, pkg string, def *StatefulDefinition) type orderedRule struct { Name string - Rules []Rule + Rules []lexer.Rule } -func orderRules(rules Rules) []orderedRule { +func orderRules(rules lexer.Rules) []orderedRule { orderedRules := []orderedRule{} for name, rules := range rules { orderedRules = append(orderedRules, orderedRule{ diff --git a/cmd/participle/go.mod b/cmd/participle/go.mod new file mode 100644 index 00000000..be2c477e --- /dev/null +++ b/cmd/participle/go.mod @@ -0,0 +1,10 @@ +module github.com/alecthomas/participle/v2/cmd/participle + +go 1.18 + +require ( + github.com/alecthomas/kong v0.6.1 + github.com/alecthomas/participle/v2 v2.0.0-00010101000000-000000000000 +) + +replace github.com/alecthomas/participle/v2 => ../.. diff --git a/cmd/participle/go.sum b/cmd/participle/go.sum new file mode 100644 index 00000000..73c82483 --- /dev/null +++ b/cmd/participle/go.sum @@ -0,0 +1,17 @@ +github.com/alecthomas/assert/v2 v2.1.0 h1:tbredtNcQnoSd3QBhQWI7QZ3XHOVkw1Moklp2ojoH/0= +github.com/alecthomas/kong v0.6.1 h1:1kNhcFepkR+HmasQpbiKDLylIL8yh5B5y1zPp5bJimA= +github.com/alecthomas/kong v0.6.1/go.mod h1:JfHWDzLmbh/puW6I3V7uWenoh56YNVONW+w8eKeUr9I= +github.com/alecthomas/repr v0.0.0-20210801044451-80ca428c5142/go.mod h1:2kn6fqh/zIyPLmm3ugklbEi5hg5wS435eygvNfaDQL8= +github.com/alecthomas/repr v0.1.0 h1:ENn2e1+J3k09gyj2shc0dHr/yjaWSHRlrJ4DPMevDqE= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/hexops/gotextdiff v1.0.3 h1:gitA9+qJrrTCsiCl7+kh75nPqQt1cx4ZkudSTLoUqJM= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.7.2 h1:4jaiDzPyXQvSd7D0EjG45355tLlV3VOECpq10pLC+8s= +github.com/stretchr/testify v1.7.2/go.mod h1:R6va5+xMeoiuVRoj+gSkQ7d3FALtqAAGI1FQKckRals= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/cmd/participle/main.go b/cmd/participle/main.go new file mode 100644 index 00000000..3a215317 --- /dev/null +++ b/cmd/participle/main.go @@ -0,0 +1,22 @@ +package main + +import "github.com/alecthomas/kong" + +var ( + version string = "dev" + cli struct { + Version kong.VersionFlag + Gen struct { + Lexer genLexerCmd `cmd:""` + } `cmd:"" help:"Generate code to accelerate Participle."` + } +) + +func main() { + kctx := kong.Parse(&cli, + kong.Description(`A command-line tool for Participle.`), + kong.Vars{"version": version}, + ) + err := kctx.Run() + kctx.FatalIfErrorf(err) +} diff --git a/go.mod b/go.mod index 8a1ec6d6..22a51ed3 100644 --- a/go.mod +++ b/go.mod @@ -3,8 +3,11 @@ module github.com/alecthomas/participle/v2 go 1.18 require ( - github.com/alecthomas/assert/v2 v2.0.3 + github.com/alecthomas/assert/v2 v2.1.0 github.com/alecthomas/repr v0.1.0 ) -require github.com/hexops/gotextdiff v1.0.3 // indirect +require ( + github.com/hexops/gotextdiff v1.0.3 // indirect + github.com/mitchellh/mapstructure v1.5.0 +) diff --git a/go.sum b/go.sum index f5017061..e2a72a07 100644 --- a/go.sum +++ b/go.sum @@ -1,6 +1,10 @@ github.com/alecthomas/assert/v2 v2.0.3 h1:WKqJODfOiQG0nEJKFKzDIG3E29CN2/4zR9XGJzKIkbg= github.com/alecthomas/assert/v2 v2.0.3/go.mod h1:b/+1DI2Q6NckYi+3mXyH3wFb8qG37K/DuK80n7WefXA= +github.com/alecthomas/assert/v2 v2.1.0 h1:tbredtNcQnoSd3QBhQWI7QZ3XHOVkw1Moklp2ojoH/0= +github.com/alecthomas/assert/v2 v2.1.0/go.mod h1:b/+1DI2Q6NckYi+3mXyH3wFb8qG37K/DuK80n7WefXA= github.com/alecthomas/repr v0.1.0 h1:ENn2e1+J3k09gyj2shc0dHr/yjaWSHRlrJ4DPMevDqE= github.com/alecthomas/repr v0.1.0/go.mod h1:2kn6fqh/zIyPLmm3ugklbEi5hg5wS435eygvNfaDQL8= github.com/hexops/gotextdiff v1.0.3 h1:gitA9+qJrrTCsiCl7+kh75nPqQt1cx4ZkudSTLoUqJM= github.com/hexops/gotextdiff v1.0.3/go.mod h1:pSWU5MAI3yDq+fZBTazCSJysOMbxWL1BSow5/V2vxeg= +github.com/mitchellh/mapstructure v1.5.0 h1:jeMsZIYE/09sWLaz43PL7Gy6RuMjD2eJVyuac5Z2hdY= +github.com/mitchellh/mapstructure v1.5.0/go.mod h1:bFUtVrKA4DC2yAKiSyO/QUcy7e+RRV2QTWOzhPopBRo= diff --git a/lexer/internal/basiclexer.go b/lexer/internal/basiclexer.go new file mode 100644 index 00000000..d4cfa46e --- /dev/null +++ b/lexer/internal/basiclexer.go @@ -0,0 +1,561 @@ +// Code generated by Participle. DO NOT EDIT. +package internal + +import ( + "io" + "regexp/syntax" + "strings" + "unicode/utf8" + + "github.com/alecthomas/participle/v2" + "github.com/alecthomas/participle/v2/lexer" +) + +var _ syntax.Op + +var GeneratedBasicLexer lexer.Definition = lexerGeneratedBasicDefinitionImpl{} + +type lexerGeneratedBasicDefinitionImpl struct{} + +func (lexerGeneratedBasicDefinitionImpl) Symbols() map[string]lexer.TokenType { + return map[string]lexer.TokenType{ + "Comment": -7, + "EOF": -1, + "EOL": -6, + "Ident": -4, + "Number": -3, + "Punct": -5, + "String": -2, + "Whitespace": -8, + } +} + +func (lexerGeneratedBasicDefinitionImpl) LexString(filename string, s string) (lexer.Lexer, error) { + return &lexerGeneratedBasicImpl{ + s: s, + pos: lexer.Position{ + Filename: filename, + Line: 1, + Column: 1, + }, + states: []lexerGeneratedBasicState{lexerGeneratedBasicState{name: "Root"}}, + }, nil +} + +func (d lexerGeneratedBasicDefinitionImpl) LexBytes(filename string, b []byte) (lexer.Lexer, error) { + return d.LexString(filename, string(b)) +} + +func (d lexerGeneratedBasicDefinitionImpl) Lex(filename string, r io.Reader) (lexer.Lexer, error) { + s := &strings.Builder{} + _, err := io.Copy(s, r) + if err != nil { + return nil, err + } + return d.LexString(filename, s.String()) +} + +type lexerGeneratedBasicState struct { + name string + groups []string +} + +type lexerGeneratedBasicImpl struct { + s string + p int + pos lexer.Position + states []lexerGeneratedBasicState +} + +func (l *lexerGeneratedBasicImpl) Next() (lexer.Token, error) { + if l.p == len(l.s) { + return lexer.EOFToken(l.pos), nil + } + var ( + state = l.states[len(l.states)-1] + groups []int + sym lexer.TokenType + ) + switch state.name { + case "Root": + if match := matchString(l.s, l.p); match[1] != 0 { + sym = -2 + groups = match[:] + } else if match := matchNumber(l.s, l.p); match[1] != 0 { + sym = -3 + groups = match[:] + } else if match := matchIdent(l.s, l.p); match[1] != 0 { + sym = -4 + groups = match[:] + } else if match := matchPunct(l.s, l.p); match[1] != 0 { + sym = -5 + groups = match[:] + } else if match := matchEOL(l.s, l.p); match[1] != 0 { + sym = -6 + groups = match[:] + } else if match := matchComment(l.s, l.p); match[1] != 0 { + sym = -7 + groups = match[:] + } else if match := matchWhitespace(l.s, l.p); match[1] != 0 { + sym = -8 + groups = match[:] + } + } + if groups == nil { + sample := []rune(l.s[l.p:]) + if len(sample) > 16 { + sample = append(sample[:16], []rune("...")...) + } + return lexer.Token{}, participle.Errorf(l.pos, "invalid input text %q", sample) + } + pos := l.pos + span := l.s[groups[0]:groups[1]] + l.p = groups[1] + l.pos.Advance(span) + return lexer.Token{ + Type: sym, + Value: span, + Pos: pos, + }, nil +} + +func (l *lexerGeneratedBasicImpl) sgroups(match []int) []string { + sgroups := make([]string, len(match)/2) + for i := 0; i < len(match)-1; i += 2 { + sgroups[i/2] = l.s[l.p+match[i] : l.p+match[i+1]] + } + return sgroups +} + +// "(\\"|[^"])*" +func matchString(s string, p int) (groups [4]int) { + // " (Literal) + l0 := func(s string, p int) int { + if p < len(s) && s[p] == '"' { + return p + 1 + } + return -1 + } + // \\" (Literal) + l1 := func(s string, p int) int { + if p+2 < len(s) && s[p:p+2] == "\\\"" { + return p + 2 + } + return -1 + } + // [^"] (CharClass) + l2 := func(s string, p int) int { + if len(s) <= p { + return -1 + } + var ( + rn rune + n int + ) + if s[p] < utf8.RuneSelf { + rn, n = rune(s[p]), 1 + } else { + rn, n = utf8.DecodeRuneInString(s[p:]) + } + switch { + case rn >= '\x00' && rn <= '!': + return p + 1 + case rn >= '#' && rn <= '\U0010ffff': + return p + n + } + return -1 + } + // \\"|[^"] (Alternate) + l3 := func(s string, p int) int { + if np := l1(s, p); np != -1 { + return np + } + if np := l2(s, p); np != -1 { + return np + } + return -1 + } + // (\\"|[^"]) (Capture) + l4 := func(s string, p int) int { + np := l3(s, p) + if np != -1 { + groups[2] = p + groups[3] = np + } + return np + } + // (\\"|[^"])* (Star) + l5 := func(s string, p int) int { + for len(s) > p { + if np := l4(s, p); np == -1 { + return p + } else { + p = np + } + } + return p + } + // "(\\"|[^"])*" (Concat) + l6 := func(s string, p int) int { + if p = l0(s, p); p == -1 { + return -1 + } + if p = l5(s, p); p == -1 { + return -1 + } + if p = l0(s, p); p == -1 { + return -1 + } + return p + } + np := l6(s, p) + if np == -1 { + return + } + groups[0] = p + groups[1] = np + return +} + +// [\+\-]?([0-9]*\.)?[0-9]+ +func matchNumber(s string, p int) (groups [4]int) { + // [\+\-] (CharClass) + l0 := func(s string, p int) int { + if len(s) <= p { + return -1 + } + rn := s[p] + switch { + case rn == '+': + return p + 1 + case rn == '-': + return p + 1 + } + return -1 + } + // [\+\-]? (Quest) + l1 := func(s string, p int) int { + if np := l0(s, p); np != -1 { + return np + } + return p + } + // [0-9] (CharClass) + l2 := func(s string, p int) int { + if len(s) <= p { + return -1 + } + rn := s[p] + switch { + case rn >= '0' && rn <= '9': + return p + 1 + } + return -1 + } + // [0-9]* (Star) + l3 := func(s string, p int) int { + for len(s) > p { + if np := l2(s, p); np == -1 { + return p + } else { + p = np + } + } + return p + } + // \. (Literal) + l4 := func(s string, p int) int { + if p < len(s) && s[p] == '.' { + return p + 1 + } + return -1 + } + // [0-9]*\. (Concat) + l5 := func(s string, p int) int { + if p = l3(s, p); p == -1 { + return -1 + } + if p = l4(s, p); p == -1 { + return -1 + } + return p + } + // ([0-9]*\.) (Capture) + l6 := func(s string, p int) int { + np := l5(s, p) + if np != -1 { + groups[2] = p + groups[3] = np + } + return np + } + // ([0-9]*\.)? (Quest) + l7 := func(s string, p int) int { + if np := l6(s, p); np != -1 { + return np + } + return p + } + // [0-9]+ (Plus) + l8 := func(s string, p int) int { + if p = l2(s, p); p == -1 { + return -1 + } + for len(s) > p { + if np := l2(s, p); np == -1 { + return p + } else { + p = np + } + } + return p + } + // [\+\-]?([0-9]*\.)?[0-9]+ (Concat) + l9 := func(s string, p int) int { + if p = l1(s, p); p == -1 { + return -1 + } + if p = l7(s, p); p == -1 { + return -1 + } + if p = l8(s, p); p == -1 { + return -1 + } + return p + } + np := l9(s, p) + if np == -1 { + return + } + groups[0] = p + groups[1] = np + return +} + +// [A-Z_a-z][0-9A-Z_a-z]* +func matchIdent(s string, p int) (groups [2]int) { + // [A-Z_a-z] (CharClass) + l0 := func(s string, p int) int { + if len(s) <= p { + return -1 + } + rn := s[p] + switch { + case rn >= 'A' && rn <= 'Z': + return p + 1 + case rn == '_': + return p + 1 + case rn >= 'a' && rn <= 'z': + return p + 1 + } + return -1 + } + // [0-9A-Z_a-z] (CharClass) + l1 := func(s string, p int) int { + if len(s) <= p { + return -1 + } + rn := s[p] + switch { + case rn >= '0' && rn <= '9': + return p + 1 + case rn >= 'A' && rn <= 'Z': + return p + 1 + case rn == '_': + return p + 1 + case rn >= 'a' && rn <= 'z': + return p + 1 + } + return -1 + } + // [0-9A-Z_a-z]* (Star) + l2 := func(s string, p int) int { + for len(s) > p { + if np := l1(s, p); np == -1 { + return p + } else { + p = np + } + } + return p + } + // [A-Z_a-z][0-9A-Z_a-z]* (Concat) + l3 := func(s string, p int) int { + if p = l0(s, p); p == -1 { + return -1 + } + if p = l2(s, p); p == -1 { + return -1 + } + return p + } + np := l3(s, p) + if np == -1 { + return + } + groups[0] = p + groups[1] = np + return +} + +// [!-/:-@\[-`\{-~]+ +func matchPunct(s string, p int) (groups [2]int) { + // [!-/:-@\[-`\{-~] (CharClass) + l0 := func(s string, p int) int { + if len(s) <= p { + return -1 + } + rn := s[p] + switch { + case rn >= '!' && rn <= '/': + return p + 1 + case rn >= ':' && rn <= '@': + return p + 1 + case rn >= '[' && rn <= '`': + return p + 1 + case rn >= '{' && rn <= '~': + return p + 1 + } + return -1 + } + // [!-/:-@\[-`\{-~]+ (Plus) + l1 := func(s string, p int) int { + if p = l0(s, p); p == -1 { + return -1 + } + for len(s) > p { + if np := l0(s, p); np == -1 { + return p + } else { + p = np + } + } + return p + } + np := l1(s, p) + if np == -1 { + return + } + groups[0] = p + groups[1] = np + return +} + +// \n +func matchEOL(s string, p int) (groups [2]int) { + if p < len(s) && s[p] == '\n' { + groups[0] = p + groups[1] = p + 1 + } + return +} + +// (?i:REM)[^\n]*(?i:\n) +func matchComment(s string, p int) (groups [2]int) { + // (?i:REM) (Literal) + l0 := func(s string, p int) int { + if p+3 < len(s) && s[p:p+3] == "REM" { + return p + 3 + } + return -1 + } + // [^\n] (CharClass) + l1 := func(s string, p int) int { + if len(s) <= p { + return -1 + } + var ( + rn rune + n int + ) + if s[p] < utf8.RuneSelf { + rn, n = rune(s[p]), 1 + } else { + rn, n = utf8.DecodeRuneInString(s[p:]) + } + switch { + case rn >= '\x00' && rn <= '\t': + return p + 1 + case rn >= '\v' && rn <= '\U0010ffff': + return p + n + } + return -1 + } + // [^\n]* (Star) + l2 := func(s string, p int) int { + for len(s) > p { + if np := l1(s, p); np == -1 { + return p + } else { + p = np + } + } + return p + } + // (?i:\n) (Literal) + l3 := func(s string, p int) int { + if p < len(s) && s[p] == '\n' { + return p + 1 + } + return -1 + } + // (?i:REM)[^\n]*(?i:\n) (Concat) + l4 := func(s string, p int) int { + if p = l0(s, p); p == -1 { + return -1 + } + if p = l2(s, p); p == -1 { + return -1 + } + if p = l3(s, p); p == -1 { + return -1 + } + return p + } + np := l4(s, p) + if np == -1 { + return + } + groups[0] = p + groups[1] = np + return +} + +// [\t ]+ +func matchWhitespace(s string, p int) (groups [2]int) { + // [\t ] (CharClass) + l0 := func(s string, p int) int { + if len(s) <= p { + return -1 + } + rn := s[p] + switch { + case rn == '\t': + return p + 1 + case rn == ' ': + return p + 1 + } + return -1 + } + // [\t ]+ (Plus) + l1 := func(s string, p int) int { + if p = l0(s, p); p == -1 { + return -1 + } + for len(s) > p { + if np := l0(s, p); np == -1 { + return p + } else { + p = np + } + } + return p + } + np := l1(s, p) + if np == -1 { + return + } + groups[0] = p + groups[1] = np + return +} diff --git a/lexer/internal/basiclexer.json b/lexer/internal/basiclexer.json new file mode 100644 index 00000000..91964342 --- /dev/null +++ b/lexer/internal/basiclexer.json @@ -0,0 +1,32 @@ +{ + "Root": [ + { + "name": "String", + "pattern": "\"(\\\\\"|[^\"])*\"" + }, + { + "name": "Number", + "pattern": "[-+]?(\\d*\\.)?\\d+" + }, + { + "name": "Ident", + "pattern": "[a-zA-Z_]\\w*" + }, + { + "name": "Punct", + "pattern": "[!-/:-@[-`{-~]+" + }, + { + "name": "EOL", + "pattern": "\\n" + }, + { + "name": "Comment", + "pattern": "(?i)rem[^\\n]*\\n" + }, + { + "name": "Whitespace", + "pattern": "[ \\t]+" + } + ] +} \ No newline at end of file diff --git a/lexer/internal/codegen_gen_test.go b/lexer/internal/codegen_gen_test.go deleted file mode 100644 index 3aa76a68..00000000 --- a/lexer/internal/codegen_gen_test.go +++ /dev/null @@ -1,336 +0,0 @@ - -// Code generated by Participle. DO NOT EDIT. -package internal_test - -import ( - "io" - "strings" - "unicode/utf8" - "regexp/syntax" - - "github.com/alecthomas/participle/v2" - "github.com/alecthomas/participle/v2/lexer" -) - -var _ syntax.Op - -var Lexer lexer.Definition = definitionImpl{} - -type definitionImpl struct {} - -func (definitionImpl) Symbols() map[string]lexer.TokenType { - return map[string]lexer.TokenType{ - "Char": -11, - "EOF": -1, - "Escaped": -8, - "Expr": -10, - "ExprEnd": -6, - "Ident": -5, - "Oper": -4, - "String": -7, - "StringEnd": -9, - "Whitespace": -3, - } -} - -func (definitionImpl) LexString(filename string, s string) (lexer.Lexer, error) { - return &lexerImpl{ - s: s, - pos: lexer.Position{ - Filename: filename, - Line: 1, - Column: 1, - }, - states: []lexerState{lexerState{name: "Root"}}, - }, nil -} - -func (d definitionImpl) LexBytes(filename string, b []byte) (lexer.Lexer, error) { - return d.LexString(filename, string(b)) -} - -func (d definitionImpl) Lex(filename string, r io.Reader) (lexer.Lexer, error) { - s := &strings.Builder{} - _, err := io.Copy(s, r) - if err != nil { - return nil, err - } - return d.LexString(filename, s.String()) -} - -type lexerState struct { - name string - groups []string -} - -type lexerImpl struct { - s string - p int - pos lexer.Position - states []lexerState -} - -func (l *lexerImpl) Next() (lexer.Token, error) { - if l.p == len(l.s) { - return lexer.EOFToken(l.pos), nil - } - var ( - state = l.states[len(l.states)-1] - groups []int - sym lexer.TokenType - ) - switch state.name { - case "Expr":if match := matchString(l.s, l.p); match[1] != 0 { - sym = -7 - groups = match[:] - l.states = append(l.states, lexerState{name: "String"}) - } else if match := matchWhitespace(l.s, l.p); match[1] != 0 { - sym = -3 - groups = match[:] - } else if match := matchOper(l.s, l.p); match[1] != 0 { - sym = -4 - groups = match[:] - } else if match := matchIdent(l.s, l.p); match[1] != 0 { - sym = -5 - groups = match[:] - } else if match := matchExprEnd(l.s, l.p); match[1] != 0 { - sym = -6 - groups = match[:] - l.states = l.states[:len(l.states)-1] - } - case "Root":if match := matchString(l.s, l.p); match[1] != 0 { - sym = -7 - groups = match[:] - l.states = append(l.states, lexerState{name: "String"}) - } - case "String":if match := matchEscaped(l.s, l.p); match[1] != 0 { - sym = -8 - groups = match[:] - } else if match := matchStringEnd(l.s, l.p); match[1] != 0 { - sym = -9 - groups = match[:] - l.states = l.states[:len(l.states)-1] - } else if match := matchExpr(l.s, l.p); match[1] != 0 { - sym = -10 - groups = match[:] - l.states = append(l.states, lexerState{name: "Expr"}) - } else if match := matchChar(l.s, l.p); match[1] != 0 { - sym = -11 - groups = match[:] - } - } - if groups == nil { - sample := []rune(l.s[l.p:]) - if len(sample) > 16 { - sample = append(sample[:16], []rune("...")...) - } - return lexer.Token{}, participle.Errorf(l.pos, "invalid input text %q", sample) - } - pos := l.pos - span := l.s[groups[0]:groups[1]] - l.p = groups[1] - l.pos.Advance(span) - return lexer.Token{ - Type: sym, - Value: span, - Pos: pos, - }, nil -} - -func (l *lexerImpl) sgroups(match []int) []string { - sgroups := make([]string, len(match)/2) - for i := 0; i < len(match)-1; i += 2 { - sgroups[i/2] = l.s[l.p+match[i]:l.p+match[i+1]] - } - return sgroups -} - - -// " -func matchString(s string, p int) (groups [2]int) { -if p < len(s) && s[p] == '"' { -groups[0] = p -groups[1] = p + 1 -} -return -} - -// [\t-\n\f-\r ]+ -func matchWhitespace(s string, p int) (groups [2]int) { -// [\t-\n\f-\r ] (CharClass) -l0 := func(s string, p int) int { -if len(s) <= p { return -1 } -rn := s[p] -switch { -case rn >= '\t' && rn <= '\n': return p+1 -case rn >= '\f' && rn <= '\r': return p+1 -case rn == ' ': return p+1 -} -return -1 -} -// [\t-\n\f-\r ]+ (Plus) -l1 := func(s string, p int) int { -if p = l0(s, p); p == -1 { return -1 } -for len(s) > p { -if np := l0(s, p); np == -1 { return p } else { p = np } -} -return p -} -np := l1(s, p) -if np == -1 { - return -} -groups[0] = p -groups[1] = np -return -} - -// [%\*-\+\-/] -func matchOper(s string, p int) (groups [2]int) { -// [%\*-\+\-/] (CharClass) -l0 := func(s string, p int) int { -if len(s) <= p { return -1 } -rn := s[p] -switch { -case rn == '%': return p+1 -case rn >= '*' && rn <= '+': return p+1 -case rn == '-': return p+1 -case rn == '/': return p+1 -} -return -1 -} -np := l0(s, p) -if np == -1 { - return -} -groups[0] = p -groups[1] = np -return -} - -// [0-9A-Z_a-z]+ -func matchIdent(s string, p int) (groups [2]int) { -// [0-9A-Z_a-z] (CharClass) -l0 := func(s string, p int) int { -if len(s) <= p { return -1 } -rn := s[p] -switch { -case rn >= '0' && rn <= '9': return p+1 -case rn >= 'A' && rn <= 'Z': return p+1 -case rn == '_': return p+1 -case rn >= 'a' && rn <= 'z': return p+1 -} -return -1 -} -// [0-9A-Z_a-z]+ (Plus) -l1 := func(s string, p int) int { -if p = l0(s, p); p == -1 { return -1 } -for len(s) > p { -if np := l0(s, p); np == -1 { return p } else { p = np } -} -return p -} -np := l1(s, p) -if np == -1 { - return -} -groups[0] = p -groups[1] = np -return -} - -// \} -func matchExprEnd(s string, p int) (groups [2]int) { -if p < len(s) && s[p] == '}' { -groups[0] = p -groups[1] = p + 1 -} -return -} - -// \\(?-s:.) -func matchEscaped(s string, p int) (groups [2]int) { -// \\ (Literal) -l0 := func(s string, p int) int { -if p < len(s) && s[p] == '\\' { return p+1 } -return -1 -} -// (?-s:.) (AnyCharNotNL) -l1 := func(s string, p int) int { -var (rn rune; n int) -if s[p] < utf8.RuneSelf { - rn, n = rune(s[p]), 1 -} else { - rn, n = utf8.DecodeRuneInString(s[p:]) -} -if len(s) <= p+n || rn == '\n' { return -1 } -return p+n -} -// \\(?-s:.) (Concat) -l2 := func(s string, p int) int { -if p = l0(s, p); p == -1 { return -1 } -if p = l1(s, p); p == -1 { return -1 } -return p -} -np := l2(s, p) -if np == -1 { - return -} -groups[0] = p -groups[1] = np -return -} - -// " -func matchStringEnd(s string, p int) (groups [2]int) { -if p < len(s) && s[p] == '"' { -groups[0] = p -groups[1] = p + 1 -} -return -} - -// \$\{ -func matchExpr(s string, p int) (groups [2]int) { -if p+2 < len(s) && s[p:p+2] == "${" { -groups[0] = p -groups[1] = p + 2 -} -return -} - -// [^"\$\\]+ -func matchChar(s string, p int) (groups [2]int) { -// [^"\$\\] (CharClass) -l0 := func(s string, p int) int { -if len(s) <= p { return -1 } -var (rn rune; n int) -if s[p] < utf8.RuneSelf { - rn, n = rune(s[p]), 1 -} else { - rn, n = utf8.DecodeRuneInString(s[p:]) -} -switch { -case rn >= '\x00' && rn <= '!': return p+1 -case rn == '#': return p+1 -case rn >= '%' && rn <= '[': return p+1 -case rn >= ']' && rn <= '\U0010ffff': return p+n -} -return -1 -} -// [^"\$\\]+ (Plus) -l1 := func(s string, p int) int { -if p = l0(s, p); p == -1 { return -1 } -for len(s) > p { -if np := l0(s, p); np == -1 { return p } else { p = np } -} -return p -} -np := l1(s, p) -if np == -1 { - return -} -groups[0] = p -groups[1] = np -return -} diff --git a/lexer/internal/codegen_test.go b/lexer/internal/codegen_test.go deleted file mode 100644 index 00d04e66..00000000 --- a/lexer/internal/codegen_test.go +++ /dev/null @@ -1,106 +0,0 @@ -package internal_test - -import ( - "os" - "os/exec" - "strings" - "testing" - "time" - - require "github.com/alecthomas/assert/v2" - "github.com/alecthomas/participle/v2/lexer" -) - -var ( - testInput = `hello ${name} world what's the song that you're singing, come on get ${emotion}` - benchmarkInput = `"` + strings.Repeat(testInput, 1000) + `"` - exprLexer = lexer.MustStateful(lexer.Rules{ - "Root": { - {`String`, `"`, lexer.Push("String")}, - }, - "String": { - {"Escaped", `\\.`, nil}, - {"StringEnd", `"`, lexer.Pop()}, - {"Expr", `\${`, lexer.Push("Expr")}, - {"Char", `[^$"\\]+`, nil}, - }, - "Expr": { - lexer.Include("Root"), - {`Whitespace`, `\s+`, nil}, - {`Oper`, `[-+/*%]`, nil}, - {"Ident", `\w+`, nil}, - {"ExprEnd", `}`, lexer.Pop()}, - }, - }) -) - -func TestGenerate(t *testing.T) { - w, err := os.Create("codegen_gen_test.go~") - require.NoError(t, err) - defer w.Close() - defer os.Rename("codegen_gen_test.go~", "codegen_gen_test.go") // nolint - err = lexer.ExperimentalGenerateLexer(w, "internal_test", exprLexer) - require.NoError(t, err) - err = exec.Command("gofmt", "-w", "codegen_gen_test.go").Run() - require.NoError(t, err) - // cmd.Stdin = strings.NewReader(source) - // err = cmd.Run() - // require.NoError(t, err) -} - -func TestIdentical(t *testing.T) { - lex, err := exprLexer.LexString("", `"`+testInput+`"`) - require.NoError(t, err) - expected, err := lexer.ConsumeAll(lex) - require.NoError(t, err) - - lex, err = Lexer.Lex("", strings.NewReader(`"`+testInput+`"`)) - require.NoError(t, err) - actual, err := lexer.ConsumeAll(lex) - require.NoError(t, err) - - require.Equal(t, expected, actual) -} - -func BenchmarkStatefulGenerated(b *testing.B) { - b.ReportAllocs() - slex := Lexer.(lexer.StringDefinition) - start := time.Now() - for i := 0; i < b.N; i++ { - lex, err := slex.LexString("", benchmarkInput) - if err != nil { - b.Fatal(err) - } - for { - t, err := lex.Next() - if err != nil { - b.Fatal(err) - } - if t.EOF() { - break - } - } - } - b.ReportMetric(float64(len(benchmarkInput)*b.N)*float64(time.Since(start)/time.Second)/1024/1024, "MiB/s") -} - -func BenchmarkStatefulRegex(b *testing.B) { - b.ReportAllocs() - start := time.Now() - for i := 0; i < b.N; i++ { - lex, err := exprLexer.LexString("", benchmarkInput) - if err != nil { - b.Fatal(err) - } - for { - t, err := lex.Next() - if err != nil { - b.Fatal(err) - } - if t.EOF() { - break - } - } - } - b.ReportMetric(float64(len(benchmarkInput)*b.N)/float64(time.Since(start)/time.Second)/1024/1024, "MiB/s") -} diff --git a/lexer/stateful.go b/lexer/stateful.go index d9e4837d..210b3f33 100644 --- a/lexer/stateful.go +++ b/lexer/stateful.go @@ -1,6 +1,7 @@ package lexer import ( + "encoding/json" "errors" "fmt" "io" @@ -21,9 +22,99 @@ type Option func(d *StatefulDefinition) // A Rule matching input and possibly changing state. type Rule struct { - Name string - Pattern string - Action Action + Name string `json:"name"` + Pattern string `json:"pattern"` + Action Action `json:"action"` +} + +var _ json.Marshaler = &Rule{} +var _ json.Unmarshaler = &Rule{} + +type jsonRule struct { + Name string `json:"name,omitempty"` + Pattern string `json:"pattern,omitempty"` + Action json.RawMessage `json:"action,omitempty"` +} + +func (r *Rule) UnmarshalJSON(data []byte) error { + jrule := jsonRule{} + err := json.Unmarshal(data, &jrule) + if err != nil { + return err + } + r.Name = jrule.Name + r.Pattern = jrule.Pattern + jaction := struct { + Kind string `json:"kind"` + }{} + if jrule.Action == nil { + return nil + } + err = json.Unmarshal(jrule.Action, &jaction) + if err != nil { + return fmt.Errorf("could not unmarshal action %q: %w", string(jrule.Action), err) + } + var action Action + switch jaction.Kind { + case "push": + actual := ActionPush{} + if err := json.Unmarshal(jrule.Action, &actual); err != nil { + return err + } + action = actual + case "pop": + actual := ActionPop{} + if err := json.Unmarshal(jrule.Action, &actual); err != nil { + return err + } + action = actual + case "include": + actual := include{} + if err := json.Unmarshal(jrule.Action, &actual); err != nil { + return err + } + action = actual + case "": + default: + return fmt.Errorf("unknown action %q", jaction.Kind) + } + r.Action = action + return nil +} + +func (r *Rule) MarshalJSON() ([]byte, error) { + jrule := jsonRule{ + Name: r.Name, + Pattern: r.Pattern, + } + if r.Action != nil { + actionData, err := json.Marshal(r.Action) + if err != nil { + return nil, fmt.Errorf("failed to map action: %w", err) + } + jaction := map[string]interface{}{} + err = json.Unmarshal(actionData, &jaction) + if err != nil { + return nil, fmt.Errorf("failed to map action: %w", err) + } + switch r.Action.(type) { + case nil: + case ActionPop: + jaction["kind"] = "pop" + case ActionPush: + jaction["kind"] = "push" + case include: + jaction["kind"] = "include" + default: + return nil, fmt.Errorf("unsupported action %T", r.Action) + } + actionJSON, err := json.Marshal(jaction) + if err != nil { + return nil, err + } + jrule.Action = actionJSON + } + return json.Marshal(&jrule) } // Rules grouped by name. @@ -92,7 +183,9 @@ var ReturnRule = Rule{"returnToParent", "", nil} func Return() Rule { return ReturnRule } // ActionPush pushes the current state and switches to "State" when the Rule matches. -type ActionPush struct{ State string } +type ActionPush struct { + State string `json:"state"` +} func (p ActionPush) applyAction(lexer *StatefulLexer, groups []string) error { if groups[0] == "" { @@ -110,16 +203,18 @@ func Push(state string) Action { return ActionPush{state} } -type include struct{ state string } +type include struct { + State string `json:"state"` +} func (i include) applyAction(lexer *StatefulLexer, groups []string) error { panic("should not be called") } func (i include) applyRules(state string, rule int, rules compiledRules) error { - includedRules, ok := rules[i.state] + includedRules, ok := rules[i.State] if !ok { - return fmt.Errorf("invalid include state %q", i.state) + return fmt.Errorf("invalid include state %q", i.State) } clone := make([]compiledRule, len(includedRules)) copy(clone, includedRules) @@ -218,6 +313,10 @@ restart: return d, nil } +func (d *StatefulDefinition) MarshalJSON() ([]byte, error) { + return json.Marshal(d.rules) +} + // Rules returns the user-provided Rules used to construct the lexer. func (d *StatefulDefinition) Rules() Rules { out := Rules{} diff --git a/lexer/stateful_codegen_test.go b/lexer/stateful_codegen_test.go deleted file mode 100644 index e184f6e9..00000000 --- a/lexer/stateful_codegen_test.go +++ /dev/null @@ -1,437 +0,0 @@ - -// Code generated by Participle. DO NOT EDIT. -package lexer_test - -import ( - "io" - "strings" - "unicode/utf8" - - "github.com/alecthomas/participle/v2" - "github.com/alecthomas/participle/v2/lexer" -) - -var Lexer lexer.Definition = definitionImpl{} - -type definitionImpl struct {} - -func (definitionImpl) Symbols() map[string]lexer.TokenType { - return map[string]lexer.TokenType{ - "Comment": -7, - "EOF": -1, - "EOL": -6, - "Ident": -4, - "Number": -3, - "Punct": -5, - "String": -2, - "Whitespace": -8, - } -} - -func (definitionImpl) LexString(filename string, s string) (lexer.Lexer, error) { - return &lexerImpl{ - s: s, - pos: lexer.Position{ - Filename: filename, - Line: 1, - Column: 1, - }, - states: []lexerState{lexerState{name: "Root"}}, - }, nil -} - -func (d definitionImpl) LexBytes(filename string, b []byte) (lexer.Lexer, error) { - return d.LexString(filename, string(b)) -} - -func (d definitionImpl) Lex(filename string, r io.Reader) (lexer.Lexer, error) { - s := &strings.Builder{} - _, err := io.Copy(s, r) - if err != nil { - return nil, err - } - return d.LexString(filename, s.String()) -} - -type lexerState struct { - name string - groups []string -} - -type lexerImpl struct { - s string - p int - pos lexer.Position - states []lexerState -} - -func (l *lexerImpl) Next() (lexer.Token, error) { - if l.p == len(l.s) { - return lexer.EOFToken(l.pos), nil - } - var ( - state = l.states[len(l.states)-1] - groups []int - sym lexer.TokenType - ) - switch state.name { - case "Root":if match := matchString(l.s, l.p); match[1] != 0 { - sym = -2 - groups = match[:] - } else if match := matchNumber(l.s, l.p); match[1] != 0 { - sym = -3 - groups = match[:] - } else if match := matchIdent(l.s, l.p); match[1] != 0 { - sym = -4 - groups = match[:] - } else if match := matchPunct(l.s, l.p); match[1] != 0 { - sym = -5 - groups = match[:] - } else if match := matchEOL(l.s, l.p); match[1] != 0 { - sym = -6 - groups = match[:] - } else if match := matchComment(l.s, l.p); match[1] != 0 { - sym = -7 - groups = match[:] - } else if match := matchWhitespace(l.s, l.p); match[1] != 0 { - sym = -8 - groups = match[:] - } - } - if groups == nil { - sample := []rune(l.s[l.p:]) - if len(sample) > 16 { - sample = append(sample[:16], []rune("...")...) - } - return lexer.Token{}, participle.Errorf(l.pos, "invalid input text %q", sample) - } - pos := l.pos - span := l.s[groups[0]:groups[1]] - l.p = groups[1] - l.pos.Advance(span) - return lexer.Token{ - Type: sym, - Value: span, - Pos: pos, - }, nil -} - -func (l *lexerImpl) sgroups(match []int) []string { - sgroups := make([]string, len(match)/2) - for i := 0; i < len(match)-1; i += 2 { - sgroups[i/2] = l.s[l.p+match[i]:l.p+match[i+1]] - } - return sgroups -} - - -// "(\\"|[^"])*" -func matchString(s string, p int) (groups [4]int) { -// " (Literal) -l0 := func(s string, p int) int { -if p < len(s) && s[p] == '"' { return p+1 } -return -1 -} -// \\" (Literal) -l1 := func(s string, p int) int { -if p+2 < len(s) && s[p:p+2] == "\\\"" { return p+2 } -return -1 -} -// [^"] (CharClass) -l2 := func(s string, p int) int { -if len(s) <= p { return -1 } -var (rn rune; n int) -if s[p] < utf8.RuneSelf { - rn, n = rune(s[p]), 1 -} else { - rn, n = utf8.DecodeRuneInString(s[p:]) -} -switch { -case rn >= '\x00' && rn <= '!': return p+1 -case rn >= '#' && rn <= '\U0010ffff': return p+n -} -return -1 -} -// \\"|[^"] (Alternate) -l3 := func(s string, p int) int { -if np := l1(s, p); np != -1 { return np } -if np := l2(s, p); np != -1 { return np } -return -1 -} -// (\\"|[^"]) (Capture) -l4 := func(s string, p int) int { -np := l3(s, p) -if np != -1 { - groups[2] = p - groups[3] = np -} -return np} -// (\\"|[^"])* (Star) -l5 := func(s string, p int) int { -for len(s) > p { -if np := l4(s, p); np == -1 { return p } else { p = np } -} -return p -} -// "(\\"|[^"])*" (Concat) -l6 := func(s string, p int) int { -if p = l0(s, p); p == -1 { return -1 } -if p = l5(s, p); p == -1 { return -1 } -if p = l0(s, p); p == -1 { return -1 } -return p -} -np := l6(s, p) -if np == -1 { - return -} -groups[0] = p -groups[1] = np -return -} - -// [\+\-]?([0-9]*\.)?[0-9]+ -func matchNumber(s string, p int) (groups [4]int) { -// [\+\-] (CharClass) -l0 := func(s string, p int) int { -if len(s) <= p { return -1 } -rn := s[p] -switch { -case rn == '+': return p+1 -case rn == '-': return p+1 -} -return -1 -} -// [\+\-]? (Quest) -l1 := func(s string, p int) int { -if np := l0(s, p); np != -1 { return np } -return p -} -// [0-9] (CharClass) -l2 := func(s string, p int) int { -if len(s) <= p { return -1 } -rn := s[p] -switch { -case rn >= '0' && rn <= '9': return p+1 -} -return -1 -} -// [0-9]* (Star) -l3 := func(s string, p int) int { -for len(s) > p { -if np := l2(s, p); np == -1 { return p } else { p = np } -} -return p -} -// \. (Literal) -l4 := func(s string, p int) int { -if p < len(s) && s[p] == '.' { return p+1 } -return -1 -} -// [0-9]*\. (Concat) -l5 := func(s string, p int) int { -if p = l3(s, p); p == -1 { return -1 } -if p = l4(s, p); p == -1 { return -1 } -return p -} -// ([0-9]*\.) (Capture) -l6 := func(s string, p int) int { -np := l5(s, p) -if np != -1 { - groups[2] = p - groups[3] = np -} -return np} -// ([0-9]*\.)? (Quest) -l7 := func(s string, p int) int { -if np := l6(s, p); np != -1 { return np } -return p -} -// [0-9]+ (Plus) -l8 := func(s string, p int) int { -if p = l2(s, p); p == -1 { return -1 } -for len(s) > p { -if np := l2(s, p); np == -1 { return p } else { p = np } -} -return p -} -// [\+\-]?([0-9]*\.)?[0-9]+ (Concat) -l9 := func(s string, p int) int { -if p = l1(s, p); p == -1 { return -1 } -if p = l7(s, p); p == -1 { return -1 } -if p = l8(s, p); p == -1 { return -1 } -return p -} -np := l9(s, p) -if np == -1 { - return -} -groups[0] = p -groups[1] = np -return -} - -// [A-Z_a-z][0-9A-Z_a-z]* -func matchIdent(s string, p int) (groups [2]int) { -// [A-Z_a-z] (CharClass) -l0 := func(s string, p int) int { -if len(s) <= p { return -1 } -rn := s[p] -switch { -case rn >= 'A' && rn <= 'Z': return p+1 -case rn == '_': return p+1 -case rn >= 'a' && rn <= 'z': return p+1 -} -return -1 -} -// [0-9A-Z_a-z] (CharClass) -l1 := func(s string, p int) int { -if len(s) <= p { return -1 } -rn := s[p] -switch { -case rn >= '0' && rn <= '9': return p+1 -case rn >= 'A' && rn <= 'Z': return p+1 -case rn == '_': return p+1 -case rn >= 'a' && rn <= 'z': return p+1 -} -return -1 -} -// [0-9A-Z_a-z]* (Star) -l2 := func(s string, p int) int { -for len(s) > p { -if np := l1(s, p); np == -1 { return p } else { p = np } -} -return p -} -// [A-Z_a-z][0-9A-Z_a-z]* (Concat) -l3 := func(s string, p int) int { -if p = l0(s, p); p == -1 { return -1 } -if p = l2(s, p); p == -1 { return -1 } -return p -} -np := l3(s, p) -if np == -1 { - return -} -groups[0] = p -groups[1] = np -return -} - -// [!-/:-@\[-`\{-~]+ -func matchPunct(s string, p int) (groups [2]int) { -// [!-/:-@\[-`\{-~] (CharClass) -l0 := func(s string, p int) int { -if len(s) <= p { return -1 } -rn := s[p] -switch { -case rn >= '!' && rn <= '/': return p+1 -case rn >= ':' && rn <= '@': return p+1 -case rn >= '[' && rn <= '`': return p+1 -case rn >= '{' && rn <= '~': return p+1 -} -return -1 -} -// [!-/:-@\[-`\{-~]+ (Plus) -l1 := func(s string, p int) int { -if p = l0(s, p); p == -1 { return -1 } -for len(s) > p { -if np := l0(s, p); np == -1 { return p } else { p = np } -} -return p -} -np := l1(s, p) -if np == -1 { - return -} -groups[0] = p -groups[1] = np -return -} - -// \n -func matchEOL(s string, p int) (groups [2]int) { -if p < len(s) && s[p] == '\n' { -groups[0] = p -groups[1] = p + 1 -} -return -} - -// (?i:REM)[^\n]*(?i:\n) -func matchComment(s string, p int) (groups [2]int) { -// (?i:REM) (Literal) -l0 := func(s string, p int) int { -if p+3 < len(s) && s[p:p+3] == "REM" { return p+3 } -return -1 -} -// [^\n] (CharClass) -l1 := func(s string, p int) int { -if len(s) <= p { return -1 } -var (rn rune; n int) -if s[p] < utf8.RuneSelf { - rn, n = rune(s[p]), 1 -} else { - rn, n = utf8.DecodeRuneInString(s[p:]) -} -switch { -case rn >= '\x00' && rn <= '\t': return p+1 -case rn >= '\v' && rn <= '\U0010ffff': return p+n -} -return -1 -} -// [^\n]* (Star) -l2 := func(s string, p int) int { -for len(s) > p { -if np := l1(s, p); np == -1 { return p } else { p = np } -} -return p -} -// (?i:\n) (Literal) -l3 := func(s string, p int) int { -if p < len(s) && s[p] == '\n' { return p+1 } -return -1 -} -// (?i:REM)[^\n]*(?i:\n) (Concat) -l4 := func(s string, p int) int { -if p = l0(s, p); p == -1 { return -1 } -if p = l2(s, p); p == -1 { return -1 } -if p = l3(s, p); p == -1 { return -1 } -return p -} -np := l4(s, p) -if np == -1 { - return -} -groups[0] = p -groups[1] = np -return -} - -// [\t ]+ -func matchWhitespace(s string, p int) (groups [2]int) { -// [\t ] (CharClass) -l0 := func(s string, p int) int { -if len(s) <= p { return -1 } -rn := s[p] -switch { -case rn == '\t': return p+1 -case rn == ' ': return p+1 -} -return -1 -} -// [\t ]+ (Plus) -l1 := func(s string, p int) int { -if p = l0(s, p); p == -1 { return -1 } -for len(s) > p { -if np := l0(s, p); np == -1 { return p } else { p = np } -} -return p -} -np := l1(s, p) -if np == -1 { - return -} -groups[0] = p -groups[1] = np -return -} diff --git a/lexer/stateful_test.go b/lexer/stateful_test.go index be7b5e44..c22b5a6e 100644 --- a/lexer/stateful_test.go +++ b/lexer/stateful_test.go @@ -1,15 +1,15 @@ package lexer_test import ( - "flag" + "encoding/json" "log" - "os" "strings" "testing" require "github.com/alecthomas/assert/v2" "github.com/alecthomas/participle/v2" "github.com/alecthomas/participle/v2/lexer" + "github.com/alecthomas/participle/v2/lexer/internal" "github.com/alecthomas/repr" ) @@ -32,6 +32,15 @@ var interpolatedRules = lexer.Rules{ }, } +func TestMarshalUnmarshal(t *testing.T) { + data, err := json.MarshalIndent(interpolatedRules, "", " ") + require.NoError(t, err) + unmarshalledRules := lexer.Rules{} + err = json.Unmarshal(data, &unmarshalledRules) + require.NoError(t, err) + require.Equal(t, interpolatedRules, unmarshalledRules) +} + func TestStatefulLexer(t *testing.T) { tests := []struct { name string @@ -408,30 +417,6 @@ func BenchmarkStatefulBackrefs(b *testing.B) { } } -var generateLexer = flag.Bool("generate", false, "generate lexer") - -func TestGenerate(t *testing.T) { - if !*generateLexer { - return - } - def, err := lexer.New(lexer.Rules{"Root": []lexer.Rule{ - {"String", `"(\\"|[^"])*"`, nil}, - {"Number", `[-+]?(\d*\.)?\d+`, nil}, - {"Ident", `[a-zA-Z_]\w*`, nil}, - {"Punct", `[!-/:-@[-` + "`" + `{-~]+`, nil}, - {"EOL", `\n`, nil}, - {"Comment", `(?i)rem[^\n]*\n`, nil}, - {"Whitespace", `[ \t]+`, nil}, - }}) - require.NoError(t, err) - w, err := os.Create("stateful_codegen_test.go~") - require.NoError(t, err) - err = lexer.ExperimentalGenerateLexer(w, "lexer_test", def) - require.NoError(t, err) - err = os.Rename("stateful_codegen_test.go~", "stateful_codegen_test.go") - require.NoError(t, err) -} - func basicBenchmark(b *testing.B, def lexer.Definition) { b.Helper() source := strings.Repeat(` @@ -485,5 +470,5 @@ func BenchmarkStatefulBASIC(b *testing.B) { } func BenchmarkStatefulGeneratedBASIC(b *testing.B) { - basicBenchmark(b, Lexer) + basicBenchmark(b, internal.GeneratedBasicLexer) } diff --git a/scripts/participle b/scripts/participle new file mode 100755 index 00000000..eccdee5d --- /dev/null +++ b/scripts/participle @@ -0,0 +1,4 @@ +#!/bin/bash +set -euo pipefail +(cd "$(dirname $0)/../cmd/participle" && go install github.com/alecthomas/participle/v2/cmd/participle) +exec "$(go env GOBIN)/participle" "$@" diff --git a/scripts/regen-lexer b/scripts/regen-lexer new file mode 100755 index 00000000..8daa6cc0 --- /dev/null +++ b/scripts/regen-lexer @@ -0,0 +1,3 @@ +#!/bin/bash +set -euo pipefail +participle gen lexer --name GeneratedBasic internal < lexer/internal/basiclexer.json | gofmt > lexer/internal/basiclexer.go