Skip to content

Commit

Permalink
fix(parser/lexer): ID_Start & ID_Continue checks (#524)
Browse files Browse the repository at this point in the history
Fix the checks for ID_Start & ID_Continue to match the definitions in
https://www.unicode.org/reports/tr31/
  • Loading branch information
filips authored Jun 12, 2024
1 parent 2d23528 commit d4edd51
Show file tree
Hide file tree
Showing 2 changed files with 74 additions and 2 deletions.
51 changes: 49 additions & 2 deletions parser/lexer.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,21 +38,68 @@ func digitValue(chr rune) int {
return 16 // Larger than any legal digit value
}

// See https://www.unicode.org/reports/tr31/ for reference on ID_Start and ID_Continue.
var includeIDStart = []*unicode.RangeTable{
unicode.Lu,
unicode.Ll,
unicode.Lt,
unicode.Lm,
unicode.Lo,
unicode.Nl,
unicode.Other_ID_Start,
}

var includeIDContinue = []*unicode.RangeTable{
unicode.Lu,
unicode.Ll,
unicode.Lt,
unicode.Lm,
unicode.Lo,
unicode.Nl,
unicode.Other_ID_Start,
unicode.Mn,
unicode.Mc,
unicode.Nd,
unicode.Pc,
unicode.Other_ID_Continue,
}

var exclude = []*unicode.RangeTable{
unicode.Pattern_Syntax,
unicode.Pattern_White_Space,
}

func unicodeIDStart(r rune) bool {
if unicode.In(r, exclude...) {
return false
}

return unicode.In(r, includeIDStart...)
}

func unicodeIDContinue(r rune) bool {
if unicode.In(r, exclude...) {
return false
}

return unicode.In(r, includeIDContinue...)
}

func isDigit(chr rune, base int) bool {
return digitValue(chr) < base
}

func isIdentifierStart(chr rune) bool {
return chr == '$' || chr == '_' || chr == '\\' ||
'a' <= chr && chr <= 'z' || 'A' <= chr && chr <= 'Z' ||
chr >= utf8.RuneSelf && unicode.IsLetter(chr)
chr >= utf8.RuneSelf && unicodeIDStart(chr)
}

func isIdentifierPart(chr rune) bool {
return chr == '$' || chr == '_' || chr == '\\' ||
'a' <= chr && chr <= 'z' || 'A' <= chr && chr <= 'Z' ||
'0' <= chr && chr <= '9' ||
chr >= utf8.RuneSelf && (unicode.IsLetter(chr) || unicode.IsDigit(chr))
chr >= utf8.RuneSelf && unicodeIDContinue(chr)
}

func (p *parser) scanIdentifier() (string, error) {
Expand Down
25 changes: 25 additions & 0 deletions parser/lexer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -351,6 +351,21 @@ Second line \
token.RIGHT_BRACKET, "", 6,
)

// Identifier from Unicode Nl
test("\u16ee",
token.IDENTIFIER, "ᛮ", 1,
)

// Identifier from Unicode Other_ID_Start
test("\u212e",
token.IDENTIFIER, "℮", 1,
)

// Using char from ID_Continue after valid start char
test("a\u0300",
token.IDENTIFIER, "à", 1,
)

// ILLEGAL

test(`3ea`,
Expand Down Expand Up @@ -383,5 +398,15 @@ Second line \
token.STRING, "\"\\x0G\"", 1,
token.EOF, "", 7,
)

// Starting identifier with ID_Continue char from Nm
test("\u0300",
token.ILLEGAL,
)

// Starting identifier with Pattern_Syntax
test("'",
token.ILLEGAL,
)
})
}

0 comments on commit d4edd51

Please sign in to comment.