From d4edd51c4f2cd542a7101eb296eb9fe55421c0f9 Mon Sep 17 00:00:00 2001 From: Filip Sandborg Date: Wed, 12 Jun 2024 11:20:22 +0200 Subject: [PATCH] fix(parser/lexer): ID_Start & ID_Continue checks (#524) Fix the checks for ID_Start & ID_Continue to match the definitions in https://www.unicode.org/reports/tr31/ --- parser/lexer.go | 51 ++++++++++++++++++++++++++++++++++++++++++-- parser/lexer_test.go | 25 ++++++++++++++++++++++ 2 files changed, 74 insertions(+), 2 deletions(-) diff --git a/parser/lexer.go b/parser/lexer.go index 3ff62d02..0d9132f7 100644 --- a/parser/lexer.go +++ b/parser/lexer.go @@ -38,6 +38,53 @@ func digitValue(chr rune) int { return 16 // Larger than any legal digit value } +// See https://www.unicode.org/reports/tr31/ for reference on ID_Start and ID_Continue. +var includeIDStart = []*unicode.RangeTable{ + unicode.Lu, + unicode.Ll, + unicode.Lt, + unicode.Lm, + unicode.Lo, + unicode.Nl, + unicode.Other_ID_Start, +} + +var includeIDContinue = []*unicode.RangeTable{ + unicode.Lu, + unicode.Ll, + unicode.Lt, + unicode.Lm, + unicode.Lo, + unicode.Nl, + unicode.Other_ID_Start, + unicode.Mn, + unicode.Mc, + unicode.Nd, + unicode.Pc, + unicode.Other_ID_Continue, +} + +var exclude = []*unicode.RangeTable{ + unicode.Pattern_Syntax, + unicode.Pattern_White_Space, +} + +func unicodeIDStart(r rune) bool { + if unicode.In(r, exclude...) { + return false + } + + return unicode.In(r, includeIDStart...) +} + +func unicodeIDContinue(r rune) bool { + if unicode.In(r, exclude...) { + return false + } + + return unicode.In(r, includeIDContinue...) +} + func isDigit(chr rune, base int) bool { return digitValue(chr) < base } @@ -45,14 +92,14 @@ func isDigit(chr rune, base int) bool { func isIdentifierStart(chr rune) bool { return chr == '$' || chr == '_' || chr == '\\' || 'a' <= chr && chr <= 'z' || 'A' <= chr && chr <= 'Z' || - chr >= utf8.RuneSelf && unicode.IsLetter(chr) + chr >= utf8.RuneSelf && unicodeIDStart(chr) } func isIdentifierPart(chr rune) bool { return chr == '$' || chr == '_' || chr == '\\' || 'a' <= chr && chr <= 'z' || 'A' <= chr && chr <= 'Z' || '0' <= chr && chr <= '9' || - chr >= utf8.RuneSelf && (unicode.IsLetter(chr) || unicode.IsDigit(chr)) + chr >= utf8.RuneSelf && unicodeIDContinue(chr) } func (p *parser) scanIdentifier() (string, error) { diff --git a/parser/lexer_test.go b/parser/lexer_test.go index faa98a48..7e1e5faf 100644 --- a/parser/lexer_test.go +++ b/parser/lexer_test.go @@ -351,6 +351,21 @@ Second line \ token.RIGHT_BRACKET, "", 6, ) + // Identifier from Unicode Nl + test("\u16ee", + token.IDENTIFIER, "ᛮ", 1, + ) + + // Identifier from Unicode Other_ID_Start + test("\u212e", + token.IDENTIFIER, "℮", 1, + ) + + // Using char from ID_Continue after valid start char + test("a\u0300", + token.IDENTIFIER, "à", 1, + ) + // ILLEGAL test(`3ea`, @@ -383,5 +398,15 @@ Second line \ token.STRING, "\"\\x0G\"", 1, token.EOF, "", 7, ) + + // Starting identifier with ID_Continue char from Nm + test("\u0300", + token.ILLEGAL, + ) + + // Starting identifier with Pattern_Syntax + test("'", + token.ILLEGAL, + ) }) }