From 5867b4d8a91982c4d07c88222422e24f9fed641f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Ros?= Date: Sat, 1 Jan 2022 15:54:37 -0800 Subject: [PATCH] Correctly escape non-unicode regex (#218) --- src/Esprima/Messages.cs | 5 + src/Esprima/ParserOptions.cs | 2 +- src/Esprima/Scanner.cs | 280 ++++++++++++++---- src/Esprima/Utils/AstJson.cs | 11 +- .../migrated_0005.tree.json | 2 +- .../u-flag-surrogate-pair.tree.failure.json | 6 + .../u-flag-surrogate-pair.tree.json | 176 ----------- .../u-flag-valid-range.tree.json | 2 +- test/Esprima.Tests/RegExpTests.cs | 73 +++++ test/Esprima.Tests/ScannerTests.cs | 8 - 10 files changed, 319 insertions(+), 246 deletions(-) create mode 100644 test/Esprima.Tests/Fixtures/expression/primary/literal/regular-expression/u-flag-surrogate-pair.tree.failure.json delete mode 100644 test/Esprima.Tests/Fixtures/expression/primary/literal/regular-expression/u-flag-surrogate-pair.tree.json diff --git a/src/Esprima/Messages.cs b/src/Esprima/Messages.cs index dcd8ee9e..0bffbf28 100644 --- a/src/Esprima/Messages.cs +++ b/src/Esprima/Messages.cs @@ -46,6 +46,11 @@ public static class Messages public const string ParameterAfterRestParameter = "Rest parameter must be last formal parameter"; public const string PropertyAfterRestProperty = "Unexpected token"; public const string Redeclaration = "{0} \"{1}\" has already been declared"; + public const string RegexLoneQuantifierBrackets = "Invalid regular expression: Lone quantifier brackets"; + public const string RegexIncompleteQuantifier = "Invalid regular expression: Incomplete quantifier"; + public const string RegexUnterminatedGroup = "Invalid regular expression: Unterminated group"; + public const string RegexUnterminatedCharacterClass = "Invalid regular expression: Unterminated character class"; + public const string RegexUnmatchedOpenParen = "Invalid regular expression: Unmatched ')'"; public const string StaticPrototype = "Classes may not have static property named prototype"; public const string StrictCatchVariable = "Catch variable may not be eval or arguments in strict mode"; public const string StrictDelete = "Delete of an unqualified identifier in strict mode."; diff --git a/src/Esprima/ParserOptions.cs b/src/Esprima/ParserOptions.cs index 26a43867..0b982123 100644 --- a/src/Esprima/ParserOptions.cs +++ b/src/Esprima/ParserOptions.cs @@ -52,6 +52,6 @@ public ParserOptions(IErrorHandler errorHandler) /// /// Gets or sets whether the Regular Expression syntax should be converted to a .NET compatible one. /// - public bool AdaptRegexp { get; set; } + public bool AdaptRegexp { get; set; } = true; } } diff --git a/src/Esprima/Scanner.cs b/src/Esprima/Scanner.cs index 0a32f38d..69f60225 100644 --- a/src/Esprima/Scanner.cs +++ b/src/Esprima/Scanner.cs @@ -1,5 +1,6 @@ using System; using System.Collections.Generic; +using System.ComponentModel; using System.Globalization; using System.Numerics; using System.Runtime.CompilerServices; @@ -961,7 +962,7 @@ private Token ScanBigIntLiteral(int start, ReadOnlySpan number, NumberStyl if (style == NumberStyles.None) { // binary - foreach(var c in number) + foreach (var c in number) { bigInt <<= 1; bigInt += c == '1' ? 1 : 0; @@ -1601,58 +1602,121 @@ public Token ScanTemplate() }; } - // https://tc39.github.io/ecma262/#sec-literals-regular-expression-literals + private static string FromCharCode(uint[] codeUnits) + { + var chars = new char[codeUnits.Length]; + for (var i = 0; i < chars.Length; i++) + { + chars[i] = (char) codeUnits[i]; + } + + return new string(chars); + } - public Regex? TestRegExp(string pattern, string flags) + private string FromCodePoint(params uint[] codePoints) + { + var codeUnits = new List(); + var result = ""; + + foreach (var codePoint in codePoints) + { + if (codePoint < 0 || codePoint > 0x10FFFF) + { + EsprimaExceptionHelper.ThrowArgumentOutOfRangeException(nameof(codePoint), codePoint, "Invalid code point."); + } + + var point = codePoint; + if (point <= 0xFFFF) + { + // BMP code point + codeUnits.Add(point); + } + else + { + // Astral code point; split in surrogate halves + // https://mathiasbynens.be/notes/javascript-encoding#surrogate-formulae + point -= 0x10000; + codeUnits.Add((point >> 10) + 0xD800); // highSurrogate + codeUnits.Add((point % 0x400) + 0xDC00); // lowSurrogate + } + if (codeUnits.Count >= 0x3fff) + { + result += FromCharCode(codeUnits.ToArray()); + codeUnits.Clear(); + } + } + + return result + FromCharCode(codeUnits.ToArray()); + } + + /// + /// Converts an ECMAScript regular expression to a instance. + /// + public Regex ParseRegex(string pattern, string flags) { - // The BMP character to use as a replacement for astral symbols when - // translating an ES6 "u"-flagged pattern to an ES5-compatible - // approximation. - // Note: replacing with '\uFFFF' enables false positives in unlikely - // scenarios. For example, `[\u{1044f}-\u{10440}]` is an invalid - // pattern that would not be detected by this substitution. - var astralSubstitute = "\uFFFF"; var tmp = pattern; var self = this; - if (flags.IndexOf('u') >= 0) + var isUnicode = flags.IndexOf('u') >= 0; + + CheckBracesBalance(pattern, isUnicode); + + if (isUnicode) { + if (Regex.IsMatch(tmp, @"\\0[0-9]+")) + { + throw new ParserException("Invalid decimal escape"); + } + + if (Regex.IsMatch(tmp, @"\\[1-9]\d*")) + { + throw new ParserException("Invalid escape"); + } + tmp = Regex // Replace every Unicode escape sequence with the equivalent // BMP character or a constant ASCII code point in the case of // astral symbols. (See the above note on `astralSubstitute` // for more information.) - .Replace(tmp, @"\\u\{([0-9a-fA-F]+)\}|\\u([a-fA-F0-9]{4})", (match) => + .Replace(tmp, @"\\u\{([0-9a-fA-F]+)\}", (match) => { - int codePoint; - if (!string.IsNullOrEmpty(match.Groups[1].Value)) - { - codePoint = Convert.ToInt32(match.Groups[1].Value, 16); - } - else - { - codePoint = Convert.ToInt32(match.Groups[2].Value, 16); - } + var codePoint = Convert.ToUInt32(match.Groups[1].Value, 16); if (codePoint > 0x10FFFF) { ThrowUnexpectedToken(Messages.InvalidRegExp); } - if (codePoint <= 0xFFFF) - { - return ParserExtensions.CharToString((char) codePoint); - } - - return astralSubstitute; + return FromCodePoint(codePoint); }); - // Replace each paired surrogate with a single ASCII symbol to - // avoid throwing on regular expressions that are only valid in - // combination with the "u" flag. - tmp = Regex.Replace(tmp, "[\uD800-\uDBFF][\uDC00-\uDFFF]", astralSubstitute); + tmp = ConvertUnicodeRegexRanges(tmp); } + tmp = Regex + .Replace(tmp, @"(\\u[a-fA-F0-9]{4})+", (match) => + { + // e.g., \uD83D\uDE80 (which is equivalent to \u{1F680} + var codePoints = new uint[match.Value.Length / 6]; + + for (var i = 0; i < codePoints.Length; i++) + { + codePoints[i] = Convert.ToUInt32(match.Value.Substring(i * 6 + 2, 4), 16); + } + + var sub = FromCodePoint(codePoints); + + return sub; + }); + + // \u is a valid escape sequence in JS, but not in .NET + // search for any of these that are not valid \uxxxx values + + tmp = Regex.Replace(tmp, @"(\\+)u(?![a-fA-F0-9]{4})", (match) => + { + return new String('\\', match.Groups[1].Value.Length / 2 * 2) + 'u'; + }); + // First, detect invalid regular expressions. var options = ParseRegexOptions(flags); @@ -1674,44 +1738,145 @@ public Token ScanTemplate() } } - // Return a regular expression object for this pattern-flag pair, or - // `null` in case the current environment doesn't support the flags it - // uses. - try + // Replace all non-escaped $ occurences by \r?$ + // c.f. http://programmaticallyspeaking.com/regular-expression-multiline-mode-whats-a-newline.html + + var index = 0; + var newPattern = tmp; + + if (options.HasFlag(RegexOptions.Multiline)) { - // Do we need to convert the expression to its .NET equivalent? - if (_adaptRegexp && options.HasFlag(RegexOptions.Multiline)) + while ((index = newPattern.IndexOf("$", index, StringComparison.Ordinal)) != -1) { - // Replace all non-escaped $ occurences by \r?$ - // c.f. http://programmaticallyspeaking.com/regular-expression-multiline-mode-whats-a-newline.html + if (index > 0 && newPattern[index - 1] != '\\') + { + newPattern = newPattern.Substring(0, index) + @"\r?" + newPattern.Substring(index); + index += 4; + } + else + { + index++; + } + } + } + + pattern = newPattern; + + return new Regex(pattern, options); + } - var index = 0; - var newPattern = pattern; - while ((index = newPattern.IndexOf("$", index, StringComparison.Ordinal)) != -1) + /// + /// Ensures the braces are balanced in a unicode Regex + /// + private void CheckBracesBalance(string pattern, bool unicode) + { + int paren = 0; + int curly = 0; + int square = 0; + + for (var i = 0; i < pattern.Length; i++) + { + var ch = pattern[i]; + + if (ch == '\\') + { + // Skip escape + + i++; + continue; + } + + switch (ch) + { + case '(': if (square == 0) paren++; break; + case ')': if (square == 0) paren--; break; + case '{': if (square == 0) curly++; break; + case '}': if (square == 0) curly--; break; + case '[': if (square == 0) square++; break; + case ']': square--; break; + default: break; + } + + if (paren < 0) + { + throw new ParserException(Messages.RegexUnmatchedOpenParen); + } + + if (unicode) + { + if (curly < 0 || square < 0) { - if (index > 0 && newPattern[index - 1] != '\\') - { - newPattern = newPattern.Substring(0, index) + @"\r?" + newPattern.Substring(index); - index += 4; - } - else - { - index++; - } + throw new ParserException(Messages.RegexLoneQuantifierBrackets); } + } + } + + if (paren > 0) + { + throw new ParserException(Messages.RegexUnterminatedGroup); + } + + if (unicode) + { + if (curly > 0) + { + throw new ParserException(Messages.RegexIncompleteQuantifier); + } - pattern = newPattern; + if (square > 0) + { + throw new ParserException(Messages.RegexUnterminatedCharacterClass); } + } + } - return new Regex(pattern, options); + private string ConvertUnicodeRegexRanges(string pattern) + { + if (String.IsNullOrEmpty(pattern)) + { + return pattern; } - catch + + bool converted = false; + + var sb = GetStringBuilder(); + + for (var i = 0; i < pattern.Length; i++) { - return null; + var ch = pattern[i]; + + if (ch == '.') + { + converted = true; + + sb.Append("(?:[\uD800-\uDBFF][\uDC00-\uDFFF]|.)"); + } + else if (ch == '\\' && i + 1 < pattern.Length) + { + ch = pattern[++i]; + if (ch == 'D' || ch == 'S' || ch == 'W') + { + converted = true; + + sb.Append("(?:[\uD800-\uDBFF][\uDC00-\uDFFF]|\\" + ch + ")"); + } + else + { + converted = true; + + sb.Append('\\').Append(ch); + } + } + else + { + sb.Append(ch); + } } + + return converted ? sb.ToString() : pattern; } - public string EscapeFailingRegex(string pattern) + internal string EscapeFailingRegex(string pattern) { // .NET 4.x doesn't support [^] which should match any character including newline // c.f. https://github.com/sebastienros/esprima-dotnet/issues/146 @@ -1851,12 +2016,11 @@ public Token ScanRegExp() var body = ScanRegExpBody(); var flags = ScanRegExpFlags(); var flagsValue = (string) flags.Value!; - var value = TestRegExp((string) body.Value!, flagsValue); return new Token { Type = TokenType.RegularExpression, - Value = value, + Value = _adaptRegexp ? ParseRegex((string) body.Value!, flagsValue) : null, Literal = body.Literal + flags.Literal, RegexValue = new RegexValue((string) body.Value!, flagsValue), LineNumber = LineNumber, diff --git a/src/Esprima/Utils/AstJson.cs b/src/Esprima/Utils/AstJson.cs index be2613d1..eec3a16c 100644 --- a/src/Esprima/Utils/AstJson.cs +++ b/src/Esprima/Utils/AstJson.cs @@ -592,10 +592,19 @@ protected internal override void VisitLiteral(Literal literal) { _writer.Member("value"); var value = literal.Value; + switch (value) { case null: - _writer.Null(); + if (literal.TokenType == TokenType.RegularExpression) + { + // This is how esprima.org actually renders regexes since it relies on Regex.toString + _writer.String(literal.Raw); + } + else + { + _writer.Null(); + } break; case bool b: _writer.Boolean(b); diff --git a/test/Esprima.Tests/Fixtures/expression/primary/literal/regular-expression/migrated_0005.tree.json b/test/Esprima.Tests/Fixtures/expression/primary/literal/regular-expression/migrated_0005.tree.json index 50217409..a7d0d6c1 100644 --- a/test/Esprima.Tests/Fixtures/expression/primary/literal/regular-expression/migrated_0005.tree.json +++ b/test/Esprima.Tests/Fixtures/expression/primary/literal/regular-expression/migrated_0005.tree.json @@ -26,7 +26,7 @@ }, "init": { "type": "Literal", - "value": null, + "value": { }, "raw": "/[\\u{0000000000000061}-\\u{7A}]/u", "regex": { "pattern": "[\\u{0000000000000061}-\\u{7A}]", diff --git a/test/Esprima.Tests/Fixtures/expression/primary/literal/regular-expression/u-flag-surrogate-pair.tree.failure.json b/test/Esprima.Tests/Fixtures/expression/primary/literal/regular-expression/u-flag-surrogate-pair.tree.failure.json new file mode 100644 index 00000000..e3922f28 --- /dev/null +++ b/test/Esprima.Tests/Fixtures/expression/primary/literal/regular-expression/u-flag-surrogate-pair.tree.failure.json @@ -0,0 +1,6 @@ +{ + "index": 7, + "lineNumber": 1, + "column": 8, + "message": "Error: Line 1: Invalid regular expression" +} diff --git a/test/Esprima.Tests/Fixtures/expression/primary/literal/regular-expression/u-flag-surrogate-pair.tree.json b/test/Esprima.Tests/Fixtures/expression/primary/literal/regular-expression/u-flag-surrogate-pair.tree.json deleted file mode 100644 index 74098cfe..00000000 --- a/test/Esprima.Tests/Fixtures/expression/primary/literal/regular-expression/u-flag-surrogate-pair.tree.json +++ /dev/null @@ -1,176 +0,0 @@ -{ - "type": "Program", - "body": [ - { - "type": "VariableDeclaration", - "declarations": [ - { - "type": "VariableDeclarator", - "id": { - "type": "Identifier", - "name": "x", - "range": [ - 4, - 5 - ], - "loc": { - "start": { - "line": 1, - "column": 4 - }, - "end": { - "line": 1, - "column": 5 - } - } - }, - "init": { - "type": "Literal", - "value": null, - "raw": "/[\\uD834\\uDF06-\\uD834\\uDF08a-z]/u", - "regex": { - "pattern": "[\\uD834\\uDF06-\\uD834\\uDF08a-z]", - "flags": "u" - }, - "range": [ - 8, - 41 - ], - "loc": { - "start": { - "line": 1, - "column": 8 - }, - "end": { - "line": 1, - "column": 41 - } - } - }, - "range": [ - 4, - 41 - ], - "loc": { - "start": { - "line": 1, - "column": 4 - }, - "end": { - "line": 1, - "column": 41 - } - } - } - ], - "kind": "var", - "range": [ - 0, - 41 - ], - "loc": { - "start": { - "line": 1, - "column": 0 - }, - "end": { - "line": 1, - "column": 41 - } - } - } - ], - "sourceType": "script", - "tokens": [ - { - "type": "Keyword", - "value": "var", - "range": [ - 0, - 3 - ], - "loc": { - "start": { - "line": 1, - "column": 0 - }, - "end": { - "line": 1, - "column": 3 - } - } - }, - { - "type": "Identifier", - "value": "x", - "range": [ - 4, - 5 - ], - "loc": { - "start": { - "line": 1, - "column": 4 - }, - "end": { - "line": 1, - "column": 5 - } - } - }, - { - "type": "Punctuator", - "value": "=", - "range": [ - 6, - 7 - ], - "loc": { - "start": { - "line": 1, - "column": 6 - }, - "end": { - "line": 1, - "column": 7 - } - } - }, - { - "type": "RegularExpression", - "value": "/[\\uD834\\uDF06-\\uD834\\uDF08a-z]/u", - "regex": { - "pattern": "[\\uD834\\uDF06-\\uD834\\uDF08a-z]", - "flags": "u" - }, - "range": [ - 8, - 41 - ], - "loc": { - "start": { - "line": 1, - "column": 8 - }, - "end": { - "line": 1, - "column": 41 - } - } - } - ], - "range": [ - 0, - 41 - ], - "loc": { - "start": { - "line": 1, - "column": 0 - }, - "end": { - "line": 1, - "column": 41 - } - } -} diff --git a/test/Esprima.Tests/Fixtures/expression/primary/literal/regular-expression/u-flag-valid-range.tree.json b/test/Esprima.Tests/Fixtures/expression/primary/literal/regular-expression/u-flag-valid-range.tree.json index 5dd777ee..d81fd1bc 100644 --- a/test/Esprima.Tests/Fixtures/expression/primary/literal/regular-expression/u-flag-valid-range.tree.json +++ b/test/Esprima.Tests/Fixtures/expression/primary/literal/regular-expression/u-flag-valid-range.tree.json @@ -26,7 +26,7 @@ }, "init": { "type": "Literal", - "value": null, + "value": { }, "raw": "/[\\u{61}-b][\\u0061-b][a-\\u{62}][a-\\u0062]\\u{1ffff}/u", "regex": { "pattern": "[\\u{61}-b][\\u0061-b][a-\\u{62}][a-\\u0062]\\u{1ffff}", diff --git a/test/Esprima.Tests/RegExpTests.cs b/test/Esprima.Tests/RegExpTests.cs index f0beacc2..bbbb392e 100644 --- a/test/Esprima.Tests/RegExpTests.cs +++ b/test/Esprima.Tests/RegExpTests.cs @@ -33,5 +33,78 @@ public void ShouldParseRegularExpression(string regexp) Assert.NotNull(program); } + + [Fact] + public void ShouldParseUnicodeEscapesWithoutFlag() + { + Assert.Matches(CreateRegex(@"/^\u{3}$/"), "uuu"); + Assert.Matches(CreateRegex(@"/^\\u{3}$/"), @"\uuu"); + } + + [Fact] + public void ShouldEscapeUnicodeEscapesWithoutFlag() + { + Assert.Matches(CreateRegex(@"/^\\u{3}$/"), @"\uuu"); + Assert.Matches(CreateRegex(@"/^\\\u{3}$/"), @"\uuu"); + Assert.Matches(CreateRegex(@"/^\\\\u{3}$/"), @"\\uuu"); + Assert.Matches(CreateRegex(@"/^\\\\\u{3}$/"), @"\\uuu"); + } + + [Fact] + public void ShouldParseUnicodeEscapesWithFlag() + { + Assert.Matches(CreateRegex(@"/^\u{1F680}$/u"), "🚀"); + } + + [Fact] + public void ShouldParseSurrogatePairs() + { + Assert.Matches(CreateRegex(@"/^\uD83D\uDE80$/u"), "🚀"); + } + + + [Fact] + public void ShouldNotAcceptOctalEspacesWithUnicodeFlag() + { + Assert.Throws(() => CreateRegex(@"/\1/u")); + Assert.Throws(() => CreateRegex(@"/\251/u")); + Assert.Throws(() => CreateRegex(@"/\00/u")); + Assert.NotNull(CreateRegex(@"/\0/u")); // NULL == \u0000 + Assert.NotNull(CreateRegex(@"/\1/")); + } + + [Fact] + public void ShouldCheckGroupBalance() + { + Assert.Throws(() => CreateRegex(@"/(/")); + Assert.Throws(() => CreateRegex(@"/)/")); + Assert.Throws(() => CreateRegex(@"/[/")); + Assert.NotNull(CreateRegex(@"/]/")); + Assert.NotNull(CreateRegex(@"/{/")); + Assert.NotNull(CreateRegex(@"/}/")); + + Assert.NotNull(CreateRegex(@"/[(]/")); + Assert.NotNull(CreateRegex(@"/[)]/")); + Assert.NotNull(CreateRegex(@"/[{]/")); + Assert.NotNull(CreateRegex(@"/[}]/")); + Assert.NotNull(CreateRegex(@"/[[]/")); + + Assert.Throws(() => CreateRegex(@"/(/u")); + Assert.Throws(() => CreateRegex(@"/)/u")); + Assert.Throws(() => CreateRegex(@"/[/u")); + Assert.Throws(() => CreateRegex(@"/]/u")); + Assert.Throws(() => CreateRegex(@"/{/u")); + Assert.Throws(() => CreateRegex(@"/}/u")); + + Assert.NotNull(CreateRegex(@"/([-.*+?^${}()|[\]\/\\])/")); + } + + [Fact] + public void ShouldPreventInfiniteLoopWhenAdaptingMultiLine() + { + var scanner = new Scanner("", new ParserOptions { AdaptRegexp = true }); + var regex = scanner.ParseRegex("\\$", "gm"); + Assert.NotNull(regex); + } } } diff --git a/test/Esprima.Tests/ScannerTests.cs b/test/Esprima.Tests/ScannerTests.cs index c39a136f..e4cebfc6 100644 --- a/test/Esprima.Tests/ScannerTests.cs +++ b/test/Esprima.Tests/ScannerTests.cs @@ -24,13 +24,5 @@ public void CanScanMultiLineComment() Assert.Equal(new[] { "11-28" }, results); } - - [Fact] - public void ShouldPreventInfiniteLoopWhenAdaptingMultiLine() - { - var scanner = new Scanner("", new ParserOptions { AdaptRegexp = true }); - var regex = scanner.TestRegExp("\\$", "gm"); - Assert.NotNull(regex); - } } }