Skip to content

Commit

Permalink
Correctly escape non-unicode regex (#218)
Browse files Browse the repository at this point in the history
  • Loading branch information
sebastienros authored Jan 1, 2022
1 parent 68f28de commit 5867b4d
Show file tree
Hide file tree
Showing 10 changed files with 319 additions and 246 deletions.
5 changes: 5 additions & 0 deletions src/Esprima/Messages.cs
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,11 @@ public static class Messages
public const string ParameterAfterRestParameter = "Rest parameter must be last formal parameter";
public const string PropertyAfterRestProperty = "Unexpected token";
public const string Redeclaration = "{0} \"{1}\" has already been declared";
public const string RegexLoneQuantifierBrackets = "Invalid regular expression: Lone quantifier brackets";
public const string RegexIncompleteQuantifier = "Invalid regular expression: Incomplete quantifier";
public const string RegexUnterminatedGroup = "Invalid regular expression: Unterminated group";
public const string RegexUnterminatedCharacterClass = "Invalid regular expression: Unterminated character class";
public const string RegexUnmatchedOpenParen = "Invalid regular expression: Unmatched ')'";
public const string StaticPrototype = "Classes may not have static property named prototype";
public const string StrictCatchVariable = "Catch variable may not be eval or arguments in strict mode";
public const string StrictDelete = "Delete of an unqualified identifier in strict mode.";
Expand Down
2 changes: 1 addition & 1 deletion src/Esprima/ParserOptions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,6 @@ public ParserOptions(IErrorHandler errorHandler)
/// <summary>
/// Gets or sets whether the Regular Expression syntax should be converted to a .NET compatible one.
/// </summary>
public bool AdaptRegexp { get; set; }
public bool AdaptRegexp { get; set; } = true;
}
}
280 changes: 222 additions & 58 deletions src/Esprima/Scanner.cs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Globalization;
using System.Numerics;
using System.Runtime.CompilerServices;
Expand Down Expand Up @@ -961,7 +962,7 @@ private Token ScanBigIntLiteral(int start, ReadOnlySpan<char> number, NumberStyl
if (style == NumberStyles.None)
{
// binary
foreach(var c in number)
foreach (var c in number)
{
bigInt <<= 1;
bigInt += c == '1' ? 1 : 0;
Expand Down Expand Up @@ -1601,58 +1602,121 @@ public Token ScanTemplate()
};
}

// https://tc39.github.io/ecma262/#sec-literals-regular-expression-literals
private static string FromCharCode(uint[] codeUnits)
{
var chars = new char[codeUnits.Length];
for (var i = 0; i < chars.Length; i++)
{
chars[i] = (char) codeUnits[i];
}

return new string(chars);
}

public Regex? TestRegExp(string pattern, string flags)
private string FromCodePoint(params uint[] codePoints)
{
var codeUnits = new List<uint>();
var result = "";

foreach (var codePoint in codePoints)
{
if (codePoint < 0 || codePoint > 0x10FFFF)
{
EsprimaExceptionHelper.ThrowArgumentOutOfRangeException(nameof(codePoint), codePoint, "Invalid code point.");
}

var point = codePoint;
if (point <= 0xFFFF)
{
// BMP code point
codeUnits.Add(point);
}
else
{
// Astral code point; split in surrogate halves
// https://mathiasbynens.be/notes/javascript-encoding#surrogate-formulae
point -= 0x10000;
codeUnits.Add((point >> 10) + 0xD800); // highSurrogate
codeUnits.Add((point % 0x400) + 0xDC00); // lowSurrogate
}
if (codeUnits.Count >= 0x3fff)
{
result += FromCharCode(codeUnits.ToArray());
codeUnits.Clear();
}
}

return result + FromCharCode(codeUnits.ToArray());
}

/// <summary>
/// Converts an ECMAScript regular expression to a <see cref="Regex"/> instance.
/// </summary>
public Regex ParseRegex(string pattern, string flags)
{
// The BMP character to use as a replacement for astral symbols when
// translating an ES6 "u"-flagged pattern to an ES5-compatible
// approximation.
// Note: replacing with '\uFFFF' enables false positives in unlikely
// scenarios. For example, `[\u{1044f}-\u{10440}]` is an invalid
// pattern that would not be detected by this substitution.
var astralSubstitute = "\uFFFF";
var tmp = pattern;
var self = this;

if (flags.IndexOf('u') >= 0)
var isUnicode = flags.IndexOf('u') >= 0;

CheckBracesBalance(pattern, isUnicode);

if (isUnicode)
{
if (Regex.IsMatch(tmp, @"\\0[0-9]+"))
{
throw new ParserException("Invalid decimal escape");
}

if (Regex.IsMatch(tmp, @"\\[1-9]\d*"))
{
throw new ParserException("Invalid escape");
}

tmp = Regex
// Replace every Unicode escape sequence with the equivalent
// BMP character or a constant ASCII code point in the case of
// astral symbols. (See the above note on `astralSubstitute`
// for more information.)
.Replace(tmp, @"\\u\{([0-9a-fA-F]+)\}|\\u([a-fA-F0-9]{4})", (match) =>
.Replace(tmp, @"\\u\{([0-9a-fA-F]+)\}", (match) =>
{
int codePoint;
if (!string.IsNullOrEmpty(match.Groups[1].Value))
{
codePoint = Convert.ToInt32(match.Groups[1].Value, 16);
}
else
{
codePoint = Convert.ToInt32(match.Groups[2].Value, 16);
}
var codePoint = Convert.ToUInt32(match.Groups[1].Value, 16);
if (codePoint > 0x10FFFF)
{
ThrowUnexpectedToken(Messages.InvalidRegExp);
}
if (codePoint <= 0xFFFF)
{
return ParserExtensions.CharToString((char) codePoint);
}
return astralSubstitute;
return FromCodePoint(codePoint);
});

// Replace each paired surrogate with a single ASCII symbol to
// avoid throwing on regular expressions that are only valid in
// combination with the "u" flag.
tmp = Regex.Replace(tmp, "[\uD800-\uDBFF][\uDC00-\uDFFF]", astralSubstitute);
tmp = ConvertUnicodeRegexRanges(tmp);
}

tmp = Regex
.Replace(tmp, @"(\\u[a-fA-F0-9]{4})+", (match) =>
{
// e.g., \uD83D\uDE80 (which is equivalent to \u{1F680}
var codePoints = new uint[match.Value.Length / 6];
for (var i = 0; i < codePoints.Length; i++)
{
codePoints[i] = Convert.ToUInt32(match.Value.Substring(i * 6 + 2, 4), 16);
}
var sub = FromCodePoint(codePoints);
return sub;
});

// \u is a valid escape sequence in JS, but not in .NET
// search for any of these that are not valid \uxxxx values

tmp = Regex.Replace(tmp, @"(\\+)u(?![a-fA-F0-9]{4})", (match) =>
{
return new String('\\', match.Groups[1].Value.Length / 2 * 2) + 'u';
});

// First, detect invalid regular expressions.
var options = ParseRegexOptions(flags);

Expand All @@ -1674,44 +1738,145 @@ public Token ScanTemplate()
}
}

// Return a regular expression object for this pattern-flag pair, or
// `null` in case the current environment doesn't support the flags it
// uses.
try
// Replace all non-escaped $ occurences by \r?$
// c.f. http://programmaticallyspeaking.com/regular-expression-multiline-mode-whats-a-newline.html

var index = 0;
var newPattern = tmp;

if (options.HasFlag(RegexOptions.Multiline))
{
// Do we need to convert the expression to its .NET equivalent?
if (_adaptRegexp && options.HasFlag(RegexOptions.Multiline))
while ((index = newPattern.IndexOf("$", index, StringComparison.Ordinal)) != -1)
{
// Replace all non-escaped $ occurences by \r?$
// c.f. http://programmaticallyspeaking.com/regular-expression-multiline-mode-whats-a-newline.html
if (index > 0 && newPattern[index - 1] != '\\')
{
newPattern = newPattern.Substring(0, index) + @"\r?" + newPattern.Substring(index);
index += 4;
}
else
{
index++;
}
}
}

pattern = newPattern;

return new Regex(pattern, options);
}

var index = 0;
var newPattern = pattern;
while ((index = newPattern.IndexOf("$", index, StringComparison.Ordinal)) != -1)
/// <summary>
/// Ensures the braces are balanced in a unicode Regex
/// </summary>
private void CheckBracesBalance(string pattern, bool unicode)
{
int paren = 0;
int curly = 0;
int square = 0;

for (var i = 0; i < pattern.Length; i++)
{
var ch = pattern[i];

if (ch == '\\')
{
// Skip escape

i++;
continue;
}

switch (ch)
{
case '(': if (square == 0) paren++; break;
case ')': if (square == 0) paren--; break;
case '{': if (square == 0) curly++; break;
case '}': if (square == 0) curly--; break;
case '[': if (square == 0) square++; break;
case ']': square--; break;
default: break;
}

if (paren < 0)
{
throw new ParserException(Messages.RegexUnmatchedOpenParen);
}

if (unicode)
{
if (curly < 0 || square < 0)
{
if (index > 0 && newPattern[index - 1] != '\\')
{
newPattern = newPattern.Substring(0, index) + @"\r?" + newPattern.Substring(index);
index += 4;
}
else
{
index++;
}
throw new ParserException(Messages.RegexLoneQuantifierBrackets);
}
}
}

if (paren > 0)
{
throw new ParserException(Messages.RegexUnterminatedGroup);
}

if (unicode)
{
if (curly > 0)
{
throw new ParserException(Messages.RegexIncompleteQuantifier);
}

pattern = newPattern;
if (square > 0)
{
throw new ParserException(Messages.RegexUnterminatedCharacterClass);
}
}
}

return new Regex(pattern, options);
private string ConvertUnicodeRegexRanges(string pattern)
{
if (String.IsNullOrEmpty(pattern))
{
return pattern;
}
catch

bool converted = false;

var sb = GetStringBuilder();

for (var i = 0; i < pattern.Length; i++)
{
return null;
var ch = pattern[i];

if (ch == '.')
{
converted = true;

sb.Append("(?:[\uD800-\uDBFF][\uDC00-\uDFFF]|.)");
}
else if (ch == '\\' && i + 1 < pattern.Length)
{
ch = pattern[++i];
if (ch == 'D' || ch == 'S' || ch == 'W')
{
converted = true;

sb.Append("(?:[\uD800-\uDBFF][\uDC00-\uDFFF]|\\" + ch + ")");
}
else
{
converted = true;

sb.Append('\\').Append(ch);
}
}
else
{
sb.Append(ch);
}
}

return converted ? sb.ToString() : pattern;
}

public string EscapeFailingRegex(string pattern)
internal string EscapeFailingRegex(string pattern)
{
// .NET 4.x doesn't support [^] which should match any character including newline
// c.f. https://github.com/sebastienros/esprima-dotnet/issues/146
Expand Down Expand Up @@ -1851,12 +2016,11 @@ public Token ScanRegExp()
var body = ScanRegExpBody();
var flags = ScanRegExpFlags();
var flagsValue = (string) flags.Value!;
var value = TestRegExp((string) body.Value!, flagsValue);

return new Token
{
Type = TokenType.RegularExpression,
Value = value,
Value = _adaptRegexp ? ParseRegex((string) body.Value!, flagsValue) : null,
Literal = body.Literal + flags.Literal,
RegexValue = new RegexValue((string) body.Value!, flagsValue),
LineNumber = LineNumber,
Expand Down
11 changes: 10 additions & 1 deletion src/Esprima/Utils/AstJson.cs
Original file line number Diff line number Diff line change
Expand Up @@ -592,10 +592,19 @@ protected internal override void VisitLiteral(Literal literal)
{
_writer.Member("value");
var value = literal.Value;

switch (value)
{
case null:
_writer.Null();
if (literal.TokenType == TokenType.RegularExpression)
{
// This is how esprima.org actually renders regexes since it relies on Regex.toString
_writer.String(literal.Raw);
}
else
{
_writer.Null();
}
break;
case bool b:
_writer.Boolean(b);
Expand Down
Loading

0 comments on commit 5867b4d

Please sign in to comment.