Skip to content

Commit

Permalink
Improve identifier parsing (#386)
Browse files Browse the repository at this point in the history
* Improves whitespace and identifier parsing to match the spec (using Unicode 15.0)

* Optimize whitespace and identifier parsing in the BMP range
  • Loading branch information
adams85 authored May 6, 2023
1 parent e95ed5a commit 81f25cc
Show file tree
Hide file tree
Showing 9 changed files with 1,450 additions and 1,918 deletions.
10 changes: 5 additions & 5 deletions samples/Esprima.Benchmark/CharBenchmark.cs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ public void IsHexDigit_Old()
var source = FullRange ? _fullRange : _asciiRange;
foreach (var i in source)
{
CharMaskGeneratorTest.IsHexDigit(i);
CharacterOld.IsHexDigit(i);
}
}

Expand All @@ -46,7 +46,7 @@ public void IsDecimalDigit_Old()
var source = FullRange ? _fullRange : _asciiRange;
foreach (var i in source)
{
CharMaskGeneratorTest.IsDecimalDigit(i);
CharacterOld.IsDecimalDigit(i);
}
}

Expand All @@ -66,7 +66,7 @@ public void IsWhiteSpace_Old()
var source = FullRange ? _fullRange : _asciiRange;
foreach (var i in source)
{
CharMaskGeneratorTest.IsWhiteSpace(i);
CharacterOld.IsWhiteSpace(i);
}
}

Expand All @@ -86,7 +86,7 @@ public void IsIdentifierStart_Old()
var source = FullRange ? _fullRange : _asciiRange;
foreach (var i in source)
{
CharMaskGeneratorTest.IsIdentifierStart(i);
CharacterOld.IsIdentifierStart(i);
}
}

Expand All @@ -106,7 +106,7 @@ public void IsIdentifierPart_Old()
var source = FullRange ? _fullRange : _asciiRange;
foreach (var i in source)
{
CharMaskGeneratorTest.IsIdentifierPart(i);
CharacterOld.IsIdentifierPart(i);
}
}

Expand Down
68 changes: 68 additions & 0 deletions samples/Esprima.Benchmark/CharacterOld.cs

Large diffs are not rendered by default.

3 changes: 0 additions & 3 deletions samples/Esprima.Benchmark/Esprima.Benchmark.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,4 @@
<ItemGroup>
<PackageReference Include="BenchmarkDotNet" Version="0.13.2" />
</ItemGroup>
<ItemGroup>
<Compile Include="..\..\test\Esprima.Tests\CharMaskGeneratorTest.cs" Link="CharMaskGeneratorTest.cs" />
</ItemGroup>
</Project>
46 changes: 46 additions & 0 deletions src/Esprima/CharRange.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
using System.Diagnostics;
using System.Runtime.CompilerServices;

namespace Esprima;

internal readonly partial struct CharRange
{
public CharRange(int start, int end)
{
Start = start;
End = end;
}

public readonly int Start;

public readonly int End; /* Inclusive */

public int Length => End - Start + 1;

[MethodImpl(MethodImplOptions.AggressiveInlining)]
public bool Contains(int codePoint) => Start <= codePoint && codePoint <= End;

internal static bool CharSetContains(int codePoint, int[] charSet, int[] rangeLengthLookup)
{
Debug.Assert(codePoint is >= 0 and <= Character.UnicodeLastCodePoint);

var codePointShifted = codePoint << 8;

var index = Array.BinarySearch(charSet, codePointShifted);
if (index >= 0
|| (index = ~index) < charSet.Length && DecodeCharRange(charSet[index], rangeLengthLookup).Contains(codePoint)
|| index > 0 && DecodeCharRange(charSet[--index], rangeLengthLookup).Contains(codePoint))
{
return true;
}

return false;
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static CharRange DecodeCharRange(int data, int[] rangeLengths)
{
var start = data >> 8;
return new CharRange(start, start + rangeLengths[data & 0xFF]);
}
}
2,724 changes: 1,082 additions & 1,642 deletions src/Esprima/Character.Generated.cs

Large diffs are not rendered by default.

91 changes: 16 additions & 75 deletions src/Esprima/Character.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
using System.Globalization;
using System.Diagnostics;
using System.Runtime.CompilerServices;

namespace Esprima;
Expand All @@ -16,6 +16,12 @@ public static partial class Character
{
internal const int UnicodeLastCodePoint = 0x10FFFF;

[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static bool HasCharacterFlag(char ch, CharacterMask flag)
{
return (s_characterData[ch >> 1] & ((byte) flag << ((ch & 1) << 2))) != 0;
}

// https://tc39.github.io/ecma262/#sec-line-terminators

internal static bool IsLineTerminator(char ch)
Expand All @@ -31,35 +37,31 @@ internal static bool IsLineTerminator(char ch)

internal static bool IsWhiteSpace(char ch)
{
return (_characterData[ch] & (byte) CharacterMask.WhiteSpace) != 0;
return HasCharacterFlag(ch, CharacterMask.WhiteSpace);
}

// https://tc39.github.io/ecma262/#sec-names-and-keywords

internal static bool IsIdentifierStart(char ch)
{
return (_characterData[ch] & (byte) CharacterMask.IdentifierStart) != 0;
return HasCharacterFlag(ch, CharacterMask.IdentifierStart);
}

internal static bool IsIdentifierStart(string s, int index)
internal static bool IsIdentifierStartAstral(int cp)
{
var ch = s[index];
return !char.IsHighSurrogate(ch)
? IsIdentifierStart(ch)
: IsIdentifierStartUnicodeCategory(CharUnicodeInfo.GetUnicodeCategory(s, index));
Debug.Assert(cp > char.MaxValue);
return CharRange.CharSetContains(cp, s_identifierStartAstralRanges, s_rangeLengthLookup);
}

internal static bool IsIdentifierPart(char ch)
{
return (_characterData[ch] & (byte) CharacterMask.IdentifierPart) != 0;
return HasCharacterFlag(ch, CharacterMask.IdentifierPart);
}

internal static bool IsIdentifierPart(string s, int index)
internal static bool IsIdentifierPartAstral(int cp)
{
var ch = s[index];
return !char.IsHighSurrogate(ch)
? IsIdentifierPart(ch)
: IsIdentifierPartUnicodeCategory(CharUnicodeInfo.GetUnicodeCategory(s, index));
Debug.Assert(cp > char.MaxValue);
return CharRange.CharSetContains(cp, s_identifierPartAstralRanges, s_rangeLengthLookup);
}

// https://tc39.github.io/ecma262/#sec-literals-numeric-literals
Expand All @@ -78,65 +80,4 @@ internal static bool IsIdentifierPart(string s, int index)

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static bool IsInRange(char c, char min, char max) => c - (uint) min <= max - (uint) min;

internal static bool IsIdentifierStartUnicodeCategory(UnicodeCategory cat)
{
return IsLetterChar(cat) || cat is UnicodeCategory.ModifierLetter or UnicodeCategory.NonSpacingMark;
}

internal static bool IsIdentifierPartUnicodeCategory(UnicodeCategory cat)
{
return IsLetterChar(cat)
|| IsDecimalDigitChar(cat)
|| IsConnectingChar(cat)
|| IsCombiningChar(cat)
|| IsFormattingChar(cat);
}

internal static bool IsLetterChar(UnicodeCategory cat)
{
return cat switch
{
UnicodeCategory.UppercaseLetter => true,
UnicodeCategory.LowercaseLetter => true,
UnicodeCategory.TitlecaseLetter => true,
UnicodeCategory.OtherLetter => true,
UnicodeCategory.LetterNumber => true,
UnicodeCategory.Surrogate => true,
UnicodeCategory.OtherNotAssigned => true,
UnicodeCategory.OtherNumber => true,

UnicodeCategory.MathSymbol => true,
UnicodeCategory.OtherSymbol => true,
UnicodeCategory.ModifierSymbol => true,

_ => false
};
}

private static bool IsCombiningChar(UnicodeCategory cat)
{
return cat is UnicodeCategory.NonSpacingMark or UnicodeCategory.SpacingCombiningMark;
}

private static bool IsDecimalDigitChar(UnicodeCategory cat)
{
return cat == UnicodeCategory.DecimalDigitNumber;
}

private static bool IsConnectingChar(UnicodeCategory cat)
{
return cat is UnicodeCategory.ConnectorPunctuation or UnicodeCategory.OtherPunctuation;
}

internal static bool IsFormattingChar(char ch)
{
// There are no FormattingChars in ASCII range
return ch > 127 && IsFormattingChar(CharUnicodeInfo.GetUnicodeCategory(ch));
}

internal static bool IsFormattingChar(UnicodeCategory cat)
{
return cat == UnicodeCategory.Format;
}
}
78 changes: 45 additions & 33 deletions src/Esprima/Scanner.cs
Original file line number Diff line number Diff line change
Expand Up @@ -620,15 +620,10 @@ private string GetIdentifier()
while (!Eof())
{
var ch = _source[_index];
if (ch == 0x5C)
if ((ushort) ch is
0x5C // Blackslash (U+005C) marks Unicode escape sequence.
or >= 0xD800 and <= 0xDFFF) // Need to handle surrogate pairs.
{
// Blackslash (U+005C) marks Unicode escape sequence.
_index = start;
return GetComplexIdentifier();
}
else if ((ushort) ch is >= 0xD800 and <= 0xDFFF)
{
// Need to handle surrogate pairs.
_index = start;
return GetComplexIdentifier();
}
Expand Down Expand Up @@ -661,13 +656,27 @@ private string GetComplexIdentifier()

if (cp != 0x5C)
{
ch = ParserExtensions.CodePointOrSurrogateToString(cp);
if (ch.Length == 1 && char.IsSurrogate(ch[0]))
if (cp <= char.MaxValue)
{
ThrowUnexpectedToken();
if (char.IsSurrogate((char) cp))
{
ThrowUnexpectedToken();
}

sb.Append((char) cp);
_index++;
}
else
{
if (!Character.IsIdentifierStartAstral(cp))
{
ThrowUnexpectedToken();
}

ch = ParserExtensions.CodePointToString(cp);
sb.Append(ch);
_index += ch.Length;
}
sb.Append(ch);
_index += ch.Length;
}
// '\u' (U+005C, U+0075) denotes an escaped character.
else
Expand All @@ -689,14 +698,13 @@ private string GetComplexIdentifier()
: Messages.InvalidUnicodeEscapeSequence);
}

if (chcp > 0xFFFF)
if (chcp > char.MaxValue)
{
ch = ParserExtensions.CodePointToString(chcp);
if (!Character.IsIdentifierStart(ch, 0))
if (!Character.IsIdentifierStartAstral(chcp))
{
ThrowUnexpectedToken();
}
sb.Append(ch);
sb.Append(ParserExtensions.CodePointToString(chcp));
}
else
{
Expand All @@ -723,27 +731,32 @@ private string GetComplexIdentifier()

if (cp != 0x5C)
{
ch = ParserExtensions.CodePointOrSurrogateToString(cp);

if (ch.Length == 1)
if (cp <= char.MaxValue)
{
// IsIdentifierPart also matches the surrogate range (U+D800..U+DFFF) currently.
if (!Character.IsIdentifierPart(ch[0]))
// IsIdentifierPart also matches the surrogate range (U+D800..U+DFFF).
if (!Character.IsIdentifierPart((char) cp))
{
break;
}
else if (char.IsSurrogate(ch[0]))
else if (char.IsSurrogate((char) cp))
{
ThrowUnexpectedToken();
}

sb.Append((char) cp);
_index++;
}
else if (!Character.IsIdentifierPart(ch, 0))
else
{
break;
}
if (!Character.IsIdentifierPartAstral(cp))
{
break;
}

sb.Append(ch);
_index += ch.Length;
ch = ParserExtensions.CodePointToString(cp);
sb.Append(ch);
_index += ch.Length;
}
}
// '\u' (U+005C, U+0075) denotes an escaped character.
else
Expand All @@ -765,14 +778,13 @@ private string GetComplexIdentifier()
: Messages.InvalidUnicodeEscapeSequence);
}

if (chcp > 0xFFFF)
if (chcp > char.MaxValue)
{
ch = ParserExtensions.CodePointToString(chcp);
if (!Character.IsIdentifierPart(ch, 0))
if (!Character.IsIdentifierPartAstral(chcp))
{
ThrowUnexpectedToken();
}
sb.Append(ch);
sb.Append(ParserExtensions.CodePointToString(chcp));
}
else
{
Expand Down Expand Up @@ -2576,7 +2588,7 @@ internal Token Lex(in LexOptions options)

var cp = _source[_index];

// IsIdentifierStart also matches the surrogate range (U+D800..U+DFFF) currently.
// IsIdentifierStart also matches backslash and the surrogate range (U+D800..U+DFFF).
if (Character.IsIdentifierStart(cp))
{
return ScanIdentifier(options.AllowIdentifierEscape);
Expand Down
Loading

0 comments on commit 81f25cc

Please sign in to comment.