From e9ace1a5a130b49ee020e316e87a4d6b0d1076db Mon Sep 17 00:00:00 2001 From: adams85 <31276480+adams85@users.noreply.github.com> Date: Mon, 7 Aug 2023 23:21:28 +0200 Subject: [PATCH] Fix capturing group numbering bug and a few more regex-related issues (#1614) --- .../Test262Harness.settings.json | 6 -- Jint/Jint.csproj | 2 +- Jint/Native/RegExp/JsRegExp.cs | 3 + Jint/Native/RegExp/RegExpConstructor.cs | 12 +-- Jint/Native/RegExp/RegExpPrototype.cs | 80 ++++++++----------- .../Expressions/JintLiteralExpression.cs | 7 +- 6 files changed, 47 insertions(+), 63 deletions(-) diff --git a/Jint.Tests.Test262/Test262Harness.settings.json b/Jint.Tests.Test262/Test262Harness.settings.json index 54a56f0059..88fa50bd24 100644 --- a/Jint.Tests.Test262/Test262Harness.settings.json +++ b/Jint.Tests.Test262/Test262Harness.settings.json @@ -44,14 +44,8 @@ "language/literals/regexp/named-groups/forward-reference.js", // RegExp handling problems - "built-ins/RegExp/named-groups/non-unicode-match.js", - "built-ins/RegExp/named-groups/non-unicode-property-names-valid.js", - "built-ins/RegExp/named-groups/unicode-match.js", - "built-ins/RegExp/named-groups/unicode-property-names-valid.js", "built-ins/RegExp/prototype/exec/S15.10.6.2_A1_T6.js", - "built-ins/String/prototype/split/separator-regexp.js", "language/literals/regexp/u-case-mapping.js", - "language/literals/regexp/u-surrogate-pairs-atom-escape-decimal.js", // requires investigation how to process complex function name evaluation for property "built-ins/Function/prototype/toString/method-computed-property-name.js", diff --git a/Jint/Jint.csproj b/Jint/Jint.csproj index 5dfab6951e..cc591bc37b 100644 --- a/Jint/Jint.csproj +++ b/Jint/Jint.csproj @@ -16,7 +16,7 @@ - + diff --git a/Jint/Native/RegExp/JsRegExp.cs b/Jint/Native/RegExp/JsRegExp.cs index bacd74c5cb..629fabae0f 100644 --- a/Jint/Native/RegExp/JsRegExp.cs +++ b/Jint/Native/RegExp/JsRegExp.cs @@ -1,4 +1,5 @@ using System.Text.RegularExpressions; +using Esprima; using Jint.Native.Object; using Jint.Runtime; using Jint.Runtime.Descriptors; @@ -62,6 +63,8 @@ public string Flags } } + public RegExpParseResult ParseResult { get; set; } + public bool DotAll { get; private set; } public bool Global { get; private set; } public bool Indices { get; private set; } diff --git a/Jint/Native/RegExp/RegExpConstructor.cs b/Jint/Native/RegExp/RegExpConstructor.cs index 3a339fc672..9e0a174377 100644 --- a/Jint/Native/RegExp/RegExpConstructor.cs +++ b/Jint/Native/RegExp/RegExpConstructor.cs @@ -104,14 +104,15 @@ private ObjectInstance RegExpInitialize(JsRegExp r, JsValue pattern, JsValue fla try { - var regExp = Scanner.AdaptRegExp(p, f, compiled: false, _engine.Options.Constraints.RegexTimeout); + var regExpParseResult = Scanner.AdaptRegExp(p, f, compiled: false, _engine.Options.Constraints.RegexTimeout); - if (regExp is null) + if (!regExpParseResult.Success) { - ExceptionHelper.ThrowSyntaxError(_realm, $"Unsupported regular expression: '/{p}/{flags}'"); + ExceptionHelper.ThrowSyntaxError(_realm, $"Unsupported regular expression. {regExpParseResult.ConversionError!.Description}"); } - r.Value = regExp; + r.Value = regExpParseResult.Regex!; + r.ParseResult = regExpParseResult; } catch (Exception ex) { @@ -135,12 +136,13 @@ private JsRegExp RegExpAlloc(JsValue newTarget) return r; } - public JsRegExp Construct(Regex regExp, string source, string flags) + public JsRegExp Construct(Regex regExp, string source, string flags, RegExpParseResult regExpParseResult = default) { var r = RegExpAlloc(this); r.Value = regExp; r.Source = source; r.Flags = flags; + r.ParseResult = regExpParseResult; RegExpInitialize(r); diff --git a/Jint/Native/RegExp/RegExpPrototype.cs b/Jint/Native/RegExp/RegExpPrototype.cs index 65fbe03f07..7912a64859 100644 --- a/Jint/Native/RegExp/RegExpPrototype.cs +++ b/Jint/Native/RegExp/RegExpPrototype.cs @@ -1,5 +1,4 @@ using System.Diagnostics.CodeAnalysis; -using System.Text; using System.Text.RegularExpressions; using Jint.Collections; using Jint.Native.Number; @@ -168,16 +167,17 @@ private JsValue Replace(JsValue thisObject, JsValue[] arguments) { string Evaluator(Match match) { - var replacerArgs = new List(match.Groups.Count + 2); + var actualGroupCount = GetActualRegexGroupCount(rei, match); + var replacerArgs = new List(actualGroupCount + 2); replacerArgs.Add(match.Value); ObjectInstance? groups = null; - for (var i = 1; i < match.Groups.Count; i++) + for (var i = 1; i < actualGroupCount; i++) { var capture = match.Groups[i]; replacerArgs.Add(capture.Success ? capture.Value : Undefined); - var groupName = GetRegexGroupName(rei.Value, i); + var groupName = GetRegexGroupName(rei, i); if (!string.IsNullOrWhiteSpace(groupName)) { groups ??= OrdinaryObjectCreate(_engine, null); @@ -475,21 +475,13 @@ private JsValue Split(JsValue thisObject, JsValue[] arguments) } var a = _realm.Intrinsics.Array.Construct(Arguments.Empty); - var match = R.Value.Match(s, 0); - - if (!match.Success) // No match at all return the string in an array - { - a.SetIndexValue(0, s, updateLength: true); - return a; - } int lastIndex = 0; uint index = 0; - while (match.Success && index < lim) + for (var match = R.Value.Match(s, 0); match.Success; match = match.NextMatch()) { if (match.Length == 0 && (match.Index == 0 || match.Index == s.Length || match.Index == lastIndex)) { - match = match.NextMatch(); continue; } @@ -502,7 +494,8 @@ private JsValue Split(JsValue thisObject, JsValue[] arguments) } lastIndex = match.Index + match.Length; - for (int i = 1; i < match.Groups.Count; i++) + var actualGroupCount = GetActualRegexGroupCount(R, match); + for (int i = 1; i < actualGroupCount; i++) { var group = match.Groups[i]; var item = Undefined; @@ -518,14 +511,11 @@ private JsValue Split(JsValue thisObject, JsValue[] arguments) return a; } } - - match = match.NextMatch(); - if (!match.Success) // Add the last part of the split - { - a.SetIndexValue(index++, s.Substring(lastIndex), updateLength: true); - } } + // Add the last part of the split + a.SetIndexValue(index, s.Substring(lastIndex), updateLength: true); + return a; } @@ -720,7 +710,7 @@ private JsValue Match(JsValue thisObject, JsValue[] arguments) match = match.NextMatch(); if (!match.Success || match.Index != ++li) break; - a.SetIndexValue(li, match.Value, updateLength: false); + a.SetIndexValue(li, match.Value, updateLength: false); } a.SetLength(li); return a; @@ -898,7 +888,7 @@ private static JsValue RegExpBuiltinExec(JsRegExp R, string s) return Null; } - return CreateReturnValueArray(R.Engine, matcher, m, s, fullUnicode: false, hasIndices: false); + return CreateReturnValueArray(R, m, s, fullUnicode: false, hasIndices: false); } // the stateful version @@ -949,25 +939,26 @@ private static JsValue RegExpBuiltinExec(JsRegExp R, string s) R.Set(JsRegExp.PropertyLastIndex, e, true); } - return CreateReturnValueArray(R.Engine, matcher, match, s, fullUnicode, hasIndices); + return CreateReturnValueArray(R, match, s, fullUnicode, hasIndices); } private static JsArray CreateReturnValueArray( - Engine engine, - Regex regex, + JsRegExp rei, Match match, string s, bool fullUnicode, bool hasIndices) { - var array = engine.Realm.Intrinsics.Array.ArrayCreate((ulong) match.Groups.Count); + var engine = rei.Engine; + var actualGroupCount = GetActualRegexGroupCount(rei, match); + var array = engine.Realm.Intrinsics.Array.ArrayCreate((ulong) actualGroupCount); array.CreateDataProperty(PropertyIndex, match.Index); array.CreateDataProperty(PropertyInput, s); ObjectInstance? groups = null; List? groupNames = null; - var indices = hasIndices ? new List(match.Groups.Count) : null; - for (uint i = 0; i < match.Groups.Count; i++) + var indices = hasIndices ? new List(actualGroupCount) : null; + for (uint i = 0; i < actualGroupCount; i++) { var capture = match.Groups[(int) i]; var capturedValue = Undefined; @@ -988,7 +979,7 @@ private static JsArray CreateReturnValueArray( } } - var groupName = GetRegexGroupName(regex, (int) i); + var groupName = GetRegexGroupName(rei, (int) i); if (!string.IsNullOrWhiteSpace(groupName)) { groups ??= OrdinaryObjectCreate(engine, null); @@ -1055,35 +1046,28 @@ private static JsValue GetMatchIndexPair(Engine engine, string s, JsNumber[] mat return engine.Realm.Intrinsics.Array.CreateArrayFromList(match); } - private static string? GetRegexGroupName(Regex regex, int index) + private static int GetActualRegexGroupCount(JsRegExp rei, Match match) + { + return rei.ParseResult.Success ? rei.ParseResult.ActualRegexGroupCount : match.Groups.Count; + } + + private static string? GetRegexGroupName(JsRegExp rei, int index) { if (index == 0) { return null; } + var regex = rei.Value; + if (rei.ParseResult.Success) + { + return rei.ParseResult.GetRegexGroupName(index); + } + var groupNameFromNumber = regex.GroupNameFromNumber(index); if (groupNameFromNumber.Length == 1 && groupNameFromNumber[0] == 48 + index) { // regex defaults to index as group name when it's not a named group return null; - - } - - // The characters allowed in group names differs between the JS and .NET regex engines. - // For example the group name "$group" is valid in JS but invalid in .NET. - // As a workaround for this issue, the parser make an attempt to encode the problematic group names to - // names which are valid in .NET and probably won't collide with other group names present in the pattern - // (https://github.com/sebastienros/esprima-dotnet/blob/v3.0.0-rc-03/src/Esprima/Scanner.RegExpParser.cs#L942). - // We need to decode such group names. - const string encodedGroupNamePrefix = "__utf8_"; - if (groupNameFromNumber.StartsWith(encodedGroupNamePrefix, StringComparison.Ordinal)) - { - try - { - var bytes = groupNameFromNumber.AsSpan(encodedGroupNamePrefix.Length).BytesFromHexString(); - groupNameFromNumber = Encoding.UTF8.GetString(bytes); - } - catch { /* intentional no-op */ } } return groupNameFromNumber; diff --git a/Jint/Runtime/Interpreter/Expressions/JintLiteralExpression.cs b/Jint/Runtime/Interpreter/Expressions/JintLiteralExpression.cs index de5de54832..04211a6294 100644 --- a/Jint/Runtime/Interpreter/Expressions/JintLiteralExpression.cs +++ b/Jint/Runtime/Interpreter/Expressions/JintLiteralExpression.cs @@ -76,12 +76,13 @@ private JsValue ResolveValue(EvaluationContext context) if (expression.TokenType == TokenType.RegularExpression) { var regExpLiteral = (RegExpLiteral) _expression; - if (regExpLiteral.Value is System.Text.RegularExpressions.Regex regex) + var regExpParseResult = regExpLiteral.ParseResult; + if (regExpParseResult.Success) { - return context.Engine.Realm.Intrinsics.RegExp.Construct(regex, regExpLiteral.Regex.Pattern, regExpLiteral.Regex.Flags); + return context.Engine.Realm.Intrinsics.RegExp.Construct(regExpParseResult.Regex!, regExpLiteral.Regex.Pattern, regExpLiteral.Regex.Flags, regExpParseResult); } - ExceptionHelper.ThrowSyntaxError(context.Engine.Realm, $"Unsupported regular expression: '{regExpLiteral.Regex.Pattern}/{regExpLiteral.Regex.Flags}'"); + ExceptionHelper.ThrowSyntaxError(context.Engine.Realm, $"Unsupported regular expression. {regExpParseResult.ConversionError!.Description}"); } return JsValue.FromObject(context.Engine, expression.Value);