Keep the first of duplicate names per alternation path, excluding sub…

…routines
slevithan · Dec 25, 2024 · b8bbcb3 · b8bbcb3
1 parent 438f171
commit b8bbcb3
Show file tree

Hide file tree

Showing 5 changed files with 70 additions and 52 deletions.
diff --git a/README.md b/README.md
@@ -964,7 +964,7 @@ The following don't yet have any support, and throw errors. They're all infreque
 <a name="unicode"></a>
 ## ㊗️ Unicode
 
-Oniguruma-To-ES fully supports mixed case-sensitivity (through the use of pattern modifiers) and handles the Unicode edge cases regardless of JavaScript [target](#target).
+Oniguruma-To-ES fully supports mixed case-sensitivity (ex: `(?i)a(?-i)a`) and handles the Unicode edge cases regardless of JavaScript [target](#target).
 
 Oniguruma-To-ES focuses on being lightweight to make it better for use in browsers. This is partly achieved by not including heavyweight Unicode character data, which imposes a few minor/rare restrictions:
 

diff --git a/spec/match-capturing-group.spec.js b/spec/match-capturing-group.spec.js
@@ -14,55 +14,50 @@ describe('CapturingGroup', () => {
   // });
 
   describe('named', () => {
-    it('should keep only the last of duplicate names per alternation path with target ES2025', () => {
+    it('should keep only the first of duplicate names per alternation path with target ES2025', () => {
       const opts = {
         target: 'ES2025',
         verbose: true,
       };
       const tests = [
-        ['(?<a>)(?<a>)', '()(?<a>)'],
+        ['(?<a>)(?<a>)', '(?<a>)()'],
         ['(?<a>)|(?<a>)', '(?<a>)|(?<a>)'],
         ['((?<a>)|(?<a>))', '(?:(?<a>)|(?<a>))'],
-        ['(?<a>)(((?<a>)))', '()(?:(?:(?<a>)))'],
-        ['(((?<a>)))(?<a>)', '(?:(?:()))(?<a>)'],
+        ['(?<a>)(((?<a>)))', '(?<a>)(?:(?:()))'],
+        ['(((?<a>)))(?<a>)', '(?:(?:(?<a>)))()'],
         ['(?<a>)|(((?<a>)))', '(?<a>)|(?:(?:(?<a>)))'],
         ['(((?<a>)))|(?<a>)', '(?:(?:(?<a>)))|(?<a>)'],
-        ['(?<a>(?<a>))', '((?<a>))'],
-        ['(?<a>(?<a>))|(?<a>)', '((?<a>))|(?<a>)'],
-        ['(?<a>)(?<a>)(|(?<a>))(?<a>)', '()()(?:|())(?<a>)'],
-        ['((?<a>)(?<a>))(((?<a>)|(?<a>)))((?<a>))', '(?:()())(?:(?:()|()))(?:(?<a>))'],
-        ['(?<a>)(?<a>)((?<a>)|(?<a>))', '()()(?:(?<a>)|(?<a>))'],
+        ['(?<a>(?<a>))', '(?<a>())'],
+        ['(?<a>(?<a>))|(?<a>)', '(?<a>())|(?<a>)'],
+        ['(?<a>)(?<a>)(|(?<a>))(?<a>)', '(?<a>)()(?:|())()'],
+        ['((?<a>)(?<a>))(((?<a>)|(?<a>)))((?<a>))', '(?:(?<a>)())(?:(?:()|()))(?:())'],
+        ['(?<a>)(?<a>)((?<a>)|(?<a>))', '(?<a>)()(?:()|())'],
+        ['((?<a>)|(?<a>))(?<a>)(?<a>)', '(?:(?<a>)|(?<a>))()()'],
       ];
       for (const [pattern, output] of tests) {
         expect(toDetails(pattern, opts).pattern).toBe(output);
       }
     });
 
-    it('should convert all but one duplicate name to an unnamed capture with target < ES2025', () => {
+    it('should keep only the first of duplicate names with target < ES2025', () => {
       const opts = {
         target: 'ES2024',
         verbose: true,
       };
-      // Current behavior is to:
-      // 1. First, keep only the last instance of the name per alternation path (like for ES2025).
-      // 2. Next, strip duplicate names from all but the first alternative that includes it.
-      // Keeping only the last instance per alternation path (step 1) is important for correctness,
-      // but the choice of which path to keep the name for in step 2 is arbitrary. The current
-      // approach of keeping the first remaining instance of the name rather than the last is
-      // merely for implementation simplicity, and could change in the future
       const tests = [
-        ['(?<a>)(?<a>)', '()(?<a>)'],
+        ['(?<a>)(?<a>)', '(?<a>)()'],
         ['(?<a>)|(?<a>)', '(?<a>)|()'],
         ['((?<a>)|(?<a>))', '(?:(?<a>)|())'],
-        ['(?<a>)(((?<a>)))', '()(?:(?:(?<a>)))'],
-        ['(((?<a>)))(?<a>)', '(?:(?:()))(?<a>)'],
+        ['(?<a>)(((?<a>)))', '(?<a>)(?:(?:()))'],
+        ['(((?<a>)))(?<a>)', '(?:(?:(?<a>)))()'],
         ['(?<a>)|(((?<a>)))', '(?<a>)|(?:(?:()))'],
         ['(((?<a>)))|(?<a>)', '(?:(?:(?<a>)))|()'],
-        ['(?<a>(?<a>))', '((?<a>))'],
-        ['(?<a>(?<a>))|(?<a>)', '((?<a>))|()'],
-        ['(?<a>)(?<a>)(|(?<a>))(?<a>)', '()()(?:|())(?<a>)'],
-        ['((?<a>)(?<a>))(((?<a>)|(?<a>)))((?<a>))', '(?:()())(?:(?:()|()))(?:(?<a>))'],
-        ['(?<a>)(?<a>)((?<a>)|(?<a>))', '()()(?:(?<a>)|())'],
+        ['(?<a>(?<a>))', '(?<a>())'],
+        ['(?<a>(?<a>))|(?<a>)', '(?<a>())|()'],
+        ['(?<a>)(?<a>)(|(?<a>))(?<a>)', '(?<a>)()(?:|())()'],
+        ['((?<a>)(?<a>))(((?<a>)|(?<a>)))((?<a>))', '(?:(?<a>)())(?:(?:()|()))(?:())'],
+        ['(?<a>)(?<a>)((?<a>)|(?<a>))', '(?<a>)()(?:()|())'],
+        ['((?<a>)|(?<a>))(?<a>)(?<a>)', '(?:(?<a>)|())()()'],
       ];
       for (const [pattern, output] of tests) {
         expect(toDetails(pattern, opts).pattern).toBe(output);

diff --git a/spec/match-subroutine.spec.js b/spec/match-subroutine.spec.js
@@ -1,4 +1,4 @@
-import {toDetails} from '../dist/index.mjs';
+import {toDetails, toRegExp} from '../dist/index.mjs';
 import {r} from '../src/utils.js';
 import {matchers} from './helpers/matchers.js';
 
@@ -119,5 +119,15 @@ describe('Subroutine', () => {
       expect('baaba').toExactlyMatch(r`\g<b>(?<a>a)(?<b>b\g<a>)`);
       expect('abcbcc').toExactlyMatch(r`(?<a>a\g<b>)(?<b>b\g<c>)(?<c>c)`);
     });
+
+    it('should not use duplicate names for subroutines', () => {
+      expect(toRegExp(r`(?<n>[ab])\g<n>`).exec('ab').groups.n).toBe('a');
+      expect(toRegExp(r`\g<n>(?<n>[ab])`).exec('ab').groups.n).toBe('b');
+    });
+
+    it('should not use duplicate names for subroutine child captures', () => {
+      expect(toRegExp(r`(?<n1>(?<n2>[ab]))\g<n1>`).exec('ab').groups.n2).toBe('a');
+      expect(toRegExp(r`\g<n1>(?<n1>(?<n2>[ab]))`).exec('ab').groups.n2).toBe('b');
+    });
   });
 });
diff --git a/src/index.js b/src/index.js
@@ -10,7 +10,7 @@ import {recursion} from 'regex-recursion';
 // The transformation and error checking for Oniguruma's unique syntax and behavior differences
 // compared to native JS RegExp is layered into all steps of the compilation process:
 // 1. Tokenizer: Understands Oniguruma syntax, with many large and small differences from JS.
-// 2. Parser: Builds an Oniguruma AST from the tokens with understanding of Oniguruma differences.
+// 2. Parser: Builds an Oniguruma AST from the tokens, with understanding of Oniguruma differences.
 // 3. Transformer: Converts the Oniguruma AST to a Regex+ AST that preserves all Oniguruma
 //    behavior. This is true even in cases of non-native-JS features that are supported by both
 //    Regex+ and Oniguruma but with subtly different behavior in each (subroutines, flag x).
@@ -98,6 +98,11 @@ function toOnigurumaAst(pattern, options) {
   return parse(tokenize(pattern, flags, {captureGroup}));
 }
 
+// // Returns a Regex+ AST generated from an Oniguruma pattern
+// function toRegexAst(pattern, options) {
+//   return transform(toOnigurumaAst(pattern, options));
+// }
+
 /**
 Accepts an Oniguruma pattern and returns an equivalent JavaScript `RegExp`.
 @param {string} pattern Oniguruma regex pattern.
@@ -116,5 +121,6 @@ export {
   EmulatedRegExp,
   toDetails,
   toOnigurumaAst,
+  // toRegexAst,
   toRegExp,
 };
diff --git a/src/transform.js b/src/transform.js
@@ -374,23 +374,6 @@ const SecondPassVisitor = {
     }
   },
 
-  Recursion({node, parent}, {reffedNodesByReferencer}) {
-    // Recursion nodes are created during the current traversal; they're only traversed here if a
-    // recursion node created during traversal is then copied by a subroutine expansion, e.g. with
-    // `(?<a>\g<a>)\g<a>`
-    const {ref} = node;
-    // Immediate parent is an alternative or quantifier; can skip
-    let reffed = parent;
-    while ((reffed = reffed.parent)) {
-      if (reffed.type === AstTypes.CapturingGroup && (reffed.name === ref || reffed.number === ref)) {
-        break;
-      }
-    }
-    // Track the referenced node because `ref`s are rewritten in a subsequent pass; capturing group
-    // names and numbers might change due to subroutine expansion and duplicate group names
-    reffedNodesByReferencer.set(node, reffed);
-  },
-
   CapturingGroup: {
     enter(
       { node,
@@ -459,15 +442,22 @@ const SecondPassVisitor = {
       // subroutine expansion)
       if (node.name) {
         const groupsWithSameName = getOrCreate(groupsByName, node.name, new Map());
-        for (const groupInfo of groupsWithSameName.values()) {
-          if (!groupInfo.hasDuplicateNameToRemove && canParticipateWithNode(groupInfo.node, node, {
-            ancestorsParticipate: true,
-          })) {
-            // Will change the earlier instance with this name to an unnamed capture in a later pass
-            groupInfo.hasDuplicateNameToRemove = true;
+        let hasDuplicateNameToRemove = false;
+        if (origin) {
+          // Subroutines and their child captures shouldn't hold duplicate names in the final state
+          hasDuplicateNameToRemove = true;
+        } else {
+          for (const groupInfo of groupsWithSameName.values()) {
+            if (!groupInfo.hasDuplicateNameToRemove && canParticipateWithNode(groupInfo.node, node, {
+              ancestorsParticipate: true,
+            })) {
+              // Will change to an unnamed capture in a later pass
+              hasDuplicateNameToRemove = true;
+              break;
+            }
           }
         }
-        groupsByName.get(node.name).set(node, {node});
+        groupsByName.get(node.name).set(node, {node, hasDuplicateNameToRemove});
       }
     },
     exit({node}, {openRefs}) {
@@ -488,6 +478,23 @@ const SecondPassVisitor = {
     },
   },
 
+  Recursion({node, parent}, {reffedNodesByReferencer}) {
+    // Recursion nodes are created during the current traversal; they're only traversed here if a
+    // recursion node created during traversal is then copied by a subroutine expansion, e.g. with
+    // `(?<a>\g<a>)\g<a>`
+    const {ref} = node;
+    // Immediate parent is an alternative or quantifier; can skip
+    let reffed = parent;
+    while ((reffed = reffed.parent)) {
+      if (reffed.type === AstTypes.CapturingGroup && (reffed.name === ref || reffed.number === ref)) {
+        break;
+      }
+    }
+    // Track the referenced node because `ref`s are rewritten in a subsequent pass; capturing group
+    // names and numbers might change due to subroutine expansion and duplicate group names
+    reffedNodesByReferencer.set(node, reffed);
+  },
+
   Subroutine(path, state) {
     const {node, replaceWith} = path;
     const {ref} = node;