From b8bbcb315964319c383482ac58d54df86349e8e2 Mon Sep 17 00:00:00 2001 From: Steven Levithan Date: Wed, 25 Dec 2024 22:35:27 +0100 Subject: [PATCH] Keep the first of duplicate names per alternation path, excluding subroutines --- README.md | 2 +- spec/match-capturing-group.spec.js | 45 +++++++++++------------- spec/match-subroutine.spec.js | 12 ++++++- src/index.js | 8 ++++- src/transform.js | 55 +++++++++++++++++------------- 5 files changed, 70 insertions(+), 52 deletions(-) diff --git a/README.md b/README.md index 1283176..62e2b87 100644 --- a/README.md +++ b/README.md @@ -964,7 +964,7 @@ The following don't yet have any support, and throw errors. They're all infreque ## ㊗️ Unicode -Oniguruma-To-ES fully supports mixed case-sensitivity (through the use of pattern modifiers) and handles the Unicode edge cases regardless of JavaScript [target](#target). +Oniguruma-To-ES fully supports mixed case-sensitivity (ex: `(?i)a(?-i)a`) and handles the Unicode edge cases regardless of JavaScript [target](#target). Oniguruma-To-ES focuses on being lightweight to make it better for use in browsers. This is partly achieved by not including heavyweight Unicode character data, which imposes a few minor/rare restrictions: diff --git a/spec/match-capturing-group.spec.js b/spec/match-capturing-group.spec.js index e6b2ff7..9677a01 100644 --- a/spec/match-capturing-group.spec.js +++ b/spec/match-capturing-group.spec.js @@ -14,55 +14,50 @@ describe('CapturingGroup', () => { // }); describe('named', () => { - it('should keep only the last of duplicate names per alternation path with target ES2025', () => { + it('should keep only the first of duplicate names per alternation path with target ES2025', () => { const opts = { target: 'ES2025', verbose: true, }; const tests = [ - ['(?)(?)', '()(?)'], + ['(?)(?)', '(?)()'], ['(?)|(?)', '(?)|(?)'], ['((?)|(?))', '(?:(?)|(?))'], - ['(?)(((?)))', '()(?:(?:(?)))'], - ['(((?)))(?)', '(?:(?:()))(?)'], + ['(?)(((?)))', '(?)(?:(?:()))'], + ['(((?)))(?)', '(?:(?:(?)))()'], ['(?)|(((?)))', '(?)|(?:(?:(?)))'], ['(((?)))|(?)', '(?:(?:(?)))|(?)'], - ['(?(?))', '((?))'], - ['(?(?))|(?)', '((?))|(?)'], - ['(?)(?)(|(?))(?)', '()()(?:|())(?)'], - ['((?)(?))(((?)|(?)))((?))', '(?:()())(?:(?:()|()))(?:(?))'], - ['(?)(?)((?)|(?))', '()()(?:(?)|(?))'], + ['(?(?))', '(?())'], + ['(?(?))|(?)', '(?())|(?)'], + ['(?)(?)(|(?))(?)', '(?)()(?:|())()'], + ['((?)(?))(((?)|(?)))((?))', '(?:(?)())(?:(?:()|()))(?:())'], + ['(?)(?)((?)|(?))', '(?)()(?:()|())'], + ['((?)|(?))(?)(?)', '(?:(?)|(?))()()'], ]; for (const [pattern, output] of tests) { expect(toDetails(pattern, opts).pattern).toBe(output); } }); - it('should convert all but one duplicate name to an unnamed capture with target < ES2025', () => { + it('should keep only the first of duplicate names with target < ES2025', () => { const opts = { target: 'ES2024', verbose: true, }; - // Current behavior is to: - // 1. First, keep only the last instance of the name per alternation path (like for ES2025). - // 2. Next, strip duplicate names from all but the first alternative that includes it. - // Keeping only the last instance per alternation path (step 1) is important for correctness, - // but the choice of which path to keep the name for in step 2 is arbitrary. The current - // approach of keeping the first remaining instance of the name rather than the last is - // merely for implementation simplicity, and could change in the future const tests = [ - ['(?)(?)', '()(?)'], + ['(?)(?)', '(?)()'], ['(?)|(?)', '(?)|()'], ['((?)|(?))', '(?:(?)|())'], - ['(?)(((?)))', '()(?:(?:(?)))'], - ['(((?)))(?)', '(?:(?:()))(?)'], + ['(?)(((?)))', '(?)(?:(?:()))'], + ['(((?)))(?)', '(?:(?:(?)))()'], ['(?)|(((?)))', '(?)|(?:(?:()))'], ['(((?)))|(?)', '(?:(?:(?)))|()'], - ['(?(?))', '((?))'], - ['(?(?))|(?)', '((?))|()'], - ['(?)(?)(|(?))(?)', '()()(?:|())(?)'], - ['((?)(?))(((?)|(?)))((?))', '(?:()())(?:(?:()|()))(?:(?))'], - ['(?)(?)((?)|(?))', '()()(?:(?)|())'], + ['(?(?))', '(?())'], + ['(?(?))|(?)', '(?())|()'], + ['(?)(?)(|(?))(?)', '(?)()(?:|())()'], + ['((?)(?))(((?)|(?)))((?))', '(?:(?)())(?:(?:()|()))(?:())'], + ['(?)(?)((?)|(?))', '(?)()(?:()|())'], + ['((?)|(?))(?)(?)', '(?:(?)|())()()'], ]; for (const [pattern, output] of tests) { expect(toDetails(pattern, opts).pattern).toBe(output); diff --git a/spec/match-subroutine.spec.js b/spec/match-subroutine.spec.js index 1b57825..4e0ebac 100644 --- a/spec/match-subroutine.spec.js +++ b/spec/match-subroutine.spec.js @@ -1,4 +1,4 @@ -import {toDetails} from '../dist/index.mjs'; +import {toDetails, toRegExp} from '../dist/index.mjs'; import {r} from '../src/utils.js'; import {matchers} from './helpers/matchers.js'; @@ -119,5 +119,15 @@ describe('Subroutine', () => { expect('baaba').toExactlyMatch(r`\g(?a)(?b\g)`); expect('abcbcc').toExactlyMatch(r`(?a\g)(?b\g)(?c)`); }); + + it('should not use duplicate names for subroutines', () => { + expect(toRegExp(r`(?[ab])\g`).exec('ab').groups.n).toBe('a'); + expect(toRegExp(r`\g(?[ab])`).exec('ab').groups.n).toBe('b'); + }); + + it('should not use duplicate names for subroutine child captures', () => { + expect(toRegExp(r`(?(?[ab]))\g`).exec('ab').groups.n2).toBe('a'); + expect(toRegExp(r`\g(?(?[ab]))`).exec('ab').groups.n2).toBe('b'); + }); }); }); diff --git a/src/index.js b/src/index.js index b4aeb26..93f48c0 100644 --- a/src/index.js +++ b/src/index.js @@ -10,7 +10,7 @@ import {recursion} from 'regex-recursion'; // The transformation and error checking for Oniguruma's unique syntax and behavior differences // compared to native JS RegExp is layered into all steps of the compilation process: // 1. Tokenizer: Understands Oniguruma syntax, with many large and small differences from JS. -// 2. Parser: Builds an Oniguruma AST from the tokens with understanding of Oniguruma differences. +// 2. Parser: Builds an Oniguruma AST from the tokens, with understanding of Oniguruma differences. // 3. Transformer: Converts the Oniguruma AST to a Regex+ AST that preserves all Oniguruma // behavior. This is true even in cases of non-native-JS features that are supported by both // Regex+ and Oniguruma but with subtly different behavior in each (subroutines, flag x). @@ -98,6 +98,11 @@ function toOnigurumaAst(pattern, options) { return parse(tokenize(pattern, flags, {captureGroup})); } +// // Returns a Regex+ AST generated from an Oniguruma pattern +// function toRegexAst(pattern, options) { +// return transform(toOnigurumaAst(pattern, options)); +// } + /** Accepts an Oniguruma pattern and returns an equivalent JavaScript `RegExp`. @param {string} pattern Oniguruma regex pattern. @@ -116,5 +121,6 @@ export { EmulatedRegExp, toDetails, toOnigurumaAst, + // toRegexAst, toRegExp, }; diff --git a/src/transform.js b/src/transform.js index 9ee151d..afbce94 100644 --- a/src/transform.js +++ b/src/transform.js @@ -374,23 +374,6 @@ const SecondPassVisitor = { } }, - Recursion({node, parent}, {reffedNodesByReferencer}) { - // Recursion nodes are created during the current traversal; they're only traversed here if a - // recursion node created during traversal is then copied by a subroutine expansion, e.g. with - // `(?\g)\g` - const {ref} = node; - // Immediate parent is an alternative or quantifier; can skip - let reffed = parent; - while ((reffed = reffed.parent)) { - if (reffed.type === AstTypes.CapturingGroup && (reffed.name === ref || reffed.number === ref)) { - break; - } - } - // Track the referenced node because `ref`s are rewritten in a subsequent pass; capturing group - // names and numbers might change due to subroutine expansion and duplicate group names - reffedNodesByReferencer.set(node, reffed); - }, - CapturingGroup: { enter( { node, @@ -459,15 +442,22 @@ const SecondPassVisitor = { // subroutine expansion) if (node.name) { const groupsWithSameName = getOrCreate(groupsByName, node.name, new Map()); - for (const groupInfo of groupsWithSameName.values()) { - if (!groupInfo.hasDuplicateNameToRemove && canParticipateWithNode(groupInfo.node, node, { - ancestorsParticipate: true, - })) { - // Will change the earlier instance with this name to an unnamed capture in a later pass - groupInfo.hasDuplicateNameToRemove = true; + let hasDuplicateNameToRemove = false; + if (origin) { + // Subroutines and their child captures shouldn't hold duplicate names in the final state + hasDuplicateNameToRemove = true; + } else { + for (const groupInfo of groupsWithSameName.values()) { + if (!groupInfo.hasDuplicateNameToRemove && canParticipateWithNode(groupInfo.node, node, { + ancestorsParticipate: true, + })) { + // Will change to an unnamed capture in a later pass + hasDuplicateNameToRemove = true; + break; + } } } - groupsByName.get(node.name).set(node, {node}); + groupsByName.get(node.name).set(node, {node, hasDuplicateNameToRemove}); } }, exit({node}, {openRefs}) { @@ -488,6 +478,23 @@ const SecondPassVisitor = { }, }, + Recursion({node, parent}, {reffedNodesByReferencer}) { + // Recursion nodes are created during the current traversal; they're only traversed here if a + // recursion node created during traversal is then copied by a subroutine expansion, e.g. with + // `(?\g)\g` + const {ref} = node; + // Immediate parent is an alternative or quantifier; can skip + let reffed = parent; + while ((reffed = reffed.parent)) { + if (reffed.type === AstTypes.CapturingGroup && (reffed.name === ref || reffed.number === ref)) { + break; + } + } + // Track the referenced node because `ref`s are rewritten in a subsequent pass; capturing group + // names and numbers might change due to subroutine expansion and duplicate group names + reffedNodesByReferencer.set(node, reffed); + }, + Subroutine(path, state) { const {node, replaceWith} = path; const {ref} = node;