Skip to content

Commit

Permalink
Keep the first of duplicate names per alternation path, excluding sub…
Browse files Browse the repository at this point in the history
…routines
  • Loading branch information
slevithan committed Dec 25, 2024
1 parent 438f171 commit b8bbcb3
Show file tree
Hide file tree
Showing 5 changed files with 70 additions and 52 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -964,7 +964,7 @@ The following don't yet have any support, and throw errors. They're all infreque
<a name="unicode"></a>
## ㊗️ Unicode

Oniguruma-To-ES fully supports mixed case-sensitivity (through the use of pattern modifiers) and handles the Unicode edge cases regardless of JavaScript [target](#target).
Oniguruma-To-ES fully supports mixed case-sensitivity (ex: `(?i)a(?-i)a`) and handles the Unicode edge cases regardless of JavaScript [target](#target).

Oniguruma-To-ES focuses on being lightweight to make it better for use in browsers. This is partly achieved by not including heavyweight Unicode character data, which imposes a few minor/rare restrictions:

Expand Down
45 changes: 20 additions & 25 deletions spec/match-capturing-group.spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -14,55 +14,50 @@ describe('CapturingGroup', () => {
// });

describe('named', () => {
it('should keep only the last of duplicate names per alternation path with target ES2025', () => {
it('should keep only the first of duplicate names per alternation path with target ES2025', () => {
const opts = {
target: 'ES2025',
verbose: true,
};
const tests = [
['(?<a>)(?<a>)', '()(?<a>)'],
['(?<a>)(?<a>)', '(?<a>)()'],
['(?<a>)|(?<a>)', '(?<a>)|(?<a>)'],
['((?<a>)|(?<a>))', '(?:(?<a>)|(?<a>))'],
['(?<a>)(((?<a>)))', '()(?:(?:(?<a>)))'],
['(((?<a>)))(?<a>)', '(?:(?:()))(?<a>)'],
['(?<a>)(((?<a>)))', '(?<a>)(?:(?:()))'],
['(((?<a>)))(?<a>)', '(?:(?:(?<a>)))()'],
['(?<a>)|(((?<a>)))', '(?<a>)|(?:(?:(?<a>)))'],
['(((?<a>)))|(?<a>)', '(?:(?:(?<a>)))|(?<a>)'],
['(?<a>(?<a>))', '((?<a>))'],
['(?<a>(?<a>))|(?<a>)', '((?<a>))|(?<a>)'],
['(?<a>)(?<a>)(|(?<a>))(?<a>)', '()()(?:|())(?<a>)'],
['((?<a>)(?<a>))(((?<a>)|(?<a>)))((?<a>))', '(?:()())(?:(?:()|()))(?:(?<a>))'],
['(?<a>)(?<a>)((?<a>)|(?<a>))', '()()(?:(?<a>)|(?<a>))'],
['(?<a>(?<a>))', '(?<a>())'],
['(?<a>(?<a>))|(?<a>)', '(?<a>())|(?<a>)'],
['(?<a>)(?<a>)(|(?<a>))(?<a>)', '(?<a>)()(?:|())()'],
['((?<a>)(?<a>))(((?<a>)|(?<a>)))((?<a>))', '(?:(?<a>)())(?:(?:()|()))(?:())'],
['(?<a>)(?<a>)((?<a>)|(?<a>))', '(?<a>)()(?:()|())'],
['((?<a>)|(?<a>))(?<a>)(?<a>)', '(?:(?<a>)|(?<a>))()()'],
];
for (const [pattern, output] of tests) {
expect(toDetails(pattern, opts).pattern).toBe(output);
}
});

it('should convert all but one duplicate name to an unnamed capture with target < ES2025', () => {
it('should keep only the first of duplicate names with target < ES2025', () => {
const opts = {
target: 'ES2024',
verbose: true,
};
// Current behavior is to:
// 1. First, keep only the last instance of the name per alternation path (like for ES2025).
// 2. Next, strip duplicate names from all but the first alternative that includes it.
// Keeping only the last instance per alternation path (step 1) is important for correctness,
// but the choice of which path to keep the name for in step 2 is arbitrary. The current
// approach of keeping the first remaining instance of the name rather than the last is
// merely for implementation simplicity, and could change in the future
const tests = [
['(?<a>)(?<a>)', '()(?<a>)'],
['(?<a>)(?<a>)', '(?<a>)()'],
['(?<a>)|(?<a>)', '(?<a>)|()'],
['((?<a>)|(?<a>))', '(?:(?<a>)|())'],
['(?<a>)(((?<a>)))', '()(?:(?:(?<a>)))'],
['(((?<a>)))(?<a>)', '(?:(?:()))(?<a>)'],
['(?<a>)(((?<a>)))', '(?<a>)(?:(?:()))'],
['(((?<a>)))(?<a>)', '(?:(?:(?<a>)))()'],
['(?<a>)|(((?<a>)))', '(?<a>)|(?:(?:()))'],
['(((?<a>)))|(?<a>)', '(?:(?:(?<a>)))|()'],
['(?<a>(?<a>))', '((?<a>))'],
['(?<a>(?<a>))|(?<a>)', '((?<a>))|()'],
['(?<a>)(?<a>)(|(?<a>))(?<a>)', '()()(?:|())(?<a>)'],
['((?<a>)(?<a>))(((?<a>)|(?<a>)))((?<a>))', '(?:()())(?:(?:()|()))(?:(?<a>))'],
['(?<a>)(?<a>)((?<a>)|(?<a>))', '()()(?:(?<a>)|())'],
['(?<a>(?<a>))', '(?<a>())'],
['(?<a>(?<a>))|(?<a>)', '(?<a>())|()'],
['(?<a>)(?<a>)(|(?<a>))(?<a>)', '(?<a>)()(?:|())()'],
['((?<a>)(?<a>))(((?<a>)|(?<a>)))((?<a>))', '(?:(?<a>)())(?:(?:()|()))(?:())'],
['(?<a>)(?<a>)((?<a>)|(?<a>))', '(?<a>)()(?:()|())'],
['((?<a>)|(?<a>))(?<a>)(?<a>)', '(?:(?<a>)|())()()'],
];
for (const [pattern, output] of tests) {
expect(toDetails(pattern, opts).pattern).toBe(output);
Expand Down
12 changes: 11 additions & 1 deletion spec/match-subroutine.spec.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import {toDetails} from '../dist/index.mjs';
import {toDetails, toRegExp} from '../dist/index.mjs';
import {r} from '../src/utils.js';
import {matchers} from './helpers/matchers.js';

Expand Down Expand Up @@ -119,5 +119,15 @@ describe('Subroutine', () => {
expect('baaba').toExactlyMatch(r`\g<b>(?<a>a)(?<b>b\g<a>)`);
expect('abcbcc').toExactlyMatch(r`(?<a>a\g<b>)(?<b>b\g<c>)(?<c>c)`);
});

it('should not use duplicate names for subroutines', () => {
expect(toRegExp(r`(?<n>[ab])\g<n>`).exec('ab').groups.n).toBe('a');
expect(toRegExp(r`\g<n>(?<n>[ab])`).exec('ab').groups.n).toBe('b');
});

it('should not use duplicate names for subroutine child captures', () => {
expect(toRegExp(r`(?<n1>(?<n2>[ab]))\g<n1>`).exec('ab').groups.n2).toBe('a');
expect(toRegExp(r`\g<n1>(?<n1>(?<n2>[ab]))`).exec('ab').groups.n2).toBe('b');
});
});
});
8 changes: 7 additions & 1 deletion src/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ import {recursion} from 'regex-recursion';
// The transformation and error checking for Oniguruma's unique syntax and behavior differences
// compared to native JS RegExp is layered into all steps of the compilation process:
// 1. Tokenizer: Understands Oniguruma syntax, with many large and small differences from JS.
// 2. Parser: Builds an Oniguruma AST from the tokens with understanding of Oniguruma differences.
// 2. Parser: Builds an Oniguruma AST from the tokens, with understanding of Oniguruma differences.
// 3. Transformer: Converts the Oniguruma AST to a Regex+ AST that preserves all Oniguruma
// behavior. This is true even in cases of non-native-JS features that are supported by both
// Regex+ and Oniguruma but with subtly different behavior in each (subroutines, flag x).
Expand Down Expand Up @@ -98,6 +98,11 @@ function toOnigurumaAst(pattern, options) {
return parse(tokenize(pattern, flags, {captureGroup}));
}

// // Returns a Regex+ AST generated from an Oniguruma pattern
// function toRegexAst(pattern, options) {
// return transform(toOnigurumaAst(pattern, options));
// }

/**
Accepts an Oniguruma pattern and returns an equivalent JavaScript `RegExp`.
@param {string} pattern Oniguruma regex pattern.
Expand All @@ -116,5 +121,6 @@ export {
EmulatedRegExp,
toDetails,
toOnigurumaAst,
// toRegexAst,
toRegExp,
};
55 changes: 31 additions & 24 deletions src/transform.js
Original file line number Diff line number Diff line change
Expand Up @@ -374,23 +374,6 @@ const SecondPassVisitor = {
}
},

Recursion({node, parent}, {reffedNodesByReferencer}) {
// Recursion nodes are created during the current traversal; they're only traversed here if a
// recursion node created during traversal is then copied by a subroutine expansion, e.g. with
// `(?<a>\g<a>)\g<a>`
const {ref} = node;
// Immediate parent is an alternative or quantifier; can skip
let reffed = parent;
while ((reffed = reffed.parent)) {
if (reffed.type === AstTypes.CapturingGroup && (reffed.name === ref || reffed.number === ref)) {
break;
}
}
// Track the referenced node because `ref`s are rewritten in a subsequent pass; capturing group
// names and numbers might change due to subroutine expansion and duplicate group names
reffedNodesByReferencer.set(node, reffed);
},

CapturingGroup: {
enter(
{ node,
Expand Down Expand Up @@ -459,15 +442,22 @@ const SecondPassVisitor = {
// subroutine expansion)
if (node.name) {
const groupsWithSameName = getOrCreate(groupsByName, node.name, new Map());
for (const groupInfo of groupsWithSameName.values()) {
if (!groupInfo.hasDuplicateNameToRemove && canParticipateWithNode(groupInfo.node, node, {
ancestorsParticipate: true,
})) {
// Will change the earlier instance with this name to an unnamed capture in a later pass
groupInfo.hasDuplicateNameToRemove = true;
let hasDuplicateNameToRemove = false;
if (origin) {
// Subroutines and their child captures shouldn't hold duplicate names in the final state
hasDuplicateNameToRemove = true;
} else {
for (const groupInfo of groupsWithSameName.values()) {
if (!groupInfo.hasDuplicateNameToRemove && canParticipateWithNode(groupInfo.node, node, {
ancestorsParticipate: true,
})) {
// Will change to an unnamed capture in a later pass
hasDuplicateNameToRemove = true;
break;
}
}
}
groupsByName.get(node.name).set(node, {node});
groupsByName.get(node.name).set(node, {node, hasDuplicateNameToRemove});
}
},
exit({node}, {openRefs}) {
Expand All @@ -488,6 +478,23 @@ const SecondPassVisitor = {
},
},

Recursion({node, parent}, {reffedNodesByReferencer}) {
// Recursion nodes are created during the current traversal; they're only traversed here if a
// recursion node created during traversal is then copied by a subroutine expansion, e.g. with
// `(?<a>\g<a>)\g<a>`
const {ref} = node;
// Immediate parent is an alternative or quantifier; can skip
let reffed = parent;
while ((reffed = reffed.parent)) {
if (reffed.type === AstTypes.CapturingGroup && (reffed.name === ref || reffed.number === ref)) {
break;
}
}
// Track the referenced node because `ref`s are rewritten in a subsequent pass; capturing group
// names and numbers might change due to subroutine expansion and duplicate group names
reffedNodesByReferencer.set(node, reffed);
},

Subroutine(path, state) {
const {node, replaceWith} = path;
const {ref} = node;
Expand Down

0 comments on commit b8bbcb3

Please sign in to comment.