diff --git a/src/atomic-groups.js b/src/atomic-groups.js index 8665541..8416ffc 100644 --- a/src/atomic-groups.js +++ b/src/atomic-groups.js @@ -1,5 +1,5 @@ import {Context, hasUnescaped, replaceUnescaped} from 'regex-utilities'; -import {noncapturingStart} from './utils.js'; +import {noncapturingDelim} from './utils.js'; /** @param {string} expression @@ -9,7 +9,7 @@ export function atomicGroupsPostprocessor(expression) { if (!hasUnescaped(expression, '\\(\\?>', Context.DEFAULT)) { return expression; } - const token = new RegExp(String.raw`(?${noncapturingStart})|(?\((?:\?<[^>]+>)?)|(?\\[1-9]\d*)|\\?.`, 'gsu'); + const token = new RegExp(String.raw`(?${noncapturingDelim})|(?\((?:\?<[^>]+>)?)|(?\\[1-9]\d*)|\\?.`, 'gsu'); const aGDelim = '(?>'; const emulatedAGDelim = '(?:(?=('; let capturingGroupCount = 0; diff --git a/src/flag-n.js b/src/flag-n.js index 3a517d5..70e6366 100644 --- a/src/flag-n.js +++ b/src/flag-n.js @@ -1,7 +1,7 @@ -import {RegexContext, getEndContextForIncompleteExpression, noncapturingStart} from './utils.js'; +import {RegexContext, getEndContextForIncompleteExpression, noncapturingDelim} from './utils.js'; const token = new RegExp(String.raw` -${noncapturingStart} +${noncapturingDelim} | \(\?< | (?\\[1-9]\d*) | \\?. diff --git a/src/flag-x.js b/src/flag-x.js index 11322d6..a7439cb 100644 --- a/src/flag-x.js +++ b/src/flag-x.js @@ -1,5 +1,5 @@ import {Context, replaceUnescaped} from 'regex-utilities'; -import {CharClassContext, RegexContext, doublePunctuatorChars, getEndContextForIncompleteExpression, noncapturingStart, sandboxLoneDoublePunctuatorChar, sandboxUnsafeNulls} from './utils.js'; +import {CharClassContext, RegexContext, doublePunctuatorChars, getEndContextForIncompleteExpression, noncapturingDelim, sandboxLoneDoublePunctuatorChar, sandboxUnsafeNulls} from './utils.js'; const ws = /^\s$/; const escapedWsOrHash = /^\\[\s#]$/; @@ -14,7 +14,7 @@ const token = new RegExp(String.raw` | 0\d+ ) | \[\^ -| ${noncapturingStart} +| ${noncapturingDelim} | \(\?< | (?[${doublePunctuatorChars}])\k | -- diff --git a/src/subroutines.js b/src/subroutines.js index 0fccb78..92838d5 100644 --- a/src/subroutines.js +++ b/src/subroutines.js @@ -1,12 +1,12 @@ import {Context, execUnescaped, forEachUnescaped, getGroupContents, hasUnescaped} from 'regex-utilities'; -import {countCaptures} from './utils.js'; +import {capturingDelim, countCaptures, namedCapturingDelim} from './utils.js'; /** @param {string} expression @returns {string} */ export function subroutinesPostprocessor(expression) { - const namedGroups = getNamedCapturingGroups(expression); + const namedGroups = getNamedCapturingGroups(expression, true); return processDefinitionGroup( processSubroutines(expression, namedGroups), namedGroups @@ -16,22 +16,24 @@ export function subroutinesPostprocessor(expression) { // Explicitly exclude `&` from subroutine name chars because it's used by extension // `regex-recursion` for recursive subroutines via `\g` const subroutinePattern = String.raw`\\g<(?[^>&]+)>`; -const namedCapturingStartPattern = String.raw`\(\?<(?![=!])(?[^>]+)>`; -const capturingStartPattern = String.raw`\((?!\?)|${namedCapturingStartPattern}`; const token = new RegExp(String.raw` ${subroutinePattern} -| (?${capturingStartPattern}) +| (?${capturingDelim}) | \\(?[1-9]\d*) | \\k<(?[^>]+)> | \\?. `.replace(/\s+/g, ''), 'gsu'); /** -@typedef {Map} NamedCapturingGroupsMap +@typedef { + Map} NamedCapturingGroupsMap */ /** -Transform syntax `\g` +Transform `\g` @param {string} expression @param {NamedCapturingGroupsMap} namedGroups @returns {string} @@ -41,10 +43,10 @@ function processSubroutines(expression, namedGroups) { return expression; } const backrefIncrements = [0]; + const openSubroutinesMap = new Map(); + const openSubroutinesStack = []; let numCapturesPassedOutsideSubroutines = 0; let numCapturesPassedInsideSubroutines = 0; - let openSubroutinesMap = new Map(); - let openSubroutinesStack = []; let numCharClassesOpen = 0; let result = expression; let match; @@ -105,7 +107,8 @@ function processSubroutines(expression, namedGroups) { if (openSubroutinesMap.size) { const numCapturesBeforeReferencedGroup = countCapturesBeforeGroupName(expression, openSubroutinesStack[0]); if (num > numCapturesBeforeReferencedGroup) { - increment = numCapturesPassedOutsideSubroutines + + increment = + numCapturesPassedOutsideSubroutines + numCapturesPassedInsideSubroutines - numCapturesBeforeReferencedGroup - subroutine.numCaptures; @@ -167,11 +170,11 @@ Strip `(?(DEFINE)…)` @returns {string} */ function processDefinitionGroup(expression, namedGroups) { - const defineDelim = execUnescaped(expression, String.raw`\(\?\(DEFINE\)`, 0, Context.DEFAULT); - if (!defineDelim) { + const defineStart = execUnescaped(expression, String.raw`\(\?\(DEFINE\)`, 0, Context.DEFAULT); + if (!defineStart) { return expression; } - const defineGroup = getGroup(expression, defineDelim); + const defineGroup = getGroup(expression, defineStart); if (defineGroup.afterPos < expression.length) { // Supporting DEFINE at positions other than the end would significantly complicate edge-case // backref handling. Note: Flag x's preprocessing permits trailing whitespace and comments @@ -180,7 +183,7 @@ function processDefinitionGroup(expression, namedGroups) { throw new Error('DEFINE group is unclosed'); } // `(?:)` separators can be added by the flag x preprocessor - const contentsToken = new RegExp(String.raw`${namedCapturingStartPattern}|\(\?:\)|(?\\?.)`, 'gsu'); + const contentsToken = new RegExp(String.raw`${namedCapturingDelim}|\(\?:\)|(?\\?.)`, 'gsu'); let match; while (match = contentsToken.exec(defineGroup.contents)) { const {captureName, unsupported} = match.groups; @@ -199,7 +202,7 @@ function processDefinitionGroup(expression, namedGroups) { } } if (duplicateName) { - throw new Error(`Group names within DEFINE must be unique; has duplicate "${duplicateName}"`); + throw new Error(`Duplicate group name "${duplicateName}" within DEFINE"`); } contentsToken.lastIndex = group.afterPos; continue; @@ -211,7 +214,7 @@ function processDefinitionGroup(expression, namedGroups) { throw new Error(`DEFINE group includes unsupported syntax at top level`); } } - return expression.slice(0, defineDelim.index); + return expression.slice(0, defineStart.index); } /** @@ -238,7 +241,7 @@ function countCapturesBeforeGroupName(expression, groupName) { let num = 0; let pos = 0; let match; - while (match = execUnescaped(expression, capturingStartPattern, pos, Context.DEFAULT)) { + while (match = execUnescaped(expression, capturingDelim, pos, Context.DEFAULT)) { const {0: m, index, groups: {captureName}} = match; if (captureName === groupName) { break; @@ -258,7 +261,7 @@ function getCaptureNum(expression, groupName) { let num = 0; let pos = 0; let match; - while (match = execUnescaped(expression, capturingStartPattern, pos, Context.DEFAULT)) { + while (match = execUnescaped(expression, capturingDelim, pos, Context.DEFAULT)) { const {0: m, index, groups: {captureName}} = match; num++; if (captureName === groupName) { @@ -282,13 +285,14 @@ function spliceStr(str, pos, oldValue, newValue) { /** @param {string} expression +@param {boolean} [includeContents] Leave off if unneeded, for perf @returns {NamedCapturingGroupsMap} */ -function getNamedCapturingGroups(expression) { +function getNamedCapturingGroups(expression, includeContents) { const namedGroups = new Map(); forEachUnescaped( expression, - namedCapturingStartPattern, + namedCapturingDelim, ({0: m, index, groups: {captureName}}) => { // If there are duplicate capture names, subroutines refer to the first instance of the given // group (matching the behavior of PCRE and Perl) @@ -296,8 +300,12 @@ function getNamedCapturingGroups(expression) { namedGroups.get(captureName).isUnique = false; } else { namedGroups.set(captureName, { - contents: getGroupContents(expression, index + m.length), isUnique: true, + ...( + includeContents ? { + contents: getGroupContents(expression, index + m.length), + } : null + ), }); } }, diff --git a/src/utils.js b/src/utils.js index 02c84bf..d1da0d5 100644 --- a/src/utils.js +++ b/src/utils.js @@ -38,7 +38,9 @@ export const flagVSupported = (() => { export const doublePunctuatorChars = '&!#$%*+,.:;<=>?@^`~'; -export const noncapturingStart = String.raw`\(\?(?:[:=!>A-Za-z\-]|<[=!]|\(DEFINE\))`; +export const namedCapturingDelim = String.raw`\(\?<(?![=!])(?[^>]+)>`; +export const capturingDelim = String.raw`\((?!\?)|${namedCapturingDelim}`; +export const noncapturingDelim = String.raw`\(\?(?:[:=!>A-Za-z\-]|<[=!]|\(DEFINE\))`; /** Escape special characters for the given context, assuming flag v. @@ -225,7 +227,7 @@ export function getEndContextForIncompleteExpression(incompleteExpression, { */ export function countCaptures(expression) { let num = 0; - forEachUnescaped(expression, String.raw`\((?:(?!\?)|\?<[^>]+>)`, () => num++, Context.DEFAULT); + forEachUnescaped(expression, capturingDelim, () => num++, Context.DEFAULT); return num; }