Skip to content

Commit

Permalink
Rewrite subroutine and definition group backref handling for maintain…
Browse files Browse the repository at this point in the history
…ability and to handle complex edge cases
  • Loading branch information
slevithan committed Jul 24, 2024
1 parent 9b35ff5 commit 63fbd96
Show file tree
Hide file tree
Showing 2 changed files with 134 additions and 123 deletions.
12 changes: 6 additions & 6 deletions spec/subroutines-spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,11 @@ describe('subroutines', () => {
[String.raw`(?<a>)\g<a>\1\g<a>\1`, String.raw`(?<a>)()\1()\1`],
[String.raw`(?<a>)\g<a>()\1\g<a>()\1`, String.raw`(?<a>)()()\1()()\1`],
[String.raw`(?<a>)\g<a>()\2\g<a>()\3`, String.raw`(?<a>)()()\3()()\5`],
[String.raw`\1\2\3(?<a>\1\2\3()\1\2\3)\1\2\3\g<a>\1\2\3()\1\2\3\g<a>\1\2\3()\1\2\3`, String.raw`\1\2\3(?<a>\1\2\3()\1\2\3)\1\2\3(\3\4\5()\3\4\5)\1\2\5()\1\2\5(\6\7\8()\6\7\8)\1\2\5()\1\2\5`],
[String.raw`(?<a>\1\2\3())\g<a>()`, String.raw`(?<a>\1\2\5())(\3\4\5())()`],
[String.raw`\1\2\3(?<a>\1\2\3()\1\2\3)\1\2\3\g<a>\1\2\3()\1\2\3\g<a>\1\2\3()\1\2\3`, String.raw`\1\2\5(?<a>\1\2\5()\1\2\5)\1\2\5(\3\4\5()\3\4\5)\1\2\5()\1\2\5(\6\7\5()\6\7\5)\1\2\5()\1\2\5`],
[String.raw`\g<a>(?<a>\1)`, String.raw`(\1)(?<a>\2)`],
[String.raw`\g<a>()(?<a>\1)`, String.raw`(\2)()(?<a>\2)`],
[String.raw`\g<a>(?<a>\2)()`, String.raw`(\3)(?<a>\3)()`],
[String.raw`(?<a>\k<a>)\g<a>`, String.raw`(?<a>\k<a>)(\2)`],
[String.raw`\g<a>(?<a>\k<a>)`, String.raw`(\1)(?<a>\k<a>)`],
[String.raw`(?<a>(?<b>)\k<b>)\g<a>`, String.raw`(?<a>(?<b>)\k<b>)(()\4)`],
Expand All @@ -72,11 +75,8 @@ describe('subroutines', () => {

it('should throw with out of bounds numbered backreferences', () => {
const cases = [
String.raw`(?<a>)\g<a>\1\2`, // To: String.raw`(?<a>)()\1\3`
// TO FIX: The emitted \2→\3 should be \2→\4 (or otherwise throw) so it remains an out of
// bounds reference error. Low priority since `regex`'s implicit flag n prevents using this,
// plus out of bounds backrefs are invalid (with flag u/v) and this is an extreme edge case
// String.raw`(?<a>)\g<a>\1\2\g<a>`, // To: String.raw`(?<a>)()\1\3()`
String.raw`(?<a>)\g<a>\2`,
String.raw`(?<a>)\g<a>\2\g<a>`,
];
cases.forEach(input => {
expect(() => regex({__flagN: false})({raw: [input]})).toThrow();
Expand Down
245 changes: 128 additions & 117 deletions src/subroutines.js
Original file line number Diff line number Diff line change
@@ -1,16 +1,17 @@
import {Context, execUnescaped, forEachUnescaped, getGroupContents, hasUnescaped} from 'regex-utilities';
import {Context, execUnescaped, forEachUnescaped, getGroupContents, hasUnescaped, replaceUnescaped} from 'regex-utilities';
import {capturingDelim, countCaptures, namedCapturingDelim} from './utils.js';

/**
@param {string} expression
@returns {string}
*/
export function subroutinesPostprocessor(expression) {
// Note that subroutines and definition groups fully support numbered backreferences and unnamed
// captures (from interpolated regexes or from turning implicit flag n off), and all of the
// complex forward and backward backreference adjustments that can result
const namedGroups = getNamedCapturingGroups(expression, {includeContents: true});
return processDefinitionGroup(
processSubroutines(expression, namedGroups),
namedGroups
);
const transformed = processSubroutines(expression, namedGroups);
return processDefinitionGroup(transformed, namedGroups);
}

// Explicitly exclude `&` from subroutine name chars because it's used by extension
Expand All @@ -28,7 +29,9 @@ ${subroutinePattern}
@typedef {
Map<string, {
isUnique: boolean;
contents: string | null;
contents?: string;
groupNum?: number;
numCaptures?: number;
}>} NamedCapturingGroupsMap
*/

Expand All @@ -42,12 +45,16 @@ function processSubroutines(expression, namedGroups) {
if (!hasUnescaped(expression, '\\\\g<', Context.DEFAULT)) {
return expression;
}
// Can skip a lot of processing if there are no backrefs
const hasBackrefs = hasUnescaped(expression, '\\\\(?:[1-9]|k<[^>]+>)', Context.DEFAULT);
const backrefIncrements = [0];
const openSubroutinesMap = new Map();
const subroutineWrapper = hasBackrefs ? '(' : '(?:';
const openSubroutines = new Map();
const openSubroutinesStack = [];
const remappedGroupNums = [0];
let numCapturesPassedOutsideSubroutines = 0;
let numCapturesPassedInsideSubroutines = 0;
let numCapturesPassedInsideThisSubroutine = 0;
let numSubroutineCapturesTrackedInRemap = 0;
let numCharClassesOpen = 0;
let result = expression;
let match;
Expand All @@ -58,103 +65,91 @@ function processSubroutines(expression, namedGroups) {
numCharClassesOpen++;
} else if (!numCharClassesOpen) {

const subroutine = openSubroutinesMap.size ? openSubroutinesMap.get(lastOf(openSubroutinesStack)) : null;
if (subroutineName) {
if (!namedGroups.has(subroutineName)) {
throw new Error(`Invalid named capture referenced by subroutine ${m}`);
}
if (openSubroutinesMap.has(subroutineName)) {
if (openSubroutines.has(subroutineName)) {
throw new Error(`Subroutine ${m} followed a recursive reference`);
}
const contents = namedGroups.get(subroutineName).contents;
const numCaptures = countCaptures(contents) + 1; // Plus '(' wrapper
numCapturesPassedInsideSubroutines += numCaptures;
// Wrap value in case it has top-level alternation or is followed by a quantifier. The
// wrapper also marks the end of the expanded contents, which we'll track using
// `unclosedGroupCount`. If there are any backrefs in the expression, wrap with '()'
// instead of '(?:)' so that backrefs line up, in case there are backrefs inside the
// subroutine that refer to their parent capturing group
const subroutineValue = `(${hasBackrefs ? '' : '?:'}${contents})`;
openSubroutinesMap.set(subroutineName, {
contents,
// `unclosedGroupCount`. If there are any backrefs in the expression, wrap with `()`
// instead of `(?:)` in case there are backrefs inside the subroutine that refer to their
// containing capturing group
const subroutineValue = `${subroutineWrapper}${contents})`;
if (hasBackrefs) {
numCapturesPassedInsideThisSubroutine = 0;
numCapturesPassedInsideSubroutines++;
}
openSubroutines.set(subroutineName, {
// Incrementally decremented to track when we've left the group
unclosedGroupCount: countSubgroups(subroutineValue),
numCaptures,
});
openSubroutinesStack.push(subroutineName);
// Expand the subroutine's contents into the pattern we're looping over
result = spliceStr(result, index, m, subroutineValue);
token.lastIndex -= m.length;
token.lastIndex -= m.length - subroutineWrapper.length;
} else if (capturingStart) {
// Somewhere within an expanded subroutine
if (openSubroutinesMap.size) {
if (openSubroutines.size) {
if (hasBackrefs) {
numCapturesPassedInsideThisSubroutine++;
numCapturesPassedInsideSubroutines++;
}
// Named capturing group
if (m !== '(') {
// Replace named with unnamed capture. Subroutines ideally wouldn't create any new
// captures, but it can't be helped since we need any backrefs to this named capture to
// work. Given that flag n prevents unnamed capture and thereby requires you to rely on
// named backrefs and `groups`, switching to unnamed essentially accomplishes not
// creating a capture. Fully avoid capturing if there are no backrefs in the expression
result = spliceStr(result, index, m, '(' + (hasBackrefs ? '' : '?:'));
token.lastIndex -= m.length;
// captures, but it can't be helped since we need any backrefs to this capture to work.
// Given that flag n prevents unnamed capture and thereby requires you to rely on named
// backrefs and `groups`, switching to unnamed essentially accomplishes not creating a
// capture. Can fully avoid capturing if there are no backrefs in the expression
result = spliceStr(result, index, m, subroutineWrapper);
token.lastIndex -= m.length - subroutineWrapper.length;
}
backrefIncrements.push(lastOf(backrefIncrements) + subroutine.numCaptures);
} else {
} else if (hasBackrefs) {
remappedGroupNums.push(
lastOf(remappedGroupNums) + 1 +
numCapturesPassedInsideSubroutines -
numSubroutineCapturesTrackedInRemap
);
numSubroutineCapturesTrackedInRemap = numCapturesPassedInsideSubroutines;
numCapturesPassedOutsideSubroutines++;
if (backrefIncrements.length === numCapturesPassedOutsideSubroutines) {
backrefIncrements.push(lastOf(backrefIncrements));
}
}
} else if (backrefNum) {
const num = +backrefNum;
let increment = 0;
if (openSubroutinesMap.size) {
const numCapturesBeforeReferencedGroup = countCapturesBeforeGroupName(expression, openSubroutinesStack[0]);
if (num > numCapturesBeforeReferencedGroup) {
increment =
numCapturesPassedOutsideSubroutines +
numCapturesPassedInsideSubroutines -
numCapturesBeforeReferencedGroup -
subroutine.numCaptures;
}
} else {
increment = backrefIncrements[num];
}
if (increment) {
const adjusted = `\\${num + increment}`;
result = spliceStr(result, index, m, adjusted);
token.lastIndex += adjusted.length - m.length;
}
} else if (backrefName) {
if (openSubroutinesMap.size) {
let isGroupFromThisSubroutine = false;
if (backrefName === openSubroutinesStack[0]) {
} else if ((backrefNum || backrefName) && openSubroutines.size) {
// Unify handling for named and unnamed by always using the backref num
const num = backrefNum ? +backrefNum : namedGroups.get(backrefName)?.groupNum;
let isGroupFromThisSubroutine = false;
// Search for the group in the contents of the subroutine stack
for (const s of openSubroutinesStack) {
const group = namedGroups.get(s);
if (num >= group.groupNum && num <= (group.groupNum + group.numCaptures)) {
isGroupFromThisSubroutine = true;
} else {
// Search for the group in the contents of the subroutine stack
for (const s of openSubroutinesStack) {
if (hasUnescaped(
openSubroutinesMap.get(s).contents,
String.raw`\(\?<${backrefName}>`,
Context.DEFAULT
)) {
isGroupFromThisSubroutine = true;
break;
}
}
}
if (isGroupFromThisSubroutine) {
// Point to the group, then let normal renumbering work in the next loop iteration
const adjusted = `\\${getCaptureNum(expression, backrefName)}`;
result = spliceStr(result, index, m, adjusted);
token.lastIndex -= m.length;
break;
}
// Else, leave as is
}
if (isGroupFromThisSubroutine) {
const group = namedGroups.get(lastOf(openSubroutinesStack));
// Replace the backref with metadata we'll need to rewrite it later, using
// `\k<$$bNsNrNcN>` as a temporary wrapper:
// - b: The unmodified matched backref num, or the corresponding num of a named backref
// - s: The capture num of the subroutine we're most deeply nested in, including captures
// added by expanding the contents of preceding subroutines
// - r: The original capture num of the group that the subroutine we're most deeply
// nested in references, not counting the effects of subroutines
// - c: The number of captures within `r`, not counting the effects of subroutines
const subroutineNum = numCapturesPassedOutsideSubroutines + numCapturesPassedInsideSubroutines - numCapturesPassedInsideThisSubroutine;
const metadata = `\\k<$$b${num}s${subroutineNum}r${group.groupNum}c${group.numCaptures}>`;
result = spliceStr(result, index, m, metadata);
token.lastIndex += metadata.length - m.length;
}
} else if (m === ')') {
if (openSubroutinesMap.size) {
if (openSubroutines.size) {
const subroutine = openSubroutines.get(lastOf(openSubroutinesStack));
subroutine.unclosedGroupCount--;
if (!subroutine.unclosedGroupCount) {
openSubroutinesMap.delete(openSubroutinesStack.pop());
openSubroutines.delete(openSubroutinesStack.pop());
}
}
}
Expand All @@ -163,35 +158,66 @@ function processSubroutines(expression, namedGroups) {
numCharClassesOpen--;
}
}

if (hasBackrefs) {
// Second pass to adjust backrefs
result = replaceUnescaped(
result,
String.raw`\\(?:(?<bNum>[1-9]\d*)|k<\$\$b(?<bNumSub>\d+)s(?<subNum>\d+)r(?<refNum>\d+)c(?<refCaps>\d+)>)`,
({0: m, groups: {bNum, bNumSub, subNum, refNum, refCaps}}) => {
if (bNum) {
const backrefNum = +bNum;
if (backrefNum > remappedGroupNums.length - 1) {
throw new Error(`Backref "${m}" greater than number of captures`);
}
return `\\${remappedGroupNums[backrefNum]}`;
}
const backrefNumInSubroutine = +bNumSub;
const subroutineGroupNum = +subNum;
const refGroupNum = +refNum;
const numCapturesInRef = +refCaps;
if (backrefNumInSubroutine < refGroupNum || backrefNumInSubroutine > (refGroupNum + numCapturesInRef)) {
return `\\${remappedGroupNums[backrefNumInSubroutine]}`;
}
return `\\${subroutineGroupNum - refGroupNum + backrefNumInSubroutine}`;
},
Context.DEFAULT
);
}

return result;
}

// `(?:)` allowed because it can be added by flag x's preprocessing of whitespace and comments
const defineGroupToken = new RegExp(String.raw`${namedCapturingDelim}|\(\?:\)|(?<invalid>\\?.)`, 'gsu');

/**
Strip `(?(DEFINE)…)`
@param {string} expression
@param {NamedCapturingGroupsMap} namedGroups
IMPORTANT: Avoid using the `contents` property of `namedGroups` objects, because at this point
subroutine substitution has been performed on them in `expression`
@returns {string}
*/
function processDefinitionGroup(expression, namedGroups) {
const defineStart = execUnescaped(expression, String.raw`\(\?\(DEFINE\)`, 0, Context.DEFAULT);
if (!defineStart) {
const defineMatch = execUnescaped(expression, String.raw`\(\?\(DEFINE\)`, 0, Context.DEFAULT);
if (!defineMatch) {
return expression;
}
const defineGroup = getGroup(expression, defineStart);
const defineGroup = getGroup(expression, defineMatch);
if (defineGroup.afterPos < expression.length) {
// Supporting DEFINE at positions other than the end would significantly complicate edge-case
// backref handling. Note: Flag x's preprocessing permits trailing whitespace and comments
// Supporting DEFINE at positions other than the end would complicate backref handling.
// Note: Flag x's preprocessing permits trailing whitespace and comments
throw new Error('DEFINE group allowed only at the end of a regex');
} else if (defineGroup.afterPos > expression.length) {
throw new Error('DEFINE group is unclosed');
}
// `(?:)` separators can be added by the flag x preprocessor
const contentsToken = new RegExp(String.raw`${namedCapturingDelim}|\(\?:\)|(?<unsupported>\\?.)`, 'gsu');
let match;
while (match = contentsToken.exec(defineGroup.contents)) {
const {captureName, unsupported} = match.groups;
defineGroupToken.lastIndex = 0;
while (match = defineGroupToken.exec(defineGroup.contents)) {
const {captureName, invalid} = match.groups;
if (captureName) {
let group = getGroup(defineGroup.contents, match);
const group = getGroup(defineGroup.contents, match);
let duplicateName;
if (!namedGroups.get(captureName).isUnique) {
duplicateName = captureName;
Expand All @@ -207,40 +233,19 @@ function processDefinitionGroup(expression, namedGroups) {
if (duplicateName) {
throw new Error(`Duplicate group name "${duplicateName}" within DEFINE`);
}
contentsToken.lastIndex = group.afterPos;
continue;
}
if (unsupported) {
// Since a DEFINE group is stripped from its expression, we can't easily check if
defineGroupToken.lastIndex = group.afterPos;
} else if (invalid) {
// Since a DEFINE group is stripped from its expression, we can't easily determine whether
// unreferenced top-level syntax within it is valid. Such syntax serves no purpose, so it's
// easiest to not allow it
throw new Error(`DEFINE group includes unsupported syntax at top level`);
}
}
return expression.slice(0, defineStart.index);
}

/**
@param {string} expression
@param {string} groupName
@returns {number}
*/
function countCapturesBeforeGroupName(expression, groupName) {
let num = 0;
let pos = 0;
let match;
while (match = execUnescaped(expression, capturingDelim, pos, Context.DEFAULT)) {
const {0: m, index, groups: {captureName}} = match;
if (captureName === groupName) {
break;
}
num++;
pos = index + m.length;
}
return num;
return expression.slice(0, defineMatch.index);
}

/**
Counts unescaped open parens outside of character classes, regardless of group type
@param {string} expression
@returns {number}
*/
Expand Down Expand Up @@ -301,10 +306,16 @@ function getNamedCapturingGroups(expression, {includeContents} = {}) {
if (namedGroups.has(captureName)) {
namedGroups.get(captureName).isUnique = false;
} else {
namedGroups.set(captureName, {
isUnique: true,
contents: includeContents ? getGroupContents(expression, index + m.length) : null,
});
const group = {isUnique: true};
if (includeContents) {
const contents = getGroupContents(expression, index + m.length);
Object.assign(group, {
contents,
groupNum: getCaptureNum(expression, captureName),
numCaptures: countCaptures(contents),
});
}
namedGroups.set(captureName, group);
}
},
Context.DEFAULT
Expand Down

0 comments on commit 63fbd96

Please sign in to comment.