Skip to content

Commit

Permalink
Fix interpolating numbered backref with lookbehind, plus refactor
Browse files Browse the repository at this point in the history
  • Loading branch information
slevithan committed Jul 21, 2024
1 parent b4734fd commit 520e487
Show file tree
Hide file tree
Showing 5 changed files with 39 additions and 29 deletions.
4 changes: 2 additions & 2 deletions src/atomic-groups.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import {Context, hasUnescaped, replaceUnescaped} from 'regex-utilities';
import {noncapturingStart} from './utils.js';
import {noncapturingDelim} from './utils.js';

/**
@param {string} expression
Expand All @@ -9,7 +9,7 @@ export function atomicGroupsPostprocessor(expression) {
if (!hasUnescaped(expression, '\\(\\?>', Context.DEFAULT)) {
return expression;
}
const token = new RegExp(String.raw`(?<noncapturingStart>${noncapturingStart})|(?<capturingStart>\((?:\?<[^>]+>)?)|(?<backrefNum>\\[1-9]\d*)|\\?.`, 'gsu');
const token = new RegExp(String.raw`(?<noncapturingStart>${noncapturingDelim})|(?<capturingStart>\((?:\?<[^>]+>)?)|(?<backrefNum>\\[1-9]\d*)|\\?.`, 'gsu');
const aGDelim = '(?>';
const emulatedAGDelim = '(?:(?=(';
let capturingGroupCount = 0;
Expand Down
4 changes: 2 additions & 2 deletions src/flag-n.js
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import {RegexContext, getEndContextForIncompleteExpression, noncapturingStart} from './utils.js';
import {RegexContext, getEndContextForIncompleteExpression, noncapturingDelim} from './utils.js';

const token = new RegExp(String.raw`
${noncapturingStart}
${noncapturingDelim}
| \(\?<
| (?<backrefNum>\\[1-9]\d*)
| \\?.
Expand Down
4 changes: 2 additions & 2 deletions src/flag-x.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import {Context, replaceUnescaped} from 'regex-utilities';
import {CharClassContext, RegexContext, doublePunctuatorChars, getEndContextForIncompleteExpression, noncapturingStart, sandboxLoneDoublePunctuatorChar, sandboxUnsafeNulls} from './utils.js';
import {CharClassContext, RegexContext, doublePunctuatorChars, getEndContextForIncompleteExpression, noncapturingDelim, sandboxLoneDoublePunctuatorChar, sandboxUnsafeNulls} from './utils.js';

const ws = /^\s$/;
const escapedWsOrHash = /^\\[\s#]$/;
Expand All @@ -14,7 +14,7 @@ const token = new RegExp(String.raw`
| 0\d+
)
| \[\^
| ${noncapturingStart}
| ${noncapturingDelim}
| \(\?<
| (?<dp>[${doublePunctuatorChars}])\k<dp>
| --
Expand Down
50 changes: 29 additions & 21 deletions src/subroutines.js
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
import {Context, execUnescaped, forEachUnescaped, getGroupContents, hasUnescaped} from 'regex-utilities';
import {countCaptures} from './utils.js';
import {capturingDelim, countCaptures, namedCapturingDelim} from './utils.js';

/**
@param {string} expression
@returns {string}
*/
export function subroutinesPostprocessor(expression) {
const namedGroups = getNamedCapturingGroups(expression);
const namedGroups = getNamedCapturingGroups(expression, true);
return processDefinitionGroup(
processSubroutines(expression, namedGroups),
namedGroups
Expand All @@ -16,22 +16,24 @@ export function subroutinesPostprocessor(expression) {
// Explicitly exclude `&` from subroutine name chars because it's used by extension
// `regex-recursion` for recursive subroutines via `\g<name&R=N>`
const subroutinePattern = String.raw`\\g<(?<subroutineName>[^>&]+)>`;
const namedCapturingStartPattern = String.raw`\(\?<(?![=!])(?<captureName>[^>]+)>`;
const capturingStartPattern = String.raw`\((?!\?)|${namedCapturingStartPattern}`;
const token = new RegExp(String.raw`
${subroutinePattern}
| (?<capturingStart>${capturingStartPattern})
| (?<capturingStart>${capturingDelim})
| \\(?<backrefNum>[1-9]\d*)
| \\k<(?<backrefName>[^>]+)>
| \\?.
`.replace(/\s+/g, ''), 'gsu');

/**
@typedef {Map<string, {contents: string; isUnique: boolean}>} NamedCapturingGroupsMap
@typedef {
Map<string, {
isUnique: boolean;
contents?: string;
}>} NamedCapturingGroupsMap
*/

/**
Transform syntax `\g<name>`
Transform `\g<name>`
@param {string} expression
@param {NamedCapturingGroupsMap} namedGroups
@returns {string}
Expand All @@ -41,10 +43,10 @@ function processSubroutines(expression, namedGroups) {
return expression;
}
const backrefIncrements = [0];
const openSubroutinesMap = new Map();
const openSubroutinesStack = [];
let numCapturesPassedOutsideSubroutines = 0;
let numCapturesPassedInsideSubroutines = 0;
let openSubroutinesMap = new Map();
let openSubroutinesStack = [];
let numCharClassesOpen = 0;
let result = expression;
let match;
Expand Down Expand Up @@ -105,7 +107,8 @@ function processSubroutines(expression, namedGroups) {
if (openSubroutinesMap.size) {
const numCapturesBeforeReferencedGroup = countCapturesBeforeGroupName(expression, openSubroutinesStack[0]);
if (num > numCapturesBeforeReferencedGroup) {
increment = numCapturesPassedOutsideSubroutines +
increment =
numCapturesPassedOutsideSubroutines +
numCapturesPassedInsideSubroutines -
numCapturesBeforeReferencedGroup -
subroutine.numCaptures;
Expand Down Expand Up @@ -167,11 +170,11 @@ Strip `(?(DEFINE)…)`
@returns {string}
*/
function processDefinitionGroup(expression, namedGroups) {
const defineDelim = execUnescaped(expression, String.raw`\(\?\(DEFINE\)`, 0, Context.DEFAULT);
if (!defineDelim) {
const defineStart = execUnescaped(expression, String.raw`\(\?\(DEFINE\)`, 0, Context.DEFAULT);
if (!defineStart) {
return expression;
}
const defineGroup = getGroup(expression, defineDelim);
const defineGroup = getGroup(expression, defineStart);
if (defineGroup.afterPos < expression.length) {
// Supporting DEFINE at positions other than the end would significantly complicate edge-case
// backref handling. Note: Flag x's preprocessing permits trailing whitespace and comments
Expand All @@ -180,7 +183,7 @@ function processDefinitionGroup(expression, namedGroups) {
throw new Error('DEFINE group is unclosed');
}
// `(?:)` separators can be added by the flag x preprocessor
const contentsToken = new RegExp(String.raw`${namedCapturingStartPattern}|\(\?:\)|(?<unsupported>\\?.)`, 'gsu');
const contentsToken = new RegExp(String.raw`${namedCapturingDelim}|\(\?:\)|(?<unsupported>\\?.)`, 'gsu');
let match;
while (match = contentsToken.exec(defineGroup.contents)) {
const {captureName, unsupported} = match.groups;
Expand All @@ -199,7 +202,7 @@ function processDefinitionGroup(expression, namedGroups) {
}
}
if (duplicateName) {
throw new Error(`Group names within DEFINE must be unique; has duplicate "${duplicateName}"`);
throw new Error(`Duplicate group name "${duplicateName}" within DEFINE"`);
}
contentsToken.lastIndex = group.afterPos;
continue;
Expand All @@ -211,7 +214,7 @@ function processDefinitionGroup(expression, namedGroups) {
throw new Error(`DEFINE group includes unsupported syntax at top level`);
}
}
return expression.slice(0, defineDelim.index);
return expression.slice(0, defineStart.index);
}

/**
Expand All @@ -238,7 +241,7 @@ function countCapturesBeforeGroupName(expression, groupName) {
let num = 0;
let pos = 0;
let match;
while (match = execUnescaped(expression, capturingStartPattern, pos, Context.DEFAULT)) {
while (match = execUnescaped(expression, capturingDelim, pos, Context.DEFAULT)) {
const {0: m, index, groups: {captureName}} = match;
if (captureName === groupName) {
break;
Expand All @@ -258,7 +261,7 @@ function getCaptureNum(expression, groupName) {
let num = 0;
let pos = 0;
let match;
while (match = execUnescaped(expression, capturingStartPattern, pos, Context.DEFAULT)) {
while (match = execUnescaped(expression, capturingDelim, pos, Context.DEFAULT)) {
const {0: m, index, groups: {captureName}} = match;
num++;
if (captureName === groupName) {
Expand All @@ -282,22 +285,27 @@ function spliceStr(str, pos, oldValue, newValue) {

/**
@param {string} expression
@param {boolean} [includeContents] Leave off if unneeded, for perf
@returns {NamedCapturingGroupsMap}
*/
function getNamedCapturingGroups(expression) {
function getNamedCapturingGroups(expression, includeContents) {
const namedGroups = new Map();
forEachUnescaped(
expression,
namedCapturingStartPattern,
namedCapturingDelim,
({0: m, index, groups: {captureName}}) => {
// If there are duplicate capture names, subroutines refer to the first instance of the given
// group (matching the behavior of PCRE and Perl)
if (namedGroups.has(captureName)) {
namedGroups.get(captureName).isUnique = false;
} else {
namedGroups.set(captureName, {
contents: getGroupContents(expression, index + m.length),
isUnique: true,
...(
includeContents ? {
contents: getGroupContents(expression, index + m.length),
} : null
),
});
}
},
Expand Down
6 changes: 4 additions & 2 deletions src/utils.js
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,9 @@ export const flagVSupported = (() => {

export const doublePunctuatorChars = '&!#$%*+,.:;<=>?@^`~';

export const noncapturingStart = String.raw`\(\?(?:[:=!>A-Za-z\-]|<[=!]|\(DEFINE\))`;
export const namedCapturingDelim = String.raw`\(\?<(?![=!])(?<captureName>[^>]+)>`;
export const capturingDelim = String.raw`\((?!\?)|${namedCapturingDelim}`;
export const noncapturingDelim = String.raw`\(\?(?:[:=!>A-Za-z\-]|<[=!]|\(DEFINE\))`;

/**
Escape special characters for the given context, assuming flag v.
Expand Down Expand Up @@ -225,7 +227,7 @@ export function getEndContextForIncompleteExpression(incompleteExpression, {
*/
export function countCaptures(expression) {
let num = 0;
forEachUnescaped(expression, String.raw`\((?:(?!\?)|\?<[^>]+>)`, () => num++, Context.DEFAULT);
forEachUnescaped(expression, capturingDelim, () => num++, Context.DEFAULT);
return num;
}

Expand Down

0 comments on commit 520e487

Please sign in to comment.