Skip to content

Commit

Permalink
Fix multiple use of possessive quantifiers
Browse files Browse the repository at this point in the history
  • Loading branch information
slevithan committed Sep 6, 2024
1 parent 9373b56 commit bad13cb
Show file tree
Hide file tree
Showing 5 changed files with 64 additions and 43 deletions.
4 changes: 3 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
## Unreleased changes

-
### 🐞 Fixes

- Fix usage of more than possessive quantifier in the same regex.

## Released changes

Expand Down
25 changes: 20 additions & 5 deletions spec/atomic.spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,19 @@ describe('atomic groups', () => {
expect('aaaaaab').toMatch(regex`(?>a)+ab`);
});

it('should allow nested atomic groups', () => {
expect('integerrr+').toMatch(regex`\b(?>int(?>eger+)?|insert)\b(?>.)`);
expect('integerrr+').not.toMatch(regex`\b(?>int(?>eger+)??|insert)\b(?>.)`);
});

it('should allow quantifying atomic groups', () => {
expect('one two').toMatch(regex`^(?>\w+\s?)+$`);
});

it('should work for multiple atomic groups', () => {
expect('ab').toMatch(regex`^(?>a)(?>b)$`);
});

it('should work for nested atomic groups', () => {
expect('integerrr+').toMatch(regex`\b(?>int(?>eger+)?|insert)\b(?>.)`);
expect('integerrr+').not.toMatch(regex`\b(?>int(?>eger+)??|insert)\b(?>.)`);
});

it('should work when followed by a literal digit', () => {
expect('a0').toMatch(regex`^(?>a)0$`);
});
Expand Down Expand Up @@ -114,6 +118,17 @@ describe('possessive quantifiers', () => {
expect(() => regex`(++`).toThrow();
});

it('should work for multiple possessive quantifiers', () => {
expect('ab').toMatch(regex`^a++b++$`);
expect('ab').toMatch(regex`^[a]++[b]++$`);
expect('ab').toMatch(regex`^(a)++(b)++$`);
});

it('should work for nested possessive quantifiers', () => {
expect('ababb').toMatch(regex`^(ab++)++$`);
expect('ababb').toMatch(regex`^(a(b)++)++$`);
});

it('should not allow quantifying unquantifiable tokens', () => {
expect(() => regex`(?=a)++`).toThrow();
expect(() => regex`(?!a)++`).toThrow();
Expand Down
54 changes: 29 additions & 25 deletions src/atomic.js
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import {Context, replaceUnescaped} from 'regex-utilities';
import {emulationGroupMarker, noncapturingDelim} from './utils.js';
import {emulationGroupMarker, noncapturingDelim, spliceStr} from './utils.js';

const token = new RegExp(String.raw`(?<noncapturingStart>${noncapturingDelim})|(?<capturingStart>\((?:\?<[^>]+>)?)|\\?.`, 'gsu');
const atomicPluginToken = new RegExp(String.raw`(?<noncapturingStart>${noncapturingDelim})|(?<capturingStart>\((?:\?<[^>]+>)?)|\\?.`, 'gsu');

/**
@typedef {import('./regex.js').PluginData} PluginData
Expand Down Expand Up @@ -29,8 +29,8 @@ export function atomicPlugin(expression, data) {
let numGroupsOpenInAG = 0;
let inAG = false;
let match;
token.lastIndex = Number.isNaN(aGPos) ? 0 : aGPos + emulatedAGDelim.length;
while (match = token.exec(expression)) {
atomicPluginToken.lastIndex = Number.isNaN(aGPos) ? 0 : aGPos + emulatedAGDelim.length;
while (match = atomicPluginToken.exec(expression)) {
const {0: m, index, groups: {capturingStart, noncapturingStart}} = match;
if (m === '[') {
numCharClassesOpen++;
Expand Down Expand Up @@ -93,7 +93,7 @@ export function atomicPlugin(expression, data) {

const baseQuantifier = String.raw`(?:[?*+]|\{\d+(?:,\d*)?\})`;
// Complete tokenizer for base syntax; doesn't (need to) know about character-class-only syntax
const baseToken = new RegExp(String.raw`
const possessivePluginToken = new RegExp(String.raw`
\\(?: \d+
| c[A-Za-z]
| [gk]<[^>]+>
Expand All @@ -106,27 +106,31 @@ const baseToken = new RegExp(String.raw`
| [A-Za-z\-]+:
| \(DEFINE\)
))?
| (?<q>${baseQuantifier})(?<qMod>[?+]?)(?<invalidQ>[?*+\{]?)
| (?<qBase>${baseQuantifier})(?<qMod>[?+]?)(?<invalidQ>[?*+\{]?)
| \\?.
`.replace(/\s+/g, ''), 'gsu');

/**
Transform posessive quantifiers into atomic groups. The posessessive quantifiers are:
`?+`, `*+`, `++`, `{N}+`, `{N,}+`, `{N,N}+`.
This follows Java, PCRE, Perl, and Python.
Possessive quantifiers in Oniguruma and Onigmo are only: `?+`, `*+`, `++`.
@param {string} expression
@returns {string}
*/
export function possessivePlugin(expression) {
if (!new RegExp(`${baseQuantifier}\\+`).test(expression)) {
if (!(new RegExp(`${baseQuantifier}\\+`).test(expression))) {
return expression;
}
const openGroupIndices = [];
let lastGroupIndex = null;
let lastCharClassIndex = null;
let lastToken = '';
let numCharClassesOpen = 0;
let transformed = '';
for (const {0: m, index, groups: {q, qMod, invalidQ}} of expression.matchAll(baseToken)) {
let match;
possessivePluginToken.lastIndex = 0;
while (match = possessivePluginToken.exec(expression)) {
const {0: m, index, groups: {qBase, qMod, invalidQ}} = match;
if (m === '[') {
if (!numCharClassesOpen) {
lastCharClassIndex = index;
Expand All @@ -146,24 +150,25 @@ export function possessivePlugin(expression) {
if (invalidQ) {
throw new Error(`Invalid quantifier "${m}"`);
}
let charsAdded = -1; // -1 for removed trailing `+`
// Possessivizing fixed repetition quantifiers like `{2}` does't change their behavior, so
// avoid doing so (convert them to greedy)
if (/^\{\d+\}$/.test(q)) {
transformed += q;
} else if (lastToken === ')' || lastToken === ']') {
const nodeIndex = lastToken === ')' ? lastGroupIndex : lastCharClassIndex;
// Unmatched `)` would break out of the wrapping group and mess with handling
if (nodeIndex === null) {
throw new Error(`Invalid unmatched "${lastToken}"`);
}
const node = expression.slice(nodeIndex, index);
transformed = `${expression.slice(0, nodeIndex)}(?>${node}${q})`;
if (/^\{\d+\}$/.test(qBase)) {
expression = spliceStr(expression, index + qBase.length, qMod, '');
} else {
transformed = `${expression.slice(0, transformed.length - lastToken.length)}(?>${lastToken}${q})`;
if (lastToken === ')' || lastToken === ']') {
const nodeIndex = lastToken === ')' ? lastGroupIndex : lastCharClassIndex;
// Unmatched `)` would break out of the wrapping group and mess with handling
if (nodeIndex === null) {
throw new Error(`Invalid unmatched "${lastToken}"`);
}
expression = `${expression.slice(0, nodeIndex)}(?>${expression.slice(nodeIndex, index)}${qBase})${expression.slice(index + m.length)}`;
} else {
expression = `${expression.slice(0, index - lastToken.length)}(?>${lastToken}${qBase})${expression.slice(index + m.length)}`;
}
charsAdded += 4; // `(?>)`
}
// Avoid adding the match to `transformed`
// Haven't updated `lastToken`, but it isn't needed
continue;
possessivePluginToken.lastIndex += charsAdded;
} else if (m[0] === '(') {
openGroupIndices.push(index);
} else if (m === ')') {
Expand All @@ -172,7 +177,6 @@ export function possessivePlugin(expression) {

}
lastToken = m;
transformed += m;
}
return transformed;
return expression;
}
13 changes: 1 addition & 12 deletions src/subroutines.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import {Context, execUnescaped, forEachUnescaped, getGroupContents, hasUnescaped, replaceUnescaped} from 'regex-utilities';
import {capturingDelim, countCaptures, emulationGroupMarker, namedCapturingDelim} from './utils.js';
import {capturingDelim, countCaptures, emulationGroupMarker, namedCapturingDelim, spliceStr} from './utils.js';

/**
@typedef {import('./regex.js').PluginData} PluginData
Expand Down Expand Up @@ -336,14 +336,3 @@ function lastOf(arr) {
// <https://caniuse.com/mdn-javascript_builtins_array_at>
return arr[arr.length - 1];
}

/**
@param {string} str
@param {number} pos
@param {string} oldValue
@param {string} newValue
@returns {string}
*/
function spliceStr(str, pos, oldValue, newValue) {
return str.slice(0, pos) + newValue + str.slice(pos + oldValue.length);
}
11 changes: 11 additions & 0 deletions src/utils.js
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,17 @@ export function adjustNumberedBackrefs(expression, precedingCaptures) {
);
}

/**
@param {string} str
@param {number} pos
@param {string} oldValue
@param {string} newValue
@returns {string}
*/
export function spliceStr(str, pos, oldValue, newValue) {
return str.slice(0, pos) + newValue + str.slice(pos + oldValue.length);
}

// Properties of strings as of ES2024
const stringPropertyNames = [
'Basic_Emoji',
Expand Down

0 comments on commit bad13cb

Please sign in to comment.