From bad13cba3245e60f51dce5ccd39d48e50312c8d4 Mon Sep 17 00:00:00 2001 From: Steven Levithan Date: Fri, 6 Sep 2024 21:43:32 +0200 Subject: [PATCH] Fix multiple use of possessive quantifiers --- CHANGELOG.md | 4 +++- spec/atomic.spec.js | 25 ++++++++++++++++----- src/atomic.js | 54 ++++++++++++++++++++++++--------------------- src/subroutines.js | 13 +---------- src/utils.js | 11 +++++++++ 5 files changed, 64 insertions(+), 43 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bd23593..146d7d3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,8 @@ ## Unreleased changes -- +### 🐞 Fixes + +- Fix usage of more than possessive quantifier in the same regex. ## Released changes diff --git a/spec/atomic.spec.js b/spec/atomic.spec.js index 8ad061e..6f4e0d8 100644 --- a/spec/atomic.spec.js +++ b/spec/atomic.spec.js @@ -6,15 +6,19 @@ describe('atomic groups', () => { expect('aaaaaab').toMatch(regex`(?>a)+ab`); }); - it('should allow nested atomic groups', () => { - expect('integerrr+').toMatch(regex`\b(?>int(?>eger+)?|insert)\b(?>.)`); - expect('integerrr+').not.toMatch(regex`\b(?>int(?>eger+)??|insert)\b(?>.)`); - }); - it('should allow quantifying atomic groups', () => { expect('one two').toMatch(regex`^(?>\w+\s?)+$`); }); + it('should work for multiple atomic groups', () => { + expect('ab').toMatch(regex`^(?>a)(?>b)$`); + }); + + it('should work for nested atomic groups', () => { + expect('integerrr+').toMatch(regex`\b(?>int(?>eger+)?|insert)\b(?>.)`); + expect('integerrr+').not.toMatch(regex`\b(?>int(?>eger+)??|insert)\b(?>.)`); + }); + it('should work when followed by a literal digit', () => { expect('a0').toMatch(regex`^(?>a)0$`); }); @@ -114,6 +118,17 @@ describe('possessive quantifiers', () => { expect(() => regex`(++`).toThrow(); }); + it('should work for multiple possessive quantifiers', () => { + expect('ab').toMatch(regex`^a++b++$`); + expect('ab').toMatch(regex`^[a]++[b]++$`); + expect('ab').toMatch(regex`^(a)++(b)++$`); + }); + + it('should work for nested possessive quantifiers', () => { + expect('ababb').toMatch(regex`^(ab++)++$`); + expect('ababb').toMatch(regex`^(a(b)++)++$`); + }); + it('should not allow quantifying unquantifiable tokens', () => { expect(() => regex`(?=a)++`).toThrow(); expect(() => regex`(?!a)++`).toThrow(); diff --git a/src/atomic.js b/src/atomic.js index a65300b..2c4e61f 100644 --- a/src/atomic.js +++ b/src/atomic.js @@ -1,7 +1,7 @@ import {Context, replaceUnescaped} from 'regex-utilities'; -import {emulationGroupMarker, noncapturingDelim} from './utils.js'; +import {emulationGroupMarker, noncapturingDelim, spliceStr} from './utils.js'; -const token = new RegExp(String.raw`(?${noncapturingDelim})|(?\((?:\?<[^>]+>)?)|\\?.`, 'gsu'); +const atomicPluginToken = new RegExp(String.raw`(?${noncapturingDelim})|(?\((?:\?<[^>]+>)?)|\\?.`, 'gsu'); /** @typedef {import('./regex.js').PluginData} PluginData @@ -29,8 +29,8 @@ export function atomicPlugin(expression, data) { let numGroupsOpenInAG = 0; let inAG = false; let match; - token.lastIndex = Number.isNaN(aGPos) ? 0 : aGPos + emulatedAGDelim.length; - while (match = token.exec(expression)) { + atomicPluginToken.lastIndex = Number.isNaN(aGPos) ? 0 : aGPos + emulatedAGDelim.length; + while (match = atomicPluginToken.exec(expression)) { const {0: m, index, groups: {capturingStart, noncapturingStart}} = match; if (m === '[') { numCharClassesOpen++; @@ -93,7 +93,7 @@ export function atomicPlugin(expression, data) { const baseQuantifier = String.raw`(?:[?*+]|\{\d+(?:,\d*)?\})`; // Complete tokenizer for base syntax; doesn't (need to) know about character-class-only syntax -const baseToken = new RegExp(String.raw` +const possessivePluginToken = new RegExp(String.raw` \\(?: \d+ | c[A-Za-z] | [gk]<[^>]+> @@ -106,18 +106,20 @@ const baseToken = new RegExp(String.raw` | [A-Za-z\-]+: | \(DEFINE\) ))? -| (?${baseQuantifier})(?[?+]?)(?[?*+\{]?) +| (?${baseQuantifier})(?[?+]?)(?[?*+\{]?) | \\?. `.replace(/\s+/g, ''), 'gsu'); /** Transform posessive quantifiers into atomic groups. The posessessive quantifiers are: `?+`, `*+`, `++`, `{N}+`, `{N,}+`, `{N,N}+`. +This follows Java, PCRE, Perl, and Python. +Possessive quantifiers in Oniguruma and Onigmo are only: `?+`, `*+`, `++`. @param {string} expression @returns {string} */ export function possessivePlugin(expression) { - if (!new RegExp(`${baseQuantifier}\\+`).test(expression)) { + if (!(new RegExp(`${baseQuantifier}\\+`).test(expression))) { return expression; } const openGroupIndices = []; @@ -125,8 +127,10 @@ export function possessivePlugin(expression) { let lastCharClassIndex = null; let lastToken = ''; let numCharClassesOpen = 0; - let transformed = ''; - for (const {0: m, index, groups: {q, qMod, invalidQ}} of expression.matchAll(baseToken)) { + let match; + possessivePluginToken.lastIndex = 0; + while (match = possessivePluginToken.exec(expression)) { + const {0: m, index, groups: {qBase, qMod, invalidQ}} = match; if (m === '[') { if (!numCharClassesOpen) { lastCharClassIndex = index; @@ -146,24 +150,25 @@ export function possessivePlugin(expression) { if (invalidQ) { throw new Error(`Invalid quantifier "${m}"`); } + let charsAdded = -1; // -1 for removed trailing `+` // Possessivizing fixed repetition quantifiers like `{2}` does't change their behavior, so // avoid doing so (convert them to greedy) - if (/^\{\d+\}$/.test(q)) { - transformed += q; - } else if (lastToken === ')' || lastToken === ']') { - const nodeIndex = lastToken === ')' ? lastGroupIndex : lastCharClassIndex; - // Unmatched `)` would break out of the wrapping group and mess with handling - if (nodeIndex === null) { - throw new Error(`Invalid unmatched "${lastToken}"`); - } - const node = expression.slice(nodeIndex, index); - transformed = `${expression.slice(0, nodeIndex)}(?>${node}${q})`; + if (/^\{\d+\}$/.test(qBase)) { + expression = spliceStr(expression, index + qBase.length, qMod, ''); } else { - transformed = `${expression.slice(0, transformed.length - lastToken.length)}(?>${lastToken}${q})`; + if (lastToken === ')' || lastToken === ']') { + const nodeIndex = lastToken === ')' ? lastGroupIndex : lastCharClassIndex; + // Unmatched `)` would break out of the wrapping group and mess with handling + if (nodeIndex === null) { + throw new Error(`Invalid unmatched "${lastToken}"`); + } + expression = `${expression.slice(0, nodeIndex)}(?>${expression.slice(nodeIndex, index)}${qBase})${expression.slice(index + m.length)}`; + } else { + expression = `${expression.slice(0, index - lastToken.length)}(?>${lastToken}${qBase})${expression.slice(index + m.length)}`; + } + charsAdded += 4; // `(?>)` } - // Avoid adding the match to `transformed` - // Haven't updated `lastToken`, but it isn't needed - continue; + possessivePluginToken.lastIndex += charsAdded; } else if (m[0] === '(') { openGroupIndices.push(index); } else if (m === ')') { @@ -172,7 +177,6 @@ export function possessivePlugin(expression) { } lastToken = m; - transformed += m; } - return transformed; + return expression; } diff --git a/src/subroutines.js b/src/subroutines.js index 130dcd4..bcb7267 100644 --- a/src/subroutines.js +++ b/src/subroutines.js @@ -1,5 +1,5 @@ import {Context, execUnescaped, forEachUnescaped, getGroupContents, hasUnescaped, replaceUnescaped} from 'regex-utilities'; -import {capturingDelim, countCaptures, emulationGroupMarker, namedCapturingDelim} from './utils.js'; +import {capturingDelim, countCaptures, emulationGroupMarker, namedCapturingDelim, spliceStr} from './utils.js'; /** @typedef {import('./regex.js').PluginData} PluginData @@ -336,14 +336,3 @@ function lastOf(arr) { // return arr[arr.length - 1]; } - -/** -@param {string} str -@param {number} pos -@param {string} oldValue -@param {string} newValue -@returns {string} -*/ -function spliceStr(str, pos, oldValue, newValue) { - return str.slice(0, pos) + newValue + str.slice(pos + oldValue.length); -} diff --git a/src/utils.js b/src/utils.js index 8c529bc..cccf0e1 100644 --- a/src/utils.js +++ b/src/utils.js @@ -262,6 +262,17 @@ export function adjustNumberedBackrefs(expression, precedingCaptures) { ); } +/** +@param {string} str +@param {number} pos +@param {string} oldValue +@param {string} newValue +@returns {string} +*/ +export function spliceStr(str, pos, oldValue, newValue) { + return str.slice(0, pos) + newValue + str.slice(pos + oldValue.length); +} + // Properties of strings as of ES2024 const stringPropertyNames = [ 'Basic_Emoji',