Skip to content

Commit

Permalink
Add subclass-based emulation strategies
Browse files Browse the repository at this point in the history
  • Loading branch information
slevithan committed Nov 4, 2024
1 parent 4417e11 commit e9b3ff4
Show file tree
Hide file tree
Showing 5 changed files with 197 additions and 33 deletions.
1 change: 1 addition & 0 deletions scripts/utils.js
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ async function onigurumaExec(pattern, str, pos = 0) {
// See <github.com/microsoft/vscode-oniguruma/blob/main/main.d.ts>
const re = new oniguruma.OnigScanner([pattern]);
const match = re.findNextMatchSync(str, pos);
re.dispose();
if (!match) {
return null;
}
Expand Down
26 changes: 25 additions & 1 deletion spec/match-assertion.spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,8 @@ describe('Assertion', () => {
expect(() => compile(r`(?:(?>a(?<n>\Gb)))`)).toThrow();
expect('a').toExactlyMatch(r`\Ga|(((\Gb)))`);
expect(() => compile(r`\Ga|(((b\Gc)))`)).toThrow();
expect(['ac', 'bc']).toExactlyMatch(r`((\Ga|\Gb)c)`);
expect(() => compile(r`((\Ga|b)c)`)).toThrow();
});

it('should throw if leading in a non-0-min quantified group', () => {
Expand All @@ -136,11 +138,33 @@ describe('Assertion', () => {
expect(() => compile(r`(?<!\G)a`)).toThrow();
});

// Documenting current behavior; supportable
// Just documenting current behavior; supportable
it('should throw for redundant assertions', () => {
expect(() => compile(r`\G\Ga`)).toThrow();
expect(() => compile(r`\Ga|\G\Gb`)).toThrow();
});

describe('subclass strategies', () => {
const opts = {allowSubclass: true};

// Leading `(^|\G)` and similar
it('should apply start_of_search_or_line', () => {
expect(toRegExp(r`(^|\G)a`, '', opts).exec('b\na')?.index).toBe(2);
// Should match first 3 and last 1
expect('aaabaaacaa\na'.match(toRegExp(
r`(^|\G)a`, '', {...opts, global: true}
))).toEqual(['a', 'a', 'a', 'a']);
expect(toRegExp(r`(?:^|\G)a`, '', opts).exec('b\na')?.index).toBe(2);
expect(toRegExp(r`(\G|^)a`, '', opts).exec('b\na')?.index).toBe(2);
expect(toRegExp(r`(?:(\G|^)a)`, '', opts).exec('b\na')?.index).toBe(2);
expect(toRegExp(r`((\G|^)a)`, '', opts).exec('b\na')?.index).toBe(2); // TODO
});

// Leading `(?!\G)`
it('should apply not_search_start', () => {
expect(toRegExp(r`(?!\G)a`, '', opts).exec('aba')?.index).toBe(2);
});
});
});

describe('string_end', () => {
Expand Down
15 changes: 14 additions & 1 deletion src/compile.js
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import {recursion} from 'regex-recursion';
/**
@typedef {{
allowBestEffort?: boolean;
allowSubclass?: boolean;
global?: boolean;
hasIndices?: boolean;
maxRecursionDepth?: number | null;
Expand All @@ -34,13 +35,23 @@ function compile(pattern, flags, options) {
});
const regexAst = transform(onigurumaAst, {
allowBestEffort: opts.allowBestEffort,
allowSubclass: opts.allowSubclass,
bestEffortTarget: opts.target,
});
const generated = generate(regexAst, opts);
return {
const result = {
pattern: atomic(possessive(recursion(generated.pattern))),
flags: `${opts.hasIndices ? 'd' : ''}${opts.global ? 'g' : ''}${generated.flags}${generated.options.disable.v ? 'u' : 'v'}`,
};
if (regexAst._strategy) {
result._internal = {
pattern: result.pattern,
strategy: regexAst._strategy,
};
// Hide the pattern since it's not accurate unless `toRegExp` constructs it with a subclass
result.pattern = null;
}
return result;
}

/**
Expand All @@ -57,6 +68,8 @@ function getOptions(options) {
// Allows results that differ from Oniguruma in rare cases. If `false`, throws if the pattern
// can't be emulated with identical behavior
allowBestEffort: true,
// Experimental
allowSubclass: false,
// Include JS flag `g` in results
global: false,
// Include JS flag `d` in results
Expand Down
57 changes: 57 additions & 0 deletions src/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,66 @@ Transpiles an Oniguruma regex pattern and flags and returns a native JS RegExp.
*/
function toRegExp(pattern, flags, options) {
const result = compile(pattern, flags, options);
if (result._internal) {
return new WrappedRegExp(result._internal.pattern, result.flags, result._internal.strategy);
}
return new RegExp(result.pattern, result.flags);
}

class WrappedRegExp extends RegExp {
#strategy;
/**
@param {string | WrappedRegExp} pattern
@param {string} [flags]
@param {string} [strategy]
*/
constructor(pattern, flags, strategy) {
super(pattern, flags);
if (strategy) {
this.#strategy = strategy;
// The third argument `strategy` isn't provided when regexes are copied as part of the internal
// handling of string methods `matchAll` and `split`
} else if (pattern instanceof WrappedRegExp) {
// Can read private properties of the existing object since it was created by this class
this.#strategy = pattern.#strategy;
}
}
/**
Called internally by all String/RegExp methods that use regexes.
@override
@param {string} str
@returns {RegExpExecArray | null}
*/
exec(str) {
const useLastIndex = this.global || this.sticky;
const pos = this.lastIndex;
const exec = RegExp.prototype.exec;
if (this.#strategy === 'start_of_search_or_line' && useLastIndex && this.lastIndex) {
this.lastIndex = 0;
const match = exec.call(this, str.slice(pos));
if (match) {
match.input = str;
match.index += pos;
this.lastIndex += pos;
}
return match;
}
if (this.#strategy === 'not_search_start') {
let match = exec.call(this, str);
if (match?.index === pos) {
match = exec.call(this, str.slice(1));
if (match) {
match.input = str;
match.index += 1;
this.lastIndex += (useLastIndex ? 1 : 0);
}
}
return match;
}
return exec.call(this, str);
}
}

export {
compile,
toOnigurumaAst,
Expand Down
131 changes: 100 additions & 31 deletions src/transform.js
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,6 @@ import {cp, getNewCurrentFlags, getOrCreate, isMinTarget, r, Target} from './uti
/**
Transforms an Oniguruma AST in-place to a `regex` AST. Targets `ESNext`, expecting the generator to
then down-convert to the desired JS target version.
A couple edge cases exist where options `allowBestEffort` and `bestEffortTarget` are used:
- `VariableLengthCharacterSet` kind `grapheme` (`\X`): An exact representation would require heavy
Unicode data; a best-effort approximation requires knowing the target.
- `CharacterSet` kind `posix` with values `graph` and `print`: Their complex exact representations
are hard to change after the fact in the generator to a best-effort approximation based on
the target, so produce the appropriate structure here.
@param {import('./parse.js').OnigurumaAst} ast
@param {{
allowBestEffort?: boolean;
Expand All @@ -33,10 +26,19 @@ A couple edge cases exist where options `allowBestEffort` and `bestEffortTarget`
*/
function transform(ast, options) {
const opts = {
// A couple edge cases exist where options `allowBestEffort` and `bestEffortTarget` are used:
// - `VariableLengthCharacterSet` kind `grapheme` (`\X`): An exact representation would require
// heavy Unicode data; a best-effort approximation requires knowing the target.
// - `CharacterSet` kind `posix` with values `graph` and `print`: Their complex exact
// representations are hard to change after the fact in the generator to a best-effort
// approximation based on the target, so produce the appropriate structure here.
allowBestEffort: true,
allowSubclass: false,
bestEffortTarget: 'ESNext',
...options,
};
// Experimental AST changes that work together with a `RegExp` subclass to add advanced emulation
const strategy = opts.allowSubclass ? applySubclassStrategies(ast) : null;
const firstPassState = {
allowBestEffort: opts.allowBestEffort,
flagDirectivesByAlt: new Map(),
Expand Down Expand Up @@ -76,6 +78,9 @@ function transform(ast, options) {
reffedNodesByBackreference: secondPassState.reffedNodesByBackreference,
};
traverse({node: ast}, thirdPassState, ThirdPassVisitor);
if (strategy) {
ast._strategy = strategy;
}
return ast;
}

Expand Down Expand Up @@ -262,18 +267,27 @@ const FirstPassVisitor = {
// For `\G` to be accurately emulatable using JS flag y, it must be at (and only at) the start
// of every top-level alternative (with complex rules for what determines being at the start).
// Additional `\G` error checking in `Assertion` visitor
const leadingGs = [];
let hasAltWithLeadG = false;
let hasAltWithoutLeadG = false;
for (const alt of node.alternatives) {
if (hasLeadingG(alt.elements, supportedGNodes)) {
const leadingG = getLeadingG(alt.elements);
if (leadingG) {
hasAltWithLeadG = true;
if (Array.isArray(leadingG)) {
leadingGs.push(...leadingG);
} else {
leadingGs.push(leadingG);
}
} else {
hasAltWithoutLeadG = true;
}
}
if (hasAltWithLeadG && hasAltWithoutLeadG) {
throw new Error(r`Uses "\G" in a way that's unsupported for conversion to JS`);
}
// These nodes will be removed when traversed; other `\G` nodes will error
leadingGs.forEach(g => supportedGNodes.add(g))
},

Quantifier({node}) {
Expand Down Expand Up @@ -528,6 +542,55 @@ function adoptAndSwapKids(parent, kids) {
return parent;
}

function applySubclassStrategies(ast) {
// Special case handling for common patterns that are otherwise unsupportable; only one subclass
// strategy supported per pattern; see `WrappedRegExp` in `index.js`
const alts = ast.pattern.alternatives;
const first = alts[0].elements[0];
if (alts.length !== 1 || !first) {
return null;
}
const hasWrappingGroup =
(first.type === AstTypes.CapturingGroup || first.type === AstTypes.Group) &&
first.alternatives.length === 1;
const firstIn = hasWrappingGroup ? first.alternatives[0].elements[0] : first;
// Strategy `start_of_search_or_line` adds support for leading `(^|\G)` and similar
if (
(firstIn.type === AstTypes.CapturingGroup || firstIn.type === AstTypes.Group) &&
firstIn.alternatives.length === 2 &&
firstIn.alternatives[0].elements.length === 1 &&
firstIn.alternatives[1].elements.length === 1
) {
const el1 = firstIn.alternatives[0].elements[0];
const el2 = firstIn.alternatives[1].elements[0];
if (
(el1.kind === AstAssertionKinds.line_start && el2.kind === AstAssertionKinds.search_start) ||
(el1.kind === AstAssertionKinds.search_start && el2.kind === AstAssertionKinds.line_start)
) {
// Remove the `\G` and its container alternative
if (el1.kind === AstAssertionKinds.line_start) {
firstIn.alternatives.pop();
} else {
firstIn.alternatives.shift();
}
return 'start_of_search_or_line';
}
}
// Strategy `not_search_start` adds support for leading `(?!\G)`
if (
isLookaround(first) &&
first.negate &&
first.alternatives.length === 1 &&
first.alternatives[0].elements.length === 1 &&
first.alternatives[0].elements[0].kind === AstAssertionKinds.search_start
) {
// Remove the negative lookahead
alts[0].elements.shift();
return 'not_search_start';
}
return null;
}

function areFlagsEqual(a, b) {
return a.dotAll === b.dotAll && a.ignoreCase === b.ignoreCase;
}
Expand Down Expand Up @@ -630,32 +693,20 @@ function getFlagModsFromFlags({dotAll, ignoreCase}) {
return mods;
}

// See also `getAllParents`
function getParentAlternative(node) {
while ((node = node.parent)) {
// Skip past quantifiers, etc.
if (node.type === AstTypes.Alternative) {
return node;
}
}
return null;
}

function hasLeadingG(els, supportedGNodes) {
function getLeadingG(els) {
if (!els.length) {
return false;
return null;
}
const first = els[0];
// Special case for leading positive lookaround with leading `\G`, else all leading assertions
// Special case for leading positive lookaround with leading `\G`; else all leading assertions
// are ignored when looking for `\G`
if (
isLookaround(first) &&
!first.negate &&
first.alternatives.length === 1 &&
first.alternatives[0].elements[0]?.kind === AstAssertionKinds.search_start
) {
supportedGNodes.add(first.alternatives[0].elements[0]);
return true;
return first.alternatives[0].elements[0];
}
const firstToConsider = els.find(el => {
return el.kind === AstAssertionKinds.search_start ?
Expand All @@ -666,21 +717,39 @@ function hasLeadingG(els, supportedGNodes) {
);
});
if (!firstToConsider) {
return false;
return null;
}
if (firstToConsider.kind === AstAssertionKinds.search_start) {
supportedGNodes.add(firstToConsider);
return true;
return firstToConsider;
}
if (firstToConsider.type === AstTypes.Group || firstToConsider.type === AstTypes.CapturingGroup) {
const gNodesForGroup = [];
for (const alt of firstToConsider.alternatives) {
if (!hasLeadingG(alt.elements, supportedGNodes)) {
return false;
const leadingG = getLeadingG(alt.elements);
if (!leadingG) {
// Don't return `gNodesForGroup` collected so far since this alt didn't qualify
return null;
}
if (Array.isArray(leadingG)) {
gNodesForGroup.push(...leadingG);
} else {
gNodesForGroup.push(leadingG);
}
}
return gNodesForGroup;
}
return null;
}

// See also `getAllParents`
function getParentAlternative(node) {
while ((node = node.parent)) {
// Skip past quantifiers, etc.
if (node.type === AstTypes.Alternative) {
return node;
}
return true;
}
return false;
return null;
}

function isValidGroupNameJs(name) {
Expand Down

0 comments on commit e9b3ff4

Please sign in to comment.