Skip to content

Commit

Permalink
Default on subclass strategies
Browse files Browse the repository at this point in the history
  • Loading branch information
slevithan committed Nov 6, 2024
1 parent 3a56346 commit 7b1938f
Show file tree
Hide file tree
Showing 7 changed files with 49 additions and 46 deletions.
10 changes: 5 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ function toRegExp(
pattern: string,
flags?: OnigurumaFlags,
options?: (CompileOptions & {
allowSubclassBasedEmulation?: boolean;
avoidSubclass?: boolean;
})
): RegExp;
```
Expand Down Expand Up @@ -478,7 +478,7 @@ Notice that nearly every feature below has at least subtle differences from Java
✔ Scripts<br>
✔ Aliases<br>
✔ POSIX properties<br>
Negate with <code>\p{^…}</code>, <code>\P{^…}</code><br>
Invert with <code>\p{^…}</code>, <code>\P{^…}</code><br>
✔ Insignificant spaces, underscores, and casing in names<br>
✔ <code>\p</code>, <code>\P</code> without <code>{</code> is an identity escape<br>
✔ Error for key prefixes<br>
Expand Down Expand Up @@ -640,7 +640,7 @@ Notice that nearly every feature below has at least subtle differences from Java
<td align="middle">✅</td>
<td>
✔ Includes all JS forms<br>
✔ Adds form <code>{,n}</code> for implicit min 0<br>
✔ Adds <code>{,n}</code> for min 0<br>
✔ Explicit bounds have upper limit of 100,000 (unlimited in JS)<br>
✔ Error with assertions (same as JS with flag <code>u</code>, <code>v</code>)<br>
</td>
Expand Down Expand Up @@ -731,7 +731,7 @@ Notice that nearly every feature below has at least subtle differences from Java
✔ Error if named capture used<br>
✔ Allows leading 0s<br>
✔ Refs the most recent of a capture/subroutine set<br>
✔ <code>\k</code> without <code>&lt;</code>, <code>'</code> is an identity escape<br>
✔ <code>\k</code> without <code>&lt;</code> <code>'</code> is an identity escape<br>
</td>
</tr>
<tr valign="top">
Expand Down Expand Up @@ -779,7 +779,7 @@ Notice that nearly every feature below has at least subtle differences from Java
✔ Doesn't alter backref nums<br>
✔ Reuses flags from the reffed group (ignores local flags)<br>
✔ Replaces most recent captured values (for backrefs)<br>
✔ <code>\g</code> without <code>&lt;</code>, <code>'</code> is an identity escape<br>
✔ <code>\g</code> without <code>&lt;</code> <code>'</code> is an identity escape<br>
✔ Error if named capture used<br>
</td>
</tr>
Expand Down
4 changes: 2 additions & 2 deletions demo/demo.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ const state = {
},
opts: {
accuracy: getValue('option-accuracy'),
allowSubclassBasedEmulation: getValue('option-allowSubclassBasedEmulation'),
avoidSubclass: getValue('option-avoidSubclass'),
global: getValue('option-global'),
hasIndices: getValue('option-hasIndices'),
maxRecursionDepth: getValue('option-maxRecursionDepth'),
Expand Down Expand Up @@ -35,7 +35,7 @@ function showOutput(el) {
// Use `compile` but display output as if `toRegExp` was called. This avoids erroring when the
// selected `target` includes features that don't work in the user's browser
const compiled = OnigurumaToES.compile(input, flags, opts);
if (opts.allowSubclassBasedEmulation && compiled._internal) {
if (compiled._internal) {
infoEl.classList.remove('hidden');
outputEl.classList.add('subclass');
output = getFormattedSubclass(compiled.pattern, compiled.flags, {
Expand Down
4 changes: 2 additions & 2 deletions demo/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,8 @@ <h2>Try it</h2>
<div>
<p>
<label>
<input type="checkbox" id="option-allowSubclassBasedEmulation" onchange="setOption('allowSubclassBasedEmulation', this.checked)">
<code>allowSubclassBasedEmulation</code>
<input type="checkbox" id="option-avoidSubclass" onchange="setOption('avoidSubclass', this.checked)">
<code>avoidSubclass</code>
</label>
</p>
<p>
Expand Down
2 changes: 1 addition & 1 deletion scripts/utils.js
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ function getMatchDetails(match) {
const transpiledRegExpResult = (pattern, str, pos) => {
let result;
try {
const options = {allowSubclassBasedEmulation: true};
const options = {};
if (pos) {
options.global = true;
}
Expand Down
58 changes: 30 additions & 28 deletions spec/match-assertion.spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@ describe('Assertion', () => {
});

describe('search_start', () => {
// TODO: Consider enabling `avoidSubclass` for all of these except when specifically testing
// subclass strategies

it('should match at the start of the search', () => {
expect('a').toExactlyMatch(r`\Ga`);
expect([
Expand Down Expand Up @@ -133,7 +136,7 @@ describe('Assertion', () => {
r`(?:(?=\G))?a`,
r`(?=\G)a|b`,
].forEach(pattern => {
expect(() => compile(pattern)).toThrow();
expect(() => compile(pattern, '', {avoidSubclass: true})).toThrow();
});
});

Expand All @@ -149,19 +152,20 @@ describe('Assertion', () => {
r`(?:(?<=\G))?a`,
r`(?<=\G)a|b`,
].forEach(pattern => {
expect(() => compile(pattern)).toThrow();
expect(() => compile(pattern, '', {avoidSubclass: true})).toThrow();
});
});

it('should throw if leading in a leading positive lookbehind', () => {
// Matches at index 3 within `abc`, but doesn't match within `aabc`. Emulatable by replacing
// `\G` with `^`, slicing the string to `lastIndex`, and doing a non-sticky search
// [Oniguruma] Matches at index 3 within `abc`, but doesn't match within `aabc`
// [TODO] Emulatable by replacing `\G` with `^`, slicing the string to `lastIndex`, and doing
// a non-sticky search
expect(() => compile(r`(?<=\Gabc)`)).toThrow();
});

it('should throw if leading in a leading negative lookaround', () => {
expect(() => compile(r`(?!\G)a`)).toThrow();
expect(() => compile(r`(?<!\G)a`)).toThrow();
expect(() => compile(r`(?!\G)a`, '', {avoidSubclass: true})).toThrow();
expect(() => compile(r`(?<!\G)a`, '', {avoidSubclass: true})).toThrow();
});

// Just documenting current behavior
Expand Down Expand Up @@ -190,44 +194,42 @@ describe('Assertion', () => {
});

describe('subclass strategies', () => {
const opts = {allowSubclassBasedEmulation: true};

// Leading `(^|\G)` and similar
it('should apply line_or_search_start', () => {
// Matches with `^` since not global
expect(toRegExp(r`(^|\G)a`, '', opts).exec('b\na')?.index).toBe(2);
expect(toRegExp(r`(^|\G)a`).exec('b\na')?.index).toBe(2);
// Match the first 3 and last 1
expect('aaabaaacaa\na'.match(toRegExp(
r`(^|\G)a`, '', {...opts, global: true}
r`(^|\G)a`, '', {global: true}
))).toEqual(['a', 'a', 'a', 'a']);
expect(toRegExp(r`(?:^|\G)a`, '', opts).exec('b\na')?.index).toBe(2);
expect(toRegExp(r`(\G|^)a`, '', opts).exec('b\na')?.index).toBe(2);
expect(toRegExp(r`(?:(\G|^)a)`, '', opts).exec('b\na')?.index).toBe(2);
expect(toRegExp(r`((\G|^)a)`, '', opts).exec('b\na')?.index).toBe(2);
expect(toRegExp(r`(?:^|\G)a`).exec('b\na')?.index).toBe(2);
expect(toRegExp(r`(\G|^)a`).exec('b\na')?.index).toBe(2);
expect(toRegExp(r`(?:(\G|^)a)`).exec('b\na')?.index).toBe(2);
expect(toRegExp(r`((\G|^)a)`).exec('b\na')?.index).toBe(2);
});

// Leading `(?!\G)` and similar
it('should apply not_search_start', () => {
// Leading
expect(toRegExp(r`(?!\G)a`, '', opts).exec('aba')?.index).toBe(2);
expect(toRegExp(r`(?<!\G)a`, '', opts).exec('aba')?.index).toBe(2);
expect(toRegExp(r`(?:(?!\G)a)`, '', opts).exec('aba')?.index).toBe(2);
expect(toRegExp(r`((?!\G)a)`, '', opts).exec('aba')?.index).toBe(2);
expect(toRegExp(r`(?!\G)a`).exec('aba')?.index).toBe(2);
expect(toRegExp(r`(?<!\G)a`).exec('aba')?.index).toBe(2);
expect(toRegExp(r`(?:(?!\G)a)`).exec('aba')?.index).toBe(2);
expect(toRegExp(r`((?!\G)a)`).exec('aba')?.index).toBe(2);
// Only assertions
expect(toRegExp(r`(?<=;)(?!\G)`, '', opts).exec(';;')?.index).toBe(1);
expect(toRegExp(r`(?!\G)(?=;)^`, '', opts).exec(';;\n;')?.index).toBe(3);
expect(toRegExp(r`(?=;)(?!\G)^`, '', opts).exec(';;\n;')?.index).toBe(3);
expect(toRegExp(r`(?=;)^(?!\G)`, '', opts).exec(';;\n;')?.index).toBe(3);
expect(toRegExp(r`(?<=;)(?!\G)`).exec(';;')?.index).toBe(1);
expect(toRegExp(r`(?!\G)(?=;)^`).exec(';;\n;')?.index).toBe(3);
expect(toRegExp(r`(?=;)(?!\G)^`).exec(';;\n;')?.index).toBe(3);
expect(toRegExp(r`(?=;)^(?!\G)`).exec(';;\n;')?.index).toBe(3);
});

// Leading `(?<=\G|…)` and similar
it('should apply after_search_start_or_subpattern', () => {
expect(toRegExp(r`(?<=\G|a)b`, '', opts).exec('ba')?.index).toBe(0);
expect(toRegExp(r`(?<=\G|a)b`, '', opts).exec('aba')?.index).toBe(1);
expect(toRegExp(r`(?<=\G|a)b`, '', opts).exec('aaba')?.index).toBe(2);
expect(toRegExp(r`(?<=\G|a)b`, '', opts).exec('cbbab')?.index).toBe(4);
expect(toRegExp(r`((?<=xy?|\G|a)b)`, '', opts).exec('cbbab')?.index).toBe(4);
expect(toRegExp(r`(?<=\G|a)b`, '', opts).exec('cbba')).toBeNull();
expect(toRegExp(r`(?<=\G|a)b`).exec('ba')?.index).toBe(0);
expect(toRegExp(r`(?<=\G|a)b`).exec('aba')?.index).toBe(1);
expect(toRegExp(r`(?<=\G|a)b`).exec('aaba')?.index).toBe(2);
expect(toRegExp(r`(?<=\G|a)b`).exec('cbbab')?.index).toBe(4);
expect(toRegExp(r`((?<=xy?|\G|a)b)`).exec('cbbab')?.index).toBe(4);
expect(toRegExp(r`(?<=\G|a)b`).exec('cbba')).toBeNull();
});
});
});
Expand Down
10 changes: 5 additions & 5 deletions src/compile.js
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ import {recursion} from 'regex-recursion';
tmGrammar?: boolean;
}} CompileOptions
@typedef {CompileOptions & {
allowSubclassBasedEmulation?: boolean;
avoidSubclass?: boolean;
}} ToRegExpOptions
*/

Expand Down Expand Up @@ -57,7 +57,7 @@ function compileInternal(pattern, flags, options) {
});
const regexAst = transform(onigurumaAst, {
accuracy: opts.accuracy,
allowSubclassBasedEmulation: opts.allowSubclassBasedEmulation,
avoidSubclass: opts.avoidSubclass,
bestEffortTarget: opts.target,
});
const generated = generate(regexAst, opts);
Expand Down Expand Up @@ -92,9 +92,9 @@ function getOptions(options) {
return {
// Sets the level of emulation rigor/strictness
accuracy: 'default',
// Allows advanced emulation strategies that rely on returning a `RegExp` subclass with an
// overridden `exec` method. A subclass is only used if needed for the given pattern
allowSubclassBasedEmulation: false,
// Prevents use of advanced emulation strategies that rely on returning a `RegExp` subclass,
// resulting in certain patterns not being emulatable
avoidSubclass: false,
// Include JS flag `g` in the result
global: false,
// Include JS flag `d` in the result
Expand Down
7 changes: 4 additions & 3 deletions src/transform.js
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ then down-convert to the desired JS target version.
@param {import('./parse.js').OnigurumaAst} ast
@param {{
accuracy?: keyof Accuracy;
allowSubclassBasedEmulation?: boolean;
avoidSubclass?: boolean;
bestEffortTarget?: keyof Target;
}} [options]
@returns {RegexAst}
Expand All @@ -35,12 +35,12 @@ function transform(ast, options) {
// representations are hard to change after the fact in the generator to a best-effort
// approximation based on the target, so produce the appropriate structure here.
accuracy: 'default',
allowSubclassBasedEmulation: false,
avoidSubclass: false,
bestEffortTarget: 'ESNext',
...options,
};
// AST changes that work together with a `RegExp` subclass to add advanced emulation
const strategy = opts.allowSubclassBasedEmulation ? applySubclassStrategies(ast, opts.accuracy) : null;
const strategy = opts.avoidSubclass ? null : applySubclassStrategies(ast, opts.accuracy);
const firstPassState = {
accuracy: opts.accuracy,
flagDirectivesByAlt: new Map(),
Expand Down Expand Up @@ -578,6 +578,7 @@ function applySubclassStrategies(ast, accuracy) {
return null;
}
const hasWrapperGroup =
alts[0].elements.length === 1 &&
(firstEl.type === AstTypes.CapturingGroup || firstEl.type === AstTypes.Group) &&
firstEl.alternatives.length === 1;
// First element within first group if the group doesn't contain top-level alternation, else just
Expand Down

0 comments on commit 7b1938f

Please sign in to comment.