From d594d532594d0e56d9c32501d5b0139cee81a21c Mon Sep 17 00:00:00 2001 From: Steven Levithan Date: Wed, 6 Nov 2024 12:38:02 +0100 Subject: [PATCH] Switch from allowBestEffort to 3 emulation modes --- README.md | 61 ++++++++++++++-------- demo/demo.css | 42 +++++++++------ demo/demo.js | 8 +-- demo/index.html | 89 ++++++++++++++++++-------------- spec/match-backreference.spec.js | 2 +- spec/match-recursion.spec.js | 8 +-- src/compile.js | 18 +++---- src/generate.js | 15 +++--- src/transform.js | 20 +++---- src/unicode.js | 2 +- 10 files changed, 154 insertions(+), 111 deletions(-) diff --git a/README.md b/README.md index 2e0ce8d..919173c 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ Compared to running the actual [Oniguruma](https://github.com/kkos/oniguruma) C ### [Try the demo REPL](https://slevithan.github.io/oniguruma-to-es/demo/) -Oniguruma-To-ES deeply understands all of the hundreds of large and small differences in Oniguruma and JavaScript regex syntax and behavior across multiple JavaScript version targets. It's *obsessive* about precisely following Oniguruma syntax rules and ensuring that the emulated features it supports have **exactly the same behavior**, even in extreme edge cases. And it's battle-tested on thousands of real-world Oniguruma regexes used in TextMate grammars (via the Shiki library). A few uncommon features can't be perfectly emulated and allow rare differences, but if you don't want to allow this, you can disable the `allowBestEffort` option to throw for such patterns (see details below). +Oniguruma-To-ES deeply understands all of the hundreds of large and small differences in Oniguruma and JavaScript regex syntax and behavior across multiple JavaScript version targets. It's *obsessive* about precisely following Oniguruma syntax rules and ensuring that the emulated features it supports have **exactly the same behavior**, even in extreme edge cases. And it's battle-tested on thousands of real-world Oniguruma regexes used in TextMate grammars (via the Shiki library). A few uncommon features can't be perfectly emulated and allow rare differences, but if you don't want to allow this, you can set the `emulation` option to `strict` and throw for such patterns (see details below). ## 📜 Contents @@ -83,7 +83,7 @@ A string with `i`, `m`, and `x` in any order (all optional). ```ts type CompileOptions = { - allowBestEffort?: boolean; + emulation?: 'strict' | 'default' | 'loose'; global?: boolean; hasIndices?: boolean; maxRecursionDepth?: number | null; @@ -139,63 +139,82 @@ function toRegexAst( These options are shared by functions [`compile`](#compile) and [`toRegExp`](#toregexp). -### `allowBestEffort` +### `emulation` -Allows results that differ from Oniguruma in rare cases. If `false`, throws if the pattern can't be emulated with identical behavior for the given `target`. +One of `'strict'`, `'default'` *(default)*, or `'loose'`. -*Default: `true`.* +Sets the level of emulation strictness. + +- **Strict:** Throw if the pattern can't be emulated with identical behavior (even in rare edge cases) for the given target. +- **Default:** The best choice in most cases. Permits a few close approximations of Oniguruma in order to support additional features. +- **Loose:** Useful for non-critical matching like syntax highlighting where having some mismatches is better than not working. + +Each level of increased emulation strictness supports a subset of patterns supported by less strict modes. If a given pattern doesn't produce an error for a particular emulation mode, its generated result will be identical with all lower levels of strictness (given the same `target`).
More details -Specifically, this option enables the following additional features, depending on `target`: +#### `default` mode + +Supports all features of `strict` mode, plus the following additional features, depending on `target`: - All targets (`ESNext` and earlier): - Enables use of `\X` using a close approximation of a Unicode extended grapheme cluster. - - Enables recursion (e.g. via `\g<0>`) using a depth limit specified via option `maxRecursionDepth`. + - Enables recursion (e.g. via `\g<0>`) with a depth limit specified by option `maxRecursionDepth`. - `ES2024` and earlier: - Enables use of case-insensitive backreferences to case-sensitive groups. - `ES2018`: - Enables use of POSIX classes `[:graph:]` and `[:print:]` using ASCII-based versions rather than the Unicode versions available for `ES2024` and later. Other POSIX classes are always based on Unicode. + +#### `loose` mode + +Supports all features of `default`, plus the following: + +- Silences errors for unsupported uses of the search-start anchor `\G` (a flexible assertion that doesn’t have a direct equivalent in JavaScript). + - Oniguruma-To-ES uses a variety of strategies to accurately emulate many common uses of `\G`. When using `loose` mode, if a `\G` assertion is found that doesn't have a known emulation strategy, the `\G` is simply removed and JavaScript's `y` (`sticky`) flag is added. This might lead to some false positives and negatives.
### `global` -Include JavaScript flag `g` (`global`) in the result. - *Default: `false`.* -### `hasIndices` +Include JavaScript flag `g` (`global`) in the result. -Include JavaScript flag `d` (`hasIndices`) in the result. +### `hasIndices` *Default: `false`.* -### `maxRecursionDepth` +Include JavaScript flag `d` (`hasIndices`) in the result. -If `null`, any use of recursion throws. If an integer between `2` and `100` (and `allowBestEffort` is `true`), common recursion forms are supported and recurse up to the specified max depth. +### `maxRecursionDepth` *Default: `6`.* +If an integer between `2` and `100`, common recursion forms are supported and recurse up to the specified depth limit. If set to `null`, any use of recursion results in an error. + +Since recursion isn't infinite-depth like in Oniguruma, use of recursion also results in an error if the `emulation` option is set to `'strict'`. +
More details -Using a high limit is not a problem if needed. Although there can be a performance cost (minor unless it's exacerbating an existing issue with runaway backtracking), there is no effect on regexes that don't use recursion. +Using a high limit has a (usually tiny) impact on transpilation and regex performance. Generally, this is only a problem if the regex has an existing issue with runaway backtracking that recursion exacerbates. + +Higher limits have no effect on regexes that don't use recursion, so you should feel free to increase this if helpful.
### `optimize` -Simplify the generated pattern when it doesn't change the meaning. - *Default: `true`.* -### `target` +Simplify the generated pattern when it doesn't change the meaning. -Sets the JavaScript language version for generated patterns and flags. Later targets allow faster processing, simpler generated source, and support for additional features. +### `target` *Default: `'ES2024'`.* -
+Sets the JavaScript language version for generated patterns and flags. Later targets allow faster processing, simpler generated source, and support for additional features. + +
More details - `ES2018`: Uses JS flag `u`. @@ -887,10 +906,10 @@ The table above doesn't include all aspects that Oniguruma-To-ES emulates (inclu 1. Target `ES2018` doesn't allow Unicode property names added in JavaScript specifications after ES2018 to be used. 2. Unicode blocks are easily emulatable but their character data would significantly increase library weight. They're also a deeply flawed and arguably-unuseful feature, given the ability to use Unicode scripts and other properties. -3. With target `ES2018`, the specific POSIX classes `[:graph:]` and `[:print:]` are an error if option `allowBestEffort` is `false`, and they use ASCII-based versions rather than the Unicode versions available for target `ES2024` and later. +3. With target `ES2018`, the specific POSIX classes `[:graph:]` and `[:print:]` are an error if option `emulation` is `'strict'`, and they use ASCII-based versions rather than the Unicode versions available for target `ES2024` and later. 4. Target `ES2018` doesn't support nested *negated* character classes. 5. It's not an error for *numbered* backreferences to come before their referenced group in Oniguruma, but an error is the best path for Oniguruma-To-ES because (1) most placements are mistakes and can never match (based on the Oniguruma behavior for backreferences to nonparticipating groups), (2) erroring matches the behavior of named backreferences, and (3) the edge cases where they're matchable rely on rules for backreference resetting within quantified groups that are different in JavaScript and aren't emulatable. Note that it's not a backreference in the first place if using `\10` or higher and not as many capturing groups are defined to the left (it's an octal or identity escape). -6. The maximum recursion depth is specified by option `maxRecursionDepth`. Use of recursion results in an error if `maxRecursionDepth` is `null` or `allowBestEffort` is `false`. Some forms of recursion (recursion with backreferences, and multiple recursions in the same pattern) aren't yet supported. Note that, because recursion is bounded, patterns that fail due to infinite recursion in Oniguruma might find a match in Oniguruma-To-ES. Future versions will detect this and throw an error. +6. The recursion depth limit is specified by option `maxRecursionDepth`. Some forms of recursion (multiple recursions in the same pattern, and recursion with backreferences) aren't yet supported. Patterns that would error in Oniguruma due to triggering infinite recursion might find a match in Oniguruma-To-ES since recursion is bounded (future versions will detect this and error at transpilation time). ## ㊗️ Unicode / mixed case-sensitivity diff --git a/demo/demo.css b/demo/demo.css index 8bc057c..18b412f 100644 --- a/demo/demo.css +++ b/demo/demo.css @@ -20,7 +20,7 @@ main { border-radius: 0 0 15px 15px; } -h1, h2, ul, p, pre, details, summary { +h1, h2, ul, p, pre, summary { margin-bottom: 12px; } @@ -37,20 +37,28 @@ code { background-color: #f6f6f6; } +kbd { + padding: 0 3px; +} + small { font-size: 0.8em; } -td { - padding-right: 3vw; +.hidden { + display: none; } summary { cursor: pointer; } -label, .label { - margin-right: 0.4em; +label { + margin-right: 0.5em; +} + +label img { + vertical-align: middle; } input[type='checkbox'] { @@ -62,20 +70,18 @@ input[type='checkbox'] { } input[type='number'] { + width: 3.5em; + padding: 3px; + font-size: 0.9em; border: 1px solid #bbb; - height: 1.6em; border-radius: 4px; - padding-left: 4px; - width: 3.5em; } select { - padding: 4px 35px 4px 10px; + padding: 3px; font-size: 0.9em; border: 1px solid #bbb; border-radius: 4px; - appearance: none; - background: url(https://upload.wikimedia.org/wikipedia/commons/9/99/Unofficial_JavaScript_logo_2.svg) 96% / 15% no-repeat #f6f6f6; } textarea { @@ -92,12 +98,20 @@ textarea:focus { box-shadow: 0 0 8px #80c0ff; } -pre, code, textarea { +pre, code, kbd, textarea { font-family: Consolas, "Source Code Pro", Monospace; font-size: 0.9em; border-radius: 0.375em; } +#more-options { + display: flex; +} + +#more-options div { + margin-right: 3%; +} + #output, textarea { padding: 0.6em; white-space: pre-wrap; @@ -133,7 +147,3 @@ pre, code, textarea { margin-top: -12px; padding: 0.6em; } - -.hidden { - display: none; -} diff --git a/demo/demo.js b/demo/demo.js index 72a8bb8..ddeb215 100644 --- a/demo/demo.js +++ b/demo/demo.js @@ -5,11 +5,11 @@ const state = { x: getValue('flag-x'), }, opts: { - allowBestEffort: getValue('option-allow-best-effort'), - allowSubclassBasedEmulation: getValue('option-subclass'), + allowSubclassBasedEmulation: getValue('option-allowSubclassBasedEmulation'), + emulation: getValue('option-emulation'), global: getValue('option-global'), - hasIndices: getValue('option-has-indices'), - maxRecursionDepth: getValue('option-max-recursion-depth'), + hasIndices: getValue('option-hasIndices'), + maxRecursionDepth: getValue('option-maxRecursionDepth'), optimize: getValue('option-optimize'), target: getValue('option-target'), }, diff --git a/demo/index.html b/demo/index.html index 4c9787f..23c549b 100644 --- a/demo/index.html +++ b/demo/index.html @@ -19,75 +19,88 @@

Try it

- Flags: +

- target: - + +

More options - - - - - - - - - - - -
- - +
+
+

-

+

+

-

+

+ +
+

-

+

+

-

+

+ +
+

-

+

+

+ +

+ +

-    
+    
     

The output shows the result of calling toRegExp. Oniguruma-To-ES includes functions to generate additional formats: compile, toOnigurumaAst, and toRegexAst (for an AST based on regex). You can run all of these from the console on this page, and you can pretty-print AST results by passing them to printAst. diff --git a/spec/match-backreference.spec.js b/spec/match-backreference.spec.js index ed1b970..3720de2 100644 --- a/spec/match-backreference.spec.js +++ b/spec/match-backreference.spec.js @@ -8,7 +8,7 @@ beforeEach(() => { }); describe('Backreference', () => { - // TODO: Test that case-insensitive backref to case-sensitive group requires allowBestEffort or ESNext + // TODO: Test that case-insensitive backref to case-sensitive group requires `ESNext` or non-`strict` emulation describe('numbered backref', () => { it('should rematch the captured text', () => { diff --git a/spec/match-recursion.spec.js b/spec/match-recursion.spec.js index fe38f1a..4f12dbe 100644 --- a/spec/match-recursion.spec.js +++ b/spec/match-recursion.spec.js @@ -7,12 +7,12 @@ beforeEach(() => { }); describe('Recursion', () => { - it('should throw if recursion used with allowBestEffort false', () => { - expect(() => compile(r`a\g<0>?`, '', {allowBestEffort: false})).toThrow(); - expect(() => compile('', '', {allowBestEffort: false})).not.toThrow(); + it('should throw if recursion used with strict emulation', () => { + expect(() => compile(r`a\g<0>?`, '', {emulation: 'strict'})).toThrow(); + expect(() => compile('', '', {emulation: 'strict'})).not.toThrow(); }); - it('should throw if recursion used with maxRecursionDepth null', () => { + it('should throw if recursion used with null maxRecursionDepth', () => { expect(() => compile(r`a\g<0>?`, '', {maxRecursionDepth: null})).toThrow(); expect(() => compile('', '', {maxRecursionDepth: null})).not.toThrow(); }); diff --git a/src/compile.js b/src/compile.js index a017df5..09398f5 100644 --- a/src/compile.js +++ b/src/compile.js @@ -8,7 +8,7 @@ import {recursion} from 'regex-recursion'; /** @typedef {{ - allowBestEffort?: boolean; + emulation?: 'strict' | 'default' | 'loose'; global?: boolean; hasIndices?: boolean; maxRecursionDepth?: number | null; @@ -56,8 +56,8 @@ function compileInternal(pattern, flags, options) { skipBackrefValidation: opts.tmGrammar, }); const regexAst = transform(onigurumaAst, { - allowBestEffort: opts.allowBestEffort, allowSubclassBasedEmulation: opts.allowSubclassBasedEmulation, + emulation: opts.emulation, bestEffortTarget: opts.target, }); const generated = generate(regexAst, opts); @@ -90,19 +90,19 @@ function getOptions(options) { } // Set default values return { - // Allows results that differ from Oniguruma in rare cases. If `false`, throws if the pattern - // can't be emulated with identical behavior - allowBestEffort: true, // Allows advanced emulation strategies that rely on returning a `RegExp` subclass with an // overridden `exec` method. A subclass is only used if needed for the given pattern allowSubclassBasedEmulation: false, + // Sets the level of emulation strictness; `default` is best in most cases. If `strict`, throws + // if the pattern can't be emulated with identical behavior (even in rare edge cases) for the + // given target + emulation: 'default', // Include JS flag `g` in the result global: false, // Include JS flag `d` in the result hasIndices: false, - // If `null`, any use of recursion throws. If an integer between `2` and `100` (and - // `allowBestEffort` is on), common recursion forms are supported and recurse up to the - // specified max depth + // If an integer between `2` and `100`, common recursion forms are supported and recurse up to + // the specified depth limit. If set to `null`, any use of recursion results in an error maxRecursionDepth: 6, // Simplify the generated pattern when it doesn't change the meaning optimize: true, @@ -110,7 +110,7 @@ function getOptions(options) { // faster processing, simpler generated source, and support for additional features target: 'ES2024', // Leave disabled unless the regex will be used in a TextMate grammar processor that merges - // `begin` and `end` patterns + // backreferences across `begin` and `end` patterns tmGrammar: false, ...options, }; diff --git a/src/generate.js b/src/generate.js index 710bfbd..afb3490 100644 --- a/src/generate.js +++ b/src/generate.js @@ -56,13 +56,13 @@ function generate(ast, options) { }; let lastNode = null; const state = { - allowBestEffort: opts.allowBestEffort, appliedGlobalFlags, captureFlagIMap: new Map(), currentFlags: { dotAll: ast.flags.dotAll, ignoreCase: ast.flags.ignoreCase, }, + emulation: opts.emulation, groupNames: new Set(), inCharClass: false, lastNode, @@ -226,11 +226,11 @@ function genBackreference({ref}, state) { } if ( !state.useFlagMods && - !state.allowBestEffort && + state.emulation === 'strict' && state.currentFlags.ignoreCase && !state.captureFlagIMap.get(ref) ) { - throw new Error('Use of case-insensitive backref to case-sensitive group requires option allowBestEffort or target ESNext'); + throw new Error('Use of case-insensitive backref to case-sensitive group requires target ESNext or non-strict emulation'); } return '\\' + ref; } @@ -342,8 +342,9 @@ function genCharacterSet({kind, negate, value, key}, state) { UnicodePropertiesWithSpecificCase.has(value) ) { // Support for this would require heavy Unicode data. Could change e.g. `\p{Lu}` to `\p{LC}` - // if `allowBestEffort` (since it's close but not 100%), but this wouldn't work for e.g. - // `\p{Lt}` and in any case it's probably a mistake if using these props case-insensitively + // if not using `strict` emulation (since it's close but not 100%), but this wouldn't work + // for e.g. `\p{Lt}`, and in any case, it's probably user error if using these case-specific + // props case-insensitively throw new Error(`Unicode property "${value}" can't be case-insensitive when other chars have specific case`); } return `${negate ? r`\P` : r`\p`}{${key ? `${key}=` : ''}${value}}`; @@ -392,8 +393,8 @@ function genRecursion({ref}, state) { if (!rDepth) { throw new Error('Use of recursion disabled'); } - if (!state.allowBestEffort) { - throw new Error('Use of recursion requires option allowBestEffort'); + if (state.emulation === 'strict') { + throw new Error('Use of recursion requires non-strict emulation'); } // Using the syntax supported by `regex-recursion` return ref === 0 ? `(?R=${rDepth})` : r`\g<${ref}&R=${rDepth}>`; diff --git a/src/transform.js b/src/transform.js index 603c864..d71644c 100644 --- a/src/transform.js +++ b/src/transform.js @@ -20,29 +20,29 @@ Transforms an Oniguruma AST in-place to a `regex` AST. Targets `ESNext`, expecti then down-convert to the desired JS target version. @param {import('./parse.js').OnigurumaAst} ast @param {{ - allowBestEffort?: boolean; allowSubclassBasedEmulation?: boolean; bestEffortTarget?: keyof Target; + emulation?: 'strict' | 'default' | 'loose'; }} [options] @returns {RegexAst} */ function transform(ast, options) { const opts = { - // A couple edge cases exist where options `allowBestEffort` and `bestEffortTarget` are used: + // A couple edge cases exist where options `emulation` and `bestEffortTarget` are used: // - `VariableLengthCharacterSet` kind `grapheme` (`\X`): An exact representation would require // heavy Unicode data; a best-effort approximation requires knowing the target. // - `CharacterSet` kind `posix` with values `graph` and `print`: Their complex exact // representations are hard to change after the fact in the generator to a best-effort // approximation based on the target, so produce the appropriate structure here. - allowBestEffort: true, allowSubclassBasedEmulation: false, bestEffortTarget: 'ESNext', + emulation: 'default', ...options, }; // AST changes that work together with a `RegExp` subclass to add advanced emulation const strategy = opts.allowSubclassBasedEmulation ? applySubclassStrategies(ast) : null; const firstPassState = { - allowBestEffort: opts.allowBestEffort, + emulation: opts.emulation, flagDirectivesByAlt: new Map(), minTargetEs2024: isMinTarget(opts.bestEffortTarget, 'ES2024'), // Subroutines can appear before the groups they ref, so collect reffed nodes for a second pass @@ -149,7 +149,7 @@ const FirstPassVisitor = { subroutineRefMap.set(name ?? number, node); }, - CharacterSet({node, replaceWith}, {allowBestEffort, minTargetEs2024}) { + CharacterSet({node, replaceWith}, {emulation, minTargetEs2024}) { const {kind, negate, value} = node; if (kind === AstCharacterSetKinds.any) { replaceWith(createUnicodeProperty('Any')); @@ -159,8 +159,8 @@ const FirstPassVisitor = { replaceWith(parseFragment(r`[^\n]`)); } else if (kind === AstCharacterSetKinds.posix) { if (!minTargetEs2024 && (value === 'graph' || value === 'print')) { - if (!allowBestEffort) { - throw new Error(`POSIX class "${value}" requires option allowBestEffort or min target ES2024`); + if (emulation === 'strict') { + throw new Error(`POSIX class "${value}" requires min target ES2024 or non-strict emulation`); } let ascii = { graph: '!-~', @@ -303,13 +303,13 @@ const FirstPassVisitor = { } }, - VariableLengthCharacterSet({node, replaceWith}, {allowBestEffort, minTargetEs2024}) { + VariableLengthCharacterSet({node, replaceWith}, {emulation, minTargetEs2024}) { const {kind} = node; if (kind === AstVariableLengthCharacterSetKinds.newline) { replaceWith(parseFragment('(?>\r\n?|[\n\v\f\x85\u2028\u2029])')); } else if (kind === AstVariableLengthCharacterSetKinds.grapheme) { - if (!allowBestEffort) { - throw new Error(r`Use of "\X" requires option allowBestEffort`); + if (emulation === 'strict') { + throw new Error(r`Use of "\X" requires non-strict emulation`); } // `emojiRegex` is more permissive than `\p{RGI_Emoji}` since it allows over/under-qualified // emoji using a general pattern that matches any Unicode sequence following the structure of diff --git a/src/unicode.js b/src/unicode.js index 287a82f..0679789 100644 --- a/src/unicode.js +++ b/src/unicode.js @@ -203,7 +203,7 @@ const LowerToTitleCaseMap = new Map([ // (see: POSIX bracket: Unicode Case) // Note: Handling in the transformer assumes all values here are a single, negateable node that's // not pre-negated at the top level. It also uses ASCII versions of `graph` and `print` for target -// `ES2018` (which doesn't allow intersection) if `allowBestEffort` +// `ES2018` (which doesn't allow intersection) if `emulation` is not `strict` const PosixClassesMap = new Map([ ['alnum', r`[\p{Alpha}\p{Nd}]`], ['alpha', r`\p{Alpha}`],