From d594d532594d0e56d9c32501d5b0139cee81a21c Mon Sep 17 00:00:00 2001
From: Steven Levithan
- Flags:
+
-
-
-
-
-
- More details
-Specifically, this option enables the following additional features, depending on `target`:
+#### `default` mode
+
+Supports all features of `strict` mode, plus the following additional features, depending on `target`:
- All targets (`ESNext` and earlier):
- Enables use of `\X` using a close approximation of a Unicode extended grapheme cluster.
- - Enables recursion (e.g. via `\g<0>`) using a depth limit specified via option `maxRecursionDepth`.
+ - Enables recursion (e.g. via `\g<0>`) with a depth limit specified by option `maxRecursionDepth`.
- `ES2024` and earlier:
- Enables use of case-insensitive backreferences to case-sensitive groups.
- `ES2018`:
- Enables use of POSIX classes `[:graph:]` and `[:print:]` using ASCII-based versions rather than the Unicode versions available for `ES2024` and later. Other POSIX classes are always based on Unicode.
+
+#### `loose` mode
+
+Supports all features of `default`, plus the following:
+
+- Silences errors for unsupported uses of the search-start anchor `\G` (a flexible assertion that doesn’t have a direct equivalent in JavaScript).
+ - Oniguruma-To-ES uses a variety of strategies to accurately emulate many common uses of `\G`. When using `loose` mode, if a `\G` assertion is found that doesn't have a known emulation strategy, the `\G` is simply removed and JavaScript's `y` (`sticky`) flag is added. This might lead to some false positives and negatives.
More details
-Using a high limit is not a problem if needed. Although there can be a performance cost (minor unless it's exacerbating an existing issue with runaway backtracking), there is no effect on regexes that don't use recursion.
+Using a high limit has a (usually tiny) impact on transpilation and regex performance. Generally, this is only a problem if the regex has an existing issue with runaway backtracking that recursion exacerbates.
+
+Higher limits have no effect on regexes that don't use recursion, so you should feel free to increase this if helpful.
More details
- `ES2018`: Uses JS flag `u`.
@@ -887,10 +906,10 @@ The table above doesn't include all aspects that Oniguruma-To-ES emulates (inclu
1. Target `ES2018` doesn't allow Unicode property names added in JavaScript specifications after ES2018 to be used.
2. Unicode blocks are easily emulatable but their character data would significantly increase library weight. They're also a deeply flawed and arguably-unuseful feature, given the ability to use Unicode scripts and other properties.
-3. With target `ES2018`, the specific POSIX classes `[:graph:]` and `[:print:]` are an error if option `allowBestEffort` is `false`, and they use ASCII-based versions rather than the Unicode versions available for target `ES2024` and later.
+3. With target `ES2018`, the specific POSIX classes `[:graph:]` and `[:print:]` are an error if option `emulation` is `'strict'`, and they use ASCII-based versions rather than the Unicode versions available for target `ES2024` and later.
4. Target `ES2018` doesn't support nested *negated* character classes.
5. It's not an error for *numbered* backreferences to come before their referenced group in Oniguruma, but an error is the best path for Oniguruma-To-ES because (1) most placements are mistakes and can never match (based on the Oniguruma behavior for backreferences to nonparticipating groups), (2) erroring matches the behavior of named backreferences, and (3) the edge cases where they're matchable rely on rules for backreference resetting within quantified groups that are different in JavaScript and aren't emulatable. Note that it's not a backreference in the first place if using `\10` or higher and not as many capturing groups are defined to the left (it's an octal or identity escape).
-6. The maximum recursion depth is specified by option `maxRecursionDepth`. Use of recursion results in an error if `maxRecursionDepth` is `null` or `allowBestEffort` is `false`. Some forms of recursion (recursion with backreferences, and multiple recursions in the same pattern) aren't yet supported. Note that, because recursion is bounded, patterns that fail due to infinite recursion in Oniguruma might find a match in Oniguruma-To-ES. Future versions will detect this and throw an error.
+6. The recursion depth limit is specified by option `maxRecursionDepth`. Some forms of recursion (multiple recursions in the same pattern, and recursion with backreferences) aren't yet supported. Patterns that would error in Oniguruma due to triggering infinite recursion might find a match in Oniguruma-To-ES since recursion is bounded (future versions will detect this and error at transpilation time).
## ㊗️ Unicode / mixed case-sensitivity
diff --git a/demo/demo.css b/demo/demo.css
index 8bc057c..18b412f 100644
--- a/demo/demo.css
+++ b/demo/demo.css
@@ -20,7 +20,7 @@ main {
border-radius: 0 0 15px 15px;
}
-h1, h2, ul, p, pre, details, summary {
+h1, h2, ul, p, pre, summary {
margin-bottom: 12px;
}
@@ -37,20 +37,28 @@ code {
background-color: #f6f6f6;
}
+kbd {
+ padding: 0 3px;
+}
+
small {
font-size: 0.8em;
}
-td {
- padding-right: 3vw;
+.hidden {
+ display: none;
}
summary {
cursor: pointer;
}
-label, .label {
- margin-right: 0.4em;
+label {
+ margin-right: 0.5em;
+}
+
+label img {
+ vertical-align: middle;
}
input[type='checkbox'] {
@@ -62,20 +70,18 @@ input[type='checkbox'] {
}
input[type='number'] {
+ width: 3.5em;
+ padding: 3px;
+ font-size: 0.9em;
border: 1px solid #bbb;
- height: 1.6em;
border-radius: 4px;
- padding-left: 4px;
- width: 3.5em;
}
select {
- padding: 4px 35px 4px 10px;
+ padding: 3px;
font-size: 0.9em;
border: 1px solid #bbb;
border-radius: 4px;
- appearance: none;
- background: url(https://upload.wikimedia.org/wikipedia/commons/9/99/Unofficial_JavaScript_logo_2.svg) 96% / 15% no-repeat #f6f6f6;
}
textarea {
@@ -92,12 +98,20 @@ textarea:focus {
box-shadow: 0 0 8px #80c0ff;
}
-pre, code, textarea {
+pre, code, kbd, textarea {
font-family: Consolas, "Source Code Pro", Monospace;
font-size: 0.9em;
border-radius: 0.375em;
}
+#more-options {
+ display: flex;
+}
+
+#more-options div {
+ margin-right: 3%;
+}
+
#output, textarea {
padding: 0.6em;
white-space: pre-wrap;
@@ -133,7 +147,3 @@ pre, code, textarea {
margin-top: -12px;
padding: 0.6em;
}
-
-.hidden {
- display: none;
-}
diff --git a/demo/demo.js b/demo/demo.js
index 72a8bb8..ddeb215 100644
--- a/demo/demo.js
+++ b/demo/demo.js
@@ -5,11 +5,11 @@ const state = {
x: getValue('flag-x'),
},
opts: {
- allowBestEffort: getValue('option-allow-best-effort'),
- allowSubclassBasedEmulation: getValue('option-subclass'),
+ allowSubclassBasedEmulation: getValue('option-allowSubclassBasedEmulation'),
+ emulation: getValue('option-emulation'),
global: getValue('option-global'),
- hasIndices: getValue('option-has-indices'),
- maxRecursionDepth: getValue('option-max-recursion-depth'),
+ hasIndices: getValue('option-hasIndices'),
+ maxRecursionDepth: getValue('option-maxRecursionDepth'),
optimize: getValue('option-optimize'),
target: getValue('option-target'),
},
diff --git a/demo/index.html b/demo/index.html
index 4c9787f..23c549b 100644
--- a/demo/index.html
+++ b/demo/index.html
@@ -19,75 +19,88 @@
Try it
target
:
-
+
+
More options
-
-
+
-
-
-
-
-
+
-
+
+
-
-
-
+
+
+
-
+
+
-
+
+
+
-
+ +
+ + -✅ This regex is emulated through the combination of changes in the pattern and the use of a RegExp
subclass with custom logic.
✅ A RegExp
subclass instance (with a custom execution strategy) is returned for this pattern. It remains a native JavaScript regex and works the same as RegExp
in all contexts.
The output shows the result of calling toRegExp
. Oniguruma-To-ES includes functions to generate additional formats: compile
, toOnigurumaAst
, and toRegexAst
(for an AST based on regex
). You can run all of these from the console on this page, and you can pretty-print AST results by passing them to printAst
.
diff --git a/spec/match-backreference.spec.js b/spec/match-backreference.spec.js
index ed1b970..3720de2 100644
--- a/spec/match-backreference.spec.js
+++ b/spec/match-backreference.spec.js
@@ -8,7 +8,7 @@ beforeEach(() => {
});
describe('Backreference', () => {
- // TODO: Test that case-insensitive backref to case-sensitive group requires allowBestEffort or ESNext
+ // TODO: Test that case-insensitive backref to case-sensitive group requires `ESNext` or non-`strict` emulation
describe('numbered backref', () => {
it('should rematch the captured text', () => {
diff --git a/spec/match-recursion.spec.js b/spec/match-recursion.spec.js
index fe38f1a..4f12dbe 100644
--- a/spec/match-recursion.spec.js
+++ b/spec/match-recursion.spec.js
@@ -7,12 +7,12 @@ beforeEach(() => {
});
describe('Recursion', () => {
- it('should throw if recursion used with allowBestEffort false', () => {
- expect(() => compile(r`a\g<0>?`, '', {allowBestEffort: false})).toThrow();
- expect(() => compile('', '', {allowBestEffort: false})).not.toThrow();
+ it('should throw if recursion used with strict emulation', () => {
+ expect(() => compile(r`a\g<0>?`, '', {emulation: 'strict'})).toThrow();
+ expect(() => compile('', '', {emulation: 'strict'})).not.toThrow();
});
- it('should throw if recursion used with maxRecursionDepth null', () => {
+ it('should throw if recursion used with null maxRecursionDepth', () => {
expect(() => compile(r`a\g<0>?`, '', {maxRecursionDepth: null})).toThrow();
expect(() => compile('', '', {maxRecursionDepth: null})).not.toThrow();
});
diff --git a/src/compile.js b/src/compile.js
index a017df5..09398f5 100644
--- a/src/compile.js
+++ b/src/compile.js
@@ -8,7 +8,7 @@ import {recursion} from 'regex-recursion';
/**
@typedef {{
- allowBestEffort?: boolean;
+ emulation?: 'strict' | 'default' | 'loose';
global?: boolean;
hasIndices?: boolean;
maxRecursionDepth?: number | null;
@@ -56,8 +56,8 @@ function compileInternal(pattern, flags, options) {
skipBackrefValidation: opts.tmGrammar,
});
const regexAst = transform(onigurumaAst, {
- allowBestEffort: opts.allowBestEffort,
allowSubclassBasedEmulation: opts.allowSubclassBasedEmulation,
+ emulation: opts.emulation,
bestEffortTarget: opts.target,
});
const generated = generate(regexAst, opts);
@@ -90,19 +90,19 @@ function getOptions(options) {
}
// Set default values
return {
- // Allows results that differ from Oniguruma in rare cases. If `false`, throws if the pattern
- // can't be emulated with identical behavior
- allowBestEffort: true,
// Allows advanced emulation strategies that rely on returning a `RegExp` subclass with an
// overridden `exec` method. A subclass is only used if needed for the given pattern
allowSubclassBasedEmulation: false,
+ // Sets the level of emulation strictness; `default` is best in most cases. If `strict`, throws
+ // if the pattern can't be emulated with identical behavior (even in rare edge cases) for the
+ // given target
+ emulation: 'default',
// Include JS flag `g` in the result
global: false,
// Include JS flag `d` in the result
hasIndices: false,
- // If `null`, any use of recursion throws. If an integer between `2` and `100` (and
- // `allowBestEffort` is on), common recursion forms are supported and recurse up to the
- // specified max depth
+ // If an integer between `2` and `100`, common recursion forms are supported and recurse up to
+ // the specified depth limit. If set to `null`, any use of recursion results in an error
maxRecursionDepth: 6,
// Simplify the generated pattern when it doesn't change the meaning
optimize: true,
@@ -110,7 +110,7 @@ function getOptions(options) {
// faster processing, simpler generated source, and support for additional features
target: 'ES2024',
// Leave disabled unless the regex will be used in a TextMate grammar processor that merges
- // `begin` and `end` patterns
+ // backreferences across `begin` and `end` patterns
tmGrammar: false,
...options,
};
diff --git a/src/generate.js b/src/generate.js
index 710bfbd..afb3490 100644
--- a/src/generate.js
+++ b/src/generate.js
@@ -56,13 +56,13 @@ function generate(ast, options) {
};
let lastNode = null;
const state = {
- allowBestEffort: opts.allowBestEffort,
appliedGlobalFlags,
captureFlagIMap: new Map(),
currentFlags: {
dotAll: ast.flags.dotAll,
ignoreCase: ast.flags.ignoreCase,
},
+ emulation: opts.emulation,
groupNames: new Set(),
inCharClass: false,
lastNode,
@@ -226,11 +226,11 @@ function genBackreference({ref}, state) {
}
if (
!state.useFlagMods &&
- !state.allowBestEffort &&
+ state.emulation === 'strict' &&
state.currentFlags.ignoreCase &&
!state.captureFlagIMap.get(ref)
) {
- throw new Error('Use of case-insensitive backref to case-sensitive group requires option allowBestEffort or target ESNext');
+ throw new Error('Use of case-insensitive backref to case-sensitive group requires target ESNext or non-strict emulation');
}
return '\\' + ref;
}
@@ -342,8 +342,9 @@ function genCharacterSet({kind, negate, value, key}, state) {
UnicodePropertiesWithSpecificCase.has(value)
) {
// Support for this would require heavy Unicode data. Could change e.g. `\p{Lu}` to `\p{LC}`
- // if `allowBestEffort` (since it's close but not 100%), but this wouldn't work for e.g.
- // `\p{Lt}` and in any case it's probably a mistake if using these props case-insensitively
+ // if not using `strict` emulation (since it's close but not 100%), but this wouldn't work
+ // for e.g. `\p{Lt}`, and in any case, it's probably user error if using these case-specific
+ // props case-insensitively
throw new Error(`Unicode property "${value}" can't be case-insensitive when other chars have specific case`);
}
return `${negate ? r`\P` : r`\p`}{${key ? `${key}=` : ''}${value}}`;
@@ -392,8 +393,8 @@ function genRecursion({ref}, state) {
if (!rDepth) {
throw new Error('Use of recursion disabled');
}
- if (!state.allowBestEffort) {
- throw new Error('Use of recursion requires option allowBestEffort');
+ if (state.emulation === 'strict') {
+ throw new Error('Use of recursion requires non-strict emulation');
}
// Using the syntax supported by `regex-recursion`
return ref === 0 ? `(?R=${rDepth})` : r`\g<${ref}&R=${rDepth}>`;
diff --git a/src/transform.js b/src/transform.js
index 603c864..d71644c 100644
--- a/src/transform.js
+++ b/src/transform.js
@@ -20,29 +20,29 @@ Transforms an Oniguruma AST in-place to a `regex` AST. Targets `ESNext`, expecti
then down-convert to the desired JS target version.
@param {import('./parse.js').OnigurumaAst} ast
@param {{
- allowBestEffort?: boolean;
allowSubclassBasedEmulation?: boolean;
bestEffortTarget?: keyof Target;
+ emulation?: 'strict' | 'default' | 'loose';
}} [options]
@returns {RegexAst}
*/
function transform(ast, options) {
const opts = {
- // A couple edge cases exist where options `allowBestEffort` and `bestEffortTarget` are used:
+ // A couple edge cases exist where options `emulation` and `bestEffortTarget` are used:
// - `VariableLengthCharacterSet` kind `grapheme` (`\X`): An exact representation would require
// heavy Unicode data; a best-effort approximation requires knowing the target.
// - `CharacterSet` kind `posix` with values `graph` and `print`: Their complex exact
// representations are hard to change after the fact in the generator to a best-effort
// approximation based on the target, so produce the appropriate structure here.
- allowBestEffort: true,
allowSubclassBasedEmulation: false,
bestEffortTarget: 'ESNext',
+ emulation: 'default',
...options,
};
// AST changes that work together with a `RegExp` subclass to add advanced emulation
const strategy = opts.allowSubclassBasedEmulation ? applySubclassStrategies(ast) : null;
const firstPassState = {
- allowBestEffort: opts.allowBestEffort,
+ emulation: opts.emulation,
flagDirectivesByAlt: new Map(),
minTargetEs2024: isMinTarget(opts.bestEffortTarget, 'ES2024'),
// Subroutines can appear before the groups they ref, so collect reffed nodes for a second pass
@@ -149,7 +149,7 @@ const FirstPassVisitor = {
subroutineRefMap.set(name ?? number, node);
},
- CharacterSet({node, replaceWith}, {allowBestEffort, minTargetEs2024}) {
+ CharacterSet({node, replaceWith}, {emulation, minTargetEs2024}) {
const {kind, negate, value} = node;
if (kind === AstCharacterSetKinds.any) {
replaceWith(createUnicodeProperty('Any'));
@@ -159,8 +159,8 @@ const FirstPassVisitor = {
replaceWith(parseFragment(r`[^\n]`));
} else if (kind === AstCharacterSetKinds.posix) {
if (!minTargetEs2024 && (value === 'graph' || value === 'print')) {
- if (!allowBestEffort) {
- throw new Error(`POSIX class "${value}" requires option allowBestEffort or min target ES2024`);
+ if (emulation === 'strict') {
+ throw new Error(`POSIX class "${value}" requires min target ES2024 or non-strict emulation`);
}
let ascii = {
graph: '!-~',
@@ -303,13 +303,13 @@ const FirstPassVisitor = {
}
},
- VariableLengthCharacterSet({node, replaceWith}, {allowBestEffort, minTargetEs2024}) {
+ VariableLengthCharacterSet({node, replaceWith}, {emulation, minTargetEs2024}) {
const {kind} = node;
if (kind === AstVariableLengthCharacterSetKinds.newline) {
replaceWith(parseFragment('(?>\r\n?|[\n\v\f\x85\u2028\u2029])'));
} else if (kind === AstVariableLengthCharacterSetKinds.grapheme) {
- if (!allowBestEffort) {
- throw new Error(r`Use of "\X" requires option allowBestEffort`);
+ if (emulation === 'strict') {
+ throw new Error(r`Use of "\X" requires non-strict emulation`);
}
// `emojiRegex` is more permissive than `\p{RGI_Emoji}` since it allows over/under-qualified
// emoji using a general pattern that matches any Unicode sequence following the structure of
diff --git a/src/unicode.js b/src/unicode.js
index 287a82f..0679789 100644
--- a/src/unicode.js
+++ b/src/unicode.js
@@ -203,7 +203,7 @@ const LowerToTitleCaseMap = new Map([
//