Skip to content

Commit

Permalink
Add flag S; Unicode \s by default
Browse files Browse the repository at this point in the history
  • Loading branch information
slevithan committed Nov 21, 2024
1 parent db1d8bb commit f5bca8d
Show file tree
Hide file tree
Showing 8 changed files with 77 additions and 43 deletions.
16 changes: 13 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ Disables advanced emulation that relies on returning a `RegExp` subclass, result

### `flags`

Oniguruma flags; a string with `i`, `m`, `x`, and `W` in any order (all optional).
Oniguruma flags; a string with `i`, `m`, `x`, `D`, `S`, and `W` in any order (all optional).

Flags can also be specified via modifiers in the pattern.

Expand Down Expand Up @@ -265,7 +265,7 @@ Notice that nearly every feature below has at least subtle differences from Java
</tr>

<tr valign="top">
<th align="left" rowspan="7">Flags</th>
<th align="left" rowspan="8">Flags</th>
<td colspan="5"><i>Supported in top-level flags and pattern modifiers</i></td>
</tr>
<tr valign="top">
Expand Down Expand Up @@ -312,6 +312,15 @@ Notice that nearly every feature below has at least subtle differences from Java
✔ ASCII <code>\d</code>, <code>\p{Digit}</code>, <code>[[:digit:]]</code><br>
</td>
</tr>
<tr valign="top">
<td>Space is ASCII</td>
<td><code>S</code></td>
<td align="middle">✅</td>
<td align="middle">✅</td>
<td>
✔ ASCII <code>\s</code>, <code>\p{Space}</code>, <code>[[:space:]]</code><br>
</td>
</tr>
<tr valign="top">
<td>Word is ASCII</td>
<td><code>W</code></td>
Expand Down Expand Up @@ -471,7 +480,8 @@ Notice that nearly every feature below has at least subtle differences from Java
<td align="middle">✅</td>
<td align="middle">✅</td>
<td>
✔ ASCII (≠ JS)<br>
✔ Unicode by default<br>
✔ Compared to JS's Unicode <code>\s</code>: excludes <code>\uFEFF</code>, includes <code>\x85</code><br>
</td>
</tr>
<tr valign="top">
Expand Down
3 changes: 3 additions & 0 deletions demo/demo.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ const state = {
m: getValue('flag-m'),
x: getValue('flag-x'),
D: getValue('flag-D'),
S: getValue('flag-S'),
W: getValue('flag-W'),
},
opts: {
Expand Down Expand Up @@ -76,6 +77,8 @@ function showTranspiled() {
state.flags.x ? 'x' : ''
}${
state.flags.D ? 'D' : ''
}${
state.flags.S ? 'S' : ''
}${
state.flags.W ? 'W' : ''
}`,
Expand Down
5 changes: 5 additions & 0 deletions demo/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,11 @@ <h2>Try it</h2>
<kbd>D</kbd>
<span class="tip tip-sm">Digit is ASCII</span>
</label>
<label>
<input type="checkbox" id="flag-S" onchange="setFlag('S', this.checked)">
<kbd>S</kbd>
<span class="tip tip-sm">Space is ASCII</span>
</label>
<label>
<input type="checkbox" id="flag-W" onchange="setFlag('W', this.checked)">
<kbd>W</kbd>
Expand Down
2 changes: 1 addition & 1 deletion src/options.js
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ function getOptions(options) {
// Disables advanced emulation that relies on returning a `RegExp` subclass, resulting in
// certain patterns not being emulatable.
avoidSubclass: false,
// Oniguruma flags; a string with `i`, `m`, `x`, and `W` in any order (all optional).
// Oniguruma flags; a string with `i`, `m`, `x`, `D`, `S`, and `W` in any order (all optional).
// Oniguruma's `m` is equivalent to JavaScript's `s` (`dotAll`).
flags: '',
// Include JavaScript flag `g` (`global`) in the result.
Expand Down
3 changes: 2 additions & 1 deletion src/parse.js
Original file line number Diff line number Diff line change
Expand Up @@ -542,13 +542,14 @@ function createDirectiveFromToken({kind, flags}) {
return node;
}

function createFlags({ignoreCase, dotAll, extended, digitIsAscii, wordIsAscii}) {
function createFlags({ignoreCase, dotAll, extended, digitIsAscii, spaceIsAscii, wordIsAscii}) {
return {
type: AstTypes.Flags,
ignoreCase,
dotAll,
extended,
digitIsAscii,
spaceIsAscii,
wordIsAscii,
};
}
Expand Down
6 changes: 3 additions & 3 deletions src/tokenize.js
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ function tokenize(pattern, flags = '') {
if (typeof pattern !== 'string') {
throw new Error('String expected as pattern');
}
if (!/^[imxDW]*$/.test(flags)) {
if (!/^[imxDSW]*$/.test(flags)) {
throw new Error(`Flags "${flags}" includes unsupported value`);
}
const xStack = [flags.includes('x')];
Expand Down Expand Up @@ -196,9 +196,9 @@ function tokenize(pattern, flags = '') {
dotAll: flags.includes('m'),
// Flag x is fully handled during tokenization
extended: flags.includes('x'),
// Flag D is currently only supported as a top-level flag
// Flags D, S, W are currently only supported as top-level flags
digitIsAscii: flags.includes('D'),
// Flag W is currently only supported as a top-level flag
spaceIsAscii: flags.includes('S'),
wordIsAscii: flags.includes('W'),
},
};
Expand Down
75 changes: 47 additions & 28 deletions src/transform.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ import {AstAssertionKinds, AstCharacterSetKinds, AstDirectiveKinds, AstTypes, As
import {applySubclassStrategies, isLoneGLookaround} from './subclass.js';
import {tokenize} from './tokenize.js';
import {traverse} from './traverse.js';
import {defaultWordChar, JsUnicodeProperties, PosixClassesMap} from './unicode.js';
import {JsUnicodeProperties, PosixClassesMap} from './unicode.js';
import {cp, getNewCurrentFlags, getOrCreate, isMinTarget, r} from './utils.js';
import {isLookaround, isZeroLengthNode} from './utils-node.js';
import emojiRegex from 'emoji-regex-xs';
Expand Down Expand Up @@ -58,6 +58,7 @@ function transform(ast, options) {
subroutineRefMap: new Map(),
supportedGNodes: new Set(),
digitIsAscii: ast.flags.digitIsAscii,
spaceIsAscii: ast.flags.spaceIsAscii,
wordIsAscii: ast.flags.wordIsAscii,
};
traverse({node: ast}, firstPassState, FirstPassVisitor);
Expand Down Expand Up @@ -156,16 +157,42 @@ const FirstPassVisitor = {
subroutineRefMap.set(name ?? number, node);
},

CharacterSet({node, replaceWith}, {accuracy, minTargetEs2024, digitIsAscii, wordIsAscii}) {
CharacterSet({node, replaceWith}, {accuracy, minTargetEs2024, digitIsAscii, spaceIsAscii, wordIsAscii}) {
const {kind, negate, value} = node;
// Flag D with `\d`, `\p{Digit}`, `[[:digit:]]``
if (digitIsAscii && (kind === AstCharacterSetKinds.digit || value === 'digit')) {
replaceWith(createCharacterSet(AstCharacterSetKinds.digit, {negate}));
return;
}
// Flag S with `\s`, `\p{Space}`, `[[:space:]]``
if (spaceIsAscii && (kind === AstCharacterSetKinds.space || value === 'space')) {
replaceWith(setNegate(parseFragment(asciiSpaceChar), negate));
return;
}
// Flag W with `\w`, `\p{Word}`, `[[:word:]]``
if (wordIsAscii && (kind === AstCharacterSetKinds.word || value === 'word')) {
replaceWith(createCharacterSet(AstCharacterSetKinds.word, {negate}));
return;
}
if (kind === AstCharacterSetKinds.any) {
replaceWith(createUnicodeProperty('Any'));
} else if (kind === AstCharacterSetKinds.digit && !digitIsAscii) {
} else if (kind === AstCharacterSetKinds.digit) {
replaceWith(createUnicodeProperty('Nd', {negate}));
} else if (kind === AstCharacterSetKinds.hex) {
replaceWith(createUnicodeProperty('AHex', {negate}));
} else if (kind === AstCharacterSetKinds.non_newline) {
replaceWith(parseFragment(r`[^\n]`));
} else if (kind === AstCharacterSetKinds.space) {
// Can't use JS's Unicode-based `\s` since unlike Onig it includes `\uFEFF`, excludes `\x85`
replaceWith(createUnicodeProperty('space', {negate}));
} else if (kind === AstCharacterSetKinds.word) {
replaceWith(setNegate(parseFragment(defaultWordChar), negate));
} else if (kind === AstCharacterSetKinds.property) {
if (!JsUnicodeProperties.has(value)) {
// Assume it's a script; no error checking is the price for avoiding heavyweight Unicode
// data for all script names
node.key = 'sc';
}
} else if (kind === AstCharacterSetKinds.posix) {
if (!minTargetEs2024 && (value === 'graph' || value === 'print')) {
if (accuracy === 'strict') {
Expand All @@ -177,33 +204,13 @@ const FirstPassVisitor = {
}[value];
if (negate) {
// POSIX classes are always nested in a char class; manually invert the range rather than
// using `[^...]` so it can be unwrapped, since ES2018 doesn't support nested classes
// using `[^]` so it can be unwrapped since ES2018 doesn't support nested classes
ascii = `\0-${cp(ascii.codePointAt(0) - 1)}${cp(ascii.codePointAt(2) + 1)}-\u{10FFFF}`;
}
replaceWith(parseFragment(`[${ascii}]`));
} else if (value === 'digit' && digitIsAscii) {
replaceWith(createCharacterSet(AstCharacterSetKinds.digit, {negate}));
} else if (value === 'word' && wordIsAscii) {
replaceWith(createCharacterSet(AstCharacterSetKinds.word, {negate}));
} else {
const negateableNode = parseFragment(PosixClassesMap.get(value));
negateableNode.negate = negate;
replaceWith(negateableNode);
}
} else if (kind === AstCharacterSetKinds.property) {
if (!JsUnicodeProperties.has(value)) {
// Assume it's a script
node.key = 'sc';
replaceWith(setNegate(parseFragment(PosixClassesMap.get(value)), negate));
}
} else if (kind === AstCharacterSetKinds.space) {
// Unlike JS, Onig's `\s` matches only ASCII tab, space, LF, VT, FF, and CR
const s = parseFragment('[ \t\n\v\f\r]');
s.negate = negate;
replaceWith(s);
} else if (kind === AstCharacterSetKinds.word && !wordIsAscii) {
const w = parseFragment(defaultWordChar);
w.negate = negate;
replaceWith(w);
}
},

Expand Down Expand Up @@ -232,9 +239,11 @@ const FirstPassVisitor = {

Flags({node, parent}) {
// Remove Onig flags that aren't available in JS
delete node.extended; // Flag x
delete node.digitIsAscii; // Flag D
delete node.wordIsAscii; // Flag W
[ 'digitIsAscii', // Flag D
'extended', // Flag x
'spaceIsAscii', // Flag S
'wordIsAscii', // Flag W
].forEach(f => delete node[f]);
Object.assign(node, {
// JS flag g; no Onig equiv
global: false,
Expand Down Expand Up @@ -567,6 +576,11 @@ const ThirdPassVisitor = {
},
};

// `\t\n\v\f\r\x20`
const asciiSpaceChar = '[\t-\r ]';
// Different than `PosixClassesMap`'s `word`
const defaultWordChar = r`[\p{L}\p{M}\p{N}\p{Pc}]`;

function adoptAndSwapKids(parent, kids) {
kids.forEach(kid => kid.parent = parent);
parent[getContainerAccessor(parent)] = kids;
Expand Down Expand Up @@ -788,6 +802,11 @@ function prepContainer(node, kids) {
return node;
}

function setNegate(node, negate) {
node.negate = negate;
return node;
}

function traverseReplacement(replacement, {parent, key, container}, state, visitor) {
traverse({
// Don't use the `node` from `path`
Expand Down
10 changes: 3 additions & 7 deletions src/unicode.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,6 @@ const CharsWithoutIgnoreCaseExpansion = new Set([
cp(0x131), // ı
]);

// Different than `PosixClassesMap`'s `word`
const defaultWordChar = r`[\p{L}\p{M}\p{N}\p{Pc}]`;

function getIgnoreCaseMatchChars(char) {
// Some chars should not match the chars they case swap to
if (CharsWithoutIgnoreCaseExpansion.has(char)) {
Expand Down Expand Up @@ -241,12 +238,12 @@ const PosixProperties = new Set([
'print',
'word',
'xdigit',
// The following are available with the same name in JS (see `JsUnicodeProperties`)
// Explicitly include `digit` for the sake of flag D (`digitIsAscii`) handling as POSIX
'digit', // (JS: digit)
// The following are available with the same name in JS (see `JsUnicodeProperties`), so can be
// handled as standard Unicode properties
// 'alpha', // (JS: Alpha)
// 'ascii', // (JS: ASCII)
// 'cntrl', // (JS: cntrl)
// 'digit', // (JS: digit)
// 'lower', // (JS: Lower)
// 'punct', // (JS: punct)
// 'space', // (JS: space)
Expand Down Expand Up @@ -290,7 +287,6 @@ const UnicodePropertiesWithSpecificCase = new Set([
]);

export {
defaultWordChar,
getIgnoreCaseMatchChars,
JsUnicodeProperties,
JsUnicodePropertiesMap,
Expand Down

0 comments on commit f5bca8d

Please sign in to comment.