Add flag S; Unicode \s by default

slevithan · Nov 21, 2024 · f5bca8d · f5bca8d
1 parent db1d8bb
commit f5bca8d
Show file tree

Hide file tree

Showing 8 changed files with 77 additions and 43 deletions.
diff --git a/README.md b/README.md
@@ -181,7 +181,7 @@ Disables advanced emulation that relies on returning a `RegExp` subclass, result
 
 ### `flags`
 
-Oniguruma flags; a string with `i`, `m`, `x`, and `W` in any order (all optional).
+Oniguruma flags; a string with `i`, `m`, `x`, `D`, `S`, and `W` in any order (all optional).
 
 Flags can also be specified via modifiers in the pattern.
 
@@ -265,7 +265,7 @@ Notice that nearly every feature below has at least subtle differences from Java
   </tr>
 
   <tr valign="top">
-    <th align="left" rowspan="7">Flags</th>
+    <th align="left" rowspan="8">Flags</th>
     <td colspan="5"><i>Supported in top-level flags and pattern modifiers</i></td>
   </tr>
   <tr valign="top">
@@ -312,6 +312,15 @@ Notice that nearly every feature below has at least subtle differences from Java
       ✔ ASCII <code>\d</code>, <code>\p{Digit}</code>, <code>[[:digit:]]</code><br>
     </td>
   </tr>
+  <tr valign="top">
+    <td>Space is ASCII</td>
+    <td><code>S</code></td>
+    <td align="middle">✅</td>
+    <td align="middle">✅</td>
+    <td>
+      ✔ ASCII <code>\s</code>, <code>\p{Space}</code>, <code>[[:space:]]</code><br>
+    </td>
+  </tr>
   <tr valign="top">
     <td>Word is ASCII</td>
     <td><code>W</code></td>
@@ -471,7 +480,8 @@ Notice that nearly every feature below has at least subtle differences from Java
     <td align="middle">✅</td>
     <td align="middle">✅</td>
     <td>
-      ✔ ASCII (≠ JS)<br>
+      ✔ Unicode by default<br>
+      ✔ Compared to JS's Unicode <code>\s</code>: excludes <code>\uFEFF</code>, includes <code>\x85</code><br>
     </td>
   </tr>
   <tr valign="top">

diff --git a/demo/demo.js b/demo/demo.js
@@ -11,6 +11,7 @@ const state = {
     m: getValue('flag-m'),
     x: getValue('flag-x'),
     D: getValue('flag-D'),
+    S: getValue('flag-S'),
     W: getValue('flag-W'),
   },
   opts: {
@@ -76,6 +77,8 @@ function showTranspiled() {
       state.flags.x ? 'x' : ''
     }${
       state.flags.D ? 'D' : ''
+    }${
+      state.flags.S ? 'S' : ''
     }${
       state.flags.W ? 'W' : ''
     }`,

diff --git a/demo/index.html b/demo/index.html
@@ -40,6 +40,11 @@ <h2>Try it</h2>
         <kbd>D</kbd>
         <span class="tip tip-sm">Digit is ASCII</span>
       </label>
+      <label>
+        <input type="checkbox" id="flag-S" onchange="setFlag('S', this.checked)">
+        <kbd>S</kbd>
+        <span class="tip tip-sm">Space is ASCII</span>
+      </label>
       <label>
         <input type="checkbox" id="flag-W" onchange="setFlag('W', this.checked)">
         <kbd>W</kbd>

diff --git a/src/options.js b/src/options.js
@@ -35,7 +35,7 @@ function getOptions(options) {
     // Disables advanced emulation that relies on returning a `RegExp` subclass, resulting in
     // certain patterns not being emulatable.
     avoidSubclass: false,
-    // Oniguruma flags; a string with `i`, `m`, `x`, and `W` in any order (all optional).
+    // Oniguruma flags; a string with `i`, `m`, `x`, `D`, `S`, and `W` in any order (all optional).
     // Oniguruma's `m` is equivalent to JavaScript's `s` (`dotAll`).
     flags: '',
     // Include JavaScript flag `g` (`global`) in the result.

diff --git a/src/parse.js b/src/parse.js
@@ -542,13 +542,14 @@ function createDirectiveFromToken({kind, flags}) {
   return node;
 }
 
-function createFlags({ignoreCase, dotAll, extended, digitIsAscii, wordIsAscii}) {
+function createFlags({ignoreCase, dotAll, extended, digitIsAscii, spaceIsAscii, wordIsAscii}) {
   return {
     type: AstTypes.Flags,
     ignoreCase,
     dotAll,
     extended,
     digitIsAscii,
+    spaceIsAscii,
     wordIsAscii,
   };
 }

diff --git a/src/tokenize.js b/src/tokenize.js
@@ -136,7 +136,7 @@ function tokenize(pattern, flags = '') {
   if (typeof pattern !== 'string') {
     throw new Error('String expected as pattern');
   }
-  if (!/^[imxDW]*$/.test(flags)) {
+  if (!/^[imxDSW]*$/.test(flags)) {
     throw new Error(`Flags "${flags}" includes unsupported value`);
   }
   const xStack = [flags.includes('x')];
@@ -196,9 +196,9 @@ function tokenize(pattern, flags = '') {
       dotAll: flags.includes('m'),
       // Flag x is fully handled during tokenization
       extended: flags.includes('x'),
-      // Flag D is currently only supported as a top-level flag
+      // Flags D, S, W are currently only supported as top-level flags
       digitIsAscii: flags.includes('D'),
-      // Flag W is currently only supported as a top-level flag
+      spaceIsAscii: flags.includes('S'),
       wordIsAscii: flags.includes('W'),
     },
   };

diff --git a/src/transform.js b/src/transform.js
@@ -3,7 +3,7 @@ import {AstAssertionKinds, AstCharacterSetKinds, AstDirectiveKinds, AstTypes, As
 import {applySubclassStrategies, isLoneGLookaround} from './subclass.js';
 import {tokenize} from './tokenize.js';
 import {traverse} from './traverse.js';
-import {defaultWordChar, JsUnicodeProperties, PosixClassesMap} from './unicode.js';
+import {JsUnicodeProperties, PosixClassesMap} from './unicode.js';
 import {cp, getNewCurrentFlags, getOrCreate, isMinTarget, r} from './utils.js';
 import {isLookaround, isZeroLengthNode} from './utils-node.js';
 import emojiRegex from 'emoji-regex-xs';
@@ -58,6 +58,7 @@ function transform(ast, options) {
     subroutineRefMap: new Map(),
     supportedGNodes: new Set(),
     digitIsAscii: ast.flags.digitIsAscii,
+    spaceIsAscii: ast.flags.spaceIsAscii,
     wordIsAscii: ast.flags.wordIsAscii,
   };
   traverse({node: ast}, firstPassState, FirstPassVisitor);
@@ -156,16 +157,42 @@ const FirstPassVisitor = {
     subroutineRefMap.set(name ?? number, node);
   },
 
-  CharacterSet({node, replaceWith}, {accuracy, minTargetEs2024, digitIsAscii, wordIsAscii}) {
+  CharacterSet({node, replaceWith}, {accuracy, minTargetEs2024, digitIsAscii, spaceIsAscii, wordIsAscii}) {
     const {kind, negate, value} = node;
+    // Flag D with `\d`, `\p{Digit}`, `[[:digit:]]``
+    if (digitIsAscii && (kind === AstCharacterSetKinds.digit || value === 'digit')) {
+      replaceWith(createCharacterSet(AstCharacterSetKinds.digit, {negate}));
+      return;
+    }
+    // Flag S with `\s`, `\p{Space}`, `[[:space:]]``
+    if (spaceIsAscii && (kind === AstCharacterSetKinds.space || value === 'space')) {
+      replaceWith(setNegate(parseFragment(asciiSpaceChar), negate));
+      return;
+    }
+    // Flag W with `\w`, `\p{Word}`, `[[:word:]]``
+    if (wordIsAscii && (kind === AstCharacterSetKinds.word || value === 'word')) {
+      replaceWith(createCharacterSet(AstCharacterSetKinds.word, {negate}));
+      return;
+    }
     if (kind === AstCharacterSetKinds.any) {
       replaceWith(createUnicodeProperty('Any'));
-    } else if (kind === AstCharacterSetKinds.digit && !digitIsAscii) {
+    } else if (kind === AstCharacterSetKinds.digit) {
       replaceWith(createUnicodeProperty('Nd', {negate}));
     } else if (kind === AstCharacterSetKinds.hex) {
       replaceWith(createUnicodeProperty('AHex', {negate}));
     } else if (kind === AstCharacterSetKinds.non_newline) {
       replaceWith(parseFragment(r`[^\n]`));
+    } else if (kind === AstCharacterSetKinds.space) {
+      // Can't use JS's Unicode-based `\s` since unlike Onig it includes `\uFEFF`, excludes `\x85`
+      replaceWith(createUnicodeProperty('space', {negate}));
+    } else if (kind === AstCharacterSetKinds.word) {
+      replaceWith(setNegate(parseFragment(defaultWordChar), negate));
+    } else if (kind === AstCharacterSetKinds.property) {
+      if (!JsUnicodeProperties.has(value)) {
+        // Assume it's a script; no error checking is the price for avoiding heavyweight Unicode
+        // data for all script names
+        node.key = 'sc';
+      }
     } else if (kind === AstCharacterSetKinds.posix) {
       if (!minTargetEs2024 && (value === 'graph' || value === 'print')) {
         if (accuracy === 'strict') {
@@ -177,33 +204,13 @@ const FirstPassVisitor = {
         }[value];
         if (negate) {
           // POSIX classes are always nested in a char class; manually invert the range rather than
-          // using `[^...]` so it can be unwrapped, since ES2018 doesn't support nested classes
+          // using `[^…]` so it can be unwrapped since ES2018 doesn't support nested classes
           ascii = `\0-${cp(ascii.codePointAt(0) - 1)}${cp(ascii.codePointAt(2) + 1)}-\u{10FFFF}`;
         }
         replaceWith(parseFragment(`[${ascii}]`));
-      } else if (value === 'digit' && digitIsAscii) {
-        replaceWith(createCharacterSet(AstCharacterSetKinds.digit, {negate}));
-      } else if (value === 'word' && wordIsAscii) {
-        replaceWith(createCharacterSet(AstCharacterSetKinds.word, {negate}));
       } else {
-        const negateableNode = parseFragment(PosixClassesMap.get(value));
-        negateableNode.negate = negate;
-        replaceWith(negateableNode);
-      }
-    } else if (kind === AstCharacterSetKinds.property) {
-      if (!JsUnicodeProperties.has(value)) {
-        // Assume it's a script
-        node.key = 'sc';
+        replaceWith(setNegate(parseFragment(PosixClassesMap.get(value)), negate));
       }
-    } else if (kind === AstCharacterSetKinds.space) {
-      // Unlike JS, Onig's `\s` matches only ASCII tab, space, LF, VT, FF, and CR
-      const s = parseFragment('[ \t\n\v\f\r]');
-      s.negate = negate;
-      replaceWith(s);
-    } else if (kind === AstCharacterSetKinds.word && !wordIsAscii) {
-      const w = parseFragment(defaultWordChar);
-      w.negate = negate;
-      replaceWith(w);
     }
   },
 
@@ -232,9 +239,11 @@ const FirstPassVisitor = {
 
   Flags({node, parent}) {
     // Remove Onig flags that aren't available in JS
-    delete node.extended; // Flag x
-    delete node.digitIsAscii; // Flag D
-    delete node.wordIsAscii; // Flag W
+    [ 'digitIsAscii', // Flag D
+      'extended', // Flag x
+      'spaceIsAscii', // Flag S
+      'wordIsAscii', // Flag W
+    ].forEach(f => delete node[f]);
     Object.assign(node, {
       // JS flag g; no Onig equiv
       global: false,
@@ -567,6 +576,11 @@ const ThirdPassVisitor = {
   },
 };
 
+// `\t\n\v\f\r\x20`
+const asciiSpaceChar = '[\t-\r ]';
+// Different than `PosixClassesMap`'s `word`
+const defaultWordChar = r`[\p{L}\p{M}\p{N}\p{Pc}]`;
+
 function adoptAndSwapKids(parent, kids) {
   kids.forEach(kid => kid.parent = parent);
   parent[getContainerAccessor(parent)] = kids;
@@ -788,6 +802,11 @@ function prepContainer(node, kids) {
   return node;
 }
 
+function setNegate(node, negate) {
+  node.negate = negate;
+  return node;
+}
+
 function traverseReplacement(replacement, {parent, key, container}, state, visitor) {
   traverse({
     // Don't use the `node` from `path`

diff --git a/src/unicode.js b/src/unicode.js
@@ -5,9 +5,6 @@ const CharsWithoutIgnoreCaseExpansion = new Set([
   cp(0x131), // ı
 ]);
 
-// Different than `PosixClassesMap`'s `word`
-const defaultWordChar = r`[\p{L}\p{M}\p{N}\p{Pc}]`;
-
 function getIgnoreCaseMatchChars(char) {
   // Some chars should not match the chars they case swap to
   if (CharsWithoutIgnoreCaseExpansion.has(char)) {
@@ -241,12 +238,12 @@ const PosixProperties = new Set([
   'print',
   'word',
   'xdigit',
-  // The following are available with the same name in JS (see `JsUnicodeProperties`)
-  // Explicitly include `digit` for the sake of flag D (`digitIsAscii`) handling as POSIX
-  'digit', // (JS: digit)
+  // The following are available with the same name in JS (see `JsUnicodeProperties`), so can be
+  // handled as standard Unicode properties
   // 'alpha', // (JS: Alpha)
   // 'ascii', // (JS: ASCII)
   // 'cntrl', // (JS: cntrl)
+  // 'digit', // (JS: digit)
   // 'lower', // (JS: Lower)
   // 'punct', // (JS: punct)
   // 'space', // (JS: space)
@@ -290,7 +287,6 @@ const UnicodePropertiesWithSpecificCase = new Set([
 ]);
 
 export {
-  defaultWordChar,
   getIgnoreCaseMatchChars,
   JsUnicodeProperties,
   JsUnicodePropertiesMap,