Error for invalid, incomplete, or unsupported octal code point

slevithan · Nov 11, 2024 · 1406cdf · 1406cdf
1 parent 32ee0b9
commit 1406cdf
Show file tree

Hide file tree

Showing 3 changed files with 70 additions and 25 deletions.
diff --git a/README.md b/README.md
@@ -384,7 +384,7 @@ Notice that nearly every feature below has at least subtle differences from Java
     <td>
       ✔ Allows 1 hex digit<br>
       ✔ Above <code>7F</code>, is UTF-8 encoded byte (unlike JS)<br>
-      ✔ Error for invalid encoded byte/sequence<br>
+      ✔ Error for invalid encoded bytes<br>
     </td>
   </tr>
   <tr valign="top">
@@ -435,7 +435,7 @@ Notice that nearly every feature below has at least subtle differences from Java
       ● Non-A-Za-z with <code>\cx</code>, <code>\C-x</code><br>
       ● Meta <code>\M-x</code>, <code>\M-\C-x</code><br>
       ● Octal code point <code>\o{…}</code><br>
-      ● UTF-8 encoded bytes in octal<br>
+      ● Octal UTF-8 encoded bytes<br>
     </td>
   </tr>
 

diff --git a/spec/match-char.spec.js b/spec/match-char.spec.js
@@ -30,7 +30,7 @@ describe('Character', () => {
       expect('\x1A').toExactlyMatch(r`\C-z`);
     });
 
-    // Currently unsupported: control chars other than A-Za-z
+    // Not yet supported: control char identifier other than A-Za-z
     it('should throw for unsupported control char', () => {
       expect(() => toDetails(r`\c.`)).toThrow();
       expect(() => toDetails(r`\C-.`)).toThrow();
@@ -95,18 +95,24 @@ describe('Character', () => {
   });
 
   describe('meta', () => {
+    // Not yet supported
     it('should throw for unsupported meta', () => {
+      expect(() => toDetails(r`\M-\1`)).toThrow();
+    });
+
+    it('should throw for incomplete meta', () => {
       expect(() => toDetails(r`\M`)).toThrow();
       expect(() => toDetails(r`\M-`)).toThrow();
-      // Currently unsupported
-      expect(() => toDetails(r`\M-\1`)).toThrow();
     });
 
+    // Not yet supported
     it('should throw for unsupported meta control char', () => {
+      expect(() => toDetails(r`\M-\C-A`)).toThrow();
+    });
+
+    it('should throw for incomplete meta control char', () => {
       expect(() => toDetails(r`\M-\C`)).toThrow();
       expect(() => toDetails(r`\M-\C-`)).toThrow();
-      // Currently unsupported
-      expect(() => toDetails(r`\M-\C-A`)).toThrow();
     });
   });
 
@@ -234,4 +240,34 @@ describe('Character', () => {
       expect(() => toDetails(r`\x{110000}`)).toThrow();
     });
   });
+
+  describe('enclosed octal', () => {
+    // Not yet supported
+    it('should throw for unsupported octal code point', () => {
+      expect(() => toDetails(r`\o{0}`)).toThrow();
+      expect(() => toDetails(r`\o{177}`)).toThrow();
+      expect(() => toDetails(r`\o{7777}`)).toThrow();
+    });
+
+    it(r`should match \o without { as identity escape`, () => {
+      expect('o').toExactlyMatch(r`\o`);
+    });
+
+    // Not an error in Onig
+    it(r`should throw for incomplete \o{`, () => {
+      expect(() => toDetails(r`\o{`)).toThrow();
+      expect(() => toDetails(r`\o{-}`)).toThrow();
+      expect(() => toDetails(r`\o{A}`)).toThrow();
+      expect(() => toDetails(r`\o{ 1}`)).toThrow();
+      // Quantified identity escape!
+      expect(() => toDetails(r`\o{,1}`)).toThrow();
+    });
+
+    it(r`should throw for invalid \o{N...}`, () => {
+      expect(() => toDetails(r`\o{1,}`)).toThrow();
+      expect(() => toDetails(r`\o{8}`)).toThrow();
+      expect(() => toDetails(r`\o{18}`)).toThrow();
+      expect(() => toDetails(r`\o{1A}`)).toThrow();
+    });
+  });
 });
diff --git a/src/tokenize.js b/src/tokenize.js
@@ -58,23 +58,33 @@ const EscapeCharCodes = new Map([
   ['v', 11], // vertical tab
 ]);
 
-const controlCharPattern = 'c.? | C(?:-.?)?';
-// Onig considers `\p` an identity escape, but e.g. `\p{`, `\p{ ^L}`, and `\p{gc=L}` are invalid
-const unicodePropertyPattern = r`[pP]\{(?:\^?[\x20\w]+\})?`;
-const encodedByteValuePattern = r`x[89A-Fa-f]\p{AHex}(?:\\x[89A-Fa-f]\p{AHex})*`;
-const hexCharPattern = r`u(?:\p{AHex}{4})? | x\{[^\}]*\}? | x\p{AHex}{0,2}`;
-const escapedNumPattern = r`\d{1,3}`;
 const charClassOpenPattern = r`\[\^?\]?`;
+const sharedEscapesPattern = `${
+  // Control char
+  'c.? | C(?:-.?)?'
+}|${
+  // Unicode property; Onig considers `\p` an identity escape, but e.g. `\p{`, `\p{ ^L}`, and
+  // `\p{gc=L}` are invalid
+  r`[pP]\{(?:\^?[\x20\w]+\})?`
+}|${
+  // Hex encoded byte sequence; attempt match before other `\xNN` hex char
+  r`x[89A-Fa-f]\p{AHex}(?:\\x[89A-Fa-f]\p{AHex})*`
+}|${
+  // Hex char
+  r`u(?:\p{AHex}{4})? | x\{[^\}]*\}? | x\p{AHex}{0,2}`
+}|${
+  // Enclosed octal code point
+  r`o\{[^\}]*\}?`
+}|${
+  // Escaped number
+  r`\d{1,3}`
+}`;
 // Even with flag x, Onig doesn't allow whitespace to separate a quantifier from the `?` or `+`
 // that makes it lazy or possessive. Possessive suffixes don't apply to interval quantifiers
 const quantifierRe = /[?*+][?+]?|\{(?:\d+(?:,\d*)?|,\d+)\}\??/;
 const tokenRe = new RegExp(r`
   \\ (?:
-    ${controlCharPattern}
-    | ${unicodePropertyPattern}
-    | ${encodedByteValuePattern}
-    | ${hexCharPattern}
-    | ${escapedNumPattern}
+    ${sharedEscapesPattern}
     | [gk]<[^>]*>?
     | [gk]'[^']*'?
     | .
@@ -93,11 +103,7 @@ const tokenRe = new RegExp(r`
 `.replace(/\s+/g, ''), 'gsu');
 const charClassTokenRe = new RegExp(r`
   \\ (?:
-    ${controlCharPattern}
-    | ${unicodePropertyPattern}
-    | ${encodedByteValuePattern}
-    | ${hexCharPattern}
-    | ${escapedNumPattern}
+    ${sharedEscapesPattern}
     | .
   )
   | \[:[^:]*:\]
@@ -478,13 +484,16 @@ function createTokenForSharedEscape(raw, {inCharClass}) {
   if ('dDhHsSwW'.includes(char1)) {
     return createTokenForShorthandCharClass(raw);
   }
+  if (raw.startsWith(r`\o{`)) {
+    throw new Error(`Incomplete, invalid, or unsupported octal code point "${raw}"`);
+  }
   if (/^\\[pP]\{/.test(raw)) {
     if (raw.length === 3) {
       throw new Error('Incomplete or invalid Unicode property');
     }
     return createTokenForUnicodeProperty(raw);
   }
-  // UTF-8 encoded byte sequence
+  // Hex UTF-8 encoded byte sequence
   if (/^\\x[89A-Fa-f]\p{AHex}/u.test(raw)) {
     try {
       const bytes = raw.split(/\\x/).slice(1).map(hex => parseInt(hex, 16));
@@ -716,7 +725,7 @@ function splitEscapedNumToken(token, numCaptures) {
     if (i === 0 && m !== '8' && m !== '9') {
       value = parseInt(m, 8);
       if (value > 0o177) {
-        // UTF-8 encoded byte sequence in octal; unsupported
+        // Octal UTF-8 encoded byte sequence; not yet supported
         throw new Error(r`Octal encoded byte above 177 unsupported "${raw}"`);
       }
     } else {