Skip to content

Commit

Permalink
Error for invalid, incomplete, or unsupported octal code point
Browse files Browse the repository at this point in the history
  • Loading branch information
slevithan committed Nov 11, 2024
1 parent 32ee0b9 commit 1406cdf
Show file tree
Hide file tree
Showing 3 changed files with 70 additions and 25 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -384,7 +384,7 @@ Notice that nearly every feature below has at least subtle differences from Java
<td>
✔ Allows 1 hex digit<br>
✔ Above <code>7F</code>, is UTF-8 encoded byte (unlike JS)<br>
✔ Error for invalid encoded byte/sequence<br>
✔ Error for invalid encoded bytes<br>
</td>
</tr>
<tr valign="top">
Expand Down Expand Up @@ -435,7 +435,7 @@ Notice that nearly every feature below has at least subtle differences from Java
● Non-A-Za-z with <code>\cx</code>, <code>\C-x</code><br>
● Meta <code>\M-x</code>, <code>\M-\C-x</code><br>
● Octal code point <code>\o{…}</code><br>
● UTF-8 encoded bytes in octal<br>
Octal UTF-8 encoded bytes<br>
</td>
</tr>

Expand Down
46 changes: 41 additions & 5 deletions spec/match-char.spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ describe('Character', () => {
expect('\x1A').toExactlyMatch(r`\C-z`);
});

// Currently unsupported: control chars other than A-Za-z
// Not yet supported: control char identifier other than A-Za-z
it('should throw for unsupported control char', () => {
expect(() => toDetails(r`\c.`)).toThrow();
expect(() => toDetails(r`\C-.`)).toThrow();
Expand Down Expand Up @@ -95,18 +95,24 @@ describe('Character', () => {
});

describe('meta', () => {
// Not yet supported
it('should throw for unsupported meta', () => {
expect(() => toDetails(r`\M-\1`)).toThrow();
});

it('should throw for incomplete meta', () => {
expect(() => toDetails(r`\M`)).toThrow();
expect(() => toDetails(r`\M-`)).toThrow();
// Currently unsupported
expect(() => toDetails(r`\M-\1`)).toThrow();
});

// Not yet supported
it('should throw for unsupported meta control char', () => {
expect(() => toDetails(r`\M-\C-A`)).toThrow();
});

it('should throw for incomplete meta control char', () => {
expect(() => toDetails(r`\M-\C`)).toThrow();
expect(() => toDetails(r`\M-\C-`)).toThrow();
// Currently unsupported
expect(() => toDetails(r`\M-\C-A`)).toThrow();
});
});

Expand Down Expand Up @@ -234,4 +240,34 @@ describe('Character', () => {
expect(() => toDetails(r`\x{110000}`)).toThrow();
});
});
describe('enclosed octal', () => {
// Not yet supported
it('should throw for unsupported octal code point', () => {
expect(() => toDetails(r`\o{0}`)).toThrow();
expect(() => toDetails(r`\o{177}`)).toThrow();
expect(() => toDetails(r`\o{7777}`)).toThrow();
});
it(r`should match \o without { as identity escape`, () => {
expect('o').toExactlyMatch(r`\o`);
});
// Not an error in Onig
it(r`should throw for incomplete \o{`, () => {
expect(() => toDetails(r`\o{`)).toThrow();
expect(() => toDetails(r`\o{-}`)).toThrow();
expect(() => toDetails(r`\o{A}`)).toThrow();
expect(() => toDetails(r`\o{ 1}`)).toThrow();
// Quantified identity escape!
expect(() => toDetails(r`\o{,1}`)).toThrow();
});

it(r`should throw for invalid \o{N...}`, () => {
expect(() => toDetails(r`\o{1,}`)).toThrow();
expect(() => toDetails(r`\o{8}`)).toThrow();
expect(() => toDetails(r`\o{18}`)).toThrow();
expect(() => toDetails(r`\o{1A}`)).toThrow();
});
});
});
45 changes: 27 additions & 18 deletions src/tokenize.js
Original file line number Diff line number Diff line change
Expand Up @@ -58,23 +58,33 @@ const EscapeCharCodes = new Map([
['v', 11], // vertical tab
]);

const controlCharPattern = 'c.? | C(?:-.?)?';
// Onig considers `\p` an identity escape, but e.g. `\p{`, `\p{ ^L}`, and `\p{gc=L}` are invalid
const unicodePropertyPattern = r`[pP]\{(?:\^?[\x20\w]+\})?`;
const encodedByteValuePattern = r`x[89A-Fa-f]\p{AHex}(?:\\x[89A-Fa-f]\p{AHex})*`;
const hexCharPattern = r`u(?:\p{AHex}{4})? | x\{[^\}]*\}? | x\p{AHex}{0,2}`;
const escapedNumPattern = r`\d{1,3}`;
const charClassOpenPattern = r`\[\^?\]?`;
const sharedEscapesPattern = `${
// Control char
'c.? | C(?:-.?)?'
}|${
// Unicode property; Onig considers `\p` an identity escape, but e.g. `\p{`, `\p{ ^L}`, and
// `\p{gc=L}` are invalid
r`[pP]\{(?:\^?[\x20\w]+\})?`
}|${
// Hex encoded byte sequence; attempt match before other `\xNN` hex char
r`x[89A-Fa-f]\p{AHex}(?:\\x[89A-Fa-f]\p{AHex})*`
}|${
// Hex char
r`u(?:\p{AHex}{4})? | x\{[^\}]*\}? | x\p{AHex}{0,2}`
}|${
// Enclosed octal code point
r`o\{[^\}]*\}?`
}|${
// Escaped number
r`\d{1,3}`
}`;
// Even with flag x, Onig doesn't allow whitespace to separate a quantifier from the `?` or `+`
// that makes it lazy or possessive. Possessive suffixes don't apply to interval quantifiers
const quantifierRe = /[?*+][?+]?|\{(?:\d+(?:,\d*)?|,\d+)\}\??/;
const tokenRe = new RegExp(r`
\\ (?:
${controlCharPattern}
| ${unicodePropertyPattern}
| ${encodedByteValuePattern}
| ${hexCharPattern}
| ${escapedNumPattern}
${sharedEscapesPattern}
| [gk]<[^>]*>?
| [gk]'[^']*'?
| .
Expand All @@ -93,11 +103,7 @@ const tokenRe = new RegExp(r`
`.replace(/\s+/g, ''), 'gsu');
const charClassTokenRe = new RegExp(r`
\\ (?:
${controlCharPattern}
| ${unicodePropertyPattern}
| ${encodedByteValuePattern}
| ${hexCharPattern}
| ${escapedNumPattern}
${sharedEscapesPattern}
| .
)
| \[:[^:]*:\]
Expand Down Expand Up @@ -478,13 +484,16 @@ function createTokenForSharedEscape(raw, {inCharClass}) {
if ('dDhHsSwW'.includes(char1)) {
return createTokenForShorthandCharClass(raw);
}
if (raw.startsWith(r`\o{`)) {
throw new Error(`Incomplete, invalid, or unsupported octal code point "${raw}"`);
}
if (/^\\[pP]\{/.test(raw)) {
if (raw.length === 3) {
throw new Error('Incomplete or invalid Unicode property');
}
return createTokenForUnicodeProperty(raw);
}
// UTF-8 encoded byte sequence
// Hex UTF-8 encoded byte sequence
if (/^\\x[89A-Fa-f]\p{AHex}/u.test(raw)) {
try {
const bytes = raw.split(/\\x/).slice(1).map(hex => parseInt(hex, 16));
Expand Down Expand Up @@ -716,7 +725,7 @@ function splitEscapedNumToken(token, numCaptures) {
if (i === 0 && m !== '8' && m !== '9') {
value = parseInt(m, 8);
if (value > 0o177) {
// UTF-8 encoded byte sequence in octal; unsupported
// Octal UTF-8 encoded byte sequence; not yet supported
throw new Error(r`Octal encoded byte above 177 unsupported "${raw}"`);
}
} else {
Expand Down

0 comments on commit 1406cdf

Please sign in to comment.