Skip to content

Commit

Permalink
Error for unsupported grapheme boundary
Browse files Browse the repository at this point in the history
  • Loading branch information
slevithan committed Nov 11, 2024
1 parent 9aa7683 commit 32ee0b9
Show file tree
Hide file tree
Showing 5 changed files with 42 additions and 20 deletions.
31 changes: 20 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -384,7 +384,7 @@ Notice that nearly every feature below has at least subtle differences from Java
<td>
✔ Allows 1 hex digit<br>
✔ Above <code>7F</code>, is UTF-8 encoded byte (unlike JS)<br>
✔ Error for invalid encoded bytes<br>
✔ Error for invalid encoded byte/sequence<br>
</td>
</tr>
<tr valign="top">
Expand Down Expand Up @@ -602,7 +602,7 @@ Notice that nearly every feature below has at least subtle differences from Java
</tr>

<tr valign="top">
<th align="left" rowspan="6">Assertions</th>
<th align="left" rowspan="7">Assertions</th>
<td>Line start, end</td>
<td><code>^</code>, <code>$</code></td>
<td align="middle">✅</td>
Expand Down Expand Up @@ -639,15 +639,6 @@ Notice that nearly every feature below has at least subtle differences from Java
● Common uses supported<br>
</td>
</tr>
<tr valign="top">
<td>Word boundary</td>
<td><code>\b</code>, <code>\B</code></td>
<td align="middle">✅</td>
<td align="middle">✅</td>
<td>
✔ Unicode based (unlike JS)<br>
</td>
</tr>
<tr valign="top">
<td>Lookaround</td>
<td>
Expand All @@ -663,6 +654,24 @@ Notice that nearly every feature below has at least subtle differences from Java
✔ Allows variable-length quantifiers and alternation within lookbehind<br>
</td>
</tr>
<tr valign="top">
<td>Word boundary</td>
<td><code>\b</code>, <code>\B</code></td>
<td align="middle">✅</td>
<td align="middle">✅</td>
<td>
✔ Unicode based (unlike JS)<br>
</td>
</tr>
<tr valign="top">
<td>Grapheme boundary (extremely rare)</td>
<td><code>\y</code>, <code>\Y</code></td>
<td align="middle">❌</td>
<td align="middle">❌</td>
<td>
● Not yet supported<br>
</td>
</tr>

<tr valign="top">
<th align="left" rowspan="3">Quantifiers</th>
Expand Down
10 changes: 9 additions & 1 deletion spec/match-assertion.spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ beforeEach(() => {
});

describe('Assertion', () => {
// [Note] For kinds `lookahead` and `lookbehind`, see `match-lookaround.spec.js`
// [NOTE] For kinds `lookahead` and `lookbehind`, see `match-lookaround.spec.js`

describe('line_end', () => {
it('should match at the end of the string', () => {
Expand Down Expand Up @@ -341,4 +341,12 @@ describe('Assertion', () => {
});
});
});

describe('grapheme_boundary', () => {
// Supportable with close approximation, but extremely rare and not many use cases
it('should throw as unsupported', () => {
expect(() => toDetails(r`\y`)).toThrow();
expect(() => toDetails(r`\Y`)).toThrow();
});
});
});
3 changes: 2 additions & 1 deletion spec/match-char-class.spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ describe('CharacterClass', () => {
expect(cp(0o177)).toExactlyMatch(r`[\177]`);
});

it(r`should throw for UTF-8 encoded byte sequence in octal (above \177)`, () => {
it(r`should throw for octal UTF-8 encoded byte above \177`, () => {
expect(() => toDetails(r`[\200]`)).toThrow();
expect(() => toDetails(r`[\777]`)).toThrow();
});
Expand Down Expand Up @@ -72,4 +72,5 @@ describe('CharacterClass', () => {

// TODO: Add remaining
// TODO: Test that nested negated classes throw for target ES2018
// TODO: Test assertions/var-length escapes are identity escapes
});
2 changes: 1 addition & 1 deletion spec/match-char.spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ describe('Character', () => {
expect(cp(0o177)).toExactlyMatch(r`\177`);
});

it(r`should throw for UTF-8 encoded byte sequence in octal (above \177)`, () => {
it(r`should throw for octal UTF-8 encoded byte above \177`, () => {
expect(() => toDetails(r`\200`)).toThrow();
expect(() => toDetails(r`\777`)).toThrow();
});
Expand Down
16 changes: 10 additions & 6 deletions src/tokenize.js
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,10 @@ function getTokenWithDetails(context, pattern, m, lastIndex) {
}),
};
}
// Grapheme boundaries not yet unsupported; avoid treating as an identity escape
if ('yY'.includes(m1)) {
throw new Error(`Unsupported grapheme boundary "${m}"`);
}
// Run last since it assumes an identity escape as final condition
const result = createTokenForSharedEscape(m, {inCharClass: false});
return Array.isArray(result) ? {tokens: result} : {token: result};
Expand Down Expand Up @@ -523,7 +527,7 @@ function createTokenForSharedEscape(raw, {inCharClass}) {
}
// Meta `\M-x` and `\M-\C-x` are unsupported; avoid treating as an identity escape
if (char1 === 'M') {
// [TODO] Supportable; see <github.com/kkos/oniguruma/blob/master/doc/SYNTAX.md#12-onig_syn_op2_esc_capital_m_bar_meta-enable-m-x>, <github.com/kkos/oniguruma/blob/43a8c3f3daf263091f3a74019d4b32ebb6417093/src/regparse.c#L4695>
// [TODO] Supportable; see <github.com/kkos/oniguruma/blob/master/doc/SYNTAX.md#12-onig_syn_op2_esc_capital_m_bar_meta-enable-m-x>, <github.com/kkos/oniguruma/blob/43a8c3f3daf263091f3a74019d4b32ebb6417093/src/regparse.c#L4695>, <https://github.com/ammar/regexp_parser/blob/8851030feda68223d74f502335fb254a20d77016/lib/regexp_parser/expression/classes/escape_sequence.rb#L75>
throw new Error(`Unsupported meta "${raw}"`);
}
// Identity escape; count code point length
Expand Down Expand Up @@ -663,11 +667,11 @@ function getFlagPropsForToken(flags) {
function getValidatedHexCharCode(raw) {
// Note: Onig (tested 6.9.8) has a bug where bare `\u` and `\x` are identity escapes if they
// appear at the very end of the pattern, so e.g. `\u` matches `u`, but `\u0`, `\u.`, and `[\u]`
// are all errors, and `\x.` and `[\x]` are not errors but fail to match anything. Don't emulate
// these bugs, and just treat these cases as errors. Also, Onig treats incomplete `\x{` (with the
// brace and not immediately followed by a hex digit) as an identity escape, so e.g. `\x{`
// matches `x{` and `^\x{,2}$` matches `xx`, but `\x{2,}` and `\x{0,2}` are errors. Don't emulate
// this crazy and pointless ambiguity, and just treat incomplete `\x{` as an error
// are all errors, and `\x.` and `[\x]` are not errors but seemingly fail to match anything.
// Don't emulate these bugs, and just treat these cases as errors. Also, Onig treats incomplete
// `\x{` (with the brace and not immediately followed by a hex digit) as an identity escape, so
// e.g. `\x{` matches `x{` and `^\x{,2}$` matches `xx`, but `\x{2,}` and `\x{0,2}` are errors.
// Don't emulate this nasty/pointless ambiguity; just treat incomplete `\x{` as an error
if (/^(?:\\u(?!\p{AHex}{4})|\\x(?!\p{AHex}{1,2}|\{\p{AHex}{1,8}\}))/u.test(raw)) {
throw new Error(`Incomplete or invalid escape "${raw}"`);
}
Expand Down

0 comments on commit 32ee0b9

Please sign in to comment.