From 32ee0b9a4eac255726cb37ffb8ec28f9a188ca43 Mon Sep 17 00:00:00 2001 From: Steven Levithan Date: Mon, 11 Nov 2024 10:08:47 +0100 Subject: [PATCH] Error for unsupported grapheme boundary --- README.md | 31 ++++++++++++++++++++----------- spec/match-assertion.spec.js | 10 +++++++++- spec/match-char-class.spec.js | 3 ++- spec/match-char.spec.js | 2 +- src/tokenize.js | 16 ++++++++++------ 5 files changed, 42 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index 97393f3..46c35a5 100644 --- a/README.md +++ b/README.md @@ -384,7 +384,7 @@ Notice that nearly every feature below has at least subtle differences from Java ✔ Allows 1 hex digit
✔ Above 7F, is UTF-8 encoded byte (unlike JS)
- ✔ Error for invalid encoded bytes
+ ✔ Error for invalid encoded byte/sequence
@@ -602,7 +602,7 @@ Notice that nearly every feature below has at least subtle differences from Java - Assertions + Assertions Line start, end ^, $ ✅ @@ -639,15 +639,6 @@ Notice that nearly every feature below has at least subtle differences from Java ● Common uses supported
- - Word boundary - \b, \B - ✅ - ✅ - - ✔ Unicode based (unlike JS)
- - Lookaround @@ -663,6 +654,24 @@ Notice that nearly every feature below has at least subtle differences from Java ✔ Allows variable-length quantifiers and alternation within lookbehind
+ + Word boundary + \b, \B + ✅ + ✅ + + ✔ Unicode based (unlike JS)
+ + + + Grapheme boundary (extremely rare) + \y, \Y + ❌ + ❌ + + ● Not yet supported
+ + Quantifiers diff --git a/spec/match-assertion.spec.js b/spec/match-assertion.spec.js index 0fbf55b..3399842 100644 --- a/spec/match-assertion.spec.js +++ b/spec/match-assertion.spec.js @@ -8,7 +8,7 @@ beforeEach(() => { }); describe('Assertion', () => { - // [Note] For kinds `lookahead` and `lookbehind`, see `match-lookaround.spec.js` + // [NOTE] For kinds `lookahead` and `lookbehind`, see `match-lookaround.spec.js` describe('line_end', () => { it('should match at the end of the string', () => { @@ -341,4 +341,12 @@ describe('Assertion', () => { }); }); }); + + describe('grapheme_boundary', () => { + // Supportable with close approximation, but extremely rare and not many use cases + it('should throw as unsupported', () => { + expect(() => toDetails(r`\y`)).toThrow(); + expect(() => toDetails(r`\Y`)).toThrow(); + }); + }); }); diff --git a/spec/match-char-class.spec.js b/spec/match-char-class.spec.js index 4e606ae..1df0a90 100644 --- a/spec/match-char-class.spec.js +++ b/spec/match-char-class.spec.js @@ -41,7 +41,7 @@ describe('CharacterClass', () => { expect(cp(0o177)).toExactlyMatch(r`[\177]`); }); - it(r`should throw for UTF-8 encoded byte sequence in octal (above \177)`, () => { + it(r`should throw for octal UTF-8 encoded byte above \177`, () => { expect(() => toDetails(r`[\200]`)).toThrow(); expect(() => toDetails(r`[\777]`)).toThrow(); }); @@ -72,4 +72,5 @@ describe('CharacterClass', () => { // TODO: Add remaining // TODO: Test that nested negated classes throw for target ES2018 + // TODO: Test assertions/var-length escapes are identity escapes }); diff --git a/spec/match-char.spec.js b/spec/match-char.spec.js index 386d188..071f560 100644 --- a/spec/match-char.spec.js +++ b/spec/match-char.spec.js @@ -136,7 +136,7 @@ describe('Character', () => { expect(cp(0o177)).toExactlyMatch(r`\177`); }); - it(r`should throw for UTF-8 encoded byte sequence in octal (above \177)`, () => { + it(r`should throw for octal UTF-8 encoded byte above \177`, () => { expect(() => toDetails(r`\200`)).toThrow(); expect(() => toDetails(r`\777`)).toThrow(); }); diff --git a/src/tokenize.js b/src/tokenize.js index 8cd019f..3f23573 100644 --- a/src/tokenize.js +++ b/src/tokenize.js @@ -260,6 +260,10 @@ function getTokenWithDetails(context, pattern, m, lastIndex) { }), }; } + // Grapheme boundaries not yet unsupported; avoid treating as an identity escape + if ('yY'.includes(m1)) { + throw new Error(`Unsupported grapheme boundary "${m}"`); + } // Run last since it assumes an identity escape as final condition const result = createTokenForSharedEscape(m, {inCharClass: false}); return Array.isArray(result) ? {tokens: result} : {token: result}; @@ -523,7 +527,7 @@ function createTokenForSharedEscape(raw, {inCharClass}) { } // Meta `\M-x` and `\M-\C-x` are unsupported; avoid treating as an identity escape if (char1 === 'M') { - // [TODO] Supportable; see , + // [TODO] Supportable; see , , throw new Error(`Unsupported meta "${raw}"`); } // Identity escape; count code point length @@ -663,11 +667,11 @@ function getFlagPropsForToken(flags) { function getValidatedHexCharCode(raw) { // Note: Onig (tested 6.9.8) has a bug where bare `\u` and `\x` are identity escapes if they // appear at the very end of the pattern, so e.g. `\u` matches `u`, but `\u0`, `\u.`, and `[\u]` - // are all errors, and `\x.` and `[\x]` are not errors but fail to match anything. Don't emulate - // these bugs, and just treat these cases as errors. Also, Onig treats incomplete `\x{` (with the - // brace and not immediately followed by a hex digit) as an identity escape, so e.g. `\x{` - // matches `x{` and `^\x{,2}$` matches `xx`, but `\x{2,}` and `\x{0,2}` are errors. Don't emulate - // this crazy and pointless ambiguity, and just treat incomplete `\x{` as an error + // are all errors, and `\x.` and `[\x]` are not errors but seemingly fail to match anything. + // Don't emulate these bugs, and just treat these cases as errors. Also, Onig treats incomplete + // `\x{` (with the brace and not immediately followed by a hex digit) as an identity escape, so + // e.g. `\x{` matches `x{` and `^\x{,2}$` matches `xx`, but `\x{2,}` and `\x{0,2}` are errors. + // Don't emulate this nasty/pointless ambiguity; just treat incomplete `\x{` as an error if (/^(?:\\u(?!\p{AHex}{4})|\\x(?!\p{AHex}{1,2}|\{\p{AHex}{1,8}\}))/u.test(raw)) { throw new Error(`Incomplete or invalid escape "${raw}"`); }