diff --git a/README.md b/README.md
index 97393f3..46c35a5 100644
--- a/README.md
+++ b/README.md
@@ -384,7 +384,7 @@ Notice that nearly every feature below has at least subtle differences from Java
✔ Allows 1 hex digit
✔ Above 7F , is UTF-8 encoded byte (unlike JS)
- ✔ Error for invalid encoded bytes
+ ✔ Error for invalid encoded byte/sequence
|
@@ -602,7 +602,7 @@ Notice that nearly every feature below has at least subtle differences from Java
- Assertions |
+ Assertions |
Line start, end |
^ , $ |
✅ |
@@ -639,15 +639,6 @@ Notice that nearly every feature below has at least subtle differences from Java
● Common uses supported
-
- Word boundary |
- \b , \B |
- ✅ |
- ✅ |
-
- ✔ Unicode based (unlike JS)
- |
-
Lookaround |
@@ -663,6 +654,24 @@ Notice that nearly every feature below has at least subtle differences from Java
✔ Allows variable-length quantifiers and alternation within lookbehind
|
+
+ Word boundary |
+ \b , \B |
+ ✅ |
+ ✅ |
+
+ ✔ Unicode based (unlike JS)
+ |
+
+
+ Grapheme boundary (extremely rare) |
+ \y , \Y |
+ ❌ |
+ ❌ |
+
+ ● Not yet supported
+ |
+
Quantifiers |
diff --git a/spec/match-assertion.spec.js b/spec/match-assertion.spec.js
index 0fbf55b..3399842 100644
--- a/spec/match-assertion.spec.js
+++ b/spec/match-assertion.spec.js
@@ -8,7 +8,7 @@ beforeEach(() => {
});
describe('Assertion', () => {
- // [Note] For kinds `lookahead` and `lookbehind`, see `match-lookaround.spec.js`
+ // [NOTE] For kinds `lookahead` and `lookbehind`, see `match-lookaround.spec.js`
describe('line_end', () => {
it('should match at the end of the string', () => {
@@ -341,4 +341,12 @@ describe('Assertion', () => {
});
});
});
+
+ describe('grapheme_boundary', () => {
+ // Supportable with close approximation, but extremely rare and not many use cases
+ it('should throw as unsupported', () => {
+ expect(() => toDetails(r`\y`)).toThrow();
+ expect(() => toDetails(r`\Y`)).toThrow();
+ });
+ });
});
diff --git a/spec/match-char-class.spec.js b/spec/match-char-class.spec.js
index 4e606ae..1df0a90 100644
--- a/spec/match-char-class.spec.js
+++ b/spec/match-char-class.spec.js
@@ -41,7 +41,7 @@ describe('CharacterClass', () => {
expect(cp(0o177)).toExactlyMatch(r`[\177]`);
});
- it(r`should throw for UTF-8 encoded byte sequence in octal (above \177)`, () => {
+ it(r`should throw for octal UTF-8 encoded byte above \177`, () => {
expect(() => toDetails(r`[\200]`)).toThrow();
expect(() => toDetails(r`[\777]`)).toThrow();
});
@@ -72,4 +72,5 @@ describe('CharacterClass', () => {
// TODO: Add remaining
// TODO: Test that nested negated classes throw for target ES2018
+ // TODO: Test assertions/var-length escapes are identity escapes
});
diff --git a/spec/match-char.spec.js b/spec/match-char.spec.js
index 386d188..071f560 100644
--- a/spec/match-char.spec.js
+++ b/spec/match-char.spec.js
@@ -136,7 +136,7 @@ describe('Character', () => {
expect(cp(0o177)).toExactlyMatch(r`\177`);
});
- it(r`should throw for UTF-8 encoded byte sequence in octal (above \177)`, () => {
+ it(r`should throw for octal UTF-8 encoded byte above \177`, () => {
expect(() => toDetails(r`\200`)).toThrow();
expect(() => toDetails(r`\777`)).toThrow();
});
diff --git a/src/tokenize.js b/src/tokenize.js
index 8cd019f..3f23573 100644
--- a/src/tokenize.js
+++ b/src/tokenize.js
@@ -260,6 +260,10 @@ function getTokenWithDetails(context, pattern, m, lastIndex) {
}),
};
}
+ // Grapheme boundaries not yet unsupported; avoid treating as an identity escape
+ if ('yY'.includes(m1)) {
+ throw new Error(`Unsupported grapheme boundary "${m}"`);
+ }
// Run last since it assumes an identity escape as final condition
const result = createTokenForSharedEscape(m, {inCharClass: false});
return Array.isArray(result) ? {tokens: result} : {token: result};
@@ -523,7 +527,7 @@ function createTokenForSharedEscape(raw, {inCharClass}) {
}
// Meta `\M-x` and `\M-\C-x` are unsupported; avoid treating as an identity escape
if (char1 === 'M') {
- // [TODO] Supportable; see ,
+ // [TODO] Supportable; see , ,
throw new Error(`Unsupported meta "${raw}"`);
}
// Identity escape; count code point length
@@ -663,11 +667,11 @@ function getFlagPropsForToken(flags) {
function getValidatedHexCharCode(raw) {
// Note: Onig (tested 6.9.8) has a bug where bare `\u` and `\x` are identity escapes if they
// appear at the very end of the pattern, so e.g. `\u` matches `u`, but `\u0`, `\u.`, and `[\u]`
- // are all errors, and `\x.` and `[\x]` are not errors but fail to match anything. Don't emulate
- // these bugs, and just treat these cases as errors. Also, Onig treats incomplete `\x{` (with the
- // brace and not immediately followed by a hex digit) as an identity escape, so e.g. `\x{`
- // matches `x{` and `^\x{,2}$` matches `xx`, but `\x{2,}` and `\x{0,2}` are errors. Don't emulate
- // this crazy and pointless ambiguity, and just treat incomplete `\x{` as an error
+ // are all errors, and `\x.` and `[\x]` are not errors but seemingly fail to match anything.
+ // Don't emulate these bugs, and just treat these cases as errors. Also, Onig treats incomplete
+ // `\x{` (with the brace and not immediately followed by a hex digit) as an identity escape, so
+ // e.g. `\x{` matches `x{` and `^\x{,2}$` matches `xx`, but `\x{2,}` and `\x{0,2}` are errors.
+ // Don't emulate this nasty/pointless ambiguity; just treat incomplete `\x{` as an error
if (/^(?:\\u(?!\p{AHex}{4})|\\x(?!\p{AHex}{1,2}|\{\p{AHex}{1,8}\}))/u.test(raw)) {
throw new Error(`Incomplete or invalid escape "${raw}"`);
}