From e4a2bc62177feb0954db6d266fb9fdfdad6dbf0d Mon Sep 17 00:00:00 2001 From: Rob23oba Date: Mon, 26 Aug 2024 21:19:11 +0200 Subject: [PATCH 1/4] rhemyn: reworked and extended parser --- rhemyn/parse.js | 655 +++++++++++++++++++++++++++++++----------------- 1 file changed, 421 insertions(+), 234 deletions(-) diff --git a/rhemyn/parse.js b/rhemyn/parse.js index 7d8464a2..dd8504c7 100644 --- a/rhemyn/parse.js +++ b/rhemyn/parse.js @@ -1,14 +1,4 @@ -const State = { - none: 0, - insideSet: 1 -}; - -const Quantifiers = { - '*': [ 0 ], // 0 - - '+': [ 1 ], // 1 - - '?': [ 0, 1 ], // 0 - 1 -}; -const QuantifierKeys = Object.keys(Quantifiers); +"use strict"; const getArg = (name, def) => { const arg = (typeof process !== 'undefined' ? process.argv : Deno.args).find(x => x.startsWith(`--${name}=`)); @@ -56,268 +46,465 @@ const EscapeSequences = { '0': '\0' }; -const HexDigit = /[0-9a-fA-F]/; - -export default str => { - const Metachars = _Metachars(); - - const out = { - type: 'Expression', - body: [] - }; - let node = out, parents = []; - - let state = State.none, setIndex = 0, escape = false; - for (let i = 0; i < str.length; i++) { - const c = str[i]; - - const charNode = char => ({ - type: 'Character', - char - }); - - const rangeNode = (from, to) => ({ - type: 'Range', - from, - to - }); - - const addChar = (char = c) => { - node.body.push(charNode(char)); - }; +const charNode = char => ({ + type: 'Character', + char +}); - const addSet = (matches, negated = false) => { - let body = matches.map(x => x[1] ? rangeNode(x[0], x[1]) : charNode(x)); - if (state === State.insideSet) { - // if negated, mark each node as negated for merge - if (negated) body = body.map(x => { - x.negated = true; - return x; - }); +const rangeNode = (from, to) => ({ + type: 'Range', + from, + to +}); - // already in set, merge bodies - node.body.push(...body); - return; +const parseEscape = (str, index, inSet, unicodeMode, unicodeSetsMode) => { + const c = str[index++]; + if (!c) { + throw new SyntaxError('Unterminated escape sequence at end of pattern'); + } + if (EscapeSequences[c]) { + return [ EscapeSequences[c], index ]; + } + if (c === 'c') { + // \c (not [A-Za-z] ...) = literal \c... (WHY) + const next = str[index]; + if (next == null || /[^a-zA-Z]/.test(next)) { + if (unicodeMode) { + throw new SyntaxError('Invalid control character escape, expected /\\\\c[a-zA-Z]/'); } + if (!inSet || /[^0-9_]/.test(next)) { + return [ '\\', index - 1 ]; + } + // legacy: \c3 = \x13 and \c_ = \x1F + } + // \c[A-Za-z] + const code = next.charCodeAt(0); + return [ String.fromCharCode(code % 32), index + 1 ]; + } + if (c === 'x' || c === 'u') { + // \x = x + // \xH = xH + // \x[0-9a-zA-Z][0-9a-zA-Z] = \xAB + // '\u' = u + // '\uHHH' = uHHH + // '\uABCD' = \uABCD + if (unicodeMode && str[index] === '{') { + index++; + const endIndex = str.indexOf('}', index); + if (endIndex < 0) { + throw new SyntaxError('Unterminated unicode character escape'); + } + const hexStr = str.substring(index, endIndex); + if (/[^0-9a-fA-F]/.test(hexStr)) { + throw new SyntaxError('Invalid unicode character escape, expected /\\\\u\\{[0-9a-fA-F]*\\}/'); + } + const code = parseInt(hexStr, 16); + if (code >= 0x110000) { + throw new SyntaxError('Invalid unicode character escape, code point may not be above U+10FFFF'); + } + return [ String.fromCodePoint(code), endIndex + 1 ]; + } + const count = c === 'x' ? 2 : 4; + const next = str.substr(index, count); - node.body.push({ - type: 'Set', - body, - negated - }); - }; - - const addMetachar = meta => { - const [ matches, negated = false ] = meta; - return addSet(matches, negated); - }; - - // get next char and consume it - const seek = (allowEscaped = true) => { - const cNext = str[++i]; - - if (cNext === '\\') return !allowEscaped ? undefined : [ str[++i], true ]; - return !allowEscaped ? cNext : [ cNext, false ]; - }; + // missing a char or invalid hex digit + if (next.length < count || /[^0-9a-fA-F]/.test(next)) { + if (unicodeMode) { + throw new SyntaxError(`Invalid hex character escape, expected /\\\\${c}[0-9a-fA-F]{${count}}/`); + } + return [ c, index ]; + } + const code = parseInt(next, 16); + return [ String.fromCharCode(code), index + count ]; + } + if (inSet && c === 'b') { + return [ '\b', index ]; + } + if (unicodeMode) { + let allowedSymbols = '^$\\.*+?()[]{}|/'; + if (inSet) { + if (unicodeSetsMode) { + allowedSymbols += '&-!#%,:;<=>@`~'; + } else { + allowedSymbols += '-'; + } + } + if (!allowedSymbols.includes(c)) { + throw new SyntaxError(`Invalid identity escape '${c}', expected one of: ${allowedSymbols}`); + } + } + return [ c, index ]; +}; - // get next char without consuming - const peek = (allowEscaped = true, offset = 0) => { - const cNext = str[i + 1 + offset]; +const classEscapeNode = (classType, negated) => ({ + type: 'CharacterClassEscape', + classType, + negated +}); - if (cNext === '\\') return !allowEscaped ? undefined : [ str[i + 2 + offset], true ]; - return !allowEscaped ? cNext : [ cNext, false ]; - }; +const unicodeClassEscapeNode = (property, value, negated) => ({ + type: 'CharacterClassEscape', + classType: 'UnicodeProperty', + property, + value, + negated +}); - if (escape) { - escape = false; - if (EscapeSequences[c]) { - addChar(EscapeSequences[c]); - continue; +const parseClassEscape = (str, index, unicodeMode) => { + switch (str[index]) { + case 'd': return [ classEscapeNode('Digit', false), index + 1 ]; + case 'D': return [ classEscapeNode('Digit', true), index + 1 ]; + case 's': return [ classEscapeNode('Whitespace', false), index + 1 ]; + case 'S': return [ classEscapeNode('Whitespace', true), index + 1 ]; + case 'w': return [ classEscapeNode('WordCharacter', false), index + 1 ]; + case 'W': return [ classEscapeNode('WordCharacter', true), index + 1 ]; + case 'p': + case 'P': + if (!unicodeMode) { + return [ null, index ]; } - - if (Metachars.escaped[c]) { - addMetachar(Metachars.escaped[c]); - continue; + const negated = str[index] === 'P'; + index++; + if (str[index] !== '{') { + throw new SyntaxError('Invalid escape sequence \\p, expected unicode property \\p{...}'); } + index++; + const endIndex = str.indexOf('}', index); + if (endIndex < 0) { + throw new SyntaxError('Unterminated unicode property escape sequence'); + } + let property = str.substring(index, endIndex); + let value = null; + const eq = property.indexOf('='); + if (eq >= 0) { + value = property.substring(eq + 1); + property = property.substring(0, eq); + } + // todo: validate unicode property + return [ unicodeClassEscapeNode(property, value, negated), index ]; + default: + return [ null, index ]; + } +}; - if (c === 'c') { - // \c (not [A-Za-z] ...) = literal \c... (WHY) - const next = peek(false); - if (next == null || /[^a-zA-Z]/.test(next)) { - addChar('\\'); - addChar('c'); - continue; - } +const parseSet = (str, index, unicodeMode, unicodeSetsMode) => { + let negated = false; + if (str[index] === '^') { + negated = true; + index++; + } + if (!unicodeSetsMode) { + // Simple character classes + + } +}; - // \c[A-Za-z] - const code = seek(false).charCodeAt(0); - addChar(String.fromCharCode(code % 32)); - continue; +const parseParenthesizedType = (str, index) => { + if (str[index] !== '?') { + return [{ + type: 'Group', + body: [], + capture: true + }, index ]; + } + // special + index++; + let c = str[index++]; + switch (c) { + case ':': + // non-capturing + return [ { + type: 'Group', + body: [] + }, index ]; + case '=': + // positive look-ahead + return [ { + type: 'LookAhead', + body: [], + negated: false + }, index ]; + case '!': + // negative look-ahead + return [ { + type: 'LookAhead', + body: [], + negated: true + }, index ]; + case '<': + // look-behind / group name + c = str[index]; + if (c === '=' || c === '!') { + return [ { + type: 'LookBehind', + body: [], + negated: c === '!' + }, index + 1 ]; } + const endIndex = str.indexOf('>'); + if (endIndex < 0) { + throw new SyntaxError('Expected group name after (?<, for look-behinds use (?<= or (? { + let c = str[index++]; + switch (c) { + case '*': + return [ [ 0 ], index ]; // 0 or above + case '+': + return [ [ 1 ], index ]; // 1 or above + case '?': + return [ [ 0, 1 ], index ]; // 0 - 1 + case '{': + if (!(str[index] >= '0' && str[index] <= '9')) { + return [ new SyntaxError('Invalid quantifier, expected number'), -1 ]; + } + let min = 0; + while (str[index] >= '0' && str[index] <= '9') { + min *= 10; + min += str.charCodeAt(index++) - 48; + } + if (str[index] === '}') { + return [ [ min, min ], index + 1 ]; + } + if (str[index] !== ',') { + return [ new SyntaxError("Invalid quantifier, expected ',' or '}' after minimum count"), -1 ]; + } + index++; + if (!(str[index] >= '0' && str[index] <= '9')) { + if (str[index] !== '}') { + return [ new SyntaxError("Unclosed quantifier, expected '}'"), -1 ]; } - - const code = parseInt(seek(false) + seek(false), 16); - addChar(String.fromCodePoint(code)); - continue; + return [ [ min ], index + 1 ]; } + let max = 0; + do { + max *= 10; + max += str.charCodeAt(index++) - 48; + } while (str[index] >= '0' && str[index] <= '9'); + if (str[index] !== '}') { + return [ new SyntaxError("Unclosed quantifier, expected '}'"), -1 ]; + } + return [ [ min, max ], index + 1 ]; + default: + return [ null, index - 1 ]; + } +}; - if (c === 'u') { - // '\u' = u - // '\uHHH' = uHHH - // '\uABCD' = \uABCD - const next1 = peek(false); - const next2 = peek(false, 1); - const next3 = peek(false, 2); - const next4 = peek(false, 3); +export default (str, unicodeMode = false, unicodeSetsMode = false) => { + const Metachars = _Metachars(); - // missing a char or invalid hex digit - if (next1 == null || next2 == null || next3 == null || next4 == null || !HexDigit.test(next1) || !HexDigit.test(next2) || !HexDigit.test(next3) || !HexDigit.test(next4)) { - addChar('u'); - continue; + const out = { + type: 'Expression', + body: [] + }; + let node = out, target = node.body, parents = []; + + let i = 0; + const applyQuantifier = to => { + while (true) { + const [ quantifier, newIndex ] = parseQuantifier(str, i); + if (newIndex === -1) { + if (unicodeMode) { + throw quantifier; // error } - - const code = parseInt(seek(false) + seek(false) + seek(false) + seek(false), 16); - addChar(String.fromCodePoint(code)); + // assume '{' + to = charNode('{'); + target.push(to); + i++; + // perf: try repeating '{' using loop continue; } - - addChar(); - continue; + if (quantifier == null) { + return; + } + i = newIndex; + to.quantifier = quantifier; + if (str[i] === '?') { + to.lazy = true; + i++; + } + break; } - - if (c === '\\') { - escape = true; - continue; + }; + const addChar = (char, quantifiable = true) => { + const n = charNode(char); + if (quantifiable) { + applyQuantifier(n); } + target.push(n); + }; - switch (state) { - case State.none: - if (c === '[') { - parents.push(node); - node = { - type: 'Set', - body: [], - negated: false - }; - - parents.at(-1).body.push(node); + while (i < str.length) { + const c = str[i++]; - state = State.insideSet; - setIndex = 0; - continue; + switch (c) { + case '^': + target.push({ + type: 'Begin' + }); + break; + case '$': + target.push({ + type: 'End' + }); + break; + case '\\': { + if (str[i] >= '1' && str[i] <= '9') { + let number = str.charCodeAt(i++) - 48; + while (str[i] >= '0' && str[i] <= '9') { + number *= 10; + number += str.charCodeAt(i++) - 48; + } + const n = { + type: 'Backreference', + number + }; + target.push(n); + applyQuantifier(n); + break; } - - if (c === '(') { - parents.push(node); - node = { - type: 'Group', - body: [] + if (str[i] === 'k') { + i++; + if (str[i] !== '<') { + // todo: fail if there aren't any named groups + if (unicodeMode) { + throw new SyntaxError('Invalid named backreference, expected \\k<...>'); + } + addChar('k'); + break; + } + i++; + const endIndex = str.indexOf('>', i); + if (endIndex < 0) { + // todo: fail if there aren't any named groups + if (unicodeMode) { + throw new SyntaxError('Unclosed named backreference'); + } + addChar('k', false); + addChar('<'); + break; + } + const name = str.substring(i, endIndex); + // todo: validate name + const n = { + type: 'NamedBackreference', + name }; - - parents.at(-1).body.push(node); - continue; + target.push(n); + i = endIndex + 1; + applyQuantifier(n); + break; } - - if (c === ')') { - if (node.type !== 'Group') throw new SyntaxError('Unmatched closing parenthesis'); - - node = parents.pop(); - continue; + if (str[i] === 'b') { + target.push({ + type: 'WordBoundary', + negated: false + }); + i++; + break; } - - if (QuantifierKeys.includes(c)) { - const last = node.body.at(-1); - if (!last) continue; // ignore, maybe lookahead - - last.quantifier = Quantifiers[c]; - - // lazy modifier - if (peek(false) === '?') last.lazy = true; - - continue; + if (str[i] === 'B') { + target.push({ + type: 'WordBoundary', + negated: true + }); + i++; + break; } - - if (Metachars.unescaped[c]) { - addMetachar(Metachars.unescaped[c]); - continue; + const [ escape, newIndex ] = parseClassEscape(str, i, unicodeMode, unicodeSetsMode); + if (escape) { + target.push(escape); + i = newIndex; + applyQuantifier(escape); + break; } - - addChar(); - break; - - case State.insideSet: - setIndex++; - if (setIndex === 1) { - // first char in set - if (c === '^') { - node.negated = true; - continue; - } + const [ char, newIndex2 ] = parseEscape(str, i, false, unicodeMode, unicodeSetsMode); + i = newIndex2; + addChar(char); + } break; + case '[': { + const [ set, newIndex ] = parseSet(str, i, unicodeMode, unicodeSetsMode); + target.push(set); + i = newIndex; + applyQuantifier(set); + } break; + case '(': { + const [ newNode, newIndex ] = parseParenthesizedType(str, i); + parents.push(node); + node = newNode; + target = node.body; + i = newIndex; + } break; + case ')': { + let parent = parents.pop(); + if (node.type === 'Disjunction') { + node = parent; + parent = parents.pop(); } - - if (c === ']') { - state = State.none; - node = parents.pop(); - - continue; + if (!parent) { + throw new SyntaxError("Unmatched ')'"); } - - // range - if (c === '-') { - // start of set (or not char), just literal - - if (node.body.at(-1)?.char == null) { - addChar(); // add - - continue; - } - - const from = node.body.pop().char; - const [ to, escaped ] = seek(); - - // end of set, just literal - - if (to == null || (!escaped && to === ']')) { - addChar(from); // add from char back - i--; // rollback seek - - addChar(); // add - - continue; - } - - // next char was escaped and a metachar, just literal - - if (escaped && Metachars.escaped[to] != null) { - i -= 2; // rollback seek - - addChar(); // add - - continue; + const newTarget = parent.type === 'Disjunction' ? parent.options.at(-1) : parent.body; + newTarget.push(node); + target = newTarget; + if (node.type === 'Group' || (!unicodeMode && node.type === 'LookAhead')) { + applyQuantifier(node); + } + node = parent; + } break; + case '*': case '+': case '?': { + throw new SyntaxError(`Unexpected quantifier '${c}'`); + } break; + case '{': { + const [ quantifier, newIndex ] = parseQuantifier(str, i - 1); + if (newIndex === -1) { + if (unicodeMode) { + throw quantifier; // error } - - if (to < from) throw new SyntaxError('Range out of order'); - - node.body.push(rangeNode(from, to)); - continue; + } else { + throw new SyntaxError(`Unexpected quantifier '${str.substring(i - 1, newIndex)}'`); } - - addChar(); - break; + } break; + case '}': case ']': { + if (unicodeMode) { + throw new SyntaxError(`Unmatched '${c}'`); + } + addChar(c); + } break; + case '|': { + if (node.type !== 'Disjunction') { + parents.push(node); + node.body = [ { + type: 'Disjunction', + options: [ target ] + } ]; + node = node.body[0]; + } + target = []; + node.options.push(target); + } break; + default: { + addChar(c); + } } } + if (node.type === 'Disjunction') node = parents.pop(); + // still in a group by the end if (node.type !== 'Expression') throw new SyntaxError('Unmatched opening parenthesis'); - // still in a set by the end - if (state === State.insideSet) throw new SyntaxError('Unmatched opening square bracket'); - return out; }; \ No newline at end of file From 1f1cc881dafa77a0ec46035087fb2aa6cf54c010 Mon Sep 17 00:00:00 2001 From: Rob23oba Date: Mon, 26 Aug 2024 22:36:45 +0200 Subject: [PATCH 2/4] rhemyn: re-add character classes --- rhemyn/parse.js | 125 ++++++++++++++++++++++++++++--------------- rhemyn/test/parse.js | 7 ++- 2 files changed, 86 insertions(+), 46 deletions(-) diff --git a/rhemyn/parse.js b/rhemyn/parse.js index dd8504c7..36ccdbfc 100644 --- a/rhemyn/parse.js +++ b/rhemyn/parse.js @@ -1,42 +1,3 @@ -"use strict"; - -const getArg = (name, def) => { - const arg = (typeof process !== 'undefined' ? process.argv : Deno.args).find(x => x.startsWith(`--${name}=`)); - if (arg) return arg.split('=')[0]; - - return def; -}; - -// full is spec-compliant but slower. not needed most of the time. (evil) -const DotChars = () => ({ - full: [ '\n', '\r', '\u2028', '\u2029' ], - fast: [ '\n', '\r' ] -})[getArg('regex-dot', 'fast')]; - -const WordChars = () => ({ - full: [ [ 'a', 'z' ], [ 'A', 'Z' ], [ '0', '9' ], '_' ], - fast: [ [ '_', 'z' ], [ 'A', 'Z' ], [ '0', '9' ] ] // skip individual _ with _-z BUT it also matches '`' -})[getArg('regex-word', 'full')]; - -const WhitespaceChars = () => ({ - full: [ ' ', '\t', '\n', '\r', '\u2028', '\u2029' ], - fast: [ ' ', '\t', '\n', '\r' ] -})[getArg('regex-ws', 'fast')]; - -const _Metachars = () => ({ - unescaped: { - '.': [ DotChars(), true ], // dot - }, - escaped: { - d: [ [ [ '0', '9' ] ], false ], // digit - D: [ [ [ '0', '9' ] ], true ], // not digit - w: [ WordChars(), false ], // word - W: [ WordChars(), true ], // not word - s: [ WhitespaceChars(), false ], // whitespace - S: [ WhitespaceChars(), true ], // not whitespace - } -}); - const EscapeSequences = { f: '\f', n: '\n', @@ -195,8 +156,83 @@ const parseSet = (str, index, unicodeMode, unicodeSetsMode) => { } if (!unicodeSetsMode) { // Simple character classes - + let node = { + type: 'Set', + body: [], + negated + }; + while (index < str.length) { + let c = str[index++]; + if (c === ']') { + return [ node, index ]; + } + if (c === '\\') { + const [ escape, newIndex ] = parseClassEscape(str, index, unicodeMode); + if (escape) { + node.body.push(escape); + index = newIndex; + c = ''; + } else { + const [ char, newIndex2 ] = parseEscape(str, index, true, unicodeMode, false); + c = char; + index = newIndex2; + } + } + if (str[index] !== '-') { + if (c) { + node.body.push(charNode(c)); + } + continue; + } + // range + if (!c) { + if (unicodeMode) { + throw new SyntaxError('Cannot use class escape within range in character class'); + } + node.body.push(charNode('-')); + } + index++; + let c2 = str[index++]; + if (c2 === ']') { + if (c) { + node.body.push(charNode(c)); + } else if (unicodeMode) { + throw new SyntaxError('Cannot use class escape within range in character class'); + } + node.body.push(charNode('-')); + return [ node, index ]; + } + if (c2 === '\\') { + const [ escape, newIndex ] = parseClassEscape(str, index, unicodeMode); + if (escape) { + node.body.push(escape); + index = newIndex; + c2 = ''; + } else { + const [ char, newIndex2 ] = parseEscape(str, index, true, unicodeMode, false); + c2 = char; + index = newIndex2; + } + } + if (!c || !c2) { + if (unicodeMode) { + throw new SyntaxError('Cannot use class escape within range in character class'); + } + if (c) { + node.body.push(charNode(c)); + node.body.push(charNode('-')); + } + if (c2) node.body.push(charNode(c2)); + } else { + if (c > c2) { + throw new SyntaxError('Range out of order in character class'); + } + node.body.push(rangeNode(c, c2)); + } + } + throw new SyntaxError('Unclosed character class'); } + // todo: unicode sets }; const parseParenthesizedType = (str, index) => { @@ -303,8 +339,6 @@ const parseQuantifier = (str, index) => { }; export default (str, unicodeMode = false, unicodeSetsMode = false) => { - const Metachars = _Metachars(); - const out = { type: 'Expression', body: [] @@ -464,6 +498,13 @@ export default (str, unicodeMode = false, unicodeSetsMode = false) => { } node = parent; } break; + case '.': { + const n = { + type: 'Dot' + }; + target.push(n); + applyQuantifier(n); + } break; case '*': case '+': case '?': { throw new SyntaxError(`Unexpected quantifier '${c}'`); } break; diff --git a/rhemyn/test/parse.js b/rhemyn/test/parse.js index 0ea22bd3..e0392119 100644 --- a/rhemyn/test/parse.js +++ b/rhemyn/test/parse.js @@ -42,16 +42,15 @@ const tests = { '\\u000g': {}, '\\u000a': {}, - /* // email regexes - '^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$': {}, + '^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\\.[a-zA-Z0-9-.]+$': {}, // input type=email from HTML spec // https://html.spec.whatwg.org/multipage/input.html#email-state-(type=email) // simpler form - '^[a-zA-Z0-9.!#$%&’*+/=?^_`{|}~-]+@[a-zA-Z0-9-]+(?:\\.[a-zA-Z0-9-]+)*$': {}, + '^[a-zA-Z0-9.!#$%&\'*+/=?^_`{|}~-]+@[a-zA-Z0-9-]+(?:\\.[a-zA-Z0-9-]+)*$': {}, // full/complex form - '^[a-zA-Z0-9.!#$%&\'*+\\/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$': {}*/ + '^[a-zA-Z0-9.!#$%&\'*+\\/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$': {} }; for (const str in tests) { From b8a0142b381e8965cff372057df7e72091aad89d Mon Sep 17 00:00:00 2001 From: Rob23oba Date: Tue, 27 Aug 2024 00:15:18 +0200 Subject: [PATCH 3/4] rhemyn: update readme --- rhemyn/README.md | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/rhemyn/README.md b/rhemyn/README.md index 09a4fd51..92da8a22 100644 --- a/rhemyn/README.md +++ b/rhemyn/README.md @@ -15,26 +15,36 @@ Made for use with Porffor but could possibly be adapted, implementation/library - 🟢 character itself (eg `\.`) - 🟢 escape sequences (eg `\n`) - 🟢 control character (eg `\cJ`) - - 🟢 unicode code points (eg `\x00`, `\u0000`) + - 🟢 unicode code points (eg `\x00`, `\u0000`, `\u{10FFFF}`) - 🟢 sets (eg `[ab]`) - 🟢 ranges (eg `[a-z]`) - 🟢 negated sets (eg `[^ab]`) + - 🔴 unicode set expressions (eg `[[a-z]&&[b-e]]`) - 🟢 metacharacters - 🟢 dot (eg `a.b`) - 🟢 digit, not digit (eg `\d\D`) - 🟢 word, not word (eg `\w\W`) - 🟢 whitespace, not whitespace (eg `\s\S`) + - 🟠 unicode properties (eg `\p{ID_Continue}`) - 🟡 quantifiers - 🟡 star (eg `a*`) - 🟡 plus (eg `a+`) - 🟡 optional (eg `a?`) - 🟠 lazy modifier (eg `a*?`) - - 🔴 n repetitions (eg `a{4}`) - - 🔴 n-m repetitions (eg `a{2,4}`) + - 🟠 n repetitions (eg `a{4}`) + - 🟠 n-m repetitions (eg `a{2,4}`) +- 🟠 disjunctions (eg `a|b`) - 🟠 groups - - 🟠 capturing groups (`(a)`) - - 🔴 non-capturing groups (`(?:a)`) -- 🔴 assertions - - 🔴 beginning (eg `^a`) - - 🔴 end (eg `a$`) - - 🔴 word boundary assertion (eg `\b\B`) \ No newline at end of file + - 🟠 capturing groups (eg `(a)`) + - 🟠 named capturing groups (eg `(?a)`) + - 🟠 non-capturing groups (eg `(?:a)`) + - 🟠 backreferences (eg `\1`) + - 🟠 named backreferences (eg `\k`) +- 🟠 assertions + - 🟠 beginning (eg `^a`) + - 🟠 end (eg `a$`) + - 🟠 word boundary assertion (eg `\b\B`) + - 🟠 positive look-ahead (eg `(?=a)`) + - 🟠 negative look-ahead (eg `(?!a)`) + - 🟠 positive look-behind (eg `(?<=a)`) + - 🟠 negative look-behind (eg `(? Date: Tue, 27 Aug 2024 10:28:49 +0200 Subject: [PATCH 4/4] rhemyn: support unicode sets mode At this point, the rhemyn parser should be 99% spec compliant. --- rhemyn/README.md | 4 +- rhemyn/parse.js | 213 ++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 205 insertions(+), 12 deletions(-) diff --git a/rhemyn/README.md b/rhemyn/README.md index 92da8a22..a33fc687 100644 --- a/rhemyn/README.md +++ b/rhemyn/README.md @@ -8,7 +8,7 @@ Made for use with Porffor but could possibly be adapted, implementation/library - Wasm function returned expects an i32 pointer to a UTF-16 string (can add UTF-8 option later if someone else actually wants to use this) ## syntax -🟢 supported 🟡 partial 🟠 parsed only 🔴 unsupported +🟢 supported 🟡 partial 🟠 parsed only - 🟢 literal characters (eg `a`) - 🟢 escaping (eg `\.\n\cJ\x0a\u000a`) @@ -19,7 +19,7 @@ Made for use with Porffor but could possibly be adapted, implementation/library - 🟢 sets (eg `[ab]`) - 🟢 ranges (eg `[a-z]`) - 🟢 negated sets (eg `[^ab]`) - - 🔴 unicode set expressions (eg `[[a-z]&&[b-e]]`) + - 🟠 unicode set expressions (eg `[[a-z]&&[b-e]]`) - 🟢 metacharacters - 🟢 dot (eg `a.b`) - 🟢 digit, not digit (eg `\d\D`) diff --git a/rhemyn/parse.js b/rhemyn/parse.js index 36ccdbfc..0909e63e 100644 --- a/rhemyn/parse.js +++ b/rhemyn/parse.js @@ -56,8 +56,8 @@ const parseEscape = (str, index, inSet, unicodeMode, unicodeSetsMode) => { throw new SyntaxError('Unterminated unicode character escape'); } const hexStr = str.substring(index, endIndex); - if (/[^0-9a-fA-F]/.test(hexStr)) { - throw new SyntaxError('Invalid unicode character escape, expected /\\\\u\\{[0-9a-fA-F]*\\}/'); + if (hexStr.length === 0 || /[^0-9a-fA-F]/.test(hexStr)) { + throw new SyntaxError('Invalid unicode character escape, expected /\\\\u\\{[0-9a-fA-F]+\\}/'); } const code = parseInt(hexStr, 16); if (code >= 0x110000) { @@ -76,7 +76,19 @@ const parseEscape = (str, index, inSet, unicodeMode, unicodeSetsMode) => { return [ c, index ]; } const code = parseInt(next, 16); - return [ String.fromCharCode(code), index + count ]; + index += count; + if (unicodeMode && inSet && code >= 0xD800 && code <= 0xDBFF && str[index] === '\\' && str[index + 1] === 'u') { + // code point using 2 surrogates + // only matters within a character class + const hexStr = str.substr(index + 2, 4); + if (hexStr.length >= 4 && !/[^0-9a-fA-F]/.test(next)) { + const code2 = parseInt(hexStr, 16); + if (code2 >= 0xDC00 && code2 <= 0xDFFF) { + return [ String.fromCharCode(code) + String.fromCharCode(code2), index + 6 ]; + } + } + } + return [ String.fromCharCode(code), index ]; } if (inSet && c === 'b') { return [ '\b', index ]; @@ -111,7 +123,7 @@ const unicodeClassEscapeNode = (property, value, negated) => ({ negated }); -const parseClassEscape = (str, index, unicodeMode) => { +const parseClassEscape = (str, index, unicodeMode, unicodeSetsMode) => { switch (str[index]) { case 'd': return [ classEscapeNode('Digit', false), index + 1 ]; case 'D': return [ classEscapeNode('Digit', true), index + 1 ]; @@ -142,7 +154,7 @@ const parseClassEscape = (str, index, unicodeMode) => { property = property.substring(0, eq); } // todo: validate unicode property - return [ unicodeClassEscapeNode(property, value, negated), index ]; + return [ unicodeClassEscapeNode(property, value, negated), endIndex + 1 ]; default: return [ null, index ]; } @@ -166,6 +178,14 @@ const parseSet = (str, index, unicodeMode, unicodeSetsMode) => { if (c === ']') { return [ node, index ]; } + if (unicodeMode && c >= '\uD800' && c <= '\uDBFF') { + let cx = str[index]; + if (cx >= '\uDC00' && cx <= '\uDFFF') { + // surrogate pair + index++; + c += cx; + } + } if (c === '\\') { const [ escape, newIndex ] = parseClassEscape(str, index, unicodeMode); if (escape) { @@ -193,13 +213,19 @@ const parseSet = (str, index, unicodeMode, unicodeSetsMode) => { } index++; let c2 = str[index++]; + if (unicodeMode && c2 >= '\uD800' && c2 <= '\uDBFF') { + let cx = str[index]; + if (cx >= '\uDC00' && cx <= '\uDFFF') { + // surrogate pair + index++; + c2 += cx; + } + } if (c2 === ']') { if (c) { node.body.push(charNode(c)); - } else if (unicodeMode) { - throw new SyntaxError('Cannot use class escape within range in character class'); + node.body.push(charNode('-')); } - node.body.push(charNode('-')); return [ node, index ]; } if (c2 === '\\') { @@ -224,7 +250,7 @@ const parseSet = (str, index, unicodeMode, unicodeSetsMode) => { } if (c2) node.body.push(charNode(c2)); } else { - if (c > c2) { + if (c.codePointAt(0) > c2.codePointAt(0)) { throw new SyntaxError('Range out of order in character class'); } node.body.push(rangeNode(c, c2)); @@ -232,7 +258,174 @@ const parseSet = (str, index, unicodeMode, unicodeSetsMode) => { } throw new SyntaxError('Unclosed character class'); } - // todo: unicode sets + let node = { + type: 'Set', + body: [], + negated + }; + let parents = []; + let allowOperand = true; + while (index < str.length) { + let c = str[index++]; + if (c === ']') { + if (allowOperand && node.type !== 'Set') { + throw new SyntaxError('Trailing set operation ' + (node.type === 'SetIntersection' ? '&&' : '--')); + } + let parent = parents.pop(); + if (!parent) { + return [ node, index ]; + } + node = parent; + allowOperand = node.type === 'Set'; + continue; + } + if (c === str[index] && /[&\-!#\$%\*\+,\.:;<=>\?@\^`~]/.test(c)) { + // double punctuator + index++; + if (c !== '&' && c !== '-') { + throw new SyntaxError(`Invalid set operation ${c}${c}, only && (intersection) and -- (subtraction) are allowed`); + } + if (node.body.length === 0) { + throw new SyntaxError(`Unexpected set operation ${c}${c} at start of character class`); + } + if (node.type !== 'Set' && allowOperand) { + throw new SyntaxError(`Unexpected set operation ${c}${c} directly after other set operation`); + } + if (c === '&') { + if (node.body.length === 1) { + node.type = 'SetIntersection'; + } else if (node.type !== 'SetIntersection') { + throw new SyntaxError('Unexpected set intersection, previously ' + (node.type === 'Set' ? 'union' : 'subtraction')); + } + } else if (c === '-') { + if (node.body.length === 1) { + node.type = 'SetSubtraction'; + } else if (node.type !== 'SetSubtraction') { + throw new SyntaxError('Unexpected set subtraction, previously ' + (node.type === 'Set' ? 'union' : 'intersection')); + } + } + if (node.body[0].type === 'Range') { + throw new SyntaxError('Range not allowed in set ' + (c === '&' ? 'intersection' : 'subtraction') + ', wrap in []'); + } + allowOperand = true; + continue; + } + if (!allowOperand) { + throw new SyntaxError('Unexpected set union, previously ' + (node.type === 'SetIntersection' ? 'intersection' : 'subtraction')); + } + if (c === '[') { + negated = false; + if (str[index] === '^') { + negated = true; + index++; + } + let newNode = { + type: 'Set', + body: [], + negated + }; + node.body.push(newNode); + parents.push(node); + node = newNode; + allowOperand = true; + continue; + } + if (c === '-') { + throw new SyntaxError("Range character '-' has no associated starting character"); + } + if (/[\(\)\{\}\/\|]/.test(c)) { + throw new SyntaxError(`Unexpected '${c}' in character class`); + } + if (c >= '\uD800' && c <= '\uDBFF') { + let cx = str[index]; + if (cx >= '\uDC00' && cx <= '\uDFFF') { + // surrogate pair + index++; + c += cx; + } + } else if (c === '\\') { + // escape sequence or \q{...} + if (str[index] === 'q') { + // class string disjunction + if (str[index + 1] !== '{') { + throw new SyntaxError('Invalid escape sequence \\q, expected class string disjunction \\q{...}'); + } + index += 2; + let string = ''; + while (index < str.length) { + c = str[index++]; + if (c === '}') { + node.body.push({ + type: 'ClassStringDisjunction', + string + }); + allowOperand = node.type === 'Set'; + continue; + } + if (c === str[index] && /[&\-!#\$%\*\+,\.:;<=>\?@\^`~]/.test(c)) { + throw new SyntaxError(`Class string disjunction may not contain set operation ${c}${c}, use escaping`); + } + if (/[\(\)\[\]\{\}\/\-\\\|]/.test(c)) { + throw new SyntaxError(`Class string disjunction may not contain set syntax character '${c}', use escaping`); + } + if (c === '\\') { + const [ char, newIndex ] = parseEscape(str, index, true, true, true); + c = char; + index = newIndex; + } + string += c; + } + throw new SyntaxError('Unclosed class string disjunction'); + } + const [ escape, newIndex ] = parseClassEscape(str, index, true, true); + if (escape) { + node.body.push(escape); + allowOperand = node.type === 'Set'; + index = newIndex; + continue; + } + const [ char, newIndex2 ] = parseEscape(str, index, true, true, true); + c = char; + index = newIndex2; + } + if (str[index] === '-') { + // range + if (node.type !== 'Set') { + throw new SyntaxError('Range not allowed in set ' + (node.type === 'SetIntersection' ? 'intersection' : 'subtraction') + ', wrap in []'); + } + index++; + let c2 = str[index++]; + if (!c2) { + throw new SyntaxError('Unexpected end after range'); + } + if (c2 === str[index] && /[&\-!#\$%\*\+,\.:;<=>\?@\^`~]/.test(c2)) { + throw new SyntaxError(`Range may not end with a set operation (${c2}${c2})`); + } + if (/[\(\)\[\]\{\}\/\-\\\|]/.test(c)) { + throw new SyntaxError(`Range may not contain set syntax character '${c2}', use escaping`); + } + if (c2 >= '\uD800' && c2 <= '\uDBFF') { + let cx = str[index]; + if (cx >= '\uDC00' && cx <= '\uDFFF') { + // surrogate pair + index++; + c2 += cx; + } + } else if (c2 === '\\') { + const [ char, newIndex ] = parseEscape(str, index, true, true, true); + c2 = char; + index = newIndex; + } + if (c.codePointAt(0) > c2.codePointAt(0)) { + throw new SyntaxError('Range out of order in character class'); + } + node.body.push(rangeNode(c, c2)); + } else { + node.body.push(charNode(c)); + } + allowOperand = node.type === 'Set'; + } + throw new SyntaxError('Unclosed character class'); }; const parseParenthesizedType = (str, index) => {