From e4a2bc62177feb0954db6d266fb9fdfdad6dbf0d Mon Sep 17 00:00:00 2001
From: Rob23oba <robin.arnez@web.de>
Date: Mon, 26 Aug 2024 21:19:11 +0200
Subject: [PATCH 1/4] rhemyn: reworked and extended parser

---
 rhemyn/parse.js | 655 +++++++++++++++++++++++++++++++-----------------
 1 file changed, 421 insertions(+), 234 deletions(-)

diff --git a/rhemyn/parse.js b/rhemyn/parse.js
index 7d8464a2..dd8504c7 100644
--- a/rhemyn/parse.js
+++ b/rhemyn/parse.js
@@ -1,14 +1,4 @@
-const State = {
-  none: 0,
-  insideSet: 1
-};
-
-const Quantifiers = {
-  '*': [ 0 ], // 0 -
-  '+': [ 1 ], // 1 -
-  '?': [ 0, 1 ], // 0 - 1
-};
-const QuantifierKeys = Object.keys(Quantifiers);
+"use strict";
 
 const getArg = (name, def) => {
   const arg = (typeof process !== 'undefined' ? process.argv : Deno.args).find(x => x.startsWith(`--${name}=`));
@@ -56,268 +46,465 @@ const EscapeSequences = {
   '0': '\0'
 };
 
-const HexDigit = /[0-9a-fA-F]/;
-
-export default str => {
-  const Metachars = _Metachars();
-
-  const out = {
-    type: 'Expression',
-    body: []
-  };
-  let node = out, parents = [];
-
-  let state = State.none, setIndex = 0, escape = false;
-  for (let i = 0; i < str.length; i++) {
-    const c = str[i];
-
-    const charNode = char => ({
-      type: 'Character',
-      char
-    });
-
-    const rangeNode = (from, to) => ({
-      type: 'Range',
-      from,
-      to
-    });
-
-    const addChar = (char = c) => {
-      node.body.push(charNode(char));
-    };
+const charNode = char => ({
+  type: 'Character',
+  char
+});
 
-    const addSet = (matches, negated = false) => {
-      let body = matches.map(x => x[1] ? rangeNode(x[0], x[1]) : charNode(x));
-      if (state === State.insideSet) {
-        // if negated, mark each node as negated for merge
-        if (negated) body = body.map(x => {
-          x.negated = true;
-          return x;
-        });
+const rangeNode = (from, to) => ({
+  type: 'Range',
+  from,
+  to
+});
 
-        // already in set, merge bodies
-        node.body.push(...body);
-        return;
+const parseEscape = (str, index, inSet, unicodeMode, unicodeSetsMode) => {
+  const c = str[index++];
+  if (!c) {
+    throw new SyntaxError('Unterminated escape sequence at end of pattern');
+  }
+  if (EscapeSequences[c]) {
+    return [ EscapeSequences[c], index ];
+  }
+  if (c === 'c') {
+    // \c (not [A-Za-z] ...) = literal \c... (WHY)
+    const next = str[index];
+    if (next == null || /[^a-zA-Z]/.test(next)) {
+      if (unicodeMode) {
+        throw new SyntaxError('Invalid control character escape, expected /\\\\c[a-zA-Z]/');
       }
+      if (!inSet || /[^0-9_]/.test(next)) {
+        return [ '\\', index - 1 ];
+      }
+      // legacy: \c3 = \x13 and \c_ = \x1F
+    }
+    // \c[A-Za-z]
+    const code = next.charCodeAt(0);
+    return [ String.fromCharCode(code % 32), index + 1 ];
+  }
+  if (c === 'x' || c === 'u') {
+    // \x = x
+    // \xH = xH
+    // \x[0-9a-zA-Z][0-9a-zA-Z] = \xAB
+    // '\u' = u
+    // '\uHHH' = uHHH
+    // '\uABCD' = \uABCD
+    if (unicodeMode && str[index] === '{') {
+      index++;
+      const endIndex = str.indexOf('}', index);
+      if (endIndex < 0) {
+        throw new SyntaxError('Unterminated unicode character escape');
+      }
+      const hexStr = str.substring(index, endIndex);
+      if (/[^0-9a-fA-F]/.test(hexStr)) {
+        throw new SyntaxError('Invalid unicode character escape, expected /\\\\u\\{[0-9a-fA-F]*\\}/');
+      }
+      const code = parseInt(hexStr, 16);
+      if (code >= 0x110000) {
+        throw new SyntaxError('Invalid unicode character escape, code point may not be above U+10FFFF');
+      }
+      return [ String.fromCodePoint(code), endIndex + 1 ];
+    }
+    const count = c === 'x' ? 2 : 4;
+    const next = str.substr(index, count);
 
-      node.body.push({
-        type: 'Set',
-        body,
-        negated
-      });
-    };
-
-    const addMetachar = meta => {
-      const [ matches, negated = false ] = meta;
-      return addSet(matches, negated);
-    };
-
-    // get next char and consume it
-    const seek = (allowEscaped = true) => {
-      const cNext = str[++i];
-
-      if (cNext === '\\') return !allowEscaped ? undefined : [ str[++i], true ];
-      return !allowEscaped ? cNext : [ cNext, false ];
-    };
+    // missing a char or invalid hex digit
+    if (next.length < count || /[^0-9a-fA-F]/.test(next)) {
+      if (unicodeMode) {
+        throw new SyntaxError(`Invalid hex character escape, expected /\\\\${c}[0-9a-fA-F]{${count}}/`);
+      }
+      return [ c, index ];
+    }
+    const code = parseInt(next, 16);
+    return [ String.fromCharCode(code), index + count ];
+  }
+  if (inSet && c === 'b') {
+    return [ '\b', index ];
+  }
+  if (unicodeMode) {
+    let allowedSymbols = '^$\\.*+?()[]{}|/';
+    if (inSet) {
+      if (unicodeSetsMode) {
+        allowedSymbols += '&-!#%,:;<=>@`~';
+      } else {
+        allowedSymbols += '-';
+      }
+    }
+    if (!allowedSymbols.includes(c)) {
+      throw new SyntaxError(`Invalid identity escape '${c}', expected one of: ${allowedSymbols}`);
+    }
+  }
+  return [ c, index ];
+};
 
-    // get next char without consuming
-    const peek = (allowEscaped = true, offset = 0) => {
-      const cNext = str[i + 1 + offset];
+const classEscapeNode = (classType, negated) => ({
+  type: 'CharacterClassEscape',
+  classType,
+  negated
+});
 
-      if (cNext === '\\') return !allowEscaped ? undefined : [ str[i + 2 + offset], true ];
-      return !allowEscaped ? cNext : [ cNext, false ];
-    };
+const unicodeClassEscapeNode = (property, value, negated) => ({
+  type: 'CharacterClassEscape',
+  classType: 'UnicodeProperty',
+  property,
+  value,
+  negated
+});
 
-    if (escape) {
-      escape = false;
-      if (EscapeSequences[c]) {
-        addChar(EscapeSequences[c]);
-        continue;
+const parseClassEscape = (str, index, unicodeMode) => {
+  switch (str[index]) {
+    case 'd': return [ classEscapeNode('Digit', false), index + 1 ];
+    case 'D': return [ classEscapeNode('Digit', true), index + 1 ];
+    case 's': return [ classEscapeNode('Whitespace', false), index + 1 ];
+    case 'S': return [ classEscapeNode('Whitespace', true), index + 1 ];
+    case 'w': return [ classEscapeNode('WordCharacter', false), index + 1 ];
+    case 'W': return [ classEscapeNode('WordCharacter', true), index + 1 ];
+    case 'p':
+    case 'P':
+      if (!unicodeMode) {
+        return [ null, index ];
       }
-
-      if (Metachars.escaped[c]) {
-        addMetachar(Metachars.escaped[c]);
-        continue;
+      const negated = str[index] === 'P';
+      index++;
+      if (str[index] !== '{') {
+        throw new SyntaxError('Invalid escape sequence \\p, expected unicode property \\p{...}');
       }
+      index++;
+      const endIndex = str.indexOf('}', index);
+      if (endIndex < 0) {
+        throw new SyntaxError('Unterminated unicode property escape sequence');
+      }
+      let property = str.substring(index, endIndex);
+      let value = null;
+      const eq = property.indexOf('=');
+      if (eq >= 0) {
+        value = property.substring(eq + 1);
+        property = property.substring(0, eq);
+      }
+      // todo: validate unicode property
+      return [ unicodeClassEscapeNode(property, value, negated), index ];
+    default:
+      return [ null, index ];
+  }
+};
 
-      if (c === 'c') {
-        // \c (not [A-Za-z] ...) = literal \c... (WHY)
-        const next = peek(false);
-        if (next == null || /[^a-zA-Z]/.test(next)) {
-          addChar('\\');
-          addChar('c');
-          continue;
-        }
+const parseSet = (str, index, unicodeMode, unicodeSetsMode) => {
+  let negated = false;
+  if (str[index] === '^') {
+    negated = true;
+    index++;
+  }
+  if (!unicodeSetsMode) {
+    // Simple character classes
+    
+  }
+};
 
-        // \c[A-Za-z]
-        const code = seek(false).charCodeAt(0);
-        addChar(String.fromCharCode(code % 32));
-        continue;
+const parseParenthesizedType = (str, index) => {
+  if (str[index] !== '?') {
+    return [{
+      type: 'Group',
+      body: [],
+      capture: true
+    }, index ];
+  }
+  // special
+  index++;
+  let c = str[index++];
+  switch (c) {
+    case ':':
+      // non-capturing
+      return [ {
+        type: 'Group',
+        body: []
+      }, index ];
+    case '=':
+      // positive look-ahead
+      return [ {
+        type: 'LookAhead',
+        body: [],
+        negated: false
+      }, index ];
+    case '!':
+      // negative look-ahead
+      return [ {
+        type: 'LookAhead',
+        body: [],
+        negated: true
+      }, index ];
+    case '<':
+      // look-behind / group name
+      c = str[index];
+      if (c === '=' || c === '!') {
+        return [ {
+          type: 'LookBehind',
+          body: [],
+          negated: c === '!'
+        }, index + 1 ];
       }
+      const endIndex = str.indexOf('>');
+      if (endIndex < 0) {
+        throw new SyntaxError('Expected group name after (?<, for look-behinds use (?<= or (?<!');
+      }
+      const name = str.substring(index, endIndex);
+      // todo: validate name
+      return [ {
+        type: 'Group',
+        body: [],
+        capture: name
+      }, endIndex + 1 ];
+    default:
+      throw new SyntaxError(`Invalid group specifier: Expected one of (?=, (?! or (?< but found (?${c}`);
+  }
+};
 
-      if (c === 'x') {
-        // \x = x
-        // \xH = xH
-        // \x[0-9a-zA-Z][0-9a-zA-Z] = \xAB
-        const next1 = peek(false);
-        const next2 = peek(false, 1);
-
-        // missing a char or invalid hex digit
-        if (next1 == null || next2 == null || !HexDigit.test(next1) || !HexDigit.test(next2)) {
-          addChar('x');
-          continue;
+const parseQuantifier = (str, index) => {
+  let c = str[index++];
+  switch (c) {
+    case '*':
+      return [ [ 0 ], index ]; // 0 or above
+    case '+':
+      return [ [ 1 ], index ]; // 1 or above
+    case '?':
+      return [ [ 0, 1 ], index ]; // 0 - 1
+    case '{':
+      if (!(str[index] >= '0' && str[index] <= '9')) {
+        return [ new SyntaxError('Invalid quantifier, expected number'), -1 ];
+      }
+      let min = 0;
+      while (str[index] >= '0' && str[index] <= '9') {
+        min *= 10;
+        min += str.charCodeAt(index++) - 48;
+      }
+      if (str[index] === '}') {
+        return [ [ min, min ], index + 1 ];
+      }
+      if (str[index] !== ',') {
+        return [ new SyntaxError("Invalid quantifier, expected ',' or '}' after minimum count"), -1 ];
+      }
+      index++;
+      if (!(str[index] >= '0' && str[index] <= '9')) {
+        if (str[index] !== '}') {
+          return [ new SyntaxError("Unclosed quantifier, expected '}'"), -1 ];
         }
-
-        const code = parseInt(seek(false) + seek(false), 16);
-        addChar(String.fromCodePoint(code));
-        continue;
+        return [ [ min ], index + 1 ];
       }
+      let max = 0;
+      do {
+        max *= 10;
+        max += str.charCodeAt(index++) - 48;
+      } while (str[index] >= '0' && str[index] <= '9');
+      if (str[index] !== '}') {
+        return [ new SyntaxError("Unclosed quantifier, expected '}'"), -1 ];
+      }
+      return [ [ min, max ], index + 1 ];
+    default:
+      return [ null, index - 1 ];
+  }
+};
 
-      if (c === 'u') {
-        // '\u' = u
-        // '\uHHH' = uHHH
-        // '\uABCD' = \uABCD
-        const next1 = peek(false);
-        const next2 = peek(false, 1);
-        const next3 = peek(false, 2);
-        const next4 = peek(false, 3);
+export default (str, unicodeMode = false, unicodeSetsMode = false) => {
+  const Metachars = _Metachars();
 
-        // missing a char or invalid hex digit
-        if (next1 == null || next2 == null || next3 == null || next4 == null || !HexDigit.test(next1) || !HexDigit.test(next2) || !HexDigit.test(next3) || !HexDigit.test(next4)) {
-          addChar('u');
-          continue;
+  const out = {
+    type: 'Expression',
+    body: []
+  };
+  let node = out, target = node.body, parents = [];
+
+  let i = 0;
+  const applyQuantifier = to => {
+    while (true) {
+      const [ quantifier, newIndex ] = parseQuantifier(str, i);
+      if (newIndex === -1) {
+        if (unicodeMode) {
+          throw quantifier; // error
         }
-
-        const code = parseInt(seek(false) + seek(false) + seek(false) + seek(false), 16);
-        addChar(String.fromCodePoint(code));
+        // assume '{'
+        to = charNode('{');
+        target.push(to);
+        i++;
+        // perf: try repeating '{' using loop
         continue;
       }
-
-      addChar();
-      continue;
+      if (quantifier == null) {
+        return;
+      }
+      i = newIndex;
+      to.quantifier = quantifier;
+      if (str[i] === '?') {
+        to.lazy = true;
+        i++;
+      }
+      break;
     }
-
-    if (c === '\\') {
-      escape = true;
-      continue;
+  };
+  const addChar = (char, quantifiable = true) => {
+    const n = charNode(char);
+    if (quantifiable) {
+      applyQuantifier(n);
     }
+    target.push(n);
+  };
 
-    switch (state) {
-      case State.none:
-        if (c === '[') {
-          parents.push(node);
-          node = {
-            type: 'Set',
-            body: [],
-            negated: false
-          };
-
-          parents.at(-1).body.push(node);
+  while (i < str.length) {
+    const c = str[i++];
 
-          state = State.insideSet;
-          setIndex = 0;
-          continue;
+    switch (c) {
+      case '^':
+        target.push({
+          type: 'Begin'
+        });
+        break;
+      case '$':
+        target.push({
+          type: 'End'
+        });
+        break;
+      case '\\': {
+        if (str[i] >= '1' && str[i] <= '9') {
+          let number = str.charCodeAt(i++) - 48;
+          while (str[i] >= '0' && str[i] <= '9') {
+            number *= 10;
+            number += str.charCodeAt(i++) - 48;
+          }
+          const n = {
+            type: 'Backreference',
+            number
+          };
+          target.push(n);
+          applyQuantifier(n);
+          break;
         }
-
-        if (c === '(') {
-          parents.push(node);
-          node = {
-            type: 'Group',
-            body: []
+        if (str[i] === 'k') {
+          i++;
+          if (str[i] !== '<') {
+            // todo: fail if there aren't any named groups
+            if (unicodeMode) {
+              throw new SyntaxError('Invalid named backreference, expected \\k<...>');
+            }
+            addChar('k');
+            break;
+          }
+          i++;
+          const endIndex = str.indexOf('>', i);
+          if (endIndex < 0) {
+            // todo: fail if there aren't any named groups
+            if (unicodeMode) {
+              throw new SyntaxError('Unclosed named backreference');
+            }
+            addChar('k', false);
+            addChar('<');
+            break;
+          }
+          const name = str.substring(i, endIndex);
+          // todo: validate name
+          const n = {
+            type: 'NamedBackreference',
+            name
           };
-
-          parents.at(-1).body.push(node);
-          continue;
+          target.push(n);
+          i = endIndex + 1;
+          applyQuantifier(n);
+          break;
         }
-
-        if (c === ')') {
-          if (node.type !== 'Group') throw new SyntaxError('Unmatched closing parenthesis');
-
-          node = parents.pop();
-          continue;
+        if (str[i] === 'b') {
+          target.push({
+            type: 'WordBoundary',
+            negated: false
+          });
+          i++;
+          break;
         }
-
-        if (QuantifierKeys.includes(c)) {
-          const last = node.body.at(-1);
-          if (!last) continue; // ignore, maybe lookahead
-
-          last.quantifier = Quantifiers[c];
-
-          // lazy modifier
-          if (peek(false) === '?') last.lazy = true;
-
-          continue;
+        if (str[i] === 'B') {
+          target.push({
+            type: 'WordBoundary',
+            negated: true
+          });
+          i++;
+          break;
         }
-
-        if (Metachars.unescaped[c]) {
-          addMetachar(Metachars.unescaped[c]);
-          continue;
+        const [ escape, newIndex ] = parseClassEscape(str, i, unicodeMode, unicodeSetsMode);
+        if (escape) {
+          target.push(escape);
+          i = newIndex;
+          applyQuantifier(escape);
+          break;
         }
-
-        addChar();
-        break;
-
-      case State.insideSet:
-        setIndex++;
-        if (setIndex === 1) {
-          // first char in set
-          if (c === '^') {
-            node.negated = true;
-            continue;
-          }
+        const [ char, newIndex2 ] = parseEscape(str, i, false, unicodeMode, unicodeSetsMode);
+        i = newIndex2;
+        addChar(char);
+      } break;
+      case '[': {
+        const [ set, newIndex ] = parseSet(str, i, unicodeMode, unicodeSetsMode);
+        target.push(set);
+        i = newIndex;
+        applyQuantifier(set);
+      } break;
+      case '(': {
+        const [ newNode, newIndex ] = parseParenthesizedType(str, i);
+        parents.push(node);
+        node = newNode;
+        target = node.body;
+        i = newIndex;
+      } break;
+      case ')': {
+        let parent = parents.pop();
+        if (node.type === 'Disjunction') {
+          node = parent;
+          parent = parents.pop();
         }
-
-        if (c === ']') {
-          state = State.none;
-          node = parents.pop();
-
-          continue;
+        if (!parent) {
+          throw new SyntaxError("Unmatched ')'");
         }
-
-        // range
-        if (c === '-') {
-          // start of set (or not char), just literal -
-          if (node.body.at(-1)?.char == null) {
-            addChar(); // add -
-            continue;
-          }
-
-          const from = node.body.pop().char;
-          const [ to, escaped ] = seek();
-
-          // end of set, just literal -
-          if (to == null || (!escaped && to === ']')) {
-            addChar(from); // add from char back
-            i--; // rollback seek
-
-            addChar(); // add -
-            continue;
-          }
-
-          // next char was escaped and a metachar, just literal -
-          if (escaped && Metachars.escaped[to] != null) {
-            i -= 2; // rollback seek
-
-            addChar(); // add -
-            continue;
+        const newTarget = parent.type === 'Disjunction' ? parent.options.at(-1) : parent.body;
+        newTarget.push(node);
+        target = newTarget;
+        if (node.type === 'Group' || (!unicodeMode && node.type === 'LookAhead')) {
+          applyQuantifier(node);
+        }
+        node = parent;
+      } break;
+      case '*': case '+': case '?': {
+        throw new SyntaxError(`Unexpected quantifier '${c}'`);
+      } break;
+      case '{': {
+        const [ quantifier, newIndex ] = parseQuantifier(str, i - 1);
+        if (newIndex === -1) {
+          if (unicodeMode) {
+            throw quantifier; // error
           }
-
-          if (to < from) throw new SyntaxError('Range out of order');
-
-          node.body.push(rangeNode(from, to));
-          continue;
+        } else {
+          throw new SyntaxError(`Unexpected quantifier '${str.substring(i - 1, newIndex)}'`);
         }
-
-        addChar();
-        break;
+      } break;
+      case '}': case ']': {
+        if (unicodeMode) {
+          throw new SyntaxError(`Unmatched '${c}'`);
+        }
+        addChar(c);
+      } break;
+      case '|': {
+        if (node.type !== 'Disjunction') {
+          parents.push(node);
+          node.body = [ {
+            type: 'Disjunction',
+            options: [ target ]
+          } ];
+          node = node.body[0];
+        }
+        target = [];
+        node.options.push(target);
+      } break;
+      default: {
+        addChar(c);
+      }
     }
   }
 
+  if (node.type === 'Disjunction') node = parents.pop();
+
   // still in a group by the end
   if (node.type !== 'Expression') throw new SyntaxError('Unmatched opening parenthesis');
 
-  // still in a set by the end
-  if (state === State.insideSet) throw new SyntaxError('Unmatched opening square bracket');
-
   return out;
 };
\ No newline at end of file

From 1f1cc881dafa77a0ec46035087fb2aa6cf54c010 Mon Sep 17 00:00:00 2001
From: Rob23oba <robin.arnez@web.de>
Date: Mon, 26 Aug 2024 22:36:45 +0200
Subject: [PATCH 2/4] rhemyn: re-add character classes

---
 rhemyn/parse.js      | 125 ++++++++++++++++++++++++++++---------------
 rhemyn/test/parse.js |   7 ++-
 2 files changed, 86 insertions(+), 46 deletions(-)

diff --git a/rhemyn/parse.js b/rhemyn/parse.js
index dd8504c7..36ccdbfc 100644
--- a/rhemyn/parse.js
+++ b/rhemyn/parse.js
@@ -1,42 +1,3 @@
-"use strict";
-
-const getArg = (name, def) => {
-  const arg = (typeof process !== 'undefined' ? process.argv : Deno.args).find(x => x.startsWith(`--${name}=`));
-  if (arg) return arg.split('=')[0];
-
-  return def;
-};
-
-// full is spec-compliant but slower. not needed most of the time. (evil)
-const DotChars = () => ({
-  full: [ '\n', '\r', '\u2028', '\u2029' ],
-  fast: [ '\n', '\r' ]
-})[getArg('regex-dot', 'fast')];
-
-const WordChars = () => ({
-  full: [ [ 'a', 'z' ], [ 'A', 'Z' ], [ '0', '9' ], '_' ],
-  fast: [ [ '_', 'z' ], [ 'A', 'Z' ], [ '0', '9' ] ] // skip individual _ with _-z BUT it also matches '`'
-})[getArg('regex-word', 'full')];
-
-const WhitespaceChars = () => ({
-  full: [ ' ', '\t', '\n', '\r', '\u2028', '\u2029' ],
-  fast: [ ' ', '\t', '\n', '\r' ]
-})[getArg('regex-ws', 'fast')];
-
-const _Metachars = () => ({
-  unescaped: {
-    '.': [ DotChars(), true ], // dot
-  },
-  escaped: {
-    d: [ [ [ '0', '9' ] ], false ], // digit
-    D: [ [ [ '0', '9' ] ], true ], // not digit
-    w: [ WordChars(), false ], // word
-    W: [ WordChars(), true ], // not word
-    s: [ WhitespaceChars(), false ], // whitespace
-    S: [ WhitespaceChars(), true ], // not whitespace
-  }
-});
-
 const EscapeSequences = {
   f: '\f',
   n: '\n',
@@ -195,8 +156,83 @@ const parseSet = (str, index, unicodeMode, unicodeSetsMode) => {
   }
   if (!unicodeSetsMode) {
     // Simple character classes
-    
+    let node = {
+      type: 'Set',
+      body: [],
+      negated
+    };
+    while (index < str.length) {
+      let c = str[index++];
+      if (c === ']') {
+        return [ node, index ];
+      }
+      if (c === '\\') {
+        const [ escape, newIndex ] = parseClassEscape(str, index, unicodeMode);
+        if (escape) {
+          node.body.push(escape);
+          index = newIndex;
+          c = '';
+        } else {
+          const [ char, newIndex2 ] = parseEscape(str, index, true, unicodeMode, false);
+          c = char;
+          index = newIndex2;
+        }
+      }
+      if (str[index] !== '-') {
+        if (c) {
+          node.body.push(charNode(c));
+        }
+        continue;
+      }
+      // range
+      if (!c) {
+        if (unicodeMode) {
+          throw new SyntaxError('Cannot use class escape within range in character class');
+        }
+        node.body.push(charNode('-'));
+      }
+      index++;
+      let c2 = str[index++];
+      if (c2 === ']') {
+        if (c) {
+          node.body.push(charNode(c));
+        } else if (unicodeMode) {
+          throw new SyntaxError('Cannot use class escape within range in character class');
+        }
+        node.body.push(charNode('-'));
+        return [ node, index ];
+      }
+      if (c2 === '\\') {
+        const [ escape, newIndex ] = parseClassEscape(str, index, unicodeMode);
+        if (escape) {
+          node.body.push(escape);
+          index = newIndex;
+          c2 = '';
+        } else {
+          const [ char, newIndex2 ] = parseEscape(str, index, true, unicodeMode, false);
+          c2 = char;
+          index = newIndex2;
+        }
+      }
+      if (!c || !c2) {
+        if (unicodeMode) {
+          throw new SyntaxError('Cannot use class escape within range in character class');
+        }
+        if (c) {
+          node.body.push(charNode(c));
+          node.body.push(charNode('-'));
+        }
+        if (c2) node.body.push(charNode(c2));
+      } else {
+        if (c > c2) {
+          throw new SyntaxError('Range out of order in character class');
+        }
+        node.body.push(rangeNode(c, c2));
+      }
+    }
+    throw new SyntaxError('Unclosed character class');
   }
+  // todo: unicode sets
 };
 
 const parseParenthesizedType = (str, index) => {
@@ -303,8 +339,6 @@ const parseQuantifier = (str, index) => {
 };
 
 export default (str, unicodeMode = false, unicodeSetsMode = false) => {
-  const Metachars = _Metachars();
-
   const out = {
     type: 'Expression',
     body: []
@@ -464,6 +498,13 @@ export default (str, unicodeMode = false, unicodeSetsMode = false) => {
         }
         node = parent;
       } break;
+      case '.': {
+        const n = {
+          type: 'Dot'
+        };
+        target.push(n);
+        applyQuantifier(n);
+      } break;
       case '*': case '+': case '?': {
         throw new SyntaxError(`Unexpected quantifier '${c}'`);
       } break;
diff --git a/rhemyn/test/parse.js b/rhemyn/test/parse.js
index 0ea22bd3..e0392119 100644
--- a/rhemyn/test/parse.js
+++ b/rhemyn/test/parse.js
@@ -42,16 +42,15 @@ const tests = {
   '\\u000g': {},
   '\\u000a': {},
 
-  /*
   // email regexes
-  '^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$': {},
+  '^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\\.[a-zA-Z0-9-.]+$': {},
 
   // input type=email from HTML spec
   // https://html.spec.whatwg.org/multipage/input.html#email-state-(type=email)
   // simpler form
-  '^[a-zA-Z0-9.!#$%&’*+/=?^_`{|}~-]+@[a-zA-Z0-9-]+(?:\\.[a-zA-Z0-9-]+)*$': {},
+  '^[a-zA-Z0-9.!#$%&\'*+/=?^_`{|}~-]+@[a-zA-Z0-9-]+(?:\\.[a-zA-Z0-9-]+)*$': {},
   // full/complex form
-  '^[a-zA-Z0-9.!#$%&\'*+\\/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$': {}*/
+  '^[a-zA-Z0-9.!#$%&\'*+\\/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$': {}
 };
 
 for (const str in tests) {

From b8a0142b381e8965cff372057df7e72091aad89d Mon Sep 17 00:00:00 2001
From: Rob23oba <robin.arnez@web.de>
Date: Tue, 27 Aug 2024 00:15:18 +0200
Subject: [PATCH 3/4] rhemyn: update readme

---
 rhemyn/README.md | 28 +++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/rhemyn/README.md b/rhemyn/README.md
index 09a4fd51..92da8a22 100644
--- a/rhemyn/README.md
+++ b/rhemyn/README.md
@@ -15,26 +15,36 @@ Made for use with Porffor but could possibly be adapted, implementation/library
   - 🟢 character itself (eg `\.`)
   - 🟢 escape sequences (eg `\n`)
   - 🟢 control character (eg `\cJ`)
-  - 🟢 unicode code points (eg `\x00`, `\u0000`)
+  - 🟢 unicode code points (eg `\x00`, `\u0000`, `\u{10FFFF}`)
 - 🟢 sets (eg `[ab]`)
   - 🟢 ranges (eg `[a-z]`)
   - 🟢 negated sets (eg `[^ab]`)
+  - 🔴 unicode set expressions (eg `[[a-z]&&[b-e]]`)
 - 🟢 metacharacters
   - 🟢 dot (eg `a.b`)
   - 🟢 digit, not digit (eg `\d\D`)
   - 🟢 word, not word (eg `\w\W`)
   - 🟢 whitespace, not whitespace (eg `\s\S`)
+  - 🟠 unicode properties (eg `\p{ID_Continue}`)
 - 🟡 quantifiers
   - 🟡 star (eg `a*`)
   - 🟡 plus (eg `a+`)
   - 🟡 optional (eg `a?`)
   - 🟠 lazy modifier (eg `a*?`)
-  - 🔴 n repetitions (eg `a{4}`)
-  - 🔴 n-m repetitions (eg `a{2,4}`)
+  - 🟠 n repetitions (eg `a{4}`)
+  - 🟠 n-m repetitions (eg `a{2,4}`)
+- 🟠 disjunctions (eg `a|b`)
 - 🟠 groups
-  - 🟠 capturing groups (`(a)`)
-  - 🔴 non-capturing groups (`(?:a)`)
-- 🔴 assertions
-  - 🔴 beginning (eg `^a`)
-  - 🔴 end (eg `a$`)
-  - 🔴 word boundary assertion (eg `\b\B`)
\ No newline at end of file
+  - 🟠 capturing groups (eg `(a)`)
+  - 🟠 named capturing groups (eg `(?<foo>a)`)
+  - 🟠 non-capturing groups (eg `(?:a)`)
+  - 🟠 backreferences (eg `\1`)
+  - 🟠 named backreferences (eg `\k<foo>`)
+- 🟠 assertions
+  - 🟠 beginning (eg `^a`)
+  - 🟠 end (eg `a$`)
+  - 🟠 word boundary assertion (eg `\b\B`)
+  - 🟠 positive look-ahead (eg `(?=a)`)
+  - 🟠 negative look-ahead (eg `(?!a)`)
+  - 🟠 positive look-behind (eg `(?<=a)`)
+  - 🟠 negative look-behind (eg `(?<!a)`)

From 4c935f9927fb5c8f9b60a0ca5f3ad979cfdad46b Mon Sep 17 00:00:00 2001
From: Rob23oba <robin.arnez@web.de>
Date: Tue, 27 Aug 2024 10:28:49 +0200
Subject: [PATCH 4/4] rhemyn: support unicode sets mode

At this point, the rhemyn parser should be 99% spec compliant.
---
 rhemyn/README.md |   4 +-
 rhemyn/parse.js  | 213 ++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 205 insertions(+), 12 deletions(-)

diff --git a/rhemyn/README.md b/rhemyn/README.md
index 92da8a22..a33fc687 100644
--- a/rhemyn/README.md
+++ b/rhemyn/README.md
@@ -8,7 +8,7 @@ Made for use with Porffor but could possibly be adapted, implementation/library
 - Wasm function returned expects an i32 pointer to a UTF-16 string (can add UTF-8 option later if someone else actually wants to use this)
 
 ## syntax
-🟢 supported 🟡 partial 🟠 parsed only 🔴 unsupported
+🟢 supported 🟡 partial 🟠 parsed only
 
 - 🟢 literal characters (eg `a`)
 - 🟢 escaping (eg `\.\n\cJ\x0a\u000a`)
@@ -19,7 +19,7 @@ Made for use with Porffor but could possibly be adapted, implementation/library
 - 🟢 sets (eg `[ab]`)
   - 🟢 ranges (eg `[a-z]`)
   - 🟢 negated sets (eg `[^ab]`)
-  - 🔴 unicode set expressions (eg `[[a-z]&&[b-e]]`)
+  - 🟠 unicode set expressions (eg `[[a-z]&&[b-e]]`)
 - 🟢 metacharacters
   - 🟢 dot (eg `a.b`)
   - 🟢 digit, not digit (eg `\d\D`)
diff --git a/rhemyn/parse.js b/rhemyn/parse.js
index 36ccdbfc..0909e63e 100644
--- a/rhemyn/parse.js
+++ b/rhemyn/parse.js
@@ -56,8 +56,8 @@ const parseEscape = (str, index, inSet, unicodeMode, unicodeSetsMode) => {
         throw new SyntaxError('Unterminated unicode character escape');
       }
       const hexStr = str.substring(index, endIndex);
-      if (/[^0-9a-fA-F]/.test(hexStr)) {
-        throw new SyntaxError('Invalid unicode character escape, expected /\\\\u\\{[0-9a-fA-F]*\\}/');
+      if (hexStr.length === 0 || /[^0-9a-fA-F]/.test(hexStr)) {
+        throw new SyntaxError('Invalid unicode character escape, expected /\\\\u\\{[0-9a-fA-F]+\\}/');
       }
       const code = parseInt(hexStr, 16);
       if (code >= 0x110000) {
@@ -76,7 +76,19 @@ const parseEscape = (str, index, inSet, unicodeMode, unicodeSetsMode) => {
       return [ c, index ];
     }
     const code = parseInt(next, 16);
-    return [ String.fromCharCode(code), index + count ];
+    index += count;
+    if (unicodeMode && inSet && code >= 0xD800 && code <= 0xDBFF && str[index] === '\\' && str[index + 1] === 'u') {
+      // code point using 2 surrogates
+      // only matters within a character class
+      const hexStr = str.substr(index + 2, 4);
+      if (hexStr.length >= 4 && !/[^0-9a-fA-F]/.test(next)) {
+        const code2 = parseInt(hexStr, 16);
+        if (code2 >= 0xDC00 && code2 <= 0xDFFF) {
+          return [ String.fromCharCode(code) + String.fromCharCode(code2), index + 6 ];
+        }
+      }
+    }
+    return [ String.fromCharCode(code), index ];
   }
   if (inSet && c === 'b') {
     return [ '\b', index ];
@@ -111,7 +123,7 @@ const unicodeClassEscapeNode = (property, value, negated) => ({
   negated
 });
 
-const parseClassEscape = (str, index, unicodeMode) => {
+const parseClassEscape = (str, index, unicodeMode, unicodeSetsMode) => {
   switch (str[index]) {
     case 'd': return [ classEscapeNode('Digit', false), index + 1 ];
     case 'D': return [ classEscapeNode('Digit', true), index + 1 ];
@@ -142,7 +154,7 @@ const parseClassEscape = (str, index, unicodeMode) => {
         property = property.substring(0, eq);
       }
       // todo: validate unicode property
-      return [ unicodeClassEscapeNode(property, value, negated), index ];
+      return [ unicodeClassEscapeNode(property, value, negated), endIndex + 1 ];
     default:
       return [ null, index ];
   }
@@ -166,6 +178,14 @@ const parseSet = (str, index, unicodeMode, unicodeSetsMode) => {
       if (c === ']') {
         return [ node, index ];
       }
+      if (unicodeMode && c >= '\uD800' && c <= '\uDBFF') {
+        let cx = str[index];
+        if (cx >= '\uDC00' && cx <= '\uDFFF') {
+          // surrogate pair
+          index++;
+          c += cx;
+        }
+      }
       if (c === '\\') {
         const [ escape, newIndex ] = parseClassEscape(str, index, unicodeMode);
         if (escape) {
@@ -193,13 +213,19 @@ const parseSet = (str, index, unicodeMode, unicodeSetsMode) => {
       }
       index++;
       let c2 = str[index++];
+      if (unicodeMode && c2 >= '\uD800' && c2 <= '\uDBFF') {
+        let cx = str[index];
+        if (cx >= '\uDC00' && cx <= '\uDFFF') {
+          // surrogate pair
+          index++;
+          c2 += cx;
+        }
+      }
       if (c2 === ']') {
         if (c) {
           node.body.push(charNode(c));
-        } else if (unicodeMode) {
-          throw new SyntaxError('Cannot use class escape within range in character class');
+          node.body.push(charNode('-'));
         }
-        node.body.push(charNode('-'));
         return [ node, index ];
       }
       if (c2 === '\\') {
@@ -224,7 +250,7 @@ const parseSet = (str, index, unicodeMode, unicodeSetsMode) => {
         }
         if (c2) node.body.push(charNode(c2));
       } else {
-        if (c > c2) {
+        if (c.codePointAt(0) > c2.codePointAt(0)) {
           throw new SyntaxError('Range out of order in character class');
         }
         node.body.push(rangeNode(c, c2));
@@ -232,7 +258,174 @@ const parseSet = (str, index, unicodeMode, unicodeSetsMode) => {
     }
     throw new SyntaxError('Unclosed character class');
   }
-  // todo: unicode sets
+  let node = {
+    type: 'Set',
+    body: [],
+    negated
+  };
+  let parents = [];
+  let allowOperand = true;
+  while (index < str.length) {
+    let c = str[index++];
+    if (c === ']') {
+      if (allowOperand && node.type !== 'Set') {
+        throw new SyntaxError('Trailing set operation ' + (node.type === 'SetIntersection' ? '&&' : '--'));
+      }
+      let parent = parents.pop();
+      if (!parent) {
+        return [ node, index ];
+      }
+      node = parent;
+      allowOperand = node.type === 'Set';
+      continue;
+    }
+    if (c === str[index] && /[&\-!#\$%\*\+,\.:;<=>\?@\^`~]/.test(c)) {
+      // double punctuator
+      index++;
+      if (c !== '&' && c !== '-') {
+        throw new SyntaxError(`Invalid set operation ${c}${c}, only && (intersection) and -- (subtraction) are allowed`);
+      }
+      if (node.body.length === 0) {
+        throw new SyntaxError(`Unexpected set operation ${c}${c} at start of character class`);
+      }
+      if (node.type !== 'Set' && allowOperand) {
+        throw new SyntaxError(`Unexpected set operation ${c}${c} directly after other set operation`);
+      }
+      if (c === '&') {
+        if (node.body.length === 1) {
+          node.type = 'SetIntersection';
+        } else if (node.type !== 'SetIntersection') {
+          throw new SyntaxError('Unexpected set intersection, previously ' + (node.type === 'Set' ? 'union' : 'subtraction'));
+        }
+      } else if (c === '-') {
+        if (node.body.length === 1) {
+          node.type = 'SetSubtraction';
+        } else if (node.type !== 'SetSubtraction') {
+          throw new SyntaxError('Unexpected set subtraction, previously ' + (node.type === 'Set' ? 'union' : 'intersection'));
+        }
+      }
+      if (node.body[0].type === 'Range') {
+        throw new SyntaxError('Range not allowed in set ' + (c === '&' ? 'intersection' : 'subtraction') + ', wrap in []');
+      }
+      allowOperand = true;
+      continue;
+    }
+    if (!allowOperand) {
+      throw new SyntaxError('Unexpected set union, previously ' + (node.type === 'SetIntersection' ? 'intersection' : 'subtraction'));
+    }
+    if (c === '[') {
+      negated = false;
+      if (str[index] === '^') {
+        negated = true;
+        index++;
+      }
+      let newNode = {
+        type: 'Set',
+        body: [],
+        negated
+      };
+      node.body.push(newNode);
+      parents.push(node);
+      node = newNode;
+      allowOperand = true;
+      continue;
+    }
+    if (c === '-') {
+      throw new SyntaxError("Range character '-' has no associated starting character");
+    }
+    if (/[\(\)\{\}\/\|]/.test(c)) {
+      throw new SyntaxError(`Unexpected '${c}' in character class`);
+    }
+    if (c >= '\uD800' && c <= '\uDBFF') {
+      let cx = str[index];
+      if (cx >= '\uDC00' && cx <= '\uDFFF') {
+        // surrogate pair
+        index++;
+        c += cx;
+      }
+    } else if (c === '\\') {
+      // escape sequence or \q{...}
+      if (str[index] === 'q') {
+        // class string disjunction
+        if (str[index + 1] !== '{') {
+          throw new SyntaxError('Invalid escape sequence \\q, expected class string disjunction \\q{...}');
+        }
+        index += 2;
+        let string = '';
+        while (index < str.length) {
+          c = str[index++];
+          if (c === '}') {
+            node.body.push({
+              type: 'ClassStringDisjunction',
+              string
+            });
+            allowOperand = node.type === 'Set';
+            continue;
+          }
+          if (c === str[index] && /[&\-!#\$%\*\+,\.:;<=>\?@\^`~]/.test(c)) {
+            throw new SyntaxError(`Class string disjunction may not contain set operation ${c}${c}, use escaping`);
+          }
+          if (/[\(\)\[\]\{\}\/\-\\\|]/.test(c)) {
+            throw new SyntaxError(`Class string disjunction may not contain set syntax character '${c}', use escaping`);
+          }
+          if (c === '\\') {
+            const [ char, newIndex ] = parseEscape(str, index, true, true, true);
+            c = char;
+            index = newIndex;
+          }
+          string += c;
+        }
+        throw new SyntaxError('Unclosed class string disjunction');
+      }
+      const [ escape, newIndex ] = parseClassEscape(str, index, true, true);
+      if (escape) {
+        node.body.push(escape);
+        allowOperand = node.type === 'Set';
+        index = newIndex;
+        continue;
+      }
+      const [ char, newIndex2 ] = parseEscape(str, index, true, true, true);
+      c = char;
+      index = newIndex2;
+    }
+    if (str[index] === '-') {
+      // range
+      if (node.type !== 'Set') {
+        throw new SyntaxError('Range not allowed in set ' + (node.type === 'SetIntersection' ? 'intersection' : 'subtraction') + ', wrap in []');
+      }
+      index++;
+      let c2 = str[index++];
+      if (!c2) {
+        throw new SyntaxError('Unexpected end after range');
+      }
+      if (c2 === str[index] && /[&\-!#\$%\*\+,\.:;<=>\?@\^`~]/.test(c2)) {
+        throw new SyntaxError(`Range may not end with a set operation (${c2}${c2})`);
+      }
+      if (/[\(\)\[\]\{\}\/\-\\\|]/.test(c)) {
+        throw new SyntaxError(`Range may not contain set syntax character '${c2}', use escaping`);
+      }
+      if (c2 >= '\uD800' && c2 <= '\uDBFF') {
+        let cx = str[index];
+        if (cx >= '\uDC00' && cx <= '\uDFFF') {
+          // surrogate pair
+          index++;
+          c2 += cx;
+        }
+      } else if (c2 === '\\') {
+        const [ char, newIndex ] = parseEscape(str, index, true, true, true);
+        c2 = char;
+        index = newIndex;
+      }
+      if (c.codePointAt(0) > c2.codePointAt(0)) {
+        throw new SyntaxError('Range out of order in character class');
+      }
+      node.body.push(rangeNode(c, c2));
+    } else {
+      node.body.push(charNode(c));
+    }
+    allowOperand = node.type === 'Set';
+  }
+  throw new SyntaxError('Unclosed character class');
 };
 
 const parseParenthesizedType = (str, index) => {