Write a virtual machine for C++ regexes (#502)

The C++ standard library comes with a very slow engine for regular expressions in most implementations. For example, it seems that GNU C++ compiler runs in exponential time, see this bug report: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=93502 This is impractical for most real-world inputs. Therefore, we implement a virtual machine for matching regular expressions which runs in ``O(m * n)`` time (where ``m`` is the number of instructions in the regular expression and ``n`` is the length of the input). The original approach has been described in: Thompson, K., "Regular expression search algorithm", ACM 11(6) 1968 while we follow this blog post: https://swtch.com/~rsc/regexp/regexp2.html
aas-core-works · Jun 20, 2024 · a2f329f · a2f329f
1 parent 1a50e13
commit a2f329f
Show file tree

Hide file tree

Showing 73 changed files with 75,798 additions and 2,285 deletions.
diff --git a/aas_core_codegen/cpp/common.py b/aas_core_codegen/cpp/common.py
@@ -65,7 +65,13 @@ def wstring_literal(text: str) -> Stripped:
         elif code_point < 32:
             # Non-printable ASCII characters
             escaped.append(f"\\x{ord(character):x}")
-        elif 255 < code_point < 65536:
+        elif code_point <= 127:
+            # ASCII
+            escaped.append(character)
+        elif 127 < code_point < 255:
+            # Above ASCII, but can be encoded as a single byte
+            escaped.append(f"\\x{ord(character):x}")
+        elif 255 <= code_point < 65536:
             # Above ASCII
             escaped.append(f"\\u{ord(character):04x}")
         elif code_point >= 65536:
@@ -79,6 +85,64 @@ def wstring_literal(text: str) -> Stripped:
     return Stripped('L"{}"'.format("".join(escaped)))
 
 
+@require(lambda character: len(character) == 1)
+def wchar_literal(character: str) -> Stripped:
+    """Generate a C++ wide character literal from the ``character``."""
+    code_point = ord(character)
+
+    escaped: str
+
+    if character == "\a":
+        escaped = "L'\\a'"
+    elif character == "\b":
+        escaped = "L'\\b'"
+    elif character == "\f":
+        escaped = "L'\\f'"
+    elif character == "\n":
+        escaped = "L'\\n'"
+    elif character == "\r":
+        escaped = "L'\\r'"
+    elif character == "\t":
+        escaped = "L'\\t'"
+    elif character == "\v":
+        escaped = "L'\\v'"
+    elif character == "'":
+        escaped = "L'\\''"
+    elif character == "\\":
+        escaped = "L'\\\\'"
+    elif code_point < 32:
+        # Non-printable ASCII characters
+        escaped = f"L'\\x{ord(character):x}'"
+    elif code_point <= 127:
+        # ASCII
+        escaped = f"L'{character}'"
+    elif 127 < code_point < 255:
+        # Above ASCII, but can be encoded as a single byte
+        escaped = f"L'\\x{ord(character):x}'"
+    elif 0xD800 <= code_point <= 0xDFFF:
+        # NOTE (mristin):
+        # These are the surrogate points and can not be represented as wide character
+        # literals directly as common compilers such as MSVC++ will complain.
+        #
+        # We have to fool the compiler at this point as we deliberately want to model
+        # the surrogate point.
+        escaped = f"static_cast<wchar_t>(0x{ord(character):04x})"
+
+    # NOTE (mristin):
+    # Mind the intersecting range for surrogate points just above if you ever convert
+    # this if-elif-else statement into a mapping or pattern matching.
+    elif 255 <= code_point < 65536:
+        # Above ASCII
+        escaped = f"L'\\u{ord(character):04x}'"
+    elif code_point >= 65536:
+        # Above Unicode Basic Multilingual Pane
+        escaped = f"L'\\U{ord(character):08x}'"
+    else:
+        raise AssertionError(f"Unexpected unhandled character: {character!r}")
+
+    return Stripped(escaped)
+
+
 # fmt: off
 # NOTE (mristin, 2023-09-24):
 # We use a pre-condition here to simplify the client code. The client must check
@@ -87,7 +151,7 @@ def wstring_literal(text: str) -> Stripped:
 @require(
     lambda text:
     all(
-        ord(character) <= 255
+        ord(character) <= 127
         for character in text
     ),
     "Only ASCII text can be converted to a C++ string literal, otherwise encoding "
@@ -124,7 +188,7 @@ def string_literal(text: str) -> Stripped:
         elif code_point < 32:
             # Non-printable ASCII characters
             escaped.append(f"\\x{ord(character):x}")
-        elif code_point <= 255:
+        elif code_point <= 127:
             escaped.append(character)
         else:
             # Above ASCII
@@ -159,14 +223,20 @@ def needs_escaping(text: str) -> bool:
         elif code_point < 32:
             # Non-printable ASCII characters
             return True
-        elif 255 < code_point < 65536:
+        elif code_point <= 127:
+            # ASCII
+            continue
+        elif 127 < code_point < 255:
+            # Above ASCII, but can be encoded as a single byte
+            return True
+        elif 255 <= code_point < 65536:
             # Above ASCII
             return True
         elif code_point >= 65536:
             # Above Unicode Binary Multilingual Pane
             return True
         else:
-            pass
+            raise AssertionError(f"Unexpected unhandled character: {character!r}")
 
     return False
 
@@ -286,6 +356,8 @@ def _assert_all_primitive_types_are_mapped() -> None:
 VERIFICATION_NAMESPACE = Identifier("verification")
 JSONIZATION_NAMESPACE = Identifier("jsonization")
 XMLIZATION_NAMESPACE = Identifier("xmlization")
+REVM_NAMESPACE = Identifier("revm")
+PATTERN_NAMESPACE = Identifier("pattern")
 
 
 def generate_primitive_type(primitive_type: intermediate.PrimitiveType) -> Stripped:

diff --git a/aas_core_codegen/cpp/main.py b/aas_core_codegen/cpp/main.py
@@ -9,6 +9,8 @@
     enhancing as cpp_enhancing,
     iteration as cpp_iteration,
     jsonization as cpp_jsonization,
+    pattern as cpp_pattern,
+    revm as cpp_revm,
     stringification as cpp_stringification,
     structure as cpp_structure,
     verification as cpp_verification,
@@ -320,6 +322,85 @@ def execute(context: run.Context, stdout: TextIO, stderr: TextIO) -> int:
         return 1
     # endregion
 
+    # region Pattern
+    code = cpp_pattern.generate_header(
+        symbol_table=context.symbol_table, library_namespace=namespace
+    )
+
+    pth = context.output_dir / "pattern.hpp"
+    try:
+        pth.write_text(code, encoding="utf-8")
+    except Exception as exception:
+        run.write_error_report(
+            message=(f"Failed to write the header for the C++ pattern code to {pth}"),
+            errors=[str(exception)],
+            stderr=stderr,
+        )
+        return 1
+
+    code, errors = cpp_pattern.generate_implementation(
+        symbol_table=context.symbol_table,
+        library_namespace=namespace,
+    )
+
+    if errors is not None:
+        run.write_error_report(
+            message=(
+                f"Failed to generate the implementation of the C++ pattern code "
+                f"based on {context.model_path}"
+            ),
+            errors=[context.lineno_columner.error_message(error) for error in errors],
+            stderr=stderr,
+        )
+        return 1
+    assert code is not None
+
+    pth = context.output_dir / "pattern.cpp"
+    try:
+        pth.write_text(code, encoding="utf-8")
+    except Exception as exception:
+        run.write_error_report(
+            message=(
+                f"Failed to write the implementation of the C++ pattern code to {pth}"
+            ),
+            errors=[str(exception)],
+            stderr=stderr,
+        )
+        return 1
+    # endregion
+
+    # region REVM
+    code = cpp_revm.generate_header(library_namespace=namespace)
+
+    pth = context.output_dir / "revm.hpp"
+    try:
+        pth.write_text(code, encoding="utf-8")
+    except Exception as exception:
+        run.write_error_report(
+            message=(f"Failed to write the header for the C++ REVM code to {pth}"),
+            errors=[str(exception)],
+            stderr=stderr,
+        )
+        return 1
+
+    code = cpp_revm.generate_implementation(
+        library_namespace=namespace,
+    )
+
+    pth = context.output_dir / "revm.cpp"
+    try:
+        pth.write_text(code, encoding="utf-8")
+    except Exception as exception:
+        run.write_error_report(
+            message=(
+                f"Failed to write the implementation of the C++ REVM code to {pth}"
+            ),
+            errors=[str(exception)],
+            stderr=stderr,
+        )
+        return 1
+    # endregion
+
     # region Stringification
     code = cpp_stringification.generate_header(
         symbol_table=context.symbol_table, library_namespace=namespace

diff --git a/aas_core_codegen/cpp/pattern/__init__.py b/aas_core_codegen/cpp/pattern/__init__.py
@@ -0,0 +1,6 @@
+"""Generate instructions for a virtual machine for matching regular expressions."""
+
+from aas_core_codegen.cpp.pattern import _generate
+
+generate_header = _generate.generate_header
+generate_implementation = _generate.generate_implementation